pulldown_cmark/
firstpass.rs

Help
1//! The first pass resolves all block structure, generating an AST. Within a block, items
2//! are in a linear chain with potential inline markup identified.
3
4use std::cmp::max;
5use std::ops::Range;
6
7use crate::parse::{
8    scan_containers, Allocations, FootnoteDef, HeadingAttributes, Item, ItemBody, LinkDef,
9    LINK_MAX_NESTED_PARENS,
10};
11use crate::strings::CowStr;
12use crate::tree::{Tree, TreeIndex};
13use crate::Options;
14use crate::{
15    linklabel::{scan_link_label_rest, LinkLabel},
16    HeadingLevel,
17};
18use crate::{scanners::*, MetadataBlockKind};
19
20use unicase::UniCase;
21
22/// Runs the first pass, which resolves the block structure of the document,
23/// and returns the resulting tree.
24pub(crate) fn run_first_pass(text: &str, options: Options) -> (Tree<Item>, Allocations<'_>) {
25    // This is a very naive heuristic for the number of nodes
26    // we'll need.
27    let start_capacity = max(128, text.len() / 32);
28    let lookup_table = &create_lut(&options);
29    let first_pass = FirstPass {
30        text,
31        tree: Tree::with_capacity(start_capacity),
32        begin_list_item: None,
33        last_line_blank: false,
34        allocs: Allocations::new(),
35        options,
36        lookup_table,
37        next_paragraph_task: None,
38        brace_context_next: 0,
39        brace_context_stack: Vec::new(),
40    };
41    first_pass.run()
42}
43
44// Each level of brace nesting adds another entry to a hash table.
45// To limit the amount of memory consumed, do not create a new brace
46// context beyond some amount deep.
47//
48// There are actually two limits at play here: this one,
49// and the one where the maximum amount of distinct contexts passes
50// the 255 item limit imposed by using `u8`. When over 255 distinct
51// contexts are created, it wraps around, while this one instead makes it
52// saturate, which is a better behavior.
53const MATH_BRACE_CONTEXT_MAX_NESTING: usize = 25;
54
55/// State for the first parsing pass.
56struct FirstPass<'a, 'b> {
57    text: &'a str,
58    tree: Tree<Item>,
59    begin_list_item: Option<usize>,
60    last_line_blank: bool,
61    allocs: Allocations<'a>,
62    options: Options,
63    lookup_table: &'b LookupTable,
64    /// This is `Some(item)` when the next paragraph
65    /// starts with a task list marker.
66    next_paragraph_task: Option<Item>,
67    /// Math environment brace nesting.
68    brace_context_stack: Vec<u8>,
69    brace_context_next: usize,
70}
71
72impl<'a, 'b> FirstPass<'a, 'b> {
73    fn run(mut self) -> (Tree<Item>, Allocations<'a>) {
74        let mut ix = 0;
75        while ix < self.text.len() {
76            ix = self.parse_block(ix);
77        }
78        while self.tree.spine_len() > 0 {
79            self.pop(ix);
80        }
81        (self.tree, self.allocs)
82    }
83
84    /// Returns offset after block.
85    fn parse_block(&mut self, mut start_ix: usize) -> usize {
86        let bytes = self.text.as_bytes();
87        let mut line_start = LineStart::new(&bytes[start_ix..]);
88
89        // math spans and their braces are tracked only within a single block
90        self.brace_context_stack.clear();
91        self.brace_context_next = 0;
92
93        let i = scan_containers(
94            &self.tree,
95            &mut line_start,
96            self.options.has_gfm_footnotes(),
97        );
98        for _ in i..self.tree.spine_len() {
99            self.pop(start_ix);
100        }
101
102        if self.options.contains(Options::ENABLE_OLD_FOOTNOTES) {
103            // finish footnote if it's still open and was preceded by blank line
104            if let Some(node_ix) = self.tree.peek_up() {
105                if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body {
106                    if self.last_line_blank {
107                        self.pop(start_ix);
108                    }
109                }
110            }
111
112            // Footnote definitions of the form
113            // [^bar]:
114            // * anything really
115            let container_start = start_ix + line_start.bytes_scanned();
116            if let Some(bytecount) = self.parse_footnote(container_start) {
117                start_ix = container_start + bytecount;
118                start_ix += scan_blank_line(&bytes[start_ix..]).unwrap_or(0);
119                line_start = LineStart::new(&bytes[start_ix..]);
120            }
121        }
122
123        // Process new containers
124        loop {
125            let save = line_start.clone();
126            let outer_indent = line_start.scan_space_upto(4);
127            if outer_indent >= 4 {
128                line_start = save;
129                break;
130            }
131            if self.options.has_gfm_footnotes()
132                || self.options.contains(Options::ENABLE_OLD_FOOTNOTES)
133            {
134                // Footnote definitions of the form
135                // [^bar]:
136                //     * anything really
137                let container_start = start_ix + line_start.bytes_scanned();
138                if let Some(bytecount) = self.parse_footnote(container_start) {
139                    start_ix = container_start + bytecount;
140                    line_start = LineStart::new(&bytes[start_ix..]);
141                    continue;
142                }
143            }
144            let container_start = start_ix + line_start.bytes_scanned();
145            if let Some((ch, index, indent)) = line_start.scan_list_marker_with_indent(outer_indent)
146            {
147                let after_marker_index = start_ix + line_start.bytes_scanned();
148                self.continue_list(container_start - outer_indent, ch, index);
149                self.tree.append(Item {
150                    start: container_start - outer_indent,
151                    end: after_marker_index, // will get updated later if item not empty
152                    body: ItemBody::ListItem(indent),
153                });
154                self.tree.push();
155                if let Some(n) = scan_blank_line(&bytes[after_marker_index..]) {
156                    self.begin_list_item = Some(after_marker_index + n);
157                    return after_marker_index + n;
158                }
159                if self.options.contains(Options::ENABLE_TASKLISTS) {
160                    let task_list_marker =
161                        line_start.scan_task_list_marker().map(|is_checked| Item {
162                            start: after_marker_index,
163                            end: start_ix + line_start.bytes_scanned(),
164                            body: ItemBody::TaskListMarker(is_checked),
165                        });
166                    if let Some(task_list_marker) = task_list_marker {
167                        if let Some(n) = scan_blank_line(&bytes[task_list_marker.end..]) {
168                            self.tree.append(task_list_marker);
169                            self.begin_list_item = Some(task_list_marker.end + n);
170                            return task_list_marker.end + n;
171                        } else {
172                            self.next_paragraph_task = Some(task_list_marker);
173                        }
174                    }
175                }
176            } else if let Some((indent, child, item)) = self
177                .options
178                .contains(Options::ENABLE_DEFINITION_LIST)
179                .then(|| {
180                    self.tree
181                        .cur()
182                        .map(|cur| (self.tree[cur].child, &mut self.tree[cur].item))
183                })
184                .flatten()
185                .filter(|(_, item)| {
186                    matches!(
187                        item,
188                        Item {
189                            body: ItemBody::Paragraph
190                                | ItemBody::MaybeDefinitionListTitle
191                                | ItemBody::DefinitionListDefinition(_),
192                            ..
193                        }
194                    )
195                })
196                .and_then(|item| {
197                    Some((
198                        line_start
199                            .scan_definition_list_definition_marker_with_indent(outer_indent)?,
200                        item.0,
201                        item.1,
202                    ))
203                })
204            {
205                match item.body {
206                    ItemBody::Paragraph => {
207                        item.body = ItemBody::DefinitionList(true);
208                        let Item { start, end, .. } = *item;
209                        let list_idx = self.tree.cur().unwrap();
210                        let title_idx = self.tree.create_node(Item {
211                            start,
212                            end, // will get updated later if item not empty
213                            body: ItemBody::DefinitionListTitle,
214                        });
215                        self.tree[title_idx].child = child;
216                        self.tree[list_idx].child = Some(title_idx);
217                        self.tree.push();
218                    }
219                    ItemBody::MaybeDefinitionListTitle => {
220                        item.body = ItemBody::DefinitionListTitle;
221                    }
222                    ItemBody::DefinitionListDefinition(_) => {}
223                    _ => unreachable!(),
224                }
225                let after_marker_index = start_ix + line_start.bytes_scanned();
226                self.tree.append(Item {
227                    start: container_start - outer_indent,
228                    end: after_marker_index, // will get updated later if item not empty
229                    body: ItemBody::DefinitionListDefinition(indent),
230                });
231                if let Some(ItemBody::DefinitionList(ref mut is_tight)) =
232                    self.tree.peek_up().map(|cur| &mut self.tree[cur].item.body)
233                {
234                    if self.last_line_blank {
235                        *is_tight = false;
236                        self.last_line_blank = false;
237                    }
238                }
239                self.tree.push();
240                if let Some(n) = scan_blank_line(&bytes[after_marker_index..]) {
241                    self.begin_list_item = Some(after_marker_index + n);
242                    return after_marker_index + n;
243                }
244            } else if line_start.scan_blockquote_marker() {
245                let kind = if self.options.contains(Options::ENABLE_GFM) {
246                    line_start.scan_blockquote_tag()
247                } else {
248                    None
249                };
250                self.finish_list(start_ix);
251                self.tree.append(Item {
252                    start: container_start,
253                    end: 0, // will get set later
254                    body: ItemBody::BlockQuote(kind),
255                });
256                self.tree.push();
257                if kind.is_some() {
258                    // blockquote tag leaves us at the end of the line
259                    // we need to scan through all the container syntax for the next line
260                    // and break out if we can't re-scan all of them
261                    let ix = start_ix + line_start.bytes_scanned();
262                    let mut lazy_line_start = LineStart::new(&bytes[ix..]);
263                    let current_container = scan_containers(
264                        &self.tree,
265                        &mut lazy_line_start,
266                        self.options.has_gfm_footnotes(),
267                    ) == self.tree.spine_len();
268                    if !lazy_line_start.scan_space(4)
269                        && self.scan_paragraph_interrupt(
270                            &bytes[ix + lazy_line_start.bytes_scanned()..],
271                            current_container,
272                        )
273                    {
274                        return ix;
275                    } else {
276                        // blockquote tags act as if they were nested in a paragraph
277                        // so you can lazily continue the imaginary paragraph off of them
278                        line_start = lazy_line_start;
279                        line_start.scan_all_space();
280                        start_ix = ix;
281                        break;
282                    }
283                }
284            } else {
285                line_start = save;
286                break;
287            }
288        }
289
290        let ix = start_ix + line_start.bytes_scanned();
291
292        if let Some(n) = scan_blank_line(&bytes[ix..]) {
293            if let Some(node_ix) = self.tree.peek_up() {
294                match &mut self.tree[node_ix].item.body {
295                    ItemBody::BlockQuote(..) => (),
296                    ItemBody::ListItem(indent) | ItemBody::DefinitionListDefinition(indent)
297                        if self.begin_list_item.is_some() =>
298                    {
299                        self.last_line_blank = true;
300                        // This is a blank list item.
301                        // While the list itself can be continued no matter how many blank lines
302                        // there are between this one and the next one, it cannot nest anything
303                        // else, so its indentation should not be subtracted from the line start.
304                        *indent = 0;
305                    }
306                    _ => {
307                        self.last_line_blank = true;
308                    }
309                }
310            } else {
311                self.last_line_blank = true;
312            }
313            return ix + n;
314        }
315
316        // Save `remaining_space` here to avoid needing to backtrack `line_start` for HTML blocks
317        let remaining_space = line_start.remaining_space();
318
319        let indent = line_start.scan_space_upto(4);
320        if indent == 4 {
321            self.finish_list(start_ix);
322            let ix = start_ix + line_start.bytes_scanned();
323            let remaining_space = line_start.remaining_space();
324            return self.parse_indented_code_block(ix, remaining_space);
325        }
326
327        let ix = start_ix + line_start.bytes_scanned();
328
329        // metadata blocks cannot be indented
330        if indent == 0 {
331            if let Some((_n, metadata_block_ch)) = scan_metadata_block(
332                &bytes[ix..],
333                self.options
334                    .contains(Options::ENABLE_YAML_STYLE_METADATA_BLOCKS),
335                self.options
336                    .contains(Options::ENABLE_PLUSES_DELIMITED_METADATA_BLOCKS),
337            ) {
338                self.finish_list(start_ix);
339                return self.parse_metadata_block(ix, metadata_block_ch);
340            }
341        }
342
343        // HTML Blocks
344        if bytes[ix] == b'<' {
345            // Types 1-5 are all detected by one function and all end with the same
346            // pattern
347            if let Some(html_end_tag) = get_html_end_tag(&bytes[(ix + 1)..]) {
348                self.finish_list(start_ix);
349                return self.parse_html_block_type_1_to_5(
350                    ix,
351                    html_end_tag,
352                    remaining_space,
353                    indent,
354                );
355            }
356
357            // Detect type 6
358            if starts_html_block_type_6(&bytes[(ix + 1)..]) {
359                self.finish_list(start_ix);
360                return self.parse_html_block_type_6_or_7(ix, remaining_space, indent);
361            }
362
363            // Detect type 7
364            if let Some(_html_bytes) = scan_html_type_7(&bytes[ix..]) {
365                self.finish_list(start_ix);
366                return self.parse_html_block_type_6_or_7(ix, remaining_space, indent);
367            }
368        }
369
370        if let Ok(n) = scan_hrule(&bytes[ix..]) {
371            self.finish_list(start_ix);
372            return self.parse_hrule(n, ix);
373        }
374
375        if let Some(atx_size) = scan_atx_heading(&bytes[ix..]) {
376            self.finish_list(start_ix);
377            return self.parse_atx_heading(ix, atx_size);
378        }
379
380        if let Some((n, fence_ch)) = scan_code_fence(&bytes[ix..]) {
381            self.finish_list(start_ix);
382            return self.parse_fenced_code_block(ix, indent, fence_ch, n);
383        }
384
385        // parse refdef
386        while let Some((bytecount, label, link_def)) =
387            self.parse_refdef_total(start_ix + line_start.bytes_scanned())
388        {
389            self.allocs.refdefs.0.entry(label).or_insert(link_def);
390            let container_start = start_ix + line_start.bytes_scanned();
391            let mut ix = container_start + bytecount;
392            // Refdefs act as if they were contained within a paragraph, for purposes of lazy
393            // continuations. For example:
394            //
395            // ```
396            // > [foo]: http://example.com
397            // bar
398            // ```
399            //
400            // is equivalent to
401            //
402            // ```
403            // > bar
404            //
405            // [foo]: http://example.com
406            // ```
407            if let Some(nl) = scan_blank_line(&bytes[ix..]) {
408                ix += nl;
409                let mut lazy_line_start = LineStart::new(&bytes[ix..]);
410                let current_container = scan_containers(
411                    &self.tree,
412                    &mut lazy_line_start,
413                    self.options.has_gfm_footnotes(),
414                ) == self.tree.spine_len();
415                if !lazy_line_start.scan_space(4)
416                    && self.scan_paragraph_interrupt(
417                        &bytes[ix + lazy_line_start.bytes_scanned()..],
418                        current_container,
419                    )
420                {
421                    self.finish_list(start_ix);
422                    return ix;
423                } else {
424                    line_start = lazy_line_start;
425                    line_start.scan_all_space();
426                    start_ix = ix;
427                }
428            } else {
429                self.finish_list(start_ix);
430                return ix;
431            }
432        }
433
434        let ix = start_ix + line_start.bytes_scanned();
435
436        self.parse_paragraph(ix)
437    }
438
439    /// Returns the offset of the first line after the table.
440    /// Assumptions: current focus is a table element and the table header
441    /// matches the separator line (same number of columns).
442    fn parse_table(
443        &mut self,
444        table_cols: usize,
445        head_start: usize,
446        body_start: usize,
447    ) -> Option<usize> {
448        // filled empty cells are limited to protect against quadratic growth
449        // https://github.com/raphlinus/pulldown-cmark/issues/832
450        let mut missing_empty_cells = 0;
451        // parse header. this shouldn't fail because we made sure the table header is ok
452        let (_sep_start, thead_ix) =
453            self.parse_table_row_inner(head_start, table_cols, &mut missing_empty_cells)?;
454        self.tree[thead_ix].item.body = ItemBody::TableHead;
455
456        // parse body
457        let mut ix = body_start;
458        while let Some((next_ix, _row_ix)) =
459            self.parse_table_row(ix, table_cols, &mut missing_empty_cells)
460        {
461            ix = next_ix;
462        }
463
464        self.pop(ix);
465        Some(ix)
466    }
467
468    /// Call this when containers are taken care of.
469    /// Returns bytes scanned, row_ix
470    fn parse_table_row_inner(
471        &mut self,
472        mut ix: usize,
473        row_cells: usize,
474        missing_empty_cells: &mut usize,
475    ) -> Option<(usize, TreeIndex)> {
476        // Limit to prevent a malicious input from causing a denial of service.
477        const MAX_AUTOCOMPLETED_CELLS: usize = 1 << 18; // = 0x40000
478
479        let bytes = self.text.as_bytes();
480        let mut cells = 0;
481        let mut final_cell_ix = None;
482
483        let old_cur = self.tree.cur();
484        let row_ix = self.tree.append(Item {
485            start: ix,
486            end: 0, // set at end of this function
487            body: ItemBody::TableRow,
488        });
489        self.tree.push();
490
491        loop {
492            ix += scan_ch(&bytes[ix..], b'|');
493            let start_ix = ix;
494            ix += scan_whitespace_no_nl(&bytes[ix..]);
495
496            if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
497                ix += eol_bytes;
498                break;
499            }
500
501            let cell_ix = self.tree.append(Item {
502                start: start_ix,
503                end: ix,
504                body: ItemBody::TableCell,
505            });
506            self.tree.push();
507            let (next_ix, _brk) = self.parse_line(ix, None, TableParseMode::Active);
508
509            self.tree[cell_ix].item.end = next_ix;
510            self.tree.pop();
511
512            ix = next_ix;
513            cells += 1;
514
515            if cells == row_cells {
516                final_cell_ix = Some(cell_ix);
517            }
518        }
519
520        if let (Some(cur), 0) = (old_cur, cells) {
521            self.pop(ix);
522            self.tree[cur].next = None;
523            return None;
524        }
525
526        // fill empty cells if needed
527        // note: this is where GFM and commonmark-extra diverge. we follow
528        // GFM here
529        for _ in cells..row_cells {
530            if *missing_empty_cells >= MAX_AUTOCOMPLETED_CELLS {
531                return None;
532            }
533            *missing_empty_cells += 1;
534            self.tree.append(Item {
535                start: ix,
536                end: ix,
537                body: ItemBody::TableCell,
538            });
539        }
540
541        // drop excess cells
542        if let Some(cell_ix) = final_cell_ix {
543            self.tree[cell_ix].next = None;
544        }
545
546        self.pop(ix);
547
548        Some((ix, row_ix))
549    }
550
551    /// Returns first offset after the row and the tree index of the row.
552    fn parse_table_row(
553        &mut self,
554        mut ix: usize,
555        row_cells: usize,
556        missing_empty_cells: &mut usize,
557    ) -> Option<(usize, TreeIndex)> {
558        let bytes = self.text.as_bytes();
559        let mut line_start = LineStart::new(&bytes[ix..]);
560        let current_container = scan_containers(
561            &self.tree,
562            &mut line_start,
563            self.options.has_gfm_footnotes(),
564        ) == self.tree.spine_len();
565        if !current_container {
566            return None;
567        }
568        line_start.scan_all_space();
569        ix += line_start.bytes_scanned();
570        if scan_paragraph_interrupt_no_table(
571            &bytes[ix..],
572            current_container,
573            self.options.contains(Options::ENABLE_FOOTNOTES),
574            self.options.contains(Options::ENABLE_DEFINITION_LIST),
575            &self.tree,
576        ) {
577            return None;
578        }
579
580        let (ix, row_ix) = self.parse_table_row_inner(ix, row_cells, missing_empty_cells)?;
581        Some((ix, row_ix))
582    }
583
584    /// Returns offset of line start after paragraph.
585    fn parse_paragraph(&mut self, start_ix: usize) -> usize {
586        let body = if let Some(ItemBody::DefinitionList(_)) =
587            self.tree.peek_up().map(|idx| self.tree[idx].item.body)
588        {
589            if self.tree.cur().map_or(true, |idx| {
590                matches!(
591                    &self.tree[idx].item.body,
592                    ItemBody::DefinitionListDefinition(..)
593                )
594            }) {
595                // blank lines between the previous definition and this one don't count
596                self.last_line_blank = false;
597                ItemBody::MaybeDefinitionListTitle
598            } else {
599                self.finish_list(start_ix);
600                ItemBody::Paragraph
601            }
602        } else {
603            self.finish_list(start_ix);
604            ItemBody::Paragraph
605        };
606        let node_ix = self.tree.append(Item {
607            start: start_ix,
608            end: 0, // will get set later
609            body,
610        });
611        self.tree.push();
612
613        if let Some(item) = self.next_paragraph_task {
614            self.tree.append(item);
615            self.next_paragraph_task = None;
616        }
617
618        let bytes = self.text.as_bytes();
619        let mut ix = start_ix;
620        loop {
621            let scan_mode = if self.options.contains(Options::ENABLE_TABLES) && ix == start_ix {
622                TableParseMode::Scan
623            } else {
624                TableParseMode::Disabled
625            };
626            let (next_ix, brk) = self.parse_line(ix, None, scan_mode);
627
628            // break out when we find a table
629            if let Some(Item {
630                body: ItemBody::Table(alignment_ix),
631                ..
632            }) = brk
633            {
634                let table_cols = self.allocs[alignment_ix].len();
635                self.tree[node_ix].item.body = ItemBody::Table(alignment_ix);
636                // this clears out any stuff we may have appended - but there may
637                // be a cleaner way
638                self.tree[node_ix].child = None;
639                self.tree.pop();
640                if body == ItemBody::MaybeDefinitionListTitle {
641                    self.finish_list(ix);
642                }
643                self.tree.push();
644                if let Some(ix) = self.parse_table(table_cols, ix, next_ix) {
645                    return ix;
646                }
647            }
648
649            ix = next_ix;
650            let mut line_start = LineStart::new(&bytes[ix..]);
651            let current_container = scan_containers(
652                &self.tree,
653                &mut line_start,
654                self.options.has_gfm_footnotes(),
655            ) == self.tree.spine_len();
656            let trailing_backslash_pos = match brk {
657                Some(Item {
658                    start,
659                    body: ItemBody::HardBreak(true),
660                    ..
661                }) if bytes[start] == b'\\' => Some(start),
662                _ => None,
663            };
664            if !line_start.scan_space(4) {
665                let ix_new = ix + line_start.bytes_scanned();
666                if current_container {
667                    if let Some(ix_setext) =
668                        self.parse_setext_heading(ix_new, node_ix, trailing_backslash_pos.is_some())
669                    {
670                        if let Some(pos) = trailing_backslash_pos {
671                            self.tree.append_text(pos, pos + 1, false);
672                        }
673                        self.pop(ix_setext);
674                        if body == ItemBody::MaybeDefinitionListTitle {
675                            self.finish_list(ix);
676                        }
677                        return ix_setext;
678                    }
679                }
680                // first check for non-empty lists, then for other interrupts
681                let suffix = &bytes[ix_new..];
682                if self.scan_paragraph_interrupt(suffix, current_container) {
683                    if let Some(pos) = trailing_backslash_pos {
684                        self.tree.append_text(pos, pos + 1, false);
685                    }
686                    break;
687                }
688            }
689            line_start.scan_all_space();
690            if line_start.is_at_eol() {
691                if let Some(pos) = trailing_backslash_pos {
692                    self.tree.append_text(pos, pos + 1, false);
693                }
694                break;
695            }
696            ix = next_ix + line_start.bytes_scanned();
697            if let Some(item) = brk {
698                self.tree.append(item);
699            }
700        }
701
702        self.pop(ix);
703        ix
704    }
705
706    /// Returns end ix of setext_heading on success.
707    fn parse_setext_heading(
708        &mut self,
709        ix: usize,
710        node_ix: TreeIndex,
711        has_trailing_content: bool,
712    ) -> Option<usize> {
713        let bytes = self.text.as_bytes();
714        let (n, level) = scan_setext_heading(&bytes[ix..])?;
715        let mut attrs = None;
716
717        if let Some(cur_ix) = self.tree.cur() {
718            let parent_ix = self.tree.peek_up().unwrap();
719            let header_start = self.tree[parent_ix].item.start;
720            // Note that `self.tree[parent_ix].item.end` might be zero at this point.
721            // Use the end position of the current node (i.e. the last known child
722            // of the parent) instead.
723            let header_end = self.tree[cur_ix].item.end;
724
725            // extract the trailing attribute block
726            let (content_end, attrs_) =
727                self.extract_and_parse_heading_attribute_block(header_start, header_end);
728            attrs = attrs_;
729
730            // strip trailing whitespace
731            let new_end = if has_trailing_content {
732                content_end
733            } else {
734                let mut last_line_start = header_start;
735                if attrs.is_some() {
736                    loop {
737                        let next_line_start =
738                            last_line_start + scan_nextline(&bytes[last_line_start..content_end]);
739                        if next_line_start >= content_end {
740                            break;
741                        }
742                        let mut line_start = LineStart::new(&bytes[next_line_start..content_end]);
743                        if scan_containers(
744                            &self.tree,
745                            &mut line_start,
746                            self.options.has_gfm_footnotes(),
747                        ) != self.tree.spine_len()
748                        {
749                            break;
750                        }
751                        last_line_start = next_line_start + line_start.bytes_scanned();
752                    }
753                }
754                let trailing_ws = scan_rev_while(
755                    &bytes[last_line_start..content_end],
756                    is_ascii_whitespace_no_nl,
757                );
758                content_end - trailing_ws
759            };
760
761            if attrs.is_some() {
762                // remove trailing block attributes
763                self.tree.truncate_siblings(new_end);
764            }
765
766            if let Some(cur_ix) = self.tree.cur() {
767                self.tree[cur_ix].item.end = new_end;
768            }
769        }
770
771        self.tree[node_ix].item.body = ItemBody::Heading(
772            level,
773            attrs.map(|attrs| self.allocs.allocate_heading(attrs)),
774        );
775
776        Some(ix + n)
777    }
778
779    /// Parse a line of input, appending text and items to tree.
780    ///
781    /// Returns: index after line and an item representing the break.
782    fn parse_line(
783        &mut self,
784        start: usize,
785        end: Option<usize>,
786        mode: TableParseMode,
787    ) -> (usize, Option<Item>) {
788        let bytes = self.text.as_bytes();
789        let bytes = match end {
790            Some(end) => &bytes[..end],
791            None => bytes,
792        };
793        let bytes_len = bytes.len();
794        let mut pipes = 0;
795        let mut last_pipe_ix = start;
796        let mut begin_text = start;
797        let mut backslash_escaped = false;
798
799        let (final_ix, brk) = iterate_special_bytes(self.lookup_table, bytes, start, |ix, byte| {
800            match byte {
801                b'\n' | b'\r' => {
802                    if let TableParseMode::Active = mode {
803                        return LoopInstruction::BreakAtWith(ix, None);
804                    }
805
806                    let mut i = ix;
807                    let eol_bytes = scan_eol(&bytes[ix..]).unwrap();
808
809                    let end_ix = ix + eol_bytes;
810                    let trailing_backslashes = scan_rev_while(&bytes[..ix], |b| b == b'\\');
811                    if trailing_backslashes % 2 == 1 && end_ix < bytes_len {
812                        i -= 1;
813                        self.tree.append_text(begin_text, i, backslash_escaped);
814                        backslash_escaped = false;
815                        return LoopInstruction::BreakAtWith(
816                            end_ix,
817                            Some(Item {
818                                start: i,
819                                end: end_ix,
820                                body: ItemBody::HardBreak(true),
821                            }),
822                        );
823                    }
824
825                    if mode == TableParseMode::Scan && pipes > 0 {
826                        // check if we may be parsing a table
827                        let next_line_ix = ix + eol_bytes;
828                        let mut line_start = LineStart::new(&bytes[next_line_ix..]);
829                        if scan_containers(
830                            &self.tree,
831                            &mut line_start,
832                            self.options.has_gfm_footnotes(),
833                        ) == self.tree.spine_len()
834                        {
835                            let table_head_ix = next_line_ix + line_start.bytes_scanned();
836                            let (table_head_bytes, alignment) =
837                                scan_table_head(&bytes[table_head_ix..]);
838
839                            if table_head_bytes > 0 {
840                                // computing header count from number of pipes
841                                let header_count =
842                                    count_header_cols(bytes, pipes, start, last_pipe_ix);
843
844                                // make sure they match the number of columns we find in separator line
845                                if alignment.len() == header_count {
846                                    let alignment_ix = self.allocs.allocate_alignment(alignment);
847                                    let end_ix = table_head_ix + table_head_bytes;
848                                    return LoopInstruction::BreakAtWith(
849                                        end_ix,
850                                        Some(Item {
851                                            start: i,
852                                            end: end_ix, // must update later
853                                            body: ItemBody::Table(alignment_ix),
854                                        }),
855                                    );
856                                }
857                            }
858                        }
859                    }
860
861                    let trailing_whitespace =
862                        scan_rev_while(&bytes[..ix], is_ascii_whitespace_no_nl);
863                    if trailing_whitespace >= 2 {
864                        i -= trailing_whitespace;
865                        self.tree.append_text(begin_text, i, backslash_escaped);
866                        backslash_escaped = false;
867                        return LoopInstruction::BreakAtWith(
868                            end_ix,
869                            Some(Item {
870                                start: i,
871                                end: end_ix,
872                                body: ItemBody::HardBreak(false),
873                            }),
874                        );
875                    }
876
877                    self.tree
878                        .append_text(begin_text, ix - trailing_whitespace, backslash_escaped);
879                    backslash_escaped = false;
880
881                    LoopInstruction::BreakAtWith(
882                        end_ix,
883                        Some(Item {
884                            start: i,
885                            end: end_ix,
886                            body: ItemBody::SoftBreak,
887                        }),
888                    )
889                }
890                b'\\' => {
891                    if ix + 1 < bytes_len && is_ascii_punctuation(bytes[ix + 1]) {
892                        self.tree.append_text(begin_text, ix, backslash_escaped);
893                        if bytes[ix + 1] == b'`' {
894                            let count = 1 + scan_ch_repeat(&bytes[(ix + 2)..], b'`');
895                            self.tree.append(Item {
896                                start: ix + 1,
897                                end: ix + count + 1,
898                                body: ItemBody::MaybeCode(count, true),
899                            });
900                            begin_text = ix + 1 + count;
901                            backslash_escaped = false;
902                            LoopInstruction::ContinueAndSkip(count)
903                        } else if bytes[ix + 1] == b'|' && TableParseMode::Active == mode {
904                            // Yeah, it's super weird that backslash escaped pipes in tables aren't "real"
905                            // backslash escapes.
906                            //
907                            // This tree structure is intended for the benefit of inline analysis, and it
908                            // is supposed to operate as-if backslash escaped pipes were stripped out in a
909                            // separate pass.
910                            begin_text = ix + 1;
911                            backslash_escaped = false;
912                            LoopInstruction::ContinueAndSkip(1)
913                        } else if ix + 2 < bytes_len
914                            && bytes[ix + 1] == b'\\'
915                            && bytes[ix + 2] == b'|'
916                            && TableParseMode::Active == mode
917                        {
918                            // To parse `\\|`, discard the backslashes and parse the `|` that follows it.
919                            begin_text = ix + 2;
920                            backslash_escaped = true;
921                            LoopInstruction::ContinueAndSkip(2)
922                        } else {
923                            begin_text = ix + 1;
924                            backslash_escaped = true;
925                            LoopInstruction::ContinueAndSkip(1)
926                        }
927                    } else {
928                        LoopInstruction::ContinueAndSkip(0)
929                    }
930                }
931                c @ b'*' | c @ b'_' | c @ b'~' => {
932                    let string_suffix = &self.text[ix..];
933                    let count = 1 + scan_ch_repeat(&string_suffix.as_bytes()[1..], c);
934                    let can_open = delim_run_can_open(
935                        &self.text[start..],
936                        string_suffix,
937                        count,
938                        ix - start,
939                        mode,
940                    );
941                    let can_close = delim_run_can_close(
942                        &self.text[start..],
943                        string_suffix,
944                        count,
945                        ix - start,
946                        mode,
947                    );
948                    let is_valid_seq = c != b'~' || count <= 2;
949
950                    if (can_open || can_close) && is_valid_seq {
951                        self.tree.append_text(begin_text, ix, backslash_escaped);
952                        backslash_escaped = false;
953                        for i in 0..count {
954                            self.tree.append(Item {
955                                start: ix + i,
956                                end: ix + i + 1,
957                                body: ItemBody::MaybeEmphasis(count - i, can_open, can_close),
958                            });
959                        }
960                        begin_text = ix + count;
961                    }
962                    LoopInstruction::ContinueAndSkip(count - 1)
963                }
964                b'$' => {
965                    let string_suffix = &self.text[ix..];
966                    let can_open = !string_suffix[1..]
967                        .as_bytes()
968                        .first()
969                        .copied()
970                        .map_or(true, is_ascii_whitespace);
971                    let can_close = ix > start
972                        && !self.text[..ix]
973                            .as_bytes()
974                            .last()
975                            .copied()
976                            .map_or(true, is_ascii_whitespace);
977
978                    // 0xFFFF_FFFF... represents the root brace context. Using None would require
979                    // storing Option<u8>, which is bigger than u8.
980                    //
981                    // These shouldn't conflict unless you have 255 levels of nesting, which is
982                    // past the intended limit anyway.
983                    //
984                    // Unbalanced braces will cause the root to be changed, which is why it gets
985                    // stored here.
986                    let brace_context =
987                        if self.brace_context_stack.len() > MATH_BRACE_CONTEXT_MAX_NESTING {
988                            self.brace_context_next as u8
989                        } else {
990                            self.brace_context_stack.last().copied().unwrap_or_else(|| {
991                                self.brace_context_stack.push(!0);
992                                !0
993                            })
994                        };
995
996                    self.tree.append_text(begin_text, ix, backslash_escaped);
997                    self.tree.append(Item {
998                        start: ix,
999                        end: ix + 1,
1000                        body: ItemBody::MaybeMath(can_open, can_close, brace_context),
1001                    });
1002                    begin_text = ix + 1;
1003                    LoopInstruction::ContinueAndSkip(0)
1004                }
1005                b'{' => {
1006                    if self.brace_context_stack.len() == MATH_BRACE_CONTEXT_MAX_NESTING {
1007                        self.brace_context_stack.push(self.brace_context_next as u8);
1008                        self.brace_context_next = MATH_BRACE_CONTEXT_MAX_NESTING;
1009                    } else if self.brace_context_stack.len() > MATH_BRACE_CONTEXT_MAX_NESTING {
1010                        // When we reach the limit of nesting, switch from actually matching
1011                        // braces to just counting them.
1012                        self.brace_context_next += 1;
1013                    } else if !self.brace_context_stack.is_empty() {
1014                        // Store nothing if no math environment has been reached yet.
1015                        self.brace_context_stack.push(self.brace_context_next as u8);
1016                        self.brace_context_next += 1;
1017                    }
1018                    LoopInstruction::ContinueAndSkip(0)
1019                }
1020                b'}' => {
1021                    if let &mut [ref mut top_level_context] = &mut self.brace_context_stack[..] {
1022                        // Unbalanced Braces
1023                        //
1024                        // The initial, root top-level brace context is -1, but this is changed whenever an unbalanced
1025                        // close brace is encountered:
1026                        //
1027                        //     This is not a math environment: $}$
1028                        //     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^|^
1029                        //     -1                               |-2
1030                        //
1031                        // To ensure this can't get parsed as math, each side of the unbalanced
1032                        // brace is an irreversibly separate brace context. As long as the math
1033                        // environment itself contains balanced braces, they should share a top level context.
1034                        //
1035                        //     Math environment contains 2+2: $}$2+2$
1036                        //                                       ^^^ this is a math environment
1037                        *top_level_context = top_level_context.wrapping_sub(1);
1038                    } else if self.brace_context_stack.len() > MATH_BRACE_CONTEXT_MAX_NESTING {
1039                        // When we exceed 25 levels of nesting, switch from accurately balancing braces
1040                        // to just counting them. When we dip back below the limit, switch back.
1041                        if self.brace_context_next <= MATH_BRACE_CONTEXT_MAX_NESTING {
1042                            self.brace_context_stack.pop();
1043                        } else {
1044                            self.brace_context_next -= 1;
1045                        }
1046                    } else {
1047                        self.brace_context_stack.pop();
1048                    }
1049                    LoopInstruction::ContinueAndSkip(0)
1050                }
1051                b'`' => {
1052                    self.tree.append_text(begin_text, ix, backslash_escaped);
1053                    backslash_escaped = false;
1054                    let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'`');
1055                    self.tree.append(Item {
1056                        start: ix,
1057                        end: ix + count,
1058                        body: ItemBody::MaybeCode(count, false),
1059                    });
1060                    begin_text = ix + count;
1061                    LoopInstruction::ContinueAndSkip(count - 1)
1062                }
1063                b'<' if bytes.get(ix + 1) != Some(&b'\\') => {
1064                    // Note: could detect some non-HTML cases and early escape here, but not
1065                    // clear that's a win.
1066                    self.tree.append_text(begin_text, ix, backslash_escaped);
1067                    backslash_escaped = false;
1068                    self.tree.append(Item {
1069                        start: ix,
1070                        end: ix + 1,
1071                        body: ItemBody::MaybeHtml,
1072                    });
1073                    begin_text = ix + 1;
1074                    LoopInstruction::ContinueAndSkip(0)
1075                }
1076                b'!' => {
1077                    if ix + 1 < bytes_len && bytes[ix + 1] == b'[' {
1078                        self.tree.append_text(begin_text, ix, backslash_escaped);
1079                        backslash_escaped = false;
1080                        self.tree.append(Item {
1081                            start: ix,
1082                            end: ix + 2,
1083                            body: ItemBody::MaybeImage,
1084                        });
1085                        begin_text = ix + 2;
1086                        LoopInstruction::ContinueAndSkip(1)
1087                    } else {
1088                        LoopInstruction::ContinueAndSkip(0)
1089                    }
1090                }
1091                b'[' => {
1092                    self.tree.append_text(begin_text, ix, backslash_escaped);
1093                    backslash_escaped = false;
1094                    self.tree.append(Item {
1095                        start: ix,
1096                        end: ix + 1,
1097                        body: ItemBody::MaybeLinkOpen,
1098                    });
1099                    begin_text = ix + 1;
1100                    LoopInstruction::ContinueAndSkip(0)
1101                }
1102                b']' => {
1103                    self.tree.append_text(begin_text, ix, backslash_escaped);
1104                    backslash_escaped = false;
1105                    self.tree.append(Item {
1106                        start: ix,
1107                        end: ix + 1,
1108                        body: ItemBody::MaybeLinkClose(true),
1109                    });
1110                    begin_text = ix + 1;
1111                    LoopInstruction::ContinueAndSkip(0)
1112                }
1113                b'&' => match scan_entity(&bytes[ix..]) {
1114                    (n, Some(value)) => {
1115                        self.tree.append_text(begin_text, ix, backslash_escaped);
1116                        backslash_escaped = false;
1117                        self.tree.append(Item {
1118                            start: ix,
1119                            end: ix + n,
1120                            body: ItemBody::SynthesizeText(self.allocs.allocate_cow(value)),
1121                        });
1122                        begin_text = ix + n;
1123                        LoopInstruction::ContinueAndSkip(n - 1)
1124                    }
1125                    _ => LoopInstruction::ContinueAndSkip(0),
1126                },
1127                b'|' => {
1128                    if ix != 0 && bytes[ix - 1] == b'\\' {
1129                        LoopInstruction::ContinueAndSkip(0)
1130                    } else if let TableParseMode::Active = mode {
1131                        LoopInstruction::BreakAtWith(ix, None)
1132                    } else {
1133                        last_pipe_ix = ix;
1134                        pipes += 1;
1135                        LoopInstruction::ContinueAndSkip(0)
1136                    }
1137                }
1138                b'.' => {
1139                    if ix + 2 < bytes.len() && bytes[ix + 1] == b'.' && bytes[ix + 2] == b'.' {
1140                        self.tree.append_text(begin_text, ix, backslash_escaped);
1141                        backslash_escaped = false;
1142                        self.tree.append(Item {
1143                            start: ix,
1144                            end: ix + 3,
1145                            body: ItemBody::SynthesizeChar('…'),
1146                        });
1147                        begin_text = ix + 3;
1148                        LoopInstruction::ContinueAndSkip(2)
1149                    } else {
1150                        LoopInstruction::ContinueAndSkip(0)
1151                    }
1152                }
1153                b'-' => {
1154                    let count = 1 + scan_ch_repeat(&bytes[(ix + 1)..], b'-');
1155                    if count == 1 {
1156                        LoopInstruction::ContinueAndSkip(0)
1157                    } else {
1158                        let itembody = if count == 2 {
1159                            ItemBody::SynthesizeChar('–')
1160                        } else if count == 3 {
1161                            ItemBody::SynthesizeChar('—')
1162                        } else {
1163                            let (ems, ens) = match count % 6 {
1164                                0 | 3 => (count / 3, 0),
1165                                2 | 4 => (0, count / 2),
1166                                1 => (count / 3 - 1, 2),
1167                                _ => (count / 3, 1),
1168                            };
1169                            // – and — are 3 bytes each in utf8
1170                            let mut buf = String::with_capacity(3 * (ems + ens));
1171                            for _ in 0..ems {
1172                                buf.push('—');
1173                            }
1174                            for _ in 0..ens {
1175                                buf.push('–');
1176                            }
1177                            ItemBody::SynthesizeText(self.allocs.allocate_cow(buf.into()))
1178                        };
1179
1180                        self.tree.append_text(begin_text, ix, backslash_escaped);
1181                        backslash_escaped = false;
1182                        self.tree.append(Item {
1183                            start: ix,
1184                            end: ix + count,
1185                            body: itembody,
1186                        });
1187                        begin_text = ix + count;
1188                        LoopInstruction::ContinueAndSkip(count - 1)
1189                    }
1190                }
1191                c @ b'\'' | c @ b'"' => {
1192                    let string_suffix = &self.text[ix..];
1193                    let can_open =
1194                        delim_run_can_open(&self.text[start..], string_suffix, 1, ix - start, mode);
1195                    let can_close = delim_run_can_close(
1196                        &self.text[start..],
1197                        string_suffix,
1198                        1,
1199                        ix - start,
1200                        mode,
1201                    );
1202
1203                    self.tree.append_text(begin_text, ix, backslash_escaped);
1204                    backslash_escaped = false;
1205                    self.tree.append(Item {
1206                        start: ix,
1207                        end: ix + 1,
1208                        body: ItemBody::MaybeSmartQuote(c, can_open, can_close),
1209                    });
1210                    begin_text = ix + 1;
1211
1212                    LoopInstruction::ContinueAndSkip(0)
1213                }
1214                _ => LoopInstruction::ContinueAndSkip(0),
1215            }
1216        });
1217
1218        if brk.is_none() {
1219            let trailing_whitespace =
1220                scan_rev_while(&bytes[begin_text..final_ix], is_ascii_whitespace_no_nl);
1221            // need to close text at eof
1222            self.tree.append_text(
1223                begin_text,
1224                final_ix - trailing_whitespace,
1225                backslash_escaped,
1226            );
1227        }
1228        (final_ix, brk)
1229    }
1230
1231    /// When start_ix is at the beginning of an HTML block of type 1 to 5,
1232    /// this will find the end of the block, adding the block itself to the
1233    /// tree and also keeping track of the lines of HTML within the block.
1234    ///
1235    /// The html_end_tag is the tag that must be found on a line to end the block.
1236    fn parse_html_block_type_1_to_5(
1237        &mut self,
1238        start_ix: usize,
1239        html_end_tag: &str,
1240        mut remaining_space: usize,
1241        mut indent: usize,
1242    ) -> usize {
1243        self.tree.append(Item {
1244            start: start_ix,
1245            end: 0, // set later
1246            body: ItemBody::HtmlBlock,
1247        });
1248        self.tree.push();
1249
1250        let bytes = self.text.as_bytes();
1251        let mut ix = start_ix;
1252        let end_ix;
1253        loop {
1254            let line_start_ix = ix;
1255            ix += scan_nextline(&bytes[ix..]);
1256            self.append_html_line(remaining_space.max(indent), line_start_ix, ix);
1257
1258            let mut line_start = LineStart::new(&bytes[ix..]);
1259            let n_containers = scan_containers(
1260                &self.tree,
1261                &mut line_start,
1262                self.options.has_gfm_footnotes(),
1263            );
1264            if n_containers < self.tree.spine_len() {
1265                end_ix = ix;
1266                break;
1267            }
1268
1269            if self.text[line_start_ix..ix].contains(html_end_tag) {
1270                end_ix = ix;
1271                break;
1272            }
1273
1274            let next_line_ix = ix + line_start.bytes_scanned();
1275            if next_line_ix == self.text.len() {
1276                end_ix = next_line_ix;
1277                break;
1278            }
1279            ix = next_line_ix;
1280            remaining_space = line_start.remaining_space();
1281            indent = 0;
1282        }
1283        self.pop(end_ix);
1284        ix
1285    }
1286
1287    /// When start_ix is at the beginning of an HTML block of type 6 or 7,
1288    /// this will consume lines until there is a blank line and keep track of
1289    /// the HTML within the block.
1290    fn parse_html_block_type_6_or_7(
1291        &mut self,
1292        start_ix: usize,
1293        mut remaining_space: usize,
1294        mut indent: usize,
1295    ) -> usize {
1296        self.tree.append(Item {
1297            start: start_ix,
1298            end: 0, // set later
1299            body: ItemBody::HtmlBlock,
1300        });
1301        self.tree.push();
1302
1303        let bytes = self.text.as_bytes();
1304        let mut ix = start_ix;
1305        let end_ix;
1306        loop {
1307            let line_start_ix = ix;
1308            ix += scan_nextline(&bytes[ix..]);
1309            self.append_html_line(remaining_space.max(indent), line_start_ix, ix);
1310
1311            let mut line_start = LineStart::new(&bytes[ix..]);
1312            let n_containers = scan_containers(
1313                &self.tree,
1314                &mut line_start,
1315                self.options.has_gfm_footnotes(),
1316            );
1317            if n_containers < self.tree.spine_len() || line_start.is_at_eol() {
1318                end_ix = ix;
1319                break;
1320            }
1321
1322            let next_line_ix = ix + line_start.bytes_scanned();
1323            if next_line_ix == self.text.len() || scan_blank_line(&bytes[next_line_ix..]).is_some()
1324            {
1325                end_ix = next_line_ix;
1326                break;
1327            }
1328            ix = next_line_ix;
1329            remaining_space = line_start.remaining_space();
1330            indent = 0;
1331        }
1332        self.pop(end_ix);
1333        ix
1334    }
1335
1336    fn parse_indented_code_block(&mut self, start_ix: usize, mut remaining_space: usize) -> usize {
1337        self.tree.append(Item {
1338            start: start_ix,
1339            end: 0, // will get set later
1340            body: ItemBody::IndentCodeBlock,
1341        });
1342        self.tree.push();
1343        let bytes = self.text.as_bytes();
1344        let mut last_nonblank_child = None;
1345        let mut last_nonblank_ix = 0;
1346        let mut end_ix = 0;
1347        self.last_line_blank = false;
1348
1349        let mut ix = start_ix;
1350        loop {
1351            let line_start_ix = ix;
1352            ix += scan_nextline(&bytes[ix..]);
1353            self.append_code_text(remaining_space, line_start_ix, ix);
1354            // TODO(spec clarification): should we synthesize newline at EOF?
1355
1356            if !self.last_line_blank {
1357                last_nonblank_child = self.tree.cur();
1358                last_nonblank_ix = ix;
1359                end_ix = ix;
1360            }
1361
1362            let mut line_start = LineStart::new(&bytes[ix..]);
1363            let n_containers = scan_containers(
1364                &self.tree,
1365                &mut line_start,
1366                self.options.has_gfm_footnotes(),
1367            );
1368            if n_containers < self.tree.spine_len()
1369                || !(line_start.scan_space(4) || line_start.is_at_eol())
1370            {
1371                break;
1372            }
1373            let next_line_ix = ix + line_start.bytes_scanned();
1374            if next_line_ix == self.text.len() {
1375                break;
1376            }
1377            ix = next_line_ix;
1378            remaining_space = line_start.remaining_space();
1379            self.last_line_blank = scan_blank_line(&bytes[ix..]).is_some();
1380        }
1381
1382        // Trim trailing blank lines.
1383        if let Some(child) = last_nonblank_child {
1384            self.tree[child].next = None;
1385            self.tree[child].item.end = last_nonblank_ix;
1386        }
1387        self.pop(end_ix);
1388        ix
1389    }
1390
1391    fn parse_fenced_code_block(
1392        &mut self,
1393        start_ix: usize,
1394        indent: usize,
1395        fence_ch: u8,
1396        n_fence_char: usize,
1397    ) -> usize {
1398        let bytes = self.text.as_bytes();
1399        let mut info_start = start_ix + n_fence_char;
1400        info_start += scan_whitespace_no_nl(&bytes[info_start..]);
1401        // TODO: info strings are typically very short. wouldn't it be faster
1402        // to just do a forward scan here?
1403        let mut ix = info_start + scan_nextline(&bytes[info_start..]);
1404        let info_end = ix - scan_rev_while(&bytes[info_start..ix], is_ascii_whitespace);
1405        let info_string = unescape(&self.text[info_start..info_end], self.tree.is_in_table());
1406        self.tree.append(Item {
1407            start: start_ix,
1408            end: 0, // will get set later
1409            body: ItemBody::FencedCodeBlock(self.allocs.allocate_cow(info_string)),
1410        });
1411        self.tree.push();
1412        loop {
1413            let mut line_start = LineStart::new(&bytes[ix..]);
1414            let n_containers = scan_containers(
1415                &self.tree,
1416                &mut line_start,
1417                self.options.has_gfm_footnotes(),
1418            );
1419            if n_containers < self.tree.spine_len() {
1420                // this line will get parsed again as not being part of the code
1421                // if it's blank, it should be parsed as a blank line
1422                self.pop(ix);
1423                return ix;
1424            }
1425            line_start.scan_space(indent);
1426            let mut close_line_start = line_start.clone();
1427            if !close_line_start.scan_space(4 - indent) {
1428                let close_ix = ix + close_line_start.bytes_scanned();
1429                if let Some(n) = scan_closing_code_fence(&bytes[close_ix..], fence_ch, n_fence_char)
1430                {
1431                    ix = close_ix + n;
1432                    self.pop(ix);
1433                    // try to read trailing whitespace or it will register as a completely blank line
1434                    return ix + scan_blank_line(&bytes[ix..]).unwrap_or(0);
1435                }
1436            }
1437            let remaining_space = line_start.remaining_space();
1438            ix += line_start.bytes_scanned();
1439            let next_ix = ix + scan_nextline(&bytes[ix..]);
1440            self.append_code_text(remaining_space, ix, next_ix);
1441            ix = next_ix;
1442        }
1443    }
1444
1445    fn parse_metadata_block(&mut self, start_ix: usize, metadata_block_ch: u8) -> usize {
1446        let bytes = self.text.as_bytes();
1447        let metadata_block_kind = match metadata_block_ch {
1448            b'-' => MetadataBlockKind::YamlStyle,
1449            b'+' => MetadataBlockKind::PlusesStyle,
1450            _ => panic!("Erroneous metadata block character when parsing metadata block"),
1451        };
1452        // 3 delimiter characters
1453        let mut ix = start_ix + 3 + scan_nextline(&bytes[start_ix + 3..]);
1454        self.tree.append(Item {
1455            start: start_ix,
1456            end: 0, // will get set later
1457            body: ItemBody::MetadataBlock(metadata_block_kind),
1458        });
1459        self.tree.push();
1460        loop {
1461            let mut line_start = LineStart::new(&bytes[ix..]);
1462            let n_containers = scan_containers(
1463                &self.tree,
1464                &mut line_start,
1465                self.options.has_gfm_footnotes(),
1466            );
1467            if n_containers < self.tree.spine_len() {
1468                break;
1469            }
1470            if let (_, 0) = calc_indent(&bytes[ix..], 4) {
1471                if let Some(n) = scan_closing_metadata_block(&bytes[ix..], metadata_block_ch) {
1472                    ix += n;
1473                    break;
1474                }
1475            }
1476            let remaining_space = line_start.remaining_space();
1477            ix += line_start.bytes_scanned();
1478            let next_ix = ix + scan_nextline(&bytes[ix..]);
1479            self.append_code_text(remaining_space, ix, next_ix);
1480            ix = next_ix;
1481        }
1482
1483        self.pop(ix);
1484
1485        // try to read trailing whitespace or it will register as a completely blank line
1486        ix + scan_blank_line(&bytes[ix..]).unwrap_or(0)
1487    }
1488
1489    fn append_code_text(&mut self, remaining_space: usize, start: usize, end: usize) {
1490        if remaining_space > 0 {
1491            let cow_ix = self.allocs.allocate_cow("   "[..remaining_space].into());
1492            self.tree.append(Item {
1493                start,
1494                end: start,
1495                body: ItemBody::SynthesizeText(cow_ix),
1496            });
1497        }
1498        if self.text.as_bytes()[end - 2] == b'\r' {
1499            // Normalize CRLF to LF
1500            self.tree.append_text(start, end - 2, false);
1501            self.tree.append_text(end - 1, end, false);
1502        } else {
1503            self.tree.append_text(start, end, false);
1504        }
1505    }
1506
1507    /// Appends a line of HTML to the tree.
1508    fn append_html_line(&mut self, remaining_space: usize, start: usize, end: usize) {
1509        if remaining_space > 0 {
1510            let cow_ix = self.allocs.allocate_cow("   "[..remaining_space].into());
1511            self.tree.append(Item {
1512                start,
1513                end: start,
1514                body: ItemBody::SynthesizeText(cow_ix),
1515            });
1516        }
1517        if self.text.as_bytes()[end - 2] == b'\r' {
1518            // Normalize CRLF to LF
1519            self.tree.append(Item {
1520                start,
1521                end: end - 2,
1522                body: ItemBody::Html,
1523            });
1524            self.tree.append(Item {
1525                start: end - 1,
1526                end,
1527                body: ItemBody::Html,
1528            });
1529        } else {
1530            self.tree.append(Item {
1531                start,
1532                end,
1533                body: ItemBody::Html,
1534            });
1535        }
1536    }
1537
1538    /// Pop a container, setting its end.
1539    fn pop(&mut self, ix: usize) {
1540        let cur_ix = self.tree.pop().unwrap();
1541        self.tree[cur_ix].item.end = ix;
1542        if let ItemBody::DefinitionList(_) = self.tree[cur_ix].item.body {
1543            fixup_end_of_definition_list(&mut self.tree, cur_ix);
1544            self.begin_list_item = None;
1545        }
1546        if let ItemBody::List(true, _, _) | ItemBody::DefinitionList(true) =
1547            self.tree[cur_ix].item.body
1548        {
1549            surgerize_tight_list(&mut self.tree, cur_ix);
1550            self.begin_list_item = None;
1551        }
1552    }
1553
1554    /// Close a list if it's open. Also set loose if last line was blank
1555    /// and end current list if it's a lone, empty item
1556    fn finish_list(&mut self, ix: usize) {
1557        self.finish_empty_list_item();
1558        if let Some(node_ix) = self.tree.peek_up() {
1559            if let ItemBody::List(_, _, _) | ItemBody::DefinitionList(_) =
1560                self.tree[node_ix].item.body
1561            {
1562                self.pop(ix);
1563            }
1564        }
1565        if self.last_line_blank {
1566            if let Some(node_ix) = self.tree.peek_grandparent() {
1567                if let ItemBody::List(ref mut is_tight, _, _)
1568                | ItemBody::DefinitionList(ref mut is_tight) = self.tree[node_ix].item.body
1569                {
1570                    *is_tight = false;
1571                }
1572            }
1573            self.last_line_blank = false;
1574        }
1575    }
1576
1577    fn finish_empty_list_item(&mut self) {
1578        if let Some(begin_list_item) = self.begin_list_item {
1579            if self.last_line_blank {
1580                // A list item can begin with at most one blank line.
1581                if let Some(node_ix) = self.tree.peek_up() {
1582                    if let ItemBody::ListItem(_) | ItemBody::DefinitionListDefinition(_) =
1583                        self.tree[node_ix].item.body
1584                    {
1585                        self.pop(begin_list_item);
1586                    }
1587                }
1588            }
1589        }
1590        self.begin_list_item = None;
1591    }
1592
1593    /// Continue an existing list or start a new one if there's not an open
1594    /// list that matches.
1595    fn continue_list(&mut self, start: usize, ch: u8, index: u64) {
1596        self.finish_empty_list_item();
1597        if let Some(node_ix) = self.tree.peek_up() {
1598            if let ItemBody::List(ref mut is_tight, existing_ch, _) = self.tree[node_ix].item.body {
1599                if existing_ch == ch {
1600                    if self.last_line_blank {
1601                        *is_tight = false;
1602                        self.last_line_blank = false;
1603                    }
1604                    return;
1605                }
1606            }
1607            // TODO: this is not the best choice for end; maybe get end from last list item.
1608            self.finish_list(start);
1609        }
1610        self.tree.append(Item {
1611            start,
1612            end: 0, // will get set later
1613            body: ItemBody::List(true, ch, index),
1614        });
1615        self.tree.push();
1616        self.last_line_blank = false;
1617    }
1618
1619    /// Parse a thematic break.
1620    ///
1621    /// Returns index of start of next line.
1622    fn parse_hrule(&mut self, hrule_size: usize, ix: usize) -> usize {
1623        self.tree.append(Item {
1624            start: ix,
1625            end: ix + hrule_size,
1626            body: ItemBody::Rule,
1627        });
1628        ix + hrule_size
1629    }
1630
1631    /// Parse an ATX heading.
1632    ///
1633    /// Returns index of start of next line.
1634    fn parse_atx_heading(&mut self, start: usize, atx_level: HeadingLevel) -> usize {
1635        let mut ix = start;
1636        let heading_ix = self.tree.append(Item {
1637            start,
1638            end: 0,                    // set later
1639            body: ItemBody::default(), // set later
1640        });
1641        ix += atx_level as usize;
1642        // next char is space or eol (guaranteed by scan_atx_heading)
1643        let bytes = self.text.as_bytes();
1644        if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
1645            self.tree[heading_ix].item.end = ix + eol_bytes;
1646            self.tree[heading_ix].item.body = ItemBody::Heading(atx_level, None);
1647            return ix + eol_bytes;
1648        }
1649        // skip leading spaces
1650        let skip_spaces = scan_whitespace_no_nl(&bytes[ix..]);
1651        ix += skip_spaces;
1652
1653        // now handle the header text
1654        let header_start = ix;
1655        let header_node_idx = self.tree.push(); // so that we can set the endpoint later
1656
1657        // trim the trailing attribute block before parsing the entire line, if necessary
1658        let (end, content_end, attrs) = if self.options.contains(Options::ENABLE_HEADING_ATTRIBUTES)
1659        {
1660            // the start of the next line is the end of the header since the
1661            // header cannot have line breaks
1662            let header_end = header_start + scan_nextline(&bytes[header_start..]);
1663            let (content_end, attrs) =
1664                self.extract_and_parse_heading_attribute_block(header_start, header_end);
1665            self.parse_line(ix, Some(content_end), TableParseMode::Disabled);
1666            (header_end, content_end, attrs)
1667        } else {
1668            let (line_ix, line_brk) = self.parse_line(ix, None, TableParseMode::Disabled);
1669            ix = line_ix;
1670            // Backslash at end is actually hard line break
1671            if let Some(Item {
1672                start,
1673                end,
1674                body: ItemBody::HardBreak(true),
1675            }) = line_brk
1676            {
1677                self.tree.append_text(start, end, false);
1678            }
1679            (ix, ix, None)
1680        };
1681        self.tree[header_node_idx].item.end = end;
1682
1683        // remove trailing matter from header text
1684        let mut empty_text_node = false;
1685        if let Some(cur_ix) = self.tree.cur() {
1686            // remove closing of the ATX heading
1687            let header_text = &bytes[header_start..content_end];
1688            let mut limit = header_text
1689                .iter()
1690                .rposition(|&b| !(b == b'\n' || b == b'\r' || b == b' '))
1691                .map_or(0, |i| i + 1);
1692            let closer = header_text[..limit]
1693                .iter()
1694                .rposition(|&b| b != b'#')
1695                .map_or(0, |i| i + 1);
1696            if closer == 0 {
1697                limit = closer;
1698            } else {
1699                let spaces = scan_rev_while(&header_text[..closer], |b| b == b' ');
1700                if spaces > 0 {
1701                    limit = closer - spaces;
1702                }
1703            }
1704            // if text is only spaces, then remove them
1705            self.tree[cur_ix].item.end = limit + header_start;
1706
1707            // limit = 0 when text is empty after removing spaces
1708            if limit == 0 {
1709                empty_text_node = true;
1710            }
1711        }
1712
1713        if empty_text_node {
1714            self.tree.remove_node();
1715        } else {
1716            self.tree.pop();
1717        }
1718        self.tree[heading_ix].item.body = ItemBody::Heading(
1719            atx_level,
1720            attrs.map(|attrs| self.allocs.allocate_heading(attrs)),
1721        );
1722
1723        end
1724    }
1725
1726    /// Returns the number of bytes scanned on success.
1727    fn parse_footnote(&mut self, start: usize) -> Option<usize> {
1728        let bytes = &self.text.as_bytes()[start..];
1729        if !bytes.starts_with(b"[^") {
1730            return None;
1731        }
1732        let (mut i, label) = if self.options.has_gfm_footnotes() {
1733            // GitHub doesn't allow footnote definition labels to contain line breaks.
1734            // It actually does allow this for link definitions under certain circumstances,
1735            // but for this it's simpler to avoid it.
1736            scan_link_label_rest(&self.text[start + 2..], &|_| None, self.tree.is_in_table())?
1737        } else {
1738            self.parse_refdef_label(start + 2)?
1739        };
1740        if self.options.has_gfm_footnotes() && label.bytes().any(|b| b == b'\r' || b == b'\n') {
1741            // GitHub doesn't allow footnote definition labels to contain line breaks,
1742            // even if they're escaped.
1743            return None;
1744        }
1745        i += 2;
1746        if scan_ch(&bytes[i..], b':') == 0 {
1747            return None;
1748        }
1749        i += 1;
1750        self.finish_list(start);
1751        if let Some(node_ix) = self.tree.peek_up() {
1752            if let ItemBody::FootnoteDefinition(..) = self.tree[node_ix].item.body {
1753                // finish previous footnote if it's still open
1754                self.pop(start);
1755            }
1756        }
1757        if self.options.has_gfm_footnotes() {
1758            i += scan_whitespace_no_nl(&bytes[i..]);
1759        }
1760        self.allocs
1761            .footdefs
1762            .0
1763            .insert(UniCase::new(label.clone()), FootnoteDef { use_count: 0 });
1764        self.tree.append(Item {
1765            start,
1766            end: 0, // will get set later
1767            // TODO: check whether the label here is strictly necessary
1768            body: ItemBody::FootnoteDefinition(self.allocs.allocate_cow(label)),
1769        });
1770        self.tree.push();
1771        Some(i)
1772    }
1773
1774    /// Tries to parse a reference label, which can be interrupted by new blocks.
1775    /// On success, returns the number of bytes of the label and the label itself.
1776    fn parse_refdef_label(&self, start: usize) -> Option<(usize, CowStr<'a>)> {
1777        scan_link_label_rest(
1778            &self.text[start..],
1779            &|bytes| {
1780                let mut line_start = LineStart::new(bytes);
1781                let current_container = scan_containers(
1782                    &self.tree,
1783                    &mut line_start,
1784                    self.options.has_gfm_footnotes(),
1785                ) == self.tree.spine_len();
1786                if line_start.scan_space(4) {
1787                    return Some(line_start.bytes_scanned());
1788                }
1789                let bytes_scanned = line_start.bytes_scanned();
1790                let suffix = &bytes[bytes_scanned..];
1791                if self.scan_paragraph_interrupt(suffix, current_container)
1792                    || (current_container && scan_setext_heading(suffix).is_some())
1793                {
1794                    None
1795                } else {
1796                    Some(bytes_scanned)
1797                }
1798            },
1799            self.tree.is_in_table(),
1800        )
1801    }
1802
1803    /// Returns number of bytes scanned, label and definition on success.
1804    fn parse_refdef_total(&mut self, start: usize) -> Option<(usize, LinkLabel<'a>, LinkDef<'a>)> {
1805        let bytes = &self.text.as_bytes()[start..];
1806        if scan_ch(bytes, b'[') == 0 {
1807            return None;
1808        }
1809        let (mut i, label) = self.parse_refdef_label(start + 1)?;
1810        i += 1;
1811        if scan_ch(&bytes[i..], b':') == 0 {
1812            return None;
1813        }
1814        i += 1;
1815        let (bytecount, link_def) = self.scan_refdef(start, start + i)?;
1816        Some((bytecount + i, UniCase::new(label), link_def))
1817    }
1818
1819    /// Returns number of bytes and number of newlines
1820    fn scan_refdef_space(&self, bytes: &[u8], mut i: usize) -> Option<(usize, usize)> {
1821        let mut newlines = 0;
1822        loop {
1823            let whitespaces = scan_whitespace_no_nl(&bytes[i..]);
1824            i += whitespaces;
1825            if let Some(eol_bytes) = scan_eol(&bytes[i..]) {
1826                i += eol_bytes;
1827                newlines += 1;
1828                if newlines > 1 {
1829                    return None;
1830                }
1831            } else {
1832                break;
1833            }
1834            let mut line_start = LineStart::new(&bytes[i..]);
1835            let current_container = scan_containers(
1836                &self.tree,
1837                &mut line_start,
1838                self.options.has_gfm_footnotes(),
1839            ) == self.tree.spine_len();
1840            if !line_start.scan_space(4) {
1841                let suffix = &bytes[i + line_start.bytes_scanned()..];
1842                if self.scan_paragraph_interrupt(suffix, current_container)
1843                    || scan_setext_heading(suffix).is_some()
1844                {
1845                    return None;
1846                }
1847            }
1848            i += line_start.bytes_scanned();
1849        }
1850        Some((i, newlines))
1851    }
1852
1853    // returns (bytelength, title_str)
1854    fn scan_refdef_title<'t>(&self, text: &'t str) -> Option<(usize, CowStr<'t>)> {
1855        let bytes = text.as_bytes();
1856        let closing_delim = match bytes.first()? {
1857            b'\'' => b'\'',
1858            b'"' => b'"',
1859            b'(' => b')',
1860            _ => return None,
1861        };
1862        let mut bytecount = 1;
1863        let mut linestart = 1;
1864
1865        let mut linebuf = None;
1866
1867        while let Some(&c) = bytes.get(bytecount) {
1868            match c {
1869                b'(' if closing_delim == b')' => {
1870                    // https://spec.commonmark.org/0.30/#link-title
1871                    // a sequence of zero or more characters between matching parentheses ((...)),
1872                    // including a ( or ) character only if it is backslash-escaped.
1873                    return None;
1874                }
1875                b'\n' | b'\r' => {
1876                    // push text to line buffer
1877                    // this is used to strip the block formatting:
1878                    //
1879                    // > [first]: http://example.com "
1880                    // > second"
1881                    //
1882                    // should get turned into `<a href="example.com" title=" second">first</a>`
1883                    let linebuf = if let Some(linebuf) = &mut linebuf {
1884                        linebuf
1885                    } else {
1886                        linebuf = Some(String::new());
1887                        linebuf.as_mut().unwrap()
1888                    };
1889                    linebuf.push_str(&text[linestart..bytecount]);
1890                    linebuf.push('\n'); // normalize line breaks
1891                                        // skip line break
1892                    bytecount += 1;
1893                    if c == b'\r' && bytes.get(bytecount) == Some(&b'\n') {
1894                        bytecount += 1;
1895                    }
1896                    let mut line_start = LineStart::new(&bytes[bytecount..]);
1897                    let current_container = scan_containers(
1898                        &self.tree,
1899                        &mut line_start,
1900                        self.options.has_gfm_footnotes(),
1901                    ) == self.tree.spine_len();
1902                    if !line_start.scan_space(4) {
1903                        let suffix = &bytes[bytecount + line_start.bytes_scanned()..];
1904                        if self.scan_paragraph_interrupt(suffix, current_container)
1905                            || scan_setext_heading(suffix).is_some()
1906                        {
1907                            return None;
1908                        }
1909                    }
1910                    line_start.scan_all_space();
1911                    bytecount += line_start.bytes_scanned();
1912                    linestart = bytecount;
1913                    if scan_blank_line(&bytes[bytecount..]).is_some() {
1914                        // blank line - not allowed
1915                        return None;
1916                    }
1917                }
1918                b'\\' => {
1919                    bytecount += 1;
1920                    if let Some(c) = bytes.get(bytecount) {
1921                        if c != &b'\r' && c != &b'\n' {
1922                            bytecount += 1;
1923                        }
1924                    }
1925                }
1926                c if c == closing_delim => {
1927                    let cow = if let Some(mut linebuf) = linebuf {
1928                        linebuf.push_str(&text[linestart..bytecount]);
1929                        CowStr::from(linebuf)
1930                    } else {
1931                        CowStr::from(&text[linestart..bytecount])
1932                    };
1933                    return Some((bytecount + 1, cow));
1934                }
1935                _ => {
1936                    bytecount += 1;
1937                }
1938            }
1939        }
1940        None
1941    }
1942
1943    /// Returns # of bytes and definition.
1944    /// Assumes the label of the reference including colon has already been scanned.
1945    fn scan_refdef(&self, span_start: usize, start: usize) -> Option<(usize, LinkDef<'a>)> {
1946        let bytes = self.text.as_bytes();
1947
1948        // whitespace between label and url (including up to one newline)
1949        let (mut i, _newlines) = self.scan_refdef_space(bytes, start)?;
1950
1951        // scan link dest
1952        let (dest_length, dest) = scan_link_dest(self.text, i, LINK_MAX_NESTED_PARENS)?;
1953        if dest_length == 0 {
1954            return None;
1955        }
1956        let dest = unescape(dest, self.tree.is_in_table());
1957        i += dest_length;
1958
1959        // no title
1960        let mut backup = (
1961            i - start,
1962            LinkDef {
1963                dest,
1964                title: None,
1965                span: span_start..i,
1966            },
1967        );
1968
1969        // scan whitespace between dest and label
1970        let (mut i, newlines) =
1971            if let Some((new_i, mut newlines)) = self.scan_refdef_space(bytes, i) {
1972                if i == self.text.len() {
1973                    newlines += 1;
1974                }
1975                if new_i == i && newlines == 0 {
1976                    return None;
1977                }
1978                if newlines > 1 {
1979                    return Some(backup);
1980                };
1981                (new_i, newlines)
1982            } else {
1983                return Some(backup);
1984            };
1985
1986        // scan title
1987        // if this fails but newline == 1, return also a refdef without title
1988        if let Some((title_length, title)) = self.scan_refdef_title(&self.text[i..]) {
1989            i += title_length;
1990            if scan_blank_line(&bytes[i..]).is_some() {
1991                backup.0 = i - start;
1992                backup.1.span = span_start..i;
1993                backup.1.title = Some(unescape(title, self.tree.is_in_table()));
1994                return Some(backup);
1995            }
1996        }
1997        if newlines > 0 {
1998            Some(backup)
1999        } else {
2000            None
2001        }
2002    }
2003
2004    /// Checks whether we should break a paragraph on the given input.
2005    fn scan_paragraph_interrupt(&self, bytes: &[u8], current_container: bool) -> bool {
2006        if scan_paragraph_interrupt_no_table(
2007            bytes,
2008            current_container,
2009            self.options.contains(Options::ENABLE_FOOTNOTES),
2010            self.options.contains(Options::ENABLE_DEFINITION_LIST),
2011            &self.tree,
2012        ) {
2013            return true;
2014        }
2015        // pulldown-cmark allows heavy tables, that have a `|` on the header row,
2016        // to interrupt paragraphs.
2017        //
2018        // ```markdown
2019        // This is a table
2020        // | a | b | c |
2021        // |---|---|---|
2022        // | d | e | f |
2023        //
2024        // This is not a table
2025        //  a | b | c
2026        // ---|---|---
2027        //  d | e | f
2028        // ```
2029        if !self.options.contains(Options::ENABLE_TABLES) || !bytes.starts_with(b"|") {
2030            return false;
2031        }
2032
2033        // Checking if something's a valid table or not requires looking at two lines.
2034        // First line, count unescaped pipes.
2035        let mut pipes = 0;
2036        let mut next_line_ix = 0;
2037        let mut bsesc = false;
2038        let mut last_pipe_ix = 0;
2039        for (i, &byte) in bytes.iter().enumerate() {
2040            match byte {
2041                b'\\' => {
2042                    bsesc = true;
2043                    continue;
2044                }
2045                b'|' if !bsesc => {
2046                    pipes += 1;
2047                    last_pipe_ix = i;
2048                }
2049                b'\r' | b'\n' => {
2050                    next_line_ix = i + scan_eol(&bytes[i..]).unwrap();
2051                    break;
2052                }
2053                _ => {}
2054            }
2055            bsesc = false;
2056        }
2057
2058        // scan_eol can't return 0, so this can't be zero
2059        if next_line_ix == 0 {
2060            return false;
2061        }
2062
2063        // Scan the table head. The part that looks like:
2064        //
2065        //     |---|---|---|
2066        //
2067        // Also scan any containing items, since it's on its own line, and
2068        // might be nested inside a block quote or something
2069        //
2070        //     > Table: First
2071        //     > | first col | second col |
2072        //     > |-----------|------------|
2073        //     ^
2074        //     | need to skip over the `>` when checking for the table
2075        let mut line_start = LineStart::new(&bytes[next_line_ix..]);
2076        if scan_containers(
2077            &self.tree,
2078            &mut line_start,
2079            self.options.has_gfm_footnotes(),
2080        ) != self.tree.spine_len()
2081        {
2082            return false;
2083        }
2084        let table_head_ix = next_line_ix + line_start.bytes_scanned();
2085        let (table_head_bytes, alignment) = scan_table_head(&bytes[table_head_ix..]);
2086
2087        if table_head_bytes == 0 {
2088            return false;
2089        }
2090
2091        // computing header count from number of pipes
2092        let header_count = count_header_cols(bytes, pipes, 0, last_pipe_ix);
2093
2094        // make sure they match the number of columns we find in separator line
2095        alignment.len() == header_count
2096    }
2097
2098    /// Extracts and parses a heading attribute block if exists.
2099    ///
2100    /// Returns `(end_offset_of_heading_content, (id, classes))`.
2101    ///
2102    /// If `header_end` is less than or equal to `header_start`, the given
2103    /// input is considered as empty.
2104    fn extract_and_parse_heading_attribute_block(
2105        &mut self,
2106        header_start: usize,
2107        header_end: usize,
2108    ) -> (usize, Option<HeadingAttributes<'a>>) {
2109        if !self.options.contains(Options::ENABLE_HEADING_ATTRIBUTES) {
2110            return (header_end, None);
2111        }
2112
2113        // extract the trailing attribute block
2114        let header_bytes = &self.text.as_bytes()[header_start..header_end];
2115        let (content_len, attr_block_range_rel) =
2116            extract_attribute_block_content_from_header_text(header_bytes);
2117        let content_end = header_start + content_len;
2118        let attrs = attr_block_range_rel.and_then(|r| {
2119            parse_inside_attribute_block(
2120                &self.text[(header_start + r.start)..(header_start + r.end)],
2121            )
2122        });
2123        (content_end, attrs)
2124    }
2125}
2126
2127/// Scanning modes for `Parser`'s `parse_line` method.
2128#[derive(PartialEq, Eq, Copy, Clone)]
2129enum TableParseMode {
2130    /// Inside a paragraph, scanning for table headers.
2131    Scan,
2132    /// Inside a table.
2133    Active,
2134    /// Inside a paragraph, not scanning for table headers.
2135    Disabled,
2136}
2137
2138/// Computes the number of header columns in a table line by computing the number of dividing pipes
2139/// that aren't followed or preceded by whitespace.
2140fn count_header_cols(
2141    bytes: &[u8],
2142    mut pipes: usize,
2143    mut start: usize,
2144    last_pipe_ix: usize,
2145) -> usize {
2146    // was first pipe preceded by whitespace? if so, subtract one
2147    start += scan_whitespace_no_nl(&bytes[start..]);
2148    if bytes[start] == b'|' {
2149        pipes -= 1;
2150    }
2151
2152    // was last pipe followed by whitespace? if so, sub one
2153    if scan_blank_line(&bytes[(last_pipe_ix + 1)..]).is_some() {
2154        pipes
2155    } else {
2156        pipes + 1
2157    }
2158}
2159
2160/// Checks whether we should break a paragraph on the given input.
2161///
2162/// Use `FirstPass::scan_paragraph_interrupt` in any context that allows
2163/// tables to interrupt the paragraph.
2164fn scan_paragraph_interrupt_no_table(
2165    bytes: &[u8],
2166    current_container: bool,
2167    has_footnote: bool,
2168    definition_list: bool,
2169    tree: &Tree<Item>,
2170) -> bool {
2171    scan_eol(bytes).is_some()
2172        || scan_hrule(bytes).is_ok()
2173        || scan_atx_heading(bytes).is_some()
2174        || scan_code_fence(bytes).is_some()
2175        || scan_blockquote_start(bytes).is_some()
2176        || scan_listitem(bytes).map_or(false, |(ix, delim, index, _)| {
2177            ! current_container ||
2178            tree.is_in_table() ||
2179            // we don't allow interruption by either empty lists or
2180            // numbered lists starting at an index other than 1
2181            (delim == b'*' || delim == b'-' || delim == b'+' || index == 1)
2182                && (scan_blank_line(&bytes[ix..]).is_none())
2183        })
2184        || bytes.starts_with(b"<")
2185            && (get_html_end_tag(&bytes[1..]).is_some() || starts_html_block_type_6(&bytes[1..]))
2186        || definition_list && bytes.starts_with(b":")
2187        || (has_footnote
2188            && bytes.starts_with(b"[^")
2189            && scan_link_label_rest(
2190                std::str::from_utf8(&bytes[2..]).unwrap(),
2191                &|_| None,
2192                tree.is_in_table(),
2193            )
2194            .map_or(false, |(len, _)| bytes.get(2 + len) == Some(&b':')))
2195}
2196
2197/// Assumes `text_bytes` is preceded by `<`.
2198fn get_html_end_tag(text_bytes: &[u8]) -> Option<&'static str> {
2199    static BEGIN_TAGS: &[&[u8]; 4] = &[b"pre", b"style", b"script", b"textarea"];
2200    static ST_BEGIN_TAGS: &[&[u8]; 3] = &[b"!--", b"?", b"![CDATA["];
2201
2202    for (beg_tag, end_tag) in BEGIN_TAGS
2203        .iter()
2204        .zip(["</pre>", "</style>", "</script>", "</textarea>"].iter())
2205    {
2206        let tag_len = beg_tag.len();
2207
2208        if text_bytes.len() < tag_len {
2209            // begin tags are increasing in size
2210            break;
2211        }
2212
2213        if !text_bytes[..tag_len].eq_ignore_ascii_case(beg_tag) {
2214            continue;
2215        }
2216
2217        // Must either be the end of the line...
2218        if text_bytes.len() == tag_len {
2219            return Some(end_tag);
2220        }
2221
2222        // ...or be followed by whitespace, newline, or '>'.
2223        let s = text_bytes[tag_len];
2224        if is_ascii_whitespace(s) || s == b'>' {
2225            return Some(end_tag);
2226        }
2227    }
2228
2229    for (beg_tag, end_tag) in ST_BEGIN_TAGS.iter().zip(["-->", "?>", "]]>"].iter()) {
2230        if text_bytes.starts_with(beg_tag) {
2231            return Some(end_tag);
2232        }
2233    }
2234
2235    if text_bytes.len() > 1 && text_bytes[0] == b'!' && text_bytes[1].is_ascii_alphabetic() {
2236        Some(">")
2237    } else {
2238        None
2239    }
2240}
2241
2242// https://english.stackexchange.com/a/285573
2243fn surgerize_tight_list(tree: &mut Tree<Item>, list_ix: TreeIndex) {
2244    let mut list_item = tree[list_ix].child;
2245    while let Some(listitem_ix) = list_item {
2246        // first child is special, controls how we repoint list_item.child
2247        let list_item_firstborn = tree[listitem_ix].child;
2248
2249        // Check that list item has children - this is not necessarily the case!
2250        if let Some(firstborn_ix) = list_item_firstborn {
2251            if let ItemBody::Paragraph = tree[firstborn_ix].item.body {
2252                tree[listitem_ix].child = tree[firstborn_ix].child;
2253            }
2254
2255            let mut list_item_child = Some(firstborn_ix);
2256            let mut node_to_repoint = None;
2257            while let Some(child_ix) = list_item_child {
2258                // surgerize paragraphs
2259                let repoint_ix = if let ItemBody::Paragraph = tree[child_ix].item.body {
2260                    if let Some(child_firstborn) = tree[child_ix].child {
2261                        if let Some(repoint_ix) = node_to_repoint {
2262                            tree[repoint_ix].next = Some(child_firstborn);
2263                        }
2264                        let mut child_lastborn = child_firstborn;
2265                        while let Some(lastborn_next_ix) = tree[child_lastborn].next {
2266                            child_lastborn = lastborn_next_ix;
2267                        }
2268                        child_lastborn
2269                    } else {
2270                        child_ix
2271                    }
2272                } else {
2273                    child_ix
2274                };
2275
2276                node_to_repoint = Some(repoint_ix);
2277                tree[repoint_ix].next = tree[child_ix].next;
2278                list_item_child = tree[child_ix].next;
2279            }
2280        }
2281
2282        list_item = tree[listitem_ix].next;
2283    }
2284}
2285
2286fn fixup_end_of_definition_list(tree: &mut Tree<Item>, list_ix: TreeIndex) {
2287    let mut list_item = tree[list_ix].child;
2288    let mut previous_list_item = None;
2289    while let Some(listitem_ix) = list_item {
2290        match &mut tree[listitem_ix].item.body {
2291            ItemBody::DefinitionListTitle | ItemBody::DefinitionListDefinition(_) => {
2292                previous_list_item = list_item;
2293                list_item = tree[listitem_ix].next;
2294            }
2295            body @ ItemBody::MaybeDefinitionListTitle => {
2296                *body = ItemBody::Paragraph;
2297                break;
2298            }
2299            _ => break,
2300        }
2301    }
2302    if let Some(previous_list_item) = previous_list_item {
2303        tree.truncate_to_parent(previous_list_item);
2304    }
2305}
2306
2307/// Determines whether the delimiter run starting at given index is
2308/// left-flanking, as defined by the commonmark spec (and isn't intraword
2309/// for _ delims).
2310/// suffix is &s[ix..], which is passed in as an optimization, since taking
2311/// a string subslice is O(n).
2312fn delim_run_can_open(
2313    s: &str,
2314    suffix: &str,
2315    run_len: usize,
2316    ix: usize,
2317    mode: TableParseMode,
2318) -> bool {
2319    let next_char = if let Some(c) = suffix[run_len..].chars().next() {
2320        c
2321    } else {
2322        return false;
2323    };
2324    if next_char.is_whitespace() {
2325        return false;
2326    }
2327    if ix == 0 {
2328        return true;
2329    }
2330    if mode == TableParseMode::Active {
2331        if s.as_bytes()[..ix].ends_with(b"|") && !s.as_bytes()[..ix].ends_with(br"\|") {
2332            return true;
2333        }
2334        if next_char == '|' {
2335            return false;
2336        }
2337    }
2338    let delim = suffix.bytes().next().unwrap();
2339    // `*` and `~~` can be intraword, `_` and `~` cannot
2340    if delim == b'*' && !is_punctuation(next_char) {
2341        return true;
2342    }
2343    if delim == b'~' && run_len > 1 {
2344        return true;
2345    }
2346    let prev_char = s[..ix].chars().last().unwrap();
2347    if delim == b'~' && prev_char == '~' && !is_punctuation(next_char) {
2348        return true;
2349    }
2350
2351    prev_char.is_whitespace()
2352        || is_punctuation(prev_char) && (delim != b'\'' || ![']', ')'].contains(&prev_char))
2353}
2354
2355/// Determines whether the delimiter run starting at given index is
2356/// right-flanking, as defined by the commonmark spec (and isn't intraword
2357/// for _ delims)
2358fn delim_run_can_close(
2359    s: &str,
2360    suffix: &str,
2361    run_len: usize,
2362    ix: usize,
2363    mode: TableParseMode,
2364) -> bool {
2365    if ix == 0 {
2366        return false;
2367    }
2368    let prev_char = s[..ix].chars().last().unwrap();
2369    if prev_char.is_whitespace() {
2370        return false;
2371    }
2372    let next_char = if let Some(c) = suffix[run_len..].chars().next() {
2373        c
2374    } else {
2375        return true;
2376    };
2377    if mode == TableParseMode::Active {
2378        if s.as_bytes()[..ix].ends_with(b"|") && !s.as_bytes()[..ix].ends_with(br"\|") {
2379            return false;
2380        }
2381        if next_char == '|' {
2382            return true;
2383        }
2384    }
2385    let delim = suffix.bytes().next().unwrap();
2386    // `*` and `~~` can be intraword, `_` and `~` cannot
2387    if (delim == b'*' || (delim == b'~' && run_len > 1)) && !is_punctuation(prev_char) {
2388        return true;
2389    }
2390    if delim == b'~' && prev_char == '~' {
2391        return true;
2392    }
2393
2394    next_char.is_whitespace() || is_punctuation(next_char)
2395}
2396
2397fn create_lut(options: &Options) -> LookupTable {
2398    #[cfg(all(target_arch = "x86_64", feature = "simd"))]
2399    {
2400        LookupTable {
2401            simd: simd::compute_lookup(options),
2402            scalar: special_bytes(options),
2403        }
2404    }
2405    #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
2406    {
2407        special_bytes(options)
2408    }
2409}
2410
2411fn special_bytes(options: &Options) -> [bool; 256] {
2412    let mut bytes = [false; 256];
2413    let standard_bytes = [
2414        b'\n', b'\r', b'*', b'_', b'&', b'\\', b'[', b']', b'<', b'!', b'`',
2415    ];
2416
2417    for &byte in &standard_bytes {
2418        bytes[byte as usize] = true;
2419    }
2420    if options.contains(Options::ENABLE_TABLES) {
2421        bytes[b'|' as usize] = true;
2422    }
2423    if options.contains(Options::ENABLE_STRIKETHROUGH) {
2424        bytes[b'~' as usize] = true;
2425    }
2426    if options.contains(Options::ENABLE_MATH) {
2427        bytes[b'$' as usize] = true;
2428        bytes[b'{' as usize] = true;
2429        bytes[b'}' as usize] = true;
2430    }
2431    if options.contains(Options::ENABLE_SMART_PUNCTUATION) {
2432        for &byte in &[b'.', b'-', b'"', b'\''] {
2433            bytes[byte as usize] = true;
2434        }
2435    }
2436
2437    bytes
2438}
2439
2440enum LoopInstruction<T> {
2441    /// Continue looking for more special bytes, but skip next few bytes.
2442    ContinueAndSkip(usize),
2443    /// Break looping immediately, returning with the given index and value.
2444    BreakAtWith(usize, T),
2445}
2446
2447#[cfg(all(target_arch = "x86_64", feature = "simd"))]
2448struct LookupTable {
2449    simd: [u8; 16],
2450    scalar: [bool; 256],
2451}
2452
2453#[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
2454type LookupTable = [bool; 256];
2455
2456/// This function walks the byte slices from the given index and
2457/// calls the callback function on all bytes (and their indices) that are in the following set:
2458/// `` ` ``, `\`, `&`, `*`, `_`, `~`, `!`, `<`, `[`, `]`, `|`, `\r`, `\n`
2459/// It is guaranteed not call the callback on other bytes.
2460/// Whenever `callback(ix, byte)` returns a `ContinueAndSkip(n)` value, the callback
2461/// will not be called with an index that is less than `ix + n + 1`.
2462/// When the callback returns a `BreakAtWith(end_ix, opt+val)`, no more callbacks will be
2463/// called and the function returns immediately with the return value `(end_ix, opt_val)`.
2464/// If `BreakAtWith(..)` is never returned, this function will return the first
2465/// index that is outside the byteslice bound and a `None` value.
2466fn iterate_special_bytes<F, T>(
2467    lut: &LookupTable,
2468    bytes: &[u8],
2469    ix: usize,
2470    callback: F,
2471) -> (usize, Option<T>)
2472where
2473    F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
2474{
2475    #[cfg(all(target_arch = "x86_64", feature = "simd"))]
2476    {
2477        simd::iterate_special_bytes(lut, bytes, ix, callback)
2478    }
2479    #[cfg(not(all(target_arch = "x86_64", feature = "simd")))]
2480    {
2481        scalar_iterate_special_bytes(lut, bytes, ix, callback)
2482    }
2483}
2484
2485fn scalar_iterate_special_bytes<F, T>(
2486    lut: &[bool; 256],
2487    bytes: &[u8],
2488    mut ix: usize,
2489    mut callback: F,
2490) -> (usize, Option<T>)
2491where
2492    F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
2493{
2494    while ix < bytes.len() {
2495        let b = bytes[ix];
2496        if lut[b as usize] {
2497            match callback(ix, b) {
2498                LoopInstruction::ContinueAndSkip(skip) => {
2499                    ix += skip;
2500                }
2501                LoopInstruction::BreakAtWith(ix, val) => {
2502                    return (ix, val);
2503                }
2504            }
2505        }
2506        ix += 1;
2507    }
2508
2509    (ix, None)
2510}
2511
2512/// Split the usual heading content range and the content inside the trailing attribute block.
2513///
2514/// Returns `(leading_content_len, Option<trailing_attr_block_range>)`.
2515///
2516/// Note that `trailing_attr_block_range` will be empty range when the block
2517/// is `{}`, since the range is content inside the wrapping `{` and `}`.
2518///
2519/// The closing `}` of an attribute block can have trailing whitespaces.
2520/// They are automatically trimmed when the attribute block is being searched.
2521///
2522/// However, this method does not trim the trailing whitespaces of heading content.
2523/// It is callers' responsibility to trim them if necessary.
2524fn extract_attribute_block_content_from_header_text(
2525    heading: &[u8],
2526) -> (usize, Option<Range<usize>>) {
2527    let heading_len = heading.len();
2528    let mut ix = heading_len;
2529    ix -= scan_rev_while(heading, |b| {
2530        b == b'\n' || b == b'\r' || b == b' ' || b == b'\t'
2531    });
2532    if ix == 0 {
2533        return (heading_len, None);
2534    }
2535
2536    let attr_block_close = ix - 1;
2537    if heading.get(attr_block_close) != Some(&b'}') {
2538        // The last character is not `}`. No attribute blocks found.
2539        return (heading_len, None);
2540    }
2541    // move cursor before the closing right brace (`}`)
2542    ix -= 1;
2543
2544    ix -= scan_rev_while(&heading[..ix], |b| {
2545        // Characters to be excluded:
2546        //  * `{` and `}`: special characters to open and close an attribute block.
2547        //  * `\\`: a special character to escape many characters and disable some syntaxes.
2548        //      + Handling of this escape character differs among markdown processors.
2549        //      + Escaped characters will be separate text node from neighbors, so
2550        //        it is not easy to handle unescaped string and trim the trailing block.
2551        //  * `<` and `>`: special characters to start and end HTML tag.
2552        //      + No known processors converts `{#<i>foo</i>}` into
2553        //        `id="&lt;i&gt;foo&lt;/&gt;"` as of this writing, so hopefully
2554        //        this restriction won't cause compatibility issues.
2555        //  * `\n` and `\r`: a newline character.
2556        //      + Setext heading can have multiple lines. However it is hard to support
2557        //        attribute blocks that have newline inside, since the parsing proceeds line by
2558        //        line and lines will be separate nodes even they are logically a single text.
2559        !matches!(b, b'{' | b'}' | b'<' | b'>' | b'\\' | b'\n' | b'\r')
2560    });
2561    if ix == 0 {
2562        // `{` is not found. No attribute blocks available.
2563        return (heading_len, None);
2564    }
2565    let attr_block_open = ix - 1;
2566    if heading[attr_block_open] != b'{' {
2567        // `{` is not found. No attribute blocks available.
2568        return (heading_len, None);
2569    }
2570
2571    (attr_block_open, Some(ix..attr_block_close))
2572}
2573
2574/// Parses an attribute block content, such as `.class1 #id .class2`.
2575///
2576/// Returns `(id, classes)`.
2577///
2578/// It is callers' responsibility to find opening and closing characters of the attribute
2579/// block. Usually [`extract_attribute_block_content_from_header_text`] function does it for you.
2580///
2581/// Note that this parsing requires explicit whitespace separators between
2582/// attributes. This is intentional design with the reasons below:
2583///
2584/// * to keep conversion simple and easy to understand for any possible input,
2585/// * to avoid adding less obvious conversion rule that can reduce compatibility
2586///   with other implementations more, and
2587/// * to follow the major design of implementations with the support for the
2588///   attribute blocks extension (as of this writing).
2589///
2590/// See also: [`Options::ENABLE_HEADING_ATTRIBUTES`].
2591///
2592/// [`Options::ENABLE_HEADING_ATTRIBUTES`]: `crate::Options::ENABLE_HEADING_ATTRIBUTES`
2593fn parse_inside_attribute_block(inside_attr_block: &str) -> Option<HeadingAttributes<'_>> {
2594    let mut id = None;
2595    let mut classes = Vec::new();
2596    let mut attrs = Vec::new();
2597
2598    for attr in inside_attr_block.split_ascii_whitespace() {
2599        // iterator returned by `str::split_ascii_whitespace` never emits empty
2600        // strings, so taking first byte won't panic.
2601        if attr.len() > 1 {
2602            let first_byte = attr.as_bytes()[0];
2603            if first_byte == b'#' {
2604                id = Some(attr[1..].into());
2605            } else if first_byte == b'.' {
2606                classes.push(attr[1..].into());
2607            } else {
2608                let split = attr.split_once('=');
2609                if let Some((key, value)) = split {
2610                    attrs.push((key.into(), Some(value.into())));
2611                } else {
2612                    attrs.push((attr.into(), None));
2613                }
2614            }
2615        }
2616    }
2617
2618    Some(HeadingAttributes { id, classes, attrs })
2619}
2620
2621#[cfg(all(target_arch = "x86_64", feature = "simd"))]
2622mod simd {
2623    //! SIMD byte scanning logic.
2624    //!
2625    //! This module provides functions that allow walking through byteslices, calling
2626    //! provided callback functions on special bytes and their indices using SIMD.
2627    //! The byteset is defined in `compute_lookup`.
2628    //!
2629    //! The idea is to load in a chunk of 16 bytes and perform a lookup into a set of
2630    //! bytes on all the bytes in this chunk simultaneously. We produce a 16 bit bitmask
2631    //! from this and call the callback on every index corresponding to a 1 in this mask
2632    //! before moving on to the next chunk. This allows us to move quickly when there
2633    //! are no or few matches.
2634    //!
2635    //! The table lookup is inspired by this [great overview]. However, since all of the
2636    //! bytes we're interested in are ASCII, we don't quite need the full generality of
2637    //! the universal algorithm and are hence able to skip a few instructions.
2638    //!
2639    //! [great overview]: http://0x80.pl/articles/simd-byte-lookup.html
2640
2641    use super::{LookupTable, LoopInstruction};
2642    use crate::Options;
2643    use core::arch::x86_64::*;
2644
2645    const VECTOR_SIZE: usize = std::mem::size_of::<__m128i>();
2646
2647    /// Generates a lookup table containing the bitmaps for our
2648    /// special marker bytes. This is effectively a 128 element 2d bitvector,
2649    /// that can be indexed by a four bit row index (the lower nibble)
2650    /// and a three bit column index (upper nibble).
2651    pub(super) fn compute_lookup(options: &Options) -> [u8; 16] {
2652        let mut lookup = [0u8; 16];
2653        let standard_bytes = [
2654            b'\n', b'\r', b'*', b'_', b'&', b'\\', b'[', b']', b'<', b'!', b'`',
2655        ];
2656
2657        for &byte in &standard_bytes {
2658            add_lookup_byte(&mut lookup, byte);
2659        }
2660        if options.contains(Options::ENABLE_TABLES) {
2661            add_lookup_byte(&mut lookup, b'|');
2662        }
2663        if options.contains(Options::ENABLE_STRIKETHROUGH) {
2664            add_lookup_byte(&mut lookup, b'~');
2665        }
2666        if options.contains(Options::ENABLE_MATH) {
2667            add_lookup_byte(&mut lookup, b'$');
2668            add_lookup_byte(&mut lookup, b'{');
2669            add_lookup_byte(&mut lookup, b'}');
2670        }
2671        if options.contains(Options::ENABLE_SMART_PUNCTUATION) {
2672            for &byte in &[b'.', b'-', b'"', b'\''] {
2673                add_lookup_byte(&mut lookup, byte);
2674            }
2675        }
2676
2677        lookup
2678    }
2679
2680    fn add_lookup_byte(lookup: &mut [u8; 16], byte: u8) {
2681        lookup[(byte & 0x0f) as usize] |= 1 << (byte >> 4);
2682    }
2683
2684    /// Computes a bit mask for the given byteslice starting from the given index,
2685    /// where the 16 least significant bits indicate (by value of 1) whether or not
2686    /// there is a special character at that byte position. The least significant bit
2687    /// corresponds to `bytes[ix]` and the most significant bit corresponds to
2688    /// `bytes[ix + 15]`.
2689    /// It is only safe to call this function when `bytes.len() >= ix + VECTOR_SIZE`.
2690    #[target_feature(enable = "ssse3")]
2691    #[inline]
2692    unsafe fn compute_mask(lut: &[u8; 16], bytes: &[u8], ix: usize) -> i32 {
2693        debug_assert!(bytes.len() >= ix + VECTOR_SIZE);
2694
2695        let bitmap = _mm_loadu_si128(lut.as_ptr() as *const __m128i);
2696        // Small lookup table to compute single bit bitshifts
2697        // for 16 bytes at once.
2698        let bitmask_lookup =
2699            _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, -1, -1, -1, -1, -1, -1, -1, -1);
2700
2701        // Load input from memory.
2702        let raw_ptr = bytes.as_ptr().add(ix) as *const __m128i;
2703        let input = _mm_loadu_si128(raw_ptr);
2704        // Compute the bitmap using the bottom nibble as an index
2705        // into the lookup table. Note that non-ascii bytes will have
2706        // their most significant bit set and will map to lookup[0].
2707        let bitset = _mm_shuffle_epi8(bitmap, input);
2708        // Compute the high nibbles of the input using a 16-bit rightshift of four
2709        // and a mask to prevent most-significant bit issues.
2710        let higher_nibbles = _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0f));
2711        // Create a bitmask for the bitmap by perform a left shift of the value
2712        // of the higher nibble. Bytes with their most significant set are mapped
2713        // to -1 (all ones).
2714        let bitmask = _mm_shuffle_epi8(bitmask_lookup, higher_nibbles);
2715        // Test the bit of the bitmap by AND'ing the bitmap and the mask together.
2716        let tmp = _mm_and_si128(bitset, bitmask);
2717        // Check whether the result was not null. NEQ is not a SIMD intrinsic,
2718        // but comparing to the bitmask is logically equivalent. This also prevents us
2719        // from matching any non-ASCII bytes since none of the bitmaps were all ones
2720        // (-1).
2721        let result = _mm_cmpeq_epi8(tmp, bitmask);
2722
2723        // Return the resulting bitmask.
2724        _mm_movemask_epi8(result)
2725    }
2726
2727    /// Calls callback on byte indices and their value.
2728    /// Breaks when callback returns LoopInstruction::BreakAtWith(ix, val). And skips the
2729    /// number of bytes in callback return value otherwise.
2730    /// Returns the final index and a possible break value.
2731    pub(super) fn iterate_special_bytes<F, T>(
2732        lut: &LookupTable,
2733        bytes: &[u8],
2734        ix: usize,
2735        callback: F,
2736    ) -> (usize, Option<T>)
2737    where
2738        F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
2739    {
2740        if is_x86_feature_detected!("ssse3") && bytes.len() >= VECTOR_SIZE {
2741            unsafe { simd_iterate_special_bytes(&lut.simd, bytes, ix, callback) }
2742        } else {
2743            super::scalar_iterate_special_bytes(&lut.scalar, bytes, ix, callback)
2744        }
2745    }
2746
2747    /// Calls the callback function for every 1 in the given bitmask with
2748    /// the index `offset + ix`, where `ix` is the position of the 1 in the mask.
2749    /// Returns `Ok(ix)` to continue from index `ix`, `Err((end_ix, opt_val)` to break with
2750    /// final index `end_ix` and optional value `opt_val`.
2751    unsafe fn process_mask<F, T>(
2752        mut mask: i32,
2753        bytes: &[u8],
2754        mut offset: usize,
2755        callback: &mut F,
2756    ) -> Result<usize, (usize, Option<T>)>
2757    where
2758        F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
2759    {
2760        while mask != 0 {
2761            let mask_ix = mask.trailing_zeros() as usize;
2762            offset += mask_ix;
2763            match callback(offset, *bytes.get_unchecked(offset)) {
2764                LoopInstruction::ContinueAndSkip(skip) => {
2765                    offset += skip + 1;
2766                    mask = mask.wrapping_shr((skip + 1 + mask_ix) as u32);
2767                }
2768                LoopInstruction::BreakAtWith(ix, val) => return Err((ix, val)),
2769            }
2770        }
2771        Ok(offset)
2772    }
2773
2774    #[target_feature(enable = "ssse3")]
2775    /// Important: only call this function when `bytes.len() >= 16`. Doing
2776    /// so otherwise may exhibit undefined behaviour.
2777    unsafe fn simd_iterate_special_bytes<F, T>(
2778        lut: &[u8; 16],
2779        bytes: &[u8],
2780        mut ix: usize,
2781        mut callback: F,
2782    ) -> (usize, Option<T>)
2783    where
2784        F: FnMut(usize, u8) -> LoopInstruction<Option<T>>,
2785    {
2786        debug_assert!(bytes.len() >= VECTOR_SIZE);
2787        let upperbound = bytes.len() - VECTOR_SIZE;
2788
2789        while ix < upperbound {
2790            let mask = compute_mask(lut, bytes, ix);
2791            let block_start = ix;
2792            ix = match process_mask(mask, bytes, ix, &mut callback) {
2793                Ok(ix) => std::cmp::max(ix, VECTOR_SIZE + block_start),
2794                Err((end_ix, val)) => return (end_ix, val),
2795            };
2796        }
2797
2798        if bytes.len() > ix {
2799            // shift off the bytes at start we have already scanned
2800            let mask = compute_mask(lut, bytes, upperbound) >> ix - upperbound;
2801            if let Err((end_ix, val)) = process_mask(mask, bytes, ix, &mut callback) {
2802                return (end_ix, val);
2803            }
2804        }
2805
2806        (bytes.len(), None)
2807    }
2808
2809    #[cfg(test)]
2810    mod simd_test {
2811        use super::super::create_lut;
2812        use super::{iterate_special_bytes, LoopInstruction};
2813        use crate::Options;
2814
2815        fn check_expected_indices(bytes: &[u8], expected: &[usize], skip: usize) {
2816            let mut opts = Options::empty();
2817            opts.insert(Options::ENABLE_MATH);
2818            opts.insert(Options::ENABLE_TABLES);
2819            opts.insert(Options::ENABLE_FOOTNOTES);
2820            opts.insert(Options::ENABLE_STRIKETHROUGH);
2821            opts.insert(Options::ENABLE_TASKLISTS);
2822
2823            let lut = create_lut(&opts);
2824            let mut indices = vec![];
2825
2826            iterate_special_bytes::<_, i32>(&lut, bytes, 0, |ix, _byte_ty| {
2827                indices.push(ix);
2828                LoopInstruction::ContinueAndSkip(skip)
2829            });
2830
2831            assert_eq!(&indices[..], expected);
2832        }
2833
2834        #[test]
2835        fn simple_no_match() {
2836            check_expected_indices("abcdef0123456789".as_bytes(), &[], 0);
2837        }
2838
2839        #[test]
2840        fn simple_match() {
2841            check_expected_indices("*bcd&f0123456789".as_bytes(), &[0, 4], 0);
2842        }
2843
2844        #[test]
2845        fn single_open_fish() {
2846            check_expected_indices("<".as_bytes(), &[0], 0);
2847        }
2848
2849        #[test]
2850        fn long_match() {
2851            check_expected_indices("0123456789abcde~*bcd&f0".as_bytes(), &[15, 16, 20], 0);
2852        }
2853
2854        #[test]
2855        fn border_skip() {
2856            check_expected_indices("0123456789abcde~~~~d&f0".as_bytes(), &[15, 20], 3);
2857        }
2858
2859        #[test]
2860        fn exhaustive_search() {
2861            let chars = [
2862                b'\n', b'\r', b'*', b'_', b'~', b'|', b'&', b'\\', b'[', b']', b'<', b'!', b'`',
2863                b'$', b'{', b'}',
2864            ];
2865
2866            for &c in &chars {
2867                for i in 0u8..=255 {
2868                    if !chars.contains(&i) {
2869                        // full match
2870                        let mut buf = [i; 18];
2871                        buf[3] = c;
2872                        buf[6] = c;
2873
2874                        check_expected_indices(&buf[..], &[3, 6], 0);
2875                    }
2876                }
2877            }
2878        }
2879    }
2880}
pulldown_cmark/firstpass.rs

pulldown_cmark/
firstpass.rs