1use std::char;
24
25use crate::parse::HtmlScanGuard;
26pub(crate) use crate::puncttable::{is_ascii_punctuation, is_punctuation};
27use crate::strings::CowStr;
28use crate::{entities, BlockQuoteKind, HeadingLevel};
29use crate::{Alignment, LinkType};
30
31use memchr::memchr;
32
33const HTML_TAGS: [&str; 62] = [
35 "address",
36 "article",
37 "aside",
38 "base",
39 "basefont",
40 "blockquote",
41 "body",
42 "caption",
43 "center",
44 "col",
45 "colgroup",
46 "dd",
47 "details",
48 "dialog",
49 "dir",
50 "div",
51 "dl",
52 "dt",
53 "fieldset",
54 "figcaption",
55 "figure",
56 "footer",
57 "form",
58 "frame",
59 "frameset",
60 "h1",
61 "h2",
62 "h3",
63 "h4",
64 "h5",
65 "h6",
66 "head",
67 "header",
68 "hr",
69 "html",
70 "iframe",
71 "legend",
72 "li",
73 "link",
74 "main",
75 "menu",
76 "menuitem",
77 "nav",
78 "noframes",
79 "ol",
80 "optgroup",
81 "option",
82 "p",
83 "param",
84 "search",
85 "section",
86 "summary",
87 "table",
88 "tbody",
89 "td",
90 "tfoot",
91 "th",
92 "thead",
93 "title",
94 "tr",
95 "track",
96 "ul",
97];
98
99#[derive(Clone)]
102pub(crate) struct LineStart<'a> {
103 bytes: &'a [u8],
104 ix: usize,
105
106 tab_start: usize,
117
118 spaces_remaining: usize,
126
127 min_hrule_offset: usize,
130}
131
132impl<'a> LineStart<'a> {
133 pub(crate) fn new(bytes: &[u8]) -> LineStart<'_> {
134 LineStart {
135 bytes,
136 tab_start: 0,
137 ix: 0,
138 spaces_remaining: 0,
139 min_hrule_offset: 0,
140 }
141 }
142
143 pub(crate) fn scan_space(&mut self, n_space: usize) -> bool {
149 self.scan_space_inner(n_space) == 0
150 }
151
152 pub(crate) fn scan_space_upto(&mut self, n_space: usize) -> usize {
156 n_space - self.scan_space_inner(n_space)
157 }
158
159 fn scan_space_inner(&mut self, mut n_space: usize) -> usize {
161 let n_from_remaining = self.spaces_remaining.min(n_space);
164 self.spaces_remaining -= n_from_remaining;
165 n_space -= n_from_remaining;
166
167 while n_space > 0 && self.ix < self.bytes.len() {
168 match self.bytes[self.ix] {
169 b' ' => {
170 self.ix += 1;
171 n_space -= 1;
172 }
173 b'\t' => {
174 let spaces = 4 - (self.ix - self.tab_start) % 4;
175 self.ix += 1;
176 self.tab_start = self.ix;
177 let n = spaces.min(n_space);
178 n_space -= n;
179
180 self.spaces_remaining = spaces - n;
182 }
183 _ => break,
184 }
185 }
186 n_space
187 }
188
189 pub(crate) fn scan_all_space(&mut self) {
191 self.spaces_remaining = 0;
192 self.ix += self.bytes[self.ix..]
193 .iter()
194 .take_while(|&&b| b == b' ' || b == b'\t')
195 .count();
196 }
197
198 pub(crate) fn is_at_eol(&self) -> bool {
200 self.bytes
201 .get(self.ix)
202 .map(|&c| c == b'\r' || c == b'\n')
203 .unwrap_or(true)
204 }
205
206 fn scan_ch(&mut self, c: u8) -> bool {
207 if self.ix < self.bytes.len() && self.bytes[self.ix] == c {
208 self.ix += 1;
209 true
210 } else {
211 false
212 }
213 }
214
215 fn scan_case_insensitive(&mut self, tag: &[u8]) -> bool {
216 if self.bytes.len() - self.ix < tag.len() {
217 return false;
218 }
219 let prefix = &self.bytes[self.ix..self.ix + tag.len()];
220 let ok = prefix.eq_ignore_ascii_case(tag);
221 if ok {
222 self.ix += tag.len();
223 }
224 ok
225 }
226
227 pub(crate) fn scan_blockquote_tag(&mut self) -> Option<BlockQuoteKind> {
228 let saved_ix = self.ix;
229 let tag = if self.scan_ch(b'[') && self.scan_ch(b'!') {
230 let tag = if self.scan_case_insensitive(b"note") {
231 Some(BlockQuoteKind::Note)
232 } else if self.scan_case_insensitive(b"tip") {
233 Some(BlockQuoteKind::Tip)
234 } else if self.scan_case_insensitive(b"important") {
235 Some(BlockQuoteKind::Important)
236 } else if self.scan_case_insensitive(b"warning") {
237 Some(BlockQuoteKind::Warning)
238 } else if self.scan_case_insensitive(b"caution") {
239 Some(BlockQuoteKind::Caution)
240 } else {
241 None
242 };
243 if tag.is_some() && self.scan_ch(b']') {
244 if let Some(nl) = scan_blank_line(&self.bytes[self.ix..]) {
245 self.ix += nl;
246 tag
247 } else {
248 None
249 }
250 } else {
251 None
252 }
253 } else {
254 None
255 };
256 if tag.is_none() {
257 self.ix = saved_ix;
258 }
259 tag
260 }
261
262 pub(crate) fn scan_blockquote_marker(&mut self) -> bool {
263 if self.scan_ch(b'>') {
264 let _ = self.scan_space(1);
265 true
266 } else {
267 false
268 }
269 }
270
271 pub(crate) fn scan_definition_list_definition_marker_with_indent(
283 &mut self,
284 indent: usize,
285 ) -> Option<usize> {
286 let save = self.clone();
287 if self.scan_ch(b':') {
288 let remaining = 4 - (indent + 1);
289 Some(indent + 1 + self.scan_space_upto(remaining))
290 } else {
291 *self = save;
292 None
293 }
294 }
295
296 pub(crate) fn scan_list_marker_with_indent(
302 &mut self,
303 indent: usize,
304 ) -> Option<(u8, u64, usize)> {
305 let save = self.clone();
306 if self.ix < self.bytes.len() {
307 let c = self.bytes[self.ix];
308 if c == b'-' || c == b'+' || c == b'*' {
309 if self.ix >= self.min_hrule_offset {
310 if let Err(min_offset) = scan_hrule(&self.bytes[self.ix..]) {
312 self.min_hrule_offset = min_offset;
313 } else {
314 *self = save;
315 return None;
316 }
317 }
318 self.ix += 1;
319 if self.scan_space(1) || self.is_at_eol() {
320 return self.finish_list_marker(c, 0, indent + 2);
321 }
322 } else if c.is_ascii_digit() {
323 let start_ix = self.ix;
324 let mut ix = self.ix + 1;
325 let mut val = u64::from(c - b'0');
326 while ix < self.bytes.len() && ix - start_ix < 10 {
327 let c = self.bytes[ix];
328 ix += 1;
329 if c.is_ascii_digit() {
330 val = val * 10 + u64::from(c - b'0');
331 } else if c == b')' || c == b'.' {
332 self.ix = ix;
333 if self.scan_space(1) || self.is_at_eol() {
334 return self.finish_list_marker(c, val, indent + 1 + ix - start_ix);
335 } else {
336 break;
337 }
338 } else {
339 break;
340 }
341 }
342 }
343 }
344 *self = save;
345 None
346 }
347
348 fn finish_list_marker(
349 &mut self,
350 c: u8,
351 start: u64,
352 mut indent: usize,
353 ) -> Option<(u8, u64, usize)> {
354 let save = self.clone();
355
356 if scan_blank_line(&self.bytes[self.ix..]).is_some() {
358 return Some((c, start, indent));
359 }
360
361 let post_indent = self.scan_space_upto(4);
362 if post_indent < 4 {
363 indent += post_indent;
364 } else {
365 *self = save;
366 }
367 Some((c, start, indent))
368 }
369
370 pub(crate) fn scan_task_list_marker(&mut self) -> Option<bool> {
373 let save = self.clone();
374 self.scan_space_upto(3);
375
376 if !self.scan_ch(b'[') {
377 *self = save;
378 return None;
379 }
380 let is_checked = match self.bytes.get(self.ix) {
381 Some(&c) if is_ascii_whitespace_no_nl(c) => {
382 self.ix += 1;
383 false
384 }
385 Some(b'x') | Some(b'X') => {
386 self.ix += 1;
387 true
388 }
389 _ => {
390 *self = save;
391 return None;
392 }
393 };
394 if !self.scan_ch(b']') {
395 *self = save;
396 return None;
397 }
398 if !self
399 .bytes
400 .get(self.ix)
401 .map(|&b| is_ascii_whitespace_no_nl(b))
402 .unwrap_or(false)
403 {
404 *self = save;
405 return None;
406 }
407 Some(is_checked)
408 }
409
410 pub(crate) fn bytes_scanned(&self) -> usize {
411 self.ix
412 }
413
414 pub(crate) fn remaining_space(&self) -> usize {
415 self.spaces_remaining
416 }
417}
418
419pub(crate) fn is_ascii_whitespace(c: u8) -> bool {
420 (0x09..=0x0d).contains(&c) || c == b' '
421}
422
423pub(crate) fn is_ascii_whitespace_no_nl(c: u8) -> bool {
424 c == b'\t' || c == 0x0b || c == 0x0c || c == b' '
425}
426
427fn is_ascii_alpha(c: u8) -> bool {
428 c.is_ascii_alphabetic()
429}
430
431fn is_ascii_alphanumeric(c: u8) -> bool {
432 matches!(c, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z')
433}
434
435fn is_ascii_letterdigitdash(c: u8) -> bool {
436 c == b'-' || is_ascii_alphanumeric(c)
437}
438
439fn is_digit(c: u8) -> bool {
440 c.is_ascii_digit()
441}
442
443fn is_valid_unquoted_attr_value_char(c: u8) -> bool {
444 !matches!(
445 c,
446 b'\'' | b'"' | b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r'
447 )
448}
449
450pub(crate) fn scan_ch(data: &[u8], c: u8) -> usize {
452 if !data.is_empty() && data[0] == c {
453 1
454 } else {
455 0
456 }
457}
458
459pub(crate) fn scan_while<F>(data: &[u8], mut f: F) -> usize
460where
461 F: FnMut(u8) -> bool,
462{
463 data.iter().take_while(|&&c| f(c)).count()
464}
465
466pub(crate) fn scan_rev_while<F>(data: &[u8], mut f: F) -> usize
467where
468 F: FnMut(u8) -> bool,
469{
470 data.iter().rev().take_while(|&&c| f(c)).count()
471}
472
473pub(crate) fn scan_ch_repeat(data: &[u8], c: u8) -> usize {
474 scan_while(data, |x| x == c)
475}
476
477pub(crate) fn scan_whitespace_no_nl(data: &[u8]) -> usize {
480 scan_while(data, is_ascii_whitespace_no_nl)
481}
482
483fn scan_attr_value_chars(data: &[u8]) -> usize {
484 scan_while(data, is_valid_unquoted_attr_value_char)
485}
486
487pub(crate) fn scan_eol(bytes: &[u8]) -> Option<usize> {
488 if bytes.is_empty() {
489 return Some(0);
490 }
491 match bytes[0] {
492 b'\n' => Some(1),
493 b'\r' => Some(if bytes.get(1) == Some(&b'\n') { 2 } else { 1 }),
494 _ => None,
495 }
496}
497
498pub(crate) fn scan_blank_line(bytes: &[u8]) -> Option<usize> {
499 let i = scan_whitespace_no_nl(bytes);
500 scan_eol(&bytes[i..]).map(|n| i + n)
501}
502
503pub(crate) fn scan_nextline(bytes: &[u8]) -> usize {
504 memchr(b'\n', bytes).map_or(bytes.len(), |x| x + 1)
505}
506
507pub(crate) fn scan_closing_code_fence(
510 bytes: &[u8],
511 fence_char: u8,
512 n_fence_char: usize,
513) -> Option<usize> {
514 if bytes.is_empty() {
515 return Some(0);
516 }
517 let mut i = 0;
518 let num_fence_chars_found = scan_ch_repeat(&bytes[i..], fence_char);
519 if num_fence_chars_found < n_fence_char {
520 return None;
521 }
522 i += num_fence_chars_found;
523 let num_trailing_spaces = scan_ch_repeat(&bytes[i..], b' ');
524 i += num_trailing_spaces;
525 scan_eol(&bytes[i..]).map(|_| i)
526}
527
528pub(crate) fn scan_closing_metadata_block(bytes: &[u8], fence_char: u8) -> Option<usize> {
531 let mut i = 0;
532 let mut num_fence_chars_found = scan_ch_repeat(&bytes[i..], fence_char);
533 if num_fence_chars_found != 3 {
534 if fence_char == b'-' {
536 num_fence_chars_found = scan_ch_repeat(&bytes[i..], b'.');
537 if num_fence_chars_found != 3 {
538 return None;
539 }
540 } else {
541 return None;
542 }
543 }
544 i += num_fence_chars_found;
545 let num_trailing_spaces = scan_ch_repeat(&bytes[i..], b' ');
546 i += num_trailing_spaces;
547 scan_eol(&bytes[i..]).map(|_| i)
548}
549
550pub(crate) fn calc_indent(text: &[u8], max: usize) -> (usize, usize) {
552 let mut spaces = 0;
553 let mut offset = 0;
554
555 for (i, &b) in text.iter().enumerate() {
556 offset = i;
557 match b {
558 b' ' => {
559 spaces += 1;
560 if spaces == max {
561 break;
562 }
563 }
564 b'\t' => {
565 let new_spaces = spaces + 4 - (spaces & 3);
566 if new_spaces > max {
567 break;
568 }
569 spaces = new_spaces;
570 }
571 _ => break,
572 }
573 }
574
575 (offset, spaces)
576}
577
578pub(crate) fn scan_hrule(bytes: &[u8]) -> Result<usize, usize> {
586 if bytes.len() < 3 {
587 return Err(0);
588 }
589 let c = bytes[0];
590 if !(c == b'*' || c == b'-' || c == b'_') {
591 return Err(0);
592 }
593 let mut n = 0;
594 let mut i = 0;
595
596 while i < bytes.len() {
597 match bytes[i] {
598 b'\n' | b'\r' => {
599 i += scan_eol(&bytes[i..]).unwrap_or(0);
600 break;
601 }
602 c2 if c2 == c => {
603 n += 1;
604 }
605 b' ' | b'\t' => (),
606 _ => return Err(i),
607 }
608 i += 1;
609 }
610 if n >= 3 {
611 Ok(i)
612 } else {
613 Err(i)
614 }
615}
616
617pub(crate) fn scan_atx_heading(data: &[u8]) -> Option<HeadingLevel> {
621 let level = scan_ch_repeat(data, b'#');
622 if data.get(level).copied().map_or(true, is_ascii_whitespace) {
623 HeadingLevel::try_from(level).ok()
624 } else {
625 None
626 }
627}
628
629pub(crate) fn scan_setext_heading(data: &[u8]) -> Option<(usize, HeadingLevel)> {
633 let c = *data.first()?;
634 let level = if c == b'=' {
635 HeadingLevel::H1
636 } else if c == b'-' {
637 HeadingLevel::H2
638 } else {
639 return None;
640 };
641 let mut i = 1 + scan_ch_repeat(&data[1..], c);
642 i += scan_blank_line(&data[i..])?;
643 Some((i, level))
644}
645
646pub(crate) fn scan_table_head(data: &[u8]) -> (usize, Vec<Alignment>) {
649 let (mut i, spaces) = calc_indent(data, 4);
650 if spaces > 3 || i == data.len() {
651 return (0, vec![]);
652 }
653 let mut cols = vec![];
654 let mut active_col = Alignment::None;
655 let mut start_col = true;
656 let mut found_pipe = false;
657 let mut found_hyphen = false;
658 let mut found_hyphen_in_col = false;
659 if data[i] == b'|' {
660 i += 1;
661 found_pipe = true;
662 }
663 for c in &data[i..] {
664 if let Some(n) = scan_eol(&data[i..]) {
665 i += n;
666 break;
667 }
668 match *c {
669 b' ' => (),
670 b':' => {
671 active_col = match (start_col, active_col) {
672 (true, Alignment::None) => Alignment::Left,
673 (false, Alignment::Left) => Alignment::Center,
674 (false, Alignment::None) => Alignment::Right,
675 _ => active_col,
676 };
677 start_col = false;
678 }
679 b'-' => {
680 start_col = false;
681 found_hyphen = true;
682 found_hyphen_in_col = true;
683 }
684 b'|' => {
685 start_col = true;
686 found_pipe = true;
687 cols.push(active_col);
688 active_col = Alignment::None;
689 if !found_hyphen_in_col {
690 return (0, vec![]);
692 }
693 found_hyphen_in_col = false;
694 }
695 _ => {
696 return (0, vec![]);
698 }
699 }
700 i += 1;
701 }
702
703 if !start_col {
704 cols.push(active_col);
705 }
706 if !found_pipe || !found_hyphen {
707 return (0, vec![]);
710 }
711
712 (i, cols)
713}
714
715pub(crate) fn scan_code_fence(data: &[u8]) -> Option<(usize, u8)> {
719 let c = *data.first()?;
720 if !(c == b'`' || c == b'~') {
721 return None;
722 }
723 let i = 1 + scan_ch_repeat(&data[1..], c);
724 if i >= 3 {
725 if c == b'`' {
726 let suffix = &data[i..];
727 let next_line = i + scan_nextline(suffix);
728 if suffix[..(next_line - i)].iter().any(|&b| b == b'`') {
730 return None;
731 }
732 }
733 Some((i, c))
734 } else {
735 None
736 }
737}
738
739pub(crate) fn scan_metadata_block(
748 data: &[u8],
749 yaml_style_enabled: bool,
750 pluses_style_enabled: bool,
751) -> Option<(usize, u8)> {
752 if yaml_style_enabled || pluses_style_enabled {
754 let c = *data.first()?;
755 if !((c == b'-' && yaml_style_enabled) || (c == b'+' && pluses_style_enabled)) {
756 return None;
757 }
758 let i = 1 + scan_ch_repeat(&data[1..], c);
759 let next_line = scan_nextline(&data[i..]);
761 for c in &data[i..i + next_line] {
762 if !c.is_ascii_whitespace() {
763 return None;
764 }
765 }
766 if i == 3 {
767 let mut j = i;
769 let mut first_line = true;
770 while j < data.len() {
771 j += scan_nextline(&data[j..]);
772 let closed = scan_closing_metadata_block(&data[j..], c).is_some();
773 if first_line {
776 if closed || scan_blank_line(&data[j..]).is_some() {
777 return None;
778 }
779 first_line = false;
780 }
781 if closed {
782 return Some((i, c));
783 }
784 }
785 None
786 } else {
787 None
788 }
789 } else {
790 None
791 }
792}
793
794pub(crate) fn scan_blockquote_start(data: &[u8]) -> Option<usize> {
795 if data.first().copied() == Some(b'>') {
796 let space = if data.get(1).copied() == Some(b' ') {
797 1
798 } else {
799 0
800 };
801 Some(1 + space)
802 } else {
803 None
804 }
805}
806
807pub(crate) fn scan_listitem(bytes: &[u8]) -> Option<(usize, u8, usize, usize)> {
809 let mut c = *bytes.first()?;
810 let (w, start) = match c {
811 b'-' | b'+' | b'*' => (1, 0),
812 b'0'..=b'9' => {
813 let (length, start) = parse_decimal(bytes, 9);
814 c = *bytes.get(length)?;
815 if !(c == b'.' || c == b')') {
816 return None;
817 }
818 (length + 1, start)
819 }
820 _ => {
821 return None;
822 }
823 };
824 let (mut postn, mut postindent) = calc_indent(&bytes[w..], 5);
826 if postindent == 0 {
827 scan_eol(&bytes[w..])?;
828 postindent += 1;
829 } else if postindent > 4 {
830 postn = 1;
831 postindent = 1;
832 }
833 if scan_blank_line(&bytes[w..]).is_some() {
834 postn = 0;
835 postindent = 1;
836 }
837 Some((w + postn, c, start, w + postindent))
838}
839
840fn parse_decimal(bytes: &[u8], limit: usize) -> (usize, usize) {
842 match bytes
843 .iter()
844 .take(limit)
845 .take_while(|&&b| is_digit(b))
846 .try_fold((0, 0usize), |(count, acc), c| {
847 let digit = usize::from(c - b'0');
848 match acc
849 .checked_mul(10)
850 .and_then(|ten_acc| ten_acc.checked_add(digit))
851 {
852 Some(number) => Ok((count + 1, number)),
853 None => Err((count, acc)),
855 }
856 }) {
857 Ok(p) | Err(p) => p,
858 }
859}
860
861fn parse_hex(bytes: &[u8], limit: usize) -> (usize, usize) {
863 match bytes
864 .iter()
865 .take(limit)
866 .try_fold((0, 0usize), |(count, acc), c| {
867 let mut c = *c;
868 let digit = if c.is_ascii_digit() {
869 usize::from(c - b'0')
870 } else {
871 c |= 0x20;
873 if (b'a'..=b'f').contains(&c) {
874 usize::from(c - b'a' + 10)
875 } else {
876 return Err((count, acc));
877 }
878 };
879 match acc
880 .checked_mul(16)
881 .and_then(|sixteen_acc| sixteen_acc.checked_add(digit))
882 {
883 Some(number) => Ok((count + 1, number)),
884 None => Err((count, acc)),
886 }
887 }) {
888 Ok(p) | Err(p) => p,
889 }
890}
891
892fn char_from_codepoint(input: usize) -> Option<char> {
893 let codepoint = input.try_into().ok()?;
894 if codepoint == 0 {
895 return None;
896 }
897 char::from_u32(codepoint)
898}
899
900pub(crate) fn scan_entity(bytes: &[u8]) -> (usize, Option<CowStr<'static>>) {
902 let mut end = 1;
903 if scan_ch(&bytes[end..], b'#') == 1 {
904 end += 1;
905 let (bytecount, codepoint) = if end < bytes.len() && bytes[end] | 0x20 == b'x' {
906 end += 1;
907 parse_hex(&bytes[end..], 6)
908 } else {
909 parse_decimal(&bytes[end..], 7)
910 };
911 end += bytecount;
912 return if bytecount == 0 || scan_ch(&bytes[end..], b';') == 0 {
913 (0, None)
914 } else {
915 (
916 end + 1,
917 Some(char_from_codepoint(codepoint).unwrap_or('\u{FFFD}').into()),
918 )
919 };
920 }
921 end += scan_while(&bytes[end..], is_ascii_alphanumeric);
922 if scan_ch(&bytes[end..], b';') == 1 {
923 if let Some(value) = entities::get_entity(&bytes[1..end]) {
924 return (end + 1, Some(value.into()));
925 }
926 }
927 (0, None)
928}
929
930pub(crate) fn scan_link_dest(
934 data: &str,
935 start_ix: usize,
936 max_next: usize,
937) -> Option<(usize, &str)> {
938 let bytes = &data.as_bytes()[start_ix..];
939 let mut i = scan_ch(bytes, b'<');
940
941 if i != 0 {
942 while i < bytes.len() {
944 match bytes[i] {
945 b'\n' | b'\r' | b'<' => return None,
946 b'>' => return Some((i + 1, &data[(start_ix + 1)..(start_ix + i)])),
947 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
948 i += 1;
949 }
950 _ => {}
951 }
952 i += 1;
953 }
954 None
955 } else {
956 let mut nest = 0;
958 while i < bytes.len() {
959 match bytes[i] {
960 0x0..=0x20 => {
961 break;
962 }
963 b'(' => {
964 if nest > max_next {
965 return None;
966 }
967 nest += 1;
968 }
969 b')' => {
970 if nest == 0 {
971 break;
972 }
973 nest -= 1;
974 }
975 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
976 i += 1;
977 }
978 _ => {}
979 }
980 i += 1;
981 }
982 if nest != 0 {
983 return None;
984 }
985 Some((i, &data[start_ix..(start_ix + i)]))
986 }
987}
988
989fn scan_attribute_name(data: &[u8]) -> Option<usize> {
991 let (&c, tail) = data.split_first()?;
992 if is_ascii_alpha(c) || c == b'_' || c == b':' {
993 Some(
994 1 + scan_while(tail, |c| {
995 is_ascii_alphanumeric(c) || c == b'_' || c == b'.' || c == b':' || c == b'-'
996 }),
997 )
998 } else {
999 None
1000 }
1001}
1002
1003fn scan_attribute(
1007 data: &[u8],
1008 mut ix: usize,
1009 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1010 buffer: &mut Vec<u8>,
1011 buffer_ix: &mut usize,
1012) -> Option<usize> {
1013 ix += scan_attribute_name(&data[ix..])?;
1014 let ix_after_attribute = ix;
1015 ix = scan_whitespace_with_newline_handler_without_buffer(data, ix, newline_handler)?;
1016 if scan_ch(&data[ix..], b'=') == 1 {
1017 ix = scan_whitespace_with_newline_handler(
1018 data,
1019 ix_after_attribute,
1020 newline_handler,
1021 buffer,
1022 buffer_ix,
1023 )?;
1024 ix += 1;
1025 ix = scan_whitespace_with_newline_handler(data, ix, newline_handler, buffer, buffer_ix)?;
1026 ix = scan_attribute_value(data, ix, newline_handler, buffer, buffer_ix)?;
1027 Some(ix)
1028 } else {
1029 Some(ix_after_attribute)
1031 }
1032}
1033
1034fn scan_whitespace_with_newline_handler(
1038 data: &[u8],
1039 mut i: usize,
1040 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1041 buffer: &mut Vec<u8>,
1042 buffer_ix: &mut usize,
1043) -> Option<usize> {
1044 while i < data.len() {
1045 if !is_ascii_whitespace(data[i]) {
1046 return Some(i);
1047 }
1048 if let Some(eol_bytes) = scan_eol(&data[i..]) {
1049 let handler = newline_handler?;
1050 i += eol_bytes;
1051 let skipped_bytes = handler(&data[i..]);
1052
1053 if skipped_bytes > 0 {
1054 buffer.extend(&data[*buffer_ix..i]);
1055 *buffer_ix = i + skipped_bytes;
1056 }
1057
1058 i += skipped_bytes;
1059 } else {
1060 i += 1;
1061 }
1062 }
1063
1064 Some(i)
1065}
1066
1067fn scan_whitespace_with_newline_handler_without_buffer(
1075 data: &[u8],
1076 mut i: usize,
1077 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1078) -> Option<usize> {
1079 while i < data.len() {
1080 if !is_ascii_whitespace(data[i]) {
1081 return Some(i);
1082 }
1083 if let Some(eol_bytes) = scan_eol(&data[i..]) {
1084 let handler = newline_handler?;
1085 i += eol_bytes;
1086 let skipped_bytes = handler(&data[i..]);
1087 i += skipped_bytes;
1088 } else {
1089 i += 1;
1090 }
1091 }
1092
1093 Some(i)
1094}
1095
1096fn scan_attribute_value(
1098 data: &[u8],
1099 mut i: usize,
1100 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1101 buffer: &mut Vec<u8>,
1102 buffer_ix: &mut usize,
1103) -> Option<usize> {
1104 match *data.get(i)? {
1105 b @ b'"' | b @ b'\'' => {
1106 i += 1;
1107 while i < data.len() {
1108 if data[i] == b {
1109 return Some(i + 1);
1110 }
1111 if let Some(eol_bytes) = scan_eol(&data[i..]) {
1112 let handler = newline_handler?;
1113 i += eol_bytes;
1114 let skipped_bytes = handler(&data[i..]);
1115
1116 if skipped_bytes > 0 {
1117 buffer.extend(&data[*buffer_ix..i]);
1118 *buffer_ix = i + skipped_bytes;
1119 }
1120 i += skipped_bytes;
1121 } else {
1122 i += 1;
1123 }
1124 }
1125 return None;
1126 }
1127 b' ' | b'=' | b'>' | b'<' | b'`' | b'\n' | b'\r' => {
1128 return None;
1129 }
1130 _ => {
1131 i += scan_attr_value_chars(&data[i..]);
1133 }
1134 }
1135
1136 Some(i)
1137}
1138
1139pub(crate) fn unescape<'a, I: Into<CowStr<'a>>>(input: I, is_in_table: bool) -> CowStr<'a> {
1141 let input = input.into();
1142 let mut result = String::new();
1143 let mut mark = 0;
1144 let mut i = 0;
1145 let bytes = input.as_bytes();
1146 while i < bytes.len() {
1147 match bytes[i] {
1148 b'\\'
1152 if is_in_table
1153 && i + 2 < bytes.len()
1154 && bytes[i + 1] == b'\\'
1155 && bytes[i + 2] == b'|' =>
1156 {
1157 result.push_str(&input[mark..i]);
1160 mark = i + 2;
1161 i += 3;
1162 }
1163 b'\\' if i + 1 < bytes.len() && is_ascii_punctuation(bytes[i + 1]) => {
1164 result.push_str(&input[mark..i]);
1165 mark = i + 1;
1166 i += 2;
1167 }
1168 b'&' => match scan_entity(&bytes[i..]) {
1169 (n, Some(value)) => {
1170 result.push_str(&input[mark..i]);
1171 result.push_str(&value);
1172 i += n;
1173 mark = i;
1174 }
1175 _ => i += 1,
1176 },
1177 b'\r' => {
1178 result.push_str(&input[mark..i]);
1179 i += 1;
1180 mark = i;
1181 }
1182 _ => i += 1,
1183 }
1184 }
1185 if mark == 0 {
1186 input
1187 } else {
1188 result.push_str(&input[mark..]);
1189 result.into()
1190 }
1191}
1192
1193pub(crate) fn starts_html_block_type_6(data: &[u8]) -> bool {
1195 let i = scan_ch(data, b'/');
1196 let tail = &data[i..];
1197 let n = scan_while(tail, is_ascii_alphanumeric);
1198 if !is_html_tag(&tail[..n]) {
1199 return false;
1200 }
1201 let tail = &tail[n..];
1204 tail.is_empty()
1205 || tail[0] == b' '
1206 || tail[0] == b'\t'
1207 || tail[0] == b'\r'
1208 || tail[0] == b'\n'
1209 || tail[0] == b'>'
1210 || tail.len() >= 2 && &tail[..2] == b"/>"
1211}
1212
1213fn is_html_tag(tag: &[u8]) -> bool {
1214 HTML_TAGS
1215 .binary_search_by(|probe| {
1216 let probe_bytes_iter = probe.as_bytes().iter();
1217 let tag_bytes_iter = tag.iter();
1218
1219 probe_bytes_iter
1220 .zip(tag_bytes_iter)
1221 .find_map(|(&a, &b)| {
1222 match a.cmp(&(b | 0x20)) {
1225 std::cmp::Ordering::Equal => None,
1226 inequality => Some(inequality),
1227 }
1228 })
1229 .unwrap_or_else(|| probe.len().cmp(&tag.len()))
1230 })
1231 .is_ok()
1232}
1233
1234pub(crate) fn scan_html_type_7(data: &[u8]) -> Option<usize> {
1237 let (_span, i) = scan_html_block_inner(data, None)?;
1240 scan_blank_line(&data[i..])?;
1241 Some(i)
1242}
1243
1244pub(crate) fn scan_html_block_inner(
1252 data: &[u8],
1253 newline_handler: Option<&dyn Fn(&[u8]) -> usize>,
1254) -> Option<(Vec<u8>, usize)> {
1255 let mut buffer = Vec::new();
1256 let mut last_buf_index = 0;
1257
1258 let close_tag_bytes = scan_ch(&data[1..], b'/');
1259 let l = scan_while(&data[(1 + close_tag_bytes)..], is_ascii_alpha);
1260 if l == 0 {
1261 return None;
1262 }
1263 let mut i = 1 + close_tag_bytes + l;
1264 i += scan_while(&data[i..], is_ascii_letterdigitdash);
1265
1266 if close_tag_bytes == 0 {
1267 loop {
1268 let old_i = i;
1269 loop {
1270 i += scan_whitespace_no_nl(&data[i..]);
1271 if let Some(eol_bytes) = scan_eol(&data[i..]) {
1272 if eol_bytes == 0 {
1273 return None;
1274 }
1275 let handler = newline_handler?;
1276 i += eol_bytes;
1277 let skipped_bytes = handler(&data[i..]);
1278
1279 let data_len = data.len() - i;
1280
1281 debug_assert!(
1282 skipped_bytes <= data_len,
1283 "Handler tried to skip too many bytes, fed {}, skipped {}",
1284 data_len,
1285 skipped_bytes
1286 );
1287
1288 if skipped_bytes > 0 {
1289 buffer.extend(&data[last_buf_index..i]);
1290 i += skipped_bytes;
1291 last_buf_index = i;
1292 }
1293 } else {
1294 break;
1295 }
1296 }
1297 if let Some(b'/') | Some(b'>') = data.get(i) {
1298 break;
1299 }
1300 if old_i == i {
1301 return None;
1303 }
1304 i = scan_attribute(data, i, newline_handler, &mut buffer, &mut last_buf_index)?;
1305 }
1306 }
1307
1308 i += scan_whitespace_no_nl(&data[i..]);
1309
1310 if close_tag_bytes == 0 {
1311 i += scan_ch(&data[i..], b'/');
1312 }
1313
1314 if scan_ch(&data[i..], b'>') == 0 {
1315 None
1316 } else {
1317 i += 1;
1318 if !buffer.is_empty() {
1319 buffer.extend(&data[last_buf_index..i]);
1320 }
1321 Some((buffer, i))
1322 }
1323}
1324
1325pub(crate) fn scan_autolink(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>, LinkType)> {
1327 scan_uri(text, start_ix)
1328 .map(|(bytes, uri)| (bytes, uri, LinkType::Autolink))
1329 .or_else(|| scan_email(text, start_ix).map(|(bytes, uri)| (bytes, uri, LinkType::Email)))
1330}
1331
1332fn scan_uri(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1334 let bytes = &text.as_bytes()[start_ix..];
1335
1336 if bytes.is_empty() || !is_ascii_alpha(bytes[0]) {
1338 return None;
1339 }
1340
1341 let mut i = 1;
1342
1343 while i < bytes.len() {
1344 let c = bytes[i];
1345 i += 1;
1346 match c {
1347 c if is_ascii_alphanumeric(c) => (),
1348 b'.' | b'-' | b'+' => (),
1349 b':' => break,
1350 _ => return None,
1351 }
1352 }
1353
1354 if !(3..=33).contains(&i) {
1357 return None;
1358 }
1359
1360 while i < bytes.len() {
1361 match bytes[i] {
1362 b'>' => return Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into())),
1363 b'\0'..=b' ' | b'<' => return None,
1364 _ => (),
1365 }
1366 i += 1;
1367 }
1368
1369 None
1370}
1371
1372fn scan_email(text: &str, start_ix: usize) -> Option<(usize, CowStr<'_>)> {
1374 let bytes = &text.as_bytes()[start_ix..];
1376 let mut i = 0;
1377
1378 while i < bytes.len() {
1379 let c = bytes[i];
1380 i += 1;
1381 match c {
1382 c if is_ascii_alphanumeric(c) => (),
1383 b'.' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'/' | b'=' | b'?'
1384 | b'^' | b'_' | b'`' | b'{' | b'|' | b'}' | b'~' | b'-' => (),
1385 b'@' if i > 1 => break,
1386 _ => return None,
1387 }
1388 }
1389
1390 loop {
1391 let label_start_ix = i;
1392 let mut fresh_label = true;
1393
1394 while i < bytes.len() {
1395 match bytes[i] {
1396 c if is_ascii_alphanumeric(c) => (),
1397 b'-' if fresh_label => {
1398 return None;
1399 }
1400 b'-' => (),
1401 _ => break,
1402 }
1403 fresh_label = false;
1404 i += 1;
1405 }
1406
1407 if i == label_start_ix || i - label_start_ix > 63 || bytes[i - 1] == b'-' {
1408 return None;
1409 }
1410
1411 if scan_ch(&bytes[i..], b'.') == 0 {
1412 break;
1413 }
1414 i += 1;
1415 }
1416
1417 if scan_ch(&bytes[i..], b'>') == 0 {
1418 return None;
1419 }
1420
1421 Some((start_ix + i + 1, text[start_ix..(start_ix + i)].into()))
1422}
1423
1424pub(crate) fn scan_inline_html_comment(
1427 bytes: &[u8],
1428 mut ix: usize,
1429 scan_guard: &mut HtmlScanGuard,
1430) -> Option<usize> {
1431 let c = *bytes.get(ix)?;
1432 ix += 1;
1433 match c {
1434 b'-' if ix > scan_guard.comment => {
1437 if *bytes.get(ix)? != b'-' {
1439 return None;
1440 }
1441 ix -= 1;
1449
1450 while let Some(x) = memchr(b'-', &bytes[ix..]) {
1451 ix += x + 1;
1452 scan_guard.comment = ix;
1453 if scan_ch(&bytes[ix..], b'-') == 1 && scan_ch(&bytes[ix + 1..], b'>') == 1 {
1454 return Some(ix + 2);
1455 }
1456 }
1457 None
1458 }
1459 b'[' if bytes[ix..].starts_with(b"CDATA[") && ix > scan_guard.cdata => {
1462 ix += b"CDATA[".len();
1463 ix = memchr(b']', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1464 let close_brackets = scan_ch_repeat(&bytes[ix..], b']');
1465 ix += close_brackets;
1466
1467 if close_brackets == 0 || scan_ch(&bytes[ix..], b'>') == 0 {
1468 scan_guard.cdata = ix;
1469 None
1470 } else {
1471 Some(ix + 1)
1472 }
1473 }
1474 _ if c.is_ascii_alphabetic() && ix > scan_guard.declaration => {
1477 ix = memchr(b'>', &bytes[ix..]).map_or(bytes.len(), |x| ix + x);
1478 if scan_ch(&bytes[ix..], b'>') == 0 {
1479 scan_guard.declaration = ix;
1480 None
1481 } else {
1482 Some(ix + 1)
1483 }
1484 }
1485 _ => None,
1486 }
1487}
1488
1489pub(crate) fn scan_inline_html_processing(
1492 bytes: &[u8],
1493 mut ix: usize,
1494 scan_guard: &mut HtmlScanGuard,
1495) -> Option<usize> {
1496 if ix <= scan_guard.processing {
1497 return None;
1498 }
1499 while let Some(offset) = memchr(b'?', &bytes[ix..]) {
1500 ix += offset + 1;
1501 if scan_ch(&bytes[ix..], b'>') == 1 {
1502 return Some(ix + 1);
1503 }
1504 }
1505 scan_guard.processing = ix;
1506 None
1507}
1508
1509#[cfg(test)]
1510mod test {
1511 use super::*;
1512 #[test]
1513 fn overflow_list() {
1514 assert!(
1515 scan_listitem(b"4444444444444444444444444444444444444444444444444444444444!").is_none()
1516 );
1517 }
1518
1519 #[test]
1520 fn overflow_by_addition() {
1521 assert!(scan_listitem(b"1844674407370955161615!").is_none());
1522 }
1523
1524 #[test]
1525 fn good_emails() {
1526 const EMAILS: &[&str] = &[
1527 "<a@b.c>",
1528 "<a@b>",
1529 "<a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-@example.com>",
1530 "<a@sixty-three-letters-in-this-identifier-----------------------63>",
1531 ];
1532 for email in EMAILS {
1533 assert!(scan_email(email, 1).is_some());
1534 }
1535 }
1536
1537 #[test]
1538 fn bad_emails() {
1539 const EMAILS: &[&str] = &[
1540 "<@b.c>",
1541 "<foo@-example.com>",
1542 "<foo@example-.com>",
1543 "<a@notrailingperiod.>",
1544 "<a(noparens)@example.com>",
1545 "<\"noquotes\"@example.com>",
1546 "<a@sixty-four-letters-in-this-identifier-------------------------64>",
1547 ];
1548 for email in EMAILS {
1549 assert!(scan_email(email, 1).is_none());
1550 }
1551 }
1552}