pulldown_cmark/linklabel.rs
1// Copyright 2018 Google LLC
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21//! Link label parsing and matching.
22
23use unicase::UniCase;
24
25use crate::scanners::{is_ascii_punctuation, is_ascii_whitespace, scan_eol};
26use crate::strings::CowStr;
27
28#[derive(Debug)]
29pub(crate) enum ReferenceLabel<'a> {
30 Link(CowStr<'a>),
31 Footnote(CowStr<'a>),
32}
33
34pub(crate) type LinkLabel<'a> = UniCase<CowStr<'a>>;
35
36pub(crate) type FootnoteLabel<'a> = UniCase<CowStr<'a>>;
37
38/// Assumes the opening bracket has already been scanned.
39/// The line break handler determines what happens when a linebreak
40/// is found. It is passed the bytes following the line break and
41/// either returns `Some(k)`, where `k` is the number of bytes to skip,
42/// or `None` to abort parsing the label.
43/// Returns the number of bytes read (including closing bracket) and label on success.
44pub(crate) fn scan_link_label_rest<'t>(
45 text: &'t str,
46 linebreak_handler: &dyn Fn(&[u8]) -> Option<usize>,
47 is_in_table: bool,
48) -> Option<(usize, CowStr<'t>)> {
49 let bytes = text.as_bytes();
50 let mut ix = 0;
51 let mut only_white_space = true;
52 let mut codepoints = 0;
53 // no worries, doesn't allocate until we push things onto it
54 let mut label = String::new();
55 let mut mark = 0;
56
57 loop {
58 if codepoints >= 1000 {
59 return None;
60 }
61 match *bytes.get(ix)? {
62 b'[' => return None,
63 b']' => break,
64 // Backslash escapes in link references are normally untouched, but
65 // tables are an exception, because they're parsed as-if the tables
66 // were parsed in a discrete pass, changing `\|` to `|`, and then
67 // passing the changed string to the inline parser.
68 b'|' if is_in_table && ix != 0 && bytes.get(ix - 1) == Some(&b'\\') => {
69 // only way to reach this spot is to have `\\|` (even number of `\` before `|`)
70 label.push_str(&text[mark..ix - 1]);
71 label.push('|');
72 ix += 1;
73 only_white_space = false;
74 mark = ix;
75 }
76 b'\\' if is_in_table && bytes.get(ix + 1) == Some(&b'|') => {
77 // only way to reach this spot is to have `\|` (odd number of `\` before `|`)
78 label.push_str(&text[mark..ix]);
79 label.push('|');
80 ix += 2;
81 codepoints += 1;
82 only_white_space = false;
83 mark = ix;
84 }
85 b'\\' if is_ascii_punctuation(*bytes.get(ix + 1)?) => {
86 ix += 2;
87 codepoints += 2;
88 only_white_space = false;
89 }
90 b if is_ascii_whitespace(b) => {
91 // normalize labels by collapsing whitespaces, including linebreaks
92 let mut whitespaces = 0;
93 let mut linebreaks = 0;
94 let whitespace_start = ix;
95
96 while ix < bytes.len() && is_ascii_whitespace(bytes[ix]) {
97 if let Some(eol_bytes) = scan_eol(&bytes[ix..]) {
98 linebreaks += 1;
99 if linebreaks > 1 {
100 return None;
101 }
102 ix += eol_bytes;
103 ix += linebreak_handler(&bytes[ix..])?;
104 whitespaces += 2; // indicate that we need to replace
105 } else {
106 whitespaces += if bytes[ix] == b' ' { 1 } else { 2 };
107 ix += 1;
108 }
109 }
110 if whitespaces > 1 {
111 label.push_str(&text[mark..whitespace_start]);
112 label.push(' ');
113 mark = ix;
114 codepoints += ix - whitespace_start;
115 } else {
116 codepoints += 1;
117 }
118 }
119 b => {
120 only_white_space = false;
121 ix += 1;
122 if b & 0b1000_0000 != 0 {
123 codepoints += 1;
124 }
125 }
126 }
127 }
128
129 if only_white_space {
130 None
131 } else {
132 let cow = if mark == 0 {
133 let asciiws = &[' ', '\r', '\n', '\t'][..];
134 text[..ix].trim_matches(asciiws).into()
135 } else {
136 label.push_str(&text[mark..ix]);
137 while matches!(
138 label.as_bytes().last(),
139 Some(&b' ' | &b'\r' | &b'\n' | &b'\t')
140 ) {
141 label.pop();
142 }
143 while matches!(
144 label.as_bytes().first(),
145 Some(&b' ' | &b'\r' | &b'\n' | &b'\t')
146 ) {
147 label.remove(0);
148 }
149 label.into()
150 };
151 Some((ix + 1, cow))
152 }
153}
154
155#[cfg(test)]
156mod test {
157 use super::scan_link_label_rest;
158
159 #[test]
160 fn whitespace_normalization() {
161 let input = "«\t\tBlurry Eyes\t\t»][blurry_eyes]";
162 let expected_output = "« Blurry Eyes »"; // regular spaces!
163
164 let (_bytes, normalized_label) = scan_link_label_rest(input, &|_| None, false).unwrap();
165 assert_eq!(expected_output, normalized_label.as_ref());
166 }
167
168 #[test]
169 fn return_carriage_linefeed_ok() {
170 let input = "hello\r\nworld\r\n]";
171 assert!(scan_link_label_rest(input, &|_| Some(0), false).is_some());
172 }
173}