quick_xml/reader/state.rs
1#[cfg(feature = "encoding")]
2use encoding_rs::UTF_8;
3
4use crate::encoding::Decoder;
5use crate::errors::{Error, IllFormedError, Result, SyntaxError};
6use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
7#[cfg(feature = "encoding")]
8use crate::reader::EncodingRef;
9use crate::reader::{is_whitespace, BangType, Config, ParseState};
10
11/// A struct that holds a current reader state and a parser configuration.
12/// It is independent on a way of reading data: the reader feed data into it and
13/// get back produced [`Event`]s.
14#[derive(Clone, Debug)]
15pub(super) struct ReaderState {
16 /// Number of bytes read from the source of data since the reader was created
17 pub offset: usize,
18 /// A snapshot of an `offset` of the last error returned. It can be less than
19 /// `offset`, because some errors conveniently report at earlier position,
20 /// and changing `offset` is not possible, because `Error::IllFormed` errors
21 /// are recoverable.
22 pub last_error_offset: usize,
23 /// Defines how to process next byte
24 pub state: ParseState,
25 /// User-defined settings that affect parsing
26 pub config: Config,
27 /// All currently Started elements which didn't have a matching
28 /// End element yet.
29 ///
30 /// For an XML
31 ///
32 /// ```xml
33 /// <root><one/><inner attr="value">|<tag></inner></root>
34 /// ```
35 /// when cursor at the `|` position buffer contains:
36 ///
37 /// ```text
38 /// rootinner
39 /// ^ ^
40 /// ```
41 ///
42 /// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
43 /// (0 and 4 in that case).
44 opened_buffer: Vec<u8>,
45 /// Opened name start indexes into [`Self::opened_buffer`]. See documentation
46 /// for that field for details
47 opened_starts: Vec<usize>,
48
49 #[cfg(feature = "encoding")]
50 /// Reference to the encoding used to read an XML
51 pub encoding: EncodingRef,
52}
53
54impl ReaderState {
55 /// Trims end whitespaces from `bytes`, if required, and returns a [`Text`]
56 /// event or an [`Eof`] event, if text after trimming is empty.
57 ///
58 /// # Parameters
59 /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
60 ///
61 /// [`Text`]: Event::Text
62 /// [`Eof`]: Event::Eof
63 pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> Result<Event<'b>> {
64 let mut content = bytes;
65
66 if self.config.trim_text_end {
67 // Skip the ending '<'
68 let len = bytes
69 .iter()
70 .rposition(|&b| !is_whitespace(b))
71 .map_or_else(|| bytes.len(), |p| p + 1);
72 content = &bytes[..len];
73 }
74
75 if content.is_empty() {
76 Ok(Event::Eof)
77 } else {
78 Ok(Event::Text(BytesText::wrap(content, self.decoder())))
79 }
80 }
81
82 /// reads `BytesElement` starting with a `!`,
83 /// return `Comment`, `CData` or `DocType` event
84 pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
85 let uncased_starts_with = |string: &[u8], prefix: &[u8]| {
86 string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
87 };
88
89 let len = buf.len();
90 match bang_type {
91 BangType::Comment if buf.starts_with(b"!--") => {
92 debug_assert!(buf.ends_with(b"--"));
93 if self.config.check_comments {
94 // search if '--' not in comments
95 let mut haystack = &buf[3..len - 2];
96 let mut off = 0;
97 while let Some(p) = memchr::memchr(b'-', haystack) {
98 off += p + 1;
99 // if next byte after `-` is also `-`, return an error
100 if buf[3 + off] == b'-' {
101 // Explanation of the magic:
102 //
103 // - `self.offset`` just after `>`,
104 // - `buf` contains `!-- con--tent --`
105 // - `p` is counted from byte after `<!--`
106 //
107 // <!-- con--tent -->:
108 // ~~~~~~~~~~~~~~~~ : - buf
109 // : =========== : - zone of search (possible values of `p`)
110 // : |---p : - p is counted from | (| is 0)
111 // : : : ^ - self.offset
112 // ^ : : - self.offset - len
113 // ^ : - self.offset - len + 2
114 // ^ - self.offset - len + 2 + p
115 self.last_error_offset = self.offset - len + 2 + p;
116 return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment));
117 }
118 // Continue search after single `-` (+1 to skip it)
119 haystack = &haystack[p + 1..];
120 }
121 }
122 Ok(Event::Comment(BytesText::wrap(
123 // Cut of `!--` and `--` from start and end
124 &buf[3..len - 2],
125 self.decoder(),
126 )))
127 }
128 BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
129 debug_assert!(buf.ends_with(b"]]"));
130 Ok(Event::CData(BytesCData::wrap(
131 // Cut of `![CDATA[` and `]]` from start and end
132 &buf[8..len - 2],
133 self.decoder(),
134 )))
135 }
136 BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
137 match buf[8..].iter().position(|&b| !is_whitespace(b)) {
138 Some(start) => Ok(Event::DocType(BytesText::wrap(
139 // Cut of `!DOCTYPE` and any number of spaces from start
140 &buf[8 + start..],
141 self.decoder(),
142 ))),
143 None => {
144 // Because we here, we at least read `<!DOCTYPE>` and offset after `>`.
145 // We want report error at place where name is expected - this is just
146 // before `>`
147 self.last_error_offset = self.offset - 1;
148 return Err(Error::IllFormed(IllFormedError::MissingDoctypeName));
149 }
150 }
151 }
152 _ => {
153 // <!....>
154 // ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`.
155 // ^------- We report error at that position, so we need to subtract 2 and buf len
156 self.last_error_offset = self.offset - len - 2;
157 Err(bang_type.to_err())
158 }
159 }
160 }
161
162 /// Wraps content of `buf` into the [`Event::End`] event. Does the check that
163 /// end name matches the last opened start name if `self.config.check_end_names` is set.
164 pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
165 // Strip the `/` character. `content` contains data between `</` and `>`
166 let content = &buf[1..];
167 // XML standard permits whitespaces after the markup name in closing tags.
168 // Let's strip them from the buffer before comparing tag names.
169 let name = if self.config.trim_markup_names_in_closing_tags {
170 if let Some(pos_end_name) = content.iter().rposition(|&b| !is_whitespace(b)) {
171 &content[..pos_end_name + 1]
172 } else {
173 content
174 }
175 } else {
176 content
177 };
178
179 let decoder = self.decoder();
180
181 // Get the index in self.opened_buffer of the name of the last opened tag
182 match self.opened_starts.pop() {
183 Some(start) => {
184 if self.config.check_end_names {
185 let expected = &self.opened_buffer[start..];
186 if name != expected {
187 let expected = decoder.decode(expected).unwrap_or_default().into_owned();
188 // #513: In order to allow error recovery we should drop content of the buffer
189 self.opened_buffer.truncate(start);
190
191 // Report error at start of the end tag at `<` character
192 // -2 for `<` and `>`
193 self.last_error_offset = self.offset - buf.len() - 2;
194 return Err(Error::IllFormed(IllFormedError::MismatchedEndTag {
195 expected,
196 found: decoder.decode(name).unwrap_or_default().into_owned(),
197 }));
198 }
199 }
200
201 self.opened_buffer.truncate(start);
202 }
203 None => {
204 // Report error at start of the end tag at `<` character
205 // -2 for `<` and `>`
206 self.last_error_offset = self.offset - buf.len() - 2;
207 return Err(Error::IllFormed(IllFormedError::UnmatchedEndTag(
208 decoder.decode(name).unwrap_or_default().into_owned(),
209 )));
210 }
211 }
212
213 Ok(Event::End(BytesEnd::wrap(name.into())))
214 }
215
216 /// `buf` contains data between `<` and `>` and the first byte is `?`.
217 /// `self.offset` already after the `>`
218 ///
219 /// Returns `Decl` or `PI` event
220 pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
221 debug_assert!(buf.len() > 0);
222 debug_assert_eq!(buf[0], b'?');
223
224 let len = buf.len();
225 // We accept at least <??>
226 // ~~ - len = 2
227 if len > 1 && buf[len - 1] == b'?' {
228 // Cut of `?` and `?` from start and end
229 let content = &buf[1..len - 1];
230 let len = content.len();
231
232 if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) {
233 let event = BytesDecl::from_start(BytesStart::wrap(content, 3));
234
235 // Try getting encoding from the declaration event
236 #[cfg(feature = "encoding")]
237 if self.encoding.can_be_refined() {
238 if let Some(encoding) = event.encoder() {
239 self.encoding = EncodingRef::XmlDetected(encoding);
240 }
241 }
242
243 Ok(Event::Decl(event))
244 } else {
245 Ok(Event::PI(BytesText::wrap(content, self.decoder())))
246 }
247 } else {
248 // <?....EOF
249 // ^^^^^ - `buf` does not contains `<`, but we want to report error at `<`,
250 // so we move offset to it (-2 for `<` and `>`)
251 self.last_error_offset = self.offset - len - 2;
252 Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl))
253 }
254 }
255
256 /// Converts content of a tag to a `Start` or an `Empty` event
257 ///
258 /// # Parameters
259 /// - `content`: Content of a tag between `<` and `>`
260 pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Result<Event<'b>> {
261 let len = content.len();
262 let name_end = content
263 .iter()
264 .position(|&b| is_whitespace(b))
265 .unwrap_or(len);
266 if let Some(&b'/') = content.last() {
267 // This is self-closed tag `<something/>`
268 let name_len = if name_end < len { name_end } else { len - 1 };
269 let event = BytesStart::wrap(&content[..len - 1], name_len);
270
271 if self.config.expand_empty_elements {
272 self.state = ParseState::Empty;
273 self.opened_starts.push(self.opened_buffer.len());
274 self.opened_buffer.extend(&content[..name_len]);
275 Ok(Event::Start(event))
276 } else {
277 Ok(Event::Empty(event))
278 }
279 } else {
280 // #514: Always store names event when .check_end_names == false,
281 // because checks can be temporary disabled and when they would be
282 // enabled, we should have that information
283 self.opened_starts.push(self.opened_buffer.len());
284 self.opened_buffer.extend(&content[..name_end]);
285 Ok(Event::Start(BytesStart::wrap(content, name_end)))
286 }
287 }
288
289 #[inline]
290 pub fn close_expanded_empty(&mut self) -> Result<Event<'static>> {
291 self.state = ParseState::ClosedTag;
292 let name = self
293 .opened_buffer
294 .split_off(self.opened_starts.pop().unwrap());
295 Ok(Event::End(BytesEnd::wrap(name.into())))
296 }
297
298 /// Get the decoder, used to decode bytes, read by this reader, to the strings.
299 ///
300 /// If [`encoding`] feature is enabled, the used encoding may change after
301 /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
302 ///
303 /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
304 /// defaults to UTF-8.
305 ///
306 /// [`encoding`]: ../../index.html#encoding
307 pub fn decoder(&self) -> Decoder {
308 Decoder {
309 #[cfg(feature = "encoding")]
310 encoding: self.encoding.encoding(),
311 }
312 }
313}
314
315impl Default for ReaderState {
316 fn default() -> Self {
317 Self {
318 offset: 0,
319 last_error_offset: 0,
320 state: ParseState::Init,
321 config: Config::default(),
322 opened_buffer: Vec::new(),
323 opened_starts: Vec::new(),
324
325 #[cfg(feature = "encoding")]
326 encoding: EncodingRef::Implicit(UTF_8),
327 }
328 }
329}