quick_xml/reader/
state.rs

1#[cfg(feature = "encoding")]
2use encoding_rs::UTF_8;
3
4use crate::encoding::Decoder;
5use crate::errors::{Error, IllFormedError, Result, SyntaxError};
6use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
7#[cfg(feature = "encoding")]
8use crate::reader::EncodingRef;
9use crate::reader::{is_whitespace, BangType, Config, ParseState};
10
11/// A struct that holds a current reader state and a parser configuration.
12/// It is independent on a way of reading data: the reader feed data into it and
13/// get back produced [`Event`]s.
14#[derive(Clone, Debug)]
15pub(super) struct ReaderState {
16    /// Number of bytes read from the source of data since the reader was created
17    pub offset: usize,
18    /// A snapshot of an `offset` of the last error returned. It can be less than
19    /// `offset`, because some errors conveniently report at earlier position,
20    /// and changing `offset` is not possible, because `Error::IllFormed` errors
21    /// are recoverable.
22    pub last_error_offset: usize,
23    /// Defines how to process next byte
24    pub state: ParseState,
25    /// User-defined settings that affect parsing
26    pub config: Config,
27    /// All currently Started elements which didn't have a matching
28    /// End element yet.
29    ///
30    /// For an XML
31    ///
32    /// ```xml
33    /// <root><one/><inner attr="value">|<tag></inner></root>
34    /// ```
35    /// when cursor at the `|` position buffer contains:
36    ///
37    /// ```text
38    /// rootinner
39    /// ^   ^
40    /// ```
41    ///
42    /// The `^` symbols shows which positions stored in the [`Self::opened_starts`]
43    /// (0 and 4 in that case).
44    opened_buffer: Vec<u8>,
45    /// Opened name start indexes into [`Self::opened_buffer`]. See documentation
46    /// for that field for details
47    opened_starts: Vec<usize>,
48
49    #[cfg(feature = "encoding")]
50    /// Reference to the encoding used to read an XML
51    pub encoding: EncodingRef,
52}
53
54impl ReaderState {
55    /// Trims end whitespaces from `bytes`, if required, and returns a [`Text`]
56    /// event or an [`Eof`] event, if text after trimming is empty.
57    ///
58    /// # Parameters
59    /// - `bytes`: data from the start of stream to the first `<` or from `>` to `<`
60    ///
61    /// [`Text`]: Event::Text
62    /// [`Eof`]: Event::Eof
63    pub fn emit_text<'b>(&mut self, bytes: &'b [u8]) -> Result<Event<'b>> {
64        let mut content = bytes;
65
66        if self.config.trim_text_end {
67            // Skip the ending '<'
68            let len = bytes
69                .iter()
70                .rposition(|&b| !is_whitespace(b))
71                .map_or_else(|| bytes.len(), |p| p + 1);
72            content = &bytes[..len];
73        }
74
75        if content.is_empty() {
76            Ok(Event::Eof)
77        } else {
78            Ok(Event::Text(BytesText::wrap(content, self.decoder())))
79        }
80    }
81
82    /// reads `BytesElement` starting with a `!`,
83    /// return `Comment`, `CData` or `DocType` event
84    pub fn emit_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result<Event<'b>> {
85        let uncased_starts_with = |string: &[u8], prefix: &[u8]| {
86            string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix)
87        };
88
89        let len = buf.len();
90        match bang_type {
91            BangType::Comment if buf.starts_with(b"!--") => {
92                debug_assert!(buf.ends_with(b"--"));
93                if self.config.check_comments {
94                    // search if '--' not in comments
95                    let mut haystack = &buf[3..len - 2];
96                    let mut off = 0;
97                    while let Some(p) = memchr::memchr(b'-', haystack) {
98                        off += p + 1;
99                        // if next byte after `-` is also `-`, return an error
100                        if buf[3 + off] == b'-' {
101                            // Explanation of the magic:
102                            //
103                            // - `self.offset`` just after `>`,
104                            // - `buf` contains `!-- con--tent --`
105                            // - `p` is counted from byte after `<!--`
106                            //
107                            // <!-- con--tent -->:
108                            //  ~~~~~~~~~~~~~~~~ : - buf
109                            //   : ===========   : - zone of search (possible values of `p`)
110                            //   : |---p         : - p is counted from | (| is 0)
111                            //   : :   :         ^ - self.offset
112                            //   ^ :   :           - self.offset - len
113                            //     ^   :           - self.offset - len + 2
114                            //         ^           - self.offset - len + 2 + p
115                            self.last_error_offset = self.offset - len + 2 + p;
116                            return Err(Error::IllFormed(IllFormedError::DoubleHyphenInComment));
117                        }
118                        // Continue search after single `-` (+1 to skip it)
119                        haystack = &haystack[p + 1..];
120                    }
121                }
122                Ok(Event::Comment(BytesText::wrap(
123                    // Cut of `!--` and `--` from start and end
124                    &buf[3..len - 2],
125                    self.decoder(),
126                )))
127            }
128            BangType::CData if uncased_starts_with(buf, b"![CDATA[") => {
129                debug_assert!(buf.ends_with(b"]]"));
130                Ok(Event::CData(BytesCData::wrap(
131                    // Cut of `![CDATA[` and `]]` from start and end
132                    &buf[8..len - 2],
133                    self.decoder(),
134                )))
135            }
136            BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => {
137                match buf[8..].iter().position(|&b| !is_whitespace(b)) {
138                    Some(start) => Ok(Event::DocType(BytesText::wrap(
139                        // Cut of `!DOCTYPE` and any number of spaces from start
140                        &buf[8 + start..],
141                        self.decoder(),
142                    ))),
143                    None => {
144                        // Because we here, we at least read `<!DOCTYPE>` and offset after `>`.
145                        // We want report error at place where name is expected - this is just
146                        // before `>`
147                        self.last_error_offset = self.offset - 1;
148                        return Err(Error::IllFormed(IllFormedError::MissingDoctypeName));
149                    }
150                }
151            }
152            _ => {
153                // <!....>
154                //  ^^^^^ - `buf` does not contain `<` and `>`, but `self.offset` is after `>`.
155                // ^------- We report error at that position, so we need to subtract 2 and buf len
156                self.last_error_offset = self.offset - len - 2;
157                Err(bang_type.to_err())
158            }
159        }
160    }
161
162    /// Wraps content of `buf` into the [`Event::End`] event. Does the check that
163    /// end name matches the last opened start name if `self.config.check_end_names` is set.
164    pub fn emit_end<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
165        // Strip the `/` character. `content` contains data between `</` and `>`
166        let content = &buf[1..];
167        // XML standard permits whitespaces after the markup name in closing tags.
168        // Let's strip them from the buffer before comparing tag names.
169        let name = if self.config.trim_markup_names_in_closing_tags {
170            if let Some(pos_end_name) = content.iter().rposition(|&b| !is_whitespace(b)) {
171                &content[..pos_end_name + 1]
172            } else {
173                content
174            }
175        } else {
176            content
177        };
178
179        let decoder = self.decoder();
180
181        // Get the index in self.opened_buffer of the name of the last opened tag
182        match self.opened_starts.pop() {
183            Some(start) => {
184                if self.config.check_end_names {
185                    let expected = &self.opened_buffer[start..];
186                    if name != expected {
187                        let expected = decoder.decode(expected).unwrap_or_default().into_owned();
188                        // #513: In order to allow error recovery we should drop content of the buffer
189                        self.opened_buffer.truncate(start);
190
191                        // Report error at start of the end tag at `<` character
192                        // -2 for `<` and `>`
193                        self.last_error_offset = self.offset - buf.len() - 2;
194                        return Err(Error::IllFormed(IllFormedError::MismatchedEndTag {
195                            expected,
196                            found: decoder.decode(name).unwrap_or_default().into_owned(),
197                        }));
198                    }
199                }
200
201                self.opened_buffer.truncate(start);
202            }
203            None => {
204                // Report error at start of the end tag at `<` character
205                // -2 for `<` and `>`
206                self.last_error_offset = self.offset - buf.len() - 2;
207                return Err(Error::IllFormed(IllFormedError::UnmatchedEndTag(
208                    decoder.decode(name).unwrap_or_default().into_owned(),
209                )));
210            }
211        }
212
213        Ok(Event::End(BytesEnd::wrap(name.into())))
214    }
215
216    /// `buf` contains data between `<` and `>` and the first byte is `?`.
217    /// `self.offset` already after the `>`
218    ///
219    /// Returns `Decl` or `PI` event
220    pub fn emit_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
221        debug_assert!(buf.len() > 0);
222        debug_assert_eq!(buf[0], b'?');
223
224        let len = buf.len();
225        // We accept at least <??>
226        //                     ~~ - len = 2
227        if len > 1 && buf[len - 1] == b'?' {
228            // Cut of `?` and `?` from start and end
229            let content = &buf[1..len - 1];
230            let len = content.len();
231
232            if content.starts_with(b"xml") && (len == 3 || is_whitespace(content[3])) {
233                let event = BytesDecl::from_start(BytesStart::wrap(content, 3));
234
235                // Try getting encoding from the declaration event
236                #[cfg(feature = "encoding")]
237                if self.encoding.can_be_refined() {
238                    if let Some(encoding) = event.encoder() {
239                        self.encoding = EncodingRef::XmlDetected(encoding);
240                    }
241                }
242
243                Ok(Event::Decl(event))
244            } else {
245                Ok(Event::PI(BytesText::wrap(content, self.decoder())))
246            }
247        } else {
248            // <?....EOF
249            //  ^^^^^ - `buf` does not contains `<`, but we want to report error at `<`,
250            //          so we move offset to it (-2 for `<` and `>`)
251            self.last_error_offset = self.offset - len - 2;
252            Err(Error::Syntax(SyntaxError::UnclosedPIOrXmlDecl))
253        }
254    }
255
256    /// Converts content of a tag to a `Start` or an `Empty` event
257    ///
258    /// # Parameters
259    /// - `content`: Content of a tag between `<` and `>`
260    pub fn emit_start<'b>(&mut self, content: &'b [u8]) -> Result<Event<'b>> {
261        let len = content.len();
262        let name_end = content
263            .iter()
264            .position(|&b| is_whitespace(b))
265            .unwrap_or(len);
266        if let Some(&b'/') = content.last() {
267            // This is self-closed tag `<something/>`
268            let name_len = if name_end < len { name_end } else { len - 1 };
269            let event = BytesStart::wrap(&content[..len - 1], name_len);
270
271            if self.config.expand_empty_elements {
272                self.state = ParseState::Empty;
273                self.opened_starts.push(self.opened_buffer.len());
274                self.opened_buffer.extend(&content[..name_len]);
275                Ok(Event::Start(event))
276            } else {
277                Ok(Event::Empty(event))
278            }
279        } else {
280            // #514: Always store names event when .check_end_names == false,
281            // because checks can be temporary disabled and when they would be
282            // enabled, we should have that information
283            self.opened_starts.push(self.opened_buffer.len());
284            self.opened_buffer.extend(&content[..name_end]);
285            Ok(Event::Start(BytesStart::wrap(content, name_end)))
286        }
287    }
288
289    #[inline]
290    pub fn close_expanded_empty(&mut self) -> Result<Event<'static>> {
291        self.state = ParseState::ClosedTag;
292        let name = self
293            .opened_buffer
294            .split_off(self.opened_starts.pop().unwrap());
295        Ok(Event::End(BytesEnd::wrap(name.into())))
296    }
297
298    /// Get the decoder, used to decode bytes, read by this reader, to the strings.
299    ///
300    /// If [`encoding`] feature is enabled, the used encoding may change after
301    /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
302    ///
303    /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
304    /// defaults to UTF-8.
305    ///
306    /// [`encoding`]: ../../index.html#encoding
307    pub fn decoder(&self) -> Decoder {
308        Decoder {
309            #[cfg(feature = "encoding")]
310            encoding: self.encoding.encoding(),
311        }
312    }
313}
314
315impl Default for ReaderState {
316    fn default() -> Self {
317        Self {
318            offset: 0,
319            last_error_offset: 0,
320            state: ParseState::Init,
321            config: Config::default(),
322            opened_buffer: Vec::new(),
323            opened_starts: Vec::new(),
324
325            #[cfg(feature = "encoding")]
326            encoding: EncodingRef::Implicit(UTF_8),
327        }
328    }
329}