quick_xml/reader/
slice_reader.rs

1//! This is an implementation of [`Reader`] for reading from a `&[u8]` as
2//! underlying byte stream. This implementation supports not using an
3//! intermediate buffer as the byte slice itself can be used to borrow from.
4
5use std::borrow::Cow;
6
7#[cfg(feature = "encoding")]
8use crate::reader::EncodingRef;
9#[cfg(feature = "encoding")]
10use encoding_rs::{Encoding, UTF_8};
11
12use crate::errors::{Error, Result};
13use crate::events::Event;
14use crate::name::QName;
15use crate::reader::{is_whitespace, BangType, Parser, Reader, Span, XmlSource};
16
17/// This is an implementation for reading from a `&[u8]` as underlying byte stream.
18/// This implementation supports not using an intermediate buffer as the byte slice
19/// itself can be used to borrow from.
20impl<'a> Reader<&'a [u8]> {
21    /// Creates an XML reader from a string slice.
22    #[allow(clippy::should_implement_trait)]
23    pub fn from_str(s: &'a str) -> Self {
24        // Rust strings are guaranteed to be UTF-8, so lock the encoding
25        #[cfg(feature = "encoding")]
26        {
27            let mut reader = Self::from_reader(s.as_bytes());
28            reader.state.encoding = EncodingRef::Explicit(UTF_8);
29            reader
30        }
31
32        #[cfg(not(feature = "encoding"))]
33        Self::from_reader(s.as_bytes())
34    }
35
36    /// Read an event that borrows from the input rather than a buffer.
37    ///
38    /// There is no asynchronous `read_event_async()` version of this function,
39    /// because it is not necessary -- the contents are already in memory and no IO
40    /// is needed, therefore there is no potential for blocking.
41    ///
42    /// # Examples
43    ///
44    /// ```
45    /// # use pretty_assertions::assert_eq;
46    /// use quick_xml::events::Event;
47    /// use quick_xml::reader::Reader;
48    ///
49    /// let mut reader = Reader::from_str(r#"
50    ///     <tag1 att1 = "test">
51    ///        <tag2><!--Test comment-->Test</tag2>
52    ///        <tag2>Test 2</tag2>
53    ///     </tag1>
54    /// "#);
55    /// reader.config_mut().trim_text(true);
56    ///
57    /// let mut count = 0;
58    /// let mut txt = Vec::new();
59    /// loop {
60    ///     match reader.read_event().unwrap() {
61    ///         Event::Start(e) => count += 1,
62    ///         Event::Text(e) => txt.push(e.unescape().unwrap().into_owned()),
63    ///         Event::Eof => break,
64    ///         _ => (),
65    ///     }
66    /// }
67    /// assert_eq!(count, 3);
68    /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
69    /// ```
70    #[inline]
71    pub fn read_event(&mut self) -> Result<Event<'a>> {
72        self.read_event_impl(())
73    }
74
75    /// Reads until end element is found. This function is supposed to be called
76    /// after you already read a [`Start`] event.
77    ///
78    /// Returns a span that cover content between `>` of an opening tag and `<` of
79    /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
80    /// this method was called after reading expanded [`Start`] event.
81    ///
82    /// Manages nested cases where parent and child elements have the _literally_
83    /// same name.
84    ///
85    /// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
86    /// will be returned. In particularly, that error will be returned if you call
87    /// this method without consuming the corresponding [`Start`] event first.
88    ///
89    /// The `end` parameter should contain name of the end element _in the reader
90    /// encoding_. It is good practice to always get that parameter using
91    /// [`BytesStart::to_end()`] method.
92    ///
93    /// The correctness of the skipped events does not checked, if you disabled
94    /// the [`check_end_names`] option.
95    ///
96    /// There is no asynchronous `read_to_end_async()` version of this function,
97    /// because it is not necessary -- the contents are already in memory and no IO
98    /// is needed, therefore there is no potential for blocking.
99    ///
100    /// # Namespaces
101    ///
102    /// While the `Reader` does not support namespace resolution, namespaces
103    /// does not change the algorithm for comparing names. Although the names
104    /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
105    /// same namespace, are semantically equivalent, `</b:name>` cannot close
106    /// `<a:name>`, because according to [the specification]
107    ///
108    /// > The end of every element that begins with a **start-tag** MUST be marked
109    /// > by an **end-tag** containing a name that echoes the element's type as
110    /// > given in the **start-tag**
111    ///
112    /// # Examples
113    ///
114    /// This example shows, how you can skip XML content after you read the
115    /// start event.
116    ///
117    /// ```
118    /// # use pretty_assertions::assert_eq;
119    /// use quick_xml::events::{BytesStart, Event};
120    /// use quick_xml::reader::Reader;
121    ///
122    /// let mut reader = Reader::from_str(r#"
123    ///     <outer>
124    ///         <inner>
125    ///             <inner></inner>
126    ///             <inner/>
127    ///             <outer></outer>
128    ///             <outer/>
129    ///         </inner>
130    ///     </outer>
131    /// "#);
132    /// reader.config_mut().trim_text(true);
133    ///
134    /// let start = BytesStart::new("outer");
135    /// let end   = start.to_end().into_owned();
136    ///
137    /// // First, we read a start event...
138    /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
139    ///
140    /// // ...then, we could skip all events to the corresponding end event.
141    /// // This call will correctly handle nested <outer> elements.
142    /// // Note, however, that this method does not handle namespaces.
143    /// reader.read_to_end(end.name()).unwrap();
144    ///
145    /// // At the end we should get an Eof event, because we ate the whole XML
146    /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
147    /// ```
148    ///
149    /// [`Start`]: Event::Start
150    /// [`End`]: Event::End
151    /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
152    /// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
153    /// [`check_end_names`]: crate::reader::Config::check_end_names
154    /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
155    pub fn read_to_end(&mut self, end: QName) -> Result<Span> {
156        Ok(read_to_end!(self, end, (), read_event_impl, {}))
157    }
158
159    /// Reads content between start and end tags, including any markup. This
160    /// function is supposed to be called after you already read a [`Start`] event.
161    ///
162    /// Manages nested cases where parent and child elements have the _literally_
163    /// same name.
164    ///
165    /// This method does not unescape read data, instead it returns content
166    /// "as is" of the XML document. This is because it has no idea what text
167    /// it reads, and if, for example, it contains CDATA section, attempt to
168    /// unescape it content will spoil data.
169    ///
170    /// Any text will be decoded using the XML current [`decoder()`].
171    ///
172    /// Actually, this method perform the following code:
173    ///
174    /// ```ignore
175    /// let span = reader.read_to_end(end)?;
176    /// let text = reader.decoder().decode(&reader.inner_slice[span]);
177    /// ```
178    ///
179    /// # Examples
180    ///
181    /// This example shows, how you can read a HTML content from your XML document.
182    ///
183    /// ```
184    /// # use pretty_assertions::assert_eq;
185    /// # use std::borrow::Cow;
186    /// use quick_xml::events::{BytesStart, Event};
187    /// use quick_xml::reader::Reader;
188    ///
189    /// let mut reader = Reader::from_str("
190    ///     <html>
191    ///         <title>This is a HTML text</title>
192    ///         <p>Usual XML rules does not apply inside it
193    ///         <p>For example, elements not needed to be &quot;closed&quot;
194    ///     </html>
195    /// ");
196    /// reader.config_mut().trim_text(true);
197    ///
198    /// let start = BytesStart::new("html");
199    /// let end   = start.to_end().into_owned();
200    ///
201    /// // First, we read a start event...
202    /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
203    /// // ...and disable checking of end names because we expect HTML further...
204    /// reader.config_mut().check_end_names = false;
205    ///
206    /// // ...then, we could read text content until close tag.
207    /// // This call will correctly handle nested <html> elements.
208    /// let text = reader.read_text(end.name()).unwrap();
209    /// assert_eq!(text, Cow::Borrowed(r#"
210    ///         <title>This is a HTML text</title>
211    ///         <p>Usual XML rules does not apply inside it
212    ///         <p>For example, elements not needed to be &quot;closed&quot;
213    ///     "#));
214    /// assert!(matches!(text, Cow::Borrowed(_)));
215    ///
216    /// // Now we can enable checks again
217    /// reader.config_mut().check_end_names = true;
218    ///
219    /// // At the end we should get an Eof event, because we ate the whole XML
220    /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
221    /// ```
222    ///
223    /// [`Start`]: Event::Start
224    /// [`decoder()`]: Self::decoder()
225    pub fn read_text(&mut self, end: QName) -> Result<Cow<'a, str>> {
226        // self.reader will be changed, so store original reference
227        let buffer = self.reader;
228        let span = self.read_to_end(end)?;
229
230        self.decoder().decode(&buffer[0..span.len()])
231    }
232}
233
234////////////////////////////////////////////////////////////////////////////////////////////////////
235
236/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer
237/// that will be borrowed by events. This implementation provides a zero-copy deserialization
238impl<'a> XmlSource<'a, ()> for &'a [u8] {
239    #[cfg(not(feature = "encoding"))]
240    #[inline]
241    fn remove_utf8_bom(&mut self) -> Result<()> {
242        if self.starts_with(crate::encoding::UTF8_BOM) {
243            *self = &self[crate::encoding::UTF8_BOM.len()..];
244        }
245        Ok(())
246    }
247
248    #[cfg(feature = "encoding")]
249    #[inline]
250    fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>> {
251        if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) {
252            *self = &self[bom_len..];
253            return Ok(Some(enc));
254        }
255        Ok(None)
256    }
257
258    #[inline]
259    fn read_bytes_until(
260        &mut self,
261        byte: u8,
262        _buf: (),
263        position: &mut usize,
264    ) -> Result<(&'a [u8], bool)> {
265        // search byte must be within the ascii range
266        debug_assert!(byte.is_ascii());
267
268        if let Some(i) = memchr::memchr(byte, self) {
269            *position += i + 1;
270            let bytes = &self[..i];
271            *self = &self[i + 1..];
272            Ok((bytes, true))
273        } else {
274            *position += self.len();
275            let bytes = &self[..];
276            *self = &[];
277            Ok((bytes, false))
278        }
279    }
280
281    #[inline]
282    fn read_with<P>(&mut self, mut parser: P, _buf: (), position: &mut usize) -> Result<&'a [u8]>
283    where
284        P: Parser,
285    {
286        if let Some(i) = parser.feed(self) {
287            // +1 for `>` which we do not include
288            *position += i + 1;
289            let bytes = &self[..i];
290            *self = &self[i + 1..];
291            return Ok(bytes);
292        }
293
294        *position += self.len();
295        Err(Error::Syntax(P::eof_error()))
296    }
297
298    #[inline]
299    fn read_bang_element(
300        &mut self,
301        _buf: (),
302        position: &mut usize,
303    ) -> Result<(BangType, &'a [u8])> {
304        // Peeked one bang ('!') before being called, so it's guaranteed to
305        // start with it.
306        debug_assert_eq!(self[0], b'!');
307
308        let bang_type = BangType::new(self[1..].first().copied())?;
309
310        if let Some((bytes, i)) = bang_type.parse(&[], self) {
311            *position += i;
312            *self = &self[i..];
313            return Ok((bang_type, bytes));
314        }
315
316        *position += self.len();
317        Err(bang_type.to_err())
318    }
319
320    #[inline]
321    fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
322        let whitespaces = self
323            .iter()
324            .position(|b| !is_whitespace(*b))
325            .unwrap_or(self.len());
326        *position += whitespaces;
327        *self = &self[whitespaces..];
328        Ok(())
329    }
330
331    #[inline]
332    fn skip_one(&mut self, byte: u8) -> Result<bool> {
333        // search byte must be within the ascii range
334        debug_assert!(byte.is_ascii());
335        if self.first() == Some(&byte) {
336            *self = &self[1..];
337            Ok(true)
338        } else {
339            Ok(false)
340        }
341    }
342
343    #[inline]
344    fn peek_one(&mut self) -> Result<Option<u8>> {
345        Ok(self.first().copied())
346    }
347}
348
349#[cfg(test)]
350mod test {
351    use crate::reader::test::check;
352    use crate::reader::XmlSource;
353
354    /// Default buffer constructor just pass the byte array from the test
355    fn identity<T>(input: T) -> T {
356        input
357    }
358
359    check!(
360        #[test]
361        read_event_impl,
362        read_until_close,
363        identity,
364        ()
365    );
366
367    #[cfg(feature = "encoding")]
368    mod encoding {
369        use crate::events::Event;
370        use crate::reader::Reader;
371        use encoding_rs::UTF_8;
372        use pretty_assertions::assert_eq;
373
374        /// Checks that XML declaration cannot change the encoding from UTF-8 if
375        /// a `Reader` was created using `from_str` method
376        #[test]
377        fn str_always_has_utf8() {
378            let mut reader = Reader::from_str("<?xml encoding='UTF-16'?>");
379
380            assert_eq!(reader.decoder().encoding(), UTF_8);
381            reader.read_event().unwrap();
382            assert_eq!(reader.decoder().encoding(), UTF_8);
383
384            assert_eq!(reader.read_event().unwrap(), Event::Eof);
385        }
386    }
387}