quick_xml/reader/slice_reader.rs
1//! This is an implementation of [`Reader`] for reading from a `&[u8]` as
2//! underlying byte stream. This implementation supports not using an
3//! intermediate buffer as the byte slice itself can be used to borrow from.
4
5use std::borrow::Cow;
6
7#[cfg(feature = "encoding")]
8use crate::reader::EncodingRef;
9#[cfg(feature = "encoding")]
10use encoding_rs::{Encoding, UTF_8};
11
12use crate::errors::{Error, Result};
13use crate::events::Event;
14use crate::name::QName;
15use crate::reader::{is_whitespace, BangType, Parser, Reader, Span, XmlSource};
16
17/// This is an implementation for reading from a `&[u8]` as underlying byte stream.
18/// This implementation supports not using an intermediate buffer as the byte slice
19/// itself can be used to borrow from.
20impl<'a> Reader<&'a [u8]> {
21 /// Creates an XML reader from a string slice.
22 #[allow(clippy::should_implement_trait)]
23 pub fn from_str(s: &'a str) -> Self {
24 // Rust strings are guaranteed to be UTF-8, so lock the encoding
25 #[cfg(feature = "encoding")]
26 {
27 let mut reader = Self::from_reader(s.as_bytes());
28 reader.state.encoding = EncodingRef::Explicit(UTF_8);
29 reader
30 }
31
32 #[cfg(not(feature = "encoding"))]
33 Self::from_reader(s.as_bytes())
34 }
35
36 /// Read an event that borrows from the input rather than a buffer.
37 ///
38 /// There is no asynchronous `read_event_async()` version of this function,
39 /// because it is not necessary -- the contents are already in memory and no IO
40 /// is needed, therefore there is no potential for blocking.
41 ///
42 /// # Examples
43 ///
44 /// ```
45 /// # use pretty_assertions::assert_eq;
46 /// use quick_xml::events::Event;
47 /// use quick_xml::reader::Reader;
48 ///
49 /// let mut reader = Reader::from_str(r#"
50 /// <tag1 att1 = "test">
51 /// <tag2><!--Test comment-->Test</tag2>
52 /// <tag2>Test 2</tag2>
53 /// </tag1>
54 /// "#);
55 /// reader.config_mut().trim_text(true);
56 ///
57 /// let mut count = 0;
58 /// let mut txt = Vec::new();
59 /// loop {
60 /// match reader.read_event().unwrap() {
61 /// Event::Start(e) => count += 1,
62 /// Event::Text(e) => txt.push(e.unescape().unwrap().into_owned()),
63 /// Event::Eof => break,
64 /// _ => (),
65 /// }
66 /// }
67 /// assert_eq!(count, 3);
68 /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
69 /// ```
70 #[inline]
71 pub fn read_event(&mut self) -> Result<Event<'a>> {
72 self.read_event_impl(())
73 }
74
75 /// Reads until end element is found. This function is supposed to be called
76 /// after you already read a [`Start`] event.
77 ///
78 /// Returns a span that cover content between `>` of an opening tag and `<` of
79 /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
80 /// this method was called after reading expanded [`Start`] event.
81 ///
82 /// Manages nested cases where parent and child elements have the _literally_
83 /// same name.
84 ///
85 /// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
86 /// will be returned. In particularly, that error will be returned if you call
87 /// this method without consuming the corresponding [`Start`] event first.
88 ///
89 /// The `end` parameter should contain name of the end element _in the reader
90 /// encoding_. It is good practice to always get that parameter using
91 /// [`BytesStart::to_end()`] method.
92 ///
93 /// The correctness of the skipped events does not checked, if you disabled
94 /// the [`check_end_names`] option.
95 ///
96 /// There is no asynchronous `read_to_end_async()` version of this function,
97 /// because it is not necessary -- the contents are already in memory and no IO
98 /// is needed, therefore there is no potential for blocking.
99 ///
100 /// # Namespaces
101 ///
102 /// While the `Reader` does not support namespace resolution, namespaces
103 /// does not change the algorithm for comparing names. Although the names
104 /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
105 /// same namespace, are semantically equivalent, `</b:name>` cannot close
106 /// `<a:name>`, because according to [the specification]
107 ///
108 /// > The end of every element that begins with a **start-tag** MUST be marked
109 /// > by an **end-tag** containing a name that echoes the element's type as
110 /// > given in the **start-tag**
111 ///
112 /// # Examples
113 ///
114 /// This example shows, how you can skip XML content after you read the
115 /// start event.
116 ///
117 /// ```
118 /// # use pretty_assertions::assert_eq;
119 /// use quick_xml::events::{BytesStart, Event};
120 /// use quick_xml::reader::Reader;
121 ///
122 /// let mut reader = Reader::from_str(r#"
123 /// <outer>
124 /// <inner>
125 /// <inner></inner>
126 /// <inner/>
127 /// <outer></outer>
128 /// <outer/>
129 /// </inner>
130 /// </outer>
131 /// "#);
132 /// reader.config_mut().trim_text(true);
133 ///
134 /// let start = BytesStart::new("outer");
135 /// let end = start.to_end().into_owned();
136 ///
137 /// // First, we read a start event...
138 /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
139 ///
140 /// // ...then, we could skip all events to the corresponding end event.
141 /// // This call will correctly handle nested <outer> elements.
142 /// // Note, however, that this method does not handle namespaces.
143 /// reader.read_to_end(end.name()).unwrap();
144 ///
145 /// // At the end we should get an Eof event, because we ate the whole XML
146 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
147 /// ```
148 ///
149 /// [`Start`]: Event::Start
150 /// [`End`]: Event::End
151 /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
152 /// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
153 /// [`check_end_names`]: crate::reader::Config::check_end_names
154 /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
155 pub fn read_to_end(&mut self, end: QName) -> Result<Span> {
156 Ok(read_to_end!(self, end, (), read_event_impl, {}))
157 }
158
159 /// Reads content between start and end tags, including any markup. This
160 /// function is supposed to be called after you already read a [`Start`] event.
161 ///
162 /// Manages nested cases where parent and child elements have the _literally_
163 /// same name.
164 ///
165 /// This method does not unescape read data, instead it returns content
166 /// "as is" of the XML document. This is because it has no idea what text
167 /// it reads, and if, for example, it contains CDATA section, attempt to
168 /// unescape it content will spoil data.
169 ///
170 /// Any text will be decoded using the XML current [`decoder()`].
171 ///
172 /// Actually, this method perform the following code:
173 ///
174 /// ```ignore
175 /// let span = reader.read_to_end(end)?;
176 /// let text = reader.decoder().decode(&reader.inner_slice[span]);
177 /// ```
178 ///
179 /// # Examples
180 ///
181 /// This example shows, how you can read a HTML content from your XML document.
182 ///
183 /// ```
184 /// # use pretty_assertions::assert_eq;
185 /// # use std::borrow::Cow;
186 /// use quick_xml::events::{BytesStart, Event};
187 /// use quick_xml::reader::Reader;
188 ///
189 /// let mut reader = Reader::from_str("
190 /// <html>
191 /// <title>This is a HTML text</title>
192 /// <p>Usual XML rules does not apply inside it
193 /// <p>For example, elements not needed to be "closed"
194 /// </html>
195 /// ");
196 /// reader.config_mut().trim_text(true);
197 ///
198 /// let start = BytesStart::new("html");
199 /// let end = start.to_end().into_owned();
200 ///
201 /// // First, we read a start event...
202 /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
203 /// // ...and disable checking of end names because we expect HTML further...
204 /// reader.config_mut().check_end_names = false;
205 ///
206 /// // ...then, we could read text content until close tag.
207 /// // This call will correctly handle nested <html> elements.
208 /// let text = reader.read_text(end.name()).unwrap();
209 /// assert_eq!(text, Cow::Borrowed(r#"
210 /// <title>This is a HTML text</title>
211 /// <p>Usual XML rules does not apply inside it
212 /// <p>For example, elements not needed to be "closed"
213 /// "#));
214 /// assert!(matches!(text, Cow::Borrowed(_)));
215 ///
216 /// // Now we can enable checks again
217 /// reader.config_mut().check_end_names = true;
218 ///
219 /// // At the end we should get an Eof event, because we ate the whole XML
220 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
221 /// ```
222 ///
223 /// [`Start`]: Event::Start
224 /// [`decoder()`]: Self::decoder()
225 pub fn read_text(&mut self, end: QName) -> Result<Cow<'a, str>> {
226 // self.reader will be changed, so store original reference
227 let buffer = self.reader;
228 let span = self.read_to_end(end)?;
229
230 self.decoder().decode(&buffer[0..span.len()])
231 }
232}
233
234////////////////////////////////////////////////////////////////////////////////////////////////////
235
236/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer
237/// that will be borrowed by events. This implementation provides a zero-copy deserialization
238impl<'a> XmlSource<'a, ()> for &'a [u8] {
239 #[cfg(not(feature = "encoding"))]
240 #[inline]
241 fn remove_utf8_bom(&mut self) -> Result<()> {
242 if self.starts_with(crate::encoding::UTF8_BOM) {
243 *self = &self[crate::encoding::UTF8_BOM.len()..];
244 }
245 Ok(())
246 }
247
248 #[cfg(feature = "encoding")]
249 #[inline]
250 fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>> {
251 if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) {
252 *self = &self[bom_len..];
253 return Ok(Some(enc));
254 }
255 Ok(None)
256 }
257
258 #[inline]
259 fn read_bytes_until(
260 &mut self,
261 byte: u8,
262 _buf: (),
263 position: &mut usize,
264 ) -> Result<(&'a [u8], bool)> {
265 // search byte must be within the ascii range
266 debug_assert!(byte.is_ascii());
267
268 if let Some(i) = memchr::memchr(byte, self) {
269 *position += i + 1;
270 let bytes = &self[..i];
271 *self = &self[i + 1..];
272 Ok((bytes, true))
273 } else {
274 *position += self.len();
275 let bytes = &self[..];
276 *self = &[];
277 Ok((bytes, false))
278 }
279 }
280
281 #[inline]
282 fn read_with<P>(&mut self, mut parser: P, _buf: (), position: &mut usize) -> Result<&'a [u8]>
283 where
284 P: Parser,
285 {
286 if let Some(i) = parser.feed(self) {
287 // +1 for `>` which we do not include
288 *position += i + 1;
289 let bytes = &self[..i];
290 *self = &self[i + 1..];
291 return Ok(bytes);
292 }
293
294 *position += self.len();
295 Err(Error::Syntax(P::eof_error()))
296 }
297
298 #[inline]
299 fn read_bang_element(
300 &mut self,
301 _buf: (),
302 position: &mut usize,
303 ) -> Result<(BangType, &'a [u8])> {
304 // Peeked one bang ('!') before being called, so it's guaranteed to
305 // start with it.
306 debug_assert_eq!(self[0], b'!');
307
308 let bang_type = BangType::new(self[1..].first().copied())?;
309
310 if let Some((bytes, i)) = bang_type.parse(&[], self) {
311 *position += i;
312 *self = &self[i..];
313 return Ok((bang_type, bytes));
314 }
315
316 *position += self.len();
317 Err(bang_type.to_err())
318 }
319
320 #[inline]
321 fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
322 let whitespaces = self
323 .iter()
324 .position(|b| !is_whitespace(*b))
325 .unwrap_or(self.len());
326 *position += whitespaces;
327 *self = &self[whitespaces..];
328 Ok(())
329 }
330
331 #[inline]
332 fn skip_one(&mut self, byte: u8) -> Result<bool> {
333 // search byte must be within the ascii range
334 debug_assert!(byte.is_ascii());
335 if self.first() == Some(&byte) {
336 *self = &self[1..];
337 Ok(true)
338 } else {
339 Ok(false)
340 }
341 }
342
343 #[inline]
344 fn peek_one(&mut self) -> Result<Option<u8>> {
345 Ok(self.first().copied())
346 }
347}
348
349#[cfg(test)]
350mod test {
351 use crate::reader::test::check;
352 use crate::reader::XmlSource;
353
354 /// Default buffer constructor just pass the byte array from the test
355 fn identity<T>(input: T) -> T {
356 input
357 }
358
359 check!(
360 #[test]
361 read_event_impl,
362 read_until_close,
363 identity,
364 ()
365 );
366
367 #[cfg(feature = "encoding")]
368 mod encoding {
369 use crate::events::Event;
370 use crate::reader::Reader;
371 use encoding_rs::UTF_8;
372 use pretty_assertions::assert_eq;
373
374 /// Checks that XML declaration cannot change the encoding from UTF-8 if
375 /// a `Reader` was created using `from_str` method
376 #[test]
377 fn str_always_has_utf8() {
378 let mut reader = Reader::from_str("<?xml encoding='UTF-16'?>");
379
380 assert_eq!(reader.decoder().encoding(), UTF_8);
381 reader.read_event().unwrap();
382 assert_eq!(reader.decoder().encoding(), UTF_8);
383
384 assert_eq!(reader.read_event().unwrap(), Event::Eof);
385 }
386 }
387}