quick_xml/reader/
buffered_reader.rs

1//! This is an implementation of [`Reader`] for reading from a [`BufRead`] as
2//! underlying byte stream.
3
4use std::fs::File;
5use std::io::{self, BufRead, BufReader};
6use std::path::Path;
7
8use crate::errors::{Error, Result};
9use crate::events::Event;
10use crate::name::QName;
11use crate::reader::{is_whitespace, BangType, Parser, Reader, Span, XmlSource};
12
13macro_rules! impl_buffered_source {
14    ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
15        #[cfg(not(feature = "encoding"))]
16        #[inline]
17        $($async)? fn remove_utf8_bom(&mut self) -> Result<()> {
18            use crate::encoding::UTF8_BOM;
19
20            loop {
21                break match self $(.$reader)? .fill_buf() $(.$await)? {
22                    Ok(n) => {
23                        if n.starts_with(UTF8_BOM) {
24                            self $(.$reader)? .consume(UTF8_BOM.len());
25                        }
26                        Ok(())
27                    },
28                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
29                    Err(e) => Err(Error::Io(e.into())),
30                };
31            }
32        }
33
34        #[cfg(feature = "encoding")]
35        #[inline]
36        $($async)? fn detect_encoding(&mut self) -> Result<Option<&'static encoding_rs::Encoding>> {
37            loop {
38                break match self $(.$reader)? .fill_buf() $(.$await)? {
39                    Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
40                        self $(.$reader)? .consume(bom_len);
41                        Ok(Some(enc))
42                    } else {
43                        Ok(None)
44                    },
45                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
46                    Err(e) => Err(Error::Io(e.into())),
47                };
48            }
49        }
50
51        #[inline]
52        $($async)? fn read_bytes_until $(<$lf>)? (
53            &mut self,
54            byte: u8,
55            buf: &'b mut Vec<u8>,
56            position: &mut usize,
57        ) -> Result<(&'b [u8], bool)> {
58            // search byte must be within the ascii range
59            debug_assert!(byte.is_ascii());
60
61            let mut read = 0;
62            let mut done = false;
63            let start = buf.len();
64            while !done {
65                let used = {
66                    let available = match self $(.$reader)? .fill_buf() $(.$await)? {
67                        Ok(n) if n.is_empty() => break,
68                        Ok(n) => n,
69                        Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
70                        Err(e) => {
71                            *position += read;
72                            return Err(Error::Io(e.into()));
73                        }
74                    };
75
76                    match memchr::memchr(byte, available) {
77                        Some(i) => {
78                            buf.extend_from_slice(&available[..i]);
79                            done = true;
80                            i + 1
81                        }
82                        None => {
83                            buf.extend_from_slice(available);
84                            available.len()
85                        }
86                    }
87                };
88                self $(.$reader)? .consume(used);
89                read += used;
90            }
91            *position += read;
92
93            Ok((&buf[start..], done))
94        }
95
96        #[inline]
97        $($async)? fn read_with<$($lf,)? P: Parser>(
98            &mut self,
99            mut parser: P,
100            buf: &'b mut Vec<u8>,
101            position: &mut usize,
102        ) -> Result<&'b [u8]> {
103            let mut read = 0;
104            let start = buf.len();
105            loop {
106                let available = match self $(.$reader)? .fill_buf() $(.$await)? {
107                    Ok(n) if n.is_empty() => break,
108                    Ok(n) => n,
109                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
110                    Err(e) => {
111                        *position += read;
112                        return Err(Error::Io(e.into()));
113                    }
114                };
115
116                if let Some(i) = parser.feed(available) {
117                    buf.extend_from_slice(&available[..i]);
118
119                    // +1 for `>` which we do not include
120                    self $(.$reader)? .consume(i + 1);
121                    read += i + 1;
122
123                    *position += read;
124                    return Ok(&buf[start..]);
125                }
126
127                // The `>` symbol not yet found, continue reading
128                buf.extend_from_slice(available);
129
130                let used = available.len();
131                self $(.$reader)? .consume(used);
132                read += used;
133            }
134
135            *position += read;
136            Err(Error::Syntax(P::eof_error()))
137        }
138
139        #[inline]
140        $($async)? fn read_bang_element $(<$lf>)? (
141            &mut self,
142            buf: &'b mut Vec<u8>,
143            position: &mut usize,
144        ) -> Result<(BangType, &'b [u8])> {
145            // Peeked one bang ('!') before being called, so it's guaranteed to
146            // start with it.
147            let start = buf.len();
148            let mut read = 1;
149            buf.push(b'!');
150            self $(.$reader)? .consume(1);
151
152            let bang_type = BangType::new(self.peek_one() $(.$await)? ?)?;
153
154            loop {
155                match self $(.$reader)? .fill_buf() $(.$await)? {
156                    // Note: Do not update position, so the error points to
157                    // somewhere sane rather than at the EOF
158                    Ok(n) if n.is_empty() => break,
159                    Ok(available) => {
160                        // We only parse from start because we don't want to consider
161                        // whatever is in the buffer before the bang element
162                        if let Some((consumed, used)) = bang_type.parse(&buf[start..], available) {
163                            buf.extend_from_slice(consumed);
164
165                            self $(.$reader)? .consume(used);
166                            read += used;
167
168                            *position += read;
169                            return Ok((bang_type, &buf[start..]));
170                        } else {
171                            buf.extend_from_slice(available);
172
173                            let used = available.len();
174                            self $(.$reader)? .consume(used);
175                            read += used;
176                        }
177                    }
178                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
179                    Err(e) => {
180                        *position += read;
181                        return Err(Error::Io(e.into()));
182                    }
183                }
184            }
185
186            *position += read;
187            Err(bang_type.to_err())
188        }
189
190        #[inline]
191        $($async)? fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
192            loop {
193                break match self $(.$reader)? .fill_buf() $(.$await)? {
194                    Ok(n) => {
195                        let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len());
196                        if count > 0 {
197                            self $(.$reader)? .consume(count);
198                            *position += count;
199                            continue;
200                        } else {
201                            Ok(())
202                        }
203                    }
204                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
205                    Err(e) => Err(Error::Io(e.into())),
206                };
207            }
208        }
209
210        #[inline]
211        $($async)? fn skip_one(&mut self, byte: u8) -> Result<bool> {
212            // search byte must be within the ascii range
213            debug_assert!(byte.is_ascii());
214
215            match self.peek_one() $(.$await)? ? {
216                Some(b) if b == byte => {
217                    self $(.$reader)? .consume(1);
218                    Ok(true)
219                }
220                _ => Ok(false),
221            }
222        }
223
224        #[inline]
225        $($async)? fn peek_one(&mut self) -> Result<Option<u8>> {
226            loop {
227                break match self $(.$reader)? .fill_buf() $(.$await)? {
228                    Ok(n) => Ok(n.first().cloned()),
229                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
230                    Err(e) => Err(Error::Io(e.into())),
231                };
232            }
233        }
234    };
235}
236
237// Make it public for use in async implementations.
238// New rustc reports
239// > warning: the item `impl_buffered_source` is imported redundantly
240// so make it public only when async feature is enabled
241#[cfg(feature = "async-tokio")]
242pub(super) use impl_buffered_source;
243
244/// Implementation of `XmlSource` for any `BufRead` reader using a user-given
245/// `Vec<u8>` as buffer that will be borrowed by events.
246impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec<u8>> for R {
247    impl_buffered_source!();
248}
249
250////////////////////////////////////////////////////////////////////////////////////////////////////
251
252/// This is an implementation for reading from a [`BufRead`] as underlying byte stream.
253impl<R: BufRead> Reader<R> {
254    /// Reads the next `Event`.
255    ///
256    /// This is the main entry point for reading XML `Event`s.
257    ///
258    /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow`
259    /// internally).
260    ///
261    /// Having the possibility to control the internal buffers gives you some additional benefits
262    /// such as:
263    ///
264    /// - Reduce the number of allocations by reusing the same buffer. For constrained systems,
265    ///   you can call `buf.clear()` once you are done with processing the event (typically at the
266    ///   end of your loop).
267    /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`).
268    ///
269    /// # Examples
270    ///
271    /// ```
272    /// # use pretty_assertions::assert_eq;
273    /// use quick_xml::events::Event;
274    /// use quick_xml::reader::Reader;
275    ///
276    /// let xml = r#"<tag1 att1 = "test">
277    ///                 <tag2><!--Test comment-->Test</tag2>
278    ///                 <tag2>Test 2</tag2>
279    ///              </tag1>"#;
280    /// let mut reader = Reader::from_str(xml);
281    /// reader.config_mut().trim_text(true);
282    /// let mut count = 0;
283    /// let mut buf = Vec::new();
284    /// let mut txt = Vec::new();
285    /// loop {
286    ///     match reader.read_event_into(&mut buf) {
287    ///         Ok(Event::Start(_)) => count += 1,
288    ///         Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
289    ///         Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
290    ///         Ok(Event::Eof) => break,
291    ///         _ => (),
292    ///     }
293    ///     buf.clear();
294    /// }
295    /// assert_eq!(count, 3);
296    /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
297    /// ```
298    #[inline]
299    pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
300        self.read_event_impl(buf)
301    }
302
303    /// Reads until end element is found using provided buffer as intermediate
304    /// storage for events content. This function is supposed to be called after
305    /// you already read a [`Start`] event.
306    ///
307    /// Returns a span that cover content between `>` of an opening tag and `<` of
308    /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
309    /// this method was called after reading expanded [`Start`] event.
310    ///
311    /// Manages nested cases where parent and child elements have the _literally_
312    /// same name.
313    ///
314    /// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
315    /// will be returned. In particularly, that error will be returned if you call
316    /// this method without consuming the corresponding [`Start`] event first.
317    ///
318    /// If your reader created from a string slice or byte array slice, it is
319    /// better to use [`read_to_end()`] method, because it will not copy bytes
320    /// into intermediate buffer.
321    ///
322    /// The provided `buf` buffer will be filled only by one event content at time.
323    /// Before reading of each event the buffer will be cleared. If you know an
324    /// appropriate size of each event, you can preallocate the buffer to reduce
325    /// number of reallocations.
326    ///
327    /// The `end` parameter should contain name of the end element _in the reader
328    /// encoding_. It is good practice to always get that parameter using
329    /// [`BytesStart::to_end()`] method.
330    ///
331    /// The correctness of the skipped events does not checked, if you disabled
332    /// the [`check_end_names`] option.
333    ///
334    /// # Namespaces
335    ///
336    /// While the `Reader` does not support namespace resolution, namespaces
337    /// does not change the algorithm for comparing names. Although the names
338    /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
339    /// same namespace, are semantically equivalent, `</b:name>` cannot close
340    /// `<a:name>`, because according to [the specification]
341    ///
342    /// > The end of every element that begins with a **start-tag** MUST be marked
343    /// > by an **end-tag** containing a name that echoes the element's type as
344    /// > given in the **start-tag**
345    ///
346    /// # Examples
347    ///
348    /// This example shows, how you can skip XML content after you read the
349    /// start event.
350    ///
351    /// ```
352    /// # use pretty_assertions::assert_eq;
353    /// use quick_xml::events::{BytesStart, Event};
354    /// use quick_xml::reader::Reader;
355    ///
356    /// let mut reader = Reader::from_str(r#"
357    ///     <outer>
358    ///         <inner>
359    ///             <inner></inner>
360    ///             <inner/>
361    ///             <outer></outer>
362    ///             <outer/>
363    ///         </inner>
364    ///     </outer>
365    /// "#);
366    /// reader.config_mut().trim_text(true);
367    /// let mut buf = Vec::new();
368    ///
369    /// let start = BytesStart::new("outer");
370    /// let end   = start.to_end().into_owned();
371    ///
372    /// // First, we read a start event...
373    /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
374    ///
375    /// // ...then, we could skip all events to the corresponding end event.
376    /// // This call will correctly handle nested <outer> elements.
377    /// // Note, however, that this method does not handle namespaces.
378    /// reader.read_to_end_into(end.name(), &mut buf).unwrap();
379    ///
380    /// // At the end we should get an Eof event, because we ate the whole XML
381    /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
382    /// ```
383    ///
384    /// [`Start`]: Event::Start
385    /// [`End`]: Event::End
386    /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
387    /// [`read_to_end()`]: Self::read_to_end
388    /// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
389    /// [`check_end_names`]: crate::reader::Config::check_end_names
390    /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
391    pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
392        Ok(read_to_end!(self, end, buf, read_event_impl, {
393            buf.clear();
394        }))
395    }
396}
397
398impl Reader<BufReader<File>> {
399    /// Creates an XML reader from a file path.
400    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
401        let file = File::open(path)?;
402        let reader = BufReader::new(file);
403        Ok(Self::from_reader(reader))
404    }
405}
406
407#[cfg(test)]
408mod test {
409    use crate::reader::test::{check, small_buffers};
410    use crate::reader::XmlSource;
411
412    /// Default buffer constructor just pass the byte array from the test
413    fn identity<T>(input: T) -> T {
414        input
415    }
416
417    check!(
418        #[test]
419        read_event_impl,
420        read_until_close,
421        identity,
422        &mut Vec::new()
423    );
424
425    small_buffers!(
426        #[test]
427        read_event_into: std::io::BufReader<_>
428    );
429
430    #[cfg(feature = "encoding")]
431    mod encoding {
432        use crate::events::Event;
433        use crate::reader::Reader;
434        use encoding_rs::{UTF_16LE, UTF_8, WINDOWS_1251};
435        use pretty_assertions::assert_eq;
436
437        /// Checks that encoding is detected by BOM and changed after XML declaration
438        /// BOM indicates UTF-16LE, but XML - windows-1251
439        #[test]
440        fn bom_detected() {
441            let mut reader =
442                Reader::from_reader(b"\xFF\xFE<?xml encoding='windows-1251'?>".as_ref());
443            let mut buf = Vec::new();
444
445            assert_eq!(reader.decoder().encoding(), UTF_8);
446            reader.read_event_into(&mut buf).unwrap();
447            assert_eq!(reader.decoder().encoding(), WINDOWS_1251);
448
449            assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
450        }
451
452        /// Checks that encoding is changed by XML declaration, but only once
453        #[test]
454        fn xml_declaration() {
455            let mut reader = Reader::from_reader(
456                b"<?xml encoding='UTF-16'?><?xml encoding='windows-1251'?>".as_ref(),
457            );
458            let mut buf = Vec::new();
459
460            assert_eq!(reader.decoder().encoding(), UTF_8);
461            reader.read_event_into(&mut buf).unwrap();
462            assert_eq!(reader.decoder().encoding(), UTF_16LE);
463
464            reader.read_event_into(&mut buf).unwrap();
465            assert_eq!(reader.decoder().encoding(), UTF_16LE);
466
467            assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
468        }
469    }
470}