quick_xml/reader/buffered_reader.rs
1//! This is an implementation of [`Reader`] for reading from a [`BufRead`] as
2//! underlying byte stream.
3
4use std::fs::File;
5use std::io::{self, BufRead, BufReader};
6use std::path::Path;
7
8use crate::errors::{Error, Result};
9use crate::events::Event;
10use crate::name::QName;
11use crate::reader::{is_whitespace, BangType, Parser, Reader, Span, XmlSource};
12
13macro_rules! impl_buffered_source {
14 ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
15 #[cfg(not(feature = "encoding"))]
16 #[inline]
17 $($async)? fn remove_utf8_bom(&mut self) -> Result<()> {
18 use crate::encoding::UTF8_BOM;
19
20 loop {
21 break match self $(.$reader)? .fill_buf() $(.$await)? {
22 Ok(n) => {
23 if n.starts_with(UTF8_BOM) {
24 self $(.$reader)? .consume(UTF8_BOM.len());
25 }
26 Ok(())
27 },
28 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
29 Err(e) => Err(Error::Io(e.into())),
30 };
31 }
32 }
33
34 #[cfg(feature = "encoding")]
35 #[inline]
36 $($async)? fn detect_encoding(&mut self) -> Result<Option<&'static encoding_rs::Encoding>> {
37 loop {
38 break match self $(.$reader)? .fill_buf() $(.$await)? {
39 Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
40 self $(.$reader)? .consume(bom_len);
41 Ok(Some(enc))
42 } else {
43 Ok(None)
44 },
45 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
46 Err(e) => Err(Error::Io(e.into())),
47 };
48 }
49 }
50
51 #[inline]
52 $($async)? fn read_bytes_until $(<$lf>)? (
53 &mut self,
54 byte: u8,
55 buf: &'b mut Vec<u8>,
56 position: &mut usize,
57 ) -> Result<(&'b [u8], bool)> {
58 // search byte must be within the ascii range
59 debug_assert!(byte.is_ascii());
60
61 let mut read = 0;
62 let mut done = false;
63 let start = buf.len();
64 while !done {
65 let used = {
66 let available = match self $(.$reader)? .fill_buf() $(.$await)? {
67 Ok(n) if n.is_empty() => break,
68 Ok(n) => n,
69 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
70 Err(e) => {
71 *position += read;
72 return Err(Error::Io(e.into()));
73 }
74 };
75
76 match memchr::memchr(byte, available) {
77 Some(i) => {
78 buf.extend_from_slice(&available[..i]);
79 done = true;
80 i + 1
81 }
82 None => {
83 buf.extend_from_slice(available);
84 available.len()
85 }
86 }
87 };
88 self $(.$reader)? .consume(used);
89 read += used;
90 }
91 *position += read;
92
93 Ok((&buf[start..], done))
94 }
95
96 #[inline]
97 $($async)? fn read_with<$($lf,)? P: Parser>(
98 &mut self,
99 mut parser: P,
100 buf: &'b mut Vec<u8>,
101 position: &mut usize,
102 ) -> Result<&'b [u8]> {
103 let mut read = 0;
104 let start = buf.len();
105 loop {
106 let available = match self $(.$reader)? .fill_buf() $(.$await)? {
107 Ok(n) if n.is_empty() => break,
108 Ok(n) => n,
109 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
110 Err(e) => {
111 *position += read;
112 return Err(Error::Io(e.into()));
113 }
114 };
115
116 if let Some(i) = parser.feed(available) {
117 buf.extend_from_slice(&available[..i]);
118
119 // +1 for `>` which we do not include
120 self $(.$reader)? .consume(i + 1);
121 read += i + 1;
122
123 *position += read;
124 return Ok(&buf[start..]);
125 }
126
127 // The `>` symbol not yet found, continue reading
128 buf.extend_from_slice(available);
129
130 let used = available.len();
131 self $(.$reader)? .consume(used);
132 read += used;
133 }
134
135 *position += read;
136 Err(Error::Syntax(P::eof_error()))
137 }
138
139 #[inline]
140 $($async)? fn read_bang_element $(<$lf>)? (
141 &mut self,
142 buf: &'b mut Vec<u8>,
143 position: &mut usize,
144 ) -> Result<(BangType, &'b [u8])> {
145 // Peeked one bang ('!') before being called, so it's guaranteed to
146 // start with it.
147 let start = buf.len();
148 let mut read = 1;
149 buf.push(b'!');
150 self $(.$reader)? .consume(1);
151
152 let bang_type = BangType::new(self.peek_one() $(.$await)? ?)?;
153
154 loop {
155 match self $(.$reader)? .fill_buf() $(.$await)? {
156 // Note: Do not update position, so the error points to
157 // somewhere sane rather than at the EOF
158 Ok(n) if n.is_empty() => break,
159 Ok(available) => {
160 // We only parse from start because we don't want to consider
161 // whatever is in the buffer before the bang element
162 if let Some((consumed, used)) = bang_type.parse(&buf[start..], available) {
163 buf.extend_from_slice(consumed);
164
165 self $(.$reader)? .consume(used);
166 read += used;
167
168 *position += read;
169 return Ok((bang_type, &buf[start..]));
170 } else {
171 buf.extend_from_slice(available);
172
173 let used = available.len();
174 self $(.$reader)? .consume(used);
175 read += used;
176 }
177 }
178 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
179 Err(e) => {
180 *position += read;
181 return Err(Error::Io(e.into()));
182 }
183 }
184 }
185
186 *position += read;
187 Err(bang_type.to_err())
188 }
189
190 #[inline]
191 $($async)? fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
192 loop {
193 break match self $(.$reader)? .fill_buf() $(.$await)? {
194 Ok(n) => {
195 let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len());
196 if count > 0 {
197 self $(.$reader)? .consume(count);
198 *position += count;
199 continue;
200 } else {
201 Ok(())
202 }
203 }
204 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
205 Err(e) => Err(Error::Io(e.into())),
206 };
207 }
208 }
209
210 #[inline]
211 $($async)? fn skip_one(&mut self, byte: u8) -> Result<bool> {
212 // search byte must be within the ascii range
213 debug_assert!(byte.is_ascii());
214
215 match self.peek_one() $(.$await)? ? {
216 Some(b) if b == byte => {
217 self $(.$reader)? .consume(1);
218 Ok(true)
219 }
220 _ => Ok(false),
221 }
222 }
223
224 #[inline]
225 $($async)? fn peek_one(&mut self) -> Result<Option<u8>> {
226 loop {
227 break match self $(.$reader)? .fill_buf() $(.$await)? {
228 Ok(n) => Ok(n.first().cloned()),
229 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
230 Err(e) => Err(Error::Io(e.into())),
231 };
232 }
233 }
234 };
235}
236
237// Make it public for use in async implementations.
238// New rustc reports
239// > warning: the item `impl_buffered_source` is imported redundantly
240// so make it public only when async feature is enabled
241#[cfg(feature = "async-tokio")]
242pub(super) use impl_buffered_source;
243
244/// Implementation of `XmlSource` for any `BufRead` reader using a user-given
245/// `Vec<u8>` as buffer that will be borrowed by events.
246impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec<u8>> for R {
247 impl_buffered_source!();
248}
249
250////////////////////////////////////////////////////////////////////////////////////////////////////
251
252/// This is an implementation for reading from a [`BufRead`] as underlying byte stream.
253impl<R: BufRead> Reader<R> {
254 /// Reads the next `Event`.
255 ///
256 /// This is the main entry point for reading XML `Event`s.
257 ///
258 /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow`
259 /// internally).
260 ///
261 /// Having the possibility to control the internal buffers gives you some additional benefits
262 /// such as:
263 ///
264 /// - Reduce the number of allocations by reusing the same buffer. For constrained systems,
265 /// you can call `buf.clear()` once you are done with processing the event (typically at the
266 /// end of your loop).
267 /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`).
268 ///
269 /// # Examples
270 ///
271 /// ```
272 /// # use pretty_assertions::assert_eq;
273 /// use quick_xml::events::Event;
274 /// use quick_xml::reader::Reader;
275 ///
276 /// let xml = r#"<tag1 att1 = "test">
277 /// <tag2><!--Test comment-->Test</tag2>
278 /// <tag2>Test 2</tag2>
279 /// </tag1>"#;
280 /// let mut reader = Reader::from_str(xml);
281 /// reader.config_mut().trim_text(true);
282 /// let mut count = 0;
283 /// let mut buf = Vec::new();
284 /// let mut txt = Vec::new();
285 /// loop {
286 /// match reader.read_event_into(&mut buf) {
287 /// Ok(Event::Start(_)) => count += 1,
288 /// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
289 /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
290 /// Ok(Event::Eof) => break,
291 /// _ => (),
292 /// }
293 /// buf.clear();
294 /// }
295 /// assert_eq!(count, 3);
296 /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
297 /// ```
298 #[inline]
299 pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
300 self.read_event_impl(buf)
301 }
302
303 /// Reads until end element is found using provided buffer as intermediate
304 /// storage for events content. This function is supposed to be called after
305 /// you already read a [`Start`] event.
306 ///
307 /// Returns a span that cover content between `>` of an opening tag and `<` of
308 /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
309 /// this method was called after reading expanded [`Start`] event.
310 ///
311 /// Manages nested cases where parent and child elements have the _literally_
312 /// same name.
313 ///
314 /// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
315 /// will be returned. In particularly, that error will be returned if you call
316 /// this method without consuming the corresponding [`Start`] event first.
317 ///
318 /// If your reader created from a string slice or byte array slice, it is
319 /// better to use [`read_to_end()`] method, because it will not copy bytes
320 /// into intermediate buffer.
321 ///
322 /// The provided `buf` buffer will be filled only by one event content at time.
323 /// Before reading of each event the buffer will be cleared. If you know an
324 /// appropriate size of each event, you can preallocate the buffer to reduce
325 /// number of reallocations.
326 ///
327 /// The `end` parameter should contain name of the end element _in the reader
328 /// encoding_. It is good practice to always get that parameter using
329 /// [`BytesStart::to_end()`] method.
330 ///
331 /// The correctness of the skipped events does not checked, if you disabled
332 /// the [`check_end_names`] option.
333 ///
334 /// # Namespaces
335 ///
336 /// While the `Reader` does not support namespace resolution, namespaces
337 /// does not change the algorithm for comparing names. Although the names
338 /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
339 /// same namespace, are semantically equivalent, `</b:name>` cannot close
340 /// `<a:name>`, because according to [the specification]
341 ///
342 /// > The end of every element that begins with a **start-tag** MUST be marked
343 /// > by an **end-tag** containing a name that echoes the element's type as
344 /// > given in the **start-tag**
345 ///
346 /// # Examples
347 ///
348 /// This example shows, how you can skip XML content after you read the
349 /// start event.
350 ///
351 /// ```
352 /// # use pretty_assertions::assert_eq;
353 /// use quick_xml::events::{BytesStart, Event};
354 /// use quick_xml::reader::Reader;
355 ///
356 /// let mut reader = Reader::from_str(r#"
357 /// <outer>
358 /// <inner>
359 /// <inner></inner>
360 /// <inner/>
361 /// <outer></outer>
362 /// <outer/>
363 /// </inner>
364 /// </outer>
365 /// "#);
366 /// reader.config_mut().trim_text(true);
367 /// let mut buf = Vec::new();
368 ///
369 /// let start = BytesStart::new("outer");
370 /// let end = start.to_end().into_owned();
371 ///
372 /// // First, we read a start event...
373 /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
374 ///
375 /// // ...then, we could skip all events to the corresponding end event.
376 /// // This call will correctly handle nested <outer> elements.
377 /// // Note, however, that this method does not handle namespaces.
378 /// reader.read_to_end_into(end.name(), &mut buf).unwrap();
379 ///
380 /// // At the end we should get an Eof event, because we ate the whole XML
381 /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
382 /// ```
383 ///
384 /// [`Start`]: Event::Start
385 /// [`End`]: Event::End
386 /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
387 /// [`read_to_end()`]: Self::read_to_end
388 /// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
389 /// [`check_end_names`]: crate::reader::Config::check_end_names
390 /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
391 pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
392 Ok(read_to_end!(self, end, buf, read_event_impl, {
393 buf.clear();
394 }))
395 }
396}
397
398impl Reader<BufReader<File>> {
399 /// Creates an XML reader from a file path.
400 pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
401 let file = File::open(path)?;
402 let reader = BufReader::new(file);
403 Ok(Self::from_reader(reader))
404 }
405}
406
407#[cfg(test)]
408mod test {
409 use crate::reader::test::{check, small_buffers};
410 use crate::reader::XmlSource;
411
412 /// Default buffer constructor just pass the byte array from the test
413 fn identity<T>(input: T) -> T {
414 input
415 }
416
417 check!(
418 #[test]
419 read_event_impl,
420 read_until_close,
421 identity,
422 &mut Vec::new()
423 );
424
425 small_buffers!(
426 #[test]
427 read_event_into: std::io::BufReader<_>
428 );
429
430 #[cfg(feature = "encoding")]
431 mod encoding {
432 use crate::events::Event;
433 use crate::reader::Reader;
434 use encoding_rs::{UTF_16LE, UTF_8, WINDOWS_1251};
435 use pretty_assertions::assert_eq;
436
437 /// Checks that encoding is detected by BOM and changed after XML declaration
438 /// BOM indicates UTF-16LE, but XML - windows-1251
439 #[test]
440 fn bom_detected() {
441 let mut reader =
442 Reader::from_reader(b"\xFF\xFE<?xml encoding='windows-1251'?>".as_ref());
443 let mut buf = Vec::new();
444
445 assert_eq!(reader.decoder().encoding(), UTF_8);
446 reader.read_event_into(&mut buf).unwrap();
447 assert_eq!(reader.decoder().encoding(), WINDOWS_1251);
448
449 assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
450 }
451
452 /// Checks that encoding is changed by XML declaration, but only once
453 #[test]
454 fn xml_declaration() {
455 let mut reader = Reader::from_reader(
456 b"<?xml encoding='UTF-16'?><?xml encoding='windows-1251'?>".as_ref(),
457 );
458 let mut buf = Vec::new();
459
460 assert_eq!(reader.decoder().encoding(), UTF_8);
461 reader.read_event_into(&mut buf).unwrap();
462 assert_eq!(reader.decoder().encoding(), UTF_16LE);
463
464 reader.read_event_into(&mut buf).unwrap();
465 assert_eq!(reader.decoder().encoding(), UTF_16LE);
466
467 assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
468 }
469 }
470}