quick_xml/reader/mod.rs
1//! Contains high-level interface for a pull-based XML parser.
2
3#[cfg(feature = "encoding")]
4use encoding_rs::Encoding;
5use std::ops::Range;
6
7use crate::encoding::Decoder;
8use crate::errors::{Error, Result, SyntaxError};
9use crate::events::Event;
10use crate::reader::state::ReaderState;
11
12/// A struct that holds a parser configuration.
13///
14/// Current parser configuration can be retrieved by calling [`Reader::config()`]
15/// and changed by changing properties of the object returned by a call to
16/// [`Reader::config_mut()`].
17///
18/// [`Reader::config()`]: crate::reader::Reader::config
19/// [`Reader::config_mut()`]: crate::reader::Reader::config_mut
20#[derive(Debug, Clone, PartialEq, Eq)]
21#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
22#[cfg_attr(feature = "serde-types", derive(serde::Deserialize, serde::Serialize))]
23#[non_exhaustive]
24pub struct Config {
25 /// Whether comments should be validated. If enabled, in case of invalid comment
26 /// [`Error::IllFormed(DoubleHyphenInComment)`] is returned from read methods.
27 ///
28 /// When set to `true`, every [`Comment`] event will be checked for not
29 /// containing `--`, which [is not allowed] in XML comments. Most of the time
30 /// we don't want comments at all so we don't really care about comment
31 /// correctness, thus the default value is `false` to improve performance.
32 ///
33 /// Default: `false`
34 ///
35 /// [`Error::IllFormed(DoubleHyphenInComment)`]: crate::errors::IllFormedError::DoubleHyphenInComment
36 /// [`Comment`]: crate::events::Event::Comment
37 /// [is not allowed]: https://www.w3.org/TR/xml11/#sec-comments
38 pub check_comments: bool,
39
40 /// Whether mismatched closing tag names should be detected. If enabled, in
41 /// case of mismatch the [`Error::IllFormed(MismatchedEndTag)`] is returned from
42 /// read methods.
43 ///
44 /// Note, that start and end tags [should match literally][spec], they cannot
45 /// have different prefixes even if both prefixes resolve to the same namespace.
46 /// The XML
47 ///
48 /// ```xml
49 /// <outer xmlns="namespace" xmlns:p="namespace">
50 /// </p:outer>
51 /// ```
52 ///
53 /// is not valid, even though semantically the start tag is the same as the
54 /// end tag. The reason is that namespaces are an extension of the original
55 /// XML specification (without namespaces) and it should be backward-compatible.
56 ///
57 /// When set to `false`, it won't check if a closing tag matches the corresponding
58 /// opening tag. For example, `<mytag></different_tag>` will be permitted.
59 ///
60 /// If the XML is known to be sane (already processed, etc.) this saves extra time.
61 ///
62 /// Note that the emitted [`End`] event will not be modified if this is disabled,
63 /// ie. it will contain the data of the mismatched end tag.
64 ///
65 /// Note, that setting this to `true` will lead to additional allocates that
66 /// needed to store tag name for an [`End`] event. However if [`expand_empty_elements`]
67 /// is also set, only one additional allocation will be performed that support
68 /// both these options.
69 ///
70 /// Default: `true`
71 ///
72 /// [`Error::IllFormed(MismatchedEndTag)`]: crate::errors::IllFormedError::MismatchedEndTag
73 /// [spec]: https://www.w3.org/TR/xml11/#dt-etag
74 /// [`End`]: crate::events::Event::End
75 /// [`expand_empty_elements`]: Self::expand_empty_elements
76 pub check_end_names: bool,
77
78 /// Whether empty elements should be split into an `Open` and a `Close` event.
79 ///
80 /// When set to `true`, all [`Empty`] events produced by a self-closing tag
81 /// like `<tag/>` are expanded into a [`Start`] event followed by an [`End`]
82 /// event. When set to `false` (the default), those tags are represented by
83 /// an [`Empty`] event instead.
84 ///
85 /// Note, that setting this to `true` will lead to additional allocates that
86 /// needed to store tag name for an [`End`] event. However if [`check_end_names`]
87 /// is also set, only one additional allocation will be performed that support
88 /// both these options.
89 ///
90 /// Default: `false`
91 ///
92 /// [`Empty`]: crate::events::Event::Empty
93 /// [`Start`]: crate::events::Event::Start
94 /// [`End`]: crate::events::Event::End
95 /// [`check_end_names`]: Self::check_end_names
96 pub expand_empty_elements: bool,
97
98 /// Whether trailing whitespace after the markup name are trimmed in closing
99 /// tags `</a >`.
100 ///
101 /// If `true` the emitted [`End`] event is stripped of trailing whitespace
102 /// after the markup name.
103 ///
104 /// Note that if set to `false` and [`check_end_names`] is `true` the comparison
105 /// of markup names is going to fail erroneously if a closing tag contains
106 /// trailing whitespace.
107 ///
108 /// Default: `true`
109 ///
110 /// [`End`]: crate::events::Event::End
111 /// [`check_end_names`]: Self::check_end_names
112 pub trim_markup_names_in_closing_tags: bool,
113
114 /// Whether whitespace before character data should be removed.
115 ///
116 /// When set to `true`, leading whitespace is trimmed in [`Text`] events.
117 /// If after that the event is empty it will not be pushed.
118 ///
119 /// Default: `false`
120 ///
121 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
122 ///
123 /// WARNING: With this option every text events will be trimmed which is
124 /// incorrect behavior when text events delimited by comments, processing
125 /// instructions or CDATA sections. To correctly trim data manually apply
126 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
127 /// only to necessary events.
128 /// </div>
129 ///
130 /// [`Text`]: crate::events::Event::Text
131 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
132 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
133 pub trim_text_start: bool,
134
135 /// Whether whitespace after character data should be removed.
136 ///
137 /// When set to `true`, trailing whitespace is trimmed in [`Text`] events.
138 /// If after that the event is empty it will not be pushed.
139 ///
140 /// Default: `false`
141 ///
142 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
143 ///
144 /// WARNING: With this option every text events will be trimmed which is
145 /// incorrect behavior when text events delimited by comments, processing
146 /// instructions or CDATA sections. To correctly trim data manually apply
147 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
148 /// only to necessary events.
149 /// </div>
150 ///
151 /// [`Text`]: crate::events::Event::Text
152 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
153 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
154 pub trim_text_end: bool,
155}
156
157impl Config {
158 /// Set both [`trim_text_start`] and [`trim_text_end`] to the same value.
159 ///
160 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
161 ///
162 /// WARNING: With this option every text events will be trimmed which is
163 /// incorrect behavior when text events delimited by comments, processing
164 /// instructions or CDATA sections. To correctly trim data manually apply
165 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
166 /// only to necessary events.
167 /// </div>
168 ///
169 /// [`trim_text_start`]: Self::trim_text_start
170 /// [`trim_text_end`]: Self::trim_text_end
171 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
172 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
173 #[inline]
174 pub fn trim_text(&mut self, trim: bool) {
175 self.trim_text_start = trim;
176 self.trim_text_end = trim;
177 }
178
179 /// Turn on or off all checks for well-formedness. Currently it is that settings:
180 /// - [`check_comments`](Self::check_comments)
181 /// - [`check_end_names`](Self::check_end_names)
182 #[inline]
183 pub fn enable_all_checks(&mut self, enable: bool) {
184 self.check_comments = enable;
185 self.check_end_names = enable;
186 }
187}
188
189impl Default for Config {
190 fn default() -> Self {
191 Self {
192 check_comments: false,
193 check_end_names: true,
194 expand_empty_elements: false,
195 trim_markup_names_in_closing_tags: true,
196 trim_text_start: false,
197 trim_text_end: false,
198 }
199 }
200}
201
202////////////////////////////////////////////////////////////////////////////////////////////////////
203
204macro_rules! read_event_impl {
205 (
206 $self:ident, $buf:ident,
207 $reader:expr,
208 $read_until_open:ident,
209 $read_until_close:ident
210 $(, $await:ident)?
211 ) => {{
212 let event = loop {
213 match $self.state.state {
214 ParseState::Init => { // Go to OpenedTag state
215 // If encoding set explicitly, we not need to detect it. For example,
216 // explicit UTF-8 set automatically if Reader was created using `from_str`.
217 // But we still need to remove BOM for consistency with no encoding
218 // feature enabled path
219 #[cfg(feature = "encoding")]
220 if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
221 if $self.state.encoding.can_be_refined() {
222 $self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding);
223 }
224 }
225
226 // Removes UTF-8 BOM if it is present
227 #[cfg(not(feature = "encoding"))]
228 $reader.remove_utf8_bom() $(.$await)? ?;
229
230 // Go to OpenedTag state
231 match $self.$read_until_open($buf) $(.$await)? {
232 Ok(Ok(ev)) => break Ok(ev),
233 Ok(Err(b)) => $buf = b,
234 Err(err) => break Err(err),
235 }
236 },
237 ParseState::ClosedTag => { // Go to OpenedTag state
238 match $self.$read_until_open($buf) $(.$await)? {
239 Ok(Ok(ev)) => break Ok(ev),
240 Ok(Err(b)) => $buf = b,
241 Err(err) => break Err(err),
242 }
243 },
244 // Go to ClosedTag state in next two arms
245 ParseState::OpenedTag => break $self.$read_until_close($buf) $(.$await)?,
246 ParseState::Empty => break $self.state.close_expanded_empty(),
247 ParseState::Exit => break Ok(Event::Eof),
248 };
249 };
250 match event {
251 // #513: In case of ill-formed errors we already consume the wrong data
252 // and change the state. We can continue parsing if we wish
253 Err(Error::IllFormed(_)) => {}
254 Err(_) | Ok(Event::Eof) => $self.state.state = ParseState::Exit,
255 _ => {}
256 }
257 event
258 }};
259}
260
261/// Read bytes up to `<` and skip it. If current byte (after skipping all space
262/// characters if [`Config::trim_text_start`] is `true`) is already `<`, then
263/// returns the next event, otherwise stay at position just after the `<` symbol.
264///
265/// Moves parser to the `OpenedTag` state.
266///
267/// This code is executed in two cases:
268/// - after start of parsing just after skipping BOM if it is present
269/// - after parsing `</tag>` or `<tag>`
270macro_rules! read_until_open {
271 (
272 $self:ident, $buf:ident,
273 $reader:expr,
274 $read_event:ident
275 $(, $await:ident)?
276 ) => {{
277 if $self.state.config.trim_text_start {
278 $reader.skip_whitespace(&mut $self.state.offset) $(.$await)? ?;
279 }
280
281 // If we already at the `<` symbol, do not try to return an empty Text event
282 if $reader.skip_one(b'<') $(.$await)? ? {
283 $self.state.offset += 1;
284 $self.state.state = ParseState::OpenedTag;
285 // Pass $buf to the next next iteration of parsing loop
286 return Ok(Err($buf));
287 }
288
289 match $reader
290 .read_bytes_until(b'<', $buf, &mut $self.state.offset)
291 $(.$await)?
292 {
293 Ok((bytes, found)) => {
294 if found {
295 $self.state.state = ParseState::OpenedTag;
296 }
297 // Return Text event with `bytes` content or Eof if bytes is empty
298 $self.state.emit_text(bytes).map(Ok)
299 }
300 Err(e) => Err(e),
301 }
302 }};
303}
304
305/// Read bytes up to the `>` and skip it. This method is expected to be called
306/// after seeing the `<` symbol and skipping it. Inspects the next (current)
307/// symbol and returns an appropriate [`Event`]:
308///
309/// |Symbol |Event
310/// |-------|-------------------------------------
311/// |`!` |[`Comment`], [`CData`] or [`DocType`]
312/// |`/` |[`End`]
313/// |`?` |[`PI`]
314/// |_other_|[`Start`] or [`Empty`]
315///
316/// Moves parser to the `ClosedTag` state.
317///
318/// [`Comment`]: Event::Comment
319/// [`CData`]: Event::CData
320/// [`DocType`]: Event::DocType
321/// [`End`]: Event::End
322/// [`PI`]: Event::PI
323/// [`Start`]: Event::Start
324/// [`Empty`]: Event::Empty
325macro_rules! read_until_close {
326 (
327 $self:ident, $buf:ident,
328 $reader:expr
329 $(, $await:ident)?
330 ) => {{
331 $self.state.state = ParseState::ClosedTag;
332
333 let start = $self.state.offset;
334 match $reader.peek_one() $(.$await)? {
335 // `<!` - comment, CDATA or DOCTYPE declaration
336 Ok(Some(b'!')) => match $reader
337 .read_bang_element($buf, &mut $self.state.offset)
338 $(.$await)?
339 {
340 Ok((bang_type, bytes)) => $self.state.emit_bang(bang_type, bytes),
341 Err(e) => {
342 // <!....EOF
343 // ^^^^^ - `buf` does not contains `<`, but we want to report error at `<`,
344 // so we move offset to it (-1 for `<`)
345 $self.state.last_error_offset = start - 1;
346 Err(e)
347 }
348 },
349 // `</` - closing tag
350 Ok(Some(b'/')) => match $reader
351 .read_bytes_until(b'>', $buf, &mut $self.state.offset)
352 $(.$await)?
353 {
354 Ok((bytes, true)) => $self.state.emit_end(bytes),
355 Ok((_, false)) => {
356 // We want to report error at `<`, but offset was increased,
357 // so return it back (-1 for `<`)
358 $self.state.last_error_offset = start - 1;
359 Err(Error::Syntax(SyntaxError::UnclosedTag))
360 }
361 Err(e) => Err(e),
362 },
363 // `<?` - processing instruction
364 Ok(Some(b'?')) => match $reader
365 .read_with(PiParser::default(), $buf, &mut $self.state.offset)
366 $(.$await)?
367 {
368 Ok(bytes) => $self.state.emit_question_mark(bytes),
369 Err(e) => {
370 // We want to report error at `<`, but offset was increased,
371 // so return it back (-1 for `<`)
372 $self.state.last_error_offset = start - 1;
373 Err(e)
374 }
375 },
376 // `<...` - opening or self-closed tag
377 Ok(Some(_)) => match $reader
378 .read_with(ElementParser::default(), $buf, &mut $self.state.offset)
379 $(.$await)?
380 {
381 Ok(bytes) => $self.state.emit_start(bytes),
382 Err(e) => Err(e),
383 },
384 // `<` - syntax error, tag not closed
385 Ok(None) => {
386 // We want to report error at `<`, but offset was increased,
387 // so return it back (-1 for `<`)
388 $self.state.last_error_offset = start - 1;
389 Err(Error::Syntax(SyntaxError::UnclosedTag))
390 }
391 Err(e) => Err(e),
392 }
393 }};
394}
395
396/// Generalization of `read_to_end` method for buffered and borrowed readers
397macro_rules! read_to_end {
398 (
399 $self:expr, $end:expr, $buf:expr,
400 $read_event:ident,
401 // Code block that performs clearing of internal buffer after read of each event
402 $clear:block
403 $(, $await:ident)?
404 ) => {{
405 let start = $self.buffer_position();
406 let mut depth = 0;
407 loop {
408 $clear
409 let end = $self.buffer_position();
410 match $self.$read_event($buf) $(.$await)? {
411 Err(e) => return Err(e),
412
413 Ok(Event::Start(e)) if e.name() == $end => depth += 1,
414 Ok(Event::End(e)) if e.name() == $end => {
415 if depth == 0 {
416 break start..end;
417 }
418 depth -= 1;
419 }
420 Ok(Event::Eof) => return Err(Error::missed_end($end, $self.decoder())),
421 _ => (),
422 }
423 }
424 }};
425}
426
427#[cfg(feature = "async-tokio")]
428mod async_tokio;
429mod buffered_reader;
430mod element;
431mod ns_reader;
432mod pi;
433mod slice_reader;
434mod state;
435
436pub use element::ElementParser;
437pub use ns_reader::NsReader;
438pub use pi::PiParser;
439
440/// Range of input in bytes, that corresponds to some piece of XML
441pub type Span = Range<usize>;
442
443////////////////////////////////////////////////////////////////////////////////////////////////////
444
445/// Possible reader states. The state transition diagram (`true` and `false` shows
446/// value of [`Config::expand_empty_elements`] option):
447///
448/// ```mermaid
449/// flowchart LR
450/// subgraph _
451/// direction LR
452///
453/// Init -- "(no event)"\n --> OpenedTag
454/// OpenedTag -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> ClosedTag
455/// ClosedTag -- "#lt;false#gt;\n(no event)"\nText --> OpenedTag
456/// end
457/// ClosedTag -- "#lt;true#gt;"\nStart --> Empty
458/// Empty -- End --> ClosedTag
459/// _ -. Eof .-> Exit
460/// ```
461#[derive(Clone, Debug)]
462enum ParseState {
463 /// Initial state in which reader stay after creation. Transition from that
464 /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next
465 /// state is always `OpenedTag`. The reader will never return to this state. The
466 /// event emitted during transition to `OpenedTag` is a `StartEvent` if the
467 /// first symbol not `<`, otherwise no event are emitted.
468 Init,
469 /// State after seeing the `<` symbol. Depending on the next symbol all other
470 /// events could be generated.
471 ///
472 /// After generating one event the reader moves to the `ClosedTag` state.
473 OpenedTag,
474 /// State in which reader searches the `<` symbol of a markup. All bytes before
475 /// that symbol will be returned in the [`Event::Text`] event. After that
476 /// the reader moves to the `OpenedTag` state.
477 ClosedTag,
478 /// This state is used only if option [`expand_empty_elements`] is set to `true`.
479 /// Reader enters to this state when it is in a `ClosedTag` state and emits an
480 /// [`Event::Start`] event. The next event emitted will be an [`Event::End`],
481 /// after which reader returned to the `ClosedTag` state.
482 ///
483 /// [`expand_empty_elements`]: Config::expand_empty_elements
484 Empty,
485 /// Reader enters this state when `Eof` event generated or an error occurred.
486 /// This is the last state, the reader stay in it forever.
487 Exit,
488}
489
490/// A reference to an encoding together with information about how it was retrieved.
491///
492/// The state transition diagram:
493///
494/// ```mermaid
495/// flowchart LR
496/// Implicit -- from_str --> Explicit
497/// Implicit -- BOM --> BomDetected
498/// Implicit -- "encoding=..." --> XmlDetected
499/// BomDetected -- "encoding=..." --> XmlDetected
500/// ```
501#[cfg(feature = "encoding")]
502#[derive(Clone, Copy, Debug)]
503enum EncodingRef {
504 /// Encoding was implicitly assumed to have a specified value. It can be refined
505 /// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
506 Implicit(&'static Encoding),
507 /// Encoding was explicitly set to the desired value. It cannot be changed
508 /// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
509 Explicit(&'static Encoding),
510 /// Encoding was detected from a byte order mark (BOM) or by the first bytes
511 /// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
512 BomDetected(&'static Encoding),
513 /// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
514 /// It can no longer change
515 XmlDetected(&'static Encoding),
516}
517#[cfg(feature = "encoding")]
518impl EncodingRef {
519 #[inline]
520 fn encoding(&self) -> &'static Encoding {
521 match self {
522 Self::Implicit(e) => e,
523 Self::Explicit(e) => e,
524 Self::BomDetected(e) => e,
525 Self::XmlDetected(e) => e,
526 }
527 }
528 #[inline]
529 fn can_be_refined(&self) -> bool {
530 match self {
531 Self::Implicit(_) | Self::BomDetected(_) => true,
532 Self::Explicit(_) | Self::XmlDetected(_) => false,
533 }
534 }
535}
536
537////////////////////////////////////////////////////////////////////////////////////////////////////
538
539/// A low level encoding-agnostic XML event reader.
540///
541/// Consumes bytes and streams XML [`Event`]s.
542///
543/// This reader does not manage namespace declarations and not able to resolve
544/// prefixes. If you want these features, use the [`NsReader`].
545///
546/// # Examples
547///
548/// ```
549/// use quick_xml::events::Event;
550/// use quick_xml::reader::Reader;
551///
552/// let xml = r#"<tag1 att1 = "test">
553/// <tag2><!--Test comment-->Test</tag2>
554/// <tag2>Test 2</tag2>
555/// </tag1>"#;
556/// let mut reader = Reader::from_str(xml);
557/// reader.config_mut().trim_text(true);
558///
559/// let mut count = 0;
560/// let mut txt = Vec::new();
561/// let mut buf = Vec::new();
562///
563/// // The `Reader` does not implement `Iterator` because it outputs borrowed data (`Cow`s)
564/// loop {
565/// // NOTE: this is the generic case when we don't know about the input BufRead.
566/// // when the input is a &str or a &[u8], we don't actually need to use another
567/// // buffer, we could directly call `reader.read_event()`
568/// match reader.read_event_into(&mut buf) {
569/// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
570/// // exits the loop when reaching end of file
571/// Ok(Event::Eof) => break,
572///
573/// Ok(Event::Start(e)) => {
574/// match e.name().as_ref() {
575/// b"tag1" => println!("attributes values: {:?}",
576/// e.attributes().map(|a| a.unwrap().value)
577/// .collect::<Vec<_>>()),
578/// b"tag2" => count += 1,
579/// _ => (),
580/// }
581/// }
582/// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
583///
584/// // There are several other `Event`s we do not consider here
585/// _ => (),
586/// }
587/// // if we don't keep a borrow elsewhere, we can clear the buffer to keep memory usage low
588/// buf.clear();
589/// }
590/// ```
591///
592/// [`NsReader`]: crate::reader::NsReader
593#[derive(Clone)]
594pub struct Reader<R> {
595 /// Source of data for parse
596 reader: R,
597 /// Configuration and current parse state
598 state: ReaderState,
599}
600
601/// Builder methods
602impl<R> Reader<R> {
603 /// Creates a `Reader` that reads from a given reader.
604 pub fn from_reader(reader: R) -> Self {
605 Self {
606 reader,
607 state: ReaderState::default(),
608 }
609 }
610
611 /// Returns reference to the parser configuration
612 pub fn config(&self) -> &Config {
613 &self.state.config
614 }
615
616 /// Returns mutable reference to the parser configuration
617 pub fn config_mut(&mut self) -> &mut Config {
618 &mut self.state.config
619 }
620}
621
622/// Getters
623impl<R> Reader<R> {
624 /// Consumes `Reader` returning the underlying reader
625 ///
626 /// Can be used to compute line and column of a parsing error position
627 ///
628 /// # Examples
629 ///
630 /// ```
631 /// # use pretty_assertions::assert_eq;
632 /// use std::{str, io::Cursor};
633 /// use quick_xml::events::Event;
634 /// use quick_xml::reader::Reader;
635 ///
636 /// let xml = r#"<tag1 att1 = "test">
637 /// <tag2><!--Test comment-->Test</tag2>
638 /// <tag3>Test 2</tag3>
639 /// </tag1>"#;
640 /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
641 /// let mut buf = Vec::new();
642 ///
643 /// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
644 /// let end_pos = reader.buffer_position();
645 /// let mut cursor = reader.into_inner();
646 /// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
647 /// .expect("can't make a string");
648 /// let mut line = 1;
649 /// let mut column = 0;
650 /// for c in s.chars() {
651 /// if c == '\n' {
652 /// line += 1;
653 /// column = 0;
654 /// } else {
655 /// column += 1;
656 /// }
657 /// }
658 /// (line, column)
659 /// }
660 ///
661 /// loop {
662 /// match reader.read_event_into(&mut buf) {
663 /// Ok(Event::Start(ref e)) => match e.name().as_ref() {
664 /// b"tag1" | b"tag2" => (),
665 /// tag => {
666 /// assert_eq!(b"tag3", tag);
667 /// assert_eq!((3, 22), into_line_and_column(reader));
668 /// break;
669 /// }
670 /// },
671 /// Ok(Event::Eof) => unreachable!(),
672 /// _ => (),
673 /// }
674 /// buf.clear();
675 /// }
676 /// ```
677 pub fn into_inner(self) -> R {
678 self.reader
679 }
680
681 /// Gets a reference to the underlying reader.
682 pub fn get_ref(&self) -> &R {
683 &self.reader
684 }
685
686 /// Gets a mutable reference to the underlying reader.
687 pub fn get_mut(&mut self) -> &mut R {
688 &mut self.reader
689 }
690
691 /// Gets the current byte position in the input data.
692 pub fn buffer_position(&self) -> usize {
693 // when internal state is OpenedTag, we have actually read until '<',
694 // which we don't want to show
695 if let ParseState::OpenedTag = self.state.state {
696 self.state.offset - 1
697 } else {
698 self.state.offset
699 }
700 }
701
702 /// Gets the last error byte position in the input data. If there is no errors
703 /// yet, returns `0`.
704 ///
705 /// Unlike `buffer_position` it will point to the place where it is rational
706 /// to report error to the end user. For example, all [`SyntaxError`]s are
707 /// reported when the parser sees EOF inside of some kind of markup. The
708 /// `buffer_position()` will point to the last byte of input which is not
709 /// very useful. `error_position()` will point to the start of corresponding
710 /// markup element (i. e. to the `<` character).
711 ///
712 /// This position is always `<= buffer_position()`.
713 pub fn error_position(&self) -> usize {
714 self.state.last_error_offset
715 }
716
717 /// Get the decoder, used to decode bytes, read by this reader, to the strings.
718 ///
719 /// If [`encoding`] feature is enabled, the used encoding may change after
720 /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
721 ///
722 /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
723 /// defaults to UTF-8.
724 ///
725 /// [`encoding`]: ../index.html#encoding
726 #[inline]
727 pub fn decoder(&self) -> Decoder {
728 self.state.decoder()
729 }
730}
731
732/// Private sync reading methods
733impl<R> Reader<R> {
734 /// Read text into the given buffer, and return an event that borrows from
735 /// either that buffer or from the input itself, based on the type of the
736 /// reader.
737 fn read_event_impl<'i, B>(&mut self, mut buf: B) -> Result<Event<'i>>
738 where
739 R: XmlSource<'i, B>,
740 {
741 read_event_impl!(self, buf, self.reader, read_until_open, read_until_close)
742 }
743
744 /// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event.
745 ///
746 /// Returns inner `Ok` if the loop should be broken and an event returned.
747 /// Returns inner `Err` with the same `buf` because Rust borrowck stumbles upon this case in particular.
748 fn read_until_open<'i, B>(&mut self, buf: B) -> Result<std::result::Result<Event<'i>, B>>
749 where
750 R: XmlSource<'i, B>,
751 {
752 read_until_open!(self, buf, self.reader, read_event_impl)
753 }
754
755 /// Private function to read until `>` is found. This function expects that
756 /// it was called just after encounter a `<` symbol.
757 fn read_until_close<'i, B>(&mut self, buf: B) -> Result<Event<'i>>
758 where
759 R: XmlSource<'i, B>,
760 {
761 read_until_close!(self, buf, self.reader)
762 }
763}
764
765////////////////////////////////////////////////////////////////////////////////////////////////////
766
767/// Used to decouple reading of data from data source and parsing XML structure from it.
768/// This is a state preserved between getting chunks of bytes from the reader.
769///
770/// This trait is implemented for every parser that processes piece of XML grammar.
771pub trait Parser {
772 /// Process new data and try to determine end of the parsed thing.
773 ///
774 /// Returns position of the end of thing in `bytes` in case of successful search
775 /// and `None` otherwise.
776 ///
777 /// # Parameters
778 /// - `bytes`: a slice to find the end of a thing.
779 /// Should contain text in ASCII-compatible encoding
780 fn feed(&mut self, bytes: &[u8]) -> Option<usize>;
781
782 /// Returns parse error produced by this parser in case of reaching end of
783 /// input without finding the end of a parsed thing.
784 fn eof_error() -> SyntaxError;
785}
786
787/// Represents an input for a reader that can return borrowed data.
788///
789/// There are two implementors of this trait: generic one that read data from
790/// `Self`, copies some part of it into a provided buffer of type `B` and then
791/// returns data that borrow from that buffer.
792///
793/// The other implementor is for `&[u8]` and instead of copying data returns
794/// borrowed data from `Self` instead. This implementation allows zero-copy
795/// deserialization.
796///
797/// # Parameters
798/// - `'r`: lifetime of a buffer from which events will borrow
799/// - `B`: a type of a buffer that can be used to store data read from `Self` and
800/// from which events can borrow
801trait XmlSource<'r, B> {
802 /// Removes UTF-8 BOM if it is present
803 #[cfg(not(feature = "encoding"))]
804 fn remove_utf8_bom(&mut self) -> Result<()>;
805
806 /// Determines encoding from the start of input and removes BOM if it is present
807 #[cfg(feature = "encoding")]
808 fn detect_encoding(&mut self) -> Result<Option<&'static Encoding>>;
809
810 /// Read input until `byte` is found or end of input is reached.
811 ///
812 /// Returns a slice of data read up to `byte` (exclusive),
813 /// and a flag noting whether `byte` was found in the input or not.
814 ///
815 /// # Example
816 ///
817 /// ```ignore
818 /// let mut position = 0;
819 /// let mut input = b"abc*def".as_ref();
820 /// // ^= 4
821 ///
822 /// assert_eq!(
823 /// input.read_bytes_until(b'*', (), &mut position).unwrap(),
824 /// (b"abc".as_ref(), true)
825 /// );
826 /// assert_eq!(position, 4); // position after the symbol matched
827 /// ```
828 ///
829 /// # Parameters
830 /// - `byte`: Byte for search
831 /// - `buf`: Buffer that could be filled from an input (`Self`) and
832 /// from which [events] could borrow their data
833 /// - `position`: Will be increased by amount of bytes consumed
834 ///
835 /// [events]: crate::events::Event
836 fn read_bytes_until(
837 &mut self,
838 byte: u8,
839 buf: B,
840 position: &mut usize,
841 ) -> Result<(&'r [u8], bool)>;
842
843 /// Read input until processing instruction is finished.
844 ///
845 /// This method expect that start sequence of a parser already was read.
846 ///
847 /// Returns a slice of data read up to the end of the thing being parsed.
848 /// The end of thing and the returned content is determined by the used parser.
849 ///
850 /// If input (`Self`) is exhausted and no bytes was read, or if the specified
851 /// parser could not find the ending sequence of the thing, returns `SyntaxError`.
852 ///
853 /// # Parameters
854 /// - `buf`: Buffer that could be filled from an input (`Self`) and
855 /// from which [events] could borrow their data
856 /// - `position`: Will be increased by amount of bytes consumed
857 ///
858 /// A `P` type parameter is used to preserve state between calls to the underlying
859 /// reader which provides bytes fed into the parser.
860 /// [events]: crate::events::Event
861 fn read_with<P>(&mut self, parser: P, buf: B, position: &mut usize) -> Result<&'r [u8]>
862 where
863 P: Parser;
864
865 /// Read input until comment or CDATA is finished.
866 ///
867 /// This method expect that `<` already was read.
868 ///
869 /// Returns a slice of data read up to end of comment or CDATA (`>`),
870 /// which does not include into result.
871 ///
872 /// If input (`Self`) is exhausted and nothing was read, returns `None`.
873 ///
874 /// # Parameters
875 /// - `buf`: Buffer that could be filled from an input (`Self`) and
876 /// from which [events] could borrow their data
877 /// - `position`: Will be increased by amount of bytes consumed
878 ///
879 /// [events]: crate::events::Event
880 fn read_bang_element(&mut self, buf: B, position: &mut usize) -> Result<(BangType, &'r [u8])>;
881
882 /// Consume and discard all the whitespace until the next non-whitespace
883 /// character or EOF.
884 ///
885 /// # Parameters
886 /// - `position`: Will be increased by amount of bytes consumed
887 fn skip_whitespace(&mut self, position: &mut usize) -> Result<()>;
888
889 /// Consume and discard one character if it matches the given byte. Return
890 /// `true` if it matched.
891 ///
892 /// # Parameters
893 /// - `byte`: Character to skip
894 fn skip_one(&mut self, byte: u8) -> Result<bool>;
895
896 /// Return one character without consuming it, so that future `read_*` calls
897 /// will still include it. On EOF, return `None`.
898 fn peek_one(&mut self) -> Result<Option<u8>>;
899}
900
901/// Possible elements started with `<!`
902#[derive(Debug, PartialEq)]
903enum BangType {
904 /// <![CDATA[...]]>
905 CData,
906 /// <!--...-->
907 Comment,
908 /// <!DOCTYPE...>
909 DocType,
910}
911impl BangType {
912 #[inline(always)]
913 fn new(byte: Option<u8>) -> Result<Self> {
914 Ok(match byte {
915 Some(b'[') => Self::CData,
916 Some(b'-') => Self::Comment,
917 Some(b'D') | Some(b'd') => Self::DocType,
918 _ => return Err(Error::Syntax(SyntaxError::InvalidBangMarkup)),
919 })
920 }
921
922 /// If element is finished, returns its content up to `>` symbol and
923 /// an index of this symbol, otherwise returns `None`
924 ///
925 /// # Parameters
926 /// - `buf`: buffer with data consumed on previous iterations
927 /// - `chunk`: data read on current iteration and not yet consumed from reader
928 #[inline(always)]
929 fn parse<'b>(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
930 for i in memchr::memchr_iter(b'>', chunk) {
931 match self {
932 // Need to read at least 6 symbols (`!---->`) for properly finished comment
933 // <!----> - XML comment
934 // 012345 - i
935 Self::Comment if buf.len() + i > 4 => {
936 if chunk[..i].ends_with(b"--") {
937 // We cannot strip last `--` from the buffer because we need it in case of
938 // check_comments enabled option. XML standard requires that comment
939 // will not end with `--->` sequence because this is a special case of
940 // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
941 return Some((&chunk[..i], i + 1)); // +1 for `>`
942 }
943 // End sequence `-|->` was splitted at |
944 // buf --/ \-- chunk
945 if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
946 return Some((&chunk[..i], i + 1)); // +1 for `>`
947 }
948 // End sequence `--|>` was splitted at |
949 // buf --/ \-- chunk
950 if i == 0 && buf.ends_with(b"--") {
951 return Some((&[], i + 1)); // +1 for `>`
952 }
953 }
954 Self::Comment => {}
955 Self::CData => {
956 if chunk[..i].ends_with(b"]]") {
957 return Some((&chunk[..i], i + 1)); // +1 for `>`
958 }
959 // End sequence `]|]>` was splitted at |
960 // buf --/ \-- chunk
961 if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' {
962 return Some((&chunk[..i], i + 1)); // +1 for `>`
963 }
964 // End sequence `]]|>` was splitted at |
965 // buf --/ \-- chunk
966 if i == 0 && buf.ends_with(b"]]") {
967 return Some((&[], i + 1)); // +1 for `>`
968 }
969 }
970 Self::DocType => {
971 let content = &chunk[..i];
972 let balance = memchr::memchr2_iter(b'<', b'>', content)
973 .map(|p| if content[p] == b'<' { 1i32 } else { -1 })
974 .sum::<i32>();
975 if balance == 0 {
976 return Some((content, i + 1)); // +1 for `>`
977 }
978 }
979 }
980 }
981 None
982 }
983 #[inline]
984 fn to_err(&self) -> Error {
985 match self {
986 Self::CData => Error::Syntax(SyntaxError::UnclosedCData),
987 Self::Comment => Error::Syntax(SyntaxError::UnclosedComment),
988 Self::DocType => Error::Syntax(SyntaxError::UnclosedDoctype),
989 }
990 }
991}
992
993/// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab)
994#[inline]
995pub(crate) const fn is_whitespace(b: u8) -> bool {
996 matches!(b, b' ' | b'\r' | b'\n' | b'\t')
997}
998
999////////////////////////////////////////////////////////////////////////////////////////////////////
1000
1001#[cfg(test)]
1002mod test {
1003 /// Checks the internal implementation of the various reader methods
1004 macro_rules! check {
1005 (
1006 #[$test:meta]
1007 $read_event:ident,
1008 $read_until_close:ident,
1009 // constructor of the XML source on which internal functions will be called
1010 $source:path,
1011 // constructor of the buffer to which read data will stored
1012 $buf:expr
1013 $(, $async:ident, $await:ident)?
1014 ) => {
1015 mod read_bytes_until {
1016 use super::*;
1017 // Use Bytes for printing bytes as strings for ASCII range
1018 use crate::utils::Bytes;
1019 use pretty_assertions::assert_eq;
1020
1021 /// Checks that search in the empty buffer returns `None`
1022 #[$test]
1023 $($async)? fn empty() {
1024 let buf = $buf;
1025 let mut position = 0;
1026 let mut input = b"".as_ref();
1027 // ^= 0
1028
1029 let (bytes, found) = $source(&mut input)
1030 .read_bytes_until(b'*', buf, &mut position)
1031 $(.$await)?
1032 .unwrap();
1033 assert_eq!(
1034 (Bytes(bytes), found),
1035 (Bytes(b""), false)
1036 );
1037 assert_eq!(position, 0);
1038 }
1039
1040 /// Checks that search in the buffer non-existent value returns entire buffer
1041 /// as a result and set `position` to `len()`
1042 #[$test]
1043 $($async)? fn non_existent() {
1044 let buf = $buf;
1045 let mut position = 0;
1046 let mut input = b"abcdef".as_ref();
1047 // ^= 6
1048
1049 let (bytes, found) = $source(&mut input)
1050 .read_bytes_until(b'*', buf, &mut position)
1051 $(.$await)?
1052 .unwrap();
1053 assert_eq!(
1054 (Bytes(bytes), found),
1055 (Bytes(b"abcdef"), false)
1056 );
1057 assert_eq!(position, 6);
1058 }
1059
1060 /// Checks that search in the buffer an element that is located in the front of
1061 /// buffer returns empty slice as a result and set `position` to one symbol
1062 /// after match (`1`)
1063 #[$test]
1064 $($async)? fn at_the_start() {
1065 let buf = $buf;
1066 let mut position = 0;
1067 let mut input = b"*abcdef".as_ref();
1068 // ^= 1
1069
1070 let (bytes, found) = $source(&mut input)
1071 .read_bytes_until(b'*', buf, &mut position)
1072 $(.$await)?
1073 .unwrap();
1074 assert_eq!(
1075 (Bytes(bytes), found),
1076 (Bytes(b""), true)
1077 );
1078 assert_eq!(position, 1); // position after the symbol matched
1079 }
1080
1081 /// Checks that search in the buffer an element that is located in the middle of
1082 /// buffer returns slice before that symbol as a result and set `position` to one
1083 /// symbol after match
1084 #[$test]
1085 $($async)? fn inside() {
1086 let buf = $buf;
1087 let mut position = 0;
1088 let mut input = b"abc*def".as_ref();
1089 // ^= 4
1090
1091 let (bytes, found) = $source(&mut input)
1092 .read_bytes_until(b'*', buf, &mut position)
1093 $(.$await)?
1094 .unwrap();
1095 assert_eq!(
1096 (Bytes(bytes), found),
1097 (Bytes(b"abc"), true)
1098 );
1099 assert_eq!(position, 4); // position after the symbol matched
1100 }
1101
1102 /// Checks that search in the buffer an element that is located in the end of
1103 /// buffer returns slice before that symbol as a result and set `position` to one
1104 /// symbol after match (`len()`)
1105 #[$test]
1106 $($async)? fn in_the_end() {
1107 let buf = $buf;
1108 let mut position = 0;
1109 let mut input = b"abcdef*".as_ref();
1110 // ^= 7
1111
1112 let (bytes, found) = $source(&mut input)
1113 .read_bytes_until(b'*', buf, &mut position)
1114 $(.$await)?
1115 .unwrap();
1116 assert_eq!(
1117 (Bytes(bytes), found),
1118 (Bytes(b"abcdef"), true)
1119 );
1120 assert_eq!(position, 7); // position after the symbol matched
1121 }
1122 }
1123
1124 mod read_bang_element {
1125 use super::*;
1126 use crate::errors::{Error, SyntaxError};
1127 use crate::reader::BangType;
1128 use crate::utils::Bytes;
1129
1130 /// Checks that reading CDATA content works correctly
1131 mod cdata {
1132 use super::*;
1133 use pretty_assertions::assert_eq;
1134
1135 /// Checks that if input begins like CDATA element, but CDATA start sequence
1136 /// is not finished, parsing ends with an error
1137 #[$test]
1138 #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"]
1139 $($async)? fn not_properly_start() {
1140 let buf = $buf;
1141 let mut position = 1;
1142 let mut input = b"![]]>other content".as_ref();
1143 // ^= 1
1144
1145 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1146 Err(Error::Syntax(SyntaxError::UnclosedCData)) => {}
1147 x => panic!(
1148 "Expected `Err(Syntax(UnclosedCData))`, but got `{:?}`",
1149 x
1150 ),
1151 }
1152 assert_eq!(position, 1);
1153 }
1154
1155 /// Checks that if CDATA startup sequence was matched, but an end sequence
1156 /// is not found, parsing ends with an error
1157 #[$test]
1158 $($async)? fn not_closed() {
1159 let buf = $buf;
1160 let mut position = 1;
1161 let mut input = b"![CDATA[other content".as_ref();
1162 // ^= 1 ^= 22
1163
1164 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1165 Err(Error::Syntax(SyntaxError::UnclosedCData)) => {}
1166 x => panic!(
1167 "Expected `Err(Syntax(UnclosedCData))`, but got `{:?}`",
1168 x
1169 ),
1170 }
1171 assert_eq!(position, 22);
1172 }
1173
1174 /// Checks that CDATA element without content inside parsed successfully
1175 #[$test]
1176 $($async)? fn empty() {
1177 let buf = $buf;
1178 let mut position = 1;
1179 let mut input = b"![CDATA[]]>other content".as_ref();
1180 // ^= 1 ^= 12
1181
1182 let (ty, bytes) = $source(&mut input)
1183 .read_bang_element(buf, &mut position)
1184 $(.$await)?
1185 .unwrap();
1186 assert_eq!(
1187 (ty, Bytes(bytes)),
1188 (BangType::CData, Bytes(b"![CDATA[]]"))
1189 );
1190 assert_eq!(position, 12);
1191 }
1192
1193 /// Checks that CDATA element with content parsed successfully.
1194 /// Additionally checks that sequences inside CDATA that may look like
1195 /// a CDATA end sequence do not interrupt CDATA parsing
1196 #[$test]
1197 $($async)? fn with_content() {
1198 let buf = $buf;
1199 let mut position = 1;
1200 let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref();
1201 // ^= 1 ^= 29
1202
1203 let (ty, bytes) = $source(&mut input)
1204 .read_bang_element(buf, &mut position)
1205 $(.$await)?
1206 .unwrap();
1207 assert_eq!(
1208 (ty, Bytes(bytes)),
1209 (BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]"))
1210 );
1211 assert_eq!(position, 29);
1212 }
1213 }
1214
1215 /// Checks that reading XML comments works correctly. According to the [specification],
1216 /// comment data can contain any sequence except `--`:
1217 ///
1218 /// ```peg
1219 /// comment = '<--' (!'--' char)* '-->';
1220 /// char = [#x1-#x2C]
1221 /// / [#x2E-#xD7FF]
1222 /// / [#xE000-#xFFFD]
1223 /// / [#x10000-#x10FFFF]
1224 /// ```
1225 ///
1226 /// The presence of this limitation, however, is simply a poorly designed specification
1227 /// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for
1228 /// presence of these sequences by default. This tests allow such content.
1229 ///
1230 /// [specification]: https://www.w3.org/TR/xml11/#dt-comment
1231 mod comment {
1232 use super::*;
1233 use pretty_assertions::assert_eq;
1234
1235 #[$test]
1236 #[ignore = "start comment sequence fully checked outside of `read_bang_element`"]
1237 $($async)? fn not_properly_start() {
1238 let buf = $buf;
1239 let mut position = 1;
1240 let mut input = b"!- -->other content".as_ref();
1241 // ^= 1
1242
1243 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1244 Err(Error::Syntax(SyntaxError::UnclosedComment)) => {}
1245 x => panic!(
1246 "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`",
1247 x
1248 ),
1249 }
1250 assert_eq!(position, 1);
1251 }
1252
1253 #[$test]
1254 $($async)? fn not_properly_end() {
1255 let buf = $buf;
1256 let mut position = 1;
1257 let mut input = b"!->other content".as_ref();
1258 // ^= 1 ^= 17
1259
1260 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1261 Err(Error::Syntax(SyntaxError::UnclosedComment)) => {}
1262 x => panic!(
1263 "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`",
1264 x
1265 ),
1266 }
1267 assert_eq!(position, 17);
1268 }
1269
1270 #[$test]
1271 $($async)? fn not_closed1() {
1272 let buf = $buf;
1273 let mut position = 1;
1274 let mut input = b"!--other content".as_ref();
1275 // ^= 1 ^= 17
1276
1277 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1278 Err(Error::Syntax(SyntaxError::UnclosedComment)) => {}
1279 x => panic!(
1280 "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`",
1281 x
1282 ),
1283 }
1284 assert_eq!(position, 17);
1285 }
1286
1287 #[$test]
1288 $($async)? fn not_closed2() {
1289 let buf = $buf;
1290 let mut position = 1;
1291 let mut input = b"!-->other content".as_ref();
1292 // ^= 1 ^= 18
1293
1294 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1295 Err(Error::Syntax(SyntaxError::UnclosedComment)) => {}
1296 x => panic!(
1297 "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`",
1298 x
1299 ),
1300 }
1301 assert_eq!(position, 18);
1302 }
1303
1304 #[$test]
1305 $($async)? fn not_closed3() {
1306 let buf = $buf;
1307 let mut position = 1;
1308 let mut input = b"!--->other content".as_ref();
1309 // ^= 1 ^= 19
1310
1311 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1312 Err(Error::Syntax(SyntaxError::UnclosedComment)) => {}
1313 x => panic!(
1314 "Expected `Err(Syntax(UnclosedComment))`, but got `{:?}`",
1315 x
1316 ),
1317 }
1318 assert_eq!(position, 19);
1319 }
1320
1321 #[$test]
1322 $($async)? fn empty() {
1323 let buf = $buf;
1324 let mut position = 1;
1325 let mut input = b"!---->other content".as_ref();
1326 // ^= 1 ^= 7
1327
1328 let (ty, bytes) = $source(&mut input)
1329 .read_bang_element(buf, &mut position)
1330 $(.$await)?
1331 .unwrap();
1332 assert_eq!(
1333 (ty, Bytes(bytes)),
1334 (BangType::Comment, Bytes(b"!----"))
1335 );
1336 assert_eq!(position, 7);
1337 }
1338
1339 #[$test]
1340 $($async)? fn with_content() {
1341 let buf = $buf;
1342 let mut position = 1;
1343 let mut input = b"!--->comment<--->other content".as_ref();
1344 // ^= 1 ^= 18
1345
1346 let (ty, bytes) = $source(&mut input)
1347 .read_bang_element(buf, &mut position)
1348 $(.$await)?
1349 .unwrap();
1350 assert_eq!(
1351 (ty, Bytes(bytes)),
1352 (BangType::Comment, Bytes(b"!--->comment<---"))
1353 );
1354 assert_eq!(position, 18);
1355 }
1356 }
1357
1358 /// Checks that reading DOCTYPE definition works correctly
1359 mod doctype {
1360 use super::*;
1361
1362 mod uppercase {
1363 use super::*;
1364 use pretty_assertions::assert_eq;
1365
1366 #[$test]
1367 $($async)? fn not_properly_start() {
1368 let buf = $buf;
1369 let mut position = 1;
1370 let mut input = b"!D other content".as_ref();
1371 // ^= 1 ^= 17
1372
1373 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1374 Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {}
1375 x => panic!(
1376 "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`",
1377 x
1378 ),
1379 }
1380 assert_eq!(position, 17);
1381 }
1382
1383 #[$test]
1384 $($async)? fn without_space() {
1385 let buf = $buf;
1386 let mut position = 1;
1387 let mut input = b"!DOCTYPEother content".as_ref();
1388 // ^= 1 ^= 22
1389
1390 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1391 Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {}
1392 x => panic!(
1393 "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`",
1394 x
1395 ),
1396 }
1397 assert_eq!(position, 22);
1398 }
1399
1400 #[$test]
1401 $($async)? fn empty() {
1402 let buf = $buf;
1403 let mut position = 1;
1404 let mut input = b"!DOCTYPE>other content".as_ref();
1405 // ^= 1 ^= 10
1406
1407 let (ty, bytes) = $source(&mut input)
1408 .read_bang_element(buf, &mut position)
1409 $(.$await)?
1410 .unwrap();
1411 assert_eq!(
1412 (ty, Bytes(bytes)),
1413 (BangType::DocType, Bytes(b"!DOCTYPE"))
1414 );
1415 assert_eq!(position, 10);
1416 }
1417
1418 #[$test]
1419 $($async)? fn not_closed() {
1420 let buf = $buf;
1421 let mut position = 1;
1422 let mut input = b"!DOCTYPE other content".as_ref();
1423 // ^= 1 ^23
1424
1425 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1426 Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {}
1427 x => panic!(
1428 "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`",
1429 x
1430 ),
1431 }
1432 assert_eq!(position, 23);
1433 }
1434 }
1435
1436 mod lowercase {
1437 use super::*;
1438 use pretty_assertions::assert_eq;
1439
1440 #[$test]
1441 $($async)? fn not_properly_start() {
1442 let buf = $buf;
1443 let mut position = 1;
1444 let mut input = b"!d other content".as_ref();
1445 // ^= 1 ^= 17
1446
1447 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1448 Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {}
1449 x => panic!(
1450 "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`",
1451 x
1452 ),
1453 }
1454 assert_eq!(position, 17);
1455 }
1456
1457 #[$test]
1458 $($async)? fn without_space() {
1459 let buf = $buf;
1460 let mut position = 1;
1461 let mut input = b"!doctypeother content".as_ref();
1462 // ^= 1 ^= 22
1463
1464 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1465 Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {}
1466 x => panic!(
1467 "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`",
1468 x
1469 ),
1470 }
1471 assert_eq!(position, 22);
1472 }
1473
1474 #[$test]
1475 $($async)? fn empty() {
1476 let buf = $buf;
1477 let mut position = 1;
1478 let mut input = b"!doctype>other content".as_ref();
1479 // ^= 1 ^= 10
1480
1481 let (ty, bytes) = $source(&mut input)
1482 .read_bang_element(buf, &mut position)
1483 $(.$await)?
1484 .unwrap();
1485 assert_eq!(
1486 (ty, Bytes(bytes)),
1487 (BangType::DocType, Bytes(b"!doctype"))
1488 );
1489 assert_eq!(position, 10);
1490 }
1491
1492 #[$test]
1493 $($async)? fn not_closed() {
1494 let buf = $buf;
1495 let mut position = 1;
1496 let mut input = b"!doctype other content".as_ref();
1497 // ^= 1 ^= 23
1498
1499 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1500 Err(Error::Syntax(SyntaxError::UnclosedDoctype)) => {}
1501 x => panic!(
1502 "Expected `Err(Syntax(UnclosedDoctype))`, but got `{:?}`",
1503 x
1504 ),
1505 }
1506 assert_eq!(position, 23);
1507 }
1508 }
1509 }
1510 }
1511
1512 mod read_element {
1513 use super::*;
1514 use crate::errors::{Error, SyntaxError};
1515 use crate::reader::ElementParser;
1516 use crate::utils::Bytes;
1517 use pretty_assertions::assert_eq;
1518
1519 /// Checks that nothing was read from empty buffer
1520 #[$test]
1521 $($async)? fn empty() {
1522 let buf = $buf;
1523 let mut position = 1;
1524 let mut input = b"".as_ref();
1525 // ^= 1
1526
1527 match $source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? {
1528 Err(Error::Syntax(SyntaxError::UnclosedTag)) => {}
1529 x => panic!(
1530 "Expected `Err(Syntax(UnclosedTag))`, but got `{:?}`",
1531 x
1532 ),
1533 }
1534 assert_eq!(position, 1);
1535 }
1536
1537 mod open {
1538 use super::*;
1539 use pretty_assertions::assert_eq;
1540
1541 #[$test]
1542 $($async)? fn empty_tag() {
1543 let buf = $buf;
1544 let mut position = 1;
1545 let mut input = b">".as_ref();
1546 // ^= 2
1547
1548 assert_eq!(
1549 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1550 Bytes(b"")
1551 );
1552 assert_eq!(position, 2);
1553 }
1554
1555 #[$test]
1556 $($async)? fn normal() {
1557 let buf = $buf;
1558 let mut position = 1;
1559 let mut input = b"tag>".as_ref();
1560 // ^= 5
1561
1562 assert_eq!(
1563 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1564 Bytes(b"tag")
1565 );
1566 assert_eq!(position, 5);
1567 }
1568
1569 #[$test]
1570 $($async)? fn empty_ns_empty_tag() {
1571 let buf = $buf;
1572 let mut position = 1;
1573 let mut input = b":>".as_ref();
1574 // ^= 3
1575
1576 assert_eq!(
1577 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1578 Bytes(b":")
1579 );
1580 assert_eq!(position, 3);
1581 }
1582
1583 #[$test]
1584 $($async)? fn empty_ns() {
1585 let buf = $buf;
1586 let mut position = 1;
1587 let mut input = b":tag>".as_ref();
1588 // ^= 6
1589
1590 assert_eq!(
1591 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1592 Bytes(b":tag")
1593 );
1594 assert_eq!(position, 6);
1595 }
1596
1597 #[$test]
1598 $($async)? fn with_attributes() {
1599 let buf = $buf;
1600 let mut position = 1;
1601 let mut input = br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref();
1602 // ^= 39
1603
1604 assert_eq!(
1605 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1606 Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#)
1607 );
1608 assert_eq!(position, 39);
1609 }
1610 }
1611
1612 mod self_closed {
1613 use super::*;
1614 use pretty_assertions::assert_eq;
1615
1616 #[$test]
1617 $($async)? fn empty_tag() {
1618 let buf = $buf;
1619 let mut position = 1;
1620 let mut input = b"/>".as_ref();
1621 // ^= 3
1622
1623 assert_eq!(
1624 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1625 Bytes(b"/")
1626 );
1627 assert_eq!(position, 3);
1628 }
1629
1630 #[$test]
1631 $($async)? fn normal() {
1632 let buf = $buf;
1633 let mut position = 1;
1634 let mut input = b"tag/>".as_ref();
1635 // ^= 6
1636
1637 assert_eq!(
1638 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1639 Bytes(b"tag/")
1640 );
1641 assert_eq!(position, 6);
1642 }
1643
1644 #[$test]
1645 $($async)? fn empty_ns_empty_tag() {
1646 let buf = $buf;
1647 let mut position = 1;
1648 let mut input = b":/>".as_ref();
1649 // ^= 4
1650
1651 assert_eq!(
1652 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1653 Bytes(b":/")
1654 );
1655 assert_eq!(position, 4);
1656 }
1657
1658 #[$test]
1659 $($async)? fn empty_ns() {
1660 let buf = $buf;
1661 let mut position = 1;
1662 let mut input = b":tag/>".as_ref();
1663 // ^= 7
1664
1665 assert_eq!(
1666 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1667 Bytes(b":tag/")
1668 );
1669 assert_eq!(position, 7);
1670 }
1671
1672 #[$test]
1673 $($async)? fn with_attributes() {
1674 let buf = $buf;
1675 let mut position = 1;
1676 let mut input = br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref();
1677 // ^= 42
1678
1679 assert_eq!(
1680 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1681 Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#)
1682 );
1683 assert_eq!(position, 42);
1684 }
1685 }
1686 }
1687
1688 /// Ensures, that no empty `Text` events are generated
1689 mod $read_event {
1690 use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
1691 use crate::reader::Reader;
1692 use pretty_assertions::assert_eq;
1693
1694 /// When `encoding` feature is enabled, encoding should be detected
1695 /// from BOM (UTF-8) and BOM should be stripped.
1696 ///
1697 /// When `encoding` feature is disabled, UTF-8 is assumed and BOM
1698 /// character should be stripped for consistency
1699 #[$test]
1700 $($async)? fn bom_from_reader() {
1701 let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes());
1702
1703 assert_eq!(
1704 reader.$read_event($buf) $(.$await)? .unwrap(),
1705 Event::Text(BytesText::from_escaped("\u{feff}"))
1706 );
1707
1708 assert_eq!(
1709 reader.$read_event($buf) $(.$await)? .unwrap(),
1710 Event::Eof
1711 );
1712 }
1713
1714 /// When parsing from &str, encoding is fixed (UTF-8), so
1715 /// - when `encoding` feature is disabled, the behavior the
1716 /// same as in `bom_from_reader` text
1717 /// - when `encoding` feature is enabled, the behavior should
1718 /// stay consistent, so the first BOM character is stripped
1719 #[$test]
1720 $($async)? fn bom_from_str() {
1721 let mut reader = Reader::from_str("\u{feff}\u{feff}");
1722
1723 assert_eq!(
1724 reader.$read_event($buf) $(.$await)? .unwrap(),
1725 Event::Text(BytesText::from_escaped("\u{feff}"))
1726 );
1727
1728 assert_eq!(
1729 reader.$read_event($buf) $(.$await)? .unwrap(),
1730 Event::Eof
1731 );
1732 }
1733
1734 #[$test]
1735 $($async)? fn declaration() {
1736 let mut reader = Reader::from_str("<?xml ?>");
1737
1738 assert_eq!(
1739 reader.$read_event($buf) $(.$await)? .unwrap(),
1740 Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
1741 );
1742 }
1743
1744 #[$test]
1745 $($async)? fn doctype() {
1746 let mut reader = Reader::from_str("<!DOCTYPE x>");
1747
1748 assert_eq!(
1749 reader.$read_event($buf) $(.$await)? .unwrap(),
1750 Event::DocType(BytesText::from_escaped("x"))
1751 );
1752 }
1753
1754 #[$test]
1755 $($async)? fn processing_instruction() {
1756 let mut reader = Reader::from_str("<?xml-stylesheet '? >\" ?>");
1757
1758 assert_eq!(
1759 reader.$read_event($buf) $(.$await)? .unwrap(),
1760 Event::PI(BytesText::from_escaped("xml-stylesheet '? >\" "))
1761 );
1762 }
1763
1764 /// Lone closing tags are not allowed, so testing it together with start tag
1765 #[$test]
1766 $($async)? fn start_and_end() {
1767 let mut reader = Reader::from_str("<tag></tag>");
1768
1769 assert_eq!(
1770 reader.$read_event($buf) $(.$await)? .unwrap(),
1771 Event::Start(BytesStart::new("tag"))
1772 );
1773
1774 assert_eq!(
1775 reader.$read_event($buf) $(.$await)? .unwrap(),
1776 Event::End(BytesEnd::new("tag"))
1777 );
1778 }
1779
1780 #[$test]
1781 $($async)? fn empty() {
1782 let mut reader = Reader::from_str("<tag/>");
1783
1784 assert_eq!(
1785 reader.$read_event($buf) $(.$await)? .unwrap(),
1786 Event::Empty(BytesStart::new("tag"))
1787 );
1788 }
1789
1790 #[$test]
1791 $($async)? fn text() {
1792 let mut reader = Reader::from_str("text");
1793
1794 assert_eq!(
1795 reader.$read_event($buf) $(.$await)? .unwrap(),
1796 Event::Text(BytesText::from_escaped("text"))
1797 );
1798 }
1799
1800 #[$test]
1801 $($async)? fn cdata() {
1802 let mut reader = Reader::from_str("<![CDATA[]]>");
1803
1804 assert_eq!(
1805 reader.$read_event($buf) $(.$await)? .unwrap(),
1806 Event::CData(BytesCData::new(""))
1807 );
1808 }
1809
1810 #[$test]
1811 $($async)? fn comment() {
1812 let mut reader = Reader::from_str("<!---->");
1813
1814 assert_eq!(
1815 reader.$read_event($buf) $(.$await)? .unwrap(),
1816 Event::Comment(BytesText::from_escaped(""))
1817 );
1818 }
1819
1820 #[$test]
1821 $($async)? fn eof() {
1822 let mut reader = Reader::from_str("");
1823
1824 assert_eq!(
1825 reader.$read_event($buf) $(.$await)? .unwrap(),
1826 Event::Eof
1827 );
1828 }
1829 }
1830 };
1831 }
1832
1833 /// Tests for https://github.com/tafia/quick-xml/issues/469
1834 macro_rules! small_buffers {
1835 (
1836 #[$test:meta]
1837 $read_event:ident: $BufReader:ty
1838 $(, $async:ident, $await:ident)?
1839 ) => {
1840 mod small_buffers {
1841 use crate::events::{BytesCData, BytesDecl, BytesStart, BytesText, Event};
1842 use crate::reader::Reader;
1843 use pretty_assertions::assert_eq;
1844
1845 #[$test]
1846 $($async)? fn decl() {
1847 let xml = "<?xml ?>";
1848 // ^^^^^^^ data that fit into buffer
1849 let size = xml.match_indices("?>").next().unwrap().0 + 1;
1850 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1851 let mut reader = Reader::from_reader(br);
1852 let mut buf = Vec::new();
1853
1854 assert_eq!(
1855 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1856 Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
1857 );
1858 assert_eq!(
1859 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1860 Event::Eof
1861 );
1862 }
1863
1864 #[$test]
1865 $($async)? fn pi() {
1866 let xml = "<?pi?>";
1867 // ^^^^^ data that fit into buffer
1868 let size = xml.match_indices("?>").next().unwrap().0 + 1;
1869 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1870 let mut reader = Reader::from_reader(br);
1871 let mut buf = Vec::new();
1872
1873 assert_eq!(
1874 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1875 Event::PI(BytesText::new("pi"))
1876 );
1877 assert_eq!(
1878 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1879 Event::Eof
1880 );
1881 }
1882
1883 #[$test]
1884 $($async)? fn empty() {
1885 let xml = "<empty/>";
1886 // ^^^^^^^ data that fit into buffer
1887 let size = xml.match_indices("/>").next().unwrap().0 + 1;
1888 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1889 let mut reader = Reader::from_reader(br);
1890 let mut buf = Vec::new();
1891
1892 assert_eq!(
1893 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1894 Event::Empty(BytesStart::new("empty"))
1895 );
1896 assert_eq!(
1897 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1898 Event::Eof
1899 );
1900 }
1901
1902 #[$test]
1903 $($async)? fn cdata1() {
1904 let xml = "<![CDATA[cdata]]>";
1905 // ^^^^^^^^^^^^^^^ data that fit into buffer
1906 let size = xml.match_indices("]]>").next().unwrap().0 + 1;
1907 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1908 let mut reader = Reader::from_reader(br);
1909 let mut buf = Vec::new();
1910
1911 assert_eq!(
1912 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1913 Event::CData(BytesCData::new("cdata"))
1914 );
1915 assert_eq!(
1916 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1917 Event::Eof
1918 );
1919 }
1920
1921 #[$test]
1922 $($async)? fn cdata2() {
1923 let xml = "<![CDATA[cdata]]>";
1924 // ^^^^^^^^^^^^^^^^ data that fit into buffer
1925 let size = xml.match_indices("]]>").next().unwrap().0 + 2;
1926 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1927 let mut reader = Reader::from_reader(br);
1928 let mut buf = Vec::new();
1929
1930 assert_eq!(
1931 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1932 Event::CData(BytesCData::new("cdata"))
1933 );
1934 assert_eq!(
1935 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1936 Event::Eof
1937 );
1938 }
1939
1940 #[$test]
1941 $($async)? fn comment1() {
1942 let xml = "<!--comment-->";
1943 // ^^^^^^^^^^^^ data that fit into buffer
1944 let size = xml.match_indices("-->").next().unwrap().0 + 1;
1945 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1946 let mut reader = Reader::from_reader(br);
1947 let mut buf = Vec::new();
1948
1949 assert_eq!(
1950 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1951 Event::Comment(BytesText::new("comment"))
1952 );
1953 assert_eq!(
1954 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1955 Event::Eof
1956 );
1957 }
1958
1959 #[$test]
1960 $($async)? fn comment2() {
1961 let xml = "<!--comment-->";
1962 // ^^^^^^^^^^^^^ data that fit into buffer
1963 let size = xml.match_indices("-->").next().unwrap().0 + 2;
1964 let br = <$BufReader>::with_capacity(size, xml.as_bytes());
1965 let mut reader = Reader::from_reader(br);
1966 let mut buf = Vec::new();
1967
1968 assert_eq!(
1969 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1970 Event::Comment(BytesText::new("comment"))
1971 );
1972 assert_eq!(
1973 reader.$read_event(&mut buf) $(.$await)? .unwrap(),
1974 Event::Eof
1975 );
1976 }
1977 }
1978 };
1979 }
1980
1981 // Export macros for the child modules:
1982 // - buffered_reader
1983 // - slice_reader
1984 pub(super) use check;
1985 pub(super) use small_buffers;
1986}