plist/stream/
ascii_reader.rs

1/// Ascii property lists are used in legacy settings and only support four
2/// datatypes: Array, Dictionary, String and Data.
3/// See [Apple
4/// Documentation](https://developer.apple.com/library/archive/documentation/Cocoa/Conceptual/PropertyLists/OldStylePlists/OldStylePLists.html)
5/// for more info.
6/// However this reader also support Integers as first class datatype.
7/// This reader will accept certain ill-formed ascii plist without complaining.
8/// It does not check the integrity of the plist format.
9use crate::{
10    error::{Error, ErrorKind},
11    stream::{Event, OwnedEvent},
12    Integer,
13};
14use std::io::Read;
15
16pub struct AsciiReader<R: Read> {
17    reader: R,
18    current_pos: u64,
19
20    /// lookahead char to avoid backtracking.
21    peeked_char: Option<u8>,
22
23    current_char: Option<u8>,
24}
25
26impl<R: Read> AsciiReader<R> {
27    pub fn new(reader: R) -> Self {
28        Self {
29            reader,
30            current_pos: 0,
31            peeked_char: None,
32            current_char: None,
33        }
34    }
35
36    pub fn into_inner(self) -> R {
37        self.reader
38    }
39
40    fn error(&self, kind: ErrorKind) -> Error {
41        kind.with_byte_offset(self.current_pos)
42    }
43
44    fn read_one(&mut self) -> Result<Option<u8>, Error> {
45        let mut buf: [u8; 1] = [0; 1];
46        match self.reader.read_exact(&mut buf) {
47            Ok(()) => Ok(Some(buf[0])),
48            Err(err) => {
49                if err.kind() == std::io::ErrorKind::UnexpectedEof {
50                    Ok(None)
51                } else {
52                    Err(self.error(ErrorKind::Io(err)))
53                }
54            }
55        }
56    }
57
58    /// Consume the reader and set [`Self::current_char`] and
59    /// [`Self::peeked_char`]. Returns the current character.
60    fn advance(&mut self) -> Result<Option<u8>, Error> {
61        self.current_char = self.peeked_char;
62        self.peeked_char = self.read_one()?;
63
64        // We need to read two chars to boot the process and fill the peeked
65        // char.
66        if self.current_pos == 0 {
67            self.current_char = self.peeked_char;
68            self.peeked_char = self.read_one()?;
69        }
70
71        if self.current_char.is_some() {
72            self.current_pos += 1;
73        }
74
75        Ok(self.current_char)
76    }
77
78    /// From Apple doc:
79    ///
80    /// > The quotation marks can be omitted if the string is composed strictly of alphanumeric
81    /// > characters and contains no white space (numbers are handled as
82    /// > strings in property lists). Though the property list format uses
83    /// > ASCII for strings, note that Cocoa uses Unicode. Since string
84    /// > encodings vary from region to region, this representation makes the
85    /// > format fragile. You may see strings containing unreadable sequences of
86    /// > ASCII characters; these are used to represent Unicode characters
87    ///
88    /// This function will naively try to convert the string to Integer.
89    fn unquoted_string_literal(&mut self, first: u8) -> Result<Option<OwnedEvent>, Error> {
90        let mut acc: Vec<u8> = Vec::new();
91        acc.push(first);
92
93        while {
94            match self.peeked_char {
95                Some(c) => {
96                    c != b' ' && c != b')' && c != b'\r' && c != b'\t' && c != b';' && c != b','
97                }
98                None => false,
99            }
100        } {
101            // consuming the string itself
102            self.advance()?;
103            match self.current_char {
104                Some(c) => acc.push(c),
105                None => return Err(self.error(ErrorKind::UnclosedString)),
106            };
107        }
108
109        let string_literal =
110            String::from_utf8(acc).map_err(|_e| self.error(ErrorKind::InvalidUtf8AsciiStream))?;
111
112        // Not ideal but does the trick for now
113        match Integer::from_str(&string_literal) {
114            Ok(i) => Ok(Some(Event::Integer(i))),
115            Err(_) => Ok(Some(Event::String(string_literal.into()))),
116        }
117    }
118
119    /// The process for decoding utf-16 escapes to utf-8 is:
120    /// 1. Convert the 4 hex characters to utf-16 code units (u16s).
121    ///    '\u006d' becomes 0x6d.
122    /// 2. Based on the first code unit, determine whether another code unit is
123    ///    required to form the complete code point.
124    ///    "\uD83D\uDCA9" becomes `[0xd73d, 0xdca9]`
125    /// 3. Convert the 1 or 2 u16 code point to utf-8.
126    ///    `[0xd73d, 0xdca9]` becomes '💩'.
127    ///
128    /// The standard library has some useful functions behind unstable feature
129    /// flags, we can simplify and optimize this a bit once they're stable.
130    /// - str_from_utf16_endian
131    /// - is_utf16_surrogate
132    fn utf16_escape(&mut self) -> Result<String, Error> {
133        let mut code_units: &mut [u16] = &mut [0u16; 2];
134
135        let Some(code_unit) = self.utf16_code_unit()? else {
136            return Err(self.error(ErrorKind::InvalidUtf16String));
137        };
138
139        code_units[0] = code_unit;
140
141        // This is the utf-16 surrogate range, indicating another code unit is
142        // necessary to form a complete code point.
143        if !matches!(code_unit, 0xD800..=0xDFFF) {
144            code_units = &mut code_units[0..1];
145        } else {
146            self.advance_quoted_string()?;
147
148            if self.current_char != Some(b'\\')
149                || !matches!(self.peeked_char, Some(b'u') | Some(b'U'))
150            {
151                return Err(self.error(ErrorKind::InvalidUtf16String));
152            }
153
154            self.advance_quoted_string()?;
155
156            if let Some(code_unit) = self.utf16_code_unit()? {
157                code_units[1] = code_unit;
158            }
159        }
160
161        let utf8 = String::from_utf16(code_units)
162            .map_err(|_| self.error(ErrorKind::InvalidUtf16String))?;
163
164        Ok(utf8)
165    }
166
167    /// Expects the reader's next read to return the first hex character of the
168    /// utf-16 hex string.
169    fn utf16_code_unit(&mut self) -> Result<Option<u16>, Error> {
170        let hex_chars = [
171            self.advance_quoted_string()?,
172            self.advance_quoted_string()?,
173            self.advance_quoted_string()?,
174            self.advance_quoted_string()?,
175        ];
176
177        let hex_str = std::str::from_utf8(&hex_chars)
178            .map_err(|_| self.error(ErrorKind::InvalidUtf16String))?;
179
180        let code_unit = u16::from_str_radix(hex_str, 16)
181            .map_err(|_| self.error(ErrorKind::InvalidUtf16String))?;
182
183        Ok(Some(code_unit))
184    }
185
186    #[inline]
187    fn advance_quoted_string(&mut self) -> Result<u8, Error> {
188        match self.advance()? {
189            Some(c) => Ok(c),
190            None => Err(self.error(ErrorKind::UnclosedString)),
191        }
192    }
193
194    fn quoted_string_literal(&mut self, quote: u8) -> Result<Option<OwnedEvent>, Error> {
195        let mut acc = String::new();
196
197        loop {
198            let c = self.advance_quoted_string()?;
199
200            if c == quote {
201                return Ok(Some(Event::String(acc.into())));
202            }
203
204            let replacement = if c == b'\\' {
205                let c = self.advance_quoted_string()?;
206
207                match c {
208                    b'\\' | b'"' => c as char,
209                    b'a' => '\u{7}',
210                    b'b' => '\u{8}',
211                    b'f' => '\u{c}',
212                    b'n' => '\n',
213                    b'r' => '\r',
214                    b't' => '\t',
215                    b'U' => {
216                        let utf8 = self.utf16_escape()?;
217                        acc.push_str(utf8.as_str());
218                        continue;
219                    }
220                    b'v' => '\u{b}',
221                    b'0' | b'1' | b'2' | b'3' | b'4' | b'5' | b'6' | b'7' => {
222                        let value = [
223                            c,
224                            self.advance_quoted_string()?,
225                            self.advance_quoted_string()?,
226                        ];
227
228                        let value = std::str::from_utf8(&value)
229                            .map_err(|_| self.error(ErrorKind::InvalidOctalString))?;
230
231                        let value = u16::from_str_radix(value, 8)
232                            .map_err(|_| self.error(ErrorKind::InvalidOctalString))?
233                            as u32;
234
235                        let value = char::from_u32(value)
236                            .ok_or(self.error(ErrorKind::InvalidOctalString))?;
237
238                        map_next_step_to_unicode(value)
239                    }
240                    _ => return Err(self.error(ErrorKind::InvalidUtf8AsciiStream)),
241                }
242            } else {
243                c as char
244            };
245
246            acc.push(replacement);
247        }
248    }
249
250    fn line_comment(&mut self) -> Result<(), Error> {
251        // Consumes up to the end of the line.
252        // There's no error in this a line comment can reach the EOF and there's
253        // no forbidden chars in comments.
254        while {
255            match self.peeked_char {
256                Some(c) => c != b'\n',
257                None => false,
258            }
259        } {
260            let _ = self.advance()?;
261        }
262
263        Ok(())
264    }
265
266    fn block_comment(&mut self) -> Result<(), Error> {
267        let mut latest_consume = b' ';
268        while {
269            latest_consume != b'*'
270                || match self.advance()? {
271                    Some(c) => c != b'/',
272                    None => false,
273                }
274        } {
275            latest_consume = self
276                .advance()?
277                .ok_or(self.error(ErrorKind::IncompleteComment))?;
278        }
279
280        Ok(())
281    }
282
283    /// Returns:
284    /// - Some(string) if '/' was the first character of a string
285    /// - None if '/' was the beginning of a comment.
286    fn potential_comment(&mut self) -> Result<Option<OwnedEvent>, Error> {
287        match self.peeked_char {
288            Some(c) => match c {
289                b'/' => self.line_comment().map(|_| None),
290                b'*' => self.block_comment().map(|_| None),
291                _ => self.unquoted_string_literal(c),
292            },
293            // EOF
294            None => Err(self.error(ErrorKind::IncompleteComment)),
295        }
296    }
297
298    /// Consumes the reader until it finds a valid Event
299    /// Possible events for Ascii plists:
300    ///  - `StartArray(Option<u64>)`,
301    ///  - `StartDictionary(Option<u64>)`,
302    ///  - `EndCollection`,
303    ///  - `Data(Vec<u8>)`,
304    fn read_next(&mut self) -> Result<Option<OwnedEvent>, Error> {
305        while let Some(c) = self.advance()? {
306            match c {
307                // Single char tokens
308                b'(' => return Ok(Some(Event::StartArray(None))),
309                b')' => return Ok(Some(Event::EndCollection)),
310                b'{' => return Ok(Some(Event::StartDictionary(None))),
311                b'}' => return Ok(Some(Event::EndCollection)),
312                b'\'' | b'"' => return self.quoted_string_literal(c),
313                b'/' => {
314                    match self.potential_comment() {
315                        Ok(Some(event)) => return Ok(Some(event)),
316                        Ok(None) => { /* Comment has been consumed */ }
317                        Err(e) => return Err(e),
318                    }
319                }
320                b',' | b';' | b'=' => { /* consume these without emitting anything */ }
321                b' ' | b'\r' | b'\t' | b'\n' => { /* whitespace is not significant */ }
322                _ => return self.unquoted_string_literal(c),
323            }
324        }
325
326        Ok(None)
327    }
328}
329
330impl<R: Read> Iterator for AsciiReader<R> {
331    type Item = Result<OwnedEvent, Error>;
332
333    fn next(&mut self) -> Option<Result<OwnedEvent, Error>> {
334        self.read_next().transpose()
335    }
336}
337
338/// Maps NextStep encoding to Unicode, see:
339/// - <https://github.com/fonttools/openstep-plist/blob/master/src/openstep_plist/parser.pyx#L87-L106>
340/// - <ftp://ftp.unicode.org/Public/MAPPINGS/VENDORS/NEXT/NEXTSTEP.TXT>
341fn map_next_step_to_unicode(c: char) -> char {
342    const NEXT_UNICODE_MAPPING: &[char] = &[
343        '\u{A0}', '\u{C0}', '\u{C1}', '\u{C2}', '\u{C3}', '\u{C4}', '\u{C5}', '\u{C7}', '\u{C8}',
344        '\u{C9}', '\u{CA}', '\u{CB}', '\u{CC}', '\u{CD}', '\u{CE}', '\u{CF}', '\u{D0}', '\u{D1}',
345        '\u{D2}', '\u{D3}', '\u{D4}', '\u{D5}', '\u{D6}', '\u{D9}', '\u{DA}', '\u{DB}', '\u{DC}',
346        '\u{DD}', '\u{DE}', '\u{B5}', '\u{D7}', '\u{F7}', '\u{A9}', '\u{A1}', '\u{A2}', '\u{A3}',
347        '\u{2044}', '\u{A5}', '\u{192}', '\u{A7}', '\u{A4}', '\u{2019}', '\u{201C}', '\u{AB}',
348        '\u{2039}', '\u{203A}', '\u{FB01}', '\u{FB02}', '\u{AE}', '\u{2013}', '\u{2020}',
349        '\u{2021}', '\u{B7}', '\u{A6}', '\u{B6}', '\u{2022}', '\u{201A}', '\u{201E}', '\u{201D}',
350        '\u{BB}', '\u{2026}', '\u{2030}', '\u{AC}', '\u{BF}', '\u{B9}', '\u{2CB}', '\u{B4}',
351        '\u{2C6}', '\u{2DC}', '\u{AF}', '\u{2D8}', '\u{2D9}', '\u{A8}', '\u{B2}', '\u{2DA}',
352        '\u{B8}', '\u{B3}', '\u{2DD}', '\u{2DB}', '\u{2C7}', '\u{2014}', '\u{B1}', '\u{BC}',
353        '\u{BD}', '\u{BE}', '\u{E0}', '\u{E1}', '\u{E2}', '\u{E3}', '\u{E4}', '\u{E5}', '\u{E7}',
354        '\u{E8}', '\u{E9}', '\u{EA}', '\u{EB}', '\u{EC}', '\u{C6}', '\u{ED}', '\u{AA}', '\u{EE}',
355        '\u{EF}', '\u{F0}', '\u{F1}', '\u{141}', '\u{D8}', '\u{152}', '\u{BA}', '\u{F2}', '\u{F3}',
356        '\u{F4}', '\u{F5}', '\u{F6}', '\u{E6}', '\u{F9}', '\u{FA}', '\u{FB}', '\u{131}', '\u{FC}',
357        '\u{FD}', '\u{142}', '\u{F8}', '\u{153}', '\u{DF}', '\u{FE}', '\u{FF}', '\u{FFFD}',
358        '\u{FFFD}',
359    ];
360
361    let index = c as usize;
362
363    if index < 128 || index > 0xff {
364        return c;
365    }
366
367    NEXT_UNICODE_MAPPING[index - 128]
368}
369
370#[cfg(test)]
371mod tests {
372    use std::{fs::File, io::Cursor};
373
374    use super::*;
375    use crate::stream::Event::*;
376
377    #[test]
378    fn empty_test() {
379        let plist = "".to_owned();
380        let cursor = Cursor::new(plist.as_bytes());
381        let streaming_parser = AsciiReader::new(cursor);
382        let events: Vec<Event> = streaming_parser.map(|e| e.unwrap()).collect();
383        assert_eq!(events, &[]);
384    }
385
386    #[test]
387    fn streaming_sample() {
388        let reader = File::open("./tests/data/ascii-sample.plist").unwrap();
389        let streaming_parser = AsciiReader::new(reader);
390        let events: Vec<Event> = streaming_parser.map(|e| e.unwrap()).collect();
391
392        let comparison = &[
393            StartDictionary(None),
394            String("KeyName1".into()),
395            String("Value1".into()),
396            String("AnotherKeyName".into()),
397            String("Value2".into()),
398            String("Something".into()),
399            StartArray(None),
400            String("ArrayItem1".into()),
401            String("ArrayItem2".into()),
402            String("ArrayItem3".into()),
403            EndCollection,
404            String("Key4".into()),
405            String("0.10".into()),
406            String("KeyFive".into()),
407            StartDictionary(None),
408            String("Dictionary2Key1".into()),
409            String("Something".into()),
410            String("AnotherKey".into()),
411            String("Somethingelse".into()),
412            EndCollection,
413            EndCollection,
414        ];
415
416        assert_eq!(events, comparison);
417    }
418
419    #[test]
420    fn utf8_strings() {
421        let plist = "{ names = (Léa, François, Żaklina, 王芳); }".to_owned();
422        let cursor = Cursor::new(plist.as_bytes());
423        let streaming_parser = AsciiReader::new(cursor);
424        let events: Vec<Event> = streaming_parser.map(|e| e.unwrap()).collect();
425
426        let comparison = &[
427            StartDictionary(None),
428            String("names".into()),
429            StartArray(None),
430            String("Léa".into()),
431            String("François".into()),
432            String("Żaklina".into()),
433            String("王芳".into()),
434            EndCollection,
435            EndCollection,
436        ];
437
438        assert_eq!(events, comparison);
439    }
440
441    #[test]
442    fn invalid_utf16_escapes() {
443        let plist = br#"{
444            key1 = "\U123";
445            key2 = "\UD83D";
446            key3 = "\u0080";
447        }"#;
448        let cursor = Cursor::new(plist);
449        let streaming_parser = AsciiReader::new(cursor);
450        let events: Vec<Result<Event, Error>> = streaming_parser.collect();
451
452        // key1's value
453        assert!(events[2].is_err());
454        // key2's value
455        assert!(events[4].is_err());
456        // key3's value
457        assert!(events[6].is_err());
458    }
459
460    #[test]
461    fn invalid_octal_escapes() {
462        let plist = br#"{
463            key1 = "\1";
464            key2 = "\12";
465        }"#;
466        let cursor = Cursor::new(plist);
467        let streaming_parser = AsciiReader::new(cursor);
468        let events: Vec<Result<Event, Error>> = streaming_parser.collect();
469
470        // key1's value
471        assert!(events[2].is_err());
472        // key2's value
473        assert!(events[4].is_err());
474    }
475
476    #[test]
477    fn escaped_sequences_in_strings() {
478        let plist = br#"{
479            key1 = "va\"lue";
480            key2 = 'va"lue';
481            key3 = "va\a\b\f\n\r\t\v\"\nlue";
482            key4 = "a\012b";
483            key5 = "\\UD83D\\UDCA9";
484            key6 = "\UD83D\UDCA9";
485            key7 = "\U0080";
486            key8 = "\200\377";
487        }"#;
488        let cursor = Cursor::new(plist);
489        let streaming_parser = AsciiReader::new(cursor);
490        let events: Vec<Event> = streaming_parser.map(|e| e.unwrap()).collect();
491
492        let comparison = &[
493            StartDictionary(None),
494            String("key1".into()),
495            String(r#"va"lue"#.into()),
496            String("key2".into()),
497            String(r#"va"lue"#.into()),
498            String("key3".into()),
499            String("va\u{7}\u{8}\u{c}\n\r\t\u{b}\"\nlue".into()),
500            String("key4".into()),
501            String("a\nb".into()),
502            String("key5".into()),
503            String("\\UD83D\\UDCA9".into()),
504            String("key6".into()),
505            String("💩".into()),
506            String("key7".into()),
507            String("\u{80}".into()),
508            String("key8".into()),
509            String("\u{a0}\u{fffd}".into()),
510            EndCollection,
511        ];
512
513        assert_eq!(events, comparison);
514    }
515
516    #[test]
517    fn integers_and_strings() {
518        let plist = "{ name = James, age = 42 }".to_owned();
519        let cursor = Cursor::new(plist.as_bytes());
520        let streaming_parser = AsciiReader::new(cursor);
521        let events: Vec<Event> = streaming_parser.map(|e| e.unwrap()).collect();
522
523        let comparison = &[
524            StartDictionary(None),
525            String("name".into()),
526            String("James".into()),
527            String("age".into()),
528            Integer(42.into()),
529            EndCollection,
530        ];
531
532        assert_eq!(events, comparison);
533    }
534
535    #[test]
536    fn netnewswire_pbxproj() {
537        let reader = File::open("./tests/data/netnewswire.pbxproj").unwrap();
538        let streaming_parser = AsciiReader::new(reader);
539
540        // Ensure that we don't fail when reading the file
541        let events: Vec<Event> = streaming_parser.map(|e| e.unwrap()).collect();
542
543        assert!(!events.is_empty());
544    }
545}