flate2/gz/
mod.rs

1use std::ffi::CString;
2use std::io::{BufRead, Error, ErrorKind, Read, Result, Write};
3use std::time;
4
5use crate::bufreader::BufReader;
6use crate::{Compression, Crc};
7
8pub static FHCRC: u8 = 1 << 1;
9pub static FEXTRA: u8 = 1 << 2;
10pub static FNAME: u8 = 1 << 3;
11pub static FCOMMENT: u8 = 1 << 4;
12pub static FRESERVED: u8 = 1 << 5 | 1 << 6 | 1 << 7;
13
14pub mod bufread;
15pub mod read;
16pub mod write;
17
18// The maximum length of the header filename and comment fields. More than
19// enough for these fields in reasonable use, but prevents possible attacks.
20const MAX_HEADER_BUF: usize = 65535;
21
22/// A structure representing the header of a gzip stream.
23///
24/// The header can contain metadata about the file that was compressed, if
25/// present.
26#[derive(PartialEq, Clone, Debug, Default)]
27pub struct GzHeader {
28    extra: Option<Vec<u8>>,
29    filename: Option<Vec<u8>>,
30    comment: Option<Vec<u8>>,
31    operating_system: u8,
32    mtime: u32,
33}
34
35impl GzHeader {
36    /// Returns the `filename` field of this gzip stream's header, if present.
37    pub fn filename(&self) -> Option<&[u8]> {
38        self.filename.as_ref().map(|s| &s[..])
39    }
40
41    /// Returns the `extra` field of this gzip stream's header, if present.
42    pub fn extra(&self) -> Option<&[u8]> {
43        self.extra.as_ref().map(|s| &s[..])
44    }
45
46    /// Returns the `comment` field of this gzip stream's header, if present.
47    pub fn comment(&self) -> Option<&[u8]> {
48        self.comment.as_ref().map(|s| &s[..])
49    }
50
51    /// Returns the `operating_system` field of this gzip stream's header.
52    ///
53    /// There are predefined values for various operating systems.
54    /// 255 means that the value is unknown.
55    pub fn operating_system(&self) -> u8 {
56        self.operating_system
57    }
58
59    /// This gives the most recent modification time of the original file being compressed.
60    ///
61    /// The time is in Unix format, i.e., seconds since 00:00:00 GMT, Jan. 1, 1970.
62    /// (Note that this may cause problems for MS-DOS and other systems that use local
63    /// rather than Universal time.) If the compressed data did not come from a file,
64    /// `mtime` is set to the time at which compression started.
65    /// `mtime` = 0 means no time stamp is available.
66    ///
67    /// The usage of `mtime` is discouraged because of Year 2038 problem.
68    pub fn mtime(&self) -> u32 {
69        self.mtime
70    }
71
72    /// Returns the most recent modification time represented by a date-time type.
73    /// Returns `None` if the value of the underlying counter is 0,
74    /// indicating no time stamp is available.
75    ///
76    ///
77    /// The time is measured as seconds since 00:00:00 GMT, Jan. 1 1970.
78    /// See [`mtime`](#method.mtime) for more detail.
79    pub fn mtime_as_datetime(&self) -> Option<time::SystemTime> {
80        if self.mtime == 0 {
81            None
82        } else {
83            let duration = time::Duration::new(u64::from(self.mtime), 0);
84            let datetime = time::UNIX_EPOCH + duration;
85            Some(datetime)
86        }
87    }
88}
89
90#[derive(Debug, Default)]
91pub enum GzHeaderState {
92    Start(u8, [u8; 10]),
93    Xlen(Option<Box<Crc>>, u8, [u8; 2]),
94    Extra(Option<Box<Crc>>, u16),
95    Filename(Option<Box<Crc>>),
96    Comment(Option<Box<Crc>>),
97    Crc(Option<Box<Crc>>, u8, [u8; 2]),
98    #[default]
99    Complete,
100}
101
102#[derive(Debug, Default)]
103pub struct GzHeaderParser {
104    state: GzHeaderState,
105    flags: u8,
106    header: GzHeader,
107}
108
109impl GzHeaderParser {
110    fn new() -> Self {
111        GzHeaderParser {
112            state: GzHeaderState::Start(0, [0; 10]),
113            flags: 0,
114            header: GzHeader::default(),
115        }
116    }
117
118    fn parse<R: BufRead>(&mut self, r: &mut R) -> Result<()> {
119        loop {
120            match &mut self.state {
121                GzHeaderState::Start(count, buffer) => {
122                    while (*count as usize) < buffer.len() {
123                        *count += read_into(r, &mut buffer[*count as usize..])? as u8;
124                    }
125                    // Gzip identification bytes
126                    if buffer[0] != 0x1f || buffer[1] != 0x8b {
127                        return Err(bad_header());
128                    }
129                    // Gzip compression method (8 = deflate)
130                    if buffer[2] != 8 {
131                        return Err(bad_header());
132                    }
133                    self.flags = buffer[3];
134                    // RFC1952: "must give an error indication if any reserved bit is non-zero"
135                    if self.flags & FRESERVED != 0 {
136                        return Err(bad_header());
137                    }
138                    self.header.mtime = ((buffer[4] as u32) << 0)
139                        | ((buffer[5] as u32) << 8)
140                        | ((buffer[6] as u32) << 16)
141                        | ((buffer[7] as u32) << 24);
142                    let _xfl = buffer[8];
143                    self.header.operating_system = buffer[9];
144                    let crc = if self.flags & FHCRC != 0 {
145                        let mut crc = Box::new(Crc::new());
146                        crc.update(buffer);
147                        Some(crc)
148                    } else {
149                        None
150                    };
151                    self.state = GzHeaderState::Xlen(crc, 0, [0; 2]);
152                }
153                GzHeaderState::Xlen(crc, count, buffer) => {
154                    if self.flags & FEXTRA != 0 {
155                        while (*count as usize) < buffer.len() {
156                            *count += read_into(r, &mut buffer[*count as usize..])? as u8;
157                        }
158                        if let Some(crc) = crc {
159                            crc.update(buffer);
160                        }
161                        let xlen = parse_le_u16(buffer);
162                        self.header.extra = Some(vec![0; xlen as usize]);
163                        self.state = GzHeaderState::Extra(crc.take(), 0);
164                    } else {
165                        self.state = GzHeaderState::Filename(crc.take());
166                    }
167                }
168                GzHeaderState::Extra(crc, count) => {
169                    debug_assert!(self.header.extra.is_some());
170                    let extra = self.header.extra.as_mut().unwrap();
171                    while (*count as usize) < extra.len() {
172                        *count += read_into(r, &mut extra[*count as usize..])? as u16;
173                    }
174                    if let Some(crc) = crc {
175                        crc.update(extra);
176                    }
177                    self.state = GzHeaderState::Filename(crc.take());
178                }
179                GzHeaderState::Filename(crc) => {
180                    if self.flags & FNAME != 0 {
181                        let filename = self.header.filename.get_or_insert_with(Vec::new);
182                        read_to_nul(r, filename)?;
183                        if let Some(crc) = crc {
184                            crc.update(filename);
185                            crc.update(b"\0");
186                        }
187                    }
188                    self.state = GzHeaderState::Comment(crc.take());
189                }
190                GzHeaderState::Comment(crc) => {
191                    if self.flags & FCOMMENT != 0 {
192                        let comment = self.header.comment.get_or_insert_with(Vec::new);
193                        read_to_nul(r, comment)?;
194                        if let Some(crc) = crc {
195                            crc.update(comment);
196                            crc.update(b"\0");
197                        }
198                    }
199                    self.state = GzHeaderState::Crc(crc.take(), 0, [0; 2]);
200                }
201                GzHeaderState::Crc(crc, count, buffer) => {
202                    if let Some(crc) = crc {
203                        debug_assert!(self.flags & FHCRC != 0);
204                        while (*count as usize) < buffer.len() {
205                            *count += read_into(r, &mut buffer[*count as usize..])? as u8;
206                        }
207                        let stored_crc = parse_le_u16(buffer);
208                        let calced_crc = crc.sum() as u16;
209                        if stored_crc != calced_crc {
210                            return Err(corrupt());
211                        }
212                    }
213                    self.state = GzHeaderState::Complete;
214                }
215                GzHeaderState::Complete => {
216                    return Ok(());
217                }
218            }
219        }
220    }
221
222    fn header(&self) -> Option<&GzHeader> {
223        match self.state {
224            GzHeaderState::Complete => Some(&self.header),
225            _ => None,
226        }
227    }
228}
229
230impl From<GzHeaderParser> for GzHeader {
231    fn from(parser: GzHeaderParser) -> Self {
232        debug_assert!(matches!(parser.state, GzHeaderState::Complete));
233        parser.header
234    }
235}
236
237// Attempt to fill the `buffer` from `r`. Return the number of bytes read.
238// Return an error if EOF is read before the buffer is full.  This differs
239// from `read` in that Ok(0) means that more data may be available.
240fn read_into<R: Read>(r: &mut R, buffer: &mut [u8]) -> Result<usize> {
241    debug_assert!(!buffer.is_empty());
242    match r.read(buffer) {
243        Ok(0) => Err(ErrorKind::UnexpectedEof.into()),
244        Ok(n) => Ok(n),
245        Err(ref e) if e.kind() == ErrorKind::Interrupted => Ok(0),
246        Err(e) => Err(e),
247    }
248}
249
250// Read `r` up to the first nul byte, pushing non-nul bytes to `buffer`.
251fn read_to_nul<R: BufRead>(r: &mut R, buffer: &mut Vec<u8>) -> Result<()> {
252    let mut bytes = r.bytes();
253    loop {
254        match bytes.next().transpose()? {
255            Some(0) => return Ok(()),
256            Some(_) if buffer.len() == MAX_HEADER_BUF => {
257                return Err(Error::new(
258                    ErrorKind::InvalidInput,
259                    "gzip header field too long",
260                ));
261            }
262            Some(byte) => {
263                buffer.push(byte);
264            }
265            None => {
266                return Err(ErrorKind::UnexpectedEof.into());
267            }
268        }
269    }
270}
271
272fn parse_le_u16(buffer: &[u8; 2]) -> u16 {
273    u16::from_le_bytes(*buffer)
274}
275
276fn bad_header() -> Error {
277    Error::new(ErrorKind::InvalidInput, "invalid gzip header")
278}
279
280fn corrupt() -> Error {
281    Error::new(
282        ErrorKind::InvalidInput,
283        "corrupt gzip stream does not have a matching checksum",
284    )
285}
286
287/// A builder structure to create a new gzip Encoder.
288///
289/// This structure controls header configuration options such as the filename.
290///
291/// # Examples
292///
293/// ```
294/// use std::io::prelude::*;
295/// # use std::io;
296/// use std::fs::File;
297/// use flate2::GzBuilder;
298/// use flate2::Compression;
299///
300/// // GzBuilder opens a file and writes a sample string using GzBuilder pattern
301///
302/// # fn sample_builder() -> Result<(), io::Error> {
303/// let f = File::create("examples/hello_world.gz")?;
304/// let mut gz = GzBuilder::new()
305///                 .filename("hello_world.txt")
306///                 .comment("test file, please delete")
307///                 .write(f, Compression::default());
308/// gz.write_all(b"hello world")?;
309/// gz.finish()?;
310/// # Ok(())
311/// # }
312/// ```
313#[derive(Debug, Default)]
314pub struct GzBuilder {
315    extra: Option<Vec<u8>>,
316    filename: Option<CString>,
317    comment: Option<CString>,
318    operating_system: Option<u8>,
319    mtime: u32,
320}
321
322impl GzBuilder {
323    /// Create a new blank builder with no header by default.
324    pub fn new() -> GzBuilder {
325        Self::default()
326    }
327
328    /// Configure the `mtime` field in the gzip header.
329    pub fn mtime(mut self, mtime: u32) -> GzBuilder {
330        self.mtime = mtime;
331        self
332    }
333
334    /// Configure the `operating_system` field in the gzip header.
335    pub fn operating_system(mut self, os: u8) -> GzBuilder {
336        self.operating_system = Some(os);
337        self
338    }
339
340    /// Configure the `extra` field in the gzip header.
341    pub fn extra<T: Into<Vec<u8>>>(mut self, extra: T) -> GzBuilder {
342        self.extra = Some(extra.into());
343        self
344    }
345
346    /// Configure the `filename` field in the gzip header.
347    ///
348    /// # Panics
349    ///
350    /// Panics if the `filename` slice contains a zero.
351    pub fn filename<T: Into<Vec<u8>>>(mut self, filename: T) -> GzBuilder {
352        self.filename = Some(CString::new(filename.into()).unwrap());
353        self
354    }
355
356    /// Configure the `comment` field in the gzip header.
357    ///
358    /// # Panics
359    ///
360    /// Panics if the `comment` slice contains a zero.
361    pub fn comment<T: Into<Vec<u8>>>(mut self, comment: T) -> GzBuilder {
362        self.comment = Some(CString::new(comment.into()).unwrap());
363        self
364    }
365
366    /// Consume this builder, creating a writer encoder in the process.
367    ///
368    /// The data written to the returned encoder will be compressed and then
369    /// written out to the supplied parameter `w`.
370    pub fn write<W: Write>(self, w: W, lvl: Compression) -> write::GzEncoder<W> {
371        write::gz_encoder(self.into_header(lvl), w, lvl)
372    }
373
374    /// Consume this builder, creating a reader encoder in the process.
375    ///
376    /// Data read from the returned encoder will be the compressed version of
377    /// the data read from the given reader.
378    pub fn read<R: Read>(self, r: R, lvl: Compression) -> read::GzEncoder<R> {
379        read::gz_encoder(self.buf_read(BufReader::new(r), lvl))
380    }
381
382    /// Consume this builder, creating a reader encoder in the process.
383    ///
384    /// Data read from the returned encoder will be the compressed version of
385    /// the data read from the given reader.
386    pub fn buf_read<R>(self, r: R, lvl: Compression) -> bufread::GzEncoder<R>
387    where
388        R: BufRead,
389    {
390        bufread::gz_encoder(self.into_header(lvl), r, lvl)
391    }
392
393    fn into_header(self, lvl: Compression) -> Vec<u8> {
394        let GzBuilder {
395            extra,
396            filename,
397            comment,
398            operating_system,
399            mtime,
400        } = self;
401        let mut flg = 0;
402        let mut header = vec![0u8; 10];
403        if let Some(v) = extra {
404            flg |= FEXTRA;
405            header.push((v.len() >> 0) as u8);
406            header.push((v.len() >> 8) as u8);
407            header.extend(v);
408        }
409        if let Some(filename) = filename {
410            flg |= FNAME;
411            header.extend(filename.as_bytes_with_nul().iter().copied());
412        }
413        if let Some(comment) = comment {
414            flg |= FCOMMENT;
415            header.extend(comment.as_bytes_with_nul().iter().copied());
416        }
417        header[0] = 0x1f;
418        header[1] = 0x8b;
419        header[2] = 8;
420        header[3] = flg;
421        header[4] = (mtime >> 0) as u8;
422        header[5] = (mtime >> 8) as u8;
423        header[6] = (mtime >> 16) as u8;
424        header[7] = (mtime >> 24) as u8;
425        header[8] = if lvl.0 >= Compression::best().0 {
426            2
427        } else if lvl.0 <= Compression::fast().0 {
428            4
429        } else {
430            0
431        };
432
433        // Typically this byte indicates what OS the gz stream was created on,
434        // but in an effort to have cross-platform reproducible streams just
435        // default this value to 255. I'm not sure that if we "correctly" set
436        // this it'd do anything anyway...
437        header[9] = operating_system.unwrap_or(255);
438        header
439    }
440}
441
442#[cfg(test)]
443mod tests {
444    use std::io::prelude::*;
445
446    use super::{read, write, GzBuilder, GzHeaderParser};
447    use crate::{Compression, GzHeader};
448    use rand::{rng, Rng};
449
450    #[test]
451    fn roundtrip() {
452        let mut e = write::GzEncoder::new(Vec::new(), Compression::default());
453        e.write_all(b"foo bar baz").unwrap();
454        let inner = e.finish().unwrap();
455        let mut d = read::GzDecoder::new(&inner[..]);
456        let mut s = String::new();
457        d.read_to_string(&mut s).unwrap();
458        assert_eq!(s, "foo bar baz");
459    }
460
461    #[test]
462    fn roundtrip_zero() {
463        let e = write::GzEncoder::new(Vec::new(), Compression::default());
464        let inner = e.finish().unwrap();
465        let mut d = read::GzDecoder::new(&inner[..]);
466        let mut s = String::new();
467        d.read_to_string(&mut s).unwrap();
468        assert_eq!(s, "");
469    }
470
471    #[test]
472    fn roundtrip_big() {
473        let mut real = Vec::new();
474        let mut w = write::GzEncoder::new(Vec::new(), Compression::default());
475        let v = crate::random_bytes().take(1024).collect::<Vec<_>>();
476        for _ in 0..200 {
477            let to_write = &v[..rng().random_range(0..v.len())];
478            real.extend(to_write.iter().copied());
479            w.write_all(to_write).unwrap();
480        }
481        let result = w.finish().unwrap();
482        let mut r = read::GzDecoder::new(&result[..]);
483        let mut v = Vec::new();
484        r.read_to_end(&mut v).unwrap();
485        assert_eq!(v, real);
486    }
487
488    #[test]
489    fn roundtrip_big2() {
490        let v = crate::random_bytes().take(1024 * 1024).collect::<Vec<_>>();
491        let mut r = read::GzDecoder::new(read::GzEncoder::new(&v[..], Compression::default()));
492        let mut res = Vec::new();
493        r.read_to_end(&mut res).unwrap();
494        assert_eq!(res, v);
495    }
496
497    // A Rust implementation of CRC that closely matches the C code in RFC1952.
498    // Only use this to create CRCs for tests.
499    struct Rfc1952Crc {
500        /* Table of CRCs of all 8-bit messages. */
501        crc_table: [u32; 256],
502    }
503
504    impl Rfc1952Crc {
505        fn new() -> Self {
506            let mut crc = Rfc1952Crc {
507                crc_table: [0; 256],
508            };
509            /* Make the table for a fast CRC. */
510            for n in 0usize..256 {
511                let mut c = n as u32;
512                for _k in 0..8 {
513                    if c & 1 != 0 {
514                        c = 0xedb88320 ^ (c >> 1);
515                    } else {
516                        c = c >> 1;
517                    }
518                }
519                crc.crc_table[n] = c;
520            }
521            crc
522        }
523
524        /*
525         Update a running crc with the bytes buf and return
526         the updated crc. The crc should be initialized to zero. Pre- and
527         post-conditioning (one's complement) is performed within this
528         function so it shouldn't be done by the caller.
529        */
530        fn update_crc(&self, crc: u32, buf: &[u8]) -> u32 {
531            let mut c = crc ^ 0xffffffff;
532
533            for b in buf {
534                c = self.crc_table[(c as u8 ^ *b) as usize] ^ (c >> 8);
535            }
536            c ^ 0xffffffff
537        }
538
539        /* Return the CRC of the bytes buf. */
540        fn crc(&self, buf: &[u8]) -> u32 {
541            self.update_crc(0, buf)
542        }
543    }
544
545    #[test]
546    fn roundtrip_header() {
547        let mut header = GzBuilder::new()
548            .mtime(1234)
549            .operating_system(57)
550            .filename("filename")
551            .comment("comment")
552            .into_header(Compression::fast());
553
554        // Add a CRC to the header
555        header[3] = header[3] ^ super::FHCRC;
556        let rfc1952_crc = Rfc1952Crc::new();
557        let crc32 = rfc1952_crc.crc(&header);
558        let crc16 = crc32 as u16;
559        header.extend(&crc16.to_le_bytes());
560
561        let mut parser = GzHeaderParser::new();
562        parser.parse(&mut header.as_slice()).unwrap();
563        let actual = parser.header().unwrap();
564        assert_eq!(
565            actual,
566            &GzHeader {
567                extra: None,
568                filename: Some("filename".as_bytes().to_vec()),
569                comment: Some("comment".as_bytes().to_vec()),
570                operating_system: 57,
571                mtime: 1234
572            }
573        )
574    }
575
576    #[test]
577    fn fields() {
578        let r = vec![0, 2, 4, 6];
579        let e = GzBuilder::new()
580            .filename("foo.rs")
581            .comment("bar")
582            .extra(vec![0, 1, 2, 3])
583            .read(&r[..], Compression::default());
584        let mut d = read::GzDecoder::new(e);
585        assert_eq!(d.header().unwrap().filename(), Some(&b"foo.rs"[..]));
586        assert_eq!(d.header().unwrap().comment(), Some(&b"bar"[..]));
587        assert_eq!(d.header().unwrap().extra(), Some(&b"\x00\x01\x02\x03"[..]));
588        let mut res = Vec::new();
589        d.read_to_end(&mut res).unwrap();
590        assert_eq!(res, vec![0, 2, 4, 6]);
591    }
592
593    #[test]
594    fn keep_reading_after_end() {
595        let mut e = write::GzEncoder::new(Vec::new(), Compression::default());
596        e.write_all(b"foo bar baz").unwrap();
597        let inner = e.finish().unwrap();
598        let mut d = read::GzDecoder::new(&inner[..]);
599        let mut s = String::new();
600        d.read_to_string(&mut s).unwrap();
601        assert_eq!(s, "foo bar baz");
602        d.read_to_string(&mut s).unwrap();
603        assert_eq!(s, "foo bar baz");
604    }
605
606    #[test]
607    fn qc_reader() {
608        ::quickcheck::quickcheck(test as fn(_) -> _);
609
610        fn test(v: Vec<u8>) -> bool {
611            let r = read::GzEncoder::new(&v[..], Compression::default());
612            let mut r = read::GzDecoder::new(r);
613            let mut v2 = Vec::new();
614            r.read_to_end(&mut v2).unwrap();
615            v == v2
616        }
617    }
618
619    #[test]
620    fn flush_after_write() {
621        let mut f = write::GzEncoder::new(Vec::new(), Compression::default());
622        write!(f, "Hello world").unwrap();
623        f.flush().unwrap();
624    }
625}