| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 1 | // Copyright 2009 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | // Package gzip implements reading and writing of gzip format compressed files, |
| 6 | // as specified in RFC 1952. |
| 7 | package gzip |
| 8 | |
| 9 | import ( |
| 10 | "bufio" |
| 11 | "compress/gzip" |
| 12 | "encoding/binary" |
| 13 | "hash/crc32" |
| 14 | "io" |
| 15 | "time" |
| 16 | |
| 17 | "github.com/klauspost/compress/flate" |
| 18 | ) |
| 19 | |
| 20 | const ( |
| 21 | gzipID1 = 0x1f |
| 22 | gzipID2 = 0x8b |
| 23 | gzipDeflate = 8 |
| 24 | flagText = 1 << 0 |
| 25 | flagHdrCrc = 1 << 1 |
| 26 | flagExtra = 1 << 2 |
| 27 | flagName = 1 << 3 |
| 28 | flagComment = 1 << 4 |
| 29 | ) |
| 30 | |
| 31 | var ( |
| 32 | // ErrChecksum is returned when reading GZIP data that has an invalid checksum. |
| 33 | ErrChecksum = gzip.ErrChecksum |
| 34 | // ErrHeader is returned when reading GZIP data that has an invalid header. |
| 35 | ErrHeader = gzip.ErrHeader |
| 36 | ) |
| 37 | |
| 38 | var le = binary.LittleEndian |
| 39 | |
| 40 | // noEOF converts io.EOF to io.ErrUnexpectedEOF. |
| 41 | func noEOF(err error) error { |
| 42 | if err == io.EOF { |
| 43 | return io.ErrUnexpectedEOF |
| 44 | } |
| 45 | return err |
| 46 | } |
| 47 | |
| 48 | // The gzip file stores a header giving metadata about the compressed file. |
| 49 | // That header is exposed as the fields of the Writer and Reader structs. |
| 50 | // |
| 51 | // Strings must be UTF-8 encoded and may only contain Unicode code points |
| 52 | // U+0001 through U+00FF, due to limitations of the GZIP file format. |
| 53 | type Header struct { |
| 54 | Comment string // comment |
| 55 | Extra []byte // "extra data" |
| 56 | ModTime time.Time // modification time |
| 57 | Name string // file name |
| 58 | OS byte // operating system type |
| 59 | } |
| 60 | |
| 61 | // A Reader is an io.Reader that can be read to retrieve |
| 62 | // uncompressed data from a gzip-format compressed file. |
| 63 | // |
| 64 | // In general, a gzip file can be a concatenation of gzip files, |
| 65 | // each with its own header. Reads from the Reader |
| 66 | // return the concatenation of the uncompressed data of each. |
| 67 | // Only the first header is recorded in the Reader fields. |
| 68 | // |
| 69 | // Gzip files store a length and checksum of the uncompressed data. |
| 70 | // The Reader will return a ErrChecksum when Read |
| 71 | // reaches the end of the uncompressed data if it does not |
| 72 | // have the expected length or checksum. Clients should treat data |
| 73 | // returned by Read as tentative until they receive the io.EOF |
| 74 | // marking the end of the data. |
| 75 | type Reader struct { |
| 76 | Header // valid after NewReader or Reader.Reset |
| 77 | r flate.Reader |
| 78 | br *bufio.Reader |
| 79 | decompressor io.ReadCloser |
| 80 | digest uint32 // CRC-32, IEEE polynomial (section 8) |
| 81 | size uint32 // Uncompressed size (section 2.3.1) |
| 82 | buf [512]byte |
| 83 | err error |
| 84 | multistream bool |
| 85 | } |
| 86 | |
| 87 | // NewReader creates a new Reader reading the given reader. |
| 88 | // If r does not also implement io.ByteReader, |
| 89 | // the decompressor may read more data than necessary from r. |
| 90 | // |
| 91 | // It is the caller's responsibility to call Close on the Reader when done. |
| 92 | // |
| 93 | // The Reader.Header fields will be valid in the Reader returned. |
| 94 | func NewReader(r io.Reader) (*Reader, error) { |
| 95 | z := new(Reader) |
| 96 | if err := z.Reset(r); err != nil { |
| 97 | return nil, err |
| 98 | } |
| 99 | return z, nil |
| 100 | } |
| 101 | |
| 102 | // Reset discards the Reader z's state and makes it equivalent to the |
| 103 | // result of its original state from NewReader, but reading from r instead. |
| 104 | // This permits reusing a Reader rather than allocating a new one. |
| 105 | func (z *Reader) Reset(r io.Reader) error { |
| 106 | *z = Reader{ |
| 107 | decompressor: z.decompressor, |
| 108 | multistream: true, |
| 109 | br: z.br, |
| 110 | } |
| 111 | if rr, ok := r.(flate.Reader); ok { |
| 112 | z.r = rr |
| 113 | } else { |
| 114 | // Reuse if we can. |
| 115 | if z.br != nil { |
| 116 | z.br.Reset(r) |
| 117 | } else { |
| 118 | z.br = bufio.NewReader(r) |
| 119 | } |
| 120 | z.r = z.br |
| 121 | } |
| 122 | z.Header, z.err = z.readHeader() |
| 123 | return z.err |
| 124 | } |
| 125 | |
| 126 | // Multistream controls whether the reader supports multistream files. |
| 127 | // |
| 128 | // If enabled (the default), the Reader expects the input to be a sequence |
| 129 | // of individually gzipped data streams, each with its own header and |
| 130 | // trailer, ending at EOF. The effect is that the concatenation of a sequence |
| 131 | // of gzipped files is treated as equivalent to the gzip of the concatenation |
| 132 | // of the sequence. This is standard behavior for gzip readers. |
| 133 | // |
| 134 | // Calling Multistream(false) disables this behavior; disabling the behavior |
| 135 | // can be useful when reading file formats that distinguish individual gzip |
| 136 | // data streams or mix gzip data streams with other data streams. |
| 137 | // In this mode, when the Reader reaches the end of the data stream, |
| 138 | // Read returns io.EOF. If the underlying reader implements io.ByteReader, |
| 139 | // it will be left positioned just after the gzip stream. |
| 140 | // To start the next stream, call z.Reset(r) followed by z.Multistream(false). |
| 141 | // If there is no next stream, z.Reset(r) will return io.EOF. |
| 142 | func (z *Reader) Multistream(ok bool) { |
| 143 | z.multistream = ok |
| 144 | } |
| 145 | |
| 146 | // readString reads a NUL-terminated string from z.r. |
| 147 | // It treats the bytes read as being encoded as ISO 8859-1 (Latin-1) and |
| 148 | // will output a string encoded using UTF-8. |
| 149 | // This method always updates z.digest with the data read. |
| 150 | func (z *Reader) readString() (string, error) { |
| 151 | var err error |
| 152 | needConv := false |
| 153 | for i := 0; ; i++ { |
| 154 | if i >= len(z.buf) { |
| 155 | return "", ErrHeader |
| 156 | } |
| 157 | z.buf[i], err = z.r.ReadByte() |
| 158 | if err != nil { |
| 159 | return "", err |
| 160 | } |
| 161 | if z.buf[i] > 0x7f { |
| 162 | needConv = true |
| 163 | } |
| 164 | if z.buf[i] == 0 { |
| 165 | // Digest covers the NUL terminator. |
| 166 | z.digest = crc32.Update(z.digest, crc32.IEEETable, z.buf[:i+1]) |
| 167 | |
| 168 | // Strings are ISO 8859-1, Latin-1 (RFC 1952, section 2.3.1). |
| 169 | if needConv { |
| 170 | s := make([]rune, 0, i) |
| 171 | for _, v := range z.buf[:i] { |
| 172 | s = append(s, rune(v)) |
| 173 | } |
| 174 | return string(s), nil |
| 175 | } |
| 176 | return string(z.buf[:i]), nil |
| 177 | } |
| 178 | } |
| 179 | } |
| 180 | |
| 181 | // readHeader reads the GZIP header according to section 2.3.1. |
| 182 | // This method does not set z.err. |
| 183 | func (z *Reader) readHeader() (hdr Header, err error) { |
| 184 | if _, err = io.ReadFull(z.r, z.buf[:10]); err != nil { |
| 185 | // RFC 1952, section 2.2, says the following: |
| 186 | // A gzip file consists of a series of "members" (compressed data sets). |
| 187 | // |
| 188 | // Other than this, the specification does not clarify whether a |
| 189 | // "series" is defined as "one or more" or "zero or more". To err on the |
| 190 | // side of caution, Go interprets this to mean "zero or more". |
| 191 | // Thus, it is okay to return io.EOF here. |
| 192 | return hdr, err |
| 193 | } |
| 194 | if z.buf[0] != gzipID1 || z.buf[1] != gzipID2 || z.buf[2] != gzipDeflate { |
| 195 | return hdr, ErrHeader |
| 196 | } |
| 197 | flg := z.buf[3] |
| 198 | hdr.ModTime = time.Unix(int64(le.Uint32(z.buf[4:8])), 0) |
| 199 | // z.buf[8] is XFL and is currently ignored. |
| 200 | hdr.OS = z.buf[9] |
| 201 | z.digest = crc32.ChecksumIEEE(z.buf[:10]) |
| 202 | |
| 203 | if flg&flagExtra != 0 { |
| 204 | if _, err = io.ReadFull(z.r, z.buf[:2]); err != nil { |
| 205 | return hdr, noEOF(err) |
| 206 | } |
| 207 | z.digest = crc32.Update(z.digest, crc32.IEEETable, z.buf[:2]) |
| 208 | data := make([]byte, le.Uint16(z.buf[:2])) |
| 209 | if _, err = io.ReadFull(z.r, data); err != nil { |
| 210 | return hdr, noEOF(err) |
| 211 | } |
| 212 | z.digest = crc32.Update(z.digest, crc32.IEEETable, data) |
| 213 | hdr.Extra = data |
| 214 | } |
| 215 | |
| 216 | var s string |
| 217 | if flg&flagName != 0 { |
| 218 | if s, err = z.readString(); err != nil { |
| 219 | return hdr, err |
| 220 | } |
| 221 | hdr.Name = s |
| 222 | } |
| 223 | |
| 224 | if flg&flagComment != 0 { |
| 225 | if s, err = z.readString(); err != nil { |
| 226 | return hdr, err |
| 227 | } |
| 228 | hdr.Comment = s |
| 229 | } |
| 230 | |
| 231 | if flg&flagHdrCrc != 0 { |
| 232 | if _, err = io.ReadFull(z.r, z.buf[:2]); err != nil { |
| 233 | return hdr, noEOF(err) |
| 234 | } |
| 235 | digest := le.Uint16(z.buf[:2]) |
| 236 | if digest != uint16(z.digest) { |
| 237 | return hdr, ErrHeader |
| 238 | } |
| 239 | } |
| 240 | |
| 241 | // Reserved FLG bits must be zero. |
| 242 | if flg>>5 != 0 { |
| 243 | return hdr, ErrHeader |
| 244 | } |
| 245 | |
| 246 | z.digest = 0 |
| 247 | if z.decompressor == nil { |
| 248 | z.decompressor = flate.NewReader(z.r) |
| 249 | } else { |
| 250 | z.decompressor.(flate.Resetter).Reset(z.r, nil) |
| 251 | } |
| 252 | return hdr, nil |
| 253 | } |
| 254 | |
| 255 | // Read implements io.Reader, reading uncompressed bytes from its underlying Reader. |
| 256 | func (z *Reader) Read(p []byte) (n int, err error) { |
| 257 | if z.err != nil { |
| 258 | return 0, z.err |
| 259 | } |
| 260 | |
| 261 | for n == 0 { |
| 262 | n, z.err = z.decompressor.Read(p) |
| 263 | z.digest = crc32.Update(z.digest, crc32.IEEETable, p[:n]) |
| 264 | z.size += uint32(n) |
| 265 | if z.err != io.EOF { |
| 266 | // In the normal case we return here. |
| 267 | return n, z.err |
| 268 | } |
| 269 | |
| 270 | // Finished file; check checksum and size. |
| 271 | if _, err := io.ReadFull(z.r, z.buf[:8]); err != nil { |
| 272 | z.err = noEOF(err) |
| 273 | return n, z.err |
| 274 | } |
| 275 | digest := le.Uint32(z.buf[:4]) |
| 276 | size := le.Uint32(z.buf[4:8]) |
| 277 | if digest != z.digest || size != z.size { |
| 278 | z.err = ErrChecksum |
| 279 | return n, z.err |
| 280 | } |
| 281 | z.digest, z.size = 0, 0 |
| 282 | |
| 283 | // File is ok; check if there is another. |
| 284 | if !z.multistream { |
| 285 | return n, io.EOF |
| 286 | } |
| 287 | z.err = nil // Remove io.EOF |
| 288 | |
| 289 | if _, z.err = z.readHeader(); z.err != nil { |
| 290 | return n, z.err |
| 291 | } |
| 292 | } |
| 293 | |
| 294 | return n, nil |
| 295 | } |
| 296 | |
| 297 | type crcer interface { |
| 298 | io.Writer |
| 299 | Sum32() uint32 |
| 300 | Reset() |
| 301 | } |
| 302 | type crcUpdater struct { |
| 303 | z *Reader |
| 304 | } |
| 305 | |
| 306 | func (c *crcUpdater) Write(p []byte) (int, error) { |
| 307 | c.z.digest = crc32.Update(c.z.digest, crc32.IEEETable, p) |
| 308 | return len(p), nil |
| 309 | } |
| 310 | |
| 311 | func (c *crcUpdater) Sum32() uint32 { |
| 312 | return c.z.digest |
| 313 | } |
| 314 | |
| 315 | func (c *crcUpdater) Reset() { |
| 316 | c.z.digest = 0 |
| 317 | } |
| 318 | |
| 319 | // WriteTo support the io.WriteTo interface for io.Copy and friends. |
| 320 | func (z *Reader) WriteTo(w io.Writer) (int64, error) { |
| 321 | total := int64(0) |
| 322 | crcWriter := crcer(crc32.NewIEEE()) |
| 323 | if z.digest != 0 { |
| 324 | crcWriter = &crcUpdater{z: z} |
| 325 | } |
| 326 | for { |
| 327 | if z.err != nil { |
| 328 | if z.err == io.EOF { |
| 329 | return total, nil |
| 330 | } |
| 331 | return total, z.err |
| 332 | } |
| 333 | |
| 334 | // We write both to output and digest. |
| 335 | mw := io.MultiWriter(w, crcWriter) |
| 336 | n, err := z.decompressor.(io.WriterTo).WriteTo(mw) |
| 337 | total += n |
| 338 | z.size += uint32(n) |
| 339 | if err != nil { |
| 340 | z.err = err |
| 341 | return total, z.err |
| 342 | } |
| 343 | |
| 344 | // Finished file; check checksum + size. |
| 345 | if _, err := io.ReadFull(z.r, z.buf[0:8]); err != nil { |
| 346 | if err == io.EOF { |
| 347 | err = io.ErrUnexpectedEOF |
| 348 | } |
| 349 | z.err = err |
| 350 | return total, err |
| 351 | } |
| 352 | z.digest = crcWriter.Sum32() |
| 353 | digest := le.Uint32(z.buf[:4]) |
| 354 | size := le.Uint32(z.buf[4:8]) |
| 355 | if digest != z.digest || size != z.size { |
| 356 | z.err = ErrChecksum |
| 357 | return total, z.err |
| 358 | } |
| 359 | z.digest, z.size = 0, 0 |
| 360 | |
| 361 | // File is ok; check if there is another. |
| 362 | if !z.multistream { |
| 363 | return total, nil |
| 364 | } |
| 365 | crcWriter.Reset() |
| 366 | z.err = nil // Remove io.EOF |
| 367 | |
| 368 | if _, z.err = z.readHeader(); z.err != nil { |
| 369 | if z.err == io.EOF { |
| 370 | return total, nil |
| 371 | } |
| 372 | return total, z.err |
| 373 | } |
| 374 | } |
| 375 | } |
| 376 | |
| 377 | // Close closes the Reader. It does not close the underlying io.Reader. |
| 378 | // In order for the GZIP checksum to be verified, the reader must be |
| 379 | // fully consumed until the io.EOF. |
| 380 | func (z *Reader) Close() error { return z.decompressor.Close() } |