| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 1 | // Copyright 2009 The Go Authors. All rights reserved. |
| 2 | // Use of this source code is governed by a BSD-style |
| 3 | // license that can be found in the LICENSE file. |
| 4 | |
| 5 | package flate |
| 6 | |
| 7 | import ( |
| 8 | "bytes" |
| 9 | "encoding/binary" |
| 10 | "fmt" |
| 11 | "io" |
| 12 | "math" |
| 13 | ) |
| 14 | |
| 15 | const ( |
| 16 | // bits 0-16 xoffset = offset - MIN_OFFSET_SIZE, or literal - 16 bits |
| 17 | // bits 16-22 offsetcode - 5 bits |
| 18 | // bits 22-30 xlength = length - MIN_MATCH_LENGTH - 8 bits |
| 19 | // bits 30-32 type 0 = literal 1=EOF 2=Match 3=Unused - 2 bits |
| 20 | lengthShift = 22 |
| 21 | offsetMask = 1<<lengthShift - 1 |
| 22 | typeMask = 3 << 30 |
| 23 | literalType = 0 << 30 |
| 24 | matchType = 1 << 30 |
| 25 | matchOffsetOnlyMask = 0xffff |
| 26 | ) |
| 27 | |
| 28 | // The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH) |
| 29 | // is lengthCodes[length - MIN_MATCH_LENGTH] |
| 30 | var lengthCodes = [256]uint8{ |
| 31 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, |
| 32 | 9, 9, 10, 10, 11, 11, 12, 12, 12, 12, |
| 33 | 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, |
| 34 | 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, |
| 35 | 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, |
| 36 | 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, |
| 37 | 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, |
| 38 | 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, |
| 39 | 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, |
| 40 | 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, |
| 41 | 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, |
| 42 | 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, |
| 43 | 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, |
| 44 | 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, |
| 45 | 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, |
| 46 | 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, |
| 47 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, |
| 48 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, |
| 49 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, |
| 50 | 25, 25, 26, 26, 26, 26, 26, 26, 26, 26, |
| 51 | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, |
| 52 | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, |
| 53 | 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, |
| 54 | 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, |
| 55 | 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, |
| 56 | 27, 27, 27, 27, 27, 28, |
| 57 | } |
| 58 | |
| 59 | // lengthCodes1 is length codes, but starting at 1. |
| 60 | var lengthCodes1 = [256]uint8{ |
| 61 | 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, |
| 62 | 10, 10, 11, 11, 12, 12, 13, 13, 13, 13, |
| 63 | 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, |
| 64 | 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, |
| 65 | 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, |
| 66 | 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, |
| 67 | 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, |
| 68 | 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, |
| 69 | 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, |
| 70 | 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, |
| 71 | 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, |
| 72 | 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, |
| 73 | 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, |
| 74 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, |
| 75 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, |
| 76 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, |
| 77 | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, |
| 78 | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, |
| 79 | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, |
| 80 | 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, |
| 81 | 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, |
| 82 | 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, |
| 83 | 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, |
| 84 | 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, |
| 85 | 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, |
| 86 | 28, 28, 28, 28, 28, 29, |
| 87 | } |
| 88 | |
| 89 | var offsetCodes = [256]uint32{ |
| 90 | 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, |
| 91 | 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, |
| 92 | 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
| 93 | 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, |
| 94 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, |
| 95 | 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, |
| 96 | 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, |
| 97 | 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, |
| 98 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
| 99 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
| 100 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
| 101 | 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, |
| 102 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
| 103 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
| 104 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
| 105 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
| 106 | } |
| 107 | |
| 108 | // offsetCodes14 are offsetCodes, but with 14 added. |
| 109 | var offsetCodes14 = [256]uint32{ |
| 110 | 14, 15, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, |
| 111 | 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, |
| 112 | 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, |
| 113 | 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, |
| 114 | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, |
| 115 | 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, |
| 116 | 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, |
| 117 | 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, |
| 118 | 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, |
| 119 | 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, |
| 120 | 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, |
| 121 | 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, |
| 122 | 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, |
| 123 | 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, |
| 124 | 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, |
| 125 | 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, |
| 126 | } |
| 127 | |
| 128 | type token uint32 |
| 129 | |
| 130 | type tokens struct { |
| 131 | extraHist [32]uint16 // codes 256->maxnumlit |
| 132 | offHist [32]uint16 // offset codes |
| 133 | litHist [256]uint16 // codes 0->255 |
| 134 | nFilled int |
| 135 | n uint16 // Must be able to contain maxStoreBlockSize |
| 136 | tokens [maxStoreBlockSize + 1]token |
| 137 | } |
| 138 | |
| 139 | func (t *tokens) Reset() { |
| 140 | if t.n == 0 { |
| 141 | return |
| 142 | } |
| 143 | t.n = 0 |
| 144 | t.nFilled = 0 |
| 145 | for i := range t.litHist[:] { |
| 146 | t.litHist[i] = 0 |
| 147 | } |
| 148 | for i := range t.extraHist[:] { |
| 149 | t.extraHist[i] = 0 |
| 150 | } |
| 151 | for i := range t.offHist[:] { |
| 152 | t.offHist[i] = 0 |
| 153 | } |
| 154 | } |
| 155 | |
| 156 | func (t *tokens) Fill() { |
| 157 | if t.n == 0 { |
| 158 | return |
| 159 | } |
| 160 | for i, v := range t.litHist[:] { |
| 161 | if v == 0 { |
| 162 | t.litHist[i] = 1 |
| 163 | t.nFilled++ |
| 164 | } |
| 165 | } |
| 166 | for i, v := range t.extraHist[:literalCount-256] { |
| 167 | if v == 0 { |
| 168 | t.nFilled++ |
| 169 | t.extraHist[i] = 1 |
| 170 | } |
| 171 | } |
| 172 | for i, v := range t.offHist[:offsetCodeCount] { |
| 173 | if v == 0 { |
| 174 | t.offHist[i] = 1 |
| 175 | } |
| 176 | } |
| 177 | } |
| 178 | |
| 179 | func indexTokens(in []token) tokens { |
| 180 | var t tokens |
| 181 | t.indexTokens(in) |
| 182 | return t |
| 183 | } |
| 184 | |
| 185 | func (t *tokens) indexTokens(in []token) { |
| 186 | t.Reset() |
| 187 | for _, tok := range in { |
| 188 | if tok < matchType { |
| 189 | t.AddLiteral(tok.literal()) |
| 190 | continue |
| 191 | } |
| 192 | t.AddMatch(uint32(tok.length()), tok.offset()&matchOffsetOnlyMask) |
| 193 | } |
| 194 | } |
| 195 | |
| 196 | // emitLiteral writes a literal chunk and returns the number of bytes written. |
| 197 | func emitLiteral(dst *tokens, lit []byte) { |
| 198 | for _, v := range lit { |
| 199 | dst.tokens[dst.n] = token(v) |
| 200 | dst.litHist[v]++ |
| 201 | dst.n++ |
| 202 | } |
| 203 | } |
| 204 | |
| 205 | func (t *tokens) AddLiteral(lit byte) { |
| 206 | t.tokens[t.n] = token(lit) |
| 207 | t.litHist[lit]++ |
| 208 | t.n++ |
| 209 | } |
| 210 | |
| 211 | // from https://stackoverflow.com/a/28730362 |
| 212 | func mFastLog2(val float32) float32 { |
| 213 | ux := int32(math.Float32bits(val)) |
| 214 | log2 := (float32)(((ux >> 23) & 255) - 128) |
| 215 | ux &= -0x7f800001 |
| 216 | ux += 127 << 23 |
| 217 | uval := math.Float32frombits(uint32(ux)) |
| 218 | log2 += ((-0.34484843)*uval+2.02466578)*uval - 0.67487759 |
| 219 | return log2 |
| 220 | } |
| 221 | |
| 222 | // EstimatedBits will return an minimum size estimated by an *optimal* |
| 223 | // compression of the block. |
| 224 | // The size of the block |
| 225 | func (t *tokens) EstimatedBits() int { |
| 226 | shannon := float32(0) |
| 227 | bits := int(0) |
| 228 | nMatches := 0 |
| 229 | total := int(t.n) + t.nFilled |
| 230 | if total > 0 { |
| 231 | invTotal := 1.0 / float32(total) |
| 232 | for _, v := range t.litHist[:] { |
| 233 | if v > 0 { |
| 234 | n := float32(v) |
| 235 | shannon += atLeastOne(-mFastLog2(n*invTotal)) * n |
| 236 | } |
| 237 | } |
| 238 | // Just add 15 for EOB |
| 239 | shannon += 15 |
| 240 | for i, v := range t.extraHist[1 : literalCount-256] { |
| 241 | if v > 0 { |
| 242 | n := float32(v) |
| 243 | shannon += atLeastOne(-mFastLog2(n*invTotal)) * n |
| 244 | bits += int(lengthExtraBits[i&31]) * int(v) |
| 245 | nMatches += int(v) |
| 246 | } |
| 247 | } |
| 248 | } |
| 249 | if nMatches > 0 { |
| 250 | invTotal := 1.0 / float32(nMatches) |
| 251 | for i, v := range t.offHist[:offsetCodeCount] { |
| 252 | if v > 0 { |
| 253 | n := float32(v) |
| 254 | shannon += atLeastOne(-mFastLog2(n*invTotal)) * n |
| 255 | bits += int(offsetExtraBits[i&31]) * int(v) |
| 256 | } |
| 257 | } |
| 258 | } |
| 259 | return int(shannon) + bits |
| 260 | } |
| 261 | |
| 262 | // AddMatch adds a match to the tokens. |
| 263 | // This function is very sensitive to inlining and right on the border. |
| 264 | func (t *tokens) AddMatch(xlength uint32, xoffset uint32) { |
| 265 | if debugDeflate { |
| 266 | if xlength >= maxMatchLength+baseMatchLength { |
| 267 | panic(fmt.Errorf("invalid length: %v", xlength)) |
| 268 | } |
| 269 | if xoffset >= maxMatchOffset+baseMatchOffset { |
| 270 | panic(fmt.Errorf("invalid offset: %v", xoffset)) |
| 271 | } |
| 272 | } |
| 273 | oCode := offsetCode(xoffset) |
| 274 | xoffset |= oCode << 16 |
| 275 | |
| 276 | t.extraHist[lengthCodes1[uint8(xlength)]]++ |
| 277 | t.offHist[oCode&31]++ |
| 278 | t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset) |
| 279 | t.n++ |
| 280 | } |
| 281 | |
| 282 | // AddMatchLong adds a match to the tokens, potentially longer than max match length. |
| 283 | // Length should NOT have the base subtracted, only offset should. |
| 284 | func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) { |
| 285 | if debugDeflate { |
| 286 | if xoffset >= maxMatchOffset+baseMatchOffset { |
| 287 | panic(fmt.Errorf("invalid offset: %v", xoffset)) |
| 288 | } |
| 289 | } |
| 290 | oc := offsetCode(xoffset) |
| 291 | xoffset |= oc << 16 |
| 292 | for xlength > 0 { |
| 293 | xl := xlength |
| 294 | if xl > 258 { |
| 295 | // We need to have at least baseMatchLength left over for next loop. |
| 296 | if xl > 258+baseMatchLength { |
| 297 | xl = 258 |
| 298 | } else { |
| 299 | xl = 258 - baseMatchLength |
| 300 | } |
| 301 | } |
| 302 | xlength -= xl |
| 303 | xl -= baseMatchLength |
| 304 | t.extraHist[lengthCodes1[uint8(xl)]]++ |
| 305 | t.offHist[oc&31]++ |
| 306 | t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset) |
| 307 | t.n++ |
| 308 | } |
| 309 | } |
| 310 | |
| 311 | func (t *tokens) AddEOB() { |
| 312 | t.tokens[t.n] = token(endBlockMarker) |
| 313 | t.extraHist[0]++ |
| 314 | t.n++ |
| 315 | } |
| 316 | |
| 317 | func (t *tokens) Slice() []token { |
| 318 | return t.tokens[:t.n] |
| 319 | } |
| 320 | |
| 321 | // VarInt returns the tokens as varint encoded bytes. |
| 322 | func (t *tokens) VarInt() []byte { |
| 323 | var b = make([]byte, binary.MaxVarintLen32*int(t.n)) |
| 324 | var off int |
| 325 | for _, v := range t.tokens[:t.n] { |
| 326 | off += binary.PutUvarint(b[off:], uint64(v)) |
| 327 | } |
| 328 | return b[:off] |
| 329 | } |
| 330 | |
| 331 | // FromVarInt restores t to the varint encoded tokens provided. |
| 332 | // Any data in t is removed. |
| 333 | func (t *tokens) FromVarInt(b []byte) error { |
| 334 | var buf = bytes.NewReader(b) |
| 335 | var toks []token |
| 336 | for { |
| 337 | r, err := binary.ReadUvarint(buf) |
| 338 | if err == io.EOF { |
| 339 | break |
| 340 | } |
| 341 | if err != nil { |
| 342 | return err |
| 343 | } |
| 344 | toks = append(toks, token(r)) |
| 345 | } |
| 346 | t.indexTokens(toks) |
| 347 | return nil |
| 348 | } |
| 349 | |
| 350 | // Returns the type of a token |
| 351 | func (t token) typ() uint32 { return uint32(t) & typeMask } |
| 352 | |
| 353 | // Returns the literal of a literal token |
| 354 | func (t token) literal() uint8 { return uint8(t) } |
| 355 | |
| 356 | // Returns the extra offset of a match token |
| 357 | func (t token) offset() uint32 { return uint32(t) & offsetMask } |
| 358 | |
| 359 | func (t token) length() uint8 { return uint8(t >> lengthShift) } |
| 360 | |
| 361 | // Convert length to code. |
| 362 | func lengthCode(len uint8) uint8 { return lengthCodes[len] } |
| 363 | |
| 364 | // Returns the offset code corresponding to a specific offset |
| 365 | func offsetCode(off uint32) uint32 { |
| 366 | if false { |
| 367 | if off < uint32(len(offsetCodes)) { |
| 368 | return offsetCodes[off&255] |
| 369 | } else if off>>7 < uint32(len(offsetCodes)) { |
| 370 | return offsetCodes[(off>>7)&255] + 14 |
| 371 | } else { |
| 372 | return offsetCodes[(off>>14)&255] + 28 |
| 373 | } |
| 374 | } |
| 375 | if off < uint32(len(offsetCodes)) { |
| 376 | return offsetCodes[uint8(off)] |
| 377 | } |
| 378 | return offsetCodes14[uint8(off>>7)] |
| 379 | } |