| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 1 | package flate |
| 2 | |
| 3 | import ( |
| 4 | "io" |
| 5 | "math" |
| 6 | "sync" |
| 7 | |
| 8 | "github.com/klauspost/compress/internal/le" |
| 9 | ) |
| 10 | |
| 11 | const ( |
| 12 | maxStatelessBlock = math.MaxInt16 |
| 13 | // dictionary will be taken from maxStatelessBlock, so limit it. |
| 14 | maxStatelessDict = 8 << 10 |
| 15 | |
| 16 | slTableBits = 13 |
| 17 | slTableSize = 1 << slTableBits |
| 18 | slTableShift = 32 - slTableBits |
| 19 | ) |
| 20 | |
| 21 | type statelessWriter struct { |
| 22 | dst io.Writer |
| 23 | closed bool |
| 24 | } |
| 25 | |
| 26 | func (s *statelessWriter) Close() error { |
| 27 | if s.closed { |
| 28 | return nil |
| 29 | } |
| 30 | s.closed = true |
| 31 | // Emit EOF block |
| 32 | return StatelessDeflate(s.dst, nil, true, nil) |
| 33 | } |
| 34 | |
| 35 | func (s *statelessWriter) Write(p []byte) (n int, err error) { |
| 36 | err = StatelessDeflate(s.dst, p, false, nil) |
| 37 | if err != nil { |
| 38 | return 0, err |
| 39 | } |
| 40 | return len(p), nil |
| 41 | } |
| 42 | |
| 43 | func (s *statelessWriter) Reset(w io.Writer) { |
| 44 | s.dst = w |
| 45 | s.closed = false |
| 46 | } |
| 47 | |
| 48 | // NewStatelessWriter will do compression but without maintaining any state |
| 49 | // between Write calls. |
| 50 | // There will be no memory kept between Write calls, |
| 51 | // but compression and speed will be suboptimal. |
| 52 | // Because of this, the size of actual Write calls will affect output size. |
| 53 | func NewStatelessWriter(dst io.Writer) io.WriteCloser { |
| 54 | return &statelessWriter{dst: dst} |
| 55 | } |
| 56 | |
| 57 | // bitWriterPool contains bit writers that can be reused. |
| 58 | var bitWriterPool = sync.Pool{ |
| 59 | New: func() interface{} { |
| 60 | return newHuffmanBitWriter(nil) |
| 61 | }, |
| 62 | } |
| 63 | |
| 64 | // StatelessDeflate allows compressing directly to a Writer without retaining state. |
| 65 | // When returning everything will be flushed. |
| 66 | // Up to 8KB of an optional dictionary can be given which is presumed to precede the block. |
| 67 | // Longer dictionaries will be truncated and will still produce valid output. |
| 68 | // Sending nil dictionary is perfectly fine. |
| 69 | func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error { |
| 70 | var dst tokens |
| 71 | bw := bitWriterPool.Get().(*huffmanBitWriter) |
| 72 | bw.reset(out) |
| 73 | defer func() { |
| 74 | // don't keep a reference to our output |
| 75 | bw.reset(nil) |
| 76 | bitWriterPool.Put(bw) |
| 77 | }() |
| 78 | if eof && len(in) == 0 { |
| 79 | // Just write an EOF block. |
| 80 | // Could be faster... |
| 81 | bw.writeStoredHeader(0, true) |
| 82 | bw.flush() |
| 83 | return bw.err |
| 84 | } |
| 85 | |
| 86 | // Truncate dict |
| 87 | if len(dict) > maxStatelessDict { |
| 88 | dict = dict[len(dict)-maxStatelessDict:] |
| 89 | } |
| 90 | |
| 91 | // For subsequent loops, keep shallow dict reference to avoid alloc+copy. |
| 92 | var inDict []byte |
| 93 | |
| 94 | for len(in) > 0 { |
| 95 | todo := in |
| 96 | if len(inDict) > 0 { |
| 97 | if len(todo) > maxStatelessBlock-maxStatelessDict { |
| 98 | todo = todo[:maxStatelessBlock-maxStatelessDict] |
| 99 | } |
| 100 | } else if len(todo) > maxStatelessBlock-len(dict) { |
| 101 | todo = todo[:maxStatelessBlock-len(dict)] |
| 102 | } |
| 103 | inOrg := in |
| 104 | in = in[len(todo):] |
| 105 | uncompressed := todo |
| 106 | if len(dict) > 0 { |
| 107 | // combine dict and source |
| 108 | bufLen := len(todo) + len(dict) |
| 109 | combined := make([]byte, bufLen) |
| 110 | copy(combined, dict) |
| 111 | copy(combined[len(dict):], todo) |
| 112 | todo = combined |
| 113 | } |
| 114 | // Compress |
| 115 | if len(inDict) == 0 { |
| 116 | statelessEnc(&dst, todo, int16(len(dict))) |
| 117 | } else { |
| 118 | statelessEnc(&dst, inDict[:maxStatelessDict+len(todo)], maxStatelessDict) |
| 119 | } |
| 120 | isEof := eof && len(in) == 0 |
| 121 | |
| 122 | if dst.n == 0 { |
| 123 | bw.writeStoredHeader(len(uncompressed), isEof) |
| 124 | if bw.err != nil { |
| 125 | return bw.err |
| 126 | } |
| 127 | bw.writeBytes(uncompressed) |
| 128 | } else if int(dst.n) > len(uncompressed)-len(uncompressed)>>4 { |
| 129 | // If we removed less than 1/16th, huffman compress the block. |
| 130 | bw.writeBlockHuff(isEof, uncompressed, len(in) == 0) |
| 131 | } else { |
| 132 | bw.writeBlockDynamic(&dst, isEof, uncompressed, len(in) == 0) |
| 133 | } |
| 134 | if len(in) > 0 { |
| 135 | // Retain a dict if we have more |
| 136 | inDict = inOrg[len(uncompressed)-maxStatelessDict:] |
| 137 | dict = nil |
| 138 | dst.Reset() |
| 139 | } |
| 140 | if bw.err != nil { |
| 141 | return bw.err |
| 142 | } |
| 143 | } |
| 144 | if !eof { |
| 145 | // Align, only a stored block can do that. |
| 146 | bw.writeStoredHeader(0, false) |
| 147 | } |
| 148 | bw.flush() |
| 149 | return bw.err |
| 150 | } |
| 151 | |
| 152 | func hashSL(u uint32) uint32 { |
| 153 | return (u * 0x1e35a7bd) >> slTableShift |
| 154 | } |
| 155 | |
| 156 | func load3216(b []byte, i int16) uint32 { |
| 157 | return le.Load32(b, i) |
| 158 | } |
| 159 | |
| 160 | func load6416(b []byte, i int16) uint64 { |
| 161 | return le.Load64(b, i) |
| 162 | } |
| 163 | |
| 164 | func statelessEnc(dst *tokens, src []byte, startAt int16) { |
| 165 | const ( |
| 166 | inputMargin = 12 - 1 |
| 167 | minNonLiteralBlockSize = 1 + 1 + inputMargin |
| 168 | ) |
| 169 | |
| 170 | type tableEntry struct { |
| 171 | offset int16 |
| 172 | } |
| 173 | |
| 174 | var table [slTableSize]tableEntry |
| 175 | |
| 176 | // This check isn't in the Snappy implementation, but there, the caller |
| 177 | // instead of the callee handles this case. |
| 178 | if len(src)-int(startAt) < minNonLiteralBlockSize { |
| 179 | // We do not fill the token table. |
| 180 | // This will be picked up by caller. |
| 181 | dst.n = 0 |
| 182 | return |
| 183 | } |
| 184 | // Index until startAt |
| 185 | if startAt > 0 { |
| 186 | cv := load3232(src, 0) |
| 187 | for i := int16(0); i < startAt; i++ { |
| 188 | table[hashSL(cv)] = tableEntry{offset: i} |
| 189 | cv = (cv >> 8) | (uint32(src[i+4]) << 24) |
| 190 | } |
| 191 | } |
| 192 | |
| 193 | s := startAt + 1 |
| 194 | nextEmit := startAt |
| 195 | // sLimit is when to stop looking for offset/length copies. The inputMargin |
| 196 | // lets us use a fast path for emitLiteral in the main loop, while we are |
| 197 | // looking for copies. |
| 198 | sLimit := int16(len(src) - inputMargin) |
| 199 | |
| 200 | // nextEmit is where in src the next emitLiteral should start from. |
| 201 | cv := load3216(src, s) |
| 202 | |
| 203 | for { |
| 204 | const skipLog = 5 |
| 205 | const doEvery = 2 |
| 206 | |
| 207 | nextS := s |
| 208 | var candidate tableEntry |
| 209 | for { |
| 210 | nextHash := hashSL(cv) |
| 211 | candidate = table[nextHash] |
| 212 | nextS = s + doEvery + (s-nextEmit)>>skipLog |
| 213 | if nextS > sLimit || nextS <= 0 { |
| 214 | goto emitRemainder |
| 215 | } |
| 216 | |
| 217 | now := load6416(src, nextS) |
| 218 | table[nextHash] = tableEntry{offset: s} |
| 219 | nextHash = hashSL(uint32(now)) |
| 220 | |
| 221 | if cv == load3216(src, candidate.offset) { |
| 222 | table[nextHash] = tableEntry{offset: nextS} |
| 223 | break |
| 224 | } |
| 225 | |
| 226 | // Do one right away... |
| 227 | cv = uint32(now) |
| 228 | s = nextS |
| 229 | nextS++ |
| 230 | candidate = table[nextHash] |
| 231 | now >>= 8 |
| 232 | table[nextHash] = tableEntry{offset: s} |
| 233 | |
| 234 | if cv == load3216(src, candidate.offset) { |
| 235 | table[nextHash] = tableEntry{offset: nextS} |
| 236 | break |
| 237 | } |
| 238 | cv = uint32(now) |
| 239 | s = nextS |
| 240 | } |
| 241 | |
| 242 | // A 4-byte match has been found. We'll later see if more than 4 bytes |
| 243 | // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit |
| 244 | // them as literal bytes. |
| 245 | for { |
| 246 | // Invariant: we have a 4-byte match at s, and no need to emit any |
| 247 | // literal bytes prior to s. |
| 248 | |
| 249 | // Extend the 4-byte match as long as possible. |
| 250 | t := candidate.offset |
| 251 | l := int16(matchLen(src[s+4:], src[t+4:]) + 4) |
| 252 | |
| 253 | // Extend backwards |
| 254 | for t > 0 && s > nextEmit && src[t-1] == src[s-1] { |
| 255 | s-- |
| 256 | t-- |
| 257 | l++ |
| 258 | } |
| 259 | if nextEmit < s { |
| 260 | if false { |
| 261 | emitLiteral(dst, src[nextEmit:s]) |
| 262 | } else { |
| 263 | for _, v := range src[nextEmit:s] { |
| 264 | dst.tokens[dst.n] = token(v) |
| 265 | dst.litHist[v]++ |
| 266 | dst.n++ |
| 267 | } |
| 268 | } |
| 269 | } |
| 270 | |
| 271 | // Save the match found |
| 272 | dst.AddMatchLong(int32(l), uint32(s-t-baseMatchOffset)) |
| 273 | s += l |
| 274 | nextEmit = s |
| 275 | if nextS >= s { |
| 276 | s = nextS + 1 |
| 277 | } |
| 278 | if s >= sLimit { |
| 279 | goto emitRemainder |
| 280 | } |
| 281 | |
| 282 | // We could immediately start working at s now, but to improve |
| 283 | // compression we first update the hash table at s-2 and at s. If |
| 284 | // another emitCopy is not our next move, also calculate nextHash |
| 285 | // at s+1. At least on GOARCH=amd64, these three hash calculations |
| 286 | // are faster as one load64 call (with some shifts) instead of |
| 287 | // three load32 calls. |
| 288 | x := load6416(src, s-2) |
| 289 | o := s - 2 |
| 290 | prevHash := hashSL(uint32(x)) |
| 291 | table[prevHash] = tableEntry{offset: o} |
| 292 | x >>= 16 |
| 293 | currHash := hashSL(uint32(x)) |
| 294 | candidate = table[currHash] |
| 295 | table[currHash] = tableEntry{offset: o + 2} |
| 296 | |
| 297 | if uint32(x) != load3216(src, candidate.offset) { |
| 298 | cv = uint32(x >> 8) |
| 299 | s++ |
| 300 | break |
| 301 | } |
| 302 | } |
| 303 | } |
| 304 | |
| 305 | emitRemainder: |
| 306 | if int(nextEmit) < len(src) { |
| 307 | // If nothing was added, don't encode literals. |
| 308 | if dst.n == 0 { |
| 309 | return |
| 310 | } |
| 311 | emitLiteral(dst, src[nextEmit:]) |
| 312 | } |
| 313 | } |