| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 1 | package zstd |
| 2 | |
| 3 | import ( |
| 4 | "errors" |
| 5 | "fmt" |
| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 6 | "math" |
| 7 | "math/bits" |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 8 | "runtime" |
| 9 | "strings" |
| 10 | ) |
| 11 | |
| 12 | // EOption is an option for creating a encoder. |
| 13 | type EOption func(*encoderOptions) error |
| 14 | |
| 15 | // options retains accumulated state of multiple options. |
| 16 | type encoderOptions struct { |
| 17 | concurrent int |
| 18 | level EncoderLevel |
| 19 | single *bool |
| 20 | pad int |
| 21 | blockSize int |
| 22 | windowSize int |
| 23 | crc bool |
| 24 | fullZero bool |
| 25 | noEntropy bool |
| 26 | allLitEntropy bool |
| 27 | customWindow bool |
| 28 | customALEntropy bool |
| Akash Reddy Kankanala | cf04537 | 2025-06-10 14:11:24 +0530 | [diff] [blame] | 29 | customBlockSize bool |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 30 | lowMem bool |
| 31 | dict *dict |
| 32 | } |
| 33 | |
| 34 | func (o *encoderOptions) setDefault() { |
| 35 | *o = encoderOptions{ |
| 36 | concurrent: runtime.GOMAXPROCS(0), |
| 37 | crc: true, |
| 38 | single: nil, |
| Akash Reddy Kankanala | cf04537 | 2025-06-10 14:11:24 +0530 | [diff] [blame] | 39 | blockSize: maxCompressedBlockSize, |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 40 | windowSize: 8 << 20, |
| 41 | level: SpeedDefault, |
| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 42 | allLitEntropy: false, |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 43 | lowMem: false, |
| 44 | } |
| 45 | } |
| 46 | |
| 47 | // encoder returns an encoder with the selected options. |
| 48 | func (o encoderOptions) encoder() encoder { |
| 49 | switch o.level { |
| 50 | case SpeedFastest: |
| 51 | if o.dict != nil { |
| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 52 | return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}} |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 53 | } |
| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 54 | return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}} |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 55 | |
| 56 | case SpeedDefault: |
| 57 | if o.dict != nil { |
| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 58 | return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}} |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 59 | } |
| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 60 | return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}} |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 61 | case SpeedBetterCompression: |
| 62 | if o.dict != nil { |
| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 63 | return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}} |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 64 | } |
| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 65 | return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}} |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 66 | case SpeedBestCompression: |
| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 67 | return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}} |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 68 | } |
| 69 | panic("unknown compression level") |
| 70 | } |
| 71 | |
| 72 | // WithEncoderCRC will add CRC value to output. |
| 73 | // Output will be 4 bytes larger. |
| 74 | func WithEncoderCRC(b bool) EOption { |
| 75 | return func(o *encoderOptions) error { o.crc = b; return nil } |
| 76 | } |
| 77 | |
| 78 | // WithEncoderConcurrency will set the concurrency, |
| 79 | // meaning the maximum number of encoders to run concurrently. |
| 80 | // The value supplied must be at least 1. |
| Akash Reddy Kankanala | cf04537 | 2025-06-10 14:11:24 +0530 | [diff] [blame] | 81 | // For streams, setting a value of 1 will disable async compression. |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 82 | // By default this will be set to GOMAXPROCS. |
| 83 | func WithEncoderConcurrency(n int) EOption { |
| 84 | return func(o *encoderOptions) error { |
| 85 | if n <= 0 { |
| 86 | return fmt.Errorf("concurrency must be at least 1") |
| 87 | } |
| 88 | o.concurrent = n |
| 89 | return nil |
| 90 | } |
| 91 | } |
| 92 | |
| 93 | // WithWindowSize will set the maximum allowed back-reference distance. |
| 94 | // The value must be a power of two between MinWindowSize and MaxWindowSize. |
| 95 | // A larger value will enable better compression but allocate more memory and, |
| 96 | // for above-default values, take considerably longer. |
| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 97 | // The default value is determined by the compression level and max 8MB. |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 98 | func WithWindowSize(n int) EOption { |
| 99 | return func(o *encoderOptions) error { |
| 100 | switch { |
| 101 | case n < MinWindowSize: |
| 102 | return fmt.Errorf("window size must be at least %d", MinWindowSize) |
| 103 | case n > MaxWindowSize: |
| 104 | return fmt.Errorf("window size must be at most %d", MaxWindowSize) |
| 105 | case (n & (n - 1)) != 0: |
| 106 | return errors.New("window size must be a power of 2") |
| 107 | } |
| 108 | |
| 109 | o.windowSize = n |
| 110 | o.customWindow = true |
| 111 | if o.blockSize > o.windowSize { |
| 112 | o.blockSize = o.windowSize |
| Akash Reddy Kankanala | cf04537 | 2025-06-10 14:11:24 +0530 | [diff] [blame] | 113 | o.customBlockSize = true |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 114 | } |
| 115 | return nil |
| 116 | } |
| 117 | } |
| 118 | |
| 119 | // WithEncoderPadding will add padding to all output so the size will be a multiple of n. |
| 120 | // This can be used to obfuscate the exact output size or make blocks of a certain size. |
| 121 | // The contents will be a skippable frame, so it will be invisible by the decoder. |
| 122 | // n must be > 0 and <= 1GB, 1<<30 bytes. |
| 123 | // The padded area will be filled with data from crypto/rand.Reader. |
| 124 | // If `EncodeAll` is used with data already in the destination, the total size will be multiple of this. |
| 125 | func WithEncoderPadding(n int) EOption { |
| 126 | return func(o *encoderOptions) error { |
| 127 | if n <= 0 { |
| 128 | return fmt.Errorf("padding must be at least 1") |
| 129 | } |
| 130 | // No need to waste our time. |
| 131 | if n == 1 { |
| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 132 | n = 0 |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 133 | } |
| 134 | if n > 1<<30 { |
| 135 | return fmt.Errorf("padding must less than 1GB (1<<30 bytes) ") |
| 136 | } |
| 137 | o.pad = n |
| 138 | return nil |
| 139 | } |
| 140 | } |
| 141 | |
| 142 | // EncoderLevel predefines encoder compression levels. |
| 143 | // Only use the constants made available, since the actual mapping |
| 144 | // of these values are very likely to change and your compression could change |
| 145 | // unpredictably when upgrading the library. |
| 146 | type EncoderLevel int |
| 147 | |
| 148 | const ( |
| 149 | speedNotSet EncoderLevel = iota |
| 150 | |
| 151 | // SpeedFastest will choose the fastest reasonable compression. |
| 152 | // This is roughly equivalent to the fastest Zstandard mode. |
| 153 | SpeedFastest |
| 154 | |
| 155 | // SpeedDefault is the default "pretty fast" compression option. |
| 156 | // This is roughly equivalent to the default Zstandard mode (level 3). |
| 157 | SpeedDefault |
| 158 | |
| 159 | // SpeedBetterCompression will yield better compression than the default. |
| 160 | // Currently it is about zstd level 7-8 with ~ 2x-3x the default CPU usage. |
| 161 | // By using this, notice that CPU usage may go up in the future. |
| 162 | SpeedBetterCompression |
| 163 | |
| 164 | // SpeedBestCompression will choose the best available compression option. |
| 165 | // This will offer the best compression no matter the CPU cost. |
| 166 | SpeedBestCompression |
| 167 | |
| 168 | // speedLast should be kept as the last actual compression option. |
| 169 | // The is not for external usage, but is used to keep track of the valid options. |
| 170 | speedLast |
| 171 | ) |
| 172 | |
| 173 | // EncoderLevelFromString will convert a string representation of an encoding level back |
| 174 | // to a compression level. The compare is not case sensitive. |
| 175 | // If the string wasn't recognized, (false, SpeedDefault) will be returned. |
| 176 | func EncoderLevelFromString(s string) (bool, EncoderLevel) { |
| 177 | for l := speedNotSet + 1; l < speedLast; l++ { |
| 178 | if strings.EqualFold(s, l.String()) { |
| 179 | return true, l |
| 180 | } |
| 181 | } |
| 182 | return false, SpeedDefault |
| 183 | } |
| 184 | |
| 185 | // EncoderLevelFromZstd will return an encoder level that closest matches the compression |
| 186 | // ratio of a specific zstd compression level. |
| 187 | // Many input values will provide the same compression level. |
| 188 | func EncoderLevelFromZstd(level int) EncoderLevel { |
| 189 | switch { |
| 190 | case level < 3: |
| 191 | return SpeedFastest |
| 192 | case level >= 3 && level < 6: |
| 193 | return SpeedDefault |
| 194 | case level >= 6 && level < 10: |
| 195 | return SpeedBetterCompression |
| Akash Reddy Kankanala | cf04537 | 2025-06-10 14:11:24 +0530 | [diff] [blame] | 196 | default: |
| 197 | return SpeedBestCompression |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 198 | } |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 199 | } |
| 200 | |
| 201 | // String provides a string representation of the compression level. |
| 202 | func (e EncoderLevel) String() string { |
| 203 | switch e { |
| 204 | case SpeedFastest: |
| 205 | return "fastest" |
| 206 | case SpeedDefault: |
| 207 | return "default" |
| 208 | case SpeedBetterCompression: |
| 209 | return "better" |
| 210 | case SpeedBestCompression: |
| 211 | return "best" |
| 212 | default: |
| 213 | return "invalid" |
| 214 | } |
| 215 | } |
| 216 | |
| 217 | // WithEncoderLevel specifies a predefined compression level. |
| 218 | func WithEncoderLevel(l EncoderLevel) EOption { |
| 219 | return func(o *encoderOptions) error { |
| 220 | switch { |
| 221 | case l <= speedNotSet || l >= speedLast: |
| 222 | return fmt.Errorf("unknown encoder level") |
| 223 | } |
| 224 | o.level = l |
| 225 | if !o.customWindow { |
| 226 | switch o.level { |
| 227 | case SpeedFastest: |
| 228 | o.windowSize = 4 << 20 |
| Akash Reddy Kankanala | cf04537 | 2025-06-10 14:11:24 +0530 | [diff] [blame] | 229 | if !o.customBlockSize { |
| 230 | o.blockSize = 1 << 16 |
| 231 | } |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 232 | case SpeedDefault: |
| 233 | o.windowSize = 8 << 20 |
| 234 | case SpeedBetterCompression: |
| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 235 | o.windowSize = 8 << 20 |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 236 | case SpeedBestCompression: |
| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 237 | o.windowSize = 8 << 20 |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 238 | } |
| 239 | } |
| 240 | if !o.customALEntropy { |
| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 241 | o.allLitEntropy = l > SpeedDefault |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 242 | } |
| 243 | |
| 244 | return nil |
| 245 | } |
| 246 | } |
| 247 | |
| 248 | // WithZeroFrames will encode 0 length input as full frames. |
| 249 | // This can be needed for compatibility with zstandard usage, |
| 250 | // but is not needed for this package. |
| 251 | func WithZeroFrames(b bool) EOption { |
| 252 | return func(o *encoderOptions) error { |
| 253 | o.fullZero = b |
| 254 | return nil |
| 255 | } |
| 256 | } |
| 257 | |
| 258 | // WithAllLitEntropyCompression will apply entropy compression if no matches are found. |
| 259 | // Disabling this will skip incompressible data faster, but in cases with no matches but |
| 260 | // skewed character distribution compression is lost. |
| 261 | // Default value depends on the compression level selected. |
| 262 | func WithAllLitEntropyCompression(b bool) EOption { |
| 263 | return func(o *encoderOptions) error { |
| 264 | o.customALEntropy = true |
| 265 | o.allLitEntropy = b |
| 266 | return nil |
| 267 | } |
| 268 | } |
| 269 | |
| 270 | // WithNoEntropyCompression will always skip entropy compression of literals. |
| 271 | // This can be useful if content has matches, but unlikely to benefit from entropy |
| 272 | // compression. Usually the slight speed improvement is not worth enabling this. |
| 273 | func WithNoEntropyCompression(b bool) EOption { |
| 274 | return func(o *encoderOptions) error { |
| 275 | o.noEntropy = b |
| 276 | return nil |
| 277 | } |
| 278 | } |
| 279 | |
| 280 | // WithSingleSegment will set the "single segment" flag when EncodeAll is used. |
| 281 | // If this flag is set, data must be regenerated within a single continuous memory segment. |
| 282 | // In this case, Window_Descriptor byte is skipped, but Frame_Content_Size is necessarily present. |
| 283 | // As a consequence, the decoder must allocate a memory segment of size equal or larger than size of your content. |
| 284 | // In order to preserve the decoder from unreasonable memory requirements, |
| 285 | // a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range. |
| 286 | // For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB. |
| 287 | // This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations. |
| Akash Reddy Kankanala | cf04537 | 2025-06-10 14:11:24 +0530 | [diff] [blame] | 288 | // If this is not specified, block encodes will automatically choose this based on the input size and the window size. |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 289 | // This setting has no effect on streamed encodes. |
| 290 | func WithSingleSegment(b bool) EOption { |
| 291 | return func(o *encoderOptions) error { |
| 292 | o.single = &b |
| 293 | return nil |
| 294 | } |
| 295 | } |
| 296 | |
| 297 | // WithLowerEncoderMem will trade in some memory cases trade less memory usage for |
| 298 | // slower encoding speed. |
| 299 | // This will not change the window size which is the primary function for reducing |
| 300 | // memory usage. See WithWindowSize. |
| 301 | func WithLowerEncoderMem(b bool) EOption { |
| 302 | return func(o *encoderOptions) error { |
| 303 | o.lowMem = b |
| 304 | return nil |
| 305 | } |
| 306 | } |
| 307 | |
| 308 | // WithEncoderDict allows to register a dictionary that will be used for the encode. |
| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 309 | // |
| 310 | // The slice dict must be in the [dictionary format] produced by |
| 311 | // "zstd --train" from the Zstandard reference implementation. |
| 312 | // |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 313 | // The encoder *may* choose to use no dictionary instead for certain payloads. |
| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 314 | // |
| 315 | // [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format |
| khenaidoo | d948f77 | 2021-08-11 17:49:24 -0400 | [diff] [blame] | 316 | func WithEncoderDict(dict []byte) EOption { |
| 317 | return func(o *encoderOptions) error { |
| 318 | d, err := loadDict(dict) |
| 319 | if err != nil { |
| 320 | return err |
| 321 | } |
| 322 | o.dict = d |
| 323 | return nil |
| 324 | } |
| 325 | } |
| Abhay Kumar | a2ae599 | 2025-11-10 14:02:24 +0000 | [diff] [blame^] | 326 | |
| 327 | // WithEncoderDictRaw registers a dictionary that may be used by the encoder. |
| 328 | // |
| 329 | // The slice content may contain arbitrary data. It will be used as an initial |
| 330 | // history. |
| 331 | func WithEncoderDictRaw(id uint32, content []byte) EOption { |
| 332 | return func(o *encoderOptions) error { |
| 333 | if bits.UintSize > 32 && uint(len(content)) > dictMaxLength { |
| 334 | return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content)) |
| 335 | } |
| 336 | o.dict = &dict{id: id, content: content, offsets: [3]int{1, 4, 8}} |
| 337 | return nil |
| 338 | } |
| 339 | } |