[VOL-5486] Fix deprecated versions Change-Id: I3e03ea246020547ae75fa92ce8cf5cbba7e8f3bb Signed-off-by: Abhay Kumar <abhay.kumar@radisys.com>

commit: a2ae5992c52d77d5f52cea5b99df90fdfebcb48f [log] [tgz]
author: Abhay Kumar <abhay.kumar@radisys.com> Mon Nov 10 14:02:24 2025 +0000
committer: Abhay Kumar <abhay.kumar@radisys.com> Fri Dec 05 07:09:17 2025 +0000
tree: 5073cc457c66ef663e296e7dacd932674db1ece2
parent: 26065cff097de3ced0f09bff0bf47fa4f681a2c3 [diff]
diff --git a/vendor/github.com/klauspost/compress/.goreleaser.yml b/vendor/github.com/klauspost/compress/.goreleaser.yml
index 0af08e6..4528059 100644
--- a/vendor/github.com/klauspost/compress/.goreleaser.yml
+++ b/vendor/github.com/klauspost/compress/.goreleaser.yml

@@ -1,9 +1,8 @@
-# This is an example goreleaser.yaml file with some sane defaults.
-# Make sure to check the documentation at http://goreleaser.com
+version: 2
+
 before:
   hooks:
     - ./gen.sh
-    - go install mvdan.cc/garble@latest
 
 builds:
   -
@@ -32,7 +31,6 @@
       - mips64le
     goarm:
       - 7
-    gobinary: garble
   -
     id: "s2d"
     binary: s2d
@@ -59,7 +57,6 @@
       - mips64le
     goarm:
       - 7
-    gobinary: garble
   -
     id: "s2sx"
     binary: s2sx
@@ -87,21 +84,11 @@
       - mips64le
     goarm:
       - 7
-    gobinary: garble
 
 archives:
   -
     id: s2-binaries
-    name_template: "s2-{{ .Os }}_{{ .Arch }}_{{ .Version }}"
-    replacements:
-      aix: AIX
-      darwin: OSX
-      linux: Linux
-      windows: Windows
-      386: i386
-      amd64: x86_64
-      freebsd: FreeBSD
-      netbsd: NetBSD
+    name_template: "s2-{{ .Os }}_{{ .Arch }}{{ if .Arm }}v{{ .Arm }}{{ end }}"
     format_overrides:
       - goos: windows
         format: zip
@@ -112,7 +99,7 @@
 checksum:
   name_template: 'checksums.txt'
 snapshot:
-  name_template: "{{ .Tag }}-next"
+  version_template: "{{ .Tag }}-next"
 changelog:
   sort: asc
   filters:
@@ -125,7 +112,7 @@
 
 nfpms:
   -
-    file_name_template: "s2_package_{{ .Version }}_{{ .Os }}_{{ .Arch }}"
+    file_name_template: "s2_package__{{ .Os }}_{{ .Arch }}{{ if .Arm }}v{{ .Arm }}{{ end }}"
     vendor: Klaus Post
     homepage: https://github.com/klauspost/compress
     maintainer: Klaus Post <klauspost@gmail.com>
@@ -134,8 +121,3 @@
     formats:
       - deb
       - rpm
-    replacements:
-      darwin: Darwin
-      linux: Linux
-      freebsd: FreeBSD
-      amd64: x86_64

diff --git a/vendor/github.com/klauspost/compress/README.md b/vendor/github.com/klauspost/compress/README.md
index ad5c63a..244ee19 100644
--- a/vendor/github.com/klauspost/compress/README.md
+++ b/vendor/github.com/klauspost/compress/README.md

@@ -9,14 +9,192 @@
 * [huff0](https://github.com/klauspost/compress/tree/master/huff0) and [FSE](https://github.com/klauspost/compress/tree/master/fse) implementations for raw entropy encoding.

 * [gzhttp](https://github.com/klauspost/compress/tree/master/gzhttp) Provides client and server wrappers for handling gzipped requests efficiently.

 * [pgzip](https://github.com/klauspost/pgzip) is a separate package that provides a very fast parallel gzip implementation.

-* [fuzz package](https://github.com/klauspost/compress-fuzz) for fuzz testing all compressors/decompressors here.

 

 [![Go Reference](https://pkg.go.dev/badge/klauspost/compress.svg)](https://pkg.go.dev/github.com/klauspost/compress?tab=subdirectories)

 [![Go](https://github.com/klauspost/compress/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/compress/actions/workflows/go.yml)

 [![Sourcegraph Badge](https://sourcegraph.com/github.com/klauspost/compress/-/badge.svg)](https://sourcegraph.com/github.com/klauspost/compress?badge)

 

+# package usage

+

+Use `go get github.com/klauspost/compress@latest` to add it to your project.

+

+This package will support the current Go version and 2 versions back.

+

+* Use the `nounsafe` tag to disable all use of the "unsafe" package.

+* Use the `noasm` tag to disable all assembly across packages.

+

+Use the links above for more information on each.

+

 # changelog

 

+* Feb 19th, 2025 - [1.18.0](https://github.com/klauspost/compress/releases/tag/v1.18.0)

+  * Add unsafe little endian loaders https://github.com/klauspost/compress/pull/1036

+  * fix: check `r.err != nil` but return a nil value error `err` by @alingse in https://github.com/klauspost/compress/pull/1028

+  * flate: Simplify L4-6 loading https://github.com/klauspost/compress/pull/1043

+  * flate: Simplify matchlen (remove asm) https://github.com/klauspost/compress/pull/1045

+  * s2: Improve small block compression speed w/o asm https://github.com/klauspost/compress/pull/1048

+  * flate: Fix matchlen L5+L6 https://github.com/klauspost/compress/pull/1049

+  * flate: Cleanup & reduce casts https://github.com/klauspost/compress/pull/1050

+

+* Oct 11th, 2024 - [1.17.11](https://github.com/klauspost/compress/releases/tag/v1.17.11)

+  * zstd: Fix extra CRC written with multiple Close calls https://github.com/klauspost/compress/pull/1017

+  * s2: Don't use stack for index tables https://github.com/klauspost/compress/pull/1014

+  * gzhttp: No content-type on no body response code by @juliens in https://github.com/klauspost/compress/pull/1011

+  * gzhttp: Do not set the content-type when response has no body by @kevinpollet in https://github.com/klauspost/compress/pull/1013

+

+* Sep 23rd, 2024 - [1.17.10](https://github.com/klauspost/compress/releases/tag/v1.17.10)

+	* gzhttp: Add TransportAlwaysDecompress option. https://github.com/klauspost/compress/pull/978

+	* gzhttp: Add supported decompress request body by @mirecl in https://github.com/klauspost/compress/pull/1002

+	* s2: Add EncodeBuffer buffer recycling callback https://github.com/klauspost/compress/pull/982

+	* zstd: Improve memory usage on small streaming encodes https://github.com/klauspost/compress/pull/1007

+	* flate: read data written with partial flush by @vajexal in https://github.com/klauspost/compress/pull/996

+

+* Jun 12th, 2024 - [1.17.9](https://github.com/klauspost/compress/releases/tag/v1.17.9)

+	* s2: Reduce ReadFrom temporary allocations https://github.com/klauspost/compress/pull/949

+	* flate, zstd: Shave some bytes off amd64 matchLen by @greatroar in https://github.com/klauspost/compress/pull/963

+	* Upgrade zip/zlib to 1.22.4 upstream https://github.com/klauspost/compress/pull/970 https://github.com/klauspost/compress/pull/971

+	* zstd: BuildDict fails with RLE table https://github.com/klauspost/compress/pull/951

+

+* Apr 9th, 2024 - [1.17.8](https://github.com/klauspost/compress/releases/tag/v1.17.8)

+	* zstd: Reject blocks where reserved values are not 0 https://github.com/klauspost/compress/pull/885

+	* zstd: Add RLE detection+encoding https://github.com/klauspost/compress/pull/938

+

+* Feb 21st, 2024 - [1.17.7](https://github.com/klauspost/compress/releases/tag/v1.17.7)

+	* s2: Add AsyncFlush method: Complete the block without flushing by @Jille in https://github.com/klauspost/compress/pull/927

+	* s2: Fix literal+repeat exceeds dst crash https://github.com/klauspost/compress/pull/930

+  

+* Feb 5th, 2024 - [1.17.6](https://github.com/klauspost/compress/releases/tag/v1.17.6)

+	* zstd: Fix incorrect repeat coding in best mode https://github.com/klauspost/compress/pull/923

+	* s2: Fix DecodeConcurrent deadlock on errors https://github.com/klauspost/compress/pull/925

+  

+* Jan 26th, 2024 - [v1.17.5](https://github.com/klauspost/compress/releases/tag/v1.17.5)

+	* flate: Fix reset with dictionary on custom window encodes https://github.com/klauspost/compress/pull/912

+	* zstd: Add Frame header encoding and stripping https://github.com/klauspost/compress/pull/908

+	* zstd: Limit better/best default window to 8MB https://github.com/klauspost/compress/pull/913

+	* zstd: Speed improvements by @greatroar in https://github.com/klauspost/compress/pull/896 https://github.com/klauspost/compress/pull/910

+	* s2: Fix callbacks for skippable blocks and disallow 0xfe (Padding) by @Jille in https://github.com/klauspost/compress/pull/916 https://github.com/klauspost/compress/pull/917

+https://github.com/klauspost/compress/pull/919 https://github.com/klauspost/compress/pull/918

+

+* Dec 1st, 2023 - [v1.17.4](https://github.com/klauspost/compress/releases/tag/v1.17.4)

+	* huff0: Speed up symbol counting by @greatroar in https://github.com/klauspost/compress/pull/887

+	* huff0: Remove byteReader by @greatroar in https://github.com/klauspost/compress/pull/886

+	* gzhttp: Allow overriding decompression on transport https://github.com/klauspost/compress/pull/892

+	* gzhttp: Clamp compression level https://github.com/klauspost/compress/pull/890

+	* gzip: Error out if reserved bits are set https://github.com/klauspost/compress/pull/891

+

+* Nov 15th, 2023 - [v1.17.3](https://github.com/klauspost/compress/releases/tag/v1.17.3)

+	* fse: Fix max header size https://github.com/klauspost/compress/pull/881

+	* zstd: Improve better/best compression https://github.com/klauspost/compress/pull/877

+	* gzhttp: Fix missing content type on Close https://github.com/klauspost/compress/pull/883

+

+* Oct 22nd, 2023 - [v1.17.2](https://github.com/klauspost/compress/releases/tag/v1.17.2)

+	* zstd: Fix rare *CORRUPTION* output in "best" mode. See https://github.com/klauspost/compress/pull/876

+

+* Oct 14th, 2023 - [v1.17.1](https://github.com/klauspost/compress/releases/tag/v1.17.1)

+	* s2: Fix S2 "best" dictionary wrong encoding https://github.com/klauspost/compress/pull/871

+	* flate: Reduce allocations in decompressor and minor code improvements by @fakefloordiv in https://github.com/klauspost/compress/pull/869

+	* s2: Fix EstimateBlockSize on 6&7 length input https://github.com/klauspost/compress/pull/867

+

+* Sept 19th, 2023 - [v1.17.0](https://github.com/klauspost/compress/releases/tag/v1.17.0)

+	* Add experimental dictionary builder  https://github.com/klauspost/compress/pull/853

+	* Add xerial snappy read/writer https://github.com/klauspost/compress/pull/838

+	* flate: Add limited window compression https://github.com/klauspost/compress/pull/843

+	* s2: Do 2 overlapping match checks https://github.com/klauspost/compress/pull/839

+	* flate: Add amd64 assembly matchlen https://github.com/klauspost/compress/pull/837

+	* gzip: Copy bufio.Reader on Reset by @thatguystone in https://github.com/klauspost/compress/pull/860

+

+<details>

+	<summary>See changes to v1.16.x</summary>

+

+   

+* July 1st, 2023 - [v1.16.7](https://github.com/klauspost/compress/releases/tag/v1.16.7)

+	* zstd: Fix default level first dictionary encode https://github.com/klauspost/compress/pull/829

+	* s2: add GetBufferCapacity() method by @GiedriusS in https://github.com/klauspost/compress/pull/832

+

+* June 13, 2023 - [v1.16.6](https://github.com/klauspost/compress/releases/tag/v1.16.6)

+	* zstd: correctly ignore WithEncoderPadding(1) by @ianlancetaylor in https://github.com/klauspost/compress/pull/806

+	* zstd: Add amd64 match length assembly https://github.com/klauspost/compress/pull/824

+	* gzhttp: Handle informational headers by @rtribotte in https://github.com/klauspost/compress/pull/815

+	* s2: Improve Better compression slightly https://github.com/klauspost/compress/pull/663

+

+* Apr 16, 2023 - [v1.16.5](https://github.com/klauspost/compress/releases/tag/v1.16.5)

+	* zstd: readByte needs to use io.ReadFull by @jnoxon in https://github.com/klauspost/compress/pull/802

+	* gzip: Fix WriterTo after initial read https://github.com/klauspost/compress/pull/804

+

+* Apr 5, 2023 - [v1.16.4](https://github.com/klauspost/compress/releases/tag/v1.16.4)

+	* zstd: Improve zstd best efficiency by @greatroar and @klauspost in https://github.com/klauspost/compress/pull/784

+	* zstd: Respect WithAllLitEntropyCompression https://github.com/klauspost/compress/pull/792

+	* zstd: Fix amd64 not always detecting corrupt data https://github.com/klauspost/compress/pull/785

+	* zstd: Various minor improvements by @greatroar in https://github.com/klauspost/compress/pull/788 https://github.com/klauspost/compress/pull/794 https://github.com/klauspost/compress/pull/795

+	* s2: Fix huge block overflow https://github.com/klauspost/compress/pull/779

+	* s2: Allow CustomEncoder fallback https://github.com/klauspost/compress/pull/780

+	* gzhttp: Support ResponseWriter Unwrap() in gzhttp handler by @jgimenez in https://github.com/klauspost/compress/pull/799

+

+* Mar 13, 2023 - [v1.16.1](https://github.com/klauspost/compress/releases/tag/v1.16.1)

+	* zstd: Speed up + improve best encoder by @greatroar in https://github.com/klauspost/compress/pull/776

+	* gzhttp: Add optional [BREACH mitigation](https://github.com/klauspost/compress/tree/master/gzhttp#breach-mitigation). https://github.com/klauspost/compress/pull/762 https://github.com/klauspost/compress/pull/768 https://github.com/klauspost/compress/pull/769 https://github.com/klauspost/compress/pull/770 https://github.com/klauspost/compress/pull/767

+	* s2: Add Intel LZ4s converter https://github.com/klauspost/compress/pull/766

+	* zstd: Minor bug fixes https://github.com/klauspost/compress/pull/771 https://github.com/klauspost/compress/pull/772 https://github.com/klauspost/compress/pull/773

+	* huff0: Speed up compress1xDo by @greatroar in https://github.com/klauspost/compress/pull/774

+

+* Feb 26, 2023 - [v1.16.0](https://github.com/klauspost/compress/releases/tag/v1.16.0)

+	* s2: Add [Dictionary](https://github.com/klauspost/compress/tree/master/s2#dictionaries) support.  https://github.com/klauspost/compress/pull/685

+	* s2: Add Compression Size Estimate.  https://github.com/klauspost/compress/pull/752

+	* s2: Add support for custom stream encoder. https://github.com/klauspost/compress/pull/755

+	* s2: Add LZ4 block converter. https://github.com/klauspost/compress/pull/748

+	* s2: Support io.ReaderAt in ReadSeeker. https://github.com/klauspost/compress/pull/747

+	* s2c/s2sx: Use concurrent decoding. https://github.com/klauspost/compress/pull/746

+</details>

+

+<details>

+	<summary>See changes to v1.15.x</summary>

+	

+* Jan 21st, 2023 (v1.15.15)

+	* deflate: Improve level 7-9 https://github.com/klauspost/compress/pull/739

+	* zstd: Add delta encoding support by @greatroar in https://github.com/klauspost/compress/pull/728

+	* zstd: Various speed improvements by @greatroar https://github.com/klauspost/compress/pull/741 https://github.com/klauspost/compress/pull/734 https://github.com/klauspost/compress/pull/736 https://github.com/klauspost/compress/pull/744 https://github.com/klauspost/compress/pull/743 https://github.com/klauspost/compress/pull/745

+	* gzhttp: Add SuffixETag() and DropETag() options to prevent ETag collisions on compressed responses by @willbicks in https://github.com/klauspost/compress/pull/740

+

+* Jan 3rd, 2023 (v1.15.14)

+

+	* flate: Improve speed in big stateless blocks https://github.com/klauspost/compress/pull/718

+	* zstd: Minor speed tweaks by @greatroar in https://github.com/klauspost/compress/pull/716 https://github.com/klauspost/compress/pull/720

+	* export NoGzipResponseWriter for custom ResponseWriter wrappers by @harshavardhana in https://github.com/klauspost/compress/pull/722

+	* s2: Add example for indexing and existing stream https://github.com/klauspost/compress/pull/723

+

+* Dec 11, 2022 (v1.15.13)

+	* zstd: Add [MaxEncodedSize](https://pkg.go.dev/github.com/klauspost/compress@v1.15.13/zstd#Encoder.MaxEncodedSize) to encoder  https://github.com/klauspost/compress/pull/691

+	* zstd: Various tweaks and improvements https://github.com/klauspost/compress/pull/693 https://github.com/klauspost/compress/pull/695 https://github.com/klauspost/compress/pull/696 https://github.com/klauspost/compress/pull/701 https://github.com/klauspost/compress/pull/702 https://github.com/klauspost/compress/pull/703 https://github.com/klauspost/compress/pull/704 https://github.com/klauspost/compress/pull/705 https://github.com/klauspost/compress/pull/706 https://github.com/klauspost/compress/pull/707 https://github.com/klauspost/compress/pull/708

+

+* Oct 26, 2022 (v1.15.12)

+

+	* zstd: Tweak decoder allocs. https://github.com/klauspost/compress/pull/680

+	* gzhttp: Always delete `HeaderNoCompression` https://github.com/klauspost/compress/pull/683

+

+* Sept 26, 2022 (v1.15.11)

+

+	* flate: Improve level 1-3 compression  https://github.com/klauspost/compress/pull/678

+	* zstd: Improve "best" compression by @nightwolfz in https://github.com/klauspost/compress/pull/677

+	* zstd: Fix+reduce decompression allocations https://github.com/klauspost/compress/pull/668

+	* zstd: Fix non-effective noescape tag https://github.com/klauspost/compress/pull/667

+

+* Sept 16, 2022 (v1.15.10)

+

+	* zstd: Add [WithDecodeAllCapLimit](https://pkg.go.dev/github.com/klauspost/compress@v1.15.10/zstd#WithDecodeAllCapLimit) https://github.com/klauspost/compress/pull/649

+	* Add Go 1.19 - deprecate Go 1.16  https://github.com/klauspost/compress/pull/651

+	* flate: Improve level 5+6 compression https://github.com/klauspost/compress/pull/656

+	* zstd: Improve "better" compression  https://github.com/klauspost/compress/pull/657

+	* s2: Improve "best" compression https://github.com/klauspost/compress/pull/658

+	* s2: Improve "better" compression. https://github.com/klauspost/compress/pull/635

+	* s2: Slightly faster non-assembly decompression https://github.com/klauspost/compress/pull/646

+	* Use arrays for constant size copies https://github.com/klauspost/compress/pull/659

+

+* July 21, 2022 (v1.15.9)

+

+	* zstd: Fix decoder crash on amd64 (no BMI) on invalid input https://github.com/klauspost/compress/pull/645

+	* zstd: Disable decoder extended memory copies (amd64) due to possible crashes https://github.com/klauspost/compress/pull/644

+	* zstd: Allow single segments up to "max decoded size" https://github.com/klauspost/compress/pull/643

+

 * July 13, 2022 (v1.15.8)

 

 	* gzip: fix stack exhaustion bug in Reader.Read https://github.com/klauspost/compress/pull/641

@@ -57,7 +235,7 @@
 	* zstd: Speed up when WithDecoderLowmem(false) https://github.com/klauspost/compress/pull/599

 	* zstd: faster next state update in BMI2 version of decode by @WojciechMula in https://github.com/klauspost/compress/pull/593

 	* huff0: Do not check max size when reading table. https://github.com/klauspost/compress/pull/586

-	* flate: Inplace hashing for level 7-9 by @klauspost in https://github.com/klauspost/compress/pull/590

+	* flate: Inplace hashing for level 7-9 https://github.com/klauspost/compress/pull/590

 

 

 * May 11, 2022 (v1.15.4)

@@ -84,27 +262,29 @@
 	* zstd: Add stricter block size checks in [#523](https://github.com/klauspost/compress/pull/523)

 

 * Mar 3, 2022 (v1.15.0)

-	* zstd: Refactor decoder by @klauspost in [#498](https://github.com/klauspost/compress/pull/498)

-	* zstd: Add stream encoding without goroutines by @klauspost in [#505](https://github.com/klauspost/compress/pull/505)

+	* zstd: Refactor decoder [#498](https://github.com/klauspost/compress/pull/498)

+	* zstd: Add stream encoding without goroutines [#505](https://github.com/klauspost/compress/pull/505)

 	* huff0: Prevent single blocks exceeding 16 bits by @klauspost in[#507](https://github.com/klauspost/compress/pull/507)

-	* flate: Inline literal emission by @klauspost in [#509](https://github.com/klauspost/compress/pull/509)

-	* gzhttp: Add zstd to transport by @klauspost in [#400](https://github.com/klauspost/compress/pull/400)

-	* gzhttp: Make content-type optional by @klauspost in [#510](https://github.com/klauspost/compress/pull/510)

+	* flate: Inline literal emission [#509](https://github.com/klauspost/compress/pull/509)

+	* gzhttp: Add zstd to transport [#400](https://github.com/klauspost/compress/pull/400)

+	* gzhttp: Make content-type optional [#510](https://github.com/klauspost/compress/pull/510)

 

-<details>

-	<summary>See  Details</summary>

 Both compression and decompression now supports "synchronous" stream operations. This means that whenever "concurrency" is set to 1, they will operate without spawning goroutines.

 

 Stream decompression is now faster on asynchronous, since the goroutine allocation much more effectively splits the workload. On typical streams this will typically use 2 cores fully for decompression. When a stream has finished decoding no goroutines will be left over, so decoders can now safely be pooled and still be garbage collected.

 

 While the release has been extensively tested, it is recommended to testing when upgrading.

+

 </details>

 

+<details>

+	<summary>See changes to v1.14.x</summary>

+	

 * Feb 22, 2022 (v1.14.4)

 	* flate: Fix rare huffman only (-2) corruption. [#503](https://github.com/klauspost/compress/pull/503)

 	* zip: Update deprecated CreateHeaderRaw to correctly call CreateRaw by @saracen in [#502](https://github.com/klauspost/compress/pull/502)

 	* zip: don't read data descriptor early by @saracen in [#501](https://github.com/klauspost/compress/pull/501)  #501

-	* huff0: Use static decompression buffer up to 30% faster by @klauspost in [#499](https://github.com/klauspost/compress/pull/499) [#500](https://github.com/klauspost/compress/pull/500)

+	* huff0: Use static decompression buffer up to 30% faster [#499](https://github.com/klauspost/compress/pull/499) [#500](https://github.com/klauspost/compress/pull/500)

 

 * Feb 17, 2022 (v1.14.3)

 	* flate: Improve fastest levels compression speed ~10% more throughput. [#482](https://github.com/klauspost/compress/pull/482) [#489](https://github.com/klauspost/compress/pull/489) [#490](https://github.com/klauspost/compress/pull/490) [#491](https://github.com/klauspost/compress/pull/491) [#494](https://github.com/klauspost/compress/pull/494)  [#478](https://github.com/klauspost/compress/pull/478)

@@ -125,6 +305,7 @@
 	* zstd: Performance improvement in [#420]( https://github.com/klauspost/compress/pull/420) [#456](https://github.com/klauspost/compress/pull/456) [#437](https://github.com/klauspost/compress/pull/437) [#467](https://github.com/klauspost/compress/pull/467) [#468](https://github.com/klauspost/compress/pull/468)

 	* zstd: add arm64 xxhash assembly in [#464](https://github.com/klauspost/compress/pull/464)

 	* Add garbled for binaries for s2 in [#445](https://github.com/klauspost/compress/pull/445)

+</details>

 

 <details>

 	<summary>See changes to v1.13.x</summary>

@@ -205,7 +386,7 @@
 	* s2: Fix binaries.

 

 * Feb 25, 2021 (v1.11.8)

-	* s2: Fixed occational out-of-bounds write on amd64. Upgrade recommended.

+	* s2: Fixed occasional out-of-bounds write on amd64. Upgrade recommended.

 	* s2: Add AMD64 assembly for better mode. 25-50% faster. [#315](https://github.com/klauspost/compress/pull/315)

 	* s2: Less upfront decoder allocation. [#322](https://github.com/klauspost/compress/pull/322)

 	* zstd: Faster "compression" of incompressible data. [#314](https://github.com/klauspost/compress/pull/314)

@@ -384,7 +565,7 @@
 * Feb 19, 2016: Faster bit writer, level -2 is 15% faster, level 1 is 4% faster.

 * Feb 19, 2016: Handle small payloads faster in level 1-3.

 * Feb 19, 2016: Added faster level 2 + 3 compression modes.

-* Feb 19, 2016: [Rebalanced compression levels](https://blog.klauspost.com/rebalancing-deflate-compression-levels/), so there is a more even progresssion in terms of compression. New default level is 5.

+* Feb 19, 2016: [Rebalanced compression levels](https://blog.klauspost.com/rebalancing-deflate-compression-levels/), so there is a more even progression in terms of compression. New default level is 5.

 * Feb 14, 2016: Snappy: Merge upstream changes. 

 * Feb 14, 2016: Snappy: Fix aggressive skipping.

 * Feb 14, 2016: Snappy: Update benchmark.

@@ -410,12 +591,14 @@
 

 The packages are drop-in replacements for standard libraries. Simply replace the import path to use them:

 

-| old import         | new import                              | Documentation

-|--------------------|-----------------------------------------|--------------------|

-| `compress/gzip`    | `github.com/klauspost/compress/gzip`    | [gzip](https://pkg.go.dev/github.com/klauspost/compress/gzip?tab=doc)

-| `compress/zlib`    | `github.com/klauspost/compress/zlib`    | [zlib](https://pkg.go.dev/github.com/klauspost/compress/zlib?tab=doc)

-| `archive/zip`      | `github.com/klauspost/compress/zip`     | [zip](https://pkg.go.dev/github.com/klauspost/compress/zip?tab=doc)

-| `compress/flate`   | `github.com/klauspost/compress/flate`   | [flate](https://pkg.go.dev/github.com/klauspost/compress/flate?tab=doc)

+Typical speed is about 2x of the standard library packages.

+

+| old import       | new import                            | Documentation                                                           |

+|------------------|---------------------------------------|-------------------------------------------------------------------------|

+| `compress/gzip`  | `github.com/klauspost/compress/gzip`  | [gzip](https://pkg.go.dev/github.com/klauspost/compress/gzip?tab=doc)   |

+| `compress/zlib`  | `github.com/klauspost/compress/zlib`  | [zlib](https://pkg.go.dev/github.com/klauspost/compress/zlib?tab=doc)   |

+| `archive/zip`    | `github.com/klauspost/compress/zip`   | [zip](https://pkg.go.dev/github.com/klauspost/compress/zip?tab=doc)     |

+| `compress/flate` | `github.com/klauspost/compress/flate` | [flate](https://pkg.go.dev/github.com/klauspost/compress/flate?tab=doc) |

 

 * Optimized [deflate](https://godoc.org/github.com/klauspost/compress/flate) packages which can be used as a dropin replacement for [gzip](https://godoc.org/github.com/klauspost/compress/gzip), [zip](https://godoc.org/github.com/klauspost/compress/zip) and [zlib](https://godoc.org/github.com/klauspost/compress/zlib).

 

@@ -431,6 +614,8 @@
 

 For compression performance, see: [this spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing).

 

+To disable all assembly add `-tags=noasm`. This works across all packages.

+

 # Stateless compression

 

 This package offers stateless compression as a special option for gzip/deflate. 

@@ -449,7 +634,7 @@
 

 A `bufio.Writer` can of course be used to control write sizes. For example, to use a 4KB buffer:

 

-```

+```go

 	// replace 'ioutil.Discard' with your output.

 	gzw, err := gzip.NewWriterLevel(ioutil.Discard, gzip.StatelessCompression)

 	if err != nil {

@@ -468,84 +653,6 @@
 Compression is almost always worse than the fastest compression level 

 and each write will allocate (a little) memory. 

 

-# Performance Update 2018

-

-It has been a while since we have been looking at the speed of this package compared to the standard library, so I thought I would re-do my tests and give some overall recommendations based on the current state. All benchmarks have been performed with Go 1.10 on my Desktop Intel(R) Core(TM) i7-2600 CPU @3.40GHz. Since I last ran the tests, I have gotten more RAM, which means tests with big files are no longer limited by my SSD.

-

-The raw results are in my [updated spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing). Due to cgo changes and upstream updates i could not get the cgo version of gzip to compile. Instead I included the [zstd](https://github.com/datadog/zstd) cgo implementation. If I get cgo gzip to work again, I might replace the results in the sheet.

-

-The columns to take note of are: *MB/s* - the throughput. *Reduction* - the data size reduction in percent of the original. *Rel Speed* relative speed compared to the standard library at the same level. *Smaller* - how many percent smaller is the compressed output compared to stdlib. Negative means the output was bigger. *Loss* means the loss (or gain) in compression as a percentage difference of the input.

-

-The `gzstd` (standard library gzip) and `gzkp` (this package gzip) only uses one CPU core. [`pgzip`](https://github.com/klauspost/pgzip), [`bgzf`](https://github.com/biogo/hts/tree/master/bgzf) uses all 4 cores. [`zstd`](https://github.com/DataDog/zstd) uses one core, and is a beast (but not Go, yet).

-

-

-## Overall differences.

-

-There appears to be a roughly 5-10% speed advantage over the standard library when comparing at similar compression levels.

-

-The biggest difference you will see is the result of [re-balancing](https://blog.klauspost.com/rebalancing-deflate-compression-levels/) the compression levels. I wanted by library to give a smoother transition between the compression levels than the standard library.

-

-This package attempts to provide a more smooth transition, where "1" is taking a lot of shortcuts, "5" is the reasonable trade-off and "9" is the "give me the best compression", and the values in between gives something reasonable in between. The standard library has big differences in levels 1-4, but levels 5-9 having no significant gains - often spending a lot more time than can be justified by the achieved compression.

-

-There are links to all the test data in the [spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing) in the top left field on each tab.

-

-## Web Content

-

-This test set aims to emulate typical use in a web server. The test-set is 4GB data in 53k files, and is a mixture of (mostly) HTML, JS, CSS.

-

-Since level 1 and 9 are close to being the same code, they are quite close. But looking at the levels in-between the differences are quite big.

-

-Looking at level 6, this package is 88% faster, but will output about 6% more data. For a web server, this means you can serve 88% more data, but have to pay for 6% more bandwidth. You can draw your own conclusions on what would be the most expensive for your case.

-

-## Object files

-

-This test is for typical data files stored on a server. In this case it is a collection of Go precompiled objects. They are very compressible.

-

-The picture is similar to the web content, but with small differences since this is very compressible. Levels 2-3 offer good speed, but is sacrificing quite a bit of compression. 

-

-The standard library seems suboptimal on level 3 and 4 - offering both worse compression and speed than level 6 & 7 of this package respectively.

-

-## Highly Compressible File

-

-This is a JSON file with very high redundancy. The reduction starts at 95% on level 1, so in real life terms we are dealing with something like a highly redundant stream of data, etc.

-

-It is definitely visible that we are dealing with specialized content here, so the results are very scattered. This package does not do very well at levels 1-4, but picks up significantly at level 5 and levels 7 and 8 offering great speed for the achieved compression.

-

-So if you know you content is extremely compressible you might want to go slightly higher than the defaults. The standard library has a huge gap between levels 3 and 4 in terms of speed (2.75x slowdown), so it offers little "middle ground".

-

-## Medium-High Compressible

-

-This is a pretty common test corpus: [enwik9](http://mattmahoney.net/dc/textdata.html). It contains the first 10^9 bytes of the English Wikipedia dump on Mar. 3, 2006. This is a very good test of typical text based compression and more data heavy streams.

-

-We see a similar picture here as in "Web Content". On equal levels some compression is sacrificed for more speed. Level 5 seems to be the best trade-off between speed and size, beating stdlib level 3 in both.

-

-## Medium Compressible

-

-I will combine two test sets, one [10GB file set](http://mattmahoney.net/dc/10gb.html) and a VM disk image (~8GB). Both contain different data types and represent a typical backup scenario.

-

-The most notable thing is how quickly the standard library drops to very low compression speeds around level 5-6 without any big gains in compression. Since this type of data is fairly common, this does not seem like good behavior.

-

-

-## Un-compressible Content

-

-This is mainly a test of how good the algorithms are at detecting un-compressible input. The standard library only offers this feature with very conservative settings at level 1. Obviously there is no reason for the algorithms to try to compress input that cannot be compressed.  The only downside is that it might skip some compressible data on false detections.

-

-

-## Huffman only compression

-

-This compression library adds a special compression level, named `HuffmanOnly`, which allows near linear time compression. This is done by completely disabling matching of previous data, and only reduce the number of bits to represent each character. 

-

-This means that often used characters, like 'e' and ' ' (space) in text use the fewest bits to represent, and rare characters like '¤' takes more bits to represent. For more information see [wikipedia](https://en.wikipedia.org/wiki/Huffman_coding) or this nice [video](https://youtu.be/ZdooBTdW5bM).

-

-Since this type of compression has much less variance, the compression speed is mostly unaffected by the input data, and is usually more than *180MB/s* for a single core.

-

-The downside is that the compression ratio is usually considerably worse than even the fastest conventional compression. The compression ratio can never be better than 8:1 (12.5%). 

-

-The linear time compression can be used as a "better than nothing" mode, where you cannot risk the encoder to slow down on some content. For comparison, the size of the "Twain" text is *233460 bytes* (+29% vs. level 1) and encode speed is 144MB/s (4.5x level 1). So in this case you trade a 30% size increase for a 4 times speedup.

-

-For more information see my blog post on [Fast Linear Time Compression](http://blog.klauspost.com/constant-time-gzipzip-compression/).

-

-This is implemented on Go 1.7 as "Huffman Only" mode, though not exposed for gzip.

 

 # Other packages

 

@@ -554,6 +661,10 @@
 * [github.com/pierrec/lz4](https://github.com/pierrec/lz4) - strong multithreaded LZ4 compression.

 * [github.com/cosnicolaou/pbzip2](https://github.com/cosnicolaou/pbzip2) - multithreaded bzip2 decompression.

 * [github.com/dsnet/compress](https://github.com/dsnet/compress) - brotli decompression, bzip2 writer.

+* [github.com/ronanh/intcomp](https://github.com/ronanh/intcomp) - Integer compression.

+* [github.com/spenczar/fpc](https://github.com/spenczar/fpc) - Float compression.

+* [github.com/minio/zipindex](https://github.com/minio/zipindex) - External ZIP directory index.

+* [github.com/ybirader/pzip](https://github.com/ybirader/pzip) - Fast concurrent zip archiver and extractor.

 

 # license

 


diff --git a/vendor/github.com/klauspost/compress/SECURITY.md b/vendor/github.com/klauspost/compress/SECURITY.md
new file mode 100644
index 0000000..ca6685e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/SECURITY.md

@@ -0,0 +1,25 @@
+# Security Policy
+
+## Supported Versions
+
+Security updates are applied only to the latest release.
+
+## Vulnerability Definition
+
+A security vulnerability is a bug that with certain input triggers a crash or an infinite loop. Most calls will have varying execution time and only in rare cases will slow operation be considered a security vulnerability.
+
+Corrupted output generally is not considered a security vulnerability, unless independent operations are able to affect each other. Note that not all functionality is re-entrant and safe to use concurrently.
+
+Out-of-memory crashes only applies if the en/decoder uses an abnormal amount of memory, with appropriate options applied, to limit maximum window size, concurrency, etc. However, if you are in doubt you are welcome to file a security issue.
+
+It is assumed that all callers are trusted, meaning internal data exposed through reflection or inspection of returned data structures is not considered a vulnerability.
+
+Vulnerabilities resulting from compiler/assembler errors should be reported upstream. Depending on the severity this package may or may not implement a workaround.
+
+## Reporting a Vulnerability
+
+If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
+
+Please disclose it at [security advisory](https://github.com/klauspost/compress/security/advisories/new). If possible please provide a minimal reproducer. If the issue only applies to a single platform, it would be helpful to provide access to that.
+
+This project is maintained by a team of volunteers on a reasonable-effort basis. As such, vulnerabilities will be disclosed in a best effort base.

diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go
new file mode 100644
index 0000000..af53fb8
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/deflate.go

@@ -0,0 +1,1017 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright (c) 2015 Klaus Post
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"io"
+	"math"
+)
+
+const (
+	NoCompression      = 0
+	BestSpeed          = 1
+	BestCompression    = 9
+	DefaultCompression = -1
+
+	// HuffmanOnly disables Lempel-Ziv match searching and only performs Huffman
+	// entropy encoding. This mode is useful in compressing data that has
+	// already been compressed with an LZ style algorithm (e.g. Snappy or LZ4)
+	// that lacks an entropy encoder. Compression gains are achieved when
+	// certain bytes in the input stream occur more frequently than others.
+	//
+	// Note that HuffmanOnly produces a compressed output that is
+	// RFC 1951 compliant. That is, any valid DEFLATE decompressor will
+	// continue to be able to decompress this output.
+	HuffmanOnly         = -2
+	ConstantCompression = HuffmanOnly // compatibility alias.
+
+	logWindowSize    = 15
+	windowSize       = 1 << logWindowSize
+	windowMask       = windowSize - 1
+	logMaxOffsetSize = 15  // Standard DEFLATE
+	minMatchLength   = 4   // The smallest match that the compressor looks for
+	maxMatchLength   = 258 // The longest match for the compressor
+	minOffsetSize    = 1   // The shortest offset that makes any sense
+
+	// The maximum number of tokens we will encode at the time.
+	// Smaller sizes usually creates less optimal blocks.
+	// Bigger can make context switching slow.
+	// We use this for levels 7-9, so we make it big.
+	maxFlateBlockTokens = 1 << 15
+	maxStoreBlockSize   = 65535
+	hashBits            = 17 // After 17 performance degrades
+	hashSize            = 1 << hashBits
+	hashMask            = (1 << hashBits) - 1
+	hashShift           = (hashBits + minMatchLength - 1) / minMatchLength
+	maxHashOffset       = 1 << 28
+
+	skipNever = math.MaxInt32
+
+	debugDeflate = false
+)
+
+type compressionLevel struct {
+	good, lazy, nice, chain, fastSkipHashing, level int
+}
+
+// Compression levels have been rebalanced from zlib deflate defaults
+// to give a bigger spread in speed and compression.
+// See https://blog.klauspost.com/rebalancing-deflate-compression-levels/
+var levels = []compressionLevel{
+	{}, // 0
+	// Level 1-6 uses specialized algorithm - values not used
+	{0, 0, 0, 0, 0, 1},
+	{0, 0, 0, 0, 0, 2},
+	{0, 0, 0, 0, 0, 3},
+	{0, 0, 0, 0, 0, 4},
+	{0, 0, 0, 0, 0, 5},
+	{0, 0, 0, 0, 0, 6},
+	// Levels 7-9 use increasingly more lazy matching
+	// and increasingly stringent conditions for "good enough".
+	{8, 12, 16, 24, skipNever, 7},
+	{16, 30, 40, 64, skipNever, 8},
+	{32, 258, 258, 1024, skipNever, 9},
+}
+
+// advancedState contains state for the advanced levels, with bigger hash tables, etc.
+type advancedState struct {
+	// deflate state
+	length         int
+	offset         int
+	maxInsertIndex int
+	chainHead      int
+	hashOffset     int
+
+	ii uint16 // position of last match, intended to overflow to reset.
+
+	// input window: unprocessed data is window[index:windowEnd]
+	index     int
+	hashMatch [maxMatchLength + minMatchLength]uint32
+
+	// Input hash chains
+	// hashHead[hashValue] contains the largest inputIndex with the specified hash value
+	// If hashHead[hashValue] is within the current window, then
+	// hashPrev[hashHead[hashValue] & windowMask] contains the previous index
+	// with the same hash value.
+	hashHead [hashSize]uint32
+	hashPrev [windowSize]uint32
+}
+
+type compressor struct {
+	compressionLevel
+
+	h *huffmanEncoder
+	w *huffmanBitWriter
+
+	// compression algorithm
+	fill func(*compressor, []byte) int // copy data to window
+	step func(*compressor)             // process window
+
+	window     []byte
+	windowEnd  int
+	blockStart int // window index where current tokens start
+	err        error
+
+	// queued output tokens
+	tokens tokens
+	fast   fastEnc
+	state  *advancedState
+
+	sync          bool // requesting flush
+	byteAvailable bool // if true, still need to process window[index-1].
+}
+
+func (d *compressor) fillDeflate(b []byte) int {
+	s := d.state
+	if s.index >= 2*windowSize-(minMatchLength+maxMatchLength) {
+		// shift the window by windowSize
+		//copy(d.window[:], d.window[windowSize:2*windowSize])
+		*(*[windowSize]byte)(d.window) = *(*[windowSize]byte)(d.window[windowSize:])
+		s.index -= windowSize
+		d.windowEnd -= windowSize
+		if d.blockStart >= windowSize {
+			d.blockStart -= windowSize
+		} else {
+			d.blockStart = math.MaxInt32
+		}
+		s.hashOffset += windowSize
+		if s.hashOffset > maxHashOffset {
+			delta := s.hashOffset - 1
+			s.hashOffset -= delta
+			s.chainHead -= delta
+			// Iterate over slices instead of arrays to avoid copying
+			// the entire table onto the stack (Issue #18625).
+			for i, v := range s.hashPrev[:] {
+				if int(v) > delta {
+					s.hashPrev[i] = uint32(int(v) - delta)
+				} else {
+					s.hashPrev[i] = 0
+				}
+			}
+			for i, v := range s.hashHead[:] {
+				if int(v) > delta {
+					s.hashHead[i] = uint32(int(v) - delta)
+				} else {
+					s.hashHead[i] = 0
+				}
+			}
+		}
+	}
+	n := copy(d.window[d.windowEnd:], b)
+	d.windowEnd += n
+	return n
+}
+
+func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error {
+	if index > 0 || eof {
+		var window []byte
+		if d.blockStart <= index {
+			window = d.window[d.blockStart:index]
+		}
+		d.blockStart = index
+		//d.w.writeBlock(tok, eof, window)
+		d.w.writeBlockDynamic(tok, eof, window, d.sync)
+		return d.w.err
+	}
+	return nil
+}
+
+// writeBlockSkip writes the current block and uses the number of tokens
+// to determine if the block should be stored on no matches, or
+// only huffman encoded.
+func (d *compressor) writeBlockSkip(tok *tokens, index int, eof bool) error {
+	if index > 0 || eof {
+		if d.blockStart <= index {
+			window := d.window[d.blockStart:index]
+			// If we removed less than a 64th of all literals
+			// we huffman compress the block.
+			if int(tok.n) > len(window)-int(tok.n>>6) {
+				d.w.writeBlockHuff(eof, window, d.sync)
+			} else {
+				// Write a dynamic huffman block.
+				d.w.writeBlockDynamic(tok, eof, window, d.sync)
+			}
+		} else {
+			d.w.writeBlock(tok, eof, nil)
+		}
+		d.blockStart = index
+		return d.w.err
+	}
+	return nil
+}
+
+// fillWindow will fill the current window with the supplied
+// dictionary and calculate all hashes.
+// This is much faster than doing a full encode.
+// Should only be used after a start/reset.
+func (d *compressor) fillWindow(b []byte) {
+	// Do not fill window if we are in store-only or huffman mode.
+	if d.level <= 0 && d.level > -MinCustomWindowSize {
+		return
+	}
+	if d.fast != nil {
+		// encode the last data, but discard the result
+		if len(b) > maxMatchOffset {
+			b = b[len(b)-maxMatchOffset:]
+		}
+		d.fast.Encode(&d.tokens, b)
+		d.tokens.Reset()
+		return
+	}
+	s := d.state
+	// If we are given too much, cut it.
+	if len(b) > windowSize {
+		b = b[len(b)-windowSize:]
+	}
+	// Add all to window.
+	n := copy(d.window[d.windowEnd:], b)
+
+	// Calculate 256 hashes at the time (more L1 cache hits)
+	loops := (n + 256 - minMatchLength) / 256
+	for j := 0; j < loops; j++ {
+		startindex := j * 256
+		end := startindex + 256 + minMatchLength - 1
+		if end > n {
+			end = n
+		}
+		tocheck := d.window[startindex:end]
+		dstSize := len(tocheck) - minMatchLength + 1
+
+		if dstSize <= 0 {
+			continue
+		}
+
+		dst := s.hashMatch[:dstSize]
+		bulkHash4(tocheck, dst)
+		var newH uint32
+		for i, val := range dst {
+			di := i + startindex
+			newH = val & hashMask
+			// Get previous value with the same hash.
+			// Our chain should point to the previous value.
+			s.hashPrev[di&windowMask] = s.hashHead[newH]
+			// Set the head of the hash chain to us.
+			s.hashHead[newH] = uint32(di + s.hashOffset)
+		}
+	}
+	// Update window information.
+	d.windowEnd += n
+	s.index = n
+}
+
+// Try to find a match starting at index whose length is greater than prevSize.
+// We only look at chainCount possibilities before giving up.
+// pos = s.index, prevHead = s.chainHead-s.hashOffset, prevLength=minMatchLength-1, lookahead
+func (d *compressor) findMatch(pos int, prevHead int, lookahead int) (length, offset int, ok bool) {
+	minMatchLook := maxMatchLength
+	if lookahead < minMatchLook {
+		minMatchLook = lookahead
+	}
+
+	win := d.window[0 : pos+minMatchLook]
+
+	// We quit when we get a match that's at least nice long
+	nice := len(win) - pos
+	if d.nice < nice {
+		nice = d.nice
+	}
+
+	// If we've got a match that's good enough, only look in 1/4 the chain.
+	tries := d.chain
+	length = minMatchLength - 1
+
+	wEnd := win[pos+length]
+	wPos := win[pos:]
+	minIndex := pos - windowSize
+	if minIndex < 0 {
+		minIndex = 0
+	}
+	offset = 0
+
+	if d.chain < 100 {
+		for i := prevHead; tries > 0; tries-- {
+			if wEnd == win[i+length] {
+				n := matchLen(win[i:i+minMatchLook], wPos)
+				if n > length {
+					length = n
+					offset = pos - i
+					ok = true
+					if n >= nice {
+						// The match is good enough that we don't try to find a better one.
+						break
+					}
+					wEnd = win[pos+n]
+				}
+			}
+			if i <= minIndex {
+				// hashPrev[i & windowMask] has already been overwritten, so stop now.
+				break
+			}
+			i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
+			if i < minIndex {
+				break
+			}
+		}
+		return
+	}
+
+	// Minimum gain to accept a match.
+	cGain := 4
+
+	// Some like it higher (CSV), some like it lower (JSON)
+	const baseCost = 3
+	// Base is 4 bytes at with an additional cost.
+	// Matches must be better than this.
+
+	for i := prevHead; tries > 0; tries-- {
+		if wEnd == win[i+length] {
+			n := matchLen(win[i:i+minMatchLook], wPos)
+			if n > length {
+				// Calculate gain. Estimate
+				newGain := d.h.bitLengthRaw(wPos[:n]) - int(offsetExtraBits[offsetCode(uint32(pos-i))]) - baseCost - int(lengthExtraBits[lengthCodes[(n-3)&255]])
+
+				//fmt.Println("gain:", newGain, "prev:", cGain, "raw:", d.h.bitLengthRaw(wPos[:n]), "this-len:", n, "prev-len:", length)
+				if newGain > cGain {
+					length = n
+					offset = pos - i
+					cGain = newGain
+					ok = true
+					if n >= nice {
+						// The match is good enough that we don't try to find a better one.
+						break
+					}
+					wEnd = win[pos+n]
+				}
+			}
+		}
+		if i <= minIndex {
+			// hashPrev[i & windowMask] has already been overwritten, so stop now.
+			break
+		}
+		i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
+		if i < minIndex {
+			break
+		}
+	}
+	return
+}
+
+func (d *compressor) writeStoredBlock(buf []byte) error {
+	if d.w.writeStoredHeader(len(buf), false); d.w.err != nil {
+		return d.w.err
+	}
+	d.w.writeBytes(buf)
+	return d.w.err
+}
+
+// hash4 returns a hash representation of the first 4 bytes
+// of the supplied slice.
+// The caller must ensure that len(b) >= 4.
+func hash4(b []byte) uint32 {
+	return hash4u(binary.LittleEndian.Uint32(b), hashBits)
+}
+
+// hash4 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4u(u uint32, h uint8) uint32 {
+	return (u * prime4bytes) >> (32 - h)
+}
+
+// bulkHash4 will compute hashes using the same
+// algorithm as hash4
+func bulkHash4(b []byte, dst []uint32) {
+	if len(b) < 4 {
+		return
+	}
+	hb := binary.LittleEndian.Uint32(b)
+
+	dst[0] = hash4u(hb, hashBits)
+	end := len(b) - 4 + 1
+	for i := 1; i < end; i++ {
+		hb = (hb >> 8) | uint32(b[i+3])<<24
+		dst[i] = hash4u(hb, hashBits)
+	}
+}
+
+func (d *compressor) initDeflate() {
+	d.window = make([]byte, 2*windowSize)
+	d.byteAvailable = false
+	d.err = nil
+	if d.state == nil {
+		return
+	}
+	s := d.state
+	s.index = 0
+	s.hashOffset = 1
+	s.length = minMatchLength - 1
+	s.offset = 0
+	s.chainHead = -1
+}
+
+// deflateLazy is the same as deflate, but with d.fastSkipHashing == skipNever,
+// meaning it always has lazy matching on.
+func (d *compressor) deflateLazy() {
+	s := d.state
+	// Sanity enables additional runtime tests.
+	// It's intended to be used during development
+	// to supplement the currently ad-hoc unit tests.
+	const sanity = debugDeflate
+
+	if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync {
+		return
+	}
+	if d.windowEnd != s.index && d.chain > 100 {
+		// Get literal huffman coder.
+		if d.h == nil {
+			d.h = newHuffmanEncoder(maxFlateBlockTokens)
+		}
+		var tmp [256]uint16
+		for _, v := range d.window[s.index:d.windowEnd] {
+			tmp[v]++
+		}
+		d.h.generate(tmp[:], 15)
+	}
+
+	s.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
+
+	for {
+		if sanity && s.index > d.windowEnd {
+			panic("index > windowEnd")
+		}
+		lookahead := d.windowEnd - s.index
+		if lookahead < minMatchLength+maxMatchLength {
+			if !d.sync {
+				return
+			}
+			if sanity && s.index > d.windowEnd {
+				panic("index > windowEnd")
+			}
+			if lookahead == 0 {
+				// Flush current output block if any.
+				if d.byteAvailable {
+					// There is still one pending token that needs to be flushed
+					d.tokens.AddLiteral(d.window[s.index-1])
+					d.byteAvailable = false
+				}
+				if d.tokens.n > 0 {
+					if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+						return
+					}
+					d.tokens.Reset()
+				}
+				return
+			}
+		}
+		if s.index < s.maxInsertIndex {
+			// Update the hash
+			hash := hash4(d.window[s.index:])
+			ch := s.hashHead[hash]
+			s.chainHead = int(ch)
+			s.hashPrev[s.index&windowMask] = ch
+			s.hashHead[hash] = uint32(s.index + s.hashOffset)
+		}
+		prevLength := s.length
+		prevOffset := s.offset
+		s.length = minMatchLength - 1
+		s.offset = 0
+		minIndex := s.index - windowSize
+		if minIndex < 0 {
+			minIndex = 0
+		}
+
+		if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
+			if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, lookahead); ok {
+				s.length = newLength
+				s.offset = newOffset
+			}
+		}
+
+		if prevLength >= minMatchLength && s.length <= prevLength {
+			// No better match, but check for better match at end...
+			//
+			// Skip forward a number of bytes.
+			// Offset of 2 seems to yield best results. 3 is sometimes better.
+			const checkOff = 2
+
+			// Check all, except full length
+			if prevLength < maxMatchLength-checkOff {
+				prevIndex := s.index - 1
+				if prevIndex+prevLength < s.maxInsertIndex {
+					end := lookahead
+					if lookahead > maxMatchLength+checkOff {
+						end = maxMatchLength + checkOff
+					}
+					end += prevIndex
+
+					// Hash at match end.
+					h := hash4(d.window[prevIndex+prevLength:])
+					ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength
+					if prevIndex-ch2 != prevOffset && ch2 > minIndex+checkOff {
+						length := matchLen(d.window[prevIndex+checkOff:end], d.window[ch2+checkOff:])
+						// It seems like a pure length metric is best.
+						if length > prevLength {
+							prevLength = length
+							prevOffset = prevIndex - ch2
+
+							// Extend back...
+							for i := checkOff - 1; i >= 0; i-- {
+								if prevLength >= maxMatchLength || d.window[prevIndex+i] != d.window[ch2+i] {
+									// Emit tokens we "owe"
+									for j := 0; j <= i; j++ {
+										d.tokens.AddLiteral(d.window[prevIndex+j])
+										if d.tokens.n == maxFlateBlockTokens {
+											// The block includes the current character
+											if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+												return
+											}
+											d.tokens.Reset()
+										}
+										s.index++
+										if s.index < s.maxInsertIndex {
+											h := hash4(d.window[s.index:])
+											ch := s.hashHead[h]
+											s.chainHead = int(ch)
+											s.hashPrev[s.index&windowMask] = ch
+											s.hashHead[h] = uint32(s.index + s.hashOffset)
+										}
+									}
+									break
+								} else {
+									prevLength++
+								}
+							}
+						} else if false {
+							// Check one further ahead.
+							// Only rarely better, disabled for now.
+							prevIndex++
+							h := hash4(d.window[prevIndex+prevLength:])
+							ch2 := int(s.hashHead[h]) - s.hashOffset - prevLength
+							if prevIndex-ch2 != prevOffset && ch2 > minIndex+checkOff {
+								length := matchLen(d.window[prevIndex+checkOff:end], d.window[ch2+checkOff:])
+								// It seems like a pure length metric is best.
+								if length > prevLength+checkOff {
+									prevLength = length
+									prevOffset = prevIndex - ch2
+									prevIndex--
+
+									// Extend back...
+									for i := checkOff; i >= 0; i-- {
+										if prevLength >= maxMatchLength || d.window[prevIndex+i] != d.window[ch2+i-1] {
+											// Emit tokens we "owe"
+											for j := 0; j <= i; j++ {
+												d.tokens.AddLiteral(d.window[prevIndex+j])
+												if d.tokens.n == maxFlateBlockTokens {
+													// The block includes the current character
+													if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+														return
+													}
+													d.tokens.Reset()
+												}
+												s.index++
+												if s.index < s.maxInsertIndex {
+													h := hash4(d.window[s.index:])
+													ch := s.hashHead[h]
+													s.chainHead = int(ch)
+													s.hashPrev[s.index&windowMask] = ch
+													s.hashHead[h] = uint32(s.index + s.hashOffset)
+												}
+											}
+											break
+										} else {
+											prevLength++
+										}
+									}
+								}
+							}
+						}
+					}
+				}
+			}
+			// There was a match at the previous step, and the current match is
+			// not better. Output the previous match.
+			d.tokens.AddMatch(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
+
+			// Insert in the hash table all strings up to the end of the match.
+			// index and index-1 are already inserted. If there is not enough
+			// lookahead, the last two strings are not inserted into the hash
+			// table.
+			newIndex := s.index + prevLength - 1
+			// Calculate missing hashes
+			end := newIndex
+			if end > s.maxInsertIndex {
+				end = s.maxInsertIndex
+			}
+			end += minMatchLength - 1
+			startindex := s.index + 1
+			if startindex > s.maxInsertIndex {
+				startindex = s.maxInsertIndex
+			}
+			tocheck := d.window[startindex:end]
+			dstSize := len(tocheck) - minMatchLength + 1
+			if dstSize > 0 {
+				dst := s.hashMatch[:dstSize]
+				bulkHash4(tocheck, dst)
+				var newH uint32
+				for i, val := range dst {
+					di := i + startindex
+					newH = val & hashMask
+					// Get previous value with the same hash.
+					// Our chain should point to the previous value.
+					s.hashPrev[di&windowMask] = s.hashHead[newH]
+					// Set the head of the hash chain to us.
+					s.hashHead[newH] = uint32(di + s.hashOffset)
+				}
+			}
+
+			s.index = newIndex
+			d.byteAvailable = false
+			s.length = minMatchLength - 1
+			if d.tokens.n == maxFlateBlockTokens {
+				// The block includes the current character
+				if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+					return
+				}
+				d.tokens.Reset()
+			}
+			s.ii = 0
+		} else {
+			// Reset, if we got a match this run.
+			if s.length >= minMatchLength {
+				s.ii = 0
+			}
+			// We have a byte waiting. Emit it.
+			if d.byteAvailable {
+				s.ii++
+				d.tokens.AddLiteral(d.window[s.index-1])
+				if d.tokens.n == maxFlateBlockTokens {
+					if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+						return
+					}
+					d.tokens.Reset()
+				}
+				s.index++
+
+				// If we have a long run of no matches, skip additional bytes
+				// Resets when s.ii overflows after 64KB.
+				if n := int(s.ii) - d.chain; n > 0 {
+					n = 1 + int(n>>6)
+					for j := 0; j < n; j++ {
+						if s.index >= d.windowEnd-1 {
+							break
+						}
+						d.tokens.AddLiteral(d.window[s.index-1])
+						if d.tokens.n == maxFlateBlockTokens {
+							if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+								return
+							}
+							d.tokens.Reset()
+						}
+						// Index...
+						if s.index < s.maxInsertIndex {
+							h := hash4(d.window[s.index:])
+							ch := s.hashHead[h]
+							s.chainHead = int(ch)
+							s.hashPrev[s.index&windowMask] = ch
+							s.hashHead[h] = uint32(s.index + s.hashOffset)
+						}
+						s.index++
+					}
+					// Flush last byte
+					d.tokens.AddLiteral(d.window[s.index-1])
+					d.byteAvailable = false
+					// s.length = minMatchLength - 1 // not needed, since s.ii is reset above, so it should never be > minMatchLength
+					if d.tokens.n == maxFlateBlockTokens {
+						if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+							return
+						}
+						d.tokens.Reset()
+					}
+				}
+			} else {
+				s.index++
+				d.byteAvailable = true
+			}
+		}
+	}
+}
+
+func (d *compressor) store() {
+	if d.windowEnd > 0 && (d.windowEnd == maxStoreBlockSize || d.sync) {
+		d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+		d.windowEnd = 0
+	}
+}
+
+// fillWindow will fill the buffer with data for huffman-only compression.
+// The number of bytes copied is returned.
+func (d *compressor) fillBlock(b []byte) int {
+	n := copy(d.window[d.windowEnd:], b)
+	d.windowEnd += n
+	return n
+}
+
+// storeHuff will compress and store the currently added data,
+// if enough has been accumulated or we at the end of the stream.
+// Any error that occurred will be in d.err
+func (d *compressor) storeHuff() {
+	if d.windowEnd < len(d.window) && !d.sync || d.windowEnd == 0 {
+		return
+	}
+	d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync)
+	d.err = d.w.err
+	d.windowEnd = 0
+}
+
+// storeFast will compress and store the currently added data,
+// if enough has been accumulated or we at the end of the stream.
+// Any error that occurred will be in d.err
+func (d *compressor) storeFast() {
+	// We only compress if we have maxStoreBlockSize.
+	if d.windowEnd < len(d.window) {
+		if !d.sync {
+			return
+		}
+		// Handle extremely small sizes.
+		if d.windowEnd < 128 {
+			if d.windowEnd == 0 {
+				return
+			}
+			if d.windowEnd <= 32 {
+				d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+			} else {
+				d.w.writeBlockHuff(false, d.window[:d.windowEnd], true)
+				d.err = d.w.err
+			}
+			d.tokens.Reset()
+			d.windowEnd = 0
+			d.fast.Reset()
+			return
+		}
+	}
+
+	d.fast.Encode(&d.tokens, d.window[:d.windowEnd])
+	// If we made zero matches, store the block as is.
+	if d.tokens.n == 0 {
+		d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+		// If we removed less than 1/16th, huffman compress the block.
+	} else if int(d.tokens.n) > d.windowEnd-(d.windowEnd>>4) {
+		d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync)
+		d.err = d.w.err
+	} else {
+		d.w.writeBlockDynamic(&d.tokens, false, d.window[:d.windowEnd], d.sync)
+		d.err = d.w.err
+	}
+	d.tokens.Reset()
+	d.windowEnd = 0
+}
+
+// write will add input byte to the stream.
+// Unless an error occurs all bytes will be consumed.
+func (d *compressor) write(b []byte) (n int, err error) {
+	if d.err != nil {
+		return 0, d.err
+	}
+	n = len(b)
+	for len(b) > 0 {
+		if d.windowEnd == len(d.window) || d.sync {
+			d.step(d)
+		}
+		b = b[d.fill(d, b):]
+		if d.err != nil {
+			return 0, d.err
+		}
+	}
+	return n, d.err
+}
+
+func (d *compressor) syncFlush() error {
+	d.sync = true
+	if d.err != nil {
+		return d.err
+	}
+	d.step(d)
+	if d.err == nil {
+		d.w.writeStoredHeader(0, false)
+		d.w.flush()
+		d.err = d.w.err
+	}
+	d.sync = false
+	return d.err
+}
+
+func (d *compressor) init(w io.Writer, level int) (err error) {
+	d.w = newHuffmanBitWriter(w)
+
+	switch {
+	case level == NoCompression:
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillBlock
+		d.step = (*compressor).store
+	case level == ConstantCompression:
+		d.w.logNewTablePenalty = 10
+		d.window = make([]byte, 32<<10)
+		d.fill = (*compressor).fillBlock
+		d.step = (*compressor).storeHuff
+	case level == DefaultCompression:
+		level = 5
+		fallthrough
+	case level >= 1 && level <= 6:
+		d.w.logNewTablePenalty = 7
+		d.fast = newFastEnc(level)
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillBlock
+		d.step = (*compressor).storeFast
+	case 7 <= level && level <= 9:
+		d.w.logNewTablePenalty = 8
+		d.state = &advancedState{}
+		d.compressionLevel = levels[level]
+		d.initDeflate()
+		d.fill = (*compressor).fillDeflate
+		d.step = (*compressor).deflateLazy
+	case -level >= MinCustomWindowSize && -level <= MaxCustomWindowSize:
+		d.w.logNewTablePenalty = 7
+		d.fast = &fastEncL5Window{maxOffset: int32(-level), cur: maxStoreBlockSize}
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillBlock
+		d.step = (*compressor).storeFast
+	default:
+		return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level)
+	}
+	d.level = level
+	return nil
+}
+
+// reset the state of the compressor.
+func (d *compressor) reset(w io.Writer) {
+	d.w.reset(w)
+	d.sync = false
+	d.err = nil
+	// We only need to reset a few things for Snappy.
+	if d.fast != nil {
+		d.fast.Reset()
+		d.windowEnd = 0
+		d.tokens.Reset()
+		return
+	}
+	switch d.compressionLevel.chain {
+	case 0:
+		// level was NoCompression or ConstantCompression.
+		d.windowEnd = 0
+	default:
+		s := d.state
+		s.chainHead = -1
+		for i := range s.hashHead {
+			s.hashHead[i] = 0
+		}
+		for i := range s.hashPrev {
+			s.hashPrev[i] = 0
+		}
+		s.hashOffset = 1
+		s.index, d.windowEnd = 0, 0
+		d.blockStart, d.byteAvailable = 0, false
+		d.tokens.Reset()
+		s.length = minMatchLength - 1
+		s.offset = 0
+		s.ii = 0
+		s.maxInsertIndex = 0
+	}
+}
+
+func (d *compressor) close() error {
+	if d.err != nil {
+		return d.err
+	}
+	d.sync = true
+	d.step(d)
+	if d.err != nil {
+		return d.err
+	}
+	if d.w.writeStoredHeader(0, true); d.w.err != nil {
+		return d.w.err
+	}
+	d.w.flush()
+	d.w.reset(nil)
+	return d.w.err
+}
+
+// NewWriter returns a new Writer compressing data at the given level.
+// Following zlib, levels range from 1 (BestSpeed) to 9 (BestCompression);
+// higher levels typically run slower but compress more.
+// Level 0 (NoCompression) does not attempt any compression; it only adds the
+// necessary DEFLATE framing.
+// Level -1 (DefaultCompression) uses the default compression level.
+// Level -2 (ConstantCompression) will use Huffman compression only, giving
+// a very fast compression for all types of input, but sacrificing considerable
+// compression efficiency.
+//
+// If level is in the range [-2, 9] then the error returned will be nil.
+// Otherwise the error returned will be non-nil.
+func NewWriter(w io.Writer, level int) (*Writer, error) {
+	var dw Writer
+	if err := dw.d.init(w, level); err != nil {
+		return nil, err
+	}
+	return &dw, nil
+}
+
+// NewWriterDict is like NewWriter but initializes the new
+// Writer with a preset dictionary.  The returned Writer behaves
+// as if the dictionary had been written to it without producing
+// any compressed output.  The compressed data written to w
+// can only be decompressed by a Reader initialized with the
+// same dictionary.
+func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) {
+	zw, err := NewWriter(w, level)
+	if err != nil {
+		return nil, err
+	}
+	zw.d.fillWindow(dict)
+	zw.dict = append(zw.dict, dict...) // duplicate dictionary for Reset method.
+	return zw, err
+}
+
+// MinCustomWindowSize is the minimum window size that can be sent to NewWriterWindow.
+const MinCustomWindowSize = 32
+
+// MaxCustomWindowSize is the maximum custom window that can be sent to NewWriterWindow.
+const MaxCustomWindowSize = windowSize
+
+// NewWriterWindow returns a new Writer compressing data with a custom window size.
+// windowSize must be from MinCustomWindowSize to MaxCustomWindowSize.
+func NewWriterWindow(w io.Writer, windowSize int) (*Writer, error) {
+	if windowSize < MinCustomWindowSize {
+		return nil, errors.New("flate: requested window size less than MinWindowSize")
+	}
+	if windowSize > MaxCustomWindowSize {
+		return nil, errors.New("flate: requested window size bigger than MaxCustomWindowSize")
+	}
+	var dw Writer
+	if err := dw.d.init(w, -windowSize); err != nil {
+		return nil, err
+	}
+	return &dw, nil
+}
+
+// A Writer takes data written to it and writes the compressed
+// form of that data to an underlying writer (see NewWriter).
+type Writer struct {
+	d    compressor
+	dict []byte
+}
+
+// Write writes data to w, which will eventually write the
+// compressed form of data to its underlying writer.
+func (w *Writer) Write(data []byte) (n int, err error) {
+	return w.d.write(data)
+}
+
+// Flush flushes any pending data to the underlying writer.
+// It is useful mainly in compressed network protocols, to ensure that
+// a remote reader has enough data to reconstruct a packet.
+// Flush does not return until the data has been written.
+// Calling Flush when there is no pending data still causes the Writer
+// to emit a sync marker of at least 4 bytes.
+// If the underlying writer returns an error, Flush returns that error.
+//
+// In the terminology of the zlib library, Flush is equivalent to Z_SYNC_FLUSH.
+func (w *Writer) Flush() error {
+	// For more about flushing:
+	// http://www.bolet.org/~pornin/deflate-flush.html
+	return w.d.syncFlush()
+}
+
+// Close flushes and closes the writer.
+func (w *Writer) Close() error {
+	return w.d.close()
+}
+
+// Reset discards the writer's state and makes it equivalent to
+// the result of NewWriter or NewWriterDict called with dst
+// and w's level and dictionary.
+func (w *Writer) Reset(dst io.Writer) {
+	if len(w.dict) > 0 {
+		// w was created with NewWriterDict
+		w.d.reset(dst)
+		if dst != nil {
+			w.d.fillWindow(w.dict)
+		}
+	} else {
+		// w was created with NewWriter
+		w.d.reset(dst)
+	}
+}
+
+// ResetDict discards the writer's state and makes it equivalent to
+// the result of NewWriter or NewWriterDict called with dst
+// and w's level, but sets a specific dictionary.
+func (w *Writer) ResetDict(dst io.Writer, dict []byte) {
+	w.dict = dict
+	w.d.reset(dst)
+	w.d.fillWindow(w.dict)
+}

diff --git a/vendor/github.com/klauspost/compress/flate/dict_decoder.go b/vendor/github.com/klauspost/compress/flate/dict_decoder.go
new file mode 100644
index 0000000..bb36351
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/dict_decoder.go

@@ -0,0 +1,184 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// dictDecoder implements the LZ77 sliding dictionary as used in decompression.
+// LZ77 decompresses data through sequences of two forms of commands:
+//
+//   - Literal insertions: Runs of one or more symbols are inserted into the data
+//     stream as is. This is accomplished through the writeByte method for a
+//     single symbol, or combinations of writeSlice/writeMark for multiple symbols.
+//     Any valid stream must start with a literal insertion if no preset dictionary
+//     is used.
+//
+//   - Backward copies: Runs of one or more symbols are copied from previously
+//     emitted data. Backward copies come as the tuple (dist, length) where dist
+//     determines how far back in the stream to copy from and length determines how
+//     many bytes to copy. Note that it is valid for the length to be greater than
+//     the distance. Since LZ77 uses forward copies, that situation is used to
+//     perform a form of run-length encoding on repeated runs of symbols.
+//     The writeCopy and tryWriteCopy are used to implement this command.
+//
+// For performance reasons, this implementation performs little to no sanity
+// checks about the arguments. As such, the invariants documented for each
+// method call must be respected.
+type dictDecoder struct {
+	hist []byte // Sliding window history
+
+	// Invariant: 0 <= rdPos <= wrPos <= len(hist)
+	wrPos int  // Current output position in buffer
+	rdPos int  // Have emitted hist[:rdPos] already
+	full  bool // Has a full window length been written yet?
+}
+
+// init initializes dictDecoder to have a sliding window dictionary of the given
+// size. If a preset dict is provided, it will initialize the dictionary with
+// the contents of dict.
+func (dd *dictDecoder) init(size int, dict []byte) {
+	*dd = dictDecoder{hist: dd.hist}
+
+	if cap(dd.hist) < size {
+		dd.hist = make([]byte, size)
+	}
+	dd.hist = dd.hist[:size]
+
+	if len(dict) > len(dd.hist) {
+		dict = dict[len(dict)-len(dd.hist):]
+	}
+	dd.wrPos = copy(dd.hist, dict)
+	if dd.wrPos == len(dd.hist) {
+		dd.wrPos = 0
+		dd.full = true
+	}
+	dd.rdPos = dd.wrPos
+}
+
+// histSize reports the total amount of historical data in the dictionary.
+func (dd *dictDecoder) histSize() int {
+	if dd.full {
+		return len(dd.hist)
+	}
+	return dd.wrPos
+}
+
+// availRead reports the number of bytes that can be flushed by readFlush.
+func (dd *dictDecoder) availRead() int {
+	return dd.wrPos - dd.rdPos
+}
+
+// availWrite reports the available amount of output buffer space.
+func (dd *dictDecoder) availWrite() int {
+	return len(dd.hist) - dd.wrPos
+}
+
+// writeSlice returns a slice of the available buffer to write data to.
+//
+// This invariant will be kept: len(s) <= availWrite()
+func (dd *dictDecoder) writeSlice() []byte {
+	return dd.hist[dd.wrPos:]
+}
+
+// writeMark advances the writer pointer by cnt.
+//
+// This invariant must be kept: 0 <= cnt <= availWrite()
+func (dd *dictDecoder) writeMark(cnt int) {
+	dd.wrPos += cnt
+}
+
+// writeByte writes a single byte to the dictionary.
+//
+// This invariant must be kept: 0 < availWrite()
+func (dd *dictDecoder) writeByte(c byte) {
+	dd.hist[dd.wrPos] = c
+	dd.wrPos++
+}
+
+// writeCopy copies a string at a given (dist, length) to the output.
+// This returns the number of bytes copied and may be less than the requested
+// length if the available space in the output buffer is too small.
+//
+// This invariant must be kept: 0 < dist <= histSize()
+func (dd *dictDecoder) writeCopy(dist, length int) int {
+	dstBase := dd.wrPos
+	dstPos := dstBase
+	srcPos := dstPos - dist
+	endPos := dstPos + length
+	if endPos > len(dd.hist) {
+		endPos = len(dd.hist)
+	}
+
+	// Copy non-overlapping section after destination position.
+	//
+	// This section is non-overlapping in that the copy length for this section
+	// is always less than or equal to the backwards distance. This can occur
+	// if a distance refers to data that wraps-around in the buffer.
+	// Thus, a backwards copy is performed here; that is, the exact bytes in
+	// the source prior to the copy is placed in the destination.
+	if srcPos < 0 {
+		srcPos += len(dd.hist)
+		dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:])
+		srcPos = 0
+	}
+
+	// Copy possibly overlapping section before destination position.
+	//
+	// This section can overlap if the copy length for this section is larger
+	// than the backwards distance. This is allowed by LZ77 so that repeated
+	// strings can be succinctly represented using (dist, length) pairs.
+	// Thus, a forwards copy is performed here; that is, the bytes copied is
+	// possibly dependent on the resulting bytes in the destination as the copy
+	// progresses along. This is functionally equivalent to the following:
+	//
+	//	for i := 0; i < endPos-dstPos; i++ {
+	//		dd.hist[dstPos+i] = dd.hist[srcPos+i]
+	//	}
+	//	dstPos = endPos
+	//
+	for dstPos < endPos {
+		dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos])
+	}
+
+	dd.wrPos = dstPos
+	return dstPos - dstBase
+}
+
+// tryWriteCopy tries to copy a string at a given (distance, length) to the
+// output. This specialized version is optimized for short distances.
+//
+// This method is designed to be inlined for performance reasons.
+//
+// This invariant must be kept: 0 < dist <= histSize()
+func (dd *dictDecoder) tryWriteCopy(dist, length int) int {
+	dstPos := dd.wrPos
+	endPos := dstPos + length
+	if dstPos < dist || endPos > len(dd.hist) {
+		return 0
+	}
+	dstBase := dstPos
+	srcPos := dstPos - dist
+
+	// Copy possibly overlapping section before destination position.
+loop:
+	dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos])
+	if dstPos < endPos {
+		goto loop // Avoid for-loop so that this function can be inlined
+	}
+
+	dd.wrPos = dstPos
+	return dstPos - dstBase
+}
+
+// readFlush returns a slice of the historical buffer that is ready to be
+// emitted to the user. The data returned by readFlush must be fully consumed
+// before calling any other dictDecoder methods.
+func (dd *dictDecoder) readFlush() []byte {
+	toRead := dd.hist[dd.rdPos:dd.wrPos]
+	dd.rdPos = dd.wrPos
+	if dd.wrPos == len(dd.hist) {
+		dd.wrPos, dd.rdPos = 0, 0
+		dd.full = true
+	}
+	return toRead
+}

diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
new file mode 100644
index 0000000..0e8b163
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go

@@ -0,0 +1,232 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Modified for deflate by Klaus Post (c) 2015.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"fmt"
+	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
+)
+
+type fastEnc interface {
+	Encode(dst *tokens, src []byte)
+	Reset()
+}
+
+func newFastEnc(level int) fastEnc {
+	switch level {
+	case 1:
+		return &fastEncL1{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 2:
+		return &fastEncL2{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 3:
+		return &fastEncL3{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 4:
+		return &fastEncL4{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 5:
+		return &fastEncL5{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 6:
+		return &fastEncL6{fastGen: fastGen{cur: maxStoreBlockSize}}
+	default:
+		panic("invalid level specified")
+	}
+}
+
+const (
+	tableBits       = 15             // Bits used in the table
+	tableSize       = 1 << tableBits // Size of the table
+	tableShift      = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
+	baseMatchOffset = 1              // The smallest match offset
+	baseMatchLength = 3              // The smallest match length per the RFC section 3.2.5
+	maxMatchOffset  = 1 << 15        // The largest match offset
+
+	bTableBits   = 17                                               // Bits used in the big tables
+	bTableSize   = 1 << bTableBits                                  // Size of the table
+	allocHistory = maxStoreBlockSize * 5                            // Size to preallocate for history.
+	bufferReset  = (1 << 31) - allocHistory - maxStoreBlockSize - 1 // Reset the buffer offset when reaching this.
+)
+
+const (
+	prime3bytes = 506832829
+	prime4bytes = 2654435761
+	prime5bytes = 889523592379
+	prime6bytes = 227718039650203
+	prime7bytes = 58295818150454627
+	prime8bytes = 0xcf1bbcdcb7a56463
+)
+
+func load3232(b []byte, i int32) uint32 {
+	return le.Load32(b, i)
+}
+
+func load6432(b []byte, i int32) uint64 {
+	return le.Load64(b, i)
+}
+
+type tableEntry struct {
+	offset int32
+}
+
+// fastGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type fastGen struct {
+	hist []byte
+	cur  int32
+}
+
+func (e *fastGen) addBlock(src []byte) int32 {
+	// check if we have space already
+	if len(e.hist)+len(src) > cap(e.hist) {
+		if cap(e.hist) == 0 {
+			e.hist = make([]byte, 0, allocHistory)
+		} else {
+			if cap(e.hist) < maxMatchOffset*2 {
+				panic("unexpected buffer size")
+			}
+			// Move down
+			offset := int32(len(e.hist)) - maxMatchOffset
+			// copy(e.hist[0:maxMatchOffset], e.hist[offset:])
+			*(*[maxMatchOffset]byte)(e.hist) = *(*[maxMatchOffset]byte)(e.hist[offset:])
+			e.cur += offset
+			e.hist = e.hist[:maxMatchOffset]
+		}
+	}
+	s := int32(len(e.hist))
+	e.hist = append(e.hist, src...)
+	return s
+}
+
+type tableEntryPrev struct {
+	Cur  tableEntry
+	Prev tableEntry
+}
+
+// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash7(u uint64, h uint8) uint32 {
+	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & reg8SizeMask64))
+}
+
+// hashLen returns a hash of the lowest mls bytes of with length output bits.
+// mls must be >=3 and <=8. Any other value will return hash for 4 bytes.
+// length should always be < 32.
+// Preferably length and mls should be a constant for inlining.
+func hashLen(u uint64, length, mls uint8) uint32 {
+	switch mls {
+	case 3:
+		return (uint32(u<<8) * prime3bytes) >> (32 - length)
+	case 5:
+		return uint32(((u << (64 - 40)) * prime5bytes) >> (64 - length))
+	case 6:
+		return uint32(((u << (64 - 48)) * prime6bytes) >> (64 - length))
+	case 7:
+		return uint32(((u << (64 - 56)) * prime7bytes) >> (64 - length))
+	case 8:
+		return uint32((u * prime8bytes) >> (64 - length))
+	default:
+		return (uint32(u) * prime4bytes) >> (32 - length)
+	}
+}
+
+// matchlen will return the match length between offsets and t in src.
+// The maximum length returned is maxMatchLength - 4.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastGen) matchlen(s, t int, src []byte) int32 {
+	if debugDeflate {
+		if t >= s {
+			panic(fmt.Sprint("t >=s:", t, s))
+		}
+		if int(s) >= len(src) {
+			panic(fmt.Sprint("s >= len(src):", s, len(src)))
+		}
+		if t < 0 {
+			panic(fmt.Sprint("t < 0:", t))
+		}
+		if s-t > maxMatchOffset {
+			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
+		}
+	}
+	s1 := min(s+maxMatchLength-4, len(src))
+	left := s1 - s
+	n := int32(0)
+	for left >= 8 {
+		diff := le.Load64(src, s) ^ le.Load64(src, t)
+		if diff != 0 {
+			return n + int32(bits.TrailingZeros64(diff)>>3)
+		}
+		s += 8
+		t += 8
+		n += 8
+		left -= 8
+	}
+
+	a := src[s:s1]
+	b := src[t:]
+	for i := range a {
+		if a[i] != b[i] {
+			break
+		}
+		n++
+	}
+	return n
+}
+
+// matchlenLong will return the match length between offsets and t in src.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastGen) matchlenLong(s, t int, src []byte) int32 {
+	if debugDeflate {
+		if t >= s {
+			panic(fmt.Sprint("t >=s:", t, s))
+		}
+		if int(s) >= len(src) {
+			panic(fmt.Sprint("s >= len(src):", s, len(src)))
+		}
+		if t < 0 {
+			panic(fmt.Sprint("t < 0:", t))
+		}
+		if s-t > maxMatchOffset {
+			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
+		}
+	}
+	// Extend the match to be as long as possible.
+	left := len(src) - s
+	n := int32(0)
+	for left >= 8 {
+		diff := le.Load64(src, s) ^ le.Load64(src, t)
+		if diff != 0 {
+			return n + int32(bits.TrailingZeros64(diff)>>3)
+		}
+		s += 8
+		t += 8
+		n += 8
+		left -= 8
+	}
+
+	a := src[s:]
+	b := src[t:]
+	for i := range a {
+		if a[i] != b[i] {
+			break
+		}
+		n++
+	}
+	return n
+}
+
+// Reset the encoding table.
+func (e *fastGen) Reset() {
+	if cap(e.hist) < allocHistory {
+		e.hist = make([]byte, 0, allocHistory)
+	}
+	// We offset current position so everything will be out of reach.
+	// If we are above the buffer reset it will be cleared anyway since len(hist) == 0.
+	if e.cur <= bufferReset {
+		e.cur += maxMatchOffset + int32(len(e.hist))
+	}
+	e.hist = e.hist[:0]
+}

diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
new file mode 100644
index 0000000..afdc8c0
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go

@@ -0,0 +1,1183 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"fmt"
+	"io"
+	"math"
+
+	"github.com/klauspost/compress/internal/le"
+)
+
+const (
+	// The largest offset code.
+	offsetCodeCount = 30
+
+	// The special code used to mark the end of a block.
+	endBlockMarker = 256
+
+	// The first length code.
+	lengthCodesStart = 257
+
+	// The number of codegen codes.
+	codegenCodeCount = 19
+	badCode          = 255
+
+	// maxPredefinedTokens is the maximum number of tokens
+	// where we check if fixed size is smaller.
+	maxPredefinedTokens = 250
+
+	// bufferFlushSize indicates the buffer size
+	// after which bytes are flushed to the writer.
+	// Should preferably be a multiple of 6, since
+	// we accumulate 6 bytes between writes to the buffer.
+	bufferFlushSize = 246
+)
+
+// Minimum length code that emits bits.
+const lengthExtraBitsMinCode = 8
+
+// The number of extra bits needed by length code X - LENGTH_CODES_START.
+var lengthExtraBits = [32]uint8{
+	/* 257 */ 0, 0, 0,
+	/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
+	/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
+	/* 280 */ 4, 5, 5, 5, 5, 0,
+}
+
+// The length indicated by length code X - LENGTH_CODES_START.
+var lengthBase = [32]uint8{
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 10,
+	12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
+	64, 80, 96, 112, 128, 160, 192, 224, 255,
+}
+
+// Minimum offset code that emits bits.
+const offsetExtraBitsMinCode = 4
+
+// offset code word extra bits.
+var offsetExtraBits = [32]int8{
+	0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
+	4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
+	9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
+	/* extended window */
+	14, 14,
+}
+
+var offsetCombined = [32]uint32{}
+
+func init() {
+	var offsetBase = [32]uint32{
+		/* normal deflate */
+		0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
+		0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
+		0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
+		0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
+		0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
+		0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
+
+		/* extended window */
+		0x008000, 0x00c000,
+	}
+
+	for i := range offsetCombined[:] {
+		// Don't use extended window values...
+		if offsetExtraBits[i] == 0 || offsetBase[i] > 0x006000 {
+			continue
+		}
+		offsetCombined[i] = uint32(offsetExtraBits[i]) | (offsetBase[i] << 8)
+	}
+}
+
+// The odd order in which the codegen code sizes are written.
+var codegenOrder = []uint32{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
+
+type huffmanBitWriter struct {
+	// writer is the underlying writer.
+	// Do not use it directly; use the write method, which ensures
+	// that Write errors are sticky.
+	writer io.Writer
+
+	// Data waiting to be written is bytes[0:nbytes]
+	// and then the low nbits of bits.
+	bits            uint64
+	nbits           uint8
+	nbytes          uint8
+	lastHuffMan     bool
+	literalEncoding *huffmanEncoder
+	tmpLitEncoding  *huffmanEncoder
+	offsetEncoding  *huffmanEncoder
+	codegenEncoding *huffmanEncoder
+	err             error
+	lastHeader      int
+	// Set between 0 (reused block can be up to 2x the size)
+	logNewTablePenalty uint
+	bytes              [256 + 8]byte
+	literalFreq        [lengthCodesStart + 32]uint16
+	offsetFreq         [32]uint16
+	codegenFreq        [codegenCodeCount]uint16
+
+	// codegen must have an extra space for the final symbol.
+	codegen [literalCount + offsetCodeCount + 1]uint8
+}
+
+// Huffman reuse.
+//
+// The huffmanBitWriter supports reusing huffman tables and thereby combining block sections.
+//
+// This is controlled by several variables:
+//
+// If lastHeader is non-zero the Huffman table can be reused.
+// This also indicates that a Huffman table has been generated that can output all
+// possible symbols.
+// It also indicates that an EOB has not yet been emitted, so if a new tabel is generated
+// an EOB with the previous table must be written.
+//
+// If lastHuffMan is set, a table for outputting literals has been generated and offsets are invalid.
+//
+// An incoming block estimates the output size of a new table using a 'fresh' by calculating the
+// optimal size and adding a penalty in 'logNewTablePenalty'.
+// A Huffman table is not optimal, which is why we add a penalty, and generating a new table
+// is slower both for compression and decompression.
+
+func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
+	return &huffmanBitWriter{
+		writer:          w,
+		literalEncoding: newHuffmanEncoder(literalCount),
+		tmpLitEncoding:  newHuffmanEncoder(literalCount),
+		codegenEncoding: newHuffmanEncoder(codegenCodeCount),
+		offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
+	}
+}
+
+func (w *huffmanBitWriter) reset(writer io.Writer) {
+	w.writer = writer
+	w.bits, w.nbits, w.nbytes, w.err = 0, 0, 0, nil
+	w.lastHeader = 0
+	w.lastHuffMan = false
+}
+
+func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) {
+	a := t.offHist[:offsetCodeCount]
+	b := w.offsetEncoding.codes
+	b = b[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].zero() {
+			return false
+		}
+	}
+
+	a = t.extraHist[:literalCount-256]
+	b = w.literalEncoding.codes[256:literalCount]
+	b = b[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].zero() {
+			return false
+		}
+	}
+
+	a = t.litHist[:256]
+	b = w.literalEncoding.codes[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].zero() {
+			return false
+		}
+	}
+	return true
+}
+
+func (w *huffmanBitWriter) flush() {
+	if w.err != nil {
+		w.nbits = 0
+		return
+	}
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+	n := w.nbytes
+	for w.nbits != 0 {
+		w.bytes[n] = byte(w.bits)
+		w.bits >>= 8
+		if w.nbits > 8 { // Avoid underflow
+			w.nbits -= 8
+		} else {
+			w.nbits = 0
+		}
+		n++
+	}
+	w.bits = 0
+	w.write(w.bytes[:n])
+	w.nbytes = 0
+}
+
+func (w *huffmanBitWriter) write(b []byte) {
+	if w.err != nil {
+		return
+	}
+	_, w.err = w.writer.Write(b)
+}
+
+func (w *huffmanBitWriter) writeBits(b int32, nb uint8) {
+	w.bits |= uint64(b) << (w.nbits & 63)
+	w.nbits += nb
+	if w.nbits >= 48 {
+		w.writeOutBits()
+	}
+}
+
+func (w *huffmanBitWriter) writeBytes(bytes []byte) {
+	if w.err != nil {
+		return
+	}
+	n := w.nbytes
+	if w.nbits&7 != 0 {
+		w.err = InternalError("writeBytes with unfinished bits")
+		return
+	}
+	for w.nbits != 0 {
+		w.bytes[n] = byte(w.bits)
+		w.bits >>= 8
+		w.nbits -= 8
+		n++
+	}
+	if n != 0 {
+		w.write(w.bytes[:n])
+	}
+	w.nbytes = 0
+	w.write(bytes)
+}
+
+// RFC 1951 3.2.7 specifies a special run-length encoding for specifying
+// the literal and offset lengths arrays (which are concatenated into a single
+// array).  This method generates that run-length encoding.
+//
+// The result is written into the codegen array, and the frequencies
+// of each code is written into the codegenFreq array.
+// Codes 0-15 are single byte codes. Codes 16-18 are followed by additional
+// information. Code badCode is an end marker
+//
+//	numLiterals      The number of literals in literalEncoding
+//	numOffsets       The number of offsets in offsetEncoding
+//	litenc, offenc   The literal and offset encoder to use
+func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litEnc, offEnc *huffmanEncoder) {
+	for i := range w.codegenFreq {
+		w.codegenFreq[i] = 0
+	}
+	// Note that we are using codegen both as a temporary variable for holding
+	// a copy of the frequencies, and as the place where we put the result.
+	// This is fine because the output is always shorter than the input used
+	// so far.
+	codegen := w.codegen[:] // cache
+	// Copy the concatenated code sizes to codegen. Put a marker at the end.
+	cgnl := codegen[:numLiterals]
+	for i := range cgnl {
+		cgnl[i] = litEnc.codes[i].len()
+	}
+
+	cgnl = codegen[numLiterals : numLiterals+numOffsets]
+	for i := range cgnl {
+		cgnl[i] = offEnc.codes[i].len()
+	}
+	codegen[numLiterals+numOffsets] = badCode
+
+	size := codegen[0]
+	count := 1
+	outIndex := 0
+	for inIndex := 1; size != badCode; inIndex++ {
+		// INVARIANT: We have seen "count" copies of size that have not yet
+		// had output generated for them.
+		nextSize := codegen[inIndex]
+		if nextSize == size {
+			count++
+			continue
+		}
+		// We need to generate codegen indicating "count" of size.
+		if size != 0 {
+			codegen[outIndex] = size
+			outIndex++
+			w.codegenFreq[size]++
+			count--
+			for count >= 3 {
+				n := 6
+				if n > count {
+					n = count
+				}
+				codegen[outIndex] = 16
+				outIndex++
+				codegen[outIndex] = uint8(n - 3)
+				outIndex++
+				w.codegenFreq[16]++
+				count -= n
+			}
+		} else {
+			for count >= 11 {
+				n := 138
+				if n > count {
+					n = count
+				}
+				codegen[outIndex] = 18
+				outIndex++
+				codegen[outIndex] = uint8(n - 11)
+				outIndex++
+				w.codegenFreq[18]++
+				count -= n
+			}
+			if count >= 3 {
+				// count >= 3 && count <= 10
+				codegen[outIndex] = 17
+				outIndex++
+				codegen[outIndex] = uint8(count - 3)
+				outIndex++
+				w.codegenFreq[17]++
+				count = 0
+			}
+		}
+		count--
+		for ; count >= 0; count-- {
+			codegen[outIndex] = size
+			outIndex++
+			w.codegenFreq[size]++
+		}
+		// Set up invariant for next time through the loop.
+		size = nextSize
+		count = 1
+	}
+	// Marker indicating the end of the codegen.
+	codegen[outIndex] = badCode
+}
+
+func (w *huffmanBitWriter) codegens() int {
+	numCodegens := len(w.codegenFreq)
+	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
+		numCodegens--
+	}
+	return numCodegens
+}
+
+func (w *huffmanBitWriter) headerSize() (size, numCodegens int) {
+	numCodegens = len(w.codegenFreq)
+	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
+		numCodegens--
+	}
+	return 3 + 5 + 5 + 4 + (3 * numCodegens) +
+		w.codegenEncoding.bitLength(w.codegenFreq[:]) +
+		int(w.codegenFreq[16])*2 +
+		int(w.codegenFreq[17])*3 +
+		int(w.codegenFreq[18])*7, numCodegens
+}
+
+// dynamicSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) dynamicReuseSize(litEnc, offEnc *huffmanEncoder) (size int) {
+	size = litEnc.bitLength(w.literalFreq[:]) +
+		offEnc.bitLength(w.offsetFreq[:])
+	return size
+}
+
+// dynamicSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
+	header, numCodegens := w.headerSize()
+	size = header +
+		litEnc.bitLength(w.literalFreq[:]) +
+		offEnc.bitLength(w.offsetFreq[:]) +
+		extraBits
+	return size, numCodegens
+}
+
+// extraBitSize will return the number of bits that will be written
+// as "extra" bits on matches.
+func (w *huffmanBitWriter) extraBitSize() int {
+	total := 0
+	for i, n := range w.literalFreq[257:literalCount] {
+		total += int(n) * int(lengthExtraBits[i&31])
+	}
+	for i, n := range w.offsetFreq[:offsetCodeCount] {
+		total += int(n) * int(offsetExtraBits[i&31])
+	}
+	return total
+}
+
+// fixedSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) fixedSize(extraBits int) int {
+	return 3 +
+		fixedLiteralEncoding.bitLength(w.literalFreq[:]) +
+		fixedOffsetEncoding.bitLength(w.offsetFreq[:]) +
+		extraBits
+}
+
+// storedSize calculates the stored size, including header.
+// The function returns the size in bits and whether the block
+// fits inside a single block.
+func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
+	if in == nil {
+		return 0, false
+	}
+	if len(in) <= maxStoreBlockSize {
+		return (len(in) + 5) * 8, true
+	}
+	return 0, false
+}
+
+func (w *huffmanBitWriter) writeCode(c hcode) {
+	// The function does not get inlined if we "& 63" the shift.
+	w.bits |= c.code64() << (w.nbits & 63)
+	w.nbits += c.len()
+	if w.nbits >= 48 {
+		w.writeOutBits()
+	}
+}
+
+// writeOutBits will write bits to the buffer.
+func (w *huffmanBitWriter) writeOutBits() {
+	bits := w.bits
+	w.bits >>= 48
+	w.nbits -= 48
+	n := w.nbytes
+
+	// We over-write, but faster...
+	le.Store64(w.bytes[n:], bits)
+	n += 6
+
+	if n >= bufferFlushSize {
+		if w.err != nil {
+			n = 0
+			return
+		}
+		w.write(w.bytes[:n])
+		n = 0
+	}
+
+	w.nbytes = n
+}
+
+// Write the header of a dynamic Huffman block to the output stream.
+//
+//	numLiterals  The number of literals specified in codegen
+//	numOffsets   The number of offsets specified in codegen
+//	numCodegens  The number of codegens used in codegen
+func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool) {
+	if w.err != nil {
+		return
+	}
+	var firstBits int32 = 4
+	if isEof {
+		firstBits = 5
+	}
+	w.writeBits(firstBits, 3)
+	w.writeBits(int32(numLiterals-257), 5)
+	w.writeBits(int32(numOffsets-1), 5)
+	w.writeBits(int32(numCodegens-4), 4)
+
+	for i := 0; i < numCodegens; i++ {
+		value := uint(w.codegenEncoding.codes[codegenOrder[i]].len())
+		w.writeBits(int32(value), 3)
+	}
+
+	i := 0
+	for {
+		var codeWord = uint32(w.codegen[i])
+		i++
+		if codeWord == badCode {
+			break
+		}
+		w.writeCode(w.codegenEncoding.codes[codeWord])
+
+		switch codeWord {
+		case 16:
+			w.writeBits(int32(w.codegen[i]), 2)
+			i++
+		case 17:
+			w.writeBits(int32(w.codegen[i]), 3)
+			i++
+		case 18:
+			w.writeBits(int32(w.codegen[i]), 7)
+			i++
+		}
+	}
+}
+
+// writeStoredHeader will write a stored header.
+// If the stored block is only used for EOF,
+// it is replaced with a fixed huffman block.
+func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
+	if w.err != nil {
+		return
+	}
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+
+	// To write EOF, use a fixed encoding block. 10 bits instead of 5 bytes.
+	if length == 0 && isEof {
+		w.writeFixedHeader(isEof)
+		// EOB: 7 bits, value: 0
+		w.writeBits(0, 7)
+		w.flush()
+		return
+	}
+
+	var flag int32
+	if isEof {
+		flag = 1
+	}
+	w.writeBits(flag, 3)
+	w.flush()
+	w.writeBits(int32(length), 16)
+	w.writeBits(int32(^uint16(length)), 16)
+}
+
+func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
+	if w.err != nil {
+		return
+	}
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+
+	// Indicate that we are a fixed Huffman block
+	var value int32 = 2
+	if isEof {
+		value = 3
+	}
+	w.writeBits(value, 3)
+}
+
+// writeBlock will write a block of tokens with the smallest encoding.
+// The original input can be supplied, and if the huffman encoded data
+// is larger than the original bytes, the data will be written as a
+// stored block.
+// If the input is nil, the tokens will always be Huffman encoded.
+func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
+	if w.err != nil {
+		return
+	}
+
+	tokens.AddEOB()
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+	numLiterals, numOffsets := w.indexTokens(tokens, false)
+	w.generate()
+	var extraBits int
+	storedSize, storable := w.storedSize(input)
+	if storable {
+		extraBits = w.extraBitSize()
+	}
+
+	// Figure out smallest code.
+	// Fixed Huffman baseline.
+	var literalEncoding = fixedLiteralEncoding
+	var offsetEncoding = fixedOffsetEncoding
+	var size = math.MaxInt32
+	if tokens.n < maxPredefinedTokens {
+		size = w.fixedSize(extraBits)
+	}
+
+	// Dynamic Huffman?
+	var numCodegens int
+
+	// Generate codegen and codegenFrequencies, which indicates how to encode
+	// the literalEncoding and the offsetEncoding.
+	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
+	w.codegenEncoding.generate(w.codegenFreq[:], 7)
+	dynamicSize, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits)
+
+	if dynamicSize < size {
+		size = dynamicSize
+		literalEncoding = w.literalEncoding
+		offsetEncoding = w.offsetEncoding
+	}
+
+	// Stored bytes?
+	if storable && storedSize <= size {
+		w.writeStoredHeader(len(input), eof)
+		w.writeBytes(input)
+		return
+	}
+
+	// Huffman.
+	if literalEncoding == fixedLiteralEncoding {
+		w.writeFixedHeader(eof)
+	} else {
+		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+	}
+
+	// Write the tokens.
+	w.writeTokens(tokens.Slice(), literalEncoding.codes, offsetEncoding.codes)
+}
+
+// writeBlockDynamic encodes a block using a dynamic Huffman table.
+// This should be used if the symbols used have a disproportionate
+// histogram distribution.
+// If input is supplied and the compression savings are below 1/16th of the
+// input size the block is stored.
+func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []byte, sync bool) {
+	if w.err != nil {
+		return
+	}
+
+	sync = sync || eof
+	if sync {
+		tokens.AddEOB()
+	}
+
+	// We cannot reuse pure huffman table, and must mark as EOF.
+	if (w.lastHuffMan || eof) && w.lastHeader > 0 {
+		// We will not try to reuse.
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+		w.lastHuffMan = false
+	}
+
+	// fillReuse enables filling of empty values.
+	// This will make encodings always reusable without testing.
+	// However, this does not appear to benefit on most cases.
+	const fillReuse = false
+
+	// Check if we can reuse...
+	if !fillReuse && w.lastHeader > 0 && !w.canReuse(tokens) {
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+
+	numLiterals, numOffsets := w.indexTokens(tokens, !sync)
+	extraBits := 0
+	ssize, storable := w.storedSize(input)
+
+	const usePrefs = true
+	if storable || w.lastHeader > 0 {
+		extraBits = w.extraBitSize()
+	}
+
+	var size int
+
+	// Check if we should reuse.
+	if w.lastHeader > 0 {
+		// Estimate size for using a new table.
+		// Use the previous header size as the best estimate.
+		newSize := w.lastHeader + tokens.EstimatedBits()
+		newSize += int(w.literalEncoding.codes[endBlockMarker].len()) + newSize>>w.logNewTablePenalty
+
+		// The estimated size is calculated as an optimal table.
+		// We add a penalty to make it more realistic and re-use a bit more.
+		reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + extraBits
+
+		// Check if a new table is better.
+		if newSize < reuseSize {
+			// Write the EOB we owe.
+			w.writeCode(w.literalEncoding.codes[endBlockMarker])
+			size = newSize
+			w.lastHeader = 0
+		} else {
+			size = reuseSize
+		}
+
+		if tokens.n < maxPredefinedTokens {
+			if preSize := w.fixedSize(extraBits) + 7; usePrefs && preSize < size {
+				// Check if we get a reasonable size decrease.
+				if storable && ssize <= size {
+					w.writeStoredHeader(len(input), eof)
+					w.writeBytes(input)
+					return
+				}
+				w.writeFixedHeader(eof)
+				if !sync {
+					tokens.AddEOB()
+				}
+				w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+				return
+			}
+		}
+		// Check if we get a reasonable size decrease.
+		if storable && ssize <= size {
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			return
+		}
+	}
+
+	// We want a new block/table
+	if w.lastHeader == 0 {
+		if fillReuse && !sync {
+			w.fillTokens()
+			numLiterals, numOffsets = maxNumLit, maxNumDist
+		} else {
+			w.literalFreq[endBlockMarker] = 1
+		}
+
+		w.generate()
+		// Generate codegen and codegenFrequencies, which indicates how to encode
+		// the literalEncoding and the offsetEncoding.
+		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
+		w.codegenEncoding.generate(w.codegenFreq[:], 7)
+
+		var numCodegens int
+		if fillReuse && !sync {
+			// Reindex for accurate size...
+			w.indexTokens(tokens, true)
+		}
+		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits)
+
+		// Store predefined, if we don't get a reasonable improvement.
+		if tokens.n < maxPredefinedTokens {
+			if preSize := w.fixedSize(extraBits); usePrefs && preSize <= size {
+				// Store bytes, if we don't get an improvement.
+				if storable && ssize <= preSize {
+					w.writeStoredHeader(len(input), eof)
+					w.writeBytes(input)
+					return
+				}
+				w.writeFixedHeader(eof)
+				if !sync {
+					tokens.AddEOB()
+				}
+				w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+				return
+			}
+		}
+
+		if storable && ssize <= size {
+			// Store bytes, if we don't get an improvement.
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			return
+		}
+
+		// Write Huffman table.
+		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+		if !sync {
+			w.lastHeader, _ = w.headerSize()
+		}
+		w.lastHuffMan = false
+	}
+
+	if sync {
+		w.lastHeader = 0
+	}
+	// Write the tokens.
+	w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes)
+}
+
+func (w *huffmanBitWriter) fillTokens() {
+	for i, v := range w.literalFreq[:literalCount] {
+		if v == 0 {
+			w.literalFreq[i] = 1
+		}
+	}
+	for i, v := range w.offsetFreq[:offsetCodeCount] {
+		if v == 0 {
+			w.offsetFreq[i] = 1
+		}
+	}
+}
+
+// indexTokens indexes a slice of tokens, and updates
+// literalFreq and offsetFreq, and generates literalEncoding
+// and offsetEncoding.
+// The number of literal and offset tokens is returned.
+func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, numOffsets int) {
+	//copy(w.literalFreq[:], t.litHist[:])
+	*(*[256]uint16)(w.literalFreq[:]) = t.litHist
+	//copy(w.literalFreq[256:], t.extraHist[:])
+	*(*[32]uint16)(w.literalFreq[256:]) = t.extraHist
+	w.offsetFreq = t.offHist
+
+	if t.n == 0 {
+		return
+	}
+	if filled {
+		return maxNumLit, maxNumDist
+	}
+	// get the number of literals
+	numLiterals = len(w.literalFreq)
+	for w.literalFreq[numLiterals-1] == 0 {
+		numLiterals--
+	}
+	// get the number of offsets
+	numOffsets = len(w.offsetFreq)
+	for numOffsets > 0 && w.offsetFreq[numOffsets-1] == 0 {
+		numOffsets--
+	}
+	if numOffsets == 0 {
+		// We haven't found a single match. If we want to go with the dynamic encoding,
+		// we should count at least one offset to be sure that the offset huffman tree could be encoded.
+		w.offsetFreq[0] = 1
+		numOffsets = 1
+	}
+	return
+}
+
+func (w *huffmanBitWriter) generate() {
+	w.literalEncoding.generate(w.literalFreq[:literalCount], 15)
+	w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15)
+}
+
+// writeTokens writes a slice of tokens to the output.
+// codes for literal and offset encoding must be supplied.
+func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) {
+	if w.err != nil {
+		return
+	}
+	if len(tokens) == 0 {
+		return
+	}
+
+	// Only last token should be endBlockMarker.
+	var deferEOB bool
+	if tokens[len(tokens)-1] == endBlockMarker {
+		tokens = tokens[:len(tokens)-1]
+		deferEOB = true
+	}
+
+	// Create slices up to the next power of two to avoid bounds checks.
+	lits := leCodes[:256]
+	offs := oeCodes[:32]
+	lengths := leCodes[lengthCodesStart:]
+	lengths = lengths[:32]
+
+	// Go 1.16 LOVES having these on stack.
+	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
+
+	for _, t := range tokens {
+		if t < 256 {
+			//w.writeCode(lits[t.literal()])
+			c := lits[t]
+			bits |= c.code64() << (nbits & 63)
+			nbits += c.len()
+			if nbits >= 48 {
+				le.Store64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
+			continue
+		}
+
+		// Write the length
+		length := t.length()
+		lengthCode := lengthCode(length) & 31
+		if false {
+			w.writeCode(lengths[lengthCode])
+		} else {
+			// inlined
+			c := lengths[lengthCode]
+			bits |= c.code64() << (nbits & 63)
+			nbits += c.len()
+			if nbits >= 48 {
+				le.Store64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
+		}
+
+		if lengthCode >= lengthExtraBitsMinCode {
+			extraLengthBits := lengthExtraBits[lengthCode]
+			//w.writeBits(extraLength, extraLengthBits)
+			extraLength := int32(length - lengthBase[lengthCode])
+			bits |= uint64(extraLength) << (nbits & 63)
+			nbits += extraLengthBits
+			if nbits >= 48 {
+				le.Store64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
+		}
+		// Write the offset
+		offset := t.offset()
+		offsetCode := (offset >> 16) & 31
+		if false {
+			w.writeCode(offs[offsetCode])
+		} else {
+			// inlined
+			c := offs[offsetCode]
+			bits |= c.code64() << (nbits & 63)
+			nbits += c.len()
+			if nbits >= 48 {
+				le.Store64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
+		}
+
+		if offsetCode >= offsetExtraBitsMinCode {
+			offsetComb := offsetCombined[offsetCode]
+			//w.writeBits(extraOffset, extraOffsetBits)
+			bits |= uint64((offset-(offsetComb>>8))&matchOffsetOnlyMask) << (nbits & 63)
+			nbits += uint8(offsetComb)
+			if nbits >= 48 {
+				le.Store64(w.bytes[nbytes:], bits)
+				//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+				bits >>= 48
+				nbits -= 48
+				nbytes += 6
+				if nbytes >= bufferFlushSize {
+					if w.err != nil {
+						nbytes = 0
+						return
+					}
+					_, w.err = w.writer.Write(w.bytes[:nbytes])
+					nbytes = 0
+				}
+			}
+		}
+	}
+	// Restore...
+	w.bits, w.nbits, w.nbytes = bits, nbits, nbytes
+
+	if deferEOB {
+		w.writeCode(leCodes[endBlockMarker])
+	}
+}
+
+// huffOffset is a static offset encoder used for huffman only encoding.
+// It can be reused since we will not be encoding offset values.
+var huffOffset *huffmanEncoder
+
+func init() {
+	w := newHuffmanBitWriter(nil)
+	w.offsetFreq[0] = 1
+	huffOffset = newHuffmanEncoder(offsetCodeCount)
+	huffOffset.generate(w.offsetFreq[:offsetCodeCount], 15)
+}
+
+// writeBlockHuff encodes a block of bytes as either
+// Huffman encoded literals or uncompressed bytes if the
+// results only gains very little from compression.
+func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
+	if w.err != nil {
+		return
+	}
+
+	// Clear histogram
+	for i := range w.literalFreq[:] {
+		w.literalFreq[i] = 0
+	}
+	if !w.lastHuffMan {
+		for i := range w.offsetFreq[:] {
+			w.offsetFreq[i] = 0
+		}
+	}
+
+	const numLiterals = endBlockMarker + 1
+	const numOffsets = 1
+
+	// Add everything as literals
+	// We have to estimate the header size.
+	// Assume header is around 70 bytes:
+	// https://stackoverflow.com/a/25454430
+	const guessHeaderSizeBits = 70 * 8
+	histogram(input, w.literalFreq[:numLiterals])
+	ssize, storable := w.storedSize(input)
+	if storable && len(input) > 1024 {
+		// Quick check for incompressible content.
+		abs := float64(0)
+		avg := float64(len(input)) / 256
+		max := float64(len(input) * 2)
+		for _, v := range w.literalFreq[:256] {
+			diff := float64(v) - avg
+			abs += diff * diff
+			if abs > max {
+				break
+			}
+		}
+		if abs < max {
+			if debugDeflate {
+				fmt.Println("stored", abs, "<", max)
+			}
+			// No chance we can compress this...
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			return
+		}
+	}
+	w.literalFreq[endBlockMarker] = 1
+	w.tmpLitEncoding.generate(w.literalFreq[:numLiterals], 15)
+	estBits := w.tmpLitEncoding.canReuseBits(w.literalFreq[:numLiterals])
+	if estBits < math.MaxInt32 {
+		estBits += w.lastHeader
+		if w.lastHeader == 0 {
+			estBits += guessHeaderSizeBits
+		}
+		estBits += estBits >> w.logNewTablePenalty
+	}
+
+	// Store bytes, if we don't get a reasonable improvement.
+	if storable && ssize <= estBits {
+		if debugDeflate {
+			fmt.Println("stored,", ssize, "<=", estBits)
+		}
+		w.writeStoredHeader(len(input), eof)
+		w.writeBytes(input)
+		return
+	}
+
+	if w.lastHeader > 0 {
+		reuseSize := w.literalEncoding.canReuseBits(w.literalFreq[:256])
+
+		if estBits < reuseSize {
+			if debugDeflate {
+				fmt.Println("NOT reusing, reuse:", reuseSize/8, "> new:", estBits/8, "header est:", w.lastHeader/8, "bytes")
+			}
+			// We owe an EOB
+			w.writeCode(w.literalEncoding.codes[endBlockMarker])
+			w.lastHeader = 0
+		} else if debugDeflate {
+			fmt.Println("reusing, reuse:", reuseSize/8, "> new:", estBits/8, "- header est:", w.lastHeader/8)
+		}
+	}
+
+	count := 0
+	if w.lastHeader == 0 {
+		// Use the temp encoding, so swap.
+		w.literalEncoding, w.tmpLitEncoding = w.tmpLitEncoding, w.literalEncoding
+		// Generate codegen and codegenFrequencies, which indicates how to encode
+		// the literalEncoding and the offsetEncoding.
+		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
+		w.codegenEncoding.generate(w.codegenFreq[:], 7)
+		numCodegens := w.codegens()
+
+		// Huffman.
+		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+		w.lastHuffMan = true
+		w.lastHeader, _ = w.headerSize()
+		if debugDeflate {
+			count += w.lastHeader
+			fmt.Println("header:", count/8)
+		}
+	}
+
+	encoding := w.literalEncoding.codes[:256]
+	// Go 1.16 LOVES having these on stack. At least 1.5x the speed.
+	bits, nbits, nbytes := w.bits, w.nbits, w.nbytes
+
+	if debugDeflate {
+		count -= int(nbytes)*8 + int(nbits)
+	}
+	// Unroll, write 3 codes/loop.
+	// Fastest number of unrolls.
+	for len(input) > 3 {
+		// We must have at least 48 bits free.
+		if nbits >= 8 {
+			n := nbits >> 3
+			le.Store64(w.bytes[nbytes:], bits)
+			bits >>= (n * 8) & 63
+			nbits -= n * 8
+			nbytes += n
+		}
+		if nbytes >= bufferFlushSize {
+			if w.err != nil {
+				nbytes = 0
+				return
+			}
+			if debugDeflate {
+				count += int(nbytes) * 8
+			}
+			_, w.err = w.writer.Write(w.bytes[:nbytes])
+			nbytes = 0
+		}
+		a, b := encoding[input[0]], encoding[input[1]]
+		bits |= a.code64() << (nbits & 63)
+		bits |= b.code64() << ((nbits + a.len()) & 63)
+		c := encoding[input[2]]
+		nbits += b.len() + a.len()
+		bits |= c.code64() << (nbits & 63)
+		nbits += c.len()
+		input = input[3:]
+	}
+
+	// Remaining...
+	for _, t := range input {
+		if nbits >= 48 {
+			le.Store64(w.bytes[nbytes:], bits)
+			//*(*uint64)(unsafe.Pointer(&w.bytes[nbytes])) = bits
+			bits >>= 48
+			nbits -= 48
+			nbytes += 6
+			if nbytes >= bufferFlushSize {
+				if w.err != nil {
+					nbytes = 0
+					return
+				}
+				if debugDeflate {
+					count += int(nbytes) * 8
+				}
+				_, w.err = w.writer.Write(w.bytes[:nbytes])
+				nbytes = 0
+			}
+		}
+		// Bitwriting inlined, ~30% speedup
+		c := encoding[t]
+		bits |= c.code64() << (nbits & 63)
+
+		nbits += c.len()
+		if debugDeflate {
+			count += int(c.len())
+		}
+	}
+	// Restore...
+	w.bits, w.nbits, w.nbytes = bits, nbits, nbytes
+
+	if debugDeflate {
+		nb := count + int(nbytes)*8 + int(nbits)
+		fmt.Println("wrote", nb, "bits,", nb/8, "bytes.")
+	}
+	// Flush if needed to have space.
+	if w.nbits >= 48 {
+		w.writeOutBits()
+	}
+
+	if eof || sync {
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+		w.lastHuffMan = false
+	}
+}

diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go
new file mode 100644
index 0000000..be7b58b
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go

@@ -0,0 +1,417 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"math"
+	"math/bits"
+)
+
+const (
+	maxBitsLimit = 16
+	// number of valid literals
+	literalCount = 286
+)
+
+// hcode is a huffman code with a bit code and bit length.
+type hcode uint32
+
+func (h hcode) len() uint8 {
+	return uint8(h)
+}
+
+func (h hcode) code64() uint64 {
+	return uint64(h >> 8)
+}
+
+func (h hcode) zero() bool {
+	return h == 0
+}
+
+type huffmanEncoder struct {
+	codes    []hcode
+	bitCount [17]int32
+
+	// Allocate a reusable buffer with the longest possible frequency table.
+	// Possible lengths are codegenCodeCount, offsetCodeCount and literalCount.
+	// The largest of these is literalCount, so we allocate for that case.
+	freqcache [literalCount + 1]literalNode
+}
+
+type literalNode struct {
+	literal uint16
+	freq    uint16
+}
+
+// A levelInfo describes the state of the constructed tree for a given depth.
+type levelInfo struct {
+	// Our level.  for better printing
+	level int32
+
+	// The frequency of the last node at this level
+	lastFreq int32
+
+	// The frequency of the next character to add to this level
+	nextCharFreq int32
+
+	// The frequency of the next pair (from level below) to add to this level.
+	// Only valid if the "needed" value of the next lower level is 0.
+	nextPairFreq int32
+
+	// The number of chains remaining to generate for this level before moving
+	// up to the next level
+	needed int32
+}
+
+// set sets the code and length of an hcode.
+func (h *hcode) set(code uint16, length uint8) {
+	*h = hcode(length) | (hcode(code) << 8)
+}
+
+func newhcode(code uint16, length uint8) hcode {
+	return hcode(length) | (hcode(code) << 8)
+}
+
+func reverseBits(number uint16, bitLength byte) uint16 {
+	return bits.Reverse16(number << ((16 - bitLength) & 15))
+}
+
+func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxUint16} }
+
+func newHuffmanEncoder(size int) *huffmanEncoder {
+	// Make capacity to next power of two.
+	c := uint(bits.Len32(uint32(size - 1)))
+	return &huffmanEncoder{codes: make([]hcode, size, 1<<c)}
+}
+
+// Generates a HuffmanCode corresponding to the fixed literal table
+func generateFixedLiteralEncoding() *huffmanEncoder {
+	h := newHuffmanEncoder(literalCount)
+	codes := h.codes
+	var ch uint16
+	for ch = 0; ch < literalCount; ch++ {
+		var bits uint16
+		var size uint8
+		switch {
+		case ch < 144:
+			// size 8, 000110000  .. 10111111
+			bits = ch + 48
+			size = 8
+		case ch < 256:
+			// size 9, 110010000 .. 111111111
+			bits = ch + 400 - 144
+			size = 9
+		case ch < 280:
+			// size 7, 0000000 .. 0010111
+			bits = ch - 256
+			size = 7
+		default:
+			// size 8, 11000000 .. 11000111
+			bits = ch + 192 - 280
+			size = 8
+		}
+		codes[ch] = newhcode(reverseBits(bits, size), size)
+	}
+	return h
+}
+
+func generateFixedOffsetEncoding() *huffmanEncoder {
+	h := newHuffmanEncoder(30)
+	codes := h.codes
+	for ch := range codes {
+		codes[ch] = newhcode(reverseBits(uint16(ch), 5), 5)
+	}
+	return h
+}
+
+var fixedLiteralEncoding = generateFixedLiteralEncoding()
+var fixedOffsetEncoding = generateFixedOffsetEncoding()
+
+func (h *huffmanEncoder) bitLength(freq []uint16) int {
+	var total int
+	for i, f := range freq {
+		if f != 0 {
+			total += int(f) * int(h.codes[i].len())
+		}
+	}
+	return total
+}
+
+func (h *huffmanEncoder) bitLengthRaw(b []byte) int {
+	var total int
+	for _, f := range b {
+		total += int(h.codes[f].len())
+	}
+	return total
+}
+
+// canReuseBits returns the number of bits or math.MaxInt32 if the encoder cannot be reused.
+func (h *huffmanEncoder) canReuseBits(freq []uint16) int {
+	var total int
+	for i, f := range freq {
+		if f != 0 {
+			code := h.codes[i]
+			if code.zero() {
+				return math.MaxInt32
+			}
+			total += int(f) * int(code.len())
+		}
+	}
+	return total
+}
+
+// Return the number of literals assigned to each bit size in the Huffman encoding
+//
+// This method is only called when list.length >= 3
+// The cases of 0, 1, and 2 literals are handled by special case code.
+//
+// list  An array of the literals with non-zero frequencies
+//
+//	and their associated frequencies. The array is in order of increasing
+//	frequency, and has as its last element a special element with frequency
+//	MaxInt32
+//
+// maxBits     The maximum number of bits that should be used to encode any literal.
+//
+//	Must be less than 16.
+//
+// return      An integer array in which array[i] indicates the number of literals
+//
+//	that should be encoded in i bits.
+func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
+	if maxBits >= maxBitsLimit {
+		panic("flate: maxBits too large")
+	}
+	n := int32(len(list))
+	list = list[0 : n+1]
+	list[n] = maxNode()
+
+	// The tree can't have greater depth than n - 1, no matter what. This
+	// saves a little bit of work in some small cases
+	if maxBits > n-1 {
+		maxBits = n - 1
+	}
+
+	// Create information about each of the levels.
+	// A bogus "Level 0" whose sole purpose is so that
+	// level1.prev.needed==0.  This makes level1.nextPairFreq
+	// be a legitimate value that never gets chosen.
+	var levels [maxBitsLimit]levelInfo
+	// leafCounts[i] counts the number of literals at the left
+	// of ancestors of the rightmost node at level i.
+	// leafCounts[i][j] is the number of literals at the left
+	// of the level j ancestor.
+	var leafCounts [maxBitsLimit][maxBitsLimit]int32
+
+	// Descending to only have 1 bounds check.
+	l2f := int32(list[2].freq)
+	l1f := int32(list[1].freq)
+	l0f := int32(list[0].freq) + int32(list[1].freq)
+
+	for level := int32(1); level <= maxBits; level++ {
+		// For every level, the first two items are the first two characters.
+		// We initialize the levels as if we had already figured this out.
+		levels[level] = levelInfo{
+			level:        level,
+			lastFreq:     l1f,
+			nextCharFreq: l2f,
+			nextPairFreq: l0f,
+		}
+		leafCounts[level][level] = 2
+		if level == 1 {
+			levels[level].nextPairFreq = math.MaxInt32
+		}
+	}
+
+	// We need a total of 2*n - 2 items at top level and have already generated 2.
+	levels[maxBits].needed = 2*n - 4
+
+	level := uint32(maxBits)
+	for level < 16 {
+		l := &levels[level]
+		if l.nextPairFreq == math.MaxInt32 && l.nextCharFreq == math.MaxInt32 {
+			// We've run out of both leafs and pairs.
+			// End all calculations for this level.
+			// To make sure we never come back to this level or any lower level,
+			// set nextPairFreq impossibly large.
+			l.needed = 0
+			levels[level+1].nextPairFreq = math.MaxInt32
+			level++
+			continue
+		}
+
+		prevFreq := l.lastFreq
+		if l.nextCharFreq < l.nextPairFreq {
+			// The next item on this row is a leaf node.
+			n := leafCounts[level][level] + 1
+			l.lastFreq = l.nextCharFreq
+			// Lower leafCounts are the same of the previous node.
+			leafCounts[level][level] = n
+			e := list[n]
+			if e.literal < math.MaxUint16 {
+				l.nextCharFreq = int32(e.freq)
+			} else {
+				l.nextCharFreq = math.MaxInt32
+			}
+		} else {
+			// The next item on this row is a pair from the previous row.
+			// nextPairFreq isn't valid until we generate two
+			// more values in the level below
+			l.lastFreq = l.nextPairFreq
+			// Take leaf counts from the lower level, except counts[level] remains the same.
+			if true {
+				save := leafCounts[level][level]
+				leafCounts[level] = leafCounts[level-1]
+				leafCounts[level][level] = save
+			} else {
+				copy(leafCounts[level][:level], leafCounts[level-1][:level])
+			}
+			levels[l.level-1].needed = 2
+		}
+
+		if l.needed--; l.needed == 0 {
+			// We've done everything we need to do for this level.
+			// Continue calculating one level up. Fill in nextPairFreq
+			// of that level with the sum of the two nodes we've just calculated on
+			// this level.
+			if l.level == maxBits {
+				// All done!
+				break
+			}
+			levels[l.level+1].nextPairFreq = prevFreq + l.lastFreq
+			level++
+		} else {
+			// If we stole from below, move down temporarily to replenish it.
+			for levels[level-1].needed > 0 {
+				level--
+			}
+		}
+	}
+
+	// Somethings is wrong if at the end, the top level is null or hasn't used
+	// all of the leaves.
+	if leafCounts[maxBits][maxBits] != n {
+		panic("leafCounts[maxBits][maxBits] != n")
+	}
+
+	bitCount := h.bitCount[:maxBits+1]
+	bits := 1
+	counts := &leafCounts[maxBits]
+	for level := maxBits; level > 0; level-- {
+		// chain.leafCount gives the number of literals requiring at least "bits"
+		// bits to encode.
+		bitCount[bits] = counts[level] - counts[level-1]
+		bits++
+	}
+	return bitCount
+}
+
+// Look at the leaves and assign them a bit count and an encoding as specified
+// in RFC 1951 3.2.2
+func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalNode) {
+	code := uint16(0)
+	for n, bits := range bitCount {
+		code <<= 1
+		if n == 0 || bits == 0 {
+			continue
+		}
+		// The literals list[len(list)-bits] .. list[len(list)-bits]
+		// are encoded using "bits" bits, and get the values
+		// code, code + 1, ....  The code values are
+		// assigned in literal order (not frequency order).
+		chunk := list[len(list)-int(bits):]
+
+		sortByLiteral(chunk)
+		for _, node := range chunk {
+			h.codes[node.literal] = newhcode(reverseBits(code, uint8(n)), uint8(n))
+			code++
+		}
+		list = list[0 : len(list)-int(bits)]
+	}
+}
+
+// Update this Huffman Code object to be the minimum code for the specified frequency count.
+//
+// freq  An array of frequencies, in which frequency[i] gives the frequency of literal i.
+// maxBits  The maximum number of bits to use for any literal.
+func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
+	list := h.freqcache[:len(freq)+1]
+	codes := h.codes[:len(freq)]
+	// Number of non-zero literals
+	count := 0
+	// Set list to be the set of all non-zero literals and their frequencies
+	for i, f := range freq {
+		if f != 0 {
+			list[count] = literalNode{uint16(i), f}
+			count++
+		} else {
+			codes[i] = 0
+		}
+	}
+	list[count] = literalNode{}
+
+	list = list[:count]
+	if count <= 2 {
+		// Handle the small cases here, because they are awkward for the general case code. With
+		// two or fewer literals, everything has bit length 1.
+		for i, node := range list {
+			// "list" is in order of increasing literal value.
+			h.codes[node.literal].set(uint16(i), 1)
+		}
+		return
+	}
+	sortByFreq(list)
+
+	// Get the number of literals for each bit count
+	bitCount := h.bitCounts(list, maxBits)
+	// And do the assignment
+	h.assignEncodingAndSize(bitCount, list)
+}
+
+// atLeastOne clamps the result between 1 and 15.
+func atLeastOne(v float32) float32 {
+	if v < 1 {
+		return 1
+	}
+	if v > 15 {
+		return 15
+	}
+	return v
+}
+
+func histogram(b []byte, h []uint16) {
+	if true && len(b) >= 8<<10 {
+		// Split for bigger inputs
+		histogramSplit(b, h)
+	} else {
+		h = h[:256]
+		for _, t := range b {
+			h[t]++
+		}
+	}
+}
+
+func histogramSplit(b []byte, h []uint16) {
+	// Tested, and slightly faster than 2-way.
+	// Writing to separate arrays and combining is also slightly slower.
+	h = h[:256]
+	for len(b)&3 != 0 {
+		h[b[0]]++
+		b = b[1:]
+	}
+	n := len(b) / 4
+	x, y, z, w := b[:n], b[n:], b[n+n:], b[n+n+n:]
+	y, z, w = y[:len(x)], z[:len(x)], w[:len(x)]
+	for i, t := range x {
+		v0 := &h[t]
+		v1 := &h[y[i]]
+		v3 := &h[w[i]]
+		v2 := &h[z[i]]
+		*v0++
+		*v1++
+		*v2++
+		*v3++
+	}
+}

diff --git a/vendor/github.com/klauspost/compress/flate/huffman_sortByFreq.go b/vendor/github.com/klauspost/compress/flate/huffman_sortByFreq.go
new file mode 100644
index 0000000..6c05ba8
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/huffman_sortByFreq.go

@@ -0,0 +1,159 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Sort sorts data.
+// It makes one call to data.Len to determine n, and O(n*log(n)) calls to
+// data.Less and data.Swap. The sort is not guaranteed to be stable.
+func sortByFreq(data []literalNode) {
+	n := len(data)
+	quickSortByFreq(data, 0, n, maxDepth(n))
+}
+
+func quickSortByFreq(data []literalNode, a, b, maxDepth int) {
+	for b-a > 12 { // Use ShellSort for slices <= 12 elements
+		if maxDepth == 0 {
+			heapSort(data, a, b)
+			return
+		}
+		maxDepth--
+		mlo, mhi := doPivotByFreq(data, a, b)
+		// Avoiding recursion on the larger subproblem guarantees
+		// a stack depth of at most lg(b-a).
+		if mlo-a < b-mhi {
+			quickSortByFreq(data, a, mlo, maxDepth)
+			a = mhi // i.e., quickSortByFreq(data, mhi, b)
+		} else {
+			quickSortByFreq(data, mhi, b, maxDepth)
+			b = mlo // i.e., quickSortByFreq(data, a, mlo)
+		}
+	}
+	if b-a > 1 {
+		// Do ShellSort pass with gap 6
+		// It could be written in this simplified form cause b-a <= 12
+		for i := a + 6; i < b; i++ {
+			if data[i].freq == data[i-6].freq && data[i].literal < data[i-6].literal || data[i].freq < data[i-6].freq {
+				data[i], data[i-6] = data[i-6], data[i]
+			}
+		}
+		insertionSortByFreq(data, a, b)
+	}
+}
+
+func doPivotByFreq(data []literalNode, lo, hi int) (midlo, midhi int) {
+	m := int(uint(lo+hi) >> 1) // Written like this to avoid integer overflow.
+	if hi-lo > 40 {
+		// Tukey's ``Ninther,'' median of three medians of three.
+		s := (hi - lo) / 8
+		medianOfThreeSortByFreq(data, lo, lo+s, lo+2*s)
+		medianOfThreeSortByFreq(data, m, m-s, m+s)
+		medianOfThreeSortByFreq(data, hi-1, hi-1-s, hi-1-2*s)
+	}
+	medianOfThreeSortByFreq(data, lo, m, hi-1)
+
+	// Invariants are:
+	//	data[lo] = pivot (set up by ChoosePivot)
+	//	data[lo < i < a] < pivot
+	//	data[a <= i < b] <= pivot
+	//	data[b <= i < c] unexamined
+	//	data[c <= i < hi-1] > pivot
+	//	data[hi-1] >= pivot
+	pivot := lo
+	a, c := lo+1, hi-1
+
+	for ; a < c && (data[a].freq == data[pivot].freq && data[a].literal < data[pivot].literal || data[a].freq < data[pivot].freq); a++ {
+	}
+	b := a
+	for {
+		for ; b < c && (data[pivot].freq == data[b].freq && data[pivot].literal > data[b].literal || data[pivot].freq > data[b].freq); b++ { // data[b] <= pivot
+		}
+		for ; b < c && (data[pivot].freq == data[c-1].freq && data[pivot].literal < data[c-1].literal || data[pivot].freq < data[c-1].freq); c-- { // data[c-1] > pivot
+		}
+		if b >= c {
+			break
+		}
+		// data[b] > pivot; data[c-1] <= pivot
+		data[b], data[c-1] = data[c-1], data[b]
+		b++
+		c--
+	}
+	// If hi-c<3 then there are duplicates (by property of median of nine).
+	// Let's be a bit more conservative, and set border to 5.
+	protect := hi-c < 5
+	if !protect && hi-c < (hi-lo)/4 {
+		// Lets test some points for equality to pivot
+		dups := 0
+		if data[pivot].freq == data[hi-1].freq && data[pivot].literal > data[hi-1].literal || data[pivot].freq > data[hi-1].freq { // data[hi-1] = pivot
+			data[c], data[hi-1] = data[hi-1], data[c]
+			c++
+			dups++
+		}
+		if data[b-1].freq == data[pivot].freq && data[b-1].literal > data[pivot].literal || data[b-1].freq > data[pivot].freq { // data[b-1] = pivot
+			b--
+			dups++
+		}
+		// m-lo = (hi-lo)/2 > 6
+		// b-lo > (hi-lo)*3/4-1 > 8
+		// ==> m < b ==> data[m] <= pivot
+		if data[m].freq == data[pivot].freq && data[m].literal > data[pivot].literal || data[m].freq > data[pivot].freq { // data[m] = pivot
+			data[m], data[b-1] = data[b-1], data[m]
+			b--
+			dups++
+		}
+		// if at least 2 points are equal to pivot, assume skewed distribution
+		protect = dups > 1
+	}
+	if protect {
+		// Protect against a lot of duplicates
+		// Add invariant:
+		//	data[a <= i < b] unexamined
+		//	data[b <= i < c] = pivot
+		for {
+			for ; a < b && (data[b-1].freq == data[pivot].freq && data[b-1].literal > data[pivot].literal || data[b-1].freq > data[pivot].freq); b-- { // data[b] == pivot
+			}
+			for ; a < b && (data[a].freq == data[pivot].freq && data[a].literal < data[pivot].literal || data[a].freq < data[pivot].freq); a++ { // data[a] < pivot
+			}
+			if a >= b {
+				break
+			}
+			// data[a] == pivot; data[b-1] < pivot
+			data[a], data[b-1] = data[b-1], data[a]
+			a++
+			b--
+		}
+	}
+	// Swap pivot into middle
+	data[pivot], data[b-1] = data[b-1], data[pivot]
+	return b - 1, c
+}
+
+// Insertion sort
+func insertionSortByFreq(data []literalNode, a, b int) {
+	for i := a + 1; i < b; i++ {
+		for j := i; j > a && (data[j].freq == data[j-1].freq && data[j].literal < data[j-1].literal || data[j].freq < data[j-1].freq); j-- {
+			data[j], data[j-1] = data[j-1], data[j]
+		}
+	}
+}
+
+// quickSortByFreq, loosely following Bentley and McIlroy,
+// ``Engineering a Sort Function,'' SP&E November 1993.
+
+// medianOfThreeSortByFreq moves the median of the three values data[m0], data[m1], data[m2] into data[m1].
+func medianOfThreeSortByFreq(data []literalNode, m1, m0, m2 int) {
+	// sort 3 elements
+	if data[m1].freq == data[m0].freq && data[m1].literal < data[m0].literal || data[m1].freq < data[m0].freq {
+		data[m1], data[m0] = data[m0], data[m1]
+	}
+	// data[m0] <= data[m1]
+	if data[m2].freq == data[m1].freq && data[m2].literal < data[m1].literal || data[m2].freq < data[m1].freq {
+		data[m2], data[m1] = data[m1], data[m2]
+		// data[m0] <= data[m2] && data[m1] < data[m2]
+		if data[m1].freq == data[m0].freq && data[m1].literal < data[m0].literal || data[m1].freq < data[m0].freq {
+			data[m1], data[m0] = data[m0], data[m1]
+		}
+	}
+	// now data[m0] <= data[m1] <= data[m2]
+}

diff --git a/vendor/github.com/klauspost/compress/flate/huffman_sortByLiteral.go b/vendor/github.com/klauspost/compress/flate/huffman_sortByLiteral.go
new file mode 100644
index 0000000..93f1aea
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/huffman_sortByLiteral.go

@@ -0,0 +1,201 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Sort sorts data.
+// It makes one call to data.Len to determine n, and O(n*log(n)) calls to
+// data.Less and data.Swap. The sort is not guaranteed to be stable.
+func sortByLiteral(data []literalNode) {
+	n := len(data)
+	quickSort(data, 0, n, maxDepth(n))
+}
+
+func quickSort(data []literalNode, a, b, maxDepth int) {
+	for b-a > 12 { // Use ShellSort for slices <= 12 elements
+		if maxDepth == 0 {
+			heapSort(data, a, b)
+			return
+		}
+		maxDepth--
+		mlo, mhi := doPivot(data, a, b)
+		// Avoiding recursion on the larger subproblem guarantees
+		// a stack depth of at most lg(b-a).
+		if mlo-a < b-mhi {
+			quickSort(data, a, mlo, maxDepth)
+			a = mhi // i.e., quickSort(data, mhi, b)
+		} else {
+			quickSort(data, mhi, b, maxDepth)
+			b = mlo // i.e., quickSort(data, a, mlo)
+		}
+	}
+	if b-a > 1 {
+		// Do ShellSort pass with gap 6
+		// It could be written in this simplified form cause b-a <= 12
+		for i := a + 6; i < b; i++ {
+			if data[i].literal < data[i-6].literal {
+				data[i], data[i-6] = data[i-6], data[i]
+			}
+		}
+		insertionSort(data, a, b)
+	}
+}
+func heapSort(data []literalNode, a, b int) {
+	first := a
+	lo := 0
+	hi := b - a
+
+	// Build heap with greatest element at top.
+	for i := (hi - 1) / 2; i >= 0; i-- {
+		siftDown(data, i, hi, first)
+	}
+
+	// Pop elements, largest first, into end of data.
+	for i := hi - 1; i >= 0; i-- {
+		data[first], data[first+i] = data[first+i], data[first]
+		siftDown(data, lo, i, first)
+	}
+}
+
+// siftDown implements the heap property on data[lo, hi).
+// first is an offset into the array where the root of the heap lies.
+func siftDown(data []literalNode, lo, hi, first int) {
+	root := lo
+	for {
+		child := 2*root + 1
+		if child >= hi {
+			break
+		}
+		if child+1 < hi && data[first+child].literal < data[first+child+1].literal {
+			child++
+		}
+		if data[first+root].literal > data[first+child].literal {
+			return
+		}
+		data[first+root], data[first+child] = data[first+child], data[first+root]
+		root = child
+	}
+}
+func doPivot(data []literalNode, lo, hi int) (midlo, midhi int) {
+	m := int(uint(lo+hi) >> 1) // Written like this to avoid integer overflow.
+	if hi-lo > 40 {
+		// Tukey's ``Ninther,'' median of three medians of three.
+		s := (hi - lo) / 8
+		medianOfThree(data, lo, lo+s, lo+2*s)
+		medianOfThree(data, m, m-s, m+s)
+		medianOfThree(data, hi-1, hi-1-s, hi-1-2*s)
+	}
+	medianOfThree(data, lo, m, hi-1)
+
+	// Invariants are:
+	//	data[lo] = pivot (set up by ChoosePivot)
+	//	data[lo < i < a] < pivot
+	//	data[a <= i < b] <= pivot
+	//	data[b <= i < c] unexamined
+	//	data[c <= i < hi-1] > pivot
+	//	data[hi-1] >= pivot
+	pivot := lo
+	a, c := lo+1, hi-1
+
+	for ; a < c && data[a].literal < data[pivot].literal; a++ {
+	}
+	b := a
+	for {
+		for ; b < c && data[pivot].literal > data[b].literal; b++ { // data[b] <= pivot
+		}
+		for ; b < c && data[pivot].literal < data[c-1].literal; c-- { // data[c-1] > pivot
+		}
+		if b >= c {
+			break
+		}
+		// data[b] > pivot; data[c-1] <= pivot
+		data[b], data[c-1] = data[c-1], data[b]
+		b++
+		c--
+	}
+	// If hi-c<3 then there are duplicates (by property of median of nine).
+	// Let's be a bit more conservative, and set border to 5.
+	protect := hi-c < 5
+	if !protect && hi-c < (hi-lo)/4 {
+		// Lets test some points for equality to pivot
+		dups := 0
+		if data[pivot].literal > data[hi-1].literal { // data[hi-1] = pivot
+			data[c], data[hi-1] = data[hi-1], data[c]
+			c++
+			dups++
+		}
+		if data[b-1].literal > data[pivot].literal { // data[b-1] = pivot
+			b--
+			dups++
+		}
+		// m-lo = (hi-lo)/2 > 6
+		// b-lo > (hi-lo)*3/4-1 > 8
+		// ==> m < b ==> data[m] <= pivot
+		if data[m].literal > data[pivot].literal { // data[m] = pivot
+			data[m], data[b-1] = data[b-1], data[m]
+			b--
+			dups++
+		}
+		// if at least 2 points are equal to pivot, assume skewed distribution
+		protect = dups > 1
+	}
+	if protect {
+		// Protect against a lot of duplicates
+		// Add invariant:
+		//	data[a <= i < b] unexamined
+		//	data[b <= i < c] = pivot
+		for {
+			for ; a < b && data[b-1].literal > data[pivot].literal; b-- { // data[b] == pivot
+			}
+			for ; a < b && data[a].literal < data[pivot].literal; a++ { // data[a] < pivot
+			}
+			if a >= b {
+				break
+			}
+			// data[a] == pivot; data[b-1] < pivot
+			data[a], data[b-1] = data[b-1], data[a]
+			a++
+			b--
+		}
+	}
+	// Swap pivot into middle
+	data[pivot], data[b-1] = data[b-1], data[pivot]
+	return b - 1, c
+}
+
+// Insertion sort
+func insertionSort(data []literalNode, a, b int) {
+	for i := a + 1; i < b; i++ {
+		for j := i; j > a && data[j].literal < data[j-1].literal; j-- {
+			data[j], data[j-1] = data[j-1], data[j]
+		}
+	}
+}
+
+// maxDepth returns a threshold at which quicksort should switch
+// to heapsort. It returns 2*ceil(lg(n+1)).
+func maxDepth(n int) int {
+	var depth int
+	for i := n; i > 0; i >>= 1 {
+		depth++
+	}
+	return depth * 2
+}
+
+// medianOfThree moves the median of the three values data[m0], data[m1], data[m2] into data[m1].
+func medianOfThree(data []literalNode, m1, m0, m2 int) {
+	// sort 3 elements
+	if data[m1].literal < data[m0].literal {
+		data[m1], data[m0] = data[m0], data[m1]
+	}
+	// data[m0] <= data[m1]
+	if data[m2].literal < data[m1].literal {
+		data[m2], data[m1] = data[m1], data[m2]
+		// data[m0] <= data[m2] && data[m1] < data[m2]
+		if data[m1].literal < data[m0].literal {
+			data[m1], data[m0] = data[m0], data[m1]
+		}
+	}
+	// now data[m0] <= data[m1] <= data[m2]
+}

diff --git a/vendor/github.com/klauspost/compress/flate/inflate.go b/vendor/github.com/klauspost/compress/flate/inflate.go
new file mode 100644
index 0000000..0d7b437
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/inflate.go

@@ -0,0 +1,865 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package flate implements the DEFLATE compressed data format, described in
+// RFC 1951.  The gzip and zlib packages implement access to DEFLATE-based file
+// formats.
+package flate
+
+import (
+	"bufio"
+	"compress/flate"
+	"fmt"
+	"io"
+	"math/bits"
+	"sync"
+)
+
+const (
+	maxCodeLen     = 16 // max length of Huffman code
+	maxCodeLenMask = 15 // mask for max length of Huffman code
+	// The next three numbers come from the RFC section 3.2.7, with the
+	// additional proviso in section 3.2.5 which implies that distance codes
+	// 30 and 31 should never occur in compressed data.
+	maxNumLit  = 286
+	maxNumDist = 30
+	numCodes   = 19 // number of codes in Huffman meta-code
+
+	debugDecode = false
+)
+
+// Value of length - 3 and extra bits.
+type lengthExtra struct {
+	length, extra uint8
+}
+
+var decCodeToLen = [32]lengthExtra{{length: 0x0, extra: 0x0}, {length: 0x1, extra: 0x0}, {length: 0x2, extra: 0x0}, {length: 0x3, extra: 0x0}, {length: 0x4, extra: 0x0}, {length: 0x5, extra: 0x0}, {length: 0x6, extra: 0x0}, {length: 0x7, extra: 0x0}, {length: 0x8, extra: 0x1}, {length: 0xa, extra: 0x1}, {length: 0xc, extra: 0x1}, {length: 0xe, extra: 0x1}, {length: 0x10, extra: 0x2}, {length: 0x14, extra: 0x2}, {length: 0x18, extra: 0x2}, {length: 0x1c, extra: 0x2}, {length: 0x20, extra: 0x3}, {length: 0x28, extra: 0x3}, {length: 0x30, extra: 0x3}, {length: 0x38, extra: 0x3}, {length: 0x40, extra: 0x4}, {length: 0x50, extra: 0x4}, {length: 0x60, extra: 0x4}, {length: 0x70, extra: 0x4}, {length: 0x80, extra: 0x5}, {length: 0xa0, extra: 0x5}, {length: 0xc0, extra: 0x5}, {length: 0xe0, extra: 0x5}, {length: 0xff, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}}
+
+var bitMask32 = [32]uint32{
+	0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF,
+	0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF,
+	0x1ffff, 0x3ffff, 0x7FFFF, 0xfFFFF, 0x1fFFFF, 0x3fFFFF, 0x7fFFFF, 0xffFFFF,
+	0x1ffFFFF, 0x3ffFFFF, 0x7ffFFFF, 0xfffFFFF, 0x1fffFFFF, 0x3fffFFFF, 0x7fffFFFF,
+} // up to 32 bits
+
+// Initialize the fixedHuffmanDecoder only once upon first use.
+var fixedOnce sync.Once
+var fixedHuffmanDecoder huffmanDecoder
+
+// A CorruptInputError reports the presence of corrupt input at a given offset.
+type CorruptInputError = flate.CorruptInputError
+
+// An InternalError reports an error in the flate code itself.
+type InternalError string
+
+func (e InternalError) Error() string { return "flate: internal error: " + string(e) }
+
+// A ReadError reports an error encountered while reading input.
+//
+// Deprecated: No longer returned.
+type ReadError = flate.ReadError
+
+// A WriteError reports an error encountered while writing output.
+//
+// Deprecated: No longer returned.
+type WriteError = flate.WriteError
+
+// Resetter resets a ReadCloser returned by NewReader or NewReaderDict to
+// to switch to a new underlying Reader. This permits reusing a ReadCloser
+// instead of allocating a new one.
+type Resetter interface {
+	// Reset discards any buffered data and resets the Resetter as if it was
+	// newly initialized with the given reader.
+	Reset(r io.Reader, dict []byte) error
+}
+
+// The data structure for decoding Huffman tables is based on that of
+// zlib. There is a lookup table of a fixed bit width (huffmanChunkBits),
+// For codes smaller than the table width, there are multiple entries
+// (each combination of trailing bits has the same value). For codes
+// larger than the table width, the table contains a link to an overflow
+// table. The width of each entry in the link table is the maximum code
+// size minus the chunk width.
+//
+// Note that you can do a lookup in the table even without all bits
+// filled. Since the extra bits are zero, and the DEFLATE Huffman codes
+// have the property that shorter codes come before longer ones, the
+// bit length estimate in the result is a lower bound on the actual
+// number of bits.
+//
+// See the following:
+//	http://www.gzip.org/algorithm.txt
+
+// chunk & 15 is number of bits
+// chunk >> 4 is value, including table link
+
+const (
+	huffmanChunkBits  = 9
+	huffmanNumChunks  = 1 << huffmanChunkBits
+	huffmanCountMask  = 15
+	huffmanValueShift = 4
+)
+
+type huffmanDecoder struct {
+	maxRead  int                       // the maximum number of bits we can read and not overread
+	chunks   *[huffmanNumChunks]uint16 // chunks as described above
+	links    [][]uint16                // overflow links
+	linkMask uint32                    // mask the width of the link table
+}
+
+// Initialize Huffman decoding tables from array of code lengths.
+// Following this function, h is guaranteed to be initialized into a complete
+// tree (i.e., neither over-subscribed nor under-subscribed). The exception is a
+// degenerate case where the tree has only a single symbol with length 1. Empty
+// trees are permitted.
+func (h *huffmanDecoder) init(lengths []int) bool {
+	// Sanity enables additional runtime tests during Huffman
+	// table construction. It's intended to be used during
+	// development to supplement the currently ad-hoc unit tests.
+	const sanity = false
+
+	if h.chunks == nil {
+		h.chunks = new([huffmanNumChunks]uint16)
+	}
+
+	if h.maxRead != 0 {
+		*h = huffmanDecoder{chunks: h.chunks, links: h.links}
+	}
+
+	// Count number of codes of each length,
+	// compute maxRead and max length.
+	var count [maxCodeLen]int
+	var min, max int
+	for _, n := range lengths {
+		if n == 0 {
+			continue
+		}
+		if min == 0 || n < min {
+			min = n
+		}
+		if n > max {
+			max = n
+		}
+		count[n&maxCodeLenMask]++
+	}
+
+	// Empty tree. The decompressor.huffSym function will fail later if the tree
+	// is used. Technically, an empty tree is only valid for the HDIST tree and
+	// not the HCLEN and HLIT tree. However, a stream with an empty HCLEN tree
+	// is guaranteed to fail since it will attempt to use the tree to decode the
+	// codes for the HLIT and HDIST trees. Similarly, an empty HLIT tree is
+	// guaranteed to fail later since the compressed data section must be
+	// composed of at least one symbol (the end-of-block marker).
+	if max == 0 {
+		return true
+	}
+
+	code := 0
+	var nextcode [maxCodeLen]int
+	for i := min; i <= max; i++ {
+		code <<= 1
+		nextcode[i&maxCodeLenMask] = code
+		code += count[i&maxCodeLenMask]
+	}
+
+	// Check that the coding is complete (i.e., that we've
+	// assigned all 2-to-the-max possible bit sequences).
+	// Exception: To be compatible with zlib, we also need to
+	// accept degenerate single-code codings. See also
+	// TestDegenerateHuffmanCoding.
+	if code != 1<<uint(max) && !(code == 1 && max == 1) {
+		if debugDecode {
+			fmt.Println("coding failed, code, max:", code, max, code == 1<<uint(max), code == 1 && max == 1, "(one should be true)")
+		}
+		return false
+	}
+
+	h.maxRead = min
+
+	chunks := h.chunks[:]
+	for i := range chunks {
+		chunks[i] = 0
+	}
+
+	if max > huffmanChunkBits {
+		numLinks := 1 << (uint(max) - huffmanChunkBits)
+		h.linkMask = uint32(numLinks - 1)
+
+		// create link tables
+		link := nextcode[huffmanChunkBits+1] >> 1
+		if cap(h.links) < huffmanNumChunks-link {
+			h.links = make([][]uint16, huffmanNumChunks-link)
+		} else {
+			h.links = h.links[:huffmanNumChunks-link]
+		}
+		for j := uint(link); j < huffmanNumChunks; j++ {
+			reverse := int(bits.Reverse16(uint16(j)))
+			reverse >>= uint(16 - huffmanChunkBits)
+			off := j - uint(link)
+			if sanity && h.chunks[reverse] != 0 {
+				panic("impossible: overwriting existing chunk")
+			}
+			h.chunks[reverse] = uint16(off<<huffmanValueShift | (huffmanChunkBits + 1))
+			if cap(h.links[off]) < numLinks {
+				h.links[off] = make([]uint16, numLinks)
+			} else {
+				h.links[off] = h.links[off][:numLinks]
+			}
+		}
+	} else {
+		h.links = h.links[:0]
+	}
+
+	for i, n := range lengths {
+		if n == 0 {
+			continue
+		}
+		code := nextcode[n]
+		nextcode[n]++
+		chunk := uint16(i<<huffmanValueShift | n)
+		reverse := int(bits.Reverse16(uint16(code)))
+		reverse >>= uint(16 - n)
+		if n <= huffmanChunkBits {
+			for off := reverse; off < len(h.chunks); off += 1 << uint(n) {
+				// We should never need to overwrite
+				// an existing chunk. Also, 0 is
+				// never a valid chunk, because the
+				// lower 4 "count" bits should be
+				// between 1 and 15.
+				if sanity && h.chunks[off] != 0 {
+					panic("impossible: overwriting existing chunk")
+				}
+				h.chunks[off] = chunk
+			}
+		} else {
+			j := reverse & (huffmanNumChunks - 1)
+			if sanity && h.chunks[j]&huffmanCountMask != huffmanChunkBits+1 {
+				// Longer codes should have been
+				// associated with a link table above.
+				panic("impossible: not an indirect chunk")
+			}
+			value := h.chunks[j] >> huffmanValueShift
+			linktab := h.links[value]
+			reverse >>= huffmanChunkBits
+			for off := reverse; off < len(linktab); off += 1 << uint(n-huffmanChunkBits) {
+				if sanity && linktab[off] != 0 {
+					panic("impossible: overwriting existing chunk")
+				}
+				linktab[off] = chunk
+			}
+		}
+	}
+
+	if sanity {
+		// Above we've sanity checked that we never overwrote
+		// an existing entry. Here we additionally check that
+		// we filled the tables completely.
+		for i, chunk := range h.chunks {
+			if chunk == 0 {
+				// As an exception, in the degenerate
+				// single-code case, we allow odd
+				// chunks to be missing.
+				if code == 1 && i%2 == 1 {
+					continue
+				}
+				panic("impossible: missing chunk")
+			}
+		}
+		for _, linktab := range h.links {
+			for _, chunk := range linktab {
+				if chunk == 0 {
+					panic("impossible: missing chunk")
+				}
+			}
+		}
+	}
+
+	return true
+}
+
+// Reader is the actual read interface needed by NewReader.
+// If the passed in io.Reader does not also have ReadByte,
+// the NewReader will introduce its own buffering.
+type Reader interface {
+	io.Reader
+	io.ByteReader
+}
+
+type step uint8
+
+const (
+	copyData step = iota + 1
+	nextBlock
+	huffmanBytesBuffer
+	huffmanBytesReader
+	huffmanBufioReader
+	huffmanStringsReader
+	huffmanGenericReader
+)
+
+// flushMode tells decompressor when to return data
+type flushMode uint8
+
+const (
+	syncFlush    flushMode = iota // return data after sync flush block
+	partialFlush                  // return data after each block
+)
+
+// Decompress state.
+type decompressor struct {
+	// Input source.
+	r       Reader
+	roffset int64
+
+	// Huffman decoders for literal/length, distance.
+	h1, h2 huffmanDecoder
+
+	// Length arrays used to define Huffman codes.
+	bits     *[maxNumLit + maxNumDist]int
+	codebits *[numCodes]int
+
+	// Output history, buffer.
+	dict dictDecoder
+
+	// Next step in the decompression,
+	// and decompression state.
+	step      step
+	stepState int
+	err       error
+	toRead    []byte
+	hl, hd    *huffmanDecoder
+	copyLen   int
+	copyDist  int
+
+	// Temporary buffer (avoids repeated allocation).
+	buf [4]byte
+
+	// Input bits, in top of b.
+	b uint32
+
+	nb    uint
+	final bool
+
+	flushMode flushMode
+}
+
+func (f *decompressor) nextBlock() {
+	for f.nb < 1+2 {
+		if f.err = f.moreBits(); f.err != nil {
+			return
+		}
+	}
+	f.final = f.b&1 == 1
+	f.b >>= 1
+	typ := f.b & 3
+	f.b >>= 2
+	f.nb -= 1 + 2
+	switch typ {
+	case 0:
+		f.dataBlock()
+		if debugDecode {
+			fmt.Println("stored block")
+		}
+	case 1:
+		// compressed, fixed Huffman tables
+		f.hl = &fixedHuffmanDecoder
+		f.hd = nil
+		f.huffmanBlockDecoder()
+		if debugDecode {
+			fmt.Println("predefinied huffman block")
+		}
+	case 2:
+		// compressed, dynamic Huffman tables
+		if f.err = f.readHuffman(); f.err != nil {
+			break
+		}
+		f.hl = &f.h1
+		f.hd = &f.h2
+		f.huffmanBlockDecoder()
+		if debugDecode {
+			fmt.Println("dynamic huffman block")
+		}
+	default:
+		// 3 is reserved.
+		if debugDecode {
+			fmt.Println("reserved data block encountered")
+		}
+		f.err = CorruptInputError(f.roffset)
+	}
+}
+
+func (f *decompressor) Read(b []byte) (int, error) {
+	for {
+		if len(f.toRead) > 0 {
+			n := copy(b, f.toRead)
+			f.toRead = f.toRead[n:]
+			if len(f.toRead) == 0 {
+				return n, f.err
+			}
+			return n, nil
+		}
+		if f.err != nil {
+			return 0, f.err
+		}
+
+		f.doStep()
+
+		if f.err != nil && len(f.toRead) == 0 {
+			f.toRead = f.dict.readFlush() // Flush what's left in case of error
+		}
+	}
+}
+
+// WriteTo implements the io.WriteTo interface for io.Copy and friends.
+func (f *decompressor) WriteTo(w io.Writer) (int64, error) {
+	total := int64(0)
+	flushed := false
+	for {
+		if len(f.toRead) > 0 {
+			n, err := w.Write(f.toRead)
+			total += int64(n)
+			if err != nil {
+				f.err = err
+				return total, err
+			}
+			if n != len(f.toRead) {
+				return total, io.ErrShortWrite
+			}
+			f.toRead = f.toRead[:0]
+		}
+		if f.err != nil && flushed {
+			if f.err == io.EOF {
+				return total, nil
+			}
+			return total, f.err
+		}
+		if f.err == nil {
+			f.doStep()
+		}
+		if len(f.toRead) == 0 && f.err != nil && !flushed {
+			f.toRead = f.dict.readFlush() // Flush what's left in case of error
+			flushed = true
+		}
+	}
+}
+
+func (f *decompressor) Close() error {
+	if f.err == io.EOF {
+		return nil
+	}
+	return f.err
+}
+
+// RFC 1951 section 3.2.7.
+// Compression with dynamic Huffman codes
+
+var codeOrder = [...]int{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
+
+func (f *decompressor) readHuffman() error {
+	// HLIT[5], HDIST[5], HCLEN[4].
+	for f.nb < 5+5+4 {
+		if err := f.moreBits(); err != nil {
+			return err
+		}
+	}
+	nlit := int(f.b&0x1F) + 257
+	if nlit > maxNumLit {
+		if debugDecode {
+			fmt.Println("nlit > maxNumLit", nlit)
+		}
+		return CorruptInputError(f.roffset)
+	}
+	f.b >>= 5
+	ndist := int(f.b&0x1F) + 1
+	if ndist > maxNumDist {
+		if debugDecode {
+			fmt.Println("ndist > maxNumDist", ndist)
+		}
+		return CorruptInputError(f.roffset)
+	}
+	f.b >>= 5
+	nclen := int(f.b&0xF) + 4
+	// numCodes is 19, so nclen is always valid.
+	f.b >>= 4
+	f.nb -= 5 + 5 + 4
+
+	// (HCLEN+4)*3 bits: code lengths in the magic codeOrder order.
+	for i := 0; i < nclen; i++ {
+		for f.nb < 3 {
+			if err := f.moreBits(); err != nil {
+				return err
+			}
+		}
+		f.codebits[codeOrder[i]] = int(f.b & 0x7)
+		f.b >>= 3
+		f.nb -= 3
+	}
+	for i := nclen; i < len(codeOrder); i++ {
+		f.codebits[codeOrder[i]] = 0
+	}
+	if !f.h1.init(f.codebits[0:]) {
+		if debugDecode {
+			fmt.Println("init codebits failed")
+		}
+		return CorruptInputError(f.roffset)
+	}
+
+	// HLIT + 257 code lengths, HDIST + 1 code lengths,
+	// using the code length Huffman code.
+	for i, n := 0, nlit+ndist; i < n; {
+		x, err := f.huffSym(&f.h1)
+		if err != nil {
+			return err
+		}
+		if x < 16 {
+			// Actual length.
+			f.bits[i] = x
+			i++
+			continue
+		}
+		// Repeat previous length or zero.
+		var rep int
+		var nb uint
+		var b int
+		switch x {
+		default:
+			return InternalError("unexpected length code")
+		case 16:
+			rep = 3
+			nb = 2
+			if i == 0 {
+				if debugDecode {
+					fmt.Println("i==0")
+				}
+				return CorruptInputError(f.roffset)
+			}
+			b = f.bits[i-1]
+		case 17:
+			rep = 3
+			nb = 3
+			b = 0
+		case 18:
+			rep = 11
+			nb = 7
+			b = 0
+		}
+		for f.nb < nb {
+			if err := f.moreBits(); err != nil {
+				if debugDecode {
+					fmt.Println("morebits:", err)
+				}
+				return err
+			}
+		}
+		rep += int(f.b & uint32(1<<(nb&regSizeMaskUint32)-1))
+		f.b >>= nb & regSizeMaskUint32
+		f.nb -= nb
+		if i+rep > n {
+			if debugDecode {
+				fmt.Println("i+rep > n", i, rep, n)
+			}
+			return CorruptInputError(f.roffset)
+		}
+		for j := 0; j < rep; j++ {
+			f.bits[i] = b
+			i++
+		}
+	}
+
+	if !f.h1.init(f.bits[0:nlit]) || !f.h2.init(f.bits[nlit:nlit+ndist]) {
+		if debugDecode {
+			fmt.Println("init2 failed")
+		}
+		return CorruptInputError(f.roffset)
+	}
+
+	// As an optimization, we can initialize the maxRead bits to read at a time
+	// for the HLIT tree to the length of the EOB marker since we know that
+	// every block must terminate with one. This preserves the property that
+	// we never read any extra bytes after the end of the DEFLATE stream.
+	if f.h1.maxRead < f.bits[endBlockMarker] {
+		f.h1.maxRead = f.bits[endBlockMarker]
+	}
+	if !f.final {
+		// If not the final block, the smallest block possible is
+		// a predefined table, BTYPE=01, with a single EOB marker.
+		// This will take up 3 + 7 bits.
+		f.h1.maxRead += 10
+	}
+
+	return nil
+}
+
+// Copy a single uncompressed data block from input to output.
+func (f *decompressor) dataBlock() {
+	// Uncompressed.
+	// Discard current half-byte.
+	left := (f.nb) & 7
+	f.nb -= left
+	f.b >>= left
+
+	offBytes := f.nb >> 3
+	// Unfilled values will be overwritten.
+	f.buf[0] = uint8(f.b)
+	f.buf[1] = uint8(f.b >> 8)
+	f.buf[2] = uint8(f.b >> 16)
+	f.buf[3] = uint8(f.b >> 24)
+
+	f.roffset += int64(offBytes)
+	f.nb, f.b = 0, 0
+
+	// Length then ones-complement of length.
+	nr, err := io.ReadFull(f.r, f.buf[offBytes:4])
+	f.roffset += int64(nr)
+	if err != nil {
+		f.err = noEOF(err)
+		return
+	}
+	n := uint16(f.buf[0]) | uint16(f.buf[1])<<8
+	nn := uint16(f.buf[2]) | uint16(f.buf[3])<<8
+	if nn != ^n {
+		if debugDecode {
+			ncomp := ^n
+			fmt.Println("uint16(nn) != uint16(^n)", nn, ncomp)
+		}
+		f.err = CorruptInputError(f.roffset)
+		return
+	}
+
+	if n == 0 {
+		if f.flushMode == syncFlush {
+			f.toRead = f.dict.readFlush()
+		}
+
+		f.finishBlock()
+		return
+	}
+
+	f.copyLen = int(n)
+	f.copyData()
+}
+
+// copyData copies f.copyLen bytes from the underlying reader into f.hist.
+// It pauses for reads when f.hist is full.
+func (f *decompressor) copyData() {
+	buf := f.dict.writeSlice()
+	if len(buf) > f.copyLen {
+		buf = buf[:f.copyLen]
+	}
+
+	cnt, err := io.ReadFull(f.r, buf)
+	f.roffset += int64(cnt)
+	f.copyLen -= cnt
+	f.dict.writeMark(cnt)
+	if err != nil {
+		f.err = noEOF(err)
+		return
+	}
+
+	if f.dict.availWrite() == 0 || f.copyLen > 0 {
+		f.toRead = f.dict.readFlush()
+		f.step = copyData
+		return
+	}
+	f.finishBlock()
+}
+
+func (f *decompressor) finishBlock() {
+	if f.final {
+		if f.dict.availRead() > 0 {
+			f.toRead = f.dict.readFlush()
+		}
+
+		f.err = io.EOF
+	} else if f.flushMode == partialFlush && f.dict.availRead() > 0 {
+		f.toRead = f.dict.readFlush()
+	}
+
+	f.step = nextBlock
+}
+
+func (f *decompressor) doStep() {
+	switch f.step {
+	case copyData:
+		f.copyData()
+	case nextBlock:
+		f.nextBlock()
+	case huffmanBytesBuffer:
+		f.huffmanBytesBuffer()
+	case huffmanBytesReader:
+		f.huffmanBytesReader()
+	case huffmanBufioReader:
+		f.huffmanBufioReader()
+	case huffmanStringsReader:
+		f.huffmanStringsReader()
+	case huffmanGenericReader:
+		f.huffmanGenericReader()
+	default:
+		panic("BUG: unexpected step state")
+	}
+}
+
+// noEOF returns err, unless err == io.EOF, in which case it returns io.ErrUnexpectedEOF.
+func noEOF(e error) error {
+	if e == io.EOF {
+		return io.ErrUnexpectedEOF
+	}
+	return e
+}
+
+func (f *decompressor) moreBits() error {
+	c, err := f.r.ReadByte()
+	if err != nil {
+		return noEOF(err)
+	}
+	f.roffset++
+	f.b |= uint32(c) << (f.nb & regSizeMaskUint32)
+	f.nb += 8
+	return nil
+}
+
+// Read the next Huffman-encoded symbol from f according to h.
+func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
+	// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+	// with single element, huffSym must error on these two edge cases. In both
+	// cases, the chunks slice will be 0 for the invalid sequence, leading it
+	// satisfy the n == 0 check below.
+	n := uint(h.maxRead)
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	nb, b := f.nb, f.b
+	for {
+		for nb < n {
+			c, err := f.r.ReadByte()
+			if err != nil {
+				f.b = b
+				f.nb = nb
+				return 0, noEOF(err)
+			}
+			f.roffset++
+			b |= uint32(c) << (nb & regSizeMaskUint32)
+			nb += 8
+		}
+		chunk := h.chunks[b&(huffmanNumChunks-1)]
+		n = uint(chunk & huffmanCountMask)
+		if n > huffmanChunkBits {
+			chunk = h.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&h.linkMask]
+			n = uint(chunk & huffmanCountMask)
+		}
+		if n <= nb {
+			if n == 0 {
+				f.b = b
+				f.nb = nb
+				if debugDecode {
+					fmt.Println("huffsym: n==0")
+				}
+				f.err = CorruptInputError(f.roffset)
+				return 0, f.err
+			}
+			f.b = b >> (n & regSizeMaskUint32)
+			f.nb = nb - n
+			return int(chunk >> huffmanValueShift), nil
+		}
+	}
+}
+
+func makeReader(r io.Reader) Reader {
+	if rr, ok := r.(Reader); ok {
+		return rr
+	}
+	return bufio.NewReader(r)
+}
+
+func fixedHuffmanDecoderInit() {
+	fixedOnce.Do(func() {
+		// These come from the RFC section 3.2.6.
+		var bits [288]int
+		for i := 0; i < 144; i++ {
+			bits[i] = 8
+		}
+		for i := 144; i < 256; i++ {
+			bits[i] = 9
+		}
+		for i := 256; i < 280; i++ {
+			bits[i] = 7
+		}
+		for i := 280; i < 288; i++ {
+			bits[i] = 8
+		}
+		fixedHuffmanDecoder.init(bits[:])
+	})
+}
+
+func (f *decompressor) Reset(r io.Reader, dict []byte) error {
+	*f = decompressor{
+		r:        makeReader(r),
+		bits:     f.bits,
+		codebits: f.codebits,
+		h1:       f.h1,
+		h2:       f.h2,
+		dict:     f.dict,
+		step:     nextBlock,
+	}
+	f.dict.init(maxMatchOffset, dict)
+	return nil
+}
+
+type ReaderOpt func(*decompressor)
+
+// WithPartialBlock tells decompressor to return after each block,
+// so it can read data written with partial flush
+func WithPartialBlock() ReaderOpt {
+	return func(f *decompressor) {
+		f.flushMode = partialFlush
+	}
+}
+
+// WithDict initializes the reader with a preset dictionary
+func WithDict(dict []byte) ReaderOpt {
+	return func(f *decompressor) {
+		f.dict.init(maxMatchOffset, dict)
+	}
+}
+
+// NewReaderOpts returns new reader with provided options
+func NewReaderOpts(r io.Reader, opts ...ReaderOpt) io.ReadCloser {
+	fixedHuffmanDecoderInit()
+
+	var f decompressor
+	f.r = makeReader(r)
+	f.bits = new([maxNumLit + maxNumDist]int)
+	f.codebits = new([numCodes]int)
+	f.step = nextBlock
+	f.dict.init(maxMatchOffset, nil)
+
+	for _, opt := range opts {
+		opt(&f)
+	}
+
+	return &f
+}
+
+// NewReader returns a new ReadCloser that can be used
+// to read the uncompressed version of r.
+// If r does not also implement io.ByteReader,
+// the decompressor may read more data than necessary from r.
+// It is the caller's responsibility to call Close on the ReadCloser
+// when finished reading.
+//
+// The ReadCloser returned by NewReader also implements Resetter.
+func NewReader(r io.Reader) io.ReadCloser {
+	return NewReaderOpts(r)
+}
+
+// NewReaderDict is like NewReader but initializes the reader
+// with a preset dictionary. The returned Reader behaves as if
+// the uncompressed data stream started with the given dictionary,
+// which has already been read. NewReaderDict is typically used
+// to read data compressed by NewWriterDict.
+//
+// The ReadCloser returned by NewReader also implements Resetter.
+func NewReaderDict(r io.Reader, dict []byte) io.ReadCloser {
+	return NewReaderOpts(r, WithDict(dict))
+}

diff --git a/vendor/github.com/klauspost/compress/flate/inflate_gen.go b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
new file mode 100644
index 0000000..2b2f993
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/inflate_gen.go

@@ -0,0 +1,1283 @@
+// Code generated by go generate gen_inflate.go. DO NOT EDIT.
+
+package flate
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"math/bits"
+	"strings"
+)
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBytesBuffer() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*bytes.Buffer)
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var length int
+		switch {
+		case v < 256:
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = huffmanBytesBuffer
+				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.b, f.nb = fb, fnb
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+		case v < maxNumLit:
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
+		}
+
+		var dist uint32
+		if f.hd == nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
+		} else {
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
+		default:
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, int(dist)
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = huffmanBytesBuffer // We need to continue this work
+			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
+			return
+		}
+		goto readLiteral
+	}
+	// Not reached
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBytesReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*bytes.Reader)
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var length int
+		switch {
+		case v < 256:
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = huffmanBytesReader
+				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.b, f.nb = fb, fnb
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+		case v < maxNumLit:
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
+		}
+
+		var dist uint32
+		if f.hd == nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
+		} else {
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
+		default:
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, int(dist)
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = huffmanBytesReader // We need to continue this work
+			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
+			return
+		}
+		goto readLiteral
+	}
+	// Not reached
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBufioReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*bufio.Reader)
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var length int
+		switch {
+		case v < 256:
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = huffmanBufioReader
+				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.b, f.nb = fb, fnb
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+		case v < maxNumLit:
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
+		}
+
+		var dist uint32
+		if f.hd == nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
+		} else {
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
+		default:
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, int(dist)
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = huffmanBufioReader // We need to continue this work
+			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
+			return
+		}
+		goto readLiteral
+	}
+	// Not reached
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanStringsReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*strings.Reader)
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var length int
+		switch {
+		case v < 256:
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = huffmanStringsReader
+				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.b, f.nb = fb, fnb
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+		case v < maxNumLit:
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
+		}
+
+		var dist uint32
+		if f.hd == nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
+		} else {
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
+		default:
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, int(dist)
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = huffmanStringsReader // We need to continue this work
+			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
+			return
+		}
+		goto readLiteral
+	}
+	// Not reached
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanGenericReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(Reader)
+
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var length int
+		switch {
+		case v < 256:
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = huffmanGenericReader
+				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.b, f.nb = fb, fnb
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+		case v < maxNumLit:
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & regSizeMaskUint32
+			fnb -= n
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
+		}
+
+		var dist uint32
+		if f.hd == nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
+		} else {
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & regSizeMaskUint32)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & regSizeMaskUint32)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << (nb & regSizeMaskUint32)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & regSizeMaskUint32)
+				fnb += 8
+			}
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & regSizeMaskUint32
+			fnb -= nb
+			dist = 1<<((nb+1)&regSizeMaskUint32) + 1 + extra
+			// slower: dist = bitMask32[nb+1] + 2 + extra
+		default:
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
+			if debugDecode {
+				fmt.Println("dist > dict.histSize():", dist, dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, int(dist)
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = huffmanGenericReader // We need to continue this work
+			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
+			return
+		}
+		goto readLiteral
+	}
+	// Not reached
+}
+
+func (f *decompressor) huffmanBlockDecoder() {
+	switch f.r.(type) {
+	case *bytes.Buffer:
+		f.huffmanBytesBuffer()
+	case *bytes.Reader:
+		f.huffmanBytesReader()
+	case *bufio.Reader:
+		f.huffmanBufioReader()
+	case *strings.Reader:
+		f.huffmanStringsReader()
+	case Reader:
+		f.huffmanGenericReader()
+	default:
+		f.huffmanGenericReader()
+	}
+}

diff --git a/vendor/github.com/klauspost/compress/flate/level1.go b/vendor/github.com/klauspost/compress/flate/level1.go
new file mode 100644
index 0000000..c3581a3
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/level1.go

@@ -0,0 +1,215 @@
+package flate
+
+import (
+	"fmt"
+
+	"github.com/klauspost/compress/internal/le"
+)
+
+// fastGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type fastEncL1 struct {
+	fastGen
+	table [tableSize]tableEntry
+}
+
+// EncodeL1 uses a similar algorithm to level 1
+func (e *fastEncL1) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashBytes              = 5
+	)
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+
+	for {
+		const skipLog = 5
+		const doEvery = 2
+
+		nextS := s
+		var candidate tableEntry
+		var t int32
+		for {
+			nextHash := hashLen(cv, tableBits, hashBytes)
+			candidate = e.table[nextHash]
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+
+			now := load6432(src, nextS)
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+			nextHash = hashLen(now, tableBits, hashBytes)
+			t = candidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
+				break
+			}
+
+			// Do one right away...
+			cv = now
+			s = nextS
+			nextS++
+			candidate = e.table[nextHash]
+			now >>= 8
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+
+			t = candidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
+				break
+			}
+			cv = now
+			s = nextS
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			l := e.matchlenLong(int(s+4), int(t+4), src) + 4
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && le.Load8(src, t-1) == le.Load8(src, s-1) {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
+			}
+
+			// Save the match found
+			if false {
+				dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			} else {
+				// Inlined...
+				xoffset := uint32(s - t - baseMatchOffset)
+				xlength := l
+				oc := offsetCode(xoffset)
+				xoffset |= oc << 16
+				for xlength > 0 {
+					xl := xlength
+					if xl > 258 {
+						if xl > 258+baseMatchLength {
+							xl = 258
+						} else {
+							xl = 258 - baseMatchLength
+						}
+					}
+					xlength -= xl
+					xl -= baseMatchLength
+					dst.extraHist[lengthCodes1[uint8(xl)]]++
+					dst.offHist[oc]++
+					dst.tokens[dst.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
+					dst.n++
+				}
+			}
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+			if s >= sLimit {
+				// Index first pair after match end.
+				if int(s+l+8) < len(src) {
+					cv := load6432(src, s)
+					e.table[hashLen(cv, tableBits, hashBytes)] = tableEntry{offset: s + e.cur}
+				}
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load6432(src, s-2)
+			o := e.cur + s - 2
+			prevHash := hashLen(x, tableBits, hashBytes)
+			e.table[prevHash] = tableEntry{offset: o}
+			x >>= 16
+			currHash := hashLen(x, tableBits, hashBytes)
+			candidate = e.table[currHash]
+			e.table[currHash] = tableEntry{offset: o + 2}
+
+			t = candidate.offset - e.cur
+			if s-t > maxMatchOffset || uint32(x) != load3232(src, t) {
+				cv = x >> 8
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+		emitLiteral(dst, src[nextEmit:])
+	}
+}

diff --git a/vendor/github.com/klauspost/compress/flate/level2.go b/vendor/github.com/klauspost/compress/flate/level2.go
new file mode 100644
index 0000000..c8d047f
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/level2.go

@@ -0,0 +1,214 @@
+package flate
+
+import "fmt"
+
+// fastGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type fastEncL2 struct {
+	fastGen
+	table [bTableSize]tableEntry
+}
+
+// EncodeL2 uses a similar algorithm to level 1, but is capable
+// of matching across blocks giving better compression at a small slowdown.
+func (e *fastEncL2) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashBytes              = 5
+	)
+
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+	for {
+		// When should we start skipping if we haven't found matches in a long while.
+		const skipLog = 5
+		const doEvery = 2
+
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hashLen(cv, bTableBits, hashBytes)
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidate = e.table[nextHash]
+			now := load6432(src, nextS)
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+			nextHash = hashLen(now, bTableBits, hashBytes)
+
+			offset := s - (candidate.offset - e.cur)
+			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
+				break
+			}
+
+			// Do one right away...
+			cv = now
+			s = nextS
+			nextS++
+			candidate = e.table[nextHash]
+			now >>= 8
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+
+			offset = s - (candidate.offset - e.cur)
+			if offset < maxMatchOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
+				break
+			}
+			cv = now
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			t := candidate.offset - e.cur
+			l := e.matchlenLong(int(s+4), int(t+4), src) + 4
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
+			}
+
+			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+
+			if s >= sLimit {
+				// Index first pair after match end.
+				if int(s+l+8) < len(src) {
+					cv := load6432(src, s)
+					e.table[hashLen(cv, bTableBits, hashBytes)] = tableEntry{offset: s + e.cur}
+				}
+				goto emitRemainder
+			}
+
+			// Store every second hash in-between, but offset by 1.
+			for i := s - l + 2; i < s-5; i += 7 {
+				x := load6432(src, i)
+				nextHash := hashLen(x, bTableBits, hashBytes)
+				e.table[nextHash] = tableEntry{offset: e.cur + i}
+				// Skip one
+				x >>= 16
+				nextHash = hashLen(x, bTableBits, hashBytes)
+				e.table[nextHash] = tableEntry{offset: e.cur + i + 2}
+				// Skip one
+				x >>= 16
+				nextHash = hashLen(x, bTableBits, hashBytes)
+				e.table[nextHash] = tableEntry{offset: e.cur + i + 4}
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 to s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load6432(src, s-2)
+			o := e.cur + s - 2
+			prevHash := hashLen(x, bTableBits, hashBytes)
+			prevHash2 := hashLen(x>>8, bTableBits, hashBytes)
+			e.table[prevHash] = tableEntry{offset: o}
+			e.table[prevHash2] = tableEntry{offset: o + 1}
+			currHash := hashLen(x>>16, bTableBits, hashBytes)
+			candidate = e.table[currHash]
+			e.table[currHash] = tableEntry{offset: o + 2}
+
+			offset := s - (candidate.offset - e.cur)
+			if offset > maxMatchOffset || uint32(x>>16) != load3232(src, candidate.offset-e.cur) {
+				cv = x >> 24
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}

diff --git a/vendor/github.com/klauspost/compress/flate/level3.go b/vendor/github.com/klauspost/compress/flate/level3.go
new file mode 100644
index 0000000..33f9fb1
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/level3.go

@@ -0,0 +1,241 @@
+package flate
+
+import "fmt"
+
+// fastEncL3
+type fastEncL3 struct {
+	fastGen
+	table [1 << 16]tableEntryPrev
+}
+
+// Encode uses a similar algorithm to level 2, will check up to two candidates.
+func (e *fastEncL3) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		tableBits              = 16
+		tableSize              = 1 << tableBits
+		hashBytes              = 5
+	)
+
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+			}
+			if v.Prev.offset <= minOff {
+				v.Prev.offset = 0
+			} else {
+				v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+			}
+			e.table[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// Skip if too small.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+	for {
+		const skipLog = 7
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hashLen(cv, tableBits, hashBytes)
+			s = nextS
+			nextS = s + 1 + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidates := e.table[nextHash]
+			now := load6432(src, nextS)
+
+			// Safe offset distance until s + 4...
+			minOffset := e.cur + s - (maxMatchOffset - 4)
+			e.table[nextHash] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur}}
+
+			// Check both candidates
+			candidate = candidates.Cur
+			if candidate.offset < minOffset {
+				cv = now
+				// Previous will also be invalid, we have nothing.
+				continue
+			}
+
+			if uint32(cv) == load3232(src, candidate.offset-e.cur) {
+				if candidates.Prev.offset < minOffset || uint32(cv) != load3232(src, candidates.Prev.offset-e.cur) {
+					break
+				}
+				// Both match and are valid, pick longest.
+				offset := s - (candidate.offset - e.cur)
+				o2 := s - (candidates.Prev.offset - e.cur)
+				l1, l2 := matchLen(src[s+4:], src[s-offset+4:]), matchLen(src[s+4:], src[s-o2+4:])
+				if l2 > l1 {
+					candidate = candidates.Prev
+				}
+				break
+			} else {
+				// We only check if value mismatches.
+				// Offset will always be invalid in other cases.
+				candidate = candidates.Prev
+				if candidate.offset > minOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
+					break
+				}
+			}
+			cv = now
+		}
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			//
+			t := candidate.offset - e.cur
+			l := e.matchlenLong(int(s+4), int(t+4), src) + 4
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
+			}
+
+			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+
+			if s >= sLimit {
+				t += l
+				// Index first pair after match end.
+				if int(t+8) < len(src) && t > 0 {
+					cv = load6432(src, t)
+					nextHash := hashLen(cv, tableBits, hashBytes)
+					e.table[nextHash] = tableEntryPrev{
+						Prev: e.table[nextHash].Cur,
+						Cur:  tableEntry{offset: e.cur + t},
+					}
+				}
+				goto emitRemainder
+			}
+
+			// Store every 5th hash in-between.
+			for i := s - l + 2; i < s-5; i += 6 {
+				nextHash := hashLen(load6432(src, i), tableBits, hashBytes)
+				e.table[nextHash] = tableEntryPrev{
+					Prev: e.table[nextHash].Cur,
+					Cur:  tableEntry{offset: e.cur + i}}
+			}
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 to s.
+			x := load6432(src, s-2)
+			prevHash := hashLen(x, tableBits, hashBytes)
+
+			e.table[prevHash] = tableEntryPrev{
+				Prev: e.table[prevHash].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 2},
+			}
+			x >>= 8
+			prevHash = hashLen(x, tableBits, hashBytes)
+
+			e.table[prevHash] = tableEntryPrev{
+				Prev: e.table[prevHash].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 1},
+			}
+			x >>= 8
+			currHash := hashLen(x, tableBits, hashBytes)
+			candidates := e.table[currHash]
+			cv = x
+			e.table[currHash] = tableEntryPrev{
+				Prev: candidates.Cur,
+				Cur:  tableEntry{offset: s + e.cur},
+			}
+
+			// Check both candidates
+			candidate = candidates.Cur
+			minOffset := e.cur + s - (maxMatchOffset - 4)
+
+			if candidate.offset > minOffset {
+				if uint32(cv) == load3232(src, candidate.offset-e.cur) {
+					// Found a match...
+					continue
+				}
+				candidate = candidates.Prev
+				if candidate.offset > minOffset && uint32(cv) == load3232(src, candidate.offset-e.cur) {
+					// Match at prev...
+					continue
+				}
+			}
+			cv = x >> 8
+			s++
+			break
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}

diff --git a/vendor/github.com/klauspost/compress/flate/level4.go b/vendor/github.com/klauspost/compress/flate/level4.go
new file mode 100644
index 0000000..88509e1
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/level4.go

@@ -0,0 +1,221 @@
+package flate
+
+import "fmt"
+
+type fastEncL4 struct {
+	fastGen
+	table  [tableSize]tableEntry
+	bTable [tableSize]tableEntry
+}
+
+func (e *fastEncL4) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
+	)
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntry{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.bTable[i].offset = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+	for {
+		const skipLog = 6
+		const doEvery = 1
+
+		nextS := s
+		var t int32
+		for {
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
+			nextHashL := hash7(cv, tableBits)
+
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := load6432(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			e.bTable[nextHashL] = entry
+
+			t = lCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
+				// We got a long match. Use that.
+				break
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
+				// Found a 4 match...
+				lCandidate = e.bTable[hash7(next, tableBits)]
+
+				// If the next long is a candidate, check if we should use that instead...
+				lOff := lCandidate.offset - e.cur
+				if nextS-lOff < maxMatchOffset && load3232(src, lOff) == uint32(next) {
+					l1, l2 := matchLen(src[s+4:], src[t+4:]), matchLen(src[nextS+4:], src[nextS-lOff+4:])
+					if l2 > l1 {
+						s = nextS
+						t = lCandidate.offset - e.cur
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		// Extend the 4-byte match as long as possible.
+		l := e.matchlenLong(int(s+4), int(t+4), src) + 4
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
+		}
+		if debugDeflate {
+			if t >= s {
+				panic("s-t")
+			}
+			if (s - t) > maxMatchOffset {
+				panic(fmt.Sprintln("mmo", t))
+			}
+			if l < baseMatchLength {
+				panic("bml")
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			// Index first pair after match end.
+			if int(s+8) < len(src) {
+				cv := load6432(src, s)
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: s + e.cur}
+				e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur}
+			}
+			goto emitRemainder
+		}
+
+		// Store every 3rd hash in-between
+		if true {
+			i := nextS
+			if i < s-1 {
+				cv := load6432(src, i)
+				t := tableEntry{offset: i + e.cur}
+				t2 := tableEntry{offset: t.offset + 1}
+				e.bTable[hash7(cv, tableBits)] = t
+				e.bTable[hash7(cv>>8, tableBits)] = t2
+				e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
+
+				i += 3
+				for ; i < s-1; i += 3 {
+					cv := load6432(src, i)
+					t := tableEntry{offset: i + e.cur}
+					t2 := tableEntry{offset: t.offset + 1}
+					e.bTable[hash7(cv, tableBits)] = t
+					e.bTable[hash7(cv>>8, tableBits)] = t2
+					e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
+				}
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		x := load6432(src, s-1)
+		o := e.cur + s - 1
+		prevHashS := hashLen(x, tableBits, hashShortBytes)
+		prevHashL := hash7(x, tableBits)
+		e.table[prevHashS] = tableEntry{offset: o}
+		e.bTable[prevHashL] = tableEntry{offset: o}
+		cv = x >> 8
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}

diff --git a/vendor/github.com/klauspost/compress/flate/level5.go b/vendor/github.com/klauspost/compress/flate/level5.go
new file mode 100644
index 0000000..6e5c215
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/level5.go

@@ -0,0 +1,708 @@
+package flate
+
+import "fmt"
+
+type fastEncL5 struct {
+	fastGen
+	table  [tableSize]tableEntry
+	bTable [tableSize]tableEntryPrev
+}
+
+func (e *fastEncL5) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
+	)
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+				v.Prev.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+				if v.Prev.offset <= minOff {
+					v.Prev.offset = 0
+				} else {
+					v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+				}
+			}
+			e.bTable[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+	for {
+		const skipLog = 6
+		const doEvery = 1
+
+		nextS := s
+		var l int32
+		var t int32
+		for {
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
+			nextHashL := hash7(cv, tableBits)
+
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := load6432(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			eLong := &e.bTable[nextHashL]
+			eLong.Cur, eLong.Prev = entry, eLong.Cur
+
+			nextHashS = hashLen(next, tableBits, hashShortBytes)
+			nextHashL = hash7(next, tableBits)
+
+			t = lCandidate.Cur.offset - e.cur
+			if s-t < maxMatchOffset {
+				if uint32(cv) == load3232(src, t) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+					t2 := lCandidate.Prev.offset - e.cur
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, t2) {
+						l = e.matchlen(int(s+4), int(t+4), src) + 4
+						ml1 := e.matchlen(int(s+4), int(t2+4), src) + 4
+						if ml1 > l {
+							t = t2
+							l = ml1
+							break
+						}
+					}
+					break
+				}
+				t = lCandidate.Prev.offset - e.cur
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+					break
+				}
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
+				// Found a 4 match...
+				l = e.matchlen(int(s+4), int(t+4), src) + 4
+				lCandidate = e.bTable[nextHashL]
+				// Store the next match
+
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+				eLong := &e.bTable[nextHashL]
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+				// If the next long is a candidate, use that...
+				t2 := lCandidate.Cur.offset - e.cur
+				if nextS-t2 < maxMatchOffset {
+					if load3232(src, t2) == uint32(next) {
+						ml := e.matchlen(int(nextS+4), int(t2+4), src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+					// If the previous long is a candidate, use that...
+					t2 = lCandidate.Prev.offset - e.cur
+					if nextS-t2 < maxMatchOffset && load3232(src, t2) == uint32(next) {
+						ml := e.matchlen(int(nextS+4), int(t2+4), src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		if l == 0 {
+			// Extend the 4-byte match as long as possible.
+			l = e.matchlenLong(int(s+4), int(t+4), src) + 4
+		} else if l == maxMatchLength {
+			l += e.matchlenLong(int(s+l), int(t+l), src)
+		}
+
+		// Try to locate a better match by checking the end of best match...
+		if sAt := s + l; l < 30 && sAt < sLimit {
+			// Allow some bytes at the beginning to mismatch.
+			// Sweet spot is 2/3 bytes depending on input.
+			// 3 is only a little better when it is but sometimes a lot worse.
+			// The skipped bytes are tested in Extend backwards,
+			// and still picked up as part of the match if they do.
+			const skipBeginning = 2
+			eLong := e.bTable[hash7(load6432(src, sAt), tableBits)].Cur.offset
+			t2 := eLong - e.cur - l + skipBeginning
+			s2 := s + skipBeginning
+			off := s2 - t2
+			if t2 >= 0 && off < maxMatchOffset && off > 0 {
+				if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l {
+					t = t2
+					l = l2
+					s = s2
+				}
+			}
+		}
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
+		}
+		if debugDeflate {
+			if t >= s {
+				panic(fmt.Sprintln("s-t", s, t))
+			}
+			if (s - t) > maxMatchOffset {
+				panic(fmt.Sprintln("mmo", s-t))
+			}
+			if l < baseMatchLength {
+				panic("bml")
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		// Store every 3rd hash in-between.
+		if true {
+			const hashEvery = 3
+			i := s - l + 1
+			if i < s-1 {
+				cv := load6432(src, i)
+				t := tableEntry{offset: i + e.cur}
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = t
+				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+
+				// Do an long at i+1
+				cv >>= 8
+				t = tableEntry{offset: t.offset + 1}
+				eLong = &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+
+				// We only have enough bits for a short entry at i+2
+				cv >>= 8
+				t = tableEntry{offset: t.offset + 1}
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = t
+
+				// Skip one - otherwise we risk hitting 's'
+				i += 4
+				for ; i < s-1; i += hashEvery {
+					cv := load6432(src, i)
+					t := tableEntry{offset: i + e.cur}
+					t2 := tableEntry{offset: t.offset + 1}
+					eLong := &e.bTable[hash7(cv, tableBits)]
+					eLong.Cur, eLong.Prev = t, eLong.Cur
+					e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
+				}
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		x := load6432(src, s-1)
+		o := e.cur + s - 1
+		prevHashS := hashLen(x, tableBits, hashShortBytes)
+		prevHashL := hash7(x, tableBits)
+		e.table[prevHashS] = tableEntry{offset: o}
+		eLong := &e.bTable[prevHashL]
+		eLong.Cur, eLong.Prev = tableEntry{offset: o}, eLong.Cur
+		cv = x >> 8
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
+
+// fastEncL5Window is a level 5 encoder,
+// but with a custom window size.
+type fastEncL5Window struct {
+	hist      []byte
+	cur       int32
+	maxOffset int32
+	table     [tableSize]tableEntry
+	bTable    [tableSize]tableEntryPrev
+}
+
+func (e *fastEncL5Window) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
+	)
+	maxMatchOffset := e.maxOffset
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+				v.Prev.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+				if v.Prev.offset <= minOff {
+					v.Prev.offset = 0
+				} else {
+					v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+				}
+			}
+			e.bTable[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+	for {
+		const skipLog = 6
+		const doEvery = 1
+
+		nextS := s
+		var l int32
+		var t int32
+		for {
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
+			nextHashL := hash7(cv, tableBits)
+
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := load6432(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			eLong := &e.bTable[nextHashL]
+			eLong.Cur, eLong.Prev = entry, eLong.Cur
+
+			nextHashS = hashLen(next, tableBits, hashShortBytes)
+			nextHashL = hash7(next, tableBits)
+
+			t = lCandidate.Cur.offset - e.cur
+			if s-t < maxMatchOffset {
+				if uint32(cv) == load3232(src, t) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+					t2 := lCandidate.Prev.offset - e.cur
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, t2) {
+						l = e.matchlen(s+4, t+4, src) + 4
+						ml1 := e.matchlen(s+4, t2+4, src) + 4
+						if ml1 > l {
+							t = t2
+							l = ml1
+							break
+						}
+					}
+					break
+				}
+				t = lCandidate.Prev.offset - e.cur
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+					break
+				}
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
+				// Found a 4 match...
+				l = e.matchlen(s+4, t+4, src) + 4
+				lCandidate = e.bTable[nextHashL]
+				// Store the next match
+
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+				eLong := &e.bTable[nextHashL]
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+				// If the next long is a candidate, use that...
+				t2 := lCandidate.Cur.offset - e.cur
+				if nextS-t2 < maxMatchOffset {
+					if load3232(src, t2) == uint32(next) {
+						ml := e.matchlen(nextS+4, t2+4, src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+					// If the previous long is a candidate, use that...
+					t2 = lCandidate.Prev.offset - e.cur
+					if nextS-t2 < maxMatchOffset && load3232(src, t2) == uint32(next) {
+						ml := e.matchlen(nextS+4, t2+4, src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		if l == 0 {
+			// Extend the 4-byte match as long as possible.
+			l = e.matchlenLong(s+4, t+4, src) + 4
+		} else if l == maxMatchLength {
+			l += e.matchlenLong(s+l, t+l, src)
+		}
+
+		// Try to locate a better match by checking the end of best match...
+		if sAt := s + l; l < 30 && sAt < sLimit {
+			// Allow some bytes at the beginning to mismatch.
+			// Sweet spot is 2/3 bytes depending on input.
+			// 3 is only a little better when it is but sometimes a lot worse.
+			// The skipped bytes are tested in Extend backwards,
+			// and still picked up as part of the match if they do.
+			const skipBeginning = 2
+			eLong := e.bTable[hash7(load6432(src, sAt), tableBits)].Cur.offset
+			t2 := eLong - e.cur - l + skipBeginning
+			s2 := s + skipBeginning
+			off := s2 - t2
+			if t2 >= 0 && off < maxMatchOffset && off > 0 {
+				if l2 := e.matchlenLong(s2, t2, src); l2 > l {
+					t = t2
+					l = l2
+					s = s2
+				}
+			}
+		}
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
+		}
+		if debugDeflate {
+			if t >= s {
+				panic(fmt.Sprintln("s-t", s, t))
+			}
+			if (s - t) > maxMatchOffset {
+				panic(fmt.Sprintln("mmo", s-t))
+			}
+			if l < baseMatchLength {
+				panic("bml")
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		// Store every 3rd hash in-between.
+		if true {
+			const hashEvery = 3
+			i := s - l + 1
+			if i < s-1 {
+				cv := load6432(src, i)
+				t := tableEntry{offset: i + e.cur}
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = t
+				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+
+				// Do an long at i+1
+				cv >>= 8
+				t = tableEntry{offset: t.offset + 1}
+				eLong = &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+
+				// We only have enough bits for a short entry at i+2
+				cv >>= 8
+				t = tableEntry{offset: t.offset + 1}
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = t
+
+				// Skip one - otherwise we risk hitting 's'
+				i += 4
+				for ; i < s-1; i += hashEvery {
+					cv := load6432(src, i)
+					t := tableEntry{offset: i + e.cur}
+					t2 := tableEntry{offset: t.offset + 1}
+					eLong := &e.bTable[hash7(cv, tableBits)]
+					eLong.Cur, eLong.Prev = t, eLong.Cur
+					e.table[hashLen(cv>>8, tableBits, hashShortBytes)] = t2
+				}
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		x := load6432(src, s-1)
+		o := e.cur + s - 1
+		prevHashS := hashLen(x, tableBits, hashShortBytes)
+		prevHashL := hash7(x, tableBits)
+		e.table[prevHashS] = tableEntry{offset: o}
+		eLong := &e.bTable[prevHashL]
+		eLong.Cur, eLong.Prev = tableEntry{offset: o}, eLong.Cur
+		cv = x >> 8
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
+
+// Reset the encoding table.
+func (e *fastEncL5Window) Reset() {
+	// We keep the same allocs, since we are compressing the same block sizes.
+	if cap(e.hist) < allocHistory {
+		e.hist = make([]byte, 0, allocHistory)
+	}
+
+	// We offset current position so everything will be out of reach.
+	// If we are above the buffer reset it will be cleared anyway since len(hist) == 0.
+	if e.cur <= int32(bufferReset) {
+		e.cur += e.maxOffset + int32(len(e.hist))
+	}
+	e.hist = e.hist[:0]
+}
+
+func (e *fastEncL5Window) addBlock(src []byte) int32 {
+	// check if we have space already
+	maxMatchOffset := e.maxOffset
+
+	if len(e.hist)+len(src) > cap(e.hist) {
+		if cap(e.hist) == 0 {
+			e.hist = make([]byte, 0, allocHistory)
+		} else {
+			if cap(e.hist) < int(maxMatchOffset*2) {
+				panic("unexpected buffer size")
+			}
+			// Move down
+			offset := int32(len(e.hist)) - maxMatchOffset
+			copy(e.hist[0:maxMatchOffset], e.hist[offset:])
+			e.cur += offset
+			e.hist = e.hist[:maxMatchOffset]
+		}
+	}
+	s := int32(len(e.hist))
+	e.hist = append(e.hist, src...)
+	return s
+}
+
+// matchlen will return the match length between offsets and t in src.
+// The maximum length returned is maxMatchLength - 4.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastEncL5Window) matchlen(s, t int32, src []byte) int32 {
+	if debugDecode {
+		if t >= s {
+			panic(fmt.Sprint("t >=s:", t, s))
+		}
+		if int(s) >= len(src) {
+			panic(fmt.Sprint("s >= len(src):", s, len(src)))
+		}
+		if t < 0 {
+			panic(fmt.Sprint("t < 0:", t))
+		}
+		if s-t > e.maxOffset {
+			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
+		}
+	}
+	s1 := int(s) + maxMatchLength - 4
+	if s1 > len(src) {
+		s1 = len(src)
+	}
+
+	// Extend the match to be as long as possible.
+	return int32(matchLen(src[s:s1], src[t:]))
+}
+
+// matchlenLong will return the match length between offsets and t in src.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastEncL5Window) matchlenLong(s, t int32, src []byte) int32 {
+	if debugDeflate {
+		if t >= s {
+			panic(fmt.Sprint("t >=s:", t, s))
+		}
+		if int(s) >= len(src) {
+			panic(fmt.Sprint("s >= len(src):", s, len(src)))
+		}
+		if t < 0 {
+			panic(fmt.Sprint("t < 0:", t))
+		}
+		if s-t > e.maxOffset {
+			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
+		}
+	}
+	// Extend the match to be as long as possible.
+	return int32(matchLen(src[s:], src[t:]))
+}

diff --git a/vendor/github.com/klauspost/compress/flate/level6.go b/vendor/github.com/klauspost/compress/flate/level6.go
new file mode 100644
index 0000000..96f5bb4
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/level6.go

@@ -0,0 +1,325 @@
+package flate
+
+import "fmt"
+
+type fastEncL6 struct {
+	fastGen
+	table  [tableSize]tableEntry
+	bTable [tableSize]tableEntryPrev
+}
+
+func (e *fastEncL6) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+		hashShortBytes         = 4
+	)
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+				v.Prev.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+				if v.Prev.offset <= minOff {
+					v.Prev.offset = 0
+				} else {
+					v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+				}
+			}
+			e.bTable[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+	// Repeat MUST be > 1 and within range
+	repeat := int32(1)
+	for {
+		const skipLog = 7
+		const doEvery = 1
+
+		nextS := s
+		var l int32
+		var t int32
+		for {
+			nextHashS := hashLen(cv, tableBits, hashShortBytes)
+			nextHashL := hash7(cv, tableBits)
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := load6432(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			eLong := &e.bTable[nextHashL]
+			eLong.Cur, eLong.Prev = entry, eLong.Cur
+
+			// Calculate hashes of 'next'
+			nextHashS = hashLen(next, tableBits, hashShortBytes)
+			nextHashL = hash7(next, tableBits)
+
+			t = lCandidate.Cur.offset - e.cur
+			if s-t < maxMatchOffset {
+				if uint32(cv) == load3232(src, t) {
+					// Long candidate matches at least 4 bytes.
+
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+					// Check the previous long candidate as well.
+					t2 := lCandidate.Prev.offset - e.cur
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, t2) {
+						l = e.matchlen(int(s+4), int(t+4), src) + 4
+						ml1 := e.matchlen(int(s+4), int(t2+4), src) + 4
+						if ml1 > l {
+							t = t2
+							l = ml1
+							break
+						}
+					}
+					break
+				}
+				// Current value did not match, but check if previous long value does.
+				t = lCandidate.Prev.offset - e.cur
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+					break
+				}
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, t) {
+				// Found a 4 match...
+				l = e.matchlen(int(s+4), int(t+4), src) + 4
+
+				// Look up next long candidate (at nextS)
+				lCandidate = e.bTable[nextHashL]
+
+				// Store the next match
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+				eLong := &e.bTable[nextHashL]
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+				// Check repeat at s + repOff
+				const repOff = 1
+				t2 := s - repeat + repOff
+				if load3232(src, t2) == uint32(cv>>(8*repOff)) {
+					ml := e.matchlen(int(s+4+repOff), int(t2+4), src) + 4
+					if ml > l {
+						t = t2
+						l = ml
+						s += repOff
+						// Not worth checking more.
+						break
+					}
+				}
+
+				// If the next long is a candidate, use that...
+				t2 = lCandidate.Cur.offset - e.cur
+				if nextS-t2 < maxMatchOffset {
+					if load3232(src, t2) == uint32(next) {
+						ml := e.matchlen(int(nextS+4), int(t2+4), src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							// This is ok, but check previous as well.
+						}
+					}
+					// If the previous long is a candidate, use that...
+					t2 = lCandidate.Prev.offset - e.cur
+					if nextS-t2 < maxMatchOffset && load3232(src, t2) == uint32(next) {
+						ml := e.matchlen(int(nextS+4), int(t2+4), src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		// Extend the 4-byte match as long as possible.
+		if l == 0 {
+			l = e.matchlenLong(int(s+4), int(t+4), src) + 4
+		} else if l == maxMatchLength {
+			l += e.matchlenLong(int(s+l), int(t+l), src)
+		}
+
+		// Try to locate a better match by checking the end-of-match...
+		if sAt := s + l; sAt < sLimit {
+			// Allow some bytes at the beginning to mismatch.
+			// Sweet spot is 2/3 bytes depending on input.
+			// 3 is only a little better when it is but sometimes a lot worse.
+			// The skipped bytes are tested in Extend backwards,
+			// and still picked up as part of the match if they do.
+			const skipBeginning = 2
+			eLong := &e.bTable[hash7(load6432(src, sAt), tableBits)]
+			// Test current
+			t2 := eLong.Cur.offset - e.cur - l + skipBeginning
+			s2 := s + skipBeginning
+			off := s2 - t2
+			if off < maxMatchOffset {
+				if off > 0 && t2 >= 0 {
+					if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l {
+						t = t2
+						l = l2
+						s = s2
+					}
+				}
+				// Test next:
+				t2 = eLong.Prev.offset - e.cur - l + skipBeginning
+				off := s2 - t2
+				if off > 0 && off < maxMatchOffset && t2 >= 0 {
+					if l2 := e.matchlenLong(int(s2), int(t2), src); l2 > l {
+						t = t2
+						l = l2
+						s = s2
+					}
+				}
+			}
+		}
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			if false {
+				emitLiteral(dst, src[nextEmit:s])
+			} else {
+				for _, v := range src[nextEmit:s] {
+					dst.tokens[dst.n] = token(v)
+					dst.litHist[v]++
+					dst.n++
+				}
+			}
+		}
+		if false {
+			if t >= s {
+				panic(fmt.Sprintln("s-t", s, t))
+			}
+			if (s - t) > maxMatchOffset {
+				panic(fmt.Sprintln("mmo", s-t))
+			}
+			if l < baseMatchLength {
+				panic("bml")
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		repeat = s - t
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			// Index after match end.
+			for i := nextS + 1; i < int32(len(src))-8; i += 2 {
+				cv := load6432(src, i)
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = tableEntry{offset: i + e.cur}
+				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur}, eLong.Cur
+			}
+			goto emitRemainder
+		}
+
+		// Store every long hash in-between and every second short.
+		if true {
+			for i := nextS + 1; i < s-1; i += 2 {
+				cv := load6432(src, i)
+				t := tableEntry{offset: i + e.cur}
+				t2 := tableEntry{offset: t.offset + 1}
+				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong2 := &e.bTable[hash7(cv>>8, tableBits)]
+				e.table[hashLen(cv, tableBits, hashShortBytes)] = t
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+				eLong2.Cur, eLong2.Prev = t2, eLong2.Cur
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		cv = load6432(src, s)
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}

diff --git a/vendor/github.com/klauspost/compress/flate/matchlen_generic.go b/vendor/github.com/klauspost/compress/flate/matchlen_generic.go
new file mode 100644
index 0000000..6149384
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/matchlen_generic.go

@@ -0,0 +1,34 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+
+package flate
+
+import (
+	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
+)
+
+// matchLen returns the maximum common prefix length of a and b.
+// a must be the shortest of the two.
+func matchLen(a, b []byte) (n int) {
+	left := len(a)
+	for left >= 8 {
+		diff := le.Load64(a, n) ^ le.Load64(b, n)
+		if diff != 0 {
+			return n + bits.TrailingZeros64(diff)>>3
+		}
+		n += 8
+		left -= 8
+	}
+
+	a = a[n:]
+	b = b[n:]
+	for i := range a {
+		if a[i] != b[i] {
+			break
+		}
+		n++
+	}
+	return n
+}

diff --git a/vendor/github.com/klauspost/compress/flate/regmask_amd64.go b/vendor/github.com/klauspost/compress/flate/regmask_amd64.go
new file mode 100644
index 0000000..6ed2806
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/regmask_amd64.go

@@ -0,0 +1,37 @@
+package flate
+
+const (
+	// Masks for shifts with register sizes of the shift value.
+	// This can be used to work around the x86 design of shifting by mod register size.
+	// It can be used when a variable shift is always smaller than the register size.
+
+	// reg8SizeMaskX - shift value is 8 bits, shifted is X
+	reg8SizeMask8  = 7
+	reg8SizeMask16 = 15
+	reg8SizeMask32 = 31
+	reg8SizeMask64 = 63
+
+	// reg16SizeMaskX - shift value is 16 bits, shifted is X
+	reg16SizeMask8  = reg8SizeMask8
+	reg16SizeMask16 = reg8SizeMask16
+	reg16SizeMask32 = reg8SizeMask32
+	reg16SizeMask64 = reg8SizeMask64
+
+	// reg32SizeMaskX - shift value is 32 bits, shifted is X
+	reg32SizeMask8  = reg8SizeMask8
+	reg32SizeMask16 = reg8SizeMask16
+	reg32SizeMask32 = reg8SizeMask32
+	reg32SizeMask64 = reg8SizeMask64
+
+	// reg64SizeMaskX - shift value is 64 bits, shifted is X
+	reg64SizeMask8  = reg8SizeMask8
+	reg64SizeMask16 = reg8SizeMask16
+	reg64SizeMask32 = reg8SizeMask32
+	reg64SizeMask64 = reg8SizeMask64
+
+	// regSizeMaskUintX - shift value is uint, shifted is X
+	regSizeMaskUint8  = reg8SizeMask8
+	regSizeMaskUint16 = reg8SizeMask16
+	regSizeMaskUint32 = reg8SizeMask32
+	regSizeMaskUint64 = reg8SizeMask64
+)

diff --git a/vendor/github.com/klauspost/compress/flate/regmask_other.go b/vendor/github.com/klauspost/compress/flate/regmask_other.go
new file mode 100644
index 0000000..1b7a2cb
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/regmask_other.go

@@ -0,0 +1,40 @@
+//go:build !amd64
+// +build !amd64
+
+package flate
+
+const (
+	// Masks for shifts with register sizes of the shift value.
+	// This can be used to work around the x86 design of shifting by mod register size.
+	// It can be used when a variable shift is always smaller than the register size.
+
+	// reg8SizeMaskX - shift value is 8 bits, shifted is X
+	reg8SizeMask8  = 0xff
+	reg8SizeMask16 = 0xff
+	reg8SizeMask32 = 0xff
+	reg8SizeMask64 = 0xff
+
+	// reg16SizeMaskX - shift value is 16 bits, shifted is X
+	reg16SizeMask8  = 0xffff
+	reg16SizeMask16 = 0xffff
+	reg16SizeMask32 = 0xffff
+	reg16SizeMask64 = 0xffff
+
+	// reg32SizeMaskX - shift value is 32 bits, shifted is X
+	reg32SizeMask8  = 0xffffffff
+	reg32SizeMask16 = 0xffffffff
+	reg32SizeMask32 = 0xffffffff
+	reg32SizeMask64 = 0xffffffff
+
+	// reg64SizeMaskX - shift value is 64 bits, shifted is X
+	reg64SizeMask8  = 0xffffffffffffffff
+	reg64SizeMask16 = 0xffffffffffffffff
+	reg64SizeMask32 = 0xffffffffffffffff
+	reg64SizeMask64 = 0xffffffffffffffff
+
+	// regSizeMaskUintX - shift value is uint, shifted is X
+	regSizeMaskUint8  = ^uint(0)
+	regSizeMaskUint16 = ^uint(0)
+	regSizeMaskUint32 = ^uint(0)
+	regSizeMaskUint64 = ^uint(0)
+)

diff --git a/vendor/github.com/klauspost/compress/flate/stateless.go b/vendor/github.com/klauspost/compress/flate/stateless.go
new file mode 100644
index 0000000..13b9b10
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/stateless.go

@@ -0,0 +1,313 @@
+package flate
+
+import (
+	"io"
+	"math"
+	"sync"
+
+	"github.com/klauspost/compress/internal/le"
+)
+
+const (
+	maxStatelessBlock = math.MaxInt16
+	// dictionary will be taken from maxStatelessBlock, so limit it.
+	maxStatelessDict = 8 << 10
+
+	slTableBits  = 13
+	slTableSize  = 1 << slTableBits
+	slTableShift = 32 - slTableBits
+)
+
+type statelessWriter struct {
+	dst    io.Writer
+	closed bool
+}
+
+func (s *statelessWriter) Close() error {
+	if s.closed {
+		return nil
+	}
+	s.closed = true
+	// Emit EOF block
+	return StatelessDeflate(s.dst, nil, true, nil)
+}
+
+func (s *statelessWriter) Write(p []byte) (n int, err error) {
+	err = StatelessDeflate(s.dst, p, false, nil)
+	if err != nil {
+		return 0, err
+	}
+	return len(p), nil
+}
+
+func (s *statelessWriter) Reset(w io.Writer) {
+	s.dst = w
+	s.closed = false
+}
+
+// NewStatelessWriter will do compression but without maintaining any state
+// between Write calls.
+// There will be no memory kept between Write calls,
+// but compression and speed will be suboptimal.
+// Because of this, the size of actual Write calls will affect output size.
+func NewStatelessWriter(dst io.Writer) io.WriteCloser {
+	return &statelessWriter{dst: dst}
+}
+
+// bitWriterPool contains bit writers that can be reused.
+var bitWriterPool = sync.Pool{
+	New: func() interface{} {
+		return newHuffmanBitWriter(nil)
+	},
+}
+
+// StatelessDeflate allows compressing directly to a Writer without retaining state.
+// When returning everything will be flushed.
+// Up to 8KB of an optional dictionary can be given which is presumed to precede the block.
+// Longer dictionaries will be truncated and will still produce valid output.
+// Sending nil dictionary is perfectly fine.
+func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {
+	var dst tokens
+	bw := bitWriterPool.Get().(*huffmanBitWriter)
+	bw.reset(out)
+	defer func() {
+		// don't keep a reference to our output
+		bw.reset(nil)
+		bitWriterPool.Put(bw)
+	}()
+	if eof && len(in) == 0 {
+		// Just write an EOF block.
+		// Could be faster...
+		bw.writeStoredHeader(0, true)
+		bw.flush()
+		return bw.err
+	}
+
+	// Truncate dict
+	if len(dict) > maxStatelessDict {
+		dict = dict[len(dict)-maxStatelessDict:]
+	}
+
+	// For subsequent loops, keep shallow dict reference to avoid alloc+copy.
+	var inDict []byte
+
+	for len(in) > 0 {
+		todo := in
+		if len(inDict) > 0 {
+			if len(todo) > maxStatelessBlock-maxStatelessDict {
+				todo = todo[:maxStatelessBlock-maxStatelessDict]
+			}
+		} else if len(todo) > maxStatelessBlock-len(dict) {
+			todo = todo[:maxStatelessBlock-len(dict)]
+		}
+		inOrg := in
+		in = in[len(todo):]
+		uncompressed := todo
+		if len(dict) > 0 {
+			// combine dict and source
+			bufLen := len(todo) + len(dict)
+			combined := make([]byte, bufLen)
+			copy(combined, dict)
+			copy(combined[len(dict):], todo)
+			todo = combined
+		}
+		// Compress
+		if len(inDict) == 0 {
+			statelessEnc(&dst, todo, int16(len(dict)))
+		} else {
+			statelessEnc(&dst, inDict[:maxStatelessDict+len(todo)], maxStatelessDict)
+		}
+		isEof := eof && len(in) == 0
+
+		if dst.n == 0 {
+			bw.writeStoredHeader(len(uncompressed), isEof)
+			if bw.err != nil {
+				return bw.err
+			}
+			bw.writeBytes(uncompressed)
+		} else if int(dst.n) > len(uncompressed)-len(uncompressed)>>4 {
+			// If we removed less than 1/16th, huffman compress the block.
+			bw.writeBlockHuff(isEof, uncompressed, len(in) == 0)
+		} else {
+			bw.writeBlockDynamic(&dst, isEof, uncompressed, len(in) == 0)
+		}
+		if len(in) > 0 {
+			// Retain a dict if we have more
+			inDict = inOrg[len(uncompressed)-maxStatelessDict:]
+			dict = nil
+			dst.Reset()
+		}
+		if bw.err != nil {
+			return bw.err
+		}
+	}
+	if !eof {
+		// Align, only a stored block can do that.
+		bw.writeStoredHeader(0, false)
+	}
+	bw.flush()
+	return bw.err
+}
+
+func hashSL(u uint32) uint32 {
+	return (u * 0x1e35a7bd) >> slTableShift
+}
+
+func load3216(b []byte, i int16) uint32 {
+	return le.Load32(b, i)
+}
+
+func load6416(b []byte, i int16) uint64 {
+	return le.Load64(b, i)
+}
+
+func statelessEnc(dst *tokens, src []byte, startAt int16) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+
+	type tableEntry struct {
+		offset int16
+	}
+
+	var table [slTableSize]tableEntry
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src)-int(startAt) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = 0
+		return
+	}
+	// Index until startAt
+	if startAt > 0 {
+		cv := load3232(src, 0)
+		for i := int16(0); i < startAt; i++ {
+			table[hashSL(cv)] = tableEntry{offset: i}
+			cv = (cv >> 8) | (uint32(src[i+4]) << 24)
+		}
+	}
+
+	s := startAt + 1
+	nextEmit := startAt
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int16(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load3216(src, s)
+
+	for {
+		const skipLog = 5
+		const doEvery = 2
+
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hashSL(cv)
+			candidate = table[nextHash]
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit || nextS <= 0 {
+				goto emitRemainder
+			}
+
+			now := load6416(src, nextS)
+			table[nextHash] = tableEntry{offset: s}
+			nextHash = hashSL(uint32(now))
+
+			if cv == load3216(src, candidate.offset) {
+				table[nextHash] = tableEntry{offset: nextS}
+				break
+			}
+
+			// Do one right away...
+			cv = uint32(now)
+			s = nextS
+			nextS++
+			candidate = table[nextHash]
+			now >>= 8
+			table[nextHash] = tableEntry{offset: s}
+
+			if cv == load3216(src, candidate.offset) {
+				table[nextHash] = tableEntry{offset: nextS}
+				break
+			}
+			cv = uint32(now)
+			s = nextS
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			t := candidate.offset
+			l := int16(matchLen(src[s+4:], src[t+4:]) + 4)
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				if false {
+					emitLiteral(dst, src[nextEmit:s])
+				} else {
+					for _, v := range src[nextEmit:s] {
+						dst.tokens[dst.n] = token(v)
+						dst.litHist[v]++
+						dst.n++
+					}
+				}
+			}
+
+			// Save the match found
+			dst.AddMatchLong(int32(l), uint32(s-t-baseMatchOffset))
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load6416(src, s-2)
+			o := s - 2
+			prevHash := hashSL(uint32(x))
+			table[prevHash] = tableEntry{offset: o}
+			x >>= 16
+			currHash := hashSL(uint32(x))
+			candidate = table[currHash]
+			table[currHash] = tableEntry{offset: o + 2}
+
+			if uint32(x) != load3216(src, candidate.offset) {
+				cv = uint32(x >> 8)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+		emitLiteral(dst, src[nextEmit:])
+	}
+}

diff --git a/vendor/github.com/klauspost/compress/flate/token.go b/vendor/github.com/klauspost/compress/flate/token.go
new file mode 100644
index 0000000..d818790
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/token.go

@@ -0,0 +1,379 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"math"
+)
+
+const (
+	// bits 0-16  	xoffset = offset - MIN_OFFSET_SIZE, or literal - 16 bits
+	// bits 16-22	offsetcode - 5 bits
+	// bits 22-30   xlength = length - MIN_MATCH_LENGTH - 8 bits
+	// bits 30-32   type   0 = literal  1=EOF  2=Match   3=Unused - 2 bits
+	lengthShift         = 22
+	offsetMask          = 1<<lengthShift - 1
+	typeMask            = 3 << 30
+	literalType         = 0 << 30
+	matchType           = 1 << 30
+	matchOffsetOnlyMask = 0xffff
+)
+
+// The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
+// is lengthCodes[length - MIN_MATCH_LENGTH]
+var lengthCodes = [256]uint8{
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
+	9, 9, 10, 10, 11, 11, 12, 12, 12, 12,
+	13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
+	15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
+	17, 17, 17, 17, 17, 17, 17, 17, 18, 18,
+	18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
+	19, 19, 19, 19, 20, 20, 20, 20, 20, 20,
+	20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+	21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+	21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
+	22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+	22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+	23, 23, 23, 23, 23, 23, 23, 23, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 28,
+}
+
+// lengthCodes1 is length codes, but starting at 1.
+var lengthCodes1 = [256]uint8{
+	1, 2, 3, 4, 5, 6, 7, 8, 9, 9,
+	10, 10, 11, 11, 12, 12, 13, 13, 13, 13,
+	14, 14, 14, 14, 15, 15, 15, 15, 16, 16,
+	16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
+	18, 18, 18, 18, 18, 18, 18, 18, 19, 19,
+	19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
+	20, 20, 20, 20, 21, 21, 21, 21, 21, 21,
+	21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+	22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+	22, 22, 22, 22, 22, 22, 23, 23, 23, 23,
+	23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+	23, 23, 24, 24, 24, 24, 24, 24, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 29,
+}
+
+var offsetCodes = [256]uint32{
+	0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
+	8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+}
+
+// offsetCodes14 are offsetCodes, but with 14 added.
+var offsetCodes14 = [256]uint32{
+	14, 15, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
+	22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+}
+
+type token uint32
+
+type tokens struct {
+	extraHist [32]uint16  // codes 256->maxnumlit
+	offHist   [32]uint16  // offset codes
+	litHist   [256]uint16 // codes 0->255
+	nFilled   int
+	n         uint16 // Must be able to contain maxStoreBlockSize
+	tokens    [maxStoreBlockSize + 1]token
+}
+
+func (t *tokens) Reset() {
+	if t.n == 0 {
+		return
+	}
+	t.n = 0
+	t.nFilled = 0
+	for i := range t.litHist[:] {
+		t.litHist[i] = 0
+	}
+	for i := range t.extraHist[:] {
+		t.extraHist[i] = 0
+	}
+	for i := range t.offHist[:] {
+		t.offHist[i] = 0
+	}
+}
+
+func (t *tokens) Fill() {
+	if t.n == 0 {
+		return
+	}
+	for i, v := range t.litHist[:] {
+		if v == 0 {
+			t.litHist[i] = 1
+			t.nFilled++
+		}
+	}
+	for i, v := range t.extraHist[:literalCount-256] {
+		if v == 0 {
+			t.nFilled++
+			t.extraHist[i] = 1
+		}
+	}
+	for i, v := range t.offHist[:offsetCodeCount] {
+		if v == 0 {
+			t.offHist[i] = 1
+		}
+	}
+}
+
+func indexTokens(in []token) tokens {
+	var t tokens
+	t.indexTokens(in)
+	return t
+}
+
+func (t *tokens) indexTokens(in []token) {
+	t.Reset()
+	for _, tok := range in {
+		if tok < matchType {
+			t.AddLiteral(tok.literal())
+			continue
+		}
+		t.AddMatch(uint32(tok.length()), tok.offset()&matchOffsetOnlyMask)
+	}
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+func emitLiteral(dst *tokens, lit []byte) {
+	for _, v := range lit {
+		dst.tokens[dst.n] = token(v)
+		dst.litHist[v]++
+		dst.n++
+	}
+}
+
+func (t *tokens) AddLiteral(lit byte) {
+	t.tokens[t.n] = token(lit)
+	t.litHist[lit]++
+	t.n++
+}
+
+// from https://stackoverflow.com/a/28730362
+func mFastLog2(val float32) float32 {
+	ux := int32(math.Float32bits(val))
+	log2 := (float32)(((ux >> 23) & 255) - 128)
+	ux &= -0x7f800001
+	ux += 127 << 23
+	uval := math.Float32frombits(uint32(ux))
+	log2 += ((-0.34484843)*uval+2.02466578)*uval - 0.67487759
+	return log2
+}
+
+// EstimatedBits will return an minimum size estimated by an *optimal*
+// compression of the block.
+// The size of the block
+func (t *tokens) EstimatedBits() int {
+	shannon := float32(0)
+	bits := int(0)
+	nMatches := 0
+	total := int(t.n) + t.nFilled
+	if total > 0 {
+		invTotal := 1.0 / float32(total)
+		for _, v := range t.litHist[:] {
+			if v > 0 {
+				n := float32(v)
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
+			}
+		}
+		// Just add 15 for EOB
+		shannon += 15
+		for i, v := range t.extraHist[1 : literalCount-256] {
+			if v > 0 {
+				n := float32(v)
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
+				bits += int(lengthExtraBits[i&31]) * int(v)
+				nMatches += int(v)
+			}
+		}
+	}
+	if nMatches > 0 {
+		invTotal := 1.0 / float32(nMatches)
+		for i, v := range t.offHist[:offsetCodeCount] {
+			if v > 0 {
+				n := float32(v)
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
+				bits += int(offsetExtraBits[i&31]) * int(v)
+			}
+		}
+	}
+	return int(shannon) + bits
+}
+
+// AddMatch adds a match to the tokens.
+// This function is very sensitive to inlining and right on the border.
+func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
+	if debugDeflate {
+		if xlength >= maxMatchLength+baseMatchLength {
+			panic(fmt.Errorf("invalid length: %v", xlength))
+		}
+		if xoffset >= maxMatchOffset+baseMatchOffset {
+			panic(fmt.Errorf("invalid offset: %v", xoffset))
+		}
+	}
+	oCode := offsetCode(xoffset)
+	xoffset |= oCode << 16
+
+	t.extraHist[lengthCodes1[uint8(xlength)]]++
+	t.offHist[oCode&31]++
+	t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
+	t.n++
+}
+
+// AddMatchLong adds a match to the tokens, potentially longer than max match length.
+// Length should NOT have the base subtracted, only offset should.
+func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
+	if debugDeflate {
+		if xoffset >= maxMatchOffset+baseMatchOffset {
+			panic(fmt.Errorf("invalid offset: %v", xoffset))
+		}
+	}
+	oc := offsetCode(xoffset)
+	xoffset |= oc << 16
+	for xlength > 0 {
+		xl := xlength
+		if xl > 258 {
+			// We need to have at least baseMatchLength left over for next loop.
+			if xl > 258+baseMatchLength {
+				xl = 258
+			} else {
+				xl = 258 - baseMatchLength
+			}
+		}
+		xlength -= xl
+		xl -= baseMatchLength
+		t.extraHist[lengthCodes1[uint8(xl)]]++
+		t.offHist[oc&31]++
+		t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
+		t.n++
+	}
+}
+
+func (t *tokens) AddEOB() {
+	t.tokens[t.n] = token(endBlockMarker)
+	t.extraHist[0]++
+	t.n++
+}
+
+func (t *tokens) Slice() []token {
+	return t.tokens[:t.n]
+}
+
+// VarInt returns the tokens as varint encoded bytes.
+func (t *tokens) VarInt() []byte {
+	var b = make([]byte, binary.MaxVarintLen32*int(t.n))
+	var off int
+	for _, v := range t.tokens[:t.n] {
+		off += binary.PutUvarint(b[off:], uint64(v))
+	}
+	return b[:off]
+}
+
+// FromVarInt restores t to the varint encoded tokens provided.
+// Any data in t is removed.
+func (t *tokens) FromVarInt(b []byte) error {
+	var buf = bytes.NewReader(b)
+	var toks []token
+	for {
+		r, err := binary.ReadUvarint(buf)
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return err
+		}
+		toks = append(toks, token(r))
+	}
+	t.indexTokens(toks)
+	return nil
+}
+
+// Returns the type of a token
+func (t token) typ() uint32 { return uint32(t) & typeMask }
+
+// Returns the literal of a literal token
+func (t token) literal() uint8 { return uint8(t) }
+
+// Returns the extra offset of a match token
+func (t token) offset() uint32 { return uint32(t) & offsetMask }
+
+func (t token) length() uint8 { return uint8(t >> lengthShift) }
+
+// Convert length to code.
+func lengthCode(len uint8) uint8 { return lengthCodes[len] }
+
+// Returns the offset code corresponding to a specific offset
+func offsetCode(off uint32) uint32 {
+	if false {
+		if off < uint32(len(offsetCodes)) {
+			return offsetCodes[off&255]
+		} else if off>>7 < uint32(len(offsetCodes)) {
+			return offsetCodes[(off>>7)&255] + 14
+		} else {
+			return offsetCodes[(off>>14)&255] + 28
+		}
+	}
+	if off < uint32(len(offsetCodes)) {
+		return offsetCodes[uint8(off)]
+	}
+	return offsetCodes14[uint8(off>>7)]
+}

diff --git a/vendor/github.com/klauspost/compress/fse/bitwriter.go b/vendor/github.com/klauspost/compress/fse/bitwriter.go
index 43e4636..e82fa3b 100644
--- a/vendor/github.com/klauspost/compress/fse/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/fse/bitwriter.go

@@ -152,12 +152,11 @@
 
 // close will write the alignment bit and write the final byte(s)
 // to the output.
-func (b *bitWriter) close() error {
+func (b *bitWriter) close() {
 	// End mark
 	b.addBits16Clean(1, 1)
 	// flush until next byte.
 	b.flushAlign()
-	return nil
 }
 
 // reset and continue writing by appending to out.

diff --git a/vendor/github.com/klauspost/compress/fse/compress.go b/vendor/github.com/klauspost/compress/fse/compress.go
index 6f34191..074018d 100644
--- a/vendor/github.com/klauspost/compress/fse/compress.go
+++ b/vendor/github.com/klauspost/compress/fse/compress.go

@@ -146,54 +146,51 @@
 		c1.encodeZero(tt[src[ip-2]])
 		ip -= 2
 	}
+	src = src[:ip]
 
 	// Main compression loop.
 	switch {
 	case !s.zeroBits && s.actualTableLog <= 8:
 		// We can encode 4 symbols without requiring a flush.
 		// We do not need to check if any output is 0 bits.
-		for ip >= 4 {
+		for ; len(src) >= 4; src = src[:len(src)-4] {
 			s.bw.flush32()
-			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
 			c2.encode(tt[v0])
 			c1.encode(tt[v1])
 			c2.encode(tt[v2])
 			c1.encode(tt[v3])
-			ip -= 4
 		}
 	case !s.zeroBits:
 		// We do not need to check if any output is 0 bits.
-		for ip >= 4 {
+		for ; len(src) >= 4; src = src[:len(src)-4] {
 			s.bw.flush32()
-			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
 			c2.encode(tt[v0])
 			c1.encode(tt[v1])
 			s.bw.flush32()
 			c2.encode(tt[v2])
 			c1.encode(tt[v3])
-			ip -= 4
 		}
 	case s.actualTableLog <= 8:
 		// We can encode 4 symbols without requiring a flush
-		for ip >= 4 {
+		for ; len(src) >= 4; src = src[:len(src)-4] {
 			s.bw.flush32()
-			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
 			c2.encodeZero(tt[v0])
 			c1.encodeZero(tt[v1])
 			c2.encodeZero(tt[v2])
 			c1.encodeZero(tt[v3])
-			ip -= 4
 		}
 	default:
-		for ip >= 4 {
+		for ; len(src) >= 4; src = src[:len(src)-4] {
 			s.bw.flush32()
-			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1]
 			c2.encodeZero(tt[v0])
 			c1.encodeZero(tt[v1])
 			s.bw.flush32()
 			c2.encodeZero(tt[v2])
 			c1.encodeZero(tt[v3])
-			ip -= 4
 		}
 	}
 
@@ -202,7 +199,8 @@
 	c2.flush(s.actualTableLog)
 	c1.flush(s.actualTableLog)
 
-	return s.bw.close()
+	s.bw.close()
+	return nil
 }
 
 // writeCount will write the normalized histogram count to header.
@@ -214,7 +212,7 @@
 		previous0 bool
 		charnum   uint16
 
-		maxHeaderSize = ((int(s.symbolLen) * int(tableLog)) >> 3) + 3
+		maxHeaderSize = ((int(s.symbolLen)*int(tableLog) + 4 + 2) >> 3) + 3
 
 		// Write Table Size
 		bitStream = uint32(tableLog - minTablelog)
@@ -459,15 +457,17 @@
 	for _, v := range in {
 		s.count[v]++
 	}
-	m := uint32(0)
+	m, symlen := uint32(0), s.symbolLen
 	for i, v := range s.count[:] {
+		if v == 0 {
+			continue
+		}
 		if v > m {
 			m = v
 		}
-		if v > 0 {
-			s.symbolLen = uint16(i) + 1
-		}
+		symlen = uint16(i) + 1
 	}
+	s.symbolLen = symlen
 	return int(m)
 }
 

diff --git a/vendor/github.com/klauspost/compress/fse/decompress.go b/vendor/github.com/klauspost/compress/fse/decompress.go
index 926f5f1..0c7dd4f 100644
--- a/vendor/github.com/klauspost/compress/fse/decompress.go
+++ b/vendor/github.com/klauspost/compress/fse/decompress.go

@@ -15,7 +15,7 @@
 // It is possible, but by no way guaranteed that corrupt data will
 // return an error.
 // It is up to the caller to verify integrity of the returned data.
-// Use a predefined Scrach to set maximum acceptable output size.
+// Use a predefined Scratch to set maximum acceptable output size.
 func Decompress(b []byte, s *Scratch) ([]byte, error) {
 	s, err := s.prepare(b)
 	if err != nil {
@@ -260,7 +260,9 @@
 // If the buffer is over-read an error is returned.
 func (s *Scratch) decompress() error {
 	br := &s.bits
-	br.init(s.br.unread())
+	if err := br.init(s.br.unread()); err != nil {
+		return err
+	}
 
 	var s1, s2 decoder
 	// Initialize and decode first state and symbol.

diff --git a/vendor/github.com/klauspost/compress/gzip/gunzip.go b/vendor/github.com/klauspost/compress/gzip/gunzip.go
new file mode 100644
index 0000000..00a0a2c
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/gzip/gunzip.go

@@ -0,0 +1,380 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package gzip implements reading and writing of gzip format compressed files,
+// as specified in RFC 1952.
+package gzip
+
+import (
+	"bufio"
+	"compress/gzip"
+	"encoding/binary"
+	"hash/crc32"
+	"io"
+	"time"
+
+	"github.com/klauspost/compress/flate"
+)
+
+const (
+	gzipID1     = 0x1f
+	gzipID2     = 0x8b
+	gzipDeflate = 8
+	flagText    = 1 << 0
+	flagHdrCrc  = 1 << 1
+	flagExtra   = 1 << 2
+	flagName    = 1 << 3
+	flagComment = 1 << 4
+)
+
+var (
+	// ErrChecksum is returned when reading GZIP data that has an invalid checksum.
+	ErrChecksum = gzip.ErrChecksum
+	// ErrHeader is returned when reading GZIP data that has an invalid header.
+	ErrHeader = gzip.ErrHeader
+)
+
+var le = binary.LittleEndian
+
+// noEOF converts io.EOF to io.ErrUnexpectedEOF.
+func noEOF(err error) error {
+	if err == io.EOF {
+		return io.ErrUnexpectedEOF
+	}
+	return err
+}
+
+// The gzip file stores a header giving metadata about the compressed file.
+// That header is exposed as the fields of the Writer and Reader structs.
+//
+// Strings must be UTF-8 encoded and may only contain Unicode code points
+// U+0001 through U+00FF, due to limitations of the GZIP file format.
+type Header struct {
+	Comment string    // comment
+	Extra   []byte    // "extra data"
+	ModTime time.Time // modification time
+	Name    string    // file name
+	OS      byte      // operating system type
+}
+
+// A Reader is an io.Reader that can be read to retrieve
+// uncompressed data from a gzip-format compressed file.
+//
+// In general, a gzip file can be a concatenation of gzip files,
+// each with its own header. Reads from the Reader
+// return the concatenation of the uncompressed data of each.
+// Only the first header is recorded in the Reader fields.
+//
+// Gzip files store a length and checksum of the uncompressed data.
+// The Reader will return a ErrChecksum when Read
+// reaches the end of the uncompressed data if it does not
+// have the expected length or checksum. Clients should treat data
+// returned by Read as tentative until they receive the io.EOF
+// marking the end of the data.
+type Reader struct {
+	Header       // valid after NewReader or Reader.Reset
+	r            flate.Reader
+	br           *bufio.Reader
+	decompressor io.ReadCloser
+	digest       uint32 // CRC-32, IEEE polynomial (section 8)
+	size         uint32 // Uncompressed size (section 2.3.1)
+	buf          [512]byte
+	err          error
+	multistream  bool
+}
+
+// NewReader creates a new Reader reading the given reader.
+// If r does not also implement io.ByteReader,
+// the decompressor may read more data than necessary from r.
+//
+// It is the caller's responsibility to call Close on the Reader when done.
+//
+// The Reader.Header fields will be valid in the Reader returned.
+func NewReader(r io.Reader) (*Reader, error) {
+	z := new(Reader)
+	if err := z.Reset(r); err != nil {
+		return nil, err
+	}
+	return z, nil
+}
+
+// Reset discards the Reader z's state and makes it equivalent to the
+// result of its original state from NewReader, but reading from r instead.
+// This permits reusing a Reader rather than allocating a new one.
+func (z *Reader) Reset(r io.Reader) error {
+	*z = Reader{
+		decompressor: z.decompressor,
+		multistream:  true,
+		br:           z.br,
+	}
+	if rr, ok := r.(flate.Reader); ok {
+		z.r = rr
+	} else {
+		// Reuse if we can.
+		if z.br != nil {
+			z.br.Reset(r)
+		} else {
+			z.br = bufio.NewReader(r)
+		}
+		z.r = z.br
+	}
+	z.Header, z.err = z.readHeader()
+	return z.err
+}
+
+// Multistream controls whether the reader supports multistream files.
+//
+// If enabled (the default), the Reader expects the input to be a sequence
+// of individually gzipped data streams, each with its own header and
+// trailer, ending at EOF. The effect is that the concatenation of a sequence
+// of gzipped files is treated as equivalent to the gzip of the concatenation
+// of the sequence. This is standard behavior for gzip readers.
+//
+// Calling Multistream(false) disables this behavior; disabling the behavior
+// can be useful when reading file formats that distinguish individual gzip
+// data streams or mix gzip data streams with other data streams.
+// In this mode, when the Reader reaches the end of the data stream,
+// Read returns io.EOF. If the underlying reader implements io.ByteReader,
+// it will be left positioned just after the gzip stream.
+// To start the next stream, call z.Reset(r) followed by z.Multistream(false).
+// If there is no next stream, z.Reset(r) will return io.EOF.
+func (z *Reader) Multistream(ok bool) {
+	z.multistream = ok
+}
+
+// readString reads a NUL-terminated string from z.r.
+// It treats the bytes read as being encoded as ISO 8859-1 (Latin-1) and
+// will output a string encoded using UTF-8.
+// This method always updates z.digest with the data read.
+func (z *Reader) readString() (string, error) {
+	var err error
+	needConv := false
+	for i := 0; ; i++ {
+		if i >= len(z.buf) {
+			return "", ErrHeader
+		}
+		z.buf[i], err = z.r.ReadByte()
+		if err != nil {
+			return "", err
+		}
+		if z.buf[i] > 0x7f {
+			needConv = true
+		}
+		if z.buf[i] == 0 {
+			// Digest covers the NUL terminator.
+			z.digest = crc32.Update(z.digest, crc32.IEEETable, z.buf[:i+1])
+
+			// Strings are ISO 8859-1, Latin-1 (RFC 1952, section 2.3.1).
+			if needConv {
+				s := make([]rune, 0, i)
+				for _, v := range z.buf[:i] {
+					s = append(s, rune(v))
+				}
+				return string(s), nil
+			}
+			return string(z.buf[:i]), nil
+		}
+	}
+}
+
+// readHeader reads the GZIP header according to section 2.3.1.
+// This method does not set z.err.
+func (z *Reader) readHeader() (hdr Header, err error) {
+	if _, err = io.ReadFull(z.r, z.buf[:10]); err != nil {
+		// RFC 1952, section 2.2, says the following:
+		//	A gzip file consists of a series of "members" (compressed data sets).
+		//
+		// Other than this, the specification does not clarify whether a
+		// "series" is defined as "one or more" or "zero or more". To err on the
+		// side of caution, Go interprets this to mean "zero or more".
+		// Thus, it is okay to return io.EOF here.
+		return hdr, err
+	}
+	if z.buf[0] != gzipID1 || z.buf[1] != gzipID2 || z.buf[2] != gzipDeflate {
+		return hdr, ErrHeader
+	}
+	flg := z.buf[3]
+	hdr.ModTime = time.Unix(int64(le.Uint32(z.buf[4:8])), 0)
+	// z.buf[8] is XFL and is currently ignored.
+	hdr.OS = z.buf[9]
+	z.digest = crc32.ChecksumIEEE(z.buf[:10])
+
+	if flg&flagExtra != 0 {
+		if _, err = io.ReadFull(z.r, z.buf[:2]); err != nil {
+			return hdr, noEOF(err)
+		}
+		z.digest = crc32.Update(z.digest, crc32.IEEETable, z.buf[:2])
+		data := make([]byte, le.Uint16(z.buf[:2]))
+		if _, err = io.ReadFull(z.r, data); err != nil {
+			return hdr, noEOF(err)
+		}
+		z.digest = crc32.Update(z.digest, crc32.IEEETable, data)
+		hdr.Extra = data
+	}
+
+	var s string
+	if flg&flagName != 0 {
+		if s, err = z.readString(); err != nil {
+			return hdr, err
+		}
+		hdr.Name = s
+	}
+
+	if flg&flagComment != 0 {
+		if s, err = z.readString(); err != nil {
+			return hdr, err
+		}
+		hdr.Comment = s
+	}
+
+	if flg&flagHdrCrc != 0 {
+		if _, err = io.ReadFull(z.r, z.buf[:2]); err != nil {
+			return hdr, noEOF(err)
+		}
+		digest := le.Uint16(z.buf[:2])
+		if digest != uint16(z.digest) {
+			return hdr, ErrHeader
+		}
+	}
+
+	// Reserved FLG bits must be zero.
+	if flg>>5 != 0 {
+		return hdr, ErrHeader
+	}
+
+	z.digest = 0
+	if z.decompressor == nil {
+		z.decompressor = flate.NewReader(z.r)
+	} else {
+		z.decompressor.(flate.Resetter).Reset(z.r, nil)
+	}
+	return hdr, nil
+}
+
+// Read implements io.Reader, reading uncompressed bytes from its underlying Reader.
+func (z *Reader) Read(p []byte) (n int, err error) {
+	if z.err != nil {
+		return 0, z.err
+	}
+
+	for n == 0 {
+		n, z.err = z.decompressor.Read(p)
+		z.digest = crc32.Update(z.digest, crc32.IEEETable, p[:n])
+		z.size += uint32(n)
+		if z.err != io.EOF {
+			// In the normal case we return here.
+			return n, z.err
+		}
+
+		// Finished file; check checksum and size.
+		if _, err := io.ReadFull(z.r, z.buf[:8]); err != nil {
+			z.err = noEOF(err)
+			return n, z.err
+		}
+		digest := le.Uint32(z.buf[:4])
+		size := le.Uint32(z.buf[4:8])
+		if digest != z.digest || size != z.size {
+			z.err = ErrChecksum
+			return n, z.err
+		}
+		z.digest, z.size = 0, 0
+
+		// File is ok; check if there is another.
+		if !z.multistream {
+			return n, io.EOF
+		}
+		z.err = nil // Remove io.EOF
+
+		if _, z.err = z.readHeader(); z.err != nil {
+			return n, z.err
+		}
+	}
+
+	return n, nil
+}
+
+type crcer interface {
+	io.Writer
+	Sum32() uint32
+	Reset()
+}
+type crcUpdater struct {
+	z *Reader
+}
+
+func (c *crcUpdater) Write(p []byte) (int, error) {
+	c.z.digest = crc32.Update(c.z.digest, crc32.IEEETable, p)
+	return len(p), nil
+}
+
+func (c *crcUpdater) Sum32() uint32 {
+	return c.z.digest
+}
+
+func (c *crcUpdater) Reset() {
+	c.z.digest = 0
+}
+
+// WriteTo support the io.WriteTo interface for io.Copy and friends.
+func (z *Reader) WriteTo(w io.Writer) (int64, error) {
+	total := int64(0)
+	crcWriter := crcer(crc32.NewIEEE())
+	if z.digest != 0 {
+		crcWriter = &crcUpdater{z: z}
+	}
+	for {
+		if z.err != nil {
+			if z.err == io.EOF {
+				return total, nil
+			}
+			return total, z.err
+		}
+
+		// We write both to output and digest.
+		mw := io.MultiWriter(w, crcWriter)
+		n, err := z.decompressor.(io.WriterTo).WriteTo(mw)
+		total += n
+		z.size += uint32(n)
+		if err != nil {
+			z.err = err
+			return total, z.err
+		}
+
+		// Finished file; check checksum + size.
+		if _, err := io.ReadFull(z.r, z.buf[0:8]); err != nil {
+			if err == io.EOF {
+				err = io.ErrUnexpectedEOF
+			}
+			z.err = err
+			return total, err
+		}
+		z.digest = crcWriter.Sum32()
+		digest := le.Uint32(z.buf[:4])
+		size := le.Uint32(z.buf[4:8])
+		if digest != z.digest || size != z.size {
+			z.err = ErrChecksum
+			return total, z.err
+		}
+		z.digest, z.size = 0, 0
+
+		// File is ok; check if there is another.
+		if !z.multistream {
+			return total, nil
+		}
+		crcWriter.Reset()
+		z.err = nil // Remove io.EOF
+
+		if _, z.err = z.readHeader(); z.err != nil {
+			if z.err == io.EOF {
+				return total, nil
+			}
+			return total, z.err
+		}
+	}
+}
+
+// Close closes the Reader. It does not close the underlying io.Reader.
+// In order for the GZIP checksum to be verified, the reader must be
+// fully consumed until the io.EOF.
+func (z *Reader) Close() error { return z.decompressor.Close() }

diff --git a/vendor/github.com/klauspost/compress/gzip/gzip.go b/vendor/github.com/klauspost/compress/gzip/gzip.go
new file mode 100644
index 0000000..5bc7205
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/gzip/gzip.go

@@ -0,0 +1,290 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gzip
+
+import (
+	"errors"
+	"fmt"
+	"hash/crc32"
+	"io"
+
+	"github.com/klauspost/compress/flate"
+)
+
+// These constants are copied from the flate package, so that code that imports
+// "compress/gzip" does not also have to import "compress/flate".
+const (
+	NoCompression       = flate.NoCompression
+	BestSpeed           = flate.BestSpeed
+	BestCompression     = flate.BestCompression
+	DefaultCompression  = flate.DefaultCompression
+	ConstantCompression = flate.ConstantCompression
+	HuffmanOnly         = flate.HuffmanOnly
+
+	// StatelessCompression will do compression but without maintaining any state
+	// between Write calls.
+	// There will be no memory kept between Write calls,
+	// but compression and speed will be suboptimal.
+	// Because of this, the size of actual Write calls will affect output size.
+	StatelessCompression = -3
+)
+
+// A Writer is an io.WriteCloser.
+// Writes to a Writer are compressed and written to w.
+type Writer struct {
+	Header      // written at first call to Write, Flush, or Close
+	w           io.Writer
+	level       int
+	err         error
+	compressor  *flate.Writer
+	digest      uint32 // CRC-32, IEEE polynomial (section 8)
+	size        uint32 // Uncompressed size (section 2.3.1)
+	wroteHeader bool
+	closed      bool
+	buf         [10]byte
+}
+
+// NewWriter returns a new Writer.
+// Writes to the returned writer are compressed and written to w.
+//
+// It is the caller's responsibility to call Close on the WriteCloser when done.
+// Writes may be buffered and not flushed until Close.
+//
+// Callers that wish to set the fields in Writer.Header must do so before
+// the first call to Write, Flush, or Close.
+func NewWriter(w io.Writer) *Writer {
+	z, _ := NewWriterLevel(w, DefaultCompression)
+	return z
+}
+
+// NewWriterLevel is like NewWriter but specifies the compression level instead
+// of assuming DefaultCompression.
+//
+// The compression level can be DefaultCompression, NoCompression, or any
+// integer value between BestSpeed and BestCompression inclusive. The error
+// returned will be nil if the level is valid.
+func NewWriterLevel(w io.Writer, level int) (*Writer, error) {
+	if level < StatelessCompression || level > BestCompression {
+		return nil, fmt.Errorf("gzip: invalid compression level: %d", level)
+	}
+	z := new(Writer)
+	z.init(w, level)
+	return z, nil
+}
+
+// MinCustomWindowSize is the minimum window size that can be sent to NewWriterWindow.
+const MinCustomWindowSize = flate.MinCustomWindowSize
+
+// MaxCustomWindowSize is the maximum custom window that can be sent to NewWriterWindow.
+const MaxCustomWindowSize = flate.MaxCustomWindowSize
+
+// NewWriterWindow returns a new Writer compressing data with a custom window size.
+// windowSize must be from MinCustomWindowSize to MaxCustomWindowSize.
+func NewWriterWindow(w io.Writer, windowSize int) (*Writer, error) {
+	if windowSize < MinCustomWindowSize {
+		return nil, errors.New("gzip: requested window size less than MinWindowSize")
+	}
+	if windowSize > MaxCustomWindowSize {
+		return nil, errors.New("gzip: requested window size bigger than MaxCustomWindowSize")
+	}
+
+	z := new(Writer)
+	z.init(w, -windowSize)
+	return z, nil
+}
+
+func (z *Writer) init(w io.Writer, level int) {
+	compressor := z.compressor
+	if level != StatelessCompression {
+		if compressor != nil {
+			compressor.Reset(w)
+		}
+	}
+
+	*z = Writer{
+		Header: Header{
+			OS: 255, // unknown
+		},
+		w:          w,
+		level:      level,
+		compressor: compressor,
+	}
+}
+
+// Reset discards the Writer z's state and makes it equivalent to the
+// result of its original state from NewWriter or NewWriterLevel, but
+// writing to w instead. This permits reusing a Writer rather than
+// allocating a new one.
+func (z *Writer) Reset(w io.Writer) {
+	z.init(w, z.level)
+}
+
+// writeBytes writes a length-prefixed byte slice to z.w.
+func (z *Writer) writeBytes(b []byte) error {
+	if len(b) > 0xffff {
+		return errors.New("gzip.Write: Extra data is too large")
+	}
+	le.PutUint16(z.buf[:2], uint16(len(b)))
+	_, err := z.w.Write(z.buf[:2])
+	if err != nil {
+		return err
+	}
+	_, err = z.w.Write(b)
+	return err
+}
+
+// writeString writes a UTF-8 string s in GZIP's format to z.w.
+// GZIP (RFC 1952) specifies that strings are NUL-terminated ISO 8859-1 (Latin-1).
+func (z *Writer) writeString(s string) (err error) {
+	// GZIP stores Latin-1 strings; error if non-Latin-1; convert if non-ASCII.
+	needconv := false
+	for _, v := range s {
+		if v == 0 || v > 0xff {
+			return errors.New("gzip.Write: non-Latin-1 header string")
+		}
+		if v > 0x7f {
+			needconv = true
+		}
+	}
+	if needconv {
+		b := make([]byte, 0, len(s))
+		for _, v := range s {
+			b = append(b, byte(v))
+		}
+		_, err = z.w.Write(b)
+	} else {
+		_, err = io.WriteString(z.w, s)
+	}
+	if err != nil {
+		return err
+	}
+	// GZIP strings are NUL-terminated.
+	z.buf[0] = 0
+	_, err = z.w.Write(z.buf[:1])
+	return err
+}
+
+// Write writes a compressed form of p to the underlying io.Writer. The
+// compressed bytes are not necessarily flushed until the Writer is closed.
+func (z *Writer) Write(p []byte) (int, error) {
+	if z.err != nil {
+		return 0, z.err
+	}
+	var n int
+	// Write the GZIP header lazily.
+	if !z.wroteHeader {
+		z.wroteHeader = true
+		z.buf[0] = gzipID1
+		z.buf[1] = gzipID2
+		z.buf[2] = gzipDeflate
+		z.buf[3] = 0
+		if z.Extra != nil {
+			z.buf[3] |= 0x04
+		}
+		if z.Name != "" {
+			z.buf[3] |= 0x08
+		}
+		if z.Comment != "" {
+			z.buf[3] |= 0x10
+		}
+		le.PutUint32(z.buf[4:8], uint32(z.ModTime.Unix()))
+		if z.level == BestCompression {
+			z.buf[8] = 2
+		} else if z.level == BestSpeed {
+			z.buf[8] = 4
+		} else {
+			z.buf[8] = 0
+		}
+		z.buf[9] = z.OS
+		n, z.err = z.w.Write(z.buf[:10])
+		if z.err != nil {
+			return n, z.err
+		}
+		if z.Extra != nil {
+			z.err = z.writeBytes(z.Extra)
+			if z.err != nil {
+				return n, z.err
+			}
+		}
+		if z.Name != "" {
+			z.err = z.writeString(z.Name)
+			if z.err != nil {
+				return n, z.err
+			}
+		}
+		if z.Comment != "" {
+			z.err = z.writeString(z.Comment)
+			if z.err != nil {
+				return n, z.err
+			}
+		}
+
+		if z.compressor == nil && z.level != StatelessCompression {
+			z.compressor, _ = flate.NewWriter(z.w, z.level)
+		}
+	}
+	z.size += uint32(len(p))
+	z.digest = crc32.Update(z.digest, crc32.IEEETable, p)
+	if z.level == StatelessCompression {
+		return len(p), flate.StatelessDeflate(z.w, p, false, nil)
+	}
+	n, z.err = z.compressor.Write(p)
+	return n, z.err
+}
+
+// Flush flushes any pending compressed data to the underlying writer.
+//
+// It is useful mainly in compressed network protocols, to ensure that
+// a remote reader has enough data to reconstruct a packet. Flush does
+// not return until the data has been written. If the underlying
+// writer returns an error, Flush returns that error.
+//
+// In the terminology of the zlib library, Flush is equivalent to Z_SYNC_FLUSH.
+func (z *Writer) Flush() error {
+	if z.err != nil {
+		return z.err
+	}
+	if z.closed || z.level == StatelessCompression {
+		return nil
+	}
+	if !z.wroteHeader {
+		z.Write(nil)
+		if z.err != nil {
+			return z.err
+		}
+	}
+	z.err = z.compressor.Flush()
+	return z.err
+}
+
+// Close closes the Writer, flushing any unwritten data to the underlying
+// io.Writer, but does not close the underlying io.Writer.
+func (z *Writer) Close() error {
+	if z.err != nil {
+		return z.err
+	}
+	if z.closed {
+		return nil
+	}
+	z.closed = true
+	if !z.wroteHeader {
+		z.Write(nil)
+		if z.err != nil {
+			return z.err
+		}
+	}
+	if z.level == StatelessCompression {
+		z.err = flate.StatelessDeflate(z.w, nil, true, nil)
+	} else {
+		z.err = z.compressor.Close()
+	}
+	if z.err != nil {
+		return z.err
+	}
+	le.PutUint32(z.buf[:4], z.digest)
+	le.PutUint32(z.buf[4:8], z.size)
+	_, z.err = z.w.Write(z.buf[:8])
+	return z.err
+}

diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
index 504a7be..bfc7a52 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go

@@ -6,10 +6,11 @@
 package huff0
 
 import (
-	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 // bitReader reads a bitstream in reverse.
@@ -46,7 +47,7 @@
 	return nil
 }
 
-// peekBitsFast requires that at least one bit is requested every time.
+// peekByteFast requires that at least one byte is requested every time.
 // There are no checks if the buffer is filled.
 func (b *bitReaderBytes) peekByteFast() uint8 {
 	got := uint8(b.value >> 56)
@@ -66,9 +67,7 @@
 	}
 
 	// 2 bounds checks.
-	v := b.in[b.off-4 : b.off]
-	v = v[:4]
-	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	low := le.Load32(b.in, b.off-4)
 	b.value |= uint64(low) << (b.bitsRead - 32)
 	b.bitsRead -= 32
 	b.off -= 4
@@ -77,7 +76,7 @@
 // fillFastStart() assumes the bitReaderBytes is empty and there is at least 8 bytes to read.
 func (b *bitReaderBytes) fillFastStart() {
 	// Do single re-slice to avoid bounds checks.
-	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.value = le.Load64(b.in, b.off-8)
 	b.bitsRead = 0
 	b.off -= 8
 }
@@ -87,10 +86,8 @@
 	if b.bitsRead < 32 {
 		return
 	}
-	if b.off > 4 {
-		v := b.in[b.off-4:]
-		v = v[:4]
-		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	if b.off >= 4 {
+		low := le.Load32(b.in, b.off-4)
 		b.value |= uint64(low) << (b.bitsRead - 32)
 		b.bitsRead -= 32
 		b.off -= 4
@@ -177,10 +174,7 @@
 		return
 	}
 
-	// 2 bounds checks.
-	v := b.in[b.off-4 : b.off]
-	v = v[:4]
-	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	low := le.Load32(b.in, b.off-4)
 	b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
 	b.bitsRead -= 32
 	b.off -= 4
@@ -188,8 +182,7 @@
 
 // fillFastStart() assumes the bitReaderShifted is empty and there is at least 8 bytes to read.
 func (b *bitReaderShifted) fillFastStart() {
-	// Do single re-slice to avoid bounds checks.
-	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.value = le.Load64(b.in, b.off-8)
 	b.bitsRead = 0
 	b.off -= 8
 }
@@ -200,9 +193,7 @@
 		return
 	}
 	if b.off > 4 {
-		v := b.in[b.off-4:]
-		v = v[:4]
-		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+		low := le.Load32(b.in, b.off-4)
 		b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
 		b.bitsRead -= 32
 		b.off -= 4

diff --git a/vendor/github.com/klauspost/compress/huff0/bitwriter.go b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
index ec71f7a..0ebc9aa 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitwriter.go

@@ -13,14 +13,6 @@
 	out          []byte
 }
 
-// bitMask16 is bitmasks. Has extra to avoid bounds check.
-var bitMask16 = [32]uint16{
-	0, 1, 3, 7, 0xF, 0x1F,
-	0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF,
-	0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0xFFFF,
-	0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
-	0xFFFF, 0xFFFF} /* up to 16 bits */
-
 // addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
 // It will not check if there is space for them, so the caller must ensure that it has flushed recently.
 func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
@@ -60,6 +52,22 @@
 	b.nBits += encA.nBits + encB.nBits
 }
 
+// encFourSymbols adds up to 32 bits from four symbols.
+// It will not check if there is space for them,
+// so the caller must ensure that b has been flushed recently.
+func (b *bitWriter) encFourSymbols(encA, encB, encC, encD cTableEntry) {
+	bitsA := encA.nBits
+	bitsB := bitsA + encB.nBits
+	bitsC := bitsB + encC.nBits
+	bitsD := bitsC + encD.nBits
+	combined := uint64(encA.val) |
+		(uint64(encB.val) << (bitsA & 63)) |
+		(uint64(encC.val) << (bitsB & 63)) |
+		(uint64(encD.val) << (bitsC & 63))
+	b.bitContainer |= combined << (b.nBits & 63)
+	b.nBits += bitsD
+}
+
 // flush32 will flush out, so there are at least 32 bits available for writing.
 func (b *bitWriter) flush32() {
 	if b.nBits < 32 {
@@ -86,10 +94,9 @@
 
 // close will write the alignment bit and write the final byte(s)
 // to the output.
-func (b *bitWriter) close() error {
+func (b *bitWriter) close() {
 	// End mark
 	b.addBits16Clean(1, 1)
 	// flush until next byte.
 	b.flushAlign()
-	return nil
 }

diff --git a/vendor/github.com/klauspost/compress/huff0/bytereader.go b/vendor/github.com/klauspost/compress/huff0/bytereader.go
deleted file mode 100644
index 4dcab8d..0000000
--- a/vendor/github.com/klauspost/compress/huff0/bytereader.go
+++ /dev/null

@@ -1,44 +0,0 @@
-// Copyright 2018 Klaus Post. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
-
-package huff0
-
-// byteReader provides a byte reader that reads
-// little endian values from a byte stream.
-// The input stream is manually advanced.
-// The reader performs no bounds checks.
-type byteReader struct {
-	b   []byte
-	off int
-}
-
-// init will initialize the reader and set the input.
-func (b *byteReader) init(in []byte) {
-	b.b = in
-	b.off = 0
-}
-
-// Int32 returns a little endian int32 starting at current offset.
-func (b byteReader) Int32() int32 {
-	v3 := int32(b.b[b.off+3])
-	v2 := int32(b.b[b.off+2])
-	v1 := int32(b.b[b.off+1])
-	v0 := int32(b.b[b.off])
-	return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
-}
-
-// Uint32 returns a little endian uint32 starting at current offset.
-func (b byteReader) Uint32() uint32 {
-	v3 := uint32(b.b[b.off+3])
-	v2 := uint32(b.b[b.off+2])
-	v1 := uint32(b.b[b.off+1])
-	v0 := uint32(b.b[b.off])
-	return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
-}
-
-// remain will return the number of bytes remaining.
-func (b byteReader) remain() int {
-	return len(b.b) - b.off
-}

diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go
index 4d14542..84aa3d1 100644
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go

@@ -227,10 +227,10 @@
 }
 
 func (s *Scratch) compress1X(src []byte) ([]byte, error) {
-	return s.compress1xDo(s.Out, src)
+	return s.compress1xDo(s.Out, src), nil
 }
 
-func (s *Scratch) compress1xDo(dst, src []byte) ([]byte, error) {
+func (s *Scratch) compress1xDo(dst, src []byte) []byte {
 	var bw = bitWriter{out: dst}
 
 	// N is length divisible by 4.
@@ -248,8 +248,7 @@
 			tmp := src[n : n+4]
 			// tmp should be len 4
 			bw.flush32()
-			bw.encTwoSymbols(cTable, tmp[3], tmp[2])
-			bw.encTwoSymbols(cTable, tmp[1], tmp[0])
+			bw.encFourSymbols(cTable[tmp[3]], cTable[tmp[2]], cTable[tmp[1]], cTable[tmp[0]])
 		}
 	} else {
 		for ; n >= 0; n -= 4 {
@@ -261,8 +260,8 @@
 			bw.encTwoSymbols(cTable, tmp[1], tmp[0])
 		}
 	}
-	err := bw.close()
-	return bw.out, err
+	bw.close()
+	return bw.out
 }
 
 var sixZeros [6]byte
@@ -284,12 +283,8 @@
 		}
 		src = src[len(toDo):]
 
-		var err error
 		idx := len(s.Out)
-		s.Out, err = s.compress1xDo(s.Out, toDo)
-		if err != nil {
-			return nil, err
-		}
+		s.Out = s.compress1xDo(s.Out, toDo)
 		if len(s.Out)-idx > math.MaxUint16 {
 			// We cannot store the size in the jump table
 			return nil, ErrIncompressible
@@ -316,7 +311,6 @@
 
 	segmentSize := (len(src) + 3) / 4
 	var wg sync.WaitGroup
-	var errs [4]error
 	wg.Add(4)
 	for i := 0; i < 4; i++ {
 		toDo := src
@@ -327,15 +321,12 @@
 
 		// Separate goroutine for each block.
 		go func(i int) {
-			s.tmpOut[i], errs[i] = s.compress1xDo(s.tmpOut[i][:0], toDo)
+			s.tmpOut[i] = s.compress1xDo(s.tmpOut[i][:0], toDo)
 			wg.Done()
 		}(i)
 	}
 	wg.Wait()
 	for i := 0; i < 4; i++ {
-		if errs[i] != nil {
-			return nil, errs[i]
-		}
 		o := s.tmpOut[i]
 		if len(o) > math.MaxUint16 {
 			// We cannot store the size in the jump table
@@ -359,35 +350,36 @@
 // Does not update s.clearCount.
 func (s *Scratch) countSimple(in []byte) (max int, reuse bool) {
 	reuse = true
+	_ = s.count // Assert that s != nil to speed up the following loop.
 	for _, v := range in {
 		s.count[v]++
 	}
 	m := uint32(0)
 	if len(s.prevTable) > 0 {
 		for i, v := range s.count[:] {
+			if v == 0 {
+				continue
+			}
 			if v > m {
 				m = v
 			}
-			if v > 0 {
-				s.symbolLen = uint16(i) + 1
-				if i >= len(s.prevTable) {
-					reuse = false
-				} else {
-					if s.prevTable[i].nBits == 0 {
-						reuse = false
-					}
-				}
+			s.symbolLen = uint16(i) + 1
+			if i >= len(s.prevTable) {
+				reuse = false
+			} else if s.prevTable[i].nBits == 0 {
+				reuse = false
 			}
 		}
 		return int(m), reuse
 	}
 	for i, v := range s.count[:] {
+		if v == 0 {
+			continue
+		}
 		if v > m {
 			m = v
 		}
-		if v > 0 {
-			s.symbolLen = uint16(i) + 1
-		}
+		s.symbolLen = uint16(i) + 1
 	}
 	return int(m), false
 }
@@ -424,7 +416,7 @@
 
 // minTableLog provides the minimum logSize to safely represent a distribution.
 func (s *Scratch) minTableLog() uint8 {
-	minBitsSrc := highBit32(uint32(s.br.remain())) + 1
+	minBitsSrc := highBit32(uint32(s.srcLen)) + 1
 	minBitsSymbols := highBit32(uint32(s.symbolLen-1)) + 2
 	if minBitsSrc < minBitsSymbols {
 		return uint8(minBitsSrc)
@@ -436,7 +428,7 @@
 func (s *Scratch) optimalTableLog() {
 	tableLog := s.TableLog
 	minBits := s.minTableLog()
-	maxBitsSrc := uint8(highBit32(uint32(s.br.remain()-1))) - 1
+	maxBitsSrc := uint8(highBit32(uint32(s.srcLen-1))) - 1
 	if maxBitsSrc < tableLog {
 		// Accuracy can be reduced
 		tableLog = maxBitsSrc
@@ -484,34 +476,35 @@
 	// Different from reference implementation.
 	huffNode0 := s.nodes[0 : huffNodesLen+1]
 
-	for huffNode[nonNullRank].count == 0 {
+	for huffNode[nonNullRank].count() == 0 {
 		nonNullRank--
 	}
 
 	lowS := int16(nonNullRank)
 	nodeRoot := nodeNb + lowS - 1
 	lowN := nodeNb
-	huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count
-	huffNode[lowS].parent, huffNode[lowS-1].parent = uint16(nodeNb), uint16(nodeNb)
+	huffNode[nodeNb].setCount(huffNode[lowS].count() + huffNode[lowS-1].count())
+	huffNode[lowS].setParent(nodeNb)
+	huffNode[lowS-1].setParent(nodeNb)
 	nodeNb++
 	lowS -= 2
 	for n := nodeNb; n <= nodeRoot; n++ {
-		huffNode[n].count = 1 << 30
+		huffNode[n].setCount(1 << 30)
 	}
 	// fake entry, strong barrier
-	huffNode0[0].count = 1 << 31
+	huffNode0[0].setCount(1 << 31)
 
 	// create parents
 	for nodeNb <= nodeRoot {
 		var n1, n2 int16
-		if huffNode0[lowS+1].count < huffNode0[lowN+1].count {
+		if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
 			n1 = lowS
 			lowS--
 		} else {
 			n1 = lowN
 			lowN++
 		}
-		if huffNode0[lowS+1].count < huffNode0[lowN+1].count {
+		if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
 			n2 = lowS
 			lowS--
 		} else {
@@ -519,18 +512,19 @@
 			lowN++
 		}
 
-		huffNode[nodeNb].count = huffNode0[n1+1].count + huffNode0[n2+1].count
-		huffNode0[n1+1].parent, huffNode0[n2+1].parent = uint16(nodeNb), uint16(nodeNb)
+		huffNode[nodeNb].setCount(huffNode0[n1+1].count() + huffNode0[n2+1].count())
+		huffNode0[n1+1].setParent(nodeNb)
+		huffNode0[n2+1].setParent(nodeNb)
 		nodeNb++
 	}
 
 	// distribute weights (unlimited tree height)
-	huffNode[nodeRoot].nbBits = 0
+	huffNode[nodeRoot].setNbBits(0)
 	for n := nodeRoot - 1; n >= startNode; n-- {
-		huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1
+		huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
 	}
 	for n := uint16(0); n <= nonNullRank; n++ {
-		huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1
+		huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
 	}
 	s.actualTableLog = s.setMaxHeight(int(nonNullRank))
 	maxNbBits := s.actualTableLog
@@ -542,7 +536,7 @@
 	var nbPerRank [tableLogMax + 1]uint16
 	var valPerRank [16]uint16
 	for _, v := range huffNode[:nonNullRank+1] {
-		nbPerRank[v.nbBits]++
+		nbPerRank[v.nbBits()]++
 	}
 	// determine stating value per rank
 	{
@@ -557,7 +551,7 @@
 
 	// push nbBits per symbol, symbol order
 	for _, v := range huffNode[:nonNullRank+1] {
-		s.cTable[v.symbol].nBits = v.nbBits
+		s.cTable[v.symbol()].nBits = v.nbBits()
 	}
 
 	// assign value within rank, symbol order
@@ -603,12 +597,12 @@
 		pos := rank[r].current
 		rank[r].current++
 		prev := nodes[(pos-1)&huffNodesMask]
-		for pos > rank[r].base && c > prev.count {
+		for pos > rank[r].base && c > prev.count() {
 			nodes[pos&huffNodesMask] = prev
 			pos--
 			prev = nodes[(pos-1)&huffNodesMask]
 		}
-		nodes[pos&huffNodesMask] = nodeElt{count: c, symbol: byte(n)}
+		nodes[pos&huffNodesMask] = makeNodeElt(c, byte(n))
 	}
 }
 
@@ -617,7 +611,7 @@
 	huffNode := s.nodes[1 : huffNodesLen+1]
 	//huffNode = huffNode[: huffNodesLen]
 
-	largestBits := huffNode[lastNonNull].nbBits
+	largestBits := huffNode[lastNonNull].nbBits()
 
 	// early exit : no elt > maxNbBits
 	if largestBits <= maxNbBits {
@@ -627,14 +621,14 @@
 	baseCost := int(1) << (largestBits - maxNbBits)
 	n := uint32(lastNonNull)
 
-	for huffNode[n].nbBits > maxNbBits {
-		totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits))
-		huffNode[n].nbBits = maxNbBits
+	for huffNode[n].nbBits() > maxNbBits {
+		totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits()))
+		huffNode[n].setNbBits(maxNbBits)
 		n--
 	}
 	// n stops at huffNode[n].nbBits <= maxNbBits
 
-	for huffNode[n].nbBits == maxNbBits {
+	for huffNode[n].nbBits() == maxNbBits {
 		n--
 	}
 	// n end at index of smallest symbol using < maxNbBits
@@ -655,10 +649,10 @@
 		{
 			currentNbBits := maxNbBits
 			for pos := int(n); pos >= 0; pos-- {
-				if huffNode[pos].nbBits >= currentNbBits {
+				if huffNode[pos].nbBits() >= currentNbBits {
 					continue
 				}
-				currentNbBits = huffNode[pos].nbBits // < maxNbBits
+				currentNbBits = huffNode[pos].nbBits() // < maxNbBits
 				rankLast[maxNbBits-currentNbBits] = uint32(pos)
 			}
 		}
@@ -675,8 +669,8 @@
 				if lowPos == noSymbol {
 					break
 				}
-				highTotal := huffNode[highPos].count
-				lowTotal := 2 * huffNode[lowPos].count
+				highTotal := huffNode[highPos].count()
+				lowTotal := 2 * huffNode[lowPos].count()
 				if highTotal <= lowTotal {
 					break
 				}
@@ -692,13 +686,14 @@
 				// this rank is no longer empty
 				rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease]
 			}
-			huffNode[rankLast[nBitsToDecrease]].nbBits++
+			huffNode[rankLast[nBitsToDecrease]].setNbBits(1 +
+				huffNode[rankLast[nBitsToDecrease]].nbBits())
 			if rankLast[nBitsToDecrease] == 0 {
 				/* special case, reached largest symbol */
 				rankLast[nBitsToDecrease] = noSymbol
 			} else {
 				rankLast[nBitsToDecrease]--
-				if huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease {
+				if huffNode[rankLast[nBitsToDecrease]].nbBits() != maxNbBits-nBitsToDecrease {
 					rankLast[nBitsToDecrease] = noSymbol /* this rank is now empty */
 				}
 			}
@@ -706,15 +701,15 @@
 
 		for totalCost < 0 { /* Sometimes, cost correction overshoot */
 			if rankLast[1] == noSymbol { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
-				for huffNode[n].nbBits == maxNbBits {
+				for huffNode[n].nbBits() == maxNbBits {
 					n--
 				}
-				huffNode[n+1].nbBits--
+				huffNode[n+1].setNbBits(huffNode[n+1].nbBits() - 1)
 				rankLast[1] = n + 1
 				totalCost++
 				continue
 			}
-			huffNode[rankLast[1]+1].nbBits--
+			huffNode[rankLast[1]+1].setNbBits(huffNode[rankLast[1]+1].nbBits() - 1)
 			rankLast[1]++
 			totalCost++
 		}
@@ -722,9 +717,26 @@
 	return maxNbBits
 }
 
-type nodeElt struct {
-	count  uint32
-	parent uint16
-	symbol byte
-	nbBits uint8
+// A nodeElt is the fields
+//
+//	count  uint32
+//	parent uint16
+//	symbol byte
+//	nbBits uint8
+//
+// in some order, all squashed into an integer so that the compiler
+// always loads and stores entire nodeElts instead of separate fields.
+type nodeElt uint64
+
+func makeNodeElt(count uint32, symbol byte) nodeElt {
+	return nodeElt(count) | nodeElt(symbol)<<48
 }
+
+func (e *nodeElt) count() uint32  { return uint32(*e) }
+func (e *nodeElt) parent() uint16 { return uint16(*e >> 32) }
+func (e *nodeElt) symbol() byte   { return byte(*e >> 48) }
+func (e *nodeElt) nbBits() uint8  { return uint8(*e >> 56) }
+
+func (e *nodeElt) setCount(c uint32) { *e = (*e)&0xffffffff00000000 | nodeElt(c) }
+func (e *nodeElt) setParent(p int16) { *e = (*e)&0xffff0000ffffffff | nodeElt(uint16(p))<<32 }
+func (e *nodeElt) setNbBits(n uint8) { *e = (*e)&0x00ffffffffffffff | nodeElt(n)<<56 }

diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
index c0c48bd..0f56b02 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go

@@ -61,7 +61,7 @@
 		b, err := fse.Decompress(in[:iSize], s.fse)
 		s.fse.Out = nil
 		if err != nil {
-			return s, nil, err
+			return s, nil, fmt.Errorf("fse decompress returned: %w", err)
 		}
 		if len(b) > 255 {
 			return s, nil, errors.New("corrupt input: output table too large")
@@ -253,7 +253,7 @@
 
 	switch d.actualTableLog {
 	case 8:
-		const shift = 8 - 8
+		const shift = 0
 		for br.off >= 4 {
 			br.fillFast()
 			v := dt[uint8(br.value>>(56+shift))]
@@ -763,17 +763,20 @@
 				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
-			copy(out, buf[0][:])
-			copy(out[dstEvery:], buf[1][:])
-			copy(out[dstEvery*2:], buf[2][:])
-			copy(out[dstEvery*3:], buf[3][:])
-			out = out[bufoff:]
-			decoded += bufoff * 4
 			// There must at least be 3 buffers left.
-			if len(out) < dstEvery*3 {
+			if len(out)-bufoff < dstEvery*3 {
 				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
+			//copy(out, buf[0][:])
+			//copy(out[dstEvery:], buf[1][:])
+			//copy(out[dstEvery*2:], buf[2][:])
+			*(*[bufoff]byte)(out) = buf[0]
+			*(*[bufoff]byte)(out[dstEvery:]) = buf[1]
+			*(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
+			*(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
+			out = out[bufoff:]
+			decoded += bufoff * 4
 		}
 	}
 	if off > 0 {
@@ -997,17 +1000,22 @@
 				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
-			copy(out, buf[0][:])
-			copy(out[dstEvery:], buf[1][:])
-			copy(out[dstEvery*2:], buf[2][:])
-			copy(out[dstEvery*3:], buf[3][:])
-			out = out[bufoff:]
-			decoded += bufoff * 4
 			// There must at least be 3 buffers left.
-			if len(out) < dstEvery*3 {
+			if len(out)-bufoff < dstEvery*3 {
 				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
+
+			//copy(out, buf[0][:])
+			//copy(out[dstEvery:], buf[1][:])
+			//copy(out[dstEvery*2:], buf[2][:])
+			// copy(out[dstEvery*3:], buf[3][:])
+			*(*[bufoff]byte)(out) = buf[0]
+			*(*[bufoff]byte)(out[dstEvery:]) = buf[1]
+			*(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
+			*(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
+			out = out[bufoff:]
+			decoded += bufoff * 4
 		}
 	}
 	if off > 0 {
@@ -1128,7 +1136,7 @@
 			errs++
 		}
 		if errs > 0 {
-			fmt.Fprintf(w, "%d errros in base, stopping\n", errs)
+			fmt.Fprintf(w, "%d errors in base, stopping\n", errs)
 			continue
 		}
 		// Ensure that all combinations are covered.
@@ -1144,7 +1152,7 @@
 				errs++
 			}
 			if errs > 20 {
-				fmt.Fprintf(w, "%d errros, stopping\n", errs)
+				fmt.Fprintf(w, "%d errors, stopping\n", errs)
 				break
 			}
 		}

diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
index 9f3e9f7..ba7e8e6 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go

@@ -14,12 +14,14 @@
 
 // decompress4x_main_loop_x86 is an x86 assembler implementation
 // of Decompress4X when tablelog > 8.
+//
 //go:noescape
 func decompress4x_main_loop_amd64(ctx *decompress4xContext)
 
 // decompress4x_8b_loop_x86 is an x86 assembler implementation
 // of Decompress4X when tablelog <= 8 which decodes 4 entries
 // per loop.
+//
 //go:noescape
 func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
 
@@ -145,11 +147,13 @@
 
 // decompress4x_main_loop_x86 is an x86 assembler implementation
 // of Decompress1X when tablelog > 8.
+//
 //go:noescape
 func decompress1x_main_loop_amd64(ctx *decompress1xContext)
 
 // decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
 // of Decompress1X when tablelog > 8.
+//
 //go:noescape
 func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
 

diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
index dd1a5ae..c4c7ab2 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s

@@ -1,364 +1,352 @@
 // Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
 
 //go:build amd64 && !appengine && !noasm && gc
-// +build amd64,!appengine,!noasm,gc
 
 // func decompress4x_main_loop_amd64(ctx *decompress4xContext)
 TEXT ·decompress4x_main_loop_amd64(SB), $0-8
-	XORQ DX, DX
-
 	// Preload values
 	MOVQ    ctx+0(FP), AX
 	MOVBQZX 8(AX), DI
-	MOVQ    16(AX), SI
-	MOVQ    48(AX), BX
-	MOVQ    24(AX), R9
-	MOVQ    32(AX), R10
-	MOVQ    (AX), R11
+	MOVQ    16(AX), BX
+	MOVQ    48(AX), SI
+	MOVQ    24(AX), R8
+	MOVQ    32(AX), R9
+	MOVQ    (AX), R10
 
 	// Main loop
 main_loop:
-	MOVQ  SI, R8
-	CMPQ  R8, BX
+	XORL  DX, DX
+	CMPQ  BX, SI
 	SETGE DL
 
 	// br0.fillFast32()
-	MOVQ    32(R11), R12
-	MOVBQZX 40(R11), R13
-	CMPQ    R13, $0x20
+	MOVQ    32(R10), R11
+	MOVBQZX 40(R10), R12
+	CMPQ    R12, $0x20
 	JBE     skip_fill0
-	MOVQ    24(R11), AX
-	SUBQ    $0x20, R13
+	MOVQ    24(R10), AX
+	SUBQ    $0x20, R12
 	SUBQ    $0x04, AX
-	MOVQ    (R11), R14
+	MOVQ    (R10), R13
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(R14*1), R14
-	MOVQ R13, CX
-	SHLQ CL, R14
-	MOVQ AX, 24(R11)
-	ORQ  R14, R12
+	MOVL (AX)(R13*1), R13
+	MOVQ R12, CX
+	SHLQ CL, R13
+	MOVQ AX, 24(R10)
+	ORQ  R13, R11
 
-	// exhausted = exhausted || (br0.off < 4)
-	CMPQ  AX, $0x04
-	SETLT AL
-	ORB   AL, DL
+	// exhausted += (br0.off < 4)
+	CMPQ AX, $0x04
+	ADCB $+0, DL
 
 skip_fill0:
 	// val0 := br0.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v0 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br0.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12
 
 	// val1 := br0.peekTopBits(peekBits)
 	MOVQ DI, CX
-	MOVQ R12, R14
-	SHRQ CL, R14
+	MOVQ R11, R13
+	SHRQ CL, R13
 
 	// v1 := table[val1&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br0.advance(uint8(v1.entry))
 	MOVB CH, AH
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12
 
 	// these two writes get coalesced
 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	MOVW AX, (R8)
+	MOVW AX, (BX)
 
 	// update the bitreader structure
-	MOVQ R12, 32(R11)
-	MOVB R13, 40(R11)
-	ADDQ R9, R8
+	MOVQ R11, 32(R10)
+	MOVB R12, 40(R10)
 
 	// br1.fillFast32()
-	MOVQ    80(R11), R12
-	MOVBQZX 88(R11), R13
-	CMPQ    R13, $0x20
+	MOVQ    80(R10), R11
+	MOVBQZX 88(R10), R12
+	CMPQ    R12, $0x20
 	JBE     skip_fill1
-	MOVQ    72(R11), AX
-	SUBQ    $0x20, R13
+	MOVQ    72(R10), AX
+	SUBQ    $0x20, R12
 	SUBQ    $0x04, AX
-	MOVQ    48(R11), R14
+	MOVQ    48(R10), R13
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(R14*1), R14
-	MOVQ R13, CX
-	SHLQ CL, R14
-	MOVQ AX, 72(R11)
-	ORQ  R14, R12
+	MOVL (AX)(R13*1), R13
+	MOVQ R12, CX
+	SHLQ CL, R13
+	MOVQ AX, 72(R10)
+	ORQ  R13, R11
 
-	// exhausted = exhausted || (br1.off < 4)
-	CMPQ  AX, $0x04
-	SETLT AL
-	ORB   AL, DL
+	// exhausted += (br1.off < 4)
+	CMPQ AX, $0x04
+	ADCB $+0, DL
 
 skip_fill1:
 	// val0 := br1.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v0 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br1.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12
 
 	// val1 := br1.peekTopBits(peekBits)
 	MOVQ DI, CX
-	MOVQ R12, R14
-	SHRQ CL, R14
+	MOVQ R11, R13
+	SHRQ CL, R13
 
 	// v1 := table[val1&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br1.advance(uint8(v1.entry))
 	MOVB CH, AH
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12
 
 	// these two writes get coalesced
 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	MOVW AX, (R8)
+	MOVW AX, (BX)(R8*1)
 
 	// update the bitreader structure
-	MOVQ R12, 80(R11)
-	MOVB R13, 88(R11)
-	ADDQ R9, R8
+	MOVQ R11, 80(R10)
+	MOVB R12, 88(R10)
 
 	// br2.fillFast32()
-	MOVQ    128(R11), R12
-	MOVBQZX 136(R11), R13
-	CMPQ    R13, $0x20
+	MOVQ    128(R10), R11
+	MOVBQZX 136(R10), R12
+	CMPQ    R12, $0x20
 	JBE     skip_fill2
-	MOVQ    120(R11), AX
-	SUBQ    $0x20, R13
+	MOVQ    120(R10), AX
+	SUBQ    $0x20, R12
 	SUBQ    $0x04, AX
-	MOVQ    96(R11), R14
+	MOVQ    96(R10), R13
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(R14*1), R14
-	MOVQ R13, CX
-	SHLQ CL, R14
-	MOVQ AX, 120(R11)
-	ORQ  R14, R12
+	MOVL (AX)(R13*1), R13
+	MOVQ R12, CX
+	SHLQ CL, R13
+	MOVQ AX, 120(R10)
+	ORQ  R13, R11
 
-	// exhausted = exhausted || (br2.off < 4)
-	CMPQ  AX, $0x04
-	SETLT AL
-	ORB   AL, DL
+	// exhausted += (br2.off < 4)
+	CMPQ AX, $0x04
+	ADCB $+0, DL
 
 skip_fill2:
 	// val0 := br2.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v0 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br2.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12
 
 	// val1 := br2.peekTopBits(peekBits)
 	MOVQ DI, CX
-	MOVQ R12, R14
-	SHRQ CL, R14
+	MOVQ R11, R13
+	SHRQ CL, R13
 
 	// v1 := table[val1&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br2.advance(uint8(v1.entry))
 	MOVB CH, AH
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12
 
 	// these two writes get coalesced
 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	MOVW AX, (R8)
+	MOVW AX, (BX)(R8*2)
 
 	// update the bitreader structure
-	MOVQ R12, 128(R11)
-	MOVB R13, 136(R11)
-	ADDQ R9, R8
+	MOVQ R11, 128(R10)
+	MOVB R12, 136(R10)
 
 	// br3.fillFast32()
-	MOVQ    176(R11), R12
-	MOVBQZX 184(R11), R13
-	CMPQ    R13, $0x20
+	MOVQ    176(R10), R11
+	MOVBQZX 184(R10), R12
+	CMPQ    R12, $0x20
 	JBE     skip_fill3
-	MOVQ    168(R11), AX
-	SUBQ    $0x20, R13
+	MOVQ    168(R10), AX
+	SUBQ    $0x20, R12
 	SUBQ    $0x04, AX
-	MOVQ    144(R11), R14
+	MOVQ    144(R10), R13
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (AX)(R14*1), R14
-	MOVQ R13, CX
-	SHLQ CL, R14
-	MOVQ AX, 168(R11)
-	ORQ  R14, R12
+	MOVL (AX)(R13*1), R13
+	MOVQ R12, CX
+	SHLQ CL, R13
+	MOVQ AX, 168(R10)
+	ORQ  R13, R11
 
-	// exhausted = exhausted || (br3.off < 4)
-	CMPQ  AX, $0x04
-	SETLT AL
-	ORB   AL, DL
+	// exhausted += (br3.off < 4)
+	CMPQ AX, $0x04
+	ADCB $+0, DL
 
 skip_fill3:
 	// val0 := br3.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v0 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br3.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12
 
 	// val1 := br3.peekTopBits(peekBits)
 	MOVQ DI, CX
-	MOVQ R12, R14
-	SHRQ CL, R14
+	MOVQ R11, R13
+	SHRQ CL, R13
 
 	// v1 := table[val1&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br3.advance(uint8(v1.entry))
 	MOVB CH, AH
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12
 
 	// these two writes get coalesced
 	// out[id * dstEvery + 0] = uint8(v0.entry >> 8)
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
-	MOVW AX, (R8)
+	LEAQ (R8)(R8*2), CX
+	MOVW AX, (BX)(CX*1)
 
 	// update the bitreader structure
-	MOVQ  R12, 176(R11)
-	MOVB  R13, 184(R11)
-	ADDQ  $0x02, SI
+	MOVQ  R11, 176(R10)
+	MOVB  R12, 184(R10)
+	ADDQ  $0x02, BX
 	TESTB DL, DL
 	JZ    main_loop
 	MOVQ  ctx+0(FP), AX
-	SUBQ  16(AX), SI
-	SHLQ  $0x02, SI
-	MOVQ  SI, 40(AX)
+	SUBQ  16(AX), BX
+	SHLQ  $0x02, BX
+	MOVQ  BX, 40(AX)
 	RET
 
 // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
 TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
-	XORQ DX, DX
-
 	// Preload values
 	MOVQ    ctx+0(FP), CX
 	MOVBQZX 8(CX), DI
 	MOVQ    16(CX), BX
 	MOVQ    48(CX), SI
-	MOVQ    24(CX), R9
-	MOVQ    32(CX), R10
-	MOVQ    (CX), R11
+	MOVQ    24(CX), R8
+	MOVQ    32(CX), R9
+	MOVQ    (CX), R10
 
 	// Main loop
 main_loop:
-	MOVQ  BX, R8
-	CMPQ  R8, SI
+	XORL  DX, DX
+	CMPQ  BX, SI
 	SETGE DL
 
 	// br0.fillFast32()
-	MOVQ    32(R11), R12
-	MOVBQZX 40(R11), R13
-	CMPQ    R13, $0x20
+	MOVQ    32(R10), R11
+	MOVBQZX 40(R10), R12
+	CMPQ    R12, $0x20
 	JBE     skip_fill0
-	MOVQ    24(R11), R14
-	SUBQ    $0x20, R13
-	SUBQ    $0x04, R14
-	MOVQ    (R11), R15
+	MOVQ    24(R10), R13
+	SUBQ    $0x20, R12
+	SUBQ    $0x04, R13
+	MOVQ    (R10), R14
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R14)(R15*1), R15
-	MOVQ R13, CX
-	SHLQ CL, R15
-	MOVQ R14, 24(R11)
-	ORQ  R15, R12
+	MOVL (R13)(R14*1), R14
+	MOVQ R12, CX
+	SHLQ CL, R14
+	MOVQ R13, 24(R10)
+	ORQ  R14, R11
 
-	// exhausted = exhausted || (br0.off < 4)
-	CMPQ  R14, $0x04
-	SETLT AL
-	ORB   AL, DL
+	// exhausted += (br0.off < 4)
+	CMPQ R13, $0x04
+	ADCB $+0, DL
 
 skip_fill0:
 	// val0 := br0.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v0 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br0.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12
 
 	// val1 := br0.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v1 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br0.advance(uint8(v1.entry)
 	MOVB   CH, AH
-	SHLQ   CL, R12
-	ADDB   CL, R13
+	SHLQ   CL, R11
+	ADDB   CL, R12
 	BSWAPL AX
 
 	// val2 := br0.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v2 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br0.advance(uint8(v2.entry)
 	MOVB CH, AH
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12
 
 	// val3 := br0.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v3 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br0.advance(uint8(v3.entry)
 	MOVB   CH, AL
-	SHLQ   CL, R12
-	ADDB   CL, R13
+	SHLQ   CL, R11
+	ADDB   CL, R12
 	BSWAPL AX
 
 	// these four writes get coalesced
@@ -366,88 +354,86 @@
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
 	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	MOVL AX, (R8)
+	MOVL AX, (BX)
 
 	// update the bitreader structure
-	MOVQ R12, 32(R11)
-	MOVB R13, 40(R11)
-	ADDQ R9, R8
+	MOVQ R11, 32(R10)
+	MOVB R12, 40(R10)
 
 	// br1.fillFast32()
-	MOVQ    80(R11), R12
-	MOVBQZX 88(R11), R13
-	CMPQ    R13, $0x20
+	MOVQ    80(R10), R11
+	MOVBQZX 88(R10), R12
+	CMPQ    R12, $0x20
 	JBE     skip_fill1
-	MOVQ    72(R11), R14
-	SUBQ    $0x20, R13
-	SUBQ    $0x04, R14
-	MOVQ    48(R11), R15
+	MOVQ    72(R10), R13
+	SUBQ    $0x20, R12
+	SUBQ    $0x04, R13
+	MOVQ    48(R10), R14
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R14)(R15*1), R15
-	MOVQ R13, CX
-	SHLQ CL, R15
-	MOVQ R14, 72(R11)
-	ORQ  R15, R12
+	MOVL (R13)(R14*1), R14
+	MOVQ R12, CX
+	SHLQ CL, R14
+	MOVQ R13, 72(R10)
+	ORQ  R14, R11
 
-	// exhausted = exhausted || (br1.off < 4)
-	CMPQ  R14, $0x04
-	SETLT AL
-	ORB   AL, DL
+	// exhausted += (br1.off < 4)
+	CMPQ R13, $0x04
+	ADCB $+0, DL
 
 skip_fill1:
 	// val0 := br1.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v0 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br1.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12
 
 	// val1 := br1.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v1 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br1.advance(uint8(v1.entry)
 	MOVB   CH, AH
-	SHLQ   CL, R12
-	ADDB   CL, R13
+	SHLQ   CL, R11
+	ADDB   CL, R12
 	BSWAPL AX
 
 	// val2 := br1.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v2 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br1.advance(uint8(v2.entry)
 	MOVB CH, AH
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12
 
 	// val3 := br1.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v3 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br1.advance(uint8(v3.entry)
 	MOVB   CH, AL
-	SHLQ   CL, R12
-	ADDB   CL, R13
+	SHLQ   CL, R11
+	ADDB   CL, R12
 	BSWAPL AX
 
 	// these four writes get coalesced
@@ -455,88 +441,86 @@
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
 	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	MOVL AX, (R8)
+	MOVL AX, (BX)(R8*1)
 
 	// update the bitreader structure
-	MOVQ R12, 80(R11)
-	MOVB R13, 88(R11)
-	ADDQ R9, R8
+	MOVQ R11, 80(R10)
+	MOVB R12, 88(R10)
 
 	// br2.fillFast32()
-	MOVQ    128(R11), R12
-	MOVBQZX 136(R11), R13
-	CMPQ    R13, $0x20
+	MOVQ    128(R10), R11
+	MOVBQZX 136(R10), R12
+	CMPQ    R12, $0x20
 	JBE     skip_fill2
-	MOVQ    120(R11), R14
-	SUBQ    $0x20, R13
-	SUBQ    $0x04, R14
-	MOVQ    96(R11), R15
+	MOVQ    120(R10), R13
+	SUBQ    $0x20, R12
+	SUBQ    $0x04, R13
+	MOVQ    96(R10), R14
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R14)(R15*1), R15
-	MOVQ R13, CX
-	SHLQ CL, R15
-	MOVQ R14, 120(R11)
-	ORQ  R15, R12
+	MOVL (R13)(R14*1), R14
+	MOVQ R12, CX
+	SHLQ CL, R14
+	MOVQ R13, 120(R10)
+	ORQ  R14, R11
 
-	// exhausted = exhausted || (br2.off < 4)
-	CMPQ  R14, $0x04
-	SETLT AL
-	ORB   AL, DL
+	// exhausted += (br2.off < 4)
+	CMPQ R13, $0x04
+	ADCB $+0, DL
 
 skip_fill2:
 	// val0 := br2.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v0 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br2.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12
 
 	// val1 := br2.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v1 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br2.advance(uint8(v1.entry)
 	MOVB   CH, AH
-	SHLQ   CL, R12
-	ADDB   CL, R13
+	SHLQ   CL, R11
+	ADDB   CL, R12
 	BSWAPL AX
 
 	// val2 := br2.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v2 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br2.advance(uint8(v2.entry)
 	MOVB CH, AH
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12
 
 	// val3 := br2.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v3 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br2.advance(uint8(v3.entry)
 	MOVB   CH, AL
-	SHLQ   CL, R12
-	ADDB   CL, R13
+	SHLQ   CL, R11
+	ADDB   CL, R12
 	BSWAPL AX
 
 	// these four writes get coalesced
@@ -544,88 +528,86 @@
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
 	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	MOVL AX, (R8)
+	MOVL AX, (BX)(R8*2)
 
 	// update the bitreader structure
-	MOVQ R12, 128(R11)
-	MOVB R13, 136(R11)
-	ADDQ R9, R8
+	MOVQ R11, 128(R10)
+	MOVB R12, 136(R10)
 
 	// br3.fillFast32()
-	MOVQ    176(R11), R12
-	MOVBQZX 184(R11), R13
-	CMPQ    R13, $0x20
+	MOVQ    176(R10), R11
+	MOVBQZX 184(R10), R12
+	CMPQ    R12, $0x20
 	JBE     skip_fill3
-	MOVQ    168(R11), R14
-	SUBQ    $0x20, R13
-	SUBQ    $0x04, R14
-	MOVQ    144(R11), R15
+	MOVQ    168(R10), R13
+	SUBQ    $0x20, R12
+	SUBQ    $0x04, R13
+	MOVQ    144(R10), R14
 
 	// b.value |= uint64(low) << (b.bitsRead & 63)
-	MOVL (R14)(R15*1), R15
-	MOVQ R13, CX
-	SHLQ CL, R15
-	MOVQ R14, 168(R11)
-	ORQ  R15, R12
+	MOVL (R13)(R14*1), R14
+	MOVQ R12, CX
+	SHLQ CL, R14
+	MOVQ R13, 168(R10)
+	ORQ  R14, R11
 
-	// exhausted = exhausted || (br3.off < 4)
-	CMPQ  R14, $0x04
-	SETLT AL
-	ORB   AL, DL
+	// exhausted += (br3.off < 4)
+	CMPQ R13, $0x04
+	ADCB $+0, DL
 
 skip_fill3:
 	// val0 := br3.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v0 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br3.advance(uint8(v0.entry)
 	MOVB CH, AL
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12
 
 	// val1 := br3.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v1 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br3.advance(uint8(v1.entry)
 	MOVB   CH, AH
-	SHLQ   CL, R12
-	ADDB   CL, R13
+	SHLQ   CL, R11
+	ADDB   CL, R12
 	BSWAPL AX
 
 	// val2 := br3.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v2 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br3.advance(uint8(v2.entry)
 	MOVB CH, AH
-	SHLQ CL, R12
-	ADDB CL, R13
+	SHLQ CL, R11
+	ADDB CL, R12
 
 	// val3 := br3.peekTopBits(peekBits)
-	MOVQ R12, R14
+	MOVQ R11, R13
 	MOVQ DI, CX
-	SHRQ CL, R14
+	SHRQ CL, R13
 
 	// v3 := table[val0&mask]
-	MOVW (R10)(R14*2), CX
+	MOVW (R9)(R13*2), CX
 
 	// br3.advance(uint8(v3.entry)
 	MOVB   CH, AL
-	SHLQ   CL, R12
-	ADDB   CL, R13
+	SHLQ   CL, R11
+	ADDB   CL, R12
 	BSWAPL AX
 
 	// these four writes get coalesced
@@ -633,11 +615,12 @@
 	// out[id * dstEvery + 1] = uint8(v1.entry >> 8)
 	// out[id * dstEvery + 3] = uint8(v2.entry >> 8)
 	// out[id * dstEvery + 4] = uint8(v3.entry >> 8)
-	MOVL AX, (R8)
+	LEAQ (R8)(R8*2), CX
+	MOVL AX, (BX)(CX*1)
 
 	// update the bitreader structure
-	MOVQ  R12, 176(R11)
-	MOVB  R13, 184(R11)
+	MOVQ  R11, 176(R10)
+	MOVB  R12, 184(R10)
 	ADDQ  $0x04, BX
 	TESTB DL, DL
 	JZ    main_loop
@@ -653,7 +636,7 @@
 	MOVQ    16(CX), DX
 	MOVQ    24(CX), BX
 	CMPQ    BX, $0x04
-	JB      error_max_decoded_size_exeeded
+	JB      error_max_decoded_size_exceeded
 	LEAQ    (DX)(BX*1), BX
 	MOVQ    (CX), SI
 	MOVQ    (SI), R8
@@ -668,7 +651,7 @@
 	// Check if we have room for 4 bytes in the output buffer
 	LEAQ 4(DX), CX
 	CMPQ CX, BX
-	JGE  error_max_decoded_size_exeeded
+	JGE  error_max_decoded_size_exceeded
 
 	// Decode 4 values
 	CMPQ R11, $0x20
@@ -745,7 +728,7 @@
 	RET
 
 	// Report error
-error_max_decoded_size_exeeded:
+error_max_decoded_size_exceeded:
 	MOVQ ctx+0(FP), AX
 	MOVQ $-1, CX
 	MOVQ CX, 40(AX)
@@ -758,7 +741,7 @@
 	MOVQ    16(CX), DX
 	MOVQ    24(CX), BX
 	CMPQ    BX, $0x04
-	JB      error_max_decoded_size_exeeded
+	JB      error_max_decoded_size_exceeded
 	LEAQ    (DX)(BX*1), BX
 	MOVQ    (CX), SI
 	MOVQ    (SI), R8
@@ -773,7 +756,7 @@
 	// Check if we have room for 4 bytes in the output buffer
 	LEAQ 4(DX), CX
 	CMPQ CX, BX
-	JGE  error_max_decoded_size_exeeded
+	JGE  error_max_decoded_size_exceeded
 
 	// Decode 4 values
 	CMPQ  R11, $0x20
@@ -840,7 +823,7 @@
 	RET
 
 	// Report error
-error_max_decoded_size_exeeded:
+error_max_decoded_size_exceeded:
 	MOVQ ctx+0(FP), AX
 	MOVQ $-1, CX
 	MOVQ CX, 40(AX)

diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
index 4f6f37c..908c17d 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go

@@ -122,17 +122,21 @@
 				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 1")
 			}
-			copy(out, buf[0][:])
-			copy(out[dstEvery:], buf[1][:])
-			copy(out[dstEvery*2:], buf[2][:])
-			copy(out[dstEvery*3:], buf[3][:])
-			out = out[bufoff:]
-			decoded += bufoff * 4
 			// There must at least be 3 buffers left.
-			if len(out) < dstEvery*3 {
+			if len(out)-bufoff < dstEvery*3 {
 				d.bufs.Put(buf)
 				return nil, errors.New("corruption detected: stream overrun 2")
 			}
+			//copy(out, buf[0][:])
+			//copy(out[dstEvery:], buf[1][:])
+			//copy(out[dstEvery*2:], buf[2][:])
+			//copy(out[dstEvery*3:], buf[3][:])
+			*(*[bufoff]byte)(out) = buf[0]
+			*(*[bufoff]byte)(out[dstEvery:]) = buf[1]
+			*(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
+			*(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
+			out = out[bufoff:]
+			decoded += bufoff * 4
 		}
 	}
 	if off > 0 {

diff --git a/vendor/github.com/klauspost/compress/huff0/huff0.go b/vendor/github.com/klauspost/compress/huff0/huff0.go
index e8ad17a..77ecd68 100644
--- a/vendor/github.com/klauspost/compress/huff0/huff0.go
+++ b/vendor/github.com/klauspost/compress/huff0/huff0.go

@@ -88,7 +88,7 @@
 	// Decoders will return ErrMaxDecodedSizeExceeded is this limit is exceeded.
 	MaxDecodedSize int
 
-	br byteReader
+	srcLen int
 
 	// MaxSymbolValue will override the maximum symbol value of the next block.
 	MaxSymbolValue uint8
@@ -170,7 +170,7 @@
 	if s.fse == nil {
 		s.fse = &fse.Scratch{}
 	}
-	s.br.init(in)
+	s.srcLen = len(in)
 
 	return s, nil
 }

diff --git a/vendor/github.com/klauspost/compress/internal/le/le.go b/vendor/github.com/klauspost/compress/internal/le/le.go
new file mode 100644
index 0000000..e54909e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/le/le.go

@@ -0,0 +1,5 @@
+package le
+
+type Indexer interface {
+	int | int8 | int16 | int32 | int64 | uint | uint8 | uint16 | uint32 | uint64
+}

diff --git a/vendor/github.com/klauspost/compress/internal/le/unsafe_disabled.go b/vendor/github.com/klauspost/compress/internal/le/unsafe_disabled.go
new file mode 100644
index 0000000..0cfb5c0
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/le/unsafe_disabled.go

@@ -0,0 +1,42 @@
+//go:build !(amd64 || arm64 || ppc64le || riscv64) || nounsafe || purego || appengine
+
+package le
+
+import (
+	"encoding/binary"
+)
+
+// Load8 will load from b at index i.
+func Load8[I Indexer](b []byte, i I) byte {
+	return b[i]
+}
+
+// Load16 will load from b at index i.
+func Load16[I Indexer](b []byte, i I) uint16 {
+	return binary.LittleEndian.Uint16(b[i:])
+}
+
+// Load32 will load from b at index i.
+func Load32[I Indexer](b []byte, i I) uint32 {
+	return binary.LittleEndian.Uint32(b[i:])
+}
+
+// Load64 will load from b at index i.
+func Load64[I Indexer](b []byte, i I) uint64 {
+	return binary.LittleEndian.Uint64(b[i:])
+}
+
+// Store16 will store v at b.
+func Store16(b []byte, v uint16) {
+	binary.LittleEndian.PutUint16(b, v)
+}
+
+// Store32 will store v at b.
+func Store32(b []byte, v uint32) {
+	binary.LittleEndian.PutUint32(b, v)
+}
+
+// Store64 will store v at b.
+func Store64(b []byte, v uint64) {
+	binary.LittleEndian.PutUint64(b, v)
+}

diff --git a/vendor/github.com/klauspost/compress/internal/le/unsafe_enabled.go b/vendor/github.com/klauspost/compress/internal/le/unsafe_enabled.go
new file mode 100644
index 0000000..ada45cd
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/internal/le/unsafe_enabled.go

@@ -0,0 +1,55 @@
+// We enable 64 bit LE platforms:
+
+//go:build (amd64 || arm64 || ppc64le || riscv64) && !nounsafe && !purego && !appengine
+
+package le
+
+import (
+	"unsafe"
+)
+
+// Load8 will load from b at index i.
+func Load8[I Indexer](b []byte, i I) byte {
+	//return binary.LittleEndian.Uint16(b[i:])
+	//return *(*uint16)(unsafe.Pointer(&b[i]))
+	return *(*byte)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), i))
+}
+
+// Load16 will load from b at index i.
+func Load16[I Indexer](b []byte, i I) uint16 {
+	//return binary.LittleEndian.Uint16(b[i:])
+	//return *(*uint16)(unsafe.Pointer(&b[i]))
+	return *(*uint16)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), i))
+}
+
+// Load32 will load from b at index i.
+func Load32[I Indexer](b []byte, i I) uint32 {
+	//return binary.LittleEndian.Uint32(b[i:])
+	//return *(*uint32)(unsafe.Pointer(&b[i]))
+	return *(*uint32)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), i))
+}
+
+// Load64 will load from b at index i.
+func Load64[I Indexer](b []byte, i I) uint64 {
+	//return binary.LittleEndian.Uint64(b[i:])
+	//return *(*uint64)(unsafe.Pointer(&b[i]))
+	return *(*uint64)(unsafe.Add(unsafe.Pointer(unsafe.SliceData(b)), i))
+}
+
+// Store16 will store v at b.
+func Store16(b []byte, v uint16) {
+	//binary.LittleEndian.PutUint16(b, v)
+	*(*uint16)(unsafe.Pointer(unsafe.SliceData(b))) = v
+}
+
+// Store32 will store v at b.
+func Store32(b []byte, v uint32) {
+	//binary.LittleEndian.PutUint32(b, v)
+	*(*uint32)(unsafe.Pointer(unsafe.SliceData(b))) = v
+}
+
+// Store64 will store v at b.
+func Store64(b []byte, v uint64) {
+	//binary.LittleEndian.PutUint64(b, v)
+	*(*uint64)(unsafe.Pointer(unsafe.SliceData(b))) = v
+}

diff --git a/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go b/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go
index 511bba6..2754bac 100644
--- a/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go
+++ b/vendor/github.com/klauspost/compress/internal/snapref/encode_other.go

@@ -18,6 +18,7 @@
 // emitLiteral writes a literal chunk and returns the number of bytes written.
 //
 // It assumes that:
+//
 //	dst is long enough to hold the encoded bytes
 //	1 <= len(lit) && len(lit) <= 65536
 func emitLiteral(dst, lit []byte) int {
@@ -42,6 +43,7 @@
 // emitCopy writes a copy chunk and returns the number of bytes written.
 //
 // It assumes that:
+//
 //	dst is long enough to hold the encoded bytes
 //	1 <= offset && offset <= 65535
 //	4 <= length && length <= 65535
@@ -49,7 +51,7 @@
 	i := 0
 	// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
 	// threshold for this loop is a little higher (at 68 = 64 + 4), and the
-	// length emitted down below is is a little lower (at 60 = 64 - 4), because
+	// length emitted down below is a little lower (at 60 = 64 - 4), because
 	// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
 	// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
 	// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
@@ -85,28 +87,40 @@
 	return i + 2
 }
 
-// extendMatch returns the largest k such that k <= len(src) and that
-// src[i:i+k-j] and src[j:k] have the same contents.
-//
-// It assumes that:
-//	0 <= i && i < j && j <= len(src)
-func extendMatch(src []byte, i, j int) int {
-	for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
-	}
-	return j
-}
-
 func hash(u, shift uint32) uint32 {
 	return (u * 0x1e35a7bd) >> shift
 }
 
+// EncodeBlockInto exposes encodeBlock but checks dst size.
+func EncodeBlockInto(dst, src []byte) (d int) {
+	if MaxEncodedLen(len(src)) > len(dst) {
+		return 0
+	}
+
+	// encodeBlock breaks on too big blocks, so split.
+	for len(src) > 0 {
+		p := src
+		src = nil
+		if len(p) > maxBlockSize {
+			p, src = p[:maxBlockSize], p[maxBlockSize:]
+		}
+		if len(p) < minNonLiteralBlockSize {
+			d += emitLiteral(dst[d:], p)
+		} else {
+			d += encodeBlock(dst[d:], p)
+		}
+	}
+	return d
+}
+
 // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
 //
 // It also assumes that:
+//
 //	len(dst) >= MaxEncodedLen(len(src)) &&
-// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+//	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
 func encodeBlock(dst, src []byte) (d int) {
 	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
 	// The table element type is uint16, as s < sLimit and sLimit < len(src)

diff --git a/vendor/github.com/klauspost/compress/s2sx.mod b/vendor/github.com/klauspost/compress/s2sx.mod
index 2263853..81bda5e 100644
--- a/vendor/github.com/klauspost/compress/s2sx.mod
+++ b/vendor/github.com/klauspost/compress/s2sx.mod

@@ -1,4 +1,3 @@
 module github.com/klauspost/compress
 
-go 1.16
-
+go 1.22

diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md
index beb7fa8..c11d7fa 100644
--- a/vendor/github.com/klauspost/compress/zstd/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/README.md

@@ -6,12 +6,14 @@
 
 This package provides [compression](#Compressor) to and [decompression](#Decompressor) of Zstandard content. 
 
-This package is pure Go and without use of "unsafe". 
+This package is pure Go. Use `noasm` and `nounsafe` to disable relevant features.
 
 The `zstd` package is provided as open source software using a Go standard license.
 
 Currently the package is heavily optimized for 64 bit processors and will be significantly slower on 32 bit processors.
 
+For seekable zstd streams, see [this excellent package](https://github.com/SaveTheRbtz/zstd-seekable-format-go).
+
 ## Installation
 
 Install using `go get -u github.com/klauspost/compress`. The package is located in `github.com/klauspost/compress/zstd`.
@@ -257,7 +259,7 @@
 
 ## Decompressor
 
-Staus: STABLE - there may still be subtle bugs, but a wide variety of content has been tested.
+Status: STABLE - there may still be subtle bugs, but a wide variety of content has been tested.
 
 This library is being continuously [fuzz-tested](https://github.com/klauspost/compress-fuzz),
 kindly supplied by [fuzzit.dev](https://fuzzit.dev/). 
@@ -302,7 +304,7 @@
 
 // Create a reader that caches decompressors.
 // For this operation type we supply a nil Reader.
-var decoder, _ = zstd.NewReader(nil, WithDecoderConcurrency(0))
+var decoder, _ = zstd.NewReader(nil, zstd.WithDecoderConcurrency(0))
 
 // Decompress a buffer. We don't supply a destination buffer,
 // so it will be allocated by the decoder.

diff --git a/vendor/github.com/klauspost/compress/zstd/bitreader.go b/vendor/github.com/klauspost/compress/zstd/bitreader.go
index 97299d4..d41e3e1 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitreader.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitreader.go

@@ -5,11 +5,12 @@
 package zstd
 
 import (
-	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
 	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 // bitReader reads a bitstream in reverse.
@@ -17,8 +18,8 @@
 // for aligning the input.
 type bitReader struct {
 	in       []byte
-	off      uint   // next byte to read is at in[off - 1]
 	value    uint64 // Maybe use [16]byte, but shifting is awkward.
+	cursor   int    // offset where next read should end
 	bitsRead uint8
 }
 
@@ -28,12 +29,12 @@
 		return errors.New("corrupt stream: too short")
 	}
 	b.in = in
-	b.off = uint(len(in))
 	// The highest bit of the last byte indicates where to start
 	v := in[len(in)-1]
 	if v == 0 {
 		return errors.New("corrupt stream, did not find end of stream")
 	}
+	b.cursor = len(in)
 	b.bitsRead = 64
 	b.value = 0
 	if len(in) >= 8 {
@@ -69,21 +70,16 @@
 	if b.bitsRead < 32 {
 		return
 	}
-	// 2 bounds checks.
-	v := b.in[b.off-4:]
-	v = v[:4]
-	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-	b.value = (b.value << 32) | uint64(low)
+	b.cursor -= 4
+	b.value = (b.value << 32) | uint64(le.Load32(b.in, b.cursor))
 	b.bitsRead -= 32
-	b.off -= 4
 }
 
 // fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
 func (b *bitReader) fillFastStart() {
-	// Do single re-slice to avoid bounds checks.
-	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+	b.cursor -= 8
+	b.value = le.Load64(b.in, b.cursor)
 	b.bitsRead = 0
-	b.off -= 8
 }
 
 // fill() will make sure at least 32 bits are available.
@@ -91,25 +87,23 @@
 	if b.bitsRead < 32 {
 		return
 	}
-	if b.off >= 4 {
-		v := b.in[b.off-4:]
-		v = v[:4]
-		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
-		b.value = (b.value << 32) | uint64(low)
+	if b.cursor >= 4 {
+		b.cursor -= 4
+		b.value = (b.value << 32) | uint64(le.Load32(b.in, b.cursor))
 		b.bitsRead -= 32
-		b.off -= 4
 		return
 	}
-	for b.off > 0 {
-		b.value = (b.value << 8) | uint64(b.in[b.off-1])
-		b.bitsRead -= 8
-		b.off--
+
+	b.bitsRead -= uint8(8 * b.cursor)
+	for b.cursor > 0 {
+		b.cursor -= 1
+		b.value = (b.value << 8) | uint64(b.in[b.cursor])
 	}
 }
 
 // finished returns true if all bits have been read from the bit stream.
 func (b *bitReader) finished() bool {
-	return b.off == 0 && b.bitsRead >= 64
+	return b.cursor == 0 && b.bitsRead >= 64
 }
 
 // overread returns true if more bits have been requested than is on the stream.
@@ -119,13 +113,14 @@
 
 // remain returns the number of bits remaining.
 func (b *bitReader) remain() uint {
-	return b.off*8 + 64 - uint(b.bitsRead)
+	return 8*uint(b.cursor) + 64 - uint(b.bitsRead)
 }
 
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReader) close() error {
 	// Release reference.
 	b.in = nil
+	b.cursor = 0
 	if !b.finished() {
 		return fmt.Errorf("%d extra bits on block, should be 0", b.remain())
 	}

diff --git a/vendor/github.com/klauspost/compress/zstd/bitwriter.go b/vendor/github.com/klauspost/compress/zstd/bitwriter.go
index 78b3c61..1952f17 100644
--- a/vendor/github.com/klauspost/compress/zstd/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/zstd/bitwriter.go

@@ -97,12 +97,11 @@
 
 // close will write the alignment bit and write the final byte(s)
 // to the output.
-func (b *bitWriter) close() error {
+func (b *bitWriter) close() {
 	// End mark
 	b.addBits16Clean(1, 1)
 	// flush until next byte.
 	b.flushAlign()
-	return nil
 }
 
 // reset and continue writing by appending to out.

diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
index 7eed729..0dd742f 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go

@@ -5,14 +5,10 @@
 package zstd
 
 import (
-	"bytes"
-	"encoding/binary"
 	"errors"
 	"fmt"
+	"hash/crc32"
 	"io"
-	"io/ioutil"
-	"os"
-	"path/filepath"
 	"sync"
 
 	"github.com/klauspost/compress/huff0"
@@ -83,8 +79,9 @@
 
 	err error
 
-	// Check against this crc
-	checkCRC []byte
+	// Check against this crc, if hasCRC is true.
+	checkCRC uint32
+	hasCRC   bool
 
 	// Frame to use for singlethreaded decoding.
 	// Should not be used by the decoder itself since parent may be another frame.
@@ -192,16 +189,14 @@
 	}
 
 	// Read block data.
-	if cap(b.dataStorage) < cSize {
+	if _, ok := br.(*byteBuf); !ok && cap(b.dataStorage) < cSize {
+		// byteBuf doesn't need a destination buffer.
 		if b.lowMem || cSize > maxCompressedBlockSize {
 			b.dataStorage = make([]byte, 0, cSize+compressedBlockOverAlloc)
 		} else {
 			b.dataStorage = make([]byte, 0, maxCompressedBlockSizeAlloc)
 		}
 	}
-	if cap(b.dst) <= maxSize {
-		b.dst = make([]byte, 0, maxSize+1)
-	}
 	b.data, err = br.readBig(cSize, b.dataStorage)
 	if err != nil {
 		if debugDecoder {
@@ -210,6 +205,9 @@
 		}
 		return err
 	}
+	if cap(b.dst) <= maxSize {
+		b.dst = make([]byte, 0, maxSize+1)
+	}
 	return nil
 }
 
@@ -233,7 +231,7 @@
 			if b.lowMem {
 				b.dst = make([]byte, b.RLESize)
 			} else {
-				b.dst = make([]byte, maxBlockSize)
+				b.dst = make([]byte, maxCompressedBlockSize)
 			}
 		}
 		b.dst = b.dst[:b.RLESize]
@@ -441,6 +439,9 @@
 			}
 		}
 		var err error
+		if debugDecoder {
+			println("huff table input:", len(literals), "CRC:", crc32.ChecksumIEEE(literals))
+		}
 		huff, literals, err = huff0.ReadTable(literals, huff)
 		if err != nil {
 			println("reading huffman table:", err)
@@ -549,6 +550,9 @@
 		if debugDecoder {
 			printf("Compression modes: 0b%b", compMode)
 		}
+		if compMode&3 != 0 {
+			return errors.New("corrupt block: reserved bits not zero")
+		}
 		for i := uint(0); i < 3; i++ {
 			mode := seqCompMode((compMode >> (6 - i*2)) & 3)
 			if debugDecoder {
@@ -587,10 +591,12 @@
 				}
 				seq.fse.setRLE(symb)
 				if debugDecoder {
-					printf("RLE set to %+v, code: %v", symb, v)
+					printf("RLE set to 0x%x, code: %v", symb, v)
 				}
 			case compModeFSE:
-				println("Reading table for", tableIndex(i))
+				if debugDecoder {
+					println("Reading table for", tableIndex(i))
+				}
 				if seq.fse == nil || seq.fse.preDefined {
 					seq.fse = fseDecoderPool.Get().(*fseDecoder)
 				}
@@ -638,21 +644,6 @@
 		println("initializing sequences:", err)
 		return err
 	}
-	// Extract blocks...
-	if false && hist.dict == nil {
-		fatalErr := func(err error) {
-			if err != nil {
-				panic(err)
-			}
-		}
-		fn := fmt.Sprintf("n-%d-lits-%d-prev-%d-%d-%d-win-%d.blk", hist.decoders.nSeqs, len(hist.decoders.literals), hist.recentOffsets[0], hist.recentOffsets[1], hist.recentOffsets[2], hist.windowSize)
-		var buf bytes.Buffer
-		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.litLengths.fse))
-		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.matchLengths.fse))
-		fatalErr(binary.Write(&buf, binary.LittleEndian, hist.decoders.offsets.fse))
-		buf.Write(in)
-		ioutil.WriteFile(filepath.Join("testdata", "seqs", fn), buf.Bytes(), os.ModePerm)
-	}
 
 	return nil
 }

diff --git a/vendor/github.com/klauspost/compress/zstd/blockenc.go b/vendor/github.com/klauspost/compress/zstd/blockenc.go
index 12e8f6f..fd35ea1 100644
--- a/vendor/github.com/klauspost/compress/zstd/blockenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/blockenc.go

@@ -9,6 +9,7 @@
 	"fmt"
 	"math"
 	"math/bits"
+	"slices"
 
 	"github.com/klauspost/compress/huff0"
 )
@@ -361,14 +362,21 @@
 	if len(lits) >= 1024 {
 		// Use 4 Streams.
 		out, reUsed, err = huff0.Compress4X(lits, b.litEnc)
-	} else if len(lits) > 32 {
+	} else if len(lits) > 16 {
 		// Use 1 stream
 		single = true
 		out, reUsed, err = huff0.Compress1X(lits, b.litEnc)
 	} else {
 		err = huff0.ErrIncompressible
 	}
-
+	if err == nil && len(out)+5 > len(lits) {
+		// If we are close, we may still be worse or equal to raw.
+		var lh literalsHeader
+		lh.setSizes(len(out), len(lits), single)
+		if len(out)+lh.size() >= len(lits) {
+			err = huff0.ErrIncompressible
+		}
+	}
 	switch err {
 	case huff0.ErrIncompressible:
 		if debugEncoder {
@@ -420,6 +428,16 @@
 	return nil
 }
 
+// encodeRLE will encode an RLE block.
+func (b *blockEnc) encodeRLE(val byte, length uint32) {
+	var bh blockHeader
+	bh.setLast(b.last)
+	bh.setSize(length)
+	bh.setType(blockTypeRLE)
+	b.output = bh.appendTo(b.output)
+	b.output = append(b.output, val)
+}
+
 // fuzzFseEncoder can be used to fuzz the FSE encoder.
 func fuzzFseEncoder(data []byte) int {
 	if len(data) > maxSequences || len(data) < 2 {
@@ -440,16 +458,7 @@
 		// All 0
 		return 0
 	}
-	maxCount := func(a []uint32) int {
-		var max uint32
-		for _, v := range a {
-			if v > max {
-				max = v
-			}
-		}
-		return int(max)
-	}
-	cnt := maxCount(hist[:maxSym])
+	cnt := int(slices.Max(hist[:maxSym]))
 	if cnt == len(data) {
 		// RLE
 		return 0
@@ -472,8 +481,18 @@
 	if len(b.sequences) == 0 {
 		return b.encodeLits(b.literals, rawAllLits)
 	}
+	if len(b.sequences) == 1 && len(org) > 0 && len(b.literals) <= 1 {
+		// Check common RLE cases.
+		seq := b.sequences[0]
+		if seq.litLen == uint32(len(b.literals)) && seq.offset-3 == 1 {
+			// Offset == 1 and 0 or 1 literals.
+			b.encodeRLE(org[0], b.sequences[0].matchLen+zstdMinMatch+seq.litLen)
+			return nil
+		}
+	}
+
 	// We want some difference to at least account for the headers.
-	saved := b.size - len(b.literals) - (b.size >> 5)
+	saved := b.size - len(b.literals) - (b.size >> 6)
 	if saved < 16 {
 		if org == nil {
 			return errIncompressible
@@ -503,7 +522,7 @@
 	if len(b.literals) >= 1024 && !raw {
 		// Use 4 Streams.
 		out, reUsed, err = huff0.Compress4X(b.literals, b.litEnc)
-	} else if len(b.literals) > 32 && !raw {
+	} else if len(b.literals) > 16 && !raw {
 		// Use 1 stream
 		single = true
 		out, reUsed, err = huff0.Compress1X(b.literals, b.litEnc)
@@ -511,6 +530,17 @@
 		err = huff0.ErrIncompressible
 	}
 
+	if err == nil && len(out)+5 > len(b.literals) {
+		// If we are close, we may still be worse or equal to raw.
+		var lh literalsHeader
+		lh.setSize(len(b.literals))
+		szRaw := lh.size()
+		lh.setSizes(len(out), len(b.literals), single)
+		szComp := lh.size()
+		if len(out)+szComp >= len(b.literals)+szRaw {
+			err = huff0.ErrIncompressible
+		}
+	}
 	switch err {
 	case huff0.ErrIncompressible:
 		lh.setType(literalsBlockRaw)
@@ -773,16 +803,16 @@
 	ml.flush(mlEnc.actualTableLog)
 	of.flush(ofEnc.actualTableLog)
 	ll.flush(llEnc.actualTableLog)
-	err = wr.close()
-	if err != nil {
-		return err
-	}
+	wr.close()
 	b.output = wr.out
 
+	// Maybe even add a bigger margin.
 	if len(b.output)-3-bhOffset >= b.size {
-		// Maybe even add a bigger margin.
+		// Discard and encode as raw block.
+		b.output = b.encodeRawTo(b.output[:bhOffset], org)
+		b.popOffsets()
 		b.litEnc.Reuse = huff0.ReusePolicyNone
-		return errIncompressible
+		return nil
 	}
 
 	// Size is output minus block header.
@@ -846,15 +876,6 @@
 			}
 		}
 	}
-	maxCount := func(a []uint32) int {
-		var max uint32
-		for _, v := range a {
-			if v > max {
-				max = v
-			}
-		}
-		return int(max)
-	}
 	if debugAsserts && mlMax > maxMatchLengthSymbol {
 		panic(fmt.Errorf("mlMax > maxMatchLengthSymbol (%d)", mlMax))
 	}
@@ -865,7 +886,7 @@
 		panic(fmt.Errorf("llMax > maxLiteralLengthSymbol (%d)", llMax))
 	}
 
-	b.coders.mlEnc.HistogramFinished(mlMax, maxCount(mlH[:mlMax+1]))
-	b.coders.ofEnc.HistogramFinished(ofMax, maxCount(ofH[:ofMax+1]))
-	b.coders.llEnc.HistogramFinished(llMax, maxCount(llH[:llMax+1]))
+	b.coders.mlEnc.HistogramFinished(mlMax, int(slices.Max(mlH[:mlMax+1])))
+	b.coders.ofEnc.HistogramFinished(ofMax, int(slices.Max(ofH[:ofMax+1])))
+	b.coders.llEnc.HistogramFinished(llMax, int(slices.Max(llH[:llMax+1])))
 }

diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
index 2ad0207..55a3885 100644
--- a/vendor/github.com/klauspost/compress/zstd/bytebuf.go
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go

@@ -7,7 +7,6 @@
 import (
 	"fmt"
 	"io"
-	"io/ioutil"
 )
 
 type byteBuffer interface {
@@ -55,7 +54,7 @@
 func (b *byteBuf) readByte() (byte, error) {
 	bb := *b
 	if len(bb) < 1 {
-		return 0, nil
+		return 0, io.ErrUnexpectedEOF
 	}
 	r := bb[0]
 	*b = bb[1:]
@@ -110,7 +109,7 @@
 }
 
 func (r *readerWrapper) readByte() (byte, error) {
-	n2, err := r.r.Read(r.tmp[:1])
+	n2, err := io.ReadFull(r.r, r.tmp[:1])
 	if err != nil {
 		if err == io.EOF {
 			err = io.ErrUnexpectedEOF
@@ -124,7 +123,7 @@
 }
 
 func (r *readerWrapper) skipN(n int64) error {
-	n2, err := io.CopyN(ioutil.Discard, r.r, n)
+	n2, err := io.CopyN(io.Discard, r.r, n)
 	if n2 != n {
 		err = io.ErrUnexpectedEOF
 	}

diff --git a/vendor/github.com/klauspost/compress/zstd/decodeheader.go b/vendor/github.com/klauspost/compress/zstd/decodeheader.go
index 5022e71..6a5a298 100644
--- a/vendor/github.com/klauspost/compress/zstd/decodeheader.go
+++ b/vendor/github.com/klauspost/compress/zstd/decodeheader.go

@@ -4,7 +4,6 @@
 package zstd
 
 import (
-	"bytes"
 	"encoding/binary"
 	"errors"
 	"io"
@@ -96,42 +95,54 @@
 // If there isn't enough input, io.ErrUnexpectedEOF is returned.
 // The FirstBlock.OK will indicate if enough information was available to decode the first block header.
 func (h *Header) Decode(in []byte) error {
+	_, err := h.DecodeAndStrip(in)
+	return err
+}
+
+// DecodeAndStrip will decode the header from the beginning of the stream
+// and on success return the remaining bytes.
+// This will decode the frame header and the first block header if enough bytes are provided.
+// It is recommended to provide at least HeaderMaxSize bytes.
+// If the frame header cannot be read an error will be returned.
+// If there isn't enough input, io.ErrUnexpectedEOF is returned.
+// The FirstBlock.OK will indicate if enough information was available to decode the first block header.
+func (h *Header) DecodeAndStrip(in []byte) (remain []byte, err error) {
 	*h = Header{}
 	if len(in) < 4 {
-		return io.ErrUnexpectedEOF
+		return nil, io.ErrUnexpectedEOF
 	}
 	h.HeaderSize += 4
 	b, in := in[:4], in[4:]
-	if !bytes.Equal(b, frameMagic) {
-		if !bytes.Equal(b[1:4], skippableFrameMagic) || b[0]&0xf0 != 0x50 {
-			return ErrMagicMismatch
+	if string(b) != frameMagic {
+		if string(b[1:4]) != skippableFrameMagic || b[0]&0xf0 != 0x50 {
+			return nil, ErrMagicMismatch
 		}
 		if len(in) < 4 {
-			return io.ErrUnexpectedEOF
+			return nil, io.ErrUnexpectedEOF
 		}
 		h.HeaderSize += 4
 		h.Skippable = true
 		h.SkippableID = int(b[0] & 0xf)
 		h.SkippableSize = binary.LittleEndian.Uint32(in)
-		return nil
+		return in[4:], nil
 	}
 
 	// Read Window_Descriptor
 	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
 	if len(in) < 1 {
-		return io.ErrUnexpectedEOF
+		return nil, io.ErrUnexpectedEOF
 	}
 	fhd, in := in[0], in[1:]
 	h.HeaderSize++
 	h.SingleSegment = fhd&(1<<5) != 0
 	h.HasCheckSum = fhd&(1<<2) != 0
 	if fhd&(1<<3) != 0 {
-		return errors.New("reserved bit set on frame header")
+		return nil, errors.New("reserved bit set on frame header")
 	}
 
 	if !h.SingleSegment {
 		if len(in) < 1 {
-			return io.ErrUnexpectedEOF
+			return nil, io.ErrUnexpectedEOF
 		}
 		var wd byte
 		wd, in = in[0], in[1:]
@@ -149,11 +160,11 @@
 			size = 4
 		}
 		if len(in) < int(size) {
-			return io.ErrUnexpectedEOF
+			return nil, io.ErrUnexpectedEOF
 		}
 		b, in = in[:size], in[size:]
 		h.HeaderSize += int(size)
-		switch size {
+		switch len(b) {
 		case 1:
 			h.DictionaryID = uint32(b[0])
 		case 2:
@@ -179,11 +190,11 @@
 	if fcsSize > 0 {
 		h.HasFCS = true
 		if len(in) < fcsSize {
-			return io.ErrUnexpectedEOF
+			return nil, io.ErrUnexpectedEOF
 		}
 		b, in = in[:fcsSize], in[fcsSize:]
 		h.HeaderSize += int(fcsSize)
-		switch fcsSize {
+		switch len(b) {
 		case 1:
 			h.FrameContentSize = uint64(b[0])
 		case 2:
@@ -200,7 +211,7 @@
 
 	// Frame Header done, we will not fail from now on.
 	if len(in) < 3 {
-		return nil
+		return in, nil
 	}
 	tmp := in[:3]
 	bh := uint32(tmp[0]) | (uint32(tmp[1]) << 8) | (uint32(tmp[2]) << 16)
@@ -210,7 +221,7 @@
 	cSize := int(bh >> 3)
 	switch blockType {
 	case blockTypeReserved:
-		return nil
+		return in, nil
 	case blockTypeRLE:
 		h.FirstBlock.Compressed = true
 		h.FirstBlock.DecompressedSize = cSize
@@ -226,5 +237,25 @@
 	}
 
 	h.FirstBlock.OK = true
-	return nil
+	return in, nil
+}
+
+// AppendTo will append the encoded header to the dst slice.
+// There is no error checking performed on the header values.
+func (h *Header) AppendTo(dst []byte) ([]byte, error) {
+	if h.Skippable {
+		magic := [4]byte{0x50, 0x2a, 0x4d, 0x18}
+		magic[0] |= byte(h.SkippableID & 0xf)
+		dst = append(dst, magic[:]...)
+		f := h.SkippableSize
+		return append(dst, uint8(f), uint8(f>>8), uint8(f>>16), uint8(f>>24)), nil
+	}
+	f := frameHeader{
+		ContentSize:   h.FrameContentSize,
+		WindowSize:    uint32(h.WindowSize),
+		SingleSegment: h.SingleSegment,
+		Checksum:      h.HasCheckSum,
+		DictID:        h.DictionaryID,
+	}
+	return f.appendTo(dst), nil
 }

diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
index d212f47..ea2a193 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go

@@ -5,7 +5,6 @@
 package zstd
 
 import (
-	"bytes"
 	"context"
 	"encoding/binary"
 	"io"
@@ -35,13 +34,13 @@
 		br           readerWrapper
 		enabled      bool
 		inFrame      bool
+		dstBuf       []byte
 	}
 
 	frame *frameDec
 
 	// Custom dictionaries.
-	// Always uses copies.
-	dicts map[uint32]dict
+	dicts map[uint32]*dict
 
 	// streamWg is the waitgroup for all streams
 	streamWg sync.WaitGroup
@@ -83,7 +82,7 @@
 // can run multiple concurrent stateless decodes. It is even possible to
 // use stateless decodes while a stream is being decoded.
 //
-// The Reset function can be used to initiate a new stream, which is will considerably
+// The Reset function can be used to initiate a new stream, which will considerably
 // reduce the allocations normally caused by NewReader.
 func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
 	initPredefined()
@@ -103,7 +102,7 @@
 	}
 
 	// Transfer option dicts.
-	d.dicts = make(map[uint32]dict, len(d.o.dicts))
+	d.dicts = make(map[uint32]*dict, len(d.o.dicts))
 	for _, dc := range d.o.dicts {
 		d.dicts[dc.id] = dc
 	}
@@ -124,7 +123,7 @@
 }
 
 // Read bytes from the decompressed stream into p.
-// Returns the number of bytes written and any error that occurred.
+// Returns the number of bytes read and any error that occurred.
 // When the stream is done, io.EOF will be returned.
 func (d *Decoder) Read(p []byte) (int, error) {
 	var n int
@@ -187,21 +186,23 @@
 	}
 
 	// If bytes buffer and < 5MB, do sync decoding anyway.
-	if bb, ok := r.(byter); ok && bb.Len() < 5<<20 {
+	if bb, ok := r.(byter); ok && bb.Len() < d.o.decodeBufsBelow && !d.o.limitToCap {
 		bb2 := bb
 		if debugDecoder {
 			println("*bytes.Buffer detected, doing sync decode, len:", bb.Len())
 		}
 		b := bb2.Bytes()
 		var dst []byte
-		if cap(d.current.b) > 0 {
-			dst = d.current.b
+		if cap(d.syncStream.dstBuf) > 0 {
+			dst = d.syncStream.dstBuf[:0]
 		}
 
-		dst, err := d.DecodeAll(b, dst[:0])
+		dst, err := d.DecodeAll(b, dst)
 		if err == nil {
 			err = io.EOF
 		}
+		// Save output buffer
+		d.syncStream.dstBuf = dst
 		d.current.b = dst
 		d.current.err = err
 		d.current.flushed = true
@@ -216,6 +217,7 @@
 	d.current.err = nil
 	d.current.flushed = false
 	d.current.d = nil
+	d.syncStream.dstBuf = nil
 
 	// Ensure no-one else is still running...
 	d.streamWg.Wait()
@@ -312,6 +314,7 @@
 	// Grab a block decoder and frame decoder.
 	block := <-d.decoders
 	frame := block.localFrame
+	initialSize := len(dst)
 	defer func() {
 		if debugDecoder {
 			printf("re-adding decoder: %p", block)
@@ -320,6 +323,7 @@
 		frame.bBuf = nil
 		if frame.history.decoders.br != nil {
 			frame.history.decoders.br.in = nil
+			frame.history.decoders.br.cursor = 0
 		}
 		d.decoders <- block
 	}()
@@ -337,15 +341,8 @@
 			}
 			return dst, err
 		}
-		if frame.DictionaryID != nil {
-			dict, ok := d.dicts[*frame.DictionaryID]
-			if !ok {
-				return nil, ErrUnknownDictionary
-			}
-			if debugDecoder {
-				println("setting dict", frame.DictionaryID)
-			}
-			frame.history.setDict(&dict)
+		if err = d.setDict(frame); err != nil {
+			return nil, err
 		}
 		if frame.WindowSize > d.o.maxWindowSize {
 			if debugDecoder {
@@ -354,7 +351,16 @@
 			return dst, ErrWindowSizeExceeded
 		}
 		if frame.FrameContentSize != fcsUnknown {
-			if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
+			if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)-initialSize) {
+				if debugDecoder {
+					println("decoder size exceeded; fcs:", frame.FrameContentSize, "> mcs:", d.o.maxDecodedSize-uint64(len(dst)-initialSize), "len:", len(dst))
+				}
+				return dst, ErrDecoderSizeExceeded
+			}
+			if d.o.limitToCap && frame.FrameContentSize > uint64(cap(dst)-len(dst)) {
+				if debugDecoder {
+					println("decoder size exceeded; fcs:", frame.FrameContentSize, "> (cap-len)", cap(dst)-len(dst))
+				}
 				return dst, ErrDecoderSizeExceeded
 			}
 			if cap(dst)-len(dst) < int(frame.FrameContentSize) {
@@ -364,7 +370,7 @@
 			}
 		}
 
-		if cap(dst) == 0 {
+		if cap(dst) == 0 && !d.o.limitToCap {
 			// Allocate len(input) * 2 by default if nothing is provided
 			// and we didn't get frame content size.
 			size := len(input) * 2
@@ -382,6 +388,9 @@
 		if err != nil {
 			return dst, err
 		}
+		if uint64(len(dst)-initialSize) > d.o.maxDecodedSize {
+			return dst, ErrDecoderSizeExceeded
+		}
 		if len(frame.bBuf) == 0 {
 			if debugDecoder {
 				println("frame dbuf empty")
@@ -442,26 +451,23 @@
 		println("got", len(d.current.b), "bytes, error:", d.current.err, "data crc:", tmp)
 	}
 
-	if !d.o.ignoreChecksum && len(next.b) > 0 {
-		n, err := d.current.crc.Write(next.b)
-		if err == nil {
-			if n != len(next.b) {
-				d.current.err = io.ErrShortWrite
-			}
-		}
+	if d.o.ignoreChecksum {
+		return true
 	}
-	if next.err == nil && next.d != nil && len(next.d.checkCRC) != 0 {
-		got := d.current.crc.Sum64()
-		var tmp [4]byte
-		binary.LittleEndian.PutUint32(tmp[:], uint32(got))
-		if !d.o.ignoreChecksum && !bytes.Equal(tmp[:], next.d.checkCRC) {
+
+	if len(next.b) > 0 {
+		d.current.crc.Write(next.b)
+	}
+	if next.err == nil && next.d != nil && next.d.hasCRC {
+		got := uint32(d.current.crc.Sum64())
+		if got != next.d.checkCRC {
 			if debugDecoder {
-				println("CRC Check Failed:", tmp[:], " (got) !=", next.d.checkCRC, "(on stream)")
+				printf("CRC Check Failed: %08x (got) != %08x (on stream)\n", got, next.d.checkCRC)
 			}
 			d.current.err = ErrCRCMismatch
 		} else {
 			if debugDecoder {
-				println("CRC ok", tmp[:])
+				printf("CRC ok %08x\n", got)
 			}
 		}
 	}
@@ -477,18 +483,12 @@
 		if !d.syncStream.inFrame {
 			d.frame.history.reset()
 			d.current.err = d.frame.reset(&d.syncStream.br)
+			if d.current.err == nil {
+				d.current.err = d.setDict(d.frame)
+			}
 			if d.current.err != nil {
 				return false
 			}
-			if d.frame.DictionaryID != nil {
-				dict, ok := d.dicts[*d.frame.DictionaryID]
-				if !ok {
-					d.current.err = ErrUnknownDictionary
-					return false
-				} else {
-					d.frame.history.setDict(&dict)
-				}
-			}
 			if d.frame.WindowSize > d.o.maxDecodedSize || d.frame.WindowSize > d.o.maxWindowSize {
 				d.current.err = ErrDecoderSizeExceeded
 				return false
@@ -667,6 +667,7 @@
 				if debugDecoder {
 					println("Async 1: new history, recent:", block.async.newHist.recentOffsets)
 				}
+				hist.reset()
 				hist.decoders = block.async.newHist.decoders
 				hist.recentOffsets = block.async.newHist.recentOffsets
 				hist.windowSize = block.async.newHist.windowSize
@@ -698,6 +699,7 @@
 			seqExecute <- block
 		}
 		close(seqExecute)
+		hist.reset()
 	}()
 
 	var wg sync.WaitGroup
@@ -721,6 +723,7 @@
 				if debugDecoder {
 					println("Async 2: new history")
 				}
+				hist.reset()
 				hist.windowSize = block.async.newHist.windowSize
 				hist.allocFrameBuffer = block.async.newHist.allocFrameBuffer
 				if block.async.newHist.dict != nil {
@@ -750,7 +753,7 @@
 					if block.lowMem {
 						block.dst = make([]byte, block.RLESize)
 					} else {
-						block.dst = make([]byte, maxBlockSize)
+						block.dst = make([]byte, maxCompressedBlockSize)
 					}
 				}
 				block.dst = block.dst[:block.RLESize]
@@ -802,13 +805,14 @@
 		if debugDecoder {
 			println("decoder goroutines finished")
 		}
+		hist.reset()
 	}()
 
+	var hist history
 decodeStream:
 	for {
-		var hist history
 		var hasErr bool
-
+		hist.reset()
 		decodeBlock := func(block *blockDec) {
 			if hasErr {
 				if block != nil {
@@ -843,15 +847,14 @@
 		if debugDecoder && err != nil {
 			println("Frame decoder returned", err)
 		}
-		if err == nil && frame.DictionaryID != nil {
-			dict, ok := d.dicts[*frame.DictionaryID]
-			if !ok {
-				err = ErrUnknownDictionary
-			} else {
-				frame.history.setDict(&dict)
-			}
+		if err == nil {
+			err = d.setDict(frame)
 		}
 		if err == nil && d.frame.WindowSize > d.o.maxWindowSize {
+			if debugDecoder {
+				println("decoder size exceeded, fws:", d.frame.WindowSize, "> mws:", d.o.maxWindowSize)
+			}
+
 			err = ErrDecoderSizeExceeded
 		}
 		if err != nil {
@@ -893,18 +896,22 @@
 				println("next block returned error:", err)
 			}
 			dec.err = err
-			dec.checkCRC = nil
+			dec.hasCRC = false
 			if dec.Last && frame.HasCheckSum && err == nil {
 				crc, err := frame.rawInput.readSmall(4)
-				if err != nil {
+				if len(crc) < 4 {
+					if err == nil {
+						err = io.ErrUnexpectedEOF
+
+					}
 					println("CRC missing?", err)
 					dec.err = err
-				}
-				var tmp [4]byte
-				copy(tmp[:], crc)
-				dec.checkCRC = tmp[:]
-				if debugDecoder {
-					println("found crc to check:", dec.checkCRC)
+				} else {
+					dec.checkCRC = binary.LittleEndian.Uint32(crc)
+					dec.hasCRC = true
+					if debugDecoder {
+						printf("found crc to check: %08x\n", dec.checkCRC)
+					}
 				}
 			}
 			err = dec.err
@@ -920,5 +927,23 @@
 	}
 	close(seqDecode)
 	wg.Wait()
+	hist.reset()
 	d.frame.history.b = frameHistCache
 }
+
+func (d *Decoder) setDict(frame *frameDec) (err error) {
+	dict, ok := d.dicts[frame.DictionaryID]
+	if ok {
+		if debugDecoder {
+			println("setting dict", frame.DictionaryID)
+		}
+		frame.history.setDict(dict)
+	} else if frame.DictionaryID != 0 {
+		// A zero or missing dictionary id is ambiguous:
+		// either dictionary zero, or no dictionary. In particular,
+		// zstd --patch-from uses this id for the source file,
+		// so only return an error if the dictionary id is not zero.
+		err = ErrUnknownDictionary
+	}
+	return err
+}

diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
index c70e6fa..774c5f0 100644
--- a/vendor/github.com/klauspost/compress/zstd/decoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go

@@ -6,6 +6,8 @@
 
 import (
 	"errors"
+	"fmt"
+	"math/bits"
 	"runtime"
 )
 
@@ -14,20 +16,23 @@
 
 // options retains accumulated state of multiple options.
 type decoderOptions struct {
-	lowMem         bool
-	concurrent     int
-	maxDecodedSize uint64
-	maxWindowSize  uint64
-	dicts          []dict
-	ignoreChecksum bool
+	lowMem          bool
+	concurrent      int
+	maxDecodedSize  uint64
+	maxWindowSize   uint64
+	dicts           []*dict
+	ignoreChecksum  bool
+	limitToCap      bool
+	decodeBufsBelow int
 }
 
 func (o *decoderOptions) setDefault() {
 	*o = decoderOptions{
 		// use less ram: true for now, but may change.
-		lowMem:        true,
-		concurrent:    runtime.GOMAXPROCS(0),
-		maxWindowSize: MaxWindowSize,
+		lowMem:          true,
+		concurrent:      runtime.GOMAXPROCS(0),
+		maxWindowSize:   MaxWindowSize,
+		decodeBufsBelow: 128 << 10,
 	}
 	if o.concurrent > 4 {
 		o.concurrent = 4
@@ -82,7 +87,13 @@
 }
 
 // WithDecoderDicts allows to register one or more dictionaries for the decoder.
-// If several dictionaries with the same ID is provided the last one will be used.
+//
+// Each slice in dict must be in the [dictionary format] produced by
+// "zstd --train" from the Zstandard reference implementation.
+//
+// If several dictionaries with the same ID are provided, the last one will be used.
+//
+// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
 func WithDecoderDicts(dicts ...[]byte) DOption {
 	return func(o *decoderOptions) error {
 		for _, b := range dicts {
@@ -90,12 +101,24 @@
 			if err != nil {
 				return err
 			}
-			o.dicts = append(o.dicts, *d)
+			o.dicts = append(o.dicts, d)
 		}
 		return nil
 	}
 }
 
+// WithDecoderDictRaw registers a dictionary that may be used by the decoder.
+// The slice content can be arbitrary data.
+func WithDecoderDictRaw(id uint32, content []byte) DOption {
+	return func(o *decoderOptions) error {
+		if bits.UintSize > 32 && uint(len(content)) > dictMaxLength {
+			return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content))
+		}
+		o.dicts = append(o.dicts, &dict{id: id, content: content, offsets: [3]int{1, 4, 8}})
+		return nil
+	}
+}
+
 // WithDecoderMaxWindow allows to set a maximum window size for decodes.
 // This allows rejecting packets that will cause big memory usage.
 // The Decoder will likely allocate more memory based on the WithDecoderLowmem setting.
@@ -114,6 +137,29 @@
 	}
 }
 
+// WithDecodeAllCapLimit will limit DecodeAll to decoding cap(dst)-len(dst) bytes,
+// or any size set in WithDecoderMaxMemory.
+// This can be used to limit decoding to a specific maximum output size.
+// Disabled by default.
+func WithDecodeAllCapLimit(b bool) DOption {
+	return func(o *decoderOptions) error {
+		o.limitToCap = b
+		return nil
+	}
+}
+
+// WithDecodeBuffersBelow will fully decode readers that have a
+// `Bytes() []byte` and `Len() int` interface similar to bytes.Buffer.
+// This typically uses less allocations but will have the full decompressed object in memory.
+// Note that DecodeAllCapLimit will disable this, as well as giving a size of 0 or less.
+// Default is 128KiB.
+func WithDecodeBuffersBelow(size int) DOption {
+	return func(o *decoderOptions) error {
+		o.decodeBufsBelow = size
+		return nil
+	}
+}
+
 // IgnoreChecksum allows to forcibly ignore checksum checking.
 func IgnoreChecksum(b bool) DOption {
 	return func(o *decoderOptions) error {

diff --git a/vendor/github.com/klauspost/compress/zstd/dict.go b/vendor/github.com/klauspost/compress/zstd/dict.go
index a36ae83..b7b8316 100644
--- a/vendor/github.com/klauspost/compress/zstd/dict.go
+++ b/vendor/github.com/klauspost/compress/zstd/dict.go

@@ -6,6 +6,8 @@
 	"errors"
 	"fmt"
 	"io"
+	"math"
+	"sort"
 
 	"github.com/klauspost/compress/huff0"
 )
@@ -15,12 +17,14 @@
 
 	litEnc              *huff0.Scratch
 	llDec, ofDec, mlDec sequenceDec
-	//llEnc, ofEnc, mlEnc []*fseEncoder
-	offsets [3]int
-	content []byte
+	offsets             [3]int
+	content             []byte
 }
 
-var dictMagic = [4]byte{0x37, 0xa4, 0x30, 0xec}
+const dictMagic = "\x37\xa4\x30\xec"
+
+// Maximum dictionary size for the reference implementation (1.5.3) is 2 GiB.
+const dictMaxLength = 1 << 31
 
 // ID returns the dictionary id or 0 if d is nil.
 func (d *dict) ID() uint32 {
@@ -30,14 +34,38 @@
 	return d.id
 }
 
-// DictContentSize returns the dictionary content size or 0 if d is nil.
-func (d *dict) DictContentSize() int {
+// ContentSize returns the dictionary content size or 0 if d is nil.
+func (d *dict) ContentSize() int {
 	if d == nil {
 		return 0
 	}
 	return len(d.content)
 }
 
+// Content returns the dictionary content.
+func (d *dict) Content() []byte {
+	if d == nil {
+		return nil
+	}
+	return d.content
+}
+
+// Offsets returns the initial offsets.
+func (d *dict) Offsets() [3]int {
+	if d == nil {
+		return [3]int{}
+	}
+	return d.offsets
+}
+
+// LitEncoder returns the literal encoder.
+func (d *dict) LitEncoder() *huff0.Scratch {
+	if d == nil {
+		return nil
+	}
+	return d.litEnc
+}
+
 // Load a dictionary as described in
 // https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
 func loadDict(b []byte) (*dict, error) {
@@ -50,7 +78,7 @@
 		ofDec: sequenceDec{fse: &fseDecoder{}},
 		mlDec: sequenceDec{fse: &fseDecoder{}},
 	}
-	if !bytes.Equal(b[:4], dictMagic[:]) {
+	if string(b[:4]) != dictMagic {
 		return nil, ErrMagicMismatch
 	}
 	d.id = binary.LittleEndian.Uint32(b[4:8])
@@ -62,7 +90,7 @@
 	var err error
 	d.litEnc, b, err = huff0.ReadTable(b[8:], nil)
 	if err != nil {
-		return nil, err
+		return nil, fmt.Errorf("loading literal table: %w", err)
 	}
 	d.litEnc.Reuse = huff0.ReusePolicyMust
 
@@ -120,3 +148,418 @@
 
 	return &d, nil
 }
+
+// InspectDictionary loads a zstd dictionary and provides functions to inspect the content.
+func InspectDictionary(b []byte) (interface {
+	ID() uint32
+	ContentSize() int
+	Content() []byte
+	Offsets() [3]int
+	LitEncoder() *huff0.Scratch
+}, error) {
+	initPredefined()
+	d, err := loadDict(b)
+	return d, err
+}
+
+type BuildDictOptions struct {
+	// Dictionary ID.
+	ID uint32
+
+	// Content to use to create dictionary tables.
+	Contents [][]byte
+
+	// History to use for all blocks.
+	History []byte
+
+	// Offsets to use.
+	Offsets [3]int
+
+	// CompatV155 will make the dictionary compatible with Zstd v1.5.5 and earlier.
+	// See https://github.com/facebook/zstd/issues/3724
+	CompatV155 bool
+
+	// Use the specified encoder level.
+	// The dictionary will be built using the specified encoder level,
+	// which will reflect speed and make the dictionary tailored for that level.
+	// If not set SpeedBestCompression will be used.
+	Level EncoderLevel
+
+	// DebugOut will write stats and other details here if set.
+	DebugOut io.Writer
+}
+
+func BuildDict(o BuildDictOptions) ([]byte, error) {
+	initPredefined()
+	hist := o.History
+	contents := o.Contents
+	debug := o.DebugOut != nil
+	println := func(args ...interface{}) {
+		if o.DebugOut != nil {
+			fmt.Fprintln(o.DebugOut, args...)
+		}
+	}
+	printf := func(s string, args ...interface{}) {
+		if o.DebugOut != nil {
+			fmt.Fprintf(o.DebugOut, s, args...)
+		}
+	}
+	print := func(args ...interface{}) {
+		if o.DebugOut != nil {
+			fmt.Fprint(o.DebugOut, args...)
+		}
+	}
+
+	if int64(len(hist)) > dictMaxLength {
+		return nil, fmt.Errorf("dictionary of size %d > %d", len(hist), int64(dictMaxLength))
+	}
+	if len(hist) < 8 {
+		return nil, fmt.Errorf("dictionary of size %d < %d", len(hist), 8)
+	}
+	if len(contents) == 0 {
+		return nil, errors.New("no content provided")
+	}
+	d := dict{
+		id:      o.ID,
+		litEnc:  nil,
+		llDec:   sequenceDec{},
+		ofDec:   sequenceDec{},
+		mlDec:   sequenceDec{},
+		offsets: o.Offsets,
+		content: hist,
+	}
+	block := blockEnc{lowMem: false}
+	block.init()
+	enc := encoder(&bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(maxMatchLen), bufferReset: math.MaxInt32 - int32(maxMatchLen*2), lowMem: false}})
+	if o.Level != 0 {
+		eOpts := encoderOptions{
+			level:      o.Level,
+			blockSize:  maxMatchLen,
+			windowSize: maxMatchLen,
+			dict:       &d,
+			lowMem:     false,
+		}
+		enc = eOpts.encoder()
+	} else {
+		o.Level = SpeedBestCompression
+	}
+	var (
+		remain [256]int
+		ll     [256]int
+		ml     [256]int
+		of     [256]int
+	)
+	addValues := func(dst *[256]int, src []byte) {
+		for _, v := range src {
+			dst[v]++
+		}
+	}
+	addHist := func(dst *[256]int, src *[256]uint32) {
+		for i, v := range src {
+			dst[i] += int(v)
+		}
+	}
+	seqs := 0
+	nUsed := 0
+	litTotal := 0
+	newOffsets := make(map[uint32]int, 1000)
+	for _, b := range contents {
+		block.reset(nil)
+		if len(b) < 8 {
+			continue
+		}
+		nUsed++
+		enc.Reset(&d, true)
+		enc.Encode(&block, b)
+		addValues(&remain, block.literals)
+		litTotal += len(block.literals)
+		if len(block.sequences) == 0 {
+			continue
+		}
+		seqs += len(block.sequences)
+		block.genCodes()
+		addHist(&ll, block.coders.llEnc.Histogram())
+		addHist(&ml, block.coders.mlEnc.Histogram())
+		addHist(&of, block.coders.ofEnc.Histogram())
+		for i, seq := range block.sequences {
+			if i > 3 {
+				break
+			}
+			offset := seq.offset
+			if offset == 0 {
+				continue
+			}
+			if int(offset) >= len(o.History) {
+				continue
+			}
+			if offset > 3 {
+				newOffsets[offset-3]++
+			} else {
+				newOffsets[uint32(o.Offsets[offset-1])]++
+			}
+		}
+	}
+	// Find most used offsets.
+	var sortedOffsets []uint32
+	for k := range newOffsets {
+		sortedOffsets = append(sortedOffsets, k)
+	}
+	sort.Slice(sortedOffsets, func(i, j int) bool {
+		a, b := sortedOffsets[i], sortedOffsets[j]
+		if a == b {
+			// Prefer the longer offset
+			return sortedOffsets[i] > sortedOffsets[j]
+		}
+		return newOffsets[sortedOffsets[i]] > newOffsets[sortedOffsets[j]]
+	})
+	if len(sortedOffsets) > 3 {
+		if debug {
+			print("Offsets:")
+			for i, v := range sortedOffsets {
+				if i > 20 {
+					break
+				}
+				printf("[%d: %d],", v, newOffsets[v])
+			}
+			println("")
+		}
+
+		sortedOffsets = sortedOffsets[:3]
+	}
+	for i, v := range sortedOffsets {
+		o.Offsets[i] = int(v)
+	}
+	if debug {
+		println("New repeat offsets", o.Offsets)
+	}
+
+	if nUsed == 0 || seqs == 0 {
+		return nil, fmt.Errorf("%d blocks, %d sequences found", nUsed, seqs)
+	}
+	if debug {
+		println("Sequences:", seqs, "Blocks:", nUsed, "Literals:", litTotal)
+	}
+	if seqs/nUsed < 512 {
+		// Use 512 as minimum.
+		nUsed = seqs / 512
+		if nUsed == 0 {
+			nUsed = 1
+		}
+	}
+	copyHist := func(dst *fseEncoder, src *[256]int) ([]byte, error) {
+		hist := dst.Histogram()
+		var maxSym uint8
+		var maxCount int
+		var fakeLength int
+		for i, v := range src {
+			if v > 0 {
+				v = v / nUsed
+				if v == 0 {
+					v = 1
+				}
+			}
+			if v > maxCount {
+				maxCount = v
+			}
+			if v != 0 {
+				maxSym = uint8(i)
+			}
+			fakeLength += v
+			hist[i] = uint32(v)
+		}
+
+		// Ensure we aren't trying to represent RLE.
+		if maxCount == fakeLength {
+			for i := range hist {
+				if uint8(i) == maxSym {
+					fakeLength++
+					maxSym++
+					hist[i+1] = 1
+					if maxSym > 1 {
+						break
+					}
+				}
+				if hist[0] == 0 {
+					fakeLength++
+					hist[i] = 1
+					if maxSym > 1 {
+						break
+					}
+				}
+			}
+		}
+
+		dst.HistogramFinished(maxSym, maxCount)
+		dst.reUsed = false
+		dst.useRLE = false
+		err := dst.normalizeCount(fakeLength)
+		if err != nil {
+			return nil, err
+		}
+		if debug {
+			println("RAW:", dst.count[:maxSym+1], "NORM:", dst.norm[:maxSym+1], "LEN:", fakeLength)
+		}
+		return dst.writeCount(nil)
+	}
+	if debug {
+		print("Literal lengths: ")
+	}
+	llTable, err := copyHist(block.coders.llEnc, &ll)
+	if err != nil {
+		return nil, err
+	}
+	if debug {
+		print("Match lengths: ")
+	}
+	mlTable, err := copyHist(block.coders.mlEnc, &ml)
+	if err != nil {
+		return nil, err
+	}
+	if debug {
+		print("Offsets: ")
+	}
+	ofTable, err := copyHist(block.coders.ofEnc, &of)
+	if err != nil {
+		return nil, err
+	}
+
+	// Literal table
+	avgSize := litTotal
+	if avgSize > huff0.BlockSizeMax/2 {
+		avgSize = huff0.BlockSizeMax / 2
+	}
+	huffBuff := make([]byte, 0, avgSize)
+	// Target size
+	div := litTotal / avgSize
+	if div < 1 {
+		div = 1
+	}
+	if debug {
+		println("Huffman weights:")
+	}
+	for i, n := range remain[:] {
+		if n > 0 {
+			n = n / div
+			// Allow all entries to be represented.
+			if n == 0 {
+				n = 1
+			}
+			huffBuff = append(huffBuff, bytes.Repeat([]byte{byte(i)}, n)...)
+			if debug {
+				printf("[%d: %d], ", i, n)
+			}
+		}
+	}
+	if o.CompatV155 && remain[255]/div == 0 {
+		huffBuff = append(huffBuff, 255)
+	}
+	scratch := &huff0.Scratch{TableLog: 11}
+	for tries := 0; tries < 255; tries++ {
+		scratch = &huff0.Scratch{TableLog: 11}
+		_, _, err = huff0.Compress1X(huffBuff, scratch)
+		if err == nil {
+			break
+		}
+		if debug {
+			printf("Try %d: Huffman error: %v\n", tries+1, err)
+		}
+		huffBuff = huffBuff[:0]
+		if tries == 250 {
+			if debug {
+				println("Huffman: Bailing out with predefined table")
+			}
+
+			// Bail out.... Just generate something
+			huffBuff = append(huffBuff, bytes.Repeat([]byte{255}, 10000)...)
+			for i := 0; i < 128; i++ {
+				huffBuff = append(huffBuff, byte(i))
+			}
+			continue
+		}
+		if errors.Is(err, huff0.ErrIncompressible) {
+			// Try truncating least common.
+			for i, n := range remain[:] {
+				if n > 0 {
+					n = n / (div * (i + 1))
+					if n > 0 {
+						huffBuff = append(huffBuff, bytes.Repeat([]byte{byte(i)}, n)...)
+					}
+				}
+			}
+			if o.CompatV155 && len(huffBuff) > 0 && huffBuff[len(huffBuff)-1] != 255 {
+				huffBuff = append(huffBuff, 255)
+			}
+			if len(huffBuff) == 0 {
+				huffBuff = append(huffBuff, 0, 255)
+			}
+		}
+		if errors.Is(err, huff0.ErrUseRLE) {
+			for i, n := range remain[:] {
+				n = n / (div * (i + 1))
+				// Allow all entries to be represented.
+				if n == 0 {
+					n = 1
+				}
+				huffBuff = append(huffBuff, bytes.Repeat([]byte{byte(i)}, n)...)
+			}
+		}
+	}
+
+	var out bytes.Buffer
+	out.Write([]byte(dictMagic))
+	out.Write(binary.LittleEndian.AppendUint32(nil, o.ID))
+	out.Write(scratch.OutTable)
+	if debug {
+		println("huff table:", len(scratch.OutTable), "bytes")
+		println("of table:", len(ofTable), "bytes")
+		println("ml table:", len(mlTable), "bytes")
+		println("ll table:", len(llTable), "bytes")
+	}
+	out.Write(ofTable)
+	out.Write(mlTable)
+	out.Write(llTable)
+	out.Write(binary.LittleEndian.AppendUint32(nil, uint32(o.Offsets[0])))
+	out.Write(binary.LittleEndian.AppendUint32(nil, uint32(o.Offsets[1])))
+	out.Write(binary.LittleEndian.AppendUint32(nil, uint32(o.Offsets[2])))
+	out.Write(hist)
+	if debug {
+		_, err := loadDict(out.Bytes())
+		if err != nil {
+			panic(err)
+		}
+		i, err := InspectDictionary(out.Bytes())
+		if err != nil {
+			panic(err)
+		}
+		println("ID:", i.ID())
+		println("Content size:", i.ContentSize())
+		println("Encoder:", i.LitEncoder() != nil)
+		println("Offsets:", i.Offsets())
+		var totalSize int
+		for _, b := range contents {
+			totalSize += len(b)
+		}
+
+		encWith := func(opts ...EOption) int {
+			enc, err := NewWriter(nil, opts...)
+			if err != nil {
+				panic(err)
+			}
+			defer enc.Close()
+			var dst []byte
+			var totalSize int
+			for _, b := range contents {
+				dst = enc.EncodeAll(b, dst[:0])
+				totalSize += len(dst)
+			}
+			return totalSize
+		}
+		plain := encWith(WithEncoderLevel(o.Level))
+		withDict := encWith(WithEncoderLevel(o.Level), WithEncoderDict(out.Bytes()))
+		println("Input size:", totalSize)
+		println("Plain Compressed:", plain)
+		println("Dict Compressed:", withDict)
+		println("Saved:", plain-withDict, (plain-withDict)/len(contents), "bytes per input (rounded down)")
+	}
+	return out.Bytes(), nil
+}

diff --git a/vendor/github.com/klauspost/compress/zstd/enc_base.go b/vendor/github.com/klauspost/compress/zstd/enc_base.go
index 15ae8ee..7d250c6 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_base.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_base.go

@@ -16,6 +16,7 @@
 	cur int32
 	// maximum offset. Should be at least 2x block size.
 	maxMatchOff int32
+	bufferReset int32
 	hist        []byte
 	crc         *xxhash.Digest
 	tmp         [8]byte
@@ -56,8 +57,8 @@
 }
 
 func (e *fastBase) addBlock(src []byte) int32 {
-	if debugAsserts && e.cur > bufferReset {
-		panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, bufferReset))
+	if debugAsserts && e.cur > e.bufferReset {
+		panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, e.bufferReset))
 	}
 	// check if we have space already
 	if len(e.hist)+len(src) > cap(e.hist) {
@@ -115,7 +116,7 @@
 			panic(err)
 		}
 		if t < 0 {
-			err := fmt.Sprintf("s (%d) < 0", s)
+			err := fmt.Sprintf("t (%d) < 0", t)
 			panic(err)
 		}
 		if s-t > e.maxMatchOff {
@@ -126,24 +127,7 @@
 			panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize))
 		}
 	}
-	a := src[s:]
-	b := src[t:]
-	b = b[:len(a)]
-	end := int32((len(a) >> 3) << 3)
-	for i := int32(0); i < end; i += 8 {
-		if diff := load6432(a, i) ^ load6432(b, i); diff != 0 {
-			return i + int32(bits.TrailingZeros64(diff)>>3)
-		}
-	}
-
-	a = a[end:]
-	b = b[end:]
-	for i := range a {
-		if a[i] != b[i] {
-			return int32(i) + end
-		}
-	}
-	return int32(len(a)) + end
+	return int32(matchLen(src[s:], src[t:]))
 }
 
 // Reset the encoding table.
@@ -160,18 +144,19 @@
 	} else {
 		e.crc.Reset()
 	}
+	e.blk.dictLitEnc = nil
 	if d != nil {
 		low := e.lowMem
 		if singleBlock {
 			e.lowMem = true
 		}
-		e.ensureHist(d.DictContentSize() + maxCompressedBlockSize)
+		e.ensureHist(d.ContentSize() + maxCompressedBlockSize)
 		e.lowMem = low
 	}
 
 	// We offset current position so everything will be out of reach.
 	// If above reset line, history will be purged.
-	if e.cur < bufferReset {
+	if e.cur < e.bufferReset {
 		e.cur += e.maxMatchOff + int32(len(e.hist))
 	}
 	e.hist = e.hist[:0]

diff --git a/vendor/github.com/klauspost/compress/zstd/enc_best.go b/vendor/github.com/klauspost/compress/zstd/enc_best.go
index 96028ec..4613724 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_best.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_best.go

@@ -34,7 +34,7 @@
 	est    int32
 }
 
-const highScore = 25000
+const highScore = maxMatchLen * 8
 
 // estBits will estimate output bits from predefined tables.
 func (m *match) estBits(bitsPerByte int32) {
@@ -43,7 +43,7 @@
 	if m.rep < 0 {
 		ofc = ofCode(uint32(m.s-m.offset) + 3)
 	} else {
-		ofc = ofCode(uint32(m.rep))
+		ofc = ofCode(uint32(m.rep) & 3)
 	}
 	// Cost, excluding
 	ofTT, mlTT := fsePredefEnc[tableOffsets].ct.symbolTT[ofc], fsePredefEnc[tableMatchLengths].ct.symbolTT[mlc]
@@ -84,14 +84,10 @@
 	)
 
 	// Protect against e.cur wraparound.
-	for e.cur >= bufferReset {
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 		if len(e.hist) == 0 {
-			for i := range e.table[:] {
-				e.table[i] = prevEntry{}
-			}
-			for i := range e.longTable[:] {
-				e.longTable[i] = prevEntry{}
-			}
+			e.table = [bestShortTableSize]prevEntry{}
+			e.longTable = [bestLongTableSize]prevEntry{}
 			e.cur = e.maxMatchOff
 			break
 		}
@@ -139,8 +135,20 @@
 		break
 	}
 
+	// Add block to history
 	s := e.addBlock(src)
 	blk.size = len(src)
+
+	// Check RLE first
+	if len(src) > zstdMinMatch {
+		ml := matchLen(src[1:], src)
+		if ml == len(src)-1 {
+			blk.literals = append(blk.literals, src[0])
+			blk.sequences = append(blk.sequences, seq{litLen: 1, matchLen: uint32(len(src)-1) - zstdMinMatch, offset: 1 + 3})
+			return
+		}
+	}
+
 	if len(src) < minNonLiteralBlockSize {
 		blk.extraLits = len(src)
 		blk.literals = blk.literals[:len(src)]
@@ -163,7 +171,6 @@
 
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := s
-	cv := load6432(src, s)
 
 	// Relative offsets
 	offset1 := int32(blk.recentOffsets[0])
@@ -177,7 +184,6 @@
 		blk.literals = append(blk.literals, src[nextEmit:until]...)
 		s.litLen = uint32(until - nextEmit)
 	}
-	_ = addLiterals
 
 	if debugEncoder {
 		println("recent offsets:", blk.recentOffsets)
@@ -192,54 +198,104 @@
 			panic("offset0 was 0")
 		}
 
-		bestOf := func(a, b match) match {
-			if a.est+(a.s-b.s)*bitsPerByte>>10 < b.est+(b.s-a.s)*bitsPerByte>>10 {
-				return a
-			}
-			return b
-		}
-		const goodEnough = 100
+		const goodEnough = 250
+
+		cv := load6432(src, s)
 
 		nextHashL := hashLen(cv, bestLongTableBits, bestLongLen)
 		nextHashS := hashLen(cv, bestShortTableBits, bestShortLen)
 		candidateL := e.longTable[nextHashL]
 		candidateS := e.table[nextHashS]
 
-		matchAt := func(offset int32, s int32, first uint32, rep int32) match {
-			if s-offset >= e.maxMatchOff || load3232(src, offset) != first {
-				return match{s: s, est: highScore}
+		// Set m to a match at offset if it looks like that will improve compression.
+		improve := func(m *match, offset int32, s int32, first uint32, rep int32) {
+			delta := s - offset
+			if delta >= e.maxMatchOff || delta <= 0 || load3232(src, offset) != first {
+				return
 			}
-			if debugAsserts {
-				if !bytes.Equal(src[s:s+4], src[offset:offset+4]) {
-					panic(fmt.Sprintf("first match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first))
+			// Try to quick reject if we already have a long match.
+			if m.length > 16 {
+				left := len(src) - int(m.s+m.length)
+				// If we are too close to the end, keep as is.
+				if left <= 0 {
+					return
+				}
+				checkLen := m.length - (s - m.s) - 8
+				if left > 2 && checkLen > 4 {
+					// Check 4 bytes, 4 bytes from the end of the current match.
+					a := load3232(src, offset+checkLen)
+					b := load3232(src, s+checkLen)
+					if a != b {
+						return
+					}
 				}
 			}
-			m := match{offset: offset, s: s, length: 4 + e.matchlen(s+4, offset+4, src), rep: rep}
-			m.estBits(bitsPerByte)
-			return m
+			l := 4 + e.matchlen(s+4, offset+4, src)
+			if m.rep <= 0 {
+				// Extend candidate match backwards as far as possible.
+				// Do not extend repeats as we can assume they are optimal
+				// and offsets change if s == nextEmit.
+				tMin := s - e.maxMatchOff
+				if tMin < 0 {
+					tMin = 0
+				}
+				for offset > tMin && s > nextEmit && src[offset-1] == src[s-1] && l < maxMatchLength {
+					s--
+					offset--
+					l++
+				}
+			}
+			if debugAsserts {
+				if offset >= s {
+					panic(fmt.Sprintf("offset: %d - s:%d - rep: %d - cur :%d - max: %d", offset, s, rep, e.cur, e.maxMatchOff))
+				}
+				if !bytes.Equal(src[s:s+l], src[offset:offset+l]) {
+					panic(fmt.Sprintf("second match mismatch: %v != %v, first: %08x", src[s:s+4], src[offset:offset+4], first))
+				}
+			}
+			cand := match{offset: offset, s: s, length: l, rep: rep}
+			cand.estBits(bitsPerByte)
+			if m.est >= highScore || cand.est-m.est+(cand.s-m.s)*bitsPerByte>>10 < 0 {
+				*m = cand
+			}
 		}
 
-		best := bestOf(matchAt(candidateL.offset-e.cur, s, uint32(cv), -1), matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
-		best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
-		best = bestOf(best, matchAt(candidateS.prev-e.cur, s, uint32(cv), -1))
+		best := match{s: s, est: highScore}
+		improve(&best, candidateL.offset-e.cur, s, uint32(cv), -1)
+		improve(&best, candidateL.prev-e.cur, s, uint32(cv), -1)
+		improve(&best, candidateS.offset-e.cur, s, uint32(cv), -1)
+		improve(&best, candidateS.prev-e.cur, s, uint32(cv), -1)
 
 		if canRepeat && best.length < goodEnough {
-			cv32 := uint32(cv >> 8)
-			spp := s + 1
-			best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1))
-			best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2))
-			best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3))
-			if best.length > 0 {
-				cv32 = uint32(cv >> 24)
-				spp += 2
-				best = bestOf(best, matchAt(spp-offset1, spp, cv32, 1))
-				best = bestOf(best, matchAt(spp-offset2, spp, cv32, 2))
-				best = bestOf(best, matchAt(spp-offset3, spp, cv32, 3))
+			if s == nextEmit {
+				// Check repeats straight after a match.
+				improve(&best, s-offset2, s, uint32(cv), 1|4)
+				improve(&best, s-offset3, s, uint32(cv), 2|4)
+				if offset1 > 1 {
+					improve(&best, s-(offset1-1), s, uint32(cv), 3|4)
+				}
+			}
+
+			// If either no match or a non-repeat match, check at + 1
+			if best.rep <= 0 {
+				cv32 := uint32(cv >> 8)
+				spp := s + 1
+				improve(&best, spp-offset1, spp, cv32, 1)
+				improve(&best, spp-offset2, spp, cv32, 2)
+				improve(&best, spp-offset3, spp, cv32, 3)
+				if best.rep < 0 {
+					cv32 = uint32(cv >> 24)
+					spp += 2
+					improve(&best, spp-offset1, spp, cv32, 1)
+					improve(&best, spp-offset2, spp, cv32, 2)
+					improve(&best, spp-offset3, spp, cv32, 3)
+				}
 			}
 		}
 		// Load next and check...
 		e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: candidateL.offset}
 		e.table[nextHashS] = prevEntry{offset: s + e.cur, prev: candidateS.offset}
+		index0 := s + 1
 
 		// Look far ahead, unless we have a really long match already...
 		if best.length < goodEnough {
@@ -249,97 +305,89 @@
 				if s >= sLimit {
 					break encodeLoop
 				}
-				cv = load6432(src, s)
 				continue
 			}
 
-			s++
 			candidateS = e.table[hashLen(cv>>8, bestShortTableBits, bestShortLen)]
-			cv = load6432(src, s)
-			cv2 := load6432(src, s+1)
+			cv = load6432(src, s+1)
+			cv2 := load6432(src, s+2)
 			candidateL = e.longTable[hashLen(cv, bestLongTableBits, bestLongLen)]
 			candidateL2 := e.longTable[hashLen(cv2, bestLongTableBits, bestLongLen)]
 
 			// Short at s+1
-			best = bestOf(best, matchAt(candidateS.offset-e.cur, s, uint32(cv), -1))
+			improve(&best, candidateS.offset-e.cur, s+1, uint32(cv), -1)
 			// Long at s+1, s+2
-			best = bestOf(best, matchAt(candidateL.offset-e.cur, s, uint32(cv), -1))
-			best = bestOf(best, matchAt(candidateL.prev-e.cur, s, uint32(cv), -1))
-			best = bestOf(best, matchAt(candidateL2.offset-e.cur, s+1, uint32(cv2), -1))
-			best = bestOf(best, matchAt(candidateL2.prev-e.cur, s+1, uint32(cv2), -1))
+			improve(&best, candidateL.offset-e.cur, s+1, uint32(cv), -1)
+			improve(&best, candidateL.prev-e.cur, s+1, uint32(cv), -1)
+			improve(&best, candidateL2.offset-e.cur, s+2, uint32(cv2), -1)
+			improve(&best, candidateL2.prev-e.cur, s+2, uint32(cv2), -1)
 			if false {
 				// Short at s+3.
 				// Too often worse...
-				best = bestOf(best, matchAt(e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+2, uint32(cv2>>8), -1))
+				improve(&best, e.table[hashLen(cv2>>8, bestShortTableBits, bestShortLen)].offset-e.cur, s+3, uint32(cv2>>8), -1)
 			}
-			// See if we can find a better match by checking where the current best ends.
-			// Use that offset to see if we can find a better full match.
-			if sAt := best.s + best.length; sAt < sLimit {
-				nextHashL := hashLen(load6432(src, sAt), bestLongTableBits, bestLongLen)
-				candidateEnd := e.longTable[nextHashL]
-				if pos := candidateEnd.offset - e.cur - best.length; pos >= 0 {
-					bestEnd := bestOf(best, matchAt(pos, best.s, load3232(src, best.s), -1))
-					if pos := candidateEnd.prev - e.cur - best.length; pos >= 0 {
-						bestEnd = bestOf(bestEnd, matchAt(pos, best.s, load3232(src, best.s), -1))
+
+			// Start check at a fixed offset to allow for a few mismatches.
+			// For this compression level 2 yields the best results.
+			// We cannot do this if we have already indexed this position.
+			const skipBeginning = 2
+			if best.s > s-skipBeginning {
+				// See if we can find a better match by checking where the current best ends.
+				// Use that offset to see if we can find a better full match.
+				if sAt := best.s + best.length; sAt < sLimit {
+					nextHashL := hashLen(load6432(src, sAt), bestLongTableBits, bestLongLen)
+					candidateEnd := e.longTable[nextHashL]
+
+					if off := candidateEnd.offset - e.cur - best.length + skipBeginning; off >= 0 {
+						improve(&best, off, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
+						if off := candidateEnd.prev - e.cur - best.length + skipBeginning; off >= 0 {
+							improve(&best, off, best.s+skipBeginning, load3232(src, best.s+skipBeginning), -1)
+						}
 					}
-					best = bestEnd
 				}
 			}
 		}
 
 		if debugAsserts {
+			if best.offset >= best.s {
+				panic(fmt.Sprintf("best.offset > s: %d >= %d", best.offset, best.s))
+			}
+			if best.s < nextEmit {
+				panic(fmt.Sprintf("s %d < nextEmit %d", best.s, nextEmit))
+			}
+			if best.offset < s-e.maxMatchOff {
+				panic(fmt.Sprintf("best.offset < s-e.maxMatchOff: %d < %d", best.offset, s-e.maxMatchOff))
+			}
 			if !bytes.Equal(src[best.s:best.s+best.length], src[best.offset:best.offset+best.length]) {
 				panic(fmt.Sprintf("match mismatch: %v != %v", src[best.s:best.s+best.length], src[best.offset:best.offset+best.length]))
 			}
 		}
 
 		// We have a match, we can store the forward value
+		s = best.s
 		if best.rep > 0 {
-			s = best.s
 			var seq seq
 			seq.matchLen = uint32(best.length - zstdMinMatch)
+			addLiterals(&seq, best.s)
 
-			// We might be able to match backwards.
-			// Extend as long as we can.
-			start := best.s
-			// We end the search early, so we don't risk 0 literals
-			// and have to do special offset treatment.
-			startLimit := nextEmit + 1
-
-			tMin := s - e.maxMatchOff
-			if tMin < 0 {
-				tMin = 0
-			}
-			repIndex := best.offset
-			for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
-				repIndex--
-				start--
-				seq.matchLen++
-			}
-			addLiterals(&seq, start)
-
-			// rep 0
-			seq.offset = uint32(best.rep)
+			// Repeat. If bit 4 is set, this is a non-lit repeat.
+			seq.offset = uint32(best.rep & 3)
 			if debugSequences {
-				println("repeat sequence", seq, "next s:", s)
+				println("repeat sequence", seq, "next s:", best.s, "off:", best.s-best.offset)
 			}
 			blk.sequences = append(blk.sequences, seq)
 
-			// Index match start+1 (long) -> s - 1
-			index0 := s
+			// Index old s + 1 -> s - 1
 			s = best.s + best.length
-
 			nextEmit = s
-			if s >= sLimit {
-				if debugEncoder {
-					println("repeat ended", s, best.length)
 
-				}
-				break encodeLoop
-			}
 			// Index skipped...
+			end := s
+			if s > sLimit+4 {
+				end = sLimit + 4
+			}
 			off := index0 + e.cur
-			for index0 < s-1 {
+			for index0 < end {
 				cv0 := load6432(src, index0)
 				h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
 				h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
@@ -348,19 +396,26 @@
 				off++
 				index0++
 			}
+
 			switch best.rep {
-			case 2:
+			case 2, 4 | 1:
 				offset1, offset2 = offset2, offset1
-			case 3:
+			case 3, 4 | 2:
 				offset1, offset2, offset3 = offset3, offset1, offset2
+			case 4 | 3:
+				offset1, offset2, offset3 = offset1-1, offset1, offset2
 			}
-			cv = load6432(src, s)
+			if s >= sLimit {
+				if debugEncoder {
+					println("repeat ended", s, best.length)
+				}
+				break encodeLoop
+			}
 			continue
 		}
 
 		// A 4-byte match has been found. Update recent offsets.
 		// We'll later see if more than 4 bytes.
-		s = best.s
 		t := best.offset
 		offset1, offset2, offset3 = s-t, offset1, offset2
 
@@ -372,22 +427,9 @@
 			panic("invalid offset")
 		}
 
-		// Extend the n-byte match as long as possible.
-		l := best.length
-
-		// Extend backwards
-		tMin := s - e.maxMatchOff
-		if tMin < 0 {
-			tMin = 0
-		}
-		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
-			s--
-			t--
-			l++
-		}
-
 		// Write our sequence
 		var seq seq
+		l := best.length
 		seq.litLen = uint32(s - nextEmit)
 		seq.matchLen = uint32(l - zstdMinMatch)
 		if seq.litLen > 0 {
@@ -400,65 +442,25 @@
 		}
 		blk.sequences = append(blk.sequences, seq)
 		nextEmit = s
-		if s >= sLimit {
-			break encodeLoop
+
+		// Index old s + 1 -> s - 1 or sLimit
+		end := s
+		if s > sLimit-4 {
+			end = sLimit - 4
 		}
 
-		// Index match start+1 (long) -> s - 1
-		index0 := s - l + 1
-		// every entry
-		for index0 < s-1 {
+		off := index0 + e.cur
+		for index0 < end {
 			cv0 := load6432(src, index0)
 			h0 := hashLen(cv0, bestLongTableBits, bestLongLen)
 			h1 := hashLen(cv0, bestShortTableBits, bestShortLen)
-			off := index0 + e.cur
 			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
 			e.table[h1] = prevEntry{offset: off, prev: e.table[h1].offset}
 			index0++
+			off++
 		}
-
-		cv = load6432(src, s)
-		if !canRepeat {
-			continue
-		}
-
-		// Check offset 2
-		for {
-			o2 := s - offset2
-			if load3232(src, o2) != uint32(cv) {
-				// Do regular search
-				break
-			}
-
-			// Store this, since we have it.
-			nextHashS := hashLen(cv, bestShortTableBits, bestShortLen)
-			nextHashL := hashLen(cv, bestLongTableBits, bestLongLen)
-
-			// We have at least 4 byte match.
-			// No need to check backwards. We come straight from a match
-			l := 4 + e.matchlen(s+4, o2+4, src)
-
-			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
-			e.table[nextHashS] = prevEntry{offset: s + e.cur, prev: e.table[nextHashS].offset}
-			seq.matchLen = uint32(l) - zstdMinMatch
-			seq.litLen = 0
-
-			// Since litlen is always 0, this is offset 1.
-			seq.offset = 1
-			s += l
-			nextEmit = s
-			if debugSequences {
-				println("sequence", seq, "next s:", s)
-			}
-			blk.sequences = append(blk.sequences, seq)
-
-			// Swap offset 1 and 2.
-			offset1, offset2 = offset2, offset1
-			if s >= sLimit {
-				// Finished
-				break encodeLoop
-			}
-			cv = load6432(src, s)
+		if s >= sLimit {
+			break encodeLoop
 		}
 	}
 

diff --git a/vendor/github.com/klauspost/compress/zstd/enc_better.go b/vendor/github.com/klauspost/compress/zstd/enc_better.go
index c769f69..84a79fd 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_better.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_better.go

@@ -62,14 +62,10 @@
 	)
 
 	// Protect against e.cur wraparound.
-	for e.cur >= bufferReset {
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 		if len(e.hist) == 0 {
-			for i := range e.table[:] {
-				e.table[i] = tableEntry{}
-			}
-			for i := range e.longTable[:] {
-				e.longTable[i] = prevEntry{}
-			}
+			e.table = [betterShortTableSize]tableEntry{}
+			e.longTable = [betterLongTableSize]prevEntry{}
 			e.cur = e.maxMatchOff
 			break
 		}
@@ -106,9 +102,20 @@
 		e.cur = e.maxMatchOff
 		break
 	}
-
+	// Add block to history
 	s := e.addBlock(src)
 	blk.size = len(src)
+
+	// Check RLE first
+	if len(src) > zstdMinMatch {
+		ml := matchLen(src[1:], src)
+		if ml == len(src)-1 {
+			blk.literals = append(blk.literals, src[0])
+			blk.sequences = append(blk.sequences, seq{litLen: 1, matchLen: uint32(len(src)-1) - zstdMinMatch, offset: 1 + 3})
+			return
+		}
+	}
+
 	if len(src) < minNonLiteralBlockSize {
 		blk.extraLits = len(src)
 		blk.literals = blk.literals[:len(src)]
@@ -149,7 +156,7 @@
 		var t int32
 		// We allow the encoder to optionally turn off repeat offsets across blocks
 		canRepeat := len(blk.sequences) > 2
-		var matched int32
+		var matched, index0 int32
 
 		for {
 			if debugAsserts && canRepeat && offset1 == 0 {
@@ -166,14 +173,15 @@
 			off := s + e.cur
 			e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
 			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
+			index0 = s + 1
 
 			if canRepeat {
 				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
 					// Consider history as well.
 					var seq seq
-					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+					length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
 
-					seq.matchLen = uint32(lenght - zstdMinMatch)
+					seq.matchLen = uint32(length - zstdMinMatch)
 
 					// We might be able to match backwards.
 					// Extend as long as we can.
@@ -202,12 +210,12 @@
 
 					// Index match start+1 (long) -> s - 1
 					index0 := s + repOff
-					s += lenght + repOff
+					s += length + repOff
 
 					nextEmit = s
 					if s >= sLimit {
 						if debugEncoder {
-							println("repeat ended", s, lenght)
+							println("repeat ended", s, length)
 
 						}
 						break encodeLoop
@@ -233,9 +241,9 @@
 				if false && repIndex >= 0 && load6432(src, repIndex) == load6432(src, s+repOff) {
 					// Consider history as well.
 					var seq seq
-					lenght := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
+					length := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
 
-					seq.matchLen = uint32(lenght - zstdMinMatch)
+					seq.matchLen = uint32(length - zstdMinMatch)
 
 					// We might be able to match backwards.
 					// Extend as long as we can.
@@ -262,12 +270,11 @@
 					}
 					blk.sequences = append(blk.sequences, seq)
 
-					index0 := s + repOff2
-					s += lenght + repOff2
+					s += length + repOff2
 					nextEmit = s
 					if s >= sLimit {
 						if debugEncoder {
-							println("repeat ended", s, lenght)
+							println("repeat ended", s, length)
 
 						}
 						break encodeLoop
@@ -416,15 +423,23 @@
 
 		// Try to find a better match by searching for a long match at the end of the current best match
 		if s+matched < sLimit {
+			// Allow some bytes at the beginning to mismatch.
+			// Sweet spot is around 3 bytes, but depends on input.
+			// The skipped bytes are tested in Extend backwards,
+			// and still picked up as part of the match if they do.
+			const skipBeginning = 3
+
 			nextHashL := hashLen(load6432(src, s+matched), betterLongTableBits, betterLongLen)
-			cv := load3232(src, s)
+			s2 := s + skipBeginning
+			cv := load3232(src, s2)
 			candidateL := e.longTable[nextHashL]
-			coffsetL := candidateL.offset - e.cur - matched
-			if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+			coffsetL := candidateL.offset - e.cur - matched + skipBeginning
+			if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
 				// Found a long match, at least 4 bytes.
-				matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+				matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4
 				if matchedNext > matched {
 					t = coffsetL
+					s = s2
 					matched = matchedNext
 					if debugMatches {
 						println("long match at end-of-match")
@@ -434,12 +449,13 @@
 
 			// Check prev long...
 			if true {
-				coffsetL = candidateL.prev - e.cur - matched
-				if coffsetL >= 0 && coffsetL < s && s-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
+				coffsetL = candidateL.prev - e.cur - matched + skipBeginning
+				if coffsetL >= 0 && coffsetL < s2 && s2-coffsetL < e.maxMatchOff && cv == load3232(src, coffsetL) {
 					// Found a long match, at least 4 bytes.
-					matchedNext := e.matchlen(s+4, coffsetL+4, src) + 4
+					matchedNext := e.matchlen(s2+4, coffsetL+4, src) + 4
 					if matchedNext > matched {
 						t = coffsetL
+						s = s2
 						matched = matchedNext
 						if debugMatches {
 							println("prev long match at end-of-match")
@@ -493,15 +509,15 @@
 		}
 
 		// Index match start+1 (long) -> s - 1
-		index0 := s - l + 1
+		off := index0 + e.cur
 		for index0 < s-1 {
 			cv0 := load6432(src, index0)
 			cv1 := cv0 >> 8
 			h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
-			off := index0 + e.cur
 			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
 			e.table[hashLen(cv1, betterShortTableBits, betterShortLen)] = tableEntry{offset: off + 1, val: uint32(cv1)}
 			index0 += 2
+			off += 2
 		}
 
 		cv = load6432(src, s)
@@ -578,7 +594,7 @@
 	)
 
 	// Protect against e.cur wraparound.
-	for e.cur >= bufferReset {
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntry{}
@@ -667,7 +683,7 @@
 		var t int32
 		// We allow the encoder to optionally turn off repeat offsets across blocks
 		canRepeat := len(blk.sequences) > 2
-		var matched int32
+		var matched, index0 int32
 
 		for {
 			if debugAsserts && canRepeat && offset1 == 0 {
@@ -686,14 +702,15 @@
 			e.markLongShardDirty(nextHashL)
 			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
 			e.markShortShardDirty(nextHashS)
+			index0 = s + 1
 
 			if canRepeat {
 				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
 					// Consider history as well.
 					var seq seq
-					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+					length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
 
-					seq.matchLen = uint32(lenght - zstdMinMatch)
+					seq.matchLen = uint32(length - zstdMinMatch)
 
 					// We might be able to match backwards.
 					// Extend as long as we can.
@@ -721,13 +738,12 @@
 					blk.sequences = append(blk.sequences, seq)
 
 					// Index match start+1 (long) -> s - 1
-					index0 := s + repOff
-					s += lenght + repOff
+					s += length + repOff
 
 					nextEmit = s
 					if s >= sLimit {
 						if debugEncoder {
-							println("repeat ended", s, lenght)
+							println("repeat ended", s, length)
 
 						}
 						break encodeLoop
@@ -756,9 +772,9 @@
 				if false && repIndex >= 0 && load6432(src, repIndex) == load6432(src, s+repOff) {
 					// Consider history as well.
 					var seq seq
-					lenght := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
+					length := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
 
-					seq.matchLen = uint32(lenght - zstdMinMatch)
+					seq.matchLen = uint32(length - zstdMinMatch)
 
 					// We might be able to match backwards.
 					// Extend as long as we can.
@@ -785,12 +801,11 @@
 					}
 					blk.sequences = append(blk.sequences, seq)
 
-					index0 := s + repOff2
-					s += lenght + repOff2
+					s += length + repOff2
 					nextEmit = s
 					if s >= sLimit {
 						if debugEncoder {
-							println("repeat ended", s, lenght)
+							println("repeat ended", s, length)
 
 						}
 						break encodeLoop
@@ -1019,18 +1034,18 @@
 		}
 
 		// Index match start+1 (long) -> s - 1
-		index0 := s - l + 1
+		off := index0 + e.cur
 		for index0 < s-1 {
 			cv0 := load6432(src, index0)
 			cv1 := cv0 >> 8
 			h0 := hashLen(cv0, betterLongTableBits, betterLongLen)
-			off := index0 + e.cur
 			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
 			e.markLongShardDirty(h0)
 			h1 := hashLen(cv1, betterShortTableBits, betterShortLen)
 			e.table[h1] = tableEntry{offset: off + 1, val: uint32(cv1)}
 			e.markShortShardDirty(h1)
 			index0 += 2
+			off += 2
 		}
 
 		cv = load6432(src, s)

diff --git a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
index 7ff0c64..d36be7b 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go

@@ -44,14 +44,10 @@
 	)
 
 	// Protect against e.cur wraparound.
-	for e.cur >= bufferReset {
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 		if len(e.hist) == 0 {
-			for i := range e.table[:] {
-				e.table[i] = tableEntry{}
-			}
-			for i := range e.longTable[:] {
-				e.longTable[i] = tableEntry{}
-			}
+			e.table = [dFastShortTableSize]tableEntry{}
+			e.longTable = [dFastLongTableSize]tableEntry{}
 			e.cur = e.maxMatchOff
 			break
 		}
@@ -142,9 +138,9 @@
 				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
 					// Consider history as well.
 					var seq seq
-					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+					length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
 
-					seq.matchLen = uint32(lenght - zstdMinMatch)
+					seq.matchLen = uint32(length - zstdMinMatch)
 
 					// We might be able to match backwards.
 					// Extend as long as we can.
@@ -170,11 +166,11 @@
 						println("repeat sequence", seq, "next s:", s)
 					}
 					blk.sequences = append(blk.sequences, seq)
-					s += lenght + repOff
+					s += length + repOff
 					nextEmit = s
 					if s >= sLimit {
 						if debugEncoder {
-							println("repeat ended", s, lenght)
+							println("repeat ended", s, length)
 
 						}
 						break encodeLoop
@@ -388,7 +384,7 @@
 	)
 
 	// Protect against e.cur wraparound.
-	if e.cur >= bufferReset {
+	if e.cur >= e.bufferReset {
 		for i := range e.table[:] {
 			e.table[i] = tableEntry{}
 		}
@@ -685,7 +681,7 @@
 	}
 
 	// We do not store history, so we must offset e.cur to avoid false matches for next user.
-	if e.cur < bufferReset {
+	if e.cur < e.bufferReset {
 		e.cur += int32(len(src))
 	}
 }
@@ -700,7 +696,7 @@
 	)
 
 	// Protect against e.cur wraparound.
-	for e.cur >= bufferReset {
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntry{}
@@ -802,9 +798,9 @@
 				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
 					// Consider history as well.
 					var seq seq
-					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+					length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
 
-					seq.matchLen = uint32(lenght - zstdMinMatch)
+					seq.matchLen = uint32(length - zstdMinMatch)
 
 					// We might be able to match backwards.
 					// Extend as long as we can.
@@ -830,11 +826,11 @@
 						println("repeat sequence", seq, "next s:", s)
 					}
 					blk.sequences = append(blk.sequences, seq)
-					s += lenght + repOff
+					s += length + repOff
 					nextEmit = s
 					if s >= sLimit {
 						if debugEncoder {
-							println("repeat ended", s, lenght)
+							println("repeat ended", s, length)
 
 						}
 						break encodeLoop
@@ -1088,7 +1084,7 @@
 			}
 		}
 		e.lastDictID = d.id
-		e.allDirty = true
+		allDirty = true
 	}
 	// Reset table to initial state
 	e.cur = e.maxMatchOff
@@ -1103,7 +1099,8 @@
 	}
 
 	if allDirty || dirtyShardCnt > dLongTableShardCnt/2 {
-		copy(e.longTable[:], e.dictLongTable)
+		//copy(e.longTable[:], e.dictLongTable)
+		e.longTable = *(*[dFastLongTableSize]tableEntry)(e.dictLongTable)
 		for i := range e.longTableShardDirty {
 			e.longTableShardDirty[i] = false
 		}
@@ -1114,7 +1111,9 @@
 			continue
 		}
 
-		copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize])
+		// copy(e.longTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize], e.dictLongTable[i*dLongTableShardSize:(i+1)*dLongTableShardSize])
+		*(*[dLongTableShardSize]tableEntry)(e.longTable[i*dLongTableShardSize:]) = *(*[dLongTableShardSize]tableEntry)(e.dictLongTable[i*dLongTableShardSize:])
+
 		e.longTableShardDirty[i] = false
 	}
 }

diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
index f51ab52..f45a3da 100644
--- a/vendor/github.com/klauspost/compress/zstd/enc_fast.go
+++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go

@@ -43,7 +43,7 @@
 	)
 
 	// Protect against e.cur wraparound.
-	for e.cur >= bufferReset {
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntry{}
@@ -133,8 +133,7 @@
 			if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) {
 				// Consider history as well.
 				var seq seq
-				var length int32
-				length = 4 + e.matchlen(s+6, repIndex+4, src)
+				length := 4 + e.matchlen(s+6, repIndex+4, src)
 				seq.matchLen = uint32(length - zstdMinMatch)
 
 				// We might be able to match backwards.
@@ -304,13 +303,13 @@
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
 	if debugEncoder {
-		if len(src) > maxBlockSize {
+		if len(src) > maxCompressedBlockSize {
 			panic("src too big")
 		}
 	}
 
 	// Protect against e.cur wraparound.
-	if e.cur >= bufferReset {
+	if e.cur >= e.bufferReset {
 		for i := range e.table[:] {
 			e.table[i] = tableEntry{}
 		}
@@ -538,7 +537,7 @@
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
 	// We do not store history, so we must offset e.cur to avoid false matches for next user.
-	if e.cur < bufferReset {
+	if e.cur < e.bufferReset {
 		e.cur += int32(len(src))
 	}
 }
@@ -555,11 +554,9 @@
 		return
 	}
 	// Protect against e.cur wraparound.
-	for e.cur >= bufferReset {
+	for e.cur >= e.bufferReset-int32(len(e.hist)) {
 		if len(e.hist) == 0 {
-			for i := range e.table[:] {
-				e.table[i] = tableEntry{}
-			}
+			e.table = [tableSize]tableEntry{}
 			e.cur = e.maxMatchOff
 			break
 		}
@@ -647,8 +644,7 @@
 			if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) {
 				// Consider history as well.
 				var seq seq
-				var length int32
-				length = 4 + e.matchlen(s+6, repIndex+4, src)
+				length := 4 + e.matchlen(s+6, repIndex+4, src)
 
 				seq.matchLen = uint32(length - zstdMinMatch)
 
@@ -833,13 +829,12 @@
 		}
 		if true {
 			end := e.maxMatchOff + int32(len(d.content)) - 8
-			for i := e.maxMatchOff; i < end; i += 3 {
+			for i := e.maxMatchOff; i < end; i += 2 {
 				const hashLog = tableBits
 
 				cv := load6432(d.content, i-e.maxMatchOff)
-				nextHash := hashLen(cv, hashLog, tableFastHashLen)      // 0 -> 5
-				nextHash1 := hashLen(cv>>8, hashLog, tableFastHashLen)  // 1 -> 6
-				nextHash2 := hashLen(cv>>16, hashLog, tableFastHashLen) // 2 -> 7
+				nextHash := hashLen(cv, hashLog, tableFastHashLen)     // 0 -> 6
+				nextHash1 := hashLen(cv>>8, hashLog, tableFastHashLen) // 1 -> 7
 				e.dictTable[nextHash] = tableEntry{
 					val:    uint32(cv),
 					offset: i,
@@ -848,10 +843,6 @@
 					val:    uint32(cv >> 8),
 					offset: i + 1,
 				}
-				e.dictTable[nextHash2] = tableEntry{
-					val:    uint32(cv >> 16),
-					offset: i + 2,
-				}
 			}
 		}
 		e.lastDictID = d.id
@@ -871,7 +862,8 @@
 	const shardCnt = tableShardCnt
 	const shardSize = tableShardSize
 	if e.allDirty || dirtyShardCnt > shardCnt*4/6 {
-		copy(e.table[:], e.dictTable)
+		//copy(e.table[:], e.dictTable)
+		e.table = *(*[tableSize]tableEntry)(e.dictTable)
 		for i := range e.tableShardDirty {
 			e.tableShardDirty[i] = false
 		}
@@ -883,7 +875,8 @@
 			continue
 		}
 
-		copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
+		//copy(e.table[i*shardSize:(i+1)*shardSize], e.dictTable[i*shardSize:(i+1)*shardSize])
+		*(*[shardSize]tableEntry)(e.table[i*shardSize:]) = *(*[shardSize]tableEntry)(e.dictTable[i*shardSize:])
 		e.tableShardDirty[i] = false
 	}
 	e.allDirty = false

diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
index 7aaaedb..8f8223c 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go

@@ -6,8 +6,10 @@
 
 import (
 	"crypto/rand"
+	"errors"
 	"fmt"
 	"io"
+	"math"
 	rdebug "runtime/debug"
 	"sync"
 
@@ -148,6 +150,9 @@
 // and write CRC if requested.
 func (e *Encoder) Write(p []byte) (n int, err error) {
 	s := &e.state
+	if s.eofWritten {
+		return 0, ErrEncoderClosed
+	}
 	for len(p) > 0 {
 		if len(p)+len(s.filling) < e.o.blockSize {
 			if e.o.crc {
@@ -201,7 +206,7 @@
 			return nil
 		}
 		if final && len(s.filling) > 0 {
-			s.current = e.EncodeAll(s.filling, s.current[:0])
+			s.current = e.encodeAll(s.encoder, s.filling, s.current[:0])
 			var n2 int
 			n2, s.err = s.w.Write(s.current)
 			if s.err != nil {
@@ -226,10 +231,7 @@
 			DictID:        e.o.dict.ID(),
 		}
 
-		dst, err := fh.appendTo(tmp[:0])
-		if err != nil {
-			return err
-		}
+		dst := fh.appendTo(tmp[:0])
 		s.headerWritten = true
 		s.wWg.Wait()
 		var n2 int
@@ -276,23 +278,9 @@
 			s.eofWritten = true
 		}
 
-		err := errIncompressible
-		// If we got the exact same number of literals as input,
-		// assume the literals cannot be compressed.
-		if len(src) != len(blk.literals) || len(src) != e.o.blockSize {
-			err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
-		}
-		switch err {
-		case errIncompressible:
-			if debugEncoder {
-				println("Storing incompressible block as raw")
-			}
-			blk.encodeRaw(src)
-			// In fast mode, we do not transfer offsets, so we don't have to deal with changing the.
-		case nil:
-		default:
-			s.err = err
-			return err
+		s.err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
+		if s.err != nil {
+			return s.err
 		}
 		_, s.err = s.w.Write(blk.output)
 		s.nWritten += int64(len(blk.output))
@@ -304,6 +292,9 @@
 	s.filling, s.current, s.previous = s.previous[:0], s.filling, s.current
 	s.nInput += int64(len(s.current))
 	s.wg.Add(1)
+	if final {
+		s.eofWritten = true
+	}
 	go func(src []byte) {
 		if debugEncoder {
 			println("Adding block,", len(src), "bytes, final:", final)
@@ -319,9 +310,6 @@
 		blk := enc.Block()
 		enc.Encode(blk, src)
 		blk.last = final
-		if final {
-			s.eofWritten = true
-		}
 		// Wait for pending writes.
 		s.wWg.Wait()
 		if s.writeErr != nil {
@@ -342,22 +330,8 @@
 				}
 				s.wWg.Done()
 			}()
-			err := errIncompressible
-			// If we got the exact same number of literals as input,
-			// assume the literals cannot be compressed.
-			if len(src) != len(blk.literals) || len(src) != e.o.blockSize {
-				err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
-			}
-			switch err {
-			case errIncompressible:
-				if debugEncoder {
-					println("Storing incompressible block as raw")
-				}
-				blk.encodeRaw(src)
-				// In fast mode, we do not transfer offsets, so we don't have to deal with changing the.
-			case nil:
-			default:
-				s.writeErr = err
+			s.writeErr = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
+			if s.writeErr != nil {
 				return
 			}
 			_, s.writeErr = s.w.Write(blk.output)
@@ -431,12 +405,20 @@
 	if len(s.filling) > 0 {
 		err := e.nextBlock(false)
 		if err != nil {
+			// Ignore Flush after Close.
+			if errors.Is(s.err, ErrEncoderClosed) {
+				return nil
+			}
 			return err
 		}
 	}
 	s.wg.Wait()
 	s.wWg.Wait()
 	if s.err != nil {
+		// Ignore Flush after Close.
+		if errors.Is(s.err, ErrEncoderClosed) {
+			return nil
+		}
 		return s.err
 	}
 	return s.writeErr
@@ -452,6 +434,9 @@
 	}
 	err := e.nextBlock(true)
 	if err != nil {
+		if errors.Is(s.err, ErrEncoderClosed) {
+			return nil
+		}
 		return err
 	}
 	if s.frameContentSize > 0 {
@@ -489,6 +474,11 @@
 		}
 		_, s.err = s.w.Write(frame)
 	}
+	if s.err == nil {
+		s.err = ErrEncoderClosed
+		return nil
+	}
+
 	return s.err
 }
 
@@ -499,6 +489,15 @@
 // Data compressed with EncodeAll can be decoded with the Decoder,
 // using either a stream or DecodeAll.
 func (e *Encoder) EncodeAll(src, dst []byte) []byte {
+	e.init.Do(e.initialize)
+	enc := <-e.encoders
+	defer func() {
+		e.encoders <- enc
+	}()
+	return e.encodeAll(enc, src, dst)
+}
+
+func (e *Encoder) encodeAll(enc encoder, src, dst []byte) []byte {
 	if len(src) == 0 {
 		if e.o.fullZero {
 			// Add frame header.
@@ -510,7 +509,7 @@
 				Checksum: false,
 				DictID:   0,
 			}
-			dst, _ = fh.appendTo(dst)
+			dst = fh.appendTo(dst)
 
 			// Write raw block as last one only.
 			var blk blockHeader
@@ -521,13 +520,7 @@
 		}
 		return dst
 	}
-	e.init.Do(e.initialize)
-	enc := <-e.encoders
-	defer func() {
-		// Release encoder reference to last block.
-		// If a non-single block is needed the encoder will reset again.
-		e.encoders <- enc
-	}()
+
 	// Use single segments when above minimum window and below window size.
 	single := len(src) <= e.o.windowSize && len(src) > MinWindowSize
 	if e.o.single != nil {
@@ -545,10 +538,7 @@
 	if len(dst) == 0 && cap(dst) == 0 && len(src) < 1<<20 && !e.o.lowMem {
 		dst = make([]byte, 0, len(src))
 	}
-	dst, err := fh.appendTo(dst)
-	if err != nil {
-		panic(err)
-	}
+	dst = fh.appendTo(dst)
 
 	// If we can do everything in one block, prefer that.
 	if len(src) <= e.o.blockSize {
@@ -567,25 +557,15 @@
 
 		// If we got the exact same number of literals as input,
 		// assume the literals cannot be compressed.
-		err := errIncompressible
 		oldout := blk.output
-		if len(blk.literals) != len(src) || len(src) != e.o.blockSize {
-			// Output directly to dst
-			blk.output = dst
-			err = blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
-		}
+		// Output directly to dst
+		blk.output = dst
 
-		switch err {
-		case errIncompressible:
-			if debugEncoder {
-				println("Storing incompressible block as raw")
-			}
-			dst = blk.encodeRawTo(dst, src)
-		case nil:
-			dst = blk.output
-		default:
+		err := blk.encode(src, e.o.noEntropy, !e.o.allLitEntropy)
+		if err != nil {
 			panic(err)
 		}
+		dst = blk.output
 		blk.output = oldout
 	} else {
 		enc.Reset(e.o.dict, false)
@@ -604,25 +584,11 @@
 			if len(src) == 0 {
 				blk.last = true
 			}
-			err := errIncompressible
-			// If we got the exact same number of literals as input,
-			// assume the literals cannot be compressed.
-			if len(blk.literals) != len(todo) || len(todo) != e.o.blockSize {
-				err = blk.encode(todo, e.o.noEntropy, !e.o.allLitEntropy)
-			}
-
-			switch err {
-			case errIncompressible:
-				if debugEncoder {
-					println("Storing incompressible block as raw")
-				}
-				dst = blk.encodeRawTo(dst, todo)
-				blk.popOffsets()
-			case nil:
-				dst = append(dst, blk.output...)
-			default:
+			err := blk.encode(todo, e.o.noEntropy, !e.o.allLitEntropy)
+			if err != nil {
 				panic(err)
 			}
+			dst = append(dst, blk.output...)
 			blk.reset(nil)
 		}
 	}
@@ -632,6 +598,7 @@
 	// Add padding with content from crypto/rand.Reader
 	if e.o.pad > 0 {
 		add := calcSkippableFrame(int64(len(dst)), int64(e.o.pad))
+		var err error
 		dst, err = skippableFrame(dst, add, rand.Reader)
 		if err != nil {
 			panic(err)
@@ -639,3 +606,37 @@
 	}
 	return dst
 }
+
+// MaxEncodedSize returns the expected maximum
+// size of an encoded block or stream.
+func (e *Encoder) MaxEncodedSize(size int) int {
+	frameHeader := 4 + 2 // magic + frame header & window descriptor
+	if e.o.dict != nil {
+		frameHeader += 4
+	}
+	// Frame content size:
+	if size < 256 {
+		frameHeader++
+	} else if size < 65536+256 {
+		frameHeader += 2
+	} else if size < math.MaxInt32 {
+		frameHeader += 4
+	} else {
+		frameHeader += 8
+	}
+	// Final crc
+	if e.o.crc {
+		frameHeader += 4
+	}
+
+	// Max overhead is 3 bytes/block.
+	// There cannot be 0 blocks.
+	blocks := (size + e.o.blockSize) / e.o.blockSize
+
+	// Combine, add padding.
+	maxSz := frameHeader + 3*blocks + size
+	if e.o.pad > 1 {
+		maxSz += calcSkippableFrame(int64(maxSz), int64(e.o.pad))
+	}
+	return maxSz
+}

diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
index a7c5e1a..20671dc 100644
--- a/vendor/github.com/klauspost/compress/zstd/encoder_options.go
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go

@@ -3,6 +3,8 @@
 import (
 	"errors"
 	"fmt"
+	"math"
+	"math/bits"
 	"runtime"
 	"strings"
 )
@@ -37,7 +39,7 @@
 		blockSize:     maxCompressedBlockSize,
 		windowSize:    8 << 20,
 		level:         SpeedDefault,
-		allLitEntropy: true,
+		allLitEntropy: false,
 		lowMem:        false,
 	}
 }
@@ -47,22 +49,22 @@
 	switch o.level {
 	case SpeedFastest:
 		if o.dict != nil {
-			return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+			return &fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
 		}
-		return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
+		return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
 
 	case SpeedDefault:
 		if o.dict != nil {
-			return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}}
+			return &doubleFastEncoderDict{fastEncoderDict: fastEncoderDict{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}}
 		}
-		return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+		return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
 	case SpeedBetterCompression:
 		if o.dict != nil {
-			return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}}
+			return &betterFastEncoderDict{betterFastEncoder: betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}}
 		}
-		return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
+		return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
 	case SpeedBestCompression:
-		return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), lowMem: o.lowMem}}
+		return &bestFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize), bufferReset: math.MaxInt32 - int32(o.windowSize*2), lowMem: o.lowMem}}
 	}
 	panic("unknown compression level")
 }
@@ -92,7 +94,7 @@
 // The value must be a power of two between MinWindowSize and MaxWindowSize.
 // A larger value will enable better compression but allocate more memory and,
 // for above-default values, take considerably longer.
-// The default value is determined by the compression level.
+// The default value is determined by the compression level and max 8MB.
 func WithWindowSize(n int) EOption {
 	return func(o *encoderOptions) error {
 		switch {
@@ -127,7 +129,7 @@
 		}
 		// No need to waste our time.
 		if n == 1 {
-			o.pad = 0
+			n = 0
 		}
 		if n > 1<<30 {
 			return fmt.Errorf("padding must less than 1GB (1<<30 bytes) ")
@@ -230,13 +232,13 @@
 			case SpeedDefault:
 				o.windowSize = 8 << 20
 			case SpeedBetterCompression:
-				o.windowSize = 16 << 20
+				o.windowSize = 8 << 20
 			case SpeedBestCompression:
-				o.windowSize = 32 << 20
+				o.windowSize = 8 << 20
 			}
 		}
 		if !o.customALEntropy {
-			o.allLitEntropy = l > SpeedFastest
+			o.allLitEntropy = l > SpeedDefault
 		}
 
 		return nil
@@ -304,7 +306,13 @@
 }
 
 // WithEncoderDict allows to register a dictionary that will be used for the encode.
+//
+// The slice dict must be in the [dictionary format] produced by
+// "zstd --train" from the Zstandard reference implementation.
+//
 // The encoder *may* choose to use no dictionary instead for certain payloads.
+//
+// [dictionary format]: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
 func WithEncoderDict(dict []byte) EOption {
 	return func(o *encoderOptions) error {
 		d, err := loadDict(dict)
@@ -315,3 +323,17 @@
 		return nil
 	}
 }
+
+// WithEncoderDictRaw registers a dictionary that may be used by the encoder.
+//
+// The slice content may contain arbitrary data. It will be used as an initial
+// history.
+func WithEncoderDictRaw(id uint32, content []byte) EOption {
+	return func(o *encoderOptions) error {
+		if bits.UintSize > 32 && uint(len(content)) > dictMaxLength {
+			return fmt.Errorf("dictionary of size %d > 2GiB too large", len(content))
+		}
+		o.dict = &dict{id: id, content: content, offsets: [3]int{1, 4, 8}}
+		return nil
+	}
+}

diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
index 9568a4b..e47af66 100644
--- a/vendor/github.com/klauspost/compress/zstd/framedec.go
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go

@@ -5,7 +5,7 @@
 package zstd
 
 import (
-	"bytes"
+	"encoding/binary"
 	"encoding/hex"
 	"errors"
 	"io"
@@ -29,7 +29,7 @@
 
 	FrameContentSize uint64
 
-	DictionaryID  *uint32
+	DictionaryID  uint32
 	HasCheckSum   bool
 	SingleSegment bool
 }
@@ -43,9 +43,9 @@
 	MaxWindowSize = 1 << 29
 )
 
-var (
-	frameMagic          = []byte{0x28, 0xb5, 0x2f, 0xfd}
-	skippableFrameMagic = []byte{0x2a, 0x4d, 0x18}
+const (
+	frameMagic          = "\x28\xb5\x2f\xfd"
+	skippableFrameMagic = "\x2a\x4d\x18"
 )
 
 func newFrameDec(o decoderOptions) *frameDec {
@@ -73,25 +73,25 @@
 		switch err {
 		case io.EOF, io.ErrUnexpectedEOF:
 			return io.EOF
-		default:
-			return err
 		case nil:
 			signature[0] = b[0]
+		default:
+			return err
 		}
 		// Read the rest, don't allow io.ErrUnexpectedEOF
 		b, err = br.readSmall(3)
 		switch err {
 		case io.EOF:
 			return io.EOF
-		default:
-			return err
 		case nil:
 			copy(signature[1:], b)
+		default:
+			return err
 		}
 
-		if !bytes.Equal(signature[1:4], skippableFrameMagic) || signature[0]&0xf0 != 0x50 {
+		if string(signature[1:4]) != skippableFrameMagic || signature[0]&0xf0 != 0x50 {
 			if debugDecoder {
-				println("Not skippable", hex.EncodeToString(signature[:]), hex.EncodeToString(skippableFrameMagic))
+				println("Not skippable", hex.EncodeToString(signature[:]), hex.EncodeToString([]byte(skippableFrameMagic)))
 			}
 			// Break if not skippable frame.
 			break
@@ -114,9 +114,9 @@
 			return err
 		}
 	}
-	if !bytes.Equal(signature[:], frameMagic) {
+	if string(signature[:]) != frameMagic {
 		if debugDecoder {
-			println("Got magic numbers: ", signature, "want:", frameMagic)
+			println("Got magic numbers: ", signature, "want:", []byte(frameMagic))
 		}
 		return ErrMagicMismatch
 	}
@@ -146,7 +146,9 @@
 			}
 			return err
 		}
-		printf("raw: %x, mantissa: %d, exponent: %d\n", wd, wd&7, wd>>3)
+		if debugDecoder {
+			printf("raw: %x, mantissa: %d, exponent: %d\n", wd, wd&7, wd>>3)
+		}
 		windowLog := 10 + (wd >> 3)
 		windowBase := uint64(1) << windowLog
 		windowAdd := (windowBase / 8) * uint64(wd&0x7)
@@ -155,7 +157,7 @@
 
 	// Read Dictionary_ID
 	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
-	d.DictionaryID = nil
+	d.DictionaryID = 0
 	if size := fhd & 3; size != 0 {
 		if size == 3 {
 			size = 4
@@ -167,7 +169,7 @@
 			return err
 		}
 		var id uint32
-		switch size {
+		switch len(b) {
 		case 1:
 			id = uint32(b[0])
 		case 2:
@@ -178,11 +180,7 @@
 		if debugDecoder {
 			println("Dict size", size, "ID:", id)
 		}
-		if id > 0 {
-			// ID 0 means "sorry, no dictionary anyway".
-			// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
-			d.DictionaryID = &id
-		}
+		d.DictionaryID = id
 	}
 
 	// Read Frame_Content_Size
@@ -204,7 +202,7 @@
 			println("Reading Frame content", err)
 			return err
 		}
-		switch fcsSize {
+		switch len(b) {
 		case 1:
 			d.FrameContentSize = uint64(b[0])
 		case 2:
@@ -261,11 +259,16 @@
 	}
 	d.history.windowSize = int(d.WindowSize)
 	if !d.o.lowMem || d.history.windowSize < maxBlockSize {
-		// Alloc 2x window size if not low-mem, or very small window size.
+		// Alloc 2x window size if not low-mem, or window size below 2MB.
 		d.history.allocFrameBuffer = d.history.windowSize * 2
 	} else {
-		// Alloc with one additional block
-		d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
+		if d.o.lowMem {
+			// Alloc with 1MB extra.
+			d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize/2
+		} else {
+			// Alloc with 2MB extra.
+			d.history.allocFrameBuffer = d.history.windowSize + maxBlockSize
+		}
 	}
 
 	if debugDecoder {
@@ -292,58 +295,41 @@
 	return nil
 }
 
-// checkCRC will check the checksum if the frame has one.
+// checkCRC will check the checksum, assuming the frame has one.
 // Will return ErrCRCMismatch if crc check failed, otherwise nil.
 func (d *frameDec) checkCRC() error {
-	if !d.HasCheckSum {
-		return nil
-	}
-
 	// We can overwrite upper tmp now
-	want, err := d.rawInput.readSmall(4)
+	buf, err := d.rawInput.readSmall(4)
 	if err != nil {
 		println("CRC missing?", err)
 		return err
 	}
 
-	if d.o.ignoreChecksum {
-		return nil
-	}
+	want := binary.LittleEndian.Uint32(buf[:4])
+	got := uint32(d.crc.Sum64())
 
-	var tmp [4]byte
-	got := d.crc.Sum64()
-	// Flip to match file order.
-	tmp[0] = byte(got >> 0)
-	tmp[1] = byte(got >> 8)
-	tmp[2] = byte(got >> 16)
-	tmp[3] = byte(got >> 24)
-
-	if !bytes.Equal(tmp[:], want) {
+	if got != want {
 		if debugDecoder {
-			println("CRC Check Failed:", tmp[:], "!=", want)
+			printf("CRC check failed: got %08x, want %08x\n", got, want)
 		}
 		return ErrCRCMismatch
 	}
 	if debugDecoder {
-		println("CRC ok", tmp[:])
+		printf("CRC ok %08x\n", got)
 	}
 	return nil
 }
 
-// consumeCRC reads the checksum data if the frame has one.
+// consumeCRC skips over the checksum, assuming the frame has one.
 func (d *frameDec) consumeCRC() error {
-	if d.HasCheckSum {
-		_, err := d.rawInput.readSmall(4)
-		if err != nil {
-			println("CRC missing?", err)
-			return err
-		}
+	_, err := d.rawInput.readSmall(4)
+	if err != nil {
+		println("CRC missing?", err)
 	}
-
-	return nil
+	return err
 }
 
-// runDecoder will create a sync decoder that will decode a block of data.
+// runDecoder will run the decoder for the remainder of the frame.
 func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 	saved := d.history.b
 
@@ -353,12 +339,23 @@
 	// Store input length, so we only check new data.
 	crcStart := len(dst)
 	d.history.decoders.maxSyncLen = 0
+	if d.o.limitToCap {
+		d.history.decoders.maxSyncLen = uint64(cap(dst) - len(dst))
+	}
 	if d.FrameContentSize != fcsUnknown {
-		d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
+		if !d.o.limitToCap || d.FrameContentSize+uint64(len(dst)) < d.history.decoders.maxSyncLen {
+			d.history.decoders.maxSyncLen = d.FrameContentSize + uint64(len(dst))
+		}
 		if d.history.decoders.maxSyncLen > d.o.maxDecodedSize {
+			if debugDecoder {
+				println("maxSyncLen:", d.history.decoders.maxSyncLen, "> maxDecodedSize:", d.o.maxDecodedSize)
+			}
 			return dst, ErrDecoderSizeExceeded
 		}
-		if uint64(cap(dst)) < d.history.decoders.maxSyncLen {
+		if debugDecoder {
+			println("maxSyncLen:", d.history.decoders.maxSyncLen)
+		}
+		if !d.o.limitToCap && uint64(cap(dst)) < d.history.decoders.maxSyncLen {
 			// Alloc for output
 			dst2 := make([]byte, len(dst), d.history.decoders.maxSyncLen+compressedBlockOverAlloc)
 			copy(dst2, dst)
@@ -378,7 +375,13 @@
 		if err != nil {
 			break
 		}
-		if uint64(len(d.history.b)) > d.o.maxDecodedSize {
+		if uint64(len(d.history.b)-crcStart) > d.o.maxDecodedSize {
+			println("runDecoder: maxDecodedSize exceeded", uint64(len(d.history.b)-crcStart), ">", d.o.maxDecodedSize)
+			err = ErrDecoderSizeExceeded
+			break
+		}
+		if d.o.limitToCap && len(d.history.b) > cap(dst) {
+			println("runDecoder: cap exceeded", uint64(len(d.history.b)), ">", cap(dst))
 			err = ErrDecoderSizeExceeded
 			break
 		}
@@ -402,15 +405,8 @@
 			if d.o.ignoreChecksum {
 				err = d.consumeCRC()
 			} else {
-				var n int
-				n, err = d.crc.Write(dst[crcStart:])
-				if err == nil {
-					if n != len(dst)-crcStart {
-						err = io.ErrShortWrite
-					} else {
-						err = d.checkCRC()
-					}
-				}
+				d.crc.Write(dst[crcStart:])
+				err = d.checkCRC()
 			}
 		}
 	}

diff --git a/vendor/github.com/klauspost/compress/zstd/frameenc.go b/vendor/github.com/klauspost/compress/zstd/frameenc.go
index 4ef7f5a..667ca06 100644
--- a/vendor/github.com/klauspost/compress/zstd/frameenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/frameenc.go

@@ -22,7 +22,7 @@
 
 const maxHeaderSize = 14
 
-func (f frameHeader) appendTo(dst []byte) ([]byte, error) {
+func (f frameHeader) appendTo(dst []byte) []byte {
 	dst = append(dst, frameMagic...)
 	var fhd uint8
 	if f.Checksum {
@@ -76,7 +76,7 @@
 		if f.SingleSegment {
 			dst = append(dst, uint8(f.ContentSize))
 		}
-		// Unless SingleSegment is set, framessizes < 256 are nto stored.
+		// Unless SingleSegment is set, framessizes < 256 are not stored.
 	case 1:
 		f.ContentSize -= 256
 		dst = append(dst, uint8(f.ContentSize), uint8(f.ContentSize>>8))
@@ -88,7 +88,7 @@
 	default:
 		panic("invalid fcs")
 	}
-	return dst, nil
+	return dst
 }
 
 const skippableFrameHeader = 4 + 4

diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
index c881d28..d04a829 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.go

@@ -21,7 +21,8 @@
 
 // buildDtable_asm is an x86 assembly implementation of fseDecoder.buildDtable.
 // Function returns non-zero exit code on error.
-// go:noescape
+//
+//go:noescape
 func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
 
 // please keep in sync with _generate/gen_fse.go

diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
index da32b44..bcde398 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_amd64.s

@@ -1,7 +1,6 @@
 // Code generated by command: go run gen_fse.go -out ../fse_decoder_amd64.s -pkg=zstd. DO NOT EDIT.
 
 //go:build !appengine && !noasm && gc && !noasm
-// +build !appengine,!noasm,gc,!noasm
 
 // func buildDtable_asm(s *fseDecoder, ctx *buildDtableAsmContext) int
 TEXT ·buildDtable_asm(SB), $0-24

diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go
index 332e51f..8adfebb 100644
--- a/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder_generic.go

@@ -20,10 +20,9 @@
 			if v == -1 {
 				s.dt[highThreshold].setAddBits(uint8(i))
 				highThreshold--
-				symbolNext[i] = 1
-			} else {
-				symbolNext[i] = uint16(v)
+				v = 1
 			}
+			symbolNext[i] = uint16(v)
 		}
 	}
 
@@ -35,10 +34,12 @@
 		for ss, v := range s.norm[:s.symbolLen] {
 			for i := 0; i < int(v); i++ {
 				s.dt[position].setAddBits(uint8(ss))
-				position = (position + step) & tableMask
-				for position > highThreshold {
+				for {
 					// lowprob area
 					position = (position + step) & tableMask
+					if position <= highThreshold {
+						break
+					}
 				}
 			}
 		}

diff --git a/vendor/github.com/klauspost/compress/zstd/history.go b/vendor/github.com/klauspost/compress/zstd/history.go
index 28b4015..0916485 100644
--- a/vendor/github.com/klauspost/compress/zstd/history.go
+++ b/vendor/github.com/klauspost/compress/zstd/history.go

@@ -37,26 +37,23 @@
 	h.ignoreBuffer = 0
 	h.error = false
 	h.recentOffsets = [3]int{1, 4, 8}
-	if f := h.decoders.litLengths.fse; f != nil && !f.preDefined {
-		fseDecoderPool.Put(f)
-	}
-	if f := h.decoders.offsets.fse; f != nil && !f.preDefined {
-		fseDecoderPool.Put(f)
-	}
-	if f := h.decoders.matchLengths.fse; f != nil && !f.preDefined {
-		fseDecoderPool.Put(f)
-	}
+	h.decoders.freeDecoders()
 	h.decoders = sequenceDecs{br: h.decoders.br}
-	if h.huffTree != nil {
-		if h.dict == nil || h.dict.litEnc != h.huffTree {
-			huffDecoderPool.Put(h.huffTree)
-		}
-	}
+	h.freeHuffDecoder()
 	h.huffTree = nil
 	h.dict = nil
 	//printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b))
 }
 
+func (h *history) freeHuffDecoder() {
+	if h.huffTree != nil {
+		if h.dict == nil || h.dict.litEnc != h.huffTree {
+			huffDecoderPool.Put(h.huffTree)
+			h.huffTree = nil
+		}
+	}
+}
+
 func (h *history) setDict(dict *dict) {
 	if dict == nil {
 		return

diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md
index 69aa3bb..777290d 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md

@@ -2,12 +2,7 @@
 
 VENDORED: Go to [github.com/cespare/xxhash](https://github.com/cespare/xxhash) for original package.
 
-
-[![GoDoc](https://godoc.org/github.com/cespare/xxhash?status.svg)](https://godoc.org/github.com/cespare/xxhash)
-[![Build Status](https://travis-ci.org/cespare/xxhash.svg?branch=master)](https://travis-ci.org/cespare/xxhash)
-
-xxhash is a Go implementation of the 64-bit
-[xxHash](http://cyan4973.github.io/xxHash/) algorithm, XXH64. This is a
+xxhash is a Go implementation of the 64-bit [xxHash] algorithm, XXH64. This is a
 high-quality hashing algorithm that is much faster than anything in the Go
 standard library.
 
@@ -28,31 +23,49 @@
 func (*Digest) Sum64() uint64
 ```
 
-This implementation provides a fast pure-Go implementation and an even faster
-assembly implementation for amd64.
+The package is written with optimized pure Go and also contains even faster
+assembly implementations for amd64 and arm64. If desired, the `purego` build tag
+opts into using the Go code even on those architectures.
+
+[xxHash]: http://cyan4973.github.io/xxHash/
+
+## Compatibility
+
+This package is in a module and the latest code is in version 2 of the module.
+You need a version of Go with at least "minimal module compatibility" to use
+github.com/cespare/xxhash/v2:
+
+* 1.9.7+ for Go 1.9
+* 1.10.3+ for Go 1.10
+* Go 1.11 or later
+
+I recommend using the latest release of Go.
 
 ## Benchmarks
 
 Here are some quick benchmarks comparing the pure-Go and assembly
 implementations of Sum64.
 
-| input size | purego | asm |
-| --- | --- | --- |
-| 5 B   |  979.66 MB/s |  1291.17 MB/s  |
-| 100 B | 7475.26 MB/s | 7973.40 MB/s  |
-| 4 KB  | 17573.46 MB/s | 17602.65 MB/s |
-| 10 MB | 17131.46 MB/s | 17142.16 MB/s |
+| input size | purego    | asm       |
+| ---------- | --------- | --------- |
+| 4 B        |  1.3 GB/s |  1.2 GB/s |
+| 16 B       |  2.9 GB/s |  3.5 GB/s |
+| 100 B      |  6.9 GB/s |  8.1 GB/s |
+| 4 KB       | 11.7 GB/s | 16.7 GB/s |
+| 10 MB      | 12.0 GB/s | 17.3 GB/s |
 
-These numbers were generated on Ubuntu 18.04 with an Intel i7-8700K CPU using
-the following commands under Go 1.11.2:
+These numbers were generated on Ubuntu 20.04 with an Intel Xeon Platinum 8252C
+CPU using the following commands under Go 1.19.2:
 
 ```
-$ go test -tags purego -benchtime 10s -bench '/xxhash,direct,bytes'
-$ go test -benchtime 10s -bench '/xxhash,direct,bytes'
+benchstat <(go test -tags purego -benchtime 500ms -count 15 -bench 'Sum64$')
+benchstat <(go test -benchtime 500ms -count 15 -bench 'Sum64$')
 ```
 
 ## Projects using this package
 
 - [InfluxDB](https://github.com/influxdata/influxdb)
 - [Prometheus](https://github.com/prometheus/prometheus)
+- [VictoriaMetrics](https://github.com/VictoriaMetrics/VictoriaMetrics)
 - [FreeCache](https://github.com/coocood/freecache)
+- [FastCache](https://github.com/VictoriaMetrics/fastcache)

diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
index 2c112a0..fc40c82 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go

@@ -18,19 +18,11 @@
 	prime5 uint64 = 2870177450012600261
 )
 
-// NOTE(caleb): I'm using both consts and vars of the primes. Using consts where
-// possible in the Go code is worth a small (but measurable) performance boost
-// by avoiding some MOVQs. Vars are needed for the asm and also are useful for
-// convenience in the Go code in a few places where we need to intentionally
-// avoid constant arithmetic (e.g., v1 := prime1 + prime2 fails because the
-// result overflows a uint64).
-var (
-	prime1v = prime1
-	prime2v = prime2
-	prime3v = prime3
-	prime4v = prime4
-	prime5v = prime5
-)
+// Store the primes in an array as well.
+//
+// The consts are used when possible in Go code to avoid MOVs but we need a
+// contiguous array of the assembly code.
+var primes = [...]uint64{prime1, prime2, prime3, prime4, prime5}
 
 // Digest implements hash.Hash64.
 type Digest struct {
@@ -52,10 +44,10 @@
 
 // Reset clears the Digest's state so that it can be reused.
 func (d *Digest) Reset() {
-	d.v1 = prime1v + prime2
+	d.v1 = primes[0] + prime2
 	d.v2 = prime2
 	d.v3 = 0
-	d.v4 = -prime1v
+	d.v4 = -primes[0]
 	d.total = 0
 	d.n = 0
 }
@@ -71,21 +63,23 @@
 	n = len(b)
 	d.total += uint64(n)
 
+	memleft := d.mem[d.n&(len(d.mem)-1):]
+
 	if d.n+n < 32 {
 		// This new data doesn't even fill the current block.
-		copy(d.mem[d.n:], b)
+		copy(memleft, b)
 		d.n += n
 		return
 	}
 
 	if d.n > 0 {
 		// Finish off the partial block.
-		copy(d.mem[d.n:], b)
+		c := copy(memleft, b)
 		d.v1 = round(d.v1, u64(d.mem[0:8]))
 		d.v2 = round(d.v2, u64(d.mem[8:16]))
 		d.v3 = round(d.v3, u64(d.mem[16:24]))
 		d.v4 = round(d.v4, u64(d.mem[24:32]))
-		b = b[32-d.n:]
+		b = b[c:]
 		d.n = 0
 	}
 
@@ -135,21 +129,20 @@
 
 	h += d.total
 
-	i, end := 0, d.n
-	for ; i+8 <= end; i += 8 {
-		k1 := round(0, u64(d.mem[i:i+8]))
+	b := d.mem[:d.n&(len(d.mem)-1)]
+	for ; len(b) >= 8; b = b[8:] {
+		k1 := round(0, u64(b[:8]))
 		h ^= k1
 		h = rol27(h)*prime1 + prime4
 	}
-	if i+4 <= end {
-		h ^= uint64(u32(d.mem[i:i+4])) * prime1
+	if len(b) >= 4 {
+		h ^= uint64(u32(b[:4])) * prime1
 		h = rol23(h)*prime2 + prime3
-		i += 4
+		b = b[4:]
 	}
-	for i < end {
-		h ^= uint64(d.mem[i]) * prime5
+	for ; len(b) > 0; b = b[1:] {
+		h ^= uint64(b[0]) * prime5
 		h = rol11(h) * prime1
-		i++
 	}
 
 	h ^= h >> 33

diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
index cea1785..ddb63aa 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s

@@ -1,3 +1,4 @@
+//go:build !appengine && gc && !purego && !noasm
 // +build !appengine
 // +build gc
 // +build !purego
@@ -5,212 +6,205 @@
 
 #include "textflag.h"
 
-// Register allocation:
-// AX	h
-// SI	pointer to advance through b
-// DX	n
-// BX	loop end
-// R8	v1, k1
-// R9	v2
-// R10	v3
-// R11	v4
-// R12	tmp
-// R13	prime1v
-// R14	prime2v
-// DI	prime4v
+// Registers:
+#define h      AX
+#define d      AX
+#define p      SI // pointer to advance through b
+#define n      DX
+#define end    BX // loop end
+#define v1     R8
+#define v2     R9
+#define v3     R10
+#define v4     R11
+#define x      R12
+#define prime1 R13
+#define prime2 R14
+#define prime4 DI
 
-// round reads from and advances the buffer pointer in SI.
-// It assumes that R13 has prime1v and R14 has prime2v.
-#define round(r) \
-	MOVQ  (SI), R12 \
-	ADDQ  $8, SI    \
-	IMULQ R14, R12  \
-	ADDQ  R12, r    \
-	ROLQ  $31, r    \
-	IMULQ R13, r
+#define round(acc, x) \
+	IMULQ prime2, x   \
+	ADDQ  x, acc      \
+	ROLQ  $31, acc    \
+	IMULQ prime1, acc
 
-// mergeRound applies a merge round on the two registers acc and val.
-// It assumes that R13 has prime1v, R14 has prime2v, and DI has prime4v.
-#define mergeRound(acc, val) \
-	IMULQ R14, val \
-	ROLQ  $31, val \
-	IMULQ R13, val \
-	XORQ  val, acc \
-	IMULQ R13, acc \
-	ADDQ  DI, acc
+// round0 performs the operation x = round(0, x).
+#define round0(x) \
+	IMULQ prime2, x \
+	ROLQ  $31, x    \
+	IMULQ prime1, x
+
+// mergeRound applies a merge round on the two registers acc and x.
+// It assumes that prime1, prime2, and prime4 have been loaded.
+#define mergeRound(acc, x) \
+	round0(x)         \
+	XORQ  x, acc      \
+	IMULQ prime1, acc \
+	ADDQ  prime4, acc
+
+// blockLoop processes as many 32-byte blocks as possible,
+// updating v1, v2, v3, and v4. It assumes that there is at least one block
+// to process.
+#define blockLoop() \
+loop:  \
+	MOVQ +0(p), x  \
+	round(v1, x)   \
+	MOVQ +8(p), x  \
+	round(v2, x)   \
+	MOVQ +16(p), x \
+	round(v3, x)   \
+	MOVQ +24(p), x \
+	round(v4, x)   \
+	ADDQ $32, p    \
+	CMPQ p, end    \
+	JLE  loop
 
 // func Sum64(b []byte) uint64
-TEXT ·Sum64(SB), NOSPLIT, $0-32
+TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
 	// Load fixed primes.
-	MOVQ ·prime1v(SB), R13
-	MOVQ ·prime2v(SB), R14
-	MOVQ ·prime4v(SB), DI
+	MOVQ ·primes+0(SB), prime1
+	MOVQ ·primes+8(SB), prime2
+	MOVQ ·primes+24(SB), prime4
 
 	// Load slice.
-	MOVQ b_base+0(FP), SI
-	MOVQ b_len+8(FP), DX
-	LEAQ (SI)(DX*1), BX
+	MOVQ b_base+0(FP), p
+	MOVQ b_len+8(FP), n
+	LEAQ (p)(n*1), end
 
 	// The first loop limit will be len(b)-32.
-	SUBQ $32, BX
+	SUBQ $32, end
 
 	// Check whether we have at least one block.
-	CMPQ DX, $32
+	CMPQ n, $32
 	JLT  noBlocks
 
 	// Set up initial state (v1, v2, v3, v4).
-	MOVQ R13, R8
-	ADDQ R14, R8
-	MOVQ R14, R9
-	XORQ R10, R10
-	XORQ R11, R11
-	SUBQ R13, R11
+	MOVQ prime1, v1
+	ADDQ prime2, v1
+	MOVQ prime2, v2
+	XORQ v3, v3
+	XORQ v4, v4
+	SUBQ prime1, v4
 
-	// Loop until SI > BX.
-blockLoop:
-	round(R8)
-	round(R9)
-	round(R10)
-	round(R11)
+	blockLoop()
 
-	CMPQ SI, BX
-	JLE  blockLoop
+	MOVQ v1, h
+	ROLQ $1, h
+	MOVQ v2, x
+	ROLQ $7, x
+	ADDQ x, h
+	MOVQ v3, x
+	ROLQ $12, x
+	ADDQ x, h
+	MOVQ v4, x
+	ROLQ $18, x
+	ADDQ x, h
 
-	MOVQ R8, AX
-	ROLQ $1, AX
-	MOVQ R9, R12
-	ROLQ $7, R12
-	ADDQ R12, AX
-	MOVQ R10, R12
-	ROLQ $12, R12
-	ADDQ R12, AX
-	MOVQ R11, R12
-	ROLQ $18, R12
-	ADDQ R12, AX
-
-	mergeRound(AX, R8)
-	mergeRound(AX, R9)
-	mergeRound(AX, R10)
-	mergeRound(AX, R11)
+	mergeRound(h, v1)
+	mergeRound(h, v2)
+	mergeRound(h, v3)
+	mergeRound(h, v4)
 
 	JMP afterBlocks
 
 noBlocks:
-	MOVQ ·prime5v(SB), AX
+	MOVQ ·primes+32(SB), h
 
 afterBlocks:
-	ADDQ DX, AX
+	ADDQ n, h
 
-	// Right now BX has len(b)-32, and we want to loop until SI > len(b)-8.
-	ADDQ $24, BX
+	ADDQ $24, end
+	CMPQ p, end
+	JG   try4
 
-	CMPQ SI, BX
-	JG   fourByte
+loop8:
+	MOVQ  (p), x
+	ADDQ  $8, p
+	round0(x)
+	XORQ  x, h
+	ROLQ  $27, h
+	IMULQ prime1, h
+	ADDQ  prime4, h
 
-wordLoop:
-	// Calculate k1.
-	MOVQ  (SI), R8
-	ADDQ  $8, SI
-	IMULQ R14, R8
-	ROLQ  $31, R8
-	IMULQ R13, R8
+	CMPQ p, end
+	JLE  loop8
 
-	XORQ  R8, AX
-	ROLQ  $27, AX
-	IMULQ R13, AX
-	ADDQ  DI, AX
+try4:
+	ADDQ $4, end
+	CMPQ p, end
+	JG   try1
 
-	CMPQ SI, BX
-	JLE  wordLoop
+	MOVL  (p), x
+	ADDQ  $4, p
+	IMULQ prime1, x
+	XORQ  x, h
 
-fourByte:
-	ADDQ $4, BX
-	CMPQ SI, BX
-	JG   singles
+	ROLQ  $23, h
+	IMULQ prime2, h
+	ADDQ  ·primes+16(SB), h
 
-	MOVL  (SI), R8
-	ADDQ  $4, SI
-	IMULQ R13, R8
-	XORQ  R8, AX
-
-	ROLQ  $23, AX
-	IMULQ R14, AX
-	ADDQ  ·prime3v(SB), AX
-
-singles:
-	ADDQ $4, BX
-	CMPQ SI, BX
+try1:
+	ADDQ $4, end
+	CMPQ p, end
 	JGE  finalize
 
-singlesLoop:
-	MOVBQZX (SI), R12
-	ADDQ    $1, SI
-	IMULQ   ·prime5v(SB), R12
-	XORQ    R12, AX
+loop1:
+	MOVBQZX (p), x
+	ADDQ    $1, p
+	IMULQ   ·primes+32(SB), x
+	XORQ    x, h
+	ROLQ    $11, h
+	IMULQ   prime1, h
 
-	ROLQ  $11, AX
-	IMULQ R13, AX
-
-	CMPQ SI, BX
-	JL   singlesLoop
+	CMPQ p, end
+	JL   loop1
 
 finalize:
-	MOVQ  AX, R12
-	SHRQ  $33, R12
-	XORQ  R12, AX
-	IMULQ R14, AX
-	MOVQ  AX, R12
-	SHRQ  $29, R12
-	XORQ  R12, AX
-	IMULQ ·prime3v(SB), AX
-	MOVQ  AX, R12
-	SHRQ  $32, R12
-	XORQ  R12, AX
+	MOVQ  h, x
+	SHRQ  $33, x
+	XORQ  x, h
+	IMULQ prime2, h
+	MOVQ  h, x
+	SHRQ  $29, x
+	XORQ  x, h
+	IMULQ ·primes+16(SB), h
+	MOVQ  h, x
+	SHRQ  $32, x
+	XORQ  x, h
 
-	MOVQ AX, ret+24(FP)
+	MOVQ h, ret+24(FP)
 	RET
 
-// writeBlocks uses the same registers as above except that it uses AX to store
-// the d pointer.
-
 // func writeBlocks(d *Digest, b []byte) int
-TEXT ·writeBlocks(SB), NOSPLIT, $0-40
+TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
 	// Load fixed primes needed for round.
-	MOVQ ·prime1v(SB), R13
-	MOVQ ·prime2v(SB), R14
+	MOVQ ·primes+0(SB), prime1
+	MOVQ ·primes+8(SB), prime2
 
 	// Load slice.
-	MOVQ b_base+8(FP), SI
-	MOVQ b_len+16(FP), DX
-	LEAQ (SI)(DX*1), BX
-	SUBQ $32, BX
+	MOVQ b_base+8(FP), p
+	MOVQ b_len+16(FP), n
+	LEAQ (p)(n*1), end
+	SUBQ $32, end
 
 	// Load vN from d.
-	MOVQ d+0(FP), AX
-	MOVQ 0(AX), R8   // v1
-	MOVQ 8(AX), R9   // v2
-	MOVQ 16(AX), R10 // v3
-	MOVQ 24(AX), R11 // v4
+	MOVQ s+0(FP), d
+	MOVQ 0(d), v1
+	MOVQ 8(d), v2
+	MOVQ 16(d), v3
+	MOVQ 24(d), v4
 
 	// We don't need to check the loop condition here; this function is
 	// always called with at least one block of data to process.
-blockLoop:
-	round(R8)
-	round(R9)
-	round(R10)
-	round(R11)
-
-	CMPQ SI, BX
-	JLE  blockLoop
+	blockLoop()
 
 	// Copy vN back to d.
-	MOVQ R8, 0(AX)
-	MOVQ R9, 8(AX)
-	MOVQ R10, 16(AX)
-	MOVQ R11, 24(AX)
+	MOVQ v1, 0(d)
+	MOVQ v2, 8(d)
+	MOVQ v3, 16(d)
+	MOVQ v4, 24(d)
 
-	// The number of bytes written is SI minus the old base pointer.
-	SUBQ b_base+8(FP), SI
-	MOVQ SI, ret+32(FP)
+	// The number of bytes written is p minus the old base pointer.
+	SUBQ b_base+8(FP), p
+	MOVQ p, ret+32(FP)
 
 	RET

diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
index 4d64a17..ae7d4d3 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_arm64.s

@@ -1,13 +1,17 @@
-// +build gc,!purego,!noasm
+//go:build !appengine && gc && !purego && !noasm
+// +build !appengine
+// +build gc
+// +build !purego
+// +build !noasm
 
 #include "textflag.h"
 
-// Register allocation.
+// Registers:
 #define digest	R1
-#define h	R2 // Return value.
-#define p	R3 // Input pointer.
-#define len	R4
-#define nblocks	R5 // len / 32.
+#define h	R2 // return value
+#define p	R3 // input pointer
+#define n	R4 // input length
+#define nblocks	R5 // n / 32
 #define prime1	R7
 #define prime2	R8
 #define prime3	R9
@@ -25,60 +29,52 @@
 #define round(acc, x) \
 	MADD prime2, acc, x, acc \
 	ROR  $64-31, acc         \
-	MUL  prime1, acc         \
+	MUL  prime1, acc
 
-// x = round(0, x).
+// round0 performs the operation x = round(0, x).
 #define round0(x) \
 	MUL prime2, x \
 	ROR $64-31, x \
-	MUL prime1, x \
+	MUL prime1, x
 
-#define mergeRound(x) \
-	round0(x)                 \
-	EOR  x, h                 \
-	MADD h, prime4, prime1, h \
+#define mergeRound(acc, x) \
+	round0(x)                     \
+	EOR  x, acc                   \
+	MADD acc, prime4, prime1, acc
 
-// Update v[1-4] with 32-byte blocks. Assumes len >= 32.
-#define blocksLoop() \
-	LSR     $5, len, nblocks \
-	PCALIGN $16              \
-	loop:                    \
-	LDP.P   32(p), (x1, x2)  \
-	round(v1, x1)            \
-	LDP     -16(p), (x3, x4) \
-	round(v2, x2)            \
-	SUB     $1, nblocks      \
-	round(v3, x3)            \
-	round(v4, x4)            \
-	CBNZ    nblocks, loop    \
-
-// The primes are repeated here to ensure that they're stored
-// in a contiguous array, so we can load them with LDP.
-DATA primes<> +0(SB)/8, $11400714785074694791
-DATA primes<> +8(SB)/8, $14029467366897019727
-DATA primes<>+16(SB)/8, $1609587929392839161
-DATA primes<>+24(SB)/8, $9650029242287828579
-DATA primes<>+32(SB)/8, $2870177450012600261
-GLOBL primes<>(SB), NOPTR+RODATA, $40
+// blockLoop processes as many 32-byte blocks as possible,
+// updating v1, v2, v3, and v4. It assumes that n >= 32.
+#define blockLoop() \
+	LSR     $5, n, nblocks  \
+	PCALIGN $16             \
+	loop:                   \
+	LDP.P   16(p), (x1, x2) \
+	LDP.P   16(p), (x3, x4) \
+	round(v1, x1)           \
+	round(v2, x2)           \
+	round(v3, x3)           \
+	round(v4, x4)           \
+	SUB     $1, nblocks     \
+	CBNZ    nblocks, loop
 
 // func Sum64(b []byte) uint64
-TEXT ·Sum64(SB), NOFRAME+NOSPLIT, $0-32
-	LDP b_base+0(FP), (p, len)
+TEXT ·Sum64(SB), NOSPLIT|NOFRAME, $0-32
+	LDP b_base+0(FP), (p, n)
 
-	LDP  primes<> +0(SB), (prime1, prime2)
-	LDP  primes<>+16(SB), (prime3, prime4)
-	MOVD primes<>+32(SB), prime5
+	LDP  ·primes+0(SB), (prime1, prime2)
+	LDP  ·primes+16(SB), (prime3, prime4)
+	MOVD ·primes+32(SB), prime5
 
-	CMP  $32, len
-	CSEL LO, prime5, ZR, h // if len < 32 { h = prime5 } else { h = 0 }
-	BLO  afterLoop
+	CMP  $32, n
+	CSEL LT, prime5, ZR, h // if n < 32 { h = prime5 } else { h = 0 }
+	BLT  afterLoop
 
 	ADD  prime1, prime2, v1
 	MOVD prime2, v2
 	MOVD $0, v3
 	NEG  prime1, v4
 
-	blocksLoop()
+	blockLoop()
 
 	ROR $64-1, v1, x1
 	ROR $64-7, v2, x2
@@ -88,71 +84,75 @@
 	ADD x3, x4
 	ADD x2, x4, h
 
-	mergeRound(v1)
-	mergeRound(v2)
-	mergeRound(v3)
-	mergeRound(v4)
+	mergeRound(h, v1)
+	mergeRound(h, v2)
+	mergeRound(h, v3)
+	mergeRound(h, v4)
 
 afterLoop:
-	ADD len, h
+	ADD n, h
 
-	TBZ   $4, len, try8
+	TBZ   $4, n, try8
 	LDP.P 16(p), (x1, x2)
 
 	round0(x1)
+
+	// NOTE: here and below, sequencing the EOR after the ROR (using a
+	// rotated register) is worth a small but measurable speedup for small
+	// inputs.
 	ROR  $64-27, h
 	EOR  x1 @> 64-27, h, h
 	MADD h, prime4, prime1, h
 
 	round0(x2)
 	ROR  $64-27, h
-	EOR  x2 @> 64-27, h
+	EOR  x2 @> 64-27, h, h
 	MADD h, prime4, prime1, h
 
 try8:
-	TBZ    $3, len, try4
+	TBZ    $3, n, try4
 	MOVD.P 8(p), x1
 
 	round0(x1)
 	ROR  $64-27, h
-	EOR  x1 @> 64-27, h
+	EOR  x1 @> 64-27, h, h
 	MADD h, prime4, prime1, h
 
 try4:
-	TBZ     $2, len, try2
+	TBZ     $2, n, try2
 	MOVWU.P 4(p), x2
 
 	MUL  prime1, x2
 	ROR  $64-23, h
-	EOR  x2 @> 64-23, h
+	EOR  x2 @> 64-23, h, h
 	MADD h, prime3, prime2, h
 
 try2:
-	TBZ     $1, len, try1
+	TBZ     $1, n, try1
 	MOVHU.P 2(p), x3
 	AND     $255, x3, x1
 	LSR     $8, x3, x2
 
 	MUL prime5, x1
 	ROR $64-11, h
-	EOR x1 @> 64-11, h
+	EOR x1 @> 64-11, h, h
 	MUL prime1, h
 
 	MUL prime5, x2
 	ROR $64-11, h
-	EOR x2 @> 64-11, h
+	EOR x2 @> 64-11, h, h
 	MUL prime1, h
 
 try1:
-	TBZ   $0, len, end
+	TBZ   $0, n, finalize
 	MOVBU (p), x4
 
 	MUL prime5, x4
 	ROR $64-11, h
-	EOR x4 @> 64-11, h
+	EOR x4 @> 64-11, h, h
 	MUL prime1, h
 
-end:
+finalize:
 	EOR h >> 33, h
 	MUL prime2, h
 	EOR h >> 29, h
@@ -162,25 +162,23 @@
 	MOVD h, ret+24(FP)
 	RET
 
-// func writeBlocks(d *Digest, b []byte) int
-//
-// Assumes len(b) >= 32.
-TEXT ·writeBlocks(SB), NOFRAME+NOSPLIT, $0-40
-	LDP primes<>(SB), (prime1, prime2)
+// func writeBlocks(s *Digest, b []byte) int
+TEXT ·writeBlocks(SB), NOSPLIT|NOFRAME, $0-40
+	LDP ·primes+0(SB), (prime1, prime2)
 
 	// Load state. Assume v[1-4] are stored contiguously.
-	MOVD d+0(FP), digest
+	MOVD s+0(FP), digest
 	LDP  0(digest), (v1, v2)
 	LDP  16(digest), (v3, v4)
 
-	LDP b_base+8(FP), (p, len)
+	LDP b_base+8(FP), (p, n)
 
-	blocksLoop()
+	blockLoop()
 
 	// Store updated state.
 	STP (v1, v2), 0(digest)
 	STP (v3, v4), 16(digest)
 
-	BIC  $31, len
-	MOVD len, ret+32(FP)
+	BIC  $31, n
+	MOVD n, ret+32(FP)
 	RET

diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
index 1a1fac9..d4221ed 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_asm.go

@@ -13,4 +13,4 @@
 func Sum64(b []byte) uint64
 
 //go:noescape
-func writeBlocks(d *Digest, b []byte) int
+func writeBlocks(s *Digest, b []byte) int

diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
index 209cb4a..0be16ce 100644
--- a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go

@@ -15,10 +15,10 @@
 	var h uint64
 
 	if n >= 32 {
-		v1 := prime1v + prime2
+		v1 := primes[0] + prime2
 		v2 := prime2
 		v3 := uint64(0)
-		v4 := -prime1v
+		v4 := -primes[0]
 		for len(b) >= 32 {
 			v1 = round(v1, u64(b[0:8:len(b)]))
 			v2 = round(v2, u64(b[8:16:len(b)]))
@@ -37,19 +37,18 @@
 
 	h += uint64(n)
 
-	i, end := 0, len(b)
-	for ; i+8 <= end; i += 8 {
-		k1 := round(0, u64(b[i:i+8:len(b)]))
+	for ; len(b) >= 8; b = b[8:] {
+		k1 := round(0, u64(b[:8]))
 		h ^= k1
 		h = rol27(h)*prime1 + prime4
 	}
-	if i+4 <= end {
-		h ^= uint64(u32(b[i:i+4:len(b)])) * prime1
+	if len(b) >= 4 {
+		h ^= uint64(u32(b[:4])) * prime1
 		h = rol23(h)*prime2 + prime3
-		i += 4
+		b = b[4:]
 	}
-	for ; i < end; i++ {
-		h ^= uint64(b[i]) * prime5
+	for ; len(b) > 0; b = b[1:] {
+		h ^= uint64(b[0]) * prime5
 		h = rol11(h) * prime1
 	}
 

diff --git a/vendor/github.com/klauspost/compress/zstd/matchlen_amd64.go b/vendor/github.com/klauspost/compress/zstd/matchlen_amd64.go
new file mode 100644
index 0000000..f41932b
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/matchlen_amd64.go

@@ -0,0 +1,16 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+
+package zstd
+
+// matchLen returns how many bytes match in a and b
+//
+// It assumes that:
+//
+//	len(a) <= len(b) and len(a) > 0
+//
+//go:noescape
+func matchLen(a []byte, b []byte) int

diff --git a/vendor/github.com/klauspost/compress/zstd/matchlen_amd64.s b/vendor/github.com/klauspost/compress/zstd/matchlen_amd64.s
new file mode 100644
index 0000000..0782b86
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/matchlen_amd64.s

@@ -0,0 +1,66 @@
+// Copied from S2 implementation.
+
+//go:build !appengine && !noasm && gc && !noasm
+
+#include "textflag.h"
+
+// func matchLen(a []byte, b []byte) int
+TEXT ·matchLen(SB), NOSPLIT, $0-56
+	MOVQ a_base+0(FP), AX
+	MOVQ b_base+24(FP), CX
+	MOVQ a_len+8(FP), DX
+
+	// matchLen
+	XORL SI, SI
+	CMPL DX, $0x08
+	JB   matchlen_match4_standalone
+
+matchlen_loopback_standalone:
+	MOVQ (AX)(SI*1), BX
+	XORQ (CX)(SI*1), BX
+	JZ   matchlen_loop_standalone
+
+#ifdef GOAMD64_v3
+	TZCNTQ BX, BX
+#else
+	BSFQ BX, BX
+#endif
+	SHRL $0x03, BX
+	LEAL (SI)(BX*1), SI
+	JMP  gen_match_len_end
+
+matchlen_loop_standalone:
+	LEAL -8(DX), DX
+	LEAL 8(SI), SI
+	CMPL DX, $0x08
+	JAE  matchlen_loopback_standalone
+
+matchlen_match4_standalone:
+	CMPL DX, $0x04
+	JB   matchlen_match2_standalone
+	MOVL (AX)(SI*1), BX
+	CMPL (CX)(SI*1), BX
+	JNE  matchlen_match2_standalone
+	LEAL -4(DX), DX
+	LEAL 4(SI), SI
+
+matchlen_match2_standalone:
+	CMPL DX, $0x02
+	JB   matchlen_match1_standalone
+	MOVW (AX)(SI*1), BX
+	CMPW (CX)(SI*1), BX
+	JNE  matchlen_match1_standalone
+	LEAL -2(DX), DX
+	LEAL 2(SI), SI
+
+matchlen_match1_standalone:
+	CMPL DX, $0x01
+	JB   gen_match_len_end
+	MOVB (AX)(SI*1), BL
+	CMPB (CX)(SI*1), BL
+	JNE  gen_match_len_end
+	INCL SI
+
+gen_match_len_end:
+	MOVQ SI, ret+48(FP)
+	RET

diff --git a/vendor/github.com/klauspost/compress/zstd/matchlen_generic.go b/vendor/github.com/klauspost/compress/zstd/matchlen_generic.go
new file mode 100644
index 0000000..bea1779
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/matchlen_generic.go

@@ -0,0 +1,38 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+
+package zstd
+
+import (
+	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
+)
+
+// matchLen returns the maximum common prefix length of a and b.
+// a must be the shortest of the two.
+func matchLen(a, b []byte) (n int) {
+	left := len(a)
+	for left >= 8 {
+		diff := le.Load64(a, n) ^ le.Load64(b, n)
+		if diff != 0 {
+			return n + bits.TrailingZeros64(diff)>>3
+		}
+		n += 8
+		left -= 8
+	}
+	a = a[n:]
+	b = b[n:]
+
+	for i := range a {
+		if a[i] != b[i] {
+			break
+		}
+		n++
+	}
+	return n
+
+}

diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
index df04472..9a7de82 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go

@@ -99,6 +99,21 @@
 	return nil
 }
 
+func (s *sequenceDecs) freeDecoders() {
+	if f := s.litLengths.fse; f != nil && !f.preDefined {
+		fseDecoderPool.Put(f)
+		s.litLengths.fse = nil
+	}
+	if f := s.offsets.fse; f != nil && !f.preDefined {
+		fseDecoderPool.Put(f)
+		s.offsets.fse = nil
+	}
+	if f := s.matchLengths.fse; f != nil && !f.preDefined {
+		fseDecoderPool.Put(f)
+		s.matchLengths.fse = nil
+	}
+}
+
 // execute will execute the decoded sequence with the provided history.
 // The sequence must be evaluated before being sent.
 func (s *sequenceDecs) execute(seqs []seqVals, hist []byte) error {
@@ -221,13 +236,16 @@
 		maxBlockSize = s.windowSize
 	}
 
+	if debugDecoder {
+		println("decodeSync: decoding", seqs, "sequences", br.remain(), "bits remain on stream")
+	}
 	for i := seqs - 1; i >= 0; i-- {
 		if br.overread() {
-			printf("reading sequence %d, exceeded available data\n", seqs-i)
+			printf("reading sequence %d, exceeded available data. Overread by %d\n", seqs-i, -br.remain())
 			return io.ErrUnexpectedEOF
 		}
 		var ll, mo, ml int
-		if br.off > 4+((maxOffsetBits+16+16)>>3) {
+		if br.cursor > 4+((maxOffsetBits+16+16)>>3) {
 			// inlined function:
 			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
 
@@ -299,7 +317,7 @@
 		}
 		size := ll + ml + len(out)
 		if size-startSize > maxBlockSize {
-			return fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
+			return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
 		}
 		if size > cap(out) {
 			// Not enough size, which can happen under high volume block streaming conditions
@@ -409,9 +427,8 @@
 		}
 	}
 
-	// Check if space for literals
-	if size := len(s.literals) + len(s.out) - startSize; size > maxBlockSize {
-		return fmt.Errorf("output (%d) bigger than max block size (%d)", size, maxBlockSize)
+	if size := len(s.literals) + len(out) - startSize; size > maxBlockSize {
+		return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
 	}
 
 	// Add final literals
@@ -435,18 +452,13 @@
 
 	// extra bits are stored in reverse order.
 	br.fill()
-	if s.maxBits <= 32 {
-		mo += br.getBits(moB)
-		ml += br.getBits(mlB)
-		ll += br.getBits(llB)
-	} else {
-		mo += br.getBits(moB)
+	mo += br.getBits(moB)
+	if s.maxBits > 32 {
 		br.fill()
-		// matchlength+literal length, max 32 bits
-		ml += br.getBits(mlB)
-		ll += br.getBits(llB)
-
 	}
+	// matchlength+literal length, max 32 bits
+	ml += br.getBits(mlB)
+	ll += br.getBits(llB)
 	mo = s.adjustOffset(mo, ll, moB)
 	return
 }

diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
index 7598c10..c59f17e 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.go

@@ -5,6 +5,7 @@
 
 import (
 	"fmt"
+	"io"
 
 	"github.com/klauspost/compress/internal/cpuinfo"
 )
@@ -32,18 +33,22 @@
 // sequenceDecs_decodeSync_amd64 implements the main loop of sequenceDecs.decodeSync in x86 asm.
 //
 // Please refer to seqdec_generic.go for the reference implementation.
+//
 //go:noescape
 func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
 
 // sequenceDecs_decodeSync_bmi2 implements the main loop of sequenceDecs.decodeSync in x86 asm with BMI2 extensions.
+//
 //go:noescape
 func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
 
 // sequenceDecs_decodeSync_safe_amd64 does the same as above, but does not write more than output buffer.
+//
 //go:noescape
 func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
 
 // sequenceDecs_decodeSync_safe_bmi2 does the same as above, but does not write more than output buffer.
+//
 //go:noescape
 func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
 
@@ -130,20 +135,23 @@
 		return true, fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available",
 			ctx.ll, ctx.litRemain+ctx.ll)
 
+	case errorOverread:
+		return true, io.ErrUnexpectedEOF
+
 	case errorNotEnoughSpace:
 		size := ctx.outPosition + ctx.ll + ctx.ml
 		if debugDecoder {
 			println("msl:", s.maxSyncLen, "cap", cap(s.out), "bef:", startSize, "sz:", size-startSize, "mbs:", maxBlockSize, "outsz:", cap(s.out)-startSize)
 		}
-		return true, fmt.Errorf("output (%d) bigger than max block size (%d)", size-startSize, maxBlockSize)
+		return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
 
 	default:
-		return true, fmt.Errorf("sequenceDecs_decode returned erronous code %d", errCode)
+		return true, fmt.Errorf("sequenceDecs_decode returned erroneous code %d", errCode)
 	}
 
 	s.seqSize += ctx.litRemain
 	if s.seqSize > maxBlockSize {
-		return true, fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+		return true, fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
 	}
 	err := br.close()
 	if err != nil {
@@ -198,23 +206,30 @@
 // error reported when capacity of `out` is too small
 const errorNotEnoughSpace = 5
 
+// error reported when bits are overread.
+const errorOverread = 6
+
 // sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
 //
 // Please refer to seqdec_generic.go for the reference implementation.
+//
 //go:noescape
 func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 
 // sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm.
 //
 // Please refer to seqdec_generic.go for the reference implementation.
+//
 //go:noescape
 func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 
 // sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
+//
 //go:noescape
 func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 
 // sequenceDecs_decode implements the main loop of sequenceDecs in x86 asm with BMI2 extensions.
+//
 //go:noescape
 func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 
@@ -239,6 +254,10 @@
 		litRemain: len(s.literals),
 	}
 
+	if debugDecoder {
+		println("decode: decoding", len(seqs), "sequences", br.remain(), "bits remain on stream")
+	}
+
 	s.seqSize = 0
 	lte56bits := s.maxBits+s.offsets.fse.actualTableLog+s.matchLengths.fse.actualTableLog+s.litLengths.fse.actualTableLog <= 56
 	var errCode int
@@ -269,9 +288,11 @@
 		case errorNotEnoughLiterals:
 			ll := ctx.seqs[i].ll
 			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, ctx.litRemain+ll)
+		case errorOverread:
+			return io.ErrUnexpectedEOF
 		}
 
-		return fmt.Errorf("sequenceDecs_decode_amd64 returned erronous code %d", errCode)
+		return fmt.Errorf("sequenceDecs_decode_amd64 returned erroneous code %d", errCode)
 	}
 
 	if ctx.litRemain < 0 {
@@ -281,7 +302,10 @@
 
 	s.seqSize += ctx.litRemain
 	if s.seqSize > maxBlockSize {
-		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+		return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
+	}
+	if debugDecoder {
+		println("decode: ", br.remain(), "bits remain on stream. code:", errCode)
 	}
 	err := br.close()
 	if err != nil {
@@ -308,10 +332,12 @@
 // Returns false if a match offset is too big.
 //
 // Please refer to seqdec_generic.go for the reference implementation.
+//
 //go:noescape
 func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
 
 // Same as above, but with safe memcopies
+//
 //go:noescape
 func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
 

diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
index 27e7677..a708ca6 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s

@@ -1,16 +1,15 @@
 // Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
 
 //go:build !appengine && !noasm && gc && !noasm
-// +build !appengine,!noasm,gc,!noasm
 
 // func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 // Requires: CMOV
 TEXT ·sequenceDecs_decode_amd64(SB), $8-32
-	MOVQ    br+8(FP), AX
-	MOVQ    32(AX), DX
-	MOVBQZX 40(AX), BX
-	MOVQ    24(AX), SI
-	MOVQ    (AX), AX
+	MOVQ    br+8(FP), CX
+	MOVQ    24(CX), DX
+	MOVBQZX 40(CX), BX
+	MOVQ    (CX), AX
+	MOVQ    32(CX), SI
 	ADDQ    SI, AX
 	MOVQ    AX, (SP)
 	MOVQ    ctx+16(FP), AX
@@ -39,7 +38,7 @@
 
 sequenceDecs_decode_amd64_fill_byte_by_byte:
 	CMPQ    SI, $0x00
-	JLE     sequenceDecs_decode_amd64_fill_end
+	JLE     sequenceDecs_decode_amd64_fill_check_overread
 	CMPQ    BX, $0x07
 	JLE     sequenceDecs_decode_amd64_fill_end
 	SHLQ    $0x08, DX
@@ -50,6 +49,10 @@
 	ORQ     AX, DX
 	JMP     sequenceDecs_decode_amd64_fill_byte_by_byte
 
+sequenceDecs_decode_amd64_fill_check_overread:
+	CMPQ BX, $0x40
+	JA   error_overread
+
 sequenceDecs_decode_amd64_fill_end:
 	// Update offset
 	MOVQ  R9, AX
@@ -106,7 +109,7 @@
 
 sequenceDecs_decode_amd64_fill_2_byte_by_byte:
 	CMPQ    SI, $0x00
-	JLE     sequenceDecs_decode_amd64_fill_2_end
+	JLE     sequenceDecs_decode_amd64_fill_2_check_overread
 	CMPQ    BX, $0x07
 	JLE     sequenceDecs_decode_amd64_fill_2_end
 	SHLQ    $0x08, DX
@@ -117,6 +120,10 @@
 	ORQ     AX, DX
 	JMP     sequenceDecs_decode_amd64_fill_2_byte_by_byte
 
+sequenceDecs_decode_amd64_fill_2_check_overread:
+	CMPQ BX, $0x40
+	JA   error_overread
+
 sequenceDecs_decode_amd64_fill_2_end:
 	// Update literal length
 	MOVQ  DI, AX
@@ -150,8 +157,7 @@
 
 	// Update Literal Length State
 	MOVBQZX DI, R14
-	SHRQ    $0x10, DI
-	MOVWQZX DI, DI
+	SHRL    $0x10, DI
 	LEAQ    (BX)(R14*1), CX
 	MOVQ    DX, R15
 	MOVQ    CX, BX
@@ -170,8 +176,7 @@
 
 	// Update Match Length State
 	MOVBQZX R8, R14
-	SHRQ    $0x10, R8
-	MOVWQZX R8, R8
+	SHRL    $0x10, R8
 	LEAQ    (BX)(R14*1), CX
 	MOVQ    DX, R15
 	MOVQ    CX, BX
@@ -190,8 +195,7 @@
 
 	// Update Offset State
 	MOVBQZX R9, R14
-	SHRQ    $0x10, R9
-	MOVWQZX R9, R9
+	SHRL    $0x10, R9
 	LEAQ    (BX)(R14*1), CX
 	MOVQ    DX, R15
 	MOVQ    CX, BX
@@ -294,9 +298,9 @@
 	MOVQ R12, 152(AX)
 	MOVQ R13, 160(AX)
 	MOVQ br+8(FP), AX
-	MOVQ DX, 32(AX)
+	MOVQ DX, 24(AX)
 	MOVB BL, 40(AX)
-	MOVQ SI, 24(AX)
+	MOVQ SI, 32(AX)
 
 	// Return success
 	MOVQ $0x00000000, ret+24(FP)
@@ -321,18 +325,19 @@
 	MOVQ $0x00000004, ret+24(FP)
 	RET
 
-	// Return with not enough output space error
-	MOVQ $0x00000005, ret+24(FP)
+	// Return with overread error
+error_overread:
+	MOVQ $0x00000006, ret+24(FP)
 	RET
 
 // func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 // Requires: CMOV
 TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
-	MOVQ    br+8(FP), AX
-	MOVQ    32(AX), DX
-	MOVBQZX 40(AX), BX
-	MOVQ    24(AX), SI
-	MOVQ    (AX), AX
+	MOVQ    br+8(FP), CX
+	MOVQ    24(CX), DX
+	MOVBQZX 40(CX), BX
+	MOVQ    (CX), AX
+	MOVQ    32(CX), SI
 	ADDQ    SI, AX
 	MOVQ    AX, (SP)
 	MOVQ    ctx+16(FP), AX
@@ -361,7 +366,7 @@
 
 sequenceDecs_decode_56_amd64_fill_byte_by_byte:
 	CMPQ    SI, $0x00
-	JLE     sequenceDecs_decode_56_amd64_fill_end
+	JLE     sequenceDecs_decode_56_amd64_fill_check_overread
 	CMPQ    BX, $0x07
 	JLE     sequenceDecs_decode_56_amd64_fill_end
 	SHLQ    $0x08, DX
@@ -372,6 +377,10 @@
 	ORQ     AX, DX
 	JMP     sequenceDecs_decode_56_amd64_fill_byte_by_byte
 
+sequenceDecs_decode_56_amd64_fill_check_overread:
+	CMPQ BX, $0x40
+	JA   error_overread
+
 sequenceDecs_decode_56_amd64_fill_end:
 	// Update offset
 	MOVQ  R9, AX
@@ -447,8 +456,7 @@
 
 	// Update Literal Length State
 	MOVBQZX DI, R14
-	SHRQ    $0x10, DI
-	MOVWQZX DI, DI
+	SHRL    $0x10, DI
 	LEAQ    (BX)(R14*1), CX
 	MOVQ    DX, R15
 	MOVQ    CX, BX
@@ -467,8 +475,7 @@
 
 	// Update Match Length State
 	MOVBQZX R8, R14
-	SHRQ    $0x10, R8
-	MOVWQZX R8, R8
+	SHRL    $0x10, R8
 	LEAQ    (BX)(R14*1), CX
 	MOVQ    DX, R15
 	MOVQ    CX, BX
@@ -487,8 +494,7 @@
 
 	// Update Offset State
 	MOVBQZX R9, R14
-	SHRQ    $0x10, R9
-	MOVWQZX R9, R9
+	SHRL    $0x10, R9
 	LEAQ    (BX)(R14*1), CX
 	MOVQ    DX, R15
 	MOVQ    CX, BX
@@ -591,9 +597,9 @@
 	MOVQ R12, 152(AX)
 	MOVQ R13, 160(AX)
 	MOVQ br+8(FP), AX
-	MOVQ DX, 32(AX)
+	MOVQ DX, 24(AX)
 	MOVB BL, 40(AX)
-	MOVQ SI, 24(AX)
+	MOVQ SI, 32(AX)
 
 	// Return success
 	MOVQ $0x00000000, ret+24(FP)
@@ -618,18 +624,19 @@
 	MOVQ $0x00000004, ret+24(FP)
 	RET
 
-	// Return with not enough output space error
-	MOVQ $0x00000005, ret+24(FP)
+	// Return with overread error
+error_overread:
+	MOVQ $0x00000006, ret+24(FP)
 	RET
 
 // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 // Requires: BMI, BMI2, CMOV
 TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
-	MOVQ    br+8(FP), CX
-	MOVQ    32(CX), AX
-	MOVBQZX 40(CX), DX
-	MOVQ    24(CX), BX
-	MOVQ    (CX), CX
+	MOVQ    br+8(FP), BX
+	MOVQ    24(BX), AX
+	MOVBQZX 40(BX), DX
+	MOVQ    (BX), CX
+	MOVQ    32(BX), BX
 	ADDQ    BX, CX
 	MOVQ    CX, (SP)
 	MOVQ    ctx+16(FP), CX
@@ -658,7 +665,7 @@
 
 sequenceDecs_decode_bmi2_fill_byte_by_byte:
 	CMPQ    BX, $0x00
-	JLE     sequenceDecs_decode_bmi2_fill_end
+	JLE     sequenceDecs_decode_bmi2_fill_check_overread
 	CMPQ    DX, $0x07
 	JLE     sequenceDecs_decode_bmi2_fill_end
 	SHLQ    $0x08, AX
@@ -669,6 +676,10 @@
 	ORQ     CX, AX
 	JMP     sequenceDecs_decode_bmi2_fill_byte_by_byte
 
+sequenceDecs_decode_bmi2_fill_check_overread:
+	CMPQ DX, $0x40
+	JA   error_overread
+
 sequenceDecs_decode_bmi2_fill_end:
 	// Update offset
 	MOVQ   $0x00000808, CX
@@ -709,7 +720,7 @@
 
 sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
 	CMPQ    BX, $0x00
-	JLE     sequenceDecs_decode_bmi2_fill_2_end
+	JLE     sequenceDecs_decode_bmi2_fill_2_check_overread
 	CMPQ    DX, $0x07
 	JLE     sequenceDecs_decode_bmi2_fill_2_end
 	SHLQ    $0x08, AX
@@ -720,6 +731,10 @@
 	ORQ     CX, AX
 	JMP     sequenceDecs_decode_bmi2_fill_2_byte_by_byte
 
+sequenceDecs_decode_bmi2_fill_2_check_overread:
+	CMPQ DX, $0x40
+	JA   error_overread
+
 sequenceDecs_decode_bmi2_fill_2_end:
 	// Update literal length
 	MOVQ   $0x00000808, CX
@@ -751,11 +766,10 @@
 	BZHIQ   R14, R15, R15
 
 	// Update Offset State
-	BZHIQ  R8, R15, CX
-	SHRXQ  R8, R15, R15
-	MOVQ   $0x00001010, R14
-	BEXTRQ R14, R8, R8
-	ADDQ   CX, R8
+	BZHIQ R8, R15, CX
+	SHRXQ R8, R15, R15
+	SHRL  $0x10, R8
+	ADDQ  CX, R8
 
 	// Load ctx.ofTable
 	MOVQ ctx+16(FP), CX
@@ -763,11 +777,10 @@
 	MOVQ (CX)(R8*8), R8
 
 	// Update Match Length State
-	BZHIQ  DI, R15, CX
-	SHRXQ  DI, R15, R15
-	MOVQ   $0x00001010, R14
-	BEXTRQ R14, DI, DI
-	ADDQ   CX, DI
+	BZHIQ DI, R15, CX
+	SHRXQ DI, R15, R15
+	SHRL  $0x10, DI
+	ADDQ  CX, DI
 
 	// Load ctx.mlTable
 	MOVQ ctx+16(FP), CX
@@ -775,10 +788,9 @@
 	MOVQ (CX)(DI*8), DI
 
 	// Update Literal Length State
-	BZHIQ  SI, R15, CX
-	MOVQ   $0x00001010, R14
-	BEXTRQ R14, SI, SI
-	ADDQ   CX, SI
+	BZHIQ SI, R15, CX
+	SHRL  $0x10, SI
+	ADDQ  CX, SI
 
 	// Load ctx.llTable
 	MOVQ ctx+16(FP), CX
@@ -871,9 +883,9 @@
 	MOVQ R11, 152(CX)
 	MOVQ R12, 160(CX)
 	MOVQ br+8(FP), CX
-	MOVQ AX, 32(CX)
+	MOVQ AX, 24(CX)
 	MOVB DL, 40(CX)
-	MOVQ BX, 24(CX)
+	MOVQ BX, 32(CX)
 
 	// Return success
 	MOVQ $0x00000000, ret+24(FP)
@@ -898,18 +910,19 @@
 	MOVQ $0x00000004, ret+24(FP)
 	RET
 
-	// Return with not enough output space error
-	MOVQ $0x00000005, ret+24(FP)
+	// Return with overread error
+error_overread:
+	MOVQ $0x00000006, ret+24(FP)
 	RET
 
 // func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
 // Requires: BMI, BMI2, CMOV
 TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
-	MOVQ    br+8(FP), CX
-	MOVQ    32(CX), AX
-	MOVBQZX 40(CX), DX
-	MOVQ    24(CX), BX
-	MOVQ    (CX), CX
+	MOVQ    br+8(FP), BX
+	MOVQ    24(BX), AX
+	MOVBQZX 40(BX), DX
+	MOVQ    (BX), CX
+	MOVQ    32(BX), BX
 	ADDQ    BX, CX
 	MOVQ    CX, (SP)
 	MOVQ    ctx+16(FP), CX
@@ -938,7 +951,7 @@
 
 sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
 	CMPQ    BX, $0x00
-	JLE     sequenceDecs_decode_56_bmi2_fill_end
+	JLE     sequenceDecs_decode_56_bmi2_fill_check_overread
 	CMPQ    DX, $0x07
 	JLE     sequenceDecs_decode_56_bmi2_fill_end
 	SHLQ    $0x08, AX
@@ -949,6 +962,10 @@
 	ORQ     CX, AX
 	JMP     sequenceDecs_decode_56_bmi2_fill_byte_by_byte
 
+sequenceDecs_decode_56_bmi2_fill_check_overread:
+	CMPQ DX, $0x40
+	JA   error_overread
+
 sequenceDecs_decode_56_bmi2_fill_end:
 	// Update offset
 	MOVQ   $0x00000808, CX
@@ -1006,11 +1023,10 @@
 	BZHIQ   R14, R15, R15
 
 	// Update Offset State
-	BZHIQ  R8, R15, CX
-	SHRXQ  R8, R15, R15
-	MOVQ   $0x00001010, R14
-	BEXTRQ R14, R8, R8
-	ADDQ   CX, R8
+	BZHIQ R8, R15, CX
+	SHRXQ R8, R15, R15
+	SHRL  $0x10, R8
+	ADDQ  CX, R8
 
 	// Load ctx.ofTable
 	MOVQ ctx+16(FP), CX
@@ -1018,11 +1034,10 @@
 	MOVQ (CX)(R8*8), R8
 
 	// Update Match Length State
-	BZHIQ  DI, R15, CX
-	SHRXQ  DI, R15, R15
-	MOVQ   $0x00001010, R14
-	BEXTRQ R14, DI, DI
-	ADDQ   CX, DI
+	BZHIQ DI, R15, CX
+	SHRXQ DI, R15, R15
+	SHRL  $0x10, DI
+	ADDQ  CX, DI
 
 	// Load ctx.mlTable
 	MOVQ ctx+16(FP), CX
@@ -1030,10 +1045,9 @@
 	MOVQ (CX)(DI*8), DI
 
 	// Update Literal Length State
-	BZHIQ  SI, R15, CX
-	MOVQ   $0x00001010, R14
-	BEXTRQ R14, SI, SI
-	ADDQ   CX, SI
+	BZHIQ SI, R15, CX
+	SHRL  $0x10, SI
+	ADDQ  CX, SI
 
 	// Load ctx.llTable
 	MOVQ ctx+16(FP), CX
@@ -1126,9 +1140,9 @@
 	MOVQ R11, 152(CX)
 	MOVQ R12, 160(CX)
 	MOVQ br+8(FP), CX
-	MOVQ AX, 32(CX)
+	MOVQ AX, 24(CX)
 	MOVB DL, 40(CX)
-	MOVQ BX, 24(CX)
+	MOVQ BX, 32(CX)
 
 	// Return success
 	MOVQ $0x00000000, ret+24(FP)
@@ -1153,8 +1167,9 @@
 	MOVQ $0x00000004, ret+24(FP)
 	RET
 
-	// Return with not enough output space error
-	MOVQ $0x00000005, ret+24(FP)
+	// Return with overread error
+error_overread:
+	MOVQ $0x00000006, ret+24(FP)
 	RET
 
 // func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
@@ -1390,8 +1405,7 @@
 	MOVQ ctx+0(FP), AX
 	MOVQ DX, 24(AX)
 	MOVQ DI, 104(AX)
-	MOVQ 80(AX), CX
-	SUBQ CX, SI
+	SUBQ 80(AX), SI
 	MOVQ SI, 112(AX)
 	RET
 
@@ -1403,8 +1417,7 @@
 	MOVQ ctx+0(FP), AX
 	MOVQ DX, 24(AX)
 	MOVQ DI, 104(AX)
-	MOVQ 80(AX), CX
-	SUBQ CX, SI
+	SUBQ 80(AX), SI
 	MOVQ SI, 112(AX)
 	RET
 
@@ -1748,8 +1761,7 @@
 	MOVQ ctx+0(FP), AX
 	MOVQ DX, 24(AX)
 	MOVQ DI, 104(AX)
-	MOVQ 80(AX), CX
-	SUBQ CX, SI
+	SUBQ 80(AX), SI
 	MOVQ SI, 112(AX)
 	RET
 
@@ -1761,8 +1773,7 @@
 	MOVQ ctx+0(FP), AX
 	MOVQ DX, 24(AX)
 	MOVQ DI, 104(AX)
-	MOVQ 80(AX), CX
-	SUBQ CX, SI
+	SUBQ 80(AX), SI
 	MOVQ SI, 112(AX)
 	RET
 
@@ -1774,11 +1785,11 @@
 // func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
 // Requires: CMOV, SSE
 TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
-	MOVQ    br+8(FP), AX
-	MOVQ    32(AX), DX
-	MOVBQZX 40(AX), BX
-	MOVQ    24(AX), SI
-	MOVQ    (AX), AX
+	MOVQ    br+8(FP), CX
+	MOVQ    24(CX), DX
+	MOVBQZX 40(CX), BX
+	MOVQ    (CX), AX
+	MOVQ    32(CX), SI
 	ADDQ    SI, AX
 	MOVQ    AX, (SP)
 	MOVQ    ctx+16(FP), AX
@@ -1803,7 +1814,7 @@
 	MOVQ    40(SP), AX
 	ADDQ    AX, 48(SP)
 
-	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
 	ADDQ R10, 32(SP)
 
 	// outBase += outPosition
@@ -1825,7 +1836,7 @@
 
 sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
 	CMPQ    SI, $0x00
-	JLE     sequenceDecs_decodeSync_amd64_fill_end
+	JLE     sequenceDecs_decodeSync_amd64_fill_check_overread
 	CMPQ    BX, $0x07
 	JLE     sequenceDecs_decodeSync_amd64_fill_end
 	SHLQ    $0x08, DX
@@ -1836,6 +1847,10 @@
 	ORQ     AX, DX
 	JMP     sequenceDecs_decodeSync_amd64_fill_byte_by_byte
 
+sequenceDecs_decodeSync_amd64_fill_check_overread:
+	CMPQ BX, $0x40
+	JA   error_overread
+
 sequenceDecs_decodeSync_amd64_fill_end:
 	// Update offset
 	MOVQ  R9, AX
@@ -1892,7 +1907,7 @@
 
 sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
 	CMPQ    SI, $0x00
-	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
+	JLE     sequenceDecs_decodeSync_amd64_fill_2_check_overread
 	CMPQ    BX, $0x07
 	JLE     sequenceDecs_decodeSync_amd64_fill_2_end
 	SHLQ    $0x08, DX
@@ -1903,6 +1918,10 @@
 	ORQ     AX, DX
 	JMP     sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
 
+sequenceDecs_decodeSync_amd64_fill_2_check_overread:
+	CMPQ BX, $0x40
+	JA   error_overread
+
 sequenceDecs_decodeSync_amd64_fill_2_end:
 	// Update literal length
 	MOVQ  DI, AX
@@ -1936,8 +1955,7 @@
 
 	// Update Literal Length State
 	MOVBQZX DI, R13
-	SHRQ    $0x10, DI
-	MOVWQZX DI, DI
+	SHRL    $0x10, DI
 	LEAQ    (BX)(R13*1), CX
 	MOVQ    DX, R14
 	MOVQ    CX, BX
@@ -1956,8 +1974,7 @@
 
 	// Update Match Length State
 	MOVBQZX R8, R13
-	SHRQ    $0x10, R8
-	MOVWQZX R8, R8
+	SHRL    $0x10, R8
 	LEAQ    (BX)(R13*1), CX
 	MOVQ    DX, R14
 	MOVQ    CX, BX
@@ -1976,8 +1993,7 @@
 
 	// Update Offset State
 	MOVBQZX R9, R13
-	SHRQ    $0x10, R9
-	MOVWQZX R9, R9
+	SHRL    $0x10, R9
 	LEAQ    (BX)(R13*1), CX
 	MOVQ    DX, R14
 	MOVQ    CX, BX
@@ -2264,9 +2280,9 @@
 
 loop_finished:
 	MOVQ br+8(FP), AX
-	MOVQ DX, 32(AX)
+	MOVQ DX, 24(AX)
 	MOVB BL, 40(AX)
-	MOVQ SI, 24(AX)
+	MOVQ SI, 32(AX)
 
 	// Update the context
 	MOVQ ctx+16(FP), AX
@@ -2312,6 +2328,11 @@
 	MOVQ $0x00000004, ret+24(FP)
 	RET
 
+	// Return with overread error
+error_overread:
+	MOVQ $0x00000006, ret+24(FP)
+	RET
+
 	// Return with not enough output space error
 error_not_enough_space:
 	MOVQ ctx+16(FP), AX
@@ -2326,11 +2347,11 @@
 // func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
 // Requires: BMI, BMI2, CMOV, SSE
 TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
-	MOVQ    br+8(FP), CX
-	MOVQ    32(CX), AX
-	MOVBQZX 40(CX), DX
-	MOVQ    24(CX), BX
-	MOVQ    (CX), CX
+	MOVQ    br+8(FP), BX
+	MOVQ    24(BX), AX
+	MOVBQZX 40(BX), DX
+	MOVQ    (BX), CX
+	MOVQ    32(BX), BX
 	ADDQ    BX, CX
 	MOVQ    CX, (SP)
 	MOVQ    ctx+16(FP), CX
@@ -2355,7 +2376,7 @@
 	MOVQ    40(SP), CX
 	ADDQ    CX, 48(SP)
 
-	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
 	ADDQ R9, 32(SP)
 
 	// outBase += outPosition
@@ -2377,7 +2398,7 @@
 
 sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
 	CMPQ    BX, $0x00
-	JLE     sequenceDecs_decodeSync_bmi2_fill_end
+	JLE     sequenceDecs_decodeSync_bmi2_fill_check_overread
 	CMPQ    DX, $0x07
 	JLE     sequenceDecs_decodeSync_bmi2_fill_end
 	SHLQ    $0x08, AX
@@ -2388,6 +2409,10 @@
 	ORQ     CX, AX
 	JMP     sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
 
+sequenceDecs_decodeSync_bmi2_fill_check_overread:
+	CMPQ DX, $0x40
+	JA   error_overread
+
 sequenceDecs_decodeSync_bmi2_fill_end:
 	// Update offset
 	MOVQ   $0x00000808, CX
@@ -2428,7 +2453,7 @@
 
 sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
 	CMPQ    BX, $0x00
-	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
+	JLE     sequenceDecs_decodeSync_bmi2_fill_2_check_overread
 	CMPQ    DX, $0x07
 	JLE     sequenceDecs_decodeSync_bmi2_fill_2_end
 	SHLQ    $0x08, AX
@@ -2439,6 +2464,10 @@
 	ORQ     CX, AX
 	JMP     sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
 
+sequenceDecs_decodeSync_bmi2_fill_2_check_overread:
+	CMPQ DX, $0x40
+	JA   error_overread
+
 sequenceDecs_decodeSync_bmi2_fill_2_end:
 	// Update literal length
 	MOVQ   $0x00000808, CX
@@ -2470,11 +2499,10 @@
 	BZHIQ   R13, R14, R14
 
 	// Update Offset State
-	BZHIQ  R8, R14, CX
-	SHRXQ  R8, R14, R14
-	MOVQ   $0x00001010, R13
-	BEXTRQ R13, R8, R8
-	ADDQ   CX, R8
+	BZHIQ R8, R14, CX
+	SHRXQ R8, R14, R14
+	SHRL  $0x10, R8
+	ADDQ  CX, R8
 
 	// Load ctx.ofTable
 	MOVQ ctx+16(FP), CX
@@ -2482,11 +2510,10 @@
 	MOVQ (CX)(R8*8), R8
 
 	// Update Match Length State
-	BZHIQ  DI, R14, CX
-	SHRXQ  DI, R14, R14
-	MOVQ   $0x00001010, R13
-	BEXTRQ R13, DI, DI
-	ADDQ   CX, DI
+	BZHIQ DI, R14, CX
+	SHRXQ DI, R14, R14
+	SHRL  $0x10, DI
+	ADDQ  CX, DI
 
 	// Load ctx.mlTable
 	MOVQ ctx+16(FP), CX
@@ -2494,10 +2521,9 @@
 	MOVQ (CX)(DI*8), DI
 
 	// Update Literal Length State
-	BZHIQ  SI, R14, CX
-	MOVQ   $0x00001010, R13
-	BEXTRQ R13, SI, SI
-	ADDQ   CX, SI
+	BZHIQ SI, R14, CX
+	SHRL  $0x10, SI
+	ADDQ  CX, SI
 
 	// Load ctx.llTable
 	MOVQ ctx+16(FP), CX
@@ -2774,9 +2800,9 @@
 
 loop_finished:
 	MOVQ br+8(FP), CX
-	MOVQ AX, 32(CX)
+	MOVQ AX, 24(CX)
 	MOVB DL, 40(CX)
-	MOVQ BX, 24(CX)
+	MOVQ BX, 32(CX)
 
 	// Update the context
 	MOVQ ctx+16(FP), AX
@@ -2822,6 +2848,11 @@
 	MOVQ $0x00000004, ret+24(FP)
 	RET
 
+	// Return with overread error
+error_overread:
+	MOVQ $0x00000006, ret+24(FP)
+	RET
+
 	// Return with not enough output space error
 error_not_enough_space:
 	MOVQ ctx+16(FP), AX
@@ -2836,11 +2867,11 @@
 // func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
 // Requires: CMOV, SSE
 TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
-	MOVQ    br+8(FP), AX
-	MOVQ    32(AX), DX
-	MOVBQZX 40(AX), BX
-	MOVQ    24(AX), SI
-	MOVQ    (AX), AX
+	MOVQ    br+8(FP), CX
+	MOVQ    24(CX), DX
+	MOVBQZX 40(CX), BX
+	MOVQ    (CX), AX
+	MOVQ    32(CX), SI
 	ADDQ    SI, AX
 	MOVQ    AX, (SP)
 	MOVQ    ctx+16(FP), AX
@@ -2865,7 +2896,7 @@
 	MOVQ    40(SP), AX
 	ADDQ    AX, 48(SP)
 
-	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
 	ADDQ R10, 32(SP)
 
 	// outBase += outPosition
@@ -2887,7 +2918,7 @@
 
 sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
 	CMPQ    SI, $0x00
-	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_check_overread
 	CMPQ    BX, $0x07
 	JLE     sequenceDecs_decodeSync_safe_amd64_fill_end
 	SHLQ    $0x08, DX
@@ -2898,6 +2929,10 @@
 	ORQ     AX, DX
 	JMP     sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
 
+sequenceDecs_decodeSync_safe_amd64_fill_check_overread:
+	CMPQ BX, $0x40
+	JA   error_overread
+
 sequenceDecs_decodeSync_safe_amd64_fill_end:
 	// Update offset
 	MOVQ  R9, AX
@@ -2954,7 +2989,7 @@
 
 sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
 	CMPQ    SI, $0x00
-	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
+	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread
 	CMPQ    BX, $0x07
 	JLE     sequenceDecs_decodeSync_safe_amd64_fill_2_end
 	SHLQ    $0x08, DX
@@ -2965,6 +3000,10 @@
 	ORQ     AX, DX
 	JMP     sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
 
+sequenceDecs_decodeSync_safe_amd64_fill_2_check_overread:
+	CMPQ BX, $0x40
+	JA   error_overread
+
 sequenceDecs_decodeSync_safe_amd64_fill_2_end:
 	// Update literal length
 	MOVQ  DI, AX
@@ -2998,8 +3037,7 @@
 
 	// Update Literal Length State
 	MOVBQZX DI, R13
-	SHRQ    $0x10, DI
-	MOVWQZX DI, DI
+	SHRL    $0x10, DI
 	LEAQ    (BX)(R13*1), CX
 	MOVQ    DX, R14
 	MOVQ    CX, BX
@@ -3018,8 +3056,7 @@
 
 	// Update Match Length State
 	MOVBQZX R8, R13
-	SHRQ    $0x10, R8
-	MOVWQZX R8, R8
+	SHRL    $0x10, R8
 	LEAQ    (BX)(R13*1), CX
 	MOVQ    DX, R14
 	MOVQ    CX, BX
@@ -3038,8 +3075,7 @@
 
 	// Update Offset State
 	MOVBQZX R9, R13
-	SHRQ    $0x10, R9
-	MOVWQZX R9, R9
+	SHRL    $0x10, R9
 	LEAQ    (BX)(R13*1), CX
 	MOVQ    DX, R14
 	MOVQ    CX, BX
@@ -3428,9 +3464,9 @@
 
 loop_finished:
 	MOVQ br+8(FP), AX
-	MOVQ DX, 32(AX)
+	MOVQ DX, 24(AX)
 	MOVB BL, 40(AX)
-	MOVQ SI, 24(AX)
+	MOVQ SI, 32(AX)
 
 	// Update the context
 	MOVQ ctx+16(FP), AX
@@ -3476,6 +3512,11 @@
 	MOVQ $0x00000004, ret+24(FP)
 	RET
 
+	// Return with overread error
+error_overread:
+	MOVQ $0x00000006, ret+24(FP)
+	RET
+
 	// Return with not enough output space error
 error_not_enough_space:
 	MOVQ ctx+16(FP), AX
@@ -3490,11 +3531,11 @@
 // func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
 // Requires: BMI, BMI2, CMOV, SSE
 TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
-	MOVQ    br+8(FP), CX
-	MOVQ    32(CX), AX
-	MOVBQZX 40(CX), DX
-	MOVQ    24(CX), BX
-	MOVQ    (CX), CX
+	MOVQ    br+8(FP), BX
+	MOVQ    24(BX), AX
+	MOVBQZX 40(BX), DX
+	MOVQ    (BX), CX
+	MOVQ    32(BX), BX
 	ADDQ    BX, CX
 	MOVQ    CX, (SP)
 	MOVQ    ctx+16(FP), CX
@@ -3519,7 +3560,7 @@
 	MOVQ    40(SP), CX
 	ADDQ    CX, 48(SP)
 
-	// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
+	// Calculate pointer to s.out[cap(s.out)] (a past-end pointer)
 	ADDQ R9, 32(SP)
 
 	// outBase += outPosition
@@ -3541,7 +3582,7 @@
 
 sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
 	CMPQ    BX, $0x00
-	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_check_overread
 	CMPQ    DX, $0x07
 	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_end
 	SHLQ    $0x08, AX
@@ -3552,6 +3593,10 @@
 	ORQ     CX, AX
 	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
 
+sequenceDecs_decodeSync_safe_bmi2_fill_check_overread:
+	CMPQ DX, $0x40
+	JA   error_overread
+
 sequenceDecs_decodeSync_safe_bmi2_fill_end:
 	// Update offset
 	MOVQ   $0x00000808, CX
@@ -3592,7 +3637,7 @@
 
 sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
 	CMPQ    BX, $0x00
-	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
+	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread
 	CMPQ    DX, $0x07
 	JLE     sequenceDecs_decodeSync_safe_bmi2_fill_2_end
 	SHLQ    $0x08, AX
@@ -3603,6 +3648,10 @@
 	ORQ     CX, AX
 	JMP     sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
 
+sequenceDecs_decodeSync_safe_bmi2_fill_2_check_overread:
+	CMPQ DX, $0x40
+	JA   error_overread
+
 sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
 	// Update literal length
 	MOVQ   $0x00000808, CX
@@ -3634,11 +3683,10 @@
 	BZHIQ   R13, R14, R14
 
 	// Update Offset State
-	BZHIQ  R8, R14, CX
-	SHRXQ  R8, R14, R14
-	MOVQ   $0x00001010, R13
-	BEXTRQ R13, R8, R8
-	ADDQ   CX, R8
+	BZHIQ R8, R14, CX
+	SHRXQ R8, R14, R14
+	SHRL  $0x10, R8
+	ADDQ  CX, R8
 
 	// Load ctx.ofTable
 	MOVQ ctx+16(FP), CX
@@ -3646,11 +3694,10 @@
 	MOVQ (CX)(R8*8), R8
 
 	// Update Match Length State
-	BZHIQ  DI, R14, CX
-	SHRXQ  DI, R14, R14
-	MOVQ   $0x00001010, R13
-	BEXTRQ R13, DI, DI
-	ADDQ   CX, DI
+	BZHIQ DI, R14, CX
+	SHRXQ DI, R14, R14
+	SHRL  $0x10, DI
+	ADDQ  CX, DI
 
 	// Load ctx.mlTable
 	MOVQ ctx+16(FP), CX
@@ -3658,10 +3705,9 @@
 	MOVQ (CX)(DI*8), DI
 
 	// Update Literal Length State
-	BZHIQ  SI, R14, CX
-	MOVQ   $0x00001010, R13
-	BEXTRQ R13, SI, SI
-	ADDQ   CX, SI
+	BZHIQ SI, R14, CX
+	SHRL  $0x10, SI
+	ADDQ  CX, SI
 
 	// Load ctx.llTable
 	MOVQ ctx+16(FP), CX
@@ -4040,9 +4086,9 @@
 
 loop_finished:
 	MOVQ br+8(FP), CX
-	MOVQ AX, 32(CX)
+	MOVQ AX, 24(CX)
 	MOVB DL, 40(CX)
-	MOVQ BX, 24(CX)
+	MOVQ BX, 32(CX)
 
 	// Update the context
 	MOVQ ctx+16(FP), AX
@@ -4088,6 +4134,11 @@
 	MOVQ $0x00000004, ret+24(FP)
 	RET
 
+	// Return with overread error
+error_overread:
+	MOVQ $0x00000006, ret+24(FP)
+	RET
+
 	// Return with not enough output space error
 error_not_enough_space:
 	MOVQ ctx+16(FP), AX

diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
index c3452bc..7cec219 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec_generic.go

@@ -29,7 +29,7 @@
 	}
 	for i := range seqs {
 		var ll, mo, ml int
-		if br.off > 4+((maxOffsetBits+16+16)>>3) {
+		if br.cursor > 4+((maxOffsetBits+16+16)>>3) {
 			// inlined function:
 			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
 
@@ -111,7 +111,7 @@
 		}
 		s.seqSize += ll + ml
 		if s.seqSize > maxBlockSize {
-			return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+			return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
 		}
 		litRemain -= ll
 		if litRemain < 0 {
@@ -149,7 +149,7 @@
 	}
 	s.seqSize += litRemain
 	if s.seqSize > maxBlockSize {
-		return fmt.Errorf("output (%d) bigger than max block size (%d)", s.seqSize, maxBlockSize)
+		return fmt.Errorf("output bigger than max block size (%d)", maxBlockSize)
 	}
 	err := br.close()
 	if err != nil {

diff --git a/vendor/github.com/klauspost/compress/zstd/seqenc.go b/vendor/github.com/klauspost/compress/zstd/seqenc.go
index 8014174..65045ea 100644
--- a/vendor/github.com/klauspost/compress/zstd/seqenc.go
+++ b/vendor/github.com/klauspost/compress/zstd/seqenc.go

@@ -69,7 +69,6 @@
 func llCode(litLength uint32) uint8 {
 	const llDeltaCode = 19
 	if litLength <= 63 {
-		// Compiler insists on bounds check (Go 1.12)
 		return llCodeTable[litLength&63]
 	}
 	return uint8(highBit(litLength)) + llDeltaCode
@@ -102,7 +101,6 @@
 func mlCode(mlBase uint32) uint8 {
 	const mlDeltaCode = 36
 	if mlBase <= 127 {
-		// Compiler insists on bounds check (Go 1.12)
 		return mlCodeTable[mlBase&127]
 	}
 	return uint8(highBit(mlBase)) + mlDeltaCode

diff --git a/vendor/github.com/klauspost/compress/zstd/snappy.go b/vendor/github.com/klauspost/compress/zstd/snappy.go
index 9e1baad..a17381b 100644
--- a/vendor/github.com/klauspost/compress/zstd/snappy.go
+++ b/vendor/github.com/klauspost/compress/zstd/snappy.go

@@ -95,10 +95,9 @@
 	var written int64
 	var readHeader bool
 	{
-		var header []byte
-		var n int
-		header, r.err = frameHeader{WindowSize: snappyMaxBlockSize}.appendTo(r.buf[:0])
+		header := frameHeader{WindowSize: snappyMaxBlockSize}.appendTo(r.buf[:0])
 
+		var n int
 		n, r.err = w.Write(header)
 		if r.err != nil {
 			return written, r.err
@@ -198,7 +197,7 @@
 
 			n, r.err = w.Write(r.block.output)
 			if r.err != nil {
-				return written, err
+				return written, r.err
 			}
 			written += int64(n)
 			continue
@@ -240,7 +239,7 @@
 			}
 			n, r.err = w.Write(r.block.output)
 			if r.err != nil {
-				return written, err
+				return written, r.err
 			}
 			written += int64(n)
 			continue

diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go
index 3eb3f1c..6252b46 100644
--- a/vendor/github.com/klauspost/compress/zstd/zstd.go
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go

@@ -5,11 +5,11 @@
 
 import (
 	"bytes"
-	"encoding/binary"
 	"errors"
 	"log"
 	"math"
-	"math/bits"
+
+	"github.com/klauspost/compress/internal/le"
 )
 
 // enable debug printing
@@ -36,9 +36,6 @@
 // zstdMinMatch is the minimum zstd match length.
 const zstdMinMatch = 3
 
-// Reset the buffer offset when reaching this.
-const bufferReset = math.MaxInt32 - MaxWindowSize
-
 // fcsUnknown is used for unknown frame content size.
 const fcsUnknown = math.MaxUint64
 
@@ -75,7 +72,6 @@
 	ErrDecoderSizeExceeded = errors.New("decompressed size exceeds configured limit")
 
 	// ErrUnknownDictionary is returned if the dictionary ID is unknown.
-	// For the time being dictionaries are not supported.
 	ErrUnknownDictionary = errors.New("unknown dictionary")
 
 	// ErrFrameSizeExceeded is returned if the stated frame size is exceeded.
@@ -93,6 +89,10 @@
 	// Close has been called.
 	ErrDecoderClosed = errors.New("decoder used after Close")
 
+	// ErrEncoderClosed will be returned if the Encoder was used after
+	// Close has been called.
+	ErrEncoderClosed = errors.New("encoder used after Close")
+
 	// ErrDecoderNilInput is returned when a nil Reader was provided
 	// and an operation other than Reset/DecodeAll/Close was attempted.
 	ErrDecoderNilInput = errors.New("nil input provided as reader")
@@ -110,38 +110,12 @@
 	}
 }
 
-// matchLen returns the maximum length.
-// a must be the shortest of the two.
-// The function also returns whether all bytes matched.
-func matchLen(a, b []byte) int {
-	b = b[:len(a)]
-	for i := 0; i < len(a)-7; i += 8 {
-		if diff := load64(a, i) ^ load64(b, i); diff != 0 {
-			return i + (bits.TrailingZeros64(diff) >> 3)
-		}
-	}
-
-	checked := (len(a) >> 3) << 3
-	a = a[checked:]
-	b = b[checked:]
-	for i := range a {
-		if a[i] != b[i] {
-			return i + checked
-		}
-	}
-	return len(a) + checked
-}
-
 func load3232(b []byte, i int32) uint32 {
-	return binary.LittleEndian.Uint32(b[i:])
+	return le.Load32(b, i)
 }
 
 func load6432(b []byte, i int32) uint64 {
-	return binary.LittleEndian.Uint64(b[i:])
-}
-
-func load64(b []byte, i int) uint64 {
-	return binary.LittleEndian.Uint64(b[i:])
+	return le.Load64(b, i)
 }
 
 type byter interface {
commit	a2ae5992c52d77d5f52cea5b99df90fdfebcb48f	[log] [tgz]
author	Abhay Kumar <abhay.kumar@radisys.com>	Mon Nov 10 14:02:24 2025 +0000
committer	Abhay Kumar <abhay.kumar@radisys.com>	Fri Dec 05 07:09:17 2025 +0000
tree	5073cc457c66ef663e296e7dacd932674db1ece2
parent	26065cff097de3ced0f09bff0bf47fa4f681a2c3 [diff]