[VOL-5486] Upgrade library versions
Change-Id: I8b4e88699e03f44ee13e467867f45ae3f0a63c4b
Signed-off-by: Abhay Kumar <abhay.kumar@radisys.com>
diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
index a4979e8..bfc7a52 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitreader.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@@ -6,120 +6,16 @@
package huff0
import (
- "encoding/binary"
"errors"
+ "fmt"
"io"
+
+ "github.com/klauspost/compress/internal/le"
)
// bitReader reads a bitstream in reverse.
// The last set bit indicates the start of the stream and is used
// for aligning the input.
-type bitReader struct {
- in []byte
- off uint // next byte to read is at in[off - 1]
- value uint64
- bitsRead uint8
-}
-
-// init initializes and resets the bit reader.
-func (b *bitReader) init(in []byte) error {
- if len(in) < 1 {
- return errors.New("corrupt stream: too short")
- }
- b.in = in
- b.off = uint(len(in))
- // The highest bit of the last byte indicates where to start
- v := in[len(in)-1]
- if v == 0 {
- return errors.New("corrupt stream, did not find end of stream")
- }
- b.bitsRead = 64
- b.value = 0
- if len(in) >= 8 {
- b.fillFastStart()
- } else {
- b.fill()
- b.fill()
- }
- b.bitsRead += 8 - uint8(highBit32(uint32(v)))
- return nil
-}
-
-// peekBitsFast requires that at least one bit is requested every time.
-// There are no checks if the buffer is filled.
-func (b *bitReader) peekBitsFast(n uint8) uint16 {
- const regMask = 64 - 1
- v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
- return v
-}
-
-// fillFast() will make sure at least 32 bits are available.
-// There must be at least 4 bytes available.
-func (b *bitReader) fillFast() {
- if b.bitsRead < 32 {
- return
- }
-
- // 2 bounds checks.
- v := b.in[b.off-4 : b.off]
- v = v[:4]
- low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- b.value = (b.value << 32) | uint64(low)
- b.bitsRead -= 32
- b.off -= 4
-}
-
-func (b *bitReader) advance(n uint8) {
- b.bitsRead += n
-}
-
-// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
-func (b *bitReader) fillFastStart() {
- // Do single re-slice to avoid bounds checks.
- b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
- b.bitsRead = 0
- b.off -= 8
-}
-
-// fill() will make sure at least 32 bits are available.
-func (b *bitReader) fill() {
- if b.bitsRead < 32 {
- return
- }
- if b.off > 4 {
- v := b.in[b.off-4:]
- v = v[:4]
- low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- b.value = (b.value << 32) | uint64(low)
- b.bitsRead -= 32
- b.off -= 4
- return
- }
- for b.off > 0 {
- b.value = (b.value << 8) | uint64(b.in[b.off-1])
- b.bitsRead -= 8
- b.off--
- }
-}
-
-// finished returns true if all bits have been read from the bit stream.
-func (b *bitReader) finished() bool {
- return b.off == 0 && b.bitsRead >= 64
-}
-
-// close the bitstream and returns an error if out-of-buffer reads occurred.
-func (b *bitReader) close() error {
- // Release reference.
- b.in = nil
- if b.bitsRead > 64 {
- return io.ErrUnexpectedEOF
- }
- return nil
-}
-
-// bitReader reads a bitstream in reverse.
-// The last set bit indicates the start of the stream and is used
-// for aligning the input.
type bitReaderBytes struct {
in []byte
off uint // next byte to read is at in[off - 1]
@@ -151,7 +47,7 @@
return nil
}
-// peekBitsFast requires that at least one bit is requested every time.
+// peekByteFast requires that at least one byte is requested every time.
// There are no checks if the buffer is filled.
func (b *bitReaderBytes) peekByteFast() uint8 {
got := uint8(b.value >> 56)
@@ -171,9 +67,7 @@
}
// 2 bounds checks.
- v := b.in[b.off-4 : b.off]
- v = v[:4]
- low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ low := le.Load32(b.in, b.off-4)
b.value |= uint64(low) << (b.bitsRead - 32)
b.bitsRead -= 32
b.off -= 4
@@ -182,7 +76,7 @@
// fillFastStart() assumes the bitReaderBytes is empty and there is at least 8 bytes to read.
func (b *bitReaderBytes) fillFastStart() {
// Do single re-slice to avoid bounds checks.
- b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+ b.value = le.Load64(b.in, b.off-8)
b.bitsRead = 0
b.off -= 8
}
@@ -192,10 +86,8 @@
if b.bitsRead < 32 {
return
}
- if b.off > 4 {
- v := b.in[b.off-4:]
- v = v[:4]
- low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ if b.off >= 4 {
+ low := le.Load32(b.in, b.off-4)
b.value |= uint64(low) << (b.bitsRead - 32)
b.bitsRead -= 32
b.off -= 4
@@ -213,10 +105,17 @@
return b.off == 0 && b.bitsRead >= 64
}
+func (b *bitReaderBytes) remaining() uint {
+ return b.off*8 + uint(64-b.bitsRead)
+}
+
// close the bitstream and returns an error if out-of-buffer reads occurred.
func (b *bitReaderBytes) close() error {
// Release reference.
b.in = nil
+ if b.remaining() > 0 {
+ return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
+ }
if b.bitsRead > 64 {
return io.ErrUnexpectedEOF
}
@@ -275,10 +174,7 @@
return
}
- // 2 bounds checks.
- v := b.in[b.off-4 : b.off]
- v = v[:4]
- low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ low := le.Load32(b.in, b.off-4)
b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
b.bitsRead -= 32
b.off -= 4
@@ -286,8 +182,7 @@
// fillFastStart() assumes the bitReaderShifted is empty and there is at least 8 bytes to read.
func (b *bitReaderShifted) fillFastStart() {
- // Do single re-slice to avoid bounds checks.
- b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
+ b.value = le.Load64(b.in, b.off-8)
b.bitsRead = 0
b.off -= 8
}
@@ -298,9 +193,7 @@
return
}
if b.off > 4 {
- v := b.in[b.off-4:]
- v = v[:4]
- low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ low := le.Load32(b.in, b.off-4)
b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
b.bitsRead -= 32
b.off -= 4
@@ -313,15 +206,17 @@
}
}
-// finished returns true if all bits have been read from the bit stream.
-func (b *bitReaderShifted) finished() bool {
- return b.off == 0 && b.bitsRead >= 64
+func (b *bitReaderShifted) remaining() uint {
+ return b.off*8 + uint(64-b.bitsRead)
}
// close the bitstream and returns an error if out-of-buffer reads occurred.
func (b *bitReaderShifted) close() error {
// Release reference.
b.in = nil
+ if b.remaining() > 0 {
+ return fmt.Errorf("corrupt input: %d bits remain on stream", b.remaining())
+ }
if b.bitsRead > 64 {
return io.ErrUnexpectedEOF
}
diff --git a/vendor/github.com/klauspost/compress/huff0/bitwriter.go b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
index 6bce4e8..0ebc9aa 100644
--- a/vendor/github.com/klauspost/compress/huff0/bitwriter.go
+++ b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
@@ -5,8 +5,6 @@
package huff0
-import "fmt"
-
// bitWriter will write bits.
// First bit will be LSB of the first byte of output.
type bitWriter struct {
@@ -15,22 +13,6 @@
out []byte
}
-// bitMask16 is bitmasks. Has extra to avoid bounds check.
-var bitMask16 = [32]uint16{
- 0, 1, 3, 7, 0xF, 0x1F,
- 0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF,
- 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0xFFFF,
- 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
- 0xFFFF, 0xFFFF} /* up to 16 bits */
-
-// addBits16NC will add up to 16 bits.
-// It will not check if there is space for them,
-// so the caller must ensure that it has flushed recently.
-func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
- b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63)
- b.nBits += bits
-}
-
// addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
@@ -70,102 +52,20 @@
b.nBits += encA.nBits + encB.nBits
}
-// addBits16ZeroNC will add up to 16 bits.
+// encFourSymbols adds up to 32 bits from four symbols.
// It will not check if there is space for them,
-// so the caller must ensure that it has flushed recently.
-// This is fastest if bits can be zero.
-func (b *bitWriter) addBits16ZeroNC(value uint16, bits uint8) {
- if bits == 0 {
- return
- }
- value <<= (16 - bits) & 15
- value >>= (16 - bits) & 15
- b.bitContainer |= uint64(value) << (b.nBits & 63)
- b.nBits += bits
-}
-
-// flush will flush all pending full bytes.
-// There will be at least 56 bits available for writing when this has been called.
-// Using flush32 is faster, but leaves less space for writing.
-func (b *bitWriter) flush() {
- v := b.nBits >> 3
- switch v {
- case 0:
- return
- case 1:
- b.out = append(b.out,
- byte(b.bitContainer),
- )
- b.bitContainer >>= 1 << 3
- case 2:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- )
- b.bitContainer >>= 2 << 3
- case 3:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- )
- b.bitContainer >>= 3 << 3
- case 4:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- )
- b.bitContainer >>= 4 << 3
- case 5:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- )
- b.bitContainer >>= 5 << 3
- case 6:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- )
- b.bitContainer >>= 6 << 3
- case 7:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- byte(b.bitContainer>>48),
- )
- b.bitContainer >>= 7 << 3
- case 8:
- b.out = append(b.out,
- byte(b.bitContainer),
- byte(b.bitContainer>>8),
- byte(b.bitContainer>>16),
- byte(b.bitContainer>>24),
- byte(b.bitContainer>>32),
- byte(b.bitContainer>>40),
- byte(b.bitContainer>>48),
- byte(b.bitContainer>>56),
- )
- b.bitContainer = 0
- b.nBits = 0
- return
- default:
- panic(fmt.Errorf("bits (%d) > 64", b.nBits))
- }
- b.nBits &= 7
+// so the caller must ensure that b has been flushed recently.
+func (b *bitWriter) encFourSymbols(encA, encB, encC, encD cTableEntry) {
+ bitsA := encA.nBits
+ bitsB := bitsA + encB.nBits
+ bitsC := bitsB + encC.nBits
+ bitsD := bitsC + encD.nBits
+ combined := uint64(encA.val) |
+ (uint64(encB.val) << (bitsA & 63)) |
+ (uint64(encC.val) << (bitsB & 63)) |
+ (uint64(encD.val) << (bitsC & 63))
+ b.bitContainer |= combined << (b.nBits & 63)
+ b.nBits += bitsD
}
// flush32 will flush out, so there are at least 32 bits available for writing.
@@ -194,17 +94,9 @@
// close will write the alignment bit and write the final byte(s)
// to the output.
-func (b *bitWriter) close() error {
+func (b *bitWriter) close() {
// End mark
b.addBits16Clean(1, 1)
// flush until next byte.
b.flushAlign()
- return nil
-}
-
-// reset and continue writing by appending to out.
-func (b *bitWriter) reset(out []byte) {
- b.bitContainer = 0
- b.nBits = 0
- b.out = out
}
diff --git a/vendor/github.com/klauspost/compress/huff0/bytereader.go b/vendor/github.com/klauspost/compress/huff0/bytereader.go
deleted file mode 100644
index 50bcdf6..0000000
--- a/vendor/github.com/klauspost/compress/huff0/bytereader.go
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright 2018 Klaus Post. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
-
-package huff0
-
-// byteReader provides a byte reader that reads
-// little endian values from a byte stream.
-// The input stream is manually advanced.
-// The reader performs no bounds checks.
-type byteReader struct {
- b []byte
- off int
-}
-
-// init will initialize the reader and set the input.
-func (b *byteReader) init(in []byte) {
- b.b = in
- b.off = 0
-}
-
-// advance the stream b n bytes.
-func (b *byteReader) advance(n uint) {
- b.off += int(n)
-}
-
-// Int32 returns a little endian int32 starting at current offset.
-func (b byteReader) Int32() int32 {
- v3 := int32(b.b[b.off+3])
- v2 := int32(b.b[b.off+2])
- v1 := int32(b.b[b.off+1])
- v0 := int32(b.b[b.off])
- return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
-}
-
-// Uint32 returns a little endian uint32 starting at current offset.
-func (b byteReader) Uint32() uint32 {
- v3 := uint32(b.b[b.off+3])
- v2 := uint32(b.b[b.off+2])
- v1 := uint32(b.b[b.off+1])
- v0 := uint32(b.b[b.off])
- return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
-}
-
-// unread returns the unread portion of the input.
-func (b byteReader) unread() []byte {
- return b.b[b.off:]
-}
-
-// remain will return the number of bytes remaining.
-func (b byteReader) remain() int {
- return len(b.b) - b.off
-}
diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go
index 0823c92..84aa3d1 100644
--- a/vendor/github.com/klauspost/compress/huff0/compress.go
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@@ -2,6 +2,7 @@
import (
"fmt"
+ "math"
"runtime"
"sync"
)
@@ -161,11 +162,75 @@
return s.Out, false, nil
}
-func (s *Scratch) compress1X(src []byte) ([]byte, error) {
- return s.compress1xDo(s.Out, src)
+// EstimateSizes will estimate the data sizes
+func EstimateSizes(in []byte, s *Scratch) (tableSz, dataSz, reuseSz int, err error) {
+ s, err = s.prepare(in)
+ if err != nil {
+ return 0, 0, 0, err
+ }
+
+ // Create histogram, if none was provided.
+ tableSz, dataSz, reuseSz = -1, -1, -1
+ maxCount := s.maxCount
+ var canReuse = false
+ if maxCount == 0 {
+ maxCount, canReuse = s.countSimple(in)
+ } else {
+ canReuse = s.canUseTable(s.prevTable)
+ }
+
+ // We want the output size to be less than this:
+ wantSize := len(in)
+ if s.WantLogLess > 0 {
+ wantSize -= wantSize >> s.WantLogLess
+ }
+
+ // Reset for next run.
+ s.clearCount = true
+ s.maxCount = 0
+ if maxCount >= len(in) {
+ if maxCount > len(in) {
+ return 0, 0, 0, fmt.Errorf("maxCount (%d) > length (%d)", maxCount, len(in))
+ }
+ if len(in) == 1 {
+ return 0, 0, 0, ErrIncompressible
+ }
+ // One symbol, use RLE
+ return 0, 0, 0, ErrUseRLE
+ }
+ if maxCount == 1 || maxCount < (len(in)>>7) {
+ // Each symbol present maximum once or too well distributed.
+ return 0, 0, 0, ErrIncompressible
+ }
+
+ // Calculate new table.
+ err = s.buildCTable()
+ if err != nil {
+ return 0, 0, 0, err
+ }
+
+ if false && !s.canUseTable(s.cTable) {
+ panic("invalid table generated")
+ }
+
+ tableSz, err = s.cTable.estTableSize(s)
+ if err != nil {
+ return 0, 0, 0, err
+ }
+ if canReuse {
+ reuseSz = s.prevTable.estimateSize(s.count[:s.symbolLen])
+ }
+ dataSz = s.cTable.estimateSize(s.count[:s.symbolLen])
+
+ // Restore
+ return tableSz, dataSz, reuseSz, nil
}
-func (s *Scratch) compress1xDo(dst, src []byte) ([]byte, error) {
+func (s *Scratch) compress1X(src []byte) ([]byte, error) {
+ return s.compress1xDo(s.Out, src), nil
+}
+
+func (s *Scratch) compress1xDo(dst, src []byte) []byte {
var bw = bitWriter{out: dst}
// N is length divisible by 4.
@@ -183,8 +248,7 @@
tmp := src[n : n+4]
// tmp should be len 4
bw.flush32()
- bw.encTwoSymbols(cTable, tmp[3], tmp[2])
- bw.encTwoSymbols(cTable, tmp[1], tmp[0])
+ bw.encFourSymbols(cTable[tmp[3]], cTable[tmp[2]], cTable[tmp[1]], cTable[tmp[0]])
}
} else {
for ; n >= 0; n -= 4 {
@@ -196,8 +260,8 @@
bw.encTwoSymbols(cTable, tmp[1], tmp[0])
}
}
- err := bw.close()
- return bw.out, err
+ bw.close()
+ return bw.out
}
var sixZeros [6]byte
@@ -219,11 +283,11 @@
}
src = src[len(toDo):]
- var err error
idx := len(s.Out)
- s.Out, err = s.compress1xDo(s.Out, toDo)
- if err != nil {
- return nil, err
+ s.Out = s.compress1xDo(s.Out, toDo)
+ if len(s.Out)-idx > math.MaxUint16 {
+ // We cannot store the size in the jump table
+ return nil, ErrIncompressible
}
// Write compressed length as little endian before block.
if i < 3 {
@@ -247,7 +311,6 @@
segmentSize := (len(src) + 3) / 4
var wg sync.WaitGroup
- var errs [4]error
wg.Add(4)
for i := 0; i < 4; i++ {
toDo := src
@@ -258,16 +321,17 @@
// Separate goroutine for each block.
go func(i int) {
- s.tmpOut[i], errs[i] = s.compress1xDo(s.tmpOut[i][:0], toDo)
+ s.tmpOut[i] = s.compress1xDo(s.tmpOut[i][:0], toDo)
wg.Done()
}(i)
}
wg.Wait()
for i := 0; i < 4; i++ {
- if errs[i] != nil {
- return nil, errs[i]
- }
o := s.tmpOut[i]
+ if len(o) > math.MaxUint16 {
+ // We cannot store the size in the jump table
+ return nil, ErrIncompressible
+ }
// Write compressed length as little endian before block.
if i < 3 {
// Last length is not written.
@@ -286,35 +350,36 @@
// Does not update s.clearCount.
func (s *Scratch) countSimple(in []byte) (max int, reuse bool) {
reuse = true
+ _ = s.count // Assert that s != nil to speed up the following loop.
for _, v := range in {
s.count[v]++
}
m := uint32(0)
if len(s.prevTable) > 0 {
for i, v := range s.count[:] {
+ if v == 0 {
+ continue
+ }
if v > m {
m = v
}
- if v > 0 {
- s.symbolLen = uint16(i) + 1
- if i >= len(s.prevTable) {
- reuse = false
- } else {
- if s.prevTable[i].nBits == 0 {
- reuse = false
- }
- }
+ s.symbolLen = uint16(i) + 1
+ if i >= len(s.prevTable) {
+ reuse = false
+ } else if s.prevTable[i].nBits == 0 {
+ reuse = false
}
}
return int(m), reuse
}
for i, v := range s.count[:] {
+ if v == 0 {
+ continue
+ }
if v > m {
m = v
}
- if v > 0 {
- s.symbolLen = uint16(i) + 1
- }
+ s.symbolLen = uint16(i) + 1
}
return int(m), false
}
@@ -331,6 +396,7 @@
return true
}
+//lint:ignore U1000 used for debugging
func (s *Scratch) validateTable(c cTable) bool {
if len(c) < int(s.symbolLen) {
return false
@@ -350,7 +416,7 @@
// minTableLog provides the minimum logSize to safely represent a distribution.
func (s *Scratch) minTableLog() uint8 {
- minBitsSrc := highBit32(uint32(s.br.remain())) + 1
+ minBitsSrc := highBit32(uint32(s.srcLen)) + 1
minBitsSymbols := highBit32(uint32(s.symbolLen-1)) + 2
if minBitsSrc < minBitsSymbols {
return uint8(minBitsSrc)
@@ -362,7 +428,7 @@
func (s *Scratch) optimalTableLog() {
tableLog := s.TableLog
minBits := s.minTableLog()
- maxBitsSrc := uint8(highBit32(uint32(s.br.remain()-1))) - 1
+ maxBitsSrc := uint8(highBit32(uint32(s.srcLen-1))) - 1
if maxBitsSrc < tableLog {
// Accuracy can be reduced
tableLog = maxBitsSrc
@@ -410,34 +476,35 @@
// Different from reference implementation.
huffNode0 := s.nodes[0 : huffNodesLen+1]
- for huffNode[nonNullRank].count == 0 {
+ for huffNode[nonNullRank].count() == 0 {
nonNullRank--
}
lowS := int16(nonNullRank)
nodeRoot := nodeNb + lowS - 1
lowN := nodeNb
- huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count
- huffNode[lowS].parent, huffNode[lowS-1].parent = uint16(nodeNb), uint16(nodeNb)
+ huffNode[nodeNb].setCount(huffNode[lowS].count() + huffNode[lowS-1].count())
+ huffNode[lowS].setParent(nodeNb)
+ huffNode[lowS-1].setParent(nodeNb)
nodeNb++
lowS -= 2
for n := nodeNb; n <= nodeRoot; n++ {
- huffNode[n].count = 1 << 30
+ huffNode[n].setCount(1 << 30)
}
// fake entry, strong barrier
- huffNode0[0].count = 1 << 31
+ huffNode0[0].setCount(1 << 31)
// create parents
for nodeNb <= nodeRoot {
var n1, n2 int16
- if huffNode0[lowS+1].count < huffNode0[lowN+1].count {
+ if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
n1 = lowS
lowS--
} else {
n1 = lowN
lowN++
}
- if huffNode0[lowS+1].count < huffNode0[lowN+1].count {
+ if huffNode0[lowS+1].count() < huffNode0[lowN+1].count() {
n2 = lowS
lowS--
} else {
@@ -445,18 +512,19 @@
lowN++
}
- huffNode[nodeNb].count = huffNode0[n1+1].count + huffNode0[n2+1].count
- huffNode0[n1+1].parent, huffNode0[n2+1].parent = uint16(nodeNb), uint16(nodeNb)
+ huffNode[nodeNb].setCount(huffNode0[n1+1].count() + huffNode0[n2+1].count())
+ huffNode0[n1+1].setParent(nodeNb)
+ huffNode0[n2+1].setParent(nodeNb)
nodeNb++
}
// distribute weights (unlimited tree height)
- huffNode[nodeRoot].nbBits = 0
+ huffNode[nodeRoot].setNbBits(0)
for n := nodeRoot - 1; n >= startNode; n-- {
- huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1
+ huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
}
for n := uint16(0); n <= nonNullRank; n++ {
- huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1
+ huffNode[n].setNbBits(huffNode[huffNode[n].parent()].nbBits() + 1)
}
s.actualTableLog = s.setMaxHeight(int(nonNullRank))
maxNbBits := s.actualTableLog
@@ -468,7 +536,7 @@
var nbPerRank [tableLogMax + 1]uint16
var valPerRank [16]uint16
for _, v := range huffNode[:nonNullRank+1] {
- nbPerRank[v.nbBits]++
+ nbPerRank[v.nbBits()]++
}
// determine stating value per rank
{
@@ -483,7 +551,7 @@
// push nbBits per symbol, symbol order
for _, v := range huffNode[:nonNullRank+1] {
- s.cTable[v.symbol].nBits = v.nbBits
+ s.cTable[v.symbol()].nBits = v.nbBits()
}
// assign value within rank, symbol order
@@ -529,12 +597,12 @@
pos := rank[r].current
rank[r].current++
prev := nodes[(pos-1)&huffNodesMask]
- for pos > rank[r].base && c > prev.count {
+ for pos > rank[r].base && c > prev.count() {
nodes[pos&huffNodesMask] = prev
pos--
prev = nodes[(pos-1)&huffNodesMask]
}
- nodes[pos&huffNodesMask] = nodeElt{count: c, symbol: byte(n)}
+ nodes[pos&huffNodesMask] = makeNodeElt(c, byte(n))
}
}
@@ -543,7 +611,7 @@
huffNode := s.nodes[1 : huffNodesLen+1]
//huffNode = huffNode[: huffNodesLen]
- largestBits := huffNode[lastNonNull].nbBits
+ largestBits := huffNode[lastNonNull].nbBits()
// early exit : no elt > maxNbBits
if largestBits <= maxNbBits {
@@ -553,14 +621,14 @@
baseCost := int(1) << (largestBits - maxNbBits)
n := uint32(lastNonNull)
- for huffNode[n].nbBits > maxNbBits {
- totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits))
- huffNode[n].nbBits = maxNbBits
+ for huffNode[n].nbBits() > maxNbBits {
+ totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits()))
+ huffNode[n].setNbBits(maxNbBits)
n--
}
// n stops at huffNode[n].nbBits <= maxNbBits
- for huffNode[n].nbBits == maxNbBits {
+ for huffNode[n].nbBits() == maxNbBits {
n--
}
// n end at index of smallest symbol using < maxNbBits
@@ -581,10 +649,10 @@
{
currentNbBits := maxNbBits
for pos := int(n); pos >= 0; pos-- {
- if huffNode[pos].nbBits >= currentNbBits {
+ if huffNode[pos].nbBits() >= currentNbBits {
continue
}
- currentNbBits = huffNode[pos].nbBits // < maxNbBits
+ currentNbBits = huffNode[pos].nbBits() // < maxNbBits
rankLast[maxNbBits-currentNbBits] = uint32(pos)
}
}
@@ -601,8 +669,8 @@
if lowPos == noSymbol {
break
}
- highTotal := huffNode[highPos].count
- lowTotal := 2 * huffNode[lowPos].count
+ highTotal := huffNode[highPos].count()
+ lowTotal := 2 * huffNode[lowPos].count()
if highTotal <= lowTotal {
break
}
@@ -618,13 +686,14 @@
// this rank is no longer empty
rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease]
}
- huffNode[rankLast[nBitsToDecrease]].nbBits++
+ huffNode[rankLast[nBitsToDecrease]].setNbBits(1 +
+ huffNode[rankLast[nBitsToDecrease]].nbBits())
if rankLast[nBitsToDecrease] == 0 {
/* special case, reached largest symbol */
rankLast[nBitsToDecrease] = noSymbol
} else {
rankLast[nBitsToDecrease]--
- if huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease {
+ if huffNode[rankLast[nBitsToDecrease]].nbBits() != maxNbBits-nBitsToDecrease {
rankLast[nBitsToDecrease] = noSymbol /* this rank is now empty */
}
}
@@ -632,15 +701,15 @@
for totalCost < 0 { /* Sometimes, cost correction overshoot */
if rankLast[1] == noSymbol { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
- for huffNode[n].nbBits == maxNbBits {
+ for huffNode[n].nbBits() == maxNbBits {
n--
}
- huffNode[n+1].nbBits--
+ huffNode[n+1].setNbBits(huffNode[n+1].nbBits() - 1)
rankLast[1] = n + 1
totalCost++
continue
}
- huffNode[rankLast[1]+1].nbBits--
+ huffNode[rankLast[1]+1].setNbBits(huffNode[rankLast[1]+1].nbBits() - 1)
rankLast[1]++
totalCost++
}
@@ -648,9 +717,26 @@
return maxNbBits
}
-type nodeElt struct {
- count uint32
- parent uint16
- symbol byte
- nbBits uint8
+// A nodeElt is the fields
+//
+// count uint32
+// parent uint16
+// symbol byte
+// nbBits uint8
+//
+// in some order, all squashed into an integer so that the compiler
+// always loads and stores entire nodeElts instead of separate fields.
+type nodeElt uint64
+
+func makeNodeElt(count uint32, symbol byte) nodeElt {
+ return nodeElt(count) | nodeElt(symbol)<<48
}
+
+func (e *nodeElt) count() uint32 { return uint32(*e) }
+func (e *nodeElt) parent() uint16 { return uint16(*e >> 32) }
+func (e *nodeElt) symbol() byte { return byte(*e >> 48) }
+func (e *nodeElt) nbBits() uint8 { return uint8(*e >> 56) }
+
+func (e *nodeElt) setCount(c uint32) { *e = (*e)&0xffffffff00000000 | nodeElt(c) }
+func (e *nodeElt) setParent(p int16) { *e = (*e)&0xffff0000ffffffff | nodeElt(uint16(p))<<32 }
+func (e *nodeElt) setNbBits(n uint8) { *e = (*e)&0x00ffffffffffffff | nodeElt(n)<<56 }
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
index 41703bb..0f56b02 100644
--- a/vendor/github.com/klauspost/compress/huff0/decompress.go
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -4,13 +4,13 @@
"errors"
"fmt"
"io"
+ "sync"
"github.com/klauspost/compress/fse"
)
type dTable struct {
single []dEntrySingle
- double []dEntryDouble
}
// single-symbols decoding
@@ -18,13 +18,6 @@
entry uint16
}
-// double-symbols decoding
-type dEntryDouble struct {
- seq uint16
- nBits uint8
- len uint8
-}
-
// Uses special code for all tables that are < 8 bits.
const use8BitTables = true
@@ -34,7 +27,7 @@
// If no Scratch is provided a new one is allocated.
// The returned Scratch can be used for encoding or decoding input using this table.
func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
- s, err = s.prepare(in)
+ s, err = s.prepare(nil)
if err != nil {
return s, nil, err
}
@@ -68,7 +61,7 @@
b, err := fse.Decompress(in[:iSize], s.fse)
s.fse.Out = nil
if err != nil {
- return s, nil, err
+ return s, nil, fmt.Errorf("fse decompress returned: %w", err)
}
if len(b) > 255 {
return s, nil, errors.New("corrupt input: output table too large")
@@ -216,6 +209,7 @@
return &Decoder{
dt: s.dt,
actualTableLog: s.actualTableLog,
+ bufs: &s.decPool,
}
}
@@ -223,103 +217,15 @@
type Decoder struct {
dt dTable
actualTableLog uint8
+ bufs *sync.Pool
}
-// Decompress1X will decompress a 1X encoded stream.
-// The cap of the output buffer will be the maximum decompressed size.
-// The length of the supplied input must match the end of a block exactly.
-func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
- if len(d.dt.single) == 0 {
- return nil, errors.New("no table loaded")
+func (d *Decoder) buffer() *[4][256]byte {
+ buf, ok := d.bufs.Get().(*[4][256]byte)
+ if ok {
+ return buf
}
- if use8BitTables && d.actualTableLog <= 8 {
- return d.decompress1X8Bit(dst, src)
- }
- var br bitReaderShifted
- err := br.init(src)
- if err != nil {
- return dst, err
- }
- maxDecodedSize := cap(dst)
- dst = dst[:0]
-
- // Avoid bounds check by always having full sized table.
- const tlSize = 1 << tableLogMax
- const tlMask = tlSize - 1
- dt := d.dt.single[:tlSize]
-
- // Use temp table to avoid bound checks/append penalty.
- var buf [256]byte
- var off uint8
-
- for br.off >= 8 {
- br.fillFast()
- v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
- br.advance(uint8(v.entry))
- buf[off+0] = uint8(v.entry >> 8)
-
- v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
- br.advance(uint8(v.entry))
- buf[off+1] = uint8(v.entry >> 8)
-
- // Refill
- br.fillFast()
-
- v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
- br.advance(uint8(v.entry))
- buf[off+2] = uint8(v.entry >> 8)
-
- v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
- br.advance(uint8(v.entry))
- buf[off+3] = uint8(v.entry >> 8)
-
- off += 4
- if off == 0 {
- if len(dst)+256 > maxDecodedSize {
- br.close()
- return nil, ErrMaxDecodedSizeExceeded
- }
- dst = append(dst, buf[:]...)
- }
- }
-
- if len(dst)+int(off) > maxDecodedSize {
- br.close()
- return nil, ErrMaxDecodedSizeExceeded
- }
- dst = append(dst, buf[:off]...)
-
- // br < 8, so uint8 is fine
- bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
- for bitsLeft > 0 {
- br.fill()
- if false && br.bitsRead >= 32 {
- if br.off >= 4 {
- v := br.in[br.off-4:]
- v = v[:4]
- low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- br.value = (br.value << 32) | uint64(low)
- br.bitsRead -= 32
- br.off -= 4
- } else {
- for br.off > 0 {
- br.value = (br.value << 8) | uint64(br.in[br.off-1])
- br.bitsRead -= 8
- br.off--
- }
- }
- }
- if len(dst) >= maxDecodedSize {
- br.close()
- return nil, ErrMaxDecodedSizeExceeded
- }
- v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
- nBits := uint8(v.entry)
- br.advance(nBits)
- bitsLeft -= nBits
- dst = append(dst, uint8(v.entry>>8))
- }
- return dst, br.close()
+ return &[4][256]byte{}
}
// decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
@@ -341,41 +247,258 @@
dt := d.dt.single[:256]
// Use temp table to avoid bound checks/append penalty.
- var buf [256]byte
+ bufs := d.buffer()
+ buf := &bufs[0]
var off uint8
- shift := (8 - d.actualTableLog) & 7
+ switch d.actualTableLog {
+ case 8:
+ const shift = 0
+ for br.off >= 4 {
+ br.fillFast()
+ v := dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+0] = uint8(v.entry >> 8)
- //fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
- for br.off >= 4 {
- br.fillFast()
- v := dt[br.peekByteFast()>>shift]
- br.advance(uint8(v.entry))
- buf[off+0] = uint8(v.entry >> 8)
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+1] = uint8(v.entry >> 8)
- v = dt[br.peekByteFast()>>shift]
- br.advance(uint8(v.entry))
- buf[off+1] = uint8(v.entry >> 8)
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+2] = uint8(v.entry >> 8)
- v = dt[br.peekByteFast()>>shift]
- br.advance(uint8(v.entry))
- buf[off+2] = uint8(v.entry >> 8)
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+3] = uint8(v.entry >> 8)
- v = dt[br.peekByteFast()>>shift]
- br.advance(uint8(v.entry))
- buf[off+3] = uint8(v.entry >> 8)
-
- off += 4
- if off == 0 {
- if len(dst)+256 > maxDecodedSize {
- br.close()
- return nil, ErrMaxDecodedSizeExceeded
+ off += 4
+ if off == 0 {
+ if len(dst)+256 > maxDecodedSize {
+ br.close()
+ d.bufs.Put(bufs)
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:]...)
}
- dst = append(dst, buf[:]...)
}
+ case 7:
+ const shift = 8 - 7
+ for br.off >= 4 {
+ br.fillFast()
+ v := dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+0] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+1] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+2] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+3] = uint8(v.entry >> 8)
+
+ off += 4
+ if off == 0 {
+ if len(dst)+256 > maxDecodedSize {
+ br.close()
+ d.bufs.Put(bufs)
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:]...)
+ }
+ }
+ case 6:
+ const shift = 8 - 6
+ for br.off >= 4 {
+ br.fillFast()
+ v := dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+0] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+1] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+2] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+3] = uint8(v.entry >> 8)
+
+ off += 4
+ if off == 0 {
+ if len(dst)+256 > maxDecodedSize {
+ d.bufs.Put(bufs)
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:]...)
+ }
+ }
+ case 5:
+ const shift = 8 - 5
+ for br.off >= 4 {
+ br.fillFast()
+ v := dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+0] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+1] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+2] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+3] = uint8(v.entry >> 8)
+
+ off += 4
+ if off == 0 {
+ if len(dst)+256 > maxDecodedSize {
+ d.bufs.Put(bufs)
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:]...)
+ }
+ }
+ case 4:
+ const shift = 8 - 4
+ for br.off >= 4 {
+ br.fillFast()
+ v := dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+0] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+1] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+2] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+3] = uint8(v.entry >> 8)
+
+ off += 4
+ if off == 0 {
+ if len(dst)+256 > maxDecodedSize {
+ d.bufs.Put(bufs)
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:]...)
+ }
+ }
+ case 3:
+ const shift = 8 - 3
+ for br.off >= 4 {
+ br.fillFast()
+ v := dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+0] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+1] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+2] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+3] = uint8(v.entry >> 8)
+
+ off += 4
+ if off == 0 {
+ if len(dst)+256 > maxDecodedSize {
+ d.bufs.Put(bufs)
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:]...)
+ }
+ }
+ case 2:
+ const shift = 8 - 2
+ for br.off >= 4 {
+ br.fillFast()
+ v := dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+0] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+1] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+2] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+3] = uint8(v.entry >> 8)
+
+ off += 4
+ if off == 0 {
+ if len(dst)+256 > maxDecodedSize {
+ d.bufs.Put(bufs)
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:]...)
+ }
+ }
+ case 1:
+ const shift = 8 - 1
+ for br.off >= 4 {
+ br.fillFast()
+ v := dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+0] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+1] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+2] = uint8(v.entry >> 8)
+
+ v = dt[uint8(br.value>>(56+shift))]
+ br.advance(uint8(v.entry))
+ buf[off+3] = uint8(v.entry >> 8)
+
+ off += 4
+ if off == 0 {
+ if len(dst)+256 > maxDecodedSize {
+ d.bufs.Put(bufs)
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:]...)
+ }
+ }
+ default:
+ d.bufs.Put(bufs)
+ return nil, fmt.Errorf("invalid tablelog: %d", d.actualTableLog)
}
if len(dst)+int(off) > maxDecodedSize {
+ d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
@@ -383,6 +506,8 @@
// br < 4, so uint8 is fine
bitsLeft := int8(uint8(br.off)*8 + (64 - br.bitsRead))
+ shift := (8 - d.actualTableLog) & 7
+
for bitsLeft > 0 {
if br.bitsRead >= 64-8 {
for br.off > 0 {
@@ -393,6 +518,7 @@
}
if len(dst) >= maxDecodedSize {
br.close()
+ d.bufs.Put(bufs)
return nil, ErrMaxDecodedSizeExceeded
}
v := dt[br.peekByteFast()>>shift]
@@ -401,6 +527,7 @@
bitsLeft -= int8(nBits)
dst = append(dst, uint8(v.entry>>8))
}
+ d.bufs.Put(bufs)
return dst, br.close()
}
@@ -420,33 +547,35 @@
dt := d.dt.single[:256]
// Use temp table to avoid bound checks/append penalty.
- var buf [256]byte
+ bufs := d.buffer()
+ buf := &bufs[0]
var off uint8
- const shift = 0
+ const shift = 56
//fmt.Printf("mask: %b, tl:%d\n", mask, d.actualTableLog)
for br.off >= 4 {
br.fillFast()
- v := dt[br.peekByteFast()>>shift]
+ v := dt[uint8(br.value>>shift)]
br.advance(uint8(v.entry))
buf[off+0] = uint8(v.entry >> 8)
- v = dt[br.peekByteFast()>>shift]
+ v = dt[uint8(br.value>>shift)]
br.advance(uint8(v.entry))
buf[off+1] = uint8(v.entry >> 8)
- v = dt[br.peekByteFast()>>shift]
+ v = dt[uint8(br.value>>shift)]
br.advance(uint8(v.entry))
buf[off+2] = uint8(v.entry >> 8)
- v = dt[br.peekByteFast()>>shift]
+ v = dt[uint8(br.value>>shift)]
br.advance(uint8(v.entry))
buf[off+3] = uint8(v.entry >> 8)
off += 4
if off == 0 {
if len(dst)+256 > maxDecodedSize {
+ d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
@@ -455,6 +584,7 @@
}
if len(dst)+int(off) > maxDecodedSize {
+ d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
@@ -471,15 +601,17 @@
}
}
if len(dst) >= maxDecodedSize {
+ d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
- v := dt[br.peekByteFast()>>shift]
+ v := dt[br.peekByteFast()]
nBits := uint8(v.entry)
br.advance(nBits)
bitsLeft -= int8(nBits)
dst = append(dst, uint8(v.entry>>8))
}
+ d.bufs.Put(bufs)
return dst, br.close()
}
@@ -487,196 +619,6 @@
// The length of the supplied input must match the end of a block exactly.
// The *capacity* of the dst slice must match the destination size of
// the uncompressed data exactly.
-func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
- if len(d.dt.single) == 0 {
- return nil, errors.New("no table loaded")
- }
- if len(src) < 6+(4*1) {
- return nil, errors.New("input too small")
- }
- if use8BitTables && d.actualTableLog <= 8 {
- return d.decompress4X8bit(dst, src)
- }
-
- var br [4]bitReaderShifted
- start := 6
- for i := 0; i < 3; i++ {
- length := int(src[i*2]) | (int(src[i*2+1]) << 8)
- if start+length >= len(src) {
- return nil, errors.New("truncated input (or invalid offset)")
- }
- err := br[i].init(src[start : start+length])
- if err != nil {
- return nil, err
- }
- start += length
- }
- err := br[3].init(src[start:])
- if err != nil {
- return nil, err
- }
-
- // destination, offset to match first output
- dstSize := cap(dst)
- dst = dst[:dstSize]
- out := dst
- dstEvery := (dstSize + 3) / 4
-
- const tlSize = 1 << tableLogMax
- const tlMask = tlSize - 1
- single := d.dt.single[:tlSize]
-
- // Use temp table to avoid bound checks/append penalty.
- var buf [256]byte
- var off uint8
- var decoded int
-
- // Decode 2 values from each decoder/loop.
- const bufoff = 256 / 4
- for {
- if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
- break
- }
-
- {
- const stream = 0
- const stream2 = 1
- br[stream].fillFast()
- br[stream2].fillFast()
-
- val := br[stream].peekBitsFast(d.actualTableLog)
- v := single[val&tlMask]
- br[stream].advance(uint8(v.entry))
- buf[off+bufoff*stream] = uint8(v.entry >> 8)
-
- val2 := br[stream2].peekBitsFast(d.actualTableLog)
- v2 := single[val2&tlMask]
- br[stream2].advance(uint8(v2.entry))
- buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
-
- val = br[stream].peekBitsFast(d.actualTableLog)
- v = single[val&tlMask]
- br[stream].advance(uint8(v.entry))
- buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
-
- val2 = br[stream2].peekBitsFast(d.actualTableLog)
- v2 = single[val2&tlMask]
- br[stream2].advance(uint8(v2.entry))
- buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
- }
-
- {
- const stream = 2
- const stream2 = 3
- br[stream].fillFast()
- br[stream2].fillFast()
-
- val := br[stream].peekBitsFast(d.actualTableLog)
- v := single[val&tlMask]
- br[stream].advance(uint8(v.entry))
- buf[off+bufoff*stream] = uint8(v.entry >> 8)
-
- val2 := br[stream2].peekBitsFast(d.actualTableLog)
- v2 := single[val2&tlMask]
- br[stream2].advance(uint8(v2.entry))
- buf[off+bufoff*stream2] = uint8(v2.entry >> 8)
-
- val = br[stream].peekBitsFast(d.actualTableLog)
- v = single[val&tlMask]
- br[stream].advance(uint8(v.entry))
- buf[off+bufoff*stream+1] = uint8(v.entry >> 8)
-
- val2 = br[stream2].peekBitsFast(d.actualTableLog)
- v2 = single[val2&tlMask]
- br[stream2].advance(uint8(v2.entry))
- buf[off+bufoff*stream2+1] = uint8(v2.entry >> 8)
- }
-
- off += 2
-
- if off == bufoff {
- if bufoff > dstEvery {
- return nil, errors.New("corruption detected: stream overrun 1")
- }
- copy(out, buf[:bufoff])
- copy(out[dstEvery:], buf[bufoff:bufoff*2])
- copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
- copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
- off = 0
- out = out[bufoff:]
- decoded += 256
- // There must at least be 3 buffers left.
- if len(out) < dstEvery*3 {
- return nil, errors.New("corruption detected: stream overrun 2")
- }
- }
- }
- if off > 0 {
- ioff := int(off)
- if len(out) < dstEvery*3+ioff {
- return nil, errors.New("corruption detected: stream overrun 3")
- }
- copy(out, buf[:off])
- copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
- copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
- copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
- decoded += int(off) * 4
- out = out[off:]
- }
-
- // Decode remaining.
- for i := range br {
- offset := dstEvery * i
- br := &br[i]
- bitsLeft := br.off*8 + uint(64-br.bitsRead)
- for bitsLeft > 0 {
- br.fill()
- if false && br.bitsRead >= 32 {
- if br.off >= 4 {
- v := br.in[br.off-4:]
- v = v[:4]
- low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
- br.value = (br.value << 32) | uint64(low)
- br.bitsRead -= 32
- br.off -= 4
- } else {
- for br.off > 0 {
- br.value = (br.value << 8) | uint64(br.in[br.off-1])
- br.bitsRead -= 8
- br.off--
- }
- }
- }
- // end inline...
- if offset >= len(out) {
- return nil, errors.New("corruption detected: stream overrun 4")
- }
-
- // Read value and increment offset.
- val := br.peekBitsFast(d.actualTableLog)
- v := single[val&tlMask].entry
- nBits := uint8(v)
- br.advance(nBits)
- bitsLeft -= uint(nBits)
- out[offset] = uint8(v >> 8)
- offset++
- }
- decoded += offset - dstEvery*i
- err = br.close()
- if err != nil {
- return nil, err
- }
- }
- if dstSize != decoded {
- return nil, errors.New("corruption detected: short output block")
- }
- return dst, nil
-}
-
-// Decompress4X will decompress a 4X encoded stream.
-// The length of the supplied input must match the end of a block exactly.
-// The *capacity* of the dst slice must match the destination size of
-// the uncompressed data exactly.
func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
if d.actualTableLog == 8 {
return d.decompress4X8bitExactly(dst, src)
@@ -706,19 +648,18 @@
out := dst
dstEvery := (dstSize + 3) / 4
- shift := (8 - d.actualTableLog) & 7
+ shift := (56 + (8 - d.actualTableLog)) & 63
const tlSize = 1 << 8
- const tlMask = tlSize - 1
single := d.dt.single[:tlSize]
// Use temp table to avoid bound checks/append penalty.
- var buf [256]byte
+ buf := d.buffer()
var off uint8
var decoded int
// Decode 4 values from each decoder/loop.
- const bufoff = 256 / 4
+ const bufoff = 256
for {
if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
break
@@ -728,120 +669,144 @@
// Interleave 2 decodes.
const stream = 0
const stream2 = 1
- br[stream].fillFast()
- br[stream2].fillFast()
+ br1 := &br[stream]
+ br2 := &br[stream2]
+ br1.fillFast()
+ br2.fillFast()
- v := single[br[stream].peekByteFast()>>shift].entry
- buf[off+bufoff*stream] = uint8(v >> 8)
- br[stream].advance(uint8(v))
+ v := single[uint8(br1.value>>shift)].entry
+ v2 := single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off] = uint8(v >> 8)
+ buf[stream2][off] = uint8(v2 >> 8)
- v2 := single[br[stream2].peekByteFast()>>shift].entry
- buf[off+bufoff*stream2] = uint8(v2 >> 8)
- br[stream2].advance(uint8(v2))
+ v = single[uint8(br1.value>>shift)].entry
+ v2 = single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off+1] = uint8(v >> 8)
+ buf[stream2][off+1] = uint8(v2 >> 8)
- v = single[br[stream].peekByteFast()>>shift].entry
- buf[off+bufoff*stream+1] = uint8(v >> 8)
- br[stream].advance(uint8(v))
+ v = single[uint8(br1.value>>shift)].entry
+ v2 = single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off+2] = uint8(v >> 8)
+ buf[stream2][off+2] = uint8(v2 >> 8)
- v2 = single[br[stream2].peekByteFast()>>shift].entry
- buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
- br[stream2].advance(uint8(v2))
-
- v = single[br[stream].peekByteFast()>>shift].entry
- buf[off+bufoff*stream+2] = uint8(v >> 8)
- br[stream].advance(uint8(v))
-
- v2 = single[br[stream2].peekByteFast()>>shift].entry
- buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
- br[stream2].advance(uint8(v2))
-
- v = single[br[stream].peekByteFast()>>shift].entry
- buf[off+bufoff*stream+3] = uint8(v >> 8)
- br[stream].advance(uint8(v))
-
- v2 = single[br[stream2].peekByteFast()>>shift].entry
- buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
- br[stream2].advance(uint8(v2))
+ v = single[uint8(br1.value>>shift)].entry
+ v2 = single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off+3] = uint8(v >> 8)
+ buf[stream2][off+3] = uint8(v2 >> 8)
}
{
const stream = 2
const stream2 = 3
- br[stream].fillFast()
- br[stream2].fillFast()
+ br1 := &br[stream]
+ br2 := &br[stream2]
+ br1.fillFast()
+ br2.fillFast()
- v := single[br[stream].peekByteFast()>>shift].entry
- buf[off+bufoff*stream] = uint8(v >> 8)
- br[stream].advance(uint8(v))
+ v := single[uint8(br1.value>>shift)].entry
+ v2 := single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off] = uint8(v >> 8)
+ buf[stream2][off] = uint8(v2 >> 8)
- v2 := single[br[stream2].peekByteFast()>>shift].entry
- buf[off+bufoff*stream2] = uint8(v2 >> 8)
- br[stream2].advance(uint8(v2))
+ v = single[uint8(br1.value>>shift)].entry
+ v2 = single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off+1] = uint8(v >> 8)
+ buf[stream2][off+1] = uint8(v2 >> 8)
- v = single[br[stream].peekByteFast()>>shift].entry
- buf[off+bufoff*stream+1] = uint8(v >> 8)
- br[stream].advance(uint8(v))
+ v = single[uint8(br1.value>>shift)].entry
+ v2 = single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off+2] = uint8(v >> 8)
+ buf[stream2][off+2] = uint8(v2 >> 8)
- v2 = single[br[stream2].peekByteFast()>>shift].entry
- buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
- br[stream2].advance(uint8(v2))
-
- v = single[br[stream].peekByteFast()>>shift].entry
- buf[off+bufoff*stream+2] = uint8(v >> 8)
- br[stream].advance(uint8(v))
-
- v2 = single[br[stream2].peekByteFast()>>shift].entry
- buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
- br[stream2].advance(uint8(v2))
-
- v = single[br[stream].peekByteFast()>>shift].entry
- buf[off+bufoff*stream+3] = uint8(v >> 8)
- br[stream].advance(uint8(v))
-
- v2 = single[br[stream2].peekByteFast()>>shift].entry
- buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
- br[stream2].advance(uint8(v2))
+ v = single[uint8(br1.value>>shift)].entry
+ v2 = single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off+3] = uint8(v >> 8)
+ buf[stream2][off+3] = uint8(v2 >> 8)
}
off += 4
- if off == bufoff {
+ if off == 0 {
if bufoff > dstEvery {
+ d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
- copy(out, buf[:bufoff])
- copy(out[dstEvery:], buf[bufoff:bufoff*2])
- copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
- copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
- off = 0
- out = out[bufoff:]
- decoded += 256
// There must at least be 3 buffers left.
- if len(out) < dstEvery*3 {
+ if len(out)-bufoff < dstEvery*3 {
+ d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
+ //copy(out, buf[0][:])
+ //copy(out[dstEvery:], buf[1][:])
+ //copy(out[dstEvery*2:], buf[2][:])
+ *(*[bufoff]byte)(out) = buf[0]
+ *(*[bufoff]byte)(out[dstEvery:]) = buf[1]
+ *(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
+ *(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
+ out = out[bufoff:]
+ decoded += bufoff * 4
}
}
if off > 0 {
ioff := int(off)
if len(out) < dstEvery*3+ioff {
+ d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 3")
}
- copy(out, buf[:off])
- copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
- copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
- copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+ copy(out, buf[0][:off])
+ copy(out[dstEvery:], buf[1][:off])
+ copy(out[dstEvery*2:], buf[2][:off])
+ copy(out[dstEvery*3:], buf[3][:off])
decoded += int(off) * 4
out = out[off:]
}
// Decode remaining.
+ // Decode remaining.
+ remainBytes := dstEvery - (decoded / 4)
for i := range br {
offset := dstEvery * i
+ endsAt := offset + remainBytes
+ if endsAt > len(out) {
+ endsAt = len(out)
+ }
br := &br[i]
- bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+ bitsLeft := br.remaining()
for bitsLeft > 0 {
if br.finished() {
+ d.bufs.Put(buf)
return nil, io.ErrUnexpectedEOF
}
if br.bitsRead >= 56 {
@@ -861,24 +826,31 @@
}
}
// end inline...
- if offset >= len(out) {
+ if offset >= endsAt {
+ d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 4")
}
// Read value and increment offset.
- v := single[br.peekByteFast()>>shift].entry
+ v := single[uint8(br.value>>shift)].entry
nBits := uint8(v)
br.advance(nBits)
- bitsLeft -= int(nBits)
+ bitsLeft -= uint(nBits)
out[offset] = uint8(v >> 8)
offset++
}
+ if offset != endsAt {
+ d.bufs.Put(buf)
+ return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+ }
decoded += offset - dstEvery*i
err = br.close()
if err != nil {
+ d.bufs.Put(buf)
return nil, err
}
}
+ d.bufs.Put(buf)
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
@@ -914,18 +886,17 @@
out := dst
dstEvery := (dstSize + 3) / 4
- const shift = 0
+ const shift = 56
const tlSize = 1 << 8
- const tlMask = tlSize - 1
single := d.dt.single[:tlSize]
// Use temp table to avoid bound checks/append penalty.
- var buf [256]byte
+ buf := d.buffer()
var off uint8
var decoded int
// Decode 4 values from each decoder/loop.
- const bufoff = 256 / 4
+ const bufoff = 256
for {
if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
break
@@ -935,98 +906,116 @@
// Interleave 2 decodes.
const stream = 0
const stream2 = 1
- br[stream].fillFast()
- br[stream2].fillFast()
+ br1 := &br[stream]
+ br2 := &br[stream2]
+ br1.fillFast()
+ br2.fillFast()
- v := single[br[stream].peekByteFast()>>shift].entry
- buf[off+bufoff*stream] = uint8(v >> 8)
- br[stream].advance(uint8(v))
+ v := single[uint8(br1.value>>shift)].entry
+ v2 := single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off] = uint8(v >> 8)
+ buf[stream2][off] = uint8(v2 >> 8)
- v2 := single[br[stream2].peekByteFast()>>shift].entry
- buf[off+bufoff*stream2] = uint8(v2 >> 8)
- br[stream2].advance(uint8(v2))
+ v = single[uint8(br1.value>>shift)].entry
+ v2 = single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off+1] = uint8(v >> 8)
+ buf[stream2][off+1] = uint8(v2 >> 8)
- v = single[br[stream].peekByteFast()>>shift].entry
- buf[off+bufoff*stream+1] = uint8(v >> 8)
- br[stream].advance(uint8(v))
+ v = single[uint8(br1.value>>shift)].entry
+ v2 = single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off+2] = uint8(v >> 8)
+ buf[stream2][off+2] = uint8(v2 >> 8)
- v2 = single[br[stream2].peekByteFast()>>shift].entry
- buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
- br[stream2].advance(uint8(v2))
-
- v = single[br[stream].peekByteFast()>>shift].entry
- buf[off+bufoff*stream+2] = uint8(v >> 8)
- br[stream].advance(uint8(v))
-
- v2 = single[br[stream2].peekByteFast()>>shift].entry
- buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
- br[stream2].advance(uint8(v2))
-
- v = single[br[stream].peekByteFast()>>shift].entry
- buf[off+bufoff*stream+3] = uint8(v >> 8)
- br[stream].advance(uint8(v))
-
- v2 = single[br[stream2].peekByteFast()>>shift].entry
- buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
- br[stream2].advance(uint8(v2))
+ v = single[uint8(br1.value>>shift)].entry
+ v2 = single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off+3] = uint8(v >> 8)
+ buf[stream2][off+3] = uint8(v2 >> 8)
}
{
const stream = 2
const stream2 = 3
- br[stream].fillFast()
- br[stream2].fillFast()
+ br1 := &br[stream]
+ br2 := &br[stream2]
+ br1.fillFast()
+ br2.fillFast()
- v := single[br[stream].peekByteFast()>>shift].entry
- buf[off+bufoff*stream] = uint8(v >> 8)
- br[stream].advance(uint8(v))
+ v := single[uint8(br1.value>>shift)].entry
+ v2 := single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off] = uint8(v >> 8)
+ buf[stream2][off] = uint8(v2 >> 8)
- v2 := single[br[stream2].peekByteFast()>>shift].entry
- buf[off+bufoff*stream2] = uint8(v2 >> 8)
- br[stream2].advance(uint8(v2))
+ v = single[uint8(br1.value>>shift)].entry
+ v2 = single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off+1] = uint8(v >> 8)
+ buf[stream2][off+1] = uint8(v2 >> 8)
- v = single[br[stream].peekByteFast()>>shift].entry
- buf[off+bufoff*stream+1] = uint8(v >> 8)
- br[stream].advance(uint8(v))
+ v = single[uint8(br1.value>>shift)].entry
+ v2 = single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off+2] = uint8(v >> 8)
+ buf[stream2][off+2] = uint8(v2 >> 8)
- v2 = single[br[stream2].peekByteFast()>>shift].entry
- buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
- br[stream2].advance(uint8(v2))
-
- v = single[br[stream].peekByteFast()>>shift].entry
- buf[off+bufoff*stream+2] = uint8(v >> 8)
- br[stream].advance(uint8(v))
-
- v2 = single[br[stream2].peekByteFast()>>shift].entry
- buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
- br[stream2].advance(uint8(v2))
-
- v = single[br[stream].peekByteFast()>>shift].entry
- buf[off+bufoff*stream+3] = uint8(v >> 8)
- br[stream].advance(uint8(v))
-
- v2 = single[br[stream2].peekByteFast()>>shift].entry
- buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
- br[stream2].advance(uint8(v2))
+ v = single[uint8(br1.value>>shift)].entry
+ v2 = single[uint8(br2.value>>shift)].entry
+ br1.bitsRead += uint8(v)
+ br1.value <<= v & 63
+ br2.bitsRead += uint8(v2)
+ br2.value <<= v2 & 63
+ buf[stream][off+3] = uint8(v >> 8)
+ buf[stream2][off+3] = uint8(v2 >> 8)
}
off += 4
- if off == bufoff {
+ if off == 0 {
if bufoff > dstEvery {
+ d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
- copy(out, buf[:bufoff])
- copy(out[dstEvery:], buf[bufoff:bufoff*2])
- copy(out[dstEvery*2:], buf[bufoff*2:bufoff*3])
- copy(out[dstEvery*3:], buf[bufoff*3:bufoff*4])
- off = 0
- out = out[bufoff:]
- decoded += 256
// There must at least be 3 buffers left.
- if len(out) < dstEvery*3 {
+ if len(out)-bufoff < dstEvery*3 {
+ d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
+
+ //copy(out, buf[0][:])
+ //copy(out[dstEvery:], buf[1][:])
+ //copy(out[dstEvery*2:], buf[2][:])
+ // copy(out[dstEvery*3:], buf[3][:])
+ *(*[bufoff]byte)(out) = buf[0]
+ *(*[bufoff]byte)(out[dstEvery:]) = buf[1]
+ *(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
+ *(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
+ out = out[bufoff:]
+ decoded += bufoff * 4
}
}
if off > 0 {
@@ -1034,21 +1023,27 @@
if len(out) < dstEvery*3+ioff {
return nil, errors.New("corruption detected: stream overrun 3")
}
- copy(out, buf[:off])
- copy(out[dstEvery:dstEvery+ioff], buf[bufoff:bufoff*2])
- copy(out[dstEvery*2:dstEvery*2+ioff], buf[bufoff*2:bufoff*3])
- copy(out[dstEvery*3:dstEvery*3+ioff], buf[bufoff*3:bufoff*4])
+ copy(out, buf[0][:off])
+ copy(out[dstEvery:], buf[1][:off])
+ copy(out[dstEvery*2:], buf[2][:off])
+ copy(out[dstEvery*3:], buf[3][:off])
decoded += int(off) * 4
out = out[off:]
}
// Decode remaining.
+ remainBytes := dstEvery - (decoded / 4)
for i := range br {
offset := dstEvery * i
+ endsAt := offset + remainBytes
+ if endsAt > len(out) {
+ endsAt = len(out)
+ }
br := &br[i]
- bitsLeft := int(br.off*8) + int(64-br.bitsRead)
+ bitsLeft := br.remaining()
for bitsLeft > 0 {
if br.finished() {
+ d.bufs.Put(buf)
return nil, io.ErrUnexpectedEOF
}
if br.bitsRead >= 56 {
@@ -1068,24 +1063,32 @@
}
}
// end inline...
- if offset >= len(out) {
+ if offset >= endsAt {
+ d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 4")
}
// Read value and increment offset.
- v := single[br.peekByteFast()>>shift].entry
+ v := single[br.peekByteFast()].entry
nBits := uint8(v)
br.advance(nBits)
- bitsLeft -= int(nBits)
+ bitsLeft -= uint(nBits)
out[offset] = uint8(v >> 8)
offset++
}
+ if offset != endsAt {
+ d.bufs.Put(buf)
+ return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+ }
+
decoded += offset - dstEvery*i
err = br.close()
if err != nil {
+ d.bufs.Put(buf)
return nil, err
}
}
+ d.bufs.Put(buf)
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
@@ -1133,7 +1136,7 @@
errs++
}
if errs > 0 {
- fmt.Fprintf(w, "%d errros in base, stopping\n", errs)
+ fmt.Fprintf(w, "%d errors in base, stopping\n", errs)
continue
}
// Ensure that all combinations are covered.
@@ -1149,7 +1152,7 @@
errs++
}
if errs > 20 {
- fmt.Fprintf(w, "%d errros, stopping\n", errs)
+ fmt.Fprintf(w, "%d errors, stopping\n", errs)
break
}
}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
new file mode 100644
index 0000000..ba7e8e6
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.go
@@ -0,0 +1,226 @@
+//go:build amd64 && !appengine && !noasm && gc
+// +build amd64,!appengine,!noasm,gc
+
+// This file contains the specialisation of Decoder.Decompress4X
+// and Decoder.Decompress1X that use an asm implementation of thir main loops.
+package huff0
+
+import (
+ "errors"
+ "fmt"
+
+ "github.com/klauspost/compress/internal/cpuinfo"
+)
+
+// decompress4x_main_loop_x86 is an x86 assembler implementation
+// of Decompress4X when tablelog > 8.
+//
+//go:noescape
+func decompress4x_main_loop_amd64(ctx *decompress4xContext)
+
+// decompress4x_8b_loop_x86 is an x86 assembler implementation
+// of Decompress4X when tablelog <= 8 which decodes 4 entries
+// per loop.
+//
+//go:noescape
+func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
+
+// fallback8BitSize is the size where using Go version is faster.
+const fallback8BitSize = 800
+
+type decompress4xContext struct {
+ pbr *[4]bitReaderShifted
+ peekBits uint8
+ out *byte
+ dstEvery int
+ tbl *dEntrySingle
+ decoded int
+ limit *byte
+}
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+ if len(d.dt.single) == 0 {
+ return nil, errors.New("no table loaded")
+ }
+ if len(src) < 6+(4*1) {
+ return nil, errors.New("input too small")
+ }
+
+ use8BitTables := d.actualTableLog <= 8
+ if cap(dst) < fallback8BitSize && use8BitTables {
+ return d.decompress4X8bit(dst, src)
+ }
+
+ var br [4]bitReaderShifted
+ // Decode "jump table"
+ start := 6
+ for i := 0; i < 3; i++ {
+ length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+ if start+length >= len(src) {
+ return nil, errors.New("truncated input (or invalid offset)")
+ }
+ err := br[i].init(src[start : start+length])
+ if err != nil {
+ return nil, err
+ }
+ start += length
+ }
+ err := br[3].init(src[start:])
+ if err != nil {
+ return nil, err
+ }
+
+ // destination, offset to match first output
+ dstSize := cap(dst)
+ dst = dst[:dstSize]
+ out := dst
+ dstEvery := (dstSize + 3) / 4
+
+ const tlSize = 1 << tableLogMax
+ const tlMask = tlSize - 1
+ single := d.dt.single[:tlSize]
+
+ var decoded int
+
+ if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
+ ctx := decompress4xContext{
+ pbr: &br,
+ peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
+ out: &out[0],
+ dstEvery: dstEvery,
+ tbl: &single[0],
+ limit: &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
+ }
+ if use8BitTables {
+ decompress4x_8b_main_loop_amd64(&ctx)
+ } else {
+ decompress4x_main_loop_amd64(&ctx)
+ }
+
+ decoded = ctx.decoded
+ out = out[decoded/4:]
+ }
+
+ // Decode remaining.
+ remainBytes := dstEvery - (decoded / 4)
+ for i := range br {
+ offset := dstEvery * i
+ endsAt := offset + remainBytes
+ if endsAt > len(out) {
+ endsAt = len(out)
+ }
+ br := &br[i]
+ bitsLeft := br.remaining()
+ for bitsLeft > 0 {
+ br.fill()
+ if offset >= endsAt {
+ return nil, errors.New("corruption detected: stream overrun 4")
+ }
+
+ // Read value and increment offset.
+ val := br.peekBitsFast(d.actualTableLog)
+ v := single[val&tlMask].entry
+ nBits := uint8(v)
+ br.advance(nBits)
+ bitsLeft -= uint(nBits)
+ out[offset] = uint8(v >> 8)
+ offset++
+ }
+ if offset != endsAt {
+ return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+ }
+ decoded += offset - dstEvery*i
+ err = br.close()
+ if err != nil {
+ return nil, err
+ }
+ }
+ if dstSize != decoded {
+ return nil, errors.New("corruption detected: short output block")
+ }
+ return dst, nil
+}
+
+// decompress4x_main_loop_x86 is an x86 assembler implementation
+// of Decompress1X when tablelog > 8.
+//
+//go:noescape
+func decompress1x_main_loop_amd64(ctx *decompress1xContext)
+
+// decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
+// of Decompress1X when tablelog > 8.
+//
+//go:noescape
+func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
+
+type decompress1xContext struct {
+ pbr *bitReaderShifted
+ peekBits uint8
+ out *byte
+ outCap int
+ tbl *dEntrySingle
+ decoded int
+}
+
+// Error reported by asm implementations
+const error_max_decoded_size_exeeded = -1
+
+// Decompress1X will decompress a 1X encoded stream.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
+ if len(d.dt.single) == 0 {
+ return nil, errors.New("no table loaded")
+ }
+ var br bitReaderShifted
+ err := br.init(src)
+ if err != nil {
+ return dst, err
+ }
+ maxDecodedSize := cap(dst)
+ dst = dst[:maxDecodedSize]
+
+ const tlSize = 1 << tableLogMax
+ const tlMask = tlSize - 1
+
+ if maxDecodedSize >= 4 {
+ ctx := decompress1xContext{
+ pbr: &br,
+ out: &dst[0],
+ outCap: maxDecodedSize,
+ peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
+ tbl: &d.dt.single[0],
+ }
+
+ if cpuinfo.HasBMI2() {
+ decompress1x_main_loop_bmi2(&ctx)
+ } else {
+ decompress1x_main_loop_amd64(&ctx)
+ }
+ if ctx.decoded == error_max_decoded_size_exeeded {
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+
+ dst = dst[:ctx.decoded]
+ }
+
+ // br < 8, so uint8 is fine
+ bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
+ for bitsLeft > 0 {
+ br.fill()
+ if len(dst) >= maxDecodedSize {
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
+ nBits := uint8(v.entry)
+ br.advance(nBits)
+ bitsLeft -= nBits
+ dst = append(dst, uint8(v.entry>>8))
+ }
+ return dst, br.close()
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
new file mode 100644
index 0000000..c4c7ab2
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_amd64.s
@@ -0,0 +1,830 @@
+// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT.
+
+//go:build amd64 && !appengine && !noasm && gc
+
+// func decompress4x_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_main_loop_amd64(SB), $0-8
+ // Preload values
+ MOVQ ctx+0(FP), AX
+ MOVBQZX 8(AX), DI
+ MOVQ 16(AX), BX
+ MOVQ 48(AX), SI
+ MOVQ 24(AX), R8
+ MOVQ 32(AX), R9
+ MOVQ (AX), R10
+
+ // Main loop
+main_loop:
+ XORL DX, DX
+ CMPQ BX, SI
+ SETGE DL
+
+ // br0.fillFast32()
+ MOVQ 32(R10), R11
+ MOVBQZX 40(R10), R12
+ CMPQ R12, $0x20
+ JBE skip_fill0
+ MOVQ 24(R10), AX
+ SUBQ $0x20, R12
+ SUBQ $0x04, AX
+ MOVQ (R10), R13
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (AX)(R13*1), R13
+ MOVQ R12, CX
+ SHLQ CL, R13
+ MOVQ AX, 24(R10)
+ ORQ R13, R11
+
+ // exhausted += (br0.off < 4)
+ CMPQ AX, $0x04
+ ADCB $+0, DL
+
+skip_fill0:
+ // val0 := br0.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v0 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br0.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R11
+ ADDB CL, R12
+
+ // val1 := br0.peekTopBits(peekBits)
+ MOVQ DI, CX
+ MOVQ R11, R13
+ SHRQ CL, R13
+
+ // v1 := table[val1&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br0.advance(uint8(v1.entry))
+ MOVB CH, AH
+ SHLQ CL, R11
+ ADDB CL, R12
+
+ // these two writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (BX)
+
+ // update the bitreader structure
+ MOVQ R11, 32(R10)
+ MOVB R12, 40(R10)
+
+ // br1.fillFast32()
+ MOVQ 80(R10), R11
+ MOVBQZX 88(R10), R12
+ CMPQ R12, $0x20
+ JBE skip_fill1
+ MOVQ 72(R10), AX
+ SUBQ $0x20, R12
+ SUBQ $0x04, AX
+ MOVQ 48(R10), R13
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (AX)(R13*1), R13
+ MOVQ R12, CX
+ SHLQ CL, R13
+ MOVQ AX, 72(R10)
+ ORQ R13, R11
+
+ // exhausted += (br1.off < 4)
+ CMPQ AX, $0x04
+ ADCB $+0, DL
+
+skip_fill1:
+ // val0 := br1.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v0 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br1.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R11
+ ADDB CL, R12
+
+ // val1 := br1.peekTopBits(peekBits)
+ MOVQ DI, CX
+ MOVQ R11, R13
+ SHRQ CL, R13
+
+ // v1 := table[val1&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br1.advance(uint8(v1.entry))
+ MOVB CH, AH
+ SHLQ CL, R11
+ ADDB CL, R12
+
+ // these two writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (BX)(R8*1)
+
+ // update the bitreader structure
+ MOVQ R11, 80(R10)
+ MOVB R12, 88(R10)
+
+ // br2.fillFast32()
+ MOVQ 128(R10), R11
+ MOVBQZX 136(R10), R12
+ CMPQ R12, $0x20
+ JBE skip_fill2
+ MOVQ 120(R10), AX
+ SUBQ $0x20, R12
+ SUBQ $0x04, AX
+ MOVQ 96(R10), R13
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (AX)(R13*1), R13
+ MOVQ R12, CX
+ SHLQ CL, R13
+ MOVQ AX, 120(R10)
+ ORQ R13, R11
+
+ // exhausted += (br2.off < 4)
+ CMPQ AX, $0x04
+ ADCB $+0, DL
+
+skip_fill2:
+ // val0 := br2.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v0 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br2.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R11
+ ADDB CL, R12
+
+ // val1 := br2.peekTopBits(peekBits)
+ MOVQ DI, CX
+ MOVQ R11, R13
+ SHRQ CL, R13
+
+ // v1 := table[val1&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br2.advance(uint8(v1.entry))
+ MOVB CH, AH
+ SHLQ CL, R11
+ ADDB CL, R12
+
+ // these two writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ MOVW AX, (BX)(R8*2)
+
+ // update the bitreader structure
+ MOVQ R11, 128(R10)
+ MOVB R12, 136(R10)
+
+ // br3.fillFast32()
+ MOVQ 176(R10), R11
+ MOVBQZX 184(R10), R12
+ CMPQ R12, $0x20
+ JBE skip_fill3
+ MOVQ 168(R10), AX
+ SUBQ $0x20, R12
+ SUBQ $0x04, AX
+ MOVQ 144(R10), R13
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (AX)(R13*1), R13
+ MOVQ R12, CX
+ SHLQ CL, R13
+ MOVQ AX, 168(R10)
+ ORQ R13, R11
+
+ // exhausted += (br3.off < 4)
+ CMPQ AX, $0x04
+ ADCB $+0, DL
+
+skip_fill3:
+ // val0 := br3.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v0 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br3.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R11
+ ADDB CL, R12
+
+ // val1 := br3.peekTopBits(peekBits)
+ MOVQ DI, CX
+ MOVQ R11, R13
+ SHRQ CL, R13
+
+ // v1 := table[val1&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br3.advance(uint8(v1.entry))
+ MOVB CH, AH
+ SHLQ CL, R11
+ ADDB CL, R12
+
+ // these two writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ LEAQ (R8)(R8*2), CX
+ MOVW AX, (BX)(CX*1)
+
+ // update the bitreader structure
+ MOVQ R11, 176(R10)
+ MOVB R12, 184(R10)
+ ADDQ $0x02, BX
+ TESTB DL, DL
+ JZ main_loop
+ MOVQ ctx+0(FP), AX
+ SUBQ 16(AX), BX
+ SHLQ $0x02, BX
+ MOVQ BX, 40(AX)
+ RET
+
+// func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
+TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8
+ // Preload values
+ MOVQ ctx+0(FP), CX
+ MOVBQZX 8(CX), DI
+ MOVQ 16(CX), BX
+ MOVQ 48(CX), SI
+ MOVQ 24(CX), R8
+ MOVQ 32(CX), R9
+ MOVQ (CX), R10
+
+ // Main loop
+main_loop:
+ XORL DX, DX
+ CMPQ BX, SI
+ SETGE DL
+
+ // br0.fillFast32()
+ MOVQ 32(R10), R11
+ MOVBQZX 40(R10), R12
+ CMPQ R12, $0x20
+ JBE skip_fill0
+ MOVQ 24(R10), R13
+ SUBQ $0x20, R12
+ SUBQ $0x04, R13
+ MOVQ (R10), R14
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R13)(R14*1), R14
+ MOVQ R12, CX
+ SHLQ CL, R14
+ MOVQ R13, 24(R10)
+ ORQ R14, R11
+
+ // exhausted += (br0.off < 4)
+ CMPQ R13, $0x04
+ ADCB $+0, DL
+
+skip_fill0:
+ // val0 := br0.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v0 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br0.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R11
+ ADDB CL, R12
+
+ // val1 := br0.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v1 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br0.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R11
+ ADDB CL, R12
+ BSWAPL AX
+
+ // val2 := br0.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v2 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br0.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R11
+ ADDB CL, R12
+
+ // val3 := br0.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v3 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br0.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R11
+ ADDB CL, R12
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (BX)
+
+ // update the bitreader structure
+ MOVQ R11, 32(R10)
+ MOVB R12, 40(R10)
+
+ // br1.fillFast32()
+ MOVQ 80(R10), R11
+ MOVBQZX 88(R10), R12
+ CMPQ R12, $0x20
+ JBE skip_fill1
+ MOVQ 72(R10), R13
+ SUBQ $0x20, R12
+ SUBQ $0x04, R13
+ MOVQ 48(R10), R14
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R13)(R14*1), R14
+ MOVQ R12, CX
+ SHLQ CL, R14
+ MOVQ R13, 72(R10)
+ ORQ R14, R11
+
+ // exhausted += (br1.off < 4)
+ CMPQ R13, $0x04
+ ADCB $+0, DL
+
+skip_fill1:
+ // val0 := br1.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v0 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br1.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R11
+ ADDB CL, R12
+
+ // val1 := br1.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v1 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br1.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R11
+ ADDB CL, R12
+ BSWAPL AX
+
+ // val2 := br1.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v2 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br1.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R11
+ ADDB CL, R12
+
+ // val3 := br1.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v3 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br1.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R11
+ ADDB CL, R12
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (BX)(R8*1)
+
+ // update the bitreader structure
+ MOVQ R11, 80(R10)
+ MOVB R12, 88(R10)
+
+ // br2.fillFast32()
+ MOVQ 128(R10), R11
+ MOVBQZX 136(R10), R12
+ CMPQ R12, $0x20
+ JBE skip_fill2
+ MOVQ 120(R10), R13
+ SUBQ $0x20, R12
+ SUBQ $0x04, R13
+ MOVQ 96(R10), R14
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R13)(R14*1), R14
+ MOVQ R12, CX
+ SHLQ CL, R14
+ MOVQ R13, 120(R10)
+ ORQ R14, R11
+
+ // exhausted += (br2.off < 4)
+ CMPQ R13, $0x04
+ ADCB $+0, DL
+
+skip_fill2:
+ // val0 := br2.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v0 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br2.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R11
+ ADDB CL, R12
+
+ // val1 := br2.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v1 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br2.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R11
+ ADDB CL, R12
+ BSWAPL AX
+
+ // val2 := br2.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v2 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br2.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R11
+ ADDB CL, R12
+
+ // val3 := br2.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v3 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br2.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R11
+ ADDB CL, R12
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ MOVL AX, (BX)(R8*2)
+
+ // update the bitreader structure
+ MOVQ R11, 128(R10)
+ MOVB R12, 136(R10)
+
+ // br3.fillFast32()
+ MOVQ 176(R10), R11
+ MOVBQZX 184(R10), R12
+ CMPQ R12, $0x20
+ JBE skip_fill3
+ MOVQ 168(R10), R13
+ SUBQ $0x20, R12
+ SUBQ $0x04, R13
+ MOVQ 144(R10), R14
+
+ // b.value |= uint64(low) << (b.bitsRead & 63)
+ MOVL (R13)(R14*1), R14
+ MOVQ R12, CX
+ SHLQ CL, R14
+ MOVQ R13, 168(R10)
+ ORQ R14, R11
+
+ // exhausted += (br3.off < 4)
+ CMPQ R13, $0x04
+ ADCB $+0, DL
+
+skip_fill3:
+ // val0 := br3.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v0 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br3.advance(uint8(v0.entry)
+ MOVB CH, AL
+ SHLQ CL, R11
+ ADDB CL, R12
+
+ // val1 := br3.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v1 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br3.advance(uint8(v1.entry)
+ MOVB CH, AH
+ SHLQ CL, R11
+ ADDB CL, R12
+ BSWAPL AX
+
+ // val2 := br3.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v2 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br3.advance(uint8(v2.entry)
+ MOVB CH, AH
+ SHLQ CL, R11
+ ADDB CL, R12
+
+ // val3 := br3.peekTopBits(peekBits)
+ MOVQ R11, R13
+ MOVQ DI, CX
+ SHRQ CL, R13
+
+ // v3 := table[val0&mask]
+ MOVW (R9)(R13*2), CX
+
+ // br3.advance(uint8(v3.entry)
+ MOVB CH, AL
+ SHLQ CL, R11
+ ADDB CL, R12
+ BSWAPL AX
+
+ // these four writes get coalesced
+ // out[id * dstEvery + 0] = uint8(v0.entry >> 8)
+ // out[id * dstEvery + 1] = uint8(v1.entry >> 8)
+ // out[id * dstEvery + 3] = uint8(v2.entry >> 8)
+ // out[id * dstEvery + 4] = uint8(v3.entry >> 8)
+ LEAQ (R8)(R8*2), CX
+ MOVL AX, (BX)(CX*1)
+
+ // update the bitreader structure
+ MOVQ R11, 176(R10)
+ MOVB R12, 184(R10)
+ ADDQ $0x04, BX
+ TESTB DL, DL
+ JZ main_loop
+ MOVQ ctx+0(FP), AX
+ SUBQ 16(AX), BX
+ SHLQ $0x02, BX
+ MOVQ BX, 40(AX)
+ RET
+
+// func decompress1x_main_loop_amd64(ctx *decompress1xContext)
+TEXT ·decompress1x_main_loop_amd64(SB), $0-8
+ MOVQ ctx+0(FP), CX
+ MOVQ 16(CX), DX
+ MOVQ 24(CX), BX
+ CMPQ BX, $0x04
+ JB error_max_decoded_size_exceeded
+ LEAQ (DX)(BX*1), BX
+ MOVQ (CX), SI
+ MOVQ (SI), R8
+ MOVQ 24(SI), R9
+ MOVQ 32(SI), R10
+ MOVBQZX 40(SI), R11
+ MOVQ 32(CX), SI
+ MOVBQZX 8(CX), DI
+ JMP loop_condition
+
+main_loop:
+ // Check if we have room for 4 bytes in the output buffer
+ LEAQ 4(DX), CX
+ CMPQ CX, BX
+ JGE error_max_decoded_size_exceeded
+
+ // Decode 4 values
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_1_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), R12
+ MOVQ R11, CX
+ SHLQ CL, R12
+ ORQ R12, R10
+
+bitReader_fillFast_1_end:
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ BSWAPL AX
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_2_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), R12
+ MOVQ R11, CX
+ SHLQ CL, R12
+ ORQ R12, R10
+
+bitReader_fillFast_2_end:
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ MOVQ DI, CX
+ MOVQ R10, R12
+ SHRQ CL, R12
+ MOVW (SI)(R12*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLQ CL, R10
+ BSWAPL AX
+
+ // Store the decoded values
+ MOVL AX, (DX)
+ ADDQ $0x04, DX
+
+loop_condition:
+ CMPQ R9, $0x08
+ JGE main_loop
+
+ // Update ctx structure
+ MOVQ ctx+0(FP), AX
+ SUBQ 16(AX), DX
+ MOVQ DX, 40(AX)
+ MOVQ (AX), AX
+ MOVQ R9, 24(AX)
+ MOVQ R10, 32(AX)
+ MOVB R11, 40(AX)
+ RET
+
+ // Report error
+error_max_decoded_size_exceeded:
+ MOVQ ctx+0(FP), AX
+ MOVQ $-1, CX
+ MOVQ CX, 40(AX)
+ RET
+
+// func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
+// Requires: BMI2
+TEXT ·decompress1x_main_loop_bmi2(SB), $0-8
+ MOVQ ctx+0(FP), CX
+ MOVQ 16(CX), DX
+ MOVQ 24(CX), BX
+ CMPQ BX, $0x04
+ JB error_max_decoded_size_exceeded
+ LEAQ (DX)(BX*1), BX
+ MOVQ (CX), SI
+ MOVQ (SI), R8
+ MOVQ 24(SI), R9
+ MOVQ 32(SI), R10
+ MOVBQZX 40(SI), R11
+ MOVQ 32(CX), SI
+ MOVBQZX 8(CX), DI
+ JMP loop_condition
+
+main_loop:
+ // Check if we have room for 4 bytes in the output buffer
+ LEAQ 4(DX), CX
+ CMPQ CX, BX
+ JGE error_max_decoded_size_exceeded
+
+ // Decode 4 values
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_1_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), CX
+ SHLXQ R11, CX, CX
+ ORQ CX, R10
+
+bitReader_fillFast_1_end:
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ BSWAPL AX
+ CMPQ R11, $0x20
+ JL bitReader_fillFast_2_end
+ SUBQ $0x20, R11
+ SUBQ $0x04, R9
+ MOVL (R8)(R9*1), CX
+ SHLXQ R11, CX, CX
+ ORQ CX, R10
+
+bitReader_fillFast_2_end:
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AH
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ SHRXQ DI, R10, CX
+ MOVW (SI)(CX*2), CX
+ MOVB CH, AL
+ MOVBQZX CL, CX
+ ADDQ CX, R11
+ SHLXQ CX, R10, R10
+ BSWAPL AX
+
+ // Store the decoded values
+ MOVL AX, (DX)
+ ADDQ $0x04, DX
+
+loop_condition:
+ CMPQ R9, $0x08
+ JGE main_loop
+
+ // Update ctx structure
+ MOVQ ctx+0(FP), AX
+ SUBQ 16(AX), DX
+ MOVQ DX, 40(AX)
+ MOVQ (AX), AX
+ MOVQ R9, 24(AX)
+ MOVQ R10, 32(AX)
+ MOVB R11, 40(AX)
+ RET
+
+ // Report error
+error_max_decoded_size_exceeded:
+ MOVQ ctx+0(FP), AX
+ MOVQ $-1, CX
+ MOVQ CX, 40(AX)
+ RET
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress_generic.go b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
new file mode 100644
index 0000000..908c17d
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress_generic.go
@@ -0,0 +1,299 @@
+//go:build !amd64 || appengine || !gc || noasm
+// +build !amd64 appengine !gc noasm
+
+// This file contains a generic implementation of Decoder.Decompress4X.
+package huff0
+
+import (
+ "errors"
+ "fmt"
+)
+
+// Decompress4X will decompress a 4X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// The *capacity* of the dst slice must match the destination size of
+// the uncompressed data exactly.
+func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
+ if len(d.dt.single) == 0 {
+ return nil, errors.New("no table loaded")
+ }
+ if len(src) < 6+(4*1) {
+ return nil, errors.New("input too small")
+ }
+ if use8BitTables && d.actualTableLog <= 8 {
+ return d.decompress4X8bit(dst, src)
+ }
+
+ var br [4]bitReaderShifted
+ // Decode "jump table"
+ start := 6
+ for i := 0; i < 3; i++ {
+ length := int(src[i*2]) | (int(src[i*2+1]) << 8)
+ if start+length >= len(src) {
+ return nil, errors.New("truncated input (or invalid offset)")
+ }
+ err := br[i].init(src[start : start+length])
+ if err != nil {
+ return nil, err
+ }
+ start += length
+ }
+ err := br[3].init(src[start:])
+ if err != nil {
+ return nil, err
+ }
+
+ // destination, offset to match first output
+ dstSize := cap(dst)
+ dst = dst[:dstSize]
+ out := dst
+ dstEvery := (dstSize + 3) / 4
+
+ const tlSize = 1 << tableLogMax
+ const tlMask = tlSize - 1
+ single := d.dt.single[:tlSize]
+
+ // Use temp table to avoid bound checks/append penalty.
+ buf := d.buffer()
+ var off uint8
+ var decoded int
+
+ // Decode 2 values from each decoder/loop.
+ const bufoff = 256
+ for {
+ if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
+ break
+ }
+
+ {
+ const stream = 0
+ const stream2 = 1
+ br[stream].fillFast()
+ br[stream2].fillFast()
+
+ val := br[stream].peekBitsFast(d.actualTableLog)
+ val2 := br[stream2].peekBitsFast(d.actualTableLog)
+ v := single[val&tlMask]
+ v2 := single[val2&tlMask]
+ br[stream].advance(uint8(v.entry))
+ br[stream2].advance(uint8(v2.entry))
+ buf[stream][off] = uint8(v.entry >> 8)
+ buf[stream2][off] = uint8(v2.entry >> 8)
+
+ val = br[stream].peekBitsFast(d.actualTableLog)
+ val2 = br[stream2].peekBitsFast(d.actualTableLog)
+ v = single[val&tlMask]
+ v2 = single[val2&tlMask]
+ br[stream].advance(uint8(v.entry))
+ br[stream2].advance(uint8(v2.entry))
+ buf[stream][off+1] = uint8(v.entry >> 8)
+ buf[stream2][off+1] = uint8(v2.entry >> 8)
+ }
+
+ {
+ const stream = 2
+ const stream2 = 3
+ br[stream].fillFast()
+ br[stream2].fillFast()
+
+ val := br[stream].peekBitsFast(d.actualTableLog)
+ val2 := br[stream2].peekBitsFast(d.actualTableLog)
+ v := single[val&tlMask]
+ v2 := single[val2&tlMask]
+ br[stream].advance(uint8(v.entry))
+ br[stream2].advance(uint8(v2.entry))
+ buf[stream][off] = uint8(v.entry >> 8)
+ buf[stream2][off] = uint8(v2.entry >> 8)
+
+ val = br[stream].peekBitsFast(d.actualTableLog)
+ val2 = br[stream2].peekBitsFast(d.actualTableLog)
+ v = single[val&tlMask]
+ v2 = single[val2&tlMask]
+ br[stream].advance(uint8(v.entry))
+ br[stream2].advance(uint8(v2.entry))
+ buf[stream][off+1] = uint8(v.entry >> 8)
+ buf[stream2][off+1] = uint8(v2.entry >> 8)
+ }
+
+ off += 2
+
+ if off == 0 {
+ if bufoff > dstEvery {
+ d.bufs.Put(buf)
+ return nil, errors.New("corruption detected: stream overrun 1")
+ }
+ // There must at least be 3 buffers left.
+ if len(out)-bufoff < dstEvery*3 {
+ d.bufs.Put(buf)
+ return nil, errors.New("corruption detected: stream overrun 2")
+ }
+ //copy(out, buf[0][:])
+ //copy(out[dstEvery:], buf[1][:])
+ //copy(out[dstEvery*2:], buf[2][:])
+ //copy(out[dstEvery*3:], buf[3][:])
+ *(*[bufoff]byte)(out) = buf[0]
+ *(*[bufoff]byte)(out[dstEvery:]) = buf[1]
+ *(*[bufoff]byte)(out[dstEvery*2:]) = buf[2]
+ *(*[bufoff]byte)(out[dstEvery*3:]) = buf[3]
+ out = out[bufoff:]
+ decoded += bufoff * 4
+ }
+ }
+ if off > 0 {
+ ioff := int(off)
+ if len(out) < dstEvery*3+ioff {
+ d.bufs.Put(buf)
+ return nil, errors.New("corruption detected: stream overrun 3")
+ }
+ copy(out, buf[0][:off])
+ copy(out[dstEvery:], buf[1][:off])
+ copy(out[dstEvery*2:], buf[2][:off])
+ copy(out[dstEvery*3:], buf[3][:off])
+ decoded += int(off) * 4
+ out = out[off:]
+ }
+
+ // Decode remaining.
+ remainBytes := dstEvery - (decoded / 4)
+ for i := range br {
+ offset := dstEvery * i
+ endsAt := offset + remainBytes
+ if endsAt > len(out) {
+ endsAt = len(out)
+ }
+ br := &br[i]
+ bitsLeft := br.remaining()
+ for bitsLeft > 0 {
+ br.fill()
+ if offset >= endsAt {
+ d.bufs.Put(buf)
+ return nil, errors.New("corruption detected: stream overrun 4")
+ }
+
+ // Read value and increment offset.
+ val := br.peekBitsFast(d.actualTableLog)
+ v := single[val&tlMask].entry
+ nBits := uint8(v)
+ br.advance(nBits)
+ bitsLeft -= uint(nBits)
+ out[offset] = uint8(v >> 8)
+ offset++
+ }
+ if offset != endsAt {
+ d.bufs.Put(buf)
+ return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
+ }
+ decoded += offset - dstEvery*i
+ err = br.close()
+ if err != nil {
+ return nil, err
+ }
+ }
+ d.bufs.Put(buf)
+ if dstSize != decoded {
+ return nil, errors.New("corruption detected: short output block")
+ }
+ return dst, nil
+}
+
+// Decompress1X will decompress a 1X encoded stream.
+// The cap of the output buffer will be the maximum decompressed size.
+// The length of the supplied input must match the end of a block exactly.
+func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
+ if len(d.dt.single) == 0 {
+ return nil, errors.New("no table loaded")
+ }
+ if use8BitTables && d.actualTableLog <= 8 {
+ return d.decompress1X8Bit(dst, src)
+ }
+ var br bitReaderShifted
+ err := br.init(src)
+ if err != nil {
+ return dst, err
+ }
+ maxDecodedSize := cap(dst)
+ dst = dst[:0]
+
+ // Avoid bounds check by always having full sized table.
+ const tlSize = 1 << tableLogMax
+ const tlMask = tlSize - 1
+ dt := d.dt.single[:tlSize]
+
+ // Use temp table to avoid bound checks/append penalty.
+ bufs := d.buffer()
+ buf := &bufs[0]
+ var off uint8
+
+ for br.off >= 8 {
+ br.fillFast()
+ v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+0] = uint8(v.entry >> 8)
+
+ v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+1] = uint8(v.entry >> 8)
+
+ // Refill
+ br.fillFast()
+
+ v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+2] = uint8(v.entry >> 8)
+
+ v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
+ br.advance(uint8(v.entry))
+ buf[off+3] = uint8(v.entry >> 8)
+
+ off += 4
+ if off == 0 {
+ if len(dst)+256 > maxDecodedSize {
+ br.close()
+ d.bufs.Put(bufs)
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:]...)
+ }
+ }
+
+ if len(dst)+int(off) > maxDecodedSize {
+ d.bufs.Put(bufs)
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ dst = append(dst, buf[:off]...)
+
+ // br < 8, so uint8 is fine
+ bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
+ for bitsLeft > 0 {
+ br.fill()
+ if false && br.bitsRead >= 32 {
+ if br.off >= 4 {
+ v := br.in[br.off-4:]
+ v = v[:4]
+ low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+ br.value = (br.value << 32) | uint64(low)
+ br.bitsRead -= 32
+ br.off -= 4
+ } else {
+ for br.off > 0 {
+ br.value = (br.value << 8) | uint64(br.in[br.off-1])
+ br.bitsRead -= 8
+ br.off--
+ }
+ }
+ }
+ if len(dst) >= maxDecodedSize {
+ d.bufs.Put(bufs)
+ br.close()
+ return nil, ErrMaxDecodedSizeExceeded
+ }
+ v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
+ nBits := uint8(v.entry)
+ br.advance(nBits)
+ bitsLeft -= nBits
+ dst = append(dst, uint8(v.entry>>8))
+ }
+ d.bufs.Put(bufs)
+ return dst, br.close()
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/huff0.go b/vendor/github.com/klauspost/compress/huff0/huff0.go
index 7ec2022..77ecd68 100644
--- a/vendor/github.com/klauspost/compress/huff0/huff0.go
+++ b/vendor/github.com/klauspost/compress/huff0/huff0.go
@@ -8,6 +8,7 @@
"fmt"
"math"
"math/bits"
+ "sync"
"github.com/klauspost/compress/fse"
)
@@ -87,7 +88,7 @@
// Decoders will return ErrMaxDecodedSizeExceeded is this limit is exceeded.
MaxDecodedSize int
- br byteReader
+ srcLen int
// MaxSymbolValue will override the maximum symbol value of the next block.
MaxSymbolValue uint8
@@ -116,6 +117,7 @@
nodes []nodeElt
tmpOut [4][]byte
fse *fse.Scratch
+ decPool sync.Pool // *[4][256]byte buffers.
huffWeight [maxSymbolValue + 1]byte
}
@@ -168,7 +170,7 @@
if s.fse == nil {
s.fse = &fse.Scratch{}
}
- s.br.init(in)
+ s.srcLen = len(in)
return s, nil
}
@@ -245,6 +247,68 @@
return nil
}
+func (c cTable) estTableSize(s *Scratch) (sz int, err error) {
+ var (
+ // precomputed conversion table
+ bitsToWeight [tableLogMax + 1]byte
+ huffLog = s.actualTableLog
+ // last weight is not saved.
+ maxSymbolValue = uint8(s.symbolLen - 1)
+ huffWeight = s.huffWeight[:256]
+ )
+ const (
+ maxFSETableLog = 6
+ )
+ // convert to weight
+ bitsToWeight[0] = 0
+ for n := uint8(1); n < huffLog+1; n++ {
+ bitsToWeight[n] = huffLog + 1 - n
+ }
+
+ // Acquire histogram for FSE.
+ hist := s.fse.Histogram()
+ hist = hist[:256]
+ for i := range hist[:16] {
+ hist[i] = 0
+ }
+ for n := uint8(0); n < maxSymbolValue; n++ {
+ v := bitsToWeight[c[n].nBits] & 15
+ huffWeight[n] = v
+ hist[v]++
+ }
+
+ // FSE compress if feasible.
+ if maxSymbolValue >= 2 {
+ huffMaxCnt := uint32(0)
+ huffMax := uint8(0)
+ for i, v := range hist[:16] {
+ if v == 0 {
+ continue
+ }
+ huffMax = byte(i)
+ if v > huffMaxCnt {
+ huffMaxCnt = v
+ }
+ }
+ s.fse.HistogramFinished(huffMax, int(huffMaxCnt))
+ s.fse.TableLog = maxFSETableLog
+ b, err := fse.Compress(huffWeight[:maxSymbolValue], s.fse)
+ if err == nil && len(b) < int(s.symbolLen>>1) {
+ sz += 1 + len(b)
+ return sz, nil
+ }
+ // Unable to compress (RLE/uncompressible)
+ }
+ // write raw values as 4-bits (max : 15)
+ if maxSymbolValue > (256 - 128) {
+ // should not happen : likely means source cannot be compressed
+ return 0, ErrIncompressible
+ }
+ // special case, pack weights 4 bits/weight.
+ sz += 1 + int(maxSymbolValue/2)
+ return sz, nil
+}
+
// estimateSize returns the estimated size in bytes of the input represented in the
// histogram supplied.
func (c cTable) estimateSize(hist []uint32) int {