s2/dict.go - external/github.com/klauspost/compress - Git at Google

 // Copyright (c) 2022+ Klaus Post. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package s2

 import (
 	"bytes"
 	"encoding/binary"
 	"sync"
 )

 const (
 	// MinDictSize is the minimum dictionary size when repeat has been read.
 	MinDictSize = 16

 	// MaxDictSize is the maximum dictionary size when repeat has been read.
 	MaxDictSize = 65536

 	// MaxDictSrcOffset is the maximum offset where a dictionary entry can start.
 	MaxDictSrcOffset = 65535
 )

 // Dict contains a dictionary that can be used for encoding and decoding s2
 type Dict struct {
 	dict   []byte
 	repeat int // Repeat as index of dict

 	fast, better, best sync.Once
 	fastTable          *[1 << 14]uint16

 	betterTableShort *[1 << 14]uint16
 	betterTableLong  *[1 << 17]uint16

 	bestTableShort *[1 << 16]uint32
 	bestTableLong  *[1 << 19]uint32
 }

 // NewDict will read a dictionary.
 // It will return nil if the dictionary is invalid.
 func NewDict(dict []byte) *Dict {
 	if len(dict) == 0 {
 		return nil
 	}
 	var d Dict
 	// Repeat is the first value of the dict
 	r, n := binary.Uvarint(dict)
 	if n <= 0 {
 		return nil
 	}
 	dict = dict[n:]
 	d.dict = dict
 	if cap(d.dict) < len(d.dict)+16 {
 		d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
 	}
 	if len(dict) < MinDictSize || len(dict) > MaxDictSize {
 		return nil
 	}
 	d.repeat = int(r)
 	if d.repeat > len(dict) {
 		return nil
 	}
 	return &d
 }

 // Bytes will return a serialized version of the dictionary.
 // The output can be sent to NewDict.
 func (d *Dict) Bytes() []byte {
 	dst := make([]byte, binary.MaxVarintLen16+len(d.dict))
 	return append(dst[:binary.PutUvarint(dst, uint64(d.repeat))], d.dict...)
 }

 // MakeDict will create a dictionary.
 // 'data' must be at least MinDictSize.
 // If data is longer than MaxDictSize only the last MaxDictSize bytes will be used.
 // If searchStart is set the start repeat value will be set to the last
 // match of this content.
 // If no matches are found, it will attempt to find shorter matches.
 // This content should match the typical start of a block.
 // If at least 4 bytes cannot be matched, repeat is set to start of block.
 func MakeDict(data []byte, searchStart []byte) *Dict {
 	if len(data) == 0 {
 		return nil
 	}
 	if len(data) > MaxDictSize {
 		data = data[len(data)-MaxDictSize:]
 	}
 	var d Dict
 	dict := data
 	d.dict = dict
 	if cap(d.dict) < len(d.dict)+16 {
 		d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
 	}
 	if len(dict) < MinDictSize {
 		return nil
 	}

 	// Find the longest match possible, last entry if multiple.
 	for s := len(searchStart); s > 4; s-- {
 		if idx := bytes.LastIndex(data, searchStart[:s]); idx >= 0 && idx <= len(data)-8 {
 			d.repeat = idx
 			break
 		}
 	}

 	return &d
 }

 // MakeDictManual will create a dictionary.
 // 'data' must be at least MinDictSize and less than or equal to MaxDictSize.
 // A manual first repeat index into data must be provided.
 // It must be less than len(data)-8.
 func MakeDictManual(data []byte, firstIdx uint16) *Dict {
 	if len(data) < MinDictSize || int(firstIdx) >= len(data)-8 || len(data) > MaxDictSize {
 		return nil
 	}
 	var d Dict
 	dict := data
 	d.dict = dict
 	if cap(d.dict) < len(d.dict)+16 {
 		d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
 	}

 	d.repeat = int(firstIdx)
 	return &d
 }

 // Encode returns the encoded form of src. The returned slice may be a sub-
 // slice of dst if dst was large enough to hold the entire encoded block.
 // Otherwise, a newly allocated slice will be returned.
 //
 // The dst and src must not overlap. It is valid to pass a nil dst.
 //
 // The blocks will require the same amount of memory to decode as encoding,
 // and does not make for concurrent decoding.
 // Also note that blocks do not contain CRC information, so corruption may be undetected.
 //
 // If you need to encode larger amounts of data, consider using
 // the streaming interface which gives all of these features.
 func (d *Dict) Encode(dst, src []byte) []byte {
 	if n := MaxEncodedLen(len(src)); n < 0 {
 		panic(ErrTooLarge)
 	} else if cap(dst) < n {
 		dst = make([]byte, n)
 	} else {
 		dst = dst[:n]
 	}

 	// The block starts with the varint-encoded length of the decompressed bytes.
 	dstP := binary.PutUvarint(dst, uint64(len(src)))

 	if len(src) == 0 {
 		return dst[:dstP]
 	}
 	if len(src) < minNonLiteralBlockSize {
 		dstP += emitLiteral(dst[dstP:], src)
 		return dst[:dstP]
 	}
 	n := encodeBlockDictGo(dst[dstP:], src, d)
 	if n > 0 {
 		dstP += n
 		return dst[:dstP]
 	}
 	// Not compressible
 	dstP += emitLiteral(dst[dstP:], src)
 	return dst[:dstP]
 }

 // EncodeBetter returns the encoded form of src. The returned slice may be a sub-
 // slice of dst if dst was large enough to hold the entire encoded block.
 // Otherwise, a newly allocated slice will be returned.
 //
 // EncodeBetter compresses better than Encode but typically with a
 // 10-40% speed decrease on both compression and decompression.
 //
 // The dst and src must not overlap. It is valid to pass a nil dst.
 //
 // The blocks will require the same amount of memory to decode as encoding,
 // and does not make for concurrent decoding.
 // Also note that blocks do not contain CRC information, so corruption may be undetected.
 //
 // If you need to encode larger amounts of data, consider using
 // the streaming interface which gives all of these features.
 func (d *Dict) EncodeBetter(dst, src []byte) []byte {
 	if n := MaxEncodedLen(len(src)); n < 0 {
 		panic(ErrTooLarge)
 	} else if len(dst) < n {
 		dst = make([]byte, n)
 	}

 	// The block starts with the varint-encoded length of the decompressed bytes.
 	dstP := binary.PutUvarint(dst, uint64(len(src)))

 	if len(src) == 0 {
 		return dst[:dstP]
 	}
 	if len(src) < minNonLiteralBlockSize {
 		dstP += emitLiteral(dst[dstP:], src)
 		return dst[:dstP]
 	}
 	n := encodeBlockBetterDict(dst[dstP:], src, d)
 	if n > 0 {
 		dstP += n
 		return dst[:dstP]
 	}
 	// Not compressible
 	dstP += emitLiteral(dst[dstP:], src)
 	return dst[:dstP]
 }

 // EncodeBest returns the encoded form of src. The returned slice may be a sub-
 // slice of dst if dst was large enough to hold the entire encoded block.
 // Otherwise, a newly allocated slice will be returned.
 //
 // EncodeBest compresses as good as reasonably possible but with a
 // big speed decrease.
 //
 // The dst and src must not overlap. It is valid to pass a nil dst.
 //
 // The blocks will require the same amount of memory to decode as encoding,
 // and does not make for concurrent decoding.
 // Also note that blocks do not contain CRC information, so corruption may be undetected.
 //
 // If you need to encode larger amounts of data, consider using
 // the streaming interface which gives all of these features.
 func (d *Dict) EncodeBest(dst, src []byte) []byte {
 	if n := MaxEncodedLen(len(src)); n < 0 {
 		panic(ErrTooLarge)
 	} else if len(dst) < n {
 		dst = make([]byte, n)
 	}

 	// The block starts with the varint-encoded length of the decompressed bytes.
 	dstP := binary.PutUvarint(dst, uint64(len(src)))

 	if len(src) == 0 {
 		return dst[:dstP]
 	}
 	if len(src) < minNonLiteralBlockSize {
 		dstP += emitLiteral(dst[dstP:], src)
 		return dst[:dstP]
 	}
 	n := encodeBlockBest(dst[dstP:], src, d)
 	if n > 0 {
 		dstP += n
 		return dst[:dstP]
 	}
 	// Not compressible
 	dstP += emitLiteral(dst[dstP:], src)
 	return dst[:dstP]
 }

 // Decode returns the decoded form of src. The returned slice may be a sub-
 // slice of dst if dst was large enough to hold the entire decoded block.
 // Otherwise, a newly allocated slice will be returned.
 //
 // The dst and src must not overlap. It is valid to pass a nil dst.
 func (d *Dict) Decode(dst, src []byte) ([]byte, error) {
 	dLen, s, err := decodedLen(src)
 	if err != nil {
 		return nil, err
 	}
 	if dLen <= cap(dst) {
 		dst = dst[:dLen]
 	} else {
 		dst = make([]byte, dLen)
 	}
 	if s2DecodeDict(dst, src[s:], d) != 0 {
 		return nil, ErrCorrupt
 	}
 	return dst, nil
 }

 func (d *Dict) initFast() {
 	d.fast.Do(func() {
 		const (
 			tableBits    = 14
 			maxTableSize = 1 << tableBits
 		)

 		var table [maxTableSize]uint16
 		// We stop so any entry of length 8 can always be read.
 		for i := 0; i < len(d.dict)-8-2; i += 3 {
 			x0 := load64(d.dict, i)
 			h0 := hash6(x0, tableBits)
 			h1 := hash6(x0>>8, tableBits)
 			h2 := hash6(x0>>16, tableBits)
 			table[h0] = uint16(i)
 			table[h1] = uint16(i + 1)
 			table[h2] = uint16(i + 2)
 		}
 		d.fastTable = &table
 	})
 }

 func (d *Dict) initBetter() {
 	d.better.Do(func() {
 		const (
 			// Long hash matches.
 			lTableBits    = 17
 			maxLTableSize = 1 << lTableBits

 			// Short hash matches.
 			sTableBits    = 14
 			maxSTableSize = 1 << sTableBits
 		)

 		var lTable [maxLTableSize]uint16
 		var sTable [maxSTableSize]uint16

 		// We stop so any entry of length 8 can always be read.
 		for i := 0; i < len(d.dict)-8; i++ {
 			cv := load64(d.dict, i)
 			lTable[hash7(cv, lTableBits)] = uint16(i)
 			sTable[hash4(cv, sTableBits)] = uint16(i)
 		}
 		d.betterTableShort = &sTable
 		d.betterTableLong = &lTable
 	})
 }

 func (d *Dict) initBest() {
 	d.best.Do(func() {
 		const (
 			// Long hash matches.
 			lTableBits    = 19
 			maxLTableSize = 1 << lTableBits

 			// Short hash matches.
 			sTableBits    = 16
 			maxSTableSize = 1 << sTableBits
 		)

 		var lTable [maxLTableSize]uint32
 		var sTable [maxSTableSize]uint32

 		// We stop so any entry of length 8 can always be read.
 		for i := 0; i < len(d.dict)-8; i++ {
 			cv := load64(d.dict, i)
 			hashL := hash8(cv, lTableBits)
 			hashS := hash4(cv, sTableBits)
 			candidateL := lTable[hashL]
 			candidateS := sTable[hashS]
 			lTable[hashL] = uint32(i) | candidateL<<16
 			sTable[hashS] = uint32(i) | candidateS<<16
 		}
 		d.bestTableShort = &sTable
 		d.bestTableLong = &lTable
 	})
 }
	// Copyright (c) 2022+ Klaus Post. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package s2

	import (
	"bytes"
	"encoding/binary"
	"sync"
	)

	const (
	// MinDictSize is the minimum dictionary size when repeat has been read.
	MinDictSize = 16

	// MaxDictSize is the maximum dictionary size when repeat has been read.
	MaxDictSize = 65536

	// MaxDictSrcOffset is the maximum offset where a dictionary entry can start.
	MaxDictSrcOffset = 65535
	)

	// Dict contains a dictionary that can be used for encoding and decoding s2
	type Dict struct {
	dict []byte
	repeat int // Repeat as index of dict

	fast, better, best sync.Once
	fastTable *[1 << 14]uint16

	betterTableShort *[1 << 14]uint16
	betterTableLong *[1 << 17]uint16

	bestTableShort *[1 << 16]uint32
	bestTableLong *[1 << 19]uint32
	}

	// NewDict will read a dictionary.
	// It will return nil if the dictionary is invalid.
	func NewDict(dict []byte) *Dict {
	if len(dict) == 0 {
	return nil
	}
	var d Dict
	// Repeat is the first value of the dict
	r, n := binary.Uvarint(dict)
	if n <= 0 {
	return nil
	}
	dict = dict[n:]
	d.dict = dict
	if cap(d.dict) < len(d.dict)+16 {
	d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
	}
	if len(dict) < MinDictSize \|\| len(dict) > MaxDictSize {
	return nil
	}
	d.repeat = int(r)
	if d.repeat > len(dict) {
	return nil
	}
	return &d
	}

	// Bytes will return a serialized version of the dictionary.
	// The output can be sent to NewDict.
	func (d *Dict) Bytes() []byte {
	dst := make([]byte, binary.MaxVarintLen16+len(d.dict))
	return append(dst[:binary.PutUvarint(dst, uint64(d.repeat))], d.dict...)
	}

	// MakeDict will create a dictionary.
	// 'data' must be at least MinDictSize.
	// If data is longer than MaxDictSize only the last MaxDictSize bytes will be used.
	// If searchStart is set the start repeat value will be set to the last
	// match of this content.
	// If no matches are found, it will attempt to find shorter matches.
	// This content should match the typical start of a block.
	// If at least 4 bytes cannot be matched, repeat is set to start of block.
	func MakeDict(data []byte, searchStart []byte) *Dict {
	if len(data) == 0 {
	return nil
	}
	if len(data) > MaxDictSize {
	data = data[len(data)-MaxDictSize:]
	}
	var d Dict
	dict := data
	d.dict = dict
	if cap(d.dict) < len(d.dict)+16 {
	d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
	}
	if len(dict) < MinDictSize {
	return nil
	}

	// Find the longest match possible, last entry if multiple.
	for s := len(searchStart); s > 4; s-- {
	if idx := bytes.LastIndex(data, searchStart[:s]); idx >= 0 && idx <= len(data)-8 {
	d.repeat = idx
	break
	}
	}

	return &d
	}

	// MakeDictManual will create a dictionary.
	// 'data' must be at least MinDictSize and less than or equal to MaxDictSize.
	// A manual first repeat index into data must be provided.
	// It must be less than len(data)-8.
	func MakeDictManual(data []byte, firstIdx uint16) *Dict {
	if len(data) < MinDictSize \|\| int(firstIdx) >= len(data)-8 \|\| len(data) > MaxDictSize {
	return nil
	}
	var d Dict
	dict := data
	d.dict = dict
	if cap(d.dict) < len(d.dict)+16 {
	d.dict = append(make([]byte, 0, len(d.dict)+16), d.dict...)
	}

	d.repeat = int(firstIdx)
	return &d
	}

	// Encode returns the encoded form of src. The returned slice may be a sub-
	// slice of dst if dst was large enough to hold the entire encoded block.
	// Otherwise, a newly allocated slice will be returned.
	//
	// The dst and src must not overlap. It is valid to pass a nil dst.
	//
	// The blocks will require the same amount of memory to decode as encoding,
	// and does not make for concurrent decoding.
	// Also note that blocks do not contain CRC information, so corruption may be undetected.
	//
	// If you need to encode larger amounts of data, consider using
	// the streaming interface which gives all of these features.
	func (d *Dict) Encode(dst, src []byte) []byte {
	if n := MaxEncodedLen(len(src)); n < 0 {
	panic(ErrTooLarge)
	} else if cap(dst) < n {
	dst = make([]byte, n)
	} else {
	dst = dst[:n]
	}

	// The block starts with the varint-encoded length of the decompressed bytes.
	dstP := binary.PutUvarint(dst, uint64(len(src)))

	if len(src) == 0 {
	return dst[:dstP]
	}
	if len(src) < minNonLiteralBlockSize {
	dstP += emitLiteral(dst[dstP:], src)
	return dst[:dstP]
	}
	n := encodeBlockDictGo(dst[dstP:], src, d)
	if n > 0 {
	dstP += n
	return dst[:dstP]
	}
	// Not compressible
	dstP += emitLiteral(dst[dstP:], src)
	return dst[:dstP]
	}

	// EncodeBetter returns the encoded form of src. The returned slice may be a sub-
	// slice of dst if dst was large enough to hold the entire encoded block.
	// Otherwise, a newly allocated slice will be returned.
	//
	// EncodeBetter compresses better than Encode but typically with a
	// 10-40% speed decrease on both compression and decompression.
	//
	// The dst and src must not overlap. It is valid to pass a nil dst.
	//
	// The blocks will require the same amount of memory to decode as encoding,
	// and does not make for concurrent decoding.
	// Also note that blocks do not contain CRC information, so corruption may be undetected.
	//
	// If you need to encode larger amounts of data, consider using
	// the streaming interface which gives all of these features.
	func (d *Dict) EncodeBetter(dst, src []byte) []byte {
	if n := MaxEncodedLen(len(src)); n < 0 {
	panic(ErrTooLarge)
	} else if len(dst) < n {
	dst = make([]byte, n)
	}

	// The block starts with the varint-encoded length of the decompressed bytes.
	dstP := binary.PutUvarint(dst, uint64(len(src)))

	if len(src) == 0 {
	return dst[:dstP]
	}
	if len(src) < minNonLiteralBlockSize {
	dstP += emitLiteral(dst[dstP:], src)
	return dst[:dstP]
	}
	n := encodeBlockBetterDict(dst[dstP:], src, d)
	if n > 0 {
	dstP += n
	return dst[:dstP]
	}
	// Not compressible
	dstP += emitLiteral(dst[dstP:], src)
	return dst[:dstP]
	}

	// EncodeBest returns the encoded form of src. The returned slice may be a sub-
	// slice of dst if dst was large enough to hold the entire encoded block.
	// Otherwise, a newly allocated slice will be returned.
	//
	// EncodeBest compresses as good as reasonably possible but with a
	// big speed decrease.
	//
	// The dst and src must not overlap. It is valid to pass a nil dst.
	//
	// The blocks will require the same amount of memory to decode as encoding,
	// and does not make for concurrent decoding.
	// Also note that blocks do not contain CRC information, so corruption may be undetected.
	//
	// If you need to encode larger amounts of data, consider using
	// the streaming interface which gives all of these features.
	func (d *Dict) EncodeBest(dst, src []byte) []byte {
	if n := MaxEncodedLen(len(src)); n < 0 {
	panic(ErrTooLarge)
	} else if len(dst) < n {
	dst = make([]byte, n)
	}

	// The block starts with the varint-encoded length of the decompressed bytes.
	dstP := binary.PutUvarint(dst, uint64(len(src)))

	if len(src) == 0 {
	return dst[:dstP]
	}
	if len(src) < minNonLiteralBlockSize {
	dstP += emitLiteral(dst[dstP:], src)
	return dst[:dstP]
	}
	n := encodeBlockBest(dst[dstP:], src, d)
	if n > 0 {
	dstP += n
	return dst[:dstP]
	}
	// Not compressible
	dstP += emitLiteral(dst[dstP:], src)
	return dst[:dstP]
	}

	// Decode returns the decoded form of src. The returned slice may be a sub-
	// slice of dst if dst was large enough to hold the entire decoded block.
	// Otherwise, a newly allocated slice will be returned.
	//
	// The dst and src must not overlap. It is valid to pass a nil dst.
	func (d *Dict) Decode(dst, src []byte) ([]byte, error) {
	dLen, s, err := decodedLen(src)
	if err != nil {
	return nil, err
	}
	if dLen <= cap(dst) {
	dst = dst[:dLen]
	} else {
	dst = make([]byte, dLen)
	}
	if s2DecodeDict(dst, src[s:], d) != 0 {
	return nil, ErrCorrupt
	}
	return dst, nil
	}

	func (d *Dict) initFast() {
	d.fast.Do(func() {
	const (
	tableBits = 14
	maxTableSize = 1 << tableBits
	)

	var table [maxTableSize]uint16
	// We stop so any entry of length 8 can always be read.
	for i := 0; i < len(d.dict)-8-2; i += 3 {
	x0 := load64(d.dict, i)
	h0 := hash6(x0, tableBits)
	h1 := hash6(x0>>8, tableBits)
	h2 := hash6(x0>>16, tableBits)
	table[h0] = uint16(i)
	table[h1] = uint16(i + 1)
	table[h2] = uint16(i + 2)
	}
	d.fastTable = &table
	})
	}

	func (d *Dict) initBetter() {
	d.better.Do(func() {
	const (
	// Long hash matches.
	lTableBits = 17
	maxLTableSize = 1 << lTableBits

	// Short hash matches.
	sTableBits = 14
	maxSTableSize = 1 << sTableBits
	)

	var lTable [maxLTableSize]uint16
	var sTable [maxSTableSize]uint16

	// We stop so any entry of length 8 can always be read.
	for i := 0; i < len(d.dict)-8; i++ {
	cv := load64(d.dict, i)
	lTable[hash7(cv, lTableBits)] = uint16(i)
	sTable[hash4(cv, sTableBits)] = uint16(i)
	}
	d.betterTableShort = &sTable
	d.betterTableLong = &lTable
	})
	}

	func (d *Dict) initBest() {
	d.best.Do(func() {
	const (
	// Long hash matches.
	lTableBits = 19
	maxLTableSize = 1 << lTableBits

	// Short hash matches.
	sTableBits = 16
	maxSTableSize = 1 << sTableBits
	)

	var lTable [maxLTableSize]uint32
	var sTable [maxSTableSize]uint32

	// We stop so any entry of length 8 can always be read.
	for i := 0; i < len(d.dict)-8; i++ {
	cv := load64(d.dict, i)
	hashL := hash8(cv, lTableBits)
	hashS := hash4(cv, sTableBits)
	candidateL := lTable[hashL]
	candidateS := sTable[hashS]
	lTable[hashL] = uint32(i) \| candidateL<<16
	sTable[hashS] = uint32(i) \| candidateS<<16
	}
	d.bestTableShort = &sTable
	d.bestTableLong = &lTable
	})
	}