
commit
69f9277bd7
379 changed files with 94246 additions and 0 deletions
@ -0,0 +1 @@ |
|||
*.bin -text -diff |
@ -0,0 +1,24 @@ |
|||
# Compiled Object files, Static and Dynamic libs (Shared Objects) |
|||
*.o |
|||
*.a |
|||
*.so |
|||
|
|||
# Folders |
|||
_obj |
|||
_test |
|||
|
|||
# Architecture specific extensions/prefixes |
|||
*.[568vq] |
|||
[568vq].out |
|||
|
|||
*.cgo1.go |
|||
*.cgo2.c |
|||
_cgo_defun.c |
|||
_cgo_gotypes.go |
|||
_cgo_export.* |
|||
|
|||
_testmain.go |
|||
|
|||
*.exe |
|||
*.test |
|||
*.prof |
@ -0,0 +1,98 @@ |
|||
# This is an example goreleaser.yaml file with some sane defaults. |
|||
# Make sure to check the documentation at http://goreleaser.com |
|||
before: |
|||
hooks: |
|||
builds: |
|||
- |
|||
id: "s2c" |
|||
binary: s2c |
|||
main: ./s2/cmd/s2c/main.go |
|||
env: |
|||
- CGO_ENABLED=0 |
|||
goos: |
|||
- linux |
|||
- freebsd |
|||
- netbsd |
|||
- windows |
|||
goarch: |
|||
- 386 |
|||
- amd64 |
|||
- arm |
|||
- arm64 |
|||
- ppc64 |
|||
- ppc64le |
|||
- mips64 |
|||
- mips64le |
|||
goarm: |
|||
- 7 |
|||
- |
|||
id: "s2d" |
|||
binary: s2d |
|||
main: ./s2/cmd/s2d/main.go |
|||
env: |
|||
- CGO_ENABLED=0 |
|||
goos: |
|||
- linux |
|||
- freebsd |
|||
- netbsd |
|||
- windows |
|||
goarch: |
|||
- 386 |
|||
- amd64 |
|||
- arm |
|||
- arm64 |
|||
- ppc64 |
|||
- ppc64le |
|||
- mips64 |
|||
- mips64le |
|||
goarm: |
|||
- 7 |
|||
|
|||
archives: |
|||
- |
|||
id: s2-binaries |
|||
name_template: "s2-cmds_{{ .Version }}_{{ .Os }}_{{ .Arch }}" |
|||
replacements: |
|||
darwin: Darwin |
|||
linux: Linux |
|||
windows: Windows |
|||
386: i386 |
|||
amd64: x86_64 |
|||
freebsd: FreeBSD |
|||
netbsd: NetBSD |
|||
format_overrides: |
|||
- goos: windows |
|||
format: zip |
|||
files: |
|||
- s2/LICENSE |
|||
- s2/README.md |
|||
checksum: |
|||
name_template: 'checksums.txt' |
|||
snapshot: |
|||
name_template: "s2-cmds_{{ .Tag }}-next" |
|||
changelog: |
|||
sort: asc |
|||
filters: |
|||
exclude: |
|||
- '^doc:' |
|||
- '^docs:' |
|||
- '^test:' |
|||
- '^tests:' |
|||
- '^Update\sREADME.md' |
|||
|
|||
nfpms: |
|||
- |
|||
name_template: "s2-cmds_{{ .Version }}_{{ .Os }}_{{ .Arch }}" |
|||
vendor: Klaus Post |
|||
homepage: https://github.com/klauspost/compress |
|||
maintainer: Klaus Post <klauspost@gmail.com> |
|||
description: S2 Compression Tool |
|||
license: BSD 3-Clause |
|||
formats: |
|||
- deb |
|||
- rpm |
|||
replacements: |
|||
darwin: Darwin |
|||
linux: Linux |
|||
freebsd: FreeBSD |
|||
amd64: x86_64 |
@ -0,0 +1,45 @@ |
|||
language: go |
|||
|
|||
os: |
|||
- linux |
|||
- osx |
|||
|
|||
go: |
|||
- 1.12.x |
|||
- 1.13.x |
|||
- 1.14.x |
|||
- master |
|||
|
|||
env: |
|||
- GO111MODULE=off |
|||
|
|||
install: |
|||
- go get ./... |
|||
- go get github.com/klauspost/compress-fuzz |
|||
|
|||
script: |
|||
- diff <(gofmt -d .) <(printf "") |
|||
- IS_GO112=`go version | cut -d ' ' -f3 | grep 1.12`; if [ ! -z "$IS_GO112" ]; then echo 'Skipping vet on Go 1.12...'; else go vet ./...; fi |
|||
- go test -cpu=2 ./... |
|||
- go test -cpu=2 -tags=noasm ./... |
|||
- go test -cpu=1,4 -short -race ./... |
|||
- go build github.com/klauspost/compress/s2/cmd/s2c && go build github.com/klauspost/compress/s2/cmd/s2d && s2c s2c && s2d s2c.s2 && rm s2c && rm s2d && rm s2c.s2 |
|||
|
|||
jobs: |
|||
allow_failures: |
|||
- go: 'master' |
|||
fast_finish: true |
|||
include: |
|||
- stage: 386 linux test |
|||
go: 1.14.x |
|||
script: |
|||
- GOOS=linux GOARCH=386 go test -short ./... |
|||
|
|||
deploy: |
|||
- provider: script |
|||
skip_cleanup: true |
|||
script: curl -sL https://git.io/goreleaser | VERSION=v0.127.0 bash || true |
|||
on: |
|||
tags: true |
|||
condition: $TRAVIS_OS_NAME = linux |
|||
go: 1.14.x |
@ -0,0 +1,28 @@ |
|||
Copyright (c) 2012 The Go Authors. All rights reserved. |
|||
Copyright (c) 2019 Klaus Post. All rights reserved. |
|||
|
|||
Redistribution and use in source and binary forms, with or without |
|||
modification, are permitted provided that the following conditions are |
|||
met: |
|||
|
|||
* Redistributions of source code must retain the above copyright |
|||
notice, this list of conditions and the following disclaimer. |
|||
* Redistributions in binary form must reproduce the above |
|||
copyright notice, this list of conditions and the following disclaimer |
|||
in the documentation and/or other materials provided with the |
|||
distribution. |
|||
* Neither the name of Google Inc. nor the names of its |
|||
contributors may be used to endorse or promote products derived from |
|||
this software without specific prior written permission. |
|||
|
|||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
|||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
|||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
|||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
|||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
@ -0,0 +1,300 @@ |
|||
# compress |
|||
|
|||
This package provides various compression algorithms. |
|||
|
|||
* [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression and decompression in pure Go. |
|||
* [S2](https://github.com/klauspost/compress/tree/master/s2#s2-compression) is a high performance replacement for Snappy. |
|||
* Optimized [deflate](https://godoc.org/github.com/klauspost/compress/flate) packages which can be used as a dropin replacement for [gzip](https://godoc.org/github.com/klauspost/compress/gzip), [zip](https://godoc.org/github.com/klauspost/compress/zip) and [zlib](https://godoc.org/github.com/klauspost/compress/zlib). |
|||
* [huff0](https://github.com/klauspost/compress/tree/master/huff0) and [FSE](https://github.com/klauspost/compress/tree/master/fse) implementations for raw entropy encoding. |
|||
* [pgzip](https://github.com/klauspost/pgzip) is a separate package that provides a very fast parallel gzip implementation. |
|||
* [fuzz package](https://github.com/klauspost/compress-fuzz) for fuzz testing all compressors/decompressors here. |
|||
|
|||
[](https://pkg.go.dev/github.com/klauspost/compress?tab=subdirectories) |
|||
[](https://travis-ci.org/klauspost/compress) |
|||
[](https://sourcegraph.com/github.com/klauspost/compress?badge) |
|||
|
|||
# changelog |
|||
|
|||
* June 23, 2020 (v1.10.10) zstd: Skip entropy compression in fastest mode when no matches. [#270](https://github.com/klauspost/compress/pull/270) |
|||
* June 16, 2020 (v1.10.9): zstd: API change for specifying dictionaries. See [#268](https://github.com/klauspost/compress/pull/268) |
|||
* June 16, 2020: zip: update CreateHeaderRaw to handle zip64 fields. [#266](https://github.com/klauspost/compress/pull/266) |
|||
* June 16, 2020: Fuzzit tests removed. The service has been purchased and is no longer available. |
|||
* June 5, 2020 (v1.10.8): 1.15x faster zstd block decompression. [#265](https://github.com/klauspost/compress/pull/265) |
|||
* June 1, 2020 (v1.10.7): Added zstd decompression [dictionary support](https://github.com/klauspost/compress/tree/master/zstd#dictionaries) |
|||
* June 1, 2020: Increase zstd decompression speed up to 1.19x. [#259](https://github.com/klauspost/compress/pull/259) |
|||
* June 1, 2020: Remove internal reset call in zstd compression and reduce allocations. [#263](https://github.com/klauspost/compress/pull/263) |
|||
* May 21, 2020: (v1.10.6) zstd: Reduce allocations while decoding. [#258](https://github.com/klauspost/compress/pull/258), [#252](https://github.com/klauspost/compress/pull/252) |
|||
* May 21, 2020: zstd: Stricter decompression checks. |
|||
* April 12, 2020: (v1.10.5) s2-commands: Flush output when receiving SIGINT. [#239](https://github.com/klauspost/compress/pull/239) |
|||
* Apr 8, 2020: (v1.10.4) zstd: Minor/special case optimizations. [#251](https://github.com/klauspost/compress/pull/251), [#250](https://github.com/klauspost/compress/pull/250), [#249](https://github.com/klauspost/compress/pull/249), [#247](https://github.com/klauspost/compress/pull/247) |
|||
* Mar 11, 2020: (v1.10.3) s2: Use S2 encoder in pure Go mode for Snappy output as well. [#245](https://github.com/klauspost/compress/pull/245) |
|||
* Mar 10, 2020: s2: Fix pure Go block encoder. [#244](https://github.com/klauspost/compress/pull/244) |
|||
* Mar 9, 2020: zstd: Added "better compression" mode. [#240](https://github.com/klauspost/compress/pull/240) |
|||
* Mar 9, 2020: zstd: Improve speed of fastest compression mode by 5-10% [#241](https://github.com/klauspost/compress/pull/241) |
|||
* Feb 28, 2020: zstd: Skip creating encoders when not needed. [#238](https://github.com/klauspost/compress/pull/238) |
|||
* Feb 27, 2020: (v1.10.2) Close to 50% speedup in inflate (gzip/zip decompression). [#236](https://github.com/klauspost/compress/pull/236) [#234](https://github.com/klauspost/compress/pull/234) [#232](https://github.com/klauspost/compress/pull/232) |
|||
* Feb 23, 2020: Reduce deflate level 1-6 memory usage up to 59%. [#227](https://github.com/klauspost/compress/pull/227) |
|||
* Feb 18, 2020: (v1.10.1) Fix zstd crash when resetting multiple times without sending data. [#226](https://github.com/klauspost/compress/pull/226) |
|||
* Feb 16, 2020: deflate: Fix dictionary use on level 1-6. [#224](https://github.com/klauspost/compress/pull/224) |
|||
* Feb 16, 2020: Remove deflate writer reference when closing. [#224](https://github.com/klauspost/compress/pull/224) |
|||
* Feb 4, 2020: (v1.10.0) Add optional dictionary to [stateless deflate](https://pkg.go.dev/github.com/klauspost/compress/flate?tab=doc#StatelessDeflate). Breaking change, send `nil` for previous behaviour. [#216](https://github.com/klauspost/compress/pull/216) |
|||
* Feb 3, 2020: Fix buffer overflow on repeated small block deflate. [#218](https://github.com/klauspost/compress/pull/218) |
|||
* Jan 31, 2020: Allow copying content from an existing ZIP file without decompressing+compressing. [#214](https://github.com/klauspost/compress/pull/214) |
|||
* Jan 28, 2020: Added [S2](https://github.com/klauspost/compress/tree/master/s2#s2-compression) AMD64 assembler and various optimizations. Stream speed >10GB/s. [#186](https://github.com/klauspost/compress/pull/186) |
|||
|
|||
<details> |
|||
<summary>See changes prior to v1.10.0</summary> |
|||
|
|||
* Jan 20,2020 (v1.9.8) Optimize gzip/deflate with better size estimates and faster table generation. [#207](https://github.com/klauspost/compress/pull/207) by [luyu6056](https://github.com/luyu6056), [#206](https://github.com/klauspost/compress/pull/206). |
|||
* Jan 11, 2020: S2 Encode/Decode will use provided buffer if capacity is big enough. [#204](https://github.com/klauspost/compress/pull/204) |
|||
* Jan 5, 2020: (v1.9.7) Fix another zstd regression in v1.9.5 - v1.9.6 removed. |
|||
* Jan 4, 2020: (v1.9.6) Regression in v1.9.5 fixed causing corrupt zstd encodes in rare cases. |
|||
* Jan 4, 2020: Faster IO in [s2c + s2d commandline tools](https://github.com/klauspost/compress/tree/master/s2#commandline-tools) compression/decompression. [#192](https://github.com/klauspost/compress/pull/192) |
|||
* Dec 29, 2019: Removed v1.9.5 since fuzz tests showed a compatibility problem with the reference zstandard decoder. |
|||
* Dec 29, 2019: (v1.9.5) zstd: 10-20% faster block compression. [#199](https://github.com/klauspost/compress/pull/199) |
|||
* Dec 29, 2019: [zip](https://godoc.org/github.com/klauspost/compress/zip) package updated with latest Go features |
|||
* Dec 29, 2019: zstd: Single segment flag condintions tweaked. [#197](https://github.com/klauspost/compress/pull/197) |
|||
* Dec 18, 2019: s2: Faster compression when ReadFrom is used. [#198](https://github.com/klauspost/compress/pull/198) |
|||
* Dec 10, 2019: s2: Fix repeat length output when just above at 16MB limit. |
|||
* Dec 10, 2019: zstd: Add function to get decoder as io.ReadCloser. [#191](https://github.com/klauspost/compress/pull/191) |
|||
* Dec 3, 2019: (v1.9.4) S2: limit max repeat length. [#188](https://github.com/klauspost/compress/pull/188) |
|||
* Dec 3, 2019: Add [WithNoEntropyCompression](https://godoc.org/github.com/klauspost/compress/zstd#WithNoEntropyCompression) to zstd [#187](https://github.com/klauspost/compress/pull/187) |
|||
* Dec 3, 2019: Reduce memory use for tests. Check for leaked goroutines. |
|||
* Nov 28, 2019 (v1.9.3) Less allocations in stateless deflate. |
|||
* Nov 28, 2019: 5-20% Faster huff0 decode. Impacts zstd as well. [#184](https://github.com/klauspost/compress/pull/184) |
|||
* Nov 12, 2019 (v1.9.2) Added [Stateless Compression](#stateless-compression) for gzip/deflate. |
|||
* Nov 12, 2019: Fixed zstd decompression of large single blocks. [#180](https://github.com/klauspost/compress/pull/180) |
|||
* Nov 11, 2019: Set default [s2c](https://github.com/klauspost/compress/tree/master/s2#commandline-tools) block size to 4MB. |
|||
* Nov 11, 2019: Reduce inflate memory use by 1KB. |
|||
* Nov 10, 2019: Less allocations in deflate bit writer. |
|||
* Nov 10, 2019: Fix inconsistent error returned by zstd decoder. |
|||
* Oct 28, 2019 (v1.9.1) ztsd: Fix crash when compressing blocks. [#174](https://github.com/klauspost/compress/pull/174) |
|||
* Oct 24, 2019 (v1.9.0) zstd: Fix rare data corruption [#173](https://github.com/klauspost/compress/pull/173) |
|||
* Oct 24, 2019 zstd: Fix huff0 out of buffer write [#171](https://github.com/klauspost/compress/pull/171) and always return errors [#172](https://github.com/klauspost/compress/pull/172) |
|||
* Oct 10, 2019: Big deflate rewrite, 30-40% faster with better compression [#105](https://github.com/klauspost/compress/pull/105) |
|||
|
|||
</details> |
|||
|
|||
<details> |
|||
<summary>See changes prior to v1.9.0</summary> |
|||
|
|||
* Oct 10, 2019: (v1.8.6) zstd: Allow partial reads to get flushed data. [#169](https://github.com/klauspost/compress/pull/169) |
|||
* Oct 3, 2019: Fix inconsistent results on broken zstd streams. |
|||
* Sep 25, 2019: Added `-rm` (remove source files) and `-q` (no output except errors) to `s2c` and `s2d` [commands](https://github.com/klauspost/compress/tree/master/s2#commandline-tools) |
|||
* Sep 16, 2019: (v1.8.4) Add `s2c` and `s2d` [commandline tools](https://github.com/klauspost/compress/tree/master/s2#commandline-tools). |
|||
* Sep 10, 2019: (v1.8.3) Fix s2 decoder [Skip](https://godoc.org/github.com/klauspost/compress/s2#Reader.Skip). |
|||
* Sep 7, 2019: zstd: Added [WithWindowSize](https://godoc.org/github.com/klauspost/compress/zstd#WithWindowSize), contributed by [ianwilkes](https://github.com/ianwilkes). |
|||
* Sep 5, 2019: (v1.8.2) Add [WithZeroFrames](https://godoc.org/github.com/klauspost/compress/zstd#WithZeroFrames) which adds full zero payload block encoding option. |
|||
* Sep 5, 2019: Lazy initialization of zstandard predefined en/decoder tables. |
|||
* Aug 26, 2019: (v1.8.1) S2: 1-2% compression increase in "better" compression mode. |
|||
* Aug 26, 2019: zstd: Check maximum size of Huffman 1X compressed literals while decoding. |
|||
* Aug 24, 2019: (v1.8.0) Added [S2 compression](https://github.com/klauspost/compress/tree/master/s2#s2-compression), a high performance replacement for Snappy. |
|||
* Aug 21, 2019: (v1.7.6) Fixed minor issues found by fuzzer. One could lead to zstd not decompressing. |
|||
* Aug 18, 2019: Add [fuzzit](https://fuzzit.dev/) continuous fuzzing. |
|||
* Aug 14, 2019: zstd: Skip incompressible data 2x faster. [#147](https://github.com/klauspost/compress/pull/147) |
|||
* Aug 4, 2019 (v1.7.5): Better literal compression. [#146](https://github.com/klauspost/compress/pull/146) |
|||
* Aug 4, 2019: Faster zstd compression. [#143](https://github.com/klauspost/compress/pull/143) [#144](https://github.com/klauspost/compress/pull/144) |
|||
* Aug 4, 2019: Faster zstd decompression. [#145](https://github.com/klauspost/compress/pull/145) [#143](https://github.com/klauspost/compress/pull/143) [#142](https://github.com/klauspost/compress/pull/142) |
|||
* July 15, 2019 (v1.7.4): Fix double EOF block in rare cases on zstd encoder. |
|||
* July 15, 2019 (v1.7.3): Minor speedup/compression increase in default zstd encoder. |
|||
* July 14, 2019: zstd decoder: Fix decompression error on multiple uses with mixed content. |
|||
* July 7, 2019 (v1.7.2): Snappy update, zstd decoder potential race fix. |
|||
* June 17, 2019: zstd decompression bugfix. |
|||
* June 17, 2019: fix 32 bit builds. |
|||
* June 17, 2019: Easier use in modules (less dependencies). |
|||
* June 9, 2019: New stronger "default" [zstd](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression mode. Matches zstd default compression ratio. |
|||
* June 5, 2019: 20-40% throughput in [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression and better compression. |
|||
* June 5, 2019: deflate/gzip compression: Reduce memory usage of lower compression levels. |
|||
* June 2, 2019: Added [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression! |
|||
* May 25, 2019: deflate/gzip: 10% faster bit writer, mostly visible in lower levels. |
|||
* Apr 22, 2019: [zstd](https://github.com/klauspost/compress/tree/master/zstd#zstd) decompression added. |
|||
* Aug 1, 2018: Added [huff0 README](https://github.com/klauspost/compress/tree/master/huff0#huff0-entropy-compression). |
|||
* Jul 8, 2018: Added [Performance Update 2018](#performance-update-2018) below. |
|||
* Jun 23, 2018: Merged [Go 1.11 inflate optimizations](https://go-review.googlesource.com/c/go/+/102235). Go 1.9 is now required. Backwards compatible version tagged with [v1.3.0](https://github.com/klauspost/compress/releases/tag/v1.3.0). |
|||
* Apr 2, 2018: Added [huff0](https://godoc.org/github.com/klauspost/compress/huff0) en/decoder. Experimental for now, API may change. |
|||
* Mar 4, 2018: Added [FSE Entropy](https://godoc.org/github.com/klauspost/compress/fse) en/decoder. Experimental for now, API may change. |
|||
* Nov 3, 2017: Add compression [Estimate](https://godoc.org/github.com/klauspost/compress#Estimate) function. |
|||
* May 28, 2017: Reduce allocations when resetting decoder. |
|||
* Apr 02, 2017: Change back to official crc32, since changes were merged in Go 1.7. |
|||
* Jan 14, 2017: Reduce stack pressure due to array copies. See [Issue #18625](https://github.com/golang/go/issues/18625). |
|||
* Oct 25, 2016: Level 2-4 have been rewritten and now offers significantly better performance than before. |
|||
* Oct 20, 2016: Port zlib changes from Go 1.7 to fix zlib writer issue. Please update. |
|||
* Oct 16, 2016: Go 1.7 changes merged. Apples to apples this package is a few percent faster, but has a significantly better balance between speed and compression per level. |
|||
* Mar 24, 2016: Always attempt Huffman encoding on level 4-7. This improves base 64 encoded data compression. |
|||
* Mar 24, 2016: Small speedup for level 1-3. |
|||
* Feb 19, 2016: Faster bit writer, level -2 is 15% faster, level 1 is 4% faster. |
|||
* Feb 19, 2016: Handle small payloads faster in level 1-3. |
|||
* Feb 19, 2016: Added faster level 2 + 3 compression modes. |
|||
* Feb 19, 2016: [Rebalanced compression levels](https://blog.klauspost.com/rebalancing-deflate-compression-levels/), so there is a more even progresssion in terms of compression. New default level is 5. |
|||
* Feb 14, 2016: Snappy: Merge upstream changes. |
|||
* Feb 14, 2016: Snappy: Fix aggressive skipping. |
|||
* Feb 14, 2016: Snappy: Update benchmark. |
|||
* Feb 13, 2016: Deflate: Fixed assembler problem that could lead to sub-optimal compression. |
|||
* Feb 12, 2016: Snappy: Added AMD64 SSE 4.2 optimizations to matching, which makes easy to compress material run faster. Typical speedup is around 25%. |
|||
* Feb 9, 2016: Added Snappy package fork. This version is 5-7% faster, much more on hard to compress content. |
|||
* Jan 30, 2016: Optimize level 1 to 3 by not considering static dictionary or storing uncompressed. ~4-5% speedup. |
|||
* Jan 16, 2016: Optimization on deflate level 1,2,3 compression. |
|||
* Jan 8 2016: Merge [CL 18317](https://go-review.googlesource.com/#/c/18317): fix reading, writing of zip64 archives. |
|||
* Dec 8 2015: Make level 1 and -2 deterministic even if write size differs. |
|||
* Dec 8 2015: Split encoding functions, so hashing and matching can potentially be inlined. 1-3% faster on AMD64. 5% faster on other platforms. |
|||
* Dec 8 2015: Fixed rare [one byte out-of bounds read](https://github.com/klauspost/compress/issues/20). Please update! |
|||
* Nov 23 2015: Optimization on token writer. ~2-4% faster. Contributed by [@dsnet](https://github.com/dsnet). |
|||
* Nov 20 2015: Small optimization to bit writer on 64 bit systems. |
|||
* Nov 17 2015: Fixed out-of-bound errors if the underlying Writer returned an error. See [#15](https://github.com/klauspost/compress/issues/15). |
|||
* Nov 12 2015: Added [io.WriterTo](https://golang.org/pkg/io/#WriterTo) support to gzip/inflate. |
|||
* Nov 11 2015: Merged [CL 16669](https://go-review.googlesource.com/#/c/16669/4): archive/zip: enable overriding (de)compressors per file |
|||
* Oct 15 2015: Added skipping on uncompressible data. Random data speed up >5x. |
|||
|
|||
</details> |
|||
|
|||
# deflate usage |
|||
|
|||
* [High Throughput Benchmark](http://blog.klauspost.com/go-gzipdeflate-benchmarks/). |
|||
* [Small Payload/Webserver Benchmarks](http://blog.klauspost.com/gzip-performance-for-go-webservers/). |
|||
* [Linear Time Compression](http://blog.klauspost.com/constant-time-gzipzip-compression/). |
|||
* [Re-balancing Deflate Compression Levels](https://blog.klauspost.com/rebalancing-deflate-compression-levels/) |
|||
|
|||
The packages are drop-in replacements for standard libraries. Simply replace the import path to use them: |
|||
|
|||
| old import | new import | Documentation |
|||
|--------------------|-----------------------------------------|--------------------| |
|||
| `compress/gzip` | `github.com/klauspost/compress/gzip` | [gzip](https://pkg.go.dev/github.com/klauspost/compress/gzip?tab=doc) |
|||
| `compress/zlib` | `github.com/klauspost/compress/zlib` | [zlib](https://pkg.go.dev/github.com/klauspost/compress/zlib?tab=doc) |
|||
| `archive/zip` | `github.com/klauspost/compress/zip` | [zip](https://pkg.go.dev/github.com/klauspost/compress/zip?tab=doc) |
|||
| `compress/flate` | `github.com/klauspost/compress/flate` | [flate](https://pkg.go.dev/github.com/klauspost/compress/flate?tab=doc) |
|||
|
|||
* Optimized [deflate](https://godoc.org/github.com/klauspost/compress/flate) packages which can be used as a dropin replacement for [gzip](https://godoc.org/github.com/klauspost/compress/gzip), [zip](https://godoc.org/github.com/klauspost/compress/zip) and [zlib](https://godoc.org/github.com/klauspost/compress/zlib). |
|||
|
|||
You may also be interested in [pgzip](https://github.com/klauspost/pgzip), which is a drop in replacement for gzip, which support multithreaded compression on big files and the optimized [crc32](https://github.com/klauspost/crc32) package used by these packages. |
|||
|
|||
The packages contains the same as the standard library, so you can use the godoc for that: [gzip](http://golang.org/pkg/compress/gzip/), [zip](http://golang.org/pkg/archive/zip/), [zlib](http://golang.org/pkg/compress/zlib/), [flate](http://golang.org/pkg/compress/flate/). |
|||
|
|||
Currently there is only minor speedup on decompression (mostly CRC32 calculation). |
|||
|
|||
# Stateless compression |
|||
|
|||
This package offers stateless compression as a special option for gzip/deflate. |
|||
It will do compression but without maintaining any state between Write calls. |
|||
|
|||
This means there will be no memory kept between Write calls, but compression and speed will be suboptimal. |
|||
|
|||
This is only relevant in cases where you expect to run many thousands of compressors concurrently, |
|||
but with very little activity. This is *not* intended for regular web servers serving individual requests. |
|||
|
|||
Because of this, the size of actual Write calls will affect output size. |
|||
|
|||
In gzip, specify level `-3` / `gzip.StatelessCompression` to enable. |
|||
|
|||
For direct deflate use, NewStatelessWriter and StatelessDeflate are available. See [documentation](https://godoc.org/github.com/klauspost/compress/flate#NewStatelessWriter) |
|||
|
|||
A `bufio.Writer` can of course be used to control write sizes. For example, to use a 4KB buffer: |
|||
|
|||
``` |
|||
// replace 'ioutil.Discard' with your output. |
|||
gzw, err := gzip.NewWriterLevel(ioutil.Discard, gzip.StatelessCompression) |
|||
if err != nil { |
|||
return err |
|||
} |
|||
defer gzw.Close() |
|||
|
|||
w := bufio.NewWriterSize(gzw, 4096) |
|||
defer w.Flush() |
|||
|
|||
// Write to 'w' |
|||
``` |
|||
|
|||
This will only use up to 4KB in memory when the writer is idle. |
|||
|
|||
Compression is almost always worse than the fastest compression level |
|||
and each write will allocate (a little) memory. |
|||
|
|||
# Performance Update 2018 |
|||
|
|||
It has been a while since we have been looking at the speed of this package compared to the standard library, so I thought I would re-do my tests and give some overall recommendations based on the current state. All benchmarks have been performed with Go 1.10 on my Desktop Intel(R) Core(TM) i7-2600 CPU @3.40GHz. Since I last ran the tests, I have gotten more RAM, which means tests with big files are no longer limited by my SSD. |
|||
|
|||
The raw results are in my [updated spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing). Due to cgo changes and upstream updates i could not get the cgo version of gzip to compile. Instead I included the [zstd](https://github.com/datadog/zstd) cgo implementation. If I get cgo gzip to work again, I might replace the results in the sheet. |
|||
|
|||
The columns to take note of are: *MB/s* - the throughput. *Reduction* - the data size reduction in percent of the original. *Rel Speed* relative speed compared to the standard library at the same level. *Smaller* - how many percent smaller is the compressed output compared to stdlib. Negative means the output was bigger. *Loss* means the loss (or gain) in compression as a percentage difference of the input. |
|||
|
|||
The `gzstd` (standard library gzip) and `gzkp` (this package gzip) only uses one CPU core. [`pgzip`](https://github.com/klauspost/pgzip), [`bgzf`](https://github.com/biogo/hts/tree/master/bgzf) uses all 4 cores. [`zstd`](https://github.com/DataDog/zstd) uses one core, and is a beast (but not Go, yet). |
|||
|
|||
|
|||
## Overall differences. |
|||
|
|||
There appears to be a roughly 5-10% speed advantage over the standard library when comparing at similar compression levels. |
|||
|
|||
The biggest difference you will see is the result of [re-balancing](https://blog.klauspost.com/rebalancing-deflate-compression-levels/) the compression levels. I wanted by library to give a smoother transition between the compression levels than the standard library. |
|||
|
|||
This package attempts to provide a more smooth transition, where "1" is taking a lot of shortcuts, "5" is the reasonable trade-off and "9" is the "give me the best compression", and the values in between gives something reasonable in between. The standard library has big differences in levels 1-4, but levels 5-9 having no significant gains - often spending a lot more time than can be justified by the achieved compression. |
|||
|
|||
There are links to all the test data in the [spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing) in the top left field on each tab. |
|||
|
|||
## Web Content |
|||
|
|||
This test set aims to emulate typical use in a web server. The test-set is 4GB data in 53k files, and is a mixture of (mostly) HTML, JS, CSS. |
|||
|
|||
Since level 1 and 9 are close to being the same code, they are quite close. But looking at the levels in-between the differences are quite big. |
|||
|
|||
Looking at level 6, this package is 88% faster, but will output about 6% more data. For a web server, this means you can serve 88% more data, but have to pay for 6% more bandwidth. You can draw your own conclusions on what would be the most expensive for your case. |
|||
|
|||
## Object files |
|||
|
|||
This test is for typical data files stored on a server. In this case it is a collection of Go precompiled objects. They are very compressible. |
|||
|
|||
The picture is similar to the web content, but with small differences since this is very compressible. Levels 2-3 offer good speed, but is sacrificing quite a bit of compression. |
|||
|
|||
The standard library seems suboptimal on level 3 and 4 - offering both worse compression and speed than level 6 & 7 of this package respectively. |
|||
|
|||
## Highly Compressible File |
|||
|
|||
This is a JSON file with very high redundancy. The reduction starts at 95% on level 1, so in real life terms we are dealing with something like a highly redundant stream of data, etc. |
|||
|
|||
It is definitely visible that we are dealing with specialized content here, so the results are very scattered. This package does not do very well at levels 1-4, but picks up significantly at level 5 and levels 7 and 8 offering great speed for the achieved compression. |
|||
|
|||
So if you know you content is extremely compressible you might want to go slightly higher than the defaults. The standard library has a huge gap between levels 3 and 4 in terms of speed (2.75x slowdown), so it offers little "middle ground". |
|||
|
|||
## Medium-High Compressible |
|||
|
|||
This is a pretty common test corpus: [enwik9](http://mattmahoney.net/dc/textdata.html). It contains the first 10^9 bytes of the English Wikipedia dump on Mar. 3, 2006. This is a very good test of typical text based compression and more data heavy streams. |
|||
|
|||
We see a similar picture here as in "Web Content". On equal levels some compression is sacrificed for more speed. Level 5 seems to be the best trade-off between speed and size, beating stdlib level 3 in both. |
|||
|
|||
## Medium Compressible |
|||
|
|||
I will combine two test sets, one [10GB file set](http://mattmahoney.net/dc/10gb.html) and a VM disk image (~8GB). Both contain different data types and represent a typical backup scenario. |
|||
|
|||
The most notable thing is how quickly the standard library drops to very low compression speeds around level 5-6 without any big gains in compression. Since this type of data is fairly common, this does not seem like good behavior. |
|||
|
|||
|
|||
## Un-compressible Content |
|||
|
|||
This is mainly a test of how good the algorithms are at detecting un-compressible input. The standard library only offers this feature with very conservative settings at level 1. Obviously there is no reason for the algorithms to try to compress input that cannot be compressed. The only downside is that it might skip some compressible data on false detections. |
|||
|
|||
|
|||
# linear time compression (huffman only) |
|||
|
|||
This compression library adds a special compression level, named `HuffmanOnly`, which allows near linear time compression. This is done by completely disabling matching of previous data, and only reduce the number of bits to represent each character. |
|||
|
|||
This means that often used characters, like 'e' and ' ' (space) in text use the fewest bits to represent, and rare characters like 'ยค' takes more bits to represent. For more information see [wikipedia](https://en.wikipedia.org/wiki/Huffman_coding) or this nice [video](https://youtu.be/ZdooBTdW5bM). |
|||
|
|||
Since this type of compression has much less variance, the compression speed is mostly unaffected by the input data, and is usually more than *180MB/s* for a single core. |
|||
|
|||
The downside is that the compression ratio is usually considerably worse than even the fastest conventional compression. The compression ratio can never be better than 8:1 (12.5%). |
|||
|
|||
The linear time compression can be used as a "better than nothing" mode, where you cannot risk the encoder to slow down on some content. For comparison, the size of the "Twain" text is *233460 bytes* (+29% vs. level 1) and encode speed is 144MB/s (4.5x level 1). So in this case you trade a 30% size increase for a 4 times speedup. |
|||
|
|||
For more information see my blog post on [Fast Linear Time Compression](http://blog.klauspost.com/constant-time-gzipzip-compression/). |
|||
|
|||
This is implemented on Go 1.7 as "Huffman Only" mode, though not exposed for gzip. |
|||
|
|||
|
|||
# snappy package |
|||
|
|||
The standard snappy package has now been improved. This repo contains a copy of the snappy repo. |
|||
|
|||
I would advise to use the standard package: https://github.com/golang/snappy |
|||
|
|||
|
|||
# license |
|||
|
|||
This code is licensed under the same conditions as the original Go code. See LICENSE file. |
@ -0,0 +1,85 @@ |
|||
package compress |
|||
|
|||
import "math" |
|||
|
|||
// Estimate returns a normalized compressibility estimate of block b.
|
|||
// Values close to zero are likely uncompressible.
|
|||
// Values above 0.1 are likely to be compressible.
|
|||
// Values above 0.5 are very compressible.
|
|||
// Very small lengths will return 0.
|
|||
func Estimate(b []byte) float64 { |
|||
if len(b) < 16 { |
|||
return 0 |
|||
} |
|||
|
|||
// Correctly predicted order 1
|
|||
hits := 0 |
|||
lastMatch := false |
|||
var o1 [256]byte |
|||
var hist [256]int |
|||
c1 := byte(0) |
|||
for _, c := range b { |
|||
if c == o1[c1] { |
|||
// We only count a hit if there was two correct predictions in a row.
|
|||
if lastMatch { |
|||
hits++ |
|||
} |
|||
lastMatch = true |
|||
} else { |
|||
lastMatch = false |
|||
} |
|||
o1[c1] = c |
|||
c1 = c |
|||
hist[c]++ |
|||
} |
|||
|
|||
// Use x^0.6 to give better spread
|
|||
prediction := math.Pow(float64(hits)/float64(len(b)), 0.6) |
|||
|
|||
// Calculate histogram distribution
|
|||
variance := float64(0) |
|||
avg := float64(len(b)) / 256 |
|||
|
|||
for _, v := range hist { |
|||
ฮ := float64(v) - avg |
|||
variance += ฮ * ฮ |
|||
} |
|||
|
|||
stddev := math.Sqrt(float64(variance)) / float64(len(b)) |
|||
exp := math.Sqrt(1 / float64(len(b))) |
|||
|
|||
// Subtract expected stddev
|
|||
stddev -= exp |
|||
if stddev < 0 { |
|||
stddev = 0 |
|||
} |
|||
stddev *= 1 + exp |
|||
|
|||
// Use x^0.4 to give better spread
|
|||
entropy := math.Pow(stddev, 0.4) |
|||
|
|||
// 50/50 weight between prediction and histogram distribution
|
|||
return math.Pow((prediction+entropy)/2, 0.9) |
|||
} |
|||
|
|||
// ShannonEntropyBits returns the number of bits minimum required to represent
|
|||
// an entropy encoding of the input bytes.
|
|||
// https://en.wiktionary.org/wiki/Shannon_entropy
|
|||
func ShannonEntropyBits(b []byte) int { |
|||
if len(b) == 0 { |
|||
return 0 |
|||
} |
|||
var hist [256]int |
|||
for _, c := range b { |
|||
hist[c]++ |
|||
} |
|||
shannon := float64(0) |
|||
invTotal := 1.0 / float64(len(b)) |
|||
for _, v := range hist[:] { |
|||
if v > 0 { |
|||
n := float64(v) |
|||
shannon += math.Ceil(-math.Log2(n*invTotal) * n) |
|||
} |
|||
} |
|||
return int(math.Ceil(shannon)) |
|||
} |
@ -0,0 +1,302 @@ |
|||
package compress |
|||
|
|||
import ( |
|||
"crypto/rand" |
|||
"encoding/base32" |
|||
"io/ioutil" |
|||
"strconv" |
|||
"strings" |
|||
"testing" |
|||
|
|||
"github.com/klauspost/compress/flate" |
|||
"github.com/klauspost/compress/gzip" |
|||
) |
|||
|
|||
func BenchmarkEstimate(b *testing.B) { |
|||
b.ReportAllocs() |
|||
// (predictable, low entropy distibution)
|
|||
b.Run("zeroes-5k", func(b *testing.B) { |
|||
var testData = make([]byte, 5000) |
|||
b.SetBytes(int64(len(testData))) |
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
Estimate(testData) |
|||
} |
|||
b.Log(Estimate(testData)) |
|||
}) |
|||
|
|||
// (predictable, high entropy distibution)
|
|||
b.Run("predictable-5k", func(b *testing.B) { |
|||
var testData = make([]byte, 5000) |
|||
for i := range testData { |
|||
testData[i] = byte(float64(i) / float64(len(testData)) * 256) |
|||
} |
|||
b.SetBytes(int64(len(testData))) |
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
Estimate(testData) |
|||
} |
|||
b.Log(Estimate(testData)) |
|||
}) |
|||
|
|||
// (not predictable, high entropy distibution)
|
|||
b.Run("random-500b", func(b *testing.B) { |
|||
var testData = make([]byte, 500) |
|||
rand.Read(testData) |
|||
b.SetBytes(int64(len(testData))) |
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
Estimate(testData) |
|||
} |
|||
b.Log(Estimate(testData)) |
|||
}) |
|||
|
|||
// (not predictable, high entropy distibution)
|
|||
b.Run("random-5k", func(b *testing.B) { |
|||
var testData = make([]byte, 5000) |
|||
rand.Read(testData) |
|||
b.SetBytes(int64(len(testData))) |
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
Estimate(testData) |
|||
} |
|||
b.Log(Estimate(testData)) |
|||
}) |
|||
|
|||
// (not predictable, high entropy distibution)
|
|||
b.Run("random-50k", func(b *testing.B) { |
|||
var testData = make([]byte, 50000) |
|||
rand.Read(testData) |
|||
b.SetBytes(int64(len(testData))) |
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
Estimate(testData) |
|||
} |
|||
b.Log(Estimate(testData)) |
|||
}) |
|||
|
|||
// (not predictable, high entropy distibution)
|
|||
b.Run("random-500k", func(b *testing.B) { |
|||
var testData = make([]byte, 500000) |
|||
rand.Read(testData) |
|||
b.SetBytes(int64(len(testData))) |
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
Estimate(testData) |
|||
} |
|||
b.Log(Estimate(testData)) |
|||
}) |
|||
|
|||
// (not predictable, medium entropy distibution)
|
|||
b.Run("base-32-5k", func(b *testing.B) { |
|||
var testData = make([]byte, 5000) |
|||
rand.Read(testData) |
|||
s := base32.StdEncoding.EncodeToString(testData) |
|||
testData = []byte(s) |
|||
testData = testData[:5000] |
|||
b.SetBytes(int64(len(testData))) |
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
Estimate(testData) |
|||
} |
|||
b.Log(Estimate(testData)) |
|||
}) |
|||
// (medium predictable, medium entropy distibution)
|
|||
b.Run("text", func(b *testing.B) { |
|||
var testData = []byte(`If compression is done per-chunk, care should be taken that it doesn't leave restic backups open to watermarking/fingerprinting attacks. |
|||
This is essentially the same problem we discussed related to fingerprinting the CDC deduplication process: |
|||
With "naive" CDC, a "known plaintext" file can be verified to exist within the backup if the size of individual blocks can be observed by an attacker, by using CDC on the file in parallel and comparing the resulting amount of chunks and individual chunk lengths. |
|||
As discussed earlier, this can be somewhat mitigated by salting the CDC algorithm with a secret value, as done in attic. |
|||
With salted CDC, I assume compression would happen on each individual chunk, after splitting the problematic file into chunks. Restic chunks are in the range of 512 KB to 8 MB (but not evenly distributed - right?). |
|||
Attacker knows that the CDC algorithm uses a secret salt, so the attacker generates a range of chunks consisting of the first 512 KB to 8 MB of the file, one for each valid chunk length. The attacker is also able to determine the lengths of compressed chunks. |
|||
The attacker then compresses that chunk using the compression algorithm. |
|||
The attacker compares the lengths of the resulting chunks to the first chunk in the restic backup sets. |
|||
IF a matching block length is found, the attacker repeats the exercise with the next chunk, and the next chunk, and the next chunk, ... and the next chunk. |
|||
It is my belief that with sufficiently large files, and considering the fact that the CDC algorithm is "biased" (in lack of better of words) towards generating blocks of about 1 MB, this would be sufficient to ascertain whether or not a certain large file exists in the backup. |
|||
AS always, a paranoid and highly unscientific stream of consciousness. |
|||
Thoughts?`) |
|||
testData = append(testData, testData...) |
|||
testData = append(testData, testData...) |
|||
b.SetBytes(int64(len(testData))) |
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
Estimate(testData) |
|||
} |
|||
b.Log(Estimate(testData)) |
|||
}) |
|||
} |
|||
|
|||
func BenchmarkSnannonEntropyBits(b *testing.B) { |
|||
b.ReportAllocs() |
|||
// (predictable, low entropy distibution)
|
|||
b.Run("zeroes-5k", func(b *testing.B) { |
|||
var testData = make([]byte, 5000) |
|||
b.SetBytes(int64(len(testData))) |
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
ShannonEntropyBits(testData) |
|||
} |
|||
b.Log(ShannonEntropyBits(testData)) |
|||
}) |
|||
|
|||
// (predictable, high entropy distibution)
|
|||
b.Run("predictable-5k", func(b *testing.B) { |
|||
var testData = make([]byte, 5000) |
|||
for i := range testData { |
|||
testData[i] = byte(float64(i) / float64(len(testData)) * 256) |
|||
} |
|||
b.SetBytes(int64(len(testData))) |
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
ShannonEntropyBits(testData) |
|||
} |
|||
b.Log(ShannonEntropyBits(testData)) |
|||
}) |
|||
|
|||
// (not predictable, high entropy distibution)
|
|||
b.Run("random-500b", func(b *testing.B) { |
|||
var testData = make([]byte, 500) |
|||
rand.Read(testData) |
|||
b.SetBytes(int64(len(testData))) |
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
ShannonEntropyBits(testData) |
|||
} |
|||
b.Log(ShannonEntropyBits(testData)) |
|||
}) |
|||
|
|||
// (not predictable, high entropy distibution)
|
|||
b.Run("random-5k", func(b *testing.B) { |
|||
var testData = make([]byte, 5000) |
|||
rand.Read(testData) |
|||
b.SetBytes(int64(len(testData))) |
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
ShannonEntropyBits(testData) |
|||
} |
|||
b.Log(ShannonEntropyBits(testData)) |
|||
}) |
|||
|
|||
// (not predictable, high entropy distibution)
|
|||
b.Run("random-50k", func(b *testing.B) { |
|||
var testData = make([]byte, 50000) |
|||
rand.Read(testData) |
|||
b.SetBytes(int64(len(testData))) |
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
ShannonEntropyBits(testData) |
|||
} |
|||
b.Log(ShannonEntropyBits(testData)) |
|||
}) |
|||
|
|||
// (not predictable, high entropy distibution)
|
|||
b.Run("random-500k", func(b *testing.B) { |
|||
var testData = make([]byte, 500000) |
|||
rand.Read(testData) |
|||
b.SetBytes(int64(len(testData))) |
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
ShannonEntropyBits(testData) |
|||
} |
|||
b.Log(ShannonEntropyBits(testData)) |
|||
}) |
|||
|
|||
// (not predictable, medium entropy distibution)
|
|||
b.Run("base-32-5k", func(b *testing.B) { |
|||
var testData = make([]byte, 5000) |
|||
rand.Read(testData) |
|||
s := base32.StdEncoding.EncodeToString(testData) |
|||
testData = []byte(s) |
|||
testData = testData[:5000] |
|||
b.SetBytes(int64(len(testData))) |
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
ShannonEntropyBits(testData) |
|||
} |
|||
b.Log(ShannonEntropyBits(testData)) |
|||
}) |
|||
// (medium predictable, medium entropy distibution)
|
|||
b.Run("text", func(b *testing.B) { |
|||
var testData = []byte(`If compression is done per-chunk, care should be taken that it doesn't leave restic backups open to watermarking/fingerprinting attacks. |
|||
This is essentially the same problem we discussed related to fingerprinting the CDC deduplication process: |
|||
With "naive" CDC, a "known plaintext" file can be verified to exist within the backup if the size of individual blocks can be observed by an attacker, by using CDC on the file in parallel and comparing the resulting amount of chunks and individual chunk lengths. |
|||
As discussed earlier, this can be somewhat mitigated by salting the CDC algorithm with a secret value, as done in attic. |
|||
With salted CDC, I assume compression would happen on each individual chunk, after splitting the problematic file into chunks. Restic chunks are in the range of 512 KB to 8 MB (but not evenly distributed - right?). |
|||
Attacker knows that the CDC algorithm uses a secret salt, so the attacker generates a range of chunks consisting of the first 512 KB to 8 MB of the file, one for each valid chunk length. The attacker is also able to determine the lengths of compressed chunks. |
|||
The attacker then compresses that chunk using the compression algorithm. |
|||
The attacker compares the lengths of the resulting chunks to the first chunk in the restic backup sets. |
|||
IF a matching block length is found, the attacker repeats the exercise with the next chunk, and the next chunk, and the next chunk, ... and the next chunk. |
|||
It is my belief that with sufficiently large files, and considering the fact that the CDC algorithm is "biased" (in lack of better of words) towards generating blocks of about 1 MB, this would be sufficient to ascertain whether or not a certain large file exists in the backup. |
|||
AS always, a paranoid and highly unscientific stream of consciousness. |
|||
Thoughts?`) |
|||
testData = append(testData, testData...) |
|||
testData = append(testData, testData...) |
|||
b.SetBytes(int64(len(testData))) |
|||
b.ResetTimer() |
|||
for i := 0; i < b.N; i++ { |
|||
ShannonEntropyBits(testData) |
|||
} |
|||
b.Log(ShannonEntropyBits(testData)) |
|||
}) |
|||
} |
|||
|
|||
func BenchmarkCompressAllocations(b *testing.B) { |
|||
payload := []byte(strings.Repeat("Tiny payload", 20)) |
|||
for j := -2; j <= 9; j++ { |
|||
b.Run("level("+strconv.Itoa(j)+")", func(b *testing.B) { |
|||
b.Run("flate", func(b *testing.B) { |
|||
b.ReportAllocs() |
|||
|
|||
for i := 0; i < b.N; i++ { |
|||
w, err := flate.NewWriter(ioutil.Discard, j) |
|||
if err != nil { |
|||
b.Fatal(err) |
|||
} |
|||
w.Write(payload) |
|||
w.Close() |
|||
} |
|||
}) |
|||
b.Run("gzip", func(b *testing.B) { |
|||
b.ReportAllocs() |
|||
|
|||
for i := 0; i < b.N; i++ { |
|||
w, err := gzip.NewWriterLevel(ioutil.Discard, j) |
|||
if err != nil { |
|||
b.Fatal(err) |
|||
} |
|||
w.Write(payload) |
|||
w.Close() |
|||
} |
|||
}) |
|||
}) |
|||
} |
|||
} |
|||
|
|||
func BenchmarkCompressAllocationsSingle(b *testing.B) { |
|||
payload := []byte(strings.Repeat("Tiny payload", 20)) |
|||
const level = 2 |
|||
b.Run("flate", func(b *testing.B) { |
|||
b.ReportAllocs() |
|||
|
|||
for i := 0; i < b.N; i++ { |
|||
w, err := flate.NewWriter(ioutil.Discard, level) |
|||
if err != nil { |
|||
b.Fatal(err) |
|||
} |
|||
w.Write(payload) |
|||
w.Close() |
|||
} |
|||
}) |
|||
b.Run("gzip", func(b *testing.B) { |
|||
b.ReportAllocs() |
|||
|
|||
for i := 0; i < b.N; i++ { |
|||
w, err := gzip.NewWriterLevel(ioutil.Discard, level) |
|||
if err != nil { |
|||
b.Fatal(err) |
|||
} |
|||
w.Write(payload) |
|||
w.Close() |
|||
} |
|||
}) |
|||
} |
@ -0,0 +1,821 @@ |
|||
// Copyright 2009 The Go Authors. All rights reserved.
|
|||
// Copyright (c) 2015 Klaus Post
|
|||
// Use of this source code is governed by a BSD-style
|
|||
// license that can be found in the LICENSE file.
|
|||
|
|||
package flate |
|||
|
|||
import ( |
|||
"fmt" |
|||
"io" |
|||
"math" |
|||
) |
|||
|
|||
const ( |
|||
NoCompression = 0 |
|||
BestSpeed = 1 |
|||
BestCompression = 9 |
|||
DefaultCompression = -1 |
|||
|
|||
// HuffmanOnly disables Lempel-Ziv match searching and only performs Huffman
|
|||
// entropy encoding. This mode is useful in compressing data that has
|
|||
// already been compressed with an LZ style algorithm (e.g. Snappy or LZ4)
|
|||
// that lacks an entropy encoder. Compression gains are achieved when
|
|||
// certain bytes in the input stream occur more frequently than others.
|
|||
//
|
|||
// Note that HuffmanOnly produces a compressed output that is
|
|||
// RFC 1951 compliant. That is, any valid DEFLATE decompressor will
|
|||
// continue to be able to decompress this output.
|
|||
HuffmanOnly = -2 |
|||
ConstantCompression = HuffmanOnly // compatibility alias.
|
|||
|
|||
logWindowSize = 15 |
|||
windowSize = 1 << logWindowSize |
|||
windowMask = windowSize - 1 |
|||
logMaxOffsetSize = 15 // Standard DEFLATE
|
|||
minMatchLength = 4 // The smallest match that the compressor looks for
|
|||
maxMatchLength = 258 // The longest match for the compressor
|
|||
minOffsetSize = 1 // The shortest offset that makes any sense
|
|||
|
|||
// The maximum number of tokens we put into a single flat block, just too
|
|||
// stop things from getting too large.
|
|||
maxFlateBlockTokens = 1 << 14 |
|||
maxStoreBlockSize = 65535 |
|||
hashBits = 17 // After 17 performance degrades
|
|||
hashSize = 1 << hashBits |
|||
hashMask = (1 << hashBits) - 1 |
|||
hashShift = (hashBits + minMatchLength - 1) / minMatchLength |
|||
maxHashOffset = 1 << 24 |
|||
|
|||
skipNever = math.MaxInt32 |
|||
|
|||
debugDeflate = false |
|||
) |
|||
|
|||
type compressionLevel struct { |
|||
good, lazy, nice, chain, fastSkipHashing, level int |
|||
} |
|||
|
|||
// Compression levels have been rebalanced from zlib deflate defaults
|
|||
// to give a bigger spread in speed and compression.
|
|||
// See https://blog.klauspost.com/rebalancing-deflate-compression-levels/
|
|||
var levels = []compressionLevel{ |
|||
{}, // 0
|
|||
// Level 1-6 uses specialized algorithm - values not used
|
|||
{0, 0, 0, 0, 0, 1}, |
|||
{0, 0, 0, 0, 0, 2}, |
|||
{0, 0, 0, 0, 0, 3}, |
|||
{0, 0, 0, 0, 0, 4}, |
|||
{0, 0, 0, 0, 0, 5}, |
|||
{0, 0, 0, 0, 0, 6}, |
|||
// Levels 7-9 use increasingly more lazy matching
|
|||
// and increasingly stringent conditions for "good enough".
|
|||
{8, 8, 24, 16, skipNever, 7}, |
|||
{10, 16, 24, 64, skipNever, 8}, |
|||
{32, 258, 258, 4096, skipNever, 9}, |
|||
} |
|||
|
|||
// advancedState contains state for the advanced levels, with bigger hash tables, etc.
|
|||
type advancedState struct { |
|||
// deflate state
|
|||
length int |
|||
offset int |
|||
maxInsertIndex int |
|||
|
|||
// Input hash chains
|
|||
// hashHead[hashValue] contains the largest inputIndex with the specified hash value
|
|||
// If hashHead[hashValue] is within the current window, then
|
|||
// hashPrev[hashHead[hashValue] & windowMask] contains the previous index
|
|||
// with the same hash value.
|
|||
chainHead int |
|||
hashHead [hashSize]uint32 |
|||
hashPrev [windowSize]uint32 |
|||
hashOffset int |
|||
|
|||
// input window: unprocessed data is window[index:windowEnd]
|
|||
index int |
|||
hashMatch [maxMatchLength + minMatchLength]uint32 |
|||
|
|||
hash uint32 |
|||
ii uint16 // position of last match, intended to overflow to reset.
|
|||
} |
|||
|
|||
type compressor struct { |
|||
compressionLevel |
|||
|
|||
w *huffmanBitWriter |
|||
|
|||
// compression algorithm
|
|||
fill func(*compressor, []byte) int // copy data to window
|
|||
step func(*compressor) // process window
|
|||
|
|||
window []byte |
|||
windowEnd int |
|||
blockStart int // window index where current tokens start
|
|||
err error |
|||
|
|||
// queued output tokens
|
|||
tokens tokens |
|||
fast fastEnc |
|||
state *advancedState |
|||
|
|||
sync bool // requesting flush
|
|||
byteAvailable bool // if true, still need to process window[index-1].
|
|||
} |
|||
|
|||
func (d *compressor) fillDeflate(b []byte) int { |
|||
s := d.state |
|||
if s.index >= 2*windowSize-(minMatchLength+maxMatchLength) { |
|||
// shift the window by windowSize
|
|||
copy(d.window[:], d.window[windowSize:2*windowSize]) |
|||
s.index -= windowSize |
|||
d.windowEnd -= windowSize |
|||
if d.blockStart >= windowSize { |
|||
d.blockStart -= windowSize |
|||
} else { |
|||
d.blockStart = math.MaxInt32 |
|||
} |
|||
s.hashOffset += windowSize |
|||
if s.hashOffset > maxHashOffset { |
|||
delta := s.hashOffset - 1 |
|||
s.hashOffset -= delta |
|||
s.chainHead -= delta |
|||
// Iterate over slices instead of arrays to avoid copying
|
|||
// the entire table onto the stack (Issue #18625).
|
|||
for i, v := range s.hashPrev[:] { |
|||
if int(v) > delta { |
|||
s.hashPrev[i] = uint32(int(v) - delta) |
|||
} else { |
|||
s.hashPrev[i] = 0 |
|||
} |
|||
} |
|||
for i, v := range s.hashHead[:] { |
|||
if int(v) > delta { |
|||
s.hashHead[i] = uint32(int(v) - delta) |
|||
} else { |
|||
s.hashHead[i] = 0 |
|||
} |
|||
} |
|||
} |
|||
} |
|||
n := copy(d.window[d.windowEnd:], b) |
|||
d.windowEnd += n |
|||
return n |
|||
} |
|||
|
|||
func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error { |
|||
if index > 0 || eof { |
|||
var window []byte |
|||
if d.blockStart <= index { |
|||
window = d.window[d.blockStart:index] |
|||
} |
|||
d.blockStart = index |
|||
d.w.writeBlock(tok, eof, window) |
|||
return d.w.err |
|||
} |
|||
return nil |
|||
} |
|||
|
|||
// writeBlockSkip writes the current block and uses the number of tokens
|
|||
// to determine if the block should be stored on no matches, or
|
|||
// only huffman encoded.
|
|||
func (d *compressor) writeBlockSkip(tok *tokens, index int, eof bool) error { |
|||
if index > 0 || eof { |
|||
if d.blockStart <= index { |
|||
window := d.window[d.blockStart:index] |
|||
// If we removed less than a 64th of all literals
|
|||
// we huffman compress the block.
|
|||
if int(tok.n) > len(window)-int(tok.n>>6) { |
|||
d.w.writeBlockHuff(eof, window, d.sync) |
|||
} else { |
|||
// Write a dynamic huffman block.
|
|||
d.w.writeBlockDynamic(tok, eof, window, d.sync) |
|||
} |
|||
} else { |
|||
d.w.writeBlock(tok, eof, nil) |
|||
} |
|||
d.blockStart = index |
|||
return d.w.err |
|||
} |
|||
return nil |
|||
} |
|||
|
|||
// fillWindow will fill the current window with the supplied
|
|||
// dictionary and calculate all hashes.
|
|||
// This is much faster than doing a full encode.
|
|||
// Should only be used after a start/reset.
|
|||
func (d *compressor) fillWindow(b []byte) { |
|||
// Do not fill window if we are in store-only or huffman mode.
|
|||
if d.level <= 0 { |
|||
return |
|||
} |
|||
if d.fast != nil { |
|||
// encode the last data, but discard the result
|
|||
if len(b) > maxMatchOffset { |
|||
b = b[len(b)-maxMatchOffset:] |
|||
} |
|||
d.fast.Encode(&d.tokens, b) |
|||
d.tokens.Reset() |
|||
return |
|||
} |
|||
s := d.state |
|||
// If we are given too much, cut it.
|
|||
if len(b) > windowSize { |
|||
b = b[len(b)-windowSize:] |
|||
} |
|||
// Add all to window.
|
|||
n := copy(d.window[d.windowEnd:], b) |
|||
|
|||
// Calculate 256 hashes at the time (more L1 cache hits)
|
|||
loops := (n + 256 - minMatchLength) / 256 |
|||
for j := 0; j < loops; j++ { |
|||
startindex := j * 256 |
|||
end := startindex + 256 + minMatchLength - 1 |
|||
if end > n { |
|||
end = n |
|||
} |
|||
tocheck := d.window[startindex:end] |
|||
dstSize := len(tocheck) - minMatchLength + 1 |
|||
|
|||
if dstSize <= 0 { |
|||
continue |
|||
} |
|||
|
|||
dst := s.hashMatch[:dstSize] |
|||
bulkHash4(tocheck, dst) |
|||
var newH uint32 |
|||
for i, val := range dst { |
|||
di := i + startindex |
|||
newH = val & hashMask |
|||
// Get previous value with the same hash.
|
|||
// Our chain should point to the previous value.
|
|||
s.hashPrev[di&windowMask] = s.hashHead[newH] |
|||
// Set the head of the hash chain to us.
|
|||
s.hashHead[newH] = uint32(di + s.hashOffset) |
|||
} |
|||
s.hash = newH |
|||
} |
|||
// Update window information.
|
|||
d.windowEnd += n |
|||
s.index = n |
|||
} |
|||
|
|||
// Try to find a match starting at index whose length is greater than prevSize.
|
|||
// We only look at chainCount possibilities before giving up.
|
|||
// pos = s.index, prevHead = s.chainHead-s.hashOffset, prevLength=minMatchLength-1, lookahead
|
|||
func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) { |
|||
minMatchLook := maxMatchLength |
|||
if lookahead < minMatchLook { |
|||
minMatchLook = lookahead |
|||
} |
|||
|
|||
win := d.window[0 : pos+minMatchLook] |
|||
|
|||
// We quit when we get a match that's at least nice long
|
|||
nice := len(win) - pos |
|||
if d.nice < nice { |
|||
nice = d.nice |
|||
} |
|||
|
|||
// If we've got a match that's good enough, only look in 1/4 the chain.
|
|||
tries := d.chain |
|||
length = prevLength |
|||
if length >= d.good { |
|||
tries >>= 2 |
|||
} |
|||
|
|||
wEnd := win[pos+length] |
|||
wPos := win[pos:] |
|||
minIndex := pos - windowSize |
|||
|
|||
for i := prevHead; tries > 0; tries-- { |
|||
if wEnd == win[i+length] { |
|||
n := matchLen(win[i:i+minMatchLook], wPos) |
|||
|
|||
if n > length && (n > minMatchLength || pos-i <= 4096) { |
|||
length = n |
|||
offset = pos - i |
|||
ok = true |
|||
if n >= nice { |
|||
// The match is good enough that we don't try to find a better one.
|
|||
break |
|||
} |
|||
wEnd = win[pos+n] |
|||
} |
|||
} |
|||
if i == minIndex { |
|||
// hashPrev[i & windowMask] has already been overwritten, so stop now.
|
|||
break |
|||
} |
|||
i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset |
|||
if i < minIndex || i < 0 { |
|||
break |
|||
} |
|||
} |
|||
return |
|||
} |
|||
|
|||
func (d *compressor) writeStoredBlock(buf []byte) error { |
|||
if d.w.writeStoredHeader(len(buf), false); d.w.err != nil { |
|||
return d.w.err |
|||
} |
|||
d.w.writeBytes(buf) |
|||
return d.w.err |
|||
} |
|||
|
|||
// hash4 returns a hash representation of the first 4 bytes
|
|||
// of the supplied slice.
|
|||
// The caller must ensure that len(b) >= 4.
|
|||
func hash4(b []byte) uint32 { |
|||
b = b[:4] |
|||
return hash4u(uint32(b[3])|uint32(b[2])<<8|uint32(b[1])<<16|uint32(b[0])<<24, hashBits) |
|||
} |
|||
|
|||
// bulkHash4 will compute hashes using the same
|
|||
// algorithm as hash4
|
|||
func bulkHash4(b []byte, dst []uint32) { |
|||
if len(b) < 4 { |
|||
return |
|||
} |
|||
hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24 |
|||
dst[0] = hash4u(hb, hashBits) |
|||
end := len(b) - 4 + 1 |
|||
for i := 1; i < end; i++ { |
|||
hb = (hb << 8) | uint32(b[i+3]) |
|||
dst[i] = hash4u(hb, hashBits) |
|||
} |
|||
} |
|||
|
|||
func (d *compressor) initDeflate() { |
|||
d.window = make([]byte, 2*windowSize) |
|||
d.byteAvailable = false |
|||
d.err = nil |
|||
if d.state == nil { |
|||
return |
|||
} |
|||
s := d.state |
|||
s.index = 0 |
|||
s.hashOffset = 1 |
|||
s.length = minMatchLength - 1 |
|||
s.offset = 0 |
|||
s.hash = 0 |
|||
s.chainHead = -1 |
|||
} |
|||
|
|||
// deflateLazy is the same as deflate, but with d.fastSkipHashing == skipNever,
|
|||
// meaning it always has lazy matching on.
|
|||
func (d *compressor) deflateLazy() { |
|||
s := d.state |
|||
// Sanity enables additional runtime tests.
|
|||
// It's intended to be used during development
|
|||
// to supplement the currently ad-hoc unit tests.
|
|||
const sanity = debugDeflate |
|||
|
|||
if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync { |
|||
return |
|||
} |
|||
|
|||
s.maxInsertIndex = d.windowEnd - (minMatchLength - 1) |
|||
if s.index < s.maxInsertIndex { |
|||
s.hash = hash4(d.window[s.index : s.index+minMatchLength]) |
|||
} |
|||
|
|||
for { |
|||
if sanity && s.index > d.windowEnd { |
|||
panic("index > windowEnd") |
|||
} |
|||
lookahead := d.windowEnd - s.index |
|||
if lookahead < minMatchLength+maxMatchLength { |
|||
if !d.sync { |
|||
return |
|||
} |
|||
if sanity && s.index > d.windowEnd { |
|||
panic("index > windowEnd") |
|||
} |
|||
if lookahead == 0 { |
|||
// Flush current output block if any.
|
|||
if d.byteAvailable { |
|||
// There is still one pending token that needs to be flushed
|
|||
d.tokens.AddLiteral(d.window[s.index-1]) |
|||
d.byteAvailable = false |
|||
} |
|||
if d.tokens.n > 0 { |
|||
if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { |
|||
return |
|||
} |
|||
d.tokens.Reset() |
|||
} |
|||
return |
|||
} |
|||
} |
|||
if s.index < s.maxInsertIndex { |
|||
// Update the hash
|
|||
s.hash = hash4(d.window[s.index : s.index+minMatchLength]) |
|||
ch := s.hashHead[s.hash&hashMask] |
|||
s.chainHead = int(ch) |
|||
s.hashPrev[s.index&windowMask] = ch |
|||
s.hashHead[s.hash&hashMask] = uint32(s.index + s.hashOffset) |
|||
} |
|||
prevLength := s.length |
|||
prevOffset := s.offset |
|||
s.length = minMatchLength - 1 |
|||
s.offset = 0 |
|||
minIndex := s.index - windowSize |
|||
if minIndex < 0 { |
|||
minIndex = 0 |
|||
} |
|||
|
|||
if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy { |
|||
if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, minMatchLength-1, lookahead); ok { |
|||
s.length = newLength |
|||
s.offset = newOffset |
|||
} |
|||
} |
|||
if prevLength >= minMatchLength && s.length <= prevLength { |
|||
// There was a match at the previous step, and the current match is
|
|||
// not better. Output the previous match.
|
|||
d.tokens.AddMatch(uint32(prevLength-3), uint32(prevOffset-minOffsetSize)) |
|||
|
|||
// Insert in the hash table all strings up to the end of the match.
|
|||
// index and index-1 are already inserted. If there is not enough
|
|||
// lookahead, the last two strings are not inserted into the hash
|
|||
// table.
|
|||
var newIndex int |
|||
newIndex = s.index + prevLength - 1 |
|||
// Calculate missing hashes
|
|||
end := newIndex |
|||
if end > s.maxInsertIndex { |
|||
end = s.maxInsertIndex |
|||
} |
|||
end += minMatchLength - 1 |
|||
startindex := s.index + 1 |
|||
if startindex > s.maxInsertIndex { |
|||
startindex = s.maxInsertIndex |
|||
} |
|||
tocheck := d.window[startindex:end] |
|||
dstSize := len(tocheck) - minMatchLength + 1 |
|||
if dstSize > 0 { |
|||
dst := s.hashMatch[:dstSize] |
|||
bulkHash4(tocheck, dst) |
|||
var newH uint32 |
|||
for i, val := range dst { |
|||
di := i + startindex |
|||
newH = val & hashMask |
|||
// Get previous value with the same hash.
|
|||
// Our chain should point to the previous value.
|
|||
s.hashPrev[di&windowMask] = s.hashHead[newH] |
|||
// Set the head of the hash chain to us.
|
|||
s.hashHead[newH] = uint32(di + s.hashOffset) |
|||
} |
|||
s.hash = newH |
|||
} |
|||
|
|||
s.index = newIndex |
|||
d.byteAvailable = false |
|||
s.length = minMatchLength - 1 |
|||
if d.tokens.n == maxFlateBlockTokens { |
|||
// The block includes the current character
|
|||
if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { |
|||
return |
|||
} |
|||
d.tokens.Reset() |
|||
} |
|||
} else { |
|||
// Reset, if we got a match this run.
|
|||
if s.length >= minMatchLength { |
|||
s.ii = 0 |
|||
} |
|||
// We have a byte waiting. Emit it.
|
|||
if d.byteAvailable { |
|||
s.ii++ |
|||
d.tokens.AddLiteral(d.window[s.index-1]) |
|||
if d.tokens.n == maxFlateBlockTokens { |
|||
if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { |
|||
return |
|||
} |
|||
d.tokens.Reset() |
|||
} |
|||
s.index++ |
|||
|
|||
// If we have a long run of no matches, skip additional bytes
|
|||
// Resets when s.ii overflows after 64KB.
|
|||
if s.ii > 31 { |
|||
n := int(s.ii >> 5) |
|||
for j := 0; j < n; j++ { |
|||
if s.index >= d.windowEnd-1 { |
|||
break |
|||
} |
|||
|
|||
d.tokens.AddLiteral(d.window[s.index-1]) |
|||
if d.tokens.n == maxFlateBlockTokens { |
|||
if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { |
|||
return |
|||
} |
|||
d.tokens.Reset() |
|||
} |
|||
s.index++ |
|||
} |
|||
// Flush last byte
|
|||
d.tokens.AddLiteral(d.window[s.index-1]) |
|||
d.byteAvailable = false |
|||
// s.length = minMatchLength - 1 // not needed, since s.ii is reset above, so it should never be > minMatchLength
|
|||
if d.tokens.n == maxFlateBlockTokens { |
|||
if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil { |
|||
return |
|||
} |
|||
d.tokens.Reset() |
|||
} |
|||
} |
|||
} else { |
|||
s.index++ |
|||
d.byteAvailable = true |
|||
} |
|||
} |
|||
} |
|||
} |
|||
|
|||
func (d *compressor) store() { |
|||
if d.windowEnd > 0 && (d.windowEnd == maxStoreBlockSize || d.sync) { |
|||
d.err = d.writeStoredBlock(d.window[:d.windowEnd]) |
|||
d.windowEnd = 0 |
|||
} |
|||
} |
|||
|
|||
// fillWindow will fill the buffer with data for huffman-only compression.
|
|||
// The number of bytes copied is returned.
|
|||
func (d *compressor) fillBlock(b []byte) int { |
|||
n := copy(d.window[d.windowEnd:], b) |
|||
d.windowEnd += n |
|||
return n |
|||
} |
|||
|
|||
// storeHuff will compress and store the currently added data,
|
|||
// if enough has been accumulated or we at the end of the stream.
|
|||
// Any error that occurred will be in d.err
|
|||
func (d *compressor) storeHuff() { |
|||
if d.windowEnd < len(d.window) && !d.sync || d.windowEnd == 0 { |
|||
return |
|||
} |
|||
d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync) |
|||
d.err = d.w.err |
|||
d.windowEnd = 0 |
|||
} |
|||
|
|||
// storeFast will compress and store the currently added data,
|
|||
// if enough has been accumulated or we at the end of the stream.
|
|||
// Any error that occurred will be in d.err
|
|||
func (d *compressor) storeFast() { |
|||
// We only compress if we have maxStoreBlockSize.
|
|||
if d.windowEnd < len(d.window) { |
|||
if !d.sync { |
|||
return |
|||
} |
|||
// Handle extremely small sizes.
|
|||
if d.windowEnd < 128 { |
|||
if d.windowEnd == 0 { |
|||
return |
|||
} |
|||
if d.windowEnd <= 32 { |
|||
d.err = d.writeStoredBlock(d.window[:d.windowEnd]) |
|||
} else { |
|||
d.w.writeBlockHuff(false, d.window[:d.windowEnd], true) |
|||
d.err = d.w.err |
|||
} |
|||
d.tokens.Reset() |
|||
d.windowEnd = 0 |
|||
d.fast.Reset() |
|||
return |
|||
} |
|||
} |
|||
|
|||
d.fast.Encode(&d.tokens, d.window[:d.windowEnd]) |
|||
// If we made zero matches, store the block as is.
|
|||
if d.tokens.n == 0 { |
|||
d.err = d.writeStoredBlock(d.window[:d.windowEnd]) |
|||
// If we removed less than 1/16th, huffman compress the block.
|
|||
} else if int(d.tokens.n) > d.windowEnd-(d.windowEnd>>4) { |
|||
d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync) |
|||
d.err = d.w.err |
|||
} else { |
|||
d.w.writeBlockDynamic(&d.tokens, false, d.window[:d.windowEnd], d.sync) |
|||
d.err = d.w.err |
|||
} |
|||
d.tokens.Reset() |
|||
d.windowEnd = 0 |
|||
} |
|||
|
|||
// write will add input byte to the stream.
|
|||
// Unless an error occurs all bytes will be consumed.
|
|||
func (d *compressor) write(b []byte) (n int, err error) { |
|||
if d.err != nil { |
|||
return 0, d.err |
|||
} |
|||
n = len(b) |
|||
for len(b) > 0 { |
|||
d.step(d) |
|||
b = b[d.fill(d, b):] |
|||
if d.err != nil { |
|||
return 0, d.err |
|||
} |
|||
} |
|||
return n, d.err |
|||
} |
|||
|
|||
func (d *compressor) syncFlush() error { |
|||
d.sync = true |
|||
if d.err != nil { |
|||
return d.err |
|||
} |
|||
d.step(d) |
|||
if d.err == nil { |
|||
d.w.writeStoredHeader(0, false) |
|||
d.w.flush() |
|||
d.err = d.w.err |
|||
} |
|||
d.sync = false |
|||
return d.err |
|||
} |
|||
|
|||
func (d *compressor) init(w io.Writer, level int) (err error) { |
|||
d.w = newHuffmanBitWriter(w) |
|||
|
|||
switch { |
|||
case level == NoCompression: |
|||
d.window = make([]byte, maxStoreBlockSize) |
|||
d.fill = (*compressor).fillBlock |
|||
d.step = (*compressor).store |
|||
case level == ConstantCompression: |
|||
d.w.logNewTablePenalty = 4 |
|||
d.window = make([]byte, maxStoreBlockSize) |
|||
d.fill = (*compressor).fillBlock |
|||
d.step = (*compressor).storeHuff |
|||
case level == DefaultCompression: |
|||