Browse Source

archium - go/src (20201113)

master
Marko Seidel 2 years ago
commit
69f9277bd7
  1. 1
      github.com/klauspost/compress/.gitattributes
  2. 24
      github.com/klauspost/compress/.gitignore
  3. 98
      github.com/klauspost/compress/.goreleaser.yml
  4. 45
      github.com/klauspost/compress/.travis.yml
  5. 28
      github.com/klauspost/compress/LICENSE
  6. 300
      github.com/klauspost/compress/README.md
  7. 85
      github.com/klauspost/compress/compressible.go
  8. 302
      github.com/klauspost/compress/compressible_test.go
  9. 821
      github.com/klauspost/compress/flate/deflate.go
  10. 658
      github.com/klauspost/compress/flate/deflate_test.go
  11. 184
      github.com/klauspost/compress/flate/dict_decoder.go
  12. 139
      github.com/klauspost/compress/flate/dict_decoder_test.go
  13. 254
      github.com/klauspost/compress/flate/fast_encoder.go
  14. 360
      github.com/klauspost/compress/flate/flate_test.go
  15. 265
      github.com/klauspost/compress/flate/gen.go
  16. 274
      github.com/klauspost/compress/flate/gen_inflate.go
  17. 911
      github.com/klauspost/compress/flate/huffman_bit_writer.go
  18. 382
      github.com/klauspost/compress/flate/huffman_bit_writer_test.go
  19. 363
      github.com/klauspost/compress/flate/huffman_code.go
  20. 178
      github.com/klauspost/compress/flate/huffman_sortByFreq.go
  21. 201
      github.com/klauspost/compress/flate/huffman_sortByLiteral.go
  22. 1001
      github.com/klauspost/compress/flate/inflate.go
  23. 922
      github.com/klauspost/compress/flate/inflate_gen.go
  24. 282
      github.com/klauspost/compress/flate/inflate_test.go
  25. 179
      github.com/klauspost/compress/flate/level1.go
  26. 205
      github.com/klauspost/compress/flate/level2.go
  27. 229
      github.com/klauspost/compress/flate/level3.go
  28. 212
      github.com/klauspost/compress/flate/level4.go
  29. 279
      github.com/klauspost/compress/flate/level5.go
  30. 282
      github.com/klauspost/compress/flate/level6.go
  31. 106
      github.com/klauspost/compress/flate/reader_test.go
  32. 297
      github.com/klauspost/compress/flate/stateless.go
  33. BIN
      github.com/klauspost/compress/flate/testdata/huffman-null-max.dyn.expect
  34. BIN
      github.com/klauspost/compress/flate/testdata/huffman-null-max.dyn.expect-noinput
  35. BIN
      github.com/klauspost/compress/flate/testdata/huffman-null-max.golden
  36. BIN
      github.com/klauspost/compress/flate/testdata/huffman-null-max.in
  37. BIN
      github.com/klauspost/compress/flate/testdata/huffman-null-max.sync.expect
  38. BIN
      github.com/klauspost/compress/flate/testdata/huffman-null-max.sync.expect-noinput
  39. BIN
      github.com/klauspost/compress/flate/testdata/huffman-null-max.wb.expect
  40. BIN
      github.com/klauspost/compress/flate/testdata/huffman-null-max.wb.expect-noinput
  41. BIN
      github.com/klauspost/compress/flate/testdata/huffman-pi.dyn.expect
  42. BIN
      github.com/klauspost/compress/flate/testdata/huffman-pi.dyn.expect-noinput
  43. BIN
      github.com/klauspost/compress/flate/testdata/huffman-pi.golden
  44. 1
      github.com/klauspost/compress/flate/testdata/huffman-pi.in
  45. BIN
      github.com/klauspost/compress/flate/testdata/huffman-pi.sync.expect
  46. BIN
      github.com/klauspost/compress/flate/testdata/huffman-pi.sync.expect-noinput
  47. BIN
      github.com/klauspost/compress/flate/testdata/huffman-pi.wb.expect
  48. BIN
      github.com/klauspost/compress/flate/testdata/huffman-pi.wb.expect-noinput
  49. BIN
      github.com/klauspost/compress/flate/testdata/huffman-rand-1k.dyn.expect
  50. BIN
      github.com/klauspost/compress/flate/testdata/huffman-rand-1k.dyn.expect-noinput
  51. BIN
      github.com/klauspost/compress/flate/testdata/huffman-rand-1k.golden
  52. BIN
      github.com/klauspost/compress/flate/testdata/huffman-rand-1k.in
  53. BIN
      github.com/klauspost/compress/flate/testdata/huffman-rand-1k.sync.expect
  54. BIN
      github.com/klauspost/compress/flate/testdata/huffman-rand-1k.sync.expect-noinput
  55. BIN
      github.com/klauspost/compress/flate/testdata/huffman-rand-1k.wb.expect
  56. BIN
      github.com/klauspost/compress/flate/testdata/huffman-rand-1k.wb.expect-noinput
  57. BIN
      github.com/klauspost/compress/flate/testdata/huffman-rand-limit.dyn.expect
  58. 1
      github.com/klauspost/compress/flate/testdata/huffman-rand-limit.dyn.expect-noinput
  59. BIN
      github.com/klauspost/compress/flate/testdata/huffman-rand-limit.golden
  60. 4
      github.com/klauspost/compress/flate/testdata/huffman-rand-limit.in
  61. BIN
      github.com/klauspost/compress/flate/testdata/huffman-rand-limit.sync.expect
  62. BIN
      github.com/klauspost/compress/flate/testdata/huffman-rand-limit.sync.expect-noinput
  63. BIN
      github.com/klauspost/compress/flate/testdata/huffman-rand-limit.wb.expect
  64. BIN
      github.com/klauspost/compress/flate/testdata/huffman-rand-limit.wb.expect-noinput
  65. BIN
      github.com/klauspost/compress/flate/testdata/huffman-rand-max.golden
  66. BIN
      github.com/klauspost/compress/flate/testdata/huffman-rand-max.in
  67. BIN
      github.com/klauspost/compress/flate/testdata/huffman-shifts.dyn.expect
  68. BIN
      github.com/klauspost/compress/flate/testdata/huffman-shifts.dyn.expect-noinput
  69. BIN
      github.com/klauspost/compress/flate/testdata/huffman-shifts.golden
  70. 2
      github.com/klauspost/compress/flate/testdata/huffman-shifts.in
  71. BIN
      github.com/klauspost/compress/flate/testdata/huffman-shifts.sync.expect
  72. BIN
      github.com/klauspost/compress/flate/testdata/huffman-shifts.sync.expect-noinput
  73. BIN
      github.com/klauspost/compress/flate/testdata/huffman-shifts.wb.expect
  74. BIN
      github.com/klauspost/compress/flate/testdata/huffman-shifts.wb.expect-noinput
  75. BIN
      github.com/klauspost/compress/flate/testdata/huffman-text-shift.dyn.expect
  76. 2
      github.com/klauspost/compress/flate/testdata/huffman-text-shift.dyn.expect-noinput
  77. BIN
      github.com/klauspost/compress/flate/testdata/huffman-text-shift.golden
  78. 14
      github.com/klauspost/compress/flate/testdata/huffman-text-shift.in
  79. BIN
      github.com/klauspost/compress/flate/testdata/huffman-text-shift.sync.expect
  80. BIN
      github.com/klauspost/compress/flate/testdata/huffman-text-shift.sync.expect-noinput
  81. BIN
      github.com/klauspost/compress/flate/testdata/huffman-text-shift.wb.expect
  82. BIN
      github.com/klauspost/compress/flate/testdata/huffman-text-shift.wb.expect-noinput
  83. BIN
      github.com/klauspost/compress/flate/testdata/huffman-text.dyn.expect
  84. 3
      github.com/klauspost/compress/flate/testdata/huffman-text.dyn.expect-noinput
  85. BIN
      github.com/klauspost/compress/flate/testdata/huffman-text.golden
  86. 13
      github.com/klauspost/compress/flate/testdata/huffman-text.in
  87. 1
      github.com/klauspost/compress/flate/testdata/huffman-text.sync.expect
  88. 1
      github.com/klauspost/compress/flate/testdata/huffman-text.sync.expect-noinput
  89. 1
      github.com/klauspost/compress/flate/testdata/huffman-text.wb.expect
  90. 1
      github.com/klauspost/compress/flate/testdata/huffman-text.wb.expect-noinput
  91. BIN
      github.com/klauspost/compress/flate/testdata/huffman-zero.dyn.expect
  92. 1
      github.com/klauspost/compress/flate/testdata/huffman-zero.dyn.expect-noinput
  93. BIN
      github.com/klauspost/compress/flate/testdata/huffman-zero.golden
  94. 1
      github.com/klauspost/compress/flate/testdata/huffman-zero.in
  95. BIN
      github.com/klauspost/compress/flate/testdata/huffman-zero.sync.expect
  96. BIN
      github.com/klauspost/compress/flate/testdata/huffman-zero.sync.expect-noinput
  97. BIN
      github.com/klauspost/compress/flate/testdata/huffman-zero.wb.expect
  98. BIN
      github.com/klauspost/compress/flate/testdata/huffman-zero.wb.expect-noinput
  99. BIN
      github.com/klauspost/compress/flate/testdata/null-long-match.dyn.expect-noinput
  100. BIN
      github.com/klauspost/compress/flate/testdata/null-long-match.sync.expect-noinput

1
github.com/klauspost/compress/.gitattributes

@ -0,0 +1 @@
*.bin -text -diff

24
github.com/klauspost/compress/.gitignore

@ -0,0 +1,24 @@
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.test
*.prof

98
github.com/klauspost/compress/.goreleaser.yml

@ -0,0 +1,98 @@
# This is an example goreleaser.yaml file with some sane defaults.
# Make sure to check the documentation at http://goreleaser.com
before:
hooks:
builds:
-
id: "s2c"
binary: s2c
main: ./s2/cmd/s2c/main.go
env:
- CGO_ENABLED=0
goos:
- linux
- freebsd
- netbsd
- windows
goarch:
- 386
- amd64
- arm
- arm64
- ppc64
- ppc64le
- mips64
- mips64le
goarm:
- 7
-
id: "s2d"
binary: s2d
main: ./s2/cmd/s2d/main.go
env:
- CGO_ENABLED=0
goos:
- linux
- freebsd
- netbsd
- windows
goarch:
- 386
- amd64
- arm
- arm64
- ppc64
- ppc64le
- mips64
- mips64le
goarm:
- 7
archives:
-
id: s2-binaries
name_template: "s2-cmds_{{ .Version }}_{{ .Os }}_{{ .Arch }}"
replacements:
darwin: Darwin
linux: Linux
windows: Windows
386: i386
amd64: x86_64
freebsd: FreeBSD
netbsd: NetBSD
format_overrides:
- goos: windows
format: zip
files:
- s2/LICENSE
- s2/README.md
checksum:
name_template: 'checksums.txt'
snapshot:
name_template: "s2-cmds_{{ .Tag }}-next"
changelog:
sort: asc
filters:
exclude:
- '^doc:'
- '^docs:'
- '^test:'
- '^tests:'
- '^Update\sREADME.md'
nfpms:
-
name_template: "s2-cmds_{{ .Version }}_{{ .Os }}_{{ .Arch }}"
vendor: Klaus Post
homepage: https://github.com/klauspost/compress
maintainer: Klaus Post <klauspost@gmail.com>
description: S2 Compression Tool
license: BSD 3-Clause
formats:
- deb
- rpm
replacements:
darwin: Darwin
linux: Linux
freebsd: FreeBSD
amd64: x86_64

45
github.com/klauspost/compress/.travis.yml

@ -0,0 +1,45 @@
language: go
os:
- linux
- osx
go:
- 1.12.x
- 1.13.x
- 1.14.x
- master
env:
- GO111MODULE=off
install:
- go get ./...
- go get github.com/klauspost/compress-fuzz
script:
- diff <(gofmt -d .) <(printf "")
- IS_GO112=`go version | cut -d ' ' -f3 | grep 1.12`; if [ ! -z "$IS_GO112" ]; then echo 'Skipping vet on Go 1.12...'; else go vet ./...; fi
- go test -cpu=2 ./...
- go test -cpu=2 -tags=noasm ./...
- go test -cpu=1,4 -short -race ./...
- go build github.com/klauspost/compress/s2/cmd/s2c && go build github.com/klauspost/compress/s2/cmd/s2d && s2c s2c && s2d s2c.s2 && rm s2c && rm s2d && rm s2c.s2
jobs:
allow_failures:
- go: 'master'
fast_finish: true
include:
- stage: 386 linux test
go: 1.14.x
script:
- GOOS=linux GOARCH=386 go test -short ./...
deploy:
- provider: script
skip_cleanup: true
script: curl -sL https://git.io/goreleaser | VERSION=v0.127.0 bash || true
on:
tags: true
condition: $TRAVIS_OS_NAME = linux
go: 1.14.x

28
github.com/klauspost/compress/LICENSE

@ -0,0 +1,28 @@
Copyright (c) 2012 The Go Authors. All rights reserved.
Copyright (c) 2019 Klaus Post. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

300
github.com/klauspost/compress/README.md

@ -0,0 +1,300 @@
# compress
This package provides various compression algorithms.
* [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression and decompression in pure Go.
* [S2](https://github.com/klauspost/compress/tree/master/s2#s2-compression) is a high performance replacement for Snappy.
* Optimized [deflate](https://godoc.org/github.com/klauspost/compress/flate) packages which can be used as a dropin replacement for [gzip](https://godoc.org/github.com/klauspost/compress/gzip), [zip](https://godoc.org/github.com/klauspost/compress/zip) and [zlib](https://godoc.org/github.com/klauspost/compress/zlib).
* [huff0](https://github.com/klauspost/compress/tree/master/huff0) and [FSE](https://github.com/klauspost/compress/tree/master/fse) implementations for raw entropy encoding.
* [pgzip](https://github.com/klauspost/pgzip) is a separate package that provides a very fast parallel gzip implementation.
* [fuzz package](https://github.com/klauspost/compress-fuzz) for fuzz testing all compressors/decompressors here.
[![Documentation](https://godoc.org/github.com/klauspost/compress?status.svg)](https://pkg.go.dev/github.com/klauspost/compress?tab=subdirectories)
[![Build Status](https://travis-ci.org/klauspost/compress.svg?branch=master)](https://travis-ci.org/klauspost/compress)
[![Sourcegraph Badge](https://sourcegraph.com/github.com/klauspost/compress/-/badge.svg)](https://sourcegraph.com/github.com/klauspost/compress?badge)
# changelog
* June 23, 2020 (v1.10.10) zstd: Skip entropy compression in fastest mode when no matches. [#270](https://github.com/klauspost/compress/pull/270)
* June 16, 2020 (v1.10.9): zstd: API change for specifying dictionaries. See [#268](https://github.com/klauspost/compress/pull/268)
* June 16, 2020: zip: update CreateHeaderRaw to handle zip64 fields. [#266](https://github.com/klauspost/compress/pull/266)
* June 16, 2020: Fuzzit tests removed. The service has been purchased and is no longer available.
* June 5, 2020 (v1.10.8): 1.15x faster zstd block decompression. [#265](https://github.com/klauspost/compress/pull/265)
* June 1, 2020 (v1.10.7): Added zstd decompression [dictionary support](https://github.com/klauspost/compress/tree/master/zstd#dictionaries)
* June 1, 2020: Increase zstd decompression speed up to 1.19x. [#259](https://github.com/klauspost/compress/pull/259)
* June 1, 2020: Remove internal reset call in zstd compression and reduce allocations. [#263](https://github.com/klauspost/compress/pull/263)
* May 21, 2020: (v1.10.6) zstd: Reduce allocations while decoding. [#258](https://github.com/klauspost/compress/pull/258), [#252](https://github.com/klauspost/compress/pull/252)
* May 21, 2020: zstd: Stricter decompression checks.
* April 12, 2020: (v1.10.5) s2-commands: Flush output when receiving SIGINT. [#239](https://github.com/klauspost/compress/pull/239)
* Apr 8, 2020: (v1.10.4) zstd: Minor/special case optimizations. [#251](https://github.com/klauspost/compress/pull/251), [#250](https://github.com/klauspost/compress/pull/250), [#249](https://github.com/klauspost/compress/pull/249), [#247](https://github.com/klauspost/compress/pull/247)
* Mar 11, 2020: (v1.10.3) s2: Use S2 encoder in pure Go mode for Snappy output as well. [#245](https://github.com/klauspost/compress/pull/245)
* Mar 10, 2020: s2: Fix pure Go block encoder. [#244](https://github.com/klauspost/compress/pull/244)
* Mar 9, 2020: zstd: Added "better compression" mode. [#240](https://github.com/klauspost/compress/pull/240)
* Mar 9, 2020: zstd: Improve speed of fastest compression mode by 5-10% [#241](https://github.com/klauspost/compress/pull/241)
* Feb 28, 2020: zstd: Skip creating encoders when not needed. [#238](https://github.com/klauspost/compress/pull/238)
* Feb 27, 2020: (v1.10.2) Close to 50% speedup in inflate (gzip/zip decompression). [#236](https://github.com/klauspost/compress/pull/236) [#234](https://github.com/klauspost/compress/pull/234) [#232](https://github.com/klauspost/compress/pull/232)
* Feb 23, 2020: Reduce deflate level 1-6 memory usage up to 59%. [#227](https://github.com/klauspost/compress/pull/227)
* Feb 18, 2020: (v1.10.1) Fix zstd crash when resetting multiple times without sending data. [#226](https://github.com/klauspost/compress/pull/226)
* Feb 16, 2020: deflate: Fix dictionary use on level 1-6. [#224](https://github.com/klauspost/compress/pull/224)
* Feb 16, 2020: Remove deflate writer reference when closing. [#224](https://github.com/klauspost/compress/pull/224)
* Feb 4, 2020: (v1.10.0) Add optional dictionary to [stateless deflate](https://pkg.go.dev/github.com/klauspost/compress/flate?tab=doc#StatelessDeflate). Breaking change, send `nil` for previous behaviour. [#216](https://github.com/klauspost/compress/pull/216)
* Feb 3, 2020: Fix buffer overflow on repeated small block deflate. [#218](https://github.com/klauspost/compress/pull/218)
* Jan 31, 2020: Allow copying content from an existing ZIP file without decompressing+compressing. [#214](https://github.com/klauspost/compress/pull/214)
* Jan 28, 2020: Added [S2](https://github.com/klauspost/compress/tree/master/s2#s2-compression) AMD64 assembler and various optimizations. Stream speed >10GB/s. [#186](https://github.com/klauspost/compress/pull/186)
<details>
<summary>See changes prior to v1.10.0</summary>
* Jan 20,2020 (v1.9.8) Optimize gzip/deflate with better size estimates and faster table generation. [#207](https://github.com/klauspost/compress/pull/207) by [luyu6056](https://github.com/luyu6056), [#206](https://github.com/klauspost/compress/pull/206).
* Jan 11, 2020: S2 Encode/Decode will use provided buffer if capacity is big enough. [#204](https://github.com/klauspost/compress/pull/204)
* Jan 5, 2020: (v1.9.7) Fix another zstd regression in v1.9.5 - v1.9.6 removed.
* Jan 4, 2020: (v1.9.6) Regression in v1.9.5 fixed causing corrupt zstd encodes in rare cases.
* Jan 4, 2020: Faster IO in [s2c + s2d commandline tools](https://github.com/klauspost/compress/tree/master/s2#commandline-tools) compression/decompression. [#192](https://github.com/klauspost/compress/pull/192)
* Dec 29, 2019: Removed v1.9.5 since fuzz tests showed a compatibility problem with the reference zstandard decoder.
* Dec 29, 2019: (v1.9.5) zstd: 10-20% faster block compression. [#199](https://github.com/klauspost/compress/pull/199)
* Dec 29, 2019: [zip](https://godoc.org/github.com/klauspost/compress/zip) package updated with latest Go features
* Dec 29, 2019: zstd: Single segment flag condintions tweaked. [#197](https://github.com/klauspost/compress/pull/197)
* Dec 18, 2019: s2: Faster compression when ReadFrom is used. [#198](https://github.com/klauspost/compress/pull/198)
* Dec 10, 2019: s2: Fix repeat length output when just above at 16MB limit.
* Dec 10, 2019: zstd: Add function to get decoder as io.ReadCloser. [#191](https://github.com/klauspost/compress/pull/191)
* Dec 3, 2019: (v1.9.4) S2: limit max repeat length. [#188](https://github.com/klauspost/compress/pull/188)
* Dec 3, 2019: Add [WithNoEntropyCompression](https://godoc.org/github.com/klauspost/compress/zstd#WithNoEntropyCompression) to zstd [#187](https://github.com/klauspost/compress/pull/187)
* Dec 3, 2019: Reduce memory use for tests. Check for leaked goroutines.
* Nov 28, 2019 (v1.9.3) Less allocations in stateless deflate.
* Nov 28, 2019: 5-20% Faster huff0 decode. Impacts zstd as well. [#184](https://github.com/klauspost/compress/pull/184)
* Nov 12, 2019 (v1.9.2) Added [Stateless Compression](#stateless-compression) for gzip/deflate.
* Nov 12, 2019: Fixed zstd decompression of large single blocks. [#180](https://github.com/klauspost/compress/pull/180)
* Nov 11, 2019: Set default [s2c](https://github.com/klauspost/compress/tree/master/s2#commandline-tools) block size to 4MB.
* Nov 11, 2019: Reduce inflate memory use by 1KB.
* Nov 10, 2019: Less allocations in deflate bit writer.
* Nov 10, 2019: Fix inconsistent error returned by zstd decoder.
* Oct 28, 2019 (v1.9.1) ztsd: Fix crash when compressing blocks. [#174](https://github.com/klauspost/compress/pull/174)
* Oct 24, 2019 (v1.9.0) zstd: Fix rare data corruption [#173](https://github.com/klauspost/compress/pull/173)
* Oct 24, 2019 zstd: Fix huff0 out of buffer write [#171](https://github.com/klauspost/compress/pull/171) and always return errors [#172](https://github.com/klauspost/compress/pull/172)
* Oct 10, 2019: Big deflate rewrite, 30-40% faster with better compression [#105](https://github.com/klauspost/compress/pull/105)
</details>
<details>
<summary>See changes prior to v1.9.0</summary>
* Oct 10, 2019: (v1.8.6) zstd: Allow partial reads to get flushed data. [#169](https://github.com/klauspost/compress/pull/169)
* Oct 3, 2019: Fix inconsistent results on broken zstd streams.
* Sep 25, 2019: Added `-rm` (remove source files) and `-q` (no output except errors) to `s2c` and `s2d` [commands](https://github.com/klauspost/compress/tree/master/s2#commandline-tools)
* Sep 16, 2019: (v1.8.4) Add `s2c` and `s2d` [commandline tools](https://github.com/klauspost/compress/tree/master/s2#commandline-tools).
* Sep 10, 2019: (v1.8.3) Fix s2 decoder [Skip](https://godoc.org/github.com/klauspost/compress/s2#Reader.Skip).
* Sep 7, 2019: zstd: Added [WithWindowSize](https://godoc.org/github.com/klauspost/compress/zstd#WithWindowSize), contributed by [ianwilkes](https://github.com/ianwilkes).
* Sep 5, 2019: (v1.8.2) Add [WithZeroFrames](https://godoc.org/github.com/klauspost/compress/zstd#WithZeroFrames) which adds full zero payload block encoding option.
* Sep 5, 2019: Lazy initialization of zstandard predefined en/decoder tables.
* Aug 26, 2019: (v1.8.1) S2: 1-2% compression increase in "better" compression mode.
* Aug 26, 2019: zstd: Check maximum size of Huffman 1X compressed literals while decoding.
* Aug 24, 2019: (v1.8.0) Added [S2 compression](https://github.com/klauspost/compress/tree/master/s2#s2-compression), a high performance replacement for Snappy.
* Aug 21, 2019: (v1.7.6) Fixed minor issues found by fuzzer. One could lead to zstd not decompressing.
* Aug 18, 2019: Add [fuzzit](https://fuzzit.dev/) continuous fuzzing.
* Aug 14, 2019: zstd: Skip incompressible data 2x faster. [#147](https://github.com/klauspost/compress/pull/147)
* Aug 4, 2019 (v1.7.5): Better literal compression. [#146](https://github.com/klauspost/compress/pull/146)
* Aug 4, 2019: Faster zstd compression. [#143](https://github.com/klauspost/compress/pull/143) [#144](https://github.com/klauspost/compress/pull/144)
* Aug 4, 2019: Faster zstd decompression. [#145](https://github.com/klauspost/compress/pull/145) [#143](https://github.com/klauspost/compress/pull/143) [#142](https://github.com/klauspost/compress/pull/142)
* July 15, 2019 (v1.7.4): Fix double EOF block in rare cases on zstd encoder.
* July 15, 2019 (v1.7.3): Minor speedup/compression increase in default zstd encoder.
* July 14, 2019: zstd decoder: Fix decompression error on multiple uses with mixed content.
* July 7, 2019 (v1.7.2): Snappy update, zstd decoder potential race fix.
* June 17, 2019: zstd decompression bugfix.
* June 17, 2019: fix 32 bit builds.
* June 17, 2019: Easier use in modules (less dependencies).
* June 9, 2019: New stronger "default" [zstd](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression mode. Matches zstd default compression ratio.
* June 5, 2019: 20-40% throughput in [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression and better compression.
* June 5, 2019: deflate/gzip compression: Reduce memory usage of lower compression levels.
* June 2, 2019: Added [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression!
* May 25, 2019: deflate/gzip: 10% faster bit writer, mostly visible in lower levels.
* Apr 22, 2019: [zstd](https://github.com/klauspost/compress/tree/master/zstd#zstd) decompression added.
* Aug 1, 2018: Added [huff0 README](https://github.com/klauspost/compress/tree/master/huff0#huff0-entropy-compression).
* Jul 8, 2018: Added [Performance Update 2018](#performance-update-2018) below.
* Jun 23, 2018: Merged [Go 1.11 inflate optimizations](https://go-review.googlesource.com/c/go/+/102235). Go 1.9 is now required. Backwards compatible version tagged with [v1.3.0](https://github.com/klauspost/compress/releases/tag/v1.3.0).
* Apr 2, 2018: Added [huff0](https://godoc.org/github.com/klauspost/compress/huff0) en/decoder. Experimental for now, API may change.
* Mar 4, 2018: Added [FSE Entropy](https://godoc.org/github.com/klauspost/compress/fse) en/decoder. Experimental for now, API may change.
* Nov 3, 2017: Add compression [Estimate](https://godoc.org/github.com/klauspost/compress#Estimate) function.
* May 28, 2017: Reduce allocations when resetting decoder.
* Apr 02, 2017: Change back to official crc32, since changes were merged in Go 1.7.
* Jan 14, 2017: Reduce stack pressure due to array copies. See [Issue #18625](https://github.com/golang/go/issues/18625).
* Oct 25, 2016: Level 2-4 have been rewritten and now offers significantly better performance than before.
* Oct 20, 2016: Port zlib changes from Go 1.7 to fix zlib writer issue. Please update.
* Oct 16, 2016: Go 1.7 changes merged. Apples to apples this package is a few percent faster, but has a significantly better balance between speed and compression per level.
* Mar 24, 2016: Always attempt Huffman encoding on level 4-7. This improves base 64 encoded data compression.
* Mar 24, 2016: Small speedup for level 1-3.
* Feb 19, 2016: Faster bit writer, level -2 is 15% faster, level 1 is 4% faster.
* Feb 19, 2016: Handle small payloads faster in level 1-3.
* Feb 19, 2016: Added faster level 2 + 3 compression modes.
* Feb 19, 2016: [Rebalanced compression levels](https://blog.klauspost.com/rebalancing-deflate-compression-levels/), so there is a more even progresssion in terms of compression. New default level is 5.
* Feb 14, 2016: Snappy: Merge upstream changes.
* Feb 14, 2016: Snappy: Fix aggressive skipping.
* Feb 14, 2016: Snappy: Update benchmark.
* Feb 13, 2016: Deflate: Fixed assembler problem that could lead to sub-optimal compression.
* Feb 12, 2016: Snappy: Added AMD64 SSE 4.2 optimizations to matching, which makes easy to compress material run faster. Typical speedup is around 25%.
* Feb 9, 2016: Added Snappy package fork. This version is 5-7% faster, much more on hard to compress content.
* Jan 30, 2016: Optimize level 1 to 3 by not considering static dictionary or storing uncompressed. ~4-5% speedup.
* Jan 16, 2016: Optimization on deflate level 1,2,3 compression.
* Jan 8 2016: Merge [CL 18317](https://go-review.googlesource.com/#/c/18317): fix reading, writing of zip64 archives.
* Dec 8 2015: Make level 1 and -2 deterministic even if write size differs.
* Dec 8 2015: Split encoding functions, so hashing and matching can potentially be inlined. 1-3% faster on AMD64. 5% faster on other platforms.
* Dec 8 2015: Fixed rare [one byte out-of bounds read](https://github.com/klauspost/compress/issues/20). Please update!
* Nov 23 2015: Optimization on token writer. ~2-4% faster. Contributed by [@dsnet](https://github.com/dsnet).
* Nov 20 2015: Small optimization to bit writer on 64 bit systems.
* Nov 17 2015: Fixed out-of-bound errors if the underlying Writer returned an error. See [#15](https://github.com/klauspost/compress/issues/15).
* Nov 12 2015: Added [io.WriterTo](https://golang.org/pkg/io/#WriterTo) support to gzip/inflate.
* Nov 11 2015: Merged [CL 16669](https://go-review.googlesource.com/#/c/16669/4): archive/zip: enable overriding (de)compressors per file
* Oct 15 2015: Added skipping on uncompressible data. Random data speed up >5x.
</details>
# deflate usage
* [High Throughput Benchmark](http://blog.klauspost.com/go-gzipdeflate-benchmarks/).
* [Small Payload/Webserver Benchmarks](http://blog.klauspost.com/gzip-performance-for-go-webservers/).
* [Linear Time Compression](http://blog.klauspost.com/constant-time-gzipzip-compression/).
* [Re-balancing Deflate Compression Levels](https://blog.klauspost.com/rebalancing-deflate-compression-levels/)
The packages are drop-in replacements for standard libraries. Simply replace the import path to use them:
| old import | new import | Documentation
|--------------------|-----------------------------------------|--------------------|
| `compress/gzip` | `github.com/klauspost/compress/gzip` | [gzip](https://pkg.go.dev/github.com/klauspost/compress/gzip?tab=doc)
| `compress/zlib` | `github.com/klauspost/compress/zlib` | [zlib](https://pkg.go.dev/github.com/klauspost/compress/zlib?tab=doc)
| `archive/zip` | `github.com/klauspost/compress/zip` | [zip](https://pkg.go.dev/github.com/klauspost/compress/zip?tab=doc)
| `compress/flate` | `github.com/klauspost/compress/flate` | [flate](https://pkg.go.dev/github.com/klauspost/compress/flate?tab=doc)
* Optimized [deflate](https://godoc.org/github.com/klauspost/compress/flate) packages which can be used as a dropin replacement for [gzip](https://godoc.org/github.com/klauspost/compress/gzip), [zip](https://godoc.org/github.com/klauspost/compress/zip) and [zlib](https://godoc.org/github.com/klauspost/compress/zlib).
You may also be interested in [pgzip](https://github.com/klauspost/pgzip), which is a drop in replacement for gzip, which support multithreaded compression on big files and the optimized [crc32](https://github.com/klauspost/crc32) package used by these packages.
The packages contains the same as the standard library, so you can use the godoc for that: [gzip](http://golang.org/pkg/compress/gzip/), [zip](http://golang.org/pkg/archive/zip/), [zlib](http://golang.org/pkg/compress/zlib/), [flate](http://golang.org/pkg/compress/flate/).
Currently there is only minor speedup on decompression (mostly CRC32 calculation).
# Stateless compression
This package offers stateless compression as a special option for gzip/deflate.
It will do compression but without maintaining any state between Write calls.
This means there will be no memory kept between Write calls, but compression and speed will be suboptimal.
This is only relevant in cases where you expect to run many thousands of compressors concurrently,
but with very little activity. This is *not* intended for regular web servers serving individual requests.
Because of this, the size of actual Write calls will affect output size.
In gzip, specify level `-3` / `gzip.StatelessCompression` to enable.
For direct deflate use, NewStatelessWriter and StatelessDeflate are available. See [documentation](https://godoc.org/github.com/klauspost/compress/flate#NewStatelessWriter)
A `bufio.Writer` can of course be used to control write sizes. For example, to use a 4KB buffer:
```
// replace 'ioutil.Discard' with your output.
gzw, err := gzip.NewWriterLevel(ioutil.Discard, gzip.StatelessCompression)
if err != nil {
return err
}
defer gzw.Close()
w := bufio.NewWriterSize(gzw, 4096)
defer w.Flush()
// Write to 'w'
```
This will only use up to 4KB in memory when the writer is idle.
Compression is almost always worse than the fastest compression level
and each write will allocate (a little) memory.
# Performance Update 2018
It has been a while since we have been looking at the speed of this package compared to the standard library, so I thought I would re-do my tests and give some overall recommendations based on the current state. All benchmarks have been performed with Go 1.10 on my Desktop Intel(R) Core(TM) i7-2600 CPU @3.40GHz. Since I last ran the tests, I have gotten more RAM, which means tests with big files are no longer limited by my SSD.
The raw results are in my [updated spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing). Due to cgo changes and upstream updates i could not get the cgo version of gzip to compile. Instead I included the [zstd](https://github.com/datadog/zstd) cgo implementation. If I get cgo gzip to work again, I might replace the results in the sheet.
The columns to take note of are: *MB/s* - the throughput. *Reduction* - the data size reduction in percent of the original. *Rel Speed* relative speed compared to the standard library at the same level. *Smaller* - how many percent smaller is the compressed output compared to stdlib. Negative means the output was bigger. *Loss* means the loss (or gain) in compression as a percentage difference of the input.
The `gzstd` (standard library gzip) and `gzkp` (this package gzip) only uses one CPU core. [`pgzip`](https://github.com/klauspost/pgzip), [`bgzf`](https://github.com/biogo/hts/tree/master/bgzf) uses all 4 cores. [`zstd`](https://github.com/DataDog/zstd) uses one core, and is a beast (but not Go, yet).
## Overall differences.
There appears to be a roughly 5-10% speed advantage over the standard library when comparing at similar compression levels.
The biggest difference you will see is the result of [re-balancing](https://blog.klauspost.com/rebalancing-deflate-compression-levels/) the compression levels. I wanted by library to give a smoother transition between the compression levels than the standard library.
This package attempts to provide a more smooth transition, where "1" is taking a lot of shortcuts, "5" is the reasonable trade-off and "9" is the "give me the best compression", and the values in between gives something reasonable in between. The standard library has big differences in levels 1-4, but levels 5-9 having no significant gains - often spending a lot more time than can be justified by the achieved compression.
There are links to all the test data in the [spreadsheet](https://docs.google.com/spreadsheets/d/1nuNE2nPfuINCZJRMt6wFWhKpToF95I47XjSsc-1rbPQ/edit?usp=sharing) in the top left field on each tab.
## Web Content
This test set aims to emulate typical use in a web server. The test-set is 4GB data in 53k files, and is a mixture of (mostly) HTML, JS, CSS.
Since level 1 and 9 are close to being the same code, they are quite close. But looking at the levels in-between the differences are quite big.
Looking at level 6, this package is 88% faster, but will output about 6% more data. For a web server, this means you can serve 88% more data, but have to pay for 6% more bandwidth. You can draw your own conclusions on what would be the most expensive for your case.
## Object files
This test is for typical data files stored on a server. In this case it is a collection of Go precompiled objects. They are very compressible.
The picture is similar to the web content, but with small differences since this is very compressible. Levels 2-3 offer good speed, but is sacrificing quite a bit of compression.
The standard library seems suboptimal on level 3 and 4 - offering both worse compression and speed than level 6 & 7 of this package respectively.
## Highly Compressible File
This is a JSON file with very high redundancy. The reduction starts at 95% on level 1, so in real life terms we are dealing with something like a highly redundant stream of data, etc.
It is definitely visible that we are dealing with specialized content here, so the results are very scattered. This package does not do very well at levels 1-4, but picks up significantly at level 5 and levels 7 and 8 offering great speed for the achieved compression.
So if you know you content is extremely compressible you might want to go slightly higher than the defaults. The standard library has a huge gap between levels 3 and 4 in terms of speed (2.75x slowdown), so it offers little "middle ground".
## Medium-High Compressible
This is a pretty common test corpus: [enwik9](http://mattmahoney.net/dc/textdata.html). It contains the first 10^9 bytes of the English Wikipedia dump on Mar. 3, 2006. This is a very good test of typical text based compression and more data heavy streams.
We see a similar picture here as in "Web Content". On equal levels some compression is sacrificed for more speed. Level 5 seems to be the best trade-off between speed and size, beating stdlib level 3 in both.
## Medium Compressible
I will combine two test sets, one [10GB file set](http://mattmahoney.net/dc/10gb.html) and a VM disk image (~8GB). Both contain different data types and represent a typical backup scenario.
The most notable thing is how quickly the standard library drops to very low compression speeds around level 5-6 without any big gains in compression. Since this type of data is fairly common, this does not seem like good behavior.
## Un-compressible Content
This is mainly a test of how good the algorithms are at detecting un-compressible input. The standard library only offers this feature with very conservative settings at level 1. Obviously there is no reason for the algorithms to try to compress input that cannot be compressed. The only downside is that it might skip some compressible data on false detections.
# linear time compression (huffman only)
This compression library adds a special compression level, named `HuffmanOnly`, which allows near linear time compression. This is done by completely disabling matching of previous data, and only reduce the number of bits to represent each character.
This means that often used characters, like 'e' and ' ' (space) in text use the fewest bits to represent, and rare characters like '¤' takes more bits to represent. For more information see [wikipedia](https://en.wikipedia.org/wiki/Huffman_coding) or this nice [video](https://youtu.be/ZdooBTdW5bM).
Since this type of compression has much less variance, the compression speed is mostly unaffected by the input data, and is usually more than *180MB/s* for a single core.
The downside is that the compression ratio is usually considerably worse than even the fastest conventional compression. The compression ratio can never be better than 8:1 (12.5%).
The linear time compression can be used as a "better than nothing" mode, where you cannot risk the encoder to slow down on some content. For comparison, the size of the "Twain" text is *233460 bytes* (+29% vs. level 1) and encode speed is 144MB/s (4.5x level 1). So in this case you trade a 30% size increase for a 4 times speedup.
For more information see my blog post on [Fast Linear Time Compression](http://blog.klauspost.com/constant-time-gzipzip-compression/).
This is implemented on Go 1.7 as "Huffman Only" mode, though not exposed for gzip.
# snappy package
The standard snappy package has now been improved. This repo contains a copy of the snappy repo.
I would advise to use the standard package: https://github.com/golang/snappy
# license
This code is licensed under the same conditions as the original Go code. See LICENSE file.

85
github.com/klauspost/compress/compressible.go

@ -0,0 +1,85 @@
package compress
import "math"
// Estimate returns a normalized compressibility estimate of block b.
// Values close to zero are likely uncompressible.
// Values above 0.1 are likely to be compressible.
// Values above 0.5 are very compressible.
// Very small lengths will return 0.
func Estimate(b []byte) float64 {
if len(b) < 16 {
return 0
}
// Correctly predicted order 1
hits := 0
lastMatch := false
var o1 [256]byte
var hist [256]int
c1 := byte(0)
for _, c := range b {
if c == o1[c1] {
// We only count a hit if there was two correct predictions in a row.
if lastMatch {
hits++
}
lastMatch = true
} else {
lastMatch = false
}
o1[c1] = c
c1 = c
hist[c]++
}
// Use x^0.6 to give better spread
prediction := math.Pow(float64(hits)/float64(len(b)), 0.6)
// Calculate histogram distribution
variance := float64(0)
avg := float64(len(b)) / 256
for _, v := range hist {
Δ := float64(v) - avg
variance += Δ * Δ
}
stddev := math.Sqrt(float64(variance)) / float64(len(b))
exp := math.Sqrt(1 / float64(len(b)))
// Subtract expected stddev
stddev -= exp
if stddev < 0 {
stddev = 0
}
stddev *= 1 + exp
// Use x^0.4 to give better spread
entropy := math.Pow(stddev, 0.4)
// 50/50 weight between prediction and histogram distribution
return math.Pow((prediction+entropy)/2, 0.9)
}
// ShannonEntropyBits returns the number of bits minimum required to represent
// an entropy encoding of the input bytes.
// https://en.wiktionary.org/wiki/Shannon_entropy
func ShannonEntropyBits(b []byte) int {
if len(b) == 0 {
return 0
}
var hist [256]int
for _, c := range b {
hist[c]++
}
shannon := float64(0)
invTotal := 1.0 / float64(len(b))
for _, v := range hist[:] {
if v > 0 {
n := float64(v)
shannon += math.Ceil(-math.Log2(n*invTotal) * n)
}
}
return int(math.Ceil(shannon))
}

302
github.com/klauspost/compress/compressible_test.go

@ -0,0 +1,302 @@
package compress
import (
"crypto/rand"
"encoding/base32"
"io/ioutil"
"strconv"
"strings"
"testing"
"github.com/klauspost/compress/flate"
"github.com/klauspost/compress/gzip"
)
func BenchmarkEstimate(b *testing.B) {
b.ReportAllocs()
// (predictable, low entropy distibution)
b.Run("zeroes-5k", func(b *testing.B) {
var testData = make([]byte, 5000)
b.SetBytes(int64(len(testData)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
Estimate(testData)
}
b.Log(Estimate(testData))
})
// (predictable, high entropy distibution)
b.Run("predictable-5k", func(b *testing.B) {
var testData = make([]byte, 5000)
for i := range testData {
testData[i] = byte(float64(i) / float64(len(testData)) * 256)
}
b.SetBytes(int64(len(testData)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
Estimate(testData)
}
b.Log(Estimate(testData))
})
// (not predictable, high entropy distibution)
b.Run("random-500b", func(b *testing.B) {
var testData = make([]byte, 500)
rand.Read(testData)
b.SetBytes(int64(len(testData)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
Estimate(testData)
}
b.Log(Estimate(testData))
})
// (not predictable, high entropy distibution)
b.Run("random-5k", func(b *testing.B) {
var testData = make([]byte, 5000)
rand.Read(testData)
b.SetBytes(int64(len(testData)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
Estimate(testData)
}
b.Log(Estimate(testData))
})
// (not predictable, high entropy distibution)
b.Run("random-50k", func(b *testing.B) {
var testData = make([]byte, 50000)
rand.Read(testData)
b.SetBytes(int64(len(testData)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
Estimate(testData)
}
b.Log(Estimate(testData))
})
// (not predictable, high entropy distibution)
b.Run("random-500k", func(b *testing.B) {
var testData = make([]byte, 500000)
rand.Read(testData)
b.SetBytes(int64(len(testData)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
Estimate(testData)
}
b.Log(Estimate(testData))
})
// (not predictable, medium entropy distibution)
b.Run("base-32-5k", func(b *testing.B) {
var testData = make([]byte, 5000)
rand.Read(testData)
s := base32.StdEncoding.EncodeToString(testData)
testData = []byte(s)
testData = testData[:5000]
b.SetBytes(int64(len(testData)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
Estimate(testData)
}
b.Log(Estimate(testData))
})
// (medium predictable, medium entropy distibution)
b.Run("text", func(b *testing.B) {
var testData = []byte(`If compression is done per-chunk, care should be taken that it doesn't leave restic backups open to watermarking/fingerprinting attacks.
This is essentially the same problem we discussed related to fingerprinting the CDC deduplication process:
With "naive" CDC, a "known plaintext" file can be verified to exist within the backup if the size of individual blocks can be observed by an attacker, by using CDC on the file in parallel and comparing the resulting amount of chunks and individual chunk lengths.
As discussed earlier, this can be somewhat mitigated by salting the CDC algorithm with a secret value, as done in attic.
With salted CDC, I assume compression would happen on each individual chunk, after splitting the problematic file into chunks. Restic chunks are in the range of 512 KB to 8 MB (but not evenly distributed - right?).
Attacker knows that the CDC algorithm uses a secret salt, so the attacker generates a range of chunks consisting of the first 512 KB to 8 MB of the file, one for each valid chunk length. The attacker is also able to determine the lengths of compressed chunks.
The attacker then compresses that chunk using the compression algorithm.
The attacker compares the lengths of the resulting chunks to the first chunk in the restic backup sets.
IF a matching block length is found, the attacker repeats the exercise with the next chunk, and the next chunk, and the next chunk, ... and the next chunk.
It is my belief that with sufficiently large files, and considering the fact that the CDC algorithm is "biased" (in lack of better of words) towards generating blocks of about 1 MB, this would be sufficient to ascertain whether or not a certain large file exists in the backup.
AS always, a paranoid and highly unscientific stream of consciousness.
Thoughts?`)
testData = append(testData, testData...)
testData = append(testData, testData...)
b.SetBytes(int64(len(testData)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
Estimate(testData)
}
b.Log(Estimate(testData))
})
}
func BenchmarkSnannonEntropyBits(b *testing.B) {
b.ReportAllocs()
// (predictable, low entropy distibution)
b.Run("zeroes-5k", func(b *testing.B) {
var testData = make([]byte, 5000)
b.SetBytes(int64(len(testData)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
ShannonEntropyBits(testData)
}
b.Log(ShannonEntropyBits(testData))
})
// (predictable, high entropy distibution)
b.Run("predictable-5k", func(b *testing.B) {
var testData = make([]byte, 5000)
for i := range testData {
testData[i] = byte(float64(i) / float64(len(testData)) * 256)
}
b.SetBytes(int64(len(testData)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
ShannonEntropyBits(testData)
}
b.Log(ShannonEntropyBits(testData))
})
// (not predictable, high entropy distibution)
b.Run("random-500b", func(b *testing.B) {
var testData = make([]byte, 500)
rand.Read(testData)
b.SetBytes(int64(len(testData)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
ShannonEntropyBits(testData)
}
b.Log(ShannonEntropyBits(testData))
})
// (not predictable, high entropy distibution)
b.Run("random-5k", func(b *testing.B) {
var testData = make([]byte, 5000)
rand.Read(testData)
b.SetBytes(int64(len(testData)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
ShannonEntropyBits(testData)
}
b.Log(ShannonEntropyBits(testData))
})
// (not predictable, high entropy distibution)
b.Run("random-50k", func(b *testing.B) {
var testData = make([]byte, 50000)
rand.Read(testData)
b.SetBytes(int64(len(testData)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
ShannonEntropyBits(testData)
}
b.Log(ShannonEntropyBits(testData))
})
// (not predictable, high entropy distibution)
b.Run("random-500k", func(b *testing.B) {
var testData = make([]byte, 500000)
rand.Read(testData)
b.SetBytes(int64(len(testData)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
ShannonEntropyBits(testData)
}
b.Log(ShannonEntropyBits(testData))
})
// (not predictable, medium entropy distibution)
b.Run("base-32-5k", func(b *testing.B) {
var testData = make([]byte, 5000)
rand.Read(testData)
s := base32.StdEncoding.EncodeToString(testData)
testData = []byte(s)
testData = testData[:5000]
b.SetBytes(int64(len(testData)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
ShannonEntropyBits(testData)
}
b.Log(ShannonEntropyBits(testData))
})
// (medium predictable, medium entropy distibution)
b.Run("text", func(b *testing.B) {
var testData = []byte(`If compression is done per-chunk, care should be taken that it doesn't leave restic backups open to watermarking/fingerprinting attacks.
This is essentially the same problem we discussed related to fingerprinting the CDC deduplication process:
With "naive" CDC, a "known plaintext" file can be verified to exist within the backup if the size of individual blocks can be observed by an attacker, by using CDC on the file in parallel and comparing the resulting amount of chunks and individual chunk lengths.
As discussed earlier, this can be somewhat mitigated by salting the CDC algorithm with a secret value, as done in attic.
With salted CDC, I assume compression would happen on each individual chunk, after splitting the problematic file into chunks. Restic chunks are in the range of 512 KB to 8 MB (but not evenly distributed - right?).
Attacker knows that the CDC algorithm uses a secret salt, so the attacker generates a range of chunks consisting of the first 512 KB to 8 MB of the file, one for each valid chunk length. The attacker is also able to determine the lengths of compressed chunks.
The attacker then compresses that chunk using the compression algorithm.
The attacker compares the lengths of the resulting chunks to the first chunk in the restic backup sets.
IF a matching block length is found, the attacker repeats the exercise with the next chunk, and the next chunk, and the next chunk, ... and the next chunk.
It is my belief that with sufficiently large files, and considering the fact that the CDC algorithm is "biased" (in lack of better of words) towards generating blocks of about 1 MB, this would be sufficient to ascertain whether or not a certain large file exists in the backup.
AS always, a paranoid and highly unscientific stream of consciousness.
Thoughts?`)
testData = append(testData, testData...)
testData = append(testData, testData...)
b.SetBytes(int64(len(testData)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
ShannonEntropyBits(testData)
}
b.Log(ShannonEntropyBits(testData))
})
}
func BenchmarkCompressAllocations(b *testing.B) {
payload := []byte(strings.Repeat("Tiny payload", 20))
for j := -2; j <= 9; j++ {
b.Run("level("+strconv.Itoa(j)+")", func(b *testing.B) {
b.Run("flate", func(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
w, err := flate.NewWriter(ioutil.Discard, j)
if err != nil {
b.Fatal(err)
}
w.Write(payload)
w.Close()
}
})
b.Run("gzip", func(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
w, err := gzip.NewWriterLevel(ioutil.Discard, j)
if err != nil {
b.Fatal(err)
}
w.Write(payload)
w.Close()
}
})
})
}
}
func BenchmarkCompressAllocationsSingle(b *testing.B) {
payload := []byte(strings.Repeat("Tiny payload", 20))
const level = 2
b.Run("flate", func(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
w, err := flate.NewWriter(ioutil.Discard, level)
if err != nil {
b.Fatal(err)
}
w.Write(payload)
w.Close()
}
})
b.Run("gzip", func(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
w, err := gzip.NewWriterLevel(ioutil.Discard, level)
if err != nil {
b.Fatal(err)
}
w.Write(payload)
w.Close()
}
})
}

821
github.com/klauspost/compress/flate/deflate.go

@ -0,0 +1,821 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Copyright (c) 2015 Klaus Post
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package flate
import (
"fmt"
"io"
"math"
)
const (
NoCompression = 0
BestSpeed = 1
BestCompression = 9
DefaultCompression = -1
// HuffmanOnly disables Lempel-Ziv match searching and only performs Huffman
// entropy encoding. This mode is useful in compressing data that has
// already been compressed with an LZ style algorithm (e.g. Snappy or LZ4)
// that lacks an entropy encoder. Compression gains are achieved when
// certain bytes in the input stream occur more frequently than others.
//
// Note that HuffmanOnly produces a compressed output that is
// RFC 1951 compliant. That is, any valid DEFLATE decompressor will
// continue to be able to decompress this output.
HuffmanOnly = -2
ConstantCompression = HuffmanOnly // compatibility alias.
logWindowSize = 15
windowSize = 1 << logWindowSize
windowMask = windowSize - 1
logMaxOffsetSize = 15 // Standard DEFLATE
minMatchLength = 4 // The smallest match that the compressor looks for
maxMatchLength = 258 // The longest match for the compressor
minOffsetSize = 1 // The shortest offset that makes any sense
// The maximum number of tokens we put into a single flat block, just too
// stop things from getting too large.
maxFlateBlockTokens = 1 << 14
maxStoreBlockSize = 65535
hashBits = 17 // After 17 performance degrades
hashSize = 1 << hashBits
hashMask = (1 << hashBits) - 1
hashShift = (hashBits + minMatchLength - 1) / minMatchLength
maxHashOffset = 1 << 24
skipNever = math.MaxInt32
debugDeflate = false
)
type compressionLevel struct {
good, lazy, nice, chain, fastSkipHashing, level int
}
// Compression levels have been rebalanced from zlib deflate defaults
// to give a bigger spread in speed and compression.
// See https://blog.klauspost.com/rebalancing-deflate-compression-levels/
var levels = []compressionLevel{
{}, // 0
// Level 1-6 uses specialized algorithm - values not used
{0, 0, 0, 0, 0, 1},
{0, 0, 0, 0, 0, 2},
{0, 0, 0, 0, 0, 3},
{0, 0, 0, 0, 0, 4},
{0, 0, 0, 0, 0, 5},
{0, 0, 0, 0, 0, 6},
// Levels 7-9 use increasingly more lazy matching
// and increasingly stringent conditions for "good enough".
{8, 8, 24, 16, skipNever, 7},
{10, 16, 24, 64, skipNever, 8},
{32, 258, 258, 4096, skipNever, 9},
}
// advancedState contains state for the advanced levels, with bigger hash tables, etc.
type advancedState struct {
// deflate state
length int
offset int
maxInsertIndex int
// Input hash chains
// hashHead[hashValue] contains the largest inputIndex with the specified hash value
// If hashHead[hashValue] is within the current window, then
// hashPrev[hashHead[hashValue] & windowMask] contains the previous index
// with the same hash value.
chainHead int
hashHead [hashSize]uint32
hashPrev [windowSize]uint32
hashOffset int
// input window: unprocessed data is window[index:windowEnd]
index int
hashMatch [maxMatchLength + minMatchLength]uint32
hash uint32
ii uint16 // position of last match, intended to overflow to reset.
}
type compressor struct {
compressionLevel
w *huffmanBitWriter
// compression algorithm
fill func(*compressor, []byte) int // copy data to window
step func(*compressor) // process window
window []byte
windowEnd int
blockStart int // window index where current tokens start
err error
// queued output tokens
tokens tokens
fast fastEnc
state *advancedState
sync bool // requesting flush
byteAvailable bool // if true, still need to process window[index-1].
}
func (d *compressor) fillDeflate(b []byte) int {
s := d.state
if s.index >= 2*windowSize-(minMatchLength+maxMatchLength) {
// shift the window by windowSize
copy(d.window[:], d.window[windowSize:2*windowSize])
s.index -= windowSize
d.windowEnd -= windowSize
if d.blockStart >= windowSize {
d.blockStart -= windowSize
} else {
d.blockStart = math.MaxInt32
}
s.hashOffset += windowSize
if s.hashOffset > maxHashOffset {
delta := s.hashOffset - 1
s.hashOffset -= delta
s.chainHead -= delta
// Iterate over slices instead of arrays to avoid copying
// the entire table onto the stack (Issue #18625).
for i, v := range s.hashPrev[:] {
if int(v) > delta {
s.hashPrev[i] = uint32(int(v) - delta)
} else {
s.hashPrev[i] = 0
}
}
for i, v := range s.hashHead[:] {
if int(v) > delta {
s.hashHead[i] = uint32(int(v) - delta)
} else {
s.hashHead[i] = 0
}
}
}
}
n := copy(d.window[d.windowEnd:], b)
d.windowEnd += n
return n
}
func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error {
if index > 0 || eof {
var window []byte
if d.blockStart <= index {
window = d.window[d.blockStart:index]
}
d.blockStart = index
d.w.writeBlock(tok, eof, window)
return d.w.err
}
return nil
}
// writeBlockSkip writes the current block and uses the number of tokens
// to determine if the block should be stored on no matches, or
// only huffman encoded.
func (d *compressor) writeBlockSkip(tok *tokens, index int, eof bool) error {
if index > 0 || eof {
if d.blockStart <= index {
window := d.window[d.blockStart:index]
// If we removed less than a 64th of all literals
// we huffman compress the block.
if int(tok.n) > len(window)-int(tok.n>>6) {
d.w.writeBlockHuff(eof, window, d.sync)
} else {
// Write a dynamic huffman block.
d.w.writeBlockDynamic(tok, eof, window, d.sync)
}
} else {
d.w.writeBlock(tok, eof, nil)
}
d.blockStart = index
return d.w.err
}
return nil
}
// fillWindow will fill the current window with the supplied
// dictionary and calculate all hashes.
// This is much faster than doing a full encode.
// Should only be used after a start/reset.
func (d *compressor) fillWindow(b []byte) {
// Do not fill window if we are in store-only or huffman mode.
if d.level <= 0 {
return
}
if d.fast != nil {
// encode the last data, but discard the result
if len(b) > maxMatchOffset {
b = b[len(b)-maxMatchOffset:]
}
d.fast.Encode(&d.tokens, b)
d.tokens.Reset()
return
}
s := d.state
// If we are given too much, cut it.
if len(b) > windowSize {
b = b[len(b)-windowSize:]
}
// Add all to window.
n := copy(d.window[d.windowEnd:], b)
// Calculate 256 hashes at the time (more L1 cache hits)
loops := (n + 256 - minMatchLength) / 256
for j := 0; j < loops; j++ {
startindex := j * 256
end := startindex + 256 + minMatchLength - 1
if end > n {
end = n
}
tocheck := d.window[startindex:end]
dstSize := len(tocheck) - minMatchLength + 1
if dstSize <= 0 {
continue
}
dst := s.hashMatch[:dstSize]
bulkHash4(tocheck, dst)
var newH uint32
for i, val := range dst {
di := i + startindex
newH = val & hashMask
// Get previous value with the same hash.
// Our chain should point to the previous value.
s.hashPrev[di&windowMask] = s.hashHead[newH]
// Set the head of the hash chain to us.
s.hashHead[newH] = uint32(di + s.hashOffset)
}
s.hash = newH
}
// Update window information.
d.windowEnd += n
s.index = n
}
// Try to find a match starting at index whose length is greater than prevSize.
// We only look at chainCount possibilities before giving up.
// pos = s.index, prevHead = s.chainHead-s.hashOffset, prevLength=minMatchLength-1, lookahead
func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
minMatchLook := maxMatchLength
if lookahead < minMatchLook {
minMatchLook = lookahead
}
win := d.window[0 : pos+minMatchLook]
// We quit when we get a match that's at least nice long
nice := len(win) - pos
if d.nice < nice {
nice = d.nice
}
// If we've got a match that's good enough, only look in 1/4 the chain.
tries := d.chain
length = prevLength
if length >= d.good {
tries >>= 2
}
wEnd := win[pos+length]
wPos := win[pos:]
minIndex := pos - windowSize
for i := prevHead; tries > 0; tries-- {
if wEnd == win[i+length] {
n := matchLen(win[i:i+minMatchLook], wPos)
if n > length && (n > minMatchLength || pos-i <= 4096) {
length = n
offset = pos - i
ok = true
if n >= nice {
// The match is good enough that we don't try to find a better one.
break
}
wEnd = win[pos+n]
}
}
if i == minIndex {
// hashPrev[i & windowMask] has already been overwritten, so stop now.
break
}
i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
if i < minIndex || i < 0 {
break
}
}
return
}
func (d *compressor) writeStoredBlock(buf []byte) error {
if d.w.writeStoredHeader(len(buf), false); d.w.err != nil {
return d.w.err
}
d.w.writeBytes(buf)
return d.w.err
}
// hash4 returns a hash representation of the first 4 bytes
// of the supplied slice.
// The caller must ensure that len(b) >= 4.
func hash4(b []byte) uint32 {
b = b[:4]
return hash4u(uint32(b[3])|uint32(b[2])<<8|uint32(b[1])<<16|uint32(b[0])<<24, hashBits)
}
// bulkHash4 will compute hashes using the same
// algorithm as hash4
func bulkHash4(b []byte, dst []uint32) {
if len(b) < 4 {
return
}
hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
dst[0] = hash4u(hb, hashBits)
end := len(b) - 4 + 1
for i := 1; i < end; i++ {
hb = (hb << 8) | uint32(b[i+3])
dst[i] = hash4u(hb, hashBits)
}
}
func (d *compressor) initDeflate() {
d.window = make([]byte, 2*windowSize)
d.byteAvailable = false
d.err = nil
if d.state == nil {
return
}
s := d.state
s.index = 0
s.hashOffset = 1
s.length = minMatchLength - 1
s.offset = 0
s.hash = 0
s.chainHead = -1
}
// deflateLazy is the same as deflate, but with d.fastSkipHashing == skipNever,
// meaning it always has lazy matching on.
func (d *compressor) deflateLazy() {
s := d.state
// Sanity enables additional runtime tests.
// It's intended to be used during development
// to supplement the currently ad-hoc unit tests.
const sanity = debugDeflate
if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync {
return
}
s.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
if s.index < s.maxInsertIndex {
s.hash = hash4(d.window[s.index : s.index+minMatchLength])
}
for {
if sanity && s.index > d.windowEnd {
panic("index > windowEnd")
}
lookahead := d.windowEnd - s.index
if lookahead < minMatchLength+maxMatchLength {
if !d.sync {
return
}
if sanity && s.index > d.windowEnd {
panic("index > windowEnd")
}
if lookahead == 0 {
// Flush current output block if any.
if d.byteAvailable {
// There is still one pending token that needs to be flushed
d.tokens.AddLiteral(d.window[s.index-1])
d.byteAvailable = false
}
if d.tokens.n > 0 {
if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
return
}
d.tokens.Reset()
}
return
}
}
if s.index < s.maxInsertIndex {
// Update the hash
s.hash = hash4(d.window[s.index : s.index+minMatchLength])
ch := s.hashHead[s.hash&hashMask]
s.chainHead = int(ch)
s.hashPrev[s.index&windowMask] = ch
s.hashHead[s.hash&hashMask] = uint32(s.index + s.hashOffset)
}
prevLength := s.length
prevOffset := s.offset
s.length = minMatchLength - 1
s.offset = 0
minIndex := s.index - windowSize
if minIndex < 0 {
minIndex = 0
}
if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, minMatchLength-1, lookahead); ok {
s.length = newLength
s.offset = newOffset
}
}
if prevLength >= minMatchLength && s.length <= prevLength {
// There was a match at the previous step, and the current match is
// not better. Output the previous match.
d.tokens.AddMatch(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
// Insert in the hash table all strings up to the end of the match.
// index and index-1 are already inserted. If there is not enough
// lookahead, the last two strings are not inserted into the hash
// table.
var newIndex int
newIndex = s.index + prevLength - 1
// Calculate missing hashes
end := newIndex
if end > s.maxInsertIndex {
end = s.maxInsertIndex
}
end += minMatchLength - 1
startindex := s.index + 1
if startindex > s.maxInsertIndex {
startindex = s.maxInsertIndex
}
tocheck := d.window[startindex:end]
dstSize := len(tocheck) - minMatchLength + 1
if dstSize > 0 {
dst := s.hashMatch[:dstSize]
bulkHash4(tocheck, dst)
var newH uint32
for i, val := range dst {
di := i + startindex
newH = val & hashMask
// Get previous value with the same hash.
// Our chain should point to the previous value.
s.hashPrev[di&windowMask] = s.hashHead[newH]
// Set the head of the hash chain to us.
s.hashHead[newH] = uint32(di + s.hashOffset)
}
s.hash = newH
}
s.index = newIndex
d.byteAvailable = false
s.length = minMatchLength - 1
if d.tokens.n == maxFlateBlockTokens {
// The block includes the current character
if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
return
}
d.tokens.Reset()
}
} else {
// Reset, if we got a match this run.
if s.length >= minMatchLength {
s.ii = 0
}
// We have a byte waiting. Emit it.
if d.byteAvailable {
s.ii++
d.tokens.AddLiteral(d.window[s.index-1])
if d.tokens.n == maxFlateBlockTokens {
if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
return
}
d.tokens.Reset()
}
s.index++
// If we have a long run of no matches, skip additional bytes
// Resets when s.ii overflows after 64KB.
if s.ii > 31 {
n := int(s.ii >> 5)
for j := 0; j < n; j++ {
if s.index >= d.windowEnd-1 {
break
}
d.tokens.AddLiteral(d.window[s.index-1])
if d.tokens.n == maxFlateBlockTokens {
if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
return
}
d.tokens.Reset()
}
s.index++
}
// Flush last byte
d.tokens.AddLiteral(d.window[s.index-1])
d.byteAvailable = false
// s.length = minMatchLength - 1 // not needed, since s.ii is reset above, so it should never be > minMatchLength
if d.tokens.n == maxFlateBlockTokens {
if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
return
}
d.tokens.Reset()
}
}
} else {
s.index++
d.byteAvailable = true
}
}
}
}
func (d *compressor) store() {
if d.windowEnd > 0 && (d.windowEnd == maxStoreBlockSize || d.sync) {
d.err = d.writeStoredBlock(d.window[:d.windowEnd])
d.windowEnd = 0
}
}
// fillWindow will fill the buffer with data for huffman-only compression.
// The number of bytes copied is returned.
func (d *compressor) fillBlock(b []byte) int {
n := copy(d.window[d.windowEnd:], b)
d.windowEnd += n
return n
}
// storeHuff will compress and store the currently added data,
// if enough has been accumulated or we at the end of the stream.
// Any error that occurred will be in d.err
func (d *compressor) storeHuff() {
if d.windowEnd < len(d.window) && !d.sync || d.windowEnd == 0 {
return
}
d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync)
d.err = d.w.err
d.windowEnd = 0
}
// storeFast will compress and store the currently added data,
// if enough has been accumulated or we at the end of the stream.
// Any error that occurred will be in d.err
func (d *compressor) storeFast() {
// We only compress if we have maxStoreBlockSize.
if d.windowEnd < len(d.window) {
if !d.sync {
return
}
// Handle extremely small sizes.
if d.windowEnd < 128 {
if d.windowEnd == 0 {
return
}
if d.windowEnd <= 32 {
d.err = d.writeStoredBlock(d.window[:d.windowEnd])
} else {
d.w.writeBlockHuff(false, d.window[:d.windowEnd], true)
d.err = d.w.err
}
d.tokens.Reset()
d.windowEnd = 0
d.fast.Reset()
return
}
}
d.fast.Encode(&d.tokens, d.window[:d.windowEnd])
// If we made zero matches, store the block as is.
if d.tokens.n == 0 {
d.err = d.writeStoredBlock(d.window[:d.windowEnd])
// If we removed less than 1/16th, huffman compress the block.
} else if int(d.tokens.n) > d.windowEnd-(d.windowEnd>>4) {
d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync)
d.err = d.w.err
} else {
d.w.writeBlockDynamic(&d.tokens, false, d.window[:d.windowEnd], d.sync)
d.err = d.w.err
}
d.tokens.Reset()
d.windowEnd = 0
}
// write will add input byte to the stream.
// Unless an error occurs all bytes will be consumed.
func (d *compressor) write(b []byte) (n int, err error) {
if d.err != nil {
return 0, d.err
}
n = len(b)
for len(b) > 0 {
d.step(d)
b = b[d.fill(d, b):]
if d.err != nil {
return 0, d.err
}
}
return n, d.err
}
func (d *compressor) syncFlush() error {
d.sync = true
if d.err != nil {
return d.err
}
d.step(d)
if d.err == nil {
d.w.writeStoredHeader(0, false)
d.w.flush()
d.err = d.w.err
}
d.sync = false
return d.err
}
func (d *compressor) init(w io.Writer, level int) (err error) {
d.w = newHuffmanBitWriter(w)
switch {
case level == NoCompression:
d.window = make([]byte, maxStoreBlockSize)
d.fill = (*compressor).fillBlock
d.step = (*compressor).store
case level == ConstantCompression:
d.w.logNewTablePenalty = 4
d.window = make([]byte, maxStoreBlockSize)
d.fill = (*compressor).fillBlock
d.step = (*compressor).storeHuff
case level == DefaultCompression: