Vendor dependencies for 0.3.0 release

This commit is contained in:
2025-09-27 10:29:08 -05:00
parent 0c8d39d483
commit 82ab7f317b
26803 changed files with 16134934 additions and 0 deletions

1
vendor/ruzstd/.cargo-checksum.json vendored Normal file

File diff suppressed because one or more lines are too long

759
vendor/ruzstd/Cargo.lock generated vendored Normal file
View File

@@ -0,0 +1,759 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]]
name = "anes"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
[[package]]
name = "anstyle"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc"
[[package]]
name = "autocfg"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80"
[[package]]
name = "bumpalo"
version = "3.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
[[package]]
name = "cast"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
[[package]]
name = "cc"
version = "1.1.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945"
dependencies = [
"jobserver",
"libc",
"shlex",
]
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "ciborium"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
dependencies = [
"ciborium-io",
"ciborium-ll",
"serde",
]
[[package]]
name = "ciborium-io"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
[[package]]
name = "ciborium-ll"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
dependencies = [
"ciborium-io",
"half",
]
[[package]]
name = "clap"
version = "4.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0"
dependencies = [
"clap_builder",
]
[[package]]
name = "clap_builder"
version = "4.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4"
dependencies = [
"anstyle",
"clap_lex",
]
[[package]]
name = "clap_lex"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce"
[[package]]
name = "compiler_builtins"
version = "0.1.126"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "758019257ad46e191b587d8f711022a6ac1d1fb6745d75e1d76c587fdcbca770"
[[package]]
name = "criterion"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
dependencies = [
"anes",
"cast",
"ciborium",
"clap",
"criterion-plot",
"is-terminal",
"itertools",
"num-traits",
"once_cell",
"oorandom",
"plotters",
"rayon",
"regex",
"serde",
"serde_derive",
"serde_json",
"tinytemplate",
"walkdir",
]
[[package]]
name = "criterion-plot"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
dependencies = [
"cast",
"itertools",
]
[[package]]
name = "crossbeam-deque"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d"
dependencies = [
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
[[package]]
name = "crunchy"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
[[package]]
name = "either"
version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2"
[[package]]
name = "getrandom"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94b22e06ecb0110981051723910cbf0b5f5e09a2062dd7663334ee79a9d1286c"
dependencies = [
"cfg-if",
"libc",
"wasi",
]
[[package]]
name = "half"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
dependencies = [
"cfg-if",
"crunchy",
]
[[package]]
name = "hermit-abi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024"
[[package]]
name = "is-terminal"
version = "0.4.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b"
dependencies = [
"hermit-abi",
"libc",
"windows-sys",
]
[[package]]
name = "itertools"
version = "0.10.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
[[package]]
name = "jobserver"
version = "0.1.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0"
dependencies = [
"libc",
]
[[package]]
name = "js-sys"
version = "0.3.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d"
dependencies = [
"wasm-bindgen",
]
[[package]]
name = "libc"
version = "0.2.153"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
[[package]]
name = "log"
version = "0.4.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
[[package]]
name = "memchr"
version = "2.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d"
[[package]]
name = "num-traits"
version = "0.2.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a"
dependencies = [
"autocfg",
]
[[package]]
name = "once_cell"
version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
[[package]]
name = "oorandom"
version = "11.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
[[package]]
name = "pkg-config"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
[[package]]
name = "plotters"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2c224ba00d7cadd4d5c660deaf2098e5e80e07846537c51f9cfa4be50c1fd45"
dependencies = [
"num-traits",
"plotters-backend",
"plotters-svg",
"wasm-bindgen",
"web-sys",
]
[[package]]
name = "plotters-backend"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e76628b4d3a7581389a35d5b6e2139607ad7c75b17aed325f210aa91f4a9609"
[[package]]
name = "plotters-svg"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38f6d39893cca0701371e3c27294f09797214b86f1fb951b89ade8ec04e2abab"
dependencies = [
"plotters-backend",
]
[[package]]
name = "ppv-lite86"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
[[package]]
name = "proc-macro2"
version = "1.0.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha",
"rand_core",
]
[[package]]
name = "rand_chacha"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
"getrandom",
]
[[package]]
name = "rayon"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
dependencies = [
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
dependencies = [
"crossbeam-deque",
"crossbeam-utils",
]
[[package]]
name = "regex"
version = "1.10.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56"
[[package]]
name = "rustc-std-workspace-alloc"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff66d57013a5686e1917ed6a025d54dd591fcda71a41fe07edf4d16726aefa86"
[[package]]
name = "rustc-std-workspace-core"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1956f5517128a2b6f23ab2dadf1a976f4f5b27962e7724c2bf3d45e539ec098c"
[[package]]
name = "ruzstd"
version = "0.8.1"
dependencies = [
"compiler_builtins",
"criterion",
"rand",
"rustc-std-workspace-alloc",
"rustc-std-workspace-core",
"twox-hash",
"zstd",
]
[[package]]
name = "ryu"
version = "1.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1"
[[package]]
name = "same-file"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
dependencies = [
"winapi-util",
]
[[package]]
name = "serde"
version = "1.0.197"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.197"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.115"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd"
dependencies = [
"itoa",
"ryu",
"serde",
]
[[package]]
name = "shlex"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]]
name = "syn"
version = "2.0.58"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "tinytemplate"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
dependencies = [
"serde",
"serde_json",
]
[[package]]
name = "twox-hash"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e7b17f197b3050ba473acf9181f7b1d3b66d1cf7356c6cc57886662276e65908"
[[package]]
name = "unicode-ident"
version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
[[package]]
name = "walkdir"
version = "2.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
dependencies = [
"same-file",
"winapi-util",
]
[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "wasm-bindgen"
version = "0.2.92"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8"
dependencies = [
"cfg-if",
"wasm-bindgen-macro",
]
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.92"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da"
dependencies = [
"bumpalo",
"log",
"once_cell",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.92"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.92"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
dependencies = [
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.92"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96"
[[package]]
name = "web-sys"
version = "0.3.69"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-util"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596"
dependencies = [
"winapi",
]
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-sys"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
[[package]]
name = "windows_i686_gnu"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
[[package]]
name = "windows_i686_msvc"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
[[package]]
name = "zstd"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9"
dependencies = [
"zstd-safe",
]
[[package]]
name = "zstd-safe"
version = "7.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059"
dependencies = [
"zstd-sys",
]
[[package]]
name = "zstd-sys"
version = "2.0.13+zstd.1.5.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa"
dependencies = [
"cc",
"pkg-config",
]

100
vendor/ruzstd/Cargo.toml vendored Normal file
View File

@@ -0,0 +1,100 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2018"
name = "ruzstd"
version = "0.8.1"
authors = ["Moritz Borcherding <moritz.borcherding@web.de>"]
build = false
exclude = [
"decodecorpus_files/*",
"dict_tests/*",
"fuzz_decodecorpus/*",
]
autobins = false
autoexamples = false
autotests = false
autobenches = false
description = "A decoder for the zstd compression format"
homepage = "https://github.com/KillingSpark/zstd-rs"
readme = "Readme.md"
keywords = [
"zstd",
"zstandard",
"decompression",
]
categories = ["compression"]
license = "MIT"
repository = "https://github.com/KillingSpark/zstd-rs"
[lib]
name = "ruzstd"
path = "src/lib.rs"
[[bin]]
name = "zstd"
path = "src/bin/zstd.rs"
required-features = ["std"]
[[bin]]
name = "zstd_stream"
path = "src/bin/zstd_stream.rs"
required-features = ["std"]
[[bench]]
name = "decode_all"
path = "benches/decode_all.rs"
harness = false
[dependencies.alloc]
version = "1.0.0"
optional = true
package = "rustc-std-workspace-alloc"
[dependencies.compiler_builtins]
version = "0.1.2"
optional = true
[dependencies.core]
version = "1.0.0"
optional = true
package = "rustc-std-workspace-core"
[dependencies.twox-hash]
version = "2.0"
features = ["xxhash64"]
optional = true
default-features = false
[dev-dependencies.criterion]
version = "0.5"
[dev-dependencies.rand]
version = "0.8.5"
features = ["small_rng"]
[dev-dependencies.zstd]
version = "0.13.2"
[features]
default = [
"hash",
"std",
]
fuzz_exports = []
hash = ["dep:twox-hash"]
rustc-dep-of-std = [
"dep:compiler_builtins",
"dep:core",
"dep:alloc",
]
std = []

40
vendor/ruzstd/Changelog.md vendored Normal file
View File

@@ -0,0 +1,40 @@
# Changelog
This document records the changes made between versions, starting with version 0.5.0
# After 0.8.0 (Current)
* The compressor now includes a `content_checksum` when the `hash` feature is enabled
# After 0.7.3
* Add initial compression support
* **Breaking** Refactor modules to reflect that this is now also a compression library
# After 0.7.2
* Soundness fix in decoding::RingBuffer. The lengths of the diferent regions where sometimes calculated wrongly, resulting in reads of heap memory not belonging to that ringbuffer
* Fixed by https://github.com/paolobarbolini
* Affected versions: 0.7.0 up to and including 0.7.2
* Added convenience functions to FrameDecoder to decode multiple frames from a buffer (https://github.com/philipc)
# After 0.7.1
* Remove byteorder dependency (https://github.com/workingjubilee)
* Preparations to become a std dependency (https://github.com/workingjubilee)
# After 0.7.0
* Fix for drain_to functions into limited targets (https://github.com/michaelkirk)
# After 0.6.0
* Small fix in the zstd binary, progress tracking was slighty off for skippable frames resulting in an error only when the last frame in a file was skippable
* Small performance improvement by reorganizing code with `#[cold]` annotations
* Documentation for `StreamDecoder` mentioning the limitations around multiple frames (https://github.com/Sorseg)
* Documentation around skippable frames (https://github.com/Sorseg)
* **Breaking** `StreamDecoder` API changes to get access to the inner parts (https://github.com/ifd3f)
* Big internal documentation contribution (https://github.com/zleyyij)
* Dropped derive_more as a dependency (https://github.com/xd009642)
* Small improvement by removing the error cases from the reverse bitreader (and making sure invalid requests can't even happen)
# After 0.5.0
* Make the hashing checksum optional (thanks to [@tamird](https://github.com/tamird))
* breaking change as the public API changes based on features
* The FrameDecoder is now Send + Sync (RingBuffer impls these traits now)

21
vendor/ruzstd/LICENSE vendored Normal file
View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2019 Moritz Borcherding
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

124
vendor/ruzstd/Readme.md vendored Normal file
View File

@@ -0,0 +1,124 @@
# Ruzstd (a pure rust zstd format implementation)
[![Released API docs](https://docs.rs/ruzstd/badge.svg)](https://docs.rs/ruzstd)
[![CI](https://github.com/killingspark/zstd-rs/workflows/CI/badge.svg)](https://github.com/killingspark/zstd-rs/actions?query=workflow%3ACI)
# What is this
A pure Rust implementation of the Zstandard compression format, as defined in [RFC8878](https://www.rfc-editor.org/rfc/rfc8878.pdf).
This crate contains a fully operational implementation of the decompression portion of the standard.
It also provides a compressor which is usable, but it does not yet reach the speed, ratio or configurability of the original zstd library.
This crate is currently actively maintained.
# Current Status
Feature complete on the decoder side.
On the compression side:
- Support for generating compressed blocks at any compression level
- [x] Uncompressed
- [x] Fastest (roughly level 1)
- [ ] Default (roughly level 3)
- [ ] Better (roughly level 7)
- [ ] Best (roughly level 11)
- [ ] Checksums
- [ ] Dictionaries
## Speed
In terms of speed this library is behind the original C implementation which has a rust binding located [here](https://github.com/gyscos/zstd-rs).
Measuring with the 'time' utility the original zstd and my decoder both decoding the same enwik9.zst file from a ramfs, my decoder is about 3.5 times slower. Enwik9 is highly compressible, for less compressible data (like a ubuntu installation .iso) my decoder comes close to only being 1.4 times slower.
# How can you use it?
## Compression
The easiest is to use the provided `compress`/`compress_to_vec` functions
```rust, no_run
use ruzstd::encoding::{compress, compress_to_vec, CompressionLevel};
let data: &[u8] = todo!();
// Either
let mut compressed = Vec::new();
compress(data, &mut compressed, CompressionLevel::Fastest);
// or
let compressed = compress_to_vec(data, CompressionLevel::Fastest);
```
Or you can use the `FrameDecoder` manually to compress data. This allows you to process encoded data while it is being encoded instead of collecting into a big vector.
## Decompression
Additionally to the descriptions and the docs you can have a look at the zstd / zstd_streaming binaries. They showcase how this library can be used.
### Easy
The easiest is to wrap the io::Read into a StreamingDecoder which itself implements io::Read. It will decode blocks as necessary to fulfill the read requests
```rust, no_run
use ruzstd::decoding::StreamingDecoder;
use ruzstd::io::Read;
let mut source: &[u8] = todo!("Get a reader from a File or any other source");
let mut decoder = StreamingDecoder::new(&mut source).unwrap();
let mut result = Vec::new();
decoder.read_to_end(&mut result).unwrap();
```
This might be a problem if you are accepting user provided data. Frames can be REALLY big when decoded. If this is the case you should either check how big the frame
actually is or use the memory efficient approach described below.
### Memory efficient
If memory is a concern you can decode frames partially. There are two ways to do this:
#### Streaming decoder
Use the StreamingDecoder and use a while loop to fill your buffer (see src/bin/zstd_stream.rs for an example). This is the
recommended approach.
#### Use the lower level FrameDecoder
For an example see the src/bin/zstd.rs file. Basically you can decode the frame until either a
given block count has been decoded or the decodebuffer has reached a certain size. Then you can collect no longer needed bytes from the buffer and do something with them, discard them and resume decoding the frame in a loop until the frame has been decoded completely.
## Roadmap
1. More Performance optimizations
1. sequence_decoding and reverse_bitreader::get_bits. Those account for about 50% of the whole time used in decoding
2. Matching suffixes. This accounts for >60% of the whole time used in encoding
2. Implement encoder features
1. More levels
2. Dictionaries
3. Checksums
## Testing
Tests take two forms.
1. Tests using well-formed files that have to decode correctly and are checked against their originals
1. Tests using malformed input that have been generated by the fuzzer. These don't have to decode (they are garbage) but they must not make the decoder panic
## Fuzzing
Fuzzing has been done on
1. Random input with no initial corpus
2. The \*.zst in /fuzz_decodecorpus
### You want to help fuzz?
Use `cargo +nightly fuzz run decode` or some other fuzz target to run the fuzzer. It is seeded with files created with decodecorpus.
If the fuzzer finds a crash it will be saved to the artifacts dir by the fuzzer. Run `cargo test artifacts` to run the artifacts tests.
This will tell you where the decoder panics exactly. If you are able to fix the issue please feel free to do a pull request. If not please still submit the offending input and I will see how to fix it myself.
# Contributing
Contributions will be published under the same MIT license as this project. Please make an entry in the Changelog.md file when you make a PR.

17
vendor/ruzstd/benches/decode_all.rs vendored Normal file
View File

@@ -0,0 +1,17 @@
use criterion::{criterion_group, criterion_main, Criterion};
use ruzstd::decoding::FrameDecoder;
fn criterion_benchmark(c: &mut Criterion) {
let mut fr = FrameDecoder::new();
let mut target_slice = &mut vec![0u8; 1024 * 1024 * 200];
let src = include_bytes!("../decodecorpus_files/z000033.zst");
c.bench_function("decode_all_slice", |b| {
b.iter(|| {
fr.decode_all(src, target_slice).unwrap();
})
});
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

34
vendor/ruzstd/optimizations.md vendored Normal file
View File

@@ -0,0 +1,34 @@
# Optimizations
This document tracks which optimizations have been done after the initial implementation passed corpus tests and a good amount of fuzzing.
## Introducing more unsafe code:
These optimizations introduced more unsafe code. These should yield significant improvements, or else they are not really worth it.
### Optimizing bitreader with byteorder which uses ptr::copy_nonoverlapping
* Reverse bitreader_reversed::get_bits was identified by linux perf tool using about 36% of the whole time
* Benchmark: decode enwik9
* Before: about 14.7 seconds
* After: about 12.2 seconds with about 25% of the time used for get_bits()
### Optimizing decodebuffer::repeat with ptr::copy_nonoverlapping
* decodebuffer::repeate was identified by linux perf tool using about 28% of the whole time
* Benchmark: decode enwik9
* Before: about 9.9 seconds
* After: about 9.4 seconds
### Use custom ringbuffer in the decodebuffer
The decode buffer must be able to do two things efficiently
* Collect bytes from the front
* Copy bytes from the contents to the end
The stdlibs VecDequeu and Vec can each do one but not the other efficiently. So a custom implementation of a ringbuffer was written.
## Introducing NO additional unsafe code
These are just nice to have
### Even better bitreaders
Studying this material lead to a big improvement in bitreader speed
* https://fgiesen.wordpress.com/2018/02/19/reading-bits-in-far-too-many-ways-part-1/
* https://fgiesen.wordpress.com/2018/02/20/reading-bits-in-far-too-many-ways-part-2/

221
vendor/ruzstd/src/bin/zstd.rs vendored Normal file
View File

@@ -0,0 +1,221 @@
extern crate ruzstd;
use std::fs::File;
use std::io::BufReader;
use std::io::Read;
use std::io::Seek;
use std::io::SeekFrom;
use std::io::Write;
use std::time::Instant;
use ruzstd::decoding::errors::FrameDecoderError;
use ruzstd::decoding::errors::ReadFrameHeaderError;
use ruzstd::encoding::CompressionLevel;
use ruzstd::encoding::FrameCompressor;
struct StateTracker {
bytes_used: u64,
frames_used: usize,
valid_checksums: usize,
invalid_checksums: usize,
file_pos: u64,
file_size: u64,
old_percentage: i8,
}
fn decompress(flags: &[String], file_paths: &[String]) {
if !flags.contains(&"-d".to_owned()) {
eprintln!("This zstd implementation only supports decompression. Please add a \"-d\" flag");
return;
}
if !flags.contains(&"-c".to_owned()) {
eprintln!("This zstd implementation only supports output on the stdout. Please add a \"-c\" flag and pipe the output into a file");
return;
}
if flags.len() != 2 {
eprintln!(
"No flags other than -d and -c are currently implemented. Flags used: {:?}",
flags
);
return;
}
let mut frame_dec = ruzstd::decoding::FrameDecoder::new();
for path in file_paths {
eprintln!("File: {}", path);
let mut f = File::open(path).unwrap();
let mut tracker = StateTracker {
bytes_used: 0,
frames_used: 0,
valid_checksums: 0,
invalid_checksums: 0,
file_size: f.metadata().unwrap().len(),
file_pos: 0,
old_percentage: -1,
};
let batch_size = 1024 * 1024 * 10;
let mut result = vec![0; batch_size];
while tracker.file_pos < tracker.file_size {
match frame_dec.reset(&mut f) {
Err(FrameDecoderError::ReadFrameHeaderError(ReadFrameHeaderError::SkipFrame {
magic_number: magic_num,
length: skip_size,
})) => {
eprintln!("Found a skippable frame with magic number: {magic_num} and size: {skip_size}");
tracker.file_pos = f.stream_position().unwrap();
tracker.file_pos += skip_size as u64;
f.seek(SeekFrom::Current(skip_size as i64)).unwrap();
continue;
}
other => other.unwrap(),
}
tracker.frames_used += 1;
while !frame_dec.is_finished() {
frame_dec
.decode_blocks(
&mut f,
ruzstd::decoding::BlockDecodingStrategy::UptoBytes(batch_size),
)
.unwrap();
if frame_dec.can_collect() > batch_size {
let x = frame_dec.read(result.as_mut_slice()).unwrap();
tracker.file_pos = f.stream_position().unwrap();
do_something(&result[..x], &mut tracker);
}
}
// handle the last chunk of data
while frame_dec.can_collect() > 0 {
let x = frame_dec.read(result.as_mut_slice()).unwrap();
tracker.file_pos = f.stream_position().unwrap();
do_something(&result[..x], &mut tracker);
}
#[cfg(feature = "hash")]
if let Some(chksum) = frame_dec.get_checksum_from_data() {
if frame_dec.get_calculated_checksum().unwrap() != chksum {
tracker.invalid_checksums += 1;
eprintln!(
"Checksum did not match in frame {}! From data: {}, calculated while decoding: {}",
tracker.frames_used,
chksum,
frame_dec.get_calculated_checksum().unwrap()
);
} else {
tracker.valid_checksums += 1;
}
}
}
eprintln!(
"\nDecoded frames: {} bytes: {}",
tracker.frames_used, tracker.bytes_used
);
if tracker.valid_checksums == 0 && tracker.invalid_checksums == 0 {
eprintln!("No checksums to test");
} else {
eprintln!(
"{} of {} checksums are ok!",
tracker.valid_checksums,
tracker.valid_checksums + tracker.invalid_checksums,
);
}
}
}
struct PercentPrintReader<R: Read> {
total: usize,
counter: usize,
last_percent: usize,
reader: R,
}
impl<R: Read> Read for PercentPrintReader<R> {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
let new_bytes = self.reader.read(buf)?;
self.counter += new_bytes;
let progress = self.counter * 100 / self.total;
if progress > self.last_percent {
self.last_percent = progress;
eprint!("\r");
eprint!("{} % done", progress);
}
Ok(new_bytes)
}
}
fn main() {
let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect();
let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect();
file_paths.remove(0);
if flags.is_empty() {
let mut encoder = FrameCompressor::new(CompressionLevel::Fastest);
encoder.set_drain(Vec::new());
for path in file_paths {
let start_instant = Instant::now();
let file = std::fs::File::open(&path).unwrap();
let input_len = file.metadata().unwrap().len() as usize;
let file = PercentPrintReader {
reader: BufReader::new(file),
total: input_len,
counter: 0,
last_percent: 0,
};
encoder.set_source(file);
encoder.compress();
let mut output: Vec<_> = encoder.take_drain().unwrap();
println!(
"Compressed {path:} from {} to {} ({}%) took {}ms",
input_len,
output.len(),
if input_len == 0 {
0
} else {
output.len() * 100 / input_len
},
start_instant.elapsed().as_millis()
);
println!("Check against source file. Decoding...");
let mut decoded = Vec::with_capacity(input_len);
ruzstd::decoding::FrameDecoder::new()
.decode_all_to_vec(&output, &mut decoded)
.unwrap();
println!("Decoded without error");
assert_eq!(decoded.len(), input_len);
println!("Decoded length is correct, now check against file contents file");
let input = std::fs::read(&path).unwrap();
assert_eq!(decoded.len(), input.len());
assert!(decoded == input);
println!("Checks completed");
output.clear();
encoder.set_drain(output);
}
} else {
decompress(&flags, &file_paths);
}
}
fn do_something(data: &[u8], s: &mut StateTracker) {
//Do something. Like writing it to a file or to stdout...
std::io::stdout().write_all(data).unwrap();
s.bytes_used += data.len() as u64;
let percentage = (s.file_pos * 100) / s.file_size;
if percentage as i8 != s.old_percentage {
eprint!("\r");
eprint!("{} % done", percentage);
s.old_percentage = percentage as i8;
}
}

41
vendor/ruzstd/src/bin/zstd_stream.rs vendored Normal file
View File

@@ -0,0 +1,41 @@
extern crate ruzstd;
use std::fs::File;
use std::io::{Read, Write};
fn main() {
let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect();
let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect();
file_paths.remove(0);
if !flags.contains(&"-d".to_owned()) {
eprintln!("This zstd implementation only supports decompression. Please add a \"-d\" flag");
return;
}
if !flags.contains(&"-c".to_owned()) {
eprintln!("This zstd implementation only supports output on the stdout. Please add a \"-c\" flag and pipe the output into a file");
return;
}
if flags.len() != 2 {
eprintln!(
"No flags other than -d and -c are currently implemented. Flags used: {:?}",
flags
);
return;
}
for path in file_paths {
eprintln!("File: {}", path);
let f = File::open(path).unwrap();
let mut buf_read = std::io::BufReader::new(f);
let mut decoder = ruzstd::decoding::StreamingDecoder::new(&mut buf_read).unwrap();
let mut buf = [0u8; 1024 * 1024];
let mut stdout = std::io::stdout();
while !decoder.decoder.is_finished() || decoder.decoder.can_collect() > 0 {
let bytes = decoder.read(&mut buf[..]).unwrap();
stdout.write_all(&buf[..bytes]).unwrap();
}
}
}

135
vendor/ruzstd/src/bit_io/bit_reader.rs vendored Normal file
View File

@@ -0,0 +1,135 @@
/// Wraps a slice and enables reading arbitrary amounts of bits
/// from that slice.
pub struct BitReader<'s> {
idx: usize, //index counts bits already read
source: &'s [u8],
}
impl<'s> BitReader<'s> {
pub fn new(source: &'s [u8]) -> BitReader<'s> {
BitReader { idx: 0, source }
}
pub fn bits_left(&self) -> usize {
self.source.len() * 8 - self.idx
}
pub fn bits_read(&self) -> usize {
self.idx
}
pub fn return_bits(&mut self, n: usize) {
if n > self.idx {
panic!("Cant return this many bits");
}
self.idx -= n;
}
pub fn get_bits(&mut self, n: usize) -> Result<u64, GetBitsError> {
if n > 64 {
return Err(GetBitsError::TooManyBits {
num_requested_bits: n,
limit: 64,
});
}
if self.bits_left() < n {
return Err(GetBitsError::NotEnoughRemainingBits {
requested: n,
remaining: self.bits_left(),
});
}
let old_idx = self.idx;
let bits_left_in_current_byte = 8 - (self.idx % 8);
let bits_not_needed_in_current_byte = 8 - bits_left_in_current_byte;
//collect bits from the currently pointed to byte
let mut value = u64::from(self.source[self.idx / 8] >> bits_not_needed_in_current_byte);
if bits_left_in_current_byte >= n {
//no need for fancy stuff
//just mask all but the needed n bit
value &= (1 << n) - 1;
self.idx += n;
} else {
self.idx += bits_left_in_current_byte;
//n spans over multiple bytes
let full_bytes_needed = (n - bits_left_in_current_byte) / 8;
let bits_in_last_byte_needed = n - bits_left_in_current_byte - full_bytes_needed * 8;
assert!(
bits_left_in_current_byte + full_bytes_needed * 8 + bits_in_last_byte_needed == n
);
let mut bit_shift = bits_left_in_current_byte; //this many bits are already set in value
assert!(self.idx % 8 == 0);
//collect full bytes
for _ in 0..full_bytes_needed {
value |= u64::from(self.source[self.idx / 8]) << bit_shift;
self.idx += 8;
bit_shift += 8;
}
assert!(n - bit_shift == bits_in_last_byte_needed);
if bits_in_last_byte_needed > 0 {
let val_las_byte =
u64::from(self.source[self.idx / 8]) & ((1 << bits_in_last_byte_needed) - 1);
value |= val_las_byte << bit_shift;
self.idx += bits_in_last_byte_needed;
}
}
assert!(self.idx == old_idx + n);
Ok(value)
}
}
#[derive(Debug)]
#[non_exhaustive]
pub enum GetBitsError {
TooManyBits {
num_requested_bits: usize,
limit: u8,
},
NotEnoughRemainingBits {
requested: usize,
remaining: usize,
},
}
#[cfg(feature = "std")]
impl std::error::Error for GetBitsError {}
impl core::fmt::Display for GetBitsError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
GetBitsError::TooManyBits {
num_requested_bits,
limit,
} => {
write!(
f,
"Cant serve this request. The reader is limited to {} bits, requested {} bits",
limit, num_requested_bits,
)
}
GetBitsError::NotEnoughRemainingBits {
requested,
remaining,
} => {
write!(
f,
"Can\'t read {} bits, only have {} bits left",
requested, remaining,
)
}
}
}
}

View File

@@ -0,0 +1,184 @@
use core::convert::TryInto;
/// Zstandard encodes some types of data in a way that the data must be read
/// back to front to decode it properly. `BitReaderReversed` provides a
/// convenient interface to do that.
pub struct BitReaderReversed<'s> {
/// The index of the last read byte in the source.
index: usize,
/// How many bits have been consumed from `bit_container`.
bits_consumed: u8,
/// How many bits have been consumed past the end of the input. Will be zero until all the input
/// has been read.
extra_bits: usize,
/// The source data to read from.
source: &'s [u8],
/// The reader doesn't read directly from the source, it reads bits from here, and the container
/// is "refilled" as it's emptied.
bit_container: u64,
}
impl<'s> BitReaderReversed<'s> {
/// How many bits are left to read by the reader.
pub fn bits_remaining(&self) -> isize {
self.index as isize * 8 + (64 - self.bits_consumed as isize) - self.extra_bits as isize
}
pub fn new(source: &'s [u8]) -> BitReaderReversed<'s> {
BitReaderReversed {
index: source.len(),
bits_consumed: 64,
source,
bit_container: 0,
extra_bits: 0,
}
}
/// We refill the container in full bytes, shifting the still unread portion to the left, and filling the lower bits with new data
#[cold]
fn refill(&mut self) {
let bytes_consumed = self.bits_consumed as usize / 8;
if bytes_consumed == 0 {
return;
}
if self.index >= bytes_consumed {
// We can safely move the window contained in `bit_container` down by `bytes_consumed`
// If the reader wasn't byte aligned, the byte that was partially read is now in the highest order bits in the `bit_container`
self.index -= bytes_consumed;
// Some bits of the `bits_container` might have been consumed already because we read the window byte aligned
self.bits_consumed &= 7;
self.bit_container =
u64::from_le_bytes((&self.source[self.index..][..8]).try_into().unwrap());
} else if self.index > 0 {
// Read the last portion of source into the `bit_container`
if self.source.len() >= 8 {
self.bit_container = u64::from_le_bytes((&self.source[..8]).try_into().unwrap());
} else {
let mut value = [0; 8];
value[..self.source.len()].copy_from_slice(self.source);
self.bit_container = u64::from_le_bytes(value);
}
self.bits_consumed -= 8 * self.index as u8;
self.index = 0;
self.bit_container <<= self.bits_consumed;
self.extra_bits += self.bits_consumed as usize;
self.bits_consumed = 0;
} else if self.bits_consumed < 64 {
// Shift out already used bits and fill up with zeroes
self.bit_container <<= self.bits_consumed;
self.extra_bits += self.bits_consumed as usize;
self.bits_consumed = 0;
} else {
// All useful bits have already been read and more than 64 bits have been consumed, all we now do is return zeroes
self.extra_bits += self.bits_consumed as usize;
self.bits_consumed = 0;
self.bit_container = 0;
}
// Assert that at least `56 = 64 - 8` bits are available to read.
debug_assert!(self.bits_consumed < 8);
}
/// Read `n` number of bits from the source. Will read at most 56 bits.
/// If there are no more bits to be read from the source zero bits will be returned instead.
#[inline(always)]
pub fn get_bits(&mut self, n: u8) -> u64 {
if self.bits_consumed + n > 64 {
self.refill();
}
let value = self.peek_bits(n);
self.consume(n);
value
}
/// Get the next `n` bits from the source without consuming them.
/// Caller is responsible for making sure that `n` many bits have been refilled.
#[inline(always)]
pub fn peek_bits(&mut self, n: u8) -> u64 {
if n == 0 {
return 0;
}
let mask = (1u64 << n) - 1u64;
let shift_by = 64 - self.bits_consumed - n;
(self.bit_container >> shift_by) & mask
}
/// Get the next `n1` `n2` and `n3` bits from the source without consuming them.
/// Caller is responsible for making sure that `sum` many bits have been refilled.
#[inline(always)]
pub fn peek_bits_triple(&mut self, sum: u8, n1: u8, n2: u8, n3: u8) -> (u64, u64, u64) {
if sum == 0 {
return (0, 0, 0);
}
// all_three contains bits like this: |XXXX..XXX111122223333|
// Where XXX are already consumed bytes, 1/2/3 are bits of the respective value
// Lower bits are to the right
let all_three = self.bit_container >> (64 - self.bits_consumed - sum);
let mask1 = (1u64 << n1) - 1u64;
let shift_by1 = n3 + n2;
let val1 = (all_three >> shift_by1) & mask1;
let mask2 = (1u64 << n2) - 1u64;
let shift_by2 = n3;
let val2 = (all_three >> shift_by2) & mask2;
let mask3 = (1u64 << n3) - 1u64;
let val3 = all_three & mask3;
(val1, val2, val3)
}
/// Consume `n` bits from the source.
#[inline(always)]
pub fn consume(&mut self, n: u8) {
self.bits_consumed += n;
debug_assert!(self.bits_consumed <= 64);
}
/// Same as calling get_bits three times but slightly more performant
#[inline(always)]
pub fn get_bits_triple(&mut self, n1: u8, n2: u8, n3: u8) -> (u64, u64, u64) {
let sum = n1 + n2 + n3;
if sum <= 56 {
self.refill();
let triple = self.peek_bits_triple(sum, n1, n2, n3);
self.consume(sum);
return triple;
}
(self.get_bits(n1), self.get_bits(n2), self.get_bits(n3))
}
}
#[cfg(test)]
mod test {
#[test]
fn it_works() {
let data = [0b10101010, 0b01010101];
let mut br = super::BitReaderReversed::new(&data);
assert_eq!(br.get_bits(1), 0);
assert_eq!(br.get_bits(1), 1);
assert_eq!(br.get_bits(1), 0);
assert_eq!(br.get_bits(4), 0b1010);
assert_eq!(br.get_bits(4), 0b1101);
assert_eq!(br.get_bits(4), 0b0101);
// Last 0 from source, three zeroes filled in
assert_eq!(br.get_bits(4), 0b0000);
// All zeroes filled in
assert_eq!(br.get_bits(4), 0b0000);
assert_eq!(br.bits_remaining(), -7);
}
}

367
vendor/ruzstd/src/bit_io/bit_writer.rs vendored Normal file
View File

@@ -0,0 +1,367 @@
//! Use [BitWriter] to write an arbitrary amount of bits into a buffer.
use alloc::vec::Vec;
/// An interface for writing an arbitrary number of bits into a buffer. Write new bits into the buffer with `write_bits`, and
/// obtain the output using `dump`.
#[derive(Debug)]
pub(crate) struct BitWriter<V: AsMut<Vec<u8>>> {
/// The buffer that's filled with bits
output: V,
/// holds a partially filled byte which gets put in outpu when it's fill with a write_bits call
partial: u64,
bits_in_partial: usize,
/// The index pointing to the next unoccupied bit. Effectively just
/// the number of bits that have been written into the buffer so far.
bit_idx: usize,
}
impl BitWriter<Vec<u8>> {
/// Initialize a new writer.
pub fn new() -> Self {
Self {
output: Vec::new(),
partial: 0,
bits_in_partial: 0,
bit_idx: 0,
}
}
}
impl<V: AsMut<Vec<u8>>> BitWriter<V> {
/// Initialize a new writer.
pub fn from(mut output: V) -> BitWriter<V> {
BitWriter {
bit_idx: output.as_mut().len() * 8,
output,
partial: 0,
bits_in_partial: 0,
}
}
/// Get the current index. Can be used to reset to this index or to later change the bits at this index
pub fn index(&self) -> usize {
self.bit_idx + self.bits_in_partial
}
/// Reset to an index. Currently only supports resetting to a byte aligned index
pub fn reset_to(&mut self, index: usize) {
assert!(index % 8 == 0);
self.partial = 0;
self.bits_in_partial = 0;
self.bit_idx = index;
self.output.as_mut().resize(index / 8, 0);
}
/// Change the bits at the index. `bits` contains the ǹum_bits` new bits that should be written
/// Instead of the current content. `bits` *MUST* only contain zeroes in the upper bits outside of the `0..num_bits` range.
pub fn change_bits(&mut self, idx: usize, bits: impl Into<u64>, num_bits: usize) {
self.change_bits_64(idx, bits.into(), num_bits);
}
/// Monomorphized version of `change_bits`
pub fn change_bits_64(&mut self, mut idx: usize, mut bits: u64, mut num_bits: usize) {
self.flush();
assert!(idx + num_bits < self.index());
assert!(self.index() - (idx + num_bits) > self.bits_in_partial);
// We might be changing bits unaligned to byte borders.
// This means the lower bits of the first byte we are touching must stay the same
if idx % 8 != 0 {
// How many (upper) bits will change in the first byte?
let bits_in_first_byte = 8 - (idx % 8);
// We don't support only changing a few bits in the middle of a byte
assert!(bits_in_first_byte <= num_bits);
// Zero out the upper bits that will be changed while keeping the lower bits intact
self.output.as_mut()[idx / 8] &= 0xFFu8 >> bits_in_first_byte;
// Shift the bits up and put them in the now zeroed out bits
let new_bits = (bits << (8 - bits_in_first_byte)) as u8;
self.output.as_mut()[idx / 8] |= new_bits;
// Update the state. Note that we are now definitely working byte aligned
num_bits -= bits_in_first_byte;
bits >>= bits_in_first_byte;
idx += bits_in_first_byte;
}
assert!(idx % 8 == 0);
// We are now byte aligned, change idx to byte resolution
let mut idx = idx / 8;
// Update full bytes by just shifting and extracting bytes from the bits
while num_bits >= 8 {
self.output.as_mut()[idx] = bits as u8;
num_bits -= 8;
bits >>= 8;
idx += 1;
}
// Deal with leftover bits that wont fill a full byte, keeping the upper bits of the original byte intact
if num_bits > 0 {
self.output.as_mut()[idx] &= 0xFFu8 << num_bits;
self.output.as_mut()[idx] |= bits as u8;
}
}
/// Simply append bytes to the buffer. Only works if the buffer was already byte aligned
pub fn append_bytes(&mut self, data: &[u8]) {
if self.misaligned() != 0 {
panic!("Don't append bytes when writer is misaligned")
}
self.flush();
self.output.as_mut().extend_from_slice(data);
self.bit_idx += data.len() * 8;
}
/// Flush temporary internal buffers to the output buffer. Only works if this is currently byte aligned
pub fn flush(&mut self) {
assert!(self.bits_in_partial % 8 == 0);
let full_bytes = self.bits_in_partial / 8;
self.output
.as_mut()
.extend_from_slice(&self.partial.to_le_bytes()[..full_bytes]);
self.partial >>= full_bytes * 8;
self.bits_in_partial -= full_bytes * 8;
self.bit_idx += full_bytes * 8;
}
/// Write the lower `num_bits` from `bits` into the writer. `bits` *MUST* only contain zeroes in the upper bits outside of the `0..num_bits` range.
pub fn write_bits(&mut self, bits: impl Into<u64>, num_bits: usize) {
self.write_bits_64(bits.into(), num_bits);
}
/// This is the special case where we need to flush the partial buffer to the output.
/// Marked as cold and in a separate function so the optimizer has more information.
#[cold]
fn write_bits_64_cold(&mut self, bits: u64, num_bits: usize) {
assert!(self.bits_in_partial + num_bits >= 64);
// Fill the partial buffer so it contains 64 bits
let bits_free_in_partial = 64 - self.bits_in_partial;
let part = bits << (64 - bits_free_in_partial);
let merged = self.partial | part;
// Put the 8 bytes into the output buffer
self.output
.as_mut()
.extend_from_slice(&merged.to_le_bytes());
self.bit_idx += 64;
self.partial = 0;
self.bits_in_partial = 0;
let mut num_bits = num_bits - bits_free_in_partial;
let mut bits = bits >> bits_free_in_partial;
// While we are at it push full bytes into the output buffer instead of polluting the partial buffer
while num_bits / 8 > 0 {
let byte = bits as u8;
self.output.as_mut().push(byte);
num_bits -= 8;
self.bit_idx += 8;
bits >>= 8;
}
// The last few bits belong into the partial buffer
assert!(num_bits < 8);
if num_bits > 0 {
let mask = (1 << num_bits) - 1;
self.partial = bits & mask;
self.bits_in_partial = num_bits;
}
}
/// Monomorphized version of `change_bits`
pub fn write_bits_64(&mut self, bits: u64, num_bits: usize) {
if num_bits == 0 {
return;
}
if bits > 0 {
debug_assert!(bits.ilog2() <= num_bits as u32);
}
// fill partial byte first
if num_bits + self.bits_in_partial < 64 {
let part = bits << self.bits_in_partial;
let merged = self.partial | part;
self.partial = merged;
self.bits_in_partial += num_bits;
} else {
// If the partial buffer can't hold the num_bits we need to make space
self.write_bits_64_cold(bits, num_bits);
}
}
/// Returns the populated buffer that you've been writing bits into.
///
/// This function consumes the writer, so it cannot be used after
/// dumping
pub fn dump(mut self) -> V {
if self.misaligned() != 0 {
panic!("`dump` was called on a bit writer but an even number of bytes weren't written into the buffer. Was: {}", self.index())
}
self.flush();
debug_assert_eq!(self.partial, 0);
self.output
}
/// Returns how many bits are missing for an even byte
pub fn misaligned(&self) -> usize {
let idx = self.index();
if idx % 8 == 0 {
0
} else {
8 - (idx % 8)
}
}
}
#[cfg(test)]
mod tests {
use super::BitWriter;
use alloc::vec;
#[test]
fn from_existing() {
// Define an existing vec, write some bits into it
let mut existing_vec = vec![255_u8];
let mut bw = BitWriter::from(&mut existing_vec);
bw.write_bits(0u8, 8);
bw.flush();
assert_eq!(vec![255, 0], existing_vec);
}
#[test]
fn change_bits() {
let mut writer = BitWriter::new();
writer.write_bits(0u32, 24);
writer.change_bits(8, 0xFFu8, 8);
assert_eq!(vec![0, 0xFF, 0], writer.dump());
let mut writer = BitWriter::new();
writer.write_bits(0u32, 24);
writer.change_bits(6, 0x0FFFu16, 12);
assert_eq!(vec![0b11000000, 0xFF, 0b00000011], writer.dump());
}
#[test]
fn single_byte_written_4_4() {
// Write the first 4 bits as 1s and the last 4 bits as 0s
// 1010 is used where values should never be read from.
let mut bw = BitWriter::new();
bw.write_bits(0b1111u8, 4);
bw.write_bits(0b0000u8, 4);
let output = bw.dump();
assert!(output.len() == 1, "Single byte written into writer returned a vec that wasn't one byte, vec was {} elements long", output.len());
assert_eq!(
0b0000_1111, output[0],
"4 bits and 4 bits written into buffer"
);
}
#[test]
fn single_byte_written_3_5() {
// Write the first 3 bits as 1s and the last 5 bits as 0s
let mut bw = BitWriter::new();
bw.write_bits(0b111u8, 3);
bw.write_bits(0b0_0000u8, 5);
let output = bw.dump();
assert!(output.len() == 1, "Single byte written into writer return a vec that wasn't one byte, vec was {} elements long", output.len());
assert_eq!(0b0000_0111, output[0], "3 and 5 bits written into buffer");
}
#[test]
fn single_byte_written_1_7() {
// Write the first bit as a 1 and the last 7 bits as 0s
let mut bw = BitWriter::new();
bw.write_bits(0b1u8, 1);
bw.write_bits(0u8, 7);
let output = bw.dump();
assert!(output.len() == 1, "Single byte written into writer return a vec that wasn't one byte, vec was {} elements long", output.len());
assert_eq!(0b0000_0001, output[0], "1 and 7 bits written into buffer");
}
#[test]
fn single_byte_written_8() {
// Write an entire byte
let mut bw = BitWriter::new();
bw.write_bits(1u8, 8);
let output = bw.dump();
assert!(output.len() == 1, "Single byte written into writer return a vec that wasn't one byte, vec was {} elements long", output.len());
assert_eq!(1, output[0], "1 and 7 bits written into buffer");
}
#[test]
fn multi_byte_clean_boundary_4_4_4_4() {
// Writing 4 bits at a time for 2 bytes
let mut bw = BitWriter::new();
bw.write_bits(0u8, 4);
bw.write_bits(0b1111u8, 4);
bw.write_bits(0b1111u8, 4);
bw.write_bits(0u8, 4);
assert_eq!(vec![0b1111_0000, 0b0000_1111], bw.dump());
}
#[test]
fn multi_byte_clean_boundary_16_8() {
// Writing 16 bits at once
let mut bw = BitWriter::new();
bw.write_bits(0x0100u16, 16);
bw.write_bits(69u8, 8);
assert_eq!(vec![0, 1, 69], bw.dump())
}
#[test]
fn multi_byte_boundary_crossed_4_12() {
// Writing 4 1s and then 12 zeros
let mut bw = BitWriter::new();
bw.write_bits(0b1111u8, 4);
bw.write_bits(0b0000_0011_0100_0010u16, 12);
assert_eq!(vec![0b0010_1111, 0b0011_0100], bw.dump());
}
#[test]
fn multi_byte_boundary_crossed_4_5_7() {
// Writing 4 1s and then 5 zeros then 7 1s
let mut bw = BitWriter::new();
bw.write_bits(0b1111u8, 4);
bw.write_bits(0b0_0000u8, 5);
bw.write_bits(0b111_1111u8, 7);
assert_eq!(vec![0b0000_1111, 0b1111_1110], bw.dump());
}
#[test]
fn multi_byte_boundary_crossed_1_9_6() {
// Writing 1 1 and then 9 zeros then 6 1s
let mut bw = BitWriter::new();
bw.write_bits(0b1u8, 1);
bw.write_bits(0b0_0000_0000u16, 9);
bw.write_bits(0b11_1111u8, 6);
assert_eq!(vec![0b0000_0001, 0b1111_1100], bw.dump());
}
#[test]
#[should_panic]
fn catches_unaligned_dump() {
// Write a single bit in then dump it, making sure
// the correct error is returned
let mut bw = BitWriter::new();
bw.write_bits(0u8, 1);
bw.dump();
}
#[test]
#[should_panic]
fn catches_dirty_upper_bits() {
let mut bw = BitWriter::new();
bw.write_bits(10u8, 1);
}
#[test]
fn add_multiple_aligned() {
let mut bw = BitWriter::new();
bw.write_bits(0x00_0F_F0_FFu32, 32);
assert_eq!(vec![0xFF, 0xF0, 0x0F, 0x00], bw.dump());
}
// #[test]
// fn catches_more_than_in_buf() {
// todo!();
// }
}

9
vendor/ruzstd/src/bit_io/mod.rs vendored Normal file
View File

@@ -0,0 +1,9 @@
//! Encoding agnostic ways to read and write binary data
mod bit_reader;
mod bit_reader_reverse;
mod bit_writer;
pub(crate) use bit_reader::*;
pub(crate) use bit_reader_reverse::*;
pub(crate) use bit_writer::*;

43
vendor/ruzstd/src/blocks/block.rs vendored Normal file
View File

@@ -0,0 +1,43 @@
//! Block header definitions.
/// There are 4 different kinds of blocks, and the type of block influences the meaning of `Block_Size`.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BlockType {
/// An uncompressed block.
Raw,
/// A single byte, repeated `Block_Size` times (Run Length Encoding).
#[allow(clippy::upper_case_acronyms)]
RLE,
/// A Zstandard compressed block. `Block_Size` is the length of the compressed data.
Compressed,
/// This is not a valid block, and this value should not be used.
/// If this value is present, it should be considered corrupted data.
Reserved,
}
impl core::fmt::Display for BlockType {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
match self {
BlockType::Compressed => write!(f, "Compressed"),
BlockType::Raw => write!(f, "Raw"),
BlockType::RLE => write!(f, "RLE"),
BlockType::Reserved => write!(f, "Reserverd"),
}
}
}
/// A representation of a single block header. As well as containing a frame header,
/// each Zstandard frame contains one or more blocks.
pub struct BlockHeader {
/// Whether this block is the last block in the frame.
/// It may be followed by an optional `Content_Checksum` if it is.
pub last_block: bool,
pub block_type: BlockType,
/// The size of the decompressed data. If the block type
/// is [BlockType::Reserved] or [BlockType::Compressed],
/// this value is set to zero and should not be referenced.
pub decompressed_size: u32,
/// The size of the block. If the block is [BlockType::RLE],
/// this value will be 1.
pub content_size: u32,
}

View File

@@ -0,0 +1,236 @@
//! Utilities and representations for the first half of a block, the literals section.
//! It contains data that is then copied from by the sequences section.
use crate::bit_io::BitReader;
use crate::decoding::errors::LiteralsSectionParseError;
/// A compressed block consists of two sections, a literals section, and a sequences section.
///
/// This is the first of those two sections. A literal is just any arbitrary data, and it is copied by the sequences section
pub struct LiteralsSection {
/// - If this block is of type [LiteralsSectionType::Raw], then the data is `regenerated_bytes`
/// bytes long, and it contains the raw literals data to be used during the second section,
/// the sequences section.
/// - If this block is of type [LiteralsSectionType::RLE],
/// then the literal consists of a single byte repeated `regenerated_size` times.
/// - For types [LiteralsSectionType::Compressed] or [LiteralsSectionType::Treeless],
/// then this is the size of the decompressed data.
pub regenerated_size: u32,
/// - For types [LiteralsSectionType::Raw] and [LiteralsSectionType::RLE], this value is not present.
/// - For types [LiteralsSectionType::Compressed] and [LiteralsSectionType::Treeless], this value will
/// be set to the size of the compressed data.
pub compressed_size: Option<u32>,
/// This value will be either 1 stream or 4 streams if the literal is of type
/// [LiteralsSectionType::Compressed] or [LiteralsSectionType::Treeless], and it
/// is not used for RLE or uncompressed literals.
pub num_streams: Option<u8>,
/// The type of the literal section.
pub ls_type: LiteralsSectionType,
}
/// The way which a literal section is encoded.
pub enum LiteralsSectionType {
/// Literals are stored uncompressed.
Raw,
/// Literals consist of a single byte value repeated [LiteralsSection::regenerated_size] times.
#[allow(clippy::upper_case_acronyms)]
RLE,
/// This is a standard Huffman-compressed block, starting with a Huffman tree description.
/// In this mode, there are at least *2* different literals represented in the Huffman tree
/// description.
Compressed,
/// This is a Huffman-compressed block,
/// using the Huffman tree from the previous [LiteralsSectionType::Compressed] block
/// in the sequence. If this mode is triggered without any previous Huffman-tables in the
/// frame (or dictionary), it should be treated as data corruption.
Treeless,
}
impl Default for LiteralsSection {
fn default() -> Self {
Self::new()
}
}
impl LiteralsSection {
/// Create a new [LiteralsSection].
pub fn new() -> LiteralsSection {
LiteralsSection {
regenerated_size: 0,
compressed_size: None,
num_streams: None,
ls_type: LiteralsSectionType::Raw,
}
}
/// Given the first byte of a header, determine the size of the whole header, from 1 to 5 bytes.
pub fn header_bytes_needed(&self, first_byte: u8) -> Result<u8, LiteralsSectionParseError> {
let ls_type: LiteralsSectionType = Self::section_type(first_byte)?;
let size_format = (first_byte >> 2) & 0x3;
match ls_type {
LiteralsSectionType::RLE | LiteralsSectionType::Raw => {
match size_format {
0 | 2 => {
// size_format actually only uses one bit
// regenerated_size uses 5 bits
Ok(1)
}
1 => {
// size_format uses 2 bit
// regenerated_size uses 12 bits
Ok(2)
}
3 => {
// size_format uses 2 bit
// regenerated_size uses 20 bits
Ok(3)
}
_ => panic!(
"This is a bug in the program. There should only be values between 0..3"
),
}
}
LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => {
match size_format {
0 | 1 => {
// Only differ in num_streams
// both regenerated and compressed sizes use 10 bit
Ok(3)
}
2 => {
// both regenerated and compressed sizes use 14 bit
Ok(4)
}
3 => {
// both regenerated and compressed sizes use 18 bit
Ok(5)
}
_ => panic!(
"This is a bug in the program. There should only be values between 0..3"
),
}
}
}
}
/// Parse the header into `self`, and returns the number of bytes read.
pub fn parse_from_header(&mut self, raw: &[u8]) -> Result<u8, LiteralsSectionParseError> {
let mut br: BitReader<'_> = BitReader::new(raw);
let block_type = br.get_bits(2)? as u8;
self.ls_type = Self::section_type(block_type)?;
let size_format = br.get_bits(2)? as u8;
let byte_needed = self.header_bytes_needed(raw[0])?;
if raw.len() < byte_needed as usize {
return Err(LiteralsSectionParseError::NotEnoughBytes {
have: raw.len(),
need: byte_needed,
});
}
match self.ls_type {
LiteralsSectionType::RLE | LiteralsSectionType::Raw => {
self.compressed_size = None;
match size_format {
0 | 2 => {
// size_format actually only uses one bit
// regenerated_size uses 5 bits
self.regenerated_size = u32::from(raw[0]) >> 3;
Ok(1)
}
1 => {
// size_format uses 2 bit
// regenerated_size uses 12 bits
self.regenerated_size = (u32::from(raw[0]) >> 4) + (u32::from(raw[1]) << 4);
Ok(2)
}
3 => {
// size_format uses 2 bit
// regenerated_size uses 20 bits
self.regenerated_size = (u32::from(raw[0]) >> 4)
+ (u32::from(raw[1]) << 4)
+ (u32::from(raw[2]) << 12);
Ok(3)
}
_ => panic!(
"This is a bug in the program. There should only be values between 0..3"
),
}
}
LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => {
match size_format {
0 => {
self.num_streams = Some(1);
}
1..=3 => {
self.num_streams = Some(4);
}
_ => panic!(
"This is a bug in the program. There should only be values between 0..3"
),
};
match size_format {
0 | 1 => {
// Differ in num_streams see above
// both regenerated and compressed sizes use 10 bit
// 4 from the first, six from the second byte
self.regenerated_size =
(u32::from(raw[0]) >> 4) + ((u32::from(raw[1]) & 0x3f) << 4);
// 2 from the second, full last byte
self.compressed_size =
Some(u32::from(raw[1] >> 6) + (u32::from(raw[2]) << 2));
Ok(3)
}
2 => {
// both regenerated and compressed sizes use 14 bit
// 4 from first, full second, 2 from the third byte
self.regenerated_size = (u32::from(raw[0]) >> 4)
+ (u32::from(raw[1]) << 4)
+ ((u32::from(raw[2]) & 0x3) << 12);
// 6 from the third, full last byte
self.compressed_size =
Some((u32::from(raw[2]) >> 2) + (u32::from(raw[3]) << 6));
Ok(4)
}
3 => {
// both regenerated and compressed sizes use 18 bit
// 4 from first, full second, six from third byte
self.regenerated_size = (u32::from(raw[0]) >> 4)
+ (u32::from(raw[1]) << 4)
+ ((u32::from(raw[2]) & 0x3F) << 12);
// 2 from third, full fourth, full fifth byte
self.compressed_size = Some(
(u32::from(raw[2]) >> 6)
+ (u32::from(raw[3]) << 2)
+ (u32::from(raw[4]) << 10),
);
Ok(5)
}
_ => panic!(
"This is a bug in the program. There should only be values between 0..3"
),
}
}
}
}
/// Given the first two bits of a header, determine the type of a header.
fn section_type(raw: u8) -> Result<LiteralsSectionType, LiteralsSectionParseError> {
let t = raw & 0x3;
match t {
0 => Ok(LiteralsSectionType::Raw),
1 => Ok(LiteralsSectionType::RLE),
2 => Ok(LiteralsSectionType::Compressed),
3 => Ok(LiteralsSectionType::Treeless),
other => Err(LiteralsSectionParseError::IllegalLiteralSectionType { got: other }),
}
}
}

10
vendor/ruzstd/src/blocks/mod.rs vendored Normal file
View File

@@ -0,0 +1,10 @@
//! In a Zstandard frame, there's a frame header, followed by one or more *blocks*.
//!
//! A block contains data, and a header describing how that data is encoded, as well
//! as other misc metadata.
//!
//! <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#blocks>
pub mod block;
pub mod literals_section;
pub mod sequence_section;

View File

@@ -0,0 +1,168 @@
//! Utilities and representations for the second half of a block, the sequence section.
//! This section copies literals from the literals section into the decompressed output.
use crate::decoding::errors::SequencesHeaderParseError;
pub(crate) const MAX_LITERAL_LENGTH_CODE: u8 = 35;
pub(crate) const MAX_MATCH_LENGTH_CODE: u8 = 52;
pub(crate) const MAX_OFFSET_CODE: u8 = 31;
pub struct SequencesHeader {
pub num_sequences: u32,
pub modes: Option<CompressionModes>,
}
/// A sequence represents potentially redundant data, and it can be broken up into 2 steps:
/// - A copy step, where data is copied from the literals section to the decompressed output
/// - A *match* copy step that copies data from within the previously decompressed output.
///
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequence-execution>
#[derive(Clone, Copy)]
pub struct Sequence {
/// Literal length, or the number of bytes to be copied from the literals section
/// in the copy step.
pub ll: u32,
/// The length of the match to make during the match copy step.
pub ml: u32,
/// How far back to go in the decompressed data to read from the match copy step.
/// If this value is greater than 3, then the offset is `of -3`. If `of` is from 1-3,
/// then it has special handling:
///
/// The first 3 values define 3 different repeated offsets, with 1 referring to the most
/// recent, 2 the second recent, and so on. When the current sequence has a literal length of 0,
/// then the repeated offsets are shifted by 1. So an offset value of 1 refers to 2, 2 refers to 3,
/// and 3 refers to the most recent offset minus one. If that value is equal to zero, the data
/// is considered corrupted.
pub of: u32,
}
impl core::fmt::Display for Sequence {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
write!(f, "LL: {}, ML: {}, OF: {}", self.ll, self.ml, self.of)
}
}
/// This byte defines the compression mode of each symbol type
#[derive(Copy, Clone)]
pub struct CompressionModes(u8);
/// The compression mode used for symbol compression
pub enum ModeType {
/// A predefined FSE distribution table is used, and no distribution table
/// will be present.
Predefined,
/// The table consists of a single byte, which contains the symbol's value.
#[allow(clippy::upper_case_acronyms)]
RLE,
/// Standard FSE compression, a distribution table will be present. This
/// mode should not be used when only one symbol is present.
FSECompressed,
/// The table used in the previous compressed block with at least one sequence
/// will be used again. If this is the first block, the table in the dictionary will
/// be used.
Repeat,
}
impl CompressionModes {
/// Deserialize a two bit mode value into a [ModeType]
pub fn decode_mode(m: u8) -> ModeType {
match m {
0 => ModeType::Predefined,
1 => ModeType::RLE,
2 => ModeType::FSECompressed,
3 => ModeType::Repeat,
_ => panic!("This can never happen"),
}
}
/// Read the compression mode of the literal lengths field.
pub fn ll_mode(self) -> ModeType {
Self::decode_mode(self.0 >> 6)
}
/// Read the compression mode of the offset value field.
pub fn of_mode(self) -> ModeType {
Self::decode_mode((self.0 >> 4) & 0x3)
}
/// Read the compression mode of the match lengths field.
pub fn ml_mode(self) -> ModeType {
Self::decode_mode((self.0 >> 2) & 0x3)
}
}
impl Default for SequencesHeader {
fn default() -> Self {
Self::new()
}
}
impl SequencesHeader {
/// Create a new [SequencesHeader].
pub fn new() -> SequencesHeader {
SequencesHeader {
num_sequences: 0,
modes: None,
}
}
/// Attempt to deserialize the provided buffer into `self`, returning the number of bytes read.
pub fn parse_from_header(&mut self, source: &[u8]) -> Result<u8, SequencesHeaderParseError> {
let mut bytes_read = 0;
if source.is_empty() {
return Err(SequencesHeaderParseError::NotEnoughBytes {
need_at_least: 1,
got: 0,
});
}
match source[0] {
0 => {
self.num_sequences = 0;
bytes_read += 1;
}
1..=127 => {
if source.len() < 2 {
return Err(SequencesHeaderParseError::NotEnoughBytes {
need_at_least: 2,
got: source.len(),
});
}
self.num_sequences = u32::from(source[0]);
self.modes = Some(CompressionModes(source[1]));
bytes_read += 2;
}
128..=254 => {
if source.len() < 2 {
return Err(SequencesHeaderParseError::NotEnoughBytes {
need_at_least: 2,
got: source.len(),
});
}
self.num_sequences = ((u32::from(source[0]) - 128) << 8) + u32::from(source[1]);
bytes_read += 2;
if self.num_sequences != 0 {
if source.len() < 3 {
return Err(SequencesHeaderParseError::NotEnoughBytes {
need_at_least: 3,
got: source.len(),
});
}
self.modes = Some(CompressionModes(source[2]));
bytes_read += 1;
}
}
255 => {
if source.len() < 4 {
return Err(SequencesHeaderParseError::NotEnoughBytes {
need_at_least: 4,
got: source.len(),
});
}
self.num_sequences = u32::from(source[1]) + (u32::from(source[2]) << 8) + 0x7F00;
self.modes = Some(CompressionModes(source[3]));
bytes_read += 4;
}
}
Ok(bytes_read)
}
}

21
vendor/ruzstd/src/common/mod.rs vendored Normal file
View File

@@ -0,0 +1,21 @@
//! Values and interfaces shared between the encoding side
//! and the decoding side.
// --- FRAMES ---
/// This magic number is included at the start of a single Zstandard frame
pub const MAGIC_NUM: u32 = 0xFD2F_B528;
/// Window size refers to the minimum amount of memory needed to decode any given frame.
///
/// The minimum window size is defined as 1 KB
pub const MIN_WINDOW_SIZE: u64 = 1024;
/// Window size refers to the minimum amount of memory needed to decode any given frame.
///
/// The maximum window size allowed by the spec is 3.75TB
pub const MAX_WINDOW_SIZE: u64 = (1 << 41) + 7 * (1 << 38);
// --- BLOCKS ---
/// While the spec limits block size to 128KB, the implementation uses
/// 128kibibytes
///
/// <https://github.com/facebook/zstd/blob/eca205fc7849a61ab287492931a04960ac58e031/doc/educational_decoder/zstd_decompress.c#L28-L29>
pub const MAX_BLOCK_SIZE: u32 = 128 * 1024;

View File

@@ -0,0 +1,310 @@
use super::super::blocks::block::BlockHeader;
use super::super::blocks::block::BlockType;
use super::super::blocks::literals_section::LiteralsSection;
use super::super::blocks::literals_section::LiteralsSectionType;
use super::super::blocks::sequence_section::SequencesHeader;
use super::literals_section_decoder::decode_literals;
use super::sequence_section_decoder::decode_sequences;
use crate::common::MAX_BLOCK_SIZE;
use crate::decoding::errors::DecodeSequenceError;
use crate::decoding::errors::{
BlockHeaderReadError, BlockSizeError, BlockTypeError, DecodeBlockContentError,
DecompressBlockError,
};
use crate::decoding::scratch::DecoderScratch;
use crate::decoding::sequence_execution::execute_sequences;
use crate::io::Read;
pub struct BlockDecoder {
header_buffer: [u8; 3],
internal_state: DecoderState,
}
enum DecoderState {
ReadyToDecodeNextHeader,
ReadyToDecodeNextBody,
#[allow(dead_code)]
Failed, //TODO put "self.internal_state = DecoderState::Failed;" everywhere an unresolvable error occurs
}
/// Create a new [BlockDecoder].
pub fn new() -> BlockDecoder {
BlockDecoder {
internal_state: DecoderState::ReadyToDecodeNextHeader,
header_buffer: [0u8; 3],
}
}
impl BlockDecoder {
pub fn decode_block_content(
&mut self,
header: &BlockHeader,
workspace: &mut DecoderScratch, //reuse this as often as possible. Not only if the trees are reused but also reuse the allocations when building new trees
mut source: impl Read,
) -> Result<u64, DecodeBlockContentError> {
match self.internal_state {
DecoderState::ReadyToDecodeNextBody => { /* Happy :) */ }
DecoderState::Failed => return Err(DecodeBlockContentError::DecoderStateIsFailed),
DecoderState::ReadyToDecodeNextHeader => {
return Err(DecodeBlockContentError::ExpectedHeaderOfPreviousBlock)
}
}
let block_type = header.block_type;
match block_type {
BlockType::RLE => {
const BATCH_SIZE: usize = 512;
let mut buf = [0u8; BATCH_SIZE];
let full_reads = header.decompressed_size / BATCH_SIZE as u32;
let single_read_size = header.decompressed_size % BATCH_SIZE as u32;
source.read_exact(&mut buf[0..1]).map_err(|err| {
DecodeBlockContentError::ReadError {
step: block_type,
source: err,
}
})?;
self.internal_state = DecoderState::ReadyToDecodeNextHeader;
for i in 1..BATCH_SIZE {
buf[i] = buf[0];
}
for _ in 0..full_reads {
workspace.buffer.push(&buf[..]);
}
let smaller = &mut buf[..single_read_size as usize];
workspace.buffer.push(smaller);
Ok(1)
}
BlockType::Raw => {
const BATCH_SIZE: usize = 128 * 1024;
let mut buf = [0u8; BATCH_SIZE];
let full_reads = header.decompressed_size / BATCH_SIZE as u32;
let single_read_size = header.decompressed_size % BATCH_SIZE as u32;
for _ in 0..full_reads {
source.read_exact(&mut buf[..]).map_err(|err| {
DecodeBlockContentError::ReadError {
step: block_type,
source: err,
}
})?;
workspace.buffer.push(&buf[..]);
}
let smaller = &mut buf[..single_read_size as usize];
source
.read_exact(smaller)
.map_err(|err| DecodeBlockContentError::ReadError {
step: block_type,
source: err,
})?;
workspace.buffer.push(smaller);
self.internal_state = DecoderState::ReadyToDecodeNextHeader;
Ok(u64::from(header.decompressed_size))
}
BlockType::Reserved => {
panic!("How did you even get this. The decoder should error out if it detects a reserved-type block");
}
BlockType::Compressed => {
self.decompress_block(header, workspace, source)?;
self.internal_state = DecoderState::ReadyToDecodeNextHeader;
Ok(u64::from(header.content_size))
}
}
}
fn decompress_block(
&mut self,
header: &BlockHeader,
workspace: &mut DecoderScratch, //reuse this as often as possible. Not only if the trees are reused but also reuse the allocations when building new trees
mut source: impl Read,
) -> Result<(), DecompressBlockError> {
workspace
.block_content_buffer
.resize(header.content_size as usize, 0);
source.read_exact(workspace.block_content_buffer.as_mut_slice())?;
let raw = workspace.block_content_buffer.as_slice();
let mut section = LiteralsSection::new();
let bytes_in_literals_header = section.parse_from_header(raw)?;
let raw = &raw[bytes_in_literals_header as usize..];
vprintln!(
"Found {} literalssection with regenerated size: {}, and compressed size: {:?}",
section.ls_type,
section.regenerated_size,
section.compressed_size
);
let upper_limit_for_literals = match section.compressed_size {
Some(x) => x as usize,
None => match section.ls_type {
LiteralsSectionType::RLE => 1,
LiteralsSectionType::Raw => section.regenerated_size as usize,
_ => panic!("Bug in this library"),
},
};
if raw.len() < upper_limit_for_literals {
return Err(DecompressBlockError::MalformedSectionHeader {
expected_len: upper_limit_for_literals,
remaining_bytes: raw.len(),
});
}
let raw_literals = &raw[..upper_limit_for_literals];
vprintln!("Slice for literals: {}", raw_literals.len());
workspace.literals_buffer.clear(); //all literals of the previous block must have been used in the sequence execution anyways. just be defensive here
let bytes_used_in_literals_section = decode_literals(
&section,
&mut workspace.huf,
raw_literals,
&mut workspace.literals_buffer,
)?;
assert!(
section.regenerated_size == workspace.literals_buffer.len() as u32,
"Wrong number of literals: {}, Should have been: {}",
workspace.literals_buffer.len(),
section.regenerated_size
);
assert!(bytes_used_in_literals_section == upper_limit_for_literals as u32);
let raw = &raw[upper_limit_for_literals..];
vprintln!("Slice for sequences with headers: {}", raw.len());
let mut seq_section = SequencesHeader::new();
let bytes_in_sequence_header = seq_section.parse_from_header(raw)?;
let raw = &raw[bytes_in_sequence_header as usize..];
vprintln!(
"Found sequencessection with sequences: {} and size: {}",
seq_section.num_sequences,
raw.len()
);
assert!(
u32::from(bytes_in_literals_header)
+ bytes_used_in_literals_section
+ u32::from(bytes_in_sequence_header)
+ raw.len() as u32
== header.content_size
);
vprintln!("Slice for sequences: {}", raw.len());
if seq_section.num_sequences != 0 {
decode_sequences(
&seq_section,
raw,
&mut workspace.fse,
&mut workspace.sequences,
)?;
vprintln!("Executing sequences");
execute_sequences(workspace)?;
} else {
if !raw.is_empty() {
return Err(DecompressBlockError::DecodeSequenceError(
DecodeSequenceError::ExtraBits {
bits_remaining: raw.len() as isize * 8,
},
));
}
workspace.buffer.push(&workspace.literals_buffer);
workspace.sequences.clear();
}
Ok(())
}
/// Reads 3 bytes from the provided reader and returns
/// the deserialized header and the number of bytes read.
pub fn read_block_header(
&mut self,
mut r: impl Read,
) -> Result<(BlockHeader, u8), BlockHeaderReadError> {
//match self.internal_state {
// DecoderState::ReadyToDecodeNextHeader => {/* Happy :) */},
// DecoderState::Failed => return Err(format!("Cant decode next block if failed along the way. Results will be nonsense")),
// DecoderState::ReadyToDecodeNextBody => return Err(format!("Cant decode next block header, while expecting to decode the body of the previous block. Results will be nonsense")),
//}
r.read_exact(&mut self.header_buffer[0..3])?;
let btype = self.block_type()?;
if let BlockType::Reserved = btype {
return Err(BlockHeaderReadError::FoundReservedBlock);
}
let block_size = self.block_content_size()?;
let decompressed_size = match btype {
BlockType::Raw => block_size,
BlockType::RLE => block_size,
BlockType::Reserved => 0, //should be caught above, this is an error state
BlockType::Compressed => 0, //unknown but will be smaller than 128kb (or window_size if that is smaller than 128kb)
};
let content_size = match btype {
BlockType::Raw => block_size,
BlockType::Compressed => block_size,
BlockType::RLE => 1,
BlockType::Reserved => 0, //should be caught above, this is an error state
};
let last_block = self.is_last();
self.reset_buffer();
self.internal_state = DecoderState::ReadyToDecodeNextBody;
//just return 3. Blockheaders always take 3 bytes
Ok((
BlockHeader {
last_block,
block_type: btype,
decompressed_size,
content_size,
},
3,
))
}
fn reset_buffer(&mut self) {
self.header_buffer[0] = 0;
self.header_buffer[1] = 0;
self.header_buffer[2] = 0;
}
fn is_last(&self) -> bool {
self.header_buffer[0] & 0x1 == 1
}
fn block_type(&self) -> Result<BlockType, BlockTypeError> {
let t = (self.header_buffer[0] >> 1) & 0x3;
match t {
0 => Ok(BlockType::Raw),
1 => Ok(BlockType::RLE),
2 => Ok(BlockType::Compressed),
3 => Ok(BlockType::Reserved),
other => Err(BlockTypeError::InvalidBlocktypeNumber { num: other }),
}
}
fn block_content_size(&self) -> Result<u32, BlockSizeError> {
let val = self.block_content_size_unchecked();
if val > MAX_BLOCK_SIZE {
Err(BlockSizeError::BlockSizeTooLarge { size: val })
} else {
Ok(val)
}
}
fn block_content_size_unchecked(&self) -> u32 {
u32::from(self.header_buffer[0] >> 3) //push out type and last_block flags. Retain 5 bit
| (u32::from(self.header_buffer[1]) << 5)
| (u32::from(self.header_buffer[2]) << 13)
}
}

View File

@@ -0,0 +1,451 @@
use crate::io::{Error, Read, Write};
use alloc::vec::Vec;
#[cfg(feature = "hash")]
use core::hash::Hasher;
use super::ringbuffer::RingBuffer;
use crate::decoding::errors::DecodeBufferError;
pub struct DecodeBuffer {
buffer: RingBuffer,
pub dict_content: Vec<u8>,
pub window_size: usize,
total_output_counter: u64,
#[cfg(feature = "hash")]
pub hash: twox_hash::XxHash64,
}
impl Read for DecodeBuffer {
fn read(&mut self, target: &mut [u8]) -> Result<usize, Error> {
let max_amount = self.can_drain_to_window_size().unwrap_or(0);
let amount = max_amount.min(target.len());
let mut written = 0;
self.drain_to(amount, |buf| {
target[written..][..buf.len()].copy_from_slice(buf);
written += buf.len();
(buf.len(), Ok(()))
})?;
Ok(amount)
}
}
impl DecodeBuffer {
pub fn new(window_size: usize) -> DecodeBuffer {
DecodeBuffer {
buffer: RingBuffer::new(),
dict_content: Vec::new(),
window_size,
total_output_counter: 0,
#[cfg(feature = "hash")]
hash: twox_hash::XxHash64::with_seed(0),
}
}
pub fn reset(&mut self, window_size: usize) {
self.window_size = window_size;
self.buffer.clear();
self.buffer.reserve(self.window_size);
self.dict_content.clear();
self.total_output_counter = 0;
#[cfg(feature = "hash")]
{
self.hash = twox_hash::XxHash64::with_seed(0);
}
}
pub fn len(&self) -> usize {
self.buffer.len()
}
pub fn push(&mut self, data: &[u8]) {
self.buffer.extend(data);
self.total_output_counter += data.len() as u64;
}
pub fn repeat(&mut self, offset: usize, match_length: usize) -> Result<(), DecodeBufferError> {
if offset > self.buffer.len() {
self.repeat_from_dict(offset, match_length)
} else {
let buf_len = self.buffer.len();
let start_idx = buf_len - offset;
let end_idx = start_idx + match_length;
self.buffer.reserve(match_length);
if end_idx > buf_len {
// We need to copy in chunks.
self.repeat_in_chunks(offset, match_length, start_idx);
} else {
// can just copy parts of the existing buffer
// SAFETY: Requirements checked:
// 1. start_idx + match_length must be <= self.buffer.len()
// We know that:
// 1. start_idx = self.buffer.len() - offset
// 2. end_idx = start_idx + match_length
// 3. end_idx <= self.buffer.len()
// Thus follows: start_idx + match_length <= self.buffer.len()
//
// 2. explicitly reserved enough memory for the whole match_length
unsafe {
self.buffer
.extend_from_within_unchecked(start_idx, match_length)
};
}
self.total_output_counter += match_length as u64;
Ok(())
}
}
fn repeat_in_chunks(&mut self, offset: usize, match_length: usize, start_idx: usize) {
// We have at max offset bytes in one chunk, the last one can be smaller
let mut start_idx = start_idx;
let mut copied_counter_left = match_length;
// TODO this can be optimized further I think.
// Each time we copy a chunk we have a repetiton of length 'offset', so we can copy offset * iteration many bytes from start_idx
while copied_counter_left > 0 {
let chunksize = usize::min(offset, copied_counter_left);
// SAFETY: Requirements checked:
// 1. start_idx + chunksize must be <= self.buffer.len()
// We know that:
// 1. start_idx starts at buffer.len() - offset
// 2. chunksize <= offset (== offset for each iteration but the last, and match_length modulo offset in the last iteration)
// 3. the buffer grows by offset many bytes each iteration but the last
// 4. start_idx is increased by the same amount as the buffer grows each iteration
//
// Thus follows: start_idx + chunksize == self.buffer.len() in each iteration but the last, where match_length modulo offset == chunksize < offset
// Meaning: start_idx + chunksize <= self.buffer.len()
//
// 2. explicitly reserved enough memory for the whole match_length
unsafe {
self.buffer
.extend_from_within_unchecked(start_idx, chunksize)
};
copied_counter_left -= chunksize;
start_idx += chunksize;
}
}
#[cold]
fn repeat_from_dict(
&mut self,
offset: usize,
match_length: usize,
) -> Result<(), DecodeBufferError> {
if self.total_output_counter <= self.window_size as u64 {
// at least part of that repeat is from the dictionary content
let bytes_from_dict = offset - self.buffer.len();
if bytes_from_dict > self.dict_content.len() {
return Err(DecodeBufferError::NotEnoughBytesInDictionary {
got: self.dict_content.len(),
need: bytes_from_dict,
});
}
if bytes_from_dict < match_length {
let dict_slice = &self.dict_content[self.dict_content.len() - bytes_from_dict..];
self.buffer.extend(dict_slice);
self.total_output_counter += bytes_from_dict as u64;
return self.repeat(self.buffer.len(), match_length - bytes_from_dict);
} else {
let low = self.dict_content.len() - bytes_from_dict;
let high = low + match_length;
let dict_slice = &self.dict_content[low..high];
self.buffer.extend(dict_slice);
}
Ok(())
} else {
Err(DecodeBufferError::OffsetTooBig {
offset,
buf_len: self.buffer.len(),
})
}
}
/// Check if and how many bytes can currently be drawn from the buffer
pub fn can_drain_to_window_size(&self) -> Option<usize> {
if self.buffer.len() > self.window_size {
Some(self.buffer.len() - self.window_size)
} else {
None
}
}
//How many bytes can be drained if the window_size does not have to be maintained
pub fn can_drain(&self) -> usize {
self.buffer.len()
}
/// Drain as much as possible while retaining enough so that decoding si still possible with the required window_size
/// At best call only if can_drain_to_window_size reports a 'high' number of bytes to reduce allocations
pub fn drain_to_window_size(&mut self) -> Option<Vec<u8>> {
//TODO investigate if it is possible to return the std::vec::Drain iterator directly without collecting here
match self.can_drain_to_window_size() {
None => None,
Some(can_drain) => {
let mut vec = Vec::with_capacity(can_drain);
self.drain_to(can_drain, |buf| {
vec.extend_from_slice(buf);
(buf.len(), Ok(()))
})
.ok()?;
Some(vec)
}
}
}
pub fn drain_to_window_size_writer(&mut self, mut sink: impl Write) -> Result<usize, Error> {
match self.can_drain_to_window_size() {
None => Ok(0),
Some(can_drain) => self.drain_to(can_drain, |buf| write_all_bytes(&mut sink, buf)),
}
}
/// drain the buffer completely
pub fn drain(&mut self) -> Vec<u8> {
let (slice1, slice2) = self.buffer.as_slices();
#[cfg(feature = "hash")]
{
self.hash.write(slice1);
self.hash.write(slice2);
}
let mut vec = Vec::with_capacity(slice1.len() + slice2.len());
vec.extend_from_slice(slice1);
vec.extend_from_slice(slice2);
self.buffer.clear();
vec
}
pub fn drain_to_writer(&mut self, mut sink: impl Write) -> Result<usize, Error> {
let write_limit = self.buffer.len();
self.drain_to(write_limit, |buf| write_all_bytes(&mut sink, buf))
}
pub fn read_all(&mut self, target: &mut [u8]) -> Result<usize, Error> {
let amount = self.buffer.len().min(target.len());
let mut written = 0;
self.drain_to(amount, |buf| {
target[written..][..buf.len()].copy_from_slice(buf);
written += buf.len();
(buf.len(), Ok(()))
})?;
Ok(amount)
}
/// Semantics of write_bytes:
/// Should dump as many of the provided bytes as possible to whatever sink until no bytes are left or an error is encountered
/// Return how many bytes have actually been dumped to the sink.
fn drain_to(
&mut self,
amount: usize,
mut write_bytes: impl FnMut(&[u8]) -> (usize, Result<(), Error>),
) -> Result<usize, Error> {
if amount == 0 {
return Ok(0);
}
struct DrainGuard<'a> {
buffer: &'a mut RingBuffer,
amount: usize,
}
impl Drop for DrainGuard<'_> {
fn drop(&mut self) {
if self.amount != 0 {
self.buffer.drop_first_n(self.amount);
}
}
}
let mut drain_guard = DrainGuard {
buffer: &mut self.buffer,
amount: 0,
};
let (slice1, slice2) = drain_guard.buffer.as_slices();
let n1 = slice1.len().min(amount);
let n2 = slice2.len().min(amount - n1);
if n1 != 0 {
let (written1, res1) = write_bytes(&slice1[..n1]);
#[cfg(feature = "hash")]
self.hash.write(&slice1[..written1]);
drain_guard.amount += written1;
// Apparently this is what clippy thinks is the best way of expressing this
res1?;
// Only if the first call to write_bytes was not a partial write we can continue with slice2
// Partial writes SHOULD never happen without res1 being an error, but lets just protect against it anyways.
if written1 == n1 && n2 != 0 {
let (written2, res2) = write_bytes(&slice2[..n2]);
#[cfg(feature = "hash")]
self.hash.write(&slice2[..written2]);
drain_guard.amount += written2;
// Apparently this is what clippy thinks is the best way of expressing this
res2?;
}
}
let amount_written = drain_guard.amount;
// Make sure we don't accidentally drop `DrainGuard` earlier.
drop(drain_guard);
Ok(amount_written)
}
}
/// Like Write::write_all but returns partial write length even on error
fn write_all_bytes(mut sink: impl Write, buf: &[u8]) -> (usize, Result<(), Error>) {
let mut written = 0;
while written < buf.len() {
match sink.write(&buf[written..]) {
Ok(0) => return (written, Ok(())),
Ok(w) => written += w,
Err(e) => return (written, Err(e)),
}
}
(written, Ok(()))
}
#[cfg(test)]
mod tests {
use super::DecodeBuffer;
use crate::io::{Error, ErrorKind, Write};
extern crate std;
use alloc::vec;
use alloc::vec::Vec;
#[test]
fn short_writer() {
struct ShortWriter {
buf: Vec<u8>,
write_len: usize,
}
impl Write for ShortWriter {
fn write(&mut self, buf: &[u8]) -> std::result::Result<usize, Error> {
if buf.len() > self.write_len {
self.buf.extend_from_slice(&buf[..self.write_len]);
Ok(self.write_len)
} else {
self.buf.extend_from_slice(buf);
Ok(buf.len())
}
}
fn flush(&mut self) -> std::result::Result<(), Error> {
Ok(())
}
}
let mut short_writer = ShortWriter {
buf: vec![],
write_len: 10,
};
let mut decode_buf = DecodeBuffer::new(100);
decode_buf.push(b"0123456789");
decode_buf.repeat(10, 90).unwrap();
let repeats = 1000;
for _ in 0..repeats {
assert_eq!(decode_buf.len(), 100);
decode_buf.repeat(10, 50).unwrap();
assert_eq!(decode_buf.len(), 150);
decode_buf
.drain_to_window_size_writer(&mut short_writer)
.unwrap();
assert_eq!(decode_buf.len(), 100);
}
assert_eq!(short_writer.buf.len(), repeats * 50);
decode_buf.drain_to_writer(&mut short_writer).unwrap();
assert_eq!(short_writer.buf.len(), repeats * 50 + 100);
}
#[test]
fn wouldblock_writer() {
struct WouldblockWriter {
buf: Vec<u8>,
last_blocked: usize,
block_every: usize,
}
impl Write for WouldblockWriter {
fn write(&mut self, buf: &[u8]) -> std::result::Result<usize, Error> {
if self.last_blocked < self.block_every {
self.buf.extend_from_slice(buf);
self.last_blocked += 1;
Ok(buf.len())
} else {
self.last_blocked = 0;
Err(Error::from(ErrorKind::WouldBlock))
}
}
fn flush(&mut self) -> std::result::Result<(), Error> {
Ok(())
}
}
let mut short_writer = WouldblockWriter {
buf: vec![],
last_blocked: 0,
block_every: 5,
};
let mut decode_buf = DecodeBuffer::new(100);
decode_buf.push(b"0123456789");
decode_buf.repeat(10, 90).unwrap();
let repeats = 1000;
for _ in 0..repeats {
assert_eq!(decode_buf.len(), 100);
decode_buf.repeat(10, 50).unwrap();
assert_eq!(decode_buf.len(), 150);
loop {
match decode_buf.drain_to_window_size_writer(&mut short_writer) {
Ok(written) => {
if written == 0 {
break;
}
}
Err(e) => {
if e.kind() == ErrorKind::WouldBlock {
continue;
} else {
panic!("Unexpected error {:?}", e);
}
}
}
}
assert_eq!(decode_buf.len(), 100);
}
assert_eq!(short_writer.buf.len(), repeats * 50);
loop {
match decode_buf.drain_to_writer(&mut short_writer) {
Ok(written) => {
if written == 0 {
break;
}
}
Err(e) => {
if e.kind() == ErrorKind::WouldBlock {
continue;
} else {
panic!("Unexpected error {:?}", e);
}
}
}
}
assert_eq!(short_writer.buf.len(), repeats * 50 + 100);
}
}

104
vendor/ruzstd/src/decoding/dictionary.rs vendored Normal file
View File

@@ -0,0 +1,104 @@
use alloc::vec::Vec;
use core::convert::TryInto;
use crate::decoding::errors::DictionaryDecodeError;
use crate::decoding::scratch::FSEScratch;
use crate::decoding::scratch::HuffmanScratch;
/// Zstandard includes support for "raw content" dictionaries, that store bytes optionally used
/// during sequence execution.
///
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format>
pub struct Dictionary {
/// A 4 byte value used by decoders to check if they can use
/// the correct dictionary. This value must not be zero.
pub id: u32,
/// A dictionary can contain an entropy table, either FSE or
/// Huffman.
pub fse: FSEScratch,
/// A dictionary can contain an entropy table, either FSE or
/// Huffman.
pub huf: HuffmanScratch,
/// The content of a dictionary acts as a "past" in front of data
/// to compress or decompress,
/// so it can be referenced in sequence commands.
/// As long as the amount of data decoded from this frame is less than or
/// equal to Window_Size, sequence commands may specify offsets longer than
/// the total length of decoded output so far to reference back to the
/// dictionary, even parts of the dictionary with offsets larger than Window_Size.
/// After the total output has surpassed Window_Size however,
/// this is no longer allowed and the dictionary is no longer accessible
pub dict_content: Vec<u8>,
/// The 3 most recent offsets are stored so that they can be used
/// during sequence execution, see
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#repeat-offsets>
/// for more.
pub offset_hist: [u32; 3],
}
/// This 4 byte (little endian) magic number refers to the start of a dictionary
pub const MAGIC_NUM: [u8; 4] = [0x37, 0xA4, 0x30, 0xEC];
impl Dictionary {
/// Parses the dictionary from `raw` and set the tables
/// it returns the dict_id for checking with the frame's `dict_id``
pub fn decode_dict(raw: &[u8]) -> Result<Dictionary, DictionaryDecodeError> {
let mut new_dict = Dictionary {
id: 0,
fse: FSEScratch::new(),
huf: HuffmanScratch::new(),
dict_content: Vec::new(),
offset_hist: [2, 4, 8],
};
let magic_num: [u8; 4] = raw[..4].try_into().expect("optimized away");
if magic_num != MAGIC_NUM {
return Err(DictionaryDecodeError::BadMagicNum { got: magic_num });
}
let dict_id = raw[4..8].try_into().expect("optimized away");
let dict_id = u32::from_le_bytes(dict_id);
new_dict.id = dict_id;
let raw_tables = &raw[8..];
let huf_size = new_dict.huf.table.build_decoder(raw_tables)?;
let raw_tables = &raw_tables[huf_size as usize..];
let of_size = new_dict.fse.offsets.build_decoder(
raw_tables,
crate::decoding::sequence_section_decoder::OF_MAX_LOG,
)?;
let raw_tables = &raw_tables[of_size..];
let ml_size = new_dict.fse.match_lengths.build_decoder(
raw_tables,
crate::decoding::sequence_section_decoder::ML_MAX_LOG,
)?;
let raw_tables = &raw_tables[ml_size..];
let ll_size = new_dict.fse.literal_lengths.build_decoder(
raw_tables,
crate::decoding::sequence_section_decoder::LL_MAX_LOG,
)?;
let raw_tables = &raw_tables[ll_size..];
let offset1 = raw_tables[0..4].try_into().expect("optimized away");
let offset1 = u32::from_le_bytes(offset1);
let offset2 = raw_tables[4..8].try_into().expect("optimized away");
let offset2 = u32::from_le_bytes(offset2);
let offset3 = raw_tables[8..12].try_into().expect("optimized away");
let offset3 = u32::from_le_bytes(offset3);
new_dict.offset_hist[0] = offset1;
new_dict.offset_hist[1] = offset2;
new_dict.offset_hist[2] = offset3;
let raw_content = &raw_tables[12..];
new_dict.dict_content.extend(raw_content);
Ok(new_dict)
}
}

1187
vendor/ruzstd/src/decoding/errors.rs vendored Normal file

File diff suppressed because it is too large Load Diff

241
vendor/ruzstd/src/decoding/frame.rs vendored Normal file
View File

@@ -0,0 +1,241 @@
use crate::common::{MAGIC_NUM, MAX_WINDOW_SIZE, MIN_WINDOW_SIZE};
use crate::decoding::errors::{FrameDescriptorError, FrameHeaderError, ReadFrameHeaderError};
use crate::io::Read;
/// Read a single serialized frame from the reader and return a tuple containing the parsed frame and the number of bytes read.
pub fn read_frame_header(mut r: impl Read) -> Result<(FrameHeader, u8), ReadFrameHeaderError> {
use ReadFrameHeaderError as err;
let mut buf = [0u8; 4];
r.read_exact(&mut buf).map_err(err::MagicNumberReadError)?;
let mut bytes_read = 4;
let magic_num = u32::from_le_bytes(buf);
// Skippable frames have a magic number in this interval
if (0x184D2A50..=0x184D2A5F).contains(&magic_num) {
r.read_exact(&mut buf)
.map_err(err::FrameDescriptorReadError)?;
let skip_size = u32::from_le_bytes(buf);
return Err(ReadFrameHeaderError::SkipFrame {
magic_number: magic_num,
length: skip_size,
});
}
if magic_num != MAGIC_NUM {
return Err(ReadFrameHeaderError::BadMagicNumber(magic_num));
}
r.read_exact(&mut buf[0..1])
.map_err(err::FrameDescriptorReadError)?;
let desc = FrameDescriptor(buf[0]);
bytes_read += 1;
let mut frame_header = FrameHeader {
descriptor: FrameDescriptor(desc.0),
dict_id: None,
frame_content_size: 0,
window_descriptor: 0,
};
if !desc.single_segment_flag() {
r.read_exact(&mut buf[0..1])
.map_err(err::WindowDescriptorReadError)?;
frame_header.window_descriptor = buf[0];
bytes_read += 1;
}
let dict_id_len = desc.dictionary_id_bytes()? as usize;
if dict_id_len != 0 {
let buf = &mut buf[..dict_id_len];
r.read_exact(buf).map_err(err::DictionaryIdReadError)?;
bytes_read += dict_id_len;
let mut dict_id = 0u32;
#[allow(clippy::needless_range_loop)]
for i in 0..dict_id_len {
dict_id += (buf[i] as u32) << (8 * i);
}
if dict_id != 0 {
frame_header.dict_id = Some(dict_id);
}
}
let fcs_len = desc.frame_content_size_bytes()? as usize;
if fcs_len != 0 {
let mut fcs_buf = [0u8; 8];
let fcs_buf = &mut fcs_buf[..fcs_len];
r.read_exact(fcs_buf)
.map_err(err::FrameContentSizeReadError)?;
bytes_read += fcs_len;
let mut fcs = 0u64;
#[allow(clippy::needless_range_loop)]
for i in 0..fcs_len {
fcs += (fcs_buf[i] as u64) << (8 * i);
}
if fcs_len == 2 {
fcs += 256;
}
frame_header.frame_content_size = fcs;
}
Ok((frame_header, bytes_read as u8))
}
/// A frame header has a variable size, with a minimum of 2 bytes, and a maximum of 14 bytes.
pub struct FrameHeader {
pub descriptor: FrameDescriptor,
/// The `Window_Descriptor` field contains the minimum size of a memory buffer needed to
/// decompress the entire frame.
///
/// This byte is not included in the frame header when the `Single_Segment_flag` is set.
///
/// Bits 7-3 refer to the `Exponent`, where bits 2-0 refer to the `Mantissa`.
///
/// To determine the size of a window, the following formula can be used:
/// ```text
/// windowLog = 10 + Exponent;
/// windowBase = 1 << windowLog;
/// windowAdd = (windowBase / 8) * Mantissa;
/// Window_Size = windowBase + windowAdd;
/// ```
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor>
window_descriptor: u8,
/// The `Dictionary_ID` field contains the ID of the dictionary to be used to decode the frame.
/// When this value is not present, it's up to the decoder to know which dictionary to use.
dict_id: Option<u32>,
/// The size of the original/uncompressed content.
frame_content_size: u64,
}
impl FrameHeader {
/// Read the size of the window from the header or the total frame content size,
/// whichever is defined, returning the size in bytes.
pub fn window_size(&self) -> Result<u64, FrameHeaderError> {
if self.descriptor.single_segment_flag() {
Ok(self.frame_content_size())
} else {
let exp = self.window_descriptor >> 3;
let mantissa = self.window_descriptor & 0x7;
let window_log = 10 + u64::from(exp);
let window_base = 1 << window_log;
let window_add = (window_base / 8) * u64::from(mantissa);
let window_size = window_base + window_add;
if window_size >= MIN_WINDOW_SIZE {
if window_size < MAX_WINDOW_SIZE {
Ok(window_size)
} else {
Err(FrameHeaderError::WindowTooBig { got: window_size })
}
} else {
Err(FrameHeaderError::WindowTooSmall { got: window_size })
}
}
}
/// The ID (if provided) of the dictionary required to decode this frame.
pub fn dictionary_id(&self) -> Option<u32> {
self.dict_id
}
/// Obtain the uncompressed size (in bytes) of the frame contents.
pub fn frame_content_size(&self) -> u64 {
self.frame_content_size
}
}
/// The first byte is called the `Frame Header Descriptor`, and it describes what other fields
/// are present.
pub struct FrameDescriptor(pub u8);
impl FrameDescriptor {
/// Read the `Frame_Content_Size_flag` from the frame header descriptor.
///
/// This is a 2 bit flag, specifying if the `Frame_Content_Size` field is present
/// within the header. It notates the number of bytes used by `Frame_Content_size`
///
/// When this value is is 0, `FCS_Field_Size` depends on Single_Segment_flag.
/// If the `Single_Segment_flag` field is set in the frame header descriptor,
/// the size of the `Frame_Content_Size` field of the header is 1 byte.
/// Otherwise, `FCS_Field_Size` is 0, and the `Frame_Content_Size` is not provided.
///
/// | Flag Value (decimal) | Size of the `Frame_Content_Size` field in bytes |
/// | -- | -- |
/// | 0 | 0 or 1 (see above) |
/// | 1 | 2 |
/// | 2 | 4 |
/// | 3 | 8 |
pub fn frame_content_size_flag(&self) -> u8 {
self.0 >> 6
}
/// This bit is reserved for some future feature, a compliant decoder **must ensure**
/// that this value is set to zero.
#[expect(dead_code)]
pub fn reserved_flag(&self) -> bool {
((self.0 >> 3) & 0x1) == 1
}
/// If this flag is set, data must be regenerated within a single continuous memory segment.
///
/// In this case, the `Window_Descriptor` byte is skipped, but `Frame_Content_Size` is present.
/// The decoder must allocate a memory segment equal to or larger than `Frame_Content_Size`.
pub fn single_segment_flag(&self) -> bool {
((self.0 >> 5) & 0x1) == 1
}
/// If this flag is set, a 32 bit `Content_Checksum` will be present at the end of the frame.
pub fn content_checksum_flag(&self) -> bool {
((self.0 >> 2) & 0x1) == 1
}
/// This is a two bit flag telling if a dictionary ID is provided within the header. It also
/// specifies the size of this field
///
/// | Value (Decimal) | `DID_Field_Size` (bytes) |
/// | -- | -- |
/// | 0 | 0 |
/// | 1 | 1 |
/// | 2 | 2 |
/// | 3 | 4 |
pub fn dict_id_flag(&self) -> u8 {
self.0 & 0x3
}
/// Read the size of the `Frame_Content_size` field from the frame header descriptor, returning
/// the size in bytes.
/// If this value is zero, then the `Frame_Content_Size` field is not present within the header.
pub fn frame_content_size_bytes(&self) -> Result<u8, FrameDescriptorError> {
match self.frame_content_size_flag() {
0 => {
if self.single_segment_flag() {
Ok(1)
} else {
Ok(0)
}
}
1 => Ok(2),
2 => Ok(4),
3 => Ok(8),
other => Err(FrameDescriptorError::InvalidFrameContentSizeFlag { got: other }),
}
}
/// Read the size of the `Dictionary_ID` field from the frame header descriptor, returning the size in bytes.
/// If this value is zero, then the dictionary id is not present within the header,
/// and "It's up to the decoder to know which dictionary to use."
pub fn dictionary_id_bytes(&self) -> Result<u8, FrameDescriptorError> {
match self.dict_id_flag() {
0 => Ok(0),
1 => Ok(1),
2 => Ok(2),
3 => Ok(4),
other => Err(FrameDescriptorError::InvalidFrameContentSizeFlag { got: other }),
}
}
}

View File

@@ -0,0 +1,597 @@
//! Framedecoder is the main low-level struct users interact with to decode zstd frames
//!
//! Zstandard compressed data is made of one or more frames. Each frame is independent and can be
//! decompressed independently of other frames. This module contains structures
//! and utilities that can be used to decode a frame.
use super::frame;
use crate::decoding;
use crate::decoding::dictionary::Dictionary;
use crate::decoding::errors::FrameDecoderError;
use crate::decoding::scratch::DecoderScratch;
use crate::io::{Error, Read, Write};
use alloc::collections::BTreeMap;
use alloc::vec::Vec;
use core::convert::TryInto;
/// While the maximum window size allowed by the spec is significantly larger,
/// our implementation limits it to 100mb to protect against malformed frames.
const MAXIMUM_ALLOWED_WINDOW_SIZE: u64 = 1024 * 1024 * 100;
/// Low level Zstandard decoder that can be used to decompress frames with fine control over when and how many bytes are decoded.
///
/// This decoder is able to decode frames only partially and gives control
/// over how many bytes/blocks will be decoded at a time (so you don't have to decode a 10GB file into memory all at once).
/// It reads bytes as needed from a provided source and can be read from to collect partial results.
///
/// If you want to just read the whole frame with an `io::Read` without having to deal with manually calling [FrameDecoder::decode_blocks]
/// you can use the provided [crate::decoding::StreamingDecoder] wich wraps this FrameDecoder.
///
/// Workflow is as follows:
/// ```
/// use ruzstd::decoding::BlockDecodingStrategy;
///
/// # #[cfg(feature = "std")]
/// use std::io::{Read, Write};
///
/// // no_std environments can use the crate's own Read traits
/// # #[cfg(not(feature = "std"))]
/// use ruzstd::io::{Read, Write};
///
/// fn decode_this(mut file: impl Read) {
/// //Create a new decoder
/// let mut frame_dec = ruzstd::decoding::FrameDecoder::new();
/// let mut result = Vec::new();
///
/// // Use reset or init to make the decoder ready to decode the frame from the io::Read
/// frame_dec.reset(&mut file).unwrap();
///
/// // Loop until the frame has been decoded completely
/// while !frame_dec.is_finished() {
/// // decode (roughly) batch_size many bytes
/// frame_dec.decode_blocks(&mut file, BlockDecodingStrategy::UptoBytes(1024)).unwrap();
///
/// // read from the decoder to collect bytes from the internal buffer
/// let bytes_read = frame_dec.read(result.as_mut_slice()).unwrap();
///
/// // then do something with it
/// do_something(&result[0..bytes_read]);
/// }
///
/// // handle the last chunk of data
/// while frame_dec.can_collect() > 0 {
/// let x = frame_dec.read(result.as_mut_slice()).unwrap();
///
/// do_something(&result[0..x]);
/// }
/// }
///
/// fn do_something(data: &[u8]) {
/// # #[cfg(feature = "std")]
/// std::io::stdout().write_all(data).unwrap();
/// }
/// ```
pub struct FrameDecoder {
state: Option<FrameDecoderState>,
dicts: BTreeMap<u32, Dictionary>,
}
struct FrameDecoderState {
pub frame_header: frame::FrameHeader,
decoder_scratch: DecoderScratch,
frame_finished: bool,
block_counter: usize,
bytes_read_counter: u64,
check_sum: Option<u32>,
using_dict: Option<u32>,
}
pub enum BlockDecodingStrategy {
All,
UptoBlocks(usize),
UptoBytes(usize),
}
impl FrameDecoderState {
pub fn new(source: impl Read) -> Result<FrameDecoderState, FrameDecoderError> {
let (frame, header_size) = frame::read_frame_header(source)?;
let window_size = frame.window_size()?;
Ok(FrameDecoderState {
frame_header: frame,
frame_finished: false,
block_counter: 0,
decoder_scratch: DecoderScratch::new(window_size as usize),
bytes_read_counter: u64::from(header_size),
check_sum: None,
using_dict: None,
})
}
pub fn reset(&mut self, source: impl Read) -> Result<(), FrameDecoderError> {
let (frame_header, header_size) = frame::read_frame_header(source)?;
let window_size = frame_header.window_size()?;
if window_size > MAXIMUM_ALLOWED_WINDOW_SIZE {
return Err(FrameDecoderError::WindowSizeTooBig {
requested: window_size,
});
}
self.frame_header = frame_header;
self.frame_finished = false;
self.block_counter = 0;
self.decoder_scratch.reset(window_size as usize);
self.bytes_read_counter = u64::from(header_size);
self.check_sum = None;
self.using_dict = None;
Ok(())
}
}
impl Default for FrameDecoder {
fn default() -> Self {
Self::new()
}
}
impl FrameDecoder {
/// This will create a new decoder without allocating anything yet.
/// init()/reset() will allocate all needed buffers if it is the first time this decoder is used
/// else they just reset these buffers with not further allocations
pub fn new() -> FrameDecoder {
FrameDecoder {
state: None,
dicts: BTreeMap::new(),
}
}
/// init() will allocate all needed buffers if it is the first time this decoder is used
/// else they just reset these buffers with not further allocations
///
/// Note that all bytes currently in the decodebuffer from any previous frame will be lost. Collect them with collect()/collect_to_writer()
///
/// equivalent to reset()
pub fn init(&mut self, source: impl Read) -> Result<(), FrameDecoderError> {
self.reset(source)
}
/// reset() will allocate all needed buffers if it is the first time this decoder is used
/// else they just reset these buffers with not further allocations
///
/// Note that all bytes currently in the decodebuffer from any previous frame will be lost. Collect them with collect()/collect_to_writer()
///
/// equivalent to init()
pub fn reset(&mut self, source: impl Read) -> Result<(), FrameDecoderError> {
use FrameDecoderError as err;
let state = match &mut self.state {
Some(s) => {
s.reset(source)?;
s
}
None => {
self.state = Some(FrameDecoderState::new(source)?);
self.state.as_mut().unwrap()
}
};
if let Some(dict_id) = state.frame_header.dictionary_id() {
let dict = self
.dicts
.get(&dict_id)
.ok_or(err::DictNotProvided { dict_id })?;
state.decoder_scratch.init_from_dict(dict);
state.using_dict = Some(dict_id);
}
Ok(())
}
/// Add a dict to the FrameDecoder that can be used when needed. The FrameDecoder uses the appropriate one dynamically
pub fn add_dict(&mut self, dict: Dictionary) -> Result<(), FrameDecoderError> {
self.dicts.insert(dict.id, dict);
Ok(())
}
pub fn force_dict(&mut self, dict_id: u32) -> Result<(), FrameDecoderError> {
use FrameDecoderError as err;
let Some(state) = self.state.as_mut() else {
return Err(err::NotYetInitialized);
};
let dict = self
.dicts
.get(&dict_id)
.ok_or(err::DictNotProvided { dict_id })?;
state.decoder_scratch.init_from_dict(dict);
state.using_dict = Some(dict_id);
Ok(())
}
/// Returns how many bytes the frame contains after decompression
pub fn content_size(&self) -> u64 {
match &self.state {
None => 0,
Some(s) => s.frame_header.frame_content_size(),
}
}
/// Returns the checksum that was read from the data. Only available after all bytes have been read. It is the last 4 bytes of a zstd-frame
pub fn get_checksum_from_data(&self) -> Option<u32> {
let state = match &self.state {
None => return None,
Some(s) => s,
};
state.check_sum
}
/// Returns the checksum that was calculated while decoding.
/// Only a sensible value after all decoded bytes have been collected/read from the FrameDecoder
#[cfg(feature = "hash")]
pub fn get_calculated_checksum(&self) -> Option<u32> {
use core::hash::Hasher;
let state = match &self.state {
None => return None,
Some(s) => s,
};
let cksum_64bit = state.decoder_scratch.buffer.hash.finish();
//truncate to lower 32bit because reasons...
Some(cksum_64bit as u32)
}
/// Counter for how many bytes have been consumed while decoding the frame
pub fn bytes_read_from_source(&self) -> u64 {
let state = match &self.state {
None => return 0,
Some(s) => s,
};
state.bytes_read_counter
}
/// Whether the current frames last block has been decoded yet
/// If this returns true you can call the drain* functions to get all content
/// (the read() function will drain automatically if this returns true)
pub fn is_finished(&self) -> bool {
let state = match &self.state {
None => return true,
Some(s) => s,
};
if state.frame_header.descriptor.content_checksum_flag() {
state.frame_finished && state.check_sum.is_some()
} else {
state.frame_finished
}
}
/// Counter for how many blocks have already been decoded
pub fn blocks_decoded(&self) -> usize {
let state = match &self.state {
None => return 0,
Some(s) => s,
};
state.block_counter
}
/// Decodes blocks from a reader. It requires that the framedecoder has been initialized first.
/// The Strategy influences how many blocks will be decoded before the function returns
/// This is important if you want to manage memory consumption carefully. If you don't care
/// about that you can just choose the strategy "All" and have all blocks of the frame decoded into the buffer
pub fn decode_blocks(
&mut self,
mut source: impl Read,
strat: BlockDecodingStrategy,
) -> Result<bool, FrameDecoderError> {
use FrameDecoderError as err;
let state = self.state.as_mut().ok_or(err::NotYetInitialized)?;
let mut block_dec = decoding::block_decoder::new();
let buffer_size_before = state.decoder_scratch.buffer.len();
let block_counter_before = state.block_counter;
loop {
vprintln!("################");
vprintln!("Next Block: {}", state.block_counter);
vprintln!("################");
let (block_header, block_header_size) = block_dec
.read_block_header(&mut source)
.map_err(err::FailedToReadBlockHeader)?;
state.bytes_read_counter += u64::from(block_header_size);
vprintln!();
vprintln!(
"Found {} block with size: {}, which will be of size: {}",
block_header.block_type,
block_header.content_size,
block_header.decompressed_size
);
let bytes_read_in_block_body = block_dec
.decode_block_content(&block_header, &mut state.decoder_scratch, &mut source)
.map_err(err::FailedToReadBlockBody)?;
state.bytes_read_counter += bytes_read_in_block_body;
state.block_counter += 1;
vprintln!("Output: {}", state.decoder_scratch.buffer.len());
if block_header.last_block {
state.frame_finished = true;
if state.frame_header.descriptor.content_checksum_flag() {
let mut chksum = [0u8; 4];
source
.read_exact(&mut chksum)
.map_err(err::FailedToReadChecksum)?;
state.bytes_read_counter += 4;
let chksum = u32::from_le_bytes(chksum);
state.check_sum = Some(chksum);
}
break;
}
match strat {
BlockDecodingStrategy::All => { /* keep going */ }
BlockDecodingStrategy::UptoBlocks(n) => {
if state.block_counter - block_counter_before >= n {
break;
}
}
BlockDecodingStrategy::UptoBytes(n) => {
if state.decoder_scratch.buffer.len() - buffer_size_before >= n {
break;
}
}
}
}
Ok(state.frame_finished)
}
/// Collect bytes and retain window_size bytes while decoding is still going on.
/// After decoding of the frame (is_finished() == true) has finished it will collect all remaining bytes
pub fn collect(&mut self) -> Option<Vec<u8>> {
let finished = self.is_finished();
let state = self.state.as_mut()?;
if finished {
Some(state.decoder_scratch.buffer.drain())
} else {
state.decoder_scratch.buffer.drain_to_window_size()
}
}
/// Collect bytes and retain window_size bytes while decoding is still going on.
/// After decoding of the frame (is_finished() == true) has finished it will collect all remaining bytes
pub fn collect_to_writer(&mut self, w: impl Write) -> Result<usize, Error> {
let finished = self.is_finished();
let state = match &mut self.state {
None => return Ok(0),
Some(s) => s,
};
if finished {
state.decoder_scratch.buffer.drain_to_writer(w)
} else {
state.decoder_scratch.buffer.drain_to_window_size_writer(w)
}
}
/// How many bytes can currently be collected from the decodebuffer, while decoding is going on this will be lower than the actual decodbuffer size
/// because window_size bytes need to be retained for decoding.
/// After decoding of the frame (is_finished() == true) has finished it will report all remaining bytes
pub fn can_collect(&self) -> usize {
let finished = self.is_finished();
let state = match &self.state {
None => return 0,
Some(s) => s,
};
if finished {
state.decoder_scratch.buffer.can_drain()
} else {
state
.decoder_scratch
.buffer
.can_drain_to_window_size()
.unwrap_or(0)
}
}
/// Decodes as many blocks as possible from the source slice and reads from the decodebuffer into the target slice
/// The source slice may contain only parts of a frame but must contain at least one full block to make progress
///
/// By all means use decode_blocks if you have a io.Reader available. This is just for compatibility with other decompressors
/// which try to serve an old-style c api
///
/// Returns (read, written), if read == 0 then the source did not contain a full block and further calls with the same
/// input will not make any progress!
///
/// Note that no kind of block can be bigger than 128kb.
/// So to be safe use at least 128*1024 (max block content size) + 3 (block_header size) + 18 (max frame_header size) bytes as your source buffer
///
/// You may call this function with an empty source after all bytes have been decoded. This is equivalent to just call decoder.read(&mut target)
pub fn decode_from_to(
&mut self,
source: &[u8],
target: &mut [u8],
) -> Result<(usize, usize), FrameDecoderError> {
use FrameDecoderError as err;
let bytes_read_at_start = match &self.state {
Some(s) => s.bytes_read_counter,
None => 0,
};
if !self.is_finished() || self.state.is_none() {
let mut mt_source = source;
if self.state.is_none() {
self.init(&mut mt_source)?;
}
//pseudo block to scope "state" so we can borrow self again after the block
{
let state = match &mut self.state {
Some(s) => s,
None => panic!("Bug in library"),
};
let mut block_dec = decoding::block_decoder::new();
if state.frame_header.descriptor.content_checksum_flag()
&& state.frame_finished
&& state.check_sum.is_none()
{
//this block is needed if the checksum were the only 4 bytes that were not included in the last decode_from_to call for a frame
if mt_source.len() >= 4 {
let chksum = mt_source[..4].try_into().expect("optimized away");
state.bytes_read_counter += 4;
let chksum = u32::from_le_bytes(chksum);
state.check_sum = Some(chksum);
}
return Ok((4, 0));
}
loop {
//check if there are enough bytes for the next header
if mt_source.len() < 3 {
break;
}
let (block_header, block_header_size) = block_dec
.read_block_header(&mut mt_source)
.map_err(err::FailedToReadBlockHeader)?;
// check the needed size for the block before updating counters.
// If not enough bytes are in the source, the header will have to be read again, so act like we never read it in the first place
if mt_source.len() < block_header.content_size as usize {
break;
}
state.bytes_read_counter += u64::from(block_header_size);
let bytes_read_in_block_body = block_dec
.decode_block_content(
&block_header,
&mut state.decoder_scratch,
&mut mt_source,
)
.map_err(err::FailedToReadBlockBody)?;
state.bytes_read_counter += bytes_read_in_block_body;
state.block_counter += 1;
if block_header.last_block {
state.frame_finished = true;
if state.frame_header.descriptor.content_checksum_flag() {
//if there are enough bytes handle this here. Else the block at the start of this function will handle it at the next call
if mt_source.len() >= 4 {
let chksum = mt_source[..4].try_into().expect("optimized away");
state.bytes_read_counter += 4;
let chksum = u32::from_le_bytes(chksum);
state.check_sum = Some(chksum);
}
}
break;
}
}
}
}
let result_len = self.read(target).map_err(err::FailedToDrainDecodebuffer)?;
let bytes_read_at_end = match &mut self.state {
Some(s) => s.bytes_read_counter,
None => panic!("Bug in library"),
};
let read_len = bytes_read_at_end - bytes_read_at_start;
Ok((read_len as usize, result_len))
}
/// Decode multiple frames into the output slice.
///
/// `input` must contain an exact number of frames.
///
/// `output` must be large enough to hold the decompressed data. If you don't know
/// how large the output will be, use [`FrameDecoder::decode_blocks`] instead.
///
/// This calls [`FrameDecoder::init`], and all bytes currently in the decoder will be lost.
///
/// Returns the number of bytes written to `output`.
pub fn decode_all(
&mut self,
mut input: &[u8],
mut output: &mut [u8],
) -> Result<usize, FrameDecoderError> {
let mut total_bytes_written = 0;
while !input.is_empty() {
match self.init(&mut input) {
Ok(_) => {}
Err(FrameDecoderError::ReadFrameHeaderError(
crate::decoding::errors::ReadFrameHeaderError::SkipFrame { length, .. },
)) => {
input = input
.get(length as usize..)
.ok_or(FrameDecoderError::FailedToSkipFrame)?;
continue;
}
Err(e) => return Err(e),
};
loop {
self.decode_blocks(&mut input, BlockDecodingStrategy::UptoBytes(1024 * 1024))?;
let bytes_written = self
.read(output)
.map_err(FrameDecoderError::FailedToDrainDecodebuffer)?;
output = &mut output[bytes_written..];
total_bytes_written += bytes_written;
if self.can_collect() != 0 {
return Err(FrameDecoderError::TargetTooSmall);
}
if self.is_finished() {
break;
}
}
}
Ok(total_bytes_written)
}
/// Decode multiple frames into the extra capacity of the output vector.
///
/// `input` must contain an exact number of frames.
///
/// `output` must have enough extra capacity to hold the decompressed data.
/// This function will not reallocate or grow the vector. If you don't know
/// how large the output will be, use [`FrameDecoder::decode_blocks`] instead.
///
/// This calls [`FrameDecoder::init`], and all bytes currently in the decoder will be lost.
///
/// The length of the output vector is updated to include the decompressed data.
/// The length is not changed if an error occurs.
pub fn decode_all_to_vec(
&mut self,
input: &[u8],
output: &mut Vec<u8>,
) -> Result<(), FrameDecoderError> {
let len = output.len();
let cap = output.capacity();
output.resize(cap, 0);
match self.decode_all(input, &mut output[len..]) {
Ok(bytes_written) => {
let new_len = core::cmp::min(len + bytes_written, cap); // Sanitizes `bytes_written`.
output.resize(new_len, 0);
Ok(())
}
Err(e) => {
output.resize(len, 0);
Err(e)
}
}
}
}
/// Read bytes from the decode_buffer that are no longer needed. While the frame is not yet finished
/// this will retain window_size bytes, else it will drain it completely
impl Read for FrameDecoder {
fn read(&mut self, target: &mut [u8]) -> Result<usize, Error> {
let state = match &mut self.state {
None => return Ok(0),
Some(s) => s,
};
if state.frame_finished {
state.decoder_scratch.buffer.read_all(target)
} else {
state.decoder_scratch.buffer.read(target)
}
}
}

View File

@@ -0,0 +1,159 @@
//! This module contains the decompress_literals function, used to take a
//! parsed literals header and a source and decompress it.
use super::super::blocks::literals_section::{LiteralsSection, LiteralsSectionType};
use super::scratch::HuffmanScratch;
use crate::bit_io::BitReaderReversed;
use crate::decoding::errors::DecompressLiteralsError;
use crate::huff0::HuffmanDecoder;
use alloc::vec::Vec;
/// Decode and decompress the provided literals section into `target`, returning the number of bytes read.
pub fn decode_literals(
section: &LiteralsSection,
scratch: &mut HuffmanScratch,
source: &[u8],
target: &mut Vec<u8>,
) -> Result<u32, DecompressLiteralsError> {
match section.ls_type {
LiteralsSectionType::Raw => {
target.extend(&source[0..section.regenerated_size as usize]);
Ok(section.regenerated_size)
}
LiteralsSectionType::RLE => {
target.resize(target.len() + section.regenerated_size as usize, source[0]);
Ok(1)
}
LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => {
let bytes_read = decompress_literals(section, scratch, source, target)?;
//return sum of used bytes
Ok(bytes_read)
}
}
}
/// Decompress the provided literals section and source into the provided `target`.
/// This function is used when the literals section is `Compressed` or `Treeless`
///
/// Returns the number of bytes read.
fn decompress_literals(
section: &LiteralsSection,
scratch: &mut HuffmanScratch,
source: &[u8],
target: &mut Vec<u8>,
) -> Result<u32, DecompressLiteralsError> {
use DecompressLiteralsError as err;
let compressed_size = section.compressed_size.ok_or(err::MissingCompressedSize)? as usize;
let num_streams = section.num_streams.ok_or(err::MissingNumStreams)?;
target.reserve(section.regenerated_size as usize);
let source = &source[0..compressed_size];
let mut bytes_read = 0;
match section.ls_type {
LiteralsSectionType::Compressed => {
//read Huffman tree description
bytes_read += scratch.table.build_decoder(source)?;
vprintln!("Built huffman table using {} bytes", bytes_read);
}
LiteralsSectionType::Treeless => {
if scratch.table.max_num_bits == 0 {
return Err(err::UninitializedHuffmanTable);
}
}
_ => { /* nothing to do, huffman tree has been provided by previous block */ }
}
let source = &source[bytes_read as usize..];
if num_streams == 4 {
//build jumptable
if source.len() < 6 {
return Err(err::MissingBytesForJumpHeader { got: source.len() });
}
let jump1 = source[0] as usize + ((source[1] as usize) << 8);
let jump2 = jump1 + source[2] as usize + ((source[3] as usize) << 8);
let jump3 = jump2 + source[4] as usize + ((source[5] as usize) << 8);
bytes_read += 6;
let source = &source[6..];
if source.len() < jump3 {
return Err(err::MissingBytesForLiterals {
got: source.len(),
needed: jump3,
});
}
//decode 4 streams
let stream1 = &source[..jump1];
let stream2 = &source[jump1..jump2];
let stream3 = &source[jump2..jump3];
let stream4 = &source[jump3..];
for stream in &[stream1, stream2, stream3, stream4] {
let mut decoder = HuffmanDecoder::new(&scratch.table);
let mut br = BitReaderReversed::new(stream);
//skip the 0 padding at the end of the last byte of the bit stream and throw away the first 1 found
let mut skipped_bits = 0;
loop {
let val = br.get_bits(1);
skipped_bits += 1;
if val == 1 || skipped_bits > 8 {
break;
}
}
if skipped_bits > 8 {
//if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data
return Err(DecompressLiteralsError::ExtraPadding { skipped_bits });
}
decoder.init_state(&mut br);
while br.bits_remaining() > -(scratch.table.max_num_bits as isize) {
target.push(decoder.decode_symbol());
decoder.next_state(&mut br);
}
if br.bits_remaining() != -(scratch.table.max_num_bits as isize) {
return Err(DecompressLiteralsError::BitstreamReadMismatch {
read_til: br.bits_remaining(),
expected: -(scratch.table.max_num_bits as isize),
});
}
}
bytes_read += source.len() as u32;
} else {
//just decode the one stream
assert!(num_streams == 1);
let mut decoder = HuffmanDecoder::new(&scratch.table);
let mut br = BitReaderReversed::new(source);
let mut skipped_bits = 0;
loop {
let val = br.get_bits(1);
skipped_bits += 1;
if val == 1 || skipped_bits > 8 {
break;
}
}
if skipped_bits > 8 {
//if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data
return Err(DecompressLiteralsError::ExtraPadding { skipped_bits });
}
decoder.init_state(&mut br);
while br.bits_remaining() > -(scratch.table.max_num_bits as isize) {
target.push(decoder.decode_symbol());
decoder.next_state(&mut br);
}
bytes_read += source.len() as u32;
}
if target.len() != section.regenerated_size as usize {
return Err(DecompressLiteralsError::DecodedLiteralCountMismatch {
decoded: target.len(),
expected: section.regenerated_size as usize,
});
}
Ok(bytes_read)
}

19
vendor/ruzstd/src/decoding/mod.rs vendored Normal file
View File

@@ -0,0 +1,19 @@
//! Structures and utilities used for decoding zstd formatted data
pub mod errors;
mod frame_decoder;
mod streaming_decoder;
pub use frame_decoder::{BlockDecodingStrategy, FrameDecoder};
pub use streaming_decoder::StreamingDecoder;
pub(crate) mod block_decoder;
pub(crate) mod decode_buffer;
pub(crate) mod dictionary;
pub(crate) mod frame;
pub(crate) mod literals_section_decoder;
mod ringbuffer;
#[allow(dead_code)]
pub(crate) mod scratch;
pub(crate) mod sequence_execution;
pub(crate) mod sequence_section_decoder;

887
vendor/ruzstd/src/decoding/ringbuffer.rs vendored Normal file
View File

@@ -0,0 +1,887 @@
use alloc::alloc::{alloc, dealloc};
use core::{alloc::Layout, ptr::NonNull, slice};
pub struct RingBuffer {
// Safety invariants:
//
// 1.
// a.`buf` must be a valid allocation of capacity `cap`
// b. ...unless `cap=0`, in which case it is dangling
// 2. If tail≥head
// a. `head..tail` must contain initialized memory.
// b. Else, `head..` and `..tail` must be initialized
// 3. `head` and `tail` are in bounds (≥ 0 and < cap)
// 4. `tail` is never `cap` except for a full buffer, and instead uses the value `0`. In other words, `tail` always points to the place
// where the next element would go (if there is space)
buf: NonNull<u8>,
cap: usize,
head: usize,
tail: usize,
}
// SAFETY: RingBuffer does not hold any thread specific values -> it can be sent to another thread -> RingBuffer is Send
unsafe impl Send for RingBuffer {}
// SAFETY: Ringbuffer does not provide unsyncronized interior mutability which makes &RingBuffer Send -> RingBuffer is Sync
unsafe impl Sync for RingBuffer {}
impl RingBuffer {
pub fn new() -> Self {
RingBuffer {
// SAFETY: Upholds invariant 1a as stated
buf: NonNull::dangling(),
cap: 0,
// SAFETY: Upholds invariant 2-4
head: 0,
tail: 0,
}
}
/// Return the number of bytes in the buffer.
pub fn len(&self) -> usize {
let (x, y) = self.data_slice_lengths();
x + y
}
/// Return the amount of available space (in bytes) of the buffer.
pub fn free(&self) -> usize {
let (x, y) = self.free_slice_lengths();
(x + y).saturating_sub(1)
}
/// Empty the buffer and reset the head and tail.
pub fn clear(&mut self) {
// SAFETY: Upholds invariant 2, trivially
// SAFETY: Upholds invariant 3; 0 is always valid
self.head = 0;
self.tail = 0;
}
/// Ensure that there's space for `amount` elements in the buffer.
pub fn reserve(&mut self, amount: usize) {
let free = self.free();
if free >= amount {
return;
}
self.reserve_amortized(amount - free);
}
#[inline(never)]
#[cold]
fn reserve_amortized(&mut self, amount: usize) {
// SAFETY: if we were succesfully able to construct this layout when we allocated then it's also valid do so now
let current_layout = unsafe { Layout::array::<u8>(self.cap).unwrap_unchecked() };
// Always have at least 1 unused element as the sentinel.
let new_cap = usize::max(
self.cap.next_power_of_two(),
(self.cap + amount).next_power_of_two(),
) + 1;
// Check that the capacity isn't bigger than isize::MAX, which is the max allowed by LLVM, or that
// we are on a >= 64 bit system which will never allow that much memory to be allocated
#[allow(clippy::assertions_on_constants)]
{
debug_assert!(usize::BITS >= 64 || new_cap < isize::MAX as usize);
}
let new_layout = Layout::array::<u8>(new_cap)
.unwrap_or_else(|_| panic!("Could not create layout for u8 array of size {}", new_cap));
// alloc the new memory region and panic if alloc fails
// TODO maybe rework this to generate an error?
let new_buf = unsafe {
let new_buf = alloc(new_layout);
NonNull::new(new_buf).expect("Allocating new space for the ringbuffer failed")
};
// If we had data before, copy it over to the newly alloced memory region
if self.cap > 0 {
let ((s1_ptr, s1_len), (s2_ptr, s2_len)) = self.data_slice_parts();
unsafe {
// SAFETY: Upholds invariant 2, we end up populating (0..(len₁ + len₂))
new_buf.as_ptr().copy_from_nonoverlapping(s1_ptr, s1_len);
new_buf
.as_ptr()
.add(s1_len)
.copy_from_nonoverlapping(s2_ptr, s2_len);
dealloc(self.buf.as_ptr(), current_layout);
}
// SAFETY: Upholds invariant 3, head is 0 and in bounds, tail is only ever `cap` if the buffer
// is entirely full
self.tail = s1_len + s2_len;
self.head = 0;
}
// SAFETY: Upholds invariant 1: the buffer was just allocated correctly
self.buf = new_buf;
self.cap = new_cap;
}
#[allow(dead_code)]
pub fn push_back(&mut self, byte: u8) {
self.reserve(1);
// SAFETY: Upholds invariant 2 by writing initialized memory
unsafe { self.buf.as_ptr().add(self.tail).write(byte) };
// SAFETY: Upholds invariant 3 by wrapping `tail` around
self.tail = (self.tail + 1) % self.cap;
}
/// Fetch the byte stored at the selected index from the buffer, returning it, or
/// `None` if the index is out of bounds.
#[allow(dead_code)]
pub fn get(&self, idx: usize) -> Option<u8> {
if idx < self.len() {
// SAFETY: Establishes invariants on memory being initialized and the range being in-bounds
// (Invariants 2 & 3)
let idx = (self.head + idx) % self.cap;
Some(unsafe { self.buf.as_ptr().add(idx).read() })
} else {
None
}
}
/// Append the provided data to the end of `self`.
pub fn extend(&mut self, data: &[u8]) {
let len = data.len();
let ptr = data.as_ptr();
if len == 0 {
return;
}
self.reserve(len);
debug_assert!(self.len() + len <= self.cap - 1);
debug_assert!(self.free() >= len, "free: {} len: {}", self.free(), len);
let ((f1_ptr, f1_len), (f2_ptr, f2_len)) = self.free_slice_parts();
debug_assert!(f1_len + f2_len >= len, "{} + {} < {}", f1_len, f2_len, len);
let in_f1 = usize::min(len, f1_len);
let in_f2 = len - in_f1;
debug_assert!(in_f1 + in_f2 == len);
unsafe {
// SAFETY: `in_f₁ + in_f₂ = len`, so this writes `len` bytes total
// upholding invariant 2
if in_f1 > 0 {
f1_ptr.copy_from_nonoverlapping(ptr, in_f1);
}
if in_f2 > 0 {
f2_ptr.copy_from_nonoverlapping(ptr.add(in_f1), in_f2);
}
}
// SAFETY: Upholds invariant 3 by wrapping `tail` around.
self.tail = (self.tail + len) % self.cap;
}
/// Advance head past `amount` elements, effectively removing
/// them from the buffer.
pub fn drop_first_n(&mut self, amount: usize) {
debug_assert!(amount <= self.len());
let amount = usize::min(amount, self.len());
// SAFETY: we maintain invariant 2 here since this will always lead to a smaller buffer
// for amount≤len
self.head = (self.head + amount) % self.cap;
}
/// Return the size of the two contiguous occupied sections of memory used
/// by the buffer.
// SAFETY: other code relies on this pointing to initialized halves of the buffer only
fn data_slice_lengths(&self) -> (usize, usize) {
let len_after_head;
let len_to_tail;
// TODO can we do this branchless?
if self.tail >= self.head {
len_after_head = self.tail - self.head;
len_to_tail = 0;
} else {
len_after_head = self.cap - self.head;
len_to_tail = self.tail;
}
(len_after_head, len_to_tail)
}
// SAFETY: other code relies on this pointing to initialized halves of the buffer only
/// Return pointers to the head and tail, and the length of each section.
fn data_slice_parts(&self) -> ((*const u8, usize), (*const u8, usize)) {
let (len_after_head, len_to_tail) = self.data_slice_lengths();
(
(unsafe { self.buf.as_ptr().add(self.head) }, len_after_head),
(self.buf.as_ptr(), len_to_tail),
)
}
/// Return references to each part of the ring buffer.
pub fn as_slices(&self) -> (&[u8], &[u8]) {
let (s1, s2) = self.data_slice_parts();
unsafe {
// SAFETY: relies on the behavior of data_slice_parts for producing initialized memory
let s1 = slice::from_raw_parts(s1.0, s1.1);
let s2 = slice::from_raw_parts(s2.0, s2.1);
(s1, s2)
}
}
// SAFETY: other code relies on this producing the lengths of free zones
// at the beginning/end of the buffer. Everything else must be initialized
/// Returns the size of the two unoccupied sections of memory used by the buffer.
fn free_slice_lengths(&self) -> (usize, usize) {
let len_to_head;
let len_after_tail;
// TODO can we do this branchless?
if self.tail < self.head {
len_after_tail = self.head - self.tail;
len_to_head = 0;
} else {
len_after_tail = self.cap - self.tail;
len_to_head = self.head;
}
(len_to_head, len_after_tail)
}
/// Returns mutable references to the available space and the size of that available space,
/// for the two sections in the buffer.
// SAFETY: Other code relies on this pointing to the free zones, data after the first and before the second must
// be valid
fn free_slice_parts(&self) -> ((*mut u8, usize), (*mut u8, usize)) {
let (len_to_head, len_after_tail) = self.free_slice_lengths();
(
(unsafe { self.buf.as_ptr().add(self.tail) }, len_after_tail),
(self.buf.as_ptr(), len_to_head),
)
}
/// Copies elements from the provided range to the end of the buffer.
#[allow(dead_code)]
pub fn extend_from_within(&mut self, start: usize, len: usize) {
if start + len > self.len() {
panic!(
"Calls to this functions must respect start ({}) + len ({}) <= self.len() ({})!",
start,
len,
self.len()
);
}
self.reserve(len);
// SAFETY: Requirements checked:
// 1. explicitly checked above, resulting in a panic if it does not hold
// 2. explicitly reserved enough memory
unsafe { self.extend_from_within_unchecked(start, len) }
}
/// Copies data from the provided range to the end of the buffer, without
/// first verifying that the unoccupied capacity is available.
///
/// SAFETY:
/// For this to be safe two requirements need to hold:
/// 1. start + len <= self.len() so we do not copy uninitialised memory
/// 2. More then len reserved space so we do not write out-of-bounds
#[warn(unsafe_op_in_unsafe_fn)]
pub unsafe fn extend_from_within_unchecked(&mut self, start: usize, len: usize) {
debug_assert!(start + len <= self.len());
debug_assert!(self.free() >= len);
if self.head < self.tail {
// Continuous source section and possibly non continuous write section:
//
// H T
// Read: ____XXXXSSSSXXXX________
// Write: ________________DDDD____
//
// H: Head position (first readable byte)
// T: Tail position (first writable byte)
// X: Uninvolved bytes in the readable section
// S: Source bytes, to be copied to D bytes
// D: Destination bytes, going to be copied from S bytes
// _: Uninvolved bytes in the writable section
let after_tail = usize::min(len, self.cap - self.tail);
let src = (
// SAFETY: `len <= isize::MAX` and fits the memory range of `buf`
unsafe { self.buf.as_ptr().add(self.head + start) }.cast_const(),
// Src length (see above diagram)
self.tail - self.head - start,
);
let dst = (
// SAFETY: `len <= isize::MAX` and fits the memory range of `buf`
unsafe { self.buf.as_ptr().add(self.tail) },
// Dst length (see above diagram)
self.cap - self.tail,
);
// SAFETY: `src` points at initialized data, `dst` points to writable memory
// and does not overlap `src`.
unsafe { copy_bytes_overshooting(src, dst, after_tail) }
if after_tail < len {
// The write section was not continuous:
//
// H T
// Read: ____XXXXSSSSXXXX__
// Write: DD______________DD
//
// H: Head position (first readable byte)
// T: Tail position (first writable byte)
// X: Uninvolved bytes in the readable section
// S: Source bytes, to be copied to D bytes
// D: Destination bytes, going to be copied from S bytes
// _: Uninvolved bytes in the writable section
let src = (
// SAFETY: we are still within the memory range of `buf`
unsafe { src.0.add(after_tail) },
// Src length (see above diagram)
src.1 - after_tail,
);
let dst = (
self.buf.as_ptr(),
// Dst length overflowing (see above diagram)
self.head,
);
// SAFETY: `src` points at initialized data, `dst` points to writable memory
// and does not overlap `src`.
unsafe { copy_bytes_overshooting(src, dst, len - after_tail) }
}
} else {
#[allow(clippy::collapsible_else_if)]
if self.head + start > self.cap {
// Continuous read section and destination section:
//
// T H
// Read: XXSSSSXXXX____________XX
// Write: __________DDDD__________
//
// H: Head position (first readable byte)
// T: Tail position (first writable byte)
// X: Uninvolved bytes in the readable section
// S: Source bytes, to be copied to D bytes
// D: Destination bytes, going to be copied from S bytes
// _: Uninvolved bytes in the writable section
let start = (self.head + start) % self.cap;
let src = (
// SAFETY: `len <= isize::MAX` and fits the memory range of `buf`
unsafe { self.buf.as_ptr().add(start) }.cast_const(),
// Src length (see above diagram)
self.tail - start,
);
let dst = (
// SAFETY: `len <= isize::MAX` and fits the memory range of `buf`
unsafe { self.buf.as_ptr().add(self.tail) }, // Dst length (see above diagram)
// Dst length (see above diagram)
self.head - self.tail,
);
// SAFETY: `src` points at initialized data, `dst` points to writable memory
// and does not overlap `src`.
unsafe { copy_bytes_overshooting(src, dst, len) }
} else {
// Possibly non continuous read section and continuous destination section:
//
// T H
// Read: XXXX____________XXSSSSXX
// Write: ____DDDD________________
//
// H: Head position (first readable byte)
// T: Tail position (first writable byte)
// X: Uninvolved bytes in the readable section
// S: Source bytes, to be copied to D bytes
// D: Destination bytes, going to be copied from S bytes
// _: Uninvolved bytes in the writable section
let after_start = usize::min(len, self.cap - self.head - start);
let src = (
// SAFETY: `len <= isize::MAX` and fits the memory range of `buf`
unsafe { self.buf.as_ptr().add(self.head + start) }.cast_const(),
// Src length - chunk 1 (see above diagram on the right)
self.cap - self.head - start,
);
let dst = (
// SAFETY: `len <= isize::MAX` and fits the memory range of `buf`
unsafe { self.buf.as_ptr().add(self.tail) },
// Dst length (see above diagram)
self.head - self.tail,
);
// SAFETY: `src` points at initialized data, `dst` points to writable memory
// and does not overlap `src`.
unsafe { copy_bytes_overshooting(src, dst, after_start) }
if after_start < len {
// The read section was not continuous:
//
// T H
// Read: SSXXXXXX____________XXSS
// Write: ________DDDD____________
//
// H: Head position (first readable byte)
// T: Tail position (first writable byte)
// X: Uninvolved bytes in the readable section
// S: Source bytes, to be copied to D bytes
// D: Destination bytes, going to be copied from S bytes
// _: Uninvolved bytes in the writable section
let src = (
self.buf.as_ptr().cast_const(),
// Src length - chunk 2 (see above diagram on the left)
self.tail,
);
let dst = (
// SAFETY: we are still within the memory range of `buf`
unsafe { dst.0.add(after_start) },
// Dst length (see above diagram)
dst.1 - after_start,
);
// SAFETY: `src` points at initialized data, `dst` points to writable memory
// and does not overlap `src`.
unsafe { copy_bytes_overshooting(src, dst, len - after_start) }
}
}
}
self.tail = (self.tail + len) % self.cap;
}
#[allow(dead_code)]
/// This function is functionally the same as [RingBuffer::extend_from_within_unchecked],
/// but it does not contain any branching operations.
///
/// SAFETY:
/// Needs start + len <= self.len()
/// And more then len reserved space
pub unsafe fn extend_from_within_unchecked_branchless(&mut self, start: usize, len: usize) {
// data slices in raw parts
let ((s1_ptr, s1_len), (s2_ptr, s2_len)) = self.data_slice_parts();
debug_assert!(len <= s1_len + s2_len, "{} > {} + {}", len, s1_len, s2_len);
// calc the actually wanted slices in raw parts
let start_in_s1 = usize::min(s1_len, start);
let end_in_s1 = usize::min(s1_len, start + len);
let m1_ptr = s1_ptr.add(start_in_s1);
let m1_len = end_in_s1 - start_in_s1;
debug_assert!(end_in_s1 <= s1_len);
debug_assert!(start_in_s1 <= s1_len);
let start_in_s2 = start.saturating_sub(s1_len);
let end_in_s2 = start_in_s2 + (len - m1_len);
let m2_ptr = s2_ptr.add(start_in_s2);
let m2_len = end_in_s2 - start_in_s2;
debug_assert!(start_in_s2 <= s2_len);
debug_assert!(end_in_s2 <= s2_len);
debug_assert_eq!(len, m1_len + m2_len);
// the free slices, must hold: f1_len + f2_len >= m1_len + m2_len
let ((f1_ptr, f1_len), (f2_ptr, f2_len)) = self.free_slice_parts();
debug_assert!(f1_len + f2_len >= m1_len + m2_len);
// calc how many from where bytes go where
let m1_in_f1 = usize::min(m1_len, f1_len);
let m1_in_f2 = m1_len - m1_in_f1;
let m2_in_f1 = usize::min(f1_len - m1_in_f1, m2_len);
let m2_in_f2 = m2_len - m2_in_f1;
debug_assert_eq!(m1_len, m1_in_f1 + m1_in_f2);
debug_assert_eq!(m2_len, m2_in_f1 + m2_in_f2);
debug_assert!(f1_len >= m1_in_f1 + m2_in_f1);
debug_assert!(f2_len >= m1_in_f2 + m2_in_f2);
debug_assert_eq!(len, m1_in_f1 + m2_in_f1 + m1_in_f2 + m2_in_f2);
debug_assert!(self.buf.as_ptr().add(self.cap) > f1_ptr.add(m1_in_f1 + m2_in_f1));
debug_assert!(self.buf.as_ptr().add(self.cap) > f2_ptr.add(m1_in_f2 + m2_in_f2));
debug_assert!((m1_in_f2 > 0) ^ (m2_in_f1 > 0) || (m1_in_f2 == 0 && m2_in_f1 == 0));
copy_with_checks(
m1_ptr, m2_ptr, f1_ptr, f2_ptr, m1_in_f1, m2_in_f1, m1_in_f2, m2_in_f2,
);
self.tail = (self.tail + len) % self.cap;
}
}
impl Drop for RingBuffer {
fn drop(&mut self) {
if self.cap == 0 {
return;
}
// SAFETY: is we were succesfully able to construct this layout when we allocated then it's also valid do so now
// Relies on / establishes invariant 1
let current_layout = unsafe { Layout::array::<u8>(self.cap).unwrap_unchecked() };
unsafe {
dealloc(self.buf.as_ptr(), current_layout);
}
}
}
/// Similar to ptr::copy_nonoverlapping
///
/// But it might overshoot the desired copy length if deemed useful
///
/// src and dst specify the entire length they are eligible for reading/writing respectively
/// in addition to the desired copy length.
///
/// This function will then copy in chunks and might copy up to chunk size - 1 more bytes from src to dst
/// if that operation does not read/write memory that does not belong to src/dst.
///
/// The chunk size is not part of the contract and may change depending on the target platform.
///
/// If that isn't possible we just fall back to ptr::copy_nonoverlapping
#[inline(always)]
unsafe fn copy_bytes_overshooting(
src: (*const u8, usize),
dst: (*mut u8, usize),
copy_at_least: usize,
) {
// By default use usize as the copy size
#[cfg(all(not(target_feature = "sse2"), not(target_feature = "neon")))]
type CopyType = usize;
// Use u128 if we detect a simd feature
#[cfg(target_feature = "neon")]
type CopyType = u128;
#[cfg(target_feature = "sse2")]
type CopyType = u128;
const COPY_AT_ONCE_SIZE: usize = core::mem::size_of::<CopyType>();
let min_buffer_size = usize::min(src.1, dst.1);
// Can copy in just one read+write, very common case
if min_buffer_size >= COPY_AT_ONCE_SIZE && copy_at_least <= COPY_AT_ONCE_SIZE {
dst.0
.cast::<CopyType>()
.write_unaligned(src.0.cast::<CopyType>().read_unaligned())
} else {
let copy_multiple = copy_at_least.next_multiple_of(COPY_AT_ONCE_SIZE);
// Can copy in multiple simple instructions
if min_buffer_size >= copy_multiple {
let mut src_ptr = src.0.cast::<CopyType>();
let src_ptr_end = src.0.add(copy_multiple).cast::<CopyType>();
let mut dst_ptr = dst.0.cast::<CopyType>();
while src_ptr < src_ptr_end {
dst_ptr.write_unaligned(src_ptr.read_unaligned());
src_ptr = src_ptr.add(1);
dst_ptr = dst_ptr.add(1);
}
} else {
// Fall back to standard memcopy
dst.0.copy_from_nonoverlapping(src.0, copy_at_least);
}
}
debug_assert_eq!(
slice::from_raw_parts(src.0, copy_at_least),
slice::from_raw_parts(dst.0, copy_at_least)
);
}
#[allow(dead_code)]
#[inline(always)]
#[allow(clippy::too_many_arguments)]
unsafe fn copy_without_checks(
m1_ptr: *const u8,
m2_ptr: *const u8,
f1_ptr: *mut u8,
f2_ptr: *mut u8,
m1_in_f1: usize,
m2_in_f1: usize,
m1_in_f2: usize,
m2_in_f2: usize,
) {
f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1);
f1_ptr
.add(m1_in_f1)
.copy_from_nonoverlapping(m2_ptr, m2_in_f1);
f2_ptr.copy_from_nonoverlapping(m1_ptr.add(m1_in_f1), m1_in_f2);
f2_ptr
.add(m1_in_f2)
.copy_from_nonoverlapping(m2_ptr.add(m2_in_f1), m2_in_f2);
}
#[allow(dead_code)]
#[inline(always)]
#[allow(clippy::too_many_arguments)]
unsafe fn copy_with_checks(
m1_ptr: *const u8,
m2_ptr: *const u8,
f1_ptr: *mut u8,
f2_ptr: *mut u8,
m1_in_f1: usize,
m2_in_f1: usize,
m1_in_f2: usize,
m2_in_f2: usize,
) {
if m1_in_f1 != 0 {
f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1);
}
if m2_in_f1 != 0 {
f1_ptr
.add(m1_in_f1)
.copy_from_nonoverlapping(m2_ptr, m2_in_f1);
}
if m1_in_f2 != 0 {
f2_ptr.copy_from_nonoverlapping(m1_ptr.add(m1_in_f1), m1_in_f2);
}
if m2_in_f2 != 0 {
f2_ptr
.add(m1_in_f2)
.copy_from_nonoverlapping(m2_ptr.add(m2_in_f1), m2_in_f2);
}
}
#[allow(dead_code)]
#[inline(always)]
#[allow(clippy::too_many_arguments)]
unsafe fn copy_with_nobranch_check(
m1_ptr: *const u8,
m2_ptr: *const u8,
f1_ptr: *mut u8,
f2_ptr: *mut u8,
m1_in_f1: usize,
m2_in_f1: usize,
m1_in_f2: usize,
m2_in_f2: usize,
) {
let case = (m1_in_f1 > 0) as usize
| (((m2_in_f1 > 0) as usize) << 1)
| (((m1_in_f2 > 0) as usize) << 2)
| (((m2_in_f2 > 0) as usize) << 3);
match case {
0 => {}
// one bit set
1 => {
f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1);
}
2 => {
f1_ptr.copy_from_nonoverlapping(m2_ptr, m2_in_f1);
}
4 => {
f2_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f2);
}
8 => {
f2_ptr.copy_from_nonoverlapping(m2_ptr, m2_in_f2);
}
// two bit set
3 => {
f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1);
f1_ptr
.add(m1_in_f1)
.copy_from_nonoverlapping(m2_ptr, m2_in_f1);
}
5 => {
f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1);
f2_ptr.copy_from_nonoverlapping(m1_ptr.add(m1_in_f1), m1_in_f2);
}
6 => core::hint::unreachable_unchecked(),
7 => core::hint::unreachable_unchecked(),
9 => {
f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1);
f2_ptr.copy_from_nonoverlapping(m2_ptr, m2_in_f2);
}
10 => {
f1_ptr.copy_from_nonoverlapping(m2_ptr, m2_in_f1);
f2_ptr.copy_from_nonoverlapping(m2_ptr.add(m2_in_f1), m2_in_f2);
}
12 => {
f2_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f2);
f2_ptr
.add(m1_in_f2)
.copy_from_nonoverlapping(m2_ptr, m2_in_f2);
}
// three bit set
11 => {
f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1);
f1_ptr
.add(m1_in_f1)
.copy_from_nonoverlapping(m2_ptr, m2_in_f1);
f2_ptr.copy_from_nonoverlapping(m2_ptr.add(m2_in_f1), m2_in_f2);
}
13 => {
f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1);
f2_ptr.copy_from_nonoverlapping(m1_ptr.add(m1_in_f1), m1_in_f2);
f2_ptr
.add(m1_in_f2)
.copy_from_nonoverlapping(m2_ptr, m2_in_f2);
}
14 => core::hint::unreachable_unchecked(),
15 => core::hint::unreachable_unchecked(),
_ => core::hint::unreachable_unchecked(),
}
}
#[cfg(test)]
mod tests {
use super::RingBuffer;
#[test]
fn smoke() {
let mut rb = RingBuffer::new();
rb.reserve(15);
assert_eq!(17, rb.cap);
rb.extend(b"0123456789");
assert_eq!(rb.len(), 10);
assert_eq!(rb.as_slices().0, b"0123456789");
assert_eq!(rb.as_slices().1, b"");
rb.drop_first_n(5);
assert_eq!(rb.len(), 5);
assert_eq!(rb.as_slices().0, b"56789");
assert_eq!(rb.as_slices().1, b"");
rb.extend_from_within(2, 3);
assert_eq!(rb.len(), 8);
assert_eq!(rb.as_slices().0, b"56789789");
assert_eq!(rb.as_slices().1, b"");
rb.extend_from_within(0, 3);
assert_eq!(rb.len(), 11);
assert_eq!(rb.as_slices().0, b"56789789567");
assert_eq!(rb.as_slices().1, b"");
rb.extend_from_within(0, 2);
assert_eq!(rb.len(), 13);
assert_eq!(rb.as_slices().0, b"567897895675");
assert_eq!(rb.as_slices().1, b"6");
rb.drop_first_n(11);
assert_eq!(rb.len(), 2);
assert_eq!(rb.as_slices().0, b"5");
assert_eq!(rb.as_slices().1, b"6");
rb.extend(b"0123456789");
assert_eq!(rb.len(), 12);
assert_eq!(rb.as_slices().0, b"5");
assert_eq!(rb.as_slices().1, b"60123456789");
rb.drop_first_n(11);
assert_eq!(rb.len(), 1);
assert_eq!(rb.as_slices().0, b"9");
assert_eq!(rb.as_slices().1, b"");
rb.extend(b"0123456789");
assert_eq!(rb.len(), 11);
assert_eq!(rb.as_slices().0, b"9012345");
assert_eq!(rb.as_slices().1, b"6789");
}
#[test]
fn edge_cases() {
// Fill exactly, then empty then fill again
let mut rb = RingBuffer::new();
rb.reserve(16);
assert_eq!(17, rb.cap);
rb.extend(b"0123456789012345");
assert_eq!(17, rb.cap);
assert_eq!(16, rb.len());
assert_eq!(0, rb.free());
rb.drop_first_n(16);
assert_eq!(0, rb.len());
assert_eq!(16, rb.free());
rb.extend(b"0123456789012345");
assert_eq!(16, rb.len());
assert_eq!(0, rb.free());
assert_eq!(17, rb.cap);
assert_eq!(1, rb.as_slices().0.len());
assert_eq!(15, rb.as_slices().1.len());
rb.clear();
// data in both slices and then reserve
rb.extend(b"0123456789012345");
rb.drop_first_n(8);
rb.extend(b"67890123");
assert_eq!(16, rb.len());
assert_eq!(0, rb.free());
assert_eq!(17, rb.cap);
assert_eq!(9, rb.as_slices().0.len());
assert_eq!(7, rb.as_slices().1.len());
rb.reserve(1);
assert_eq!(16, rb.len());
assert_eq!(16, rb.free());
assert_eq!(33, rb.cap);
assert_eq!(16, rb.as_slices().0.len());
assert_eq!(0, rb.as_slices().1.len());
rb.clear();
// fill exactly, then extend from within
rb.extend(b"0123456789012345");
rb.extend_from_within(0, 16);
assert_eq!(32, rb.len());
assert_eq!(0, rb.free());
assert_eq!(33, rb.cap);
assert_eq!(32, rb.as_slices().0.len());
assert_eq!(0, rb.as_slices().1.len());
// extend from within cases
let mut rb = RingBuffer::new();
rb.reserve(8);
rb.extend(b"01234567");
rb.drop_first_n(5);
rb.extend_from_within(0, 3);
assert_eq!(4, rb.as_slices().0.len());
assert_eq!(2, rb.as_slices().1.len());
rb.drop_first_n(2);
assert_eq!(2, rb.as_slices().0.len());
assert_eq!(2, rb.as_slices().1.len());
rb.extend_from_within(0, 4);
assert_eq!(2, rb.as_slices().0.len());
assert_eq!(6, rb.as_slices().1.len());
rb.drop_first_n(2);
assert_eq!(6, rb.as_slices().0.len());
assert_eq!(0, rb.as_slices().1.len());
rb.drop_first_n(2);
assert_eq!(4, rb.as_slices().0.len());
assert_eq!(0, rb.as_slices().1.len());
rb.extend_from_within(0, 4);
assert_eq!(7, rb.as_slices().0.len());
assert_eq!(1, rb.as_slices().1.len());
let mut rb = RingBuffer::new();
rb.reserve(8);
rb.extend(b"11111111");
rb.drop_first_n(7);
rb.extend(b"111");
assert_eq!(2, rb.as_slices().0.len());
assert_eq!(2, rb.as_slices().1.len());
rb.extend_from_within(0, 4);
assert_eq!(b"11", rb.as_slices().0);
assert_eq!(b"111111", rb.as_slices().1);
}
}

134
vendor/ruzstd/src/decoding/scratch.rs vendored Normal file
View File

@@ -0,0 +1,134 @@
//! Structures that wrap around various decoders to make decoding easier.
use super::super::blocks::sequence_section::Sequence;
use super::decode_buffer::DecodeBuffer;
use crate::decoding::dictionary::Dictionary;
use crate::fse::FSETable;
use crate::huff0::HuffmanTable;
use alloc::vec::Vec;
use crate::blocks::sequence_section::{
MAX_LITERAL_LENGTH_CODE, MAX_MATCH_LENGTH_CODE, MAX_OFFSET_CODE,
};
/// A block level decoding buffer.
pub struct DecoderScratch {
/// The decoder used for Huffman blocks.
pub huf: HuffmanScratch,
/// The decoder used for FSE blocks.
pub fse: FSEScratch,
pub buffer: DecodeBuffer,
pub offset_hist: [u32; 3],
pub literals_buffer: Vec<u8>,
pub sequences: Vec<Sequence>,
pub block_content_buffer: Vec<u8>,
}
impl DecoderScratch {
pub fn new(window_size: usize) -> DecoderScratch {
DecoderScratch {
huf: HuffmanScratch {
table: HuffmanTable::new(),
},
fse: FSEScratch {
offsets: FSETable::new(MAX_OFFSET_CODE),
of_rle: None,
literal_lengths: FSETable::new(MAX_LITERAL_LENGTH_CODE),
ll_rle: None,
match_lengths: FSETable::new(MAX_MATCH_LENGTH_CODE),
ml_rle: None,
},
buffer: DecodeBuffer::new(window_size),
offset_hist: [1, 4, 8],
block_content_buffer: Vec::new(),
literals_buffer: Vec::new(),
sequences: Vec::new(),
}
}
pub fn reset(&mut self, window_size: usize) {
self.offset_hist = [1, 4, 8];
self.literals_buffer.clear();
self.sequences.clear();
self.block_content_buffer.clear();
self.buffer.reset(window_size);
self.fse.literal_lengths.reset();
self.fse.match_lengths.reset();
self.fse.offsets.reset();
self.fse.ll_rle = None;
self.fse.ml_rle = None;
self.fse.of_rle = None;
self.huf.table.reset();
}
pub fn init_from_dict(&mut self, dict: &Dictionary) {
self.fse.reinit_from(&dict.fse);
self.huf.table.reinit_from(&dict.huf.table);
self.offset_hist = dict.offset_hist;
self.buffer.dict_content.clear();
self.buffer
.dict_content
.extend_from_slice(&dict.dict_content);
}
}
pub struct HuffmanScratch {
pub table: HuffmanTable,
}
impl HuffmanScratch {
pub fn new() -> HuffmanScratch {
HuffmanScratch {
table: HuffmanTable::new(),
}
}
}
impl Default for HuffmanScratch {
fn default() -> Self {
Self::new()
}
}
pub struct FSEScratch {
pub offsets: FSETable,
pub of_rle: Option<u8>,
pub literal_lengths: FSETable,
pub ll_rle: Option<u8>,
pub match_lengths: FSETable,
pub ml_rle: Option<u8>,
}
impl FSEScratch {
pub fn new() -> FSEScratch {
FSEScratch {
offsets: FSETable::new(MAX_OFFSET_CODE),
of_rle: None,
literal_lengths: FSETable::new(MAX_LITERAL_LENGTH_CODE),
ll_rle: None,
match_lengths: FSETable::new(MAX_MATCH_LENGTH_CODE),
ml_rle: None,
}
}
pub fn reinit_from(&mut self, other: &Self) {
self.offsets.reinit_from(&other.offsets);
self.literal_lengths.reinit_from(&other.literal_lengths);
self.match_lengths.reinit_from(&other.match_lengths);
self.of_rle = other.of_rle;
self.ll_rle = other.ll_rle;
self.ml_rle = other.ml_rle;
}
}
impl Default for FSEScratch {
fn default() -> Self {
Self::new()
}
}

View File

@@ -0,0 +1,115 @@
use super::scratch::DecoderScratch;
use crate::decoding::errors::ExecuteSequencesError;
/// Take the provided decoder and execute the sequences stored within
pub fn execute_sequences(scratch: &mut DecoderScratch) -> Result<(), ExecuteSequencesError> {
let mut literals_copy_counter = 0;
let old_buffer_size = scratch.buffer.len();
let mut seq_sum = 0;
for idx in 0..scratch.sequences.len() {
let seq = scratch.sequences[idx];
if seq.ll > 0 {
let high = literals_copy_counter + seq.ll as usize;
if high > scratch.literals_buffer.len() {
return Err(ExecuteSequencesError::NotEnoughBytesForSequence {
wanted: high,
have: scratch.literals_buffer.len(),
});
}
let literals = &scratch.literals_buffer[literals_copy_counter..high];
literals_copy_counter += seq.ll as usize;
scratch.buffer.push(literals);
}
let actual_offset = do_offset_history(seq.of, seq.ll, &mut scratch.offset_hist);
if actual_offset == 0 {
return Err(ExecuteSequencesError::ZeroOffset);
}
if seq.ml > 0 {
scratch
.buffer
.repeat(actual_offset as usize, seq.ml as usize)?;
}
seq_sum += seq.ml;
seq_sum += seq.ll;
}
if literals_copy_counter < scratch.literals_buffer.len() {
let rest_literals = &scratch.literals_buffer[literals_copy_counter..];
scratch.buffer.push(rest_literals);
seq_sum += rest_literals.len() as u32;
}
let diff = scratch.buffer.len() - old_buffer_size;
assert!(
seq_sum as usize == diff,
"Seq_sum: {} is different from the difference in buffersize: {}",
seq_sum,
diff
);
Ok(())
}
/// Update the most recently used offsets to reflect the provided offset value, and return the
/// "actual" offset needed because offsets are not stored in a raw way, some transformations are needed
/// before you get a functional number.
fn do_offset_history(offset_value: u32, lit_len: u32, scratch: &mut [u32; 3]) -> u32 {
let actual_offset = if lit_len > 0 {
match offset_value {
1..=3 => scratch[offset_value as usize - 1],
_ => {
//new offset
offset_value - 3
}
}
} else {
match offset_value {
1..=2 => scratch[offset_value as usize],
3 => scratch[0] - 1,
_ => {
//new offset
offset_value - 3
}
}
};
//update history
if lit_len > 0 {
match offset_value {
1 => {
//nothing
}
2 => {
scratch[1] = scratch[0];
scratch[0] = actual_offset;
}
_ => {
scratch[2] = scratch[1];
scratch[1] = scratch[0];
scratch[0] = actual_offset;
}
}
} else {
match offset_value {
1 => {
scratch[1] = scratch[0];
scratch[0] = actual_offset;
}
2 => {
scratch[2] = scratch[1];
scratch[1] = scratch[0];
scratch[0] = actual_offset;
}
_ => {
scratch[2] = scratch[1];
scratch[1] = scratch[0];
scratch[0] = actual_offset;
}
}
}
actual_offset
}

View File

@@ -0,0 +1,487 @@
use super::super::blocks::sequence_section::ModeType;
use super::super::blocks::sequence_section::Sequence;
use super::super::blocks::sequence_section::SequencesHeader;
use super::scratch::FSEScratch;
use crate::bit_io::BitReaderReversed;
use crate::blocks::sequence_section::{
MAX_LITERAL_LENGTH_CODE, MAX_MATCH_LENGTH_CODE, MAX_OFFSET_CODE,
};
use crate::decoding::errors::DecodeSequenceError;
use crate::fse::FSEDecoder;
use alloc::vec::Vec;
/// Decode the provided source as a series of sequences into the supplied `target`.
pub fn decode_sequences(
section: &SequencesHeader,
source: &[u8],
scratch: &mut FSEScratch,
target: &mut Vec<Sequence>,
) -> Result<(), DecodeSequenceError> {
let bytes_read = maybe_update_fse_tables(section, source, scratch)?;
vprintln!("Updating tables used {} bytes", bytes_read);
let bit_stream = &source[bytes_read..];
let mut br = BitReaderReversed::new(bit_stream);
//skip the 0 padding at the end of the last byte of the bit stream and throw away the first 1 found
let mut skipped_bits = 0;
loop {
let val = br.get_bits(1);
skipped_bits += 1;
if val == 1 || skipped_bits > 8 {
break;
}
}
if skipped_bits > 8 {
//if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data
return Err(DecodeSequenceError::ExtraPadding { skipped_bits });
}
if scratch.ll_rle.is_some() || scratch.ml_rle.is_some() || scratch.of_rle.is_some() {
decode_sequences_with_rle(section, &mut br, scratch, target)
} else {
decode_sequences_without_rle(section, &mut br, scratch, target)
}
}
fn decode_sequences_with_rle(
section: &SequencesHeader,
br: &mut BitReaderReversed<'_>,
scratch: &FSEScratch,
target: &mut Vec<Sequence>,
) -> Result<(), DecodeSequenceError> {
let mut ll_dec = FSEDecoder::new(&scratch.literal_lengths);
let mut ml_dec = FSEDecoder::new(&scratch.match_lengths);
let mut of_dec = FSEDecoder::new(&scratch.offsets);
if scratch.ll_rle.is_none() {
ll_dec.init_state(br)?;
}
if scratch.of_rle.is_none() {
of_dec.init_state(br)?;
}
if scratch.ml_rle.is_none() {
ml_dec.init_state(br)?;
}
target.clear();
target.reserve(section.num_sequences as usize);
for _seq_idx in 0..section.num_sequences {
//get the codes from either the RLE byte or from the decoder
let ll_code = if scratch.ll_rle.is_some() {
scratch.ll_rle.unwrap()
} else {
ll_dec.decode_symbol()
};
let ml_code = if scratch.ml_rle.is_some() {
scratch.ml_rle.unwrap()
} else {
ml_dec.decode_symbol()
};
let of_code = if scratch.of_rle.is_some() {
scratch.of_rle.unwrap()
} else {
of_dec.decode_symbol()
};
let (ll_value, ll_num_bits) = lookup_ll_code(ll_code);
let (ml_value, ml_num_bits) = lookup_ml_code(ml_code);
//println!("Sequence: {}", i);
//println!("of stat: {}", of_dec.state);
//println!("of Code: {}", of_code);
//println!("ll stat: {}", ll_dec.state);
//println!("ll bits: {}", ll_num_bits);
//println!("ll Code: {}", ll_value);
//println!("ml stat: {}", ml_dec.state);
//println!("ml bits: {}", ml_num_bits);
//println!("ml Code: {}", ml_value);
//println!("");
if of_code > MAX_OFFSET_CODE {
return Err(DecodeSequenceError::UnsupportedOffset {
offset_code: of_code,
});
}
let (obits, ml_add, ll_add) = br.get_bits_triple(of_code, ml_num_bits, ll_num_bits);
let offset = obits as u32 + (1u32 << of_code);
if offset == 0 {
return Err(DecodeSequenceError::ZeroOffset);
}
target.push(Sequence {
ll: ll_value + ll_add as u32,
ml: ml_value + ml_add as u32,
of: offset,
});
if target.len() < section.num_sequences as usize {
//println!(
// "Bits left: {} ({} bytes)",
// br.bits_remaining(),
// br.bits_remaining() / 8,
//);
if scratch.ll_rle.is_none() {
ll_dec.update_state(br);
}
if scratch.ml_rle.is_none() {
ml_dec.update_state(br);
}
if scratch.of_rle.is_none() {
of_dec.update_state(br);
}
}
if br.bits_remaining() < 0 {
return Err(DecodeSequenceError::NotEnoughBytesForNumSequences);
}
}
if br.bits_remaining() > 0 {
Err(DecodeSequenceError::ExtraBits {
bits_remaining: br.bits_remaining(),
})
} else {
Ok(())
}
}
fn decode_sequences_without_rle(
section: &SequencesHeader,
br: &mut BitReaderReversed<'_>,
scratch: &FSEScratch,
target: &mut Vec<Sequence>,
) -> Result<(), DecodeSequenceError> {
let mut ll_dec = FSEDecoder::new(&scratch.literal_lengths);
let mut ml_dec = FSEDecoder::new(&scratch.match_lengths);
let mut of_dec = FSEDecoder::new(&scratch.offsets);
ll_dec.init_state(br)?;
of_dec.init_state(br)?;
ml_dec.init_state(br)?;
target.clear();
target.reserve(section.num_sequences as usize);
for _seq_idx in 0..section.num_sequences {
let ll_code = ll_dec.decode_symbol();
let ml_code = ml_dec.decode_symbol();
let of_code = of_dec.decode_symbol();
let (ll_value, ll_num_bits) = lookup_ll_code(ll_code);
let (ml_value, ml_num_bits) = lookup_ml_code(ml_code);
if of_code > MAX_OFFSET_CODE {
return Err(DecodeSequenceError::UnsupportedOffset {
offset_code: of_code,
});
}
let (obits, ml_add, ll_add) = br.get_bits_triple(of_code, ml_num_bits, ll_num_bits);
let offset = obits as u32 + (1u32 << of_code);
if offset == 0 {
return Err(DecodeSequenceError::ZeroOffset);
}
target.push(Sequence {
ll: ll_value + ll_add as u32,
ml: ml_value + ml_add as u32,
of: offset,
});
if target.len() < section.num_sequences as usize {
//println!(
// "Bits left: {} ({} bytes)",
// br.bits_remaining(),
// br.bits_remaining() / 8,
//);
ll_dec.update_state(br);
ml_dec.update_state(br);
of_dec.update_state(br);
}
if br.bits_remaining() < 0 {
return Err(DecodeSequenceError::NotEnoughBytesForNumSequences);
}
}
if br.bits_remaining() > 0 {
Err(DecodeSequenceError::ExtraBits {
bits_remaining: br.bits_remaining(),
})
} else {
Ok(())
}
}
/// Look up the provided state value from a literal length table predefined
/// by the Zstandard reference document. Returns a tuple of (value, number of bits).
///
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#appendix-a---decoding-tables-for-predefined-codes>
fn lookup_ll_code(code: u8) -> (u32, u8) {
match code {
0..=15 => (u32::from(code), 0),
16 => (16, 1),
17 => (18, 1),
18 => (20, 1),
19 => (22, 1),
20 => (24, 2),
21 => (28, 2),
22 => (32, 3),
23 => (40, 3),
24 => (48, 4),
25 => (64, 6),
26 => (128, 7),
27 => (256, 8),
28 => (512, 9),
29 => (1024, 10),
30 => (2048, 11),
31 => (4096, 12),
32 => (8192, 13),
33 => (16384, 14),
34 => (32768, 15),
35 => (65536, 16),
_ => unreachable!("Illegal literal length code was: {}", code),
}
}
/// Look up the provided state value from a match length table predefined
/// by the Zstandard reference document. Returns a tuple of (value, number of bits).
///
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#appendix-a---decoding-tables-for-predefined-codes>
fn lookup_ml_code(code: u8) -> (u32, u8) {
match code {
0..=31 => (u32::from(code) + 3, 0),
32 => (35, 1),
33 => (37, 1),
34 => (39, 1),
35 => (41, 1),
36 => (43, 2),
37 => (47, 2),
38 => (51, 3),
39 => (59, 3),
40 => (67, 4),
41 => (83, 4),
42 => (99, 5),
43 => (131, 7),
44 => (259, 8),
45 => (515, 9),
46 => (1027, 10),
47 => (2051, 11),
48 => (4099, 12),
49 => (8195, 13),
50 => (16387, 14),
51 => (32771, 15),
52 => (65539, 16),
_ => unreachable!("Illegal match length code was: {}", code),
}
}
// This info is buried in the symbol compression mode table
/// "The maximum allowed accuracy log for literals length and match length tables is 9"
pub const LL_MAX_LOG: u8 = 9;
/// "The maximum allowed accuracy log for literals length and match length tables is 9"
pub const ML_MAX_LOG: u8 = 9;
/// "The maximum accuracy log for the offset table is 8."
pub const OF_MAX_LOG: u8 = 8;
fn maybe_update_fse_tables(
section: &SequencesHeader,
source: &[u8],
scratch: &mut FSEScratch,
) -> Result<usize, DecodeSequenceError> {
let modes = section
.modes
.ok_or(DecodeSequenceError::MissingCompressionMode)?;
let mut bytes_read = 0;
match modes.ll_mode() {
ModeType::FSECompressed => {
let bytes = scratch.literal_lengths.build_decoder(source, LL_MAX_LOG)?;
bytes_read += bytes;
vprintln!("Updating ll table");
vprintln!("Used bytes: {}", bytes);
scratch.ll_rle = None;
}
ModeType::RLE => {
vprintln!("Use RLE ll table");
if source.is_empty() {
return Err(DecodeSequenceError::MissingByteForRleLlTable);
}
bytes_read += 1;
if source[0] > MAX_LITERAL_LENGTH_CODE {
return Err(DecodeSequenceError::MissingByteForRleMlTable);
}
scratch.ll_rle = Some(source[0]);
}
ModeType::Predefined => {
vprintln!("Use predefined ll table");
scratch.literal_lengths.build_from_probabilities(
LL_DEFAULT_ACC_LOG,
&Vec::from(&LITERALS_LENGTH_DEFAULT_DISTRIBUTION[..]),
)?;
scratch.ll_rle = None;
}
ModeType::Repeat => {
vprintln!("Repeat ll table");
/* Nothing to do */
}
};
let of_source = &source[bytes_read..];
match modes.of_mode() {
ModeType::FSECompressed => {
let bytes = scratch.offsets.build_decoder(of_source, OF_MAX_LOG)?;
vprintln!("Updating of table");
vprintln!("Used bytes: {}", bytes);
bytes_read += bytes;
scratch.of_rle = None;
}
ModeType::RLE => {
vprintln!("Use RLE of table");
if of_source.is_empty() {
return Err(DecodeSequenceError::MissingByteForRleOfTable);
}
bytes_read += 1;
if of_source[0] > MAX_OFFSET_CODE {
return Err(DecodeSequenceError::MissingByteForRleMlTable);
}
scratch.of_rle = Some(of_source[0]);
}
ModeType::Predefined => {
vprintln!("Use predefined of table");
scratch.offsets.build_from_probabilities(
OF_DEFAULT_ACC_LOG,
&Vec::from(&OFFSET_DEFAULT_DISTRIBUTION[..]),
)?;
scratch.of_rle = None;
}
ModeType::Repeat => {
vprintln!("Repeat of table");
/* Nothing to do */
}
};
let ml_source = &source[bytes_read..];
match modes.ml_mode() {
ModeType::FSECompressed => {
let bytes = scratch.match_lengths.build_decoder(ml_source, ML_MAX_LOG)?;
bytes_read += bytes;
vprintln!("Updating ml table");
vprintln!("Used bytes: {}", bytes);
scratch.ml_rle = None;
}
ModeType::RLE => {
vprintln!("Use RLE ml table");
if ml_source.is_empty() {
return Err(DecodeSequenceError::MissingByteForRleMlTable);
}
bytes_read += 1;
if ml_source[0] > MAX_MATCH_LENGTH_CODE {
return Err(DecodeSequenceError::MissingByteForRleMlTable);
}
scratch.ml_rle = Some(ml_source[0]);
}
ModeType::Predefined => {
vprintln!("Use predefined ml table");
scratch.match_lengths.build_from_probabilities(
ML_DEFAULT_ACC_LOG,
&Vec::from(&MATCH_LENGTH_DEFAULT_DISTRIBUTION[..]),
)?;
scratch.ml_rle = None;
}
ModeType::Repeat => {
vprintln!("Repeat ml table");
/* Nothing to do */
}
};
Ok(bytes_read)
}
// The default Literal Length decoding table uses an accuracy logarithm of 6 bits.
const LL_DEFAULT_ACC_LOG: u8 = 6;
/// If [ModeType::Predefined] is selected for a symbol type, its FSE decoding
/// table is generated using a predefined distribution table.
///
/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#literals-length
const LITERALS_LENGTH_DEFAULT_DISTRIBUTION: [i32; 36] = [
4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1,
-1, -1, -1, -1,
];
// The default Match Length decoding table uses an accuracy logarithm of 6 bits.
const ML_DEFAULT_ACC_LOG: u8 = 6;
/// If [ModeType::Predefined] is selected for a symbol type, its FSE decoding
/// table is generated using a predefined distribution table.
///
/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#match-length
const MATCH_LENGTH_DEFAULT_DISTRIBUTION: [i32; 53] = [
1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1,
];
// The default Match Length decoding table uses an accuracy logarithm of 5 bits.
const OF_DEFAULT_ACC_LOG: u8 = 5;
/// If [ModeType::Predefined] is selected for a symbol type, its FSE decoding
/// table is generated using a predefined distribution table.
///
/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#match-length
const OFFSET_DEFAULT_DISTRIBUTION: [i32; 29] = [
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1,
];
#[test]
fn test_ll_default() {
let mut table = crate::fse::FSETable::new(MAX_LITERAL_LENGTH_CODE);
table
.build_from_probabilities(
LL_DEFAULT_ACC_LOG,
&Vec::from(&LITERALS_LENGTH_DEFAULT_DISTRIBUTION[..]),
)
.unwrap();
#[cfg(feature = "std")]
for idx in 0..table.decode.len() {
std::println!(
"{:3}: {:3} {:3} {:3}",
idx,
table.decode[idx].symbol,
table.decode[idx].num_bits,
table.decode[idx].base_line
);
}
assert!(table.decode.len() == 64);
//just test a few values. TODO test all values
assert!(table.decode[0].symbol == 0);
assert!(table.decode[0].num_bits == 4);
assert!(table.decode[0].base_line == 0);
assert!(table.decode[19].symbol == 27);
assert!(table.decode[19].num_bits == 6);
assert!(table.decode[19].base_line == 0);
assert!(table.decode[39].symbol == 25);
assert!(table.decode[39].num_bits == 4);
assert!(table.decode[39].base_line == 16);
assert!(table.decode[60].symbol == 35);
assert!(table.decode[60].num_bits == 6);
assert!(table.decode[60].base_line == 0);
assert!(table.decode[59].symbol == 24);
assert!(table.decode[59].num_bits == 5);
assert!(table.decode[59].base_line == 32);
}

View File

@@ -0,0 +1,143 @@
//! The [StreamingDecoder] wraps a [FrameDecoder] and provides a Read impl that decodes data when necessary
use core::borrow::BorrowMut;
use crate::decoding::errors::FrameDecoderError;
use crate::decoding::{BlockDecodingStrategy, FrameDecoder};
#[cfg(not(feature = "std"))]
use crate::io::ErrorKind;
use crate::io::{Error, Read};
/// High level Zstandard frame decoder that can be used to decompress a given Zstandard frame.
///
/// This decoder implements `io::Read`, so you can interact with it by calling
/// `io::Read::read_to_end` / `io::Read::read_exact` or passing this to another library / module as a source for the decoded content
///
/// If you need more control over how decompression takes place, you can use
/// the lower level [FrameDecoder], which allows for greater control over how
/// decompression takes place but the implementor must call
/// [FrameDecoder::decode_blocks] repeatedly to decode the entire frame.
///
/// ## Caveat
/// [StreamingDecoder] expects the underlying stream to only contain a single frame,
/// yet the specification states that a single archive may contain multiple frames.
///
/// To decode all the frames in a finite stream, the calling code needs to recreate
/// the instance of the decoder and handle
/// [crate::decoding::errors::ReadFrameHeaderError::SkipFrame]
/// errors by skipping forward the `length` amount of bytes, see <https://github.com/KillingSpark/zstd-rs/issues/57>
///
/// ```no_run
/// // `read_to_end` is not implemented by the no_std implementation.
/// #[cfg(feature = "std")]
/// {
/// use std::fs::File;
/// use std::io::Read;
/// use ruzstd::decoding::StreamingDecoder;
///
/// // Read a Zstandard archive from the filesystem then decompress it into a vec.
/// let mut f: File = todo!("Read a .zstd archive from somewhere");
/// let mut decoder = StreamingDecoder::new(f).unwrap();
/// let mut result = Vec::new();
/// Read::read_to_end(&mut decoder, &mut result).unwrap();
/// }
/// ```
pub struct StreamingDecoder<READ: Read, DEC: BorrowMut<FrameDecoder>> {
pub decoder: DEC,
source: READ,
}
impl<READ: Read, DEC: BorrowMut<FrameDecoder>> StreamingDecoder<READ, DEC> {
pub fn new_with_decoder(
mut source: READ,
mut decoder: DEC,
) -> Result<StreamingDecoder<READ, DEC>, FrameDecoderError> {
decoder.borrow_mut().init(&mut source)?;
Ok(StreamingDecoder { decoder, source })
}
}
impl<READ: Read> StreamingDecoder<READ, FrameDecoder> {
pub fn new(
mut source: READ,
) -> Result<StreamingDecoder<READ, FrameDecoder>, FrameDecoderError> {
let mut decoder = FrameDecoder::new();
decoder.init(&mut source)?;
Ok(StreamingDecoder { decoder, source })
}
}
impl<READ: Read, DEC: BorrowMut<FrameDecoder>> StreamingDecoder<READ, DEC> {
/// Gets a reference to the underlying reader.
pub fn get_ref(&self) -> &READ {
&self.source
}
/// Gets a mutable reference to the underlying reader.
///
/// It is inadvisable to directly read from the underlying reader.
pub fn get_mut(&mut self) -> &mut READ {
&mut self.source
}
/// Destructures this object into the inner reader.
pub fn into_inner(self) -> READ
where
READ: Sized,
{
self.source
}
/// Destructures this object into both the inner reader and [FrameDecoder].
pub fn into_parts(self) -> (READ, DEC)
where
READ: Sized,
{
(self.source, self.decoder)
}
/// Destructures this object into the inner [FrameDecoder].
pub fn into_frame_decoder(self) -> DEC {
self.decoder
}
}
impl<READ: Read, DEC: BorrowMut<FrameDecoder>> Read for StreamingDecoder<READ, DEC> {
fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
let decoder = self.decoder.borrow_mut();
if decoder.is_finished() && decoder.can_collect() == 0 {
//No more bytes can ever be decoded
return Ok(0);
}
// need to loop. The UpToBytes strategy doesn't take any effort to actually reach that limit.
// The first few calls can result in just filling the decode buffer but these bytes can not be collected.
// So we need to call this until we can actually collect enough bytes
// TODO add BlockDecodingStrategy::UntilCollectable(usize) that pushes this logic into the decode_blocks function
while decoder.can_collect() < buf.len() && !decoder.is_finished() {
//More bytes can be decoded
let additional_bytes_needed = buf.len() - decoder.can_collect();
match decoder.decode_blocks(
&mut self.source,
BlockDecodingStrategy::UptoBytes(additional_bytes_needed),
) {
Ok(_) => { /*Nothing to do*/ }
Err(e) => {
let err;
#[cfg(feature = "std")]
{
err = Error::other(e);
}
#[cfg(not(feature = "std"))]
{
err = Error::new(ErrorKind::Other, alloc::boxed::Box::new(e));
}
return Err(err);
}
}
}
decoder.read(buf)
}
}

View File

@@ -0,0 +1,64 @@
use crate::blocks::block::BlockType;
use alloc::vec::Vec;
#[derive(Debug)]
pub struct BlockHeader {
/// Signals if this block is the last one.
/// The frame will end after this block.
pub last_block: bool,
/// Influences the meaning of `block_size`.
pub block_type: BlockType,
/// - For `Raw` blocks, this is the size of the block's
/// content in bytes.
/// - For `RLE` blocks, there will be a single byte follwing
/// the header, repeated `block_size` times.
/// - For `Compressed` blocks, this is the length of
/// the compressed data.
///
/// **This value must not be greater than 21 bits in length.**
pub block_size: u32,
}
impl BlockHeader {
/// Write encoded binary representation of this header into the provided buffer.
pub fn serialize(self, output: &mut Vec<u8>) {
vprintln!("Serializing block with the header: {self:?}");
let encoded_block_type = match self.block_type {
BlockType::Raw => 0,
BlockType::RLE => 1,
BlockType::Compressed => 2,
BlockType::Reserved => panic!("You cannot use a reserved block type"),
};
let mut block_header = self.block_size << 3;
block_header |= encoded_block_type << 1;
block_header |= self.last_block as u32;
output.extend_from_slice(&block_header.to_le_bytes()[0..3]);
}
}
#[cfg(test)]
mod tests {
use super::BlockHeader;
use crate::{blocks::block::BlockType, decoding::block_decoder};
use alloc::vec::Vec;
#[test]
fn block_header_serialize() {
let header = BlockHeader {
last_block: true,
block_type: super::BlockType::Compressed,
block_size: 69,
};
let mut serialized_header = Vec::new();
header.serialize(&mut serialized_header);
let mut decoder = block_decoder::new();
let parsed_header = decoder
.read_block_header(serialized_header.as_slice())
.unwrap()
.0;
assert!(parsed_header.last_block);
assert_eq!(parsed_header.block_type, BlockType::Compressed);
assert_eq!(parsed_header.content_size, 69);
}
}

View File

@@ -0,0 +1,376 @@
use alloc::vec::Vec;
use crate::{
bit_io::BitWriter,
encoding::frame_compressor::CompressState,
encoding::{Matcher, Sequence},
fse::fse_encoder::{build_table_from_data, FSETable, State},
huff0::huff0_encoder,
};
pub fn compress_block<M: Matcher>(state: &mut CompressState<M>, output: &mut Vec<u8>) {
let mut literals_vec = Vec::new();
let mut sequences = Vec::new();
state.matcher.start_matching(|seq| {
match seq {
Sequence::Literals { literals } => literals_vec.extend_from_slice(literals),
Sequence::Triple {
literals,
offset,
match_len,
} => {
literals_vec.extend_from_slice(literals);
sequences.push(crate::blocks::sequence_section::Sequence {
ll: literals.len() as u32,
ml: match_len as u32,
of: (offset + 3) as u32, // TODO make use of the offset history
});
}
}
});
// literals section
let mut writer = BitWriter::from(output);
if literals_vec.len() > 1024 {
if let Some(table) =
compress_literals(&literals_vec, state.last_huff_table.as_ref(), &mut writer)
{
state.last_huff_table.replace(table);
}
} else {
raw_literals(&literals_vec, &mut writer);
}
// sequences section
if sequences.is_empty() {
writer.write_bits(0u8, 8);
} else {
encode_seqnum(sequences.len(), &mut writer);
// Choose the tables
// TODO store previously used tables
let ll_mode = choose_table(
state.fse_tables.ll_previous.as_ref(),
&state.fse_tables.ll_default,
sequences.iter().map(|seq| encode_literal_length(seq.ll).0),
9,
);
let ml_mode = choose_table(
state.fse_tables.ml_previous.as_ref(),
&state.fse_tables.ml_default,
sequences.iter().map(|seq| encode_match_len(seq.ml).0),
9,
);
let of_mode = choose_table(
state.fse_tables.of_previous.as_ref(),
&state.fse_tables.of_default,
sequences.iter().map(|seq| encode_offset(seq.of).0),
8,
);
writer.write_bits(encode_fse_table_modes(&ll_mode, &ml_mode, &of_mode), 8);
encode_table(&ll_mode, &mut writer);
encode_table(&of_mode, &mut writer);
encode_table(&ml_mode, &mut writer);
encode_sequences(
&sequences,
&mut writer,
ll_mode.as_ref(),
ml_mode.as_ref(),
of_mode.as_ref(),
);
if let FseTableMode::Encoded(table) = ll_mode {
state.fse_tables.ll_previous = Some(table)
}
if let FseTableMode::Encoded(table) = ml_mode {
state.fse_tables.ml_previous = Some(table)
}
if let FseTableMode::Encoded(table) = of_mode {
state.fse_tables.of_previous = Some(table)
}
}
writer.flush();
}
#[derive(Clone)]
#[allow(clippy::large_enum_variant)]
enum FseTableMode<'a> {
Predefined(&'a FSETable),
Encoded(FSETable),
RepeateLast(&'a FSETable),
}
impl FseTableMode<'_> {
pub fn as_ref(&self) -> &FSETable {
match self {
Self::Predefined(t) => t,
Self::RepeateLast(t) => t,
Self::Encoded(t) => t,
}
}
}
fn choose_table<'a>(
previous: Option<&'a FSETable>,
default_table: &'a FSETable,
data: impl Iterator<Item = u8>,
max_log: u8,
) -> FseTableMode<'a> {
// TODO check if the new table is better than the predefined and previous table
let use_new_table = true;
let use_previous_table = false;
if use_previous_table {
FseTableMode::RepeateLast(previous.unwrap())
} else if use_new_table {
FseTableMode::Encoded(build_table_from_data(data, max_log, true))
} else {
FseTableMode::Predefined(default_table)
}
}
fn encode_table(mode: &FseTableMode<'_>, writer: &mut BitWriter<&mut Vec<u8>>) {
match mode {
FseTableMode::Predefined(_) => {}
FseTableMode::RepeateLast(_) => {}
FseTableMode::Encoded(table) => table.write_table(writer),
}
}
fn encode_fse_table_modes(
ll_mode: &FseTableMode<'_>,
ml_mode: &FseTableMode<'_>,
of_mode: &FseTableMode<'_>,
) -> u8 {
fn mode_to_bits(mode: &FseTableMode<'_>) -> u8 {
match mode {
FseTableMode::Predefined(_) => 0,
FseTableMode::Encoded(_) => 2,
FseTableMode::RepeateLast(_) => 3,
}
}
mode_to_bits(ll_mode) << 6 | mode_to_bits(of_mode) << 4 | mode_to_bits(ml_mode) << 2
}
fn encode_sequences(
sequences: &[crate::blocks::sequence_section::Sequence],
writer: &mut BitWriter<&mut Vec<u8>>,
ll_table: &FSETable,
ml_table: &FSETable,
of_table: &FSETable,
) {
let sequence = sequences[sequences.len() - 1];
let (ll_code, ll_add_bits, ll_num_bits) = encode_literal_length(sequence.ll);
let (of_code, of_add_bits, of_num_bits) = encode_offset(sequence.of);
let (ml_code, ml_add_bits, ml_num_bits) = encode_match_len(sequence.ml);
let mut ll_state: &State = ll_table.start_state(ll_code);
let mut ml_state: &State = ml_table.start_state(ml_code);
let mut of_state: &State = of_table.start_state(of_code);
writer.write_bits(ll_add_bits, ll_num_bits);
writer.write_bits(ml_add_bits, ml_num_bits);
writer.write_bits(of_add_bits, of_num_bits);
// encode backwards so the decoder reads the first sequence first
if sequences.len() > 1 {
for sequence in (0..=sequences.len() - 2).rev() {
let sequence = sequences[sequence];
let (ll_code, ll_add_bits, ll_num_bits) = encode_literal_length(sequence.ll);
let (of_code, of_add_bits, of_num_bits) = encode_offset(sequence.of);
let (ml_code, ml_add_bits, ml_num_bits) = encode_match_len(sequence.ml);
{
let next = of_table.next_state(of_code, of_state.index);
let diff = of_state.index - next.baseline;
writer.write_bits(diff as u64, next.num_bits as usize);
of_state = next;
}
{
let next = ml_table.next_state(ml_code, ml_state.index);
let diff = ml_state.index - next.baseline;
writer.write_bits(diff as u64, next.num_bits as usize);
ml_state = next;
}
{
let next = ll_table.next_state(ll_code, ll_state.index);
let diff = ll_state.index - next.baseline;
writer.write_bits(diff as u64, next.num_bits as usize);
ll_state = next;
}
writer.write_bits(ll_add_bits, ll_num_bits);
writer.write_bits(ml_add_bits, ml_num_bits);
writer.write_bits(of_add_bits, of_num_bits);
}
}
writer.write_bits(ml_state.index as u64, ml_table.table_size.ilog2() as usize);
writer.write_bits(of_state.index as u64, of_table.table_size.ilog2() as usize);
writer.write_bits(ll_state.index as u64, ll_table.table_size.ilog2() as usize);
let bits_to_fill = writer.misaligned();
if bits_to_fill == 0 {
writer.write_bits(1u32, 8);
} else {
writer.write_bits(1u32, bits_to_fill);
}
}
fn encode_seqnum(seqnum: usize, writer: &mut BitWriter<impl AsMut<Vec<u8>>>) {
const UPPER_LIMIT: usize = 0xFFFF + 0x7F00;
match seqnum {
1..=127 => writer.write_bits(seqnum as u32, 8),
128..=0x7FFF => {
let upper = ((seqnum >> 8) | 0x80) as u8;
let lower = seqnum as u8;
writer.write_bits(upper, 8);
writer.write_bits(lower, 8);
}
0x8000..=UPPER_LIMIT => {
let encode = seqnum - 0x7F00;
let upper = (encode >> 8) as u8;
let lower = encode as u8;
writer.write_bits(255u8, 8);
writer.write_bits(upper, 8);
writer.write_bits(lower, 8);
}
_ => unreachable!(),
}
}
fn encode_literal_length(len: u32) -> (u8, u32, usize) {
match len {
0..=15 => (len as u8, 0, 0),
16..=17 => (16, len - 16, 1),
18..=19 => (17, len - 18, 1),
20..=21 => (18, len - 20, 1),
22..=23 => (19, len - 22, 1),
24..=27 => (20, len - 24, 2),
28..=31 => (21, len - 28, 2),
32..=39 => (22, len - 32, 3),
40..=47 => (23, len - 40, 3),
48..=63 => (24, len - 48, 4),
64..=127 => (25, len - 64, 6),
128..=255 => (26, len - 128, 7),
256..=511 => (27, len - 256, 8),
512..=1023 => (28, len - 512, 9),
1024..=2047 => (29, len - 1024, 10),
2048..=4095 => (30, len - 2048, 11),
4096..=8191 => (31, len - 4096, 12),
8192..=16383 => (32, len - 8192, 13),
16384..=32767 => (33, len - 16384, 14),
32768..=65535 => (34, len - 32768, 15),
65536..=131071 => (35, len - 65536, 16),
131072.. => unreachable!(),
}
}
fn encode_match_len(len: u32) -> (u8, u32, usize) {
match len {
0..=2 => unreachable!(),
3..=34 => (len as u8 - 3, 0, 0),
35..=36 => (32, len - 35, 1),
37..=38 => (33, len - 37, 1),
39..=40 => (34, len - 39, 1),
41..=42 => (35, len - 41, 1),
43..=46 => (36, len - 43, 2),
47..=50 => (37, len - 47, 2),
51..=58 => (38, len - 51, 3),
59..=66 => (39, len - 59, 3),
67..=82 => (40, len - 67, 4),
83..=98 => (41, len - 83, 4),
99..=130 => (42, len - 99, 5),
131..=258 => (43, len - 131, 7),
259..=514 => (44, len - 259, 8),
515..=1026 => (45, len - 515, 9),
1027..=2050 => (46, len - 1027, 10),
2051..=4098 => (47, len - 2051, 11),
4099..=8194 => (48, len - 4099, 12),
8195..=16386 => (49, len - 8195, 13),
16387..=32770 => (50, len - 16387, 14),
32771..=65538 => (51, len - 32771, 15),
65539..=131074 => (52, len - 32771, 16),
131075.. => unreachable!(),
}
}
fn encode_offset(len: u32) -> (u8, u32, usize) {
let log = len.ilog2();
let lower = len & ((1 << log) - 1);
(log as u8, lower, log as usize)
}
fn raw_literals(literals: &[u8], writer: &mut BitWriter<&mut Vec<u8>>) {
writer.write_bits(0u8, 2);
writer.write_bits(0b11u8, 2);
writer.write_bits(literals.len() as u32, 20);
writer.append_bytes(literals);
}
fn compress_literals(
literals: &[u8],
last_table: Option<&huff0_encoder::HuffmanTable>,
writer: &mut BitWriter<&mut Vec<u8>>,
) -> Option<huff0_encoder::HuffmanTable> {
let reset_idx = writer.index();
let new_encoder_table = huff0_encoder::HuffmanTable::build_from_data(literals);
let (encoder_table, new_table) = if let Some(_table) = last_table {
if let Some(diff) = _table.can_encode(&new_encoder_table) {
// TODO this is a very simple heuristic, maybe we should try to do better
if diff > 5 {
(&new_encoder_table, true)
} else {
(_table, false)
}
} else {
(&new_encoder_table, true)
}
} else {
(&new_encoder_table, true)
};
if new_table {
writer.write_bits(2u8, 2); // compressed literals type
} else {
writer.write_bits(3u8, 2); // treeless compressed literals type
}
let (size_format, size_bits) = match literals.len() {
0..6 => (0b00u8, 10),
6..1024 => (0b01, 10),
1024..16384 => (0b10, 14),
16384..262144 => (0b11, 18),
_ => unimplemented!("too many literals"),
};
writer.write_bits(size_format, 2);
writer.write_bits(literals.len() as u32, size_bits);
let size_index = writer.index();
writer.write_bits(0u32, size_bits);
let index_before = writer.index();
let mut encoder = huff0_encoder::HuffmanEncoder::new(encoder_table, writer);
if size_format == 0 {
encoder.encode(literals, new_table)
} else {
encoder.encode4x(literals, new_table)
};
let encoded_len = (writer.index() - index_before) / 8;
writer.change_bits(size_index, encoded_len as u64, size_bits);
let total_len = (writer.index() - reset_idx) / 8;
// If encoded len is bigger than the raw literals we are better off just writing the raw literals here
if total_len >= literals.len() {
writer.reset_to(reset_idx);
raw_literals(literals, writer);
None
} else if new_table {
Some(new_encoder_table)
} else {
None
}
}

View File

@@ -0,0 +1,8 @@
//! After Magic_Number and Frame_Header, there are some number of blocks. Each frame must have at least one block,
//! but there is no upper limit on the number of blocks per frame.
//!
//! There are a few different kinds of blocks, and implementations for those kinds are
//! in this module.
mod compressed;
pub(super) use compressed::*;

View File

@@ -0,0 +1,461 @@
//! Utilities and interfaces for encoding an entire frame. Allows reusing resources
use alloc::vec::Vec;
use core::convert::TryInto;
#[cfg(feature = "hash")]
use twox_hash::XxHash64;
#[cfg(feature = "hash")]
use core::hash::Hasher;
use super::{
block_header::BlockHeader, frame_header::FrameHeader, levels::*,
match_generator::MatchGeneratorDriver, CompressionLevel, Matcher,
};
use crate::fse::fse_encoder::{default_ll_table, default_ml_table, default_of_table, FSETable};
use crate::io::{Read, Write};
/// An interface for compressing arbitrary data with the ZStandard compression algorithm.
///
/// `FrameCompressor` will generally be used by:
/// 1. Initializing a compressor by providing a buffer of data using `FrameCompressor::new()`
/// 2. Starting compression and writing that compression into a vec using `FrameCompressor::begin`
///
/// # Examples
/// ```
/// use ruzstd::encoding::{FrameCompressor, CompressionLevel};
/// let mock_data: &[_] = &[0x1, 0x2, 0x3, 0x4];
/// let mut output = std::vec::Vec::new();
/// // Initialize a compressor.
/// let mut compressor = FrameCompressor::new(CompressionLevel::Uncompressed);
/// compressor.set_source(mock_data);
/// compressor.set_drain(&mut output);
///
/// // `compress` writes the compressed output into the provided buffer.
/// compressor.compress();
/// ```
pub struct FrameCompressor<R: Read, W: Write, M: Matcher> {
uncompressed_data: Option<R>,
compressed_data: Option<W>,
compression_level: CompressionLevel,
state: CompressState<M>,
#[cfg(feature = "hash")]
hasher: XxHash64,
}
pub(crate) struct FseTables {
pub(crate) ll_default: FSETable,
pub(crate) ll_previous: Option<FSETable>,
pub(crate) ml_default: FSETable,
pub(crate) ml_previous: Option<FSETable>,
pub(crate) of_default: FSETable,
pub(crate) of_previous: Option<FSETable>,
}
impl FseTables {
pub fn new() -> Self {
Self {
ll_default: default_ll_table(),
ll_previous: None,
ml_default: default_ml_table(),
ml_previous: None,
of_default: default_of_table(),
of_previous: None,
}
}
}
pub(crate) struct CompressState<M: Matcher> {
pub(crate) matcher: M,
pub(crate) last_huff_table: Option<crate::huff0::huff0_encoder::HuffmanTable>,
pub(crate) fse_tables: FseTables,
}
impl<R: Read, W: Write> FrameCompressor<R, W, MatchGeneratorDriver> {
/// Create a new `FrameCompressor`
pub fn new(compression_level: CompressionLevel) -> Self {
Self {
uncompressed_data: None,
compressed_data: None,
compression_level,
state: CompressState {
matcher: MatchGeneratorDriver::new(1024 * 128, 1),
last_huff_table: None,
fse_tables: FseTables::new(),
},
#[cfg(feature = "hash")]
hasher: XxHash64::with_seed(0),
}
}
}
impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
/// Create a new `FrameCompressor` with a custom matching algorithm implementation
pub fn new_with_matcher(matcher: M, compression_level: CompressionLevel) -> Self {
Self {
uncompressed_data: None,
compressed_data: None,
state: CompressState {
matcher,
last_huff_table: None,
fse_tables: FseTables::new(),
},
compression_level,
#[cfg(feature = "hash")]
hasher: XxHash64::with_seed(0),
}
}
/// Before calling [FrameCompressor::compress] you need to set the source.
///
/// This is the data that is compressed and written into the drain.
pub fn set_source(&mut self, uncompressed_data: R) -> Option<R> {
self.uncompressed_data.replace(uncompressed_data)
}
/// Before calling [FrameCompressor::compress] you need to set the drain.
///
/// As the compressor compresses data, the drain serves as a place for the output to be writte.
pub fn set_drain(&mut self, compressed_data: W) -> Option<W> {
self.compressed_data.replace(compressed_data)
}
/// Compress the uncompressed data from the provided source as one Zstd frame and write it to the provided drain
///
/// This will repeatedly call [Read::read] on the source to fill up blocks until the source returns 0 on the read call.
/// Also [Write::write_all] will be called on the drain after each block has been encoded.
///
/// To avoid endlessly encoding from a potentially endless source (like a network socket) you can use the
/// [Read::take] function
pub fn compress(&mut self) {
// Clearing buffers to allow re-using of the compressor
self.state.matcher.reset(self.compression_level);
self.state.last_huff_table = None;
let source = self.uncompressed_data.as_mut().unwrap();
let drain = self.compressed_data.as_mut().unwrap();
// As the frame is compressed, it's stored here
let output: &mut Vec<u8> = &mut Vec::with_capacity(1024 * 130);
// First write the frame header
let header = FrameHeader {
frame_content_size: None,
single_segment: false,
content_checksum: cfg!(feature = "hash"),
dictionary_id: None,
window_size: Some(self.state.matcher.window_size()),
};
header.serialize(output);
// Now compress block by block
loop {
// Read a single block's worth of uncompressed data from the input
let mut uncompressed_data = self.state.matcher.get_next_space();
let mut read_bytes = 0;
let last_block;
'read_loop: loop {
let new_bytes = source.read(&mut uncompressed_data[read_bytes..]).unwrap();
if new_bytes == 0 {
last_block = true;
break 'read_loop;
}
read_bytes += new_bytes;
if read_bytes == uncompressed_data.len() {
last_block = false;
break 'read_loop;
}
}
uncompressed_data.resize(read_bytes, 0);
// As we read, hash that data too
#[cfg(feature = "hash")]
self.hasher.write(&uncompressed_data);
// Special handling is needed for compression of a totally empty file (why you'd want to do that, I don't know)
if uncompressed_data.is_empty() {
let header = BlockHeader {
last_block: true,
block_type: crate::blocks::block::BlockType::Raw,
block_size: 0,
};
// Write the header, then the block
header.serialize(output);
drain.write_all(output).unwrap();
output.clear();
break;
}
match self.compression_level {
CompressionLevel::Uncompressed => {
let header = BlockHeader {
last_block,
block_type: crate::blocks::block::BlockType::Raw,
block_size: read_bytes.try_into().unwrap(),
};
// Write the header, then the block
header.serialize(output);
output.extend_from_slice(&uncompressed_data);
}
CompressionLevel::Fastest => {
compress_fastest(&mut self.state, last_block, uncompressed_data, output)
}
_ => {
unimplemented!();
}
}
drain.write_all(output).unwrap();
output.clear();
if last_block {
break;
}
}
// If the `hash` feature is enabled, then `content_checksum` is set to true in the header
// and a 32 bit hash is written at the end of the data.
#[cfg(feature = "hash")]
{
// Because we only have the data as a reader, we need to read all of it to calculate the checksum
// Possible TODO: create a wrapper around self.uncompressed data that hashes the data as it's read?
let content_checksum = self.hasher.finish();
drain
.write_all(&(content_checksum as u32).to_le_bytes())
.unwrap();
}
}
/// Get a mutable reference to the source
pub fn source_mut(&mut self) -> Option<&mut R> {
self.uncompressed_data.as_mut()
}
/// Get a mutable reference to the drain
pub fn drain_mut(&mut self) -> Option<&mut W> {
self.compressed_data.as_mut()
}
/// Get a reference to the source
pub fn source(&self) -> Option<&R> {
self.uncompressed_data.as_ref()
}
/// Get a reference to the drain
pub fn drain(&self) -> Option<&W> {
self.compressed_data.as_ref()
}
/// Retrieve the source
pub fn take_source(&mut self) -> Option<R> {
self.uncompressed_data.take()
}
/// Retrieve the drain
pub fn take_drain(&mut self) -> Option<W> {
self.compressed_data.take()
}
/// Before calling [FrameCompressor::compress] you can replace the matcher
pub fn replace_matcher(&mut self, mut match_generator: M) -> M {
core::mem::swap(&mut match_generator, &mut self.state.matcher);
match_generator
}
/// Before calling [FrameCompressor::compress] you can replace the compression level
pub fn set_compression_level(
&mut self,
compression_level: CompressionLevel,
) -> CompressionLevel {
let old = self.compression_level;
self.compression_level = compression_level;
old
}
/// Get the current compression level
pub fn compression_level(&self) -> CompressionLevel {
self.compression_level
}
}
#[cfg(test)]
mod tests {
use alloc::vec;
use super::FrameCompressor;
use crate::common::MAGIC_NUM;
use crate::decoding::FrameDecoder;
use alloc::vec::Vec;
#[test]
fn frame_starts_with_magic_num() {
let mock_data = [1_u8, 2, 3].as_slice();
let mut output: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(super::CompressionLevel::Uncompressed);
compressor.set_source(mock_data);
compressor.set_drain(&mut output);
compressor.compress();
assert!(output.starts_with(&MAGIC_NUM.to_le_bytes()));
}
#[test]
fn very_simple_raw_compress() {
let mock_data = [1_u8, 2, 3].as_slice();
let mut output: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(super::CompressionLevel::Uncompressed);
compressor.set_source(mock_data);
compressor.set_drain(&mut output);
compressor.compress();
}
#[test]
fn very_simple_compress() {
let mut mock_data = vec![0; 1 << 17];
mock_data.extend(vec![1; (1 << 17) - 1]);
mock_data.extend(vec![2; (1 << 18) - 1]);
mock_data.extend(vec![2; 1 << 17]);
mock_data.extend(vec![3; (1 << 17) - 1]);
let mut output: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(super::CompressionLevel::Uncompressed);
compressor.set_source(mock_data.as_slice());
compressor.set_drain(&mut output);
compressor.compress();
let mut decoder = FrameDecoder::new();
let mut decoded = Vec::with_capacity(mock_data.len());
decoder.decode_all_to_vec(&output, &mut decoded).unwrap();
assert_eq!(mock_data, decoded);
let mut decoded = Vec::new();
zstd::stream::copy_decode(output.as_slice(), &mut decoded).unwrap();
assert_eq!(mock_data, decoded);
}
#[test]
fn rle_compress() {
let mock_data = vec![0; 1 << 19];
let mut output: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(super::CompressionLevel::Uncompressed);
compressor.set_source(mock_data.as_slice());
compressor.set_drain(&mut output);
compressor.compress();
let mut decoder = FrameDecoder::new();
let mut decoded = Vec::with_capacity(mock_data.len());
decoder.decode_all_to_vec(&output, &mut decoded).unwrap();
assert_eq!(mock_data, decoded);
}
#[test]
fn aaa_compress() {
let mock_data = vec![0, 1, 3, 4, 5];
let mut output: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(super::CompressionLevel::Uncompressed);
compressor.set_source(mock_data.as_slice());
compressor.set_drain(&mut output);
compressor.compress();
let mut decoder = FrameDecoder::new();
let mut decoded = Vec::with_capacity(mock_data.len());
decoder.decode_all_to_vec(&output, &mut decoded).unwrap();
assert_eq!(mock_data, decoded);
let mut decoded = Vec::new();
zstd::stream::copy_decode(output.as_slice(), &mut decoded).unwrap();
assert_eq!(mock_data, decoded);
}
#[cfg(feature = "std")]
#[test]
fn fuzz_targets() {
use std::io::Read;
fn decode_ruzstd(data: &mut dyn std::io::Read) -> Vec<u8> {
let mut decoder = crate::decoding::StreamingDecoder::new(data).unwrap();
let mut result: Vec<u8> = Vec::new();
decoder.read_to_end(&mut result).expect("Decoding failed");
result
}
fn decode_ruzstd_writer(mut data: impl Read) -> Vec<u8> {
let mut decoder = crate::decoding::FrameDecoder::new();
decoder.reset(&mut data).unwrap();
let mut result = vec![];
while !decoder.is_finished() || decoder.can_collect() > 0 {
decoder
.decode_blocks(
&mut data,
crate::decoding::BlockDecodingStrategy::UptoBytes(1024 * 1024),
)
.unwrap();
decoder.collect_to_writer(&mut result).unwrap();
}
result
}
fn encode_zstd(data: &[u8]) -> Result<Vec<u8>, std::io::Error> {
zstd::stream::encode_all(std::io::Cursor::new(data), 3)
}
fn encode_ruzstd_uncompressed(data: &mut dyn std::io::Read) -> Vec<u8> {
let mut input = Vec::new();
data.read_to_end(&mut input).unwrap();
crate::encoding::compress_to_vec(
input.as_slice(),
crate::encoding::CompressionLevel::Uncompressed,
)
}
fn encode_ruzstd_compressed(data: &mut dyn std::io::Read) -> Vec<u8> {
let mut input = Vec::new();
data.read_to_end(&mut input).unwrap();
crate::encoding::compress_to_vec(
input.as_slice(),
crate::encoding::CompressionLevel::Fastest,
)
}
fn decode_zstd(data: &[u8]) -> Result<Vec<u8>, std::io::Error> {
let mut output = Vec::new();
zstd::stream::copy_decode(data, &mut output)?;
Ok(output)
}
if std::fs::exists("fuzz/artifacts/interop").unwrap_or(false) {
for file in std::fs::read_dir("fuzz/artifacts/interop").unwrap() {
if file.as_ref().unwrap().file_type().unwrap().is_file() {
let data = std::fs::read(file.unwrap().path()).unwrap();
let data = data.as_slice();
// Decoding
let compressed = encode_zstd(data).unwrap();
let decoded = decode_ruzstd(&mut compressed.as_slice());
let decoded2 = decode_ruzstd_writer(&mut compressed.as_slice());
assert!(
decoded == data,
"Decoded data did not match the original input during decompression"
);
assert_eq!(
decoded2, data,
"Decoded data did not match the original input during decompression"
);
// Encoding
// Uncompressed encoding
let mut input = data;
let compressed = encode_ruzstd_uncompressed(&mut input);
let decoded = decode_zstd(&compressed).unwrap();
assert_eq!(
decoded, data,
"Decoded data did not match the original input during compression"
);
// Compressed encoding
let mut input = data;
let compressed = encode_ruzstd_compressed(&mut input);
let decoded = decode_zstd(&compressed).unwrap();
assert_eq!(
decoded, data,
"Decoded data did not match the original input during compression"
);
}
}
}
}
}

View File

@@ -0,0 +1,231 @@
//! Utilities and representations for a frame header.
use crate::bit_io::BitWriter;
use crate::common::MAGIC_NUM;
use crate::encoding::util::{find_min_size, minify_val};
use alloc::vec::Vec;
/// A header for a single Zstandard frame.
///
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header>
#[derive(Debug)]
pub struct FrameHeader {
/// Optionally, the original (uncompressed) size of the data within the frame in bytes.
/// If not present, `window_size` must be set.
pub frame_content_size: Option<u64>,
/// If set to true, data must be regenerated within a single
/// continuous memory segment.
pub single_segment: bool,
/// If set to true, a 32 bit content checksum will be present
/// at the end of the frame.
pub content_checksum: bool,
/// If a dictionary ID is provided, the ID of that dictionary.
pub dictionary_id: Option<u64>,
/// The minimum memory buffer required to compress a frame. If not present,
/// `single_segment` will be set to true. If present, this value must be greater than 1KB
/// and less than 3.75TB. Encoders should not generate a frame that requires a window size larger than
/// 8mb.
pub window_size: Option<u64>,
}
impl FrameHeader {
/// Writes the serialized frame header into the provided buffer.
///
/// The returned header *does include* a frame header descriptor.
pub fn serialize(self, output: &mut Vec<u8>) {
vprintln!("Serializing frame with header: {self:?}");
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header
// Magic Number:
output.extend_from_slice(&MAGIC_NUM.to_le_bytes());
// `Frame_Header_Descriptor`:
output.push(self.descriptor());
// `Window_Descriptor
// TODO: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
if !self.single_segment {
if let Some(window_size) = self.window_size {
let log = window_size.next_power_of_two().ilog2();
let exponent = if log > 10 { log - 10 } else { 1 } as u8;
output.push(exponent << 3);
}
}
if let Some(id) = self.dictionary_id {
output.extend(minify_val(id));
}
if let Some(frame_content_size) = self.frame_content_size {
output.extend(minify_val_fcs(frame_content_size));
}
}
/// Generate a serialized frame header descriptor for the frame header.
///
/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header_descriptor
fn descriptor(&self) -> u8 {
let mut bw = BitWriter::new();
// A frame header starts with a frame header descriptor.
// It describes what other fields are present
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header_descriptor
// Writing the frame header descriptor:
// `Frame_Content_Size_flag`:
// The Frame_Content_Size_flag specifies if
// the Frame_Content_Size field is provided within the header.
// TODO: The Frame_Content_Size field isn't set at all, we should prefer to include it always.
// If the `Single_Segment_flag` is set and this value is zero,
// the size of the FCS field is 1 byte.
// Otherwise, the FCS field is omitted.
// | Value | Size of field (Bytes)
// | 0 | 0 or 1
// | 1 | 2
// | 2 | 4
// | 3 | 8
// `Dictionary_ID_flag`:
if let Some(id) = self.dictionary_id {
let flag_value: u8 = match find_min_size(id) {
0 => 0,
1 => 1,
2 => 2,
4 => 3,
_ => panic!(),
};
bw.write_bits(flag_value, 2);
} else {
// A `Dictionary_ID` was not provided
bw.write_bits(0u8, 2);
}
// `Content_Checksum_flag`:
if self.content_checksum {
bw.write_bits(1u8, 1);
} else {
bw.write_bits(0u8, 1);
}
// `Reserved_bit`:
// This value must be zero
bw.write_bits(0u8, 1);
// `Unused_bit`:
// An encoder compliant with this spec must set this bit to zero
bw.write_bits(0u8, 1);
// `Single_Segment_flag`:
// If this flag is set, data must be regenerated within a single continuous memory segment,
// and the `Frame_Content_Size` field must be present in the header.
// If this flag is not set, the `Window_Descriptor` field must be present in the frame header.
if self.single_segment {
assert!(self.frame_content_size.is_some(), "if the `single_segment` flag is set to true, then a frame content size must be provided");
bw.write_bits(1u8, 1);
} else {
assert!(
self.window_size.is_some(),
"if the `single_segment` flag is set to false, then a window size must be provided"
);
bw.write_bits(0u8, 1);
}
if let Some(frame_content_size) = self.frame_content_size {
let field_size = find_min_size(frame_content_size);
let flag_value: u8 = match field_size {
1 => 0,
2 => 1,
4 => 2,
3 => 8,
_ => panic!(),
};
bw.write_bits(flag_value, 2);
} else {
// `Frame_Content_Size` was not provided
bw.write_bits(0u8, 2);
}
bw.dump()[0]
}
}
/// Identical to [`minify_val`], but it implements the following edge case:
///
/// > When FCS_Field_Size is 1, 4 or 8 bytes, the value is read directly. When FCS_Field_Size is 2, the offset of 256 is added.
///
/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_content_size
fn minify_val_fcs(val: u64) -> Vec<u8> {
let new_size = find_min_size(val);
let mut val = val;
if new_size == 2 {
val -= 256;
}
val.to_le_bytes()[0..new_size].to_vec()
}
#[cfg(test)]
mod tests {
use super::FrameHeader;
use crate::decoding::frame::{read_frame_header, FrameDescriptor};
use alloc::vec::Vec;
#[test]
fn frame_header_descriptor_decode() {
let header = FrameHeader {
frame_content_size: Some(1),
single_segment: true,
content_checksum: false,
dictionary_id: None,
window_size: None,
};
let descriptor = header.descriptor();
let decoded_descriptor = FrameDescriptor(descriptor);
assert_eq!(decoded_descriptor.frame_content_size_bytes().unwrap(), 1);
assert!(!decoded_descriptor.content_checksum_flag());
assert_eq!(decoded_descriptor.dictionary_id_bytes().unwrap(), 0);
}
#[test]
fn frame_header_decode() {
let header = FrameHeader {
frame_content_size: Some(1),
single_segment: true,
content_checksum: false,
dictionary_id: None,
window_size: None,
};
let mut serialized_header = Vec::new();
header.serialize(&mut serialized_header);
let parsed_header = read_frame_header(serialized_header.as_slice()).unwrap().0;
assert!(parsed_header.dictionary_id().is_none());
assert_eq!(parsed_header.frame_content_size(), 1);
}
#[test]
#[should_panic]
fn catches_single_segment_no_fcs() {
let header = FrameHeader {
frame_content_size: None,
single_segment: true,
content_checksum: false,
dictionary_id: None,
window_size: Some(1),
};
let mut serialized_header = Vec::new();
header.serialize(&mut serialized_header);
}
#[test]
#[should_panic]
fn catches_single_segment_no_winsize() {
let header = FrameHeader {
frame_content_size: Some(7),
single_segment: false,
content_checksum: false,
dictionary_id: None,
window_size: None,
};
let mut serialized_header = Vec::new();
header.serialize(&mut serialized_header);
}
}

View File

@@ -0,0 +1,67 @@
use crate::{
common::MAX_BLOCK_SIZE,
encoding::{
block_header::BlockHeader, blocks::compress_block, frame_compressor::CompressState, Matcher,
},
};
use alloc::vec::Vec;
/// Compresses a single block at [`crate::encoding::CompressionLevel::Fastest`].
///
/// # Parameters
/// - `state`: [`CompressState`] so the compressor can refer to data before
/// the start of this block
/// - `last_block`: Whether or not this block is going to be the last block in the frame
/// (needed because this info is written into the block header)
/// - `uncompressed_data`: A block's worth of uncompressed data, taken from the
/// larger input
/// - `output`: As `uncompressed_data` is compressed, it's appended to `output`.
#[inline]
pub fn compress_fastest<M: Matcher>(
state: &mut CompressState<M>,
last_block: bool,
uncompressed_data: Vec<u8>,
output: &mut Vec<u8>,
) {
let block_size = uncompressed_data.len() as u32;
// First check to see if run length encoding can be used for the entire block
if uncompressed_data.iter().all(|x| uncompressed_data[0].eq(x)) {
let rle_byte = uncompressed_data[0];
state.matcher.commit_space(uncompressed_data);
state.matcher.skip_matching();
let header = BlockHeader {
last_block,
block_type: crate::blocks::block::BlockType::RLE,
block_size,
};
// Write the header, then the block
header.serialize(output);
output.push(rle_byte);
} else {
// Compress as a standard compressed block
let mut compressed = Vec::new();
state.matcher.commit_space(uncompressed_data);
compress_block(state, &mut compressed);
// If the compressed data is larger than the maximum
// allowable block size, instead store uncompressed
if compressed.len() >= MAX_BLOCK_SIZE as usize {
let header = BlockHeader {
last_block,
block_type: crate::blocks::block::BlockType::Raw,
block_size,
};
// Write the header, then the block
header.serialize(output);
output.extend_from_slice(state.matcher.get_last_space());
} else {
let header = BlockHeader {
last_block,
block_type: crate::blocks::block::BlockType::Compressed,
block_size: compressed.len() as u32,
};
// Write the header, then the block
header.serialize(output);
output.extend(compressed);
}
}
}

View File

@@ -0,0 +1,2 @@
mod fastest;
pub use fastest::compress_fastest;

View File

@@ -0,0 +1,619 @@
//! Matching algorithm used find repeated parts in the original data
//!
//! The Zstd format relies on finden repeated sequences of data and compressing these sequences as instructions to the decoder.
//! A sequence basically tells the decoder "Go back X bytes and copy Y bytes to the end of your decode buffer".
//!
//! The task here is to efficiently find matches in the already encoded data for the current suffix of the not yet encoded data.
use alloc::vec::Vec;
use core::num::NonZeroUsize;
use super::CompressionLevel;
use super::Matcher;
use super::Sequence;
const MIN_MATCH_LEN: usize = 5;
/// Takes care of allocating and reusing vecs
pub struct MatchGeneratorDriver {
vec_pool: Vec<Vec<u8>>,
suffix_pool: Vec<SuffixStore>,
match_generator: MatchGenerator,
slice_size: usize,
}
impl MatchGeneratorDriver {
/// slice_size says how big the slices should be that are allocated to work with
/// max_slices_in_window says how many slices should at most be used while looking for matches
pub(crate) fn new(slice_size: usize, max_slices_in_window: usize) -> Self {
Self {
vec_pool: Vec::new(),
suffix_pool: Vec::new(),
match_generator: MatchGenerator::new(max_slices_in_window * slice_size),
slice_size,
}
}
}
impl Matcher for MatchGeneratorDriver {
fn reset(&mut self, _level: CompressionLevel) {
let vec_pool = &mut self.vec_pool;
let suffix_pool = &mut self.suffix_pool;
self.match_generator.reset(|mut data, mut suffixes| {
data.resize(data.capacity(), 0);
vec_pool.push(data);
suffixes.slots.clear();
suffixes.slots.resize(suffixes.slots.capacity(), None);
suffix_pool.push(suffixes);
});
}
fn window_size(&self) -> u64 {
self.match_generator.max_window_size as u64
}
fn get_next_space(&mut self) -> Vec<u8> {
self.vec_pool.pop().unwrap_or_else(|| {
let mut space = alloc::vec![0; self.slice_size];
space.resize(space.capacity(), 0);
space
})
}
fn get_last_space(&mut self) -> &[u8] {
self.match_generator.window.last().unwrap().data.as_slice()
}
fn commit_space(&mut self, space: Vec<u8>) {
let vec_pool = &mut self.vec_pool;
let suffixes = self
.suffix_pool
.pop()
.unwrap_or_else(|| SuffixStore::with_capacity(space.len()));
let suffix_pool = &mut self.suffix_pool;
self.match_generator
.add_data(space, suffixes, |mut data, mut suffixes| {
data.resize(data.capacity(), 0);
vec_pool.push(data);
suffixes.slots.clear();
suffixes.slots.resize(suffixes.slots.capacity(), None);
suffix_pool.push(suffixes);
});
}
fn start_matching(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) {
while self.match_generator.next_sequence(&mut handle_sequence) {}
}
fn skip_matching(&mut self) {
self.match_generator.skip_matching();
}
}
/// This stores the index of a suffix of a string by hashing the first few bytes of that suffix
/// This means that collisions just overwrite and that you need to check validity after a get
struct SuffixStore {
// We use NonZeroUsize to enable niche optimization here.
// On store we do +1 and on get -1
// This is ok since usize::MAX is never a valid offset
slots: Vec<Option<NonZeroUsize>>,
len_log: u32,
}
impl SuffixStore {
fn with_capacity(capacity: usize) -> Self {
Self {
slots: alloc::vec![None; capacity],
len_log: capacity.ilog2(),
}
}
#[inline(always)]
fn insert(&mut self, suffix: &[u8], idx: usize) {
let key = self.key(suffix);
self.slots[key] = Some(NonZeroUsize::new(idx + 1).unwrap());
}
#[inline(always)]
fn contains_key(&self, suffix: &[u8]) -> bool {
let key = self.key(suffix);
self.slots[key].is_some()
}
#[inline(always)]
fn get(&self, suffix: &[u8]) -> Option<usize> {
let key = self.key(suffix);
self.slots[key].map(|x| <NonZeroUsize as Into<usize>>::into(x) - 1)
}
#[inline(always)]
fn key(&self, suffix: &[u8]) -> usize {
let s0 = suffix[0] as u64;
let s1 = suffix[1] as u64;
let s2 = suffix[2] as u64;
let s3 = suffix[3] as u64;
let s4 = suffix[4] as u64;
const POLY: u64 = 0xCF3BCCDCABu64;
let s0 = (s0 << 24).wrapping_mul(POLY);
let s1 = (s1 << 32).wrapping_mul(POLY);
let s2 = (s2 << 40).wrapping_mul(POLY);
let s3 = (s3 << 48).wrapping_mul(POLY);
let s4 = (s4 << 56).wrapping_mul(POLY);
let index = s0 ^ s1 ^ s2 ^ s3 ^ s4;
let index = index >> (64 - self.len_log);
index as usize % self.slots.len()
}
}
/// We keep a window of a few of these entries
/// All of these are valid targets for a match to be generated for
struct WindowEntry {
data: Vec<u8>,
/// Stores indexes into data
suffixes: SuffixStore,
/// Makes offset calculations efficient
base_offset: usize,
}
pub(crate) struct MatchGenerator {
max_window_size: usize,
/// Data window we are operating on to find matches
/// The data we want to find matches for is in the last slice
window: Vec<WindowEntry>,
window_size: usize,
#[cfg(debug_assertions)]
concat_window: Vec<u8>,
/// Index in the last slice that we already processed
suffix_idx: usize,
/// Gets updated when a new sequence is returned to point right behind that sequence
last_idx_in_sequence: usize,
}
impl MatchGenerator {
/// max_size defines how many bytes will be used at most in the window used for matching
fn new(max_size: usize) -> Self {
Self {
max_window_size: max_size,
window: Vec::new(),
window_size: 0,
#[cfg(debug_assertions)]
concat_window: Vec::new(),
suffix_idx: 0,
last_idx_in_sequence: 0,
}
}
fn reset(&mut self, mut reuse_space: impl FnMut(Vec<u8>, SuffixStore)) {
self.window_size = 0;
#[cfg(debug_assertions)]
self.concat_window.clear();
self.suffix_idx = 0;
self.last_idx_in_sequence = 0;
self.window.drain(..).for_each(|entry| {
reuse_space(entry.data, entry.suffixes);
});
}
/// Processes bytes in the current window until either a match is found or no more matches can be found
/// * If a match is found handle_sequence is called with the Triple variant
/// * If no more matches can be found but there are bytes still left handle_sequence is called with the Literals variant
/// * If no more matches can be found and no more bytes are left this returns false
fn next_sequence(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) -> bool {
loop {
let last_entry = self.window.last().unwrap();
let data_slice = &last_entry.data;
// We already reached the end of the window, check if we need to return a Literals{}
if self.suffix_idx >= data_slice.len() {
if self.last_idx_in_sequence != self.suffix_idx {
let literals = &data_slice[self.last_idx_in_sequence..];
self.last_idx_in_sequence = self.suffix_idx;
handle_sequence(Sequence::Literals { literals });
return true;
} else {
return false;
}
}
// If the remaining data is smaller than the minimum match length we can stop and return a Literals{}
let data_slice = &data_slice[self.suffix_idx..];
if data_slice.len() < MIN_MATCH_LEN {
let last_idx_in_sequence = self.last_idx_in_sequence;
self.last_idx_in_sequence = last_entry.data.len();
self.suffix_idx = last_entry.data.len();
handle_sequence(Sequence::Literals {
literals: &last_entry.data[last_idx_in_sequence..],
});
return true;
}
// This is the key we are looking to find a match for
let key = &data_slice[..MIN_MATCH_LEN];
// Look in each window entry
let mut candidate = None;
for (match_entry_idx, match_entry) in self.window.iter().enumerate() {
let is_last = match_entry_idx == self.window.len() - 1;
if let Some(match_index) = match_entry.suffixes.get(key) {
let match_slice = if is_last {
&match_entry.data[match_index..self.suffix_idx]
} else {
&match_entry.data[match_index..]
};
// Check how long the common prefix actually is
let match_len = Self::common_prefix_len(match_slice, data_slice);
// Collisions in the suffix store might make this check fail
if match_len >= MIN_MATCH_LEN {
let offset = match_entry.base_offset + self.suffix_idx - match_index;
// If we are in debug/tests make sure the match we found is actually at the offset we calculated
#[cfg(debug_assertions)]
{
let unprocessed = last_entry.data.len() - self.suffix_idx;
let start = self.concat_window.len() - unprocessed - offset;
let end = start + match_len;
let check_slice = &self.concat_window[start..end];
debug_assert_eq!(check_slice, &match_slice[..match_len]);
}
if let Some((old_offset, old_match_len)) = candidate {
if match_len > old_match_len
|| (match_len == old_match_len && offset < old_offset)
{
candidate = Some((offset, match_len));
}
} else {
candidate = Some((offset, match_len));
}
}
}
}
if let Some((offset, match_len)) = candidate {
// For each index in the match we found we do not need to look for another match
// But we still want them registered in the suffix store
self.add_suffixes_till(self.suffix_idx + match_len);
// All literals that were not included between this match and the last are now included here
let last_entry = self.window.last().unwrap();
let literals = &last_entry.data[self.last_idx_in_sequence..self.suffix_idx];
// Update the indexes, all indexes upto and including the current index have been included in a sequence now
self.suffix_idx += match_len;
self.last_idx_in_sequence = self.suffix_idx;
handle_sequence(Sequence::Triple {
literals,
offset,
match_len,
});
return true;
}
let last_entry = self.window.last_mut().unwrap();
let key = &last_entry.data[self.suffix_idx..self.suffix_idx + MIN_MATCH_LEN];
if !last_entry.suffixes.contains_key(key) {
last_entry.suffixes.insert(key, self.suffix_idx);
}
self.suffix_idx += 1;
}
}
/// Find the common prefix length between two byte slices
#[inline(always)]
fn common_prefix_len(a: &[u8], b: &[u8]) -> usize {
Self::mismatch_chunks::<8>(a, b)
}
/// Find the common prefix length between two byte slices with a configurable chunk length
/// This enables vectorization optimizations
fn mismatch_chunks<const N: usize>(xs: &[u8], ys: &[u8]) -> usize {
let off = core::iter::zip(xs.chunks_exact(N), ys.chunks_exact(N))
.take_while(|(x, y)| x == y)
.count()
* N;
off + core::iter::zip(&xs[off..], &ys[off..])
.take_while(|(x, y)| x == y)
.count()
}
/// Process bytes and add the suffixes to the suffix store up to a specific index
#[inline(always)]
fn add_suffixes_till(&mut self, idx: usize) {
let last_entry = self.window.last_mut().unwrap();
if last_entry.data.len() < MIN_MATCH_LEN {
return;
}
let slice = &last_entry.data[self.suffix_idx..idx];
for (key_index, key) in slice.windows(MIN_MATCH_LEN).enumerate() {
if !last_entry.suffixes.contains_key(key) {
last_entry.suffixes.insert(key, self.suffix_idx + key_index);
}
}
}
/// Skip matching for the whole current window entry
fn skip_matching(&mut self) {
let len = self.window.last().unwrap().data.len();
self.add_suffixes_till(len);
self.suffix_idx = len;
self.last_idx_in_sequence = len;
}
/// Add a new window entry. Will panic if the last window entry hasn't been processed properly.
/// If any resources are released by pushing the new entry they are returned via the callback
fn add_data(
&mut self,
data: Vec<u8>,
suffixes: SuffixStore,
reuse_space: impl FnMut(Vec<u8>, SuffixStore),
) {
assert!(
self.window.is_empty() || self.suffix_idx == self.window.last().unwrap().data.len()
);
self.reserve(data.len(), reuse_space);
#[cfg(debug_assertions)]
self.concat_window.extend_from_slice(&data);
if let Some(last_len) = self.window.last().map(|last| last.data.len()) {
for entry in self.window.iter_mut() {
entry.base_offset += last_len;
}
}
let len = data.len();
self.window.push(WindowEntry {
data,
suffixes,
base_offset: 0,
});
self.window_size += len;
self.suffix_idx = 0;
self.last_idx_in_sequence = 0;
}
/// Reserve space for a new window entry
/// If any resources are released by pushing the new entry they are returned via the callback
fn reserve(&mut self, amount: usize, mut reuse_space: impl FnMut(Vec<u8>, SuffixStore)) {
assert!(self.max_window_size >= amount);
while self.window_size + amount > self.max_window_size {
let removed = self.window.remove(0);
self.window_size -= removed.data.len();
#[cfg(debug_assertions)]
self.concat_window.drain(0..removed.data.len());
let WindowEntry {
suffixes,
data: leaked_vec,
base_offset: _,
} = removed;
reuse_space(leaked_vec, suffixes);
}
}
}
#[test]
fn matches() {
let mut matcher = MatchGenerator::new(1000);
let mut original_data = Vec::new();
let mut reconstructed = Vec::new();
let assert_seq_equal = |seq1: Sequence<'_>, seq2: Sequence<'_>, reconstructed: &mut Vec<u8>| {
assert_eq!(seq1, seq2);
match seq2 {
Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
Sequence::Triple {
literals,
offset,
match_len,
} => {
reconstructed.extend_from_slice(literals);
let start = reconstructed.len() - offset;
let end = start + match_len;
reconstructed.extend_from_within(start..end);
}
}
};
matcher.add_data(
alloc::vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[0, 0, 0, 0, 0],
offset: 5,
match_len: 5,
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0,],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[
1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0,
]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[1, 2, 3, 4, 5, 6],
offset: 6,
match_len: 6,
},
&mut reconstructed,
)
});
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[],
offset: 12,
match_len: 6,
},
&mut reconstructed,
)
});
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[],
offset: 28,
match_len: 5,
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[],
offset: 23,
match_len: 6,
},
&mut reconstructed,
)
});
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[7, 8, 9, 10, 11],
offset: 16,
match_len: 5,
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![0, 0, 0, 0, 0],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[0, 0, 0, 0, 0]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[],
offset: 5,
match_len: 5,
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![7, 8, 9, 10, 11],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[7, 8, 9, 10, 11]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[],
offset: 15,
match_len: 5,
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![1, 3, 5, 7, 9],
SuffixStore::with_capacity(100),
|_, _| {},
);
matcher.skip_matching();
original_data.extend_from_slice(&[1, 3, 5, 7, 9]);
reconstructed.extend_from_slice(&[1, 3, 5, 7, 9]);
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![1, 3, 5, 7, 9],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[1, 3, 5, 7, 9]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[],
offset: 5,
match_len: 5,
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![0, 0, 11, 13, 15, 17, 20, 11, 13, 15, 17, 20, 21, 23],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[0, 0, 11, 13, 15, 17, 20, 11, 13, 15, 17, 20, 21, 23]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[0, 0, 11, 13, 15, 17, 20],
offset: 5,
match_len: 5,
},
&mut reconstructed,
)
});
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Literals {
literals: &[21, 23],
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
assert_eq!(reconstructed, original_data);
}

118
vendor/ruzstd/src/encoding/mod.rs vendored Normal file
View File

@@ -0,0 +1,118 @@
//! Structures and utilities used for compressing/encoding data into the Zstd format.
pub(crate) mod block_header;
pub(crate) mod blocks;
pub(crate) mod frame_header;
pub(crate) mod match_generator;
pub(crate) mod util;
mod frame_compressor;
mod levels;
pub use frame_compressor::FrameCompressor;
use crate::io::{Read, Write};
use alloc::vec::Vec;
/// Convenience function to compress some source into a target without reusing any resources of the compressor
/// ```rust
/// use ruzstd::encoding::{compress, CompressionLevel};
/// let data: &[u8] = &[0,0,0,0,0,0,0,0,0,0,0,0];
/// let mut target = Vec::new();
/// compress(data, &mut target, CompressionLevel::Fastest);
/// ```
pub fn compress<R: Read, W: Write>(source: R, target: W, level: CompressionLevel) {
let mut frame_enc = FrameCompressor::new(level);
frame_enc.set_source(source);
frame_enc.set_drain(target);
frame_enc.compress();
}
/// Convenience function to compress some source into a Vec without reusing any resources of the compressor
/// ```rust
/// use ruzstd::encoding::{compress_to_vec, CompressionLevel};
/// let data: &[u8] = &[0,0,0,0,0,0,0,0,0,0,0,0];
/// let compressed = compress_to_vec(data, CompressionLevel::Fastest);
/// ```
pub fn compress_to_vec<R: Read>(source: R, level: CompressionLevel) -> Vec<u8> {
let mut vec = Vec::new();
compress(source, &mut vec, level);
vec
}
/// The compression mode used impacts the speed of compression,
/// and resulting compression ratios. Faster compression will result
/// in worse compression ratios, and vice versa.
#[derive(Copy, Clone)]
pub enum CompressionLevel {
/// This level does not compress the data at all, and simply wraps
/// it in a Zstandard frame.
Uncompressed,
/// This level is roughly equivalent to Zstd compression level 1
Fastest,
/// This level is roughly equivalent to Zstd level 3,
/// or the one used by the official compressor when no level
/// is specified.
///
/// UNIMPLEMENTED
Default,
/// This level is roughly equivalent to Zstd level 7.
///
/// UNIMPLEMENTED
Better,
/// This level is roughly equivalent to Zstd level 11.
///
/// UNIMPLEMENTED
Best,
}
/// Trait used by the encoder that users can use to extend the matching facilities with their own algorithm
/// making their own tradeoffs between runtime, memory usage and compression ratio
///
/// This trait operates on buffers that represent the chunks of data the matching algorithm wants to work on.
/// Each one of these buffers is referred to as a *space*. One or more of these buffers represent the window
/// the decoder will need to decode the data again.
///
/// This library asks the Matcher for a new buffer using `get_next_space` to allow reusing of allocated buffers when they are no longer part of the
/// window of data that is being used for matching.
///
/// The library fills the buffer with data that is to be compressed and commits them back to the matcher using `commit_space`.
///
/// Then it will either call `start_matching` or, if the space is deemed not worth compressing, `skip_matching` is called.
///
/// This is repeated until no more data is left to be compressed.
pub trait Matcher {
/// Get a space where we can put data to be matched on. Will be encoded as one block. The maximum allowed size is 128 kB.
fn get_next_space(&mut self) -> alloc::vec::Vec<u8>;
/// Get a reference to the last commited space
fn get_last_space(&mut self) -> &[u8];
/// Commit a space to the matcher so it can be matched against
fn commit_space(&mut self, space: alloc::vec::Vec<u8>);
/// Just process the data in the last commited space for future matching
fn skip_matching(&mut self);
/// Process the data in the last commited space for future matching AND generate matches for the data
fn start_matching(&mut self, handle_sequence: impl for<'a> FnMut(Sequence<'a>));
/// Reset this matcher so it can be used for the next new frame
fn reset(&mut self, level: CompressionLevel);
/// The size of the window the decoder will need to execute all sequences produced by this matcher
///
/// May change after a call to reset with a different compression level
fn window_size(&self) -> u64;
}
#[derive(PartialEq, Eq, Debug)]
/// Sequences that a [`Matcher`] can produce
pub enum Sequence<'data> {
/// Is encoded as a sequence for the decoder sequence execution.
///
/// First the literals will be copied to the decoded data,
/// then `match_len` bytes are copied from `offset` bytes back in the buffer
Triple {
literals: &'data [u8],
offset: usize,
match_len: usize,
},
/// This is returned as the last sequence in a block
///
/// These literals will just be copied at the end of the sequence execution by the decoder
Literals { literals: &'data [u8] },
}

60
vendor/ruzstd/src/encoding/util.rs vendored Normal file
View File

@@ -0,0 +1,60 @@
use alloc::vec::Vec;
/// Returns the minimum number of bytes needed to represent this value, as
/// either 1, 2, 4, or 8 bytes. A value of 0 will still return one byte.
///
/// Used for variable length fields like `Dictionary_ID` or `Frame_Content_Size`.
pub fn find_min_size(val: u64) -> usize {
if val == 0 {
return 1;
}
if val >> 8 == 0 {
return 1;
}
if val >> 16 == 0 {
return 2;
}
if val >> 32 == 0 {
return 4;
}
8
}
/// Returns the same value, but represented using the smallest number of bytes needed.
/// Returned vector will be 1, 2, 4, or 8 bytes in length. Zero is represented as 1 byte.
///
/// Operates in **little-endian**.
pub fn minify_val(val: u64) -> Vec<u8> {
let new_size = find_min_size(val);
val.to_le_bytes()[0..new_size].to_vec()
}
#[cfg(test)]
mod tests {
use super::find_min_size;
use super::minify_val;
use alloc::vec;
#[test]
fn min_size_detection() {
assert_eq!(find_min_size(0), 1);
assert_eq!(find_min_size(0xff), 1);
assert_eq!(find_min_size(0xff_ff), 2);
assert_eq!(find_min_size(0x00_ff_ff_ff), 4);
assert_eq!(find_min_size(0xff_ff_ff_ff), 4);
assert_eq!(find_min_size(0x00ff_ffff_ffff_ffff), 8);
assert_eq!(find_min_size(0xffff_ffff_ffff_ffff), 8);
}
#[test]
fn bytes_minified() {
assert_eq!(minify_val(0), vec![0]);
assert_eq!(minify_val(0xff), vec![0xff]);
assert_eq!(minify_val(0xff_ff), vec![0xff, 0xff]);
assert_eq!(minify_val(0xff_ff_ff_ff), vec![0xff, 0xff, 0xff, 0xff]);
assert_eq!(
minify_val(0xffff_ffff_ffff_ffff),
vec![0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff]
);
}
}

366
vendor/ruzstd/src/fse/fse_decoder.rs vendored Normal file
View File

@@ -0,0 +1,366 @@
use crate::bit_io::{BitReader, BitReaderReversed};
use crate::decoding::errors::{FSEDecoderError, FSETableError};
use alloc::vec::Vec;
pub struct FSEDecoder<'table> {
/// An FSE state value represents an index in the FSE table.
pub state: Entry,
/// A reference to the table used for decoding.
table: &'table FSETable,
}
impl<'t> FSEDecoder<'t> {
/// Initialize a new Finite State Entropy decoder.
pub fn new(table: &'t FSETable) -> FSEDecoder<'t> {
FSEDecoder {
state: table.decode.first().copied().unwrap_or(Entry {
base_line: 0,
num_bits: 0,
symbol: 0,
}),
table,
}
}
/// Returns the byte associated with the symbol the internal cursor is pointing at.
pub fn decode_symbol(&self) -> u8 {
self.state.symbol
}
/// Initialize internal state and prepare for decoding. After this, `decode_symbol` can be called
/// to read the first symbol and `update_state` can be called to prepare to read the next symbol.
pub fn init_state(&mut self, bits: &mut BitReaderReversed<'_>) -> Result<(), FSEDecoderError> {
if self.table.accuracy_log == 0 {
return Err(FSEDecoderError::TableIsUninitialized);
}
let new_state = bits.get_bits(self.table.accuracy_log);
self.state = self.table.decode[new_state as usize];
Ok(())
}
/// Advance the internal state to decode the next symbol in the bitstream.
pub fn update_state(&mut self, bits: &mut BitReaderReversed<'_>) {
let num_bits = self.state.num_bits;
let add = bits.get_bits(num_bits);
let base_line = self.state.base_line;
let new_state = base_line + add as u32;
self.state = self.table.decode[new_state as usize];
//println!("Update: {}, {} -> {}", base_line, add, self.state);
}
}
/// FSE decoding involves a decoding table that describes the probabilities of
/// all literals from 0 to the highest present one
///
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#fse-table-description>
#[derive(Debug, Clone)]
pub struct FSETable {
/// The maximum symbol in the table (inclusive). Limits the probabilities length to max_symbol + 1.
max_symbol: u8,
/// The actual table containing the decoded symbol and the compression data
/// connected to that symbol.
pub decode: Vec<Entry>, //used to decode symbols, and calculate the next state
/// The size of the table is stored in logarithm base 2 format,
/// with the **size of the table** being equal to `(1 << accuracy_log)`.
/// This value is used so that the decoder knows how many bits to read from the bitstream.
pub accuracy_log: u8,
/// In this context, probability refers to the likelihood that a symbol occurs in the given data.
/// Given this info, the encoder can assign shorter codes to symbols that appear more often,
/// and longer codes that appear less often, then the decoder can use the probability
/// to determine what code was assigned to what symbol.
///
/// The probability of a single symbol is a value representing the proportion of times the symbol
/// would fall within the data.
///
/// If a symbol probability is set to `-1`, it means that the probability of a symbol
/// occurring in the data is less than one.
pub symbol_probabilities: Vec<i32>, //used while building the decode Vector
/// The number of times each symbol occurs (The first entry being 0x0, the second being 0x1) and so on
/// up until the highest possible symbol (255).
symbol_counter: Vec<u32>,
}
impl FSETable {
/// Initialize a new empty Finite State Entropy decoding table.
pub fn new(max_symbol: u8) -> FSETable {
FSETable {
max_symbol,
symbol_probabilities: Vec::with_capacity(256), //will never be more than 256 symbols because u8
symbol_counter: Vec::with_capacity(256), //will never be more than 256 symbols because u8
decode: Vec::new(), //depending on acc_log.
accuracy_log: 0,
}
}
/// Reset `self` and update `self`'s state to mirror the provided table.
pub fn reinit_from(&mut self, other: &Self) {
self.reset();
self.symbol_counter.extend_from_slice(&other.symbol_counter);
self.symbol_probabilities
.extend_from_slice(&other.symbol_probabilities);
self.decode.extend_from_slice(&other.decode);
self.accuracy_log = other.accuracy_log;
}
/// Empty the table and clear all internal state.
pub fn reset(&mut self) {
self.symbol_counter.clear();
self.symbol_probabilities.clear();
self.decode.clear();
self.accuracy_log = 0;
}
/// returns how many BYTEs (not bits) were read while building the decoder
pub fn build_decoder(&mut self, source: &[u8], max_log: u8) -> Result<usize, FSETableError> {
self.accuracy_log = 0;
let bytes_read = self.read_probabilities(source, max_log)?;
self.build_decoding_table()?;
Ok(bytes_read)
}
/// Given the provided accuracy log, build a decoding table from that log.
pub fn build_from_probabilities(
&mut self,
acc_log: u8,
probs: &[i32],
) -> Result<(), FSETableError> {
if acc_log == 0 {
return Err(FSETableError::AccLogIsZero);
}
self.symbol_probabilities = probs.to_vec();
self.accuracy_log = acc_log;
self.build_decoding_table()
}
/// Build the actual decoding table after probabilities have been read into the table.
/// After this function is called, the decoding process can begin.
fn build_decoding_table(&mut self) -> Result<(), FSETableError> {
if self.symbol_probabilities.len() > self.max_symbol as usize + 1 {
return Err(FSETableError::TooManySymbols {
got: self.symbol_probabilities.len(),
});
}
self.decode.clear();
let table_size = 1 << self.accuracy_log;
if self.decode.len() < table_size {
self.decode.reserve(table_size - self.decode.len());
}
//fill with dummy entries
self.decode.resize(
table_size,
Entry {
base_line: 0,
num_bits: 0,
symbol: 0,
},
);
let mut negative_idx = table_size; //will point to the highest index with is already occupied by a negative-probability-symbol
//first scan for all -1 probabilities and place them at the top of the table
for symbol in 0..self.symbol_probabilities.len() {
if self.symbol_probabilities[symbol] == -1 {
negative_idx -= 1;
let entry = &mut self.decode[negative_idx];
entry.symbol = symbol as u8;
entry.base_line = 0;
entry.num_bits = self.accuracy_log;
}
}
//then place in a semi-random order all of the other symbols
let mut position = 0;
for idx in 0..self.symbol_probabilities.len() {
let symbol = idx as u8;
if self.symbol_probabilities[idx] <= 0 {
continue;
}
//for each probability point the symbol gets on slot
let prob = self.symbol_probabilities[idx];
for _ in 0..prob {
let entry = &mut self.decode[position];
entry.symbol = symbol;
position = next_position(position, table_size);
while position >= negative_idx {
position = next_position(position, table_size);
//everything above negative_idx is already taken
}
}
}
// baselines and num_bits can only be calculated when all symbols have been spread
self.symbol_counter.clear();
self.symbol_counter
.resize(self.symbol_probabilities.len(), 0);
for idx in 0..negative_idx {
let entry = &mut self.decode[idx];
let symbol = entry.symbol;
let prob = self.symbol_probabilities[symbol as usize];
let symbol_count = self.symbol_counter[symbol as usize];
let (bl, nb) = calc_baseline_and_numbits(table_size as u32, prob as u32, symbol_count);
//println!("symbol: {:2}, table: {}, prob: {:3}, count: {:3}, bl: {:3}, nb: {:2}", symbol, table_size, prob, symbol_count, bl, nb);
assert!(nb <= self.accuracy_log);
self.symbol_counter[symbol as usize] += 1;
entry.base_line = bl;
entry.num_bits = nb;
}
Ok(())
}
/// Read the accuracy log and the probability table from the source and return the number of bytes
/// read. If the size of the table is larger than the provided `max_log`, return an error.
fn read_probabilities(&mut self, source: &[u8], max_log: u8) -> Result<usize, FSETableError> {
self.symbol_probabilities.clear(); //just clear, we will fill a probability for each entry anyways. No need to force new allocs here
let mut br = BitReader::new(source);
self.accuracy_log = ACC_LOG_OFFSET + (br.get_bits(4)? as u8);
if self.accuracy_log > max_log {
return Err(FSETableError::AccLogTooBig {
got: self.accuracy_log,
max: max_log,
});
}
if self.accuracy_log == 0 {
return Err(FSETableError::AccLogIsZero);
}
let probability_sum = 1 << self.accuracy_log;
let mut probability_counter = 0;
while probability_counter < probability_sum {
let max_remaining_value = probability_sum - probability_counter + 1;
let bits_to_read = highest_bit_set(max_remaining_value);
let unchecked_value = br.get_bits(bits_to_read as usize)? as u32;
let low_threshold = ((1 << bits_to_read) - 1) - (max_remaining_value);
let mask = (1 << (bits_to_read - 1)) - 1;
let small_value = unchecked_value & mask;
let value = if small_value < low_threshold {
br.return_bits(1);
small_value
} else if unchecked_value > mask {
unchecked_value - low_threshold
} else {
unchecked_value
};
//println!("{}, {}, {}", self.symbol_probablilities.len(), unchecked_value, value);
let prob = (value as i32) - 1;
self.symbol_probabilities.push(prob);
if prob != 0 {
if prob > 0 {
probability_counter += prob as u32;
} else {
// probability -1 counts as 1
assert!(prob == -1);
probability_counter += 1;
}
} else {
//fast skip further zero probabilities
loop {
let skip_amount = br.get_bits(2)? as usize;
self.symbol_probabilities
.resize(self.symbol_probabilities.len() + skip_amount, 0);
if skip_amount != 3 {
break;
}
}
}
}
if probability_counter != probability_sum {
return Err(FSETableError::ProbabilityCounterMismatch {
got: probability_counter,
expected_sum: probability_sum,
symbol_probabilities: self.symbol_probabilities.clone(),
});
}
if self.symbol_probabilities.len() > self.max_symbol as usize + 1 {
return Err(FSETableError::TooManySymbols {
got: self.symbol_probabilities.len(),
});
}
let bytes_read = if br.bits_read() % 8 == 0 {
br.bits_read() / 8
} else {
(br.bits_read() / 8) + 1
};
Ok(bytes_read)
}
}
/// A single entry in an FSE table.
#[derive(Copy, Clone, Debug)]
pub struct Entry {
/// This value is used as an offset value, and it is added
/// to a value read from the stream to determine the next state value.
pub base_line: u32,
/// How many bits should be read from the stream when decoding this entry.
pub num_bits: u8,
/// The byte that should be put in the decode output when encountering this state.
pub symbol: u8,
}
/// This value is added to the first 4 bits of the stream to determine the
/// `Accuracy_Log`
const ACC_LOG_OFFSET: u8 = 5;
fn highest_bit_set(x: u32) -> u32 {
assert!(x > 0);
u32::BITS - x.leading_zeros()
}
//utility functions for building the decoding table from probabilities
/// Calculate the position of the next entry of the table given the current
/// position and size of the table.
fn next_position(mut p: usize, table_size: usize) -> usize {
p += (table_size >> 1) + (table_size >> 3) + 3;
p &= table_size - 1;
p
}
fn calc_baseline_and_numbits(
num_states_total: u32,
num_states_symbol: u32,
state_number: u32,
) -> (u32, u8) {
if num_states_symbol == 0 {
return (0, 0);
}
let num_state_slices = if 1 << (highest_bit_set(num_states_symbol) - 1) == num_states_symbol {
num_states_symbol
} else {
1 << (highest_bit_set(num_states_symbol))
}; //always power of two
let num_double_width_state_slices = num_state_slices - num_states_symbol; //leftovers to the power of two need to be distributed
let num_single_width_state_slices = num_states_symbol - num_double_width_state_slices; //these will not receive a double width slice of states
let slice_width = num_states_total / num_state_slices; //size of a single width slice of states
let num_bits = highest_bit_set(slice_width) - 1; //number of bits needed to read for one slice
if state_number < num_double_width_state_slices {
let baseline = num_single_width_state_slices * slice_width + state_number * slice_width * 2;
(baseline, num_bits as u8 + 1)
} else {
let index_shifted = state_number - num_double_width_state_slices;
((index_shifted * slice_width), num_bits as u8)
}
}

445
vendor/ruzstd/src/fse/fse_encoder.rs vendored Normal file
View File

@@ -0,0 +1,445 @@
use crate::bit_io::BitWriter;
use alloc::vec::Vec;
pub(crate) struct FSEEncoder<'output, V: AsMut<Vec<u8>>> {
pub(super) table: FSETable,
writer: &'output mut BitWriter<V>,
}
impl<V: AsMut<Vec<u8>>> FSEEncoder<'_, V> {
pub fn new(table: FSETable, writer: &mut BitWriter<V>) -> FSEEncoder<'_, V> {
FSEEncoder { table, writer }
}
#[cfg(any(test, feature = "fuzz_exports"))]
pub fn into_table(self) -> FSETable {
self.table
}
/// Encodes the data using the provided table
/// Writes
/// * Table description
/// * Encoded data
/// * Last state index
/// * Padding bits to fill up last byte
#[cfg(any(test, feature = "fuzz_exports"))]
pub fn encode(&mut self, data: &[u8]) {
self.write_table();
let mut state = self.table.start_state(data[data.len() - 1]);
for x in data[0..data.len() - 1].iter().rev().copied() {
let next = self.table.next_state(x, state.index);
let diff = state.index - next.baseline;
self.writer.write_bits(diff as u64, next.num_bits as usize);
state = next;
}
self.writer
.write_bits(state.index as u64, self.acc_log() as usize);
let bits_to_fill = self.writer.misaligned();
if bits_to_fill == 0 {
self.writer.write_bits(1u32, 8);
} else {
self.writer.write_bits(1u32, bits_to_fill);
}
}
/// Encodes the data using the provided table but with two interleaved streams
/// Writes
/// * Table description
/// * Encoded data with two interleaved states
/// * Both Last state indexes
/// * Padding bits to fill up last byte
pub fn encode_interleaved(&mut self, data: &[u8]) {
self.write_table();
let mut state_1 = self.table.start_state(data[data.len() - 1]);
let mut state_2 = self.table.start_state(data[data.len() - 2]);
// The first two symbols are represented by the start states
// Then encode the state transitions for two symbols at a time
let mut idx = data.len() - 4;
loop {
{
let state = state_1;
let x = data[idx + 1];
let next = self.table.next_state(x, state.index);
let diff = state.index - next.baseline;
self.writer.write_bits(diff as u64, next.num_bits as usize);
state_1 = next;
}
{
let state = state_2;
let x = data[idx];
let next = self.table.next_state(x, state.index);
let diff = state.index - next.baseline;
self.writer.write_bits(diff as u64, next.num_bits as usize);
state_2 = next;
}
if idx < 2 {
break;
}
idx -= 2;
}
// Determine if we have an even or odd number of symbols to encode
// If odd we need to encode the last states transition and encode the final states in the flipped order
if idx == 1 {
let state = state_1;
let x = data[0];
let next = self.table.next_state(x, state.index);
let diff = state.index - next.baseline;
self.writer.write_bits(diff as u64, next.num_bits as usize);
state_1 = next;
self.writer
.write_bits(state_2.index as u64, self.acc_log() as usize);
self.writer
.write_bits(state_1.index as u64, self.acc_log() as usize);
} else {
self.writer
.write_bits(state_1.index as u64, self.acc_log() as usize);
self.writer
.write_bits(state_2.index as u64, self.acc_log() as usize);
}
let bits_to_fill = self.writer.misaligned();
if bits_to_fill == 0 {
self.writer.write_bits(1u32, 8);
} else {
self.writer.write_bits(1u32, bits_to_fill);
}
}
fn write_table(&mut self) {
self.table.write_table(self.writer);
}
pub(super) fn acc_log(&self) -> u8 {
self.table.acc_log()
}
}
#[derive(Debug, Clone)]
pub struct FSETable {
/// Indexed by symbol
pub(super) states: [SymbolStates; 256],
/// Sum of all states.states.len()
pub(crate) table_size: usize,
}
impl FSETable {
pub(crate) fn next_state(&self, symbol: u8, idx: usize) -> &State {
let states = &self.states[symbol as usize];
states.get(idx, self.table_size)
}
pub(crate) fn start_state(&self, symbol: u8) -> &State {
let states = &self.states[symbol as usize];
&states.states[0]
}
pub fn acc_log(&self) -> u8 {
self.table_size.ilog2() as u8
}
pub fn write_table<V: AsMut<Vec<u8>>>(&self, writer: &mut BitWriter<V>) {
writer.write_bits(self.acc_log() - 5, 4);
let mut probability_counter = 0usize;
let probability_sum = 1 << self.acc_log();
let mut prob_idx = 0;
while probability_counter < probability_sum {
let max_remaining_value = probability_sum - probability_counter + 1;
let bits_to_write = max_remaining_value.ilog2() + 1;
let low_threshold = ((1 << bits_to_write) - 1) - (max_remaining_value);
let mask = (1 << (bits_to_write - 1)) - 1;
let prob = self.states[prob_idx].probability;
prob_idx += 1;
let value = (prob + 1) as u32;
if value < low_threshold as u32 {
writer.write_bits(value, bits_to_write as usize - 1);
} else if value > mask {
writer.write_bits(value + low_threshold as u32, bits_to_write as usize);
} else {
writer.write_bits(value, bits_to_write as usize);
}
if prob == -1 {
probability_counter += 1;
} else if prob > 0 {
probability_counter += prob as usize;
} else {
let mut zeros = 0u8;
while self.states[prob_idx].probability == 0 {
zeros += 1;
prob_idx += 1;
if zeros == 3 {
writer.write_bits(3u8, 2);
zeros = 0;
}
}
writer.write_bits(zeros, 2);
}
}
writer.write_bits(0u8, writer.misaligned());
}
}
#[derive(Debug, Clone)]
pub(super) struct SymbolStates {
/// Sorted by baseline to allow easy lookup using an index
pub(super) states: Vec<State>,
pub(super) probability: i32,
}
impl SymbolStates {
fn get(&self, idx: usize, max_idx: usize) -> &State {
let start_search_at = (idx * self.states.len()) / max_idx;
self.states[start_search_at..]
.iter()
.find(|state| state.contains(idx))
.unwrap()
}
}
#[derive(Debug, Clone)]
pub(crate) struct State {
/// How many bits the range of this state needs to be encoded as
pub(crate) num_bits: u8,
/// The first index targeted by this state
pub(crate) baseline: usize,
/// The last index targeted by this state (baseline + the maximum number with numbits bits allows)
pub(crate) last_index: usize,
/// Index of this state in the decoding table
pub(crate) index: usize,
}
impl State {
fn contains(&self, idx: usize) -> bool {
self.baseline <= idx && self.last_index >= idx
}
}
pub fn build_table_from_data(
data: impl Iterator<Item = u8>,
max_log: u8,
avoid_0_numbit: bool,
) -> FSETable {
let mut counts = [0; 256];
let mut max_symbol = 0;
for x in data {
counts[x as usize] += 1;
}
for (idx, count) in counts.iter().copied().enumerate() {
if count > 0 {
max_symbol = idx;
}
}
build_table_from_counts(&counts[..=max_symbol], max_log, avoid_0_numbit)
}
fn build_table_from_counts(counts: &[usize], max_log: u8, avoid_0_numbit: bool) -> FSETable {
let mut probs = [0; 256];
let probs = &mut probs[..counts.len()];
let mut min_count = 0;
for (idx, count) in counts.iter().copied().enumerate() {
probs[idx] = count as i32;
if count > 0 && (count < min_count || min_count == 0) {
min_count = count;
}
}
// shift all probabilities down so that the lowest are 1
min_count -= 1;
let mut max_prob = 0i32;
for prob in probs.iter_mut() {
if *prob > 0 {
*prob -= min_count as i32;
}
max_prob = max_prob.max(*prob);
}
if max_prob > 0 && max_prob as usize > probs.len() {
let divisor = max_prob / (probs.len() as i32);
for prob in probs.iter_mut() {
if *prob > 0 {
*prob = (*prob / divisor).max(1)
}
}
}
// normalize probabilities to a 2^x
let sum = probs.iter().sum::<i32>();
assert!(sum > 0);
let sum = sum as usize;
let acc_log = (sum.ilog2() as u8 + 1).max(5);
let acc_log = u8::min(acc_log, max_log);
if sum < 1 << acc_log {
// just raise the maximum probability as much as possible
// TODO is this optimal?
let diff = (1 << acc_log) - sum;
let max = probs.iter_mut().max().unwrap();
*max += diff as i32;
} else {
// decrease the smallest ones to 1 first
let mut diff = sum - (1 << acc_log);
while diff > 0 {
let min = probs.iter_mut().filter(|prob| **prob > 1).min().unwrap();
let decrease = usize::min(*min as usize - 1, diff);
diff -= decrease;
*min -= decrease as i32;
}
}
let max = probs.iter_mut().max().unwrap();
if avoid_0_numbit && *max > 1 << (acc_log - 1) {
let redistribute = *max - (1 << (acc_log - 1));
*max -= redistribute;
let max = *max;
// find first occurence of the second_max to avoid lifting the last zero
let second_max = *probs.iter_mut().filter(|x| **x != max).max().unwrap();
let second_max = probs.iter_mut().find(|x| **x == second_max).unwrap();
*second_max += redistribute;
assert!(*second_max <= max);
}
build_table_from_probabilities(probs, acc_log)
}
pub(super) fn build_table_from_probabilities(probs: &[i32], acc_log: u8) -> FSETable {
let mut states = core::array::from_fn::<SymbolStates, 256, _>(|_| SymbolStates {
states: Vec::new(),
probability: 0,
});
// distribute -1 symbols
let mut negative_idx = (1 << acc_log) - 1;
for (symbol, _prob) in probs
.iter()
.copied()
.enumerate()
.filter(|prob| prob.1 == -1)
{
states[symbol].states.push(State {
num_bits: acc_log,
baseline: 0,
last_index: (1 << acc_log) - 1,
index: negative_idx,
});
states[symbol].probability = -1;
negative_idx -= 1;
}
// distribute other symbols
// Setup all needed states per symbol with their respective index
let mut idx = 0;
for (symbol, prob) in probs.iter().copied().enumerate() {
if prob <= 0 {
continue;
}
states[symbol].probability = prob;
let states = &mut states[symbol].states;
for _ in 0..prob {
states.push(State {
num_bits: 0,
baseline: 0,
last_index: 0,
index: idx,
});
idx = next_position(idx, 1 << acc_log);
while idx > negative_idx {
idx = next_position(idx, 1 << acc_log);
}
}
assert_eq!(states.len(), prob as usize);
}
// After all states know their index we can determine the numbits and baselines
for (symbol, prob) in probs.iter().copied().enumerate() {
if prob <= 0 {
continue;
}
let prob = prob as u32;
let state = &mut states[symbol];
// We process the states in their order in the table
state.states.sort_by(|l, r| l.index.cmp(&r.index));
let prob_log = if prob.is_power_of_two() {
prob.ilog2()
} else {
prob.ilog2() + 1
};
let rounded_up = 1u32 << prob_log;
// The lower states target double the amount of indexes -> numbits + 1
let double_states = rounded_up - prob;
let single_states = prob - double_states;
let num_bits = acc_log - prob_log as u8;
let mut baseline = (single_states as usize * (1 << (num_bits))) % (1 << acc_log);
for (idx, state) in state.states.iter_mut().enumerate() {
if (idx as u32) < double_states {
let num_bits = num_bits + 1;
state.baseline = baseline;
state.num_bits = num_bits;
state.last_index = baseline + ((1 << num_bits) - 1);
baseline += 1 << num_bits;
baseline %= 1 << acc_log;
} else {
state.baseline = baseline;
state.num_bits = num_bits;
state.last_index = baseline + ((1 << num_bits) - 1);
baseline += 1 << num_bits;
}
}
// For encoding we use the states ordered by the indexes they target
state.states.sort_by(|l, r| l.baseline.cmp(&r.baseline));
}
FSETable {
table_size: 1 << acc_log,
states,
}
}
/// Calculate the position of the next entry of the table given the current
/// position and size of the table.
fn next_position(mut p: usize, table_size: usize) -> usize {
p += (table_size >> 1) + (table_size >> 3) + 3;
p &= table_size - 1;
p
}
const ML_DIST: &[i32] = &[
1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1,
];
const LL_DIST: &[i32] = &[
4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1,
-1, -1, -1, -1,
];
const OF_DIST: &[i32] = &[
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1,
];
pub(crate) fn default_ml_table() -> FSETable {
build_table_from_probabilities(ML_DIST, 6)
}
pub(crate) fn default_ll_table() -> FSETable {
build_table_from_probabilities(LL_DIST, 6)
}
pub(crate) fn default_of_table() -> FSETable {
build_table_from_probabilities(OF_DIST, 5)
}

139
vendor/ruzstd/src/fse/mod.rs vendored Normal file
View File

@@ -0,0 +1,139 @@
//! FSE, short for Finite State Entropy, is an encoding technique
//! that assigns shorter codes to symbols that appear more frequently in data,
//! and longer codes to less frequent symbols.
//!
//! FSE works by mutating a state and using that state to index into a table.
//!
//! Zstandard uses two different kinds of entropy encoding: FSE, and Huffman coding.
//! Huffman is used to compress literals,
//! while FSE is used for all other symbols (literal length code, match length code, offset code).
//!
//! <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#fse>
//!
//! <https://arxiv.org/pdf/1311.2540>
mod fse_decoder;
pub use fse_decoder::*;
pub mod fse_encoder;
#[test]
fn tables_equal() {
let probs = &[0, 0, -1, 3, 2, 2, (1 << 6) - 8];
let mut dec_table = FSETable::new(255);
dec_table.build_from_probabilities(6, probs).unwrap();
let enc_table = fse_encoder::build_table_from_probabilities(probs, 6);
check_tables(&dec_table, &enc_table);
}
#[cfg(any(test, feature = "fuzz_exports"))]
fn check_tables(dec_table: &fse_decoder::FSETable, enc_table: &fse_encoder::FSETable) {
for (idx, dec_state) in dec_table.decode.iter().enumerate() {
let enc_states = &enc_table.states[dec_state.symbol as usize];
let enc_state = enc_states
.states
.iter()
.find(|state| state.index == idx)
.unwrap();
assert_eq!(enc_state.baseline, dec_state.base_line as usize);
assert_eq!(enc_state.num_bits, dec_state.num_bits);
}
}
#[test]
fn roundtrip() {
round_trip(&(0..64).collect::<alloc::vec::Vec<_>>());
let mut data = alloc::vec![];
data.extend(0..32);
data.extend(0..32);
data.extend(0..32);
data.extend(0..32);
data.extend(0..32);
data.extend(20..32);
data.extend(20..32);
data.extend(0..32);
data.extend(20..32);
data.extend(100..255);
data.extend(20..32);
data.extend(20..32);
round_trip(&data);
#[cfg(feature = "std")]
if std::fs::exists("fuzz/artifacts/fse").unwrap_or(false) {
for file in std::fs::read_dir("fuzz/artifacts/fse").unwrap() {
if file.as_ref().unwrap().file_type().unwrap().is_file() {
let data = std::fs::read(file.unwrap().path()).unwrap();
round_trip(&data);
}
}
}
}
/// Only needed for testing.
///
/// Encodes the data with a table built from that data
/// Decodes the result again by first decoding the table and then the data
/// Asserts that the decoded data equals the input
#[cfg(any(test, feature = "fuzz_exports"))]
pub fn round_trip(data: &[u8]) {
use crate::bit_io::{BitReaderReversed, BitWriter};
use fse_encoder::FSEEncoder;
if data.len() < 2 {
return;
}
if data.iter().all(|x| *x == data[0]) {
return;
}
if data.len() < 64 {
return;
}
let mut writer = BitWriter::new();
let mut encoder = FSEEncoder::new(
fse_encoder::build_table_from_data(data.iter().copied(), 22, false),
&mut writer,
);
let mut dec_table = FSETable::new(255);
encoder.encode(data);
let acc_log = encoder.acc_log();
let enc_table = encoder.into_table();
let encoded = writer.dump();
let table_bytes = dec_table.build_decoder(&encoded, acc_log).unwrap();
let encoded = &encoded[table_bytes..];
let mut decoder = FSEDecoder::new(&dec_table);
check_tables(&dec_table, &enc_table);
let mut br = BitReaderReversed::new(encoded);
let mut skipped_bits = 0;
loop {
let val = br.get_bits(1);
skipped_bits += 1;
if val == 1 || skipped_bits > 8 {
break;
}
}
if skipped_bits > 8 {
//if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data
panic!("Corrupted end marker");
}
decoder.init_state(&mut br).unwrap();
let mut decoded = alloc::vec::Vec::new();
for x in data {
let w = decoder.decode_symbol();
assert_eq!(w, *x);
decoded.push(w);
if decoded.len() < data.len() {
decoder.update_state(&mut br);
}
}
assert_eq!(&decoded, data);
assert_eq!(br.bits_remaining(), 0);
}

401
vendor/ruzstd/src/huff0/huff0_decoder.rs vendored Normal file
View File

@@ -0,0 +1,401 @@
//! Utilities for decoding Huff0 encoded huffman data.
use crate::bit_io::BitReaderReversed;
use crate::decoding::errors::HuffmanTableError;
use crate::fse::{FSEDecoder, FSETable};
use alloc::vec::Vec;
/// The Zstandard specification limits the maximum length of a code to 11 bits.
pub(crate) const MAX_MAX_NUM_BITS: u8 = 11;
pub struct HuffmanDecoder<'table> {
table: &'table HuffmanTable,
/// State is used to index into the table.
pub state: u64,
}
impl<'t> HuffmanDecoder<'t> {
/// Create a new decoder with the provided table
pub fn new(table: &'t HuffmanTable) -> HuffmanDecoder<'t> {
HuffmanDecoder { table, state: 0 }
}
/// Decode the symbol the internal state (cursor) is pointed at and return the
/// decoded literal.
pub fn decode_symbol(&mut self) -> u8 {
self.table.decode[self.state as usize].symbol
}
/// Initialize internal state and prepare to decode data. Then, `decode_symbol` can be called
/// to read the byte the internal cursor is pointing at, and `next_state` can be called to advance
/// the cursor until the max number of bits has been read.
pub fn init_state(&mut self, br: &mut BitReaderReversed<'_>) -> u8 {
let num_bits = self.table.max_num_bits;
let new_bits = br.get_bits(num_bits);
self.state = new_bits;
num_bits
}
/// Advance the internal cursor to the next symbol. After this, you can call `decode_symbol`
/// to read from the new position.
pub fn next_state(&mut self, br: &mut BitReaderReversed<'_>) -> u8 {
// self.state stores a small section, or a window of the bit stream. The table can be indexed via this state,
// telling you how many bits identify the current symbol.
let num_bits = self.table.decode[self.state as usize].num_bits;
// New bits are read from the stream
let new_bits = br.get_bits(num_bits);
// Shift and mask out the bits that identify the current symbol
self.state <<= num_bits;
self.state &= self.table.decode.len() as u64 - 1;
// The new bits are appended at the end of the current state.
self.state |= new_bits;
num_bits
}
}
/// A Huffman decoding table contains a list of Huffman prefix codes and their associated values
pub struct HuffmanTable {
decode: Vec<Entry>,
/// The weight of a symbol is the number of occurences in a table.
/// This value is used in constructing a binary tree referred to as
/// a Huffman tree. Once this tree is constructed, it can be used to build the
/// lookup table
weights: Vec<u8>,
/// The maximum size in bits a prefix code in the encoded data can be.
/// This value is used so that the decoder knows how many bits
/// to read from the bitstream before checking the table. This
/// value must be 11 or lower.
pub max_num_bits: u8,
bits: Vec<u8>,
bit_ranks: Vec<u32>,
rank_indexes: Vec<usize>,
/// In some cases, the list of weights is compressed using FSE compression.
fse_table: FSETable,
}
impl HuffmanTable {
/// Create a new, empty table.
pub fn new() -> HuffmanTable {
HuffmanTable {
decode: Vec::new(),
weights: Vec::with_capacity(256),
max_num_bits: 0,
bits: Vec::with_capacity(256),
bit_ranks: Vec::with_capacity(11),
rank_indexes: Vec::with_capacity(11),
fse_table: FSETable::new(255),
}
}
/// Completely empty the table then repopulate as a replica
/// of `other`.
pub fn reinit_from(&mut self, other: &Self) {
self.reset();
self.decode.extend_from_slice(&other.decode);
self.weights.extend_from_slice(&other.weights);
self.max_num_bits = other.max_num_bits;
self.bits.extend_from_slice(&other.bits);
self.rank_indexes.extend_from_slice(&other.rank_indexes);
self.fse_table.reinit_from(&other.fse_table);
}
/// Completely empty the table of all data.
pub fn reset(&mut self) {
self.decode.clear();
self.weights.clear();
self.max_num_bits = 0;
self.bits.clear();
self.bit_ranks.clear();
self.rank_indexes.clear();
self.fse_table.reset();
}
/// Read from `source` and decode the input, populating the huffman decoding table.
///
/// Returns the number of bytes read.
pub fn build_decoder(&mut self, source: &[u8]) -> Result<u32, HuffmanTableError> {
self.decode.clear();
let bytes_used = self.read_weights(source)?;
self.build_table_from_weights()?;
Ok(bytes_used)
}
/// Read weights from the provided source.
///
/// The huffman table is represented in the input data as a list of weights.
/// After the header, weights are read, then a Huffman decoding table
/// can be constructed using that list of weights.
///
/// Returns the number of bytes read.
fn read_weights(&mut self, source: &[u8]) -> Result<u32, HuffmanTableError> {
use HuffmanTableError as err;
if source.is_empty() {
return Err(err::SourceIsEmpty);
}
let header = source[0];
let mut bits_read = 8;
match header {
// If the header byte is less than 128, the series of weights
// is compressed using two interleaved FSE streams that share
// a distribution table.
0..=127 => {
let fse_stream = &source[1..];
if header as usize > fse_stream.len() {
return Err(err::NotEnoughBytesForWeights {
got_bytes: fse_stream.len(),
expected_bytes: header,
});
}
//fse decompress weights
let bytes_used_by_fse_header = self.fse_table.build_decoder(fse_stream, 6)?;
if bytes_used_by_fse_header > header as usize {
return Err(err::FSETableUsedTooManyBytes {
used: bytes_used_by_fse_header,
available_bytes: header,
});
}
vprintln!(
"Building fse table for huffman weights used: {}",
bytes_used_by_fse_header
);
// Huffman headers are compressed using two interleaved
// FSE bitstreams, where the first state (decoder) handles
// even symbols, and the second handles odd symbols.
let mut dec1 = FSEDecoder::new(&self.fse_table);
let mut dec2 = FSEDecoder::new(&self.fse_table);
let compressed_start = bytes_used_by_fse_header;
let compressed_length = header as usize - bytes_used_by_fse_header;
let compressed_weights = &fse_stream[compressed_start..];
if compressed_weights.len() < compressed_length {
return Err(err::NotEnoughBytesToDecompressWeights {
have: compressed_weights.len(),
need: compressed_length,
});
}
let compressed_weights = &compressed_weights[..compressed_length];
let mut br = BitReaderReversed::new(compressed_weights);
bits_read += (bytes_used_by_fse_header + compressed_length) * 8;
//skip the 0 padding at the end of the last byte of the bit stream and throw away the first 1 found
let mut skipped_bits = 0;
loop {
let val = br.get_bits(1);
skipped_bits += 1;
if val == 1 || skipped_bits > 8 {
break;
}
}
if skipped_bits > 8 {
//if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data
return Err(err::ExtraPadding { skipped_bits });
}
dec1.init_state(&mut br)?;
dec2.init_state(&mut br)?;
self.weights.clear();
// The two decoders take turns decoding a single symbol and updating their state.
loop {
let w = dec1.decode_symbol();
self.weights.push(w);
dec1.update_state(&mut br);
if br.bits_remaining() <= -1 {
//collect final states
self.weights.push(dec2.decode_symbol());
break;
}
let w = dec2.decode_symbol();
self.weights.push(w);
dec2.update_state(&mut br);
if br.bits_remaining() <= -1 {
//collect final states
self.weights.push(dec1.decode_symbol());
break;
}
//maximum number of weights is 255 because we use u8 symbols and the last weight is inferred from the sum of all others
if self.weights.len() > 255 {
return Err(err::TooManyWeights {
got: self.weights.len(),
});
}
}
}
// If the header byte is greater than or equal to 128,
// weights are directly represented, where each weight is
// encoded directly as a 4 bit field. The weights will
// always be encoded with full bytes, meaning if there's
// an odd number of weights, the last weight will still
// occupy a full byte.
_ => {
// weights are directly encoded
let weights_raw = &source[1..];
let num_weights = header - 127;
self.weights.resize(num_weights as usize, 0);
let bytes_needed = if num_weights % 2 == 0 {
num_weights as usize / 2
} else {
(num_weights as usize / 2) + 1
};
if weights_raw.len() < bytes_needed {
return Err(err::NotEnoughBytesInSource {
got: weights_raw.len(),
need: bytes_needed,
});
}
for idx in 0..num_weights {
if idx % 2 == 0 {
self.weights[idx as usize] = weights_raw[idx as usize / 2] >> 4;
} else {
self.weights[idx as usize] = weights_raw[idx as usize / 2] & 0xF;
}
bits_read += 4;
}
}
}
let bytes_read = if bits_read % 8 == 0 {
bits_read / 8
} else {
(bits_read / 8) + 1
};
Ok(bytes_read as u32)
}
/// Once the weights have been read from the data, you can decode the weights
/// into a table, and use that table to decode the actual compressed data.
///
/// This function populates the rest of the table from the series of weights.
fn build_table_from_weights(&mut self) -> Result<(), HuffmanTableError> {
use HuffmanTableError as err;
self.bits.clear();
self.bits.resize(self.weights.len() + 1, 0);
let mut weight_sum: u32 = 0;
for w in &self.weights {
if *w > MAX_MAX_NUM_BITS {
return Err(err::WeightBiggerThanMaxNumBits { got: *w });
}
weight_sum += if *w > 0 { 1_u32 << (*w - 1) } else { 0 };
}
if weight_sum == 0 {
return Err(err::MissingWeights);
}
let max_bits = highest_bit_set(weight_sum) as u8;
let left_over = (1 << max_bits) - weight_sum;
//left_over must be power of two
if !left_over.is_power_of_two() {
return Err(err::LeftoverIsNotAPowerOf2 { got: left_over });
}
let last_weight = highest_bit_set(left_over) as u8;
for symbol in 0..self.weights.len() {
let bits = if self.weights[symbol] > 0 {
max_bits + 1 - self.weights[symbol]
} else {
0
};
self.bits[symbol] = bits;
}
self.bits[self.weights.len()] = max_bits + 1 - last_weight;
self.max_num_bits = max_bits;
if max_bits > MAX_MAX_NUM_BITS {
return Err(err::MaxBitsTooHigh { got: max_bits });
}
self.bit_ranks.clear();
self.bit_ranks.resize((max_bits + 1) as usize, 0);
for num_bits in &self.bits {
self.bit_ranks[(*num_bits) as usize] += 1;
}
//fill with dummy symbols
self.decode.resize(
1 << self.max_num_bits,
Entry {
symbol: 0,
num_bits: 0,
},
);
//starting codes for each rank
self.rank_indexes.clear();
self.rank_indexes.resize((max_bits + 1) as usize, 0);
self.rank_indexes[max_bits as usize] = 0;
for bits in (1..self.rank_indexes.len() as u8).rev() {
self.rank_indexes[bits as usize - 1] = self.rank_indexes[bits as usize]
+ self.bit_ranks[bits as usize] as usize * (1 << (max_bits - bits));
}
assert!(
self.rank_indexes[0] == self.decode.len(),
"rank_idx[0]: {} should be: {}",
self.rank_indexes[0],
self.decode.len()
);
for symbol in 0..self.bits.len() {
let bits_for_symbol = self.bits[symbol];
if bits_for_symbol != 0 {
// allocate code for the symbol and set in the table
// a code ignores all max_bits - bits[symbol] bits, so it gets
// a range that spans all of those in the decoding table
let base_idx = self.rank_indexes[bits_for_symbol as usize];
let len = 1 << (max_bits - bits_for_symbol);
self.rank_indexes[bits_for_symbol as usize] += len;
for idx in 0..len {
self.decode[base_idx + idx].symbol = symbol as u8;
self.decode[base_idx + idx].num_bits = bits_for_symbol;
}
}
}
Ok(())
}
}
impl Default for HuffmanTable {
fn default() -> Self {
Self::new()
}
}
/// A single entry in the table contains the decoded symbol/literal and the
/// size of the prefix code.
#[derive(Copy, Clone, Debug)]
pub struct Entry {
/// The byte that the prefix code replaces during encoding.
symbol: u8,
/// The number of bits the prefix code occupies.
num_bits: u8,
}
/// Assert that the provided value is greater than zero, and returns the
/// 32 - the number of leading zeros
fn highest_bit_set(x: u32) -> u32 {
assert!(x > 0);
u32::BITS - x.leading_zeros()
}

484
vendor/ruzstd/src/huff0/huff0_encoder.rs vendored Normal file
View File

@@ -0,0 +1,484 @@
use alloc::vec::Vec;
use core::cmp::Ordering;
use crate::{
bit_io::BitWriter,
fse::fse_encoder::{self, FSEEncoder},
};
pub(crate) struct HuffmanEncoder<'output, 'table, V: AsMut<Vec<u8>>> {
table: &'table HuffmanTable,
writer: &'output mut BitWriter<V>,
}
impl<V: AsMut<Vec<u8>>> HuffmanEncoder<'_, '_, V> {
pub fn new<'o, 't>(
table: &'t HuffmanTable,
writer: &'o mut BitWriter<V>,
) -> HuffmanEncoder<'o, 't, V> {
HuffmanEncoder { table, writer }
}
/// Encodes the data using the provided table
/// Writes
/// * Table description
/// * Encoded data
/// * Padding bits to fill up last byte
pub fn encode(&mut self, data: &[u8], with_table: bool) {
if with_table {
self.write_table();
}
Self::encode_stream(self.table, self.writer, data);
}
/// Encodes the data using the provided table in 4 concatenated streams
/// Writes
/// * Table description
/// * Jumptable
/// * Encoded data in 4 streams, each padded to fill the last byte
pub fn encode4x(&mut self, data: &[u8], with_table: bool) {
assert!(data.len() >= 4);
// Split data in 4 equally sized parts (the last one might be a bit smaller than the rest)
let split_size = data.len().div_ceil(4);
let src1 = &data[..split_size];
let src2 = &data[split_size..split_size * 2];
let src3 = &data[split_size * 2..split_size * 3];
let src4 = &data[split_size * 3..];
// Write table description
if with_table {
self.write_table();
}
// Reserve space for the jump table, will be changed later
let size_idx = self.writer.index();
self.writer.write_bits(0u16, 16);
self.writer.write_bits(0u16, 16);
self.writer.write_bits(0u16, 16);
// Write the 4 streams, noting the sizes of the encoded streams
let index_before = self.writer.index();
Self::encode_stream(self.table, self.writer, src1);
let size1 = (self.writer.index() - index_before) / 8;
let index_before = self.writer.index();
Self::encode_stream(self.table, self.writer, src2);
let size2 = (self.writer.index() - index_before) / 8;
let index_before = self.writer.index();
Self::encode_stream(self.table, self.writer, src3);
let size3 = (self.writer.index() - index_before) / 8;
Self::encode_stream(self.table, self.writer, src4);
// Sanity check, if this doesn't hold we produce a broken stream
assert!(size1 <= u16::MAX as usize);
assert!(size2 <= u16::MAX as usize);
assert!(size3 <= u16::MAX as usize);
// Update the jumptable with the real sizes
self.writer.change_bits(size_idx, size1 as u16, 16);
self.writer.change_bits(size_idx + 16, size2 as u16, 16);
self.writer.change_bits(size_idx + 32, size3 as u16, 16);
}
/// Encode one stream and pad it to fill the last byte
fn encode_stream<VV: AsMut<Vec<u8>>>(
table: &HuffmanTable,
writer: &mut BitWriter<VV>,
data: &[u8],
) {
for symbol in data.iter().rev() {
let (code, num_bits) = table.codes[*symbol as usize];
debug_assert!(num_bits > 0);
writer.write_bits(code, num_bits as usize);
}
let bits_to_fill = writer.misaligned();
if bits_to_fill == 0 {
writer.write_bits(1u32, 8);
} else {
writer.write_bits(1u32, bits_to_fill);
}
}
pub(super) fn weights(&self) -> Vec<u8> {
let max = self.table.codes.iter().map(|(_, nb)| nb).max().unwrap();
let weights = self
.table
.codes
.iter()
.copied()
.map(|(_, nb)| if nb == 0 { 0 } else { max - nb + 1 })
.collect::<Vec<u8>>();
weights
}
fn write_table(&mut self) {
// TODO strategy for determining this?
let weights = self.weights();
let weights = &weights[..weights.len() - 1]; // dont encode last weight
if weights.len() > 16 {
let size_idx = self.writer.index();
self.writer.write_bits(0u8, 8);
let idx_before = self.writer.index();
let mut encoder = FSEEncoder::new(
fse_encoder::build_table_from_data(weights.iter().copied(), 6, true),
self.writer,
);
encoder.encode_interleaved(weights);
let encoded_len = (self.writer.index() - idx_before) / 8;
assert!(encoded_len < 128);
self.writer.change_bits(size_idx, encoded_len as u8, 8);
} else {
self.writer.write_bits(weights.len() as u8 + 127, 8);
let pairs = weights.chunks_exact(2);
let remainder = pairs.remainder();
for pair in pairs.into_iter() {
let weight1 = pair[0];
let weight2 = pair[1];
assert!(weight1 < 16);
assert!(weight2 < 16);
self.writer.write_bits(weight2, 4);
self.writer.write_bits(weight1, 4);
}
if !remainder.is_empty() {
let weight = remainder[0];
assert!(weight < 16);
self.writer.write_bits(weight << 4, 8);
}
}
}
}
pub struct HuffmanTable {
/// Index is the symbol, values are the bitstring in the lower bits of the u32 and the amount of bits in the u8
codes: Vec<(u32, u8)>,
}
impl HuffmanTable {
pub fn build_from_data(data: &[u8]) -> Self {
let mut counts = [0; 256];
let mut max = 0;
for x in data {
counts[*x as usize] += 1;
max = max.max(*x);
}
Self::build_from_counts(&counts[..=max as usize])
}
pub fn build_from_counts(counts: &[usize]) -> Self {
assert!(counts.len() <= 256);
let zeros = counts.iter().filter(|x| **x == 0).count();
let mut weights = distribute_weights(counts.len() - zeros);
let limit = weights.len().ilog2() as usize + 2;
redistribute_weights(&mut weights, limit);
weights.reverse();
let mut counts_sorted = counts.iter().enumerate().collect::<Vec<_>>();
counts_sorted.sort_by(|(_, c1), (_, c2)| c1.cmp(c2));
let mut weights_distributed = alloc::vec![0; counts.len()];
for (idx, count) in counts_sorted {
if *count == 0 {
weights_distributed[idx] = 0;
} else {
weights_distributed[idx] = weights.pop().unwrap();
}
}
Self::build_from_weights(&weights_distributed)
}
pub fn build_from_weights(weights: &[usize]) -> Self {
let mut sorted = Vec::with_capacity(weights.len());
struct SortEntry {
symbol: u8,
weight: usize,
}
// TODO this doesn't need to be a temporary Vec, it could be done in a [_; 264]
// only non-zero weights are interesting here
for (symbol, weight) in weights.iter().copied().enumerate() {
if weight > 0 {
sorted.push(SortEntry {
symbol: symbol as u8,
weight,
});
}
}
// We process symbols ordered by weight and then ordered by symbol
sorted.sort_by(|left, right| match left.weight.cmp(&right.weight) {
Ordering::Equal => left.symbol.cmp(&right.symbol),
other => other,
});
// Prepare huffman table with placeholders
let mut table = HuffmanTable {
codes: Vec::with_capacity(weights.len()),
};
for _ in 0..weights.len() {
table.codes.push((0, 0));
}
// Determine the number of bits needed for codes with the lowest weight
let weight_sum = sorted.iter().map(|e| 1 << (e.weight - 1)).sum::<usize>();
if !weight_sum.is_power_of_two() {
panic!("This is an internal error");
}
let max_num_bits = highest_bit_set(weight_sum) - 1; // this is a log_2 of a clean power of two
// Starting at the symbols with the lowest weight we update the placeholders in the table
let mut current_code = 0;
let mut current_weight = 0;
let mut current_num_bits = 0;
for entry in sorted.iter() {
// If the entry isn't the same weight as the last one we need to change a few things
if current_weight != entry.weight {
// The code shifts by the difference of the weights to allow for enough unique values
current_code >>= entry.weight - current_weight;
// Encoding a symbol of this weight will take less bits than the previous weight
current_num_bits = max_num_bits - entry.weight + 1;
// Run the next update when the weight changes again
current_weight = entry.weight;
}
table.codes[entry.symbol as usize] = (current_code as u32, current_num_bits as u8);
current_code += 1;
}
table
}
pub fn can_encode(&self, other: &Self) -> Option<usize> {
if other.codes.len() > self.codes.len() {
return None;
}
let mut sum = 0;
for ((_, other_num_bits), (_, self_num_bits)) in other.codes.iter().zip(self.codes.iter()) {
if *other_num_bits != 0 && *self_num_bits == 0 {
return None;
}
sum += other_num_bits.abs_diff(*self_num_bits) as usize;
}
Some(sum)
}
}
/// Assert that the provided value is greater than zero, and returns index of the first set bit
fn highest_bit_set(x: usize) -> usize {
assert!(x > 0);
usize::BITS as usize - x.leading_zeros() as usize
}
#[test]
fn huffman() {
let table = HuffmanTable::build_from_weights(&[2, 2, 2, 1, 1]);
assert_eq!(table.codes[0], (1, 2));
assert_eq!(table.codes[1], (2, 2));
assert_eq!(table.codes[2], (3, 2));
assert_eq!(table.codes[3], (0, 3));
assert_eq!(table.codes[4], (1, 3));
let table = HuffmanTable::build_from_weights(&[4, 3, 2, 0, 1, 1]);
assert_eq!(table.codes[0], (1, 1));
assert_eq!(table.codes[1], (1, 2));
assert_eq!(table.codes[2], (1, 3));
assert_eq!(table.codes[3], (0, 0));
assert_eq!(table.codes[4], (0, 4));
assert_eq!(table.codes[5], (1, 4));
}
/// Distributes weights that add up to a clean power of two
fn distribute_weights(amount: usize) -> Vec<usize> {
assert!(amount >= 2);
assert!(amount <= 256);
let mut weights = Vec::new();
// This is the trivial power of two we always need
weights.push(1);
weights.push(1);
// This is the weight we are adding right now
let mut target_weight = 1;
// Counts how many times we have added weights
let mut weight_counter = 2;
// We always add a power of 2 new weights so that the weights that we add equal
// the weights are already in the vec if raised to the power of two.
// This means we double the weights in the vec -> results in a new power of two
//
// Example: [1, 1] -> [1,1,2] (2^1 + 2^1 == 2^2)
//
// Example: [1, 1] -> [1,1,1,1] (2^1 + 2^1 == 2^1 + 2^1)
// [1,1,1,1] -> [1,1,1,1,3] (2^1 + 2^1 + 2^1 + 2^1 == 2^3)
while weights.len() < amount {
let mut add_new = 1 << (weight_counter - target_weight);
let available_space = amount - weights.len();
// If the amount of new weights needed to get to the next power of two would exceed amount
// We instead add 1 of a bigger weight and start the cycle again
if add_new > available_space {
// TODO we could maybe instead do this until add_new <= available_space?
// target_weight += 1
// add_new /= 2
target_weight = weight_counter;
add_new = 1;
}
for _ in 0..add_new {
weights.push(target_weight);
}
weight_counter += 1;
}
assert_eq!(amount, weights.len());
weights
}
/// Sometimes distribute_weights generates weights that require too many bits to encode
/// This redistributes the weights to have less variance by raising the lower weights while still maintaining the
/// required attributes of the weight distribution
fn redistribute_weights(weights: &mut [usize], max_num_bits: usize) {
let weight_sum_log = weights
.iter()
.copied()
.map(|x| 1 << x)
.sum::<usize>()
.ilog2() as usize;
// Nothing needs to be done, this is already fine
if weight_sum_log < max_num_bits {
return;
}
// We need to decrease the weight difference by the difference between weight_sum_log and max_num_bits
let decrease_weights_by = weight_sum_log - max_num_bits + 1;
// To do that we raise the lower weights up by that difference, recording how much weight we added in the process
let mut added_weights = 0;
for weight in weights.iter_mut() {
if *weight < decrease_weights_by {
for add in *weight..decrease_weights_by {
added_weights += 1 << add;
}
*weight = decrease_weights_by;
}
}
// Then we reduce weights until the added weights are equaled out
while added_weights > 0 {
// Find the highest weight that is still lower or equal to the added weight
let mut current_idx = 0;
let mut current_weight = 0;
for (idx, weight) in weights.iter().copied().enumerate() {
if 1 << (weight - 1) > added_weights {
break;
}
if weight > current_weight {
current_weight = weight;
current_idx = idx;
}
}
// Reduce that weight by 1
added_weights -= 1 << (current_weight - 1);
weights[current_idx] -= 1;
}
// At the end we normalize the weights so that they start at 1 again
if weights[0] > 1 {
let offset = weights[0] - 1;
for weight in weights.iter_mut() {
*weight -= offset;
}
}
}
#[test]
fn weights() {
// assert_eq!(distribute_weights(5).as_slice(), &[1, 1, 2, 3, 4]);
for amount in 2..=256 {
let mut weights = distribute_weights(amount);
assert_eq!(weights.len(), amount);
let sum = weights
.iter()
.copied()
.map(|weight| 1 << weight)
.sum::<usize>();
assert!(sum.is_power_of_two());
for num_bit_limit in (amount.ilog2() as usize + 1)..=11 {
redistribute_weights(&mut weights, num_bit_limit);
let sum = weights
.iter()
.copied()
.map(|weight| 1 << weight)
.sum::<usize>();
assert!(sum.is_power_of_two());
assert!(
sum.ilog2() <= 11,
"Max bits too big: sum: {} {weights:?}",
sum
);
let codes = HuffmanTable::build_from_weights(&weights).codes;
for (code, num_bits) in codes.iter().copied() {
for (code2, num_bits2) in codes.iter().copied() {
if num_bits == 0 || num_bits2 == 0 || (code, num_bits) == (code2, num_bits2) {
continue;
}
if num_bits <= num_bits2 {
let code2_shifted = code2 >> (num_bits2 - num_bits);
assert_ne!(
code, code2_shifted,
"{:b},{num_bits:} is prefix of {:b},{num_bits2:}",
code, code2
);
}
}
}
}
}
}
#[test]
fn counts() {
let counts = &[3, 0, 4, 1, 5];
let table = HuffmanTable::build_from_counts(counts).codes;
assert_eq!(table[1].1, 0);
assert!(table[3].1 >= table[0].1);
assert!(table[0].1 >= table[2].1);
assert!(table[2].1 >= table[4].1);
let counts = &[3, 0, 4, 0, 7, 2, 2, 2, 0, 2, 2, 1, 5];
let table = HuffmanTable::build_from_counts(counts).codes;
assert_eq!(table[1].1, 0);
assert_eq!(table[3].1, 0);
assert_eq!(table[8].1, 0);
assert!(table[11].1 >= table[5].1);
assert!(table[5].1 >= table[6].1);
assert!(table[6].1 >= table[7].1);
assert!(table[7].1 >= table[9].1);
assert!(table[9].1 >= table[10].1);
assert!(table[10].1 >= table[0].1);
assert!(table[0].1 >= table[2].1);
assert!(table[2].1 >= table[12].1);
assert!(table[12].1 >= table[4].1);
}
#[test]
fn from_data() {
let counts = &[3, 0, 4, 1, 5];
let table = HuffmanTable::build_from_counts(counts).codes;
let data = &[0, 2, 4, 4, 0, 3, 2, 2, 0, 2];
let table2 = HuffmanTable::build_from_data(data).codes;
assert_eq!(table, table2);
}

84
vendor/ruzstd/src/huff0/mod.rs vendored Normal file
View File

@@ -0,0 +1,84 @@
/// Huffman coding is a method of encoding where symbols are assigned a code,
/// and more commonly used symbols get shorter codes, and less commonly
/// used symbols get longer codes. Codes are prefix free, meaning no two codes
/// will start with the same sequence of bits.
mod huff0_decoder;
pub use huff0_decoder::*;
pub mod huff0_encoder;
/// Only needed for testing.
///
/// Encodes the data with a table built from that data
/// Decodes the result again by first decoding the table and then the data
/// Asserts that the decoded data equals the input
#[cfg(any(test, feature = "fuzz_exports"))]
pub fn round_trip(data: &[u8]) {
use crate::bit_io::{BitReaderReversed, BitWriter};
use alloc::vec::Vec;
if data.len() < 2 {
return;
}
if data.iter().all(|x| *x == data[0]) {
return;
}
let mut writer = BitWriter::new();
let encoder_table = huff0_encoder::HuffmanTable::build_from_data(data);
let mut encoder = huff0_encoder::HuffmanEncoder::new(&encoder_table, &mut writer);
encoder.encode(data, true);
let encoded = writer.dump();
let mut decoder_table = HuffmanTable::new();
let table_bytes = decoder_table.build_decoder(&encoded).unwrap();
let mut decoder = HuffmanDecoder::new(&decoder_table);
let mut br = BitReaderReversed::new(&encoded[table_bytes as usize..]);
let mut skipped_bits = 0;
loop {
let val = br.get_bits(1);
skipped_bits += 1;
if val == 1 || skipped_bits > 8 {
break;
}
}
if skipped_bits > 8 {
//if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data
panic!("Corrupted end marker");
}
decoder.init_state(&mut br);
let mut decoded = Vec::new();
while br.bits_remaining() > -(decoder_table.max_num_bits as isize) {
decoded.push(decoder.decode_symbol());
decoder.next_state(&mut br);
}
assert_eq!(&decoded, data);
}
#[test]
fn roundtrip() {
use alloc::vec::Vec;
round_trip(&[1, 1, 1, 1, 2, 3]);
round_trip(&[1, 1, 1, 1, 2, 3, 5, 45, 12, 90]);
for size in 2..512 {
use alloc::vec;
let data = vec![123; size];
round_trip(&data);
let mut data = Vec::new();
for x in 0..size {
data.push(x as u8);
}
round_trip(&data);
}
#[cfg(feature = "std")]
if std::fs::exists("fuzz/artifacts/huff0").unwrap_or(false) {
for file in std::fs::read_dir("fuzz/artifacts/huff0").unwrap() {
if file.as_ref().unwrap().file_type().unwrap().is_file() {
let data = std::fs::read(file.unwrap().path()).unwrap();
round_trip(&data);
}
}
}
}

260
vendor/ruzstd/src/io_nostd.rs vendored Normal file
View File

@@ -0,0 +1,260 @@
//! Manual implementations of representations for `#![no_std]`
use alloc::boxed::Box;
#[non_exhaustive]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd)]
pub enum ErrorKind {
Interrupted,
UnexpectedEof,
WouldBlock,
Other,
WriteAllEof,
}
impl ErrorKind {
fn as_str(&self) -> &'static str {
use ErrorKind::*;
match *self {
Interrupted => "operation interrupted",
UnexpectedEof => "unexpected end of file",
WouldBlock => "operation would block",
Other => "other error",
WriteAllEof => "write_all hit EOF",
}
}
}
impl core::fmt::Display for ErrorKind {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.write_str(self.as_str())
}
}
pub struct Error {
kind: ErrorKind,
err: Option<Box<dyn core::fmt::Display + Send + Sync + 'static>>,
}
impl alloc::fmt::Debug for Error {
fn fmt(&self, f: &mut alloc::fmt::Formatter<'_>) -> Result<(), alloc::fmt::Error> {
let mut s = f.debug_struct("Error");
s.field("kind", &self.kind);
if let Some(err) = self.err.as_ref() {
s.field("err", &alloc::format!("{err}"));
}
s.finish()
}
}
impl Error {
pub fn new(kind: ErrorKind, err: Box<dyn core::fmt::Display + Send + Sync + 'static>) -> Self {
Self {
kind,
err: Some(err),
}
}
pub fn from(kind: ErrorKind) -> Self {
Self { kind, err: None }
}
pub fn kind(&self) -> ErrorKind {
self.kind
}
pub fn is_interrupted(&self) -> bool {
matches!(self.kind, ErrorKind::Interrupted)
}
pub fn get_ref(&self) -> Option<&(dyn core::fmt::Display + Send + Sync)> {
self.err.as_ref().map(|e| e.as_ref())
}
pub fn into_inner(self) -> Option<Box<dyn core::fmt::Display + Send + Sync + 'static>> {
self.err
}
}
impl core::fmt::Display for Error {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.write_str(self.kind.as_str())?;
if let Some(ref e) = self.err {
e.fmt(f)?;
}
Ok(())
}
}
impl From<ErrorKind> for Error {
fn from(value: ErrorKind) -> Self {
Self::from(value)
}
}
pub trait Read {
fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error>;
fn read_exact(&mut self, mut buf: &mut [u8]) -> Result<(), Error> {
while !buf.is_empty() {
match self.read(buf) {
Ok(0) => break,
Ok(n) => {
let tmp = buf;
buf = &mut tmp[n..];
}
Err(ref e) if e.kind() == ErrorKind::Interrupted => {}
Err(e) => return Err(e),
}
}
if !buf.is_empty() {
Err(Error::from(ErrorKind::UnexpectedEof))
} else {
Ok(())
}
}
fn read_to_end(&mut self, output: &mut alloc::vec::Vec<u8>) -> Result<(), Error> {
let mut buf = [0u8; 1024 * 16];
loop {
let bytes = self.read(&mut buf)?;
if bytes == 0 {
break;
}
output.extend_from_slice(&buf[..bytes]);
}
Ok(())
}
fn take(self, limit: u64) -> Take<Self>
where
Self: Sized,
{
Take { inner: self, limit }
}
}
impl Read for &[u8] {
fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
let size = core::cmp::min(self.len(), buf.len());
let (to_copy, rest) = self.split_at(size);
if size == 1 {
buf[0] = to_copy[0];
} else {
buf[..size].copy_from_slice(to_copy);
}
*self = rest;
Ok(size)
}
}
impl<T> Read for &mut T
where
T: Read,
{
fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
(*self).read(buf)
}
}
pub struct Take<R: Read> {
inner: R,
limit: u64,
}
impl<R: Read> Take<R> {
pub fn limit(&self) -> u64 {
self.limit
}
pub fn set_limit(&mut self, limit: u64) {
self.limit = limit;
}
pub fn get_ref(&self) -> &R {
&self.inner
}
pub fn get_mut(&mut self) -> &mut R {
&mut self.inner
}
pub fn into_inner(self) -> R {
self.inner
}
}
impl<R: Read> Read for Take<R> {
fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
if self.limit == 0 {
return Ok(0);
}
let at_most = (self.limit as usize).min(buf.len());
let bytes = self.inner.read(&mut buf[..at_most])?;
self.limit -= bytes as u64;
Ok(bytes)
}
}
pub trait Write {
fn write(&mut self, buf: &[u8]) -> Result<usize, Error>;
fn flush(&mut self) -> Result<(), Error>;
fn write_all(&mut self, mut buf: &[u8]) -> Result<(), Error> {
while !buf.is_empty() {
match self.write(buf) {
Ok(0) => {
return Err(Error::from(ErrorKind::WriteAllEof));
}
Ok(n) => buf = &buf[n..],
Err(ref e) if e.is_interrupted() => {}
Err(e) => return Err(e),
}
}
Ok(())
}
}
impl<T> Write for &mut T
where
T: Write,
{
fn write(&mut self, buf: &[u8]) -> Result<usize, Error> {
(*self).write(buf)
}
fn flush(&mut self) -> Result<(), Error> {
(*self).flush()
}
}
impl Write for &mut [u8] {
#[inline]
fn write(&mut self, data: &[u8]) -> Result<usize, Error> {
let amt = core::cmp::min(data.len(), self.len());
let (a, b) = core::mem::take(self).split_at_mut(amt);
a.copy_from_slice(&data[..amt]);
*self = b;
Ok(amt)
}
fn flush(&mut self) -> Result<(), Error> {
Ok(())
}
}
impl Write for alloc::vec::Vec<u8> {
#[inline]
fn write(&mut self, data: &[u8]) -> Result<usize, Error> {
self.extend_from_slice(data);
Ok(data.len())
}
fn flush(&mut self) -> Result<(), Error> {
Ok(())
}
}

3
vendor/ruzstd/src/io_std.rs vendored Normal file
View File

@@ -0,0 +1,3 @@
//! Re-exports of std traits or local reimplementations if std is not available
#[cfg(feature = "std")]
pub use std::io::{Error, ErrorKind, Read, Write};

64
vendor/ruzstd/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,64 @@
//! A pure Rust implementation of the [Zstandard compression format](https://www.rfc-editor.org/rfc/rfc8878.pdf).
//!
//! ## Decompression
//! The [decoding] module contains the code for decompression.
//! Decompression can be achieved by using the [`decoding::StreamingDecoder`]
//! or the more low-level [`decoding::FrameDecoder`]
//!
//! ## Compression
//! The [encoding] module contains the code for compression.
//! Decompression can be achieved by using the [`encoding::compress`]/[`encoding::compress_to_vec`]
//! functions or the [`encoding::FrameCompressor`]
//!
#![doc = include_str!("../Readme.md")]
#![no_std]
#![deny(trivial_casts, trivial_numeric_casts, rust_2018_idioms)]
#[cfg(feature = "std")]
extern crate std;
#[cfg(not(feature = "rustc-dep-of-std"))]
extern crate alloc;
#[cfg(feature = "std")]
pub(crate) const VERBOSE: bool = false;
macro_rules! vprintln {
($($x:expr),*) => {
#[cfg(feature = "std")]
if crate::VERBOSE {
std::println!($($x),*);
}
}
}
mod bit_io;
mod common;
pub mod decoding;
pub mod encoding;
pub(crate) mod blocks;
#[cfg(feature = "fuzz_exports")]
pub mod fse;
#[cfg(feature = "fuzz_exports")]
pub mod huff0;
#[cfg(not(feature = "fuzz_exports"))]
pub(crate) mod fse;
#[cfg(not(feature = "fuzz_exports"))]
pub(crate) mod huff0;
mod tests;
#[cfg(feature = "std")]
pub mod io_std;
#[cfg(feature = "std")]
pub use io_std as io;
#[cfg(not(feature = "std"))]
pub mod io_nostd;
#[cfg(not(feature = "std"))]
pub use io_nostd as io;

79
vendor/ruzstd/src/tests/bit_reader.rs vendored Normal file
View File

@@ -0,0 +1,79 @@
#[test]
fn test_bitreader_reversed() {
use crate::bit_io::BitReaderReversed;
let encoded: [u8; 16] = [
0xC1, 0x41, 0x08, 0x00, 0x00, 0xEC, 0xC8, 0x96, 0x42, 0x79, 0xD4, 0xBC, 0xF7, 0x2C, 0xD5,
0x48,
];
//just the u128 in encoded
let num_rev: u128 = 0x48_D5_2C_F7_BC_D4_79_42_96_C8_EC_00_00_08_41_C1;
let mut br = BitReaderReversed::new(&encoded[..]);
let mut accumulator = 0;
let mut bits_read = 0;
let mut x = 0;
loop {
x += 3;
//semi random access pattern
let mut num_bits = x % 16;
if bits_read > 128 - num_bits {
num_bits = 128 - bits_read;
}
let bits = br.get_bits(num_bits);
bits_read += num_bits;
accumulator |= u128::from(bits) << (128 - bits_read);
if bits_read >= 128 {
break;
}
}
if accumulator != num_rev {
panic!(
"Bitreader failed somewhere. Accumulated bits: {:?}, Should be: {:?}",
accumulator, num_rev
);
}
}
#[test]
fn test_bitreader_normal() {
use crate::bit_io::BitReader;
let encoded: [u8; 16] = [
0xC1, 0x41, 0x08, 0x00, 0x00, 0xEC, 0xC8, 0x96, 0x42, 0x79, 0xD4, 0xBC, 0xF7, 0x2C, 0xD5,
0x48,
];
//just the u128 in encoded
let num: u128 = 0x48_D5_2C_F7_BC_D4_79_42_96_C8_EC_00_00_08_41_C1;
let mut br = BitReader::new(&encoded[..]);
let mut accumulator = 0;
let mut bits_read = 0;
let mut x = 0;
loop {
x += 3;
//semi random access pattern
let mut num_bits = x % 16;
if bits_read > 128 - num_bits {
num_bits = 128 - bits_read;
}
let bits = br.get_bits(num_bits).unwrap();
accumulator |= u128::from(bits) << bits_read;
bits_read += num_bits;
if bits_read >= 128 {
break;
}
}
if accumulator != num {
panic!(
"Bitreader failed somewhere. Accumulated bits: {:?}, Should be: {:?}",
accumulator, num
);
}
}

194
vendor/ruzstd/src/tests/decode_corpus.rs vendored Normal file
View File

@@ -0,0 +1,194 @@
#[test]
fn test_decode_corpus_files() {
extern crate std;
use crate::decoding::BlockDecodingStrategy;
use crate::decoding::FrameDecoder;
use alloc::borrow::ToOwned;
use alloc::string::{String, ToString};
use alloc::vec::Vec;
use std::fs;
use std::io::Read;
use std::println;
let mut success_counter = 0;
let mut fail_counter_diff = 0;
let mut fail_counter_size = 0;
let mut fail_counter_bytes_read = 0;
#[cfg_attr(not(feature = "hash"), allow(unused_mut))]
let mut fail_counter_chksum = 0;
let mut total_counter = 0;
let mut failed: Vec<String> = Vec::new();
let mut speeds = Vec::new();
let mut speeds_read = Vec::new();
let mut files: Vec<_> = fs::read_dir("./decodecorpus_files").unwrap().collect();
if fs::read_dir("./local_corpus_files").is_ok() {
files.extend(fs::read_dir("./local_corpus_files").unwrap());
}
files.sort_by_key(|x| match x {
Err(_) => "".to_owned(),
Ok(entry) => entry.path().to_str().unwrap().to_owned(),
});
let mut frame_dec = FrameDecoder::new();
for file in files {
let f = file.unwrap();
let metadata = f.metadata().unwrap();
let file_size = metadata.len();
let p = String::from(f.path().to_str().unwrap());
if !p.ends_with(".zst") {
continue;
}
println!("Trying file: {}", p);
let mut content = fs::File::open(f.path()).unwrap();
frame_dec.reset(&mut content).unwrap();
let start_time = std::time::Instant::now();
/////DECODING
frame_dec
.decode_blocks(&mut content, BlockDecodingStrategy::All)
.unwrap();
let result = frame_dec.collect().unwrap();
let end_time = start_time.elapsed();
match frame_dec.get_checksum_from_data() {
Some(chksum) => {
#[cfg(feature = "hash")]
if frame_dec.get_calculated_checksum().unwrap() != chksum {
println!(
"Checksum did not match! From data: {}, calculated while decoding: {}\n",
chksum,
frame_dec.get_calculated_checksum().unwrap()
);
fail_counter_chksum += 1;
failed.push(p.clone().to_string());
} else {
println!("Checksums are ok!\n");
}
#[cfg(not(feature = "hash"))]
println!(
"Checksum feature not enabled, skipping. From data: {}\n",
chksum
);
}
None => println!("No checksums to test\n"),
}
let mut original_p = p.clone();
original_p.truncate(original_p.len() - 4);
let original_f = fs::File::open(original_p).unwrap();
let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
println!("Results for file: {}", p.clone());
let mut success = true;
if original.len() != result.len() {
println!(
"Result has wrong length: {}, should be: {}",
result.len(),
original.len()
);
success = false;
fail_counter_size += 1;
}
if frame_dec.bytes_read_from_source() != file_size {
println!(
"Framedecoder counted wrong amount of bytes: {}, should be: {}",
frame_dec.bytes_read_from_source(),
file_size
);
success = false;
fail_counter_bytes_read += 1;
}
let mut counter = 0;
let min = if original.len() < result.len() {
original.len()
} else {
result.len()
};
for idx in 0..min {
if original[idx] != result[idx] {
counter += 1;
//println!(
// "Original {} not equal to result {} at byte: {}",
// original[idx], result[idx], idx,
//);
}
}
if counter > 0 {
println!("Result differs in at least {} bytes from original", counter);
success = false;
fail_counter_diff += 1;
}
if success {
success_counter += 1;
} else {
failed.push(p.clone().to_string());
}
total_counter += 1;
let dur = end_time.as_micros() as usize;
let speed = result.len() / if dur == 0 { 1 } else { dur };
let speed_read = file_size as usize / if dur == 0 { 1 } else { dur };
println!("SPEED: {}", speed);
println!("SPEED_read: {}", speed_read);
speeds.push(speed);
speeds_read.push(speed_read);
}
println!("###################");
println!("Summary:");
println!("###################");
println!(
"Total: {}, Success: {}, WrongSize: {}, WrongBytecount: {}, WrongChecksum: {}, Diffs: {}",
total_counter,
success_counter,
fail_counter_size,
fail_counter_bytes_read,
fail_counter_chksum,
fail_counter_diff
);
println!("Failed files: ");
for f in &failed {
println!("{}", f);
}
let speed_len = speeds.len();
let sum_speed: usize = speeds.into_iter().sum();
let avg_speed = sum_speed / speed_len;
let avg_speed_bps = avg_speed * 1_000_000;
if avg_speed_bps < 1000 {
println!("Average speed: {} B/s", avg_speed_bps);
} else if avg_speed_bps < 1_000_000 {
println!("Average speed: {} KB/s", avg_speed_bps / 1000);
} else {
println!("Average speed: {} MB/s", avg_speed_bps / 1_000_000);
}
let speed_read_len = speeds_read.len();
let sum_speed_read: usize = speeds_read.into_iter().sum();
let avg_speed_read = sum_speed_read / speed_read_len;
let avg_speed_read_bps = avg_speed_read * 1_000_000;
if avg_speed_read_bps < 1000 {
println!("Average speed reading: {} B/s", avg_speed_read_bps);
} else if avg_speed_bps < 1_000_000 {
println!("Average speed reading: {} KB/s", avg_speed_read_bps / 1000);
} else {
println!(
"Average speed reading: {} MB/s",
avg_speed_read_bps / 1_000_000
);
}
assert!(failed.is_empty());
}

266
vendor/ruzstd/src/tests/dict_test.rs vendored Normal file
View File

@@ -0,0 +1,266 @@
#[test]
fn test_dict_parsing() {
use crate::decoding::dictionary::Dictionary;
use alloc::vec;
let mut raw = vec![0u8; 8];
// correct magic num
raw[0] = 0x37;
raw[1] = 0xA4;
raw[2] = 0x30;
raw[3] = 0xEC;
//dict-id
let dict_id = 0x47232101;
raw[4] = 0x01;
raw[5] = 0x21;
raw[6] = 0x23;
raw[7] = 0x47;
// tables copied from ./dict_tests/dictionary
let raw_tables = &[
54, 16, 192, 155, 4, 0, 207, 59, 239, 121, 158, 116, 220, 93, 114, 229, 110, 41, 249, 95,
165, 255, 83, 202, 254, 68, 74, 159, 63, 161, 100, 151, 137, 21, 184, 183, 189, 100, 235,
209, 251, 174, 91, 75, 91, 185, 19, 39, 75, 146, 98, 177, 249, 14, 4, 35, 0, 0, 0, 40, 40,
20, 10, 12, 204, 37, 196, 1, 173, 122, 0, 4, 0, 128, 1, 2, 2, 25, 32, 27, 27, 22, 24, 26,
18, 12, 12, 15, 16, 11, 69, 37, 225, 48, 20, 12, 6, 2, 161, 80, 40, 20, 44, 137, 145, 204,
46, 0, 0, 0, 0, 0, 116, 253, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
];
raw.extend(&raw_tables[..]);
//offset history 3,10,0x00ABCDEF
raw.extend(vec![3, 0, 0, 0]);
raw.extend(vec![10, 0, 0, 0]);
raw.extend(vec![0xEF, 0xCD, 0xAB, 0]);
//just some random bytes
let raw_content = vec![
1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 123, 3, 234, 23, 234, 34, 23, 234, 34, 34, 234, 234,
];
raw.extend(&raw_content);
let dict = Dictionary::decode_dict(&raw).unwrap();
if dict.id != dict_id {
panic!(
"Dict-id did not get parsed correctly. Is: {}, Should be: {}",
dict.id, dict_id
);
}
if !dict.dict_content.eq(&raw_content) {
panic!(
"dict content did not get parsed correctly. Is: {:?}, Should be: {:?}",
dict.dict_content, raw_content
);
}
if !dict.offset_hist.eq(&[3, 10, 0x00ABCDEF]) {
panic!(
"offset history did not get parsed correctly. Is: {:?}, Should be: {:?}",
dict.offset_hist,
[3, 10, 0x00ABCDEF]
);
}
// test magic num checking
raw[0] = 1;
raw[1] = 1;
raw[2] = 1;
raw[3] = 1;
match Dictionary::decode_dict(&raw) {
Ok(_) => panic!("The dict got decoded but the magic num was incorrect!"),
Err(_) => { /* This is what should happen*/ }
}
}
#[test]
fn test_dict_decoding() {
extern crate std;
use crate::decoding::BlockDecodingStrategy;
use crate::decoding::FrameDecoder;
use alloc::borrow::ToOwned;
use alloc::string::{String, ToString};
use alloc::vec::Vec;
use std::fs;
use std::io::Read;
use std::println;
let mut success_counter = 0;
let mut fail_counter_diff = 0;
let mut fail_counter_size = 0;
let mut fail_counter_bytes_read = 0;
let mut total_counter = 0;
let mut failed: Vec<String> = Vec::new();
let mut speeds = Vec::new();
let mut speeds_read = Vec::new();
let mut files: Vec<_> = fs::read_dir("./dict_tests/files").unwrap().collect();
let dict = fs::File::open("./dict_tests/dictionary").unwrap();
let dict: Vec<u8> = dict.bytes().map(|x| x.unwrap()).collect();
files.sort_by_key(|x| match x {
Err(_) => "".to_owned(),
Ok(entry) => entry.path().to_str().unwrap().to_owned(),
});
let mut frame_dec = FrameDecoder::new();
let dict = crate::decoding::dictionary::Dictionary::decode_dict(&dict).unwrap();
frame_dec.add_dict(dict).unwrap();
for file in files {
let f = file.unwrap();
let metadata = f.metadata().unwrap();
let file_size = metadata.len();
let p = String::from(f.path().to_str().unwrap());
if !p.ends_with(".zst") {
continue;
}
println!("Trying file: {}", p);
let mut content = fs::File::open(f.path()).unwrap();
frame_dec.reset(&mut content).unwrap();
let start_time = std::time::Instant::now();
/////DECODING
frame_dec
.decode_blocks(&mut content, BlockDecodingStrategy::All)
.unwrap();
let result = frame_dec.collect().unwrap();
let end_time = start_time.elapsed();
match frame_dec.get_checksum_from_data() {
Some(chksum) => {
#[cfg(feature = "hash")]
if frame_dec.get_calculated_checksum().unwrap() != chksum {
println!(
"Checksum did not match! From data: {}, calculated while decoding: {}\n",
chksum,
frame_dec.get_calculated_checksum().unwrap()
);
} else {
println!("Checksums are ok!\n");
}
#[cfg(not(feature = "hash"))]
println!(
"Checksum feature not enabled, skipping. From data: {}\n",
chksum
);
}
None => println!("No checksums to test\n"),
}
let mut original_p = p.clone();
original_p.truncate(original_p.len() - 4);
let original_f = fs::File::open(original_p).unwrap();
let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
println!("Results for file: {}", p.clone());
let mut success = true;
if original.len() != result.len() {
println!(
"Result has wrong length: {}, should be: {}",
result.len(),
original.len()
);
success = false;
fail_counter_size += 1;
}
if frame_dec.bytes_read_from_source() != file_size {
println!(
"Framedecoder counted wrong amount of bytes: {}, should be: {}",
frame_dec.bytes_read_from_source(),
file_size
);
success = false;
fail_counter_bytes_read += 1;
}
let mut counter = 0;
let min = if original.len() < result.len() {
original.len()
} else {
result.len()
};
for idx in 0..min {
if original[idx] != result[idx] {
counter += 1;
//println!(
// "Original {} not equal to result {} at byte: {}",
// original[idx], result[idx], idx,
//);
}
}
if counter > 0 {
println!("Result differs in at least {} bytes from original", counter);
success = false;
fail_counter_diff += 1;
}
if success {
success_counter += 1;
} else {
failed.push(p.clone().to_string());
}
total_counter += 1;
let dur = end_time.as_micros() as usize;
let speed = result.len() / if dur == 0 { 1 } else { dur };
let speed_read = file_size as usize / if dur == 0 { 1 } else { dur };
println!("SPEED: {}", speed);
println!("SPEED_read: {}", speed_read);
speeds.push(speed);
speeds_read.push(speed_read);
}
println!("###################");
println!("Summary:");
println!("###################");
println!(
"Total: {}, Success: {}, WrongSize: {}, WrongBytecount: {}, Diffs: {}",
total_counter,
success_counter,
fail_counter_size,
fail_counter_bytes_read,
fail_counter_diff
);
println!("Failed files: ");
for f in &failed {
println!("{}", f);
}
let speed_len = speeds.len();
let sum_speed: usize = speeds.into_iter().sum();
let avg_speed = sum_speed / speed_len;
let avg_speed_bps = avg_speed * 1_000_000;
if avg_speed_bps < 1000 {
println!("Average speed: {} B/s", avg_speed_bps);
} else if avg_speed_bps < 1_000_000 {
println!("Average speed: {} KB/s", avg_speed_bps / 1000);
} else {
println!("Average speed: {} MB/s", avg_speed_bps / 1_000_000);
}
let speed_read_len = speeds_read.len();
let sum_speed_read: usize = speeds_read.into_iter().sum();
let avg_speed_read = sum_speed_read / speed_read_len;
let avg_speed_read_bps = avg_speed_read * 1_000_000;
if avg_speed_read_bps < 1000 {
println!("Average speed reading: {} B/s", avg_speed_read_bps);
} else if avg_speed_bps < 1_000_000 {
println!("Average speed reading: {} KB/s", avg_speed_read_bps / 1000);
} else {
println!(
"Average speed reading: {} MB/s",
avg_speed_read_bps / 1_000_000
);
}
assert!(failed.is_empty());
}

233
vendor/ruzstd/src/tests/encode_corpus.rs vendored Normal file
View File

@@ -0,0 +1,233 @@
#[test]
fn test_encode_corpus_files_uncompressed_our_decompressor() {
extern crate std;
use crate::encoding::FrameCompressor;
use alloc::borrow::ToOwned;
use alloc::vec::Vec;
use std::ffi::OsStr;
use std::fs;
use std::io::Read;
use std::path::PathBuf;
use std::println;
let mut failures: Vec<PathBuf> = Vec::new();
let mut files: Vec<_> = fs::read_dir("./decodecorpus_files").unwrap().collect();
if fs::read_dir("./local_corpus_files").is_ok() {
files.extend(fs::read_dir("./local_corpus_files").unwrap());
}
files.sort_by_key(|x| match x {
Err(_) => "".to_owned(),
Ok(entry) => entry.path().to_str().unwrap().to_owned(),
});
for entry in files.iter().map(|f| f.as_ref().unwrap()) {
let path = entry.path();
if path.extension() == Some(OsStr::new("zst")) {
continue;
}
println!("Trying file: {:?}", path);
let input = fs::read(entry.path()).unwrap();
let mut compressed_file: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(crate::encoding::CompressionLevel::Fastest);
compressor.set_source(input.as_slice());
compressor.set_drain(&mut compressed_file);
compressor.compress();
let mut decompressed_output = Vec::new();
let mut decoder =
crate::decoding::StreamingDecoder::new(compressed_file.as_slice()).unwrap();
decoder.read_to_end(&mut decompressed_output).unwrap();
if input != decompressed_output {
failures.push(path);
}
}
if !failures.is_empty() {
panic!(
"Decompression of compressed file failed on the following files: {:?}",
failures
);
}
}
#[test]
fn test_encode_corpus_files_uncompressed_original_decompressor() {
extern crate std;
use crate::encoding::FrameCompressor;
use alloc::borrow::ToOwned;
use alloc::format;
use alloc::vec::Vec;
use std::ffi::OsStr;
use std::fs;
use std::path::PathBuf;
use std::println;
use std::string::String;
let mut failures: Vec<(PathBuf, String)> = Vec::new();
let mut files: Vec<_> = fs::read_dir("./decodecorpus_files").unwrap().collect();
if fs::read_dir("./local_corpus_files").is_ok() {
files.extend(fs::read_dir("./local_corpus_files").unwrap());
}
files.sort_by_key(|x| match x {
Err(_) => "".to_owned(),
Ok(entry) => entry.path().to_str().unwrap().to_owned(),
});
for entry in files.iter().map(|f| f.as_ref().unwrap()) {
let path = entry.path();
if path.extension() == Some(OsStr::new("zst")) {
continue;
}
println!("Trying file: {:?}", path);
let input = fs::read(entry.path()).unwrap();
let mut compressed_file: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(crate::encoding::CompressionLevel::Fastest);
compressor.set_source(input.as_slice());
compressor.set_drain(&mut compressed_file);
compressor.compress();
let mut decompressed_output = Vec::new();
// zstd::stream::copy_decode(compressed_file.as_slice(), &mut decompressed_output).unwrap();
match zstd::stream::copy_decode(compressed_file.as_slice(), &mut decompressed_output) {
Ok(()) => {
if input != decompressed_output {
failures.push((path.to_owned(), "Input didn't equal output".to_owned()));
}
}
Err(e) => {
failures.push((
path.to_owned(),
format!("Decompressor threw an error: {e:?}"),
));
}
};
if !failures.is_empty() {
panic!(
"Decompression of the compressed file fails on the following files: {:?}",
failures
);
}
}
}
#[test]
fn test_encode_corpus_files_compressed_our_decompressor() {
extern crate std;
use crate::encoding::FrameCompressor;
use alloc::borrow::ToOwned;
use alloc::vec::Vec;
use std::ffi::OsStr;
use std::fs;
use std::io::Read;
use std::path::PathBuf;
use std::println;
let mut failures: Vec<PathBuf> = Vec::new();
let mut files: Vec<_> = fs::read_dir("./decodecorpus_files").unwrap().collect();
if fs::read_dir("./local_corpus_files").is_ok() {
files.extend(fs::read_dir("./local_corpus_files").unwrap());
}
files.sort_by_key(|x| match x {
Err(_) => "".to_owned(),
Ok(entry) => entry.path().to_str().unwrap().to_owned(),
});
for entry in files.iter().map(|f| f.as_ref().unwrap()) {
let path = entry.path();
if path.extension() == Some(OsStr::new("zst")) {
continue;
}
println!("Trying file: {:?}", path);
let input = fs::read(entry.path()).unwrap();
let mut compressed_file: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(crate::encoding::CompressionLevel::Fastest);
compressor.set_source(input.as_slice());
compressor.set_drain(&mut compressed_file);
compressor.compress();
let mut decompressed_output = Vec::new();
let mut decoder =
crate::decoding::StreamingDecoder::new(compressed_file.as_slice()).unwrap();
decoder.read_to_end(&mut decompressed_output).unwrap();
if input != decompressed_output {
failures.push(path);
}
}
if !failures.is_empty() {
panic!(
"Decompression of compressed file failed on the following files: {:?}",
failures
);
}
}
#[test]
fn test_encode_corpus_files_compressed_original_decompressor() {
extern crate std;
use crate::encoding::FrameCompressor;
use alloc::borrow::ToOwned;
use alloc::format;
use alloc::vec::Vec;
use std::ffi::OsStr;
use std::fs;
use std::path::PathBuf;
use std::println;
use std::string::String;
let mut failures: Vec<(PathBuf, String)> = Vec::new();
let mut files: Vec<_> = fs::read_dir("./decodecorpus_files").unwrap().collect();
if fs::read_dir("./local_corpus_files").is_ok() {
files.extend(fs::read_dir("./local_corpus_files").unwrap());
}
files.sort_by_key(|x| match x {
Err(_) => "".to_owned(),
Ok(entry) => entry.path().to_str().unwrap().to_owned(),
});
for entry in files.iter().map(|f| f.as_ref().unwrap()) {
let path = entry.path();
if path.extension() == Some(OsStr::new("zst")) {
continue;
}
println!("Trying file: {:?}", path);
let input = fs::read(entry.path()).unwrap();
let mut compressed_file: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(crate::encoding::CompressionLevel::Fastest);
compressor.set_source(input.as_slice());
compressor.set_drain(&mut compressed_file);
compressor.compress();
let mut decompressed_output = Vec::new();
// zstd::stream::copy_decode(compressed_file.as_slice(), &mut decompressed_output).unwrap();
match zstd::stream::copy_decode(compressed_file.as_slice(), &mut decompressed_output) {
Ok(()) => {
if input != decompressed_output {
failures.push((path.to_owned(), "Input didn't equal output".to_owned()));
}
}
Err(e) => {
failures.push((
path.to_owned(),
format!("Decompressor threw an error: {e:?}"),
));
}
};
if !failures.is_empty() {
panic!(
"Decompression of the compressed file fails on the following files: {:?}",
failures
);
}
}
}

View File

@@ -0,0 +1,27 @@
#[test]
fn test_all_artifacts() {
extern crate std;
use crate::decoding::BlockDecodingStrategy;
use crate::decoding::FrameDecoder;
use std::borrow::ToOwned;
use std::fs;
use std::fs::File;
let mut frame_dec = FrameDecoder::new();
for file in fs::read_dir("./fuzz/artifacts/decode").unwrap() {
let file_name = file.unwrap().path();
let fnstr = file_name.to_str().unwrap().to_owned();
if !fnstr.contains("/crash-") {
continue;
}
let mut f = File::open(file_name.clone()).unwrap();
/* ignore errors. It just should never panic on invalid input */
let _: Result<_, _> = frame_dec
.reset(&mut f)
.and_then(|()| frame_dec.decode_blocks(&mut f, BlockDecodingStrategy::All));
}
}

578
vendor/ruzstd/src/tests/mod.rs vendored Normal file
View File

@@ -0,0 +1,578 @@
#[cfg(test)]
use alloc::vec;
#[cfg(test)]
use alloc::vec::Vec;
#[cfg(test)]
extern crate std;
#[cfg(all(test, not(feature = "std")))]
impl crate::io_nostd::Read for std::fs::File {
fn read(&mut self, buf: &mut [u8]) -> Result<usize, crate::io_nostd::Error> {
std::io::Read::read(self, buf).map_err(|e| {
if e.get_ref().is_none() {
crate::io_nostd::Error::from(crate::io_nostd::ErrorKind::Other)
} else {
crate::io_nostd::Error::new(
crate::io_nostd::ErrorKind::Other,
alloc::boxed::Box::new(e.into_inner().unwrap()),
)
}
})
}
}
#[cfg(all(test, feature = "std"))]
#[allow(dead_code)]
fn assure_error_impl() {
// not a real test just there to throw an compiler error if Error is not derived correctly
use crate::decoding::errors::FrameDecoderError;
let _err: &dyn std::error::Error = &FrameDecoderError::NotYetInitialized;
}
#[cfg(all(test, feature = "std"))]
#[allow(dead_code)]
fn assure_decoder_send_sync() {
// not a real test just there to throw an compiler error if FrameDecoder is Send + Sync
use crate::decoding::FrameDecoder;
let decoder = FrameDecoder::new();
std::thread::spawn(move || {
drop(decoder);
});
}
#[test]
fn skippable_frame() {
use crate::decoding::errors;
use crate::decoding::frame;
let mut content = vec![];
content.extend_from_slice(&0x184D2A50u32.to_le_bytes());
content.extend_from_slice(&300u32.to_le_bytes());
assert_eq!(8, content.len());
let err = frame::read_frame_header(content.as_slice());
assert!(matches!(
err,
Err(errors::ReadFrameHeaderError::SkipFrame {
magic_number: 0x184D2A50u32,
length: 300
})
));
content.clear();
content.extend_from_slice(&0x184D2A5Fu32.to_le_bytes());
content.extend_from_slice(&0xFFFFFFFFu32.to_le_bytes());
assert_eq!(8, content.len());
let err = frame::read_frame_header(content.as_slice());
assert!(matches!(
err,
Err(errors::ReadFrameHeaderError::SkipFrame {
magic_number: 0x184D2A5Fu32,
length: 0xFFFFFFFF
})
));
}
#[cfg(test)]
#[test]
fn test_frame_header_reading() {
use crate::decoding::frame;
use std::fs;
let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap();
let (_frame, _) = frame::read_frame_header(&mut content).unwrap();
}
#[test]
fn test_block_header_reading() {
use crate::decoding;
use crate::decoding::frame;
use std::fs;
let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap();
let (_frame, _) = frame::read_frame_header(&mut content).unwrap();
let mut block_dec = decoding::block_decoder::new();
let block_header = block_dec.read_block_header(&mut content).unwrap();
let _ = block_header; //TODO validate blockheader in a smart way
}
#[test]
fn test_frame_decoder() {
use crate::decoding::BlockDecodingStrategy;
use crate::decoding::FrameDecoder;
use std::fs;
let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap();
struct NullWriter(());
impl std::io::Write for NullWriter {
fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
Ok(buf.len())
}
fn flush(&mut self) -> Result<(), std::io::Error> {
Ok(())
}
}
let mut _null_target = NullWriter(());
let mut frame_dec = FrameDecoder::new();
frame_dec.reset(&mut content).unwrap();
frame_dec
.decode_blocks(&mut content, BlockDecodingStrategy::All)
.unwrap();
}
#[test]
fn test_decode_from_to() {
use crate::decoding::FrameDecoder;
use std::fs::File;
use std::io::Read;
let f = File::open("./decodecorpus_files/z000088.zst").unwrap();
let mut frame_dec = FrameDecoder::new();
let content: Vec<u8> = f.bytes().map(|x| x.unwrap()).collect();
let mut target = vec![0u8; 1024 * 1024];
// first part
let source1 = &content[..50 * 1024];
let (read1, written1) = frame_dec
.decode_from_to(source1, target.as_mut_slice())
.unwrap();
//second part explicitely without checksum
let source2 = &content[read1..content.len() - 4];
let (read2, written2) = frame_dec
.decode_from_to(source2, &mut target[written1..])
.unwrap();
//must have decoded until checksum
assert!(read1 + read2 == content.len() - 4);
//insert checksum separatly to test that this is handled correctly
let chksum_source = &content[read1 + read2..];
let (read3, written3) = frame_dec
.decode_from_to(chksum_source, &mut target[written1 + written2..])
.unwrap();
//this must result in these values because just the checksum was processed
assert!(read3 == 4);
assert!(written3 == 0);
let read = read1 + read2 + read3;
let written = written1 + written2;
let result = &target.as_slice()[..written];
if read != content.len() {
panic!(
"Byte counter: {} was wrong. Should be: {}",
read,
content.len()
);
}
match frame_dec.get_checksum_from_data() {
Some(chksum) => {
#[cfg(feature = "hash")]
if frame_dec.get_calculated_checksum().unwrap() != chksum {
std::println!(
"Checksum did not match! From data: {}, calculated while decoding: {}\n",
chksum,
frame_dec.get_calculated_checksum().unwrap()
);
} else {
std::println!("Checksums are ok!\n");
}
#[cfg(not(feature = "hash"))]
std::println!(
"Checksum feature not enabled, skipping. From data: {}\n",
chksum
);
}
None => std::println!("No checksums to test\n"),
}
let original_f = File::open("./decodecorpus_files/z000088").unwrap();
let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
if original.len() != result.len() {
panic!(
"Result has wrong length: {}, should be: {}",
result.len(),
original.len()
);
}
let mut counter = 0;
let min = if original.len() < result.len() {
original.len()
} else {
result.len()
};
for idx in 0..min {
if original[idx] != result[idx] {
counter += 1;
//std::println!(
// "Original {:3} not equal to result {:3} at byte: {}",
// original[idx], result[idx], idx,
//);
}
}
if counter > 0 {
panic!("Result differs in at least {} bytes from original", counter);
}
}
#[test]
fn test_specific_file() {
use crate::decoding::BlockDecodingStrategy;
use crate::decoding::FrameDecoder;
use std::fs;
use std::io::Read;
let path = "./decodecorpus_files/z000068.zst";
let mut content = fs::File::open(path).unwrap();
struct NullWriter(());
impl std::io::Write for NullWriter {
fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
Ok(buf.len())
}
fn flush(&mut self) -> Result<(), std::io::Error> {
Ok(())
}
}
let mut _null_target = NullWriter(());
let mut frame_dec = FrameDecoder::new();
frame_dec.reset(&mut content).unwrap();
frame_dec
.decode_blocks(&mut content, BlockDecodingStrategy::All)
.unwrap();
let result = frame_dec.collect().unwrap();
let original_f = fs::File::open("./decodecorpus_files/z000088").unwrap();
let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
std::println!("Results for file: {}", path);
if original.len() != result.len() {
std::println!(
"Result has wrong length: {}, should be: {}",
result.len(),
original.len()
);
}
let mut counter = 0;
let min = if original.len() < result.len() {
original.len()
} else {
result.len()
};
for idx in 0..min {
if original[idx] != result[idx] {
counter += 1;
//std::println!(
// "Original {:3} not equal to result {:3} at byte: {}",
// original[idx], result[idx], idx,
//);
}
}
if counter > 0 {
std::println!("Result differs in at least {} bytes from original", counter);
}
}
#[test]
#[cfg(feature = "std")]
fn test_streaming() {
use std::fs;
use std::io::Read;
let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap();
let mut stream = crate::decoding::StreamingDecoder::new(&mut content).unwrap();
let mut result = Vec::new();
Read::read_to_end(&mut stream, &mut result).unwrap();
let original_f = fs::File::open("./decodecorpus_files/z000088").unwrap();
let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
if original.len() != result.len() {
panic!(
"Result has wrong length: {}, should be: {}",
result.len(),
original.len()
);
}
let mut counter = 0;
let min = if original.len() < result.len() {
original.len()
} else {
result.len()
};
for idx in 0..min {
if original[idx] != result[idx] {
counter += 1;
//std::println!(
// "Original {:3} not equal to result {:3} at byte: {}",
// original[idx], result[idx], idx,
//);
}
}
if counter > 0 {
panic!("Result differs in at least {} bytes from original", counter);
}
// Test resetting to a new file while keeping the old decoder
let mut content = fs::File::open("./decodecorpus_files/z000068.zst").unwrap();
let mut stream = crate::decoding::StreamingDecoder::new_with_decoder(
&mut content,
stream.into_frame_decoder(),
)
.unwrap();
let mut result = Vec::new();
Read::read_to_end(&mut stream, &mut result).unwrap();
let original_f = fs::File::open("./decodecorpus_files/z000068").unwrap();
let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
std::println!("Results for file:");
if original.len() != result.len() {
panic!(
"Result has wrong length: {}, should be: {}",
result.len(),
original.len()
);
}
let mut counter = 0;
let min = if original.len() < result.len() {
original.len()
} else {
result.len()
};
for idx in 0..min {
if original[idx] != result[idx] {
counter += 1;
//std::println!(
// "Original {:3} not equal to result {:3} at byte: {}",
// original[idx], result[idx], idx,
//);
}
}
if counter > 0 {
panic!("Result differs in at least {} bytes from original", counter);
}
}
#[test]
fn test_incremental_read() {
use crate::decoding::FrameDecoder;
let mut unread_compressed_content =
include_bytes!("../../decodecorpus_files/abc.txt.zst").as_slice();
let mut frame_dec = FrameDecoder::new();
frame_dec.reset(&mut unread_compressed_content).unwrap();
let mut output = [0u8; 3];
let (_, written) = frame_dec
.decode_from_to(unread_compressed_content, &mut output)
.unwrap();
assert_eq!(written, 3);
assert_eq!(output.map(char::from), ['a', 'b', 'c']);
assert!(frame_dec.is_finished());
let written = frame_dec.collect_to_writer(&mut &mut output[..]).unwrap();
assert_eq!(written, 3);
assert_eq!(output.map(char::from), ['d', 'e', 'f']);
}
#[test]
#[cfg(not(feature = "std"))]
fn test_streaming_no_std() {
use crate::io::Read;
let content = include_bytes!("../../decodecorpus_files/z000088.zst");
let mut content = content.as_slice();
let mut stream = crate::decoding::StreamingDecoder::new(&mut content).unwrap();
let original = include_bytes!("../../decodecorpus_files/z000088");
let mut result = vec![0; original.len()];
Read::read_exact(&mut stream, &mut result).unwrap();
if original.len() != result.len() {
panic!(
"Result has wrong length: {}, should be: {}",
result.len(),
original.len()
);
}
let mut counter = 0;
let min = if original.len() < result.len() {
original.len()
} else {
result.len()
};
for idx in 0..min {
if original[idx] != result[idx] {
counter += 1;
//std::println!(
// "Original {:3} not equal to result {:3} at byte: {}",
// original[idx], result[idx], idx,
//);
}
}
if counter > 0 {
panic!("Result differs in at least {} bytes from original", counter);
}
// Test resetting to a new file while keeping the old decoder
let content = include_bytes!("../../decodecorpus_files/z000068.zst");
let mut content = content.as_slice();
let mut stream = crate::decoding::StreamingDecoder::new_with_decoder(
&mut content,
stream.into_frame_decoder(),
)
.unwrap();
let original = include_bytes!("../../decodecorpus_files/z000068");
let mut result = vec![0; original.len()];
Read::read_exact(&mut stream, &mut result).unwrap();
std::println!("Results for file:");
if original.len() != result.len() {
panic!(
"Result has wrong length: {}, should be: {}",
result.len(),
original.len()
);
}
let mut counter = 0;
let min = if original.len() < result.len() {
original.len()
} else {
result.len()
};
for idx in 0..min {
if original[idx] != result[idx] {
counter += 1;
//std::println!(
// "Original {:3} not equal to result {:3} at byte: {}",
// original[idx], result[idx], idx,
//);
}
}
if counter > 0 {
panic!("Result differs in at least {} bytes from original", counter);
}
}
#[test]
fn test_decode_all() {
use crate::decoding::errors::FrameDecoderError;
use crate::decoding::FrameDecoder;
let skip_frame = |input: &mut Vec<u8>, length: usize| {
input.extend_from_slice(&0x184D2A50u32.to_le_bytes());
input.extend_from_slice(&(length as u32).to_le_bytes());
input.resize(input.len() + length, 0);
};
let mut original = Vec::new();
let mut input = Vec::new();
skip_frame(&mut input, 300);
input.extend_from_slice(include_bytes!("../../decodecorpus_files/z000089.zst"));
original.extend_from_slice(include_bytes!("../../decodecorpus_files/z000089"));
skip_frame(&mut input, 400);
input.extend_from_slice(include_bytes!("../../decodecorpus_files/z000090.zst"));
original.extend_from_slice(include_bytes!("../../decodecorpus_files/z000090"));
skip_frame(&mut input, 500);
let mut decoder = FrameDecoder::new();
// decode_all with correct buffers.
let mut output = vec![0; original.len()];
let result = decoder.decode_all(&input, &mut output).unwrap();
assert_eq!(result, original.len());
assert_eq!(output, original);
// decode_all with smaller output length.
let mut output = vec![0; original.len() - 1];
let result = decoder.decode_all(&input, &mut output);
assert!(
matches!(result, Err(FrameDecoderError::TargetTooSmall)),
"{:?}",
result
);
// decode_all with larger output length.
let mut output = vec![0; original.len() + 1];
let result = decoder.decode_all(&input, &mut output).unwrap();
assert_eq!(result, original.len());
assert_eq!(&output[..result], original);
// decode_all with truncated regular frame.
let mut output = vec![0; original.len()];
let result = decoder.decode_all(&input[..input.len() - 600], &mut output);
assert!(
matches!(result, Err(FrameDecoderError::FailedToReadBlockBody(_))),
"{:?}",
result
);
// decode_all with truncated skip frame.
let mut output = vec![0; original.len()];
let result = decoder.decode_all(&input[..input.len() - 1], &mut output);
assert!(
matches!(result, Err(FrameDecoderError::FailedToSkipFrame)),
"{:?}",
result
);
// decode_all_to_vec with correct output capacity.
let mut output = Vec::new();
output.reserve_exact(original.len());
decoder.decode_all_to_vec(&input, &mut output).unwrap();
assert_eq!(output, original);
// decode_all_to_vec with smaller output capacity.
let mut output = Vec::new();
output.reserve_exact(original.len() - 1);
let result = decoder.decode_all_to_vec(&input, &mut output);
assert!(
matches!(result, Err(FrameDecoderError::TargetTooSmall)),
"{:?}",
result
);
// decode_all_to_vec with larger output capacity.
let mut output = Vec::new();
output.reserve_exact(original.len() + 1);
decoder.decode_all_to_vec(&input, &mut output).unwrap();
assert_eq!(output, original);
}
pub mod bit_reader;
pub mod decode_corpus;
pub mod dict_test;
#[cfg(feature = "std")]
pub mod encode_corpus;
pub mod fuzz_regressions;

BIN
vendor/ruzstd/test_fixtures/abc.txt.zst vendored Normal file

Binary file not shown.