Vendor dependencies for 0.3.0 release

This commit is contained in:
2025-09-27 10:29:08 -05:00
parent 0c8d39d483
commit 82ab7f317b
26803 changed files with 16134934 additions and 0 deletions

1
vendor/blake3/.cargo-checksum.json vendored Normal file

File diff suppressed because one or more lines are too long

31
vendor/blake3/CONTRIBUTING.md vendored Normal file
View File

@@ -0,0 +1,31 @@
# Contributing
We welcome and encourage third-party contributions to BLAKE3, be it reports of issues encountered while using the software or proposals of patches.
## Bug reports
Bugs and other problems should be reported on [GitHub Issues](https://github.com/BLAKE3/BLAKE3/issues).
If you report a bug, please:
* Check that it's not already reported in the [GitHub Issues](https://github.com/BLAKE3/BLAKE3/issues).
* Provide information to help us diagnose and ideally reproduce the bug.
## Patches
We encourage you to fix a bug via a [GitHub Pull request](https://github.com/BLAKE3/BLAKE3/pulls), preferably after creating a related issue and referring it in the PR.
If you contribute code and submit a patch, please note the following:
* We use Rust's stable branch for developing BLAKE3.
* Pull requests should target the `master` branch.
* Try to follow the established Rust [style guidelines](https://doc.rust-lang.org/1.0.0/style/).
Also please make sure to create new unit tests covering your code additions. You can execute the tests by running:
```bash
cargo test
```
All third-party contributions will be recognized in the list of contributors.

585
vendor/blake3/Cargo.lock generated vendored Normal file
View File

@@ -0,0 +1,585 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "arrayref"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"
[[package]]
name = "arrayvec"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
dependencies = [
"zeroize",
]
[[package]]
name = "bitflags"
version = "2.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd"
[[package]]
name = "blake3"
version = "1.8.2"
dependencies = [
"arrayref",
"arrayvec",
"cc",
"cfg-if",
"ciborium",
"constant_time_eq",
"digest",
"hex",
"hmac",
"memmap2",
"page_size",
"rand",
"rand_chacha",
"rayon-core",
"serde",
"serde_json",
"tempfile",
"zeroize",
]
[[package]]
name = "block-buffer"
version = "0.10.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
dependencies = [
"generic-array",
]
[[package]]
name = "cc"
version = "1.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e3a13707ac958681c13b39b458c073d0d9bc8a22cb1b2f4c8e55eb72c13f362"
dependencies = [
"shlex",
]
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "ciborium"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
dependencies = [
"ciborium-io",
"ciborium-ll",
"serde",
]
[[package]]
name = "ciborium-io"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
[[package]]
name = "ciborium-ll"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
dependencies = [
"ciborium-io",
"half",
]
[[package]]
name = "constant_time_eq"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6"
[[package]]
name = "crossbeam-deque"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
dependencies = [
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "crunchy"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929"
[[package]]
name = "crypto-common"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
dependencies = [
"generic-array",
"typenum",
]
[[package]]
name = "digest"
version = "0.10.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
dependencies = [
"block-buffer",
"crypto-common",
"subtle",
]
[[package]]
name = "errno"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "976dd42dc7e85965fe702eb8164f21f450704bdde31faefd6471dba214cb594e"
dependencies = [
"libc",
"windows-sys",
]
[[package]]
name = "fastrand"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]]
name = "generic-array"
version = "0.14.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
dependencies = [
"typenum",
"version_check",
]
[[package]]
name = "getrandom"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73fea8450eea4bac3940448fb7ae50d91f034f941199fcd9d909a5a07aa455f0"
dependencies = [
"cfg-if",
"libc",
"r-efi",
"wasi",
]
[[package]]
name = "half"
version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "459196ed295495a68f7d7fe1d84f6c4b7ff0e21fe3017b2f283c6fac3ad803c9"
dependencies = [
"cfg-if",
"crunchy",
]
[[package]]
name = "hex"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
[[package]]
name = "hmac"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
dependencies = [
"digest",
]
[[package]]
name = "itoa"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "libc"
version = "0.2.172"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
[[package]]
name = "linux-raw-sys"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "memmap2"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f"
dependencies = [
"libc",
]
[[package]]
name = "once_cell"
version = "1.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
[[package]]
name = "page_size"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30d5b2194ed13191c1999ae0704b7839fb18384fa22e49b57eeaa97d79ce40da"
dependencies = [
"libc",
"winapi",
]
[[package]]
name = "ppv-lite86"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
dependencies = [
"zerocopy",
]
[[package]]
name = "proc-macro2"
version = "1.0.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
dependencies = [
"proc-macro2",
]
[[package]]
name = "r-efi"
version = "5.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
[[package]]
name = "rand"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
dependencies = [
"rand_chacha",
"rand_core",
]
[[package]]
name = "rand_chacha"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
dependencies = [
"ppv-lite86",
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
dependencies = [
"getrandom",
]
[[package]]
name = "rayon-core"
version = "1.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
dependencies = [
"crossbeam-deque",
"crossbeam-utils",
]
[[package]]
name = "rustix"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d97817398dd4bb2e6da002002db259209759911da105da92bec29ccb12cf58bf"
dependencies = [
"bitflags",
"errno",
"libc",
"linux-raw-sys",
"windows-sys",
]
[[package]]
name = "ryu"
version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
[[package]]
name = "serde"
version = "1.0.219"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.219"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.140"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373"
dependencies = [
"itoa",
"memchr",
"ryu",
"serde",
]
[[package]]
name = "shlex"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]]
name = "subtle"
version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
[[package]]
name = "syn"
version = "2.0.100"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "tempfile"
version = "3.19.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7437ac7763b9b123ccf33c338a5cc1bac6f69b45a136c19bdd8a65e3916435bf"
dependencies = [
"fastrand",
"getrandom",
"once_cell",
"rustix",
"windows-sys",
]
[[package]]
name = "typenum"
version = "1.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"
[[package]]
name = "unicode-ident"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
[[package]]
name = "version_check"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
name = "wasi"
version = "0.14.2+wasi-0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
dependencies = [
"wit-bindgen-rt",
]
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "wit-bindgen-rt"
version = "0.39.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
dependencies = [
"bitflags",
]
[[package]]
name = "zerocopy"
version = "0.8.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879"
dependencies = [
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.8.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "zeroize"
version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"

135
vendor/blake3/Cargo.toml vendored Normal file
View File

@@ -0,0 +1,135 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2021"
name = "blake3"
version = "1.8.2"
authors = [
"Jack O'Connor <oconnor663@gmail.com>",
"Samuel Neves",
]
build = "build.rs"
autolib = false
autobins = false
autoexamples = false
autotests = false
autobenches = false
description = "the BLAKE3 hash function"
documentation = "https://docs.rs/blake3"
readme = "README.md"
license = "CC0-1.0 OR Apache-2.0 OR Apache-2.0 WITH LLVM-exception"
repository = "https://github.com/BLAKE3-team/BLAKE3"
[package.metadata.docs.rs]
features = [
"mmap",
"rayon",
"serde",
"zeroize",
]
[features]
default = ["std"]
digest = ["dep:digest"]
mmap = [
"std",
"dep:memmap2",
]
neon = []
no_avx2 = []
no_avx512 = []
no_neon = []
no_sse2 = []
no_sse41 = []
prefer_intrinsics = []
pure = []
rayon = ["dep:rayon-core"]
std = []
traits-preview = ["dep:digest"]
wasm32_simd = []
zeroize = [
"dep:zeroize",
"arrayvec/zeroize",
]
[lib]
name = "blake3"
path = "src/lib.rs"
[[bench]]
name = "bench"
path = "benches/bench.rs"
[dependencies.arrayref]
version = "0.3.5"
[dependencies.arrayvec]
version = "0.7.4"
default-features = false
[dependencies.cfg-if]
version = "1.0.0"
[dependencies.constant_time_eq]
version = "0.3.1"
default-features = false
[dependencies.digest]
version = "0.10.1"
features = ["mac"]
optional = true
[dependencies.memmap2]
version = "0.9"
optional = true
[dependencies.rayon-core]
version = "1.12.1"
optional = true
[dependencies.serde]
version = "1.0"
features = ["derive"]
optional = true
default-features = false
[dependencies.zeroize]
version = "1"
optional = true
default-features = false
[dev-dependencies.ciborium]
version = "0.2.2"
[dev-dependencies.hex]
version = "0.4.2"
[dev-dependencies.hmac]
version = "0.12.0"
[dev-dependencies.page_size]
version = "0.6.0"
[dev-dependencies.rand]
version = "0.9.0"
[dev-dependencies.rand_chacha]
version = "0.9.0"
[dev-dependencies.serde_json]
version = "1.0.107"
[dev-dependencies.tempfile]
version = "3.8.0"
[build-dependencies.cc]
version = "1.1.12"

202
vendor/blake3/LICENSE_A2 vendored Normal file
View File

@@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2019 Jack O'Connor and Samuel Neves
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

219
vendor/blake3/LICENSE_A2LLVM vendored Normal file
View File

@@ -0,0 +1,219 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2019 Jack O'Connor and Samuel Neves
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
---- LLVM Exceptions to the Apache 2.0 License ----
As an exception, if, as a result of your compiling your source code, portions
of this Software are embedded into an Object form of such source code, you
may redistribute such embedded portions in such Object form without complying
with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
In addition, if you combine or link compiled forms of this Software with
software that is licensed under the GPLv2 ("Combined Software") and if a
court of competent jurisdiction determines that the patent provision (Section
3), the indemnity provision (Section 9) or other Section of the License
conflicts with the conditions of the GPLv2, you may retroactively and
prospectively choose to deem waived or otherwise exclude such Section(s) of
the License, but only in their entirety and only with respect to the Combined
Software.

121
vendor/blake3/LICENSE_CC0 vendored Normal file
View File

@@ -0,0 +1,121 @@
Creative Commons Legal Code
CC0 1.0 Universal
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
HEREUNDER.
Statement of Purpose
The laws of most jurisdictions throughout the world automatically confer
exclusive Copyright and Related Rights (defined below) upon the creator
and subsequent owner(s) (each and all, an "owner") of an original work of
authorship and/or a database (each, a "Work").
Certain owners wish to permanently relinquish those rights to a Work for
the purpose of contributing to a commons of creative, cultural and
scientific works ("Commons") that the public can reliably and without fear
of later claims of infringement build upon, modify, incorporate in other
works, reuse and redistribute as freely as possible in any form whatsoever
and for any purposes, including without limitation commercial purposes.
These owners may contribute to the Commons to promote the ideal of a free
culture and the further production of creative, cultural and scientific
works, or to gain reputation or greater distribution for their Work in
part through the use and efforts of others.
For these and/or other purposes and motivations, and without any
expectation of additional consideration or compensation, the person
associating CC0 with a Work (the "Affirmer"), to the extent that he or she
is an owner of Copyright and Related Rights in the Work, voluntarily
elects to apply CC0 to the Work and publicly distribute the Work under its
terms, with knowledge of his or her Copyright and Related Rights in the
Work and the meaning and intended legal effect of CC0 on those rights.
1. Copyright and Related Rights. A Work made available under CC0 may be
protected by copyright and related or neighboring rights ("Copyright and
Related Rights"). Copyright and Related Rights include, but are not
limited to, the following:
i. the right to reproduce, adapt, distribute, perform, display,
communicate, and translate a Work;
ii. moral rights retained by the original author(s) and/or performer(s);
iii. publicity and privacy rights pertaining to a person's image or
likeness depicted in a Work;
iv. rights protecting against unfair competition in regards to a Work,
subject to the limitations in paragraph 4(a), below;
v. rights protecting the extraction, dissemination, use and reuse of data
in a Work;
vi. database rights (such as those arising under Directive 96/9/EC of the
European Parliament and of the Council of 11 March 1996 on the legal
protection of databases, and under any national implementation
thereof, including any amended or successor version of such
directive); and
vii. other similar, equivalent or corresponding rights throughout the
world based on applicable law or treaty, and any national
implementations thereof.
2. Waiver. To the greatest extent permitted by, but not in contravention
of, applicable law, Affirmer hereby overtly, fully, permanently,
irrevocably and unconditionally waives, abandons, and surrenders all of
Affirmer's Copyright and Related Rights and associated claims and causes
of action, whether now known or unknown (including existing as well as
future claims and causes of action), in the Work (i) in all territories
worldwide, (ii) for the maximum duration provided by applicable law or
treaty (including future time extensions), (iii) in any current or future
medium and for any number of copies, and (iv) for any purpose whatsoever,
including without limitation commercial, advertising or promotional
purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
member of the public at large and to the detriment of Affirmer's heirs and
successors, fully intending that such Waiver shall not be subject to
revocation, rescission, cancellation, termination, or any other legal or
equitable action to disrupt the quiet enjoyment of the Work by the public
as contemplated by Affirmer's express Statement of Purpose.
3. Public License Fallback. Should any part of the Waiver for any reason
be judged legally invalid or ineffective under applicable law, then the
Waiver shall be preserved to the maximum extent permitted taking into
account Affirmer's express Statement of Purpose. In addition, to the
extent the Waiver is so judged Affirmer hereby grants to each affected
person a royalty-free, non transferable, non sublicensable, non exclusive,
irrevocable and unconditional license to exercise Affirmer's Copyright and
Related Rights in the Work (i) in all territories worldwide, (ii) for the
maximum duration provided by applicable law or treaty (including future
time extensions), (iii) in any current or future medium and for any number
of copies, and (iv) for any purpose whatsoever, including without
limitation commercial, advertising or promotional purposes (the
"License"). The License shall be deemed effective as of the date CC0 was
applied by Affirmer to the Work. Should any part of the License for any
reason be judged legally invalid or ineffective under applicable law, such
partial invalidity or ineffectiveness shall not invalidate the remainder
of the License, and in such case Affirmer hereby affirms that he or she
will not (i) exercise any of his or her remaining Copyright and Related
Rights in the Work or (ii) assert any associated claims and causes of
action with respect to the Work, in either case contrary to Affirmer's
express Statement of Purpose.
4. Limitations and Disclaimers.
a. No trademark or patent rights held by Affirmer are waived, abandoned,
surrendered, licensed or otherwise affected by this document.
b. Affirmer offers the Work as-is and makes no representations or
warranties of any kind concerning the Work, express, implied,
statutory or otherwise, including without limitation warranties of
title, merchantability, fitness for a particular purpose, non
infringement, or the absence of latent or other defects, accuracy, or
the present or absence of errors, whether or not discoverable, all to
the greatest extent permissible under applicable law.
c. Affirmer disclaims responsibility for clearing rights of other persons
that may apply to the Work or any use thereof, including without
limitation any person's Copyright and Related Rights in the Work.
Further, Affirmer disclaims responsibility for obtaining any necessary
consents, permissions or other rights required for any use of the
Work.
d. Affirmer understands and acknowledges that Creative Commons is not a
party to this document and has no duty or obligation with respect to
this CC0 or use of the Work.

229
vendor/blake3/README.md vendored Normal file
View File

@@ -0,0 +1,229 @@
# BLAKE3
BLAKE3 is a cryptographic hash function that is:
- **Much faster** than MD5, SHA-1, SHA-2, SHA-3, and BLAKE2.
- **Secure**, unlike MD5 and SHA-1. And secure against length extension,
unlike SHA-2.
- **Highly parallelizable** across any number of threads and SIMD lanes,
because it's a Merkle tree on the inside.
- Capable of **verified streaming** and **incremental updates**, again
because it's a Merkle tree.
- A **PRF**, **MAC**, **KDF**, and **XOF**, as well as a regular hash.
- **One algorithm with no variants**, which is fast on x86-64 and also
on smaller architectures.
The [chart below](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/benchmarks/bar_chart.py)
is an example benchmark of 16 KiB inputs on a Cascade Lake-SP 8275CL server CPU
from 2019. For more detailed benchmarks, see the
[BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf).
<p align="center">
<img src="media/speed.svg" alt="performance graph">
</p>
BLAKE3 is based on an optimized instance of the established hash
function [BLAKE2](https://blake2.net) and on the [original Bao tree
mode](https://github.com/oconnor663/bao/blob/master/docs/spec_0.9.1.md).
The specifications and design rationale are available in the [BLAKE3
paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf).
The default output size is 256 bits. The current version of
[Bao](https://github.com/oconnor663/bao) implements verified streaming
with BLAKE3.
This repository is the official implementation of BLAKE3. It includes:
* The [`blake3`](https://crates.io/crates/blake3) Rust crate, which
includes optimized implementations for SSE2, SSE4.1, AVX2, AVX-512,
NEON, and WASM, with automatic runtime CPU feature detection on x86.
The `rayon` feature provides multithreading.
* The [`b3sum`](https://crates.io/crates/b3sum) Rust crate, which
provides a command line interface. It uses multithreading by default,
making it an order of magnitude faster than e.g. `sha256sum` on
typical desktop hardware.
* The [C implementation](c), which like the Rust implementation includes SIMD
optimizations (all except WASM), CPU feature detection on x86, and optional
multithreading. See [`c/README.md`](c/README.md).
* The [Rust reference implementation](reference_impl/reference_impl.rs),
which is discussed in Section 5.1 of the [BLAKE3
paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf).
This implementation is much smaller and simpler than the optimized
ones above. If you want to see how BLAKE3 works, or you're writing a
port that doesn't need multithreading or SIMD optimizations, start
here. Ports of the reference implementation to other languages are
hosted in separate repositories
([C](https://github.com/oconnor663/blake3_reference_impl_c),
[Python](https://github.com/oconnor663/pure_python_blake3)).
* A [set of test
vectors](https://github.com/BLAKE3-team/BLAKE3/blob/master/test_vectors/test_vectors.json)
that covers extended outputs, all three modes, and a variety of input
lengths.
* [![Actions Status](https://github.com/BLAKE3-team/BLAKE3/workflows/tests/badge.svg)](https://github.com/BLAKE3-team/BLAKE3/actions)
BLAKE3 was designed by:
* [@oconnor663] (Jack O'Connor)
* [@sneves] (Samuel Neves)
* [@veorq] (Jean-Philippe Aumasson)
* [@zookozcash] (Zooko)
The development of BLAKE3 was sponsored by [Electric Coin Company](https://electriccoin.co).
BLAKE3 is also [specified](https://c2sp.org/BLAKE3) in the [Community
Cryptography Specification Project (C2SP)](https://c2sp.org).
*NOTE: BLAKE3 is not a password hashing algorithm, because it's
designed to be fast, whereas password hashing should not be fast. If you
hash passwords to store the hashes or if you derive keys from passwords,
we recommend [Argon2](https://github.com/P-H-C/phc-winner-argon2).*
## Usage
### The `b3sum` utility
The `b3sum` command line utility prints the BLAKE3 hashes of files or of
standard input. Prebuilt binaries are available for Linux, Windows, and
macOS (requiring the [unidentified developer
workaround](https://support.apple.com/guide/mac-help/open-a-mac-app-from-an-unidentified-developer-mh40616/mac))
on the [releases page](https://github.com/BLAKE3-team/BLAKE3/releases).
If you've [installed Rust and
Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html),
you can also build `b3sum` yourself with:
```bash
cargo install b3sum
```
If `rustup` didn't configure your `PATH` for you, you might need to go
looking for the installed binary in e.g. `~/.cargo/bin`. You can test
out how fast BLAKE3 is on your machine by creating a big file and
hashing it, for example:
```bash
# Create a 1 GB file.
head -c 1000000000 /dev/zero > /tmp/bigfile
# Hash it with SHA-256.
time openssl sha256 /tmp/bigfile
# Hash it with BLAKE3.
time b3sum /tmp/bigfile
```
### The `blake3` crate [![docs.rs](https://docs.rs/blake3/badge.svg)](https://docs.rs/blake3)
To use BLAKE3 from Rust code, add a dependency on the `blake3` crate to
your `Cargo.toml`. Here's an example of hashing some input bytes:
```rust
// Hash an input all at once.
let hash1 = blake3::hash(b"foobarbaz");
// Hash an input incrementally.
let mut hasher = blake3::Hasher::new();
hasher.update(b"foo");
hasher.update(b"bar");
hasher.update(b"baz");
let hash2 = hasher.finalize();
assert_eq!(hash1, hash2);
// Extended output. OutputReader also implements Read and Seek.
let mut output = [0; 1000];
let mut output_reader = hasher.finalize_xof();
output_reader.fill(&mut output);
assert_eq!(hash1, output[..32]);
// Print a hash as hex.
println!("{}", hash1);
```
Besides `hash`, BLAKE3 provides two other modes, `keyed_hash` and
`derive_key`. The `keyed_hash` mode takes a 256-bit key:
```rust
// MAC an input all at once.
let example_key = [42u8; 32];
let mac1 = blake3::keyed_hash(&example_key, b"example input");
// MAC incrementally.
let mut hasher = blake3::Hasher::new_keyed(&example_key);
hasher.update(b"example input");
let mac2 = hasher.finalize();
assert_eq!(mac1, mac2);
```
The `derive_key` mode takes a context string and some key material (not a
password). The context string should be hardcoded, globally unique, and
application-specific. A good default format for the context string is
`"[application] [commit timestamp] [purpose]"`:
```rust
// Derive a couple of subkeys for different purposes.
const EMAIL_CONTEXT: &str = "BLAKE3 example 2020-01-07 17:10:44 email key";
const API_CONTEXT: &str = "BLAKE3 example 2020-01-07 17:11:21 API key";
let input_key_material = b"usually at least 32 random bytes, not a password";
let email_key = blake3::derive_key(EMAIL_CONTEXT, input_key_material);
let api_key = blake3::derive_key(API_CONTEXT, input_key_material);
assert_ne!(email_key, api_key);
```
### The C implementation
See [`c/README.md`](c/README.md).
### Other implementations
We post links to third-party bindings and implementations on the
[@BLAKE3team Twitter account](https://twitter.com/BLAKE3team) whenever
we hear about them. Some highlights include [an optimized Go
implementation](https://github.com/zeebo/blake3), [Wasm bindings for
Node.js and browsers](https://github.com/connor4312/blake3), [binary
wheels for Python](https://github.com/oconnor663/blake3-py), [.NET
bindings](https://github.com/xoofx/Blake3.NET), and [JNI
bindings](https://github.com/sken77/BLAKE3jni).
## Contributing
Please see [CONTRIBUTING.md](CONTRIBUTING.md).
## Licenses
This work is released into the public domain with [CC0 1.0](./LICENSE_CC0).
Alternatively, it is licensed under any of the following:
* [Apache 2.0](./LICENSE_A2)
* [Apache 2.0 with LLVM exceptions](./LICENSE_A2LLVM)
## Adoption & deployment
* [Bazel](https://github.com/bazelbuild/bazel/releases/tag/6.4.0)
* [Cargo](https://github.com/rust-lang/cargo/pull/14137)
* [Ccache](https://github.com/ccache/ccache/pull/519)
* [Chia](https://github.com/Chia-Network/chia-blockchain/blob/main/CHANGELOG.md#10beta8-aka-beta-18---2020-07-16)
* [Clickhouse](https://github.com/ClickHouse/ClickHouse/blob/master/rust/chcache/Cargo.toml#L7)
* [Farcaster](https://www.farcaster.xyz/)
* [IPFS](https://github.com/ipfs/go-verifcid/issues/13)
* [Iroh](https://www.iroh.computer/blog/blake3-hazmat-api)
* [LLVM](https://reviews.llvm.org/D121510)
* [Nix](https://github.com/NixOS/nix/pull/12379)
* [Nym](https://github.com/nymtech/nym/blob/59056a22c5e6b01a38da2124662bd1fa3c8abef2/common/nymsphinx/params/src/lib.rs#L5)
* [OpenZFS](https://github.com/openzfs/zfs/)
* [Redox](https://www.redox-os.org/news/pkgar-introduction/)
* [Solana](https://docs.rs/solana-program/1.9.5/solana_program/blake3/index.html)
* [Tekken 8](https://x.com/rodarmor/status/1751567502050771189)
* [Wasmer](https://github.com/wasmerio/wasmer/blob/4f935a8c162bf604df223003e434e4f7ca253688/lib/cache/src/hash.rs#L21)
## Miscellany
- [@veorq] and [@oconnor663] did [an interview with Cryptography FM](https://www.cryptography.fm/3).
- [@oconnor663] did [an interview with Saito](https://www.youtube.com/watch?v=cJkmIt7yN_E).
[@oconnor663]: https://github.com/oconnor663
[@sneves]: https://github.com/sneves
[@veorq]: https://github.com/veorq
[@zookozcash]: https://github.com/zookozcash

623
vendor/blake3/benches/bench.rs vendored Normal file
View File

@@ -0,0 +1,623 @@
#![feature(test)]
extern crate test;
use arrayref::array_ref;
use arrayvec::ArrayVec;
use blake3::platform::{Platform, MAX_SIMD_DEGREE};
use blake3::OUT_LEN;
use blake3::{BLOCK_LEN, CHUNK_LEN};
use rand::prelude::*;
use test::Bencher;
const KIB: usize = 1024;
// This struct randomizes two things:
// 1. The actual bytes of input.
// 2. The page offset the input starts at.
pub struct RandomInput {
buf: Vec<u8>,
len: usize,
offsets: Vec<usize>,
offset_index: usize,
}
impl RandomInput {
pub fn new(b: &mut Bencher, len: usize) -> Self {
b.bytes += len as u64;
let page_size: usize = page_size::get();
let mut buf = vec![0u8; len + page_size];
let mut rng = rand::rng();
rng.fill_bytes(&mut buf);
let mut offsets: Vec<usize> = (0..page_size).collect();
offsets.shuffle(&mut rng);
Self {
buf,
len,
offsets,
offset_index: 0,
}
}
pub fn get(&mut self) -> &[u8] {
let offset = self.offsets[self.offset_index];
self.offset_index += 1;
if self.offset_index >= self.offsets.len() {
self.offset_index = 0;
}
&self.buf[offset..][..self.len]
}
}
fn bench_single_compression_fn(b: &mut Bencher, platform: Platform) {
let mut state = [1u32; 8];
let mut r = RandomInput::new(b, 64);
let input = array_ref!(r.get(), 0, 64);
b.iter(|| platform.compress_in_place(&mut state, input, 64 as u8, 0, 0));
}
#[bench]
fn bench_single_compression_portable(b: &mut Bencher) {
bench_single_compression_fn(b, Platform::portable());
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_single_compression_sse2(b: &mut Bencher) {
if let Some(platform) = Platform::sse2() {
bench_single_compression_fn(b, platform);
}
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_single_compression_sse41(b: &mut Bencher) {
if let Some(platform) = Platform::sse41() {
bench_single_compression_fn(b, platform);
}
}
#[bench]
#[cfg(blake3_avx512_ffi)]
fn bench_single_compression_avx512(b: &mut Bencher) {
if let Some(platform) = Platform::avx512() {
bench_single_compression_fn(b, platform);
}
}
fn bench_many_chunks_fn(b: &mut Bencher, platform: Platform) {
let degree = platform.simd_degree();
let mut inputs = Vec::new();
for _ in 0..degree {
inputs.push(RandomInput::new(b, CHUNK_LEN));
}
b.iter(|| {
let input_arrays: ArrayVec<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE> = inputs
.iter_mut()
.take(degree)
.map(|i| array_ref!(i.get(), 0, CHUNK_LEN))
.collect();
let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
platform.hash_many(
&input_arrays[..],
&[0; 8],
0,
blake3::IncrementCounter::Yes,
0,
0,
0,
&mut out,
);
});
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_chunks_sse2(b: &mut Bencher) {
if let Some(platform) = Platform::sse2() {
bench_many_chunks_fn(b, platform);
}
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_chunks_sse41(b: &mut Bencher) {
if let Some(platform) = Platform::sse41() {
bench_many_chunks_fn(b, platform);
}
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_chunks_avx2(b: &mut Bencher) {
if let Some(platform) = Platform::avx2() {
bench_many_chunks_fn(b, platform);
}
}
#[bench]
#[cfg(blake3_avx512_ffi)]
fn bench_many_chunks_avx512(b: &mut Bencher) {
if let Some(platform) = Platform::avx512() {
bench_many_chunks_fn(b, platform);
}
}
#[bench]
#[cfg(blake3_neon)]
fn bench_many_chunks_neon(b: &mut Bencher) {
bench_many_chunks_fn(b, Platform::neon().unwrap());
}
#[bench]
#[cfg(blake3_wasm32_simd)]
fn bench_many_chunks_wasm(b: &mut Bencher) {
bench_many_chunks_fn(b, Platform::wasm32_simd().unwrap());
}
// TODO: When we get const generics we can unify this with the chunks code.
fn bench_many_parents_fn(b: &mut Bencher, platform: Platform) {
let degree = platform.simd_degree();
let mut inputs = Vec::new();
for _ in 0..degree {
inputs.push(RandomInput::new(b, BLOCK_LEN));
}
b.iter(|| {
let input_arrays: ArrayVec<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE> = inputs
.iter_mut()
.take(degree)
.map(|i| array_ref!(i.get(), 0, BLOCK_LEN))
.collect();
let mut out = [0; MAX_SIMD_DEGREE * OUT_LEN];
platform.hash_many(
&input_arrays[..],
&[0; 8],
0,
blake3::IncrementCounter::No,
0,
0,
0,
&mut out,
);
});
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_parents_sse2(b: &mut Bencher) {
if let Some(platform) = Platform::sse2() {
bench_many_parents_fn(b, platform);
}
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_parents_sse41(b: &mut Bencher) {
if let Some(platform) = Platform::sse41() {
bench_many_parents_fn(b, platform);
}
}
#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_parents_avx2(b: &mut Bencher) {
if let Some(platform) = Platform::avx2() {
bench_many_parents_fn(b, platform);
}
}
#[bench]
#[cfg(blake3_avx512_ffi)]
fn bench_many_parents_avx512(b: &mut Bencher) {
if let Some(platform) = Platform::avx512() {
bench_many_parents_fn(b, platform);
}
}
#[bench]
#[cfg(blake3_neon)]
fn bench_many_parents_neon(b: &mut Bencher) {
bench_many_parents_fn(b, Platform::neon().unwrap());
}
#[bench]
#[cfg(blake3_wasm32_simd)]
fn bench_many_parents_wasm(b: &mut Bencher) {
bench_many_parents_fn(b, Platform::wasm32_simd().unwrap());
}
fn bench_atonce(b: &mut Bencher, len: usize) {
let mut input = RandomInput::new(b, len);
b.iter(|| blake3::hash(input.get()));
}
#[bench]
fn bench_atonce_0001_block(b: &mut Bencher) {
bench_atonce(b, BLOCK_LEN);
}
#[bench]
fn bench_atonce_0001_kib(b: &mut Bencher) {
bench_atonce(b, 1 * KIB);
}
#[bench]
fn bench_atonce_0002_kib(b: &mut Bencher) {
bench_atonce(b, 2 * KIB);
}
#[bench]
fn bench_atonce_0004_kib(b: &mut Bencher) {
bench_atonce(b, 4 * KIB);
}
#[bench]
fn bench_atonce_0008_kib(b: &mut Bencher) {
bench_atonce(b, 8 * KIB);
}
#[bench]
fn bench_atonce_0016_kib(b: &mut Bencher) {
bench_atonce(b, 16 * KIB);
}
#[bench]
fn bench_atonce_0032_kib(b: &mut Bencher) {
bench_atonce(b, 32 * KIB);
}
#[bench]
fn bench_atonce_0064_kib(b: &mut Bencher) {
bench_atonce(b, 64 * KIB);
}
#[bench]
fn bench_atonce_0128_kib(b: &mut Bencher) {
bench_atonce(b, 128 * KIB);
}
#[bench]
fn bench_atonce_0256_kib(b: &mut Bencher) {
bench_atonce(b, 256 * KIB);
}
#[bench]
fn bench_atonce_0512_kib(b: &mut Bencher) {
bench_atonce(b, 512 * KIB);
}
#[bench]
fn bench_atonce_1024_kib(b: &mut Bencher) {
bench_atonce(b, 1024 * KIB);
}
fn bench_incremental(b: &mut Bencher, len: usize) {
let mut input = RandomInput::new(b, len);
b.iter(|| blake3::Hasher::new().update(input.get()).finalize());
}
#[bench]
fn bench_incremental_0001_block(b: &mut Bencher) {
bench_incremental(b, BLOCK_LEN);
}
#[bench]
fn bench_incremental_0001_kib(b: &mut Bencher) {
bench_incremental(b, 1 * KIB);
}
#[bench]
fn bench_incremental_0002_kib(b: &mut Bencher) {
bench_incremental(b, 2 * KIB);
}
#[bench]
fn bench_incremental_0004_kib(b: &mut Bencher) {
bench_incremental(b, 4 * KIB);
}
#[bench]
fn bench_incremental_0008_kib(b: &mut Bencher) {
bench_incremental(b, 8 * KIB);
}
#[bench]
fn bench_incremental_0016_kib(b: &mut Bencher) {
bench_incremental(b, 16 * KIB);
}
#[bench]
fn bench_incremental_0032_kib(b: &mut Bencher) {
bench_incremental(b, 32 * KIB);
}
#[bench]
fn bench_incremental_0064_kib(b: &mut Bencher) {
bench_incremental(b, 64 * KIB);
}
#[bench]
fn bench_incremental_0128_kib(b: &mut Bencher) {
bench_incremental(b, 128 * KIB);
}
#[bench]
fn bench_incremental_0256_kib(b: &mut Bencher) {
bench_incremental(b, 256 * KIB);
}
#[bench]
fn bench_incremental_0512_kib(b: &mut Bencher) {
bench_incremental(b, 512 * KIB);
}
#[bench]
fn bench_incremental_1024_kib(b: &mut Bencher) {
bench_incremental(b, 1024 * KIB);
}
fn bench_reference(b: &mut Bencher, len: usize) {
let mut input = RandomInput::new(b, len);
b.iter(|| {
let mut hasher = reference_impl::Hasher::new();
hasher.update(input.get());
let mut out = [0; 32];
hasher.finalize(&mut out);
out
});
}
#[bench]
fn bench_reference_0001_block(b: &mut Bencher) {
bench_reference(b, BLOCK_LEN);
}
#[bench]
fn bench_reference_0001_kib(b: &mut Bencher) {
bench_reference(b, 1 * KIB);
}
#[bench]
fn bench_reference_0002_kib(b: &mut Bencher) {
bench_reference(b, 2 * KIB);
}
#[bench]
fn bench_reference_0004_kib(b: &mut Bencher) {
bench_reference(b, 4 * KIB);
}
#[bench]
fn bench_reference_0008_kib(b: &mut Bencher) {
bench_reference(b, 8 * KIB);
}
#[bench]
fn bench_reference_0016_kib(b: &mut Bencher) {
bench_reference(b, 16 * KIB);
}
#[bench]
fn bench_reference_0032_kib(b: &mut Bencher) {
bench_reference(b, 32 * KIB);
}
#[bench]
fn bench_reference_0064_kib(b: &mut Bencher) {
bench_reference(b, 64 * KIB);
}
#[bench]
fn bench_reference_0128_kib(b: &mut Bencher) {
bench_reference(b, 128 * KIB);
}
#[bench]
fn bench_reference_0256_kib(b: &mut Bencher) {
bench_reference(b, 256 * KIB);
}
#[bench]
fn bench_reference_0512_kib(b: &mut Bencher) {
bench_reference(b, 512 * KIB);
}
#[bench]
fn bench_reference_1024_kib(b: &mut Bencher) {
bench_reference(b, 1024 * KIB);
}
#[cfg(feature = "rayon")]
fn bench_rayon(b: &mut Bencher, len: usize) {
let mut input = RandomInput::new(b, len);
b.iter(|| blake3::Hasher::new().update_rayon(input.get()).finalize());
}
#[bench]
#[cfg(feature = "rayon")]
fn bench_rayon_0001_block(b: &mut Bencher) {
bench_rayon(b, BLOCK_LEN);
}
#[bench]
#[cfg(feature = "rayon")]
fn bench_rayon_0001_kib(b: &mut Bencher) {
bench_rayon(b, 1 * KIB);
}
#[bench]
#[cfg(feature = "rayon")]
fn bench_rayon_0002_kib(b: &mut Bencher) {
bench_rayon(b, 2 * KIB);
}
#[bench]
#[cfg(feature = "rayon")]
fn bench_rayon_0004_kib(b: &mut Bencher) {
bench_rayon(b, 4 * KIB);
}
#[bench]
#[cfg(feature = "rayon")]
fn bench_rayon_0008_kib(b: &mut Bencher) {
bench_rayon(b, 8 * KIB);
}
#[bench]
#[cfg(feature = "rayon")]
fn bench_rayon_0016_kib(b: &mut Bencher) {
bench_rayon(b, 16 * KIB);
}
#[bench]
#[cfg(feature = "rayon")]
fn bench_rayon_0032_kib(b: &mut Bencher) {
bench_rayon(b, 32 * KIB);
}
#[bench]
#[cfg(feature = "rayon")]
fn bench_rayon_0064_kib(b: &mut Bencher) {
bench_rayon(b, 64 * KIB);
}
#[bench]
#[cfg(feature = "rayon")]
fn bench_rayon_0128_kib(b: &mut Bencher) {
bench_rayon(b, 128 * KIB);
}
#[bench]
#[cfg(feature = "rayon")]
fn bench_rayon_0256_kib(b: &mut Bencher) {
bench_rayon(b, 256 * KIB);
}
#[bench]
#[cfg(feature = "rayon")]
fn bench_rayon_0512_kib(b: &mut Bencher) {
bench_rayon(b, 512 * KIB);
}
#[bench]
#[cfg(feature = "rayon")]
fn bench_rayon_1024_kib(b: &mut Bencher) {
bench_rayon(b, 1024 * KIB);
}
// This checks that update() splits up its input in increasing powers of 2, so
// that it can recover a high degree of parallelism when the number of bytes
// hashed so far is uneven. The performance of this benchmark should be
// reasonably close to bench_incremental_0064_kib, within 80% or so. When we
// had a bug in this logic (https://github.com/BLAKE3-team/BLAKE3/issues/69),
// performance was less than half.
#[bench]
fn bench_two_updates(b: &mut Bencher) {
let len = 65536;
let mut input = RandomInput::new(b, len);
b.iter(|| {
let mut hasher = blake3::Hasher::new();
let input = input.get();
hasher.update(&input[..1]);
hasher.update(&input[1..]);
hasher.finalize()
});
}
fn bench_xof(b: &mut Bencher, len: usize) {
b.bytes = len as u64;
let mut output = [0u8; 64 * BLOCK_LEN];
let output_slice = &mut output[..len];
let mut xof = blake3::Hasher::new().finalize_xof();
b.iter(|| xof.fill(output_slice));
}
#[bench]
fn bench_xof_01_block(b: &mut Bencher) {
bench_xof(b, 1 * BLOCK_LEN);
}
#[bench]
fn bench_xof_02_blocks(b: &mut Bencher) {
bench_xof(b, 2 * BLOCK_LEN);
}
#[bench]
fn bench_xof_03_blocks(b: &mut Bencher) {
bench_xof(b, 3 * BLOCK_LEN);
}
#[bench]
fn bench_xof_04_blocks(b: &mut Bencher) {
bench_xof(b, 4 * BLOCK_LEN);
}
#[bench]
fn bench_xof_05_blocks(b: &mut Bencher) {
bench_xof(b, 5 * BLOCK_LEN);
}
#[bench]
fn bench_xof_06_blocks(b: &mut Bencher) {
bench_xof(b, 6 * BLOCK_LEN);
}
#[bench]
fn bench_xof_07_blocks(b: &mut Bencher) {
bench_xof(b, 7 * BLOCK_LEN);
}
#[bench]
fn bench_xof_08_blocks(b: &mut Bencher) {
bench_xof(b, 8 * BLOCK_LEN);
}
#[bench]
fn bench_xof_09_blocks(b: &mut Bencher) {
bench_xof(b, 9 * BLOCK_LEN);
}
#[bench]
fn bench_xof_10_blocks(b: &mut Bencher) {
bench_xof(b, 10 * BLOCK_LEN);
}
#[bench]
fn bench_xof_11_blocks(b: &mut Bencher) {
bench_xof(b, 11 * BLOCK_LEN);
}
#[bench]
fn bench_xof_12_blocks(b: &mut Bencher) {
bench_xof(b, 12 * BLOCK_LEN);
}
#[bench]
fn bench_xof_13_blocks(b: &mut Bencher) {
bench_xof(b, 13 * BLOCK_LEN);
}
#[bench]
fn bench_xof_14_blocks(b: &mut Bencher) {
bench_xof(b, 14 * BLOCK_LEN);
}
#[bench]
fn bench_xof_15_blocks(b: &mut Bencher) {
bench_xof(b, 15 * BLOCK_LEN);
}
#[bench]
fn bench_xof_16_blocks(b: &mut Bencher) {
bench_xof(b, 16 * BLOCK_LEN);
}
#[bench]
fn bench_xof_32_blocks(b: &mut Bencher) {
bench_xof(b, 32 * BLOCK_LEN);
}
#[bench]
fn bench_xof_64_blocks(b: &mut Bencher) {
bench_xof(b, 64 * BLOCK_LEN);
}

389
vendor/blake3/build.rs vendored Normal file
View File

@@ -0,0 +1,389 @@
use std::env;
fn defined(var: &str) -> bool {
println!("cargo:rerun-if-env-changed={}", var);
env::var_os(var).is_some()
}
fn is_pure() -> bool {
defined("CARGO_FEATURE_PURE")
}
fn should_prefer_intrinsics() -> bool {
defined("CARGO_FEATURE_PREFER_INTRINSICS")
}
fn is_neon() -> bool {
defined("CARGO_FEATURE_NEON")
}
fn is_no_neon() -> bool {
defined("CARGO_FEATURE_NO_NEON")
}
fn is_wasm32_simd() -> bool {
defined("CARGO_FEATURE_WASM32_SIMD")
}
fn is_ci() -> bool {
defined("BLAKE3_CI")
}
fn warn(warning: &str) {
assert!(!warning.contains("\n"));
println!("cargo:warning={}", warning);
if is_ci() {
println!("cargo:warning=Warnings in CI are treated as errors. Build failed.");
std::process::exit(1);
}
}
fn target_components() -> Vec<String> {
let target = env::var("TARGET").unwrap();
target.split("-").map(|s| s.to_string()).collect()
}
fn is_x86_64() -> bool {
target_components()[0] == "x86_64"
}
fn is_windows_target() -> bool {
env::var("CARGO_CFG_TARGET_OS").unwrap() == "windows"
}
fn use_msvc_asm() -> bool {
const MSVC_NAMES: &[&str] = &["", "cl", "cl.exe"];
let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default();
let target_env = env::var("CARGO_CFG_TARGET_ENV").unwrap_or_default();
let target_windows_msvc = target_os == "windows" && target_env == "msvc";
let host_triple = env::var("HOST").unwrap_or_default();
let target_triple = env::var("TARGET").unwrap_or_default();
let cross_compiling = host_triple != target_triple;
let cc = env::var("CC").unwrap_or_default().to_ascii_lowercase();
if !target_windows_msvc {
// We are not building for Windows with the MSVC toolchain.
false
} else if !cross_compiling && MSVC_NAMES.contains(&&*cc) {
// We are building on Windows with the MSVC toolchain (and not cross-compiling for another architecture or target).
true
} else {
// We are cross-compiling to Windows with the MSVC toolchain.
let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default();
let target_vendor = env::var("CARGO_CFG_TARGET_VENDOR").unwrap_or_default();
let cc = env::var(format!("CC_{target_arch}_{target_vendor}_windows_msvc"))
.unwrap_or_default()
.to_ascii_lowercase();
// Check if we are using the MSVC compiler.
MSVC_NAMES.contains(&&*cc)
}
}
fn is_x86_32() -> bool {
let arch = &target_components()[0];
arch == "i386" || arch == "i586" || arch == "i686"
}
fn is_arm() -> bool {
is_armv7() || is_aarch64() || target_components()[0] == "arm"
}
fn is_aarch64() -> bool {
target_components()[0] == "aarch64"
}
fn is_armv7() -> bool {
target_components()[0] == "armv7"
}
fn is_wasm32() -> bool {
target_components()[0] == "wasm32"
}
fn endianness() -> String {
let endianness = env::var("CARGO_CFG_TARGET_ENDIAN").unwrap();
assert!(endianness == "little" || endianness == "big");
endianness
}
fn is_little_endian() -> bool {
endianness() == "little"
}
fn is_big_endian() -> bool {
endianness() == "big"
}
// Windows targets may be using the MSVC toolchain or the MinGW toolchain. The
// right compiler flags to use depend on the toolchain. (And we don't want to
// use flag_if_supported, because we don't want features to be silently
// disabled by old compilers.)
fn is_windows_msvc() -> bool {
// Some targets are only two components long, so check in steps.
let second_component = &target_components()[1];
(second_component == "pc" || second_component == "win7")
&& target_components()[2] == "windows"
&& target_components()[3] == "msvc"
}
// MinGW toolchain uses 2 different targets depending on the main compiler.
// Target for a general MinGW toolchain ends with `-gnu` (GCC is used as C
// compiler). Target for a LLVM-MinGW toolchain (Clang is used as C compiler)
// ends with `-gnullvm`.
fn is_windows_gnu() -> bool {
// Some targets are only two components long, so check in steps.
let second_component = &target_components()[1];
(second_component == "pc" || second_component == "win7")
&& target_components()[2] == "windows"
&& target_components()[3] != "msvc"
}
fn new_build() -> cc::Build {
let mut build = cc::Build::new();
if !is_windows_msvc() {
build.flag("-std=c11");
}
// Do NOT trigger a rebuild any time the env changes (e.g. $PATH).
// This prevents all downstream crates from being rebuilt when `cargo check`
// or `cargo build` are run in different environments, like Rust Analyzer
// vs. in the terminal vs. in a Git pre-commit hook.
build.emit_rerun_if_env_changed(false);
build
}
#[derive(PartialEq)]
enum CCompilerSupport {
NoCompiler,
NoAVX512,
YesAVX512,
}
use CCompilerSupport::*;
fn c_compiler_support() -> CCompilerSupport {
let build = new_build();
let flags_checked;
let support_result: Result<bool, _> = if is_windows_msvc() {
flags_checked = "/arch:AVX512";
build.is_flag_supported("/arch:AVX512")
} else {
// Check for both of the flags we use. If -mavx512f works, then -mavx512vl
// will probably always work too, but we might as well be thorough.
flags_checked = "-mavx512f and -mavx512vl";
match build.is_flag_supported("-mavx512f") {
Ok(true) => build.is_flag_supported("-mavx512vl"),
false_or_error => false_or_error,
}
};
match support_result {
Ok(true) => YesAVX512,
Ok(false) => {
warn(&format!(
"The C compiler {:?} does not support {}.",
build.get_compiler().path(),
flags_checked,
));
NoAVX512
}
Err(e) => {
println!("{:?}", e);
warn(&format!(
"No C compiler {:?} detected.",
build.get_compiler().path()
));
NoCompiler
}
}
}
fn build_sse2_sse41_avx2_rust_intrinsics() {
// No C code to compile here. Set the cfg flags that enable the Rust SSE2,
// SSE4.1, and AVX2 intrinsics modules. The regular Cargo build will compile
// them.
println!("cargo:rustc-cfg=blake3_sse2_rust");
println!("cargo:rustc-cfg=blake3_sse41_rust");
println!("cargo:rustc-cfg=blake3_avx2_rust");
}
fn build_sse2_sse41_avx2_assembly() {
// Build the assembly implementations for SSE4.1 and AVX2. This is
// preferred, but it only supports x86_64.
assert!(is_x86_64());
println!("cargo:rustc-cfg=blake3_sse2_ffi");
println!("cargo:rustc-cfg=blake3_sse41_ffi");
println!("cargo:rustc-cfg=blake3_avx2_ffi");
let mut build = new_build();
if is_windows_target() {
if use_msvc_asm() {
build.file("c/blake3_sse2_x86-64_windows_msvc.asm");
build.file("c/blake3_sse41_x86-64_windows_msvc.asm");
build.file("c/blake3_avx2_x86-64_windows_msvc.asm");
} else {
build.file("c/blake3_sse2_x86-64_windows_gnu.S");
build.file("c/blake3_sse41_x86-64_windows_gnu.S");
build.file("c/blake3_avx2_x86-64_windows_gnu.S");
}
} else {
// All non-Windows implementations are assumed to support
// Linux-style assembly. These files do contain a small
// explicit workaround for macOS also.
build.file("c/blake3_sse2_x86-64_unix.S");
build.file("c/blake3_sse41_x86-64_unix.S");
build.file("c/blake3_avx2_x86-64_unix.S");
}
build.compile("blake3_sse2_sse41_avx2_assembly");
}
fn build_avx512_c_intrinsics() {
// This is required on 32-bit x86 targets, since the assembly
// implementation doesn't support those.
println!("cargo:rustc-cfg=blake3_avx512_ffi");
let mut build = new_build();
build.file("c/blake3_avx512.c");
if is_windows_msvc() {
build.flag("/arch:AVX512");
} else {
build.flag("-mavx512f");
build.flag("-mavx512vl");
}
if is_windows_gnu() {
// Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782.
build.flag("-fno-asynchronous-unwind-tables");
}
build.compile("blake3_avx512_intrinsics");
}
fn build_avx512_assembly() {
// Build the assembly implementation for AVX-512. This is preferred, but it
// only supports x86_64.
assert!(is_x86_64());
println!("cargo:rustc-cfg=blake3_avx512_ffi");
let mut build = new_build();
let mut is_msvc = false;
if is_windows_target() {
if use_msvc_asm() {
build.file("c/blake3_avx512_x86-64_windows_msvc.asm");
is_msvc = true;
} else {
build.file("c/blake3_avx512_x86-64_windows_gnu.S");
}
} else {
build.file("c/blake3_avx512_x86-64_unix.S");
}
// Older versions of Clang require these flags, even for assembly. See
// https://github.com/BLAKE3-team/BLAKE3/issues/79.
if !is_msvc {
build.flag("-mavx512f");
build.flag("-mavx512vl");
}
build.compile("blake3_avx512_assembly");
}
fn build_neon_c_intrinsics() {
let mut build = new_build();
// Note that blake3_neon.c normally depends on the blake3_portable.c
// for the single-instance compression function, but we expose
// portable.rs over FFI instead. See ffi_neon.rs.
build.file("c/blake3_neon.c");
// ARMv7 platforms that support NEON generally need the following
// flags. AArch64 supports NEON by default and does not support -mpfu.
if is_armv7() {
build.flag("-mfpu=neon-vfpv4");
build.flag("-mfloat-abi=hard");
}
build.compile("blake3_neon");
}
fn build_wasm32_simd() {
assert!(is_wasm32());
// No C code to compile here. Set the cfg flags that enable the Wasm SIMD.
// The regular Cargo build will compile it.
println!("cargo:rustc-cfg=blake3_wasm32_simd");
}
fn main() -> Result<(), Box<dyn std::error::Error>> {
// As of Rust 1.80, unrecognized config names are warnings. Give Cargo all of our config names.
let all_cfgs = [
"blake3_sse2_ffi",
"blake3_sse2_rust",
"blake3_sse41_ffi",
"blake3_sse41_rust",
"blake3_avx2_ffi",
"blake3_avx2_rust",
"blake3_avx512_ffi",
"blake3_neon",
"blake3_wasm32_simd",
];
for cfg_name in all_cfgs {
// TODO: Switch this whole file to the new :: syntax when our MSRV reaches 1.77.
// https://doc.rust-lang.org/cargo/reference/build-scripts.html#outputs-of-the-build-script
println!("cargo:rustc-check-cfg=cfg({cfg_name}, values(none()))");
}
if is_pure() && is_neon() {
panic!("It doesn't make sense to enable both \"pure\" and \"neon\".");
}
if is_no_neon() && is_neon() {
panic!("It doesn't make sense to enable both \"no_neon\" and \"neon\".");
}
if is_x86_64() || is_x86_32() {
let support = c_compiler_support();
if is_x86_32() || should_prefer_intrinsics() || is_pure() || support == NoCompiler {
build_sse2_sse41_avx2_rust_intrinsics();
} else {
// We assume that all C compilers can assemble SSE4.1 and AVX2. We
// don't explicitly check for support.
build_sse2_sse41_avx2_assembly();
}
if is_pure() || support == NoCompiler || support == NoAVX512 {
// The binary will not include any AVX-512 code.
} else if is_x86_32() || should_prefer_intrinsics() {
build_avx512_c_intrinsics();
} else {
build_avx512_assembly();
}
}
if is_neon() && is_big_endian() {
panic!("The NEON implementation doesn't support big-endian ARM.")
}
if (is_arm() && is_neon())
|| (!is_no_neon() && !is_pure() && is_aarch64() && is_little_endian())
{
println!("cargo:rustc-cfg=blake3_neon");
build_neon_c_intrinsics();
}
if is_wasm32() && is_wasm32_simd() {
build_wasm32_simd();
}
// The `cc` crate doesn't automatically emit rerun-if directives for the
// environment variables it supports, in particular for $CC. We expect to
// do a lot of benchmarking across different compilers, so we explicitly
// add the variables that we're likely to need.
println!("cargo:rerun-if-env-changed=CC");
println!("cargo:rerun-if-env-changed=CFLAGS");
// Ditto for source files, though these shouldn't change as often.
for file in std::fs::read_dir("c")? {
println!(
"cargo:rerun-if-changed={}",
file?.path().to_str().expect("utf-8")
);
}
// When compiling with clang-cl for windows, it adds .asm files to the root
// which we need to delete so cargo doesn't get angry
if is_windows_target() && !use_msvc_asm() {
let _ = std::fs::remove_file("blake3_avx2_x86-64_windows_gnu.asm");
let _ = std::fs::remove_file("blake3_avx512_x86-64_windows_gnu.asm");
let _ = std::fs::remove_file("blake3_sse2_x86-64_windows_gnu.asm");
let _ = std::fs::remove_file("blake3_sse41_x86-64_windows_gnu.asm");
}
Ok(())
}

383
vendor/blake3/c/CMakeLists.txt vendored Normal file
View File

@@ -0,0 +1,383 @@
cmake_minimum_required(VERSION 3.9 FATAL_ERROR)
# respect C_EXTENSIONS OFF without explicitly setting C_STANDARD
if (POLICY CMP0128)
cmake_policy(SET CMP0128 NEW)
endif()
# mark_as_advanced does not implicitly create UNINITIALIZED cache entries
if (POLICY CMP0102)
cmake_policy(SET CMP0102 NEW)
endif()
project(libblake3
VERSION 1.8.2
DESCRIPTION "BLAKE3 C implementation"
LANGUAGES C CXX ASM
)
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
option(BLAKE3_USE_TBB "Enable oneTBB parallelism" OFF)
option(BLAKE3_FETCH_TBB "Allow fetching oneTBB from GitHub if not found on system" OFF)
include(CTest)
include(FeatureSummary)
include(GNUInstallDirs)
add_subdirectory(dependencies)
# architecture lists for which to enable assembly / SIMD sources
set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64)
set(BLAKE3_X86_NAMES i686 x86 X86)
set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a)
# default SIMD compiler flag configuration (can be overriden by toolchains or CLI)
if(MSVC)
set(BLAKE3_CFLAGS_SSE2 "/arch:SSE2" CACHE STRING "the compiler flags to enable SSE2")
# MSVC has no dedicated sse4.1 flag (see https://learn.microsoft.com/en-us/cpp/build/reference/arch-x86?view=msvc-170)
set(BLAKE3_CFLAGS_SSE4.1 "/arch:AVX" CACHE STRING "the compiler flags to enable SSE4.1")
set(BLAKE3_CFLAGS_AVX2 "/arch:AVX2" CACHE STRING "the compiler flags to enable AVX2")
set(BLAKE3_CFLAGS_AVX512 "/arch:AVX512" CACHE STRING "the compiler flags to enable AVX512")
set(BLAKE3_AMD64_ASM_SOURCES
blake3_avx2_x86-64_windows_msvc.asm
blake3_avx512_x86-64_windows_msvc.asm
blake3_sse2_x86-64_windows_msvc.asm
blake3_sse41_x86-64_windows_msvc.asm
)
elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU"
OR CMAKE_C_COMPILER_ID STREQUAL "Clang"
OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang")
set(BLAKE3_CFLAGS_SSE2 "-msse2" CACHE STRING "the compiler flags to enable SSE2")
set(BLAKE3_CFLAGS_SSE4.1 "-msse4.1" CACHE STRING "the compiler flags to enable SSE4.1")
set(BLAKE3_CFLAGS_AVX2 "-mavx2" CACHE STRING "the compiler flags to enable AVX2")
set(BLAKE3_CFLAGS_AVX512 "-mavx512f -mavx512vl" CACHE STRING "the compiler flags to enable AVX512")
if (WIN32 OR CYGWIN)
set(BLAKE3_AMD64_ASM_SOURCES
blake3_avx2_x86-64_windows_gnu.S
blake3_avx512_x86-64_windows_gnu.S
blake3_sse2_x86-64_windows_gnu.S
blake3_sse41_x86-64_windows_gnu.S
)
elseif(UNIX)
set(BLAKE3_AMD64_ASM_SOURCES
blake3_avx2_x86-64_unix.S
blake3_avx512_x86-64_unix.S
blake3_sse2_x86-64_unix.S
blake3_sse41_x86-64_unix.S
)
endif()
if (CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
# 32-bit ARMv8 needs NEON to be enabled explicitly
set(BLAKE3_CFLAGS_NEON "-mfpu=neon" CACHE STRING "the compiler flags to enable NEON")
endif()
endif()
mark_as_advanced(BLAKE3_CFLAGS_SSE2 BLAKE3_CFLAGS_SSE4.1 BLAKE3_CFLAGS_AVX2 BLAKE3_CFLAGS_AVX512 BLAKE3_CFLAGS_NEON)
mark_as_advanced(BLAKE3_AMD64_ASM_SOURCES)
message(STATUS "BLAKE3 SIMD configuration: ${CMAKE_C_COMPILER_ARCHITECTURE_ID}")
if(MSVC AND DEFINED CMAKE_C_COMPILER_ARCHITECTURE_ID)
if(CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "[Xx]86")
set(BLAKE3_SIMD_TYPE "x86-intrinsics" CACHE STRING "the SIMD acceleration type to use")
elseif(CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "[Xx]64")
set(BLAKE3_SIMD_TYPE "amd64-asm" CACHE STRING "the SIMD acceleration type to use")
elseif(CMAKE_C_COMPILER_ARCHITECTURE_ID MATCHES "[Aa][Rr][Mm]64")
set(BLAKE3_SIMD_TYPE "neon-intrinsics" CACHE STRING "the SIMD acceleration type to use")
else()
set(BLAKE3_SIMD_TYPE "none" CACHE STRING "the SIMD acceleration type to use")
endif()
elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_AMD64_NAMES)
set(BLAKE3_SIMD_TYPE "amd64-asm" CACHE STRING "the SIMD acceleration type to use")
elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_X86_NAMES
AND DEFINED BLAKE3_CFLAGS_SSE2
AND DEFINED BLAKE3_CFLAGS_SSE4.1
AND DEFINED BLAKE3_CFLAGS_AVX2
AND DEFINED BLAKE3_CFLAGS_AVX512)
set(BLAKE3_SIMD_TYPE "x86-intrinsics" CACHE STRING "the SIMD acceleration type to use")
elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
OR ANDROID_ABI STREQUAL "armeabi-v7a"
OR BLAKE3_USE_NEON_INTRINSICS)
AND (DEFINED BLAKE3_CFLAGS_NEON
OR CMAKE_SIZEOF_VOID_P EQUAL 8))
set(BLAKE3_SIMD_TYPE "neon-intrinsics" CACHE STRING "the SIMD acceleration type to use")
else()
set(BLAKE3_SIMD_TYPE "none" CACHE STRING "the SIMD acceleration type to use")
endif()
mark_as_advanced(BLAKE3_SIMD_TYPE)
# library target
add_library(blake3
blake3.c
blake3_dispatch.c
blake3_portable.c
)
add_library(BLAKE3::blake3 ALIAS blake3)
# library configuration
set(PKG_CONFIG_CFLAGS)
if (BUILD_SHARED_LIBS)
target_compile_definitions(blake3
PUBLIC BLAKE3_DLL
PRIVATE BLAKE3_DLL_EXPORTS
)
list(APPEND PKG_CONFIG_CFLAGS -DBLAKE3_DLL)
endif()
target_include_directories(blake3 PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
)
set_target_properties(blake3 PROPERTIES
VERSION ${PROJECT_VERSION}
SOVERSION 0
C_VISIBILITY_PRESET hidden
C_EXTENSIONS OFF
)
target_compile_features(blake3 PUBLIC c_std_99)
if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.12)
target_compile_features(blake3 PUBLIC cxx_std_20)
# else: add it further below through `BLAKE3_CMAKE_CXXFLAGS_*`
endif()
# ensure C_EXTENSIONS OFF is respected without overriding CMAKE_C_STANDARD
# which may be set by the user or toolchain file
if (NOT POLICY CMP0128 AND NOT DEFINED CMAKE_C_STANDARD)
set_target_properties(blake3 PROPERTIES C_STANDARD 99)
endif()
# optional SIMD sources
if(BLAKE3_SIMD_TYPE STREQUAL "amd64-asm")
if (NOT DEFINED BLAKE3_AMD64_ASM_SOURCES)
message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to 'amd64-asm' but no assembly sources are available for the target architecture.")
endif()
set(BLAKE3_SIMD_AMD64_ASM ON)
if(MSVC)
enable_language(ASM_MASM)
endif()
target_sources(blake3 PRIVATE ${BLAKE3_AMD64_ASM_SOURCES})
elseif(BLAKE3_SIMD_TYPE STREQUAL "x86-intrinsics")
if (NOT DEFINED BLAKE3_CFLAGS_SSE2
OR NOT DEFINED BLAKE3_CFLAGS_SSE4.1
OR NOT DEFINED BLAKE3_CFLAGS_AVX2
OR NOT DEFINED BLAKE3_CFLAGS_AVX512)
message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to 'x86-intrinsics' but no compiler flags are available for the target architecture.")
endif()
set(BLAKE3_SIMD_X86_INTRINSICS ON)
target_sources(blake3 PRIVATE
blake3_avx2.c
blake3_avx512.c
blake3_sse2.c
blake3_sse41.c
)
set_source_files_properties(blake3_avx2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX2}")
set_source_files_properties(blake3_avx512.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX512}")
set_source_files_properties(blake3_sse2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE2}")
set_source_files_properties(blake3_sse41.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE4.1}")
elseif(BLAKE3_SIMD_TYPE STREQUAL "neon-intrinsics")
set(BLAKE3_SIMD_NEON_INTRINSICS ON)
target_sources(blake3 PRIVATE
blake3_neon.c
)
target_compile_definitions(blake3 PRIVATE
BLAKE3_USE_NEON=1
)
if (DEFINED BLAKE3_CFLAGS_NEON)
set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}")
endif()
elseif(BLAKE3_SIMD_TYPE STREQUAL "none")
target_compile_definitions(blake3 PRIVATE
BLAKE3_USE_NEON=0
BLAKE3_NO_SSE2
BLAKE3_NO_SSE41
BLAKE3_NO_AVX2
BLAKE3_NO_AVX512
)
else()
message(FATAL_ERROR "BLAKE3_SIMD_TYPE is set to an unknown value: '${BLAKE3_SIMD_TYPE}'")
endif()
if(BLAKE3_USE_TBB)
find_package(TBB 2021.11.0 QUIET)
if(NOT TBB_FOUND AND NOT TARGET TBB::tbb)
message(WARNING
"oneTBB not found; disabling BLAKE3_USE_TBB\n"
"Enable BLAKE3_FETCH_TBB to automatically fetch and build oneTBB"
)
set(BLAKE3_USE_TBB OFF)
else()
target_sources(blake3
PRIVATE
blake3_tbb.cpp)
target_link_libraries(blake3
PUBLIC
# Make shared TBB a transitive dependency. The consuming program is technically not required
# to link TBB in order for libblake3 to function but we do this in order to prevent the
# possibility of multiple separate TBB runtimes being linked into a final program in case
# the consuming program also happens to already use TBB.
TBB::tbb)
target_compile_definitions(blake3
PUBLIC
BLAKE3_USE_TBB)
endif()
list(APPEND PKG_CONFIG_REQUIRES "tbb >= ${TBB_VERSION}")
list(APPEND PKG_CONFIG_CFLAGS -DBLAKE3_USE_TBB)
include(CheckCXXSymbolExists)
check_cxx_symbol_exists(_LIBCPP_VERSION "version" BLAKE3_HAVE_LIBCPP)
check_cxx_symbol_exists(__GLIBCXX__ "version" BLAKE3_HAVE_GLIBCXX)
if(BLAKE3_HAVE_GLIBCXX)
list(APPEND PKG_CONFIG_LIBS -lstdc++)
elseif(BLAKE3_HAVE_LIBCPP)
list(APPEND PKG_CONFIG_LIBS -lc++)
endif()
endif()
if(BLAKE3_USE_TBB)
# Define some scratch variables for building appropriate flags per compiler
if(CMAKE_VERSION VERSION_LESS 3.12)
set(APPEND BLAKE3_CXX_STANDARD_FLAGS_GNU -std=c++20)
set(APPEND BLAKE3_CXX_STANDARD_FLAGS_MSVC /std:c++20)
endif()
set(BLAKE3_CXXFLAGS_GNU "-fno-exceptions;-fno-rtti;${BLAKE3_CXX_STANDARD_FLAGS_GNU}" CACHE STRING "C++ flags used for compiling private BLAKE3 library components with GNU-like compiler frontends.")
set(BLAKE3_CXXFLAGS_MSVC "/EHs-c-;/GR-;${BLAKE3_CXX_STANDARD_FLAGS_MSVC}" CACHE STRING "C++ flags used for compiling private BLAKE3 library components with MSVC-like compiler frontends.")
# Get the C++ compiler name without extension
get_filename_component(BLAKE3_CMAKE_CXX_COMPILER_NAME "${CMAKE_CXX_COMPILER}" NAME_WE)
# Strip any trailing versioning from the C++ compiler name
string(REGEX MATCH "^(clang\\+\\+|clang-cl)" BLAKE3_CMAKE_CXX_COMPILER_NAME "${BLAKE3_CMAKE_CXX_COMPILER_NAME}")
# TODO: Simplify with CMAKE_CXX_COMPILER_FRONTEND_VARIANT once min CMake version is 3.14.
if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
target_compile_options(blake3 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${BLAKE3_CXXFLAGS_GNU}>)
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
if(BLAKE3_CMAKE_CXX_COMPILER_NAME STREQUAL "clang++")
target_compile_options(blake3 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${BLAKE3_CXXFLAGS_GNU}>)
elseif(BLAKE3_CMAKE_CXX_COMPILER_NAME STREQUAL "clang-cl")
target_compile_options(blake3 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${BLAKE3_CXXFLAGS_MSVC}>)
endif()
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
target_compile_options(blake3 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${BLAKE3_CXXFLAGS_GNU}>)
elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
target_compile_options(blake3 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:${BLAKE3_CXXFLAGS_MSVC}>)
endif()
# Undefine scratch variables
unset(BLAKE3_CXX_STANDARD_FLAGS_GNU)
unset(BLAKE3_CXX_STANDARD_FLAGS_MSVC)
unset(BLAKE3_CMAKE_CXX_COMPILER_NAME)
unset(BLAKE3_CXXFLAGS_GNU)
unset(BLAKE3_CXXFLAGS_MSVC)
endif()
# cmake install support
install(FILES blake3.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
install(TARGETS blake3 EXPORT blake3-targets
ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}"
LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}"
RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
)
install(EXPORT blake3-targets
NAMESPACE BLAKE3::
DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/blake3"
)
include(CMakePackageConfigHelpers)
configure_package_config_file(blake3-config.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/blake3-config.cmake"
INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/blake3"
)
write_basic_package_version_file(
"${CMAKE_CURRENT_BINARY_DIR}/blake3-config-version.cmake"
VERSION ${libblake3_VERSION}
COMPATIBILITY SameMajorVersion
)
install(FILES
"${CMAKE_CURRENT_BINARY_DIR}/blake3-config.cmake"
"${CMAKE_CURRENT_BINARY_DIR}/blake3-config-version.cmake"
DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/blake3"
)
# Function for joining paths known from most languages
#
# SPDX-License-Identifier: (MIT OR CC0-1.0)
# Copyright 2020 Jan Tojnar
# https://github.com/jtojnar/cmake-snips
#
# Modelled after Pythons os.path.join
# https://docs.python.org/3.7/library/os.path.html#os.path.join
# Windows not supported
function(join_paths joined_path first_path_segment)
set(temp_path "${first_path_segment}")
foreach(current_segment IN LISTS ARGN)
if(NOT ("${current_segment}" STREQUAL ""))
if(IS_ABSOLUTE "${current_segment}")
set(temp_path "${current_segment}")
else()
set(temp_path "${temp_path}/${current_segment}")
endif()
endif()
endforeach()
set(${joined_path} "${temp_path}" PARENT_SCOPE)
endfunction()
# In-place rewrite a string and and join by `sep`.
#
# TODO: Replace function with list(JOIN) when updating to CMake 3.12
function(join_pkg_config_field sep requires)
set(_requires "${${requires}}") # avoid shadowing issues, e.g. "${requires}"=len
list(LENGTH "${requires}" len)
set(idx 1)
foreach(req IN LISTS _requires)
string(APPEND acc "${req}")
if(idx LESS len)
string(APPEND acc "${sep}")
endif()
math(EXPR idx "${idx} + 1")
endforeach()
set("${requires}" "${acc}" PARENT_SCOPE)
endfunction()
# pkg-config support
join_pkg_config_field(", " PKG_CONFIG_REQUIRES)
join_pkg_config_field(" " PKG_CONFIG_LIBS)
join_pkg_config_field(" " PKG_CONFIG_CFLAGS)
join_paths(PKG_CONFIG_INSTALL_LIBDIR "\${prefix}" "${CMAKE_INSTALL_LIBDIR}")
join_paths(PKG_CONFIG_INSTALL_INCLUDEDIR "\${prefix}" "${CMAKE_INSTALL_INCLUDEDIR}")
configure_file(libblake3.pc.in libblake3.pc @ONLY)
install(FILES "${CMAKE_BINARY_DIR}/libblake3.pc"
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
# print feature summary
# add_feature_info cannot directly use the BLAKE3_SIMD_TYPE :(
add_feature_info("AMD64 assembly" BLAKE3_SIMD_AMD64_ASM "The library uses hand written amd64 SIMD assembly.")
add_feature_info("x86 SIMD intrinsics" BLAKE3_SIMD_X86_INTRINSICS "The library uses x86 SIMD intrinsics.")
add_feature_info("NEON SIMD intrinsics" BLAKE3_SIMD_NEON_INTRINSICS "The library uses NEON SIMD intrinsics.")
add_feature_info("oneTBB parallelism" BLAKE3_USE_TBB "The library uses oneTBB parallelism.")
feature_summary(WHAT ENABLED_FEATURES)
if(BLAKE3_EXAMPLES)
include(BLAKE3/Examples)
endif()
if(BLAKE3_TESTING)
include(BLAKE3/Testing)
endif()

73
vendor/blake3/c/CMakePresets.json vendored Normal file
View File

@@ -0,0 +1,73 @@
{
"version": 3,
"cmakeMinimumRequired": {
"major": 3,
"minor": 22,
"patch": 0
},
"configurePresets": [
{
"name": "base",
"hidden": true,
"binaryDir": "${sourceDir}/build/${presetName}"
},
{
"name": "msvc",
"hidden": true,
"generator": "Visual Studio 17 2022",
"vendor": {
"microsoft.com/VisualStudioSettings/CMake/1.0": {
"hostOS": [
"Windows"
]
}
}
},
{
"name": "x64-windows-msvc",
"inherits": [
"msvc",
"base"
],
"architecture": "x64"
},
{
"name": "x86-windows-msvc",
"inherits": [
"msvc",
"base"
],
"architecture": "Win32"
},
{
"name": "arm64-windows-msvc",
"inherits": [
"msvc",
"base"
],
"architecture": "ARM64"
}
],
"buildPresets": [
{
"name": "x64-windows-msvc-debug",
"configurePreset": "x64-windows-msvc",
"configuration": "Debug"
},
{
"name": "x64-windows-msvc-release",
"configurePreset": "x64-windows-msvc",
"configuration": "RelWithDebInfo"
},
{
"name": "x86-windows-msvc-debug",
"configurePreset": "x86-windows-msvc",
"configuration": "Debug"
},
{
"name": "x86-windows-msvc-release",
"configurePreset": "x86-windows-msvc",
"configuration": "RelWithDebInfo"
}
]
}

82
vendor/blake3/c/Makefile.testing vendored Normal file
View File

@@ -0,0 +1,82 @@
# This Makefile is only for testing. C callers should follow the instructions
# in ./README.md to incorporate these C files into their existing build.
NAME=blake3
CC=gcc
CFLAGS=-O3 -Wall -Wextra -std=c11 -pedantic -fstack-protector-strong -D_FORTIFY_SOURCE=2 -fPIE -fvisibility=hidden
LDFLAGS=-pie -Wl,-z,relro,-z,now
TARGETS=
ASM_TARGETS=
EXTRAFLAGS=-Wa,--noexecstack
ifdef BLAKE3_NO_SSE2
EXTRAFLAGS += -DBLAKE3_NO_SSE2
else
TARGETS += blake3_sse2.o
ASM_TARGETS += blake3_sse2_x86-64_unix.S
endif
ifdef BLAKE3_NO_SSE41
EXTRAFLAGS += -DBLAKE3_NO_SSE41
else
TARGETS += blake3_sse41.o
ASM_TARGETS += blake3_sse41_x86-64_unix.S
endif
ifdef BLAKE3_NO_AVX2
EXTRAFLAGS += -DBLAKE3_NO_AVX2
else
TARGETS += blake3_avx2.o
ASM_TARGETS += blake3_avx2_x86-64_unix.S
endif
ifdef BLAKE3_NO_AVX512
EXTRAFLAGS += -DBLAKE3_NO_AVX512
else
TARGETS += blake3_avx512.o
ASM_TARGETS += blake3_avx512_x86-64_unix.S
endif
ifdef BLAKE3_USE_NEON
EXTRAFLAGS += -DBLAKE3_USE_NEON=1
TARGETS += blake3_neon.o
endif
ifdef BLAKE3_NO_NEON
EXTRAFLAGS += -DBLAKE3_USE_NEON=0
endif
all: blake3.c blake3_dispatch.c blake3_portable.c main.c $(TARGETS)
$(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS)
blake3_sse2.o: blake3_sse2.c
$(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse2
blake3_sse41.o: blake3_sse41.c
$(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -msse4.1
blake3_avx2.o: blake3_avx2.c
$(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -mavx2
blake3_avx512.o: blake3_avx512.c
$(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@ -mavx512f -mavx512vl
blake3_neon.o: blake3_neon.c
$(CC) $(CFLAGS) $(EXTRAFLAGS) -c $^ -o $@
test: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined
test: all
./test.py
asm: blake3.c blake3_dispatch.c blake3_portable.c main.c $(ASM_TARGETS)
$(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $(NAME) $(LDFLAGS)
test_asm: CFLAGS += -DBLAKE3_TESTING -fsanitize=address,undefined
test_asm: asm
./test.py
example: example.c blake3.c blake3_dispatch.c blake3_portable.c $(ASM_TARGETS)
$(CC) $(CFLAGS) $(EXTRAFLAGS) $^ -o $@ $(LDFLAGS)
clean:
rm -f $(NAME) *.o

403
vendor/blake3/c/README.md vendored Normal file
View File

@@ -0,0 +1,403 @@
The official C implementation of BLAKE3.
# Example
An example program that hashes bytes from standard input and prints the
result:
```c
#include "blake3.h"
#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
int main(void) {
// Initialize the hasher.
blake3_hasher hasher;
blake3_hasher_init(&hasher);
// Read input bytes from stdin.
unsigned char buf[65536];
while (1) {
ssize_t n = read(STDIN_FILENO, buf, sizeof(buf));
if (n > 0) {
blake3_hasher_update(&hasher, buf, n);
} else if (n == 0) {
break; // end of file
} else {
fprintf(stderr, "read failed: %s\n", strerror(errno));
return 1;
}
}
// Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes.
uint8_t output[BLAKE3_OUT_LEN];
blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN);
// Print the hash as hexadecimal.
for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) {
printf("%02x", output[i]);
}
printf("\n");
return 0;
}
```
The code above is included in this directory as `example.c`. If you're
on x86\_64 with a Unix-like OS, you can compile a working binary like
this:
```bash
gcc -O3 -o example example.c blake3.c blake3_dispatch.c blake3_portable.c \
blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \
blake3_avx512_x86-64_unix.S
```
# API
## The Struct
```c
typedef struct {
// private fields
} blake3_hasher;
```
An incremental BLAKE3 hashing state, which can accept any number of
updates. This implementation doesn't allocate any heap memory, but
`sizeof(blake3_hasher)` itself is relatively large, currently 1912 bytes
on x86-64. This size can be reduced by restricting the maximum input
length, as described in Section 5.4 of [the BLAKE3
spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf),
but this implementation doesn't currently support that strategy.
## Common API Functions
```c
void blake3_hasher_init(
blake3_hasher *self);
```
Initialize a `blake3_hasher` in the default hashing mode.
---
```c
void blake3_hasher_update(
blake3_hasher *self,
const void *input,
size_t input_len);
```
Add input to the hasher. This can be called any number of times. This function
is always single-threaded; for multithreading see `blake3_hasher_update_tbb`
below.
---
```c
void blake3_hasher_finalize(
const blake3_hasher *self,
uint8_t *out,
size_t out_len);
```
Finalize the hasher and return an output of any length, given in bytes.
This doesn't modify the hasher itself, and it's possible to finalize
again after adding more input. The constant `BLAKE3_OUT_LEN` provides
the default output length, 32 bytes, which is recommended for most
callers. See the [Security Notes](#security-notes) below.
## Less Common API Functions
```c
void blake3_hasher_init_keyed(
blake3_hasher *self,
const uint8_t key[BLAKE3_KEY_LEN]);
```
Initialize a `blake3_hasher` in the keyed hashing mode. The key must be
exactly 32 bytes.
---
```c
void blake3_hasher_init_derive_key(
blake3_hasher *self,
const char *context);
```
Initialize a `blake3_hasher` in the key derivation mode. The context
string is given as an initialization parameter, and afterwards input key
material should be given with `blake3_hasher_update`. The context string
is a null-terminated C string which should be **hardcoded, globally
unique, and application-specific**. The context string should not
include any dynamic input like salts, nonces, or identifiers read from a
database at runtime. A good default format for the context string is
`"[application] [commit timestamp] [purpose]"`, e.g., `"example.com
2019-12-25 16:18:03 session tokens v1"`.
This function is intended for application code written in C. For
language bindings, see `blake3_hasher_init_derive_key_raw` below.
---
```c
void blake3_hasher_init_derive_key_raw(
blake3_hasher *self,
const void *context,
size_t context_len);
```
As `blake3_hasher_init_derive_key` above, except that the context string
is given as a pointer to an array of arbitrary bytes with a provided
length. This is intended for writing language bindings, where C string
conversion would add unnecessary overhead and new error cases. Unicode
strings should be encoded as UTF-8.
Application code in C should prefer `blake3_hasher_init_derive_key`,
which takes the context as a C string. If you need to use arbitrary
bytes as a context string in application code, consider whether you're
violating the requirement that context strings should be hardcoded.
---
```c
void blake3_hasher_update_tbb(
blake3_hasher *self,
const void *input,
size_t input_len);
```
Add input to the hasher, using [oneTBB] to process large inputs using multiple
threads. This can be called any number of times. This gives the same result as
`blake3_hasher_update` above.
[oneTBB]: https://uxlfoundation.github.io/oneTBB/
NOTE: This function is only enabled when the library is compiled with CMake option `BLAKE3_USE_TBB`
and when the oneTBB library is detected on the host system. See the building instructions for
further details.
To get any performance benefit from multithreading, the input buffer needs to
be large. As a rule of thumb on x86_64, `blake3_hasher_update_tbb` is _slower_
than `blake3_hasher_update` for inputs under 128 KiB. That threshold varies
quite a lot across different processors, and it's important to benchmark your
specific use case.
Hashing large files with this function usually requires
[memory-mapping](https://en.wikipedia.org/wiki/Memory-mapped_file), since
reading a file into memory in a single-threaded loop takes longer than hashing
the resulting buffer. Note that hashing a memory-mapped file with this function
produces a "random" pattern of disk reads, which can be slow on spinning disks.
Again it's important to benchmark your specific use case.
This implementation doesn't require configuration of thread resources and will
use as many cores as possible by default. More fine-grained control of
resources is possible using the [oneTBB] API.
---
```c
void blake3_hasher_finalize_seek(
const blake3_hasher *self,
uint64_t seek,
uint8_t *out,
size_t out_len);
```
The same as `blake3_hasher_finalize`, but with an additional `seek`
parameter for the starting byte position in the output stream. To
efficiently stream a large output without allocating memory, call this
function in a loop, incrementing `seek` by the output length each time.
---
```c
void blake3_hasher_reset(
blake3_hasher *self);
```
Reset the hasher to its initial state, prior to any calls to
`blake3_hasher_update`. Currently this is no different from calling
`blake3_hasher_init` or similar again.
# Security Notes
Outputs shorter than the default length of 32 bytes (256 bits) provide less security. An N-bit
BLAKE3 output is intended to provide N bits of first and second preimage resistance and N/2
bits of collision resistance, for any N up to 256. Longer outputs don't provide any additional
security.
Avoid relying on the secrecy of the output offset, that is, the `seek` argument of
`blake3_hasher_finalize_seek`. [_Block-Cipher-Based Tree Hashing_ by Aldo
Gunsing](https://eprint.iacr.org/2022/283) shows that an attacker who knows both the message
and the key (if any) can easily determine the offset of an extended output. For comparison,
AES-CTR has a similar property: if you know the key, you can decrypt a block from an unknown
position in the output stream to recover its block index. Callers with strong secret keys
aren't affected in practice, but secret offsets are a [design
smell](https://en.wikipedia.org/wiki/Design_smell) in any case.
# Building
The easiest and most complete method of compiling this library is with CMake.
This is the method described in the next section. Toward the end of the
building section there are more in depth notes about compiling manually and
things that are useful to understand if you need to integrate this library with
another build system.
## CMake
The minimum version of CMake is 3.9. The following invocations will compile and
install `libblake3`. With recent CMake:
```bash
cmake -S c -B c/build "-DCMAKE_INSTALL_PREFIX=/usr/local"
cmake --build c/build --target install
```
With an older CMake:
```bash
cd c
mkdir build
cd build
cmake .. "-DCMAKE_INSTALL_PREFIX=/usr/local"
cmake --build . --target install
```
The following options are available when compiling with CMake:
- `BLAKE3_USE_TBB`: Enable oneTBB parallelism (Requires a C++20 capable compiler)
- `BLAKE3_FETCH_TBB`: Allow fetching oneTBB from GitHub (only if not found on system)
- `BLAKE3_EXAMPLES`: Compile and install example programs
Options can be enabled like this:
```bash
cmake -S c -B c/build "-DCMAKE_INSTALL_PREFIX=/usr/local" -DBLAKE3_USE_TBB=1 -DBLAKE3_FETCH_TBB=1
```
## Building manually
We try to keep the build simple enough that you can compile this library "by
hand", and it's expected that many callers will integrate it with their
pre-existing build systems. See the `gcc` one-liner in the "Example" section
above.
### x86
Dynamic dispatch is enabled by default on x86. The implementation will
query the CPU at runtime to detect SIMD support, and it will use the
widest instruction set available. By default, `blake3_dispatch.c`
expects to be linked with code for five different instruction sets:
portable C, SSE2, SSE4.1, AVX2, and AVX-512.
For each of the x86 SIMD instruction sets, four versions are available:
three flavors of assembly (Unix, Windows MSVC, and Windows GNU) and one
version using C intrinsics. The assembly versions are generally
preferred. They perform better, they perform more consistently across
different compilers, and they build more quickly. On the other hand, the
assembly versions are x86\_64-only, and you need to select the right
flavor for your target platform.
Here's an example of building a shared library on x86\_64 Linux using
the assembly implementations:
```bash
gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \
blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \
blake3_avx512_x86-64_unix.S
```
When building the intrinsics-based implementations, you need to build
each implementation separately, with the corresponding instruction set
explicitly enabled in the compiler. Here's the same shared library using
the intrinsics-based implementations:
```bash
gcc -c -fPIC -O3 -msse2 blake3_sse2.c -o blake3_sse2.o
gcc -c -fPIC -O3 -msse4.1 blake3_sse41.c -o blake3_sse41.o
gcc -c -fPIC -O3 -mavx2 blake3_avx2.c -o blake3_avx2.o
gcc -c -fPIC -O3 -mavx512f -mavx512vl blake3_avx512.c -o blake3_avx512.o
gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \
blake3_avx2.o blake3_avx512.o blake3_sse41.o blake3_sse2.o
```
Note above that building `blake3_avx512.c` requires both `-mavx512f` and
`-mavx512vl` under GCC and Clang. Under MSVC, the single `/arch:AVX512`
flag is sufficient. The MSVC equivalent of `-mavx2` is `/arch:AVX2`.
MSVC enables SSE2 and SSE4.1 by default, and it doesn't have a
corresponding flag.
If you want to omit SIMD code entirely, you need to explicitly disable
each instruction set. Here's an example of building a shared library on
x86 with only portable code:
```bash
gcc -shared -O3 -o libblake3.so -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 \
-DBLAKE3_NO_AVX512 blake3.c blake3_dispatch.c blake3_portable.c
```
### ARM NEON
The NEON implementation is enabled by default on AArch64, but not on
other ARM targets, since not all of them support it. To enable it, set
`BLAKE3_USE_NEON=1`. Here's an example of building a shared library on
ARM Linux with NEON support:
```bash
gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON=1 blake3.c blake3_dispatch.c \
blake3_portable.c blake3_neon.c
```
To explicitiy disable using NEON instructions on AArch64, set
`BLAKE3_USE_NEON=0`.
```bash
gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON=0 blake3.c blake3_dispatch.c \
blake3_portable.c
```
Note that on some targets (ARMv7 in particular), extra flags may be
required to activate NEON support in the compiler. If you see an error
like...
```
/usr/lib/gcc/armv7l-unknown-linux-gnueabihf/9.2.0/include/arm_neon.h:635:1: error: inlining failed
in call to always_inline vaddq_u32: target specific option mismatch
```
...then you may need to add something like `-mfpu=neon-vfpv4
-mfloat-abi=hard`.
### Other Platforms
The portable implementation should work on most other architectures. For
example:
```bash
gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c
```
### Multithreading
Multithreading is available using [oneTBB], by compiling the optional C++
support file [`blake3_tbb.cpp`](./blake3_tbb.cpp). For an example of using
`mmap` (non-Windows) and `blake3_hasher_update_tbb` to get large-file
performance on par with [`b3sum`](../b3sum), see
[`example_tbb.c`](./example_tbb.c). You can build it like this:
```bash
g++ -c -O3 -fno-exceptions -fno-rtti -DBLAKE3_USE_TBB -o blake3_tbb.o blake3_tbb.cpp
gcc -O3 -o example_tbb -lstdc++ -ltbb -DBLAKE3_USE_TBB blake3_tbb.o example_tbb.c blake3.c \
blake3_dispatch.c blake3_portable.c blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S \
blake3_avx2_x86-64_unix.S blake3_avx512_x86-64_unix.S
```
NOTE: `-fno-exceptions` or equivalent is required to compile `blake3_tbb.cpp`,
and public API methods with external C linkage are marked `noexcept`. Compiling
that file with exceptions enabled will fail. Compiling with RTTI disabled isn't
required but is recommended for code size.

14
vendor/blake3/c/blake3-config.cmake.in vendored Normal file
View File

@@ -0,0 +1,14 @@
@PACKAGE_INIT@
include(CMakeFindDependencyMacro)
# Remember TBB option state
set(BLAKE3_USE_TBB @BLAKE3_USE_TBB@)
if(BLAKE3_USE_TBB)
find_dependency(TBB @TBB_VERSION@)
endif()
include("${CMAKE_CURRENT_LIST_DIR}/blake3-targets.cmake")
check_required_components(blake3)

650
vendor/blake3/c/blake3.c vendored Normal file
View File

@@ -0,0 +1,650 @@
#include <assert.h>
#include <stdbool.h>
#include <string.h>
#include "blake3.h"
#include "blake3_impl.h"
const char *blake3_version(void) { return BLAKE3_VERSION_STRING; }
INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
uint8_t flags) {
memcpy(self->cv, key, BLAKE3_KEY_LEN);
self->chunk_counter = 0;
memset(self->buf, 0, BLAKE3_BLOCK_LEN);
self->buf_len = 0;
self->blocks_compressed = 0;
self->flags = flags;
}
INLINE void chunk_state_reset(blake3_chunk_state *self, const uint32_t key[8],
uint64_t chunk_counter) {
memcpy(self->cv, key, BLAKE3_KEY_LEN);
self->chunk_counter = chunk_counter;
self->blocks_compressed = 0;
memset(self->buf, 0, BLAKE3_BLOCK_LEN);
self->buf_len = 0;
}
INLINE size_t chunk_state_len(const blake3_chunk_state *self) {
return (BLAKE3_BLOCK_LEN * (size_t)self->blocks_compressed) +
((size_t)self->buf_len);
}
INLINE size_t chunk_state_fill_buf(blake3_chunk_state *self,
const uint8_t *input, size_t input_len) {
size_t take = BLAKE3_BLOCK_LEN - ((size_t)self->buf_len);
if (take > input_len) {
take = input_len;
}
uint8_t *dest = self->buf + ((size_t)self->buf_len);
memcpy(dest, input, take);
self->buf_len += (uint8_t)take;
return take;
}
INLINE uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state *self) {
if (self->blocks_compressed == 0) {
return CHUNK_START;
} else {
return 0;
}
}
typedef struct {
uint32_t input_cv[8];
uint64_t counter;
uint8_t block[BLAKE3_BLOCK_LEN];
uint8_t block_len;
uint8_t flags;
} output_t;
INLINE output_t make_output(const uint32_t input_cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags) {
output_t ret;
memcpy(ret.input_cv, input_cv, 32);
memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
ret.block_len = block_len;
ret.counter = counter;
ret.flags = flags;
return ret;
}
// Chaining values within a given chunk (specifically the compress_in_place
// interface) are represented as words. This avoids unnecessary bytes<->words
// conversion overhead in the portable implementation. However, the hash_many
// interface handles both user input and parent node blocks, so it accepts
// bytes. For that reason, chaining values in the CV stack are represented as
// bytes.
INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
uint32_t cv_words[8];
memcpy(cv_words, self->input_cv, 32);
blake3_compress_in_place(cv_words, self->block, self->block_len,
self->counter, self->flags);
store_cv_words(cv, cv_words);
}
INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
size_t out_len) {
if (out_len == 0) {
return;
}
uint64_t output_block_counter = seek / 64;
size_t offset_within_block = seek % 64;
uint8_t wide_buf[64];
if(offset_within_block) {
blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
const size_t available_bytes = 64 - offset_within_block;
const size_t bytes = out_len > available_bytes ? available_bytes : out_len;
memcpy(out, wide_buf + offset_within_block, bytes);
out += bytes;
out_len -= bytes;
output_block_counter += 1;
}
if(out_len / 64) {
blake3_xof_many(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, out, out_len / 64);
}
output_block_counter += out_len / 64;
out += out_len & -64;
out_len -= out_len & -64;
if(out_len) {
blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
memcpy(out, wide_buf, out_len);
}
}
INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input,
size_t input_len) {
if (self->buf_len > 0) {
size_t take = chunk_state_fill_buf(self, input, input_len);
input += take;
input_len -= take;
if (input_len > 0) {
blake3_compress_in_place(
self->cv, self->buf, BLAKE3_BLOCK_LEN, self->chunk_counter,
self->flags | chunk_state_maybe_start_flag(self));
self->blocks_compressed += 1;
self->buf_len = 0;
memset(self->buf, 0, BLAKE3_BLOCK_LEN);
}
}
while (input_len > BLAKE3_BLOCK_LEN) {
blake3_compress_in_place(self->cv, input, BLAKE3_BLOCK_LEN,
self->chunk_counter,
self->flags | chunk_state_maybe_start_flag(self));
self->blocks_compressed += 1;
input += BLAKE3_BLOCK_LEN;
input_len -= BLAKE3_BLOCK_LEN;
}
chunk_state_fill_buf(self, input, input_len);
}
INLINE output_t chunk_state_output(const blake3_chunk_state *self) {
uint8_t block_flags =
self->flags | chunk_state_maybe_start_flag(self) | CHUNK_END;
return make_output(self->cv, self->buf, self->buf_len, self->chunk_counter,
block_flags);
}
INLINE output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
const uint32_t key[8], uint8_t flags) {
return make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT);
}
// Given some input larger than one chunk, return the number of bytes that
// should go in the left subtree. This is the largest power-of-2 number of
// chunks that leaves at least 1 byte for the right subtree.
INLINE size_t left_subtree_len(size_t input_len) {
// Subtract 1 to reserve at least one byte for the right side. input_len
// should always be greater than BLAKE3_CHUNK_LEN.
size_t full_chunks = (input_len - 1) / BLAKE3_CHUNK_LEN;
return round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN;
}
// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
// on a single thread. Write out the chunk chaining values and return the
// number of chunks hashed. These chunks are never the root and never empty;
// those cases use a different codepath.
INLINE size_t compress_chunks_parallel(const uint8_t *input, size_t input_len,
const uint32_t key[8],
uint64_t chunk_counter, uint8_t flags,
uint8_t *out) {
#if defined(BLAKE3_TESTING)
assert(0 < input_len);
assert(input_len <= MAX_SIMD_DEGREE * BLAKE3_CHUNK_LEN);
#endif
const uint8_t *chunks_array[MAX_SIMD_DEGREE];
size_t input_position = 0;
size_t chunks_array_len = 0;
while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
chunks_array[chunks_array_len] = &input[input_position];
input_position += BLAKE3_CHUNK_LEN;
chunks_array_len += 1;
}
blake3_hash_many(chunks_array, chunks_array_len,
BLAKE3_CHUNK_LEN / BLAKE3_BLOCK_LEN, key, chunk_counter,
true, flags, CHUNK_START, CHUNK_END, out);
// Hash the remaining partial chunk, if there is one. Note that the empty
// chunk (meaning the empty message) is a different codepath.
if (input_len > input_position) {
uint64_t counter = chunk_counter + (uint64_t)chunks_array_len;
blake3_chunk_state chunk_state;
chunk_state_init(&chunk_state, key, flags);
chunk_state.chunk_counter = counter;
chunk_state_update(&chunk_state, &input[input_position],
input_len - input_position);
output_t output = chunk_state_output(&chunk_state);
output_chaining_value(&output, &out[chunks_array_len * BLAKE3_OUT_LEN]);
return chunks_array_len + 1;
} else {
return chunks_array_len;
}
}
// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
// on a single thread. Write out the parent chaining values and return the
// number of parents hashed. (If there's an odd input chaining value left over,
// return it as an additional output.) These parents are never the root and
// never empty; those cases use a different codepath.
INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
size_t num_chaining_values,
const uint32_t key[8], uint8_t flags,
uint8_t *out) {
#if defined(BLAKE3_TESTING)
assert(2 <= num_chaining_values);
assert(num_chaining_values <= 2 * MAX_SIMD_DEGREE_OR_2);
#endif
const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2];
size_t parents_array_len = 0;
while (num_chaining_values - (2 * parents_array_len) >= 2) {
parents_array[parents_array_len] =
&child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN];
parents_array_len += 1;
}
blake3_hash_many(parents_array, parents_array_len, 1, key,
0, // Parents always use counter 0.
false, flags | PARENT,
0, // Parents have no start flags.
0, // Parents have no end flags.
out);
// If there's an odd child left over, it becomes an output.
if (num_chaining_values > 2 * parents_array_len) {
memcpy(&out[parents_array_len * BLAKE3_OUT_LEN],
&child_chaining_values[2 * parents_array_len * BLAKE3_OUT_LEN],
BLAKE3_OUT_LEN);
return parents_array_len + 1;
} else {
return parents_array_len;
}
}
// The wide helper function returns (writes out) an array of chaining values
// and returns the length of that array. The number of chaining values returned
// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
// if the input is shorter than that many chunks. The reason for maintaining a
// wide array of chaining values going back up the tree, is to allow the
// implementation to hash as many parents in parallel as possible.
//
// As a special case when the SIMD degree is 1, this function will still return
// at least 2 outputs. This guarantees that this function doesn't perform the
// root compression. (If it did, it would use the wrong flags, and also we
// wouldn't be able to implement extendable output.) Note that this function is
// not used when the whole input is only 1 chunk long; that's a different
// codepath.
//
// Why not just have the caller split the input on the first update(), instead
// of implementing this special rule? Because we don't want to limit SIMD or
// multi-threading parallelism for that update().
size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
const uint32_t key[8],
uint64_t chunk_counter, uint8_t flags,
uint8_t *out, bool use_tbb) {
// Note that the single chunk case does *not* bump the SIMD degree up to 2
// when it is 1. If this implementation adds multi-threading in the future,
// this gives us the option of multi-threading even the 2-chunk case, which
// can help performance on smaller platforms.
if (input_len <= blake3_simd_degree() * BLAKE3_CHUNK_LEN) {
return compress_chunks_parallel(input, input_len, key, chunk_counter, flags,
out);
}
// With more than simd_degree chunks, we need to recurse. Start by dividing
// the input into left and right subtrees. (Note that this is only optimal
// as long as the SIMD degree is a power of 2. If we ever get a SIMD degree
// of 3 or something, we'll need a more complicated strategy.)
size_t left_input_len = left_subtree_len(input_len);
size_t right_input_len = input_len - left_input_len;
const uint8_t *right_input = &input[left_input_len];
uint64_t right_chunk_counter =
chunk_counter + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN);
// Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to
// account for the special case of returning 2 outputs when the SIMD degree
// is 1.
uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
size_t degree = blake3_simd_degree();
if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) {
// The special case: We always use a degree of at least two, to make
// sure there are two outputs. Except, as noted above, at the chunk
// level, where we allow degree=1. (Note that the 1-chunk-input case is
// a different codepath.)
degree = 2;
}
uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
// Recurse!
size_t left_n = -1;
size_t right_n = -1;
#if defined(BLAKE3_USE_TBB)
blake3_compress_subtree_wide_join_tbb(
key, flags, use_tbb,
// left-hand side
input, left_input_len, chunk_counter, cv_array, &left_n,
// right-hand side
right_input, right_input_len, right_chunk_counter, right_cvs, &right_n);
#else
left_n = blake3_compress_subtree_wide(
input, left_input_len, key, chunk_counter, flags, cv_array, use_tbb);
right_n = blake3_compress_subtree_wide(right_input, right_input_len, key,
right_chunk_counter, flags, right_cvs,
use_tbb);
#endif // BLAKE3_USE_TBB
// The special case again. If simd_degree=1, then we'll have left_n=1 and
// right_n=1. Rather than compressing them into a single output, return
// them directly, to make sure we always have at least two outputs.
if (left_n == 1) {
memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
return 2;
}
// Otherwise, do one layer of parent node compression.
size_t num_chaining_values = left_n + right_n;
return compress_parents_parallel(cv_array, num_chaining_values, key, flags,
out);
}
// Hash a subtree with compress_subtree_wide(), and then condense the resulting
// list of chaining values down to a single parent node. Don't compress that
// last parent node, however. Instead, return its message bytes (the
// concatenated chaining values of its children). This is necessary when the
// first call to update() supplies a complete subtree, because the topmost
// parent node of that subtree could end up being the root. It's also necessary
// for extended output in the general case.
//
// As with compress_subtree_wide(), this function is not used on inputs of 1
// chunk or less. That's a different codepath.
INLINE void
compress_subtree_to_parent_node(const uint8_t *input, size_t input_len,
const uint32_t key[8], uint64_t chunk_counter,
uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN],
bool use_tbb) {
#if defined(BLAKE3_TESTING)
assert(input_len > BLAKE3_CHUNK_LEN);
#endif
uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
chunk_counter, flags, cv_array, use_tbb);
assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
// The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
// as we just asserted, num_cvs will always be <=2 in that case. But GCC
// (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is
// set then it emits incorrect warnings here. We tried a few different
// hacks to silence these, but in the end our hacks just produced different
// warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of
// desperation, we ifdef out this entire loop when we know it's not needed.
#if MAX_SIMD_DEGREE_OR_2 > 2
// If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input,
// compress_subtree_wide() returns more than 2 chaining values. Condense
// them into 2 by forming parent nodes repeatedly.
uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
while (num_cvs > 2) {
num_cvs =
compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
}
#endif
memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
}
INLINE void hasher_init_base(blake3_hasher *self, const uint32_t key[8],
uint8_t flags) {
memcpy(self->key, key, BLAKE3_KEY_LEN);
chunk_state_init(&self->chunk, key, flags);
self->cv_stack_len = 0;
}
void blake3_hasher_init(blake3_hasher *self) { hasher_init_base(self, IV, 0); }
void blake3_hasher_init_keyed(blake3_hasher *self,
const uint8_t key[BLAKE3_KEY_LEN]) {
uint32_t key_words[8];
load_key_words(key, key_words);
hasher_init_base(self, key_words, KEYED_HASH);
}
void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
size_t context_len) {
blake3_hasher context_hasher;
hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
blake3_hasher_update(&context_hasher, context, context_len);
uint8_t context_key[BLAKE3_KEY_LEN];
blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
uint32_t context_key_words[8];
load_key_words(context_key, context_key_words);
hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
}
void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
blake3_hasher_init_derive_key_raw(self, context, strlen(context));
}
// As described in hasher_push_cv() below, we do "lazy merging", delaying
// merges until right before the next CV is about to be added. This is
// different from the reference implementation. Another difference is that we
// aren't always merging 1 chunk at a time. Instead, each CV might represent
// any power-of-two number of chunks, as long as the smaller-above-larger stack
// order is maintained. Instead of the "count the trailing 0-bits" algorithm
// described in the spec, we use a "count the total number of 1-bits" variant
// that doesn't require us to retain the subtree size of the CV on top of the
// stack. The principle is the same: each CV that should remain in the stack is
// represented by a 1-bit in the total number of chunks (or bytes) so far.
INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
size_t post_merge_stack_len = (size_t)popcnt(total_len);
while (self->cv_stack_len > post_merge_stack_len) {
uint8_t *parent_node =
&self->cv_stack[(self->cv_stack_len - 2) * BLAKE3_OUT_LEN];
output_t output = parent_output(parent_node, self->key, self->chunk.flags);
output_chaining_value(&output, parent_node);
self->cv_stack_len -= 1;
}
}
// In reference_impl.rs, we merge the new CV with existing CVs from the stack
// before pushing it. We can do that because we know more input is coming, so
// we know none of the merges are root.
//
// This setting is different. We want to feed as much input as possible to
// compress_subtree_wide(), without setting aside anything for the chunk_state.
// If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once
// as a single subtree, if at all possible.
//
// This leads to two problems:
// 1) This 64 KiB input might be the only call that ever gets made to update.
// In this case, the root node of the 64 KiB subtree would be the root node
// of the whole tree, and it would need to be ROOT finalized. We can't
// compress it until we know.
// 2) This 64 KiB input might complete a larger tree, whose root node is
// similarly going to be the root of the whole tree. For example, maybe
// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
// node at the root of the 256 KiB subtree until we know how to finalize it.
//
// The second problem is solved with "lazy merging". That is, when we're about
// to add a CV to the stack, we don't merge it with anything first, as the
// reference impl does. Instead we do merges using the *previous* CV that was
// added, which is sitting on top of the stack, and we put the new CV
// (unmerged) on top of the stack afterwards. This guarantees that we never
// merge the root node until finalize().
//
// Solving the first problem requires an additional tool,
// compress_subtree_to_parent_node(). That function always returns the top
// *two* chaining values of the subtree it's compressing. We then do lazy
// merging with each of them separately, so that the second CV will always
// remain unmerged. (That also helps us support extendable output when we're
// hashing an input all-at-once.)
INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
uint64_t chunk_counter) {
hasher_merge_cv_stack(self, chunk_counter);
memcpy(&self->cv_stack[self->cv_stack_len * BLAKE3_OUT_LEN], new_cv,
BLAKE3_OUT_LEN);
self->cv_stack_len += 1;
}
INLINE void blake3_hasher_update_base(blake3_hasher *self, const void *input,
size_t input_len, bool use_tbb) {
// Explicitly checking for zero avoids causing UB by passing a null pointer
// to memcpy. This comes up in practice with things like:
// std::vector<uint8_t> v;
// blake3_hasher_update(&hasher, v.data(), v.size());
if (input_len == 0) {
return;
}
const uint8_t *input_bytes = (const uint8_t *)input;
// If we have some partial chunk bytes in the internal chunk_state, we need
// to finish that chunk first.
if (chunk_state_len(&self->chunk) > 0) {
size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&self->chunk);
if (take > input_len) {
take = input_len;
}
chunk_state_update(&self->chunk, input_bytes, take);
input_bytes += take;
input_len -= take;
// If we've filled the current chunk and there's more coming, finalize this
// chunk and proceed. In this case we know it's not the root.
if (input_len > 0) {
output_t output = chunk_state_output(&self->chunk);
uint8_t chunk_cv[32];
output_chaining_value(&output, chunk_cv);
hasher_push_cv(self, chunk_cv, self->chunk.chunk_counter);
chunk_state_reset(&self->chunk, self->key, self->chunk.chunk_counter + 1);
} else {
return;
}
}
// Now the chunk_state is clear, and we have more input. If there's more than
// a single chunk (so, definitely not the root chunk), hash the largest whole
// subtree we can, with the full benefits of SIMD (and maybe in the future,
// multi-threading) parallelism. Two restrictions:
// - The subtree has to be a power-of-2 number of chunks. Only subtrees along
// the right edge can be incomplete, and we don't know where the right edge
// is going to be until we get to finalize().
// - The subtree must evenly divide the total number of chunks up until this
// point (if total is not 0). If the current incomplete subtree is only
// waiting for 1 more chunk, we can't hash a subtree of 4 chunks. We have
// to complete the current subtree first.
// Because we might need to break up the input to form powers of 2, or to
// evenly divide what we already have, this part runs in a loop.
while (input_len > BLAKE3_CHUNK_LEN) {
size_t subtree_len = round_down_to_power_of_2(input_len);
uint64_t count_so_far = self->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
// Shrink the subtree_len until it evenly divides the count so far. We know
// that subtree_len itself is a power of 2, so we can use a bitmasking
// trick instead of an actual remainder operation. (Note that if the caller
// consistently passes power-of-2 inputs of the same size, as is hopefully
// typical, this loop condition will always fail, and subtree_len will
// always be the full length of the input.)
//
// An aside: We don't have to shrink subtree_len quite this much. For
// example, if count_so_far is 1, we could pass 2 chunks to
// compress_subtree_to_parent_node. Since we'll get 2 CVs back, we'll still
// get the right answer in the end, and we might get to use 2-way SIMD
// parallelism. The problem with this optimization, is that it gets us
// stuck always hashing 2 chunks. The total number of chunks will remain
// odd, and we'll never graduate to higher degrees of parallelism. See
// https://github.com/BLAKE3-team/BLAKE3/issues/69.
while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {
subtree_len /= 2;
}
// The shrunken subtree_len might now be 1 chunk long. If so, hash that one
// chunk by itself. Otherwise, compress the subtree into a pair of CVs.
uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
if (subtree_len <= BLAKE3_CHUNK_LEN) {
blake3_chunk_state chunk_state;
chunk_state_init(&chunk_state, self->key, self->chunk.flags);
chunk_state.chunk_counter = self->chunk.chunk_counter;
chunk_state_update(&chunk_state, input_bytes, subtree_len);
output_t output = chunk_state_output(&chunk_state);
uint8_t cv[BLAKE3_OUT_LEN];
output_chaining_value(&output, cv);
hasher_push_cv(self, cv, chunk_state.chunk_counter);
} else {
// This is the high-performance happy path, though getting here depends
// on the caller giving us a long enough input.
uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
compress_subtree_to_parent_node(input_bytes, subtree_len, self->key,
self->chunk.chunk_counter,
self->chunk.flags, cv_pair, use_tbb);
hasher_push_cv(self, cv_pair, self->chunk.chunk_counter);
hasher_push_cv(self, &cv_pair[BLAKE3_OUT_LEN],
self->chunk.chunk_counter + (subtree_chunks / 2));
}
self->chunk.chunk_counter += subtree_chunks;
input_bytes += subtree_len;
input_len -= subtree_len;
}
// If there's any remaining input less than a full chunk, add it to the chunk
// state. In that case, also do a final merge loop to make sure the subtree
// stack doesn't contain any unmerged pairs. The remaining input means we
// know these merges are non-root. This merge loop isn't strictly necessary
// here, because hasher_push_chunk_cv already does its own merge loop, but it
// simplifies blake3_hasher_finalize below.
if (input_len > 0) {
chunk_state_update(&self->chunk, input_bytes, input_len);
hasher_merge_cv_stack(self, self->chunk.chunk_counter);
}
}
void blake3_hasher_update(blake3_hasher *self, const void *input,
size_t input_len) {
bool use_tbb = false;
blake3_hasher_update_base(self, input, input_len, use_tbb);
}
#if defined(BLAKE3_USE_TBB)
void blake3_hasher_update_tbb(blake3_hasher *self, const void *input,
size_t input_len) {
bool use_tbb = true;
blake3_hasher_update_base(self, input, input_len, use_tbb);
}
#endif // BLAKE3_USE_TBB
void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
size_t out_len) {
blake3_hasher_finalize_seek(self, 0, out, out_len);
}
void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
uint8_t *out, size_t out_len) {
// Explicitly checking for zero avoids causing UB by passing a null pointer
// to memcpy. This comes up in practice with things like:
// std::vector<uint8_t> v;
// blake3_hasher_finalize(&hasher, v.data(), v.size());
if (out_len == 0) {
return;
}
// If the subtree stack is empty, then the current chunk is the root.
if (self->cv_stack_len == 0) {
output_t output = chunk_state_output(&self->chunk);
output_root_bytes(&output, seek, out, out_len);
return;
}
// If there are any bytes in the chunk state, finalize that chunk and do a
// roll-up merge between that chunk hash and every subtree in the stack. In
// this case, the extra merge loop at the end of blake3_hasher_update
// guarantees that none of the subtrees in the stack need to be merged with
// each other first. Otherwise, if there are no bytes in the chunk state,
// then the top of the stack is a chunk hash, and we start the merge from
// that.
output_t output;
size_t cvs_remaining;
if (chunk_state_len(&self->chunk) > 0) {
cvs_remaining = self->cv_stack_len;
output = chunk_state_output(&self->chunk);
} else {
// There are always at least 2 CVs in the stack in this case.
cvs_remaining = self->cv_stack_len - 2;
output = parent_output(&self->cv_stack[cvs_remaining * 32], self->key,
self->chunk.flags);
}
while (cvs_remaining > 0) {
cvs_remaining -= 1;
uint8_t parent_block[BLAKE3_BLOCK_LEN];
memcpy(parent_block, &self->cv_stack[cvs_remaining * 32], 32);
output_chaining_value(&output, &parent_block[32]);
output = parent_output(parent_block, self->key, self->chunk.flags);
}
output_root_bytes(&output, seek, out, out_len);
}
void blake3_hasher_reset(blake3_hasher *self) {
chunk_state_reset(&self->chunk, self->key, 0);
self->cv_stack_len = 0;
}

86
vendor/blake3/c/blake3.h vendored Normal file
View File

@@ -0,0 +1,86 @@
#ifndef BLAKE3_H
#define BLAKE3_H
#include <stddef.h>
#include <stdint.h>
#if !defined(BLAKE3_API)
# if defined(_WIN32) || defined(__CYGWIN__)
# if defined(BLAKE3_DLL)
# if defined(BLAKE3_DLL_EXPORTS)
# define BLAKE3_API __declspec(dllexport)
# else
# define BLAKE3_API __declspec(dllimport)
# endif
# define BLAKE3_PRIVATE
# else
# define BLAKE3_API
# define BLAKE3_PRIVATE
# endif
# elif __GNUC__ >= 4
# define BLAKE3_API __attribute__((visibility("default")))
# define BLAKE3_PRIVATE __attribute__((visibility("hidden")))
# else
# define BLAKE3_API
# define BLAKE3_PRIVATE
# endif
#endif
#ifdef __cplusplus
extern "C" {
#endif
#define BLAKE3_VERSION_STRING "1.8.2"
#define BLAKE3_KEY_LEN 32
#define BLAKE3_OUT_LEN 32
#define BLAKE3_BLOCK_LEN 64
#define BLAKE3_CHUNK_LEN 1024
#define BLAKE3_MAX_DEPTH 54
// This struct is a private implementation detail. It has to be here because
// it's part of blake3_hasher below.
typedef struct {
uint32_t cv[8];
uint64_t chunk_counter;
uint8_t buf[BLAKE3_BLOCK_LEN];
uint8_t buf_len;
uint8_t blocks_compressed;
uint8_t flags;
} blake3_chunk_state;
typedef struct {
uint32_t key[8];
blake3_chunk_state chunk;
uint8_t cv_stack_len;
// The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
// with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
// requires a 4th entry, rather than merging everything down to 1, because we
// don't know whether more input is coming. This is different from how the
// reference implementation does things.
uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
} blake3_hasher;
BLAKE3_API const char *blake3_version(void);
BLAKE3_API void blake3_hasher_init(blake3_hasher *self);
BLAKE3_API void blake3_hasher_init_keyed(blake3_hasher *self,
const uint8_t key[BLAKE3_KEY_LEN]);
BLAKE3_API void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
size_t context_len);
BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input,
size_t input_len);
#if defined(BLAKE3_USE_TBB)
BLAKE3_API void blake3_hasher_update_tbb(blake3_hasher *self, const void *input,
size_t input_len);
#endif // BLAKE3_USE_TBB
BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
size_t out_len);
BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
uint8_t *out, size_t out_len);
BLAKE3_API void blake3_hasher_reset(blake3_hasher *self);
#ifdef __cplusplus
}
#endif
#endif /* BLAKE3_H */

326
vendor/blake3/c/blake3_avx2.c vendored Normal file
View File

@@ -0,0 +1,326 @@
#include "blake3_impl.h"
#include <immintrin.h>
#define DEGREE 8
INLINE __m256i loadu(const uint8_t src[32]) {
return _mm256_loadu_si256((const __m256i *)src);
}
INLINE void storeu(__m256i src, uint8_t dest[16]) {
_mm256_storeu_si256((__m256i *)dest, src);
}
INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
// Note that clang-format doesn't like the name "xor" for some reason.
INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
INLINE __m256i rot16(__m256i x) {
return _mm256_shuffle_epi8(
x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
}
INLINE __m256i rot12(__m256i x) {
return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12));
}
INLINE __m256i rot8(__m256i x) {
return _mm256_shuffle_epi8(
x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1,
12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
}
INLINE __m256i rot7(__m256i x) {
return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7));
}
INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) {
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
v[0] = addv(v[0], v[4]);
v[1] = addv(v[1], v[5]);
v[2] = addv(v[2], v[6]);
v[3] = addv(v[3], v[7]);
v[12] = xorv(v[12], v[0]);
v[13] = xorv(v[13], v[1]);
v[14] = xorv(v[14], v[2]);
v[15] = xorv(v[15], v[3]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[15] = rot16(v[15]);
v[8] = addv(v[8], v[12]);
v[9] = addv(v[9], v[13]);
v[10] = addv(v[10], v[14]);
v[11] = addv(v[11], v[15]);
v[4] = xorv(v[4], v[8]);
v[5] = xorv(v[5], v[9]);
v[6] = xorv(v[6], v[10]);
v[7] = xorv(v[7], v[11]);
v[4] = rot12(v[4]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
v[0] = addv(v[0], v[4]);
v[1] = addv(v[1], v[5]);
v[2] = addv(v[2], v[6]);
v[3] = addv(v[3], v[7]);
v[12] = xorv(v[12], v[0]);
v[13] = xorv(v[13], v[1]);
v[14] = xorv(v[14], v[2]);
v[15] = xorv(v[15], v[3]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[15] = rot8(v[15]);
v[8] = addv(v[8], v[12]);
v[9] = addv(v[9], v[13]);
v[10] = addv(v[10], v[14]);
v[11] = addv(v[11], v[15]);
v[4] = xorv(v[4], v[8]);
v[5] = xorv(v[5], v[9]);
v[6] = xorv(v[6], v[10]);
v[7] = xorv(v[7], v[11]);
v[4] = rot7(v[4]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
v[0] = addv(v[0], v[5]);
v[1] = addv(v[1], v[6]);
v[2] = addv(v[2], v[7]);
v[3] = addv(v[3], v[4]);
v[15] = xorv(v[15], v[0]);
v[12] = xorv(v[12], v[1]);
v[13] = xorv(v[13], v[2]);
v[14] = xorv(v[14], v[3]);
v[15] = rot16(v[15]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[10] = addv(v[10], v[15]);
v[11] = addv(v[11], v[12]);
v[8] = addv(v[8], v[13]);
v[9] = addv(v[9], v[14]);
v[5] = xorv(v[5], v[10]);
v[6] = xorv(v[6], v[11]);
v[7] = xorv(v[7], v[8]);
v[4] = xorv(v[4], v[9]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[4] = rot12(v[4]);
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
v[0] = addv(v[0], v[5]);
v[1] = addv(v[1], v[6]);
v[2] = addv(v[2], v[7]);
v[3] = addv(v[3], v[4]);
v[15] = xorv(v[15], v[0]);
v[12] = xorv(v[12], v[1]);
v[13] = xorv(v[13], v[2]);
v[14] = xorv(v[14], v[3]);
v[15] = rot8(v[15]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[10] = addv(v[10], v[15]);
v[11] = addv(v[11], v[12]);
v[8] = addv(v[8], v[13]);
v[9] = addv(v[9], v[14]);
v[5] = xorv(v[5], v[10]);
v[6] = xorv(v[6], v[11]);
v[7] = xorv(v[7], v[8]);
v[4] = xorv(v[4], v[9]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[4] = rot7(v[4]);
}
INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
// Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
// is 22/33/66/77.
__m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
__m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
__m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
__m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
__m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
__m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
__m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
__m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
// Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
// 11/33.
__m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
__m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
__m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
__m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
__m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
__m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
__m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
__m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
// Interleave 128-bit lanes.
vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
}
INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
size_t block_offset, __m256i out[16]) {
out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
for (size_t i = 0; i < 8; ++i) {
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
}
transpose_vecs(&out[0]);
transpose_vecs(&out[8]);
}
INLINE void load_counters(uint64_t counter, bool increment_counter,
__m256i *out_lo, __m256i *out_hi) {
const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter);
const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
const __m256i add1 = _mm256_and_si256(mask, add0);
__m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
__m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)),
_mm256_xor_si256( l, _mm256_set1_epi32(0x80000000)));
__m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry);
*out_lo = l;
*out_hi = h;
}
static
void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,
const uint32_t key[8], uint64_t counter,
bool increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
__m256i h_vecs[8] = {
set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
};
__m256i counter_low_vec, counter_high_vec;
load_counters(counter, increment_counter, &counter_low_vec,
&counter_high_vec);
uint8_t block_flags = flags | flags_start;
for (size_t block = 0; block < blocks; block++) {
if (block + 1 == blocks) {
block_flags |= flags_end;
}
__m256i block_len_vec = set1(BLAKE3_BLOCK_LEN);
__m256i block_flags_vec = set1(block_flags);
__m256i msg_vecs[16];
transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
__m256i v[16] = {
h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]),
counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
};
round_fn(v, msg_vecs, 0);
round_fn(v, msg_vecs, 1);
round_fn(v, msg_vecs, 2);
round_fn(v, msg_vecs, 3);
round_fn(v, msg_vecs, 4);
round_fn(v, msg_vecs, 5);
round_fn(v, msg_vecs, 6);
h_vecs[0] = xorv(v[0], v[8]);
h_vecs[1] = xorv(v[1], v[9]);
h_vecs[2] = xorv(v[2], v[10]);
h_vecs[3] = xorv(v[3], v[11]);
h_vecs[4] = xorv(v[4], v[12]);
h_vecs[5] = xorv(v[5], v[13]);
h_vecs[6] = xorv(v[6], v[14]);
h_vecs[7] = xorv(v[7], v[15]);
block_flags = flags;
}
transpose_vecs(h_vecs);
storeu(h_vecs[0], &out[0 * sizeof(__m256i)]);
storeu(h_vecs[1], &out[1 * sizeof(__m256i)]);
storeu(h_vecs[2], &out[2 * sizeof(__m256i)]);
storeu(h_vecs[3], &out[3 * sizeof(__m256i)]);
storeu(h_vecs[4], &out[4 * sizeof(__m256i)]);
storeu(h_vecs[5], &out[5 * sizeof(__m256i)]);
storeu(h_vecs[6], &out[6 * sizeof(__m256i)]);
storeu(h_vecs[7], &out[7 * sizeof(__m256i)]);
}
#if !defined(BLAKE3_NO_SSE41)
void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
#else
void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
#endif
void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out) {
while (num_inputs >= DEGREE) {
blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags,
flags_start, flags_end, out);
if (increment_counter) {
counter += DEGREE;
}
inputs += DEGREE;
num_inputs -= DEGREE;
out = &out[DEGREE * BLAKE3_OUT_LEN];
}
#if !defined(BLAKE3_NO_SSE41)
blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end, out);
#else
blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
#endif
}

1815
vendor/blake3/c/blake3_avx2_x86-64_unix.S vendored Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1388
vendor/blake3/c/blake3_avx512.c vendored Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

332
vendor/blake3/c/blake3_dispatch.c vendored Normal file
View File

@@ -0,0 +1,332 @@
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include "blake3_impl.h"
#if defined(_MSC_VER)
#include <Windows.h>
#endif
#if defined(IS_X86)
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__GNUC__)
#include <immintrin.h>
#else
#undef IS_X86 /* Unimplemented! */
#endif
#endif
#if !defined(BLAKE3_ATOMICS)
#if defined(__has_include)
#if __has_include(<stdatomic.h>) && !defined(_MSC_VER)
#define BLAKE3_ATOMICS 1
#else
#define BLAKE3_ATOMICS 0
#endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */
#else
#define BLAKE3_ATOMICS 0
#endif /* defined(__has_include) */
#endif /* BLAKE3_ATOMICS */
#if BLAKE3_ATOMICS
#define ATOMIC_INT _Atomic int
#define ATOMIC_LOAD(x) x
#define ATOMIC_STORE(x, y) x = y
#elif defined(_MSC_VER)
#define ATOMIC_INT LONG
#define ATOMIC_LOAD(x) InterlockedOr(&x, 0)
#define ATOMIC_STORE(x, y) InterlockedExchange(&x, y)
#else
#define ATOMIC_INT int
#define ATOMIC_LOAD(x) x
#define ATOMIC_STORE(x, y) x = y
#endif
#define MAYBE_UNUSED(x) (void)((x))
#if defined(IS_X86)
static uint64_t xgetbv(void) {
#if defined(_MSC_VER)
return _xgetbv(0);
#else
uint32_t eax = 0, edx = 0;
__asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
return ((uint64_t)edx << 32) | eax;
#endif
}
static void cpuid(uint32_t out[4], uint32_t id) {
#if defined(_MSC_VER)
__cpuid((int *)out, id);
#elif defined(__i386__) || defined(_M_IX86)
__asm__ __volatile__("movl %%ebx, %1\n"
"cpuid\n"
"xchgl %1, %%ebx\n"
: "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
: "a"(id));
#else
__asm__ __volatile__("cpuid\n"
: "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
: "a"(id));
#endif
}
static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
#if defined(_MSC_VER)
__cpuidex((int *)out, id, sid);
#elif defined(__i386__) || defined(_M_IX86)
__asm__ __volatile__("movl %%ebx, %1\n"
"cpuid\n"
"xchgl %1, %%ebx\n"
: "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
: "a"(id), "c"(sid));
#else
__asm__ __volatile__("cpuid\n"
: "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
: "a"(id), "c"(sid));
#endif
}
#endif
enum cpu_feature {
SSE2 = 1 << 0,
SSSE3 = 1 << 1,
SSE41 = 1 << 2,
AVX = 1 << 3,
AVX2 = 1 << 4,
AVX512F = 1 << 5,
AVX512VL = 1 << 6,
/* ... */
UNDEFINED = 1 << 30
};
#if !defined(BLAKE3_TESTING)
static /* Allow the variable to be controlled manually for testing */
#endif
ATOMIC_INT g_cpu_features = UNDEFINED;
#if !defined(BLAKE3_TESTING)
static
#endif
enum cpu_feature
get_cpu_features(void) {
/* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */
enum cpu_feature features = ATOMIC_LOAD(g_cpu_features);
if (features != UNDEFINED) {
return features;
} else {
#if defined(IS_X86)
uint32_t regs[4] = {0};
uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
(void)edx;
features = 0;
cpuid(regs, 0);
const int max_id = *eax;
cpuid(regs, 1);
#if defined(__amd64__) || defined(_M_X64)
features |= SSE2;
#else
if (*edx & (1UL << 26))
features |= SSE2;
#endif
if (*ecx & (1UL << 9))
features |= SSSE3;
if (*ecx & (1UL << 19))
features |= SSE41;
if (*ecx & (1UL << 27)) { // OSXSAVE
const uint64_t mask = xgetbv();
if ((mask & 6) == 6) { // SSE and AVX states
if (*ecx & (1UL << 28))
features |= AVX;
if (max_id >= 7) {
cpuidex(regs, 7, 0);
if (*ebx & (1UL << 5))
features |= AVX2;
if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
if (*ebx & (1UL << 31))
features |= AVX512VL;
if (*ebx & (1UL << 16))
features |= AVX512F;
}
}
}
}
ATOMIC_STORE(g_cpu_features, features);
return features;
#else
/* How to detect NEON? */
return 0;
#endif
}
}
void blake3_compress_in_place(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags) {
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
if (features & AVX512VL) {
blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE41)
if (features & SSE41) {
blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
return;
}
#endif
#endif
blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
}
void blake3_compress_xof(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags,
uint8_t out[64]) {
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
if (features & AVX512VL) {
blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE41)
if (features & SSE41) {
blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
return;
}
#endif
#endif
blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
}
void blake3_xof_many(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags,
uint8_t out[64], size_t outblocks) {
if (outblocks == 0) {
// The current assembly implementation always outputs at least 1 block.
return;
}
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512)
if (features & AVX512VL) {
blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks);
return;
}
#endif
#endif
for(size_t i = 0; i < outblocks; ++i) {
blake3_compress_xof(cv, block, block_len, counter + i, flags, out + 64*i);
}
}
void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8], uint64_t counter,
bool increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
return;
}
#endif
#if !defined(BLAKE3_NO_AVX2)
if (features & AVX2) {
blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE41)
if (features & SSE41) {
blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
return;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
return;
}
#endif
#endif
#if BLAKE3_USE_NEON == 1
blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end, out);
return;
#endif
blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
}
// The dynamically detected SIMD degree of the current platform.
size_t blake3_simd_degree(void) {
#if defined(IS_X86)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_AVX512)
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
return 16;
}
#endif
#if !defined(BLAKE3_NO_AVX2)
if (features & AVX2) {
return 8;
}
#endif
#if !defined(BLAKE3_NO_SSE41)
if (features & SSE41) {
return 4;
}
#endif
#if !defined(BLAKE3_NO_SSE2)
if (features & SSE2) {
return 4;
}
#endif
#endif
#if BLAKE3_USE_NEON == 1
return 4;
#endif
return 1;
}

333
vendor/blake3/c/blake3_impl.h vendored Normal file
View File

@@ -0,0 +1,333 @@
#ifndef BLAKE3_IMPL_H
#define BLAKE3_IMPL_H
#include <assert.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include "blake3.h"
#ifdef __cplusplus
extern "C" {
#endif
// internal flags
enum blake3_flags {
CHUNK_START = 1 << 0,
CHUNK_END = 1 << 1,
PARENT = 1 << 2,
ROOT = 1 << 3,
KEYED_HASH = 1 << 4,
DERIVE_KEY_CONTEXT = 1 << 5,
DERIVE_KEY_MATERIAL = 1 << 6,
};
// This C implementation tries to support recent versions of GCC, Clang, and
// MSVC.
#if defined(_MSC_VER)
#define INLINE static __forceinline
#else
#define INLINE static inline __attribute__((always_inline))
#endif
#ifdef __cplusplus
#define NOEXCEPT noexcept
#else
#define NOEXCEPT
#endif
#if (defined(__x86_64__) || defined(_M_X64)) && !defined(_M_ARM64EC)
#define IS_X86
#define IS_X86_64
#endif
#if defined(__i386__) || defined(_M_IX86)
#define IS_X86
#define IS_X86_32
#endif
#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
#define IS_AARCH64
#endif
#if defined(IS_X86)
#if defined(_MSC_VER)
#include <intrin.h>
#endif
#endif
#if !defined(BLAKE3_USE_NEON)
// If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
#if defined(IS_AARCH64)
#if defined(__ARM_BIG_ENDIAN)
#define BLAKE3_USE_NEON 0
#else
#define BLAKE3_USE_NEON 1
#endif
#else
#define BLAKE3_USE_NEON 0
#endif
#endif
#if defined(IS_X86)
#define MAX_SIMD_DEGREE 16
#elif BLAKE3_USE_NEON == 1
#define MAX_SIMD_DEGREE 4
#else
#define MAX_SIMD_DEGREE 1
#endif
// There are some places where we want a static size that's equal to the
// MAX_SIMD_DEGREE, but also at least 2.
#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
0x1F83D9ABUL, 0x5BE0CD19UL};
static const uint8_t MSG_SCHEDULE[7][16] = {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
{3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
{10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
{12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
{9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
{11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
};
/* Find index of the highest set bit */
/* x is assumed to be nonzero. */
static unsigned int highest_one(uint64_t x) {
#if defined(__GNUC__) || defined(__clang__)
return 63 ^ (unsigned int)__builtin_clzll(x);
#elif defined(_MSC_VER) && defined(IS_X86_64)
unsigned long index;
_BitScanReverse64(&index, x);
return index;
#elif defined(_MSC_VER) && defined(IS_X86_32)
if(x >> 32) {
unsigned long index;
_BitScanReverse(&index, (unsigned long)(x >> 32));
return 32 + index;
} else {
unsigned long index;
_BitScanReverse(&index, (unsigned long)x);
return index;
}
#else
unsigned int c = 0;
if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
if(x & 0x000000000000000cULL) { x >>= 2; c += 2; }
if(x & 0x0000000000000002ULL) { c += 1; }
return c;
#endif
}
// Count the number of 1 bits.
INLINE unsigned int popcnt(uint64_t x) {
#if defined(__GNUC__) || defined(__clang__)
return (unsigned int)__builtin_popcountll(x);
#else
unsigned int count = 0;
while (x != 0) {
count += 1;
x &= x - 1;
}
return count;
#endif
}
// Largest power of two less than or equal to x. As a special case, returns 1
// when x is 0.
INLINE uint64_t round_down_to_power_of_2(uint64_t x) {
return 1ULL << highest_one(x | 1);
}
INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
INLINE uint32_t counter_high(uint64_t counter) {
return (uint32_t)(counter >> 32);
}
INLINE uint32_t load32(const void *src) {
const uint8_t *p = (const uint8_t *)src;
return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
}
INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
uint32_t key_words[8]) {
key_words[0] = load32(&key[0 * 4]);
key_words[1] = load32(&key[1 * 4]);
key_words[2] = load32(&key[2 * 4]);
key_words[3] = load32(&key[3 * 4]);
key_words[4] = load32(&key[4 * 4]);
key_words[5] = load32(&key[5 * 4]);
key_words[6] = load32(&key[6 * 4]);
key_words[7] = load32(&key[7 * 4]);
}
INLINE void load_block_words(const uint8_t block[BLAKE3_BLOCK_LEN],
uint32_t block_words[16]) {
for (size_t i = 0; i < 16; i++) {
block_words[i] = load32(&block[i * 4]);
}
}
INLINE void store32(void *dst, uint32_t w) {
uint8_t *p = (uint8_t *)dst;
p[0] = (uint8_t)(w >> 0);
p[1] = (uint8_t)(w >> 8);
p[2] = (uint8_t)(w >> 16);
p[3] = (uint8_t)(w >> 24);
}
INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
store32(&bytes_out[0 * 4], cv_words[0]);
store32(&bytes_out[1 * 4], cv_words[1]);
store32(&bytes_out[2 * 4], cv_words[2]);
store32(&bytes_out[3 * 4], cv_words[3]);
store32(&bytes_out[4 * 4], cv_words[4]);
store32(&bytes_out[5 * 4], cv_words[5]);
store32(&bytes_out[6 * 4], cv_words[6]);
store32(&bytes_out[7 * 4], cv_words[7]);
}
void blake3_compress_in_place(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags);
void blake3_compress_xof(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags,
uint8_t out[64]);
void blake3_xof_many(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags,
uint8_t out[64], size_t outblocks);
void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8], uint64_t counter,
bool increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out);
size_t blake3_simd_degree(void);
BLAKE3_PRIVATE size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
const uint32_t key[8],
uint64_t chunk_counter, uint8_t flags,
uint8_t *out, bool use_tbb);
#if defined(BLAKE3_USE_TBB)
BLAKE3_PRIVATE void blake3_compress_subtree_wide_join_tbb(
// shared params
const uint32_t key[8], uint8_t flags, bool use_tbb,
// left-hand side params
const uint8_t *l_input, size_t l_input_len, uint64_t l_chunk_counter,
uint8_t *l_cvs, size_t *l_n,
// right-hand side params
const uint8_t *r_input, size_t r_input_len, uint64_t r_chunk_counter,
uint8_t *r_cvs, size_t *r_n) NOEXCEPT;
#endif
// Declarations for implementation-specific functions.
void blake3_compress_in_place_portable(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags);
void blake3_compress_xof_portable(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags, uint8_t out[64]);
void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
#if defined(IS_X86)
#if !defined(BLAKE3_NO_SSE2)
void blake3_compress_in_place_sse2(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags);
void blake3_compress_xof_sse2(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags, uint8_t out[64]);
void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
#endif
#if !defined(BLAKE3_NO_SSE41)
void blake3_compress_in_place_sse41(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags);
void blake3_compress_xof_sse41(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags, uint8_t out[64]);
void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
#endif
#if !defined(BLAKE3_NO_AVX2)
void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
#endif
#if !defined(BLAKE3_NO_AVX512)
void blake3_compress_in_place_avx512(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags);
void blake3_compress_xof_avx512(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags, uint8_t out[64]);
void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
#if !defined(_WIN32)
void blake3_xof_many_avx512(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags,
uint8_t* out, size_t outblocks);
#endif
#endif
#endif
#if BLAKE3_USE_NEON == 1
void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
#endif
#ifdef __cplusplus
}
#endif
#endif /* BLAKE3_IMPL_H */

366
vendor/blake3/c/blake3_neon.c vendored Normal file
View File

@@ -0,0 +1,366 @@
#include "blake3_impl.h"
#include <arm_neon.h>
#ifdef __ARM_BIG_ENDIAN
#error "This implementation only supports little-endian ARM."
// It might be that all we need for big-endian support here is to get the loads
// and stores right, but step zero would be finding a way to test it in CI.
#endif
INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
// vld1q_u32 has alignment requirements. Don't use it.
return vreinterpretq_u32_u8(vld1q_u8(src));
}
INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
// vst1q_u32 has alignment requirements. Don't use it.
vst1q_u8(dest, vreinterpretq_u8_u32(src));
}
INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
return vaddq_u32(a, b);
}
INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) {
return veorq_u32(a, b);
}
INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); }
INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
uint32_t array[4] = {a, b, c, d};
return vld1q_u32(array);
}
INLINE uint32x4_t rot16_128(uint32x4_t x) {
// The straightforward implementation would be two shifts and an or, but that's
// slower on microarchitectures we've tested. See
// https://github.com/BLAKE3-team/BLAKE3/pull/319.
// return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)));
}
INLINE uint32x4_t rot12_128(uint32x4_t x) {
// See comment in rot16_128.
// return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12);
}
INLINE uint32x4_t rot8_128(uint32x4_t x) {
// See comment in rot16_128.
// return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
#if defined(__clang__)
return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12));
#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700
static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12};
return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8));
#else
return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8);
#endif
}
INLINE uint32x4_t rot7_128(uint32x4_t x) {
// See comment in rot16_128.
// return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7);
}
// TODO: compress_neon
// TODO: hash2_neon
/*
* ----------------------------------------------------------------------------
* hash4_neon
* ----------------------------------------------------------------------------
*/
INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) {
v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
v[0] = add_128(v[0], v[4]);
v[1] = add_128(v[1], v[5]);
v[2] = add_128(v[2], v[6]);
v[3] = add_128(v[3], v[7]);
v[12] = xor_128(v[12], v[0]);
v[13] = xor_128(v[13], v[1]);
v[14] = xor_128(v[14], v[2]);
v[15] = xor_128(v[15], v[3]);
v[12] = rot16_128(v[12]);
v[13] = rot16_128(v[13]);
v[14] = rot16_128(v[14]);
v[15] = rot16_128(v[15]);
v[8] = add_128(v[8], v[12]);
v[9] = add_128(v[9], v[13]);
v[10] = add_128(v[10], v[14]);
v[11] = add_128(v[11], v[15]);
v[4] = xor_128(v[4], v[8]);
v[5] = xor_128(v[5], v[9]);
v[6] = xor_128(v[6], v[10]);
v[7] = xor_128(v[7], v[11]);
v[4] = rot12_128(v[4]);
v[5] = rot12_128(v[5]);
v[6] = rot12_128(v[6]);
v[7] = rot12_128(v[7]);
v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
v[0] = add_128(v[0], v[4]);
v[1] = add_128(v[1], v[5]);
v[2] = add_128(v[2], v[6]);
v[3] = add_128(v[3], v[7]);
v[12] = xor_128(v[12], v[0]);
v[13] = xor_128(v[13], v[1]);
v[14] = xor_128(v[14], v[2]);
v[15] = xor_128(v[15], v[3]);
v[12] = rot8_128(v[12]);
v[13] = rot8_128(v[13]);
v[14] = rot8_128(v[14]);
v[15] = rot8_128(v[15]);
v[8] = add_128(v[8], v[12]);
v[9] = add_128(v[9], v[13]);
v[10] = add_128(v[10], v[14]);
v[11] = add_128(v[11], v[15]);
v[4] = xor_128(v[4], v[8]);
v[5] = xor_128(v[5], v[9]);
v[6] = xor_128(v[6], v[10]);
v[7] = xor_128(v[7], v[11]);
v[4] = rot7_128(v[4]);
v[5] = rot7_128(v[5]);
v[6] = rot7_128(v[6]);
v[7] = rot7_128(v[7]);
v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
v[0] = add_128(v[0], v[5]);
v[1] = add_128(v[1], v[6]);
v[2] = add_128(v[2], v[7]);
v[3] = add_128(v[3], v[4]);
v[15] = xor_128(v[15], v[0]);
v[12] = xor_128(v[12], v[1]);
v[13] = xor_128(v[13], v[2]);
v[14] = xor_128(v[14], v[3]);
v[15] = rot16_128(v[15]);
v[12] = rot16_128(v[12]);
v[13] = rot16_128(v[13]);
v[14] = rot16_128(v[14]);
v[10] = add_128(v[10], v[15]);
v[11] = add_128(v[11], v[12]);
v[8] = add_128(v[8], v[13]);
v[9] = add_128(v[9], v[14]);
v[5] = xor_128(v[5], v[10]);
v[6] = xor_128(v[6], v[11]);
v[7] = xor_128(v[7], v[8]);
v[4] = xor_128(v[4], v[9]);
v[5] = rot12_128(v[5]);
v[6] = rot12_128(v[6]);
v[7] = rot12_128(v[7]);
v[4] = rot12_128(v[4]);
v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
v[0] = add_128(v[0], v[5]);
v[1] = add_128(v[1], v[6]);
v[2] = add_128(v[2], v[7]);
v[3] = add_128(v[3], v[4]);
v[15] = xor_128(v[15], v[0]);
v[12] = xor_128(v[12], v[1]);
v[13] = xor_128(v[13], v[2]);
v[14] = xor_128(v[14], v[3]);
v[15] = rot8_128(v[15]);
v[12] = rot8_128(v[12]);
v[13] = rot8_128(v[13]);
v[14] = rot8_128(v[14]);
v[10] = add_128(v[10], v[15]);
v[11] = add_128(v[11], v[12]);
v[8] = add_128(v[8], v[13]);
v[9] = add_128(v[9], v[14]);
v[5] = xor_128(v[5], v[10]);
v[6] = xor_128(v[6], v[11]);
v[7] = xor_128(v[7], v[8]);
v[4] = xor_128(v[4], v[9]);
v[5] = rot7_128(v[5]);
v[6] = rot7_128(v[6]);
v[7] = rot7_128(v[7]);
v[4] = rot7_128(v[4]);
}
INLINE void transpose_vecs_128(uint32x4_t vecs[4]) {
// Individually transpose the four 2x2 sub-matrices in each corner.
uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]);
uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]);
// Swap the top-right and bottom-left 2x2s (which just got transposed).
vecs[0] =
vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0]));
vecs[1] =
vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1]));
vecs[2] =
vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0]));
vecs[3] =
vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1]));
}
INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
size_t block_offset, uint32x4_t out[16]) {
out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]);
out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]);
out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]);
out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]);
out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]);
out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]);
out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]);
out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]);
out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]);
out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]);
out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]);
out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]);
out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]);
out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]);
out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]);
out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]);
transpose_vecs_128(&out[0]);
transpose_vecs_128(&out[4]);
transpose_vecs_128(&out[8]);
transpose_vecs_128(&out[12]);
}
INLINE void load_counters4(uint64_t counter, bool increment_counter,
uint32x4_t *out_low, uint32x4_t *out_high) {
uint64_t mask = (increment_counter ? ~0 : 0);
*out_low = set4(
counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)),
counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)));
*out_high = set4(
counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)),
counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
}
void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
const uint32_t key[8], uint64_t counter,
bool increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
uint32x4_t h_vecs[8] = {
set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
};
uint32x4_t counter_low_vec, counter_high_vec;
load_counters4(counter, increment_counter, &counter_low_vec,
&counter_high_vec);
uint8_t block_flags = flags | flags_start;
for (size_t block = 0; block < blocks; block++) {
if (block + 1 == blocks) {
block_flags |= flags_end;
}
uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
uint32x4_t block_flags_vec = set1_128(block_flags);
uint32x4_t msg_vecs[16];
transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
uint32x4_t v[16] = {
h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
};
round_fn4(v, msg_vecs, 0);
round_fn4(v, msg_vecs, 1);
round_fn4(v, msg_vecs, 2);
round_fn4(v, msg_vecs, 3);
round_fn4(v, msg_vecs, 4);
round_fn4(v, msg_vecs, 5);
round_fn4(v, msg_vecs, 6);
h_vecs[0] = xor_128(v[0], v[8]);
h_vecs[1] = xor_128(v[1], v[9]);
h_vecs[2] = xor_128(v[2], v[10]);
h_vecs[3] = xor_128(v[3], v[11]);
h_vecs[4] = xor_128(v[4], v[12]);
h_vecs[5] = xor_128(v[5], v[13]);
h_vecs[6] = xor_128(v[6], v[14]);
h_vecs[7] = xor_128(v[7], v[15]);
block_flags = flags;
}
transpose_vecs_128(&h_vecs[0]);
transpose_vecs_128(&h_vecs[4]);
// The first four vecs now contain the first half of each output, and the
// second four vecs contain the second half of each output.
storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]);
storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]);
storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]);
storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]);
storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]);
storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]);
storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]);
storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]);
}
/*
* ----------------------------------------------------------------------------
* hash_many_neon
* ----------------------------------------------------------------------------
*/
void blake3_compress_in_place_portable(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags);
INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
const uint32_t key[8], uint64_t counter,
uint8_t flags, uint8_t flags_start, uint8_t flags_end,
uint8_t out[BLAKE3_OUT_LEN]) {
uint32_t cv[8];
memcpy(cv, key, BLAKE3_KEY_LEN);
uint8_t block_flags = flags | flags_start;
while (blocks > 0) {
if (blocks == 1) {
block_flags |= flags_end;
}
// TODO: Implement compress_neon. However note that according to
// https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227,
// compress_neon might not be any faster than compress_portable.
blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
block_flags);
input = &input[BLAKE3_BLOCK_LEN];
blocks -= 1;
block_flags = flags;
}
memcpy(out, cv, BLAKE3_OUT_LEN);
}
void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out) {
while (num_inputs >= 4) {
blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags,
flags_start, flags_end, out);
if (increment_counter) {
counter += 4;
}
inputs += 4;
num_inputs -= 4;
out = &out[4 * BLAKE3_OUT_LEN];
}
while (num_inputs > 0) {
hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start,
flags_end, out);
if (increment_counter) {
counter += 1;
}
inputs += 1;
num_inputs -= 1;
out = &out[BLAKE3_OUT_LEN];
}
}

160
vendor/blake3/c/blake3_portable.c vendored Normal file
View File

@@ -0,0 +1,160 @@
#include "blake3_impl.h"
#include <string.h>
INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
return (w >> c) | (w << (32 - c));
}
INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
uint32_t x, uint32_t y) {
state[a] = state[a] + state[b] + x;
state[d] = rotr32(state[d] ^ state[a], 16);
state[c] = state[c] + state[d];
state[b] = rotr32(state[b] ^ state[c], 12);
state[a] = state[a] + state[b] + y;
state[d] = rotr32(state[d] ^ state[a], 8);
state[c] = state[c] + state[d];
state[b] = rotr32(state[b] ^ state[c], 7);
}
INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) {
// Select the message schedule based on the round.
const uint8_t *schedule = MSG_SCHEDULE[round];
// Mix the columns.
g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
// Mix the rows.
g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
}
INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags) {
uint32_t block_words[16];
block_words[0] = load32(block + 4 * 0);
block_words[1] = load32(block + 4 * 1);
block_words[2] = load32(block + 4 * 2);
block_words[3] = load32(block + 4 * 3);
block_words[4] = load32(block + 4 * 4);
block_words[5] = load32(block + 4 * 5);
block_words[6] = load32(block + 4 * 6);
block_words[7] = load32(block + 4 * 7);
block_words[8] = load32(block + 4 * 8);
block_words[9] = load32(block + 4 * 9);
block_words[10] = load32(block + 4 * 10);
block_words[11] = load32(block + 4 * 11);
block_words[12] = load32(block + 4 * 12);
block_words[13] = load32(block + 4 * 13);
block_words[14] = load32(block + 4 * 14);
block_words[15] = load32(block + 4 * 15);
state[0] = cv[0];
state[1] = cv[1];
state[2] = cv[2];
state[3] = cv[3];
state[4] = cv[4];
state[5] = cv[5];
state[6] = cv[6];
state[7] = cv[7];
state[8] = IV[0];
state[9] = IV[1];
state[10] = IV[2];
state[11] = IV[3];
state[12] = counter_low(counter);
state[13] = counter_high(counter);
state[14] = (uint32_t)block_len;
state[15] = (uint32_t)flags;
round_fn(state, &block_words[0], 0);
round_fn(state, &block_words[0], 1);
round_fn(state, &block_words[0], 2);
round_fn(state, &block_words[0], 3);
round_fn(state, &block_words[0], 4);
round_fn(state, &block_words[0], 5);
round_fn(state, &block_words[0], 6);
}
void blake3_compress_in_place_portable(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags) {
uint32_t state[16];
compress_pre(state, cv, block, block_len, counter, flags);
cv[0] = state[0] ^ state[8];
cv[1] = state[1] ^ state[9];
cv[2] = state[2] ^ state[10];
cv[3] = state[3] ^ state[11];
cv[4] = state[4] ^ state[12];
cv[5] = state[5] ^ state[13];
cv[6] = state[6] ^ state[14];
cv[7] = state[7] ^ state[15];
}
void blake3_compress_xof_portable(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags, uint8_t out[64]) {
uint32_t state[16];
compress_pre(state, cv, block, block_len, counter, flags);
store32(&out[0 * 4], state[0] ^ state[8]);
store32(&out[1 * 4], state[1] ^ state[9]);
store32(&out[2 * 4], state[2] ^ state[10]);
store32(&out[3 * 4], state[3] ^ state[11]);
store32(&out[4 * 4], state[4] ^ state[12]);
store32(&out[5 * 4], state[5] ^ state[13]);
store32(&out[6 * 4], state[6] ^ state[14]);
store32(&out[7 * 4], state[7] ^ state[15]);
store32(&out[8 * 4], state[8] ^ cv[0]);
store32(&out[9 * 4], state[9] ^ cv[1]);
store32(&out[10 * 4], state[10] ^ cv[2]);
store32(&out[11 * 4], state[11] ^ cv[3]);
store32(&out[12 * 4], state[12] ^ cv[4]);
store32(&out[13 * 4], state[13] ^ cv[5]);
store32(&out[14 * 4], state[14] ^ cv[6]);
store32(&out[15 * 4], state[15] ^ cv[7]);
}
INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
const uint32_t key[8], uint64_t counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
uint32_t cv[8];
memcpy(cv, key, BLAKE3_KEY_LEN);
uint8_t block_flags = flags | flags_start;
while (blocks > 0) {
if (blocks == 1) {
block_flags |= flags_end;
}
blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
block_flags);
input = &input[BLAKE3_BLOCK_LEN];
blocks -= 1;
block_flags = flags;
}
store_cv_words(out, cv);
}
void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out) {
while (num_inputs > 0) {
hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start,
flags_end, out);
if (increment_counter) {
counter += 1;
}
inputs += 1;
num_inputs -= 1;
out = &out[BLAKE3_OUT_LEN];
}
}

566
vendor/blake3/c/blake3_sse2.c vendored Normal file
View File

@@ -0,0 +1,566 @@
#include "blake3_impl.h"
#include <immintrin.h>
#define DEGREE 4
#define _mm_shuffle_ps2(a, b, c) \
(_mm_castps_si128( \
_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
INLINE __m128i loadu(const uint8_t src[16]) {
return _mm_loadu_si128((const __m128i *)src);
}
INLINE void storeu(__m128i src, uint8_t dest[16]) {
_mm_storeu_si128((__m128i *)dest, src);
}
INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
// Note that clang-format doesn't like the name "xor" for some reason.
INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
}
INLINE __m128i rot16(__m128i x) {
return _mm_shufflehi_epi16(_mm_shufflelo_epi16(x, 0xB1), 0xB1);
}
INLINE __m128i rot12(__m128i x) {
return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
}
INLINE __m128i rot8(__m128i x) {
return xorv(_mm_srli_epi32(x, 8), _mm_slli_epi32(x, 32 - 8));
}
INLINE __m128i rot7(__m128i x) {
return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
}
INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
__m128i m) {
*row0 = addv(addv(*row0, m), *row1);
*row3 = xorv(*row3, *row0);
*row3 = rot16(*row3);
*row2 = addv(*row2, *row3);
*row1 = xorv(*row1, *row2);
*row1 = rot12(*row1);
}
INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
__m128i m) {
*row0 = addv(addv(*row0, m), *row1);
*row3 = xorv(*row3, *row0);
*row3 = rot8(*row3);
*row2 = addv(*row2, *row3);
*row1 = xorv(*row1, *row2);
*row1 = rot7(*row1);
}
// Note the optimization here of leaving row1 as the unrotated row, rather than
// row0. All the message loads below are adjusted to compensate for this. See
// discussion at https://github.com/sneves/blake2-avx2/pull/4
INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
}
INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
}
INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) {
const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
__m128i mask = _mm_set1_epi16(imm8);
mask = _mm_and_si128(mask, bits);
mask = _mm_cmpeq_epi16(mask, bits);
return _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a));
}
INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags) {
rows[0] = loadu((uint8_t *)&cv[0]);
rows[1] = loadu((uint8_t *)&cv[4]);
rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
rows[3] = set4(counter_low(counter), counter_high(counter),
(uint32_t)block_len, (uint32_t)flags);
__m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
__m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
__m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
__m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
__m128i t0, t1, t2, t3, tt;
// Round 1. The first round permutes the message words from the original
// input order, into the groups that get mixed in parallel.
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 2. This round and all following rounds apply a fixed permutation
// to the message words from the round before.
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
t1 = blend_epi16(tt, t1, 0xCC);
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 3
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
t1 = blend_epi16(tt, t1, 0xCC);
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 4
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
t1 = blend_epi16(tt, t1, 0xCC);
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 5
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
t1 = blend_epi16(tt, t1, 0xCC);
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 6
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
t1 = blend_epi16(tt, t1, 0xCC);
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 7
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
t1 = blend_epi16(tt, t1, 0xCC);
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
}
void blake3_compress_in_place_sse2(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags) {
__m128i rows[4];
compress_pre(rows, cv, block, block_len, counter, flags);
storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
}
void blake3_compress_xof_sse2(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags, uint8_t out[64]) {
__m128i rows[4];
compress_pre(rows, cv, block, block_len, counter, flags);
storeu(xorv(rows[0], rows[2]), &out[0]);
storeu(xorv(rows[1], rows[3]), &out[16]);
storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
}
INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
v[0] = addv(v[0], v[4]);
v[1] = addv(v[1], v[5]);
v[2] = addv(v[2], v[6]);
v[3] = addv(v[3], v[7]);
v[12] = xorv(v[12], v[0]);
v[13] = xorv(v[13], v[1]);
v[14] = xorv(v[14], v[2]);
v[15] = xorv(v[15], v[3]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[15] = rot16(v[15]);
v[8] = addv(v[8], v[12]);
v[9] = addv(v[9], v[13]);
v[10] = addv(v[10], v[14]);
v[11] = addv(v[11], v[15]);
v[4] = xorv(v[4], v[8]);
v[5] = xorv(v[5], v[9]);
v[6] = xorv(v[6], v[10]);
v[7] = xorv(v[7], v[11]);
v[4] = rot12(v[4]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
v[0] = addv(v[0], v[4]);
v[1] = addv(v[1], v[5]);
v[2] = addv(v[2], v[6]);
v[3] = addv(v[3], v[7]);
v[12] = xorv(v[12], v[0]);
v[13] = xorv(v[13], v[1]);
v[14] = xorv(v[14], v[2]);
v[15] = xorv(v[15], v[3]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[15] = rot8(v[15]);
v[8] = addv(v[8], v[12]);
v[9] = addv(v[9], v[13]);
v[10] = addv(v[10], v[14]);
v[11] = addv(v[11], v[15]);
v[4] = xorv(v[4], v[8]);
v[5] = xorv(v[5], v[9]);
v[6] = xorv(v[6], v[10]);
v[7] = xorv(v[7], v[11]);
v[4] = rot7(v[4]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
v[0] = addv(v[0], v[5]);
v[1] = addv(v[1], v[6]);
v[2] = addv(v[2], v[7]);
v[3] = addv(v[3], v[4]);
v[15] = xorv(v[15], v[0]);
v[12] = xorv(v[12], v[1]);
v[13] = xorv(v[13], v[2]);
v[14] = xorv(v[14], v[3]);
v[15] = rot16(v[15]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[10] = addv(v[10], v[15]);
v[11] = addv(v[11], v[12]);
v[8] = addv(v[8], v[13]);
v[9] = addv(v[9], v[14]);
v[5] = xorv(v[5], v[10]);
v[6] = xorv(v[6], v[11]);
v[7] = xorv(v[7], v[8]);
v[4] = xorv(v[4], v[9]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[4] = rot12(v[4]);
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
v[0] = addv(v[0], v[5]);
v[1] = addv(v[1], v[6]);
v[2] = addv(v[2], v[7]);
v[3] = addv(v[3], v[4]);
v[15] = xorv(v[15], v[0]);
v[12] = xorv(v[12], v[1]);
v[13] = xorv(v[13], v[2]);
v[14] = xorv(v[14], v[3]);
v[15] = rot8(v[15]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[10] = addv(v[10], v[15]);
v[11] = addv(v[11], v[12]);
v[8] = addv(v[8], v[13]);
v[9] = addv(v[9], v[14]);
v[5] = xorv(v[5], v[10]);
v[6] = xorv(v[6], v[11]);
v[7] = xorv(v[7], v[8]);
v[4] = xorv(v[4], v[9]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[4] = rot7(v[4]);
}
INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
// Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
// 22/33. Note that this doesn't split the vector into two lanes, as the
// AVX2 counterparts do.
__m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
__m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
__m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
__m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
// Interleave 64-bit lanes.
__m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
__m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
__m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
__m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
vecs[0] = abcd_0;
vecs[1] = abcd_1;
vecs[2] = abcd_2;
vecs[3] = abcd_3;
}
INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
size_t block_offset, __m128i out[16]) {
out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
for (size_t i = 0; i < 4; ++i) {
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
}
transpose_vecs(&out[0]);
transpose_vecs(&out[4]);
transpose_vecs(&out[8]);
transpose_vecs(&out[12]);
}
INLINE void load_counters(uint64_t counter, bool increment_counter,
__m128i *out_lo, __m128i *out_hi) {
const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
const __m128i add1 = _mm_and_si128(mask, add0);
__m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
__m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
_mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
__m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
*out_lo = l;
*out_hi = h;
}
static
void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks,
const uint32_t key[8], uint64_t counter,
bool increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
__m128i h_vecs[8] = {
set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
};
__m128i counter_low_vec, counter_high_vec;
load_counters(counter, increment_counter, &counter_low_vec,
&counter_high_vec);
uint8_t block_flags = flags | flags_start;
for (size_t block = 0; block < blocks; block++) {
if (block + 1 == blocks) {
block_flags |= flags_end;
}
__m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
__m128i block_flags_vec = set1(block_flags);
__m128i msg_vecs[16];
transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
__m128i v[16] = {
h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]),
counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
};
round_fn(v, msg_vecs, 0);
round_fn(v, msg_vecs, 1);
round_fn(v, msg_vecs, 2);
round_fn(v, msg_vecs, 3);
round_fn(v, msg_vecs, 4);
round_fn(v, msg_vecs, 5);
round_fn(v, msg_vecs, 6);
h_vecs[0] = xorv(v[0], v[8]);
h_vecs[1] = xorv(v[1], v[9]);
h_vecs[2] = xorv(v[2], v[10]);
h_vecs[3] = xorv(v[3], v[11]);
h_vecs[4] = xorv(v[4], v[12]);
h_vecs[5] = xorv(v[5], v[13]);
h_vecs[6] = xorv(v[6], v[14]);
h_vecs[7] = xorv(v[7], v[15]);
block_flags = flags;
}
transpose_vecs(&h_vecs[0]);
transpose_vecs(&h_vecs[4]);
// The first four vecs now contain the first half of each output, and the
// second four vecs contain the second half of each output.
storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
}
INLINE void hash_one_sse2(const uint8_t *input, size_t blocks,
const uint32_t key[8], uint64_t counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
uint32_t cv[8];
memcpy(cv, key, BLAKE3_KEY_LEN);
uint8_t block_flags = flags | flags_start;
while (blocks > 0) {
if (blocks == 1) {
block_flags |= flags_end;
}
blake3_compress_in_place_sse2(cv, input, BLAKE3_BLOCK_LEN, counter,
block_flags);
input = &input[BLAKE3_BLOCK_LEN];
blocks -= 1;
block_flags = flags;
}
memcpy(out, cv, BLAKE3_OUT_LEN);
}
void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out) {
while (num_inputs >= DEGREE) {
blake3_hash4_sse2(inputs, blocks, key, counter, increment_counter, flags,
flags_start, flags_end, out);
if (increment_counter) {
counter += DEGREE;
}
inputs += DEGREE;
num_inputs -= DEGREE;
out = &out[DEGREE * BLAKE3_OUT_LEN];
}
while (num_inputs > 0) {
hash_one_sse2(inputs[0], blocks, key, counter, flags, flags_start,
flags_end, out);
if (increment_counter) {
counter += 1;
}
inputs += 1;
num_inputs -= 1;
out = &out[BLAKE3_OUT_LEN];
}
}

2291
vendor/blake3/c/blake3_sse2_x86-64_unix.S vendored Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

560
vendor/blake3/c/blake3_sse41.c vendored Normal file
View File

@@ -0,0 +1,560 @@
#include "blake3_impl.h"
#include <immintrin.h>
#define DEGREE 4
#define _mm_shuffle_ps2(a, b, c) \
(_mm_castps_si128( \
_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
INLINE __m128i loadu(const uint8_t src[16]) {
return _mm_loadu_si128((const __m128i *)src);
}
INLINE void storeu(__m128i src, uint8_t dest[16]) {
_mm_storeu_si128((__m128i *)dest, src);
}
INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
// Note that clang-format doesn't like the name "xor" for some reason.
INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
}
INLINE __m128i rot16(__m128i x) {
return _mm_shuffle_epi8(
x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
}
INLINE __m128i rot12(__m128i x) {
return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
}
INLINE __m128i rot8(__m128i x) {
return _mm_shuffle_epi8(
x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
}
INLINE __m128i rot7(__m128i x) {
return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
}
INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
__m128i m) {
*row0 = addv(addv(*row0, m), *row1);
*row3 = xorv(*row3, *row0);
*row3 = rot16(*row3);
*row2 = addv(*row2, *row3);
*row1 = xorv(*row1, *row2);
*row1 = rot12(*row1);
}
INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
__m128i m) {
*row0 = addv(addv(*row0, m), *row1);
*row3 = xorv(*row3, *row0);
*row3 = rot8(*row3);
*row2 = addv(*row2, *row3);
*row1 = xorv(*row1, *row2);
*row1 = rot7(*row1);
}
// Note the optimization here of leaving row1 as the unrotated row, rather than
// row0. All the message loads below are adjusted to compensate for this. See
// discussion at https://github.com/sneves/blake2-avx2/pull/4
INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
}
INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
}
INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags) {
rows[0] = loadu((uint8_t *)&cv[0]);
rows[1] = loadu((uint8_t *)&cv[4]);
rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
rows[3] = set4(counter_low(counter), counter_high(counter),
(uint32_t)block_len, (uint32_t)flags);
__m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
__m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
__m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
__m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
__m128i t0, t1, t2, t3, tt;
// Round 1. The first round permutes the message words from the original
// input order, into the groups that get mixed in parallel.
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 2. This round and all following rounds apply a fixed permutation
// to the message words from the round before.
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 3
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 4
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 5
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 6
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 7
t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
diagonalize(&rows[0], &rows[2], &rows[3]);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
undiagonalize(&rows[0], &rows[2], &rows[3]);
}
void blake3_compress_in_place_sse41(uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags) {
__m128i rows[4];
compress_pre(rows, cv, block, block_len, counter, flags);
storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
}
void blake3_compress_xof_sse41(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter,
uint8_t flags, uint8_t out[64]) {
__m128i rows[4];
compress_pre(rows, cv, block, block_len, counter, flags);
storeu(xorv(rows[0], rows[2]), &out[0]);
storeu(xorv(rows[1], rows[3]), &out[16]);
storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
}
INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
v[0] = addv(v[0], v[4]);
v[1] = addv(v[1], v[5]);
v[2] = addv(v[2], v[6]);
v[3] = addv(v[3], v[7]);
v[12] = xorv(v[12], v[0]);
v[13] = xorv(v[13], v[1]);
v[14] = xorv(v[14], v[2]);
v[15] = xorv(v[15], v[3]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[15] = rot16(v[15]);
v[8] = addv(v[8], v[12]);
v[9] = addv(v[9], v[13]);
v[10] = addv(v[10], v[14]);
v[11] = addv(v[11], v[15]);
v[4] = xorv(v[4], v[8]);
v[5] = xorv(v[5], v[9]);
v[6] = xorv(v[6], v[10]);
v[7] = xorv(v[7], v[11]);
v[4] = rot12(v[4]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
v[0] = addv(v[0], v[4]);
v[1] = addv(v[1], v[5]);
v[2] = addv(v[2], v[6]);
v[3] = addv(v[3], v[7]);
v[12] = xorv(v[12], v[0]);
v[13] = xorv(v[13], v[1]);
v[14] = xorv(v[14], v[2]);
v[15] = xorv(v[15], v[3]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[15] = rot8(v[15]);
v[8] = addv(v[8], v[12]);
v[9] = addv(v[9], v[13]);
v[10] = addv(v[10], v[14]);
v[11] = addv(v[11], v[15]);
v[4] = xorv(v[4], v[8]);
v[5] = xorv(v[5], v[9]);
v[6] = xorv(v[6], v[10]);
v[7] = xorv(v[7], v[11]);
v[4] = rot7(v[4]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
v[0] = addv(v[0], v[5]);
v[1] = addv(v[1], v[6]);
v[2] = addv(v[2], v[7]);
v[3] = addv(v[3], v[4]);
v[15] = xorv(v[15], v[0]);
v[12] = xorv(v[12], v[1]);
v[13] = xorv(v[13], v[2]);
v[14] = xorv(v[14], v[3]);
v[15] = rot16(v[15]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[10] = addv(v[10], v[15]);
v[11] = addv(v[11], v[12]);
v[8] = addv(v[8], v[13]);
v[9] = addv(v[9], v[14]);
v[5] = xorv(v[5], v[10]);
v[6] = xorv(v[6], v[11]);
v[7] = xorv(v[7], v[8]);
v[4] = xorv(v[4], v[9]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[4] = rot12(v[4]);
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
v[0] = addv(v[0], v[5]);
v[1] = addv(v[1], v[6]);
v[2] = addv(v[2], v[7]);
v[3] = addv(v[3], v[4]);
v[15] = xorv(v[15], v[0]);
v[12] = xorv(v[12], v[1]);
v[13] = xorv(v[13], v[2]);
v[14] = xorv(v[14], v[3]);
v[15] = rot8(v[15]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[10] = addv(v[10], v[15]);
v[11] = addv(v[11], v[12]);
v[8] = addv(v[8], v[13]);
v[9] = addv(v[9], v[14]);
v[5] = xorv(v[5], v[10]);
v[6] = xorv(v[6], v[11]);
v[7] = xorv(v[7], v[8]);
v[4] = xorv(v[4], v[9]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[4] = rot7(v[4]);
}
INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
// Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
// 22/33. Note that this doesn't split the vector into two lanes, as the
// AVX2 counterparts do.
__m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
__m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
__m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
__m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
// Interleave 64-bit lanes.
__m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
__m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
__m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
__m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
vecs[0] = abcd_0;
vecs[1] = abcd_1;
vecs[2] = abcd_2;
vecs[3] = abcd_3;
}
INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
size_t block_offset, __m128i out[16]) {
out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
for (size_t i = 0; i < 4; ++i) {
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
}
transpose_vecs(&out[0]);
transpose_vecs(&out[4]);
transpose_vecs(&out[8]);
transpose_vecs(&out[12]);
}
INLINE void load_counters(uint64_t counter, bool increment_counter,
__m128i *out_lo, __m128i *out_hi) {
const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
const __m128i add1 = _mm_and_si128(mask, add0);
__m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
__m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
_mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
__m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
*out_lo = l;
*out_hi = h;
}
static
void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks,
const uint32_t key[8], uint64_t counter,
bool increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
__m128i h_vecs[8] = {
set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
};
__m128i counter_low_vec, counter_high_vec;
load_counters(counter, increment_counter, &counter_low_vec,
&counter_high_vec);
uint8_t block_flags = flags | flags_start;
for (size_t block = 0; block < blocks; block++) {
if (block + 1 == blocks) {
block_flags |= flags_end;
}
__m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
__m128i block_flags_vec = set1(block_flags);
__m128i msg_vecs[16];
transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
__m128i v[16] = {
h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]),
counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
};
round_fn(v, msg_vecs, 0);
round_fn(v, msg_vecs, 1);
round_fn(v, msg_vecs, 2);
round_fn(v, msg_vecs, 3);
round_fn(v, msg_vecs, 4);
round_fn(v, msg_vecs, 5);
round_fn(v, msg_vecs, 6);
h_vecs[0] = xorv(v[0], v[8]);
h_vecs[1] = xorv(v[1], v[9]);
h_vecs[2] = xorv(v[2], v[10]);
h_vecs[3] = xorv(v[3], v[11]);
h_vecs[4] = xorv(v[4], v[12]);
h_vecs[5] = xorv(v[5], v[13]);
h_vecs[6] = xorv(v[6], v[14]);
h_vecs[7] = xorv(v[7], v[15]);
block_flags = flags;
}
transpose_vecs(&h_vecs[0]);
transpose_vecs(&h_vecs[4]);
// The first four vecs now contain the first half of each output, and the
// second four vecs contain the second half of each output.
storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
}
INLINE void hash_one_sse41(const uint8_t *input, size_t blocks,
const uint32_t key[8], uint64_t counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
uint32_t cv[8];
memcpy(cv, key, BLAKE3_KEY_LEN);
uint8_t block_flags = flags | flags_start;
while (blocks > 0) {
if (blocks == 1) {
block_flags |= flags_end;
}
blake3_compress_in_place_sse41(cv, input, BLAKE3_BLOCK_LEN, counter,
block_flags);
input = &input[BLAKE3_BLOCK_LEN];
blocks -= 1;
block_flags = flags;
}
memcpy(out, cv, BLAKE3_OUT_LEN);
}
void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out) {
while (num_inputs >= DEGREE) {
blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags,
flags_start, flags_end, out);
if (increment_counter) {
counter += DEGREE;
}
inputs += DEGREE;
num_inputs -= DEGREE;
out = &out[DEGREE * BLAKE3_OUT_LEN];
}
while (num_inputs > 0) {
hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start,
flags_end, out);
if (increment_counter) {
counter += 1;
}
inputs += 1;
num_inputs -= 1;
out = &out[BLAKE3_OUT_LEN];
}
}

2028
vendor/blake3/c/blake3_sse41_x86-64_unix.S vendored Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

37
vendor/blake3/c/blake3_tbb.cpp vendored Normal file
View File

@@ -0,0 +1,37 @@
#include <cstddef>
#include <cstdint>
#include <oneapi/tbb/parallel_invoke.h>
#include "blake3_impl.h"
static_assert(TBB_USE_EXCEPTIONS == 0,
"This file should be compiled with C++ exceptions disabled.");
extern "C" void blake3_compress_subtree_wide_join_tbb(
// shared params
const uint32_t key[8], uint8_t flags, bool use_tbb,
// left-hand side params
const uint8_t *l_input, size_t l_input_len, uint64_t l_chunk_counter,
uint8_t *l_cvs, size_t *l_n,
// right-hand side params
const uint8_t *r_input, size_t r_input_len, uint64_t r_chunk_counter,
uint8_t *r_cvs, size_t *r_n) noexcept {
if (!use_tbb) {
*l_n = blake3_compress_subtree_wide(l_input, l_input_len, key,
l_chunk_counter, flags, l_cvs, use_tbb);
*r_n = blake3_compress_subtree_wide(r_input, r_input_len, key,
r_chunk_counter, flags, r_cvs, use_tbb);
return;
}
oneapi::tbb::parallel_invoke(
[=]() {
*l_n = blake3_compress_subtree_wide(
l_input, l_input_len, key, l_chunk_counter, flags, l_cvs, use_tbb);
},
[=]() {
*r_n = blake3_compress_subtree_wide(
r_input, r_input_len, key, r_chunk_counter, flags, r_cvs, use_tbb);
});
}

View File

@@ -0,0 +1,235 @@
cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
if(BUILD_SHARED_LIBS)
message(FATAL_ERROR "BUILD_SHARED_LIBS is incompatible with BLAKE3_TESTING_CI")
endif()
include(CTest)
# Declare a testing specific variant of the `blake3` library target.
#
# We use a separate library target in order to be able to perform compilation with various
# combinations of features which are too noisy to specify in the main CMake config as options for
# the normal `blake3` target.
#
# Initially this target has no properties but eventually we will populate them by copying all of the
# relevant properties from the normal `blake3` target.
add_library(blake3-testing
blake3.c
blake3_dispatch.c
blake3_portable.c
)
if(BLAKE3_USE_TBB AND TBB_FOUND)
target_sources(blake3-testing
PRIVATE
blake3_tbb.cpp)
endif()
if(BLAKE3_SIMD_TYPE STREQUAL "amd64-asm")
# Conditionally add amd64 asm files to `blake3-testing` sources
if(MSVC)
if(NOT BLAKE3_NO_AVX2)
list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx2_x86-64_windows_msvc.asm)
endif()
if(NOT BLAKE3_NO_AVX512)
list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx512_x86-64_windows_msvc.asm)
endif()
if(NOT BLAKE3_NO_SSE2)
list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse2_x86-64_windows_msvc.asm)
endif()
if(NOT BLAKE3_NO_SSE41)
list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse41_x86-64_windows_msvc.asm)
endif()
elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU"
OR CMAKE_C_COMPILER_ID STREQUAL "Clang"
OR CMAKE_C_COMPILER_ID STREQUAL "AppleClang")
if (WIN32)
if(NOT BLAKE3_NO_AVX2)
list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx2_x86-64_windows_gnu.S)
endif()
if(NOT BLAKE3_NO_AVX512)
list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx512_x86-64_windows_gnu.S)
endif()
if(NOT BLAKE3_NO_SSE2)
list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse2_x86-64_windows_gnu.S)
endif()
if(NOT BLAKE3_NO_SSE41)
list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse41_x86-64_windows_gnu.S)
endif()
elseif(UNIX)
if(NOT BLAKE3_NO_AVX2)
list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx2_x86-64_unix.S)
endif()
if(NOT BLAKE3_NO_AVX512)
list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_avx512_x86-64_unix.S)
endif()
if(NOT BLAKE3_NO_SSE2)
list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse2_x86-64_unix.S)
endif()
if(NOT BLAKE3_NO_SSE41)
list(APPEND BLAKE3_TESTING_AMD64_ASM_SOURCES blake3_sse41_x86-64_unix.S)
endif()
endif()
endif()
target_sources(blake3-testing PRIVATE ${BLAKE3_AMD64_ASM_SOURCES})
elseif(BLAKE3_SIMD_TYPE STREQUAL "x86-intrinsics")
# Conditionally add amd64 C files to `blake3-testing` sources
if (NOT DEFINED BLAKE3_CFLAGS_SSE2
OR NOT DEFINED BLAKE3_CFLAGS_SSE4.1
OR NOT DEFINED BLAKE3_CFLAGS_AVX2
OR NOT DEFINED BLAKE3_CFLAGS_AVX512)
message(WARNING "BLAKE3_SIMD_TYPE is set to 'x86-intrinsics' but no compiler flags are available for the target architecture.")
else()
set(BLAKE3_SIMD_X86_INTRINSICS ON)
endif()
if(NOT BLAKE3_NO_AVX2)
target_sources(blake3-testing PRIVATE blake3_avx2.c)
set_source_files_properties(blake3_avx2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX2}")
endif()
if(NOT BLAKE3_NO_AVX512)
target_sources(blake3-testing PRIVATE blake3_avx512.c)
set_source_files_properties(blake3_avx512.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_AVX512}")
endif()
if(NOT BLAKE3_NO_SSE2)
target_sources(blake3-testing PRIVATE blake3_sse2.c)
set_source_files_properties(blake3_sse2.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE2}")
endif()
if(NOT BLAKE3_NO_SSE41)
target_sources(blake3-testing PRIVATE blake3_sse41.c)
set_source_files_properties(blake3_sse41.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_SSE4.1}")
endif()
elseif(BLAKE3_SIMD_TYPE STREQUAL "neon-intrinsics")
# Conditionally add neon C files to `blake3-testing` sources
target_sources(blake3-testing PRIVATE
blake3_neon.c
)
target_compile_definitions(blake3-testing PRIVATE
BLAKE3_USE_NEON=1
)
if (DEFINED BLAKE3_CFLAGS_NEON)
set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}")
endif()
elseif(BLAKE3_SIMD_TYPE STREQUAL "none")
# Disable neon if simd type is "none". We check for individual amd64 features further below.
target_compile_definitions(blake3-testing PRIVATE
BLAKE3_USE_NEON=0
)
endif()
if(BLAKE3_NO_AVX2)
target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_AVX2)
endif()
if(BLAKE3_NO_AVX512)
target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_AVX512)
endif()
if(BLAKE3_NO_SSE2)
target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_SSE2)
endif()
if(BLAKE3_NO_SSE41)
target_compile_definitions(blake3-testing PRIVATE BLAKE3_NO_SSE41)
endif()
target_compile_definitions(blake3-testing PUBLIC BLAKE3_TESTING)
get_target_property(BLAKE3_COMPILE_DEFINITIONS blake3 COMPILE_DEFINITIONS)
if(BLAKE3_COMPILE_DEFINITIONS)
target_compile_definitions(blake3-testing PUBLIC
${BLAKE3_COMPILE_DEFINITIONS})
endif()
get_target_property(BLAKE3_COMPILE_OPTIONS blake3 COMPILE_OPTIONS)
if(BLAKE3_COMPILE_OPTIONS)
target_compile_options(blake3-testing PRIVATE
${BLAKE3_COMPILE_OPTIONS}
-O3
-Wall
-Wextra
-pedantic
-fstack-protector-strong
-D_FORTIFY_SOURCE=2
-fPIE
-fvisibility=hidden
-fsanitize=address,undefined
)
endif()
get_target_property(BLAKE3_INCLUDE_DIRECTORIES blake3 INCLUDE_DIRECTORIES)
if(BLAKE3_INCLUDE_DIRECTORIES)
target_include_directories(blake3-testing PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
)
endif()
get_target_property(BLAKE3_LINK_LIBRARIES blake3 LINK_LIBRARIES)
if(BLAKE3_LINK_LIBRARIES)
target_link_libraries(blake3-testing PRIVATE ${BLAKE3_LINK_LIBRARIES})
endif()
get_target_property(BLAKE3_LINK_OPTIONS blake3 LINK_OPTIONS)
if(BLAKE3_LINK_OPTIONS)
target_link_options(blake3-testing PRIVATE
${BLAKE3_LINK_OPTIONS}
-fsanitize=address,undefined
-pie
-Wl,-z,relro,-z,now
)
endif()
# test asm target
add_executable(blake3-asm-test
main.c
)
set_target_properties(blake3-asm-test PROPERTIES
OUTPUT_NAME blake3
RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR})
target_link_libraries(blake3-asm-test PRIVATE blake3-testing)
target_compile_definitions(blake3-asm-test PRIVATE BLAKE3_TESTING)
target_compile_options(blake3-asm-test PRIVATE
-O3
-Wall
-Wextra
-pedantic
-fstack-protector-strong
-D_FORTIFY_SOURCE=2
-fPIE
-fvisibility=hidden
-fsanitize=address,undefined
)
target_link_options(blake3-asm-test PRIVATE
-fsanitize=address,undefined
-pie
-Wl,-z,relro,-z,now
)
add_test(NAME blake3-testing
COMMAND "${CMAKE_CTEST_COMMAND}"
--verbose
--extra-verbose
--build-and-test "${CMAKE_SOURCE_DIR}" "${CMAKE_BINARY_DIR}"
--build-generator "${CMAKE_GENERATOR}"
--build-makeprogram "${CMAKE_MAKE_PROGRAM}"
--build-project libblake3
--build-target blake3-asm-test
--build-options
--fresh
"-DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}"
"-DBLAKE3_TESTING=${BLAKE3_TESTING}"
"-DBLAKE3_TESTING_CI=${BLAKE3_TESTING_CI}"
"-DBLAKE3_USE_TBB=${BLAKE3_USE_TBB}"
"-DBLAKE3_SIMD_TYPE=${BLAKE3_SIMD_TYPE}"
"-DBLAKE3_NO_SSE2=${BLAKE3_NO_SSE2}"
"-DBLAKE3_NO_SSE41=${BLAKE3_NO_SSE41}"
"-DBLAKE3_NO_AVX2=${BLAKE3_NO_AVX2}"
"-DBLAKE3_NO_AVX512=${BLAKE3_NO_AVX512}"
--test-command
"${CMAKE_SOURCE_DIR}/test.py"
)

View File

@@ -0,0 +1,13 @@
if(NOT WIN32)
add_executable(blake3-example
example.c)
target_link_libraries(blake3-example PRIVATE blake3)
install(TARGETS blake3-example)
if(BLAKE3_USE_TBB)
add_executable(blake3-example-tbb
example_tbb.c)
target_link_libraries(blake3-example-tbb PRIVATE blake3)
install(TARGETS blake3-example-tbb)
endif()
endif()

View File

@@ -0,0 +1,3 @@
if(BLAKE3_TESTING_CI)
include(BLAKE3/ContinuousIntegration)
endif()

View File

@@ -0,0 +1,3 @@
if(BLAKE3_USE_TBB)
add_subdirectory(tbb)
endif()

View File

@@ -0,0 +1,28 @@
find_package(TBB 2021.11.0 QUIET)
if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.11)
include(FetchContent)
if(NOT TBB_FOUND AND BLAKE3_FETCH_TBB)
set(CMAKE_C_STANDARD 99)
set(CMAKE_C_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_EXTENSIONS ON)
option(TBB_TEST OFF "")
option(TBBMALLOC_BUILD OFF "")
mark_as_advanced(TBB_TEST)
mark_as_advanced(TBBMALLOC_BUILD)
FetchContent_Declare(
TBB
GIT_REPOSITORY https://github.com/uxlfoundation/oneTBB
GIT_TAG 0c0ff192a2304e114bc9e6557582dfba101360ff # v2022.0.0
GIT_SHALLOW TRUE
)
FetchContent_MakeAvailable(TBB)
endif()
endif()

36
vendor/blake3/c/example.c vendored Normal file
View File

@@ -0,0 +1,36 @@
#include "blake3.h"
#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
int main(void) {
// Initialize the hasher.
blake3_hasher hasher;
blake3_hasher_init(&hasher);
// Read input bytes from stdin.
unsigned char buf[65536];
while (1) {
ssize_t n = read(STDIN_FILENO, buf, sizeof(buf));
if (n > 0) {
blake3_hasher_update(&hasher, buf, n);
} else if (n == 0) {
break; // end of file
} else {
fprintf(stderr, "read failed: %s\n", strerror(errno));
return 1;
}
}
// Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes.
uint8_t output[BLAKE3_OUT_LEN];
blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN);
// Print the hash as hexadecimal.
for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) {
printf("%02x", output[i]);
}
printf("\n");
return 0;
}

57
vendor/blake3/c/example_tbb.c vendored Normal file
View File

@@ -0,0 +1,57 @@
#include "blake3.h"
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
int main(int argc, char **argv) {
// For each filepath argument, memory map it and hash it.
for (int i = 1; i < argc; i++) {
// Open and memory map the file.
int fd = open(argv[i], O_RDONLY);
if (fd == -1) {
fprintf(stderr, "open failed: %s\n", strerror(errno));
return 1;
}
struct stat statbuf;
if (fstat(fd, &statbuf) == -1) {
fprintf(stderr, "stat failed: %s\n", strerror(errno));
return 1;
}
void *mapped = mmap(NULL, statbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (mapped == MAP_FAILED) {
fprintf(stderr, "mmap failed: %s\n", strerror(errno));
return 1;
}
// Initialize the hasher.
blake3_hasher hasher;
blake3_hasher_init(&hasher);
// Hash the mapped file using multiple threads.
blake3_hasher_update_tbb(&hasher, mapped, statbuf.st_size);
// Unmap and close the file.
if (munmap(mapped, statbuf.st_size) == -1) {
fprintf(stderr, "munmap failed: %s\n", strerror(errno));
return 1;
}
if (close(fd) == -1) {
fprintf(stderr, "close failed: %s\n", strerror(errno));
return 1;
}
// Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes.
uint8_t output[BLAKE3_OUT_LEN];
blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN);
// Print the hash as hexadecimal.
for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) {
printf("%02x", output[i]);
}
printf("\n");
}
}

12
vendor/blake3/c/libblake3.pc.in vendored Normal file
View File

@@ -0,0 +1,12 @@
prefix="@CMAKE_INSTALL_PREFIX@"
exec_prefix="${prefix}"
libdir="@PKG_CONFIG_INSTALL_LIBDIR@"
includedir="@PKG_CONFIG_INSTALL_INCLUDEDIR@"
Name: @PROJECT_NAME@
Description: @PROJECT_DESCRIPTION@
Version: @PROJECT_VERSION@
Requires: @PKG_CONFIG_REQUIRES@
Libs: -L"${libdir}" -lblake3 @PKG_CONFIG_LIBS@
Cflags: -I"${includedir}" @PKG_CONFIG_CFLAGS@

166
vendor/blake3/c/main.c vendored Normal file
View File

@@ -0,0 +1,166 @@
/*
* This main file is intended for testing via `make test`. It does not build in
* other settings. See README.md in this directory for examples of how to build
* C code.
*/
#include <assert.h>
#include <errno.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include "blake3.h"
#include "blake3_impl.h"
#define HASH_MODE 0
#define KEYED_HASH_MODE 1
#define DERIVE_KEY_MODE 2
static void hex_char_value(uint8_t c, uint8_t *value, bool *valid) {
if ('0' <= c && c <= '9') {
*value = c - '0';
*valid = true;
} else if ('a' <= c && c <= 'f') {
*value = 10 + c - 'a';
*valid = true;
} else {
*valid = false;
}
}
static int parse_key(char *hex_key, uint8_t out[BLAKE3_KEY_LEN]) {
size_t hex_len = strlen(hex_key);
if (hex_len != 64) {
fprintf(stderr, "Expected a 64-char hexadecimal key, got %zu chars.\n",
hex_len);
return 1;
}
for (size_t i = 0; i < 64; i++) {
uint8_t value;
bool valid;
hex_char_value(hex_key[i], &value, &valid);
if (!valid) {
fprintf(stderr, "Invalid hex char.\n");
return 1;
}
if (i % 2 == 0) {
out[i / 2] = 0;
value <<= 4;
}
out[i / 2] += value;
}
return 0;
}
/* A little repetition here */
enum cpu_feature {
SSE2 = 1 << 0,
SSSE3 = 1 << 1,
SSE41 = 1 << 2,
AVX = 1 << 3,
AVX2 = 1 << 4,
AVX512F = 1 << 5,
AVX512VL = 1 << 6,
/* ... */
UNDEFINED = 1 << 30
};
extern enum cpu_feature g_cpu_features;
enum cpu_feature get_cpu_features(void);
int main(int argc, char **argv) {
size_t out_len = BLAKE3_OUT_LEN;
uint8_t key[BLAKE3_KEY_LEN];
char *context = "";
uint8_t mode = HASH_MODE;
while (argc > 1) {
if (argc <= 2) {
fprintf(stderr, "Odd number of arguments.\n");
return 1;
}
if (strcmp("--length", argv[1]) == 0) {
char *endptr = NULL;
errno = 0;
unsigned long long out_len_ll = strtoull(argv[2], &endptr, 10);
if (errno != 0 || out_len_ll > SIZE_MAX || endptr == argv[2] ||
*endptr != 0) {
fprintf(stderr, "Bad length argument.\n");
return 1;
}
out_len = (size_t)out_len_ll;
} else if (strcmp("--keyed", argv[1]) == 0) {
mode = KEYED_HASH_MODE;
int ret = parse_key(argv[2], key);
if (ret != 0) {
return ret;
}
} else if (strcmp("--derive-key", argv[1]) == 0) {
mode = DERIVE_KEY_MODE;
context = argv[2];
} else {
fprintf(stderr, "Unknown flag.\n");
return 1;
}
argc -= 2;
argv += 2;
}
/*
* We're going to hash the input multiple times, so we need to buffer it all.
* This is just for test cases, so go ahead and assume that the input is less
* than 1 MiB.
*/
size_t buf_capacity = 1 << 20;
uint8_t *buf = malloc(buf_capacity);
assert(buf != NULL);
size_t buf_len = 0;
while (1) {
size_t n = fread(&buf[buf_len], 1, buf_capacity - buf_len, stdin);
if (n == 0) {
break;
}
buf_len += n;
assert(buf_len < buf_capacity);
}
const int mask = get_cpu_features();
int feature = 0;
do {
fprintf(stderr, "Testing 0x%08X\n", feature);
g_cpu_features = feature;
blake3_hasher hasher;
switch (mode) {
case HASH_MODE:
blake3_hasher_init(&hasher);
break;
case KEYED_HASH_MODE:
blake3_hasher_init_keyed(&hasher, key);
break;
case DERIVE_KEY_MODE:
blake3_hasher_init_derive_key(&hasher, context);
break;
default:
abort();
}
blake3_hasher_update(&hasher, buf, buf_len);
/* TODO: An incremental output reader API to avoid this allocation. */
uint8_t *out = malloc(out_len);
if (out_len > 0 && out == NULL) {
fprintf(stderr, "malloc() failed.\n");
return 1;
}
blake3_hasher_finalize(&hasher, out, out_len);
for (size_t i = 0; i < out_len; i++) {
printf("%02x", out[i]);
}
printf("\n");
free(out);
feature = (feature - mask) & mask;
} while (feature != 0);
free(buf);
return 0;
}

97
vendor/blake3/c/test.py vendored Executable file
View File

@@ -0,0 +1,97 @@
#! /usr/bin/env python3
from binascii import hexlify
import json
from os import path
import subprocess
HERE = path.dirname(__file__)
TEST_VECTORS_PATH = path.join(HERE, "..", "test_vectors", "test_vectors.json")
TEST_VECTORS = json.load(open(TEST_VECTORS_PATH))
def run_blake3(args, input):
output = subprocess.run([path.join(HERE, "blake3")] + args,
input=input,
stdout=subprocess.PIPE,
check=True)
return output.stdout.decode().strip()
# Fill the input with a repeating byte pattern. We use a cycle length of 251,
# because that's the largest prime number less than 256. This makes it unlikely
# to swapping any two adjacent input blocks or chunks will give the same
# answer.
def make_test_input(length):
i = 0
buf = bytearray()
while len(buf) < length:
buf.append(i)
i = (i + 1) % 251
return buf
def main():
for case in TEST_VECTORS["cases"]:
input_len = case["input_len"]
input = make_test_input(input_len)
hex_key = hexlify(TEST_VECTORS["key"].encode())
context_string = TEST_VECTORS["context_string"]
expected_hash_xof = case["hash"]
expected_hash = expected_hash_xof[:64]
expected_keyed_hash_xof = case["keyed_hash"]
expected_keyed_hash = expected_keyed_hash_xof[:64]
expected_derive_key_xof = case["derive_key"]
expected_derive_key = expected_derive_key_xof[:64]
# Test the default hash.
test_hash = run_blake3([], input)
for line in test_hash.splitlines():
assert expected_hash == line, \
"hash({}): {} != {}".format(input_len, expected_hash, line)
# Test the extended hash.
xof_len = len(expected_hash_xof) // 2
test_hash_xof = run_blake3(["--length", str(xof_len)], input)
for line in test_hash_xof.splitlines():
assert expected_hash_xof == line, \
"hash_xof({}): {} != {}".format(
input_len, expected_hash_xof, line)
# Test the default keyed hash.
test_keyed_hash = run_blake3(["--keyed", hex_key], input)
for line in test_keyed_hash.splitlines():
assert expected_keyed_hash == line, \
"keyed_hash({}): {} != {}".format(
input_len, expected_keyed_hash, line)
# Test the extended keyed hash.
xof_len = len(expected_keyed_hash_xof) // 2
test_keyed_hash_xof = run_blake3(
["--keyed", hex_key, "--length",
str(xof_len)], input)
for line in test_keyed_hash_xof.splitlines():
assert expected_keyed_hash_xof == line, \
"keyed_hash_xof({}): {} != {}".format(
input_len, expected_keyed_hash_xof, line)
# Test the default derive key.
test_derive_key = run_blake3(["--derive-key", context_string], input)
for line in test_derive_key.splitlines():
assert expected_derive_key == line, \
"derive_key({}): {} != {}".format(
input_len, expected_derive_key, line)
# Test the extended derive key.
xof_len = len(expected_derive_key_xof) // 2
test_derive_key_xof = run_blake3(
["--derive-key", context_string, "--length",
str(xof_len)], input)
for line in test_derive_key_xof.splitlines():
assert expected_derive_key_xof == line, \
"derive_key_xof({}): {} != {}".format(
input_len, expected_derive_key_xof, line)
if __name__ == "__main__":
main()

70
vendor/blake3/media/B3.svg vendored Normal file
View File

@@ -0,0 +1,70 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
width="13.356165mm"
height="7.1437497mm"
viewBox="0 0 13.356165 7.1437497"
version="1.1"
id="svg8"
sodipodi:docname="B3.svg"
inkscape:version="0.92.4 5da689c313, 2019-01-14">
<defs
id="defs2" />
<sodipodi:namedview
id="base"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:pageopacity="0.0"
inkscape:pageshadow="2"
inkscape:zoom="4"
inkscape:cx="72.73328"
inkscape:cy="-34.835127"
inkscape:document-units="mm"
inkscape:current-layer="layer1"
showgrid="false"
inkscape:window-width="1920"
inkscape:window-height="1016"
inkscape:window-x="0"
inkscape:window-y="27"
inkscape:window-maximized="1" />
<metadata
id="metadata5">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
<dc:title />
</cc:Work>
</rdf:RDF>
</metadata>
<g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1"
transform="translate(-24.441005,-113.52518)">
<g
aria-label="B3"
style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
id="text868">
<path
d="m 28.176921,113.52518 q 0.635,0 1.0795,0.14817 0.455084,0.13758 0.740834,0.40216 0.296333,0.254 0.433916,0.61384 0.137584,0.35983 0.137584,0.79375 0,0.62441 -0.264584,1.00541 -0.254,0.381 -0.762,0.58209 0.508,0.21166 0.783167,0.61383 0.275167,0.39158 0.275167,1.016 0,0.43392 -0.137584,0.79375 -0.137583,0.35983 -0.433916,0.62442 -0.28575,0.254 -0.740834,0.40216 -0.4445,0.14817 -1.0795,0.14817 h -3.174999 q -0.592667,0 -0.592667,-0.58208 v -5.97959 q 0,-0.58208 0.592667,-0.58208 z m -2.508249,5.78908 q 0,0.11642 0.137583,0.11642 h 2.434166 q 0.5715,0 0.836084,-0.24342 0.264583,-0.24341 0.264583,-0.68791 0,-0.92075 -1.100667,-0.92075 h -2.571749 z m 0,-2.77283 h 2.539999 q 1.100667,0 1.100667,-0.85725 0,-0.42333 -0.264583,-0.67733 -0.254,-0.254 -0.8255,-0.254 h -2.413 q -0.137583,0 -0.137583,0.127 z"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332"
id="path814" />
<path
d="m 35.38417,113.52518 q 0.635,0 1.0795,0.14817 0.455083,0.13758 0.740833,0.40216 0.296333,0.254 0.433917,0.60325 0.137583,0.34925 0.137583,0.762 0,0.635 -0.264583,1.03717 -0.254,0.39158 -0.751417,0.60325 0.508,0.21167 0.772583,0.62442 0.264584,0.40216 0.264584,1.04775 0,0.40216 -0.137584,0.75141 -0.137583,0.34925 -0.423333,0.61384 -0.28575,0.254 -0.740833,0.40216 -0.4445,0.14817 -1.0795,0.14817 h -3.608917 v -1.24883 h 3.608917 q 0.550333,0 0.814917,-0.23284 0.264583,-0.24341 0.264583,-0.67733 0,-0.85725 -1.090083,-0.85725 h -2.201334 v -1.13242 h 2.169584 q 0.550333,0 0.814916,-0.20108 0.275167,-0.21167 0.275167,-0.65617 0,-0.40216 -0.254,-0.64558 -0.254,-0.24342 -0.8255,-0.24342 h -3.566583 v -1.24883 z"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332"
id="path816" />
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 3.8 KiB

85
vendor/blake3/media/BLAKE3.svg vendored Normal file
View File

@@ -0,0 +1,85 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:cc="http://creativecommons.org/ns#"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
width="43.063534mm"
height="7.2707496mm"
viewBox="0 0 43.063534 7.2707496"
version="1.1"
id="svg8"
sodipodi:docname="BLAKE3.svg"
inkscape:version="0.92.4 5da689c313, 2019-01-14">
<defs
id="defs2" />
<sodipodi:namedview
id="base"
pagecolor="#ffffff"
bordercolor="#666666"
borderopacity="1.0"
inkscape:pageopacity="0.0"
inkscape:pageshadow="2"
inkscape:zoom="4"
inkscape:cx="72.73328"
inkscape:cy="-34.835127"
inkscape:document-units="mm"
inkscape:current-layer="layer1"
showgrid="false"
inkscape:window-width="1920"
inkscape:window-height="1016"
inkscape:window-x="0"
inkscape:window-y="27"
inkscape:window-maximized="1" />
<metadata
id="metadata5">
<rdf:RDF>
<cc:Work
rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
</cc:Work>
</rdf:RDF>
</metadata>
<g
inkscape:label="Layer 1"
inkscape:groupmode="layer"
id="layer1"
transform="translate(-24.441005,-113.39818)">
<g
aria-label="BLAKE3"
style="font-style:normal;font-weight:normal;font-size:10.58333302px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
id="text868">
<path
d="m 28.176921,113.52518 q 0.635,0 1.0795,0.14817 0.455084,0.13758 0.740834,0.40216 0.296333,0.254 0.433916,0.61384 0.137584,0.35983 0.137584,0.79375 0,0.62441 -0.264584,1.00541 -0.254,0.381 -0.762,0.58209 0.508,0.21166 0.783167,0.61383 0.275167,0.39158 0.275167,1.016 0,0.43392 -0.137584,0.79375 -0.137583,0.35983 -0.433916,0.62442 -0.28575,0.254 -0.740834,0.40216 -0.4445,0.14817 -1.0795,0.14817 h -3.174999 q -0.592667,0 -0.592667,-0.58208 v -5.97959 q 0,-0.58208 0.592667,-0.58208 z m -2.508249,5.78908 q 0,0.11642 0.137583,0.11642 h 2.434166 q 0.5715,0 0.836084,-0.24342 0.264583,-0.24341 0.264583,-0.68791 0,-0.92075 -1.100667,-0.92075 h -2.571749 z m 0,-2.77283 h 2.539999 q 1.100667,0 1.100667,-0.85725 0,-0.42333 -0.264583,-0.67733 -0.254,-0.254 -0.8255,-0.254 h -2.413 q -0.137583,0 -0.137583,0.127 z"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332"
id="path814" />
<path
d="m 33.22517,113.52518 v 4.66725 q 0,0.254 0.0635,0.48683 0.07408,0.22225 0.243417,0.39159 0.169333,0.15875 0.4445,0.254 0.28575,0.0953 0.709083,0.0953 h 2.772833 v 1.24883 h -2.846916 q -0.709084,0 -1.217084,-0.17992 -0.497416,-0.1905 -0.814916,-0.51858 -0.3175,-0.32808 -0.465667,-0.77258 -0.137583,-0.45509 -0.137583,-0.99484 v -4.67783 z"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332"
id="path816" />
<path
d="M 39.342334,120.66893 H 37.9665 l 2.50825,-6.35 q 0.201084,-0.508 0.560917,-0.70908 0.370417,-0.21167 0.941917,-0.21167 0.560916,0 0.92075,0.21167 0.370416,0.20108 0.560916,0.70908 l 2.413,6.35 h -1.386416 l -2.169584,-5.74675 q -0.09525,-0.24342 -0.34925,-0.24342 -0.254,0 -0.359833,0.24342 z"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332"
id="path818" />
<path
d="m 48.179401,113.52518 v 3.02683 h 0.687917 q 0.455083,0 0.740833,-0.0212 0.296333,-0.0318 0.486833,-0.127 0.1905,-0.0953 0.3175,-0.26459 0.137584,-0.17991 0.28575,-0.47625 l 1.090084,-2.13783 h 1.344083 l -1.121833,2.2225 q -0.243417,0.47625 -0.518584,0.79375 -0.275166,0.3175 -0.719666,0.508 0.254,0.0635 0.4445,0.17992 0.1905,0.10583 0.34925,0.27516 0.169333,0.15875 0.3175,0.39159 0.148166,0.22225 0.306916,0.52916 l 1.153584,2.24367 h -1.397 l -1.090084,-2.11667 q -0.148166,-0.28575 -0.28575,-0.45508 -0.137583,-0.16933 -0.34925,-0.26458 -0.211666,-0.0952 -0.529166,-0.11642 -0.3175,-0.0317 -0.8255,-0.0317 h -0.687917 v 2.9845 h -1.248833 v -7.14375 z"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332"
id="path820" />
<path
d="m 60.127965,113.52518 v 1.24883 h -3.577166 q -0.5715,0 -0.8255,0.24342 -0.254,0.24342 -0.254,0.65617 0,0.84666 1.090083,0.84666 h 3.513667 v 1.13242 h -3.545417 q -1.090083,0 -1.090083,0.86783 0,0.42334 0.264583,0.66675 0.264583,0.23284 0.814917,0.23284 h 3.6195 v 1.24883 h -3.6195 q -0.635,0 -1.090083,-0.14817 -0.4445,-0.14816 -0.740834,-0.40216 -0.28575,-0.26459 -0.423333,-0.62442 -0.127,-0.35983 -0.127,-0.77258 0,-0.61384 0.264583,-1.016 0.264584,-0.41275 0.762,-0.62442 -1.005416,-0.41275 -1.005416,-1.60867 0,-0.42333 0.137583,-0.78316 0.137583,-0.35984 0.423333,-0.61384 0.296334,-0.26458 0.740834,-0.40216 0.455083,-0.14817 1.090083,-0.14817 z"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332"
id="path822" />
<path
d="m 65.091539,113.52518 q 0.635,0 1.0795,0.14817 0.455083,0.13758 0.740833,0.40216 0.296333,0.254 0.433917,0.60325 0.137583,0.34925 0.137583,0.762 0,0.635 -0.264583,1.03717 -0.254,0.39158 -0.751417,0.60325 0.508,0.21167 0.772583,0.62442 0.264584,0.40216 0.264584,1.04775 0,0.40216 -0.137584,0.75141 -0.137583,0.34925 -0.423333,0.61384 -0.28575,0.254 -0.740833,0.40216 -0.4445,0.14817 -1.0795,0.14817 h -3.608917 v -1.24883 h 3.608917 q 0.550333,0 0.814916,-0.23284 0.264584,-0.24341 0.264584,-0.67733 0,-0.85725 -1.090084,-0.85725 h -2.201333 v -1.13242 h 2.169583 q 0.550334,0 0.814917,-0.20108 0.275167,-0.21167 0.275167,-0.65617 0,-0.40216 -0.254,-0.64558 -0.254,-0.24342 -0.8255,-0.24342 h -3.566583 v -1.24883 z"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-family:Nasalization;-inkscape-font-specification:Nasalization;stroke-width:0.26458332"
id="path824" />
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 6.6 KiB

1474
vendor/blake3/media/speed.svg vendored Normal file

File diff suppressed because it is too large Load Diff

After

Width:  |  Height:  |  Size: 46 KiB

65
vendor/blake3/src/ffi_avx2.rs vendored Normal file
View File

@@ -0,0 +1,65 @@
use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
// Note that there is no AVX2 implementation of compress_in_place or
// compress_xof.
// Unsafe because this may only be called on platforms supporting AVX2.
pub unsafe fn hash_many<const N: usize>(
inputs: &[&[u8; N]],
key: &CVWords,
counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8],
) {
unsafe {
// The Rust hash_many implementations do bounds checking on the `out`
// array, but the C implementations don't. Even though this is an unsafe
// function, assert the bounds here.
assert!(out.len() >= inputs.len() * OUT_LEN);
ffi::blake3_hash_many_avx2(
inputs.as_ptr() as *const *const u8,
inputs.len(),
N / BLOCK_LEN,
key.as_ptr(),
counter,
increment_counter.yes(),
flags,
flags_start,
flags_end,
out.as_mut_ptr(),
)
}
}
pub mod ffi {
extern "C" {
pub fn blake3_hash_many_avx2(
inputs: *const *const u8,
num_inputs: usize,
blocks: usize,
key: *const u32,
counter: u64,
increment_counter: bool,
flags: u8,
flags_start: u8,
flags_end: u8,
out: *mut u8,
);
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_hash_many() {
if !crate::platform::avx2_detected() {
return;
}
crate::test::test_hash_many_fn(hash_many, hash_many);
}
}

169
vendor/blake3/src/ffi_avx512.rs vendored Normal file
View File

@@ -0,0 +1,169 @@
use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
// Unsafe because this may only be called on platforms supporting AVX-512.
pub unsafe fn compress_in_place(
cv: &mut CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) {
unsafe {
ffi::blake3_compress_in_place_avx512(
cv.as_mut_ptr(),
block.as_ptr(),
block_len,
counter,
flags,
)
}
}
// Unsafe because this may only be called on platforms supporting AVX-512.
pub unsafe fn compress_xof(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) -> [u8; 64] {
unsafe {
let mut out = [0u8; 64];
ffi::blake3_compress_xof_avx512(
cv.as_ptr(),
block.as_ptr(),
block_len,
counter,
flags,
out.as_mut_ptr(),
);
out
}
}
// Unsafe because this may only be called on platforms supporting AVX-512.
pub unsafe fn hash_many<const N: usize>(
inputs: &[&[u8; N]],
key: &CVWords,
counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8],
) {
unsafe {
// The Rust hash_many implementations do bounds checking on the `out`
// array, but the C implementations don't. Even though this is an unsafe
// function, assert the bounds here.
assert!(out.len() >= inputs.len() * OUT_LEN);
ffi::blake3_hash_many_avx512(
inputs.as_ptr() as *const *const u8,
inputs.len(),
N / BLOCK_LEN,
key.as_ptr(),
counter,
increment_counter.yes(),
flags,
flags_start,
flags_end,
out.as_mut_ptr(),
)
}
}
// Unsafe because this may only be called on platforms supporting AVX-512.
#[cfg(unix)]
pub unsafe fn xof_many(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
out: &mut [u8],
) {
unsafe {
debug_assert_eq!(0, out.len() % BLOCK_LEN, "whole blocks only");
ffi::blake3_xof_many_avx512(
cv.as_ptr(),
block.as_ptr(),
block_len,
counter,
flags,
out.as_mut_ptr(),
out.len() / BLOCK_LEN,
);
}
}
pub mod ffi {
extern "C" {
pub fn blake3_compress_in_place_avx512(
cv: *mut u32,
block: *const u8,
block_len: u8,
counter: u64,
flags: u8,
);
pub fn blake3_compress_xof_avx512(
cv: *const u32,
block: *const u8,
block_len: u8,
counter: u64,
flags: u8,
out: *mut u8,
);
pub fn blake3_hash_many_avx512(
inputs: *const *const u8,
num_inputs: usize,
blocks: usize,
key: *const u32,
counter: u64,
increment_counter: bool,
flags: u8,
flags_start: u8,
flags_end: u8,
out: *mut u8,
);
#[cfg(unix)]
pub fn blake3_xof_many_avx512(
cv: *const u32,
block: *const u8,
block_len: u8,
counter: u64,
flags: u8,
out: *mut u8,
outblocks: usize,
);
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_compress() {
if !crate::platform::avx512_detected() {
return;
}
crate::test::test_compress_fn(compress_in_place, compress_xof);
}
#[test]
fn test_hash_many() {
if !crate::platform::avx512_detected() {
return;
}
crate::test::test_hash_many_fn(hash_many, hash_many);
}
#[cfg(unix)]
#[test]
fn test_xof_many() {
if !crate::platform::avx512_detected() {
return;
}
crate::test::test_xof_many_fn(xof_many);
}
}

82
vendor/blake3/src/ffi_neon.rs vendored Normal file
View File

@@ -0,0 +1,82 @@
use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
// Unsafe because this may only be called on platforms supporting NEON.
pub unsafe fn hash_many<const N: usize>(
inputs: &[&[u8; N]],
key: &CVWords,
counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8],
) {
// The Rust hash_many implementations do bounds checking on the `out`
// array, but the C implementations don't. Even though this is an unsafe
// function, assert the bounds here.
assert!(out.len() >= inputs.len() * OUT_LEN);
ffi::blake3_hash_many_neon(
inputs.as_ptr() as *const *const u8,
inputs.len(),
N / BLOCK_LEN,
key.as_ptr(),
counter,
increment_counter.yes(),
flags,
flags_start,
flags_end,
out.as_mut_ptr(),
)
}
// blake3_neon.c normally depends on blake3_portable.c, because the NEON
// implementation only provides 4x compression, and it relies on the portable
// implementation for 1x compression. However, we expose the portable Rust
// implementation here instead, to avoid linking in unnecessary code.
#[no_mangle]
pub extern "C" fn blake3_compress_in_place_portable(
cv: *mut u32,
block: *const u8,
block_len: u8,
counter: u64,
flags: u8,
) {
unsafe {
crate::portable::compress_in_place(
&mut *(cv as *mut [u32; 8]),
&*(block as *const [u8; 64]),
block_len,
counter,
flags,
)
}
}
pub mod ffi {
extern "C" {
pub fn blake3_hash_many_neon(
inputs: *const *const u8,
num_inputs: usize,
blocks: usize,
key: *const u32,
counter: u64,
increment_counter: bool,
flags: u8,
flags_start: u8,
flags_end: u8,
out: *mut u8,
);
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_hash_many() {
// This entire file is gated on feature="neon", so NEON support is
// assumed here.
crate::test::test_hash_many_fn(hash_many, hash_many);
}
}

126
vendor/blake3/src/ffi_sse2.rs vendored Normal file
View File

@@ -0,0 +1,126 @@
use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
// Unsafe because this may only be called on platforms supporting SSE2.
pub unsafe fn compress_in_place(
cv: &mut CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) {
unsafe {
ffi::blake3_compress_in_place_sse2(
cv.as_mut_ptr(),
block.as_ptr(),
block_len,
counter,
flags,
)
}
}
// Unsafe because this may only be called on platforms supporting SSE2.
pub unsafe fn compress_xof(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) -> [u8; 64] {
unsafe {
let mut out = [0u8; 64];
ffi::blake3_compress_xof_sse2(
cv.as_ptr(),
block.as_ptr(),
block_len,
counter,
flags,
out.as_mut_ptr(),
);
out
}
}
// Unsafe because this may only be called on platforms supporting SSE2.
pub unsafe fn hash_many<const N: usize>(
inputs: &[&[u8; N]],
key: &CVWords,
counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8],
) {
unsafe {
// The Rust hash_many implementations do bounds checking on the `out`
// array, but the C implementations don't. Even though this is an unsafe
// function, assert the bounds here.
assert!(out.len() >= inputs.len() * OUT_LEN);
ffi::blake3_hash_many_sse2(
inputs.as_ptr() as *const *const u8,
inputs.len(),
N / BLOCK_LEN,
key.as_ptr(),
counter,
increment_counter.yes(),
flags,
flags_start,
flags_end,
out.as_mut_ptr(),
)
}
}
pub mod ffi {
extern "C" {
pub fn blake3_compress_in_place_sse2(
cv: *mut u32,
block: *const u8,
block_len: u8,
counter: u64,
flags: u8,
);
pub fn blake3_compress_xof_sse2(
cv: *const u32,
block: *const u8,
block_len: u8,
counter: u64,
flags: u8,
out: *mut u8,
);
pub fn blake3_hash_many_sse2(
inputs: *const *const u8,
num_inputs: usize,
blocks: usize,
key: *const u32,
counter: u64,
increment_counter: bool,
flags: u8,
flags_start: u8,
flags_end: u8,
out: *mut u8,
);
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_compress() {
if !crate::platform::sse2_detected() {
return;
}
crate::test::test_compress_fn(compress_in_place, compress_xof);
}
#[test]
fn test_hash_many() {
if !crate::platform::sse2_detected() {
return;
}
crate::test::test_hash_many_fn(hash_many, hash_many);
}
}

126
vendor/blake3/src/ffi_sse41.rs vendored Normal file
View File

@@ -0,0 +1,126 @@
use crate::{CVWords, IncrementCounter, BLOCK_LEN, OUT_LEN};
// Unsafe because this may only be called on platforms supporting SSE4.1.
pub unsafe fn compress_in_place(
cv: &mut CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) {
unsafe {
ffi::blake3_compress_in_place_sse41(
cv.as_mut_ptr(),
block.as_ptr(),
block_len,
counter,
flags,
)
}
}
// Unsafe because this may only be called on platforms supporting SSE4.1.
pub unsafe fn compress_xof(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) -> [u8; 64] {
unsafe {
let mut out = [0u8; 64];
ffi::blake3_compress_xof_sse41(
cv.as_ptr(),
block.as_ptr(),
block_len,
counter,
flags,
out.as_mut_ptr(),
);
out
}
}
// Unsafe because this may only be called on platforms supporting SSE4.1.
pub unsafe fn hash_many<const N: usize>(
inputs: &[&[u8; N]],
key: &CVWords,
counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8],
) {
unsafe {
// The Rust hash_many implementations do bounds checking on the `out`
// array, but the C implementations don't. Even though this is an unsafe
// function, assert the bounds here.
assert!(out.len() >= inputs.len() * OUT_LEN);
ffi::blake3_hash_many_sse41(
inputs.as_ptr() as *const *const u8,
inputs.len(),
N / BLOCK_LEN,
key.as_ptr(),
counter,
increment_counter.yes(),
flags,
flags_start,
flags_end,
out.as_mut_ptr(),
)
}
}
pub mod ffi {
extern "C" {
pub fn blake3_compress_in_place_sse41(
cv: *mut u32,
block: *const u8,
block_len: u8,
counter: u64,
flags: u8,
);
pub fn blake3_compress_xof_sse41(
cv: *const u32,
block: *const u8,
block_len: u8,
counter: u64,
flags: u8,
out: *mut u8,
);
pub fn blake3_hash_many_sse41(
inputs: *const *const u8,
num_inputs: usize,
blocks: usize,
key: *const u32,
counter: u64,
increment_counter: bool,
flags: u8,
flags_start: u8,
flags_end: u8,
out: *mut u8,
);
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_compress() {
if !crate::platform::sse41_detected() {
return;
}
crate::test::test_compress_fn(compress_in_place, compress_xof);
}
#[test]
fn test_hash_many() {
if !crate::platform::sse41_detected() {
return;
}
crate::test::test_hash_many_fn(hash_many, hash_many);
}
}

60
vendor/blake3/src/guts.rs vendored Normal file
View File

@@ -0,0 +1,60 @@
//! Deprecated in favor of [`hazmat`](crate::hazmat)
pub use crate::{BLOCK_LEN, CHUNK_LEN};
#[derive(Clone, Debug)]
pub struct ChunkState(crate::ChunkState);
impl ChunkState {
// Currently this type only supports the regular hash mode. If an
// incremental user needs keyed_hash or derive_key, we can add that.
pub fn new(chunk_counter: u64) -> Self {
Self(crate::ChunkState::new(
crate::IV,
chunk_counter,
0,
crate::platform::Platform::detect(),
))
}
#[inline]
pub fn len(&self) -> usize {
self.0.count()
}
#[inline]
pub fn update(&mut self, input: &[u8]) -> &mut Self {
self.0.update(input);
self
}
pub fn finalize(&self, is_root: bool) -> crate::Hash {
let output = self.0.output();
if is_root {
output.root_hash()
} else {
output.chaining_value().into()
}
}
}
// As above, this currently assumes the regular hash mode. If an incremental
// user needs keyed_hash or derive_key, we can add that.
pub fn parent_cv(
left_child: &crate::Hash,
right_child: &crate::Hash,
is_root: bool,
) -> crate::Hash {
let output = crate::parent_node_output(
left_child.as_bytes(),
right_child.as_bytes(),
crate::IV,
0,
crate::platform::Platform::detect(),
);
if is_root {
output.root_hash()
} else {
output.chaining_value().into()
}
}

704
vendor/blake3/src/hazmat.rs vendored Normal file
View File

@@ -0,0 +1,704 @@
//! Low-level tree manipulations and other sharp tools
//!
//! The target audience for this module is projects like [Bao](https://github.com/oconnor663/bao),
//! which work directly with the interior hashes ("chaining values") of BLAKE3 chunks and subtrees.
//! For example, you could use these functions to implement a BitTorrent-like protocol using the
//! BLAKE3 tree structure, or to hash an input that's distributed across different machines. These
//! use cases are advanced, and most applications don't need this module. Also:
//!
//! <div class="warning">
//!
//! **Warning:** This module is *hazardous material*. If you've heard folks say *don't roll your
//! own crypto,* this is the sort of thing they're talking about. These functions have complicated
//! requirements, and any mistakes will give you garbage output and/or break the security
//! properties that BLAKE3 is supposed to have. Read section 2.1 of [the BLAKE3
//! paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf) to understand the
//! tree structure you need to maintain. Test your code against [`blake3::hash`](../fn.hash.html)
//! and make sure you can get the same outputs for [lots of different
//! inputs](https://github.com/BLAKE3-team/BLAKE3/blob/master/test_vectors/test_vectors.json).
//!
//! </div>
//!
//! On the other hand:
//!
//! <div class="warning">
//!
//! **Encouragement:** Playing with these functions is a great way to learn how BLAKE3 works on the
//! inside. Have fun!
//!
//! </div>
//!
//! The main entrypoint for this module is the [`HasherExt`] trait, particularly the
//! [`set_input_offset`](HasherExt::set_input_offset) and
//! [`finalize_non_root`](HasherExt::finalize_non_root) methods. These let you compute the chaining
//! values of individual chunks or subtrees. You then combine these chaining values into larger
//! subtrees using [`merge_subtrees_non_root`] and finally (once at the very top)
//! [`merge_subtrees_root`] or [`merge_subtrees_root_xof`].
//!
//! # Examples
//!
//! Here's an example of computing all the interior hashes in a 3-chunk tree:
//!
//! ```text
//! root
//! / \
//! parent \
//! / \ \
//! chunk0 chunk1 chunk2
//! ```
//!
//! ```
//! # fn main() {
//! use blake3::{Hasher, CHUNK_LEN};
//! use blake3::hazmat::{merge_subtrees_non_root, merge_subtrees_root, Mode};
//! use blake3::hazmat::HasherExt; // an extension trait for Hasher
//!
//! let chunk0 = [b'a'; CHUNK_LEN];
//! let chunk1 = [b'b'; CHUNK_LEN];
//! let chunk2 = [b'c'; 42]; // The final chunk can be short.
//!
//! // Compute the non-root hashes ("chaining values") of all three chunks. Chunks or subtrees
//! // that don't begin at the start of the input use `set_input_offset` to say where they begin.
//! let chunk0_cv = Hasher::new()
//! // .set_input_offset(0) is the default.
//! .update(&chunk0)
//! .finalize_non_root();
//! let chunk1_cv = Hasher::new()
//! .set_input_offset(CHUNK_LEN as u64)
//! .update(&chunk1)
//! .finalize_non_root();
//! let chunk2_cv = Hasher::new()
//! .set_input_offset(2 * CHUNK_LEN as u64)
//! .update(&chunk2)
//! .finalize_non_root();
//!
//! // Join the first two chunks with a non-root parent node and compute its chaining value.
//! let parent_cv = merge_subtrees_non_root(&chunk0_cv, &chunk1_cv, Mode::Hash);
//!
//! // Join that parent node and the third chunk with a root parent node and compute the hash.
//! let root_hash = merge_subtrees_root(&parent_cv, &chunk2_cv, Mode::Hash);
//!
//! // Double check that we got the right answer.
//! let mut combined_input = Vec::new();
//! combined_input.extend_from_slice(&chunk0);
//! combined_input.extend_from_slice(&chunk1);
//! combined_input.extend_from_slice(&chunk2);
//! assert_eq!(root_hash, blake3::hash(&combined_input));
//! # }
//! ```
//!
//! Hashing many chunks together is important for performance, because it allows the implementation
//! to use SIMD parallelism internally. ([AVX-512](https://en.wikipedia.org/wiki/AVX-512) for
//! example needs 16 chunks to really get going.) We can reproduce `parent_cv` by hashing `chunk0`
//! and `chunk1` at the same time:
//!
//! ```
//! # fn main() {
//! # use blake3::{Hasher, CHUNK_LEN};
//! # use blake3::hazmat::{Mode, HasherExt, merge_subtrees_non_root, merge_subtrees_root};
//! # let chunk0 = [b'a'; CHUNK_LEN];
//! # let chunk1 = [b'b'; CHUNK_LEN];
//! # let chunk0_cv = Hasher::new().update(&chunk0).finalize_non_root();
//! # let chunk1_cv = Hasher::new().set_input_offset(CHUNK_LEN as u64).update(&chunk1).finalize_non_root();
//! # let parent_cv = merge_subtrees_non_root(&chunk0_cv, &chunk1_cv, Mode::Hash);
//! # let mut combined_input = Vec::new();
//! # combined_input.extend_from_slice(&chunk0);
//! # combined_input.extend_from_slice(&chunk1);
//! let left_subtree_cv = Hasher::new()
//! // .set_input_offset(0) is the default.
//! .update(&combined_input[..2 * CHUNK_LEN])
//! .finalize_non_root();
//! assert_eq!(left_subtree_cv, parent_cv);
//!
//! // Using multiple updates gives the same answer, though it's not as efficient.
//! let mut subtree_hasher = Hasher::new();
//! // Again, .set_input_offset(0) is the default.
//! subtree_hasher.update(&chunk0);
//! subtree_hasher.update(&chunk1);
//! assert_eq!(left_subtree_cv, subtree_hasher.finalize_non_root());
//! # }
//! ```
//!
//! However, hashing multiple chunks together **must** respect the overall tree structure. Hashing
//! `chunk0` and `chunk1` together is valid, but hashing `chunk1` and `chunk2` together is
//! incorrect and gives a garbage result that will never match a standard BLAKE3 hash. The
//! implementation includes a few best-effort asserts to catch some of these mistakes, but these
//! checks aren't guaranteed. For example, this second call to `update` currently panics:
//!
//! ```should_panic
//! # fn main() {
//! # use blake3::{Hasher, CHUNK_LEN};
//! # use blake3::hazmat::HasherExt;
//! # let chunk0 = [b'a'; CHUNK_LEN];
//! # let chunk1 = [b'b'; CHUNK_LEN];
//! # let chunk2 = [b'c'; 42];
//! let oops = Hasher::new()
//! .set_input_offset(CHUNK_LEN as u64)
//! .update(&chunk1)
//! // PANIC: "the subtree starting at 1024 contains at most 1024 bytes"
//! .update(&chunk2)
//! .finalize_non_root();
//! # }
//! ```
//!
//! For more on valid tree structures, see the docs for and [`left_subtree_len`] and
//! [`max_subtree_len`], and see section 2.1 of [the BLAKE3
//! paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). Note that the
//! merging functions ([`merge_subtrees_root`] and friends) don't know the shape of the left and
//! right subtrees you're giving them, and they can't help you catch mistakes. The best way to
//! catch mistakes with these is to compare your root output to the [`blake3::hash`](crate::hash)
//! of the same input.
use crate::platform::Platform;
use crate::{CVWords, Hasher, CHUNK_LEN, IV, KEY_LEN, OUT_LEN};
/// Extension methods for [`Hasher`]. This is the main entrypoint to the `hazmat` module.
pub trait HasherExt {
/// Similar to [`Hasher::new_derive_key`] but using a pre-hashed [`ContextKey`] from
/// [`hash_derive_key_context`].
///
/// The [`hash_derive_key_context`] function is _only_ valid source of the [`ContextKey`]
///
/// # Example
///
/// ```
/// use blake3::Hasher;
/// use blake3::hazmat::HasherExt;
///
/// let context_key = blake3::hazmat::hash_derive_key_context("foo");
/// let mut hasher = Hasher::new_from_context_key(&context_key);
/// hasher.update(b"bar");
/// let derived_key = *hasher.finalize().as_bytes();
///
/// assert_eq!(derived_key, blake3::derive_key("foo", b"bar"));
/// ```
fn new_from_context_key(context_key: &ContextKey) -> Self;
/// Configure the `Hasher` to process a chunk or subtree starting at `offset` bytes into the
/// whole input.
///
/// You must call this function before processing any input with [`update`](Hasher::update) or
/// similar. This step isn't required for the first chunk, or for a subtree that includes the
/// first chunk (i.e. when the `offset` is zero), but it's required for all other chunks and
/// subtrees.
///
/// The starting input offset of a subtree implies a maximum possible length for that subtree.
/// See [`max_subtree_len`] and section 2.1 of [the BLAKE3
/// paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). Note that only
/// subtrees along the right edge of the whole tree can have a length less than their maximum
/// possible length.
///
/// See the [module level examples](index.html#examples).
///
/// # Panics
///
/// This function panics if the `Hasher` has already accepted any input with
/// [`update`](Hasher::update) or similar.
///
/// This should always be paired with [`finalize_non_root`](HasherExt::finalize_non_root). It's
/// never correct to use a non-zero input offset with [`finalize`](Hasher::finalize) or
/// [`finalize_xof`](Hasher::finalize_xof). The `offset` must also be a multiple of
/// `CHUNK_LEN`. Violating either of these rules will currently fail an assertion and panic,
/// but this is not guaranteed.
fn set_input_offset(&mut self, offset: u64) -> &mut Self;
/// Finalize the non-root hash ("chaining value") of the current chunk or subtree.
///
/// Afterwards you can merge subtree chaining values into parent nodes using
/// [`merge_subtrees_non_root`] and ultimately into the root node with either
/// [`merge_subtrees_root`] (similar to [`Hasher::finalize`]) or [`merge_subtrees_root_xof`]
/// (similar to [`Hasher::finalize_xof`]).
///
/// See the [module level examples](index.html#examples), particularly the discussion of valid
/// tree structures.
fn finalize_non_root(&self) -> ChainingValue;
}
impl HasherExt for Hasher {
fn new_from_context_key(context_key: &[u8; KEY_LEN]) -> Hasher {
let context_key_words = crate::platform::words_from_le_bytes_32(context_key);
Hasher::new_internal(&context_key_words, crate::DERIVE_KEY_MATERIAL)
}
fn set_input_offset(&mut self, offset: u64) -> &mut Hasher {
assert_eq!(self.count(), 0, "hasher has already accepted input");
assert_eq!(
offset % CHUNK_LEN as u64,
0,
"offset ({offset}) must be a chunk boundary (divisible by {CHUNK_LEN})",
);
let counter = offset / CHUNK_LEN as u64;
self.chunk_state.chunk_counter = counter;
self.initial_chunk_counter = counter;
self
}
fn finalize_non_root(&self) -> ChainingValue {
assert_ne!(self.count(), 0, "empty subtrees are never valid");
self.final_output().chaining_value()
}
}
/// The maximum length of a subtree in bytes, given its starting offset in bytes
///
/// If you try to hash more than this many bytes as one subtree, you'll end up merging parent nodes
/// that shouldn't be merged, and your output will be garbage. [`Hasher::update`] will currently
/// panic in this case, but this is not guaranteed.
///
/// For input offset zero (the default), there is no maximum length, and this function returns
/// `None`. For all other offsets it returns `Some`. Note that valid offsets must be a multiple of
/// [`CHUNK_LEN`] (1024); it's not possible to start hashing a chunk in the middle.
///
/// In the example tree below, chunks are numbered by their _0-based index_. The subtree that
/// _starts_ with chunk 3, i.e. `input_offset = 3 * CHUNK_LEN`, includes only that one chunk, so
/// its max length is `Some(CHUNK_LEN)`. The subtree that starts with chunk 6 includes chunk 7 but
/// not chunk 8, so its max length is `Some(2 * CHUNK_LEN)`. The subtree that starts with chunk 12
/// includes chunks 13, 14, and 15, but if the tree were bigger it would not include chunk 16, so
/// its max length is `Some(4 * CHUNK_LEN)`. One way to think about the rule here is that, if you
/// go beyond the max subtree length from a given starting offset, you start dealing with subtrees
/// that include chunks _to the left_ of where you started.
///
/// ```text
/// root
/// / \
/// . .
/// / \ / \
/// . . . .
/// / \ / \ / \ / \
/// . . . . . . . .
/// / \ / \ / \ / \ / \ / \ / \ / \
/// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/// ```
///
/// The general rule turns out to be that for a subtree starting at a 0-based chunk index N greater
/// than zero, the maximum number of chunks in that subtree is the largest power-of-two that
/// divides N, which is given by `1 << N.trailing_zeros()`.
///
/// This function can be useful for writing tests or debug assertions, but it's actually rare to
/// use this for real control flow. Callers who split their input recursively using
/// [`left_subtree_len`] will automatically satisfy the `max_subtree_len` bound and don't
/// necessarily need to check. It's also common to choose some fixed power-of-two subtree size, say
/// 64 chunks, and divide your input up into slices of that fixed length (with the final slice
/// possibly short). This approach also automatically satisfies the `max_subtree_len` bound and
/// doesn't need to check. Proving that this is true can be an interesting exercise. Note that
/// chunks 0, 4, 8, and 12 all begin subtrees of at least 4 chunks in the example tree above.
///
/// # Panics
///
/// This function currently panics if `input_offset` is not a multiple of `CHUNK_LEN`. This is not
/// guaranteed.
#[inline(always)]
pub fn max_subtree_len(input_offset: u64) -> Option<u64> {
if input_offset == 0 {
return None;
}
assert_eq!(input_offset % CHUNK_LEN as u64, 0);
let counter = input_offset / CHUNK_LEN as u64;
let max_chunks = 1 << counter.trailing_zeros();
Some(max_chunks * CHUNK_LEN as u64)
}
#[test]
fn test_max_subtree_len() {
assert_eq!(max_subtree_len(0), None);
// (chunk index, max chunks)
let cases = [
(1, 1),
(2, 2),
(3, 1),
(4, 4),
(5, 1),
(6, 2),
(7, 1),
(8, 8),
];
for (chunk_index, max_chunks) in cases {
let input_offset = chunk_index * CHUNK_LEN as u64;
assert_eq!(
max_subtree_len(input_offset),
Some(max_chunks * CHUNK_LEN as u64),
);
}
}
/// Given the length in bytes of either a complete input or a subtree input, return the number of
/// bytes that belong to its left child subtree. The rest belong to its right child subtree.
///
/// Concretely, this function returns the largest power-of-two number of bytes that's strictly less
/// than `input_len`. This leads to a tree where all left subtrees are "complete" and at least as
/// large as their sibling right subtrees, as specified in section 2.1 of [the BLAKE3
/// paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). For example, if an
/// input is exactly two chunks, its left and right subtrees both get one chunk. But if an input is
/// two chunks plus one more byte, then its left subtree gets two chunks, and its right subtree
/// only gets one byte.
///
/// This function isn't meaningful for one chunk of input, because chunks don't have children. It
/// currently panics in debug mode if `input_len <= CHUNK_LEN`.
///
/// # Example
///
/// Hash a input of random length as two subtrees:
///
/// ```
/// # #[cfg(feature = "std")] {
/// use blake3::hazmat::{left_subtree_len, merge_subtrees_root, HasherExt, Mode};
/// use blake3::{Hasher, CHUNK_LEN};
///
/// // Generate a random-length input. Note that to be split into two subtrees, the input length
/// // must be greater than CHUNK_LEN.
/// let input_len = rand::random_range(CHUNK_LEN + 1..1_000_000);
/// let mut input = vec![0; input_len];
/// rand::fill(&mut input[..]);
///
/// // Compute the left and right subtree hashes and then the root hash. left_subtree_len() tells
/// // us exactly where to split the input. Any other split would either panic (if we're lucky) or
/// // lead to an incorrect root hash.
/// let left_len = left_subtree_len(input_len as u64) as usize;
/// let left_subtree_cv = Hasher::new()
/// .update(&input[..left_len])
/// .finalize_non_root();
/// let right_subtree_cv = Hasher::new()
/// .set_input_offset(left_len as u64)
/// .update(&input[left_len..])
/// .finalize_non_root();
/// let root_hash = merge_subtrees_root(&left_subtree_cv, &right_subtree_cv, Mode::Hash);
///
/// // Double check the answer.
/// assert_eq!(root_hash, blake3::hash(&input));
/// # }
/// ```
#[inline(always)]
pub fn left_subtree_len(input_len: u64) -> u64 {
debug_assert!(input_len > CHUNK_LEN as u64);
// Note that .next_power_of_two() is greater than *or equal*.
((input_len + 1) / 2).next_power_of_two()
}
#[test]
fn test_left_subtree_len() {
assert_eq!(left_subtree_len(1025), 1024);
for boundary_case in [2, 4, 8, 16, 32, 64] {
let input_len = boundary_case * CHUNK_LEN as u64;
assert_eq!(left_subtree_len(input_len - 1), input_len / 2);
assert_eq!(left_subtree_len(input_len), input_len / 2);
assert_eq!(left_subtree_len(input_len + 1), input_len);
}
}
/// The `mode` argument to [`merge_subtrees_root`] and friends
///
/// See the [module level examples](index.html#examples).
#[derive(Copy, Clone, Debug)]
pub enum Mode<'a> {
/// Corresponding to [`hash`](crate::hash)
Hash,
/// Corresponding to [`keyed_hash`](crate::hash)
KeyedHash(&'a [u8; KEY_LEN]),
/// Corresponding to [`derive_key`](crate::hash)
///
/// The [`ContextKey`] comes from [`hash_derive_key_context`].
DeriveKeyMaterial(&'a ContextKey),
}
impl<'a> Mode<'a> {
fn key_words(&self) -> CVWords {
match self {
Mode::Hash => *IV,
Mode::KeyedHash(key) => crate::platform::words_from_le_bytes_32(key),
Mode::DeriveKeyMaterial(cx_key) => crate::platform::words_from_le_bytes_32(cx_key),
}
}
fn flags_byte(&self) -> u8 {
match self {
Mode::Hash => 0,
Mode::KeyedHash(_) => crate::KEYED_HASH,
Mode::DeriveKeyMaterial(_) => crate::DERIVE_KEY_MATERIAL,
}
}
}
/// "Chaining value" is the academic term for a non-root or non-final hash.
///
/// Besides just sounding fancy, it turns out there are [security
/// reasons](https://jacko.io/tree_hashing.html) to be careful about the difference between
/// (root/final) hashes and (non-root/non-final) chaining values.
pub type ChainingValue = [u8; OUT_LEN];
fn merge_subtrees_inner(
left_child: &ChainingValue,
right_child: &ChainingValue,
mode: Mode,
) -> crate::Output {
crate::parent_node_output(
&left_child,
&right_child,
&mode.key_words(),
mode.flags_byte(),
Platform::detect(),
)
}
/// Compute a non-root parent node chaining value from two child chaining values.
///
/// See the [module level examples](index.html#examples), particularly the discussion of valid tree
/// structures. The left and right child chaining values can come from either
/// [`Hasher::finalize_non_root`](HasherExt::finalize_non_root) or other calls to
/// `merge_subtrees_non_root`. "Chaining value" is the academic term for a non-root or non-final
/// hash.
pub fn merge_subtrees_non_root(
left_child: &ChainingValue,
right_child: &ChainingValue,
mode: Mode,
) -> ChainingValue {
merge_subtrees_inner(left_child, right_child, mode).chaining_value()
}
/// Compute a root hash from two child chaining values.
///
/// See the [module level examples](index.html#examples), particularly the discussion of valid tree
/// structures. The left and right child chaining values can come from either
/// [`Hasher::finalize_non_root`](HasherExt::finalize_non_root) or [`merge_subtrees_non_root`].
/// "Chaining value" is the academic term for a non-root or non-final hash.
///
/// Note that inputs of [`CHUNK_LEN`] or less don't produce any parent nodes and can't be hashed
/// using this function. In that case you must get the root hash from [`Hasher::finalize`] (or just
/// [`blake3::hash`](crate::hash)).
pub fn merge_subtrees_root(
left_child: &ChainingValue,
right_child: &ChainingValue,
mode: Mode,
) -> crate::Hash {
merge_subtrees_inner(left_child, right_child, mode).root_hash()
}
/// Build a root [`OutputReader`](crate::OutputReader) from two child chaining values.
///
/// See also the [module level examples](index.html#examples), particularly the discussion of valid
/// tree structures. The left and right child chaining values can come from either
/// [`Hasher::finalize_non_root`](HasherExt::finalize_non_root) or [`merge_subtrees_non_root`].
/// "Chaining value" is the academic term for a non-root or non-final hash.
///
/// Note that inputs of [`CHUNK_LEN`] or less don't produce any parent nodes and can't be hashed
/// using this function. In that case you must get the `OutputReader` from
/// [`Hasher::finalize_xof`].
///
/// # Example
///
/// ```
/// use blake3::hazmat::{merge_subtrees_root_xof, HasherExt, Mode};
/// use blake3::{Hasher, CHUNK_LEN};
///
/// // Hash a 2-chunk subtree in steps. Note that only
/// // the final chunk can be shorter than CHUNK_LEN.
/// let chunk0 = &[42; CHUNK_LEN];
/// let chunk1 = b"hello world";
/// let chunk0_cv = Hasher::new()
/// .update(chunk0)
/// .finalize_non_root();
/// let chunk1_cv = Hasher::new()
/// .set_input_offset(CHUNK_LEN as u64)
/// .update(chunk1)
/// .finalize_non_root();
///
/// // Obtain a blake3::OutputReader at the root and extract 1000 bytes.
/// let mut output_reader = merge_subtrees_root_xof(&chunk0_cv, &chunk1_cv, Mode::Hash);
/// let mut output_bytes = [0; 1_000];
/// output_reader.fill(&mut output_bytes);
///
/// // Double check the answer.
/// let mut hasher = Hasher::new();
/// hasher.update(chunk0);
/// hasher.update(chunk1);
/// let mut expected = [0; 1_000];
/// hasher.finalize_xof().fill(&mut expected);
/// assert_eq!(output_bytes, expected);
/// ```
pub fn merge_subtrees_root_xof(
left_child: &ChainingValue,
right_child: &ChainingValue,
mode: Mode,
) -> crate::OutputReader {
crate::OutputReader::new(merge_subtrees_inner(left_child, right_child, mode))
}
/// An alias to distinguish [`hash_derive_key_context`] outputs from other keys.
pub type ContextKey = [u8; KEY_LEN];
/// Hash a [`derive_key`](crate::derive_key) context string and return a [`ContextKey`].
///
/// The _only_ valid uses for the returned [`ContextKey`] are [`Hasher::new_from_context_key`] and
/// [`Mode::DeriveKeyMaterial`] (together with the merge subtree functions).
///
/// # Example
///
/// ```
/// use blake3::Hasher;
/// use blake3::hazmat::HasherExt;
///
/// let context_key = blake3::hazmat::hash_derive_key_context("foo");
/// let mut hasher = Hasher::new_from_context_key(&context_key);
/// hasher.update(b"bar");
/// let derived_key = *hasher.finalize().as_bytes();
///
/// assert_eq!(derived_key, blake3::derive_key("foo", b"bar"));
/// ```
pub fn hash_derive_key_context(context: &str) -> ContextKey {
crate::hash_all_at_once::<crate::join::SerialJoin>(
context.as_bytes(),
IV,
crate::DERIVE_KEY_CONTEXT,
)
.root_hash()
.0
}
#[cfg(test)]
mod test {
use super::*;
#[test]
#[should_panic]
fn test_empty_subtree_should_panic() {
Hasher::new().finalize_non_root();
}
#[test]
#[should_panic]
fn test_unaligned_offset_should_panic() {
Hasher::new().set_input_offset(1);
}
#[test]
#[should_panic]
fn test_hasher_already_accepted_input_should_panic() {
Hasher::new().update(b"x").set_input_offset(0);
}
#[test]
#[should_panic]
fn test_too_much_input_should_panic() {
Hasher::new()
.set_input_offset(CHUNK_LEN as u64)
.update(&[0; CHUNK_LEN + 1]);
}
#[test]
#[should_panic]
fn test_set_input_offset_cant_finalize() {
Hasher::new().set_input_offset(CHUNK_LEN as u64).finalize();
}
#[test]
#[should_panic]
fn test_set_input_offset_cant_finalize_xof() {
Hasher::new()
.set_input_offset(CHUNK_LEN as u64)
.finalize_xof();
}
#[test]
fn test_grouped_hash() {
const MAX_CHUNKS: usize = (crate::test::TEST_CASES_MAX + 1) / CHUNK_LEN;
let mut input_buf = [0; crate::test::TEST_CASES_MAX];
crate::test::paint_test_input(&mut input_buf);
for subtree_chunks in [1, 2, 4, 8, 16, 32] {
#[cfg(feature = "std")]
dbg!(subtree_chunks);
let subtree_len = subtree_chunks * CHUNK_LEN;
for &case in crate::test::TEST_CASES {
if case <= subtree_len {
continue;
}
#[cfg(feature = "std")]
dbg!(case);
let input = &input_buf[..case];
let expected_hash = crate::hash(input);
// Collect all the group chaining values.
let mut chaining_values = arrayvec::ArrayVec::<ChainingValue, MAX_CHUNKS>::new();
let mut subtree_offset = 0;
while subtree_offset < input.len() {
let take = core::cmp::min(subtree_len, input.len() - subtree_offset);
let subtree_input = &input[subtree_offset..][..take];
let subtree_cv = Hasher::new()
.set_input_offset(subtree_offset as u64)
.update(subtree_input)
.finalize_non_root();
chaining_values.push(subtree_cv);
subtree_offset += take;
}
// Compress all the chaining_values together, layer by layer.
assert!(chaining_values.len() >= 2);
while chaining_values.len() > 2 {
let n = chaining_values.len();
// Merge each side-by-side pair in place, overwriting the front half of the
// array with the merged results. This moves us "up one level" in the tree.
for i in 0..(n / 2) {
chaining_values[i] = merge_subtrees_non_root(
&chaining_values[2 * i],
&chaining_values[2 * i + 1],
Mode::Hash,
);
}
// If there's an odd CV out, it moves up.
if n % 2 == 1 {
chaining_values[n / 2] = chaining_values[n - 1];
}
chaining_values.truncate(n / 2 + n % 2);
}
assert_eq!(chaining_values.len(), 2);
let root_hash =
merge_subtrees_root(&chaining_values[0], &chaining_values[1], Mode::Hash);
assert_eq!(expected_hash, root_hash);
}
}
}
#[test]
fn test_keyed_hash_xof() {
let group0 = &[42; 4096];
let group1 = &[43; 4095];
let mut input = [0; 8191];
input[..4096].copy_from_slice(group0);
input[4096..].copy_from_slice(group1);
let key = &[44; 32];
let mut expected_output = [0; 100];
Hasher::new_keyed(&key)
.update(&input)
.finalize_xof()
.fill(&mut expected_output);
let mut hazmat_output = [0; 100];
let left = Hasher::new_keyed(key).update(group0).finalize_non_root();
let right = Hasher::new_keyed(key)
.set_input_offset(group0.len() as u64)
.update(group1)
.finalize_non_root();
merge_subtrees_root_xof(&left, &right, Mode::KeyedHash(&key)).fill(&mut hazmat_output);
assert_eq!(expected_output, hazmat_output);
}
#[test]
fn test_derive_key() {
let context = "foo";
let mut input = [0; 1025];
crate::test::paint_test_input(&mut input);
let expected = crate::derive_key(context, &input);
let cx_key = hash_derive_key_context(context);
let left = Hasher::new_from_context_key(&cx_key)
.update(&input[..1024])
.finalize_non_root();
let right = Hasher::new_from_context_key(&cx_key)
.set_input_offset(1024)
.update(&input[1024..])
.finalize_non_root();
let derived_key = merge_subtrees_root(&left, &right, Mode::DeriveKeyMaterial(&cx_key)).0;
assert_eq!(expected, derived_key);
}
}

64
vendor/blake3/src/io.rs vendored Normal file
View File

@@ -0,0 +1,64 @@
//! Helper functions for efficient IO.
#[cfg(feature = "std")]
pub(crate) fn copy_wide(
mut reader: impl std::io::Read,
hasher: &mut crate::Hasher,
) -> std::io::Result<u64> {
let mut buffer = [0; 65536];
let mut total = 0;
loop {
match reader.read(&mut buffer) {
Ok(0) => return Ok(total),
Ok(n) => {
hasher.update(&buffer[..n]);
total += n as u64;
}
// see test_update_reader_interrupted
Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
Err(e) => return Err(e),
}
}
}
// Mmap a file, if it looks like a good idea. Return None in cases where we know mmap will fail, or
// if the file is short enough that mmapping isn't worth it. However, if we do try to mmap and it
// fails, return the error.
//
// SAFETY: Mmaps are fundamentally unsafe, because you can call invariant-checking functions like
// str::from_utf8 on them and then have them change out from under you. Letting a safe caller get
// their hands on an mmap, or even a &[u8] that's backed by an mmap, is unsound. However, because
// this function is crate-private, we can guarantee that all can ever happen in the event of a race
// condition is that we either hash nonsense bytes or crash with SIGBUS or similar, neither of
// which should risk memory corruption in a safe caller.
//
// PARANOIA: But a data race...is a data race...is a data race...right? Even if we know that no
// platform in the "real world" is ever going to do anything other than compute the "wrong answer"
// if we race on this mmap while we hash it, aren't we still supposed to feel bad about doing this?
// Well, maybe. This is IO, and IO gets special carve-outs in the memory model. Consider a
// memory-mapped register that returns random 32-bit words. (This is actually realistic if you have
// a hardware RNG.) It's probably sound to construct a *const i32 pointing to that register and do
// some raw pointer reads from it. Those reads should be volatile if you don't want the compiler to
// coalesce them, but either way the compiler isn't allowed to just _go nuts_ and insert
// should-never-happen branches to wipe your hard drive if two adjacent reads happen to give
// different values. As far as I'm aware, there's no such thing as a read that's allowed if it's
// volatile but prohibited if it's not (unlike atomics). As mentioned above, it's not ok to
// construct a safe &i32 to the register if you're going to leak that reference to unknown callers.
// But if you "know what you're doing," I don't think *const i32 and &i32 are fundamentally
// different here. Feedback needed.
#[cfg(feature = "mmap")]
pub(crate) fn maybe_mmap_file(file: &std::fs::File) -> std::io::Result<Option<memmap2::Mmap>> {
let metadata = file.metadata()?;
let file_size = metadata.len();
if !metadata.is_file() {
// Not a real file.
Ok(None)
} else if file_size < 16 * 1024 {
// Mapping small files is not worth it, and some special files that can't be mapped report
// a size of zero.
Ok(None)
} else {
let map = unsafe { memmap2::Mmap::map(file)? };
Ok(Some(map))
}
}

92
vendor/blake3/src/join.rs vendored Normal file
View File

@@ -0,0 +1,92 @@
//! The multi-threading abstractions used by `Hasher::update_with_join`.
//!
//! Different implementations of the `Join` trait determine whether
//! `Hasher::update_with_join` performs multi-threading on sufficiently large
//! inputs. The `SerialJoin` implementation is single-threaded, and the
//! `RayonJoin` implementation (gated by the `rayon` feature) is multi-threaded.
//! Interfaces other than `Hasher::update_with_join`, like [`hash`](crate::hash)
//! and [`Hasher::update`](crate::Hasher::update), always use `SerialJoin`
//! internally.
//!
//! The `Join` trait is an almost exact copy of the [`rayon::join`] API, and
//! `RayonJoin` is the only non-trivial implementation. Previously this trait
//! was public, but currently it's been re-privatized, as it's both 1) of no
//! value to most callers and 2) a pretty big implementation detail to commit
//! to.
//!
//! [`rayon::join`]: https://docs.rs/rayon/1.3.0/rayon/fn.join.html
/// The trait that abstracts over single-threaded and multi-threaded recursion.
///
/// See the [`join` module docs](index.html) for more details.
pub trait Join {
fn join<A, B, RA, RB>(oper_a: A, oper_b: B) -> (RA, RB)
where
A: FnOnce() -> RA + Send,
B: FnOnce() -> RB + Send,
RA: Send,
RB: Send;
}
/// The trivial, serial implementation of `Join`. The left and right sides are
/// executed one after the other, on the calling thread. The standalone hashing
/// functions and the `Hasher::update` method use this implementation
/// internally.
///
/// See the [`join` module docs](index.html) for more details.
pub enum SerialJoin {}
impl Join for SerialJoin {
#[inline]
fn join<A, B, RA, RB>(oper_a: A, oper_b: B) -> (RA, RB)
where
A: FnOnce() -> RA + Send,
B: FnOnce() -> RB + Send,
RA: Send,
RB: Send,
{
(oper_a(), oper_b())
}
}
/// The Rayon-based implementation of `Join`. The left and right sides are
/// executed on the Rayon thread pool, potentially in parallel. This
/// implementation is gated by the `rayon` feature, which is off by default.
///
/// See the [`join` module docs](index.html) for more details.
#[cfg(feature = "rayon")]
pub enum RayonJoin {}
#[cfg(feature = "rayon")]
impl Join for RayonJoin {
#[inline]
fn join<A, B, RA, RB>(oper_a: A, oper_b: B) -> (RA, RB)
where
A: FnOnce() -> RA + Send,
B: FnOnce() -> RB + Send,
RA: Send,
RB: Send,
{
rayon_core::join(oper_a, oper_b)
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_serial_join() {
let oper_a = || 1 + 1;
let oper_b = || 2 + 2;
assert_eq!((2, 4), SerialJoin::join(oper_a, oper_b));
}
#[test]
#[cfg(feature = "rayon")]
fn test_rayon_join() {
let oper_a = || 1 + 1;
let oper_b = || 2 + 2;
assert_eq!((2, 4), RayonJoin::join(oper_a, oper_b));
}
}

1835
vendor/blake3/src/lib.rs vendored Normal file

File diff suppressed because it is too large Load Diff

587
vendor/blake3/src/platform.rs vendored Normal file
View File

@@ -0,0 +1,587 @@
use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN};
use arrayref::{array_mut_ref, array_ref};
cfg_if::cfg_if! {
if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
cfg_if::cfg_if! {
if #[cfg(blake3_avx512_ffi)] {
pub const MAX_SIMD_DEGREE: usize = 16;
} else {
pub const MAX_SIMD_DEGREE: usize = 8;
}
}
} else if #[cfg(blake3_neon)] {
pub const MAX_SIMD_DEGREE: usize = 4;
} else if #[cfg(blake3_wasm32_simd)] {
pub const MAX_SIMD_DEGREE: usize = 4;
} else {
pub const MAX_SIMD_DEGREE: usize = 1;
}
}
// There are some places where we want a static size that's equal to the
// MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently
// allowed to use cmp::max, so we have to hardcode this additional constant
// value. Get rid of this once cmp::max is a const fn.
cfg_if::cfg_if! {
if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
cfg_if::cfg_if! {
if #[cfg(blake3_avx512_ffi)] {
pub const MAX_SIMD_DEGREE_OR_2: usize = 16;
} else {
pub const MAX_SIMD_DEGREE_OR_2: usize = 8;
}
}
} else if #[cfg(blake3_neon)] {
pub const MAX_SIMD_DEGREE_OR_2: usize = 4;
} else if #[cfg(blake3_wasm32_simd)] {
pub const MAX_SIMD_DEGREE_OR_2: usize = 4;
} else {
pub const MAX_SIMD_DEGREE_OR_2: usize = 2;
}
}
#[derive(Clone, Copy, Debug)]
pub enum Platform {
Portable,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
SSE2,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
SSE41,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
AVX2,
#[cfg(blake3_avx512_ffi)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
AVX512,
#[cfg(blake3_neon)]
NEON,
#[cfg(blake3_wasm32_simd)]
#[allow(non_camel_case_types)]
WASM32_SIMD,
}
impl Platform {
#[allow(unreachable_code)]
pub fn detect() -> Self {
#[cfg(miri)]
{
return Platform::Portable;
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
#[cfg(blake3_avx512_ffi)]
{
if avx512_detected() {
return Platform::AVX512;
}
}
if avx2_detected() {
return Platform::AVX2;
}
if sse41_detected() {
return Platform::SSE41;
}
if sse2_detected() {
return Platform::SSE2;
}
}
// We don't use dynamic feature detection for NEON. If the "neon"
// feature is on, NEON is assumed to be supported.
#[cfg(blake3_neon)]
{
return Platform::NEON;
}
#[cfg(blake3_wasm32_simd)]
{
return Platform::WASM32_SIMD;
}
Platform::Portable
}
pub fn simd_degree(&self) -> usize {
let degree = match self {
Platform::Portable => 1,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE2 => 4,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 => 4,
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX2 => 8,
#[cfg(blake3_avx512_ffi)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => 16,
#[cfg(blake3_neon)]
Platform::NEON => 4,
#[cfg(blake3_wasm32_simd)]
Platform::WASM32_SIMD => 4,
};
debug_assert!(degree <= MAX_SIMD_DEGREE);
degree
}
pub fn compress_in_place(
&self,
cv: &mut CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) {
match self {
Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags),
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE2 => unsafe {
crate::sse2::compress_in_place(cv, block, block_len, counter, flags)
},
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 | Platform::AVX2 => unsafe {
crate::sse41::compress_in_place(cv, block, block_len, counter, flags)
},
// Safe because detect() checked for platform support.
#[cfg(blake3_avx512_ffi)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => unsafe {
crate::avx512::compress_in_place(cv, block, block_len, counter, flags)
},
// No NEON compress_in_place() implementation yet.
#[cfg(blake3_neon)]
Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags),
#[cfg(blake3_wasm32_simd)]
Platform::WASM32_SIMD => {
crate::wasm32_simd::compress_in_place(cv, block, block_len, counter, flags)
}
}
}
pub fn compress_xof(
&self,
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) -> [u8; 64] {
match self {
Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags),
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE2 => unsafe {
crate::sse2::compress_xof(cv, block, block_len, counter, flags)
},
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 | Platform::AVX2 => unsafe {
crate::sse41::compress_xof(cv, block, block_len, counter, flags)
},
// Safe because detect() checked for platform support.
#[cfg(blake3_avx512_ffi)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => unsafe {
crate::avx512::compress_xof(cv, block, block_len, counter, flags)
},
// No NEON compress_xof() implementation yet.
#[cfg(blake3_neon)]
Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags),
#[cfg(blake3_wasm32_simd)]
Platform::WASM32_SIMD => {
crate::wasm32_simd::compress_xof(cv, block, block_len, counter, flags)
}
}
}
// IMPLEMENTATION NOTE
// ===================
// hash_many() applies two optimizations. The critically important
// optimization is the high-performance parallel SIMD hashing mode,
// described in detail in the spec. This more than doubles throughput per
// thread. Another optimization is keeping the state vectors transposed
// from block to block within a chunk. When state vectors are transposed
// after every block, there's a small but measurable performance loss.
// Compressing chunks with a dedicated loop avoids this.
pub fn hash_many<const N: usize>(
&self,
inputs: &[&[u8; N]],
key: &CVWords,
counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8],
) {
match self {
Platform::Portable => portable::hash_many(
inputs,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
out,
),
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE2 => unsafe {
crate::sse2::hash_many(
inputs,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
out,
)
},
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::SSE41 => unsafe {
crate::sse41::hash_many(
inputs,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
out,
)
},
// Safe because detect() checked for platform support.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX2 => unsafe {
crate::avx2::hash_many(
inputs,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
out,
)
},
// Safe because detect() checked for platform support.
#[cfg(blake3_avx512_ffi)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => unsafe {
crate::avx512::hash_many(
inputs,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
out,
)
},
// Assumed to be safe if the "neon" feature is on.
#[cfg(blake3_neon)]
Platform::NEON => unsafe {
crate::neon::hash_many(
inputs,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
out,
)
},
// Assumed to be safe if the "wasm32_simd" feature is on.
#[cfg(blake3_wasm32_simd)]
Platform::WASM32_SIMD => unsafe {
crate::wasm32_simd::hash_many(
inputs,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
out,
)
},
}
}
pub fn xof_many(
&self,
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
mut counter: u64,
flags: u8,
out: &mut [u8],
) {
debug_assert_eq!(0, out.len() % BLOCK_LEN, "whole blocks only");
if out.is_empty() {
// The current assembly implementation always outputs at least 1 block.
return;
}
match self {
// Safe because detect() checked for platform support.
#[cfg(blake3_avx512_ffi)]
#[cfg(unix)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
Platform::AVX512 => unsafe {
crate::avx512::xof_many(cv, block, block_len, counter, flags, out)
},
_ => {
// For platforms without an optimized xof_many, fall back to a loop over
// compress_xof. This is still faster than portable code.
for out_block in out.chunks_exact_mut(BLOCK_LEN) {
// TODO: Use array_chunks_mut here once that's stable.
let out_array: &mut [u8; BLOCK_LEN] = out_block.try_into().unwrap();
*out_array = self.compress_xof(cv, block, block_len, counter, flags);
counter += 1;
}
}
}
}
// Explicit platform constructors, for benchmarks.
pub fn portable() -> Self {
Self::Portable
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub fn sse2() -> Option<Self> {
if sse2_detected() {
Some(Self::SSE2)
} else {
None
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub fn sse41() -> Option<Self> {
if sse41_detected() {
Some(Self::SSE41)
} else {
None
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub fn avx2() -> Option<Self> {
if avx2_detected() {
Some(Self::AVX2)
} else {
None
}
}
#[cfg(blake3_avx512_ffi)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub fn avx512() -> Option<Self> {
if avx512_detected() {
Some(Self::AVX512)
} else {
None
}
}
#[cfg(blake3_neon)]
pub fn neon() -> Option<Self> {
// Assumed to be safe if the "neon" feature is on.
Some(Self::NEON)
}
#[cfg(blake3_wasm32_simd)]
pub fn wasm32_simd() -> Option<Self> {
// Assumed to be safe if the "wasm32_simd" feature is on.
Some(Self::WASM32_SIMD)
}
}
// Note that AVX-512 is divided into multiple featuresets, and we use two of
// them, F and VL.
#[cfg(blake3_avx512_ffi)]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[inline(always)]
#[allow(unreachable_code)]
pub fn avx512_detected() -> bool {
if cfg!(miri) {
return false;
}
// A testing-only short-circuit.
if cfg!(feature = "no_avx512") {
return false;
}
// Static check, e.g. for building with target-cpu=native.
#[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))]
{
return true;
}
// Dynamic check, if std is enabled.
#[cfg(feature = "std")]
{
if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
return true;
}
}
false
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[inline(always)]
#[allow(unreachable_code)]
pub fn avx2_detected() -> bool {
if cfg!(miri) {
return false;
}
// A testing-only short-circuit.
if cfg!(feature = "no_avx2") {
return false;
}
// Static check, e.g. for building with target-cpu=native.
#[cfg(target_feature = "avx2")]
{
return true;
}
// Dynamic check, if std is enabled.
#[cfg(feature = "std")]
{
if is_x86_feature_detected!("avx2") {
return true;
}
}
false
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[inline(always)]
#[allow(unreachable_code)]
pub fn sse41_detected() -> bool {
if cfg!(miri) {
return false;
}
// A testing-only short-circuit.
if cfg!(feature = "no_sse41") {
return false;
}
// Static check, e.g. for building with target-cpu=native.
#[cfg(target_feature = "sse4.1")]
{
return true;
}
// Dynamic check, if std is enabled.
#[cfg(feature = "std")]
{
if is_x86_feature_detected!("sse4.1") {
return true;
}
}
false
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[inline(always)]
#[allow(unreachable_code)]
pub fn sse2_detected() -> bool {
if cfg!(miri) {
return false;
}
// A testing-only short-circuit.
if cfg!(feature = "no_sse2") {
return false;
}
// Static check, e.g. for building with target-cpu=native.
#[cfg(target_feature = "sse2")]
{
return true;
}
// Dynamic check, if std is enabled.
#[cfg(feature = "std")]
{
if is_x86_feature_detected!("sse2") {
return true;
}
}
false
}
#[inline(always)]
pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] {
let mut out = [0; 8];
out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4));
out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4));
out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4));
out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4));
out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4));
out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4));
out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4));
out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4));
out
}
#[inline(always)]
pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] {
let mut out = [0; 16];
out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4));
out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4));
out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4));
out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4));
out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4));
out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4));
out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4));
out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4));
out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4));
out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4));
out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4));
out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4));
out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4));
out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4));
out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4));
out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4));
out
}
#[inline(always)]
pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] {
let mut out = [0; 32];
*array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
*array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
*array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
*array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
*array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
*array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
*array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
*array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
out
}
#[inline(always)]
pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] {
let mut out = [0; 64];
*array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
*array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
*array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
*array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
*array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
*array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
*array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
*array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
*array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes();
*array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes();
*array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes();
*array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes();
*array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes();
*array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes();
*array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes();
*array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes();
out
}

198
vendor/blake3/src/portable.rs vendored Normal file
View File

@@ -0,0 +1,198 @@
use crate::{
counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
OUT_LEN,
};
use arrayref::{array_mut_ref, array_ref};
#[inline(always)]
fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) {
state[a] = state[a].wrapping_add(state[b]).wrapping_add(x);
state[d] = (state[d] ^ state[a]).rotate_right(16);
state[c] = state[c].wrapping_add(state[d]);
state[b] = (state[b] ^ state[c]).rotate_right(12);
state[a] = state[a].wrapping_add(state[b]).wrapping_add(y);
state[d] = (state[d] ^ state[a]).rotate_right(8);
state[c] = state[c].wrapping_add(state[d]);
state[b] = (state[b] ^ state[c]).rotate_right(7);
}
#[inline(always)]
fn round(state: &mut [u32; 16], msg: &[u32; 16], round: usize) {
// Select the message schedule based on the round.
let schedule = MSG_SCHEDULE[round];
// Mix the columns.
g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
// Mix the diagonals.
g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
}
#[inline(always)]
fn compress_pre(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) -> [u32; 16] {
let block_words = crate::platform::words_from_le_bytes_64(block);
let mut state = [
cv[0],
cv[1],
cv[2],
cv[3],
cv[4],
cv[5],
cv[6],
cv[7],
IV[0],
IV[1],
IV[2],
IV[3],
counter_low(counter),
counter_high(counter),
block_len as u32,
flags as u32,
];
round(&mut state, &block_words, 0);
round(&mut state, &block_words, 1);
round(&mut state, &block_words, 2);
round(&mut state, &block_words, 3);
round(&mut state, &block_words, 4);
round(&mut state, &block_words, 5);
round(&mut state, &block_words, 6);
state
}
pub fn compress_in_place(
cv: &mut CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) {
let state = compress_pre(cv, block, block_len, counter, flags);
cv[0] = state[0] ^ state[8];
cv[1] = state[1] ^ state[9];
cv[2] = state[2] ^ state[10];
cv[3] = state[3] ^ state[11];
cv[4] = state[4] ^ state[12];
cv[5] = state[5] ^ state[13];
cv[6] = state[6] ^ state[14];
cv[7] = state[7] ^ state[15];
}
pub fn compress_xof(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) -> [u8; 64] {
let mut state = compress_pre(cv, block, block_len, counter, flags);
state[0] ^= state[8];
state[1] ^= state[9];
state[2] ^= state[10];
state[3] ^= state[11];
state[4] ^= state[12];
state[5] ^= state[13];
state[6] ^= state[14];
state[7] ^= state[15];
state[8] ^= cv[0];
state[9] ^= cv[1];
state[10] ^= cv[2];
state[11] ^= cv[3];
state[12] ^= cv[4];
state[13] ^= cv[5];
state[14] ^= cv[6];
state[15] ^= cv[7];
crate::platform::le_bytes_from_words_64(&state)
}
pub fn hash1<const N: usize>(
input: &[u8; N],
key: &CVWords,
counter: u64,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut CVBytes,
) {
debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks");
let mut cv = *key;
let mut block_flags = flags | flags_start;
let mut slice = &input[..];
while slice.len() >= BLOCK_LEN {
if slice.len() == BLOCK_LEN {
block_flags |= flags_end;
}
compress_in_place(
&mut cv,
array_ref!(slice, 0, BLOCK_LEN),
BLOCK_LEN as u8,
counter,
block_flags,
);
block_flags = flags;
slice = &slice[BLOCK_LEN..];
}
*out = crate::platform::le_bytes_from_words_32(&cv);
}
pub fn hash_many<const N: usize>(
inputs: &[&[u8; N]],
key: &CVWords,
mut counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8],
) {
debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) {
hash1(
input,
key,
counter,
flags,
flags_start,
flags_end,
array_mut_ref!(output, 0, OUT_LEN),
);
if increment_counter.yes() {
counter += 1;
}
}
}
#[cfg(test)]
pub mod test {
use super::*;
// This is basically testing the portable implementation against itself,
// but it also checks that compress_in_place and compress_xof are
// consistent. And there are tests against the reference implementation and
// against hardcoded test vectors elsewhere.
#[test]
fn test_compress() {
crate::test::test_compress_fn(compress_in_place, compress_xof);
}
// Ditto.
#[test]
fn test_hash_many() {
crate::test::test_hash_many_fn(hash_many, hash_many);
}
}

474
vendor/blake3/src/rust_avx2.rs vendored Normal file
View File

@@ -0,0 +1,474 @@
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
use crate::{
counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN,
};
use arrayref::{array_mut_ref, mut_array_refs};
pub const DEGREE: usize = 8;
#[inline(always)]
unsafe fn loadu(src: *const u8) -> __m256i {
// This is an unaligned load, so the pointer cast is allowed.
_mm256_loadu_si256(src as *const __m256i)
}
#[inline(always)]
unsafe fn storeu(src: __m256i, dest: *mut u8) {
// This is an unaligned store, so the pointer cast is allowed.
_mm256_storeu_si256(dest as *mut __m256i, src)
}
#[inline(always)]
unsafe fn add(a: __m256i, b: __m256i) -> __m256i {
_mm256_add_epi32(a, b)
}
#[inline(always)]
unsafe fn xor(a: __m256i, b: __m256i) -> __m256i {
_mm256_xor_si256(a, b)
}
#[inline(always)]
unsafe fn set1(x: u32) -> __m256i {
_mm256_set1_epi32(x as i32)
}
#[inline(always)]
unsafe fn set8(a: u32, b: u32, c: u32, d: u32, e: u32, f: u32, g: u32, h: u32) -> __m256i {
_mm256_setr_epi32(
a as i32, b as i32, c as i32, d as i32, e as i32, f as i32, g as i32, h as i32,
)
}
// These rotations are the "simple/shifts version". For the
// "complicated/shuffles version", see
// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66.
// For a discussion of the tradeoffs, see
// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug
// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better
// on recent x86 chips.
#[inline(always)]
unsafe fn rot16(x: __m256i) -> __m256i {
_mm256_or_si256(_mm256_srli_epi32(x, 16), _mm256_slli_epi32(x, 32 - 16))
}
#[inline(always)]
unsafe fn rot12(x: __m256i) -> __m256i {
_mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12))
}
#[inline(always)]
unsafe fn rot8(x: __m256i) -> __m256i {
_mm256_or_si256(_mm256_srli_epi32(x, 8), _mm256_slli_epi32(x, 32 - 8))
}
#[inline(always)]
unsafe fn rot7(x: __m256i) -> __m256i {
_mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7))
}
#[inline(always)]
unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) {
v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]);
v[0] = add(v[0], v[4]);
v[1] = add(v[1], v[5]);
v[2] = add(v[2], v[6]);
v[3] = add(v[3], v[7]);
v[12] = xor(v[12], v[0]);
v[13] = xor(v[13], v[1]);
v[14] = xor(v[14], v[2]);
v[15] = xor(v[15], v[3]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[15] = rot16(v[15]);
v[8] = add(v[8], v[12]);
v[9] = add(v[9], v[13]);
v[10] = add(v[10], v[14]);
v[11] = add(v[11], v[15]);
v[4] = xor(v[4], v[8]);
v[5] = xor(v[5], v[9]);
v[6] = xor(v[6], v[10]);
v[7] = xor(v[7], v[11]);
v[4] = rot12(v[4]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]);
v[0] = add(v[0], v[4]);
v[1] = add(v[1], v[5]);
v[2] = add(v[2], v[6]);
v[3] = add(v[3], v[7]);
v[12] = xor(v[12], v[0]);
v[13] = xor(v[13], v[1]);
v[14] = xor(v[14], v[2]);
v[15] = xor(v[15], v[3]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[15] = rot8(v[15]);
v[8] = add(v[8], v[12]);
v[9] = add(v[9], v[13]);
v[10] = add(v[10], v[14]);
v[11] = add(v[11], v[15]);
v[4] = xor(v[4], v[8]);
v[5] = xor(v[5], v[9]);
v[6] = xor(v[6], v[10]);
v[7] = xor(v[7], v[11]);
v[4] = rot7(v[4]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]);
v[0] = add(v[0], v[5]);
v[1] = add(v[1], v[6]);
v[2] = add(v[2], v[7]);
v[3] = add(v[3], v[4]);
v[15] = xor(v[15], v[0]);
v[12] = xor(v[12], v[1]);
v[13] = xor(v[13], v[2]);
v[14] = xor(v[14], v[3]);
v[15] = rot16(v[15]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[10] = add(v[10], v[15]);
v[11] = add(v[11], v[12]);
v[8] = add(v[8], v[13]);
v[9] = add(v[9], v[14]);
v[5] = xor(v[5], v[10]);
v[6] = xor(v[6], v[11]);
v[7] = xor(v[7], v[8]);
v[4] = xor(v[4], v[9]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[4] = rot12(v[4]);
v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]);
v[0] = add(v[0], v[5]);
v[1] = add(v[1], v[6]);
v[2] = add(v[2], v[7]);
v[3] = add(v[3], v[4]);
v[15] = xor(v[15], v[0]);
v[12] = xor(v[12], v[1]);
v[13] = xor(v[13], v[2]);
v[14] = xor(v[14], v[3]);
v[15] = rot8(v[15]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[10] = add(v[10], v[15]);
v[11] = add(v[11], v[12]);
v[8] = add(v[8], v[13]);
v[9] = add(v[9], v[14]);
v[5] = xor(v[5], v[10]);
v[6] = xor(v[6], v[11]);
v[7] = xor(v[7], v[8]);
v[4] = xor(v[4], v[9]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[4] = rot7(v[4]);
}
#[inline(always)]
unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) {
(
_mm256_permute2x128_si256(a, b, 0x20),
_mm256_permute2x128_si256(a, b, 0x31),
)
}
// There are several ways to do a transposition. We could do it naively, with 8 separate
// _mm256_set_epi32 instructions, referencing each of the 32 words explicitly. Or we could copy
// the vecs into contiguous storage and then use gather instructions. This third approach is to use
// a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the
// fastest approach. To test this, run `cargo +nightly bench --bench libtest load_8` in the
// https://github.com/oconnor663/bao_experiments repo.
#[inline(always)]
unsafe fn transpose_vecs(vecs: &mut [__m256i; DEGREE]) {
// Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high is 22/33/66/77.
let ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
let ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
let cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
let cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
let ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
let ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
let gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
let gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
// Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is 11/33.
let abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
let abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
let abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
let abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
let efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
let efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
let efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
let efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
// Interleave 128-bit lanes.
let (abcdefgh_0, abcdefgh_4) = interleave128(abcd_04, efgh_04);
let (abcdefgh_1, abcdefgh_5) = interleave128(abcd_15, efgh_15);
let (abcdefgh_2, abcdefgh_6) = interleave128(abcd_26, efgh_26);
let (abcdefgh_3, abcdefgh_7) = interleave128(abcd_37, efgh_37);
vecs[0] = abcdefgh_0;
vecs[1] = abcdefgh_1;
vecs[2] = abcdefgh_2;
vecs[3] = abcdefgh_3;
vecs[4] = abcdefgh_4;
vecs[5] = abcdefgh_5;
vecs[6] = abcdefgh_6;
vecs[7] = abcdefgh_7;
}
#[inline(always)]
unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m256i; 16] {
let mut vecs = [
loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[4].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[5].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[6].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[7].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[4].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[5].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[6].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[7].add(block_offset + 1 * 4 * DEGREE)),
];
for i in 0..DEGREE {
_mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0);
}
let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE);
transpose_vecs(squares.0);
transpose_vecs(squares.1);
vecs
}
#[inline(always)]
unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) {
let mask = if increment_counter.yes() { !0 } else { 0 };
(
set8(
counter_low(counter + (mask & 0)),
counter_low(counter + (mask & 1)),
counter_low(counter + (mask & 2)),
counter_low(counter + (mask & 3)),
counter_low(counter + (mask & 4)),
counter_low(counter + (mask & 5)),
counter_low(counter + (mask & 6)),
counter_low(counter + (mask & 7)),
),
set8(
counter_high(counter + (mask & 0)),
counter_high(counter + (mask & 1)),
counter_high(counter + (mask & 2)),
counter_high(counter + (mask & 3)),
counter_high(counter + (mask & 4)),
counter_high(counter + (mask & 5)),
counter_high(counter + (mask & 6)),
counter_high(counter + (mask & 7)),
),
)
}
#[target_feature(enable = "avx2")]
pub unsafe fn hash8(
inputs: &[*const u8; DEGREE],
blocks: usize,
key: &CVWords,
counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8; DEGREE * OUT_LEN],
) {
let mut h_vecs = [
set1(key[0]),
set1(key[1]),
set1(key[2]),
set1(key[3]),
set1(key[4]),
set1(key[5]),
set1(key[6]),
set1(key[7]),
];
let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
let mut block_flags = flags | flags_start;
for block in 0..blocks {
if block + 1 == blocks {
block_flags |= flags_end;
}
let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only
let block_flags_vec = set1(block_flags as u32);
let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN);
// The transposed compression function. Note that inlining this
// manually here improves compile times by a lot, compared to factoring
// it out into its own function and making it #[inline(always)]. Just
// guessing, it might have something to do with loop unrolling.
let mut v = [
h_vecs[0],
h_vecs[1],
h_vecs[2],
h_vecs[3],
h_vecs[4],
h_vecs[5],
h_vecs[6],
h_vecs[7],
set1(IV[0]),
set1(IV[1]),
set1(IV[2]),
set1(IV[3]),
counter_low_vec,
counter_high_vec,
block_len_vec,
block_flags_vec,
];
round(&mut v, &msg_vecs, 0);
round(&mut v, &msg_vecs, 1);
round(&mut v, &msg_vecs, 2);
round(&mut v, &msg_vecs, 3);
round(&mut v, &msg_vecs, 4);
round(&mut v, &msg_vecs, 5);
round(&mut v, &msg_vecs, 6);
h_vecs[0] = xor(v[0], v[8]);
h_vecs[1] = xor(v[1], v[9]);
h_vecs[2] = xor(v[2], v[10]);
h_vecs[3] = xor(v[3], v[11]);
h_vecs[4] = xor(v[4], v[12]);
h_vecs[5] = xor(v[5], v[13]);
h_vecs[6] = xor(v[6], v[14]);
h_vecs[7] = xor(v[7], v[15]);
block_flags = flags;
}
transpose_vecs(&mut h_vecs);
storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE));
storeu(h_vecs[1], out.as_mut_ptr().add(1 * 4 * DEGREE));
storeu(h_vecs[2], out.as_mut_ptr().add(2 * 4 * DEGREE));
storeu(h_vecs[3], out.as_mut_ptr().add(3 * 4 * DEGREE));
storeu(h_vecs[4], out.as_mut_ptr().add(4 * 4 * DEGREE));
storeu(h_vecs[5], out.as_mut_ptr().add(5 * 4 * DEGREE));
storeu(h_vecs[6], out.as_mut_ptr().add(6 * 4 * DEGREE));
storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE));
}
#[target_feature(enable = "avx2")]
pub unsafe fn hash_many<const N: usize>(
mut inputs: &[&[u8; N]],
key: &CVWords,
mut counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
mut out: &mut [u8],
) {
debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN {
// Safe because the layout of arrays is guaranteed, and because the
// `blocks` count is determined statically from the argument type.
let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]);
let blocks = N / BLOCK_LEN;
hash8(
input_ptrs,
blocks,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
array_mut_ref!(out, 0, DEGREE * OUT_LEN),
);
if increment_counter.yes() {
counter += DEGREE as u64;
}
inputs = &inputs[DEGREE..];
out = &mut out[DEGREE * OUT_LEN..];
}
crate::sse41::hash_many(
inputs,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
out,
);
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_transpose() {
if !crate::platform::avx2_detected() {
return;
}
#[target_feature(enable = "avx2")]
unsafe fn transpose_wrapper(vecs: &mut [__m256i; DEGREE]) {
transpose_vecs(vecs);
}
let mut matrix = [[0 as u32; DEGREE]; DEGREE];
for i in 0..DEGREE {
for j in 0..DEGREE {
matrix[i][j] = (i * DEGREE + j) as u32;
}
}
unsafe {
let mut vecs: [__m256i; DEGREE] = core::mem::transmute(matrix);
transpose_wrapper(&mut vecs);
matrix = core::mem::transmute(vecs);
}
for i in 0..DEGREE {
for j in 0..DEGREE {
// Reversed indexes from above.
assert_eq!(matrix[j][i], (i * DEGREE + j) as u32);
}
}
}
#[test]
fn test_hash_many() {
if !crate::platform::avx2_detected() {
return;
}
crate::test::test_hash_many_fn(hash_many, hash_many);
}
}

775
vendor/blake3/src/rust_sse2.rs vendored Normal file
View File

@@ -0,0 +1,775 @@
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
use crate::{
counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
OUT_LEN,
};
use arrayref::{array_mut_ref, array_ref, mut_array_refs};
pub const DEGREE: usize = 4;
#[inline(always)]
unsafe fn loadu(src: *const u8) -> __m128i {
// This is an unaligned load, so the pointer cast is allowed.
_mm_loadu_si128(src as *const __m128i)
}
#[inline(always)]
unsafe fn storeu(src: __m128i, dest: *mut u8) {
// This is an unaligned store, so the pointer cast is allowed.
_mm_storeu_si128(dest as *mut __m128i, src)
}
#[inline(always)]
unsafe fn add(a: __m128i, b: __m128i) -> __m128i {
_mm_add_epi32(a, b)
}
#[inline(always)]
unsafe fn xor(a: __m128i, b: __m128i) -> __m128i {
_mm_xor_si128(a, b)
}
#[inline(always)]
unsafe fn set1(x: u32) -> __m128i {
_mm_set1_epi32(x as i32)
}
#[inline(always)]
unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i {
_mm_setr_epi32(a as i32, b as i32, c as i32, d as i32)
}
// These rotations are the "simple/shifts version". For the
// "complicated/shuffles version", see
// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66.
// For a discussion of the tradeoffs, see
// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug
// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better
// on recent x86 chips.
#[inline(always)]
unsafe fn rot16(a: __m128i) -> __m128i {
_mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16))
}
#[inline(always)]
unsafe fn rot12(a: __m128i) -> __m128i {
_mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12))
}
#[inline(always)]
unsafe fn rot8(a: __m128i) -> __m128i {
_mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8))
}
#[inline(always)]
unsafe fn rot7(a: __m128i) -> __m128i {
_mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7))
}
#[inline(always)]
unsafe fn g1(
row0: &mut __m128i,
row1: &mut __m128i,
row2: &mut __m128i,
row3: &mut __m128i,
m: __m128i,
) {
*row0 = add(add(*row0, m), *row1);
*row3 = xor(*row3, *row0);
*row3 = rot16(*row3);
*row2 = add(*row2, *row3);
*row1 = xor(*row1, *row2);
*row1 = rot12(*row1);
}
#[inline(always)]
unsafe fn g2(
row0: &mut __m128i,
row1: &mut __m128i,
row2: &mut __m128i,
row3: &mut __m128i,
m: __m128i,
) {
*row0 = add(add(*row0, m), *row1);
*row3 = xor(*row3, *row0);
*row3 = rot8(*row3);
*row2 = add(*row2, *row3);
*row1 = xor(*row1, *row2);
*row1 = rot7(*row1);
}
// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
macro_rules! _MM_SHUFFLE {
($z:expr, $y:expr, $x:expr, $w:expr) => {
($z << 6) | ($y << 4) | ($x << 2) | $w
};
}
macro_rules! shuffle2 {
($a:expr, $b:expr, $c:expr) => {
_mm_castps_si128(_mm_shuffle_ps(
_mm_castsi128_ps($a),
_mm_castsi128_ps($b),
$c,
))
};
}
// Note the optimization here of leaving row1 as the unrotated row, rather than
// row0. All the message loads below are adjusted to compensate for this. See
// discussion at https://github.com/sneves/blake2-avx2/pull/4
#[inline(always)]
unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3));
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1));
}
#[inline(always)]
unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1));
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3));
}
#[inline(always)]
unsafe fn blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
let bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
let mut mask = _mm_set1_epi16(imm8 as i16);
mask = _mm_and_si128(mask, bits);
mask = _mm_cmpeq_epi16(mask, bits);
_mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a))
}
#[inline(always)]
unsafe fn compress_pre(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) -> [__m128i; 4] {
let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8);
let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8);
let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]);
let row3 = &mut set4(
counter_low(counter),
counter_high(counter),
block_len as u32,
flags as u32,
);
let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE));
let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE));
let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE));
let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE));
let mut t0;
let mut t1;
let mut t2;
let mut t3;
let mut tt;
// Round 1. The first round permutes the message words from the original
// input order, into the groups that get mixed in parallel.
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8
t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14
g1(row0, row1, row2, row3, t2);
t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9
t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 2. This round and all following rounds apply a fixed permutation
// to the message words from the round before.
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1(row0, row1, row2, row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 3
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1(row0, row1, row2, row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 4
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1(row0, row1, row2, row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 5
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1(row0, row1, row2, row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 6
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1(row0, row1, row2, row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 7
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1(row0, row1, row2, row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
[*row0, *row1, *row2, *row3]
}
#[target_feature(enable = "sse2")]
pub unsafe fn compress_in_place(
cv: &mut CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) {
let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags);
storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8);
storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8);
}
#[target_feature(enable = "sse2")]
pub unsafe fn compress_xof(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) -> [u8; 64] {
let [mut row0, mut row1, mut row2, mut row3] =
compress_pre(cv, block, block_len, counter, flags);
row0 = xor(row0, row2);
row1 = xor(row1, row3);
row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8));
row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8));
core::mem::transmute([row0, row1, row2, row3])
}
#[inline(always)]
unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) {
v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]);
v[0] = add(v[0], v[4]);
v[1] = add(v[1], v[5]);
v[2] = add(v[2], v[6]);
v[3] = add(v[3], v[7]);
v[12] = xor(v[12], v[0]);
v[13] = xor(v[13], v[1]);
v[14] = xor(v[14], v[2]);
v[15] = xor(v[15], v[3]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[15] = rot16(v[15]);
v[8] = add(v[8], v[12]);
v[9] = add(v[9], v[13]);
v[10] = add(v[10], v[14]);
v[11] = add(v[11], v[15]);
v[4] = xor(v[4], v[8]);
v[5] = xor(v[5], v[9]);
v[6] = xor(v[6], v[10]);
v[7] = xor(v[7], v[11]);
v[4] = rot12(v[4]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]);
v[0] = add(v[0], v[4]);
v[1] = add(v[1], v[5]);
v[2] = add(v[2], v[6]);
v[3] = add(v[3], v[7]);
v[12] = xor(v[12], v[0]);
v[13] = xor(v[13], v[1]);
v[14] = xor(v[14], v[2]);
v[15] = xor(v[15], v[3]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[15] = rot8(v[15]);
v[8] = add(v[8], v[12]);
v[9] = add(v[9], v[13]);
v[10] = add(v[10], v[14]);
v[11] = add(v[11], v[15]);
v[4] = xor(v[4], v[8]);
v[5] = xor(v[5], v[9]);
v[6] = xor(v[6], v[10]);
v[7] = xor(v[7], v[11]);
v[4] = rot7(v[4]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]);
v[0] = add(v[0], v[5]);
v[1] = add(v[1], v[6]);
v[2] = add(v[2], v[7]);
v[3] = add(v[3], v[4]);
v[15] = xor(v[15], v[0]);
v[12] = xor(v[12], v[1]);
v[13] = xor(v[13], v[2]);
v[14] = xor(v[14], v[3]);
v[15] = rot16(v[15]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[10] = add(v[10], v[15]);
v[11] = add(v[11], v[12]);
v[8] = add(v[8], v[13]);
v[9] = add(v[9], v[14]);
v[5] = xor(v[5], v[10]);
v[6] = xor(v[6], v[11]);
v[7] = xor(v[7], v[8]);
v[4] = xor(v[4], v[9]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[4] = rot12(v[4]);
v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]);
v[0] = add(v[0], v[5]);
v[1] = add(v[1], v[6]);
v[2] = add(v[2], v[7]);
v[3] = add(v[3], v[4]);
v[15] = xor(v[15], v[0]);
v[12] = xor(v[12], v[1]);
v[13] = xor(v[13], v[2]);
v[14] = xor(v[14], v[3]);
v[15] = rot8(v[15]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[10] = add(v[10], v[15]);
v[11] = add(v[11], v[12]);
v[8] = add(v[8], v[13]);
v[9] = add(v[9], v[14]);
v[5] = xor(v[5], v[10]);
v[6] = xor(v[6], v[11]);
v[7] = xor(v[7], v[8]);
v[4] = xor(v[4], v[9]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[4] = rot7(v[4]);
}
#[inline(always)]
unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) {
// Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
// 22/33. Note that this doesn't split the vector into two lanes, as the
// AVX2 counterparts do.
let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
// Interleave 64-bit lanes.
let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
vecs[0] = abcd_0;
vecs[1] = abcd_1;
vecs[2] = abcd_2;
vecs[3] = abcd_3;
}
#[inline(always)]
unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] {
let mut vecs = [
loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)),
loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)),
loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)),
loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)),
loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)),
loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)),
loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)),
loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)),
];
for i in 0..DEGREE {
_mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0);
}
let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE);
transpose_vecs(squares.0);
transpose_vecs(squares.1);
transpose_vecs(squares.2);
transpose_vecs(squares.3);
vecs
}
#[inline(always)]
unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) {
let mask = if increment_counter.yes() { !0 } else { 0 };
(
set4(
counter_low(counter + (mask & 0)),
counter_low(counter + (mask & 1)),
counter_low(counter + (mask & 2)),
counter_low(counter + (mask & 3)),
),
set4(
counter_high(counter + (mask & 0)),
counter_high(counter + (mask & 1)),
counter_high(counter + (mask & 2)),
counter_high(counter + (mask & 3)),
),
)
}
#[target_feature(enable = "sse2")]
pub unsafe fn hash4(
inputs: &[*const u8; DEGREE],
blocks: usize,
key: &CVWords,
counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8; DEGREE * OUT_LEN],
) {
let mut h_vecs = [
set1(key[0]),
set1(key[1]),
set1(key[2]),
set1(key[3]),
set1(key[4]),
set1(key[5]),
set1(key[6]),
set1(key[7]),
];
let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
let mut block_flags = flags | flags_start;
for block in 0..blocks {
if block + 1 == blocks {
block_flags |= flags_end;
}
let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only
let block_flags_vec = set1(block_flags as u32);
let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN);
// The transposed compression function. Note that inlining this
// manually here improves compile times by a lot, compared to factoring
// it out into its own function and making it #[inline(always)]. Just
// guessing, it might have something to do with loop unrolling.
let mut v = [
h_vecs[0],
h_vecs[1],
h_vecs[2],
h_vecs[3],
h_vecs[4],
h_vecs[5],
h_vecs[6],
h_vecs[7],
set1(IV[0]),
set1(IV[1]),
set1(IV[2]),
set1(IV[3]),
counter_low_vec,
counter_high_vec,
block_len_vec,
block_flags_vec,
];
round(&mut v, &msg_vecs, 0);
round(&mut v, &msg_vecs, 1);
round(&mut v, &msg_vecs, 2);
round(&mut v, &msg_vecs, 3);
round(&mut v, &msg_vecs, 4);
round(&mut v, &msg_vecs, 5);
round(&mut v, &msg_vecs, 6);
h_vecs[0] = xor(v[0], v[8]);
h_vecs[1] = xor(v[1], v[9]);
h_vecs[2] = xor(v[2], v[10]);
h_vecs[3] = xor(v[3], v[11]);
h_vecs[4] = xor(v[4], v[12]);
h_vecs[5] = xor(v[5], v[13]);
h_vecs[6] = xor(v[6], v[14]);
h_vecs[7] = xor(v[7], v[15]);
block_flags = flags;
}
let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE);
transpose_vecs(squares.0);
transpose_vecs(squares.1);
// The first four vecs now contain the first half of each output, and the
// second four vecs contain the second half of each output.
storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE));
storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE));
storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE));
storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE));
storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE));
storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE));
storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE));
storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE));
}
#[target_feature(enable = "sse2")]
unsafe fn hash1<const N: usize>(
input: &[u8; N],
key: &CVWords,
counter: u64,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut CVBytes,
) {
debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks");
let mut cv = *key;
let mut block_flags = flags | flags_start;
let mut slice = &input[..];
while slice.len() >= BLOCK_LEN {
if slice.len() == BLOCK_LEN {
block_flags |= flags_end;
}
compress_in_place(
&mut cv,
array_ref!(slice, 0, BLOCK_LEN),
BLOCK_LEN as u8,
counter,
block_flags,
);
block_flags = flags;
slice = &slice[BLOCK_LEN..];
}
*out = core::mem::transmute(cv); // x86 is little-endian
}
#[target_feature(enable = "sse2")]
pub unsafe fn hash_many<const N: usize>(
mut inputs: &[&[u8; N]],
key: &CVWords,
mut counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
mut out: &mut [u8],
) {
debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN {
// Safe because the layout of arrays is guaranteed, and because the
// `blocks` count is determined statically from the argument type.
let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]);
let blocks = N / BLOCK_LEN;
hash4(
input_ptrs,
blocks,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
array_mut_ref!(out, 0, DEGREE * OUT_LEN),
);
if increment_counter.yes() {
counter += DEGREE as u64;
}
inputs = &inputs[DEGREE..];
out = &mut out[DEGREE * OUT_LEN..];
}
for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) {
hash1(
input,
key,
counter,
flags,
flags_start,
flags_end,
array_mut_ref!(output, 0, OUT_LEN),
);
if increment_counter.yes() {
counter += 1;
}
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_transpose() {
if !crate::platform::sse2_detected() {
return;
}
#[target_feature(enable = "sse2")]
unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) {
transpose_vecs(vecs);
}
let mut matrix = [[0 as u32; DEGREE]; DEGREE];
for i in 0..DEGREE {
for j in 0..DEGREE {
matrix[i][j] = (i * DEGREE + j) as u32;
}
}
unsafe {
let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix);
transpose_wrapper(&mut vecs);
matrix = core::mem::transmute(vecs);
}
for i in 0..DEGREE {
for j in 0..DEGREE {
// Reversed indexes from above.
assert_eq!(matrix[j][i], (i * DEGREE + j) as u32);
}
}
}
#[test]
fn test_compress() {
if !crate::platform::sse2_detected() {
return;
}
crate::test::test_compress_fn(compress_in_place, compress_xof);
}
#[test]
fn test_hash_many() {
if !crate::platform::sse2_detected() {
return;
}
crate::test::test_hash_many_fn(hash_many, hash_many);
}
}

766
vendor/blake3/src/rust_sse41.rs vendored Normal file
View File

@@ -0,0 +1,766 @@
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
use crate::{
counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
OUT_LEN,
};
use arrayref::{array_mut_ref, array_ref, mut_array_refs};
pub const DEGREE: usize = 4;
#[inline(always)]
unsafe fn loadu(src: *const u8) -> __m128i {
// This is an unaligned load, so the pointer cast is allowed.
_mm_loadu_si128(src as *const __m128i)
}
#[inline(always)]
unsafe fn storeu(src: __m128i, dest: *mut u8) {
// This is an unaligned store, so the pointer cast is allowed.
_mm_storeu_si128(dest as *mut __m128i, src)
}
#[inline(always)]
unsafe fn add(a: __m128i, b: __m128i) -> __m128i {
_mm_add_epi32(a, b)
}
#[inline(always)]
unsafe fn xor(a: __m128i, b: __m128i) -> __m128i {
_mm_xor_si128(a, b)
}
#[inline(always)]
unsafe fn set1(x: u32) -> __m128i {
_mm_set1_epi32(x as i32)
}
#[inline(always)]
unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i {
_mm_setr_epi32(a as i32, b as i32, c as i32, d as i32)
}
// These rotations are the "simple/shifts version". For the
// "complicated/shuffles version", see
// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66.
// For a discussion of the tradeoffs, see
// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug
// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better
// on recent x86 chips.
#[inline(always)]
unsafe fn rot16(a: __m128i) -> __m128i {
_mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16))
}
#[inline(always)]
unsafe fn rot12(a: __m128i) -> __m128i {
_mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12))
}
#[inline(always)]
unsafe fn rot8(a: __m128i) -> __m128i {
_mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8))
}
#[inline(always)]
unsafe fn rot7(a: __m128i) -> __m128i {
_mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7))
}
#[inline(always)]
unsafe fn g1(
row0: &mut __m128i,
row1: &mut __m128i,
row2: &mut __m128i,
row3: &mut __m128i,
m: __m128i,
) {
*row0 = add(add(*row0, m), *row1);
*row3 = xor(*row3, *row0);
*row3 = rot16(*row3);
*row2 = add(*row2, *row3);
*row1 = xor(*row1, *row2);
*row1 = rot12(*row1);
}
#[inline(always)]
unsafe fn g2(
row0: &mut __m128i,
row1: &mut __m128i,
row2: &mut __m128i,
row3: &mut __m128i,
m: __m128i,
) {
*row0 = add(add(*row0, m), *row1);
*row3 = xor(*row3, *row0);
*row3 = rot8(*row3);
*row2 = add(*row2, *row3);
*row1 = xor(*row1, *row2);
*row1 = rot7(*row1);
}
// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479.
macro_rules! _MM_SHUFFLE {
($z:expr, $y:expr, $x:expr, $w:expr) => {
($z << 6) | ($y << 4) | ($x << 2) | $w
};
}
macro_rules! shuffle2 {
($a:expr, $b:expr, $c:expr) => {
_mm_castps_si128(_mm_shuffle_ps(
_mm_castsi128_ps($a),
_mm_castsi128_ps($b),
$c,
))
};
}
// Note the optimization here of leaving row1 as the unrotated row, rather than
// row0. All the message loads below are adjusted to compensate for this. See
// discussion at https://github.com/sneves/blake2-avx2/pull/4
#[inline(always)]
unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3));
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1));
}
#[inline(always)]
unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1));
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3));
}
#[inline(always)]
unsafe fn compress_pre(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) -> [__m128i; 4] {
let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8);
let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8);
let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]);
let row3 = &mut set4(
counter_low(counter),
counter_high(counter),
block_len as u32,
flags as u32,
);
let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE));
let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE));
let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE));
let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE));
let mut t0;
let mut t1;
let mut t2;
let mut t3;
let mut tt;
// Round 1. The first round permutes the message words from the original
// input order, into the groups that get mixed in parallel.
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8
t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14
g1(row0, row1, row2, row3, t2);
t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9
t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 2. This round and all following rounds apply a fixed permutation
// to the message words from the round before.
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1(row0, row1, row2, row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 3
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1(row0, row1, row2, row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 4
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1(row0, row1, row2, row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 5
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1(row0, row1, row2, row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 6
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1(row0, row1, row2, row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 7
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1(row0, row1, row2, row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1(row0, row1, row2, row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
[*row0, *row1, *row2, *row3]
}
#[target_feature(enable = "sse4.1")]
pub unsafe fn compress_in_place(
cv: &mut CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) {
let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags);
storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8);
storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8);
}
#[target_feature(enable = "sse4.1")]
pub unsafe fn compress_xof(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) -> [u8; 64] {
let [mut row0, mut row1, mut row2, mut row3] =
compress_pre(cv, block, block_len, counter, flags);
row0 = xor(row0, row2);
row1 = xor(row1, row3);
row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8));
row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8));
core::mem::transmute([row0, row1, row2, row3])
}
#[inline(always)]
unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) {
v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]);
v[0] = add(v[0], v[4]);
v[1] = add(v[1], v[5]);
v[2] = add(v[2], v[6]);
v[3] = add(v[3], v[7]);
v[12] = xor(v[12], v[0]);
v[13] = xor(v[13], v[1]);
v[14] = xor(v[14], v[2]);
v[15] = xor(v[15], v[3]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[15] = rot16(v[15]);
v[8] = add(v[8], v[12]);
v[9] = add(v[9], v[13]);
v[10] = add(v[10], v[14]);
v[11] = add(v[11], v[15]);
v[4] = xor(v[4], v[8]);
v[5] = xor(v[5], v[9]);
v[6] = xor(v[6], v[10]);
v[7] = xor(v[7], v[11]);
v[4] = rot12(v[4]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]);
v[0] = add(v[0], v[4]);
v[1] = add(v[1], v[5]);
v[2] = add(v[2], v[6]);
v[3] = add(v[3], v[7]);
v[12] = xor(v[12], v[0]);
v[13] = xor(v[13], v[1]);
v[14] = xor(v[14], v[2]);
v[15] = xor(v[15], v[3]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[15] = rot8(v[15]);
v[8] = add(v[8], v[12]);
v[9] = add(v[9], v[13]);
v[10] = add(v[10], v[14]);
v[11] = add(v[11], v[15]);
v[4] = xor(v[4], v[8]);
v[5] = xor(v[5], v[9]);
v[6] = xor(v[6], v[10]);
v[7] = xor(v[7], v[11]);
v[4] = rot7(v[4]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]);
v[0] = add(v[0], v[5]);
v[1] = add(v[1], v[6]);
v[2] = add(v[2], v[7]);
v[3] = add(v[3], v[4]);
v[15] = xor(v[15], v[0]);
v[12] = xor(v[12], v[1]);
v[13] = xor(v[13], v[2]);
v[14] = xor(v[14], v[3]);
v[15] = rot16(v[15]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[10] = add(v[10], v[15]);
v[11] = add(v[11], v[12]);
v[8] = add(v[8], v[13]);
v[9] = add(v[9], v[14]);
v[5] = xor(v[5], v[10]);
v[6] = xor(v[6], v[11]);
v[7] = xor(v[7], v[8]);
v[4] = xor(v[4], v[9]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[4] = rot12(v[4]);
v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]);
v[0] = add(v[0], v[5]);
v[1] = add(v[1], v[6]);
v[2] = add(v[2], v[7]);
v[3] = add(v[3], v[4]);
v[15] = xor(v[15], v[0]);
v[12] = xor(v[12], v[1]);
v[13] = xor(v[13], v[2]);
v[14] = xor(v[14], v[3]);
v[15] = rot8(v[15]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[10] = add(v[10], v[15]);
v[11] = add(v[11], v[12]);
v[8] = add(v[8], v[13]);
v[9] = add(v[9], v[14]);
v[5] = xor(v[5], v[10]);
v[6] = xor(v[6], v[11]);
v[7] = xor(v[7], v[8]);
v[4] = xor(v[4], v[9]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[4] = rot7(v[4]);
}
#[inline(always)]
unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) {
// Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
// 22/33. Note that this doesn't split the vector into two lanes, as the
// AVX2 counterparts do.
let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
// Interleave 64-bit lanes.
let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
vecs[0] = abcd_0;
vecs[1] = abcd_1;
vecs[2] = abcd_2;
vecs[3] = abcd_3;
}
#[inline(always)]
unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] {
let mut vecs = [
loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)),
loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)),
loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)),
loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)),
loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)),
loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)),
loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)),
loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)),
];
for i in 0..DEGREE {
_mm_prefetch(inputs[i].add(block_offset + 256) as *const i8, _MM_HINT_T0);
}
let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE);
transpose_vecs(squares.0);
transpose_vecs(squares.1);
transpose_vecs(squares.2);
transpose_vecs(squares.3);
vecs
}
#[inline(always)]
unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) {
let mask = if increment_counter.yes() { !0 } else { 0 };
(
set4(
counter_low(counter + (mask & 0)),
counter_low(counter + (mask & 1)),
counter_low(counter + (mask & 2)),
counter_low(counter + (mask & 3)),
),
set4(
counter_high(counter + (mask & 0)),
counter_high(counter + (mask & 1)),
counter_high(counter + (mask & 2)),
counter_high(counter + (mask & 3)),
),
)
}
#[target_feature(enable = "sse4.1")]
pub unsafe fn hash4(
inputs: &[*const u8; DEGREE],
blocks: usize,
key: &CVWords,
counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8; DEGREE * OUT_LEN],
) {
let mut h_vecs = [
set1(key[0]),
set1(key[1]),
set1(key[2]),
set1(key[3]),
set1(key[4]),
set1(key[5]),
set1(key[6]),
set1(key[7]),
];
let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
let mut block_flags = flags | flags_start;
for block in 0..blocks {
if block + 1 == blocks {
block_flags |= flags_end;
}
let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only
let block_flags_vec = set1(block_flags as u32);
let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN);
// The transposed compression function. Note that inlining this
// manually here improves compile times by a lot, compared to factoring
// it out into its own function and making it #[inline(always)]. Just
// guessing, it might have something to do with loop unrolling.
let mut v = [
h_vecs[0],
h_vecs[1],
h_vecs[2],
h_vecs[3],
h_vecs[4],
h_vecs[5],
h_vecs[6],
h_vecs[7],
set1(IV[0]),
set1(IV[1]),
set1(IV[2]),
set1(IV[3]),
counter_low_vec,
counter_high_vec,
block_len_vec,
block_flags_vec,
];
round(&mut v, &msg_vecs, 0);
round(&mut v, &msg_vecs, 1);
round(&mut v, &msg_vecs, 2);
round(&mut v, &msg_vecs, 3);
round(&mut v, &msg_vecs, 4);
round(&mut v, &msg_vecs, 5);
round(&mut v, &msg_vecs, 6);
h_vecs[0] = xor(v[0], v[8]);
h_vecs[1] = xor(v[1], v[9]);
h_vecs[2] = xor(v[2], v[10]);
h_vecs[3] = xor(v[3], v[11]);
h_vecs[4] = xor(v[4], v[12]);
h_vecs[5] = xor(v[5], v[13]);
h_vecs[6] = xor(v[6], v[14]);
h_vecs[7] = xor(v[7], v[15]);
block_flags = flags;
}
let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE);
transpose_vecs(squares.0);
transpose_vecs(squares.1);
// The first four vecs now contain the first half of each output, and the
// second four vecs contain the second half of each output.
storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE));
storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE));
storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE));
storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE));
storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE));
storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE));
storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE));
storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE));
}
#[target_feature(enable = "sse4.1")]
unsafe fn hash1<const N: usize>(
input: &[u8; N],
key: &CVWords,
counter: u64,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut CVBytes,
) {
debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks");
let mut cv = *key;
let mut block_flags = flags | flags_start;
let mut slice = &input[..];
while slice.len() >= BLOCK_LEN {
if slice.len() == BLOCK_LEN {
block_flags |= flags_end;
}
compress_in_place(
&mut cv,
array_ref!(slice, 0, BLOCK_LEN),
BLOCK_LEN as u8,
counter,
block_flags,
);
block_flags = flags;
slice = &slice[BLOCK_LEN..];
}
*out = core::mem::transmute(cv); // x86 is little-endian
}
#[target_feature(enable = "sse4.1")]
pub unsafe fn hash_many<const N: usize>(
mut inputs: &[&[u8; N]],
key: &CVWords,
mut counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
mut out: &mut [u8],
) {
debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN {
// Safe because the layout of arrays is guaranteed, and because the
// `blocks` count is determined statically from the argument type.
let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]);
let blocks = N / BLOCK_LEN;
hash4(
input_ptrs,
blocks,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
array_mut_ref!(out, 0, DEGREE * OUT_LEN),
);
if increment_counter.yes() {
counter += DEGREE as u64;
}
inputs = &inputs[DEGREE..];
out = &mut out[DEGREE * OUT_LEN..];
}
for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) {
hash1(
input,
key,
counter,
flags,
flags_start,
flags_end,
array_mut_ref!(output, 0, OUT_LEN),
);
if increment_counter.yes() {
counter += 1;
}
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_transpose() {
if !crate::platform::sse41_detected() {
return;
}
#[target_feature(enable = "sse4.1")]
unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) {
transpose_vecs(vecs);
}
let mut matrix = [[0 as u32; DEGREE]; DEGREE];
for i in 0..DEGREE {
for j in 0..DEGREE {
matrix[i][j] = (i * DEGREE + j) as u32;
}
}
unsafe {
let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix);
transpose_wrapper(&mut vecs);
matrix = core::mem::transmute(vecs);
}
for i in 0..DEGREE {
for j in 0..DEGREE {
// Reversed indexes from above.
assert_eq!(matrix[j][i], (i * DEGREE + j) as u32);
}
}
}
#[test]
fn test_compress() {
if !crate::platform::sse41_detected() {
return;
}
crate::test::test_compress_fn(compress_in_place, compress_xof);
}
#[test]
fn test_hash_many() {
if !crate::platform::sse41_detected() {
return;
}
crate::test::test_hash_many_fn(hash_many, hash_many);
}
}

1049
vendor/blake3/src/test.rs vendored Normal file

File diff suppressed because it is too large Load Diff

227
vendor/blake3/src/traits.rs vendored Normal file
View File

@@ -0,0 +1,227 @@
//! Implementations of commonly used traits like `Digest` and `Mac` from the
//! [`digest`](https://crates.io/crates/digest) crate.
pub use digest;
use crate::{Hasher, OutputReader};
use digest::crypto_common;
use digest::generic_array::{typenum::U32, typenum::U64, GenericArray};
impl digest::HashMarker for Hasher {}
impl digest::Update for Hasher {
#[inline]
fn update(&mut self, data: &[u8]) {
self.update(data);
}
}
impl digest::Reset for Hasher {
#[inline]
fn reset(&mut self) {
self.reset(); // the inherent method
}
}
impl digest::OutputSizeUser for Hasher {
type OutputSize = U32;
}
impl digest::FixedOutput for Hasher {
#[inline]
fn finalize_into(self, out: &mut GenericArray<u8, Self::OutputSize>) {
out.copy_from_slice(self.finalize().as_bytes());
}
}
impl digest::FixedOutputReset for Hasher {
#[inline]
fn finalize_into_reset(&mut self, out: &mut GenericArray<u8, Self::OutputSize>) {
out.copy_from_slice(self.finalize().as_bytes());
self.reset();
}
}
impl digest::ExtendableOutput for Hasher {
type Reader = OutputReader;
#[inline]
fn finalize_xof(self) -> Self::Reader {
Hasher::finalize_xof(&self)
}
}
impl digest::ExtendableOutputReset for Hasher {
#[inline]
fn finalize_xof_reset(&mut self) -> Self::Reader {
let reader = Hasher::finalize_xof(self);
self.reset();
reader
}
}
impl digest::XofReader for OutputReader {
#[inline]
fn read(&mut self, buffer: &mut [u8]) {
self.fill(buffer);
}
}
impl crypto_common::KeySizeUser for Hasher {
type KeySize = U32;
}
impl crypto_common::BlockSizeUser for Hasher {
type BlockSize = U64;
}
impl digest::MacMarker for Hasher {}
impl digest::KeyInit for Hasher {
#[inline]
fn new(key: &digest::Key<Self>) -> Self {
let key_bytes: [u8; 32] = (*key).into();
Hasher::new_keyed(&key_bytes)
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_digest_traits() {
// Inherent methods.
let mut hasher1 = crate::Hasher::new();
hasher1.update(b"foo");
hasher1.update(b"bar");
hasher1.update(b"baz");
let out1 = hasher1.finalize();
let mut xof1 = [0; 301];
hasher1.finalize_xof().fill(&mut xof1);
assert_eq!(out1.as_bytes(), &xof1[..32]);
// Trait implementations.
let mut hasher2: crate::Hasher = digest::Digest::new();
digest::Digest::update(&mut hasher2, b"xxx");
digest::Digest::reset(&mut hasher2);
digest::Digest::update(&mut hasher2, b"foo");
digest::Digest::update(&mut hasher2, b"bar");
digest::Digest::update(&mut hasher2, b"baz");
let out2 = digest::Digest::finalize(hasher2.clone());
let mut xof2 = [0; 301];
digest::XofReader::read(
&mut digest::ExtendableOutput::finalize_xof(hasher2.clone()),
&mut xof2,
);
assert_eq!(out1.as_bytes(), &out2[..]);
assert_eq!(xof1[..], xof2[..]);
// Again with the resetting variants.
let mut hasher3: crate::Hasher = digest::Digest::new();
digest::Digest::update(&mut hasher3, b"foobarbaz");
let mut out3 = [0; 32];
digest::FixedOutputReset::finalize_into_reset(
&mut hasher3,
GenericArray::from_mut_slice(&mut out3),
);
digest::Digest::update(&mut hasher3, b"foobarbaz");
let mut out4 = [0; 32];
digest::FixedOutputReset::finalize_into_reset(
&mut hasher3,
GenericArray::from_mut_slice(&mut out4),
);
digest::Digest::update(&mut hasher3, b"foobarbaz");
let mut xof3 = [0; 301];
digest::XofReader::read(
&mut digest::ExtendableOutputReset::finalize_xof_reset(&mut hasher3),
&mut xof3,
);
digest::Digest::update(&mut hasher3, b"foobarbaz");
let mut xof4 = [0; 301];
digest::XofReader::read(
&mut digest::ExtendableOutputReset::finalize_xof_reset(&mut hasher3),
&mut xof4,
);
assert_eq!(out1.as_bytes(), &out3[..]);
assert_eq!(out1.as_bytes(), &out4[..]);
assert_eq!(xof1[..], xof3[..]);
assert_eq!(xof1[..], xof4[..]);
}
#[test]
fn test_mac_trait() {
// Inherent methods.
let key = b"some super secret key bytes fooo";
let mut hasher1 = crate::Hasher::new_keyed(key);
hasher1.update(b"foo");
hasher1.update(b"bar");
hasher1.update(b"baz");
let out1 = hasher1.finalize();
// Trait implementation.
let generic_key = (*key).into();
let mut hasher2: crate::Hasher = digest::Mac::new(&generic_key);
digest::Mac::update(&mut hasher2, b"xxx");
digest::Mac::reset(&mut hasher2);
digest::Mac::update(&mut hasher2, b"foo");
digest::Mac::update(&mut hasher2, b"bar");
digest::Mac::update(&mut hasher2, b"baz");
let out2 = digest::Mac::finalize(hasher2);
assert_eq!(out1.as_bytes(), out2.into_bytes().as_slice());
}
fn expected_hmac_blake3(key: &[u8], input: &[u8]) -> [u8; 32] {
// See https://en.wikipedia.org/wiki/HMAC.
let key_hash;
let key_prime = if key.len() <= 64 {
key
} else {
key_hash = *crate::hash(key).as_bytes();
&key_hash
};
let mut ipad = [0x36; 64];
let mut opad = [0x5c; 64];
for i in 0..key_prime.len() {
ipad[i] ^= key_prime[i];
opad[i] ^= key_prime[i];
}
let mut inner_state = crate::Hasher::new();
inner_state.update(&ipad);
inner_state.update(input);
let mut outer_state = crate::Hasher::new();
outer_state.update(&opad);
outer_state.update(inner_state.finalize().as_bytes());
outer_state.finalize().into()
}
#[test]
fn test_hmac_compatibility() {
use hmac::{Mac, SimpleHmac};
// Test a short key.
let mut x = SimpleHmac::<Hasher>::new_from_slice(b"key").unwrap();
hmac::digest::Update::update(&mut x, b"data");
let output = x.finalize().into_bytes();
assert_ne!(output.len(), 0);
let expected = expected_hmac_blake3(b"key", b"data");
assert_eq!(expected, output.as_ref());
// Test a range of key and data lengths, particularly to exercise the long-key logic.
let mut input_bytes = [0; crate::test::TEST_CASES_MAX];
crate::test::paint_test_input(&mut input_bytes);
for &input_len in crate::test::TEST_CASES {
#[cfg(feature = "std")]
dbg!(input_len);
let input = &input_bytes[..input_len];
let mut x = SimpleHmac::<Hasher>::new_from_slice(input).unwrap();
hmac::digest::Update::update(&mut x, input);
let output = x.finalize().into_bytes();
assert_ne!(output.len(), 0);
let expected = expected_hmac_blake3(input, input);
assert_eq!(expected, output.as_ref());
}
}
}

794
vendor/blake3/src/wasm32_simd.rs vendored Normal file
View File

@@ -0,0 +1,794 @@
/*
* This code is based on rust_sse2.rs of the same distribution, and is subject to further improvements.
* Some comments are left intact even if their applicability is questioned.
*
* Performance measurements with a primitive benchmark with ~16Kb of data:
*
* | M1 native | 11,610 ns |
* | M1 Wasm SIMD | 13,355 ns |
* | M1 Wasm | 22,037 ns |
* | x64 native | 6,713 ns |
* | x64 Wasm SIMD | 11,985 ns |
* | x64 Wasm | 25,978 ns |
*
* wasmtime v12.0.1 was used on both platforms.
*/
use core::arch::wasm32::*;
use crate::{
counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE,
OUT_LEN,
};
use arrayref::{array_mut_ref, array_ref, mut_array_refs};
pub const DEGREE: usize = 4;
#[inline(always)]
unsafe fn loadu(src: *const u8) -> v128 {
// This is an unaligned load, so the pointer cast is allowed.
v128_load(src as *const v128)
}
#[inline(always)]
unsafe fn storeu(src: v128, dest: *mut u8) {
// This is an unaligned store, so the pointer cast is allowed.
v128_store(dest as *mut v128, src)
}
#[inline(always)]
fn add(a: v128, b: v128) -> v128 {
i32x4_add(a, b)
}
#[inline(always)]
fn xor(a: v128, b: v128) -> v128 {
v128_xor(a, b)
}
#[inline(always)]
fn set1(x: u32) -> v128 {
i32x4_splat(x as i32)
}
#[inline(always)]
fn set4(a: u32, b: u32, c: u32, d: u32) -> v128 {
i32x4(a as i32, b as i32, c as i32, d as i32)
}
// These rotations are the "simple/shifts version". For the
// "complicated/shuffles version", see
// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66.
// For a discussion of the tradeoffs, see
// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug
// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better
// on recent x86 chips.
#[inline(always)]
fn rot16(a: v128) -> v128 {
v128_or(u32x4_shr(a, 16), u32x4_shl(a, 32 - 16))
}
#[inline(always)]
fn rot12(a: v128) -> v128 {
v128_or(u32x4_shr(a, 12), u32x4_shl(a, 32 - 12))
}
#[inline(always)]
fn rot8(a: v128) -> v128 {
v128_or(u32x4_shr(a, 8), u32x4_shl(a, 32 - 8))
}
#[inline(always)]
fn rot7(a: v128) -> v128 {
v128_or(u32x4_shr(a, 7), u32x4_shl(a, 32 - 7))
}
#[inline(always)]
fn g1(row0: &mut v128, row1: &mut v128, row2: &mut v128, row3: &mut v128, m: v128) {
*row0 = add(add(*row0, m), *row1);
*row3 = xor(*row3, *row0);
*row3 = rot16(*row3);
*row2 = add(*row2, *row3);
*row1 = xor(*row1, *row2);
*row1 = rot12(*row1);
}
#[inline(always)]
fn g2(row0: &mut v128, row1: &mut v128, row2: &mut v128, row3: &mut v128, m: v128) {
*row0 = add(add(*row0, m), *row1);
*row3 = xor(*row3, *row0);
*row3 = rot8(*row3);
*row2 = add(*row2, *row3);
*row1 = xor(*row1, *row2);
*row1 = rot7(*row1);
}
// It could be a function, but artimetics in const generics is too limited yet.
macro_rules! shuffle {
($a: expr, $b: expr, $z:expr, $y:expr, $x:expr, $w:expr) => {
i32x4_shuffle::<{ $w }, { $x }, { $y + 4 }, { $z + 4 }>($a, $b)
};
}
#[inline(always)]
fn unpacklo_epi64(a: v128, b: v128) -> v128 {
i64x2_shuffle::<0, 2>(a, b)
}
#[inline(always)]
fn unpackhi_epi64(a: v128, b: v128) -> v128 {
i64x2_shuffle::<1, 3>(a, b)
}
#[inline(always)]
fn unpacklo_epi32(a: v128, b: v128) -> v128 {
i32x4_shuffle::<0, 4, 1, 5>(a, b)
}
#[inline(always)]
fn unpackhi_epi32(a: v128, b: v128) -> v128 {
i32x4_shuffle::<2, 6, 3, 7>(a, b)
}
#[inline(always)]
fn shuffle_epi32<const I3: usize, const I2: usize, const I1: usize, const I0: usize>(
a: v128,
) -> v128 {
// Please note that generic arguments in delcaration and imlementation are in
// different order.
// second arg is actually ignored.
i32x4_shuffle::<I0, I1, I2, I3>(a, a)
}
#[inline(always)]
fn blend_epi16(a: v128, b: v128, imm8: i32) -> v128 {
// imm8 is always constant; it allows to implement this function with
// i16x8_shuffle. However, it is marginally slower on x64.
let bits = i16x8(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80);
let mut mask = i16x8_splat(imm8 as i16);
mask = v128_and(mask, bits);
mask = i16x8_eq(mask, bits);
// The swapped argument order is equivalent to mask negation.
v128_bitselect(b, a, mask)
}
// Note the optimization here of leaving row1 as the unrotated row, rather than
// row0. All the message loads below are adjusted to compensate for this. See
// discussion at https://github.com/sneves/blake2-avx2/pull/4
#[inline(always)]
fn diagonalize(row0: &mut v128, row2: &mut v128, row3: &mut v128) {
*row0 = shuffle_epi32::<2, 1, 0, 3>(*row0);
*row3 = shuffle_epi32::<1, 0, 3, 2>(*row3);
*row2 = shuffle_epi32::<0, 3, 2, 1>(*row2);
}
#[inline(always)]
fn undiagonalize(row0: &mut v128, row2: &mut v128, row3: &mut v128) {
*row0 = shuffle_epi32::<0, 3, 2, 1>(*row0);
*row3 = shuffle_epi32::<1, 0, 3, 2>(*row3);
*row2 = shuffle_epi32::<2, 1, 0, 3>(*row2);
}
#[inline(always)]
fn compress_pre(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) -> [v128; 4] {
// safe because CVWords is [u32; 8]
let row0 = &mut unsafe { loadu(cv.as_ptr().add(0) as *const u8) };
let row1 = &mut unsafe { loadu(cv.as_ptr().add(4) as *const u8) };
let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]);
let row3 = &mut set4(
counter_low(counter),
counter_high(counter),
block_len as u32,
flags as u32,
);
// safe because block is &[u8; 64]
let mut m0 = unsafe { loadu(block.as_ptr().add(0 * 4 * DEGREE)) };
let mut m1 = unsafe { loadu(block.as_ptr().add(1 * 4 * DEGREE)) };
let mut m2 = unsafe { loadu(block.as_ptr().add(2 * 4 * DEGREE)) };
let mut m3 = unsafe { loadu(block.as_ptr().add(3 * 4 * DEGREE)) };
let mut t0;
let mut t1;
let mut t2;
let mut t3;
let mut tt;
// Round 1. The first round permutes the message words from the original
// input order, into the groups that get mixed in parallel.
t0 = shuffle!(m0, m1, 2, 0, 2, 0); // 6 4 2 0
g1(row0, row1, row2, row3, t0);
t1 = shuffle!(m0, m1, 3, 1, 3, 1); // 7 5 3 1
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = shuffle!(m2, m3, 2, 0, 2, 0); // 14 12 10 8
t2 = shuffle_epi32::<2, 1, 0, 3>(t2); // 12 10 8 14
g1(row0, row1, row2, row3, t2);
t3 = shuffle!(m2, m3, 3, 1, 3, 1); // 15 13 11 9
t3 = shuffle_epi32::<2, 1, 0, 3>(t3); // 13 11 9 15
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 2. This round and all following rounds apply a fixed permutation
// to the message words from the round before.
t0 = shuffle!(m0, m1, 3, 1, 1, 2);
t0 = shuffle_epi32::<0, 3, 2, 1>(t0);
g1(row0, row1, row2, row3, t0);
t1 = shuffle!(m2, m3, 3, 3, 2, 2);
tt = shuffle_epi32::<0, 0, 3, 3>(m0);
t1 = blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = unpacklo_epi64(m3, m1);
tt = blend_epi16(t2, m2, 0xC0);
t2 = shuffle_epi32::<1, 3, 2, 0>(tt);
g1(row0, row1, row2, row3, t2);
t3 = unpackhi_epi32(m1, m3);
tt = unpacklo_epi32(m2, t3);
t3 = shuffle_epi32::<0, 1, 3, 2>(tt);
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 3
t0 = shuffle!(m0, m1, 3, 1, 1, 2);
t0 = shuffle_epi32::<0, 3, 2, 1>(t0);
g1(row0, row1, row2, row3, t0);
t1 = shuffle!(m2, m3, 3, 3, 2, 2);
tt = shuffle_epi32::<0, 0, 3, 3>(m0);
t1 = blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = unpacklo_epi64(m3, m1);
tt = blend_epi16(t2, m2, 0xC0);
t2 = shuffle_epi32::<1, 3, 2, 0>(tt);
g1(row0, row1, row2, row3, t2);
t3 = unpackhi_epi32(m1, m3);
tt = unpacklo_epi32(m2, t3);
t3 = shuffle_epi32::<0, 1, 3, 2>(tt);
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 4
t0 = shuffle!(m0, m1, 3, 1, 1, 2);
t0 = shuffle_epi32::<0, 3, 2, 1>(t0);
g1(row0, row1, row2, row3, t0);
t1 = shuffle!(m2, m3, 3, 3, 2, 2);
tt = shuffle_epi32::<0, 0, 3, 3>(m0);
t1 = blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = unpacklo_epi64(m3, m1);
tt = blend_epi16(t2, m2, 0xC0);
t2 = shuffle_epi32::<1, 3, 2, 0>(tt);
g1(row0, row1, row2, row3, t2);
t3 = unpackhi_epi32(m1, m3);
tt = unpacklo_epi32(m2, t3);
t3 = shuffle_epi32::<0, 1, 3, 2>(tt);
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 5
t0 = shuffle!(m0, m1, 3, 1, 1, 2);
t0 = shuffle_epi32::<0, 3, 2, 1>(t0);
g1(row0, row1, row2, row3, t0);
t1 = shuffle!(m2, m3, 3, 3, 2, 2);
tt = shuffle_epi32::<0, 0, 3, 3>(m0);
t1 = blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = unpacklo_epi64(m3, m1);
tt = blend_epi16(t2, m2, 0xC0);
t2 = shuffle_epi32::<1, 3, 2, 0>(tt);
g1(row0, row1, row2, row3, t2);
t3 = unpackhi_epi32(m1, m3);
tt = unpacklo_epi32(m2, t3);
t3 = shuffle_epi32::<0, 1, 3, 2>(tt);
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 6
t0 = shuffle!(m0, m1, 3, 1, 1, 2);
t0 = shuffle_epi32::<0, 3, 2, 1>(t0);
g1(row0, row1, row2, row3, t0);
t1 = shuffle!(m2, m3, 3, 3, 2, 2);
tt = shuffle_epi32::<0, 0, 3, 3>(m0);
t1 = blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = unpacklo_epi64(m3, m1);
tt = blend_epi16(t2, m2, 0xC0);
t2 = shuffle_epi32::<1, 3, 2, 0>(tt);
g1(row0, row1, row2, row3, t2);
t3 = unpackhi_epi32(m1, m3);
tt = unpacklo_epi32(m2, t3);
t3 = shuffle_epi32::<0, 1, 3, 2>(tt);
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
// Round 7
t0 = shuffle!(m0, m1, 3, 1, 1, 2);
t0 = shuffle_epi32::<0, 3, 2, 1>(t0);
g1(row0, row1, row2, row3, t0);
t1 = shuffle!(m2, m3, 3, 3, 2, 2);
tt = shuffle_epi32::<0, 0, 3, 3>(m0);
t1 = blend_epi16(tt, t1, 0xCC);
g2(row0, row1, row2, row3, t1);
diagonalize(row0, row2, row3);
t2 = unpacklo_epi64(m3, m1);
tt = blend_epi16(t2, m2, 0xC0);
t2 = shuffle_epi32::<1, 3, 2, 0>(tt);
g1(row0, row1, row2, row3, t2);
t3 = unpackhi_epi32(m1, m3);
tt = unpacklo_epi32(m2, t3);
t3 = shuffle_epi32::<0, 1, 3, 2>(tt);
g2(row0, row1, row2, row3, t3);
undiagonalize(row0, row2, row3);
[*row0, *row1, *row2, *row3]
}
#[target_feature(enable = "simd128")]
pub fn compress_in_place(
cv: &mut CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) {
let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags);
// it stores in reversed order...
// safe because CVWords is [u32; 8]
unsafe {
storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8);
storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8);
}
}
#[target_feature(enable = "simd128")]
pub fn compress_xof(
cv: &CVWords,
block: &[u8; BLOCK_LEN],
block_len: u8,
counter: u64,
flags: u8,
) -> [u8; 64] {
let [mut row0, mut row1, mut row2, mut row3] =
compress_pre(cv, block, block_len, counter, flags);
row0 = xor(row0, row2);
row1 = xor(row1, row3);
// safe because CVWords is [u32; 8]
row2 = xor(row2, unsafe { loadu(cv.as_ptr().add(0) as *const u8) });
row3 = xor(row3, unsafe { loadu(cv.as_ptr().add(4) as *const u8) });
// It seems to be architecture dependent, but works.
// safe because sizes match, and every state of u8 is valid.
unsafe { core::mem::transmute([row0, row1, row2, row3]) }
}
#[inline(always)]
fn round(v: &mut [v128; 16], m: &[v128; 16], r: usize) {
v[0] = add(v[0], m[MSG_SCHEDULE[r][0] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][2] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][4] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][6] as usize]);
v[0] = add(v[0], v[4]);
v[1] = add(v[1], v[5]);
v[2] = add(v[2], v[6]);
v[3] = add(v[3], v[7]);
v[12] = xor(v[12], v[0]);
v[13] = xor(v[13], v[1]);
v[14] = xor(v[14], v[2]);
v[15] = xor(v[15], v[3]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[15] = rot16(v[15]);
v[8] = add(v[8], v[12]);
v[9] = add(v[9], v[13]);
v[10] = add(v[10], v[14]);
v[11] = add(v[11], v[15]);
v[4] = xor(v[4], v[8]);
v[5] = xor(v[5], v[9]);
v[6] = xor(v[6], v[10]);
v[7] = xor(v[7], v[11]);
v[4] = rot12(v[4]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[0] = add(v[0], m[MSG_SCHEDULE[r][1] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][3] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][5] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][7] as usize]);
v[0] = add(v[0], v[4]);
v[1] = add(v[1], v[5]);
v[2] = add(v[2], v[6]);
v[3] = add(v[3], v[7]);
v[12] = xor(v[12], v[0]);
v[13] = xor(v[13], v[1]);
v[14] = xor(v[14], v[2]);
v[15] = xor(v[15], v[3]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[15] = rot8(v[15]);
v[8] = add(v[8], v[12]);
v[9] = add(v[9], v[13]);
v[10] = add(v[10], v[14]);
v[11] = add(v[11], v[15]);
v[4] = xor(v[4], v[8]);
v[5] = xor(v[5], v[9]);
v[6] = xor(v[6], v[10]);
v[7] = xor(v[7], v[11]);
v[4] = rot7(v[4]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[0] = add(v[0], m[MSG_SCHEDULE[r][8] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][10] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][12] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][14] as usize]);
v[0] = add(v[0], v[5]);
v[1] = add(v[1], v[6]);
v[2] = add(v[2], v[7]);
v[3] = add(v[3], v[4]);
v[15] = xor(v[15], v[0]);
v[12] = xor(v[12], v[1]);
v[13] = xor(v[13], v[2]);
v[14] = xor(v[14], v[3]);
v[15] = rot16(v[15]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[10] = add(v[10], v[15]);
v[11] = add(v[11], v[12]);
v[8] = add(v[8], v[13]);
v[9] = add(v[9], v[14]);
v[5] = xor(v[5], v[10]);
v[6] = xor(v[6], v[11]);
v[7] = xor(v[7], v[8]);
v[4] = xor(v[4], v[9]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[4] = rot12(v[4]);
v[0] = add(v[0], m[MSG_SCHEDULE[r][9] as usize]);
v[1] = add(v[1], m[MSG_SCHEDULE[r][11] as usize]);
v[2] = add(v[2], m[MSG_SCHEDULE[r][13] as usize]);
v[3] = add(v[3], m[MSG_SCHEDULE[r][15] as usize]);
v[0] = add(v[0], v[5]);
v[1] = add(v[1], v[6]);
v[2] = add(v[2], v[7]);
v[3] = add(v[3], v[4]);
v[15] = xor(v[15], v[0]);
v[12] = xor(v[12], v[1]);
v[13] = xor(v[13], v[2]);
v[14] = xor(v[14], v[3]);
v[15] = rot8(v[15]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[10] = add(v[10], v[15]);
v[11] = add(v[11], v[12]);
v[8] = add(v[8], v[13]);
v[9] = add(v[9], v[14]);
v[5] = xor(v[5], v[10]);
v[6] = xor(v[6], v[11]);
v[7] = xor(v[7], v[8]);
v[4] = xor(v[4], v[9]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[4] = rot7(v[4]);
}
#[inline(always)]
fn transpose_vecs(vecs: &mut [v128; DEGREE]) {
// Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
// 22/33. Note that this doesn't split the vector into two lanes, as the
// AVX2 counterparts do.
let ab_01 = unpacklo_epi32(vecs[0], vecs[1]);
let ab_23 = unpackhi_epi32(vecs[0], vecs[1]);
let cd_01 = unpacklo_epi32(vecs[2], vecs[3]);
let cd_23 = unpackhi_epi32(vecs[2], vecs[3]);
// Interleave 64-bit lanes.
let abcd_0 = unpacklo_epi64(ab_01, cd_01);
let abcd_1 = unpackhi_epi64(ab_01, cd_01);
let abcd_2 = unpacklo_epi64(ab_23, cd_23);
let abcd_3 = unpackhi_epi64(ab_23, cd_23);
vecs[0] = abcd_0;
vecs[1] = abcd_1;
vecs[2] = abcd_2;
vecs[3] = abcd_3;
}
#[inline(always)]
unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [v128; 16] {
let mut vecs = [
loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)),
loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)),
loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)),
loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)),
loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)),
loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)),
loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)),
loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)),
loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)),
loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)),
];
let squares = mut_array_refs!(&mut vecs, DEGREE, DEGREE, DEGREE, DEGREE);
transpose_vecs(squares.0);
transpose_vecs(squares.1);
transpose_vecs(squares.2);
transpose_vecs(squares.3);
vecs
}
#[inline(always)]
fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (v128, v128) {
let mask = if increment_counter.yes() { !0 } else { 0 };
(
set4(
counter_low(counter + (mask & 0)),
counter_low(counter + (mask & 1)),
counter_low(counter + (mask & 2)),
counter_low(counter + (mask & 3)),
),
set4(
counter_high(counter + (mask & 0)),
counter_high(counter + (mask & 1)),
counter_high(counter + (mask & 2)),
counter_high(counter + (mask & 3)),
),
)
}
#[target_feature(enable = "simd128")]
pub unsafe fn hash4(
inputs: &[*const u8; DEGREE],
blocks: usize,
key: &CVWords,
counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut [u8; DEGREE * OUT_LEN],
) {
let mut h_vecs = [
set1(key[0]),
set1(key[1]),
set1(key[2]),
set1(key[3]),
set1(key[4]),
set1(key[5]),
set1(key[6]),
set1(key[7]),
];
let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter);
let mut block_flags = flags | flags_start;
for block in 0..blocks {
if block + 1 == blocks {
block_flags |= flags_end;
}
let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only
let block_flags_vec = set1(block_flags as u32);
let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN);
// The transposed compression function. Note that inlining this
// manually here improves compile times by a lot, compared to factoring
// it out into its own function and making it #[inline(always)]. Just
// guessing, it might have something to do with loop unrolling.
let mut v = [
h_vecs[0],
h_vecs[1],
h_vecs[2],
h_vecs[3],
h_vecs[4],
h_vecs[5],
h_vecs[6],
h_vecs[7],
set1(IV[0]),
set1(IV[1]),
set1(IV[2]),
set1(IV[3]),
counter_low_vec,
counter_high_vec,
block_len_vec,
block_flags_vec,
];
round(&mut v, &msg_vecs, 0);
round(&mut v, &msg_vecs, 1);
round(&mut v, &msg_vecs, 2);
round(&mut v, &msg_vecs, 3);
round(&mut v, &msg_vecs, 4);
round(&mut v, &msg_vecs, 5);
round(&mut v, &msg_vecs, 6);
h_vecs[0] = xor(v[0], v[8]);
h_vecs[1] = xor(v[1], v[9]);
h_vecs[2] = xor(v[2], v[10]);
h_vecs[3] = xor(v[3], v[11]);
h_vecs[4] = xor(v[4], v[12]);
h_vecs[5] = xor(v[5], v[13]);
h_vecs[6] = xor(v[6], v[14]);
h_vecs[7] = xor(v[7], v[15]);
block_flags = flags;
}
let squares = mut_array_refs!(&mut h_vecs, DEGREE, DEGREE);
transpose_vecs(squares.0);
transpose_vecs(squares.1);
// The first four vecs now contain the first half of each output, and the
// second four vecs contain the second half of each output.
storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE));
storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE));
storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE));
storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE));
storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE));
storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE));
storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE));
storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE));
}
#[target_feature(enable = "simd128")]
unsafe fn hash1<const N: usize>(
input: &[u8; N],
key: &CVWords,
counter: u64,
flags: u8,
flags_start: u8,
flags_end: u8,
out: &mut CVBytes,
) {
debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks");
let mut cv = *key;
let mut block_flags = flags | flags_start;
let mut slice = &input[..];
while slice.len() >= BLOCK_LEN {
if slice.len() == BLOCK_LEN {
block_flags |= flags_end;
}
compress_in_place(
&mut cv,
array_ref!(slice, 0, BLOCK_LEN),
BLOCK_LEN as u8,
counter,
block_flags,
);
block_flags = flags;
slice = &slice[BLOCK_LEN..];
}
*out = core::mem::transmute(cv);
}
#[target_feature(enable = "simd128")]
pub unsafe fn hash_many<const N: usize>(
mut inputs: &[&[u8; N]],
key: &CVWords,
mut counter: u64,
increment_counter: IncrementCounter,
flags: u8,
flags_start: u8,
flags_end: u8,
mut out: &mut [u8],
) {
debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short");
while inputs.len() >= DEGREE && out.len() >= DEGREE * OUT_LEN {
// Safe because the layout of arrays is guaranteed, and because the
// `blocks` count is determined statically from the argument type.
let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]);
let blocks = N / BLOCK_LEN;
hash4(
input_ptrs,
blocks,
key,
counter,
increment_counter,
flags,
flags_start,
flags_end,
array_mut_ref!(out, 0, DEGREE * OUT_LEN),
);
if increment_counter.yes() {
counter += DEGREE as u64;
}
inputs = &inputs[DEGREE..];
out = &mut out[DEGREE * OUT_LEN..];
}
for (&input, output) in inputs.iter().zip(out.chunks_exact_mut(OUT_LEN)) {
hash1(
input,
key,
counter,
flags,
flags_start,
flags_end,
array_mut_ref!(output, 0, OUT_LEN),
);
if increment_counter.yes() {
counter += 1;
}
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_transpose() {
#[target_feature(enable = "simd128")]
fn transpose_wrapper(vecs: &mut [v128; DEGREE]) {
transpose_vecs(vecs);
}
let mut matrix = [[0 as u32; DEGREE]; DEGREE];
for i in 0..DEGREE {
for j in 0..DEGREE {
matrix[i][j] = (i * DEGREE + j) as u32;
}
}
unsafe {
let mut vecs: [v128; DEGREE] = core::mem::transmute(matrix);
transpose_wrapper(&mut vecs);
matrix = core::mem::transmute(vecs);
}
for i in 0..DEGREE {
for j in 0..DEGREE {
// Reversed indexes from above.
assert_eq!(matrix[j][i], (i * DEGREE + j) as u32);
}
}
}
#[test]
fn test_compress() {
crate::test::test_compress_fn(compress_in_place, compress_xof);
}
#[test]
fn test_hash_many() {
crate::test::test_hash_many_fn(hash_many, hash_many);
}
}

16
vendor/blake3/tools/release.md vendored Normal file
View File

@@ -0,0 +1,16 @@
# Release checklist
- Make sure `cargo outdated -R` is clean in the root and in b3sum/.
- Bump the version in the root Cargo.toml.
- Bump the version in b3sum/Cargo.toml.
- Bump the dependency version too, if new features are used.
- Delete b3sum/Cargo.lock and recreate it with `cargo build` or similar.
- Update the `-h` output in b3sum/README.md if it's changed.
- Bump `BLAKE3_VERSION_STRING` in c/blake3.h.
- Bump `VERSION` in c/CMakeLists.txt.
- Make a version bump commit with change notes.
- `git push` and make sure CI is green.
- `git tag` the version bump commit with the new version number.
- `git push --tags`
- `cargo publish` in the root.
- `cargo publish` in b3sum/.