Vendor dependencies for 0.3.0 release

This commit is contained in:
2025-09-27 10:29:08 -05:00
parent 0c8d39d483
commit 82ab7f317b
26803 changed files with 16134934 additions and 0 deletions

221
vendor/ruzstd/src/bin/zstd.rs vendored Normal file
View File

@@ -0,0 +1,221 @@
extern crate ruzstd;
use std::fs::File;
use std::io::BufReader;
use std::io::Read;
use std::io::Seek;
use std::io::SeekFrom;
use std::io::Write;
use std::time::Instant;
use ruzstd::decoding::errors::FrameDecoderError;
use ruzstd::decoding::errors::ReadFrameHeaderError;
use ruzstd::encoding::CompressionLevel;
use ruzstd::encoding::FrameCompressor;
struct StateTracker {
bytes_used: u64,
frames_used: usize,
valid_checksums: usize,
invalid_checksums: usize,
file_pos: u64,
file_size: u64,
old_percentage: i8,
}
fn decompress(flags: &[String], file_paths: &[String]) {
if !flags.contains(&"-d".to_owned()) {
eprintln!("This zstd implementation only supports decompression. Please add a \"-d\" flag");
return;
}
if !flags.contains(&"-c".to_owned()) {
eprintln!("This zstd implementation only supports output on the stdout. Please add a \"-c\" flag and pipe the output into a file");
return;
}
if flags.len() != 2 {
eprintln!(
"No flags other than -d and -c are currently implemented. Flags used: {:?}",
flags
);
return;
}
let mut frame_dec = ruzstd::decoding::FrameDecoder::new();
for path in file_paths {
eprintln!("File: {}", path);
let mut f = File::open(path).unwrap();
let mut tracker = StateTracker {
bytes_used: 0,
frames_used: 0,
valid_checksums: 0,
invalid_checksums: 0,
file_size: f.metadata().unwrap().len(),
file_pos: 0,
old_percentage: -1,
};
let batch_size = 1024 * 1024 * 10;
let mut result = vec![0; batch_size];
while tracker.file_pos < tracker.file_size {
match frame_dec.reset(&mut f) {
Err(FrameDecoderError::ReadFrameHeaderError(ReadFrameHeaderError::SkipFrame {
magic_number: magic_num,
length: skip_size,
})) => {
eprintln!("Found a skippable frame with magic number: {magic_num} and size: {skip_size}");
tracker.file_pos = f.stream_position().unwrap();
tracker.file_pos += skip_size as u64;
f.seek(SeekFrom::Current(skip_size as i64)).unwrap();
continue;
}
other => other.unwrap(),
}
tracker.frames_used += 1;
while !frame_dec.is_finished() {
frame_dec
.decode_blocks(
&mut f,
ruzstd::decoding::BlockDecodingStrategy::UptoBytes(batch_size),
)
.unwrap();
if frame_dec.can_collect() > batch_size {
let x = frame_dec.read(result.as_mut_slice()).unwrap();
tracker.file_pos = f.stream_position().unwrap();
do_something(&result[..x], &mut tracker);
}
}
// handle the last chunk of data
while frame_dec.can_collect() > 0 {
let x = frame_dec.read(result.as_mut_slice()).unwrap();
tracker.file_pos = f.stream_position().unwrap();
do_something(&result[..x], &mut tracker);
}
#[cfg(feature = "hash")]
if let Some(chksum) = frame_dec.get_checksum_from_data() {
if frame_dec.get_calculated_checksum().unwrap() != chksum {
tracker.invalid_checksums += 1;
eprintln!(
"Checksum did not match in frame {}! From data: {}, calculated while decoding: {}",
tracker.frames_used,
chksum,
frame_dec.get_calculated_checksum().unwrap()
);
} else {
tracker.valid_checksums += 1;
}
}
}
eprintln!(
"\nDecoded frames: {} bytes: {}",
tracker.frames_used, tracker.bytes_used
);
if tracker.valid_checksums == 0 && tracker.invalid_checksums == 0 {
eprintln!("No checksums to test");
} else {
eprintln!(
"{} of {} checksums are ok!",
tracker.valid_checksums,
tracker.valid_checksums + tracker.invalid_checksums,
);
}
}
}
struct PercentPrintReader<R: Read> {
total: usize,
counter: usize,
last_percent: usize,
reader: R,
}
impl<R: Read> Read for PercentPrintReader<R> {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
let new_bytes = self.reader.read(buf)?;
self.counter += new_bytes;
let progress = self.counter * 100 / self.total;
if progress > self.last_percent {
self.last_percent = progress;
eprint!("\r");
eprint!("{} % done", progress);
}
Ok(new_bytes)
}
}
fn main() {
let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect();
let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect();
file_paths.remove(0);
if flags.is_empty() {
let mut encoder = FrameCompressor::new(CompressionLevel::Fastest);
encoder.set_drain(Vec::new());
for path in file_paths {
let start_instant = Instant::now();
let file = std::fs::File::open(&path).unwrap();
let input_len = file.metadata().unwrap().len() as usize;
let file = PercentPrintReader {
reader: BufReader::new(file),
total: input_len,
counter: 0,
last_percent: 0,
};
encoder.set_source(file);
encoder.compress();
let mut output: Vec<_> = encoder.take_drain().unwrap();
println!(
"Compressed {path:} from {} to {} ({}%) took {}ms",
input_len,
output.len(),
if input_len == 0 {
0
} else {
output.len() * 100 / input_len
},
start_instant.elapsed().as_millis()
);
println!("Check against source file. Decoding...");
let mut decoded = Vec::with_capacity(input_len);
ruzstd::decoding::FrameDecoder::new()
.decode_all_to_vec(&output, &mut decoded)
.unwrap();
println!("Decoded without error");
assert_eq!(decoded.len(), input_len);
println!("Decoded length is correct, now check against file contents file");
let input = std::fs::read(&path).unwrap();
assert_eq!(decoded.len(), input.len());
assert!(decoded == input);
println!("Checks completed");
output.clear();
encoder.set_drain(output);
}
} else {
decompress(&flags, &file_paths);
}
}
fn do_something(data: &[u8], s: &mut StateTracker) {
//Do something. Like writing it to a file or to stdout...
std::io::stdout().write_all(data).unwrap();
s.bytes_used += data.len() as u64;
let percentage = (s.file_pos * 100) / s.file_size;
if percentage as i8 != s.old_percentage {
eprint!("\r");
eprint!("{} % done", percentage);
s.old_percentage = percentage as i8;
}
}

41
vendor/ruzstd/src/bin/zstd_stream.rs vendored Normal file
View File

@@ -0,0 +1,41 @@
extern crate ruzstd;
use std::fs::File;
use std::io::{Read, Write};
fn main() {
let mut file_paths: Vec<_> = std::env::args().filter(|f| !f.starts_with('-')).collect();
let flags: Vec<_> = std::env::args().filter(|f| f.starts_with('-')).collect();
file_paths.remove(0);
if !flags.contains(&"-d".to_owned()) {
eprintln!("This zstd implementation only supports decompression. Please add a \"-d\" flag");
return;
}
if !flags.contains(&"-c".to_owned()) {
eprintln!("This zstd implementation only supports output on the stdout. Please add a \"-c\" flag and pipe the output into a file");
return;
}
if flags.len() != 2 {
eprintln!(
"No flags other than -d and -c are currently implemented. Flags used: {:?}",
flags
);
return;
}
for path in file_paths {
eprintln!("File: {}", path);
let f = File::open(path).unwrap();
let mut buf_read = std::io::BufReader::new(f);
let mut decoder = ruzstd::decoding::StreamingDecoder::new(&mut buf_read).unwrap();
let mut buf = [0u8; 1024 * 1024];
let mut stdout = std::io::stdout();
while !decoder.decoder.is_finished() || decoder.decoder.can_collect() > 0 {
let bytes = decoder.read(&mut buf[..]).unwrap();
stdout.write_all(&buf[..bytes]).unwrap();
}
}
}

135
vendor/ruzstd/src/bit_io/bit_reader.rs vendored Normal file
View File

@@ -0,0 +1,135 @@
/// Wraps a slice and enables reading arbitrary amounts of bits
/// from that slice.
pub struct BitReader<'s> {
idx: usize, //index counts bits already read
source: &'s [u8],
}
impl<'s> BitReader<'s> {
pub fn new(source: &'s [u8]) -> BitReader<'s> {
BitReader { idx: 0, source }
}
pub fn bits_left(&self) -> usize {
self.source.len() * 8 - self.idx
}
pub fn bits_read(&self) -> usize {
self.idx
}
pub fn return_bits(&mut self, n: usize) {
if n > self.idx {
panic!("Cant return this many bits");
}
self.idx -= n;
}
pub fn get_bits(&mut self, n: usize) -> Result<u64, GetBitsError> {
if n > 64 {
return Err(GetBitsError::TooManyBits {
num_requested_bits: n,
limit: 64,
});
}
if self.bits_left() < n {
return Err(GetBitsError::NotEnoughRemainingBits {
requested: n,
remaining: self.bits_left(),
});
}
let old_idx = self.idx;
let bits_left_in_current_byte = 8 - (self.idx % 8);
let bits_not_needed_in_current_byte = 8 - bits_left_in_current_byte;
//collect bits from the currently pointed to byte
let mut value = u64::from(self.source[self.idx / 8] >> bits_not_needed_in_current_byte);
if bits_left_in_current_byte >= n {
//no need for fancy stuff
//just mask all but the needed n bit
value &= (1 << n) - 1;
self.idx += n;
} else {
self.idx += bits_left_in_current_byte;
//n spans over multiple bytes
let full_bytes_needed = (n - bits_left_in_current_byte) / 8;
let bits_in_last_byte_needed = n - bits_left_in_current_byte - full_bytes_needed * 8;
assert!(
bits_left_in_current_byte + full_bytes_needed * 8 + bits_in_last_byte_needed == n
);
let mut bit_shift = bits_left_in_current_byte; //this many bits are already set in value
assert!(self.idx % 8 == 0);
//collect full bytes
for _ in 0..full_bytes_needed {
value |= u64::from(self.source[self.idx / 8]) << bit_shift;
self.idx += 8;
bit_shift += 8;
}
assert!(n - bit_shift == bits_in_last_byte_needed);
if bits_in_last_byte_needed > 0 {
let val_las_byte =
u64::from(self.source[self.idx / 8]) & ((1 << bits_in_last_byte_needed) - 1);
value |= val_las_byte << bit_shift;
self.idx += bits_in_last_byte_needed;
}
}
assert!(self.idx == old_idx + n);
Ok(value)
}
}
#[derive(Debug)]
#[non_exhaustive]
pub enum GetBitsError {
TooManyBits {
num_requested_bits: usize,
limit: u8,
},
NotEnoughRemainingBits {
requested: usize,
remaining: usize,
},
}
#[cfg(feature = "std")]
impl std::error::Error for GetBitsError {}
impl core::fmt::Display for GetBitsError {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
match self {
GetBitsError::TooManyBits {
num_requested_bits,
limit,
} => {
write!(
f,
"Cant serve this request. The reader is limited to {} bits, requested {} bits",
limit, num_requested_bits,
)
}
GetBitsError::NotEnoughRemainingBits {
requested,
remaining,
} => {
write!(
f,
"Can\'t read {} bits, only have {} bits left",
requested, remaining,
)
}
}
}
}

View File

@@ -0,0 +1,184 @@
use core::convert::TryInto;
/// Zstandard encodes some types of data in a way that the data must be read
/// back to front to decode it properly. `BitReaderReversed` provides a
/// convenient interface to do that.
pub struct BitReaderReversed<'s> {
/// The index of the last read byte in the source.
index: usize,
/// How many bits have been consumed from `bit_container`.
bits_consumed: u8,
/// How many bits have been consumed past the end of the input. Will be zero until all the input
/// has been read.
extra_bits: usize,
/// The source data to read from.
source: &'s [u8],
/// The reader doesn't read directly from the source, it reads bits from here, and the container
/// is "refilled" as it's emptied.
bit_container: u64,
}
impl<'s> BitReaderReversed<'s> {
/// How many bits are left to read by the reader.
pub fn bits_remaining(&self) -> isize {
self.index as isize * 8 + (64 - self.bits_consumed as isize) - self.extra_bits as isize
}
pub fn new(source: &'s [u8]) -> BitReaderReversed<'s> {
BitReaderReversed {
index: source.len(),
bits_consumed: 64,
source,
bit_container: 0,
extra_bits: 0,
}
}
/// We refill the container in full bytes, shifting the still unread portion to the left, and filling the lower bits with new data
#[cold]
fn refill(&mut self) {
let bytes_consumed = self.bits_consumed as usize / 8;
if bytes_consumed == 0 {
return;
}
if self.index >= bytes_consumed {
// We can safely move the window contained in `bit_container` down by `bytes_consumed`
// If the reader wasn't byte aligned, the byte that was partially read is now in the highest order bits in the `bit_container`
self.index -= bytes_consumed;
// Some bits of the `bits_container` might have been consumed already because we read the window byte aligned
self.bits_consumed &= 7;
self.bit_container =
u64::from_le_bytes((&self.source[self.index..][..8]).try_into().unwrap());
} else if self.index > 0 {
// Read the last portion of source into the `bit_container`
if self.source.len() >= 8 {
self.bit_container = u64::from_le_bytes((&self.source[..8]).try_into().unwrap());
} else {
let mut value = [0; 8];
value[..self.source.len()].copy_from_slice(self.source);
self.bit_container = u64::from_le_bytes(value);
}
self.bits_consumed -= 8 * self.index as u8;
self.index = 0;
self.bit_container <<= self.bits_consumed;
self.extra_bits += self.bits_consumed as usize;
self.bits_consumed = 0;
} else if self.bits_consumed < 64 {
// Shift out already used bits and fill up with zeroes
self.bit_container <<= self.bits_consumed;
self.extra_bits += self.bits_consumed as usize;
self.bits_consumed = 0;
} else {
// All useful bits have already been read and more than 64 bits have been consumed, all we now do is return zeroes
self.extra_bits += self.bits_consumed as usize;
self.bits_consumed = 0;
self.bit_container = 0;
}
// Assert that at least `56 = 64 - 8` bits are available to read.
debug_assert!(self.bits_consumed < 8);
}
/// Read `n` number of bits from the source. Will read at most 56 bits.
/// If there are no more bits to be read from the source zero bits will be returned instead.
#[inline(always)]
pub fn get_bits(&mut self, n: u8) -> u64 {
if self.bits_consumed + n > 64 {
self.refill();
}
let value = self.peek_bits(n);
self.consume(n);
value
}
/// Get the next `n` bits from the source without consuming them.
/// Caller is responsible for making sure that `n` many bits have been refilled.
#[inline(always)]
pub fn peek_bits(&mut self, n: u8) -> u64 {
if n == 0 {
return 0;
}
let mask = (1u64 << n) - 1u64;
let shift_by = 64 - self.bits_consumed - n;
(self.bit_container >> shift_by) & mask
}
/// Get the next `n1` `n2` and `n3` bits from the source without consuming them.
/// Caller is responsible for making sure that `sum` many bits have been refilled.
#[inline(always)]
pub fn peek_bits_triple(&mut self, sum: u8, n1: u8, n2: u8, n3: u8) -> (u64, u64, u64) {
if sum == 0 {
return (0, 0, 0);
}
// all_three contains bits like this: |XXXX..XXX111122223333|
// Where XXX are already consumed bytes, 1/2/3 are bits of the respective value
// Lower bits are to the right
let all_three = self.bit_container >> (64 - self.bits_consumed - sum);
let mask1 = (1u64 << n1) - 1u64;
let shift_by1 = n3 + n2;
let val1 = (all_three >> shift_by1) & mask1;
let mask2 = (1u64 << n2) - 1u64;
let shift_by2 = n3;
let val2 = (all_three >> shift_by2) & mask2;
let mask3 = (1u64 << n3) - 1u64;
let val3 = all_three & mask3;
(val1, val2, val3)
}
/// Consume `n` bits from the source.
#[inline(always)]
pub fn consume(&mut self, n: u8) {
self.bits_consumed += n;
debug_assert!(self.bits_consumed <= 64);
}
/// Same as calling get_bits three times but slightly more performant
#[inline(always)]
pub fn get_bits_triple(&mut self, n1: u8, n2: u8, n3: u8) -> (u64, u64, u64) {
let sum = n1 + n2 + n3;
if sum <= 56 {
self.refill();
let triple = self.peek_bits_triple(sum, n1, n2, n3);
self.consume(sum);
return triple;
}
(self.get_bits(n1), self.get_bits(n2), self.get_bits(n3))
}
}
#[cfg(test)]
mod test {
#[test]
fn it_works() {
let data = [0b10101010, 0b01010101];
let mut br = super::BitReaderReversed::new(&data);
assert_eq!(br.get_bits(1), 0);
assert_eq!(br.get_bits(1), 1);
assert_eq!(br.get_bits(1), 0);
assert_eq!(br.get_bits(4), 0b1010);
assert_eq!(br.get_bits(4), 0b1101);
assert_eq!(br.get_bits(4), 0b0101);
// Last 0 from source, three zeroes filled in
assert_eq!(br.get_bits(4), 0b0000);
// All zeroes filled in
assert_eq!(br.get_bits(4), 0b0000);
assert_eq!(br.bits_remaining(), -7);
}
}

367
vendor/ruzstd/src/bit_io/bit_writer.rs vendored Normal file
View File

@@ -0,0 +1,367 @@
//! Use [BitWriter] to write an arbitrary amount of bits into a buffer.
use alloc::vec::Vec;
/// An interface for writing an arbitrary number of bits into a buffer. Write new bits into the buffer with `write_bits`, and
/// obtain the output using `dump`.
#[derive(Debug)]
pub(crate) struct BitWriter<V: AsMut<Vec<u8>>> {
/// The buffer that's filled with bits
output: V,
/// holds a partially filled byte which gets put in outpu when it's fill with a write_bits call
partial: u64,
bits_in_partial: usize,
/// The index pointing to the next unoccupied bit. Effectively just
/// the number of bits that have been written into the buffer so far.
bit_idx: usize,
}
impl BitWriter<Vec<u8>> {
/// Initialize a new writer.
pub fn new() -> Self {
Self {
output: Vec::new(),
partial: 0,
bits_in_partial: 0,
bit_idx: 0,
}
}
}
impl<V: AsMut<Vec<u8>>> BitWriter<V> {
/// Initialize a new writer.
pub fn from(mut output: V) -> BitWriter<V> {
BitWriter {
bit_idx: output.as_mut().len() * 8,
output,
partial: 0,
bits_in_partial: 0,
}
}
/// Get the current index. Can be used to reset to this index or to later change the bits at this index
pub fn index(&self) -> usize {
self.bit_idx + self.bits_in_partial
}
/// Reset to an index. Currently only supports resetting to a byte aligned index
pub fn reset_to(&mut self, index: usize) {
assert!(index % 8 == 0);
self.partial = 0;
self.bits_in_partial = 0;
self.bit_idx = index;
self.output.as_mut().resize(index / 8, 0);
}
/// Change the bits at the index. `bits` contains the ǹum_bits` new bits that should be written
/// Instead of the current content. `bits` *MUST* only contain zeroes in the upper bits outside of the `0..num_bits` range.
pub fn change_bits(&mut self, idx: usize, bits: impl Into<u64>, num_bits: usize) {
self.change_bits_64(idx, bits.into(), num_bits);
}
/// Monomorphized version of `change_bits`
pub fn change_bits_64(&mut self, mut idx: usize, mut bits: u64, mut num_bits: usize) {
self.flush();
assert!(idx + num_bits < self.index());
assert!(self.index() - (idx + num_bits) > self.bits_in_partial);
// We might be changing bits unaligned to byte borders.
// This means the lower bits of the first byte we are touching must stay the same
if idx % 8 != 0 {
// How many (upper) bits will change in the first byte?
let bits_in_first_byte = 8 - (idx % 8);
// We don't support only changing a few bits in the middle of a byte
assert!(bits_in_first_byte <= num_bits);
// Zero out the upper bits that will be changed while keeping the lower bits intact
self.output.as_mut()[idx / 8] &= 0xFFu8 >> bits_in_first_byte;
// Shift the bits up and put them in the now zeroed out bits
let new_bits = (bits << (8 - bits_in_first_byte)) as u8;
self.output.as_mut()[idx / 8] |= new_bits;
// Update the state. Note that we are now definitely working byte aligned
num_bits -= bits_in_first_byte;
bits >>= bits_in_first_byte;
idx += bits_in_first_byte;
}
assert!(idx % 8 == 0);
// We are now byte aligned, change idx to byte resolution
let mut idx = idx / 8;
// Update full bytes by just shifting and extracting bytes from the bits
while num_bits >= 8 {
self.output.as_mut()[idx] = bits as u8;
num_bits -= 8;
bits >>= 8;
idx += 1;
}
// Deal with leftover bits that wont fill a full byte, keeping the upper bits of the original byte intact
if num_bits > 0 {
self.output.as_mut()[idx] &= 0xFFu8 << num_bits;
self.output.as_mut()[idx] |= bits as u8;
}
}
/// Simply append bytes to the buffer. Only works if the buffer was already byte aligned
pub fn append_bytes(&mut self, data: &[u8]) {
if self.misaligned() != 0 {
panic!("Don't append bytes when writer is misaligned")
}
self.flush();
self.output.as_mut().extend_from_slice(data);
self.bit_idx += data.len() * 8;
}
/// Flush temporary internal buffers to the output buffer. Only works if this is currently byte aligned
pub fn flush(&mut self) {
assert!(self.bits_in_partial % 8 == 0);
let full_bytes = self.bits_in_partial / 8;
self.output
.as_mut()
.extend_from_slice(&self.partial.to_le_bytes()[..full_bytes]);
self.partial >>= full_bytes * 8;
self.bits_in_partial -= full_bytes * 8;
self.bit_idx += full_bytes * 8;
}
/// Write the lower `num_bits` from `bits` into the writer. `bits` *MUST* only contain zeroes in the upper bits outside of the `0..num_bits` range.
pub fn write_bits(&mut self, bits: impl Into<u64>, num_bits: usize) {
self.write_bits_64(bits.into(), num_bits);
}
/// This is the special case where we need to flush the partial buffer to the output.
/// Marked as cold and in a separate function so the optimizer has more information.
#[cold]
fn write_bits_64_cold(&mut self, bits: u64, num_bits: usize) {
assert!(self.bits_in_partial + num_bits >= 64);
// Fill the partial buffer so it contains 64 bits
let bits_free_in_partial = 64 - self.bits_in_partial;
let part = bits << (64 - bits_free_in_partial);
let merged = self.partial | part;
// Put the 8 bytes into the output buffer
self.output
.as_mut()
.extend_from_slice(&merged.to_le_bytes());
self.bit_idx += 64;
self.partial = 0;
self.bits_in_partial = 0;
let mut num_bits = num_bits - bits_free_in_partial;
let mut bits = bits >> bits_free_in_partial;
// While we are at it push full bytes into the output buffer instead of polluting the partial buffer
while num_bits / 8 > 0 {
let byte = bits as u8;
self.output.as_mut().push(byte);
num_bits -= 8;
self.bit_idx += 8;
bits >>= 8;
}
// The last few bits belong into the partial buffer
assert!(num_bits < 8);
if num_bits > 0 {
let mask = (1 << num_bits) - 1;
self.partial = bits & mask;
self.bits_in_partial = num_bits;
}
}
/// Monomorphized version of `change_bits`
pub fn write_bits_64(&mut self, bits: u64, num_bits: usize) {
if num_bits == 0 {
return;
}
if bits > 0 {
debug_assert!(bits.ilog2() <= num_bits as u32);
}
// fill partial byte first
if num_bits + self.bits_in_partial < 64 {
let part = bits << self.bits_in_partial;
let merged = self.partial | part;
self.partial = merged;
self.bits_in_partial += num_bits;
} else {
// If the partial buffer can't hold the num_bits we need to make space
self.write_bits_64_cold(bits, num_bits);
}
}
/// Returns the populated buffer that you've been writing bits into.
///
/// This function consumes the writer, so it cannot be used after
/// dumping
pub fn dump(mut self) -> V {
if self.misaligned() != 0 {
panic!("`dump` was called on a bit writer but an even number of bytes weren't written into the buffer. Was: {}", self.index())
}
self.flush();
debug_assert_eq!(self.partial, 0);
self.output
}
/// Returns how many bits are missing for an even byte
pub fn misaligned(&self) -> usize {
let idx = self.index();
if idx % 8 == 0 {
0
} else {
8 - (idx % 8)
}
}
}
#[cfg(test)]
mod tests {
use super::BitWriter;
use alloc::vec;
#[test]
fn from_existing() {
// Define an existing vec, write some bits into it
let mut existing_vec = vec![255_u8];
let mut bw = BitWriter::from(&mut existing_vec);
bw.write_bits(0u8, 8);
bw.flush();
assert_eq!(vec![255, 0], existing_vec);
}
#[test]
fn change_bits() {
let mut writer = BitWriter::new();
writer.write_bits(0u32, 24);
writer.change_bits(8, 0xFFu8, 8);
assert_eq!(vec![0, 0xFF, 0], writer.dump());
let mut writer = BitWriter::new();
writer.write_bits(0u32, 24);
writer.change_bits(6, 0x0FFFu16, 12);
assert_eq!(vec![0b11000000, 0xFF, 0b00000011], writer.dump());
}
#[test]
fn single_byte_written_4_4() {
// Write the first 4 bits as 1s and the last 4 bits as 0s
// 1010 is used where values should never be read from.
let mut bw = BitWriter::new();
bw.write_bits(0b1111u8, 4);
bw.write_bits(0b0000u8, 4);
let output = bw.dump();
assert!(output.len() == 1, "Single byte written into writer returned a vec that wasn't one byte, vec was {} elements long", output.len());
assert_eq!(
0b0000_1111, output[0],
"4 bits and 4 bits written into buffer"
);
}
#[test]
fn single_byte_written_3_5() {
// Write the first 3 bits as 1s and the last 5 bits as 0s
let mut bw = BitWriter::new();
bw.write_bits(0b111u8, 3);
bw.write_bits(0b0_0000u8, 5);
let output = bw.dump();
assert!(output.len() == 1, "Single byte written into writer return a vec that wasn't one byte, vec was {} elements long", output.len());
assert_eq!(0b0000_0111, output[0], "3 and 5 bits written into buffer");
}
#[test]
fn single_byte_written_1_7() {
// Write the first bit as a 1 and the last 7 bits as 0s
let mut bw = BitWriter::new();
bw.write_bits(0b1u8, 1);
bw.write_bits(0u8, 7);
let output = bw.dump();
assert!(output.len() == 1, "Single byte written into writer return a vec that wasn't one byte, vec was {} elements long", output.len());
assert_eq!(0b0000_0001, output[0], "1 and 7 bits written into buffer");
}
#[test]
fn single_byte_written_8() {
// Write an entire byte
let mut bw = BitWriter::new();
bw.write_bits(1u8, 8);
let output = bw.dump();
assert!(output.len() == 1, "Single byte written into writer return a vec that wasn't one byte, vec was {} elements long", output.len());
assert_eq!(1, output[0], "1 and 7 bits written into buffer");
}
#[test]
fn multi_byte_clean_boundary_4_4_4_4() {
// Writing 4 bits at a time for 2 bytes
let mut bw = BitWriter::new();
bw.write_bits(0u8, 4);
bw.write_bits(0b1111u8, 4);
bw.write_bits(0b1111u8, 4);
bw.write_bits(0u8, 4);
assert_eq!(vec![0b1111_0000, 0b0000_1111], bw.dump());
}
#[test]
fn multi_byte_clean_boundary_16_8() {
// Writing 16 bits at once
let mut bw = BitWriter::new();
bw.write_bits(0x0100u16, 16);
bw.write_bits(69u8, 8);
assert_eq!(vec![0, 1, 69], bw.dump())
}
#[test]
fn multi_byte_boundary_crossed_4_12() {
// Writing 4 1s and then 12 zeros
let mut bw = BitWriter::new();
bw.write_bits(0b1111u8, 4);
bw.write_bits(0b0000_0011_0100_0010u16, 12);
assert_eq!(vec![0b0010_1111, 0b0011_0100], bw.dump());
}
#[test]
fn multi_byte_boundary_crossed_4_5_7() {
// Writing 4 1s and then 5 zeros then 7 1s
let mut bw = BitWriter::new();
bw.write_bits(0b1111u8, 4);
bw.write_bits(0b0_0000u8, 5);
bw.write_bits(0b111_1111u8, 7);
assert_eq!(vec![0b0000_1111, 0b1111_1110], bw.dump());
}
#[test]
fn multi_byte_boundary_crossed_1_9_6() {
// Writing 1 1 and then 9 zeros then 6 1s
let mut bw = BitWriter::new();
bw.write_bits(0b1u8, 1);
bw.write_bits(0b0_0000_0000u16, 9);
bw.write_bits(0b11_1111u8, 6);
assert_eq!(vec![0b0000_0001, 0b1111_1100], bw.dump());
}
#[test]
#[should_panic]
fn catches_unaligned_dump() {
// Write a single bit in then dump it, making sure
// the correct error is returned
let mut bw = BitWriter::new();
bw.write_bits(0u8, 1);
bw.dump();
}
#[test]
#[should_panic]
fn catches_dirty_upper_bits() {
let mut bw = BitWriter::new();
bw.write_bits(10u8, 1);
}
#[test]
fn add_multiple_aligned() {
let mut bw = BitWriter::new();
bw.write_bits(0x00_0F_F0_FFu32, 32);
assert_eq!(vec![0xFF, 0xF0, 0x0F, 0x00], bw.dump());
}
// #[test]
// fn catches_more_than_in_buf() {
// todo!();
// }
}

9
vendor/ruzstd/src/bit_io/mod.rs vendored Normal file
View File

@@ -0,0 +1,9 @@
//! Encoding agnostic ways to read and write binary data
mod bit_reader;
mod bit_reader_reverse;
mod bit_writer;
pub(crate) use bit_reader::*;
pub(crate) use bit_reader_reverse::*;
pub(crate) use bit_writer::*;

43
vendor/ruzstd/src/blocks/block.rs vendored Normal file
View File

@@ -0,0 +1,43 @@
//! Block header definitions.
/// There are 4 different kinds of blocks, and the type of block influences the meaning of `Block_Size`.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BlockType {
/// An uncompressed block.
Raw,
/// A single byte, repeated `Block_Size` times (Run Length Encoding).
#[allow(clippy::upper_case_acronyms)]
RLE,
/// A Zstandard compressed block. `Block_Size` is the length of the compressed data.
Compressed,
/// This is not a valid block, and this value should not be used.
/// If this value is present, it should be considered corrupted data.
Reserved,
}
impl core::fmt::Display for BlockType {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
match self {
BlockType::Compressed => write!(f, "Compressed"),
BlockType::Raw => write!(f, "Raw"),
BlockType::RLE => write!(f, "RLE"),
BlockType::Reserved => write!(f, "Reserverd"),
}
}
}
/// A representation of a single block header. As well as containing a frame header,
/// each Zstandard frame contains one or more blocks.
pub struct BlockHeader {
/// Whether this block is the last block in the frame.
/// It may be followed by an optional `Content_Checksum` if it is.
pub last_block: bool,
pub block_type: BlockType,
/// The size of the decompressed data. If the block type
/// is [BlockType::Reserved] or [BlockType::Compressed],
/// this value is set to zero and should not be referenced.
pub decompressed_size: u32,
/// The size of the block. If the block is [BlockType::RLE],
/// this value will be 1.
pub content_size: u32,
}

View File

@@ -0,0 +1,236 @@
//! Utilities and representations for the first half of a block, the literals section.
//! It contains data that is then copied from by the sequences section.
use crate::bit_io::BitReader;
use crate::decoding::errors::LiteralsSectionParseError;
/// A compressed block consists of two sections, a literals section, and a sequences section.
///
/// This is the first of those two sections. A literal is just any arbitrary data, and it is copied by the sequences section
pub struct LiteralsSection {
/// - If this block is of type [LiteralsSectionType::Raw], then the data is `regenerated_bytes`
/// bytes long, and it contains the raw literals data to be used during the second section,
/// the sequences section.
/// - If this block is of type [LiteralsSectionType::RLE],
/// then the literal consists of a single byte repeated `regenerated_size` times.
/// - For types [LiteralsSectionType::Compressed] or [LiteralsSectionType::Treeless],
/// then this is the size of the decompressed data.
pub regenerated_size: u32,
/// - For types [LiteralsSectionType::Raw] and [LiteralsSectionType::RLE], this value is not present.
/// - For types [LiteralsSectionType::Compressed] and [LiteralsSectionType::Treeless], this value will
/// be set to the size of the compressed data.
pub compressed_size: Option<u32>,
/// This value will be either 1 stream or 4 streams if the literal is of type
/// [LiteralsSectionType::Compressed] or [LiteralsSectionType::Treeless], and it
/// is not used for RLE or uncompressed literals.
pub num_streams: Option<u8>,
/// The type of the literal section.
pub ls_type: LiteralsSectionType,
}
/// The way which a literal section is encoded.
pub enum LiteralsSectionType {
/// Literals are stored uncompressed.
Raw,
/// Literals consist of a single byte value repeated [LiteralsSection::regenerated_size] times.
#[allow(clippy::upper_case_acronyms)]
RLE,
/// This is a standard Huffman-compressed block, starting with a Huffman tree description.
/// In this mode, there are at least *2* different literals represented in the Huffman tree
/// description.
Compressed,
/// This is a Huffman-compressed block,
/// using the Huffman tree from the previous [LiteralsSectionType::Compressed] block
/// in the sequence. If this mode is triggered without any previous Huffman-tables in the
/// frame (or dictionary), it should be treated as data corruption.
Treeless,
}
impl Default for LiteralsSection {
fn default() -> Self {
Self::new()
}
}
impl LiteralsSection {
/// Create a new [LiteralsSection].
pub fn new() -> LiteralsSection {
LiteralsSection {
regenerated_size: 0,
compressed_size: None,
num_streams: None,
ls_type: LiteralsSectionType::Raw,
}
}
/// Given the first byte of a header, determine the size of the whole header, from 1 to 5 bytes.
pub fn header_bytes_needed(&self, first_byte: u8) -> Result<u8, LiteralsSectionParseError> {
let ls_type: LiteralsSectionType = Self::section_type(first_byte)?;
let size_format = (first_byte >> 2) & 0x3;
match ls_type {
LiteralsSectionType::RLE | LiteralsSectionType::Raw => {
match size_format {
0 | 2 => {
// size_format actually only uses one bit
// regenerated_size uses 5 bits
Ok(1)
}
1 => {
// size_format uses 2 bit
// regenerated_size uses 12 bits
Ok(2)
}
3 => {
// size_format uses 2 bit
// regenerated_size uses 20 bits
Ok(3)
}
_ => panic!(
"This is a bug in the program. There should only be values between 0..3"
),
}
}
LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => {
match size_format {
0 | 1 => {
// Only differ in num_streams
// both regenerated and compressed sizes use 10 bit
Ok(3)
}
2 => {
// both regenerated and compressed sizes use 14 bit
Ok(4)
}
3 => {
// both regenerated and compressed sizes use 18 bit
Ok(5)
}
_ => panic!(
"This is a bug in the program. There should only be values between 0..3"
),
}
}
}
}
/// Parse the header into `self`, and returns the number of bytes read.
pub fn parse_from_header(&mut self, raw: &[u8]) -> Result<u8, LiteralsSectionParseError> {
let mut br: BitReader<'_> = BitReader::new(raw);
let block_type = br.get_bits(2)? as u8;
self.ls_type = Self::section_type(block_type)?;
let size_format = br.get_bits(2)? as u8;
let byte_needed = self.header_bytes_needed(raw[0])?;
if raw.len() < byte_needed as usize {
return Err(LiteralsSectionParseError::NotEnoughBytes {
have: raw.len(),
need: byte_needed,
});
}
match self.ls_type {
LiteralsSectionType::RLE | LiteralsSectionType::Raw => {
self.compressed_size = None;
match size_format {
0 | 2 => {
// size_format actually only uses one bit
// regenerated_size uses 5 bits
self.regenerated_size = u32::from(raw[0]) >> 3;
Ok(1)
}
1 => {
// size_format uses 2 bit
// regenerated_size uses 12 bits
self.regenerated_size = (u32::from(raw[0]) >> 4) + (u32::from(raw[1]) << 4);
Ok(2)
}
3 => {
// size_format uses 2 bit
// regenerated_size uses 20 bits
self.regenerated_size = (u32::from(raw[0]) >> 4)
+ (u32::from(raw[1]) << 4)
+ (u32::from(raw[2]) << 12);
Ok(3)
}
_ => panic!(
"This is a bug in the program. There should only be values between 0..3"
),
}
}
LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => {
match size_format {
0 => {
self.num_streams = Some(1);
}
1..=3 => {
self.num_streams = Some(4);
}
_ => panic!(
"This is a bug in the program. There should only be values between 0..3"
),
};
match size_format {
0 | 1 => {
// Differ in num_streams see above
// both regenerated and compressed sizes use 10 bit
// 4 from the first, six from the second byte
self.regenerated_size =
(u32::from(raw[0]) >> 4) + ((u32::from(raw[1]) & 0x3f) << 4);
// 2 from the second, full last byte
self.compressed_size =
Some(u32::from(raw[1] >> 6) + (u32::from(raw[2]) << 2));
Ok(3)
}
2 => {
// both regenerated and compressed sizes use 14 bit
// 4 from first, full second, 2 from the third byte
self.regenerated_size = (u32::from(raw[0]) >> 4)
+ (u32::from(raw[1]) << 4)
+ ((u32::from(raw[2]) & 0x3) << 12);
// 6 from the third, full last byte
self.compressed_size =
Some((u32::from(raw[2]) >> 2) + (u32::from(raw[3]) << 6));
Ok(4)
}
3 => {
// both regenerated and compressed sizes use 18 bit
// 4 from first, full second, six from third byte
self.regenerated_size = (u32::from(raw[0]) >> 4)
+ (u32::from(raw[1]) << 4)
+ ((u32::from(raw[2]) & 0x3F) << 12);
// 2 from third, full fourth, full fifth byte
self.compressed_size = Some(
(u32::from(raw[2]) >> 6)
+ (u32::from(raw[3]) << 2)
+ (u32::from(raw[4]) << 10),
);
Ok(5)
}
_ => panic!(
"This is a bug in the program. There should only be values between 0..3"
),
}
}
}
}
/// Given the first two bits of a header, determine the type of a header.
fn section_type(raw: u8) -> Result<LiteralsSectionType, LiteralsSectionParseError> {
let t = raw & 0x3;
match t {
0 => Ok(LiteralsSectionType::Raw),
1 => Ok(LiteralsSectionType::RLE),
2 => Ok(LiteralsSectionType::Compressed),
3 => Ok(LiteralsSectionType::Treeless),
other => Err(LiteralsSectionParseError::IllegalLiteralSectionType { got: other }),
}
}
}

10
vendor/ruzstd/src/blocks/mod.rs vendored Normal file
View File

@@ -0,0 +1,10 @@
//! In a Zstandard frame, there's a frame header, followed by one or more *blocks*.
//!
//! A block contains data, and a header describing how that data is encoded, as well
//! as other misc metadata.
//!
//! <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#blocks>
pub mod block;
pub mod literals_section;
pub mod sequence_section;

View File

@@ -0,0 +1,168 @@
//! Utilities and representations for the second half of a block, the sequence section.
//! This section copies literals from the literals section into the decompressed output.
use crate::decoding::errors::SequencesHeaderParseError;
pub(crate) const MAX_LITERAL_LENGTH_CODE: u8 = 35;
pub(crate) const MAX_MATCH_LENGTH_CODE: u8 = 52;
pub(crate) const MAX_OFFSET_CODE: u8 = 31;
pub struct SequencesHeader {
pub num_sequences: u32,
pub modes: Option<CompressionModes>,
}
/// A sequence represents potentially redundant data, and it can be broken up into 2 steps:
/// - A copy step, where data is copied from the literals section to the decompressed output
/// - A *match* copy step that copies data from within the previously decompressed output.
///
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequence-execution>
#[derive(Clone, Copy)]
pub struct Sequence {
/// Literal length, or the number of bytes to be copied from the literals section
/// in the copy step.
pub ll: u32,
/// The length of the match to make during the match copy step.
pub ml: u32,
/// How far back to go in the decompressed data to read from the match copy step.
/// If this value is greater than 3, then the offset is `of -3`. If `of` is from 1-3,
/// then it has special handling:
///
/// The first 3 values define 3 different repeated offsets, with 1 referring to the most
/// recent, 2 the second recent, and so on. When the current sequence has a literal length of 0,
/// then the repeated offsets are shifted by 1. So an offset value of 1 refers to 2, 2 refers to 3,
/// and 3 refers to the most recent offset minus one. If that value is equal to zero, the data
/// is considered corrupted.
pub of: u32,
}
impl core::fmt::Display for Sequence {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
write!(f, "LL: {}, ML: {}, OF: {}", self.ll, self.ml, self.of)
}
}
/// This byte defines the compression mode of each symbol type
#[derive(Copy, Clone)]
pub struct CompressionModes(u8);
/// The compression mode used for symbol compression
pub enum ModeType {
/// A predefined FSE distribution table is used, and no distribution table
/// will be present.
Predefined,
/// The table consists of a single byte, which contains the symbol's value.
#[allow(clippy::upper_case_acronyms)]
RLE,
/// Standard FSE compression, a distribution table will be present. This
/// mode should not be used when only one symbol is present.
FSECompressed,
/// The table used in the previous compressed block with at least one sequence
/// will be used again. If this is the first block, the table in the dictionary will
/// be used.
Repeat,
}
impl CompressionModes {
/// Deserialize a two bit mode value into a [ModeType]
pub fn decode_mode(m: u8) -> ModeType {
match m {
0 => ModeType::Predefined,
1 => ModeType::RLE,
2 => ModeType::FSECompressed,
3 => ModeType::Repeat,
_ => panic!("This can never happen"),
}
}
/// Read the compression mode of the literal lengths field.
pub fn ll_mode(self) -> ModeType {
Self::decode_mode(self.0 >> 6)
}
/// Read the compression mode of the offset value field.
pub fn of_mode(self) -> ModeType {
Self::decode_mode((self.0 >> 4) & 0x3)
}
/// Read the compression mode of the match lengths field.
pub fn ml_mode(self) -> ModeType {
Self::decode_mode((self.0 >> 2) & 0x3)
}
}
impl Default for SequencesHeader {
fn default() -> Self {
Self::new()
}
}
impl SequencesHeader {
/// Create a new [SequencesHeader].
pub fn new() -> SequencesHeader {
SequencesHeader {
num_sequences: 0,
modes: None,
}
}
/// Attempt to deserialize the provided buffer into `self`, returning the number of bytes read.
pub fn parse_from_header(&mut self, source: &[u8]) -> Result<u8, SequencesHeaderParseError> {
let mut bytes_read = 0;
if source.is_empty() {
return Err(SequencesHeaderParseError::NotEnoughBytes {
need_at_least: 1,
got: 0,
});
}
match source[0] {
0 => {
self.num_sequences = 0;
bytes_read += 1;
}
1..=127 => {
if source.len() < 2 {
return Err(SequencesHeaderParseError::NotEnoughBytes {
need_at_least: 2,
got: source.len(),
});
}
self.num_sequences = u32::from(source[0]);
self.modes = Some(CompressionModes(source[1]));
bytes_read += 2;
}
128..=254 => {
if source.len() < 2 {
return Err(SequencesHeaderParseError::NotEnoughBytes {
need_at_least: 2,
got: source.len(),
});
}
self.num_sequences = ((u32::from(source[0]) - 128) << 8) + u32::from(source[1]);
bytes_read += 2;
if self.num_sequences != 0 {
if source.len() < 3 {
return Err(SequencesHeaderParseError::NotEnoughBytes {
need_at_least: 3,
got: source.len(),
});
}
self.modes = Some(CompressionModes(source[2]));
bytes_read += 1;
}
}
255 => {
if source.len() < 4 {
return Err(SequencesHeaderParseError::NotEnoughBytes {
need_at_least: 4,
got: source.len(),
});
}
self.num_sequences = u32::from(source[1]) + (u32::from(source[2]) << 8) + 0x7F00;
self.modes = Some(CompressionModes(source[3]));
bytes_read += 4;
}
}
Ok(bytes_read)
}
}

21
vendor/ruzstd/src/common/mod.rs vendored Normal file
View File

@@ -0,0 +1,21 @@
//! Values and interfaces shared between the encoding side
//! and the decoding side.
// --- FRAMES ---
/// This magic number is included at the start of a single Zstandard frame
pub const MAGIC_NUM: u32 = 0xFD2F_B528;
/// Window size refers to the minimum amount of memory needed to decode any given frame.
///
/// The minimum window size is defined as 1 KB
pub const MIN_WINDOW_SIZE: u64 = 1024;
/// Window size refers to the minimum amount of memory needed to decode any given frame.
///
/// The maximum window size allowed by the spec is 3.75TB
pub const MAX_WINDOW_SIZE: u64 = (1 << 41) + 7 * (1 << 38);
// --- BLOCKS ---
/// While the spec limits block size to 128KB, the implementation uses
/// 128kibibytes
///
/// <https://github.com/facebook/zstd/blob/eca205fc7849a61ab287492931a04960ac58e031/doc/educational_decoder/zstd_decompress.c#L28-L29>
pub const MAX_BLOCK_SIZE: u32 = 128 * 1024;

View File

@@ -0,0 +1,310 @@
use super::super::blocks::block::BlockHeader;
use super::super::blocks::block::BlockType;
use super::super::blocks::literals_section::LiteralsSection;
use super::super::blocks::literals_section::LiteralsSectionType;
use super::super::blocks::sequence_section::SequencesHeader;
use super::literals_section_decoder::decode_literals;
use super::sequence_section_decoder::decode_sequences;
use crate::common::MAX_BLOCK_SIZE;
use crate::decoding::errors::DecodeSequenceError;
use crate::decoding::errors::{
BlockHeaderReadError, BlockSizeError, BlockTypeError, DecodeBlockContentError,
DecompressBlockError,
};
use crate::decoding::scratch::DecoderScratch;
use crate::decoding::sequence_execution::execute_sequences;
use crate::io::Read;
pub struct BlockDecoder {
header_buffer: [u8; 3],
internal_state: DecoderState,
}
enum DecoderState {
ReadyToDecodeNextHeader,
ReadyToDecodeNextBody,
#[allow(dead_code)]
Failed, //TODO put "self.internal_state = DecoderState::Failed;" everywhere an unresolvable error occurs
}
/// Create a new [BlockDecoder].
pub fn new() -> BlockDecoder {
BlockDecoder {
internal_state: DecoderState::ReadyToDecodeNextHeader,
header_buffer: [0u8; 3],
}
}
impl BlockDecoder {
pub fn decode_block_content(
&mut self,
header: &BlockHeader,
workspace: &mut DecoderScratch, //reuse this as often as possible. Not only if the trees are reused but also reuse the allocations when building new trees
mut source: impl Read,
) -> Result<u64, DecodeBlockContentError> {
match self.internal_state {
DecoderState::ReadyToDecodeNextBody => { /* Happy :) */ }
DecoderState::Failed => return Err(DecodeBlockContentError::DecoderStateIsFailed),
DecoderState::ReadyToDecodeNextHeader => {
return Err(DecodeBlockContentError::ExpectedHeaderOfPreviousBlock)
}
}
let block_type = header.block_type;
match block_type {
BlockType::RLE => {
const BATCH_SIZE: usize = 512;
let mut buf = [0u8; BATCH_SIZE];
let full_reads = header.decompressed_size / BATCH_SIZE as u32;
let single_read_size = header.decompressed_size % BATCH_SIZE as u32;
source.read_exact(&mut buf[0..1]).map_err(|err| {
DecodeBlockContentError::ReadError {
step: block_type,
source: err,
}
})?;
self.internal_state = DecoderState::ReadyToDecodeNextHeader;
for i in 1..BATCH_SIZE {
buf[i] = buf[0];
}
for _ in 0..full_reads {
workspace.buffer.push(&buf[..]);
}
let smaller = &mut buf[..single_read_size as usize];
workspace.buffer.push(smaller);
Ok(1)
}
BlockType::Raw => {
const BATCH_SIZE: usize = 128 * 1024;
let mut buf = [0u8; BATCH_SIZE];
let full_reads = header.decompressed_size / BATCH_SIZE as u32;
let single_read_size = header.decompressed_size % BATCH_SIZE as u32;
for _ in 0..full_reads {
source.read_exact(&mut buf[..]).map_err(|err| {
DecodeBlockContentError::ReadError {
step: block_type,
source: err,
}
})?;
workspace.buffer.push(&buf[..]);
}
let smaller = &mut buf[..single_read_size as usize];
source
.read_exact(smaller)
.map_err(|err| DecodeBlockContentError::ReadError {
step: block_type,
source: err,
})?;
workspace.buffer.push(smaller);
self.internal_state = DecoderState::ReadyToDecodeNextHeader;
Ok(u64::from(header.decompressed_size))
}
BlockType::Reserved => {
panic!("How did you even get this. The decoder should error out if it detects a reserved-type block");
}
BlockType::Compressed => {
self.decompress_block(header, workspace, source)?;
self.internal_state = DecoderState::ReadyToDecodeNextHeader;
Ok(u64::from(header.content_size))
}
}
}
fn decompress_block(
&mut self,
header: &BlockHeader,
workspace: &mut DecoderScratch, //reuse this as often as possible. Not only if the trees are reused but also reuse the allocations when building new trees
mut source: impl Read,
) -> Result<(), DecompressBlockError> {
workspace
.block_content_buffer
.resize(header.content_size as usize, 0);
source.read_exact(workspace.block_content_buffer.as_mut_slice())?;
let raw = workspace.block_content_buffer.as_slice();
let mut section = LiteralsSection::new();
let bytes_in_literals_header = section.parse_from_header(raw)?;
let raw = &raw[bytes_in_literals_header as usize..];
vprintln!(
"Found {} literalssection with regenerated size: {}, and compressed size: {:?}",
section.ls_type,
section.regenerated_size,
section.compressed_size
);
let upper_limit_for_literals = match section.compressed_size {
Some(x) => x as usize,
None => match section.ls_type {
LiteralsSectionType::RLE => 1,
LiteralsSectionType::Raw => section.regenerated_size as usize,
_ => panic!("Bug in this library"),
},
};
if raw.len() < upper_limit_for_literals {
return Err(DecompressBlockError::MalformedSectionHeader {
expected_len: upper_limit_for_literals,
remaining_bytes: raw.len(),
});
}
let raw_literals = &raw[..upper_limit_for_literals];
vprintln!("Slice for literals: {}", raw_literals.len());
workspace.literals_buffer.clear(); //all literals of the previous block must have been used in the sequence execution anyways. just be defensive here
let bytes_used_in_literals_section = decode_literals(
&section,
&mut workspace.huf,
raw_literals,
&mut workspace.literals_buffer,
)?;
assert!(
section.regenerated_size == workspace.literals_buffer.len() as u32,
"Wrong number of literals: {}, Should have been: {}",
workspace.literals_buffer.len(),
section.regenerated_size
);
assert!(bytes_used_in_literals_section == upper_limit_for_literals as u32);
let raw = &raw[upper_limit_for_literals..];
vprintln!("Slice for sequences with headers: {}", raw.len());
let mut seq_section = SequencesHeader::new();
let bytes_in_sequence_header = seq_section.parse_from_header(raw)?;
let raw = &raw[bytes_in_sequence_header as usize..];
vprintln!(
"Found sequencessection with sequences: {} and size: {}",
seq_section.num_sequences,
raw.len()
);
assert!(
u32::from(bytes_in_literals_header)
+ bytes_used_in_literals_section
+ u32::from(bytes_in_sequence_header)
+ raw.len() as u32
== header.content_size
);
vprintln!("Slice for sequences: {}", raw.len());
if seq_section.num_sequences != 0 {
decode_sequences(
&seq_section,
raw,
&mut workspace.fse,
&mut workspace.sequences,
)?;
vprintln!("Executing sequences");
execute_sequences(workspace)?;
} else {
if !raw.is_empty() {
return Err(DecompressBlockError::DecodeSequenceError(
DecodeSequenceError::ExtraBits {
bits_remaining: raw.len() as isize * 8,
},
));
}
workspace.buffer.push(&workspace.literals_buffer);
workspace.sequences.clear();
}
Ok(())
}
/// Reads 3 bytes from the provided reader and returns
/// the deserialized header and the number of bytes read.
pub fn read_block_header(
&mut self,
mut r: impl Read,
) -> Result<(BlockHeader, u8), BlockHeaderReadError> {
//match self.internal_state {
// DecoderState::ReadyToDecodeNextHeader => {/* Happy :) */},
// DecoderState::Failed => return Err(format!("Cant decode next block if failed along the way. Results will be nonsense")),
// DecoderState::ReadyToDecodeNextBody => return Err(format!("Cant decode next block header, while expecting to decode the body of the previous block. Results will be nonsense")),
//}
r.read_exact(&mut self.header_buffer[0..3])?;
let btype = self.block_type()?;
if let BlockType::Reserved = btype {
return Err(BlockHeaderReadError::FoundReservedBlock);
}
let block_size = self.block_content_size()?;
let decompressed_size = match btype {
BlockType::Raw => block_size,
BlockType::RLE => block_size,
BlockType::Reserved => 0, //should be caught above, this is an error state
BlockType::Compressed => 0, //unknown but will be smaller than 128kb (or window_size if that is smaller than 128kb)
};
let content_size = match btype {
BlockType::Raw => block_size,
BlockType::Compressed => block_size,
BlockType::RLE => 1,
BlockType::Reserved => 0, //should be caught above, this is an error state
};
let last_block = self.is_last();
self.reset_buffer();
self.internal_state = DecoderState::ReadyToDecodeNextBody;
//just return 3. Blockheaders always take 3 bytes
Ok((
BlockHeader {
last_block,
block_type: btype,
decompressed_size,
content_size,
},
3,
))
}
fn reset_buffer(&mut self) {
self.header_buffer[0] = 0;
self.header_buffer[1] = 0;
self.header_buffer[2] = 0;
}
fn is_last(&self) -> bool {
self.header_buffer[0] & 0x1 == 1
}
fn block_type(&self) -> Result<BlockType, BlockTypeError> {
let t = (self.header_buffer[0] >> 1) & 0x3;
match t {
0 => Ok(BlockType::Raw),
1 => Ok(BlockType::RLE),
2 => Ok(BlockType::Compressed),
3 => Ok(BlockType::Reserved),
other => Err(BlockTypeError::InvalidBlocktypeNumber { num: other }),
}
}
fn block_content_size(&self) -> Result<u32, BlockSizeError> {
let val = self.block_content_size_unchecked();
if val > MAX_BLOCK_SIZE {
Err(BlockSizeError::BlockSizeTooLarge { size: val })
} else {
Ok(val)
}
}
fn block_content_size_unchecked(&self) -> u32 {
u32::from(self.header_buffer[0] >> 3) //push out type and last_block flags. Retain 5 bit
| (u32::from(self.header_buffer[1]) << 5)
| (u32::from(self.header_buffer[2]) << 13)
}
}

View File

@@ -0,0 +1,451 @@
use crate::io::{Error, Read, Write};
use alloc::vec::Vec;
#[cfg(feature = "hash")]
use core::hash::Hasher;
use super::ringbuffer::RingBuffer;
use crate::decoding::errors::DecodeBufferError;
pub struct DecodeBuffer {
buffer: RingBuffer,
pub dict_content: Vec<u8>,
pub window_size: usize,
total_output_counter: u64,
#[cfg(feature = "hash")]
pub hash: twox_hash::XxHash64,
}
impl Read for DecodeBuffer {
fn read(&mut self, target: &mut [u8]) -> Result<usize, Error> {
let max_amount = self.can_drain_to_window_size().unwrap_or(0);
let amount = max_amount.min(target.len());
let mut written = 0;
self.drain_to(amount, |buf| {
target[written..][..buf.len()].copy_from_slice(buf);
written += buf.len();
(buf.len(), Ok(()))
})?;
Ok(amount)
}
}
impl DecodeBuffer {
pub fn new(window_size: usize) -> DecodeBuffer {
DecodeBuffer {
buffer: RingBuffer::new(),
dict_content: Vec::new(),
window_size,
total_output_counter: 0,
#[cfg(feature = "hash")]
hash: twox_hash::XxHash64::with_seed(0),
}
}
pub fn reset(&mut self, window_size: usize) {
self.window_size = window_size;
self.buffer.clear();
self.buffer.reserve(self.window_size);
self.dict_content.clear();
self.total_output_counter = 0;
#[cfg(feature = "hash")]
{
self.hash = twox_hash::XxHash64::with_seed(0);
}
}
pub fn len(&self) -> usize {
self.buffer.len()
}
pub fn push(&mut self, data: &[u8]) {
self.buffer.extend(data);
self.total_output_counter += data.len() as u64;
}
pub fn repeat(&mut self, offset: usize, match_length: usize) -> Result<(), DecodeBufferError> {
if offset > self.buffer.len() {
self.repeat_from_dict(offset, match_length)
} else {
let buf_len = self.buffer.len();
let start_idx = buf_len - offset;
let end_idx = start_idx + match_length;
self.buffer.reserve(match_length);
if end_idx > buf_len {
// We need to copy in chunks.
self.repeat_in_chunks(offset, match_length, start_idx);
} else {
// can just copy parts of the existing buffer
// SAFETY: Requirements checked:
// 1. start_idx + match_length must be <= self.buffer.len()
// We know that:
// 1. start_idx = self.buffer.len() - offset
// 2. end_idx = start_idx + match_length
// 3. end_idx <= self.buffer.len()
// Thus follows: start_idx + match_length <= self.buffer.len()
//
// 2. explicitly reserved enough memory for the whole match_length
unsafe {
self.buffer
.extend_from_within_unchecked(start_idx, match_length)
};
}
self.total_output_counter += match_length as u64;
Ok(())
}
}
fn repeat_in_chunks(&mut self, offset: usize, match_length: usize, start_idx: usize) {
// We have at max offset bytes in one chunk, the last one can be smaller
let mut start_idx = start_idx;
let mut copied_counter_left = match_length;
// TODO this can be optimized further I think.
// Each time we copy a chunk we have a repetiton of length 'offset', so we can copy offset * iteration many bytes from start_idx
while copied_counter_left > 0 {
let chunksize = usize::min(offset, copied_counter_left);
// SAFETY: Requirements checked:
// 1. start_idx + chunksize must be <= self.buffer.len()
// We know that:
// 1. start_idx starts at buffer.len() - offset
// 2. chunksize <= offset (== offset for each iteration but the last, and match_length modulo offset in the last iteration)
// 3. the buffer grows by offset many bytes each iteration but the last
// 4. start_idx is increased by the same amount as the buffer grows each iteration
//
// Thus follows: start_idx + chunksize == self.buffer.len() in each iteration but the last, where match_length modulo offset == chunksize < offset
// Meaning: start_idx + chunksize <= self.buffer.len()
//
// 2. explicitly reserved enough memory for the whole match_length
unsafe {
self.buffer
.extend_from_within_unchecked(start_idx, chunksize)
};
copied_counter_left -= chunksize;
start_idx += chunksize;
}
}
#[cold]
fn repeat_from_dict(
&mut self,
offset: usize,
match_length: usize,
) -> Result<(), DecodeBufferError> {
if self.total_output_counter <= self.window_size as u64 {
// at least part of that repeat is from the dictionary content
let bytes_from_dict = offset - self.buffer.len();
if bytes_from_dict > self.dict_content.len() {
return Err(DecodeBufferError::NotEnoughBytesInDictionary {
got: self.dict_content.len(),
need: bytes_from_dict,
});
}
if bytes_from_dict < match_length {
let dict_slice = &self.dict_content[self.dict_content.len() - bytes_from_dict..];
self.buffer.extend(dict_slice);
self.total_output_counter += bytes_from_dict as u64;
return self.repeat(self.buffer.len(), match_length - bytes_from_dict);
} else {
let low = self.dict_content.len() - bytes_from_dict;
let high = low + match_length;
let dict_slice = &self.dict_content[low..high];
self.buffer.extend(dict_slice);
}
Ok(())
} else {
Err(DecodeBufferError::OffsetTooBig {
offset,
buf_len: self.buffer.len(),
})
}
}
/// Check if and how many bytes can currently be drawn from the buffer
pub fn can_drain_to_window_size(&self) -> Option<usize> {
if self.buffer.len() > self.window_size {
Some(self.buffer.len() - self.window_size)
} else {
None
}
}
//How many bytes can be drained if the window_size does not have to be maintained
pub fn can_drain(&self) -> usize {
self.buffer.len()
}
/// Drain as much as possible while retaining enough so that decoding si still possible with the required window_size
/// At best call only if can_drain_to_window_size reports a 'high' number of bytes to reduce allocations
pub fn drain_to_window_size(&mut self) -> Option<Vec<u8>> {
//TODO investigate if it is possible to return the std::vec::Drain iterator directly without collecting here
match self.can_drain_to_window_size() {
None => None,
Some(can_drain) => {
let mut vec = Vec::with_capacity(can_drain);
self.drain_to(can_drain, |buf| {
vec.extend_from_slice(buf);
(buf.len(), Ok(()))
})
.ok()?;
Some(vec)
}
}
}
pub fn drain_to_window_size_writer(&mut self, mut sink: impl Write) -> Result<usize, Error> {
match self.can_drain_to_window_size() {
None => Ok(0),
Some(can_drain) => self.drain_to(can_drain, |buf| write_all_bytes(&mut sink, buf)),
}
}
/// drain the buffer completely
pub fn drain(&mut self) -> Vec<u8> {
let (slice1, slice2) = self.buffer.as_slices();
#[cfg(feature = "hash")]
{
self.hash.write(slice1);
self.hash.write(slice2);
}
let mut vec = Vec::with_capacity(slice1.len() + slice2.len());
vec.extend_from_slice(slice1);
vec.extend_from_slice(slice2);
self.buffer.clear();
vec
}
pub fn drain_to_writer(&mut self, mut sink: impl Write) -> Result<usize, Error> {
let write_limit = self.buffer.len();
self.drain_to(write_limit, |buf| write_all_bytes(&mut sink, buf))
}
pub fn read_all(&mut self, target: &mut [u8]) -> Result<usize, Error> {
let amount = self.buffer.len().min(target.len());
let mut written = 0;
self.drain_to(amount, |buf| {
target[written..][..buf.len()].copy_from_slice(buf);
written += buf.len();
(buf.len(), Ok(()))
})?;
Ok(amount)
}
/// Semantics of write_bytes:
/// Should dump as many of the provided bytes as possible to whatever sink until no bytes are left or an error is encountered
/// Return how many bytes have actually been dumped to the sink.
fn drain_to(
&mut self,
amount: usize,
mut write_bytes: impl FnMut(&[u8]) -> (usize, Result<(), Error>),
) -> Result<usize, Error> {
if amount == 0 {
return Ok(0);
}
struct DrainGuard<'a> {
buffer: &'a mut RingBuffer,
amount: usize,
}
impl Drop for DrainGuard<'_> {
fn drop(&mut self) {
if self.amount != 0 {
self.buffer.drop_first_n(self.amount);
}
}
}
let mut drain_guard = DrainGuard {
buffer: &mut self.buffer,
amount: 0,
};
let (slice1, slice2) = drain_guard.buffer.as_slices();
let n1 = slice1.len().min(amount);
let n2 = slice2.len().min(amount - n1);
if n1 != 0 {
let (written1, res1) = write_bytes(&slice1[..n1]);
#[cfg(feature = "hash")]
self.hash.write(&slice1[..written1]);
drain_guard.amount += written1;
// Apparently this is what clippy thinks is the best way of expressing this
res1?;
// Only if the first call to write_bytes was not a partial write we can continue with slice2
// Partial writes SHOULD never happen without res1 being an error, but lets just protect against it anyways.
if written1 == n1 && n2 != 0 {
let (written2, res2) = write_bytes(&slice2[..n2]);
#[cfg(feature = "hash")]
self.hash.write(&slice2[..written2]);
drain_guard.amount += written2;
// Apparently this is what clippy thinks is the best way of expressing this
res2?;
}
}
let amount_written = drain_guard.amount;
// Make sure we don't accidentally drop `DrainGuard` earlier.
drop(drain_guard);
Ok(amount_written)
}
}
/// Like Write::write_all but returns partial write length even on error
fn write_all_bytes(mut sink: impl Write, buf: &[u8]) -> (usize, Result<(), Error>) {
let mut written = 0;
while written < buf.len() {
match sink.write(&buf[written..]) {
Ok(0) => return (written, Ok(())),
Ok(w) => written += w,
Err(e) => return (written, Err(e)),
}
}
(written, Ok(()))
}
#[cfg(test)]
mod tests {
use super::DecodeBuffer;
use crate::io::{Error, ErrorKind, Write};
extern crate std;
use alloc::vec;
use alloc::vec::Vec;
#[test]
fn short_writer() {
struct ShortWriter {
buf: Vec<u8>,
write_len: usize,
}
impl Write for ShortWriter {
fn write(&mut self, buf: &[u8]) -> std::result::Result<usize, Error> {
if buf.len() > self.write_len {
self.buf.extend_from_slice(&buf[..self.write_len]);
Ok(self.write_len)
} else {
self.buf.extend_from_slice(buf);
Ok(buf.len())
}
}
fn flush(&mut self) -> std::result::Result<(), Error> {
Ok(())
}
}
let mut short_writer = ShortWriter {
buf: vec![],
write_len: 10,
};
let mut decode_buf = DecodeBuffer::new(100);
decode_buf.push(b"0123456789");
decode_buf.repeat(10, 90).unwrap();
let repeats = 1000;
for _ in 0..repeats {
assert_eq!(decode_buf.len(), 100);
decode_buf.repeat(10, 50).unwrap();
assert_eq!(decode_buf.len(), 150);
decode_buf
.drain_to_window_size_writer(&mut short_writer)
.unwrap();
assert_eq!(decode_buf.len(), 100);
}
assert_eq!(short_writer.buf.len(), repeats * 50);
decode_buf.drain_to_writer(&mut short_writer).unwrap();
assert_eq!(short_writer.buf.len(), repeats * 50 + 100);
}
#[test]
fn wouldblock_writer() {
struct WouldblockWriter {
buf: Vec<u8>,
last_blocked: usize,
block_every: usize,
}
impl Write for WouldblockWriter {
fn write(&mut self, buf: &[u8]) -> std::result::Result<usize, Error> {
if self.last_blocked < self.block_every {
self.buf.extend_from_slice(buf);
self.last_blocked += 1;
Ok(buf.len())
} else {
self.last_blocked = 0;
Err(Error::from(ErrorKind::WouldBlock))
}
}
fn flush(&mut self) -> std::result::Result<(), Error> {
Ok(())
}
}
let mut short_writer = WouldblockWriter {
buf: vec![],
last_blocked: 0,
block_every: 5,
};
let mut decode_buf = DecodeBuffer::new(100);
decode_buf.push(b"0123456789");
decode_buf.repeat(10, 90).unwrap();
let repeats = 1000;
for _ in 0..repeats {
assert_eq!(decode_buf.len(), 100);
decode_buf.repeat(10, 50).unwrap();
assert_eq!(decode_buf.len(), 150);
loop {
match decode_buf.drain_to_window_size_writer(&mut short_writer) {
Ok(written) => {
if written == 0 {
break;
}
}
Err(e) => {
if e.kind() == ErrorKind::WouldBlock {
continue;
} else {
panic!("Unexpected error {:?}", e);
}
}
}
}
assert_eq!(decode_buf.len(), 100);
}
assert_eq!(short_writer.buf.len(), repeats * 50);
loop {
match decode_buf.drain_to_writer(&mut short_writer) {
Ok(written) => {
if written == 0 {
break;
}
}
Err(e) => {
if e.kind() == ErrorKind::WouldBlock {
continue;
} else {
panic!("Unexpected error {:?}", e);
}
}
}
}
assert_eq!(short_writer.buf.len(), repeats * 50 + 100);
}
}

104
vendor/ruzstd/src/decoding/dictionary.rs vendored Normal file
View File

@@ -0,0 +1,104 @@
use alloc::vec::Vec;
use core::convert::TryInto;
use crate::decoding::errors::DictionaryDecodeError;
use crate::decoding::scratch::FSEScratch;
use crate::decoding::scratch::HuffmanScratch;
/// Zstandard includes support for "raw content" dictionaries, that store bytes optionally used
/// during sequence execution.
///
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format>
pub struct Dictionary {
/// A 4 byte value used by decoders to check if they can use
/// the correct dictionary. This value must not be zero.
pub id: u32,
/// A dictionary can contain an entropy table, either FSE or
/// Huffman.
pub fse: FSEScratch,
/// A dictionary can contain an entropy table, either FSE or
/// Huffman.
pub huf: HuffmanScratch,
/// The content of a dictionary acts as a "past" in front of data
/// to compress or decompress,
/// so it can be referenced in sequence commands.
/// As long as the amount of data decoded from this frame is less than or
/// equal to Window_Size, sequence commands may specify offsets longer than
/// the total length of decoded output so far to reference back to the
/// dictionary, even parts of the dictionary with offsets larger than Window_Size.
/// After the total output has surpassed Window_Size however,
/// this is no longer allowed and the dictionary is no longer accessible
pub dict_content: Vec<u8>,
/// The 3 most recent offsets are stored so that they can be used
/// during sequence execution, see
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#repeat-offsets>
/// for more.
pub offset_hist: [u32; 3],
}
/// This 4 byte (little endian) magic number refers to the start of a dictionary
pub const MAGIC_NUM: [u8; 4] = [0x37, 0xA4, 0x30, 0xEC];
impl Dictionary {
/// Parses the dictionary from `raw` and set the tables
/// it returns the dict_id for checking with the frame's `dict_id``
pub fn decode_dict(raw: &[u8]) -> Result<Dictionary, DictionaryDecodeError> {
let mut new_dict = Dictionary {
id: 0,
fse: FSEScratch::new(),
huf: HuffmanScratch::new(),
dict_content: Vec::new(),
offset_hist: [2, 4, 8],
};
let magic_num: [u8; 4] = raw[..4].try_into().expect("optimized away");
if magic_num != MAGIC_NUM {
return Err(DictionaryDecodeError::BadMagicNum { got: magic_num });
}
let dict_id = raw[4..8].try_into().expect("optimized away");
let dict_id = u32::from_le_bytes(dict_id);
new_dict.id = dict_id;
let raw_tables = &raw[8..];
let huf_size = new_dict.huf.table.build_decoder(raw_tables)?;
let raw_tables = &raw_tables[huf_size as usize..];
let of_size = new_dict.fse.offsets.build_decoder(
raw_tables,
crate::decoding::sequence_section_decoder::OF_MAX_LOG,
)?;
let raw_tables = &raw_tables[of_size..];
let ml_size = new_dict.fse.match_lengths.build_decoder(
raw_tables,
crate::decoding::sequence_section_decoder::ML_MAX_LOG,
)?;
let raw_tables = &raw_tables[ml_size..];
let ll_size = new_dict.fse.literal_lengths.build_decoder(
raw_tables,
crate::decoding::sequence_section_decoder::LL_MAX_LOG,
)?;
let raw_tables = &raw_tables[ll_size..];
let offset1 = raw_tables[0..4].try_into().expect("optimized away");
let offset1 = u32::from_le_bytes(offset1);
let offset2 = raw_tables[4..8].try_into().expect("optimized away");
let offset2 = u32::from_le_bytes(offset2);
let offset3 = raw_tables[8..12].try_into().expect("optimized away");
let offset3 = u32::from_le_bytes(offset3);
new_dict.offset_hist[0] = offset1;
new_dict.offset_hist[1] = offset2;
new_dict.offset_hist[2] = offset3;
let raw_content = &raw_tables[12..];
new_dict.dict_content.extend(raw_content);
Ok(new_dict)
}
}

1187
vendor/ruzstd/src/decoding/errors.rs vendored Normal file

File diff suppressed because it is too large Load Diff

241
vendor/ruzstd/src/decoding/frame.rs vendored Normal file
View File

@@ -0,0 +1,241 @@
use crate::common::{MAGIC_NUM, MAX_WINDOW_SIZE, MIN_WINDOW_SIZE};
use crate::decoding::errors::{FrameDescriptorError, FrameHeaderError, ReadFrameHeaderError};
use crate::io::Read;
/// Read a single serialized frame from the reader and return a tuple containing the parsed frame and the number of bytes read.
pub fn read_frame_header(mut r: impl Read) -> Result<(FrameHeader, u8), ReadFrameHeaderError> {
use ReadFrameHeaderError as err;
let mut buf = [0u8; 4];
r.read_exact(&mut buf).map_err(err::MagicNumberReadError)?;
let mut bytes_read = 4;
let magic_num = u32::from_le_bytes(buf);
// Skippable frames have a magic number in this interval
if (0x184D2A50..=0x184D2A5F).contains(&magic_num) {
r.read_exact(&mut buf)
.map_err(err::FrameDescriptorReadError)?;
let skip_size = u32::from_le_bytes(buf);
return Err(ReadFrameHeaderError::SkipFrame {
magic_number: magic_num,
length: skip_size,
});
}
if magic_num != MAGIC_NUM {
return Err(ReadFrameHeaderError::BadMagicNumber(magic_num));
}
r.read_exact(&mut buf[0..1])
.map_err(err::FrameDescriptorReadError)?;
let desc = FrameDescriptor(buf[0]);
bytes_read += 1;
let mut frame_header = FrameHeader {
descriptor: FrameDescriptor(desc.0),
dict_id: None,
frame_content_size: 0,
window_descriptor: 0,
};
if !desc.single_segment_flag() {
r.read_exact(&mut buf[0..1])
.map_err(err::WindowDescriptorReadError)?;
frame_header.window_descriptor = buf[0];
bytes_read += 1;
}
let dict_id_len = desc.dictionary_id_bytes()? as usize;
if dict_id_len != 0 {
let buf = &mut buf[..dict_id_len];
r.read_exact(buf).map_err(err::DictionaryIdReadError)?;
bytes_read += dict_id_len;
let mut dict_id = 0u32;
#[allow(clippy::needless_range_loop)]
for i in 0..dict_id_len {
dict_id += (buf[i] as u32) << (8 * i);
}
if dict_id != 0 {
frame_header.dict_id = Some(dict_id);
}
}
let fcs_len = desc.frame_content_size_bytes()? as usize;
if fcs_len != 0 {
let mut fcs_buf = [0u8; 8];
let fcs_buf = &mut fcs_buf[..fcs_len];
r.read_exact(fcs_buf)
.map_err(err::FrameContentSizeReadError)?;
bytes_read += fcs_len;
let mut fcs = 0u64;
#[allow(clippy::needless_range_loop)]
for i in 0..fcs_len {
fcs += (fcs_buf[i] as u64) << (8 * i);
}
if fcs_len == 2 {
fcs += 256;
}
frame_header.frame_content_size = fcs;
}
Ok((frame_header, bytes_read as u8))
}
/// A frame header has a variable size, with a minimum of 2 bytes, and a maximum of 14 bytes.
pub struct FrameHeader {
pub descriptor: FrameDescriptor,
/// The `Window_Descriptor` field contains the minimum size of a memory buffer needed to
/// decompress the entire frame.
///
/// This byte is not included in the frame header when the `Single_Segment_flag` is set.
///
/// Bits 7-3 refer to the `Exponent`, where bits 2-0 refer to the `Mantissa`.
///
/// To determine the size of a window, the following formula can be used:
/// ```text
/// windowLog = 10 + Exponent;
/// windowBase = 1 << windowLog;
/// windowAdd = (windowBase / 8) * Mantissa;
/// Window_Size = windowBase + windowAdd;
/// ```
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor>
window_descriptor: u8,
/// The `Dictionary_ID` field contains the ID of the dictionary to be used to decode the frame.
/// When this value is not present, it's up to the decoder to know which dictionary to use.
dict_id: Option<u32>,
/// The size of the original/uncompressed content.
frame_content_size: u64,
}
impl FrameHeader {
/// Read the size of the window from the header or the total frame content size,
/// whichever is defined, returning the size in bytes.
pub fn window_size(&self) -> Result<u64, FrameHeaderError> {
if self.descriptor.single_segment_flag() {
Ok(self.frame_content_size())
} else {
let exp = self.window_descriptor >> 3;
let mantissa = self.window_descriptor & 0x7;
let window_log = 10 + u64::from(exp);
let window_base = 1 << window_log;
let window_add = (window_base / 8) * u64::from(mantissa);
let window_size = window_base + window_add;
if window_size >= MIN_WINDOW_SIZE {
if window_size < MAX_WINDOW_SIZE {
Ok(window_size)
} else {
Err(FrameHeaderError::WindowTooBig { got: window_size })
}
} else {
Err(FrameHeaderError::WindowTooSmall { got: window_size })
}
}
}
/// The ID (if provided) of the dictionary required to decode this frame.
pub fn dictionary_id(&self) -> Option<u32> {
self.dict_id
}
/// Obtain the uncompressed size (in bytes) of the frame contents.
pub fn frame_content_size(&self) -> u64 {
self.frame_content_size
}
}
/// The first byte is called the `Frame Header Descriptor`, and it describes what other fields
/// are present.
pub struct FrameDescriptor(pub u8);
impl FrameDescriptor {
/// Read the `Frame_Content_Size_flag` from the frame header descriptor.
///
/// This is a 2 bit flag, specifying if the `Frame_Content_Size` field is present
/// within the header. It notates the number of bytes used by `Frame_Content_size`
///
/// When this value is is 0, `FCS_Field_Size` depends on Single_Segment_flag.
/// If the `Single_Segment_flag` field is set in the frame header descriptor,
/// the size of the `Frame_Content_Size` field of the header is 1 byte.
/// Otherwise, `FCS_Field_Size` is 0, and the `Frame_Content_Size` is not provided.
///
/// | Flag Value (decimal) | Size of the `Frame_Content_Size` field in bytes |
/// | -- | -- |
/// | 0 | 0 or 1 (see above) |
/// | 1 | 2 |
/// | 2 | 4 |
/// | 3 | 8 |
pub fn frame_content_size_flag(&self) -> u8 {
self.0 >> 6
}
/// This bit is reserved for some future feature, a compliant decoder **must ensure**
/// that this value is set to zero.
#[expect(dead_code)]
pub fn reserved_flag(&self) -> bool {
((self.0 >> 3) & 0x1) == 1
}
/// If this flag is set, data must be regenerated within a single continuous memory segment.
///
/// In this case, the `Window_Descriptor` byte is skipped, but `Frame_Content_Size` is present.
/// The decoder must allocate a memory segment equal to or larger than `Frame_Content_Size`.
pub fn single_segment_flag(&self) -> bool {
((self.0 >> 5) & 0x1) == 1
}
/// If this flag is set, a 32 bit `Content_Checksum` will be present at the end of the frame.
pub fn content_checksum_flag(&self) -> bool {
((self.0 >> 2) & 0x1) == 1
}
/// This is a two bit flag telling if a dictionary ID is provided within the header. It also
/// specifies the size of this field
///
/// | Value (Decimal) | `DID_Field_Size` (bytes) |
/// | -- | -- |
/// | 0 | 0 |
/// | 1 | 1 |
/// | 2 | 2 |
/// | 3 | 4 |
pub fn dict_id_flag(&self) -> u8 {
self.0 & 0x3
}
/// Read the size of the `Frame_Content_size` field from the frame header descriptor, returning
/// the size in bytes.
/// If this value is zero, then the `Frame_Content_Size` field is not present within the header.
pub fn frame_content_size_bytes(&self) -> Result<u8, FrameDescriptorError> {
match self.frame_content_size_flag() {
0 => {
if self.single_segment_flag() {
Ok(1)
} else {
Ok(0)
}
}
1 => Ok(2),
2 => Ok(4),
3 => Ok(8),
other => Err(FrameDescriptorError::InvalidFrameContentSizeFlag { got: other }),
}
}
/// Read the size of the `Dictionary_ID` field from the frame header descriptor, returning the size in bytes.
/// If this value is zero, then the dictionary id is not present within the header,
/// and "It's up to the decoder to know which dictionary to use."
pub fn dictionary_id_bytes(&self) -> Result<u8, FrameDescriptorError> {
match self.dict_id_flag() {
0 => Ok(0),
1 => Ok(1),
2 => Ok(2),
3 => Ok(4),
other => Err(FrameDescriptorError::InvalidFrameContentSizeFlag { got: other }),
}
}
}

View File

@@ -0,0 +1,597 @@
//! Framedecoder is the main low-level struct users interact with to decode zstd frames
//!
//! Zstandard compressed data is made of one or more frames. Each frame is independent and can be
//! decompressed independently of other frames. This module contains structures
//! and utilities that can be used to decode a frame.
use super::frame;
use crate::decoding;
use crate::decoding::dictionary::Dictionary;
use crate::decoding::errors::FrameDecoderError;
use crate::decoding::scratch::DecoderScratch;
use crate::io::{Error, Read, Write};
use alloc::collections::BTreeMap;
use alloc::vec::Vec;
use core::convert::TryInto;
/// While the maximum window size allowed by the spec is significantly larger,
/// our implementation limits it to 100mb to protect against malformed frames.
const MAXIMUM_ALLOWED_WINDOW_SIZE: u64 = 1024 * 1024 * 100;
/// Low level Zstandard decoder that can be used to decompress frames with fine control over when and how many bytes are decoded.
///
/// This decoder is able to decode frames only partially and gives control
/// over how many bytes/blocks will be decoded at a time (so you don't have to decode a 10GB file into memory all at once).
/// It reads bytes as needed from a provided source and can be read from to collect partial results.
///
/// If you want to just read the whole frame with an `io::Read` without having to deal with manually calling [FrameDecoder::decode_blocks]
/// you can use the provided [crate::decoding::StreamingDecoder] wich wraps this FrameDecoder.
///
/// Workflow is as follows:
/// ```
/// use ruzstd::decoding::BlockDecodingStrategy;
///
/// # #[cfg(feature = "std")]
/// use std::io::{Read, Write};
///
/// // no_std environments can use the crate's own Read traits
/// # #[cfg(not(feature = "std"))]
/// use ruzstd::io::{Read, Write};
///
/// fn decode_this(mut file: impl Read) {
/// //Create a new decoder
/// let mut frame_dec = ruzstd::decoding::FrameDecoder::new();
/// let mut result = Vec::new();
///
/// // Use reset or init to make the decoder ready to decode the frame from the io::Read
/// frame_dec.reset(&mut file).unwrap();
///
/// // Loop until the frame has been decoded completely
/// while !frame_dec.is_finished() {
/// // decode (roughly) batch_size many bytes
/// frame_dec.decode_blocks(&mut file, BlockDecodingStrategy::UptoBytes(1024)).unwrap();
///
/// // read from the decoder to collect bytes from the internal buffer
/// let bytes_read = frame_dec.read(result.as_mut_slice()).unwrap();
///
/// // then do something with it
/// do_something(&result[0..bytes_read]);
/// }
///
/// // handle the last chunk of data
/// while frame_dec.can_collect() > 0 {
/// let x = frame_dec.read(result.as_mut_slice()).unwrap();
///
/// do_something(&result[0..x]);
/// }
/// }
///
/// fn do_something(data: &[u8]) {
/// # #[cfg(feature = "std")]
/// std::io::stdout().write_all(data).unwrap();
/// }
/// ```
pub struct FrameDecoder {
state: Option<FrameDecoderState>,
dicts: BTreeMap<u32, Dictionary>,
}
struct FrameDecoderState {
pub frame_header: frame::FrameHeader,
decoder_scratch: DecoderScratch,
frame_finished: bool,
block_counter: usize,
bytes_read_counter: u64,
check_sum: Option<u32>,
using_dict: Option<u32>,
}
pub enum BlockDecodingStrategy {
All,
UptoBlocks(usize),
UptoBytes(usize),
}
impl FrameDecoderState {
pub fn new(source: impl Read) -> Result<FrameDecoderState, FrameDecoderError> {
let (frame, header_size) = frame::read_frame_header(source)?;
let window_size = frame.window_size()?;
Ok(FrameDecoderState {
frame_header: frame,
frame_finished: false,
block_counter: 0,
decoder_scratch: DecoderScratch::new(window_size as usize),
bytes_read_counter: u64::from(header_size),
check_sum: None,
using_dict: None,
})
}
pub fn reset(&mut self, source: impl Read) -> Result<(), FrameDecoderError> {
let (frame_header, header_size) = frame::read_frame_header(source)?;
let window_size = frame_header.window_size()?;
if window_size > MAXIMUM_ALLOWED_WINDOW_SIZE {
return Err(FrameDecoderError::WindowSizeTooBig {
requested: window_size,
});
}
self.frame_header = frame_header;
self.frame_finished = false;
self.block_counter = 0;
self.decoder_scratch.reset(window_size as usize);
self.bytes_read_counter = u64::from(header_size);
self.check_sum = None;
self.using_dict = None;
Ok(())
}
}
impl Default for FrameDecoder {
fn default() -> Self {
Self::new()
}
}
impl FrameDecoder {
/// This will create a new decoder without allocating anything yet.
/// init()/reset() will allocate all needed buffers if it is the first time this decoder is used
/// else they just reset these buffers with not further allocations
pub fn new() -> FrameDecoder {
FrameDecoder {
state: None,
dicts: BTreeMap::new(),
}
}
/// init() will allocate all needed buffers if it is the first time this decoder is used
/// else they just reset these buffers with not further allocations
///
/// Note that all bytes currently in the decodebuffer from any previous frame will be lost. Collect them with collect()/collect_to_writer()
///
/// equivalent to reset()
pub fn init(&mut self, source: impl Read) -> Result<(), FrameDecoderError> {
self.reset(source)
}
/// reset() will allocate all needed buffers if it is the first time this decoder is used
/// else they just reset these buffers with not further allocations
///
/// Note that all bytes currently in the decodebuffer from any previous frame will be lost. Collect them with collect()/collect_to_writer()
///
/// equivalent to init()
pub fn reset(&mut self, source: impl Read) -> Result<(), FrameDecoderError> {
use FrameDecoderError as err;
let state = match &mut self.state {
Some(s) => {
s.reset(source)?;
s
}
None => {
self.state = Some(FrameDecoderState::new(source)?);
self.state.as_mut().unwrap()
}
};
if let Some(dict_id) = state.frame_header.dictionary_id() {
let dict = self
.dicts
.get(&dict_id)
.ok_or(err::DictNotProvided { dict_id })?;
state.decoder_scratch.init_from_dict(dict);
state.using_dict = Some(dict_id);
}
Ok(())
}
/// Add a dict to the FrameDecoder that can be used when needed. The FrameDecoder uses the appropriate one dynamically
pub fn add_dict(&mut self, dict: Dictionary) -> Result<(), FrameDecoderError> {
self.dicts.insert(dict.id, dict);
Ok(())
}
pub fn force_dict(&mut self, dict_id: u32) -> Result<(), FrameDecoderError> {
use FrameDecoderError as err;
let Some(state) = self.state.as_mut() else {
return Err(err::NotYetInitialized);
};
let dict = self
.dicts
.get(&dict_id)
.ok_or(err::DictNotProvided { dict_id })?;
state.decoder_scratch.init_from_dict(dict);
state.using_dict = Some(dict_id);
Ok(())
}
/// Returns how many bytes the frame contains after decompression
pub fn content_size(&self) -> u64 {
match &self.state {
None => 0,
Some(s) => s.frame_header.frame_content_size(),
}
}
/// Returns the checksum that was read from the data. Only available after all bytes have been read. It is the last 4 bytes of a zstd-frame
pub fn get_checksum_from_data(&self) -> Option<u32> {
let state = match &self.state {
None => return None,
Some(s) => s,
};
state.check_sum
}
/// Returns the checksum that was calculated while decoding.
/// Only a sensible value after all decoded bytes have been collected/read from the FrameDecoder
#[cfg(feature = "hash")]
pub fn get_calculated_checksum(&self) -> Option<u32> {
use core::hash::Hasher;
let state = match &self.state {
None => return None,
Some(s) => s,
};
let cksum_64bit = state.decoder_scratch.buffer.hash.finish();
//truncate to lower 32bit because reasons...
Some(cksum_64bit as u32)
}
/// Counter for how many bytes have been consumed while decoding the frame
pub fn bytes_read_from_source(&self) -> u64 {
let state = match &self.state {
None => return 0,
Some(s) => s,
};
state.bytes_read_counter
}
/// Whether the current frames last block has been decoded yet
/// If this returns true you can call the drain* functions to get all content
/// (the read() function will drain automatically if this returns true)
pub fn is_finished(&self) -> bool {
let state = match &self.state {
None => return true,
Some(s) => s,
};
if state.frame_header.descriptor.content_checksum_flag() {
state.frame_finished && state.check_sum.is_some()
} else {
state.frame_finished
}
}
/// Counter for how many blocks have already been decoded
pub fn blocks_decoded(&self) -> usize {
let state = match &self.state {
None => return 0,
Some(s) => s,
};
state.block_counter
}
/// Decodes blocks from a reader. It requires that the framedecoder has been initialized first.
/// The Strategy influences how many blocks will be decoded before the function returns
/// This is important if you want to manage memory consumption carefully. If you don't care
/// about that you can just choose the strategy "All" and have all blocks of the frame decoded into the buffer
pub fn decode_blocks(
&mut self,
mut source: impl Read,
strat: BlockDecodingStrategy,
) -> Result<bool, FrameDecoderError> {
use FrameDecoderError as err;
let state = self.state.as_mut().ok_or(err::NotYetInitialized)?;
let mut block_dec = decoding::block_decoder::new();
let buffer_size_before = state.decoder_scratch.buffer.len();
let block_counter_before = state.block_counter;
loop {
vprintln!("################");
vprintln!("Next Block: {}", state.block_counter);
vprintln!("################");
let (block_header, block_header_size) = block_dec
.read_block_header(&mut source)
.map_err(err::FailedToReadBlockHeader)?;
state.bytes_read_counter += u64::from(block_header_size);
vprintln!();
vprintln!(
"Found {} block with size: {}, which will be of size: {}",
block_header.block_type,
block_header.content_size,
block_header.decompressed_size
);
let bytes_read_in_block_body = block_dec
.decode_block_content(&block_header, &mut state.decoder_scratch, &mut source)
.map_err(err::FailedToReadBlockBody)?;
state.bytes_read_counter += bytes_read_in_block_body;
state.block_counter += 1;
vprintln!("Output: {}", state.decoder_scratch.buffer.len());
if block_header.last_block {
state.frame_finished = true;
if state.frame_header.descriptor.content_checksum_flag() {
let mut chksum = [0u8; 4];
source
.read_exact(&mut chksum)
.map_err(err::FailedToReadChecksum)?;
state.bytes_read_counter += 4;
let chksum = u32::from_le_bytes(chksum);
state.check_sum = Some(chksum);
}
break;
}
match strat {
BlockDecodingStrategy::All => { /* keep going */ }
BlockDecodingStrategy::UptoBlocks(n) => {
if state.block_counter - block_counter_before >= n {
break;
}
}
BlockDecodingStrategy::UptoBytes(n) => {
if state.decoder_scratch.buffer.len() - buffer_size_before >= n {
break;
}
}
}
}
Ok(state.frame_finished)
}
/// Collect bytes and retain window_size bytes while decoding is still going on.
/// After decoding of the frame (is_finished() == true) has finished it will collect all remaining bytes
pub fn collect(&mut self) -> Option<Vec<u8>> {
let finished = self.is_finished();
let state = self.state.as_mut()?;
if finished {
Some(state.decoder_scratch.buffer.drain())
} else {
state.decoder_scratch.buffer.drain_to_window_size()
}
}
/// Collect bytes and retain window_size bytes while decoding is still going on.
/// After decoding of the frame (is_finished() == true) has finished it will collect all remaining bytes
pub fn collect_to_writer(&mut self, w: impl Write) -> Result<usize, Error> {
let finished = self.is_finished();
let state = match &mut self.state {
None => return Ok(0),
Some(s) => s,
};
if finished {
state.decoder_scratch.buffer.drain_to_writer(w)
} else {
state.decoder_scratch.buffer.drain_to_window_size_writer(w)
}
}
/// How many bytes can currently be collected from the decodebuffer, while decoding is going on this will be lower than the actual decodbuffer size
/// because window_size bytes need to be retained for decoding.
/// After decoding of the frame (is_finished() == true) has finished it will report all remaining bytes
pub fn can_collect(&self) -> usize {
let finished = self.is_finished();
let state = match &self.state {
None => return 0,
Some(s) => s,
};
if finished {
state.decoder_scratch.buffer.can_drain()
} else {
state
.decoder_scratch
.buffer
.can_drain_to_window_size()
.unwrap_or(0)
}
}
/// Decodes as many blocks as possible from the source slice and reads from the decodebuffer into the target slice
/// The source slice may contain only parts of a frame but must contain at least one full block to make progress
///
/// By all means use decode_blocks if you have a io.Reader available. This is just for compatibility with other decompressors
/// which try to serve an old-style c api
///
/// Returns (read, written), if read == 0 then the source did not contain a full block and further calls with the same
/// input will not make any progress!
///
/// Note that no kind of block can be bigger than 128kb.
/// So to be safe use at least 128*1024 (max block content size) + 3 (block_header size) + 18 (max frame_header size) bytes as your source buffer
///
/// You may call this function with an empty source after all bytes have been decoded. This is equivalent to just call decoder.read(&mut target)
pub fn decode_from_to(
&mut self,
source: &[u8],
target: &mut [u8],
) -> Result<(usize, usize), FrameDecoderError> {
use FrameDecoderError as err;
let bytes_read_at_start = match &self.state {
Some(s) => s.bytes_read_counter,
None => 0,
};
if !self.is_finished() || self.state.is_none() {
let mut mt_source = source;
if self.state.is_none() {
self.init(&mut mt_source)?;
}
//pseudo block to scope "state" so we can borrow self again after the block
{
let state = match &mut self.state {
Some(s) => s,
None => panic!("Bug in library"),
};
let mut block_dec = decoding::block_decoder::new();
if state.frame_header.descriptor.content_checksum_flag()
&& state.frame_finished
&& state.check_sum.is_none()
{
//this block is needed if the checksum were the only 4 bytes that were not included in the last decode_from_to call for a frame
if mt_source.len() >= 4 {
let chksum = mt_source[..4].try_into().expect("optimized away");
state.bytes_read_counter += 4;
let chksum = u32::from_le_bytes(chksum);
state.check_sum = Some(chksum);
}
return Ok((4, 0));
}
loop {
//check if there are enough bytes for the next header
if mt_source.len() < 3 {
break;
}
let (block_header, block_header_size) = block_dec
.read_block_header(&mut mt_source)
.map_err(err::FailedToReadBlockHeader)?;
// check the needed size for the block before updating counters.
// If not enough bytes are in the source, the header will have to be read again, so act like we never read it in the first place
if mt_source.len() < block_header.content_size as usize {
break;
}
state.bytes_read_counter += u64::from(block_header_size);
let bytes_read_in_block_body = block_dec
.decode_block_content(
&block_header,
&mut state.decoder_scratch,
&mut mt_source,
)
.map_err(err::FailedToReadBlockBody)?;
state.bytes_read_counter += bytes_read_in_block_body;
state.block_counter += 1;
if block_header.last_block {
state.frame_finished = true;
if state.frame_header.descriptor.content_checksum_flag() {
//if there are enough bytes handle this here. Else the block at the start of this function will handle it at the next call
if mt_source.len() >= 4 {
let chksum = mt_source[..4].try_into().expect("optimized away");
state.bytes_read_counter += 4;
let chksum = u32::from_le_bytes(chksum);
state.check_sum = Some(chksum);
}
}
break;
}
}
}
}
let result_len = self.read(target).map_err(err::FailedToDrainDecodebuffer)?;
let bytes_read_at_end = match &mut self.state {
Some(s) => s.bytes_read_counter,
None => panic!("Bug in library"),
};
let read_len = bytes_read_at_end - bytes_read_at_start;
Ok((read_len as usize, result_len))
}
/// Decode multiple frames into the output slice.
///
/// `input` must contain an exact number of frames.
///
/// `output` must be large enough to hold the decompressed data. If you don't know
/// how large the output will be, use [`FrameDecoder::decode_blocks`] instead.
///
/// This calls [`FrameDecoder::init`], and all bytes currently in the decoder will be lost.
///
/// Returns the number of bytes written to `output`.
pub fn decode_all(
&mut self,
mut input: &[u8],
mut output: &mut [u8],
) -> Result<usize, FrameDecoderError> {
let mut total_bytes_written = 0;
while !input.is_empty() {
match self.init(&mut input) {
Ok(_) => {}
Err(FrameDecoderError::ReadFrameHeaderError(
crate::decoding::errors::ReadFrameHeaderError::SkipFrame { length, .. },
)) => {
input = input
.get(length as usize..)
.ok_or(FrameDecoderError::FailedToSkipFrame)?;
continue;
}
Err(e) => return Err(e),
};
loop {
self.decode_blocks(&mut input, BlockDecodingStrategy::UptoBytes(1024 * 1024))?;
let bytes_written = self
.read(output)
.map_err(FrameDecoderError::FailedToDrainDecodebuffer)?;
output = &mut output[bytes_written..];
total_bytes_written += bytes_written;
if self.can_collect() != 0 {
return Err(FrameDecoderError::TargetTooSmall);
}
if self.is_finished() {
break;
}
}
}
Ok(total_bytes_written)
}
/// Decode multiple frames into the extra capacity of the output vector.
///
/// `input` must contain an exact number of frames.
///
/// `output` must have enough extra capacity to hold the decompressed data.
/// This function will not reallocate or grow the vector. If you don't know
/// how large the output will be, use [`FrameDecoder::decode_blocks`] instead.
///
/// This calls [`FrameDecoder::init`], and all bytes currently in the decoder will be lost.
///
/// The length of the output vector is updated to include the decompressed data.
/// The length is not changed if an error occurs.
pub fn decode_all_to_vec(
&mut self,
input: &[u8],
output: &mut Vec<u8>,
) -> Result<(), FrameDecoderError> {
let len = output.len();
let cap = output.capacity();
output.resize(cap, 0);
match self.decode_all(input, &mut output[len..]) {
Ok(bytes_written) => {
let new_len = core::cmp::min(len + bytes_written, cap); // Sanitizes `bytes_written`.
output.resize(new_len, 0);
Ok(())
}
Err(e) => {
output.resize(len, 0);
Err(e)
}
}
}
}
/// Read bytes from the decode_buffer that are no longer needed. While the frame is not yet finished
/// this will retain window_size bytes, else it will drain it completely
impl Read for FrameDecoder {
fn read(&mut self, target: &mut [u8]) -> Result<usize, Error> {
let state = match &mut self.state {
None => return Ok(0),
Some(s) => s,
};
if state.frame_finished {
state.decoder_scratch.buffer.read_all(target)
} else {
state.decoder_scratch.buffer.read(target)
}
}
}

View File

@@ -0,0 +1,159 @@
//! This module contains the decompress_literals function, used to take a
//! parsed literals header and a source and decompress it.
use super::super::blocks::literals_section::{LiteralsSection, LiteralsSectionType};
use super::scratch::HuffmanScratch;
use crate::bit_io::BitReaderReversed;
use crate::decoding::errors::DecompressLiteralsError;
use crate::huff0::HuffmanDecoder;
use alloc::vec::Vec;
/// Decode and decompress the provided literals section into `target`, returning the number of bytes read.
pub fn decode_literals(
section: &LiteralsSection,
scratch: &mut HuffmanScratch,
source: &[u8],
target: &mut Vec<u8>,
) -> Result<u32, DecompressLiteralsError> {
match section.ls_type {
LiteralsSectionType::Raw => {
target.extend(&source[0..section.regenerated_size as usize]);
Ok(section.regenerated_size)
}
LiteralsSectionType::RLE => {
target.resize(target.len() + section.regenerated_size as usize, source[0]);
Ok(1)
}
LiteralsSectionType::Compressed | LiteralsSectionType::Treeless => {
let bytes_read = decompress_literals(section, scratch, source, target)?;
//return sum of used bytes
Ok(bytes_read)
}
}
}
/// Decompress the provided literals section and source into the provided `target`.
/// This function is used when the literals section is `Compressed` or `Treeless`
///
/// Returns the number of bytes read.
fn decompress_literals(
section: &LiteralsSection,
scratch: &mut HuffmanScratch,
source: &[u8],
target: &mut Vec<u8>,
) -> Result<u32, DecompressLiteralsError> {
use DecompressLiteralsError as err;
let compressed_size = section.compressed_size.ok_or(err::MissingCompressedSize)? as usize;
let num_streams = section.num_streams.ok_or(err::MissingNumStreams)?;
target.reserve(section.regenerated_size as usize);
let source = &source[0..compressed_size];
let mut bytes_read = 0;
match section.ls_type {
LiteralsSectionType::Compressed => {
//read Huffman tree description
bytes_read += scratch.table.build_decoder(source)?;
vprintln!("Built huffman table using {} bytes", bytes_read);
}
LiteralsSectionType::Treeless => {
if scratch.table.max_num_bits == 0 {
return Err(err::UninitializedHuffmanTable);
}
}
_ => { /* nothing to do, huffman tree has been provided by previous block */ }
}
let source = &source[bytes_read as usize..];
if num_streams == 4 {
//build jumptable
if source.len() < 6 {
return Err(err::MissingBytesForJumpHeader { got: source.len() });
}
let jump1 = source[0] as usize + ((source[1] as usize) << 8);
let jump2 = jump1 + source[2] as usize + ((source[3] as usize) << 8);
let jump3 = jump2 + source[4] as usize + ((source[5] as usize) << 8);
bytes_read += 6;
let source = &source[6..];
if source.len() < jump3 {
return Err(err::MissingBytesForLiterals {
got: source.len(),
needed: jump3,
});
}
//decode 4 streams
let stream1 = &source[..jump1];
let stream2 = &source[jump1..jump2];
let stream3 = &source[jump2..jump3];
let stream4 = &source[jump3..];
for stream in &[stream1, stream2, stream3, stream4] {
let mut decoder = HuffmanDecoder::new(&scratch.table);
let mut br = BitReaderReversed::new(stream);
//skip the 0 padding at the end of the last byte of the bit stream and throw away the first 1 found
let mut skipped_bits = 0;
loop {
let val = br.get_bits(1);
skipped_bits += 1;
if val == 1 || skipped_bits > 8 {
break;
}
}
if skipped_bits > 8 {
//if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data
return Err(DecompressLiteralsError::ExtraPadding { skipped_bits });
}
decoder.init_state(&mut br);
while br.bits_remaining() > -(scratch.table.max_num_bits as isize) {
target.push(decoder.decode_symbol());
decoder.next_state(&mut br);
}
if br.bits_remaining() != -(scratch.table.max_num_bits as isize) {
return Err(DecompressLiteralsError::BitstreamReadMismatch {
read_til: br.bits_remaining(),
expected: -(scratch.table.max_num_bits as isize),
});
}
}
bytes_read += source.len() as u32;
} else {
//just decode the one stream
assert!(num_streams == 1);
let mut decoder = HuffmanDecoder::new(&scratch.table);
let mut br = BitReaderReversed::new(source);
let mut skipped_bits = 0;
loop {
let val = br.get_bits(1);
skipped_bits += 1;
if val == 1 || skipped_bits > 8 {
break;
}
}
if skipped_bits > 8 {
//if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data
return Err(DecompressLiteralsError::ExtraPadding { skipped_bits });
}
decoder.init_state(&mut br);
while br.bits_remaining() > -(scratch.table.max_num_bits as isize) {
target.push(decoder.decode_symbol());
decoder.next_state(&mut br);
}
bytes_read += source.len() as u32;
}
if target.len() != section.regenerated_size as usize {
return Err(DecompressLiteralsError::DecodedLiteralCountMismatch {
decoded: target.len(),
expected: section.regenerated_size as usize,
});
}
Ok(bytes_read)
}

19
vendor/ruzstd/src/decoding/mod.rs vendored Normal file
View File

@@ -0,0 +1,19 @@
//! Structures and utilities used for decoding zstd formatted data
pub mod errors;
mod frame_decoder;
mod streaming_decoder;
pub use frame_decoder::{BlockDecodingStrategy, FrameDecoder};
pub use streaming_decoder::StreamingDecoder;
pub(crate) mod block_decoder;
pub(crate) mod decode_buffer;
pub(crate) mod dictionary;
pub(crate) mod frame;
pub(crate) mod literals_section_decoder;
mod ringbuffer;
#[allow(dead_code)]
pub(crate) mod scratch;
pub(crate) mod sequence_execution;
pub(crate) mod sequence_section_decoder;

887
vendor/ruzstd/src/decoding/ringbuffer.rs vendored Normal file
View File

@@ -0,0 +1,887 @@
use alloc::alloc::{alloc, dealloc};
use core::{alloc::Layout, ptr::NonNull, slice};
pub struct RingBuffer {
// Safety invariants:
//
// 1.
// a.`buf` must be a valid allocation of capacity `cap`
// b. ...unless `cap=0`, in which case it is dangling
// 2. If tail≥head
// a. `head..tail` must contain initialized memory.
// b. Else, `head..` and `..tail` must be initialized
// 3. `head` and `tail` are in bounds (≥ 0 and < cap)
// 4. `tail` is never `cap` except for a full buffer, and instead uses the value `0`. In other words, `tail` always points to the place
// where the next element would go (if there is space)
buf: NonNull<u8>,
cap: usize,
head: usize,
tail: usize,
}
// SAFETY: RingBuffer does not hold any thread specific values -> it can be sent to another thread -> RingBuffer is Send
unsafe impl Send for RingBuffer {}
// SAFETY: Ringbuffer does not provide unsyncronized interior mutability which makes &RingBuffer Send -> RingBuffer is Sync
unsafe impl Sync for RingBuffer {}
impl RingBuffer {
pub fn new() -> Self {
RingBuffer {
// SAFETY: Upholds invariant 1a as stated
buf: NonNull::dangling(),
cap: 0,
// SAFETY: Upholds invariant 2-4
head: 0,
tail: 0,
}
}
/// Return the number of bytes in the buffer.
pub fn len(&self) -> usize {
let (x, y) = self.data_slice_lengths();
x + y
}
/// Return the amount of available space (in bytes) of the buffer.
pub fn free(&self) -> usize {
let (x, y) = self.free_slice_lengths();
(x + y).saturating_sub(1)
}
/// Empty the buffer and reset the head and tail.
pub fn clear(&mut self) {
// SAFETY: Upholds invariant 2, trivially
// SAFETY: Upholds invariant 3; 0 is always valid
self.head = 0;
self.tail = 0;
}
/// Ensure that there's space for `amount` elements in the buffer.
pub fn reserve(&mut self, amount: usize) {
let free = self.free();
if free >= amount {
return;
}
self.reserve_amortized(amount - free);
}
#[inline(never)]
#[cold]
fn reserve_amortized(&mut self, amount: usize) {
// SAFETY: if we were succesfully able to construct this layout when we allocated then it's also valid do so now
let current_layout = unsafe { Layout::array::<u8>(self.cap).unwrap_unchecked() };
// Always have at least 1 unused element as the sentinel.
let new_cap = usize::max(
self.cap.next_power_of_two(),
(self.cap + amount).next_power_of_two(),
) + 1;
// Check that the capacity isn't bigger than isize::MAX, which is the max allowed by LLVM, or that
// we are on a >= 64 bit system which will never allow that much memory to be allocated
#[allow(clippy::assertions_on_constants)]
{
debug_assert!(usize::BITS >= 64 || new_cap < isize::MAX as usize);
}
let new_layout = Layout::array::<u8>(new_cap)
.unwrap_or_else(|_| panic!("Could not create layout for u8 array of size {}", new_cap));
// alloc the new memory region and panic if alloc fails
// TODO maybe rework this to generate an error?
let new_buf = unsafe {
let new_buf = alloc(new_layout);
NonNull::new(new_buf).expect("Allocating new space for the ringbuffer failed")
};
// If we had data before, copy it over to the newly alloced memory region
if self.cap > 0 {
let ((s1_ptr, s1_len), (s2_ptr, s2_len)) = self.data_slice_parts();
unsafe {
// SAFETY: Upholds invariant 2, we end up populating (0..(len₁ + len₂))
new_buf.as_ptr().copy_from_nonoverlapping(s1_ptr, s1_len);
new_buf
.as_ptr()
.add(s1_len)
.copy_from_nonoverlapping(s2_ptr, s2_len);
dealloc(self.buf.as_ptr(), current_layout);
}
// SAFETY: Upholds invariant 3, head is 0 and in bounds, tail is only ever `cap` if the buffer
// is entirely full
self.tail = s1_len + s2_len;
self.head = 0;
}
// SAFETY: Upholds invariant 1: the buffer was just allocated correctly
self.buf = new_buf;
self.cap = new_cap;
}
#[allow(dead_code)]
pub fn push_back(&mut self, byte: u8) {
self.reserve(1);
// SAFETY: Upholds invariant 2 by writing initialized memory
unsafe { self.buf.as_ptr().add(self.tail).write(byte) };
// SAFETY: Upholds invariant 3 by wrapping `tail` around
self.tail = (self.tail + 1) % self.cap;
}
/// Fetch the byte stored at the selected index from the buffer, returning it, or
/// `None` if the index is out of bounds.
#[allow(dead_code)]
pub fn get(&self, idx: usize) -> Option<u8> {
if idx < self.len() {
// SAFETY: Establishes invariants on memory being initialized and the range being in-bounds
// (Invariants 2 & 3)
let idx = (self.head + idx) % self.cap;
Some(unsafe { self.buf.as_ptr().add(idx).read() })
} else {
None
}
}
/// Append the provided data to the end of `self`.
pub fn extend(&mut self, data: &[u8]) {
let len = data.len();
let ptr = data.as_ptr();
if len == 0 {
return;
}
self.reserve(len);
debug_assert!(self.len() + len <= self.cap - 1);
debug_assert!(self.free() >= len, "free: {} len: {}", self.free(), len);
let ((f1_ptr, f1_len), (f2_ptr, f2_len)) = self.free_slice_parts();
debug_assert!(f1_len + f2_len >= len, "{} + {} < {}", f1_len, f2_len, len);
let in_f1 = usize::min(len, f1_len);
let in_f2 = len - in_f1;
debug_assert!(in_f1 + in_f2 == len);
unsafe {
// SAFETY: `in_f₁ + in_f₂ = len`, so this writes `len` bytes total
// upholding invariant 2
if in_f1 > 0 {
f1_ptr.copy_from_nonoverlapping(ptr, in_f1);
}
if in_f2 > 0 {
f2_ptr.copy_from_nonoverlapping(ptr.add(in_f1), in_f2);
}
}
// SAFETY: Upholds invariant 3 by wrapping `tail` around.
self.tail = (self.tail + len) % self.cap;
}
/// Advance head past `amount` elements, effectively removing
/// them from the buffer.
pub fn drop_first_n(&mut self, amount: usize) {
debug_assert!(amount <= self.len());
let amount = usize::min(amount, self.len());
// SAFETY: we maintain invariant 2 here since this will always lead to a smaller buffer
// for amount≤len
self.head = (self.head + amount) % self.cap;
}
/// Return the size of the two contiguous occupied sections of memory used
/// by the buffer.
// SAFETY: other code relies on this pointing to initialized halves of the buffer only
fn data_slice_lengths(&self) -> (usize, usize) {
let len_after_head;
let len_to_tail;
// TODO can we do this branchless?
if self.tail >= self.head {
len_after_head = self.tail - self.head;
len_to_tail = 0;
} else {
len_after_head = self.cap - self.head;
len_to_tail = self.tail;
}
(len_after_head, len_to_tail)
}
// SAFETY: other code relies on this pointing to initialized halves of the buffer only
/// Return pointers to the head and tail, and the length of each section.
fn data_slice_parts(&self) -> ((*const u8, usize), (*const u8, usize)) {
let (len_after_head, len_to_tail) = self.data_slice_lengths();
(
(unsafe { self.buf.as_ptr().add(self.head) }, len_after_head),
(self.buf.as_ptr(), len_to_tail),
)
}
/// Return references to each part of the ring buffer.
pub fn as_slices(&self) -> (&[u8], &[u8]) {
let (s1, s2) = self.data_slice_parts();
unsafe {
// SAFETY: relies on the behavior of data_slice_parts for producing initialized memory
let s1 = slice::from_raw_parts(s1.0, s1.1);
let s2 = slice::from_raw_parts(s2.0, s2.1);
(s1, s2)
}
}
// SAFETY: other code relies on this producing the lengths of free zones
// at the beginning/end of the buffer. Everything else must be initialized
/// Returns the size of the two unoccupied sections of memory used by the buffer.
fn free_slice_lengths(&self) -> (usize, usize) {
let len_to_head;
let len_after_tail;
// TODO can we do this branchless?
if self.tail < self.head {
len_after_tail = self.head - self.tail;
len_to_head = 0;
} else {
len_after_tail = self.cap - self.tail;
len_to_head = self.head;
}
(len_to_head, len_after_tail)
}
/// Returns mutable references to the available space and the size of that available space,
/// for the two sections in the buffer.
// SAFETY: Other code relies on this pointing to the free zones, data after the first and before the second must
// be valid
fn free_slice_parts(&self) -> ((*mut u8, usize), (*mut u8, usize)) {
let (len_to_head, len_after_tail) = self.free_slice_lengths();
(
(unsafe { self.buf.as_ptr().add(self.tail) }, len_after_tail),
(self.buf.as_ptr(), len_to_head),
)
}
/// Copies elements from the provided range to the end of the buffer.
#[allow(dead_code)]
pub fn extend_from_within(&mut self, start: usize, len: usize) {
if start + len > self.len() {
panic!(
"Calls to this functions must respect start ({}) + len ({}) <= self.len() ({})!",
start,
len,
self.len()
);
}
self.reserve(len);
// SAFETY: Requirements checked:
// 1. explicitly checked above, resulting in a panic if it does not hold
// 2. explicitly reserved enough memory
unsafe { self.extend_from_within_unchecked(start, len) }
}
/// Copies data from the provided range to the end of the buffer, without
/// first verifying that the unoccupied capacity is available.
///
/// SAFETY:
/// For this to be safe two requirements need to hold:
/// 1. start + len <= self.len() so we do not copy uninitialised memory
/// 2. More then len reserved space so we do not write out-of-bounds
#[warn(unsafe_op_in_unsafe_fn)]
pub unsafe fn extend_from_within_unchecked(&mut self, start: usize, len: usize) {
debug_assert!(start + len <= self.len());
debug_assert!(self.free() >= len);
if self.head < self.tail {
// Continuous source section and possibly non continuous write section:
//
// H T
// Read: ____XXXXSSSSXXXX________
// Write: ________________DDDD____
//
// H: Head position (first readable byte)
// T: Tail position (first writable byte)
// X: Uninvolved bytes in the readable section
// S: Source bytes, to be copied to D bytes
// D: Destination bytes, going to be copied from S bytes
// _: Uninvolved bytes in the writable section
let after_tail = usize::min(len, self.cap - self.tail);
let src = (
// SAFETY: `len <= isize::MAX` and fits the memory range of `buf`
unsafe { self.buf.as_ptr().add(self.head + start) }.cast_const(),
// Src length (see above diagram)
self.tail - self.head - start,
);
let dst = (
// SAFETY: `len <= isize::MAX` and fits the memory range of `buf`
unsafe { self.buf.as_ptr().add(self.tail) },
// Dst length (see above diagram)
self.cap - self.tail,
);
// SAFETY: `src` points at initialized data, `dst` points to writable memory
// and does not overlap `src`.
unsafe { copy_bytes_overshooting(src, dst, after_tail) }
if after_tail < len {
// The write section was not continuous:
//
// H T
// Read: ____XXXXSSSSXXXX__
// Write: DD______________DD
//
// H: Head position (first readable byte)
// T: Tail position (first writable byte)
// X: Uninvolved bytes in the readable section
// S: Source bytes, to be copied to D bytes
// D: Destination bytes, going to be copied from S bytes
// _: Uninvolved bytes in the writable section
let src = (
// SAFETY: we are still within the memory range of `buf`
unsafe { src.0.add(after_tail) },
// Src length (see above diagram)
src.1 - after_tail,
);
let dst = (
self.buf.as_ptr(),
// Dst length overflowing (see above diagram)
self.head,
);
// SAFETY: `src` points at initialized data, `dst` points to writable memory
// and does not overlap `src`.
unsafe { copy_bytes_overshooting(src, dst, len - after_tail) }
}
} else {
#[allow(clippy::collapsible_else_if)]
if self.head + start > self.cap {
// Continuous read section and destination section:
//
// T H
// Read: XXSSSSXXXX____________XX
// Write: __________DDDD__________
//
// H: Head position (first readable byte)
// T: Tail position (first writable byte)
// X: Uninvolved bytes in the readable section
// S: Source bytes, to be copied to D bytes
// D: Destination bytes, going to be copied from S bytes
// _: Uninvolved bytes in the writable section
let start = (self.head + start) % self.cap;
let src = (
// SAFETY: `len <= isize::MAX` and fits the memory range of `buf`
unsafe { self.buf.as_ptr().add(start) }.cast_const(),
// Src length (see above diagram)
self.tail - start,
);
let dst = (
// SAFETY: `len <= isize::MAX` and fits the memory range of `buf`
unsafe { self.buf.as_ptr().add(self.tail) }, // Dst length (see above diagram)
// Dst length (see above diagram)
self.head - self.tail,
);
// SAFETY: `src` points at initialized data, `dst` points to writable memory
// and does not overlap `src`.
unsafe { copy_bytes_overshooting(src, dst, len) }
} else {
// Possibly non continuous read section and continuous destination section:
//
// T H
// Read: XXXX____________XXSSSSXX
// Write: ____DDDD________________
//
// H: Head position (first readable byte)
// T: Tail position (first writable byte)
// X: Uninvolved bytes in the readable section
// S: Source bytes, to be copied to D bytes
// D: Destination bytes, going to be copied from S bytes
// _: Uninvolved bytes in the writable section
let after_start = usize::min(len, self.cap - self.head - start);
let src = (
// SAFETY: `len <= isize::MAX` and fits the memory range of `buf`
unsafe { self.buf.as_ptr().add(self.head + start) }.cast_const(),
// Src length - chunk 1 (see above diagram on the right)
self.cap - self.head - start,
);
let dst = (
// SAFETY: `len <= isize::MAX` and fits the memory range of `buf`
unsafe { self.buf.as_ptr().add(self.tail) },
// Dst length (see above diagram)
self.head - self.tail,
);
// SAFETY: `src` points at initialized data, `dst` points to writable memory
// and does not overlap `src`.
unsafe { copy_bytes_overshooting(src, dst, after_start) }
if after_start < len {
// The read section was not continuous:
//
// T H
// Read: SSXXXXXX____________XXSS
// Write: ________DDDD____________
//
// H: Head position (first readable byte)
// T: Tail position (first writable byte)
// X: Uninvolved bytes in the readable section
// S: Source bytes, to be copied to D bytes
// D: Destination bytes, going to be copied from S bytes
// _: Uninvolved bytes in the writable section
let src = (
self.buf.as_ptr().cast_const(),
// Src length - chunk 2 (see above diagram on the left)
self.tail,
);
let dst = (
// SAFETY: we are still within the memory range of `buf`
unsafe { dst.0.add(after_start) },
// Dst length (see above diagram)
dst.1 - after_start,
);
// SAFETY: `src` points at initialized data, `dst` points to writable memory
// and does not overlap `src`.
unsafe { copy_bytes_overshooting(src, dst, len - after_start) }
}
}
}
self.tail = (self.tail + len) % self.cap;
}
#[allow(dead_code)]
/// This function is functionally the same as [RingBuffer::extend_from_within_unchecked],
/// but it does not contain any branching operations.
///
/// SAFETY:
/// Needs start + len <= self.len()
/// And more then len reserved space
pub unsafe fn extend_from_within_unchecked_branchless(&mut self, start: usize, len: usize) {
// data slices in raw parts
let ((s1_ptr, s1_len), (s2_ptr, s2_len)) = self.data_slice_parts();
debug_assert!(len <= s1_len + s2_len, "{} > {} + {}", len, s1_len, s2_len);
// calc the actually wanted slices in raw parts
let start_in_s1 = usize::min(s1_len, start);
let end_in_s1 = usize::min(s1_len, start + len);
let m1_ptr = s1_ptr.add(start_in_s1);
let m1_len = end_in_s1 - start_in_s1;
debug_assert!(end_in_s1 <= s1_len);
debug_assert!(start_in_s1 <= s1_len);
let start_in_s2 = start.saturating_sub(s1_len);
let end_in_s2 = start_in_s2 + (len - m1_len);
let m2_ptr = s2_ptr.add(start_in_s2);
let m2_len = end_in_s2 - start_in_s2;
debug_assert!(start_in_s2 <= s2_len);
debug_assert!(end_in_s2 <= s2_len);
debug_assert_eq!(len, m1_len + m2_len);
// the free slices, must hold: f1_len + f2_len >= m1_len + m2_len
let ((f1_ptr, f1_len), (f2_ptr, f2_len)) = self.free_slice_parts();
debug_assert!(f1_len + f2_len >= m1_len + m2_len);
// calc how many from where bytes go where
let m1_in_f1 = usize::min(m1_len, f1_len);
let m1_in_f2 = m1_len - m1_in_f1;
let m2_in_f1 = usize::min(f1_len - m1_in_f1, m2_len);
let m2_in_f2 = m2_len - m2_in_f1;
debug_assert_eq!(m1_len, m1_in_f1 + m1_in_f2);
debug_assert_eq!(m2_len, m2_in_f1 + m2_in_f2);
debug_assert!(f1_len >= m1_in_f1 + m2_in_f1);
debug_assert!(f2_len >= m1_in_f2 + m2_in_f2);
debug_assert_eq!(len, m1_in_f1 + m2_in_f1 + m1_in_f2 + m2_in_f2);
debug_assert!(self.buf.as_ptr().add(self.cap) > f1_ptr.add(m1_in_f1 + m2_in_f1));
debug_assert!(self.buf.as_ptr().add(self.cap) > f2_ptr.add(m1_in_f2 + m2_in_f2));
debug_assert!((m1_in_f2 > 0) ^ (m2_in_f1 > 0) || (m1_in_f2 == 0 && m2_in_f1 == 0));
copy_with_checks(
m1_ptr, m2_ptr, f1_ptr, f2_ptr, m1_in_f1, m2_in_f1, m1_in_f2, m2_in_f2,
);
self.tail = (self.tail + len) % self.cap;
}
}
impl Drop for RingBuffer {
fn drop(&mut self) {
if self.cap == 0 {
return;
}
// SAFETY: is we were succesfully able to construct this layout when we allocated then it's also valid do so now
// Relies on / establishes invariant 1
let current_layout = unsafe { Layout::array::<u8>(self.cap).unwrap_unchecked() };
unsafe {
dealloc(self.buf.as_ptr(), current_layout);
}
}
}
/// Similar to ptr::copy_nonoverlapping
///
/// But it might overshoot the desired copy length if deemed useful
///
/// src and dst specify the entire length they are eligible for reading/writing respectively
/// in addition to the desired copy length.
///
/// This function will then copy in chunks and might copy up to chunk size - 1 more bytes from src to dst
/// if that operation does not read/write memory that does not belong to src/dst.
///
/// The chunk size is not part of the contract and may change depending on the target platform.
///
/// If that isn't possible we just fall back to ptr::copy_nonoverlapping
#[inline(always)]
unsafe fn copy_bytes_overshooting(
src: (*const u8, usize),
dst: (*mut u8, usize),
copy_at_least: usize,
) {
// By default use usize as the copy size
#[cfg(all(not(target_feature = "sse2"), not(target_feature = "neon")))]
type CopyType = usize;
// Use u128 if we detect a simd feature
#[cfg(target_feature = "neon")]
type CopyType = u128;
#[cfg(target_feature = "sse2")]
type CopyType = u128;
const COPY_AT_ONCE_SIZE: usize = core::mem::size_of::<CopyType>();
let min_buffer_size = usize::min(src.1, dst.1);
// Can copy in just one read+write, very common case
if min_buffer_size >= COPY_AT_ONCE_SIZE && copy_at_least <= COPY_AT_ONCE_SIZE {
dst.0
.cast::<CopyType>()
.write_unaligned(src.0.cast::<CopyType>().read_unaligned())
} else {
let copy_multiple = copy_at_least.next_multiple_of(COPY_AT_ONCE_SIZE);
// Can copy in multiple simple instructions
if min_buffer_size >= copy_multiple {
let mut src_ptr = src.0.cast::<CopyType>();
let src_ptr_end = src.0.add(copy_multiple).cast::<CopyType>();
let mut dst_ptr = dst.0.cast::<CopyType>();
while src_ptr < src_ptr_end {
dst_ptr.write_unaligned(src_ptr.read_unaligned());
src_ptr = src_ptr.add(1);
dst_ptr = dst_ptr.add(1);
}
} else {
// Fall back to standard memcopy
dst.0.copy_from_nonoverlapping(src.0, copy_at_least);
}
}
debug_assert_eq!(
slice::from_raw_parts(src.0, copy_at_least),
slice::from_raw_parts(dst.0, copy_at_least)
);
}
#[allow(dead_code)]
#[inline(always)]
#[allow(clippy::too_many_arguments)]
unsafe fn copy_without_checks(
m1_ptr: *const u8,
m2_ptr: *const u8,
f1_ptr: *mut u8,
f2_ptr: *mut u8,
m1_in_f1: usize,
m2_in_f1: usize,
m1_in_f2: usize,
m2_in_f2: usize,
) {
f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1);
f1_ptr
.add(m1_in_f1)
.copy_from_nonoverlapping(m2_ptr, m2_in_f1);
f2_ptr.copy_from_nonoverlapping(m1_ptr.add(m1_in_f1), m1_in_f2);
f2_ptr
.add(m1_in_f2)
.copy_from_nonoverlapping(m2_ptr.add(m2_in_f1), m2_in_f2);
}
#[allow(dead_code)]
#[inline(always)]
#[allow(clippy::too_many_arguments)]
unsafe fn copy_with_checks(
m1_ptr: *const u8,
m2_ptr: *const u8,
f1_ptr: *mut u8,
f2_ptr: *mut u8,
m1_in_f1: usize,
m2_in_f1: usize,
m1_in_f2: usize,
m2_in_f2: usize,
) {
if m1_in_f1 != 0 {
f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1);
}
if m2_in_f1 != 0 {
f1_ptr
.add(m1_in_f1)
.copy_from_nonoverlapping(m2_ptr, m2_in_f1);
}
if m1_in_f2 != 0 {
f2_ptr.copy_from_nonoverlapping(m1_ptr.add(m1_in_f1), m1_in_f2);
}
if m2_in_f2 != 0 {
f2_ptr
.add(m1_in_f2)
.copy_from_nonoverlapping(m2_ptr.add(m2_in_f1), m2_in_f2);
}
}
#[allow(dead_code)]
#[inline(always)]
#[allow(clippy::too_many_arguments)]
unsafe fn copy_with_nobranch_check(
m1_ptr: *const u8,
m2_ptr: *const u8,
f1_ptr: *mut u8,
f2_ptr: *mut u8,
m1_in_f1: usize,
m2_in_f1: usize,
m1_in_f2: usize,
m2_in_f2: usize,
) {
let case = (m1_in_f1 > 0) as usize
| (((m2_in_f1 > 0) as usize) << 1)
| (((m1_in_f2 > 0) as usize) << 2)
| (((m2_in_f2 > 0) as usize) << 3);
match case {
0 => {}
// one bit set
1 => {
f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1);
}
2 => {
f1_ptr.copy_from_nonoverlapping(m2_ptr, m2_in_f1);
}
4 => {
f2_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f2);
}
8 => {
f2_ptr.copy_from_nonoverlapping(m2_ptr, m2_in_f2);
}
// two bit set
3 => {
f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1);
f1_ptr
.add(m1_in_f1)
.copy_from_nonoverlapping(m2_ptr, m2_in_f1);
}
5 => {
f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1);
f2_ptr.copy_from_nonoverlapping(m1_ptr.add(m1_in_f1), m1_in_f2);
}
6 => core::hint::unreachable_unchecked(),
7 => core::hint::unreachable_unchecked(),
9 => {
f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1);
f2_ptr.copy_from_nonoverlapping(m2_ptr, m2_in_f2);
}
10 => {
f1_ptr.copy_from_nonoverlapping(m2_ptr, m2_in_f1);
f2_ptr.copy_from_nonoverlapping(m2_ptr.add(m2_in_f1), m2_in_f2);
}
12 => {
f2_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f2);
f2_ptr
.add(m1_in_f2)
.copy_from_nonoverlapping(m2_ptr, m2_in_f2);
}
// three bit set
11 => {
f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1);
f1_ptr
.add(m1_in_f1)
.copy_from_nonoverlapping(m2_ptr, m2_in_f1);
f2_ptr.copy_from_nonoverlapping(m2_ptr.add(m2_in_f1), m2_in_f2);
}
13 => {
f1_ptr.copy_from_nonoverlapping(m1_ptr, m1_in_f1);
f2_ptr.copy_from_nonoverlapping(m1_ptr.add(m1_in_f1), m1_in_f2);
f2_ptr
.add(m1_in_f2)
.copy_from_nonoverlapping(m2_ptr, m2_in_f2);
}
14 => core::hint::unreachable_unchecked(),
15 => core::hint::unreachable_unchecked(),
_ => core::hint::unreachable_unchecked(),
}
}
#[cfg(test)]
mod tests {
use super::RingBuffer;
#[test]
fn smoke() {
let mut rb = RingBuffer::new();
rb.reserve(15);
assert_eq!(17, rb.cap);
rb.extend(b"0123456789");
assert_eq!(rb.len(), 10);
assert_eq!(rb.as_slices().0, b"0123456789");
assert_eq!(rb.as_slices().1, b"");
rb.drop_first_n(5);
assert_eq!(rb.len(), 5);
assert_eq!(rb.as_slices().0, b"56789");
assert_eq!(rb.as_slices().1, b"");
rb.extend_from_within(2, 3);
assert_eq!(rb.len(), 8);
assert_eq!(rb.as_slices().0, b"56789789");
assert_eq!(rb.as_slices().1, b"");
rb.extend_from_within(0, 3);
assert_eq!(rb.len(), 11);
assert_eq!(rb.as_slices().0, b"56789789567");
assert_eq!(rb.as_slices().1, b"");
rb.extend_from_within(0, 2);
assert_eq!(rb.len(), 13);
assert_eq!(rb.as_slices().0, b"567897895675");
assert_eq!(rb.as_slices().1, b"6");
rb.drop_first_n(11);
assert_eq!(rb.len(), 2);
assert_eq!(rb.as_slices().0, b"5");
assert_eq!(rb.as_slices().1, b"6");
rb.extend(b"0123456789");
assert_eq!(rb.len(), 12);
assert_eq!(rb.as_slices().0, b"5");
assert_eq!(rb.as_slices().1, b"60123456789");
rb.drop_first_n(11);
assert_eq!(rb.len(), 1);
assert_eq!(rb.as_slices().0, b"9");
assert_eq!(rb.as_slices().1, b"");
rb.extend(b"0123456789");
assert_eq!(rb.len(), 11);
assert_eq!(rb.as_slices().0, b"9012345");
assert_eq!(rb.as_slices().1, b"6789");
}
#[test]
fn edge_cases() {
// Fill exactly, then empty then fill again
let mut rb = RingBuffer::new();
rb.reserve(16);
assert_eq!(17, rb.cap);
rb.extend(b"0123456789012345");
assert_eq!(17, rb.cap);
assert_eq!(16, rb.len());
assert_eq!(0, rb.free());
rb.drop_first_n(16);
assert_eq!(0, rb.len());
assert_eq!(16, rb.free());
rb.extend(b"0123456789012345");
assert_eq!(16, rb.len());
assert_eq!(0, rb.free());
assert_eq!(17, rb.cap);
assert_eq!(1, rb.as_slices().0.len());
assert_eq!(15, rb.as_slices().1.len());
rb.clear();
// data in both slices and then reserve
rb.extend(b"0123456789012345");
rb.drop_first_n(8);
rb.extend(b"67890123");
assert_eq!(16, rb.len());
assert_eq!(0, rb.free());
assert_eq!(17, rb.cap);
assert_eq!(9, rb.as_slices().0.len());
assert_eq!(7, rb.as_slices().1.len());
rb.reserve(1);
assert_eq!(16, rb.len());
assert_eq!(16, rb.free());
assert_eq!(33, rb.cap);
assert_eq!(16, rb.as_slices().0.len());
assert_eq!(0, rb.as_slices().1.len());
rb.clear();
// fill exactly, then extend from within
rb.extend(b"0123456789012345");
rb.extend_from_within(0, 16);
assert_eq!(32, rb.len());
assert_eq!(0, rb.free());
assert_eq!(33, rb.cap);
assert_eq!(32, rb.as_slices().0.len());
assert_eq!(0, rb.as_slices().1.len());
// extend from within cases
let mut rb = RingBuffer::new();
rb.reserve(8);
rb.extend(b"01234567");
rb.drop_first_n(5);
rb.extend_from_within(0, 3);
assert_eq!(4, rb.as_slices().0.len());
assert_eq!(2, rb.as_slices().1.len());
rb.drop_first_n(2);
assert_eq!(2, rb.as_slices().0.len());
assert_eq!(2, rb.as_slices().1.len());
rb.extend_from_within(0, 4);
assert_eq!(2, rb.as_slices().0.len());
assert_eq!(6, rb.as_slices().1.len());
rb.drop_first_n(2);
assert_eq!(6, rb.as_slices().0.len());
assert_eq!(0, rb.as_slices().1.len());
rb.drop_first_n(2);
assert_eq!(4, rb.as_slices().0.len());
assert_eq!(0, rb.as_slices().1.len());
rb.extend_from_within(0, 4);
assert_eq!(7, rb.as_slices().0.len());
assert_eq!(1, rb.as_slices().1.len());
let mut rb = RingBuffer::new();
rb.reserve(8);
rb.extend(b"11111111");
rb.drop_first_n(7);
rb.extend(b"111");
assert_eq!(2, rb.as_slices().0.len());
assert_eq!(2, rb.as_slices().1.len());
rb.extend_from_within(0, 4);
assert_eq!(b"11", rb.as_slices().0);
assert_eq!(b"111111", rb.as_slices().1);
}
}

134
vendor/ruzstd/src/decoding/scratch.rs vendored Normal file
View File

@@ -0,0 +1,134 @@
//! Structures that wrap around various decoders to make decoding easier.
use super::super::blocks::sequence_section::Sequence;
use super::decode_buffer::DecodeBuffer;
use crate::decoding::dictionary::Dictionary;
use crate::fse::FSETable;
use crate::huff0::HuffmanTable;
use alloc::vec::Vec;
use crate::blocks::sequence_section::{
MAX_LITERAL_LENGTH_CODE, MAX_MATCH_LENGTH_CODE, MAX_OFFSET_CODE,
};
/// A block level decoding buffer.
pub struct DecoderScratch {
/// The decoder used for Huffman blocks.
pub huf: HuffmanScratch,
/// The decoder used for FSE blocks.
pub fse: FSEScratch,
pub buffer: DecodeBuffer,
pub offset_hist: [u32; 3],
pub literals_buffer: Vec<u8>,
pub sequences: Vec<Sequence>,
pub block_content_buffer: Vec<u8>,
}
impl DecoderScratch {
pub fn new(window_size: usize) -> DecoderScratch {
DecoderScratch {
huf: HuffmanScratch {
table: HuffmanTable::new(),
},
fse: FSEScratch {
offsets: FSETable::new(MAX_OFFSET_CODE),
of_rle: None,
literal_lengths: FSETable::new(MAX_LITERAL_LENGTH_CODE),
ll_rle: None,
match_lengths: FSETable::new(MAX_MATCH_LENGTH_CODE),
ml_rle: None,
},
buffer: DecodeBuffer::new(window_size),
offset_hist: [1, 4, 8],
block_content_buffer: Vec::new(),
literals_buffer: Vec::new(),
sequences: Vec::new(),
}
}
pub fn reset(&mut self, window_size: usize) {
self.offset_hist = [1, 4, 8];
self.literals_buffer.clear();
self.sequences.clear();
self.block_content_buffer.clear();
self.buffer.reset(window_size);
self.fse.literal_lengths.reset();
self.fse.match_lengths.reset();
self.fse.offsets.reset();
self.fse.ll_rle = None;
self.fse.ml_rle = None;
self.fse.of_rle = None;
self.huf.table.reset();
}
pub fn init_from_dict(&mut self, dict: &Dictionary) {
self.fse.reinit_from(&dict.fse);
self.huf.table.reinit_from(&dict.huf.table);
self.offset_hist = dict.offset_hist;
self.buffer.dict_content.clear();
self.buffer
.dict_content
.extend_from_slice(&dict.dict_content);
}
}
pub struct HuffmanScratch {
pub table: HuffmanTable,
}
impl HuffmanScratch {
pub fn new() -> HuffmanScratch {
HuffmanScratch {
table: HuffmanTable::new(),
}
}
}
impl Default for HuffmanScratch {
fn default() -> Self {
Self::new()
}
}
pub struct FSEScratch {
pub offsets: FSETable,
pub of_rle: Option<u8>,
pub literal_lengths: FSETable,
pub ll_rle: Option<u8>,
pub match_lengths: FSETable,
pub ml_rle: Option<u8>,
}
impl FSEScratch {
pub fn new() -> FSEScratch {
FSEScratch {
offsets: FSETable::new(MAX_OFFSET_CODE),
of_rle: None,
literal_lengths: FSETable::new(MAX_LITERAL_LENGTH_CODE),
ll_rle: None,
match_lengths: FSETable::new(MAX_MATCH_LENGTH_CODE),
ml_rle: None,
}
}
pub fn reinit_from(&mut self, other: &Self) {
self.offsets.reinit_from(&other.offsets);
self.literal_lengths.reinit_from(&other.literal_lengths);
self.match_lengths.reinit_from(&other.match_lengths);
self.of_rle = other.of_rle;
self.ll_rle = other.ll_rle;
self.ml_rle = other.ml_rle;
}
}
impl Default for FSEScratch {
fn default() -> Self {
Self::new()
}
}

View File

@@ -0,0 +1,115 @@
use super::scratch::DecoderScratch;
use crate::decoding::errors::ExecuteSequencesError;
/// Take the provided decoder and execute the sequences stored within
pub fn execute_sequences(scratch: &mut DecoderScratch) -> Result<(), ExecuteSequencesError> {
let mut literals_copy_counter = 0;
let old_buffer_size = scratch.buffer.len();
let mut seq_sum = 0;
for idx in 0..scratch.sequences.len() {
let seq = scratch.sequences[idx];
if seq.ll > 0 {
let high = literals_copy_counter + seq.ll as usize;
if high > scratch.literals_buffer.len() {
return Err(ExecuteSequencesError::NotEnoughBytesForSequence {
wanted: high,
have: scratch.literals_buffer.len(),
});
}
let literals = &scratch.literals_buffer[literals_copy_counter..high];
literals_copy_counter += seq.ll as usize;
scratch.buffer.push(literals);
}
let actual_offset = do_offset_history(seq.of, seq.ll, &mut scratch.offset_hist);
if actual_offset == 0 {
return Err(ExecuteSequencesError::ZeroOffset);
}
if seq.ml > 0 {
scratch
.buffer
.repeat(actual_offset as usize, seq.ml as usize)?;
}
seq_sum += seq.ml;
seq_sum += seq.ll;
}
if literals_copy_counter < scratch.literals_buffer.len() {
let rest_literals = &scratch.literals_buffer[literals_copy_counter..];
scratch.buffer.push(rest_literals);
seq_sum += rest_literals.len() as u32;
}
let diff = scratch.buffer.len() - old_buffer_size;
assert!(
seq_sum as usize == diff,
"Seq_sum: {} is different from the difference in buffersize: {}",
seq_sum,
diff
);
Ok(())
}
/// Update the most recently used offsets to reflect the provided offset value, and return the
/// "actual" offset needed because offsets are not stored in a raw way, some transformations are needed
/// before you get a functional number.
fn do_offset_history(offset_value: u32, lit_len: u32, scratch: &mut [u32; 3]) -> u32 {
let actual_offset = if lit_len > 0 {
match offset_value {
1..=3 => scratch[offset_value as usize - 1],
_ => {
//new offset
offset_value - 3
}
}
} else {
match offset_value {
1..=2 => scratch[offset_value as usize],
3 => scratch[0] - 1,
_ => {
//new offset
offset_value - 3
}
}
};
//update history
if lit_len > 0 {
match offset_value {
1 => {
//nothing
}
2 => {
scratch[1] = scratch[0];
scratch[0] = actual_offset;
}
_ => {
scratch[2] = scratch[1];
scratch[1] = scratch[0];
scratch[0] = actual_offset;
}
}
} else {
match offset_value {
1 => {
scratch[1] = scratch[0];
scratch[0] = actual_offset;
}
2 => {
scratch[2] = scratch[1];
scratch[1] = scratch[0];
scratch[0] = actual_offset;
}
_ => {
scratch[2] = scratch[1];
scratch[1] = scratch[0];
scratch[0] = actual_offset;
}
}
}
actual_offset
}

View File

@@ -0,0 +1,487 @@
use super::super::blocks::sequence_section::ModeType;
use super::super::blocks::sequence_section::Sequence;
use super::super::blocks::sequence_section::SequencesHeader;
use super::scratch::FSEScratch;
use crate::bit_io::BitReaderReversed;
use crate::blocks::sequence_section::{
MAX_LITERAL_LENGTH_CODE, MAX_MATCH_LENGTH_CODE, MAX_OFFSET_CODE,
};
use crate::decoding::errors::DecodeSequenceError;
use crate::fse::FSEDecoder;
use alloc::vec::Vec;
/// Decode the provided source as a series of sequences into the supplied `target`.
pub fn decode_sequences(
section: &SequencesHeader,
source: &[u8],
scratch: &mut FSEScratch,
target: &mut Vec<Sequence>,
) -> Result<(), DecodeSequenceError> {
let bytes_read = maybe_update_fse_tables(section, source, scratch)?;
vprintln!("Updating tables used {} bytes", bytes_read);
let bit_stream = &source[bytes_read..];
let mut br = BitReaderReversed::new(bit_stream);
//skip the 0 padding at the end of the last byte of the bit stream and throw away the first 1 found
let mut skipped_bits = 0;
loop {
let val = br.get_bits(1);
skipped_bits += 1;
if val == 1 || skipped_bits > 8 {
break;
}
}
if skipped_bits > 8 {
//if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data
return Err(DecodeSequenceError::ExtraPadding { skipped_bits });
}
if scratch.ll_rle.is_some() || scratch.ml_rle.is_some() || scratch.of_rle.is_some() {
decode_sequences_with_rle(section, &mut br, scratch, target)
} else {
decode_sequences_without_rle(section, &mut br, scratch, target)
}
}
fn decode_sequences_with_rle(
section: &SequencesHeader,
br: &mut BitReaderReversed<'_>,
scratch: &FSEScratch,
target: &mut Vec<Sequence>,
) -> Result<(), DecodeSequenceError> {
let mut ll_dec = FSEDecoder::new(&scratch.literal_lengths);
let mut ml_dec = FSEDecoder::new(&scratch.match_lengths);
let mut of_dec = FSEDecoder::new(&scratch.offsets);
if scratch.ll_rle.is_none() {
ll_dec.init_state(br)?;
}
if scratch.of_rle.is_none() {
of_dec.init_state(br)?;
}
if scratch.ml_rle.is_none() {
ml_dec.init_state(br)?;
}
target.clear();
target.reserve(section.num_sequences as usize);
for _seq_idx in 0..section.num_sequences {
//get the codes from either the RLE byte or from the decoder
let ll_code = if scratch.ll_rle.is_some() {
scratch.ll_rle.unwrap()
} else {
ll_dec.decode_symbol()
};
let ml_code = if scratch.ml_rle.is_some() {
scratch.ml_rle.unwrap()
} else {
ml_dec.decode_symbol()
};
let of_code = if scratch.of_rle.is_some() {
scratch.of_rle.unwrap()
} else {
of_dec.decode_symbol()
};
let (ll_value, ll_num_bits) = lookup_ll_code(ll_code);
let (ml_value, ml_num_bits) = lookup_ml_code(ml_code);
//println!("Sequence: {}", i);
//println!("of stat: {}", of_dec.state);
//println!("of Code: {}", of_code);
//println!("ll stat: {}", ll_dec.state);
//println!("ll bits: {}", ll_num_bits);
//println!("ll Code: {}", ll_value);
//println!("ml stat: {}", ml_dec.state);
//println!("ml bits: {}", ml_num_bits);
//println!("ml Code: {}", ml_value);
//println!("");
if of_code > MAX_OFFSET_CODE {
return Err(DecodeSequenceError::UnsupportedOffset {
offset_code: of_code,
});
}
let (obits, ml_add, ll_add) = br.get_bits_triple(of_code, ml_num_bits, ll_num_bits);
let offset = obits as u32 + (1u32 << of_code);
if offset == 0 {
return Err(DecodeSequenceError::ZeroOffset);
}
target.push(Sequence {
ll: ll_value + ll_add as u32,
ml: ml_value + ml_add as u32,
of: offset,
});
if target.len() < section.num_sequences as usize {
//println!(
// "Bits left: {} ({} bytes)",
// br.bits_remaining(),
// br.bits_remaining() / 8,
//);
if scratch.ll_rle.is_none() {
ll_dec.update_state(br);
}
if scratch.ml_rle.is_none() {
ml_dec.update_state(br);
}
if scratch.of_rle.is_none() {
of_dec.update_state(br);
}
}
if br.bits_remaining() < 0 {
return Err(DecodeSequenceError::NotEnoughBytesForNumSequences);
}
}
if br.bits_remaining() > 0 {
Err(DecodeSequenceError::ExtraBits {
bits_remaining: br.bits_remaining(),
})
} else {
Ok(())
}
}
fn decode_sequences_without_rle(
section: &SequencesHeader,
br: &mut BitReaderReversed<'_>,
scratch: &FSEScratch,
target: &mut Vec<Sequence>,
) -> Result<(), DecodeSequenceError> {
let mut ll_dec = FSEDecoder::new(&scratch.literal_lengths);
let mut ml_dec = FSEDecoder::new(&scratch.match_lengths);
let mut of_dec = FSEDecoder::new(&scratch.offsets);
ll_dec.init_state(br)?;
of_dec.init_state(br)?;
ml_dec.init_state(br)?;
target.clear();
target.reserve(section.num_sequences as usize);
for _seq_idx in 0..section.num_sequences {
let ll_code = ll_dec.decode_symbol();
let ml_code = ml_dec.decode_symbol();
let of_code = of_dec.decode_symbol();
let (ll_value, ll_num_bits) = lookup_ll_code(ll_code);
let (ml_value, ml_num_bits) = lookup_ml_code(ml_code);
if of_code > MAX_OFFSET_CODE {
return Err(DecodeSequenceError::UnsupportedOffset {
offset_code: of_code,
});
}
let (obits, ml_add, ll_add) = br.get_bits_triple(of_code, ml_num_bits, ll_num_bits);
let offset = obits as u32 + (1u32 << of_code);
if offset == 0 {
return Err(DecodeSequenceError::ZeroOffset);
}
target.push(Sequence {
ll: ll_value + ll_add as u32,
ml: ml_value + ml_add as u32,
of: offset,
});
if target.len() < section.num_sequences as usize {
//println!(
// "Bits left: {} ({} bytes)",
// br.bits_remaining(),
// br.bits_remaining() / 8,
//);
ll_dec.update_state(br);
ml_dec.update_state(br);
of_dec.update_state(br);
}
if br.bits_remaining() < 0 {
return Err(DecodeSequenceError::NotEnoughBytesForNumSequences);
}
}
if br.bits_remaining() > 0 {
Err(DecodeSequenceError::ExtraBits {
bits_remaining: br.bits_remaining(),
})
} else {
Ok(())
}
}
/// Look up the provided state value from a literal length table predefined
/// by the Zstandard reference document. Returns a tuple of (value, number of bits).
///
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#appendix-a---decoding-tables-for-predefined-codes>
fn lookup_ll_code(code: u8) -> (u32, u8) {
match code {
0..=15 => (u32::from(code), 0),
16 => (16, 1),
17 => (18, 1),
18 => (20, 1),
19 => (22, 1),
20 => (24, 2),
21 => (28, 2),
22 => (32, 3),
23 => (40, 3),
24 => (48, 4),
25 => (64, 6),
26 => (128, 7),
27 => (256, 8),
28 => (512, 9),
29 => (1024, 10),
30 => (2048, 11),
31 => (4096, 12),
32 => (8192, 13),
33 => (16384, 14),
34 => (32768, 15),
35 => (65536, 16),
_ => unreachable!("Illegal literal length code was: {}", code),
}
}
/// Look up the provided state value from a match length table predefined
/// by the Zstandard reference document. Returns a tuple of (value, number of bits).
///
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#appendix-a---decoding-tables-for-predefined-codes>
fn lookup_ml_code(code: u8) -> (u32, u8) {
match code {
0..=31 => (u32::from(code) + 3, 0),
32 => (35, 1),
33 => (37, 1),
34 => (39, 1),
35 => (41, 1),
36 => (43, 2),
37 => (47, 2),
38 => (51, 3),
39 => (59, 3),
40 => (67, 4),
41 => (83, 4),
42 => (99, 5),
43 => (131, 7),
44 => (259, 8),
45 => (515, 9),
46 => (1027, 10),
47 => (2051, 11),
48 => (4099, 12),
49 => (8195, 13),
50 => (16387, 14),
51 => (32771, 15),
52 => (65539, 16),
_ => unreachable!("Illegal match length code was: {}", code),
}
}
// This info is buried in the symbol compression mode table
/// "The maximum allowed accuracy log for literals length and match length tables is 9"
pub const LL_MAX_LOG: u8 = 9;
/// "The maximum allowed accuracy log for literals length and match length tables is 9"
pub const ML_MAX_LOG: u8 = 9;
/// "The maximum accuracy log for the offset table is 8."
pub const OF_MAX_LOG: u8 = 8;
fn maybe_update_fse_tables(
section: &SequencesHeader,
source: &[u8],
scratch: &mut FSEScratch,
) -> Result<usize, DecodeSequenceError> {
let modes = section
.modes
.ok_or(DecodeSequenceError::MissingCompressionMode)?;
let mut bytes_read = 0;
match modes.ll_mode() {
ModeType::FSECompressed => {
let bytes = scratch.literal_lengths.build_decoder(source, LL_MAX_LOG)?;
bytes_read += bytes;
vprintln!("Updating ll table");
vprintln!("Used bytes: {}", bytes);
scratch.ll_rle = None;
}
ModeType::RLE => {
vprintln!("Use RLE ll table");
if source.is_empty() {
return Err(DecodeSequenceError::MissingByteForRleLlTable);
}
bytes_read += 1;
if source[0] > MAX_LITERAL_LENGTH_CODE {
return Err(DecodeSequenceError::MissingByteForRleMlTable);
}
scratch.ll_rle = Some(source[0]);
}
ModeType::Predefined => {
vprintln!("Use predefined ll table");
scratch.literal_lengths.build_from_probabilities(
LL_DEFAULT_ACC_LOG,
&Vec::from(&LITERALS_LENGTH_DEFAULT_DISTRIBUTION[..]),
)?;
scratch.ll_rle = None;
}
ModeType::Repeat => {
vprintln!("Repeat ll table");
/* Nothing to do */
}
};
let of_source = &source[bytes_read..];
match modes.of_mode() {
ModeType::FSECompressed => {
let bytes = scratch.offsets.build_decoder(of_source, OF_MAX_LOG)?;
vprintln!("Updating of table");
vprintln!("Used bytes: {}", bytes);
bytes_read += bytes;
scratch.of_rle = None;
}
ModeType::RLE => {
vprintln!("Use RLE of table");
if of_source.is_empty() {
return Err(DecodeSequenceError::MissingByteForRleOfTable);
}
bytes_read += 1;
if of_source[0] > MAX_OFFSET_CODE {
return Err(DecodeSequenceError::MissingByteForRleMlTable);
}
scratch.of_rle = Some(of_source[0]);
}
ModeType::Predefined => {
vprintln!("Use predefined of table");
scratch.offsets.build_from_probabilities(
OF_DEFAULT_ACC_LOG,
&Vec::from(&OFFSET_DEFAULT_DISTRIBUTION[..]),
)?;
scratch.of_rle = None;
}
ModeType::Repeat => {
vprintln!("Repeat of table");
/* Nothing to do */
}
};
let ml_source = &source[bytes_read..];
match modes.ml_mode() {
ModeType::FSECompressed => {
let bytes = scratch.match_lengths.build_decoder(ml_source, ML_MAX_LOG)?;
bytes_read += bytes;
vprintln!("Updating ml table");
vprintln!("Used bytes: {}", bytes);
scratch.ml_rle = None;
}
ModeType::RLE => {
vprintln!("Use RLE ml table");
if ml_source.is_empty() {
return Err(DecodeSequenceError::MissingByteForRleMlTable);
}
bytes_read += 1;
if ml_source[0] > MAX_MATCH_LENGTH_CODE {
return Err(DecodeSequenceError::MissingByteForRleMlTable);
}
scratch.ml_rle = Some(ml_source[0]);
}
ModeType::Predefined => {
vprintln!("Use predefined ml table");
scratch.match_lengths.build_from_probabilities(
ML_DEFAULT_ACC_LOG,
&Vec::from(&MATCH_LENGTH_DEFAULT_DISTRIBUTION[..]),
)?;
scratch.ml_rle = None;
}
ModeType::Repeat => {
vprintln!("Repeat ml table");
/* Nothing to do */
}
};
Ok(bytes_read)
}
// The default Literal Length decoding table uses an accuracy logarithm of 6 bits.
const LL_DEFAULT_ACC_LOG: u8 = 6;
/// If [ModeType::Predefined] is selected for a symbol type, its FSE decoding
/// table is generated using a predefined distribution table.
///
/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#literals-length
const LITERALS_LENGTH_DEFAULT_DISTRIBUTION: [i32; 36] = [
4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1,
-1, -1, -1, -1,
];
// The default Match Length decoding table uses an accuracy logarithm of 6 bits.
const ML_DEFAULT_ACC_LOG: u8 = 6;
/// If [ModeType::Predefined] is selected for a symbol type, its FSE decoding
/// table is generated using a predefined distribution table.
///
/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#match-length
const MATCH_LENGTH_DEFAULT_DISTRIBUTION: [i32; 53] = [
1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1,
];
// The default Match Length decoding table uses an accuracy logarithm of 5 bits.
const OF_DEFAULT_ACC_LOG: u8 = 5;
/// If [ModeType::Predefined] is selected for a symbol type, its FSE decoding
/// table is generated using a predefined distribution table.
///
/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#match-length
const OFFSET_DEFAULT_DISTRIBUTION: [i32; 29] = [
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1,
];
#[test]
fn test_ll_default() {
let mut table = crate::fse::FSETable::new(MAX_LITERAL_LENGTH_CODE);
table
.build_from_probabilities(
LL_DEFAULT_ACC_LOG,
&Vec::from(&LITERALS_LENGTH_DEFAULT_DISTRIBUTION[..]),
)
.unwrap();
#[cfg(feature = "std")]
for idx in 0..table.decode.len() {
std::println!(
"{:3}: {:3} {:3} {:3}",
idx,
table.decode[idx].symbol,
table.decode[idx].num_bits,
table.decode[idx].base_line
);
}
assert!(table.decode.len() == 64);
//just test a few values. TODO test all values
assert!(table.decode[0].symbol == 0);
assert!(table.decode[0].num_bits == 4);
assert!(table.decode[0].base_line == 0);
assert!(table.decode[19].symbol == 27);
assert!(table.decode[19].num_bits == 6);
assert!(table.decode[19].base_line == 0);
assert!(table.decode[39].symbol == 25);
assert!(table.decode[39].num_bits == 4);
assert!(table.decode[39].base_line == 16);
assert!(table.decode[60].symbol == 35);
assert!(table.decode[60].num_bits == 6);
assert!(table.decode[60].base_line == 0);
assert!(table.decode[59].symbol == 24);
assert!(table.decode[59].num_bits == 5);
assert!(table.decode[59].base_line == 32);
}

View File

@@ -0,0 +1,143 @@
//! The [StreamingDecoder] wraps a [FrameDecoder] and provides a Read impl that decodes data when necessary
use core::borrow::BorrowMut;
use crate::decoding::errors::FrameDecoderError;
use crate::decoding::{BlockDecodingStrategy, FrameDecoder};
#[cfg(not(feature = "std"))]
use crate::io::ErrorKind;
use crate::io::{Error, Read};
/// High level Zstandard frame decoder that can be used to decompress a given Zstandard frame.
///
/// This decoder implements `io::Read`, so you can interact with it by calling
/// `io::Read::read_to_end` / `io::Read::read_exact` or passing this to another library / module as a source for the decoded content
///
/// If you need more control over how decompression takes place, you can use
/// the lower level [FrameDecoder], which allows for greater control over how
/// decompression takes place but the implementor must call
/// [FrameDecoder::decode_blocks] repeatedly to decode the entire frame.
///
/// ## Caveat
/// [StreamingDecoder] expects the underlying stream to only contain a single frame,
/// yet the specification states that a single archive may contain multiple frames.
///
/// To decode all the frames in a finite stream, the calling code needs to recreate
/// the instance of the decoder and handle
/// [crate::decoding::errors::ReadFrameHeaderError::SkipFrame]
/// errors by skipping forward the `length` amount of bytes, see <https://github.com/KillingSpark/zstd-rs/issues/57>
///
/// ```no_run
/// // `read_to_end` is not implemented by the no_std implementation.
/// #[cfg(feature = "std")]
/// {
/// use std::fs::File;
/// use std::io::Read;
/// use ruzstd::decoding::StreamingDecoder;
///
/// // Read a Zstandard archive from the filesystem then decompress it into a vec.
/// let mut f: File = todo!("Read a .zstd archive from somewhere");
/// let mut decoder = StreamingDecoder::new(f).unwrap();
/// let mut result = Vec::new();
/// Read::read_to_end(&mut decoder, &mut result).unwrap();
/// }
/// ```
pub struct StreamingDecoder<READ: Read, DEC: BorrowMut<FrameDecoder>> {
pub decoder: DEC,
source: READ,
}
impl<READ: Read, DEC: BorrowMut<FrameDecoder>> StreamingDecoder<READ, DEC> {
pub fn new_with_decoder(
mut source: READ,
mut decoder: DEC,
) -> Result<StreamingDecoder<READ, DEC>, FrameDecoderError> {
decoder.borrow_mut().init(&mut source)?;
Ok(StreamingDecoder { decoder, source })
}
}
impl<READ: Read> StreamingDecoder<READ, FrameDecoder> {
pub fn new(
mut source: READ,
) -> Result<StreamingDecoder<READ, FrameDecoder>, FrameDecoderError> {
let mut decoder = FrameDecoder::new();
decoder.init(&mut source)?;
Ok(StreamingDecoder { decoder, source })
}
}
impl<READ: Read, DEC: BorrowMut<FrameDecoder>> StreamingDecoder<READ, DEC> {
/// Gets a reference to the underlying reader.
pub fn get_ref(&self) -> &READ {
&self.source
}
/// Gets a mutable reference to the underlying reader.
///
/// It is inadvisable to directly read from the underlying reader.
pub fn get_mut(&mut self) -> &mut READ {
&mut self.source
}
/// Destructures this object into the inner reader.
pub fn into_inner(self) -> READ
where
READ: Sized,
{
self.source
}
/// Destructures this object into both the inner reader and [FrameDecoder].
pub fn into_parts(self) -> (READ, DEC)
where
READ: Sized,
{
(self.source, self.decoder)
}
/// Destructures this object into the inner [FrameDecoder].
pub fn into_frame_decoder(self) -> DEC {
self.decoder
}
}
impl<READ: Read, DEC: BorrowMut<FrameDecoder>> Read for StreamingDecoder<READ, DEC> {
fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
let decoder = self.decoder.borrow_mut();
if decoder.is_finished() && decoder.can_collect() == 0 {
//No more bytes can ever be decoded
return Ok(0);
}
// need to loop. The UpToBytes strategy doesn't take any effort to actually reach that limit.
// The first few calls can result in just filling the decode buffer but these bytes can not be collected.
// So we need to call this until we can actually collect enough bytes
// TODO add BlockDecodingStrategy::UntilCollectable(usize) that pushes this logic into the decode_blocks function
while decoder.can_collect() < buf.len() && !decoder.is_finished() {
//More bytes can be decoded
let additional_bytes_needed = buf.len() - decoder.can_collect();
match decoder.decode_blocks(
&mut self.source,
BlockDecodingStrategy::UptoBytes(additional_bytes_needed),
) {
Ok(_) => { /*Nothing to do*/ }
Err(e) => {
let err;
#[cfg(feature = "std")]
{
err = Error::other(e);
}
#[cfg(not(feature = "std"))]
{
err = Error::new(ErrorKind::Other, alloc::boxed::Box::new(e));
}
return Err(err);
}
}
}
decoder.read(buf)
}
}

View File

@@ -0,0 +1,64 @@
use crate::blocks::block::BlockType;
use alloc::vec::Vec;
#[derive(Debug)]
pub struct BlockHeader {
/// Signals if this block is the last one.
/// The frame will end after this block.
pub last_block: bool,
/// Influences the meaning of `block_size`.
pub block_type: BlockType,
/// - For `Raw` blocks, this is the size of the block's
/// content in bytes.
/// - For `RLE` blocks, there will be a single byte follwing
/// the header, repeated `block_size` times.
/// - For `Compressed` blocks, this is the length of
/// the compressed data.
///
/// **This value must not be greater than 21 bits in length.**
pub block_size: u32,
}
impl BlockHeader {
/// Write encoded binary representation of this header into the provided buffer.
pub fn serialize(self, output: &mut Vec<u8>) {
vprintln!("Serializing block with the header: {self:?}");
let encoded_block_type = match self.block_type {
BlockType::Raw => 0,
BlockType::RLE => 1,
BlockType::Compressed => 2,
BlockType::Reserved => panic!("You cannot use a reserved block type"),
};
let mut block_header = self.block_size << 3;
block_header |= encoded_block_type << 1;
block_header |= self.last_block as u32;
output.extend_from_slice(&block_header.to_le_bytes()[0..3]);
}
}
#[cfg(test)]
mod tests {
use super::BlockHeader;
use crate::{blocks::block::BlockType, decoding::block_decoder};
use alloc::vec::Vec;
#[test]
fn block_header_serialize() {
let header = BlockHeader {
last_block: true,
block_type: super::BlockType::Compressed,
block_size: 69,
};
let mut serialized_header = Vec::new();
header.serialize(&mut serialized_header);
let mut decoder = block_decoder::new();
let parsed_header = decoder
.read_block_header(serialized_header.as_slice())
.unwrap()
.0;
assert!(parsed_header.last_block);
assert_eq!(parsed_header.block_type, BlockType::Compressed);
assert_eq!(parsed_header.content_size, 69);
}
}

View File

@@ -0,0 +1,376 @@
use alloc::vec::Vec;
use crate::{
bit_io::BitWriter,
encoding::frame_compressor::CompressState,
encoding::{Matcher, Sequence},
fse::fse_encoder::{build_table_from_data, FSETable, State},
huff0::huff0_encoder,
};
pub fn compress_block<M: Matcher>(state: &mut CompressState<M>, output: &mut Vec<u8>) {
let mut literals_vec = Vec::new();
let mut sequences = Vec::new();
state.matcher.start_matching(|seq| {
match seq {
Sequence::Literals { literals } => literals_vec.extend_from_slice(literals),
Sequence::Triple {
literals,
offset,
match_len,
} => {
literals_vec.extend_from_slice(literals);
sequences.push(crate::blocks::sequence_section::Sequence {
ll: literals.len() as u32,
ml: match_len as u32,
of: (offset + 3) as u32, // TODO make use of the offset history
});
}
}
});
// literals section
let mut writer = BitWriter::from(output);
if literals_vec.len() > 1024 {
if let Some(table) =
compress_literals(&literals_vec, state.last_huff_table.as_ref(), &mut writer)
{
state.last_huff_table.replace(table);
}
} else {
raw_literals(&literals_vec, &mut writer);
}
// sequences section
if sequences.is_empty() {
writer.write_bits(0u8, 8);
} else {
encode_seqnum(sequences.len(), &mut writer);
// Choose the tables
// TODO store previously used tables
let ll_mode = choose_table(
state.fse_tables.ll_previous.as_ref(),
&state.fse_tables.ll_default,
sequences.iter().map(|seq| encode_literal_length(seq.ll).0),
9,
);
let ml_mode = choose_table(
state.fse_tables.ml_previous.as_ref(),
&state.fse_tables.ml_default,
sequences.iter().map(|seq| encode_match_len(seq.ml).0),
9,
);
let of_mode = choose_table(
state.fse_tables.of_previous.as_ref(),
&state.fse_tables.of_default,
sequences.iter().map(|seq| encode_offset(seq.of).0),
8,
);
writer.write_bits(encode_fse_table_modes(&ll_mode, &ml_mode, &of_mode), 8);
encode_table(&ll_mode, &mut writer);
encode_table(&of_mode, &mut writer);
encode_table(&ml_mode, &mut writer);
encode_sequences(
&sequences,
&mut writer,
ll_mode.as_ref(),
ml_mode.as_ref(),
of_mode.as_ref(),
);
if let FseTableMode::Encoded(table) = ll_mode {
state.fse_tables.ll_previous = Some(table)
}
if let FseTableMode::Encoded(table) = ml_mode {
state.fse_tables.ml_previous = Some(table)
}
if let FseTableMode::Encoded(table) = of_mode {
state.fse_tables.of_previous = Some(table)
}
}
writer.flush();
}
#[derive(Clone)]
#[allow(clippy::large_enum_variant)]
enum FseTableMode<'a> {
Predefined(&'a FSETable),
Encoded(FSETable),
RepeateLast(&'a FSETable),
}
impl FseTableMode<'_> {
pub fn as_ref(&self) -> &FSETable {
match self {
Self::Predefined(t) => t,
Self::RepeateLast(t) => t,
Self::Encoded(t) => t,
}
}
}
fn choose_table<'a>(
previous: Option<&'a FSETable>,
default_table: &'a FSETable,
data: impl Iterator<Item = u8>,
max_log: u8,
) -> FseTableMode<'a> {
// TODO check if the new table is better than the predefined and previous table
let use_new_table = true;
let use_previous_table = false;
if use_previous_table {
FseTableMode::RepeateLast(previous.unwrap())
} else if use_new_table {
FseTableMode::Encoded(build_table_from_data(data, max_log, true))
} else {
FseTableMode::Predefined(default_table)
}
}
fn encode_table(mode: &FseTableMode<'_>, writer: &mut BitWriter<&mut Vec<u8>>) {
match mode {
FseTableMode::Predefined(_) => {}
FseTableMode::RepeateLast(_) => {}
FseTableMode::Encoded(table) => table.write_table(writer),
}
}
fn encode_fse_table_modes(
ll_mode: &FseTableMode<'_>,
ml_mode: &FseTableMode<'_>,
of_mode: &FseTableMode<'_>,
) -> u8 {
fn mode_to_bits(mode: &FseTableMode<'_>) -> u8 {
match mode {
FseTableMode::Predefined(_) => 0,
FseTableMode::Encoded(_) => 2,
FseTableMode::RepeateLast(_) => 3,
}
}
mode_to_bits(ll_mode) << 6 | mode_to_bits(of_mode) << 4 | mode_to_bits(ml_mode) << 2
}
fn encode_sequences(
sequences: &[crate::blocks::sequence_section::Sequence],
writer: &mut BitWriter<&mut Vec<u8>>,
ll_table: &FSETable,
ml_table: &FSETable,
of_table: &FSETable,
) {
let sequence = sequences[sequences.len() - 1];
let (ll_code, ll_add_bits, ll_num_bits) = encode_literal_length(sequence.ll);
let (of_code, of_add_bits, of_num_bits) = encode_offset(sequence.of);
let (ml_code, ml_add_bits, ml_num_bits) = encode_match_len(sequence.ml);
let mut ll_state: &State = ll_table.start_state(ll_code);
let mut ml_state: &State = ml_table.start_state(ml_code);
let mut of_state: &State = of_table.start_state(of_code);
writer.write_bits(ll_add_bits, ll_num_bits);
writer.write_bits(ml_add_bits, ml_num_bits);
writer.write_bits(of_add_bits, of_num_bits);
// encode backwards so the decoder reads the first sequence first
if sequences.len() > 1 {
for sequence in (0..=sequences.len() - 2).rev() {
let sequence = sequences[sequence];
let (ll_code, ll_add_bits, ll_num_bits) = encode_literal_length(sequence.ll);
let (of_code, of_add_bits, of_num_bits) = encode_offset(sequence.of);
let (ml_code, ml_add_bits, ml_num_bits) = encode_match_len(sequence.ml);
{
let next = of_table.next_state(of_code, of_state.index);
let diff = of_state.index - next.baseline;
writer.write_bits(diff as u64, next.num_bits as usize);
of_state = next;
}
{
let next = ml_table.next_state(ml_code, ml_state.index);
let diff = ml_state.index - next.baseline;
writer.write_bits(diff as u64, next.num_bits as usize);
ml_state = next;
}
{
let next = ll_table.next_state(ll_code, ll_state.index);
let diff = ll_state.index - next.baseline;
writer.write_bits(diff as u64, next.num_bits as usize);
ll_state = next;
}
writer.write_bits(ll_add_bits, ll_num_bits);
writer.write_bits(ml_add_bits, ml_num_bits);
writer.write_bits(of_add_bits, of_num_bits);
}
}
writer.write_bits(ml_state.index as u64, ml_table.table_size.ilog2() as usize);
writer.write_bits(of_state.index as u64, of_table.table_size.ilog2() as usize);
writer.write_bits(ll_state.index as u64, ll_table.table_size.ilog2() as usize);
let bits_to_fill = writer.misaligned();
if bits_to_fill == 0 {
writer.write_bits(1u32, 8);
} else {
writer.write_bits(1u32, bits_to_fill);
}
}
fn encode_seqnum(seqnum: usize, writer: &mut BitWriter<impl AsMut<Vec<u8>>>) {
const UPPER_LIMIT: usize = 0xFFFF + 0x7F00;
match seqnum {
1..=127 => writer.write_bits(seqnum as u32, 8),
128..=0x7FFF => {
let upper = ((seqnum >> 8) | 0x80) as u8;
let lower = seqnum as u8;
writer.write_bits(upper, 8);
writer.write_bits(lower, 8);
}
0x8000..=UPPER_LIMIT => {
let encode = seqnum - 0x7F00;
let upper = (encode >> 8) as u8;
let lower = encode as u8;
writer.write_bits(255u8, 8);
writer.write_bits(upper, 8);
writer.write_bits(lower, 8);
}
_ => unreachable!(),
}
}
fn encode_literal_length(len: u32) -> (u8, u32, usize) {
match len {
0..=15 => (len as u8, 0, 0),
16..=17 => (16, len - 16, 1),
18..=19 => (17, len - 18, 1),
20..=21 => (18, len - 20, 1),
22..=23 => (19, len - 22, 1),
24..=27 => (20, len - 24, 2),
28..=31 => (21, len - 28, 2),
32..=39 => (22, len - 32, 3),
40..=47 => (23, len - 40, 3),
48..=63 => (24, len - 48, 4),
64..=127 => (25, len - 64, 6),
128..=255 => (26, len - 128, 7),
256..=511 => (27, len - 256, 8),
512..=1023 => (28, len - 512, 9),
1024..=2047 => (29, len - 1024, 10),
2048..=4095 => (30, len - 2048, 11),
4096..=8191 => (31, len - 4096, 12),
8192..=16383 => (32, len - 8192, 13),
16384..=32767 => (33, len - 16384, 14),
32768..=65535 => (34, len - 32768, 15),
65536..=131071 => (35, len - 65536, 16),
131072.. => unreachable!(),
}
}
fn encode_match_len(len: u32) -> (u8, u32, usize) {
match len {
0..=2 => unreachable!(),
3..=34 => (len as u8 - 3, 0, 0),
35..=36 => (32, len - 35, 1),
37..=38 => (33, len - 37, 1),
39..=40 => (34, len - 39, 1),
41..=42 => (35, len - 41, 1),
43..=46 => (36, len - 43, 2),
47..=50 => (37, len - 47, 2),
51..=58 => (38, len - 51, 3),
59..=66 => (39, len - 59, 3),
67..=82 => (40, len - 67, 4),
83..=98 => (41, len - 83, 4),
99..=130 => (42, len - 99, 5),
131..=258 => (43, len - 131, 7),
259..=514 => (44, len - 259, 8),
515..=1026 => (45, len - 515, 9),
1027..=2050 => (46, len - 1027, 10),
2051..=4098 => (47, len - 2051, 11),
4099..=8194 => (48, len - 4099, 12),
8195..=16386 => (49, len - 8195, 13),
16387..=32770 => (50, len - 16387, 14),
32771..=65538 => (51, len - 32771, 15),
65539..=131074 => (52, len - 32771, 16),
131075.. => unreachable!(),
}
}
fn encode_offset(len: u32) -> (u8, u32, usize) {
let log = len.ilog2();
let lower = len & ((1 << log) - 1);
(log as u8, lower, log as usize)
}
fn raw_literals(literals: &[u8], writer: &mut BitWriter<&mut Vec<u8>>) {
writer.write_bits(0u8, 2);
writer.write_bits(0b11u8, 2);
writer.write_bits(literals.len() as u32, 20);
writer.append_bytes(literals);
}
fn compress_literals(
literals: &[u8],
last_table: Option<&huff0_encoder::HuffmanTable>,
writer: &mut BitWriter<&mut Vec<u8>>,
) -> Option<huff0_encoder::HuffmanTable> {
let reset_idx = writer.index();
let new_encoder_table = huff0_encoder::HuffmanTable::build_from_data(literals);
let (encoder_table, new_table) = if let Some(_table) = last_table {
if let Some(diff) = _table.can_encode(&new_encoder_table) {
// TODO this is a very simple heuristic, maybe we should try to do better
if diff > 5 {
(&new_encoder_table, true)
} else {
(_table, false)
}
} else {
(&new_encoder_table, true)
}
} else {
(&new_encoder_table, true)
};
if new_table {
writer.write_bits(2u8, 2); // compressed literals type
} else {
writer.write_bits(3u8, 2); // treeless compressed literals type
}
let (size_format, size_bits) = match literals.len() {
0..6 => (0b00u8, 10),
6..1024 => (0b01, 10),
1024..16384 => (0b10, 14),
16384..262144 => (0b11, 18),
_ => unimplemented!("too many literals"),
};
writer.write_bits(size_format, 2);
writer.write_bits(literals.len() as u32, size_bits);
let size_index = writer.index();
writer.write_bits(0u32, size_bits);
let index_before = writer.index();
let mut encoder = huff0_encoder::HuffmanEncoder::new(encoder_table, writer);
if size_format == 0 {
encoder.encode(literals, new_table)
} else {
encoder.encode4x(literals, new_table)
};
let encoded_len = (writer.index() - index_before) / 8;
writer.change_bits(size_index, encoded_len as u64, size_bits);
let total_len = (writer.index() - reset_idx) / 8;
// If encoded len is bigger than the raw literals we are better off just writing the raw literals here
if total_len >= literals.len() {
writer.reset_to(reset_idx);
raw_literals(literals, writer);
None
} else if new_table {
Some(new_encoder_table)
} else {
None
}
}

View File

@@ -0,0 +1,8 @@
//! After Magic_Number and Frame_Header, there are some number of blocks. Each frame must have at least one block,
//! but there is no upper limit on the number of blocks per frame.
//!
//! There are a few different kinds of blocks, and implementations for those kinds are
//! in this module.
mod compressed;
pub(super) use compressed::*;

View File

@@ -0,0 +1,461 @@
//! Utilities and interfaces for encoding an entire frame. Allows reusing resources
use alloc::vec::Vec;
use core::convert::TryInto;
#[cfg(feature = "hash")]
use twox_hash::XxHash64;
#[cfg(feature = "hash")]
use core::hash::Hasher;
use super::{
block_header::BlockHeader, frame_header::FrameHeader, levels::*,
match_generator::MatchGeneratorDriver, CompressionLevel, Matcher,
};
use crate::fse::fse_encoder::{default_ll_table, default_ml_table, default_of_table, FSETable};
use crate::io::{Read, Write};
/// An interface for compressing arbitrary data with the ZStandard compression algorithm.
///
/// `FrameCompressor` will generally be used by:
/// 1. Initializing a compressor by providing a buffer of data using `FrameCompressor::new()`
/// 2. Starting compression and writing that compression into a vec using `FrameCompressor::begin`
///
/// # Examples
/// ```
/// use ruzstd::encoding::{FrameCompressor, CompressionLevel};
/// let mock_data: &[_] = &[0x1, 0x2, 0x3, 0x4];
/// let mut output = std::vec::Vec::new();
/// // Initialize a compressor.
/// let mut compressor = FrameCompressor::new(CompressionLevel::Uncompressed);
/// compressor.set_source(mock_data);
/// compressor.set_drain(&mut output);
///
/// // `compress` writes the compressed output into the provided buffer.
/// compressor.compress();
/// ```
pub struct FrameCompressor<R: Read, W: Write, M: Matcher> {
uncompressed_data: Option<R>,
compressed_data: Option<W>,
compression_level: CompressionLevel,
state: CompressState<M>,
#[cfg(feature = "hash")]
hasher: XxHash64,
}
pub(crate) struct FseTables {
pub(crate) ll_default: FSETable,
pub(crate) ll_previous: Option<FSETable>,
pub(crate) ml_default: FSETable,
pub(crate) ml_previous: Option<FSETable>,
pub(crate) of_default: FSETable,
pub(crate) of_previous: Option<FSETable>,
}
impl FseTables {
pub fn new() -> Self {
Self {
ll_default: default_ll_table(),
ll_previous: None,
ml_default: default_ml_table(),
ml_previous: None,
of_default: default_of_table(),
of_previous: None,
}
}
}
pub(crate) struct CompressState<M: Matcher> {
pub(crate) matcher: M,
pub(crate) last_huff_table: Option<crate::huff0::huff0_encoder::HuffmanTable>,
pub(crate) fse_tables: FseTables,
}
impl<R: Read, W: Write> FrameCompressor<R, W, MatchGeneratorDriver> {
/// Create a new `FrameCompressor`
pub fn new(compression_level: CompressionLevel) -> Self {
Self {
uncompressed_data: None,
compressed_data: None,
compression_level,
state: CompressState {
matcher: MatchGeneratorDriver::new(1024 * 128, 1),
last_huff_table: None,
fse_tables: FseTables::new(),
},
#[cfg(feature = "hash")]
hasher: XxHash64::with_seed(0),
}
}
}
impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
/// Create a new `FrameCompressor` with a custom matching algorithm implementation
pub fn new_with_matcher(matcher: M, compression_level: CompressionLevel) -> Self {
Self {
uncompressed_data: None,
compressed_data: None,
state: CompressState {
matcher,
last_huff_table: None,
fse_tables: FseTables::new(),
},
compression_level,
#[cfg(feature = "hash")]
hasher: XxHash64::with_seed(0),
}
}
/// Before calling [FrameCompressor::compress] you need to set the source.
///
/// This is the data that is compressed and written into the drain.
pub fn set_source(&mut self, uncompressed_data: R) -> Option<R> {
self.uncompressed_data.replace(uncompressed_data)
}
/// Before calling [FrameCompressor::compress] you need to set the drain.
///
/// As the compressor compresses data, the drain serves as a place for the output to be writte.
pub fn set_drain(&mut self, compressed_data: W) -> Option<W> {
self.compressed_data.replace(compressed_data)
}
/// Compress the uncompressed data from the provided source as one Zstd frame and write it to the provided drain
///
/// This will repeatedly call [Read::read] on the source to fill up blocks until the source returns 0 on the read call.
/// Also [Write::write_all] will be called on the drain after each block has been encoded.
///
/// To avoid endlessly encoding from a potentially endless source (like a network socket) you can use the
/// [Read::take] function
pub fn compress(&mut self) {
// Clearing buffers to allow re-using of the compressor
self.state.matcher.reset(self.compression_level);
self.state.last_huff_table = None;
let source = self.uncompressed_data.as_mut().unwrap();
let drain = self.compressed_data.as_mut().unwrap();
// As the frame is compressed, it's stored here
let output: &mut Vec<u8> = &mut Vec::with_capacity(1024 * 130);
// First write the frame header
let header = FrameHeader {
frame_content_size: None,
single_segment: false,
content_checksum: cfg!(feature = "hash"),
dictionary_id: None,
window_size: Some(self.state.matcher.window_size()),
};
header.serialize(output);
// Now compress block by block
loop {
// Read a single block's worth of uncompressed data from the input
let mut uncompressed_data = self.state.matcher.get_next_space();
let mut read_bytes = 0;
let last_block;
'read_loop: loop {
let new_bytes = source.read(&mut uncompressed_data[read_bytes..]).unwrap();
if new_bytes == 0 {
last_block = true;
break 'read_loop;
}
read_bytes += new_bytes;
if read_bytes == uncompressed_data.len() {
last_block = false;
break 'read_loop;
}
}
uncompressed_data.resize(read_bytes, 0);
// As we read, hash that data too
#[cfg(feature = "hash")]
self.hasher.write(&uncompressed_data);
// Special handling is needed for compression of a totally empty file (why you'd want to do that, I don't know)
if uncompressed_data.is_empty() {
let header = BlockHeader {
last_block: true,
block_type: crate::blocks::block::BlockType::Raw,
block_size: 0,
};
// Write the header, then the block
header.serialize(output);
drain.write_all(output).unwrap();
output.clear();
break;
}
match self.compression_level {
CompressionLevel::Uncompressed => {
let header = BlockHeader {
last_block,
block_type: crate::blocks::block::BlockType::Raw,
block_size: read_bytes.try_into().unwrap(),
};
// Write the header, then the block
header.serialize(output);
output.extend_from_slice(&uncompressed_data);
}
CompressionLevel::Fastest => {
compress_fastest(&mut self.state, last_block, uncompressed_data, output)
}
_ => {
unimplemented!();
}
}
drain.write_all(output).unwrap();
output.clear();
if last_block {
break;
}
}
// If the `hash` feature is enabled, then `content_checksum` is set to true in the header
// and a 32 bit hash is written at the end of the data.
#[cfg(feature = "hash")]
{
// Because we only have the data as a reader, we need to read all of it to calculate the checksum
// Possible TODO: create a wrapper around self.uncompressed data that hashes the data as it's read?
let content_checksum = self.hasher.finish();
drain
.write_all(&(content_checksum as u32).to_le_bytes())
.unwrap();
}
}
/// Get a mutable reference to the source
pub fn source_mut(&mut self) -> Option<&mut R> {
self.uncompressed_data.as_mut()
}
/// Get a mutable reference to the drain
pub fn drain_mut(&mut self) -> Option<&mut W> {
self.compressed_data.as_mut()
}
/// Get a reference to the source
pub fn source(&self) -> Option<&R> {
self.uncompressed_data.as_ref()
}
/// Get a reference to the drain
pub fn drain(&self) -> Option<&W> {
self.compressed_data.as_ref()
}
/// Retrieve the source
pub fn take_source(&mut self) -> Option<R> {
self.uncompressed_data.take()
}
/// Retrieve the drain
pub fn take_drain(&mut self) -> Option<W> {
self.compressed_data.take()
}
/// Before calling [FrameCompressor::compress] you can replace the matcher
pub fn replace_matcher(&mut self, mut match_generator: M) -> M {
core::mem::swap(&mut match_generator, &mut self.state.matcher);
match_generator
}
/// Before calling [FrameCompressor::compress] you can replace the compression level
pub fn set_compression_level(
&mut self,
compression_level: CompressionLevel,
) -> CompressionLevel {
let old = self.compression_level;
self.compression_level = compression_level;
old
}
/// Get the current compression level
pub fn compression_level(&self) -> CompressionLevel {
self.compression_level
}
}
#[cfg(test)]
mod tests {
use alloc::vec;
use super::FrameCompressor;
use crate::common::MAGIC_NUM;
use crate::decoding::FrameDecoder;
use alloc::vec::Vec;
#[test]
fn frame_starts_with_magic_num() {
let mock_data = [1_u8, 2, 3].as_slice();
let mut output: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(super::CompressionLevel::Uncompressed);
compressor.set_source(mock_data);
compressor.set_drain(&mut output);
compressor.compress();
assert!(output.starts_with(&MAGIC_NUM.to_le_bytes()));
}
#[test]
fn very_simple_raw_compress() {
let mock_data = [1_u8, 2, 3].as_slice();
let mut output: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(super::CompressionLevel::Uncompressed);
compressor.set_source(mock_data);
compressor.set_drain(&mut output);
compressor.compress();
}
#[test]
fn very_simple_compress() {
let mut mock_data = vec![0; 1 << 17];
mock_data.extend(vec![1; (1 << 17) - 1]);
mock_data.extend(vec![2; (1 << 18) - 1]);
mock_data.extend(vec![2; 1 << 17]);
mock_data.extend(vec![3; (1 << 17) - 1]);
let mut output: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(super::CompressionLevel::Uncompressed);
compressor.set_source(mock_data.as_slice());
compressor.set_drain(&mut output);
compressor.compress();
let mut decoder = FrameDecoder::new();
let mut decoded = Vec::with_capacity(mock_data.len());
decoder.decode_all_to_vec(&output, &mut decoded).unwrap();
assert_eq!(mock_data, decoded);
let mut decoded = Vec::new();
zstd::stream::copy_decode(output.as_slice(), &mut decoded).unwrap();
assert_eq!(mock_data, decoded);
}
#[test]
fn rle_compress() {
let mock_data = vec![0; 1 << 19];
let mut output: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(super::CompressionLevel::Uncompressed);
compressor.set_source(mock_data.as_slice());
compressor.set_drain(&mut output);
compressor.compress();
let mut decoder = FrameDecoder::new();
let mut decoded = Vec::with_capacity(mock_data.len());
decoder.decode_all_to_vec(&output, &mut decoded).unwrap();
assert_eq!(mock_data, decoded);
}
#[test]
fn aaa_compress() {
let mock_data = vec![0, 1, 3, 4, 5];
let mut output: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(super::CompressionLevel::Uncompressed);
compressor.set_source(mock_data.as_slice());
compressor.set_drain(&mut output);
compressor.compress();
let mut decoder = FrameDecoder::new();
let mut decoded = Vec::with_capacity(mock_data.len());
decoder.decode_all_to_vec(&output, &mut decoded).unwrap();
assert_eq!(mock_data, decoded);
let mut decoded = Vec::new();
zstd::stream::copy_decode(output.as_slice(), &mut decoded).unwrap();
assert_eq!(mock_data, decoded);
}
#[cfg(feature = "std")]
#[test]
fn fuzz_targets() {
use std::io::Read;
fn decode_ruzstd(data: &mut dyn std::io::Read) -> Vec<u8> {
let mut decoder = crate::decoding::StreamingDecoder::new(data).unwrap();
let mut result: Vec<u8> = Vec::new();
decoder.read_to_end(&mut result).expect("Decoding failed");
result
}
fn decode_ruzstd_writer(mut data: impl Read) -> Vec<u8> {
let mut decoder = crate::decoding::FrameDecoder::new();
decoder.reset(&mut data).unwrap();
let mut result = vec![];
while !decoder.is_finished() || decoder.can_collect() > 0 {
decoder
.decode_blocks(
&mut data,
crate::decoding::BlockDecodingStrategy::UptoBytes(1024 * 1024),
)
.unwrap();
decoder.collect_to_writer(&mut result).unwrap();
}
result
}
fn encode_zstd(data: &[u8]) -> Result<Vec<u8>, std::io::Error> {
zstd::stream::encode_all(std::io::Cursor::new(data), 3)
}
fn encode_ruzstd_uncompressed(data: &mut dyn std::io::Read) -> Vec<u8> {
let mut input = Vec::new();
data.read_to_end(&mut input).unwrap();
crate::encoding::compress_to_vec(
input.as_slice(),
crate::encoding::CompressionLevel::Uncompressed,
)
}
fn encode_ruzstd_compressed(data: &mut dyn std::io::Read) -> Vec<u8> {
let mut input = Vec::new();
data.read_to_end(&mut input).unwrap();
crate::encoding::compress_to_vec(
input.as_slice(),
crate::encoding::CompressionLevel::Fastest,
)
}
fn decode_zstd(data: &[u8]) -> Result<Vec<u8>, std::io::Error> {
let mut output = Vec::new();
zstd::stream::copy_decode(data, &mut output)?;
Ok(output)
}
if std::fs::exists("fuzz/artifacts/interop").unwrap_or(false) {
for file in std::fs::read_dir("fuzz/artifacts/interop").unwrap() {
if file.as_ref().unwrap().file_type().unwrap().is_file() {
let data = std::fs::read(file.unwrap().path()).unwrap();
let data = data.as_slice();
// Decoding
let compressed = encode_zstd(data).unwrap();
let decoded = decode_ruzstd(&mut compressed.as_slice());
let decoded2 = decode_ruzstd_writer(&mut compressed.as_slice());
assert!(
decoded == data,
"Decoded data did not match the original input during decompression"
);
assert_eq!(
decoded2, data,
"Decoded data did not match the original input during decompression"
);
// Encoding
// Uncompressed encoding
let mut input = data;
let compressed = encode_ruzstd_uncompressed(&mut input);
let decoded = decode_zstd(&compressed).unwrap();
assert_eq!(
decoded, data,
"Decoded data did not match the original input during compression"
);
// Compressed encoding
let mut input = data;
let compressed = encode_ruzstd_compressed(&mut input);
let decoded = decode_zstd(&compressed).unwrap();
assert_eq!(
decoded, data,
"Decoded data did not match the original input during compression"
);
}
}
}
}
}

View File

@@ -0,0 +1,231 @@
//! Utilities and representations for a frame header.
use crate::bit_io::BitWriter;
use crate::common::MAGIC_NUM;
use crate::encoding::util::{find_min_size, minify_val};
use alloc::vec::Vec;
/// A header for a single Zstandard frame.
///
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header>
#[derive(Debug)]
pub struct FrameHeader {
/// Optionally, the original (uncompressed) size of the data within the frame in bytes.
/// If not present, `window_size` must be set.
pub frame_content_size: Option<u64>,
/// If set to true, data must be regenerated within a single
/// continuous memory segment.
pub single_segment: bool,
/// If set to true, a 32 bit content checksum will be present
/// at the end of the frame.
pub content_checksum: bool,
/// If a dictionary ID is provided, the ID of that dictionary.
pub dictionary_id: Option<u64>,
/// The minimum memory buffer required to compress a frame. If not present,
/// `single_segment` will be set to true. If present, this value must be greater than 1KB
/// and less than 3.75TB. Encoders should not generate a frame that requires a window size larger than
/// 8mb.
pub window_size: Option<u64>,
}
impl FrameHeader {
/// Writes the serialized frame header into the provided buffer.
///
/// The returned header *does include* a frame header descriptor.
pub fn serialize(self, output: &mut Vec<u8>) {
vprintln!("Serializing frame with header: {self:?}");
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header
// Magic Number:
output.extend_from_slice(&MAGIC_NUM.to_le_bytes());
// `Frame_Header_Descriptor`:
output.push(self.descriptor());
// `Window_Descriptor
// TODO: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
if !self.single_segment {
if let Some(window_size) = self.window_size {
let log = window_size.next_power_of_two().ilog2();
let exponent = if log > 10 { log - 10 } else { 1 } as u8;
output.push(exponent << 3);
}
}
if let Some(id) = self.dictionary_id {
output.extend(minify_val(id));
}
if let Some(frame_content_size) = self.frame_content_size {
output.extend(minify_val_fcs(frame_content_size));
}
}
/// Generate a serialized frame header descriptor for the frame header.
///
/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header_descriptor
fn descriptor(&self) -> u8 {
let mut bw = BitWriter::new();
// A frame header starts with a frame header descriptor.
// It describes what other fields are present
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header_descriptor
// Writing the frame header descriptor:
// `Frame_Content_Size_flag`:
// The Frame_Content_Size_flag specifies if
// the Frame_Content_Size field is provided within the header.
// TODO: The Frame_Content_Size field isn't set at all, we should prefer to include it always.
// If the `Single_Segment_flag` is set and this value is zero,
// the size of the FCS field is 1 byte.
// Otherwise, the FCS field is omitted.
// | Value | Size of field (Bytes)
// | 0 | 0 or 1
// | 1 | 2
// | 2 | 4
// | 3 | 8
// `Dictionary_ID_flag`:
if let Some(id) = self.dictionary_id {
let flag_value: u8 = match find_min_size(id) {
0 => 0,
1 => 1,
2 => 2,
4 => 3,
_ => panic!(),
};
bw.write_bits(flag_value, 2);
} else {
// A `Dictionary_ID` was not provided
bw.write_bits(0u8, 2);
}
// `Content_Checksum_flag`:
if self.content_checksum {
bw.write_bits(1u8, 1);
} else {
bw.write_bits(0u8, 1);
}
// `Reserved_bit`:
// This value must be zero
bw.write_bits(0u8, 1);
// `Unused_bit`:
// An encoder compliant with this spec must set this bit to zero
bw.write_bits(0u8, 1);
// `Single_Segment_flag`:
// If this flag is set, data must be regenerated within a single continuous memory segment,
// and the `Frame_Content_Size` field must be present in the header.
// If this flag is not set, the `Window_Descriptor` field must be present in the frame header.
if self.single_segment {
assert!(self.frame_content_size.is_some(), "if the `single_segment` flag is set to true, then a frame content size must be provided");
bw.write_bits(1u8, 1);
} else {
assert!(
self.window_size.is_some(),
"if the `single_segment` flag is set to false, then a window size must be provided"
);
bw.write_bits(0u8, 1);
}
if let Some(frame_content_size) = self.frame_content_size {
let field_size = find_min_size(frame_content_size);
let flag_value: u8 = match field_size {
1 => 0,
2 => 1,
4 => 2,
3 => 8,
_ => panic!(),
};
bw.write_bits(flag_value, 2);
} else {
// `Frame_Content_Size` was not provided
bw.write_bits(0u8, 2);
}
bw.dump()[0]
}
}
/// Identical to [`minify_val`], but it implements the following edge case:
///
/// > When FCS_Field_Size is 1, 4 or 8 bytes, the value is read directly. When FCS_Field_Size is 2, the offset of 256 is added.
///
/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_content_size
fn minify_val_fcs(val: u64) -> Vec<u8> {
let new_size = find_min_size(val);
let mut val = val;
if new_size == 2 {
val -= 256;
}
val.to_le_bytes()[0..new_size].to_vec()
}
#[cfg(test)]
mod tests {
use super::FrameHeader;
use crate::decoding::frame::{read_frame_header, FrameDescriptor};
use alloc::vec::Vec;
#[test]
fn frame_header_descriptor_decode() {
let header = FrameHeader {
frame_content_size: Some(1),
single_segment: true,
content_checksum: false,
dictionary_id: None,
window_size: None,
};
let descriptor = header.descriptor();
let decoded_descriptor = FrameDescriptor(descriptor);
assert_eq!(decoded_descriptor.frame_content_size_bytes().unwrap(), 1);
assert!(!decoded_descriptor.content_checksum_flag());
assert_eq!(decoded_descriptor.dictionary_id_bytes().unwrap(), 0);
}
#[test]
fn frame_header_decode() {
let header = FrameHeader {
frame_content_size: Some(1),
single_segment: true,
content_checksum: false,
dictionary_id: None,
window_size: None,
};
let mut serialized_header = Vec::new();
header.serialize(&mut serialized_header);
let parsed_header = read_frame_header(serialized_header.as_slice()).unwrap().0;
assert!(parsed_header.dictionary_id().is_none());
assert_eq!(parsed_header.frame_content_size(), 1);
}
#[test]
#[should_panic]
fn catches_single_segment_no_fcs() {
let header = FrameHeader {
frame_content_size: None,
single_segment: true,
content_checksum: false,
dictionary_id: None,
window_size: Some(1),
};
let mut serialized_header = Vec::new();
header.serialize(&mut serialized_header);
}
#[test]
#[should_panic]
fn catches_single_segment_no_winsize() {
let header = FrameHeader {
frame_content_size: Some(7),
single_segment: false,
content_checksum: false,
dictionary_id: None,
window_size: None,
};
let mut serialized_header = Vec::new();
header.serialize(&mut serialized_header);
}
}

View File

@@ -0,0 +1,67 @@
use crate::{
common::MAX_BLOCK_SIZE,
encoding::{
block_header::BlockHeader, blocks::compress_block, frame_compressor::CompressState, Matcher,
},
};
use alloc::vec::Vec;
/// Compresses a single block at [`crate::encoding::CompressionLevel::Fastest`].
///
/// # Parameters
/// - `state`: [`CompressState`] so the compressor can refer to data before
/// the start of this block
/// - `last_block`: Whether or not this block is going to be the last block in the frame
/// (needed because this info is written into the block header)
/// - `uncompressed_data`: A block's worth of uncompressed data, taken from the
/// larger input
/// - `output`: As `uncompressed_data` is compressed, it's appended to `output`.
#[inline]
pub fn compress_fastest<M: Matcher>(
state: &mut CompressState<M>,
last_block: bool,
uncompressed_data: Vec<u8>,
output: &mut Vec<u8>,
) {
let block_size = uncompressed_data.len() as u32;
// First check to see if run length encoding can be used for the entire block
if uncompressed_data.iter().all(|x| uncompressed_data[0].eq(x)) {
let rle_byte = uncompressed_data[0];
state.matcher.commit_space(uncompressed_data);
state.matcher.skip_matching();
let header = BlockHeader {
last_block,
block_type: crate::blocks::block::BlockType::RLE,
block_size,
};
// Write the header, then the block
header.serialize(output);
output.push(rle_byte);
} else {
// Compress as a standard compressed block
let mut compressed = Vec::new();
state.matcher.commit_space(uncompressed_data);
compress_block(state, &mut compressed);
// If the compressed data is larger than the maximum
// allowable block size, instead store uncompressed
if compressed.len() >= MAX_BLOCK_SIZE as usize {
let header = BlockHeader {
last_block,
block_type: crate::blocks::block::BlockType::Raw,
block_size,
};
// Write the header, then the block
header.serialize(output);
output.extend_from_slice(state.matcher.get_last_space());
} else {
let header = BlockHeader {
last_block,
block_type: crate::blocks::block::BlockType::Compressed,
block_size: compressed.len() as u32,
};
// Write the header, then the block
header.serialize(output);
output.extend(compressed);
}
}
}

View File

@@ -0,0 +1,2 @@
mod fastest;
pub use fastest::compress_fastest;

View File

@@ -0,0 +1,619 @@
//! Matching algorithm used find repeated parts in the original data
//!
//! The Zstd format relies on finden repeated sequences of data and compressing these sequences as instructions to the decoder.
//! A sequence basically tells the decoder "Go back X bytes and copy Y bytes to the end of your decode buffer".
//!
//! The task here is to efficiently find matches in the already encoded data for the current suffix of the not yet encoded data.
use alloc::vec::Vec;
use core::num::NonZeroUsize;
use super::CompressionLevel;
use super::Matcher;
use super::Sequence;
const MIN_MATCH_LEN: usize = 5;
/// Takes care of allocating and reusing vecs
pub struct MatchGeneratorDriver {
vec_pool: Vec<Vec<u8>>,
suffix_pool: Vec<SuffixStore>,
match_generator: MatchGenerator,
slice_size: usize,
}
impl MatchGeneratorDriver {
/// slice_size says how big the slices should be that are allocated to work with
/// max_slices_in_window says how many slices should at most be used while looking for matches
pub(crate) fn new(slice_size: usize, max_slices_in_window: usize) -> Self {
Self {
vec_pool: Vec::new(),
suffix_pool: Vec::new(),
match_generator: MatchGenerator::new(max_slices_in_window * slice_size),
slice_size,
}
}
}
impl Matcher for MatchGeneratorDriver {
fn reset(&mut self, _level: CompressionLevel) {
let vec_pool = &mut self.vec_pool;
let suffix_pool = &mut self.suffix_pool;
self.match_generator.reset(|mut data, mut suffixes| {
data.resize(data.capacity(), 0);
vec_pool.push(data);
suffixes.slots.clear();
suffixes.slots.resize(suffixes.slots.capacity(), None);
suffix_pool.push(suffixes);
});
}
fn window_size(&self) -> u64 {
self.match_generator.max_window_size as u64
}
fn get_next_space(&mut self) -> Vec<u8> {
self.vec_pool.pop().unwrap_or_else(|| {
let mut space = alloc::vec![0; self.slice_size];
space.resize(space.capacity(), 0);
space
})
}
fn get_last_space(&mut self) -> &[u8] {
self.match_generator.window.last().unwrap().data.as_slice()
}
fn commit_space(&mut self, space: Vec<u8>) {
let vec_pool = &mut self.vec_pool;
let suffixes = self
.suffix_pool
.pop()
.unwrap_or_else(|| SuffixStore::with_capacity(space.len()));
let suffix_pool = &mut self.suffix_pool;
self.match_generator
.add_data(space, suffixes, |mut data, mut suffixes| {
data.resize(data.capacity(), 0);
vec_pool.push(data);
suffixes.slots.clear();
suffixes.slots.resize(suffixes.slots.capacity(), None);
suffix_pool.push(suffixes);
});
}
fn start_matching(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) {
while self.match_generator.next_sequence(&mut handle_sequence) {}
}
fn skip_matching(&mut self) {
self.match_generator.skip_matching();
}
}
/// This stores the index of a suffix of a string by hashing the first few bytes of that suffix
/// This means that collisions just overwrite and that you need to check validity after a get
struct SuffixStore {
// We use NonZeroUsize to enable niche optimization here.
// On store we do +1 and on get -1
// This is ok since usize::MAX is never a valid offset
slots: Vec<Option<NonZeroUsize>>,
len_log: u32,
}
impl SuffixStore {
fn with_capacity(capacity: usize) -> Self {
Self {
slots: alloc::vec![None; capacity],
len_log: capacity.ilog2(),
}
}
#[inline(always)]
fn insert(&mut self, suffix: &[u8], idx: usize) {
let key = self.key(suffix);
self.slots[key] = Some(NonZeroUsize::new(idx + 1).unwrap());
}
#[inline(always)]
fn contains_key(&self, suffix: &[u8]) -> bool {
let key = self.key(suffix);
self.slots[key].is_some()
}
#[inline(always)]
fn get(&self, suffix: &[u8]) -> Option<usize> {
let key = self.key(suffix);
self.slots[key].map(|x| <NonZeroUsize as Into<usize>>::into(x) - 1)
}
#[inline(always)]
fn key(&self, suffix: &[u8]) -> usize {
let s0 = suffix[0] as u64;
let s1 = suffix[1] as u64;
let s2 = suffix[2] as u64;
let s3 = suffix[3] as u64;
let s4 = suffix[4] as u64;
const POLY: u64 = 0xCF3BCCDCABu64;
let s0 = (s0 << 24).wrapping_mul(POLY);
let s1 = (s1 << 32).wrapping_mul(POLY);
let s2 = (s2 << 40).wrapping_mul(POLY);
let s3 = (s3 << 48).wrapping_mul(POLY);
let s4 = (s4 << 56).wrapping_mul(POLY);
let index = s0 ^ s1 ^ s2 ^ s3 ^ s4;
let index = index >> (64 - self.len_log);
index as usize % self.slots.len()
}
}
/// We keep a window of a few of these entries
/// All of these are valid targets for a match to be generated for
struct WindowEntry {
data: Vec<u8>,
/// Stores indexes into data
suffixes: SuffixStore,
/// Makes offset calculations efficient
base_offset: usize,
}
pub(crate) struct MatchGenerator {
max_window_size: usize,
/// Data window we are operating on to find matches
/// The data we want to find matches for is in the last slice
window: Vec<WindowEntry>,
window_size: usize,
#[cfg(debug_assertions)]
concat_window: Vec<u8>,
/// Index in the last slice that we already processed
suffix_idx: usize,
/// Gets updated when a new sequence is returned to point right behind that sequence
last_idx_in_sequence: usize,
}
impl MatchGenerator {
/// max_size defines how many bytes will be used at most in the window used for matching
fn new(max_size: usize) -> Self {
Self {
max_window_size: max_size,
window: Vec::new(),
window_size: 0,
#[cfg(debug_assertions)]
concat_window: Vec::new(),
suffix_idx: 0,
last_idx_in_sequence: 0,
}
}
fn reset(&mut self, mut reuse_space: impl FnMut(Vec<u8>, SuffixStore)) {
self.window_size = 0;
#[cfg(debug_assertions)]
self.concat_window.clear();
self.suffix_idx = 0;
self.last_idx_in_sequence = 0;
self.window.drain(..).for_each(|entry| {
reuse_space(entry.data, entry.suffixes);
});
}
/// Processes bytes in the current window until either a match is found or no more matches can be found
/// * If a match is found handle_sequence is called with the Triple variant
/// * If no more matches can be found but there are bytes still left handle_sequence is called with the Literals variant
/// * If no more matches can be found and no more bytes are left this returns false
fn next_sequence(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) -> bool {
loop {
let last_entry = self.window.last().unwrap();
let data_slice = &last_entry.data;
// We already reached the end of the window, check if we need to return a Literals{}
if self.suffix_idx >= data_slice.len() {
if self.last_idx_in_sequence != self.suffix_idx {
let literals = &data_slice[self.last_idx_in_sequence..];
self.last_idx_in_sequence = self.suffix_idx;
handle_sequence(Sequence::Literals { literals });
return true;
} else {
return false;
}
}
// If the remaining data is smaller than the minimum match length we can stop and return a Literals{}
let data_slice = &data_slice[self.suffix_idx..];
if data_slice.len() < MIN_MATCH_LEN {
let last_idx_in_sequence = self.last_idx_in_sequence;
self.last_idx_in_sequence = last_entry.data.len();
self.suffix_idx = last_entry.data.len();
handle_sequence(Sequence::Literals {
literals: &last_entry.data[last_idx_in_sequence..],
});
return true;
}
// This is the key we are looking to find a match for
let key = &data_slice[..MIN_MATCH_LEN];
// Look in each window entry
let mut candidate = None;
for (match_entry_idx, match_entry) in self.window.iter().enumerate() {
let is_last = match_entry_idx == self.window.len() - 1;
if let Some(match_index) = match_entry.suffixes.get(key) {
let match_slice = if is_last {
&match_entry.data[match_index..self.suffix_idx]
} else {
&match_entry.data[match_index..]
};
// Check how long the common prefix actually is
let match_len = Self::common_prefix_len(match_slice, data_slice);
// Collisions in the suffix store might make this check fail
if match_len >= MIN_MATCH_LEN {
let offset = match_entry.base_offset + self.suffix_idx - match_index;
// If we are in debug/tests make sure the match we found is actually at the offset we calculated
#[cfg(debug_assertions)]
{
let unprocessed = last_entry.data.len() - self.suffix_idx;
let start = self.concat_window.len() - unprocessed - offset;
let end = start + match_len;
let check_slice = &self.concat_window[start..end];
debug_assert_eq!(check_slice, &match_slice[..match_len]);
}
if let Some((old_offset, old_match_len)) = candidate {
if match_len > old_match_len
|| (match_len == old_match_len && offset < old_offset)
{
candidate = Some((offset, match_len));
}
} else {
candidate = Some((offset, match_len));
}
}
}
}
if let Some((offset, match_len)) = candidate {
// For each index in the match we found we do not need to look for another match
// But we still want them registered in the suffix store
self.add_suffixes_till(self.suffix_idx + match_len);
// All literals that were not included between this match and the last are now included here
let last_entry = self.window.last().unwrap();
let literals = &last_entry.data[self.last_idx_in_sequence..self.suffix_idx];
// Update the indexes, all indexes upto and including the current index have been included in a sequence now
self.suffix_idx += match_len;
self.last_idx_in_sequence = self.suffix_idx;
handle_sequence(Sequence::Triple {
literals,
offset,
match_len,
});
return true;
}
let last_entry = self.window.last_mut().unwrap();
let key = &last_entry.data[self.suffix_idx..self.suffix_idx + MIN_MATCH_LEN];
if !last_entry.suffixes.contains_key(key) {
last_entry.suffixes.insert(key, self.suffix_idx);
}
self.suffix_idx += 1;
}
}
/// Find the common prefix length between two byte slices
#[inline(always)]
fn common_prefix_len(a: &[u8], b: &[u8]) -> usize {
Self::mismatch_chunks::<8>(a, b)
}
/// Find the common prefix length between two byte slices with a configurable chunk length
/// This enables vectorization optimizations
fn mismatch_chunks<const N: usize>(xs: &[u8], ys: &[u8]) -> usize {
let off = core::iter::zip(xs.chunks_exact(N), ys.chunks_exact(N))
.take_while(|(x, y)| x == y)
.count()
* N;
off + core::iter::zip(&xs[off..], &ys[off..])
.take_while(|(x, y)| x == y)
.count()
}
/// Process bytes and add the suffixes to the suffix store up to a specific index
#[inline(always)]
fn add_suffixes_till(&mut self, idx: usize) {
let last_entry = self.window.last_mut().unwrap();
if last_entry.data.len() < MIN_MATCH_LEN {
return;
}
let slice = &last_entry.data[self.suffix_idx..idx];
for (key_index, key) in slice.windows(MIN_MATCH_LEN).enumerate() {
if !last_entry.suffixes.contains_key(key) {
last_entry.suffixes.insert(key, self.suffix_idx + key_index);
}
}
}
/// Skip matching for the whole current window entry
fn skip_matching(&mut self) {
let len = self.window.last().unwrap().data.len();
self.add_suffixes_till(len);
self.suffix_idx = len;
self.last_idx_in_sequence = len;
}
/// Add a new window entry. Will panic if the last window entry hasn't been processed properly.
/// If any resources are released by pushing the new entry they are returned via the callback
fn add_data(
&mut self,
data: Vec<u8>,
suffixes: SuffixStore,
reuse_space: impl FnMut(Vec<u8>, SuffixStore),
) {
assert!(
self.window.is_empty() || self.suffix_idx == self.window.last().unwrap().data.len()
);
self.reserve(data.len(), reuse_space);
#[cfg(debug_assertions)]
self.concat_window.extend_from_slice(&data);
if let Some(last_len) = self.window.last().map(|last| last.data.len()) {
for entry in self.window.iter_mut() {
entry.base_offset += last_len;
}
}
let len = data.len();
self.window.push(WindowEntry {
data,
suffixes,
base_offset: 0,
});
self.window_size += len;
self.suffix_idx = 0;
self.last_idx_in_sequence = 0;
}
/// Reserve space for a new window entry
/// If any resources are released by pushing the new entry they are returned via the callback
fn reserve(&mut self, amount: usize, mut reuse_space: impl FnMut(Vec<u8>, SuffixStore)) {
assert!(self.max_window_size >= amount);
while self.window_size + amount > self.max_window_size {
let removed = self.window.remove(0);
self.window_size -= removed.data.len();
#[cfg(debug_assertions)]
self.concat_window.drain(0..removed.data.len());
let WindowEntry {
suffixes,
data: leaked_vec,
base_offset: _,
} = removed;
reuse_space(leaked_vec, suffixes);
}
}
}
#[test]
fn matches() {
let mut matcher = MatchGenerator::new(1000);
let mut original_data = Vec::new();
let mut reconstructed = Vec::new();
let assert_seq_equal = |seq1: Sequence<'_>, seq2: Sequence<'_>, reconstructed: &mut Vec<u8>| {
assert_eq!(seq1, seq2);
match seq2 {
Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
Sequence::Triple {
literals,
offset,
match_len,
} => {
reconstructed.extend_from_slice(literals);
let start = reconstructed.len() - offset;
let end = start + match_len;
reconstructed.extend_from_within(start..end);
}
}
};
matcher.add_data(
alloc::vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[0, 0, 0, 0, 0],
offset: 5,
match_len: 5,
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0,],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[
1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0,
]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[1, 2, 3, 4, 5, 6],
offset: 6,
match_len: 6,
},
&mut reconstructed,
)
});
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[],
offset: 12,
match_len: 6,
},
&mut reconstructed,
)
});
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[],
offset: 28,
match_len: 5,
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[],
offset: 23,
match_len: 6,
},
&mut reconstructed,
)
});
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[7, 8, 9, 10, 11],
offset: 16,
match_len: 5,
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![0, 0, 0, 0, 0],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[0, 0, 0, 0, 0]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[],
offset: 5,
match_len: 5,
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![7, 8, 9, 10, 11],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[7, 8, 9, 10, 11]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[],
offset: 15,
match_len: 5,
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![1, 3, 5, 7, 9],
SuffixStore::with_capacity(100),
|_, _| {},
);
matcher.skip_matching();
original_data.extend_from_slice(&[1, 3, 5, 7, 9]);
reconstructed.extend_from_slice(&[1, 3, 5, 7, 9]);
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![1, 3, 5, 7, 9],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[1, 3, 5, 7, 9]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[],
offset: 5,
match_len: 5,
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![0, 0, 11, 13, 15, 17, 20, 11, 13, 15, 17, 20, 21, 23],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[0, 0, 11, 13, 15, 17, 20, 11, 13, 15, 17, 20, 21, 23]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[0, 0, 11, 13, 15, 17, 20],
offset: 5,
match_len: 5,
},
&mut reconstructed,
)
});
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Literals {
literals: &[21, 23],
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
assert_eq!(reconstructed, original_data);
}

118
vendor/ruzstd/src/encoding/mod.rs vendored Normal file
View File

@@ -0,0 +1,118 @@
//! Structures and utilities used for compressing/encoding data into the Zstd format.
pub(crate) mod block_header;
pub(crate) mod blocks;
pub(crate) mod frame_header;
pub(crate) mod match_generator;
pub(crate) mod util;
mod frame_compressor;
mod levels;
pub use frame_compressor::FrameCompressor;
use crate::io::{Read, Write};
use alloc::vec::Vec;
/// Convenience function to compress some source into a target without reusing any resources of the compressor
/// ```rust
/// use ruzstd::encoding::{compress, CompressionLevel};
/// let data: &[u8] = &[0,0,0,0,0,0,0,0,0,0,0,0];
/// let mut target = Vec::new();
/// compress(data, &mut target, CompressionLevel::Fastest);
/// ```
pub fn compress<R: Read, W: Write>(source: R, target: W, level: CompressionLevel) {
let mut frame_enc = FrameCompressor::new(level);
frame_enc.set_source(source);
frame_enc.set_drain(target);
frame_enc.compress();
}
/// Convenience function to compress some source into a Vec without reusing any resources of the compressor
/// ```rust
/// use ruzstd::encoding::{compress_to_vec, CompressionLevel};
/// let data: &[u8] = &[0,0,0,0,0,0,0,0,0,0,0,0];
/// let compressed = compress_to_vec(data, CompressionLevel::Fastest);
/// ```
pub fn compress_to_vec<R: Read>(source: R, level: CompressionLevel) -> Vec<u8> {
let mut vec = Vec::new();
compress(source, &mut vec, level);
vec
}
/// The compression mode used impacts the speed of compression,
/// and resulting compression ratios. Faster compression will result
/// in worse compression ratios, and vice versa.
#[derive(Copy, Clone)]
pub enum CompressionLevel {
/// This level does not compress the data at all, and simply wraps
/// it in a Zstandard frame.
Uncompressed,
/// This level is roughly equivalent to Zstd compression level 1
Fastest,
/// This level is roughly equivalent to Zstd level 3,
/// or the one used by the official compressor when no level
/// is specified.
///
/// UNIMPLEMENTED
Default,
/// This level is roughly equivalent to Zstd level 7.
///
/// UNIMPLEMENTED
Better,
/// This level is roughly equivalent to Zstd level 11.
///
/// UNIMPLEMENTED
Best,
}
/// Trait used by the encoder that users can use to extend the matching facilities with their own algorithm
/// making their own tradeoffs between runtime, memory usage and compression ratio
///
/// This trait operates on buffers that represent the chunks of data the matching algorithm wants to work on.
/// Each one of these buffers is referred to as a *space*. One or more of these buffers represent the window
/// the decoder will need to decode the data again.
///
/// This library asks the Matcher for a new buffer using `get_next_space` to allow reusing of allocated buffers when they are no longer part of the
/// window of data that is being used for matching.
///
/// The library fills the buffer with data that is to be compressed and commits them back to the matcher using `commit_space`.
///
/// Then it will either call `start_matching` or, if the space is deemed not worth compressing, `skip_matching` is called.
///
/// This is repeated until no more data is left to be compressed.
pub trait Matcher {
/// Get a space where we can put data to be matched on. Will be encoded as one block. The maximum allowed size is 128 kB.
fn get_next_space(&mut self) -> alloc::vec::Vec<u8>;
/// Get a reference to the last commited space
fn get_last_space(&mut self) -> &[u8];
/// Commit a space to the matcher so it can be matched against
fn commit_space(&mut self, space: alloc::vec::Vec<u8>);
/// Just process the data in the last commited space for future matching
fn skip_matching(&mut self);
/// Process the data in the last commited space for future matching AND generate matches for the data
fn start_matching(&mut self, handle_sequence: impl for<'a> FnMut(Sequence<'a>));
/// Reset this matcher so it can be used for the next new frame
fn reset(&mut self, level: CompressionLevel);
/// The size of the window the decoder will need to execute all sequences produced by this matcher
///
/// May change after a call to reset with a different compression level
fn window_size(&self) -> u64;
}
#[derive(PartialEq, Eq, Debug)]
/// Sequences that a [`Matcher`] can produce
pub enum Sequence<'data> {
/// Is encoded as a sequence for the decoder sequence execution.
///
/// First the literals will be copied to the decoded data,
/// then `match_len` bytes are copied from `offset` bytes back in the buffer
Triple {
literals: &'data [u8],
offset: usize,
match_len: usize,
},
/// This is returned as the last sequence in a block
///
/// These literals will just be copied at the end of the sequence execution by the decoder
Literals { literals: &'data [u8] },
}

60
vendor/ruzstd/src/encoding/util.rs vendored Normal file
View File

@@ -0,0 +1,60 @@
use alloc::vec::Vec;
/// Returns the minimum number of bytes needed to represent this value, as
/// either 1, 2, 4, or 8 bytes. A value of 0 will still return one byte.
///
/// Used for variable length fields like `Dictionary_ID` or `Frame_Content_Size`.
pub fn find_min_size(val: u64) -> usize {
if val == 0 {
return 1;
}
if val >> 8 == 0 {
return 1;
}
if val >> 16 == 0 {
return 2;
}
if val >> 32 == 0 {
return 4;
}
8
}
/// Returns the same value, but represented using the smallest number of bytes needed.
/// Returned vector will be 1, 2, 4, or 8 bytes in length. Zero is represented as 1 byte.
///
/// Operates in **little-endian**.
pub fn minify_val(val: u64) -> Vec<u8> {
let new_size = find_min_size(val);
val.to_le_bytes()[0..new_size].to_vec()
}
#[cfg(test)]
mod tests {
use super::find_min_size;
use super::minify_val;
use alloc::vec;
#[test]
fn min_size_detection() {
assert_eq!(find_min_size(0), 1);
assert_eq!(find_min_size(0xff), 1);
assert_eq!(find_min_size(0xff_ff), 2);
assert_eq!(find_min_size(0x00_ff_ff_ff), 4);
assert_eq!(find_min_size(0xff_ff_ff_ff), 4);
assert_eq!(find_min_size(0x00ff_ffff_ffff_ffff), 8);
assert_eq!(find_min_size(0xffff_ffff_ffff_ffff), 8);
}
#[test]
fn bytes_minified() {
assert_eq!(minify_val(0), vec![0]);
assert_eq!(minify_val(0xff), vec![0xff]);
assert_eq!(minify_val(0xff_ff), vec![0xff, 0xff]);
assert_eq!(minify_val(0xff_ff_ff_ff), vec![0xff, 0xff, 0xff, 0xff]);
assert_eq!(
minify_val(0xffff_ffff_ffff_ffff),
vec![0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff]
);
}
}

366
vendor/ruzstd/src/fse/fse_decoder.rs vendored Normal file
View File

@@ -0,0 +1,366 @@
use crate::bit_io::{BitReader, BitReaderReversed};
use crate::decoding::errors::{FSEDecoderError, FSETableError};
use alloc::vec::Vec;
pub struct FSEDecoder<'table> {
/// An FSE state value represents an index in the FSE table.
pub state: Entry,
/// A reference to the table used for decoding.
table: &'table FSETable,
}
impl<'t> FSEDecoder<'t> {
/// Initialize a new Finite State Entropy decoder.
pub fn new(table: &'t FSETable) -> FSEDecoder<'t> {
FSEDecoder {
state: table.decode.first().copied().unwrap_or(Entry {
base_line: 0,
num_bits: 0,
symbol: 0,
}),
table,
}
}
/// Returns the byte associated with the symbol the internal cursor is pointing at.
pub fn decode_symbol(&self) -> u8 {
self.state.symbol
}
/// Initialize internal state and prepare for decoding. After this, `decode_symbol` can be called
/// to read the first symbol and `update_state` can be called to prepare to read the next symbol.
pub fn init_state(&mut self, bits: &mut BitReaderReversed<'_>) -> Result<(), FSEDecoderError> {
if self.table.accuracy_log == 0 {
return Err(FSEDecoderError::TableIsUninitialized);
}
let new_state = bits.get_bits(self.table.accuracy_log);
self.state = self.table.decode[new_state as usize];
Ok(())
}
/// Advance the internal state to decode the next symbol in the bitstream.
pub fn update_state(&mut self, bits: &mut BitReaderReversed<'_>) {
let num_bits = self.state.num_bits;
let add = bits.get_bits(num_bits);
let base_line = self.state.base_line;
let new_state = base_line + add as u32;
self.state = self.table.decode[new_state as usize];
//println!("Update: {}, {} -> {}", base_line, add, self.state);
}
}
/// FSE decoding involves a decoding table that describes the probabilities of
/// all literals from 0 to the highest present one
///
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#fse-table-description>
#[derive(Debug, Clone)]
pub struct FSETable {
/// The maximum symbol in the table (inclusive). Limits the probabilities length to max_symbol + 1.
max_symbol: u8,
/// The actual table containing the decoded symbol and the compression data
/// connected to that symbol.
pub decode: Vec<Entry>, //used to decode symbols, and calculate the next state
/// The size of the table is stored in logarithm base 2 format,
/// with the **size of the table** being equal to `(1 << accuracy_log)`.
/// This value is used so that the decoder knows how many bits to read from the bitstream.
pub accuracy_log: u8,
/// In this context, probability refers to the likelihood that a symbol occurs in the given data.
/// Given this info, the encoder can assign shorter codes to symbols that appear more often,
/// and longer codes that appear less often, then the decoder can use the probability
/// to determine what code was assigned to what symbol.
///
/// The probability of a single symbol is a value representing the proportion of times the symbol
/// would fall within the data.
///
/// If a symbol probability is set to `-1`, it means that the probability of a symbol
/// occurring in the data is less than one.
pub symbol_probabilities: Vec<i32>, //used while building the decode Vector
/// The number of times each symbol occurs (The first entry being 0x0, the second being 0x1) and so on
/// up until the highest possible symbol (255).
symbol_counter: Vec<u32>,
}
impl FSETable {
/// Initialize a new empty Finite State Entropy decoding table.
pub fn new(max_symbol: u8) -> FSETable {
FSETable {
max_symbol,
symbol_probabilities: Vec::with_capacity(256), //will never be more than 256 symbols because u8
symbol_counter: Vec::with_capacity(256), //will never be more than 256 symbols because u8
decode: Vec::new(), //depending on acc_log.
accuracy_log: 0,
}
}
/// Reset `self` and update `self`'s state to mirror the provided table.
pub fn reinit_from(&mut self, other: &Self) {
self.reset();
self.symbol_counter.extend_from_slice(&other.symbol_counter);
self.symbol_probabilities
.extend_from_slice(&other.symbol_probabilities);
self.decode.extend_from_slice(&other.decode);
self.accuracy_log = other.accuracy_log;
}
/// Empty the table and clear all internal state.
pub fn reset(&mut self) {
self.symbol_counter.clear();
self.symbol_probabilities.clear();
self.decode.clear();
self.accuracy_log = 0;
}
/// returns how many BYTEs (not bits) were read while building the decoder
pub fn build_decoder(&mut self, source: &[u8], max_log: u8) -> Result<usize, FSETableError> {
self.accuracy_log = 0;
let bytes_read = self.read_probabilities(source, max_log)?;
self.build_decoding_table()?;
Ok(bytes_read)
}
/// Given the provided accuracy log, build a decoding table from that log.
pub fn build_from_probabilities(
&mut self,
acc_log: u8,
probs: &[i32],
) -> Result<(), FSETableError> {
if acc_log == 0 {
return Err(FSETableError::AccLogIsZero);
}
self.symbol_probabilities = probs.to_vec();
self.accuracy_log = acc_log;
self.build_decoding_table()
}
/// Build the actual decoding table after probabilities have been read into the table.
/// After this function is called, the decoding process can begin.
fn build_decoding_table(&mut self) -> Result<(), FSETableError> {
if self.symbol_probabilities.len() > self.max_symbol as usize + 1 {
return Err(FSETableError::TooManySymbols {
got: self.symbol_probabilities.len(),
});
}
self.decode.clear();
let table_size = 1 << self.accuracy_log;
if self.decode.len() < table_size {
self.decode.reserve(table_size - self.decode.len());
}
//fill with dummy entries
self.decode.resize(
table_size,
Entry {
base_line: 0,
num_bits: 0,
symbol: 0,
},
);
let mut negative_idx = table_size; //will point to the highest index with is already occupied by a negative-probability-symbol
//first scan for all -1 probabilities and place them at the top of the table
for symbol in 0..self.symbol_probabilities.len() {
if self.symbol_probabilities[symbol] == -1 {
negative_idx -= 1;
let entry = &mut self.decode[negative_idx];
entry.symbol = symbol as u8;
entry.base_line = 0;
entry.num_bits = self.accuracy_log;
}
}
//then place in a semi-random order all of the other symbols
let mut position = 0;
for idx in 0..self.symbol_probabilities.len() {
let symbol = idx as u8;
if self.symbol_probabilities[idx] <= 0 {
continue;
}
//for each probability point the symbol gets on slot
let prob = self.symbol_probabilities[idx];
for _ in 0..prob {
let entry = &mut self.decode[position];
entry.symbol = symbol;
position = next_position(position, table_size);
while position >= negative_idx {
position = next_position(position, table_size);
//everything above negative_idx is already taken
}
}
}
// baselines and num_bits can only be calculated when all symbols have been spread
self.symbol_counter.clear();
self.symbol_counter
.resize(self.symbol_probabilities.len(), 0);
for idx in 0..negative_idx {
let entry = &mut self.decode[idx];
let symbol = entry.symbol;
let prob = self.symbol_probabilities[symbol as usize];
let symbol_count = self.symbol_counter[symbol as usize];
let (bl, nb) = calc_baseline_and_numbits(table_size as u32, prob as u32, symbol_count);
//println!("symbol: {:2}, table: {}, prob: {:3}, count: {:3}, bl: {:3}, nb: {:2}", symbol, table_size, prob, symbol_count, bl, nb);
assert!(nb <= self.accuracy_log);
self.symbol_counter[symbol as usize] += 1;
entry.base_line = bl;
entry.num_bits = nb;
}
Ok(())
}
/// Read the accuracy log and the probability table from the source and return the number of bytes
/// read. If the size of the table is larger than the provided `max_log`, return an error.
fn read_probabilities(&mut self, source: &[u8], max_log: u8) -> Result<usize, FSETableError> {
self.symbol_probabilities.clear(); //just clear, we will fill a probability for each entry anyways. No need to force new allocs here
let mut br = BitReader::new(source);
self.accuracy_log = ACC_LOG_OFFSET + (br.get_bits(4)? as u8);
if self.accuracy_log > max_log {
return Err(FSETableError::AccLogTooBig {
got: self.accuracy_log,
max: max_log,
});
}
if self.accuracy_log == 0 {
return Err(FSETableError::AccLogIsZero);
}
let probability_sum = 1 << self.accuracy_log;
let mut probability_counter = 0;
while probability_counter < probability_sum {
let max_remaining_value = probability_sum - probability_counter + 1;
let bits_to_read = highest_bit_set(max_remaining_value);
let unchecked_value = br.get_bits(bits_to_read as usize)? as u32;
let low_threshold = ((1 << bits_to_read) - 1) - (max_remaining_value);
let mask = (1 << (bits_to_read - 1)) - 1;
let small_value = unchecked_value & mask;
let value = if small_value < low_threshold {
br.return_bits(1);
small_value
} else if unchecked_value > mask {
unchecked_value - low_threshold
} else {
unchecked_value
};
//println!("{}, {}, {}", self.symbol_probablilities.len(), unchecked_value, value);
let prob = (value as i32) - 1;
self.symbol_probabilities.push(prob);
if prob != 0 {
if prob > 0 {
probability_counter += prob as u32;
} else {
// probability -1 counts as 1
assert!(prob == -1);
probability_counter += 1;
}
} else {
//fast skip further zero probabilities
loop {
let skip_amount = br.get_bits(2)? as usize;
self.symbol_probabilities
.resize(self.symbol_probabilities.len() + skip_amount, 0);
if skip_amount != 3 {
break;
}
}
}
}
if probability_counter != probability_sum {
return Err(FSETableError::ProbabilityCounterMismatch {
got: probability_counter,
expected_sum: probability_sum,
symbol_probabilities: self.symbol_probabilities.clone(),
});
}
if self.symbol_probabilities.len() > self.max_symbol as usize + 1 {
return Err(FSETableError::TooManySymbols {
got: self.symbol_probabilities.len(),
});
}
let bytes_read = if br.bits_read() % 8 == 0 {
br.bits_read() / 8
} else {
(br.bits_read() / 8) + 1
};
Ok(bytes_read)
}
}
/// A single entry in an FSE table.
#[derive(Copy, Clone, Debug)]
pub struct Entry {
/// This value is used as an offset value, and it is added
/// to a value read from the stream to determine the next state value.
pub base_line: u32,
/// How many bits should be read from the stream when decoding this entry.
pub num_bits: u8,
/// The byte that should be put in the decode output when encountering this state.
pub symbol: u8,
}
/// This value is added to the first 4 bits of the stream to determine the
/// `Accuracy_Log`
const ACC_LOG_OFFSET: u8 = 5;
fn highest_bit_set(x: u32) -> u32 {
assert!(x > 0);
u32::BITS - x.leading_zeros()
}
//utility functions for building the decoding table from probabilities
/// Calculate the position of the next entry of the table given the current
/// position and size of the table.
fn next_position(mut p: usize, table_size: usize) -> usize {
p += (table_size >> 1) + (table_size >> 3) + 3;
p &= table_size - 1;
p
}
fn calc_baseline_and_numbits(
num_states_total: u32,
num_states_symbol: u32,
state_number: u32,
) -> (u32, u8) {
if num_states_symbol == 0 {
return (0, 0);
}
let num_state_slices = if 1 << (highest_bit_set(num_states_symbol) - 1) == num_states_symbol {
num_states_symbol
} else {
1 << (highest_bit_set(num_states_symbol))
}; //always power of two
let num_double_width_state_slices = num_state_slices - num_states_symbol; //leftovers to the power of two need to be distributed
let num_single_width_state_slices = num_states_symbol - num_double_width_state_slices; //these will not receive a double width slice of states
let slice_width = num_states_total / num_state_slices; //size of a single width slice of states
let num_bits = highest_bit_set(slice_width) - 1; //number of bits needed to read for one slice
if state_number < num_double_width_state_slices {
let baseline = num_single_width_state_slices * slice_width + state_number * slice_width * 2;
(baseline, num_bits as u8 + 1)
} else {
let index_shifted = state_number - num_double_width_state_slices;
((index_shifted * slice_width), num_bits as u8)
}
}

445
vendor/ruzstd/src/fse/fse_encoder.rs vendored Normal file
View File

@@ -0,0 +1,445 @@
use crate::bit_io::BitWriter;
use alloc::vec::Vec;
pub(crate) struct FSEEncoder<'output, V: AsMut<Vec<u8>>> {
pub(super) table: FSETable,
writer: &'output mut BitWriter<V>,
}
impl<V: AsMut<Vec<u8>>> FSEEncoder<'_, V> {
pub fn new(table: FSETable, writer: &mut BitWriter<V>) -> FSEEncoder<'_, V> {
FSEEncoder { table, writer }
}
#[cfg(any(test, feature = "fuzz_exports"))]
pub fn into_table(self) -> FSETable {
self.table
}
/// Encodes the data using the provided table
/// Writes
/// * Table description
/// * Encoded data
/// * Last state index
/// * Padding bits to fill up last byte
#[cfg(any(test, feature = "fuzz_exports"))]
pub fn encode(&mut self, data: &[u8]) {
self.write_table();
let mut state = self.table.start_state(data[data.len() - 1]);
for x in data[0..data.len() - 1].iter().rev().copied() {
let next = self.table.next_state(x, state.index);
let diff = state.index - next.baseline;
self.writer.write_bits(diff as u64, next.num_bits as usize);
state = next;
}
self.writer
.write_bits(state.index as u64, self.acc_log() as usize);
let bits_to_fill = self.writer.misaligned();
if bits_to_fill == 0 {
self.writer.write_bits(1u32, 8);
} else {
self.writer.write_bits(1u32, bits_to_fill);
}
}
/// Encodes the data using the provided table but with two interleaved streams
/// Writes
/// * Table description
/// * Encoded data with two interleaved states
/// * Both Last state indexes
/// * Padding bits to fill up last byte
pub fn encode_interleaved(&mut self, data: &[u8]) {
self.write_table();
let mut state_1 = self.table.start_state(data[data.len() - 1]);
let mut state_2 = self.table.start_state(data[data.len() - 2]);
// The first two symbols are represented by the start states
// Then encode the state transitions for two symbols at a time
let mut idx = data.len() - 4;
loop {
{
let state = state_1;
let x = data[idx + 1];
let next = self.table.next_state(x, state.index);
let diff = state.index - next.baseline;
self.writer.write_bits(diff as u64, next.num_bits as usize);
state_1 = next;
}
{
let state = state_2;
let x = data[idx];
let next = self.table.next_state(x, state.index);
let diff = state.index - next.baseline;
self.writer.write_bits(diff as u64, next.num_bits as usize);
state_2 = next;
}
if idx < 2 {
break;
}
idx -= 2;
}
// Determine if we have an even or odd number of symbols to encode
// If odd we need to encode the last states transition and encode the final states in the flipped order
if idx == 1 {
let state = state_1;
let x = data[0];
let next = self.table.next_state(x, state.index);
let diff = state.index - next.baseline;
self.writer.write_bits(diff as u64, next.num_bits as usize);
state_1 = next;
self.writer
.write_bits(state_2.index as u64, self.acc_log() as usize);
self.writer
.write_bits(state_1.index as u64, self.acc_log() as usize);
} else {
self.writer
.write_bits(state_1.index as u64, self.acc_log() as usize);
self.writer
.write_bits(state_2.index as u64, self.acc_log() as usize);
}
let bits_to_fill = self.writer.misaligned();
if bits_to_fill == 0 {
self.writer.write_bits(1u32, 8);
} else {
self.writer.write_bits(1u32, bits_to_fill);
}
}
fn write_table(&mut self) {
self.table.write_table(self.writer);
}
pub(super) fn acc_log(&self) -> u8 {
self.table.acc_log()
}
}
#[derive(Debug, Clone)]
pub struct FSETable {
/// Indexed by symbol
pub(super) states: [SymbolStates; 256],
/// Sum of all states.states.len()
pub(crate) table_size: usize,
}
impl FSETable {
pub(crate) fn next_state(&self, symbol: u8, idx: usize) -> &State {
let states = &self.states[symbol as usize];
states.get(idx, self.table_size)
}
pub(crate) fn start_state(&self, symbol: u8) -> &State {
let states = &self.states[symbol as usize];
&states.states[0]
}
pub fn acc_log(&self) -> u8 {
self.table_size.ilog2() as u8
}
pub fn write_table<V: AsMut<Vec<u8>>>(&self, writer: &mut BitWriter<V>) {
writer.write_bits(self.acc_log() - 5, 4);
let mut probability_counter = 0usize;
let probability_sum = 1 << self.acc_log();
let mut prob_idx = 0;
while probability_counter < probability_sum {
let max_remaining_value = probability_sum - probability_counter + 1;
let bits_to_write = max_remaining_value.ilog2() + 1;
let low_threshold = ((1 << bits_to_write) - 1) - (max_remaining_value);
let mask = (1 << (bits_to_write - 1)) - 1;
let prob = self.states[prob_idx].probability;
prob_idx += 1;
let value = (prob + 1) as u32;
if value < low_threshold as u32 {
writer.write_bits(value, bits_to_write as usize - 1);
} else if value > mask {
writer.write_bits(value + low_threshold as u32, bits_to_write as usize);
} else {
writer.write_bits(value, bits_to_write as usize);
}
if prob == -1 {
probability_counter += 1;
} else if prob > 0 {
probability_counter += prob as usize;
} else {
let mut zeros = 0u8;
while self.states[prob_idx].probability == 0 {
zeros += 1;
prob_idx += 1;
if zeros == 3 {
writer.write_bits(3u8, 2);
zeros = 0;
}
}
writer.write_bits(zeros, 2);
}
}
writer.write_bits(0u8, writer.misaligned());
}
}
#[derive(Debug, Clone)]
pub(super) struct SymbolStates {
/// Sorted by baseline to allow easy lookup using an index
pub(super) states: Vec<State>,
pub(super) probability: i32,
}
impl SymbolStates {
fn get(&self, idx: usize, max_idx: usize) -> &State {
let start_search_at = (idx * self.states.len()) / max_idx;
self.states[start_search_at..]
.iter()
.find(|state| state.contains(idx))
.unwrap()
}
}
#[derive(Debug, Clone)]
pub(crate) struct State {
/// How many bits the range of this state needs to be encoded as
pub(crate) num_bits: u8,
/// The first index targeted by this state
pub(crate) baseline: usize,
/// The last index targeted by this state (baseline + the maximum number with numbits bits allows)
pub(crate) last_index: usize,
/// Index of this state in the decoding table
pub(crate) index: usize,
}
impl State {
fn contains(&self, idx: usize) -> bool {
self.baseline <= idx && self.last_index >= idx
}
}
pub fn build_table_from_data(
data: impl Iterator<Item = u8>,
max_log: u8,
avoid_0_numbit: bool,
) -> FSETable {
let mut counts = [0; 256];
let mut max_symbol = 0;
for x in data {
counts[x as usize] += 1;
}
for (idx, count) in counts.iter().copied().enumerate() {
if count > 0 {
max_symbol = idx;
}
}
build_table_from_counts(&counts[..=max_symbol], max_log, avoid_0_numbit)
}
fn build_table_from_counts(counts: &[usize], max_log: u8, avoid_0_numbit: bool) -> FSETable {
let mut probs = [0; 256];
let probs = &mut probs[..counts.len()];
let mut min_count = 0;
for (idx, count) in counts.iter().copied().enumerate() {
probs[idx] = count as i32;
if count > 0 && (count < min_count || min_count == 0) {
min_count = count;
}
}
// shift all probabilities down so that the lowest are 1
min_count -= 1;
let mut max_prob = 0i32;
for prob in probs.iter_mut() {
if *prob > 0 {
*prob -= min_count as i32;
}
max_prob = max_prob.max(*prob);
}
if max_prob > 0 && max_prob as usize > probs.len() {
let divisor = max_prob / (probs.len() as i32);
for prob in probs.iter_mut() {
if *prob > 0 {
*prob = (*prob / divisor).max(1)
}
}
}
// normalize probabilities to a 2^x
let sum = probs.iter().sum::<i32>();
assert!(sum > 0);
let sum = sum as usize;
let acc_log = (sum.ilog2() as u8 + 1).max(5);
let acc_log = u8::min(acc_log, max_log);
if sum < 1 << acc_log {
// just raise the maximum probability as much as possible
// TODO is this optimal?
let diff = (1 << acc_log) - sum;
let max = probs.iter_mut().max().unwrap();
*max += diff as i32;
} else {
// decrease the smallest ones to 1 first
let mut diff = sum - (1 << acc_log);
while diff > 0 {
let min = probs.iter_mut().filter(|prob| **prob > 1).min().unwrap();
let decrease = usize::min(*min as usize - 1, diff);
diff -= decrease;
*min -= decrease as i32;
}
}
let max = probs.iter_mut().max().unwrap();
if avoid_0_numbit && *max > 1 << (acc_log - 1) {
let redistribute = *max - (1 << (acc_log - 1));
*max -= redistribute;
let max = *max;
// find first occurence of the second_max to avoid lifting the last zero
let second_max = *probs.iter_mut().filter(|x| **x != max).max().unwrap();
let second_max = probs.iter_mut().find(|x| **x == second_max).unwrap();
*second_max += redistribute;
assert!(*second_max <= max);
}
build_table_from_probabilities(probs, acc_log)
}
pub(super) fn build_table_from_probabilities(probs: &[i32], acc_log: u8) -> FSETable {
let mut states = core::array::from_fn::<SymbolStates, 256, _>(|_| SymbolStates {
states: Vec::new(),
probability: 0,
});
// distribute -1 symbols
let mut negative_idx = (1 << acc_log) - 1;
for (symbol, _prob) in probs
.iter()
.copied()
.enumerate()
.filter(|prob| prob.1 == -1)
{
states[symbol].states.push(State {
num_bits: acc_log,
baseline: 0,
last_index: (1 << acc_log) - 1,
index: negative_idx,
});
states[symbol].probability = -1;
negative_idx -= 1;
}
// distribute other symbols
// Setup all needed states per symbol with their respective index
let mut idx = 0;
for (symbol, prob) in probs.iter().copied().enumerate() {
if prob <= 0 {
continue;
}
states[symbol].probability = prob;
let states = &mut states[symbol].states;
for _ in 0..prob {
states.push(State {
num_bits: 0,
baseline: 0,
last_index: 0,
index: idx,
});
idx = next_position(idx, 1 << acc_log);
while idx > negative_idx {
idx = next_position(idx, 1 << acc_log);
}
}
assert_eq!(states.len(), prob as usize);
}
// After all states know their index we can determine the numbits and baselines
for (symbol, prob) in probs.iter().copied().enumerate() {
if prob <= 0 {
continue;
}
let prob = prob as u32;
let state = &mut states[symbol];
// We process the states in their order in the table
state.states.sort_by(|l, r| l.index.cmp(&r.index));
let prob_log = if prob.is_power_of_two() {
prob.ilog2()
} else {
prob.ilog2() + 1
};
let rounded_up = 1u32 << prob_log;
// The lower states target double the amount of indexes -> numbits + 1
let double_states = rounded_up - prob;
let single_states = prob - double_states;
let num_bits = acc_log - prob_log as u8;
let mut baseline = (single_states as usize * (1 << (num_bits))) % (1 << acc_log);
for (idx, state) in state.states.iter_mut().enumerate() {
if (idx as u32) < double_states {
let num_bits = num_bits + 1;
state.baseline = baseline;
state.num_bits = num_bits;
state.last_index = baseline + ((1 << num_bits) - 1);
baseline += 1 << num_bits;
baseline %= 1 << acc_log;
} else {
state.baseline = baseline;
state.num_bits = num_bits;
state.last_index = baseline + ((1 << num_bits) - 1);
baseline += 1 << num_bits;
}
}
// For encoding we use the states ordered by the indexes they target
state.states.sort_by(|l, r| l.baseline.cmp(&r.baseline));
}
FSETable {
table_size: 1 << acc_log,
states,
}
}
/// Calculate the position of the next entry of the table given the current
/// position and size of the table.
fn next_position(mut p: usize, table_size: usize) -> usize {
p += (table_size >> 1) + (table_size >> 3) + 3;
p &= table_size - 1;
p
}
const ML_DIST: &[i32] = &[
1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1,
];
const LL_DIST: &[i32] = &[
4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1,
-1, -1, -1, -1,
];
const OF_DIST: &[i32] = &[
1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1,
];
pub(crate) fn default_ml_table() -> FSETable {
build_table_from_probabilities(ML_DIST, 6)
}
pub(crate) fn default_ll_table() -> FSETable {
build_table_from_probabilities(LL_DIST, 6)
}
pub(crate) fn default_of_table() -> FSETable {
build_table_from_probabilities(OF_DIST, 5)
}

139
vendor/ruzstd/src/fse/mod.rs vendored Normal file
View File

@@ -0,0 +1,139 @@
//! FSE, short for Finite State Entropy, is an encoding technique
//! that assigns shorter codes to symbols that appear more frequently in data,
//! and longer codes to less frequent symbols.
//!
//! FSE works by mutating a state and using that state to index into a table.
//!
//! Zstandard uses two different kinds of entropy encoding: FSE, and Huffman coding.
//! Huffman is used to compress literals,
//! while FSE is used for all other symbols (literal length code, match length code, offset code).
//!
//! <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#fse>
//!
//! <https://arxiv.org/pdf/1311.2540>
mod fse_decoder;
pub use fse_decoder::*;
pub mod fse_encoder;
#[test]
fn tables_equal() {
let probs = &[0, 0, -1, 3, 2, 2, (1 << 6) - 8];
let mut dec_table = FSETable::new(255);
dec_table.build_from_probabilities(6, probs).unwrap();
let enc_table = fse_encoder::build_table_from_probabilities(probs, 6);
check_tables(&dec_table, &enc_table);
}
#[cfg(any(test, feature = "fuzz_exports"))]
fn check_tables(dec_table: &fse_decoder::FSETable, enc_table: &fse_encoder::FSETable) {
for (idx, dec_state) in dec_table.decode.iter().enumerate() {
let enc_states = &enc_table.states[dec_state.symbol as usize];
let enc_state = enc_states
.states
.iter()
.find(|state| state.index == idx)
.unwrap();
assert_eq!(enc_state.baseline, dec_state.base_line as usize);
assert_eq!(enc_state.num_bits, dec_state.num_bits);
}
}
#[test]
fn roundtrip() {
round_trip(&(0..64).collect::<alloc::vec::Vec<_>>());
let mut data = alloc::vec![];
data.extend(0..32);
data.extend(0..32);
data.extend(0..32);
data.extend(0..32);
data.extend(0..32);
data.extend(20..32);
data.extend(20..32);
data.extend(0..32);
data.extend(20..32);
data.extend(100..255);
data.extend(20..32);
data.extend(20..32);
round_trip(&data);
#[cfg(feature = "std")]
if std::fs::exists("fuzz/artifacts/fse").unwrap_or(false) {
for file in std::fs::read_dir("fuzz/artifacts/fse").unwrap() {
if file.as_ref().unwrap().file_type().unwrap().is_file() {
let data = std::fs::read(file.unwrap().path()).unwrap();
round_trip(&data);
}
}
}
}
/// Only needed for testing.
///
/// Encodes the data with a table built from that data
/// Decodes the result again by first decoding the table and then the data
/// Asserts that the decoded data equals the input
#[cfg(any(test, feature = "fuzz_exports"))]
pub fn round_trip(data: &[u8]) {
use crate::bit_io::{BitReaderReversed, BitWriter};
use fse_encoder::FSEEncoder;
if data.len() < 2 {
return;
}
if data.iter().all(|x| *x == data[0]) {
return;
}
if data.len() < 64 {
return;
}
let mut writer = BitWriter::new();
let mut encoder = FSEEncoder::new(
fse_encoder::build_table_from_data(data.iter().copied(), 22, false),
&mut writer,
);
let mut dec_table = FSETable::new(255);
encoder.encode(data);
let acc_log = encoder.acc_log();
let enc_table = encoder.into_table();
let encoded = writer.dump();
let table_bytes = dec_table.build_decoder(&encoded, acc_log).unwrap();
let encoded = &encoded[table_bytes..];
let mut decoder = FSEDecoder::new(&dec_table);
check_tables(&dec_table, &enc_table);
let mut br = BitReaderReversed::new(encoded);
let mut skipped_bits = 0;
loop {
let val = br.get_bits(1);
skipped_bits += 1;
if val == 1 || skipped_bits > 8 {
break;
}
}
if skipped_bits > 8 {
//if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data
panic!("Corrupted end marker");
}
decoder.init_state(&mut br).unwrap();
let mut decoded = alloc::vec::Vec::new();
for x in data {
let w = decoder.decode_symbol();
assert_eq!(w, *x);
decoded.push(w);
if decoded.len() < data.len() {
decoder.update_state(&mut br);
}
}
assert_eq!(&decoded, data);
assert_eq!(br.bits_remaining(), 0);
}

401
vendor/ruzstd/src/huff0/huff0_decoder.rs vendored Normal file
View File

@@ -0,0 +1,401 @@
//! Utilities for decoding Huff0 encoded huffman data.
use crate::bit_io::BitReaderReversed;
use crate::decoding::errors::HuffmanTableError;
use crate::fse::{FSEDecoder, FSETable};
use alloc::vec::Vec;
/// The Zstandard specification limits the maximum length of a code to 11 bits.
pub(crate) const MAX_MAX_NUM_BITS: u8 = 11;
pub struct HuffmanDecoder<'table> {
table: &'table HuffmanTable,
/// State is used to index into the table.
pub state: u64,
}
impl<'t> HuffmanDecoder<'t> {
/// Create a new decoder with the provided table
pub fn new(table: &'t HuffmanTable) -> HuffmanDecoder<'t> {
HuffmanDecoder { table, state: 0 }
}
/// Decode the symbol the internal state (cursor) is pointed at and return the
/// decoded literal.
pub fn decode_symbol(&mut self) -> u8 {
self.table.decode[self.state as usize].symbol
}
/// Initialize internal state and prepare to decode data. Then, `decode_symbol` can be called
/// to read the byte the internal cursor is pointing at, and `next_state` can be called to advance
/// the cursor until the max number of bits has been read.
pub fn init_state(&mut self, br: &mut BitReaderReversed<'_>) -> u8 {
let num_bits = self.table.max_num_bits;
let new_bits = br.get_bits(num_bits);
self.state = new_bits;
num_bits
}
/// Advance the internal cursor to the next symbol. After this, you can call `decode_symbol`
/// to read from the new position.
pub fn next_state(&mut self, br: &mut BitReaderReversed<'_>) -> u8 {
// self.state stores a small section, or a window of the bit stream. The table can be indexed via this state,
// telling you how many bits identify the current symbol.
let num_bits = self.table.decode[self.state as usize].num_bits;
// New bits are read from the stream
let new_bits = br.get_bits(num_bits);
// Shift and mask out the bits that identify the current symbol
self.state <<= num_bits;
self.state &= self.table.decode.len() as u64 - 1;
// The new bits are appended at the end of the current state.
self.state |= new_bits;
num_bits
}
}
/// A Huffman decoding table contains a list of Huffman prefix codes and their associated values
pub struct HuffmanTable {
decode: Vec<Entry>,
/// The weight of a symbol is the number of occurences in a table.
/// This value is used in constructing a binary tree referred to as
/// a Huffman tree. Once this tree is constructed, it can be used to build the
/// lookup table
weights: Vec<u8>,
/// The maximum size in bits a prefix code in the encoded data can be.
/// This value is used so that the decoder knows how many bits
/// to read from the bitstream before checking the table. This
/// value must be 11 or lower.
pub max_num_bits: u8,
bits: Vec<u8>,
bit_ranks: Vec<u32>,
rank_indexes: Vec<usize>,
/// In some cases, the list of weights is compressed using FSE compression.
fse_table: FSETable,
}
impl HuffmanTable {
/// Create a new, empty table.
pub fn new() -> HuffmanTable {
HuffmanTable {
decode: Vec::new(),
weights: Vec::with_capacity(256),
max_num_bits: 0,
bits: Vec::with_capacity(256),
bit_ranks: Vec::with_capacity(11),
rank_indexes: Vec::with_capacity(11),
fse_table: FSETable::new(255),
}
}
/// Completely empty the table then repopulate as a replica
/// of `other`.
pub fn reinit_from(&mut self, other: &Self) {
self.reset();
self.decode.extend_from_slice(&other.decode);
self.weights.extend_from_slice(&other.weights);
self.max_num_bits = other.max_num_bits;
self.bits.extend_from_slice(&other.bits);
self.rank_indexes.extend_from_slice(&other.rank_indexes);
self.fse_table.reinit_from(&other.fse_table);
}
/// Completely empty the table of all data.
pub fn reset(&mut self) {
self.decode.clear();
self.weights.clear();
self.max_num_bits = 0;
self.bits.clear();
self.bit_ranks.clear();
self.rank_indexes.clear();
self.fse_table.reset();
}
/// Read from `source` and decode the input, populating the huffman decoding table.
///
/// Returns the number of bytes read.
pub fn build_decoder(&mut self, source: &[u8]) -> Result<u32, HuffmanTableError> {
self.decode.clear();
let bytes_used = self.read_weights(source)?;
self.build_table_from_weights()?;
Ok(bytes_used)
}
/// Read weights from the provided source.
///
/// The huffman table is represented in the input data as a list of weights.
/// After the header, weights are read, then a Huffman decoding table
/// can be constructed using that list of weights.
///
/// Returns the number of bytes read.
fn read_weights(&mut self, source: &[u8]) -> Result<u32, HuffmanTableError> {
use HuffmanTableError as err;
if source.is_empty() {
return Err(err::SourceIsEmpty);
}
let header = source[0];
let mut bits_read = 8;
match header {
// If the header byte is less than 128, the series of weights
// is compressed using two interleaved FSE streams that share
// a distribution table.
0..=127 => {
let fse_stream = &source[1..];
if header as usize > fse_stream.len() {
return Err(err::NotEnoughBytesForWeights {
got_bytes: fse_stream.len(),
expected_bytes: header,
});
}
//fse decompress weights
let bytes_used_by_fse_header = self.fse_table.build_decoder(fse_stream, 6)?;
if bytes_used_by_fse_header > header as usize {
return Err(err::FSETableUsedTooManyBytes {
used: bytes_used_by_fse_header,
available_bytes: header,
});
}
vprintln!(
"Building fse table for huffman weights used: {}",
bytes_used_by_fse_header
);
// Huffman headers are compressed using two interleaved
// FSE bitstreams, where the first state (decoder) handles
// even symbols, and the second handles odd symbols.
let mut dec1 = FSEDecoder::new(&self.fse_table);
let mut dec2 = FSEDecoder::new(&self.fse_table);
let compressed_start = bytes_used_by_fse_header;
let compressed_length = header as usize - bytes_used_by_fse_header;
let compressed_weights = &fse_stream[compressed_start..];
if compressed_weights.len() < compressed_length {
return Err(err::NotEnoughBytesToDecompressWeights {
have: compressed_weights.len(),
need: compressed_length,
});
}
let compressed_weights = &compressed_weights[..compressed_length];
let mut br = BitReaderReversed::new(compressed_weights);
bits_read += (bytes_used_by_fse_header + compressed_length) * 8;
//skip the 0 padding at the end of the last byte of the bit stream and throw away the first 1 found
let mut skipped_bits = 0;
loop {
let val = br.get_bits(1);
skipped_bits += 1;
if val == 1 || skipped_bits > 8 {
break;
}
}
if skipped_bits > 8 {
//if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data
return Err(err::ExtraPadding { skipped_bits });
}
dec1.init_state(&mut br)?;
dec2.init_state(&mut br)?;
self.weights.clear();
// The two decoders take turns decoding a single symbol and updating their state.
loop {
let w = dec1.decode_symbol();
self.weights.push(w);
dec1.update_state(&mut br);
if br.bits_remaining() <= -1 {
//collect final states
self.weights.push(dec2.decode_symbol());
break;
}
let w = dec2.decode_symbol();
self.weights.push(w);
dec2.update_state(&mut br);
if br.bits_remaining() <= -1 {
//collect final states
self.weights.push(dec1.decode_symbol());
break;
}
//maximum number of weights is 255 because we use u8 symbols and the last weight is inferred from the sum of all others
if self.weights.len() > 255 {
return Err(err::TooManyWeights {
got: self.weights.len(),
});
}
}
}
// If the header byte is greater than or equal to 128,
// weights are directly represented, where each weight is
// encoded directly as a 4 bit field. The weights will
// always be encoded with full bytes, meaning if there's
// an odd number of weights, the last weight will still
// occupy a full byte.
_ => {
// weights are directly encoded
let weights_raw = &source[1..];
let num_weights = header - 127;
self.weights.resize(num_weights as usize, 0);
let bytes_needed = if num_weights % 2 == 0 {
num_weights as usize / 2
} else {
(num_weights as usize / 2) + 1
};
if weights_raw.len() < bytes_needed {
return Err(err::NotEnoughBytesInSource {
got: weights_raw.len(),
need: bytes_needed,
});
}
for idx in 0..num_weights {
if idx % 2 == 0 {
self.weights[idx as usize] = weights_raw[idx as usize / 2] >> 4;
} else {
self.weights[idx as usize] = weights_raw[idx as usize / 2] & 0xF;
}
bits_read += 4;
}
}
}
let bytes_read = if bits_read % 8 == 0 {
bits_read / 8
} else {
(bits_read / 8) + 1
};
Ok(bytes_read as u32)
}
/// Once the weights have been read from the data, you can decode the weights
/// into a table, and use that table to decode the actual compressed data.
///
/// This function populates the rest of the table from the series of weights.
fn build_table_from_weights(&mut self) -> Result<(), HuffmanTableError> {
use HuffmanTableError as err;
self.bits.clear();
self.bits.resize(self.weights.len() + 1, 0);
let mut weight_sum: u32 = 0;
for w in &self.weights {
if *w > MAX_MAX_NUM_BITS {
return Err(err::WeightBiggerThanMaxNumBits { got: *w });
}
weight_sum += if *w > 0 { 1_u32 << (*w - 1) } else { 0 };
}
if weight_sum == 0 {
return Err(err::MissingWeights);
}
let max_bits = highest_bit_set(weight_sum) as u8;
let left_over = (1 << max_bits) - weight_sum;
//left_over must be power of two
if !left_over.is_power_of_two() {
return Err(err::LeftoverIsNotAPowerOf2 { got: left_over });
}
let last_weight = highest_bit_set(left_over) as u8;
for symbol in 0..self.weights.len() {
let bits = if self.weights[symbol] > 0 {
max_bits + 1 - self.weights[symbol]
} else {
0
};
self.bits[symbol] = bits;
}
self.bits[self.weights.len()] = max_bits + 1 - last_weight;
self.max_num_bits = max_bits;
if max_bits > MAX_MAX_NUM_BITS {
return Err(err::MaxBitsTooHigh { got: max_bits });
}
self.bit_ranks.clear();
self.bit_ranks.resize((max_bits + 1) as usize, 0);
for num_bits in &self.bits {
self.bit_ranks[(*num_bits) as usize] += 1;
}
//fill with dummy symbols
self.decode.resize(
1 << self.max_num_bits,
Entry {
symbol: 0,
num_bits: 0,
},
);
//starting codes for each rank
self.rank_indexes.clear();
self.rank_indexes.resize((max_bits + 1) as usize, 0);
self.rank_indexes[max_bits as usize] = 0;
for bits in (1..self.rank_indexes.len() as u8).rev() {
self.rank_indexes[bits as usize - 1] = self.rank_indexes[bits as usize]
+ self.bit_ranks[bits as usize] as usize * (1 << (max_bits - bits));
}
assert!(
self.rank_indexes[0] == self.decode.len(),
"rank_idx[0]: {} should be: {}",
self.rank_indexes[0],
self.decode.len()
);
for symbol in 0..self.bits.len() {
let bits_for_symbol = self.bits[symbol];
if bits_for_symbol != 0 {
// allocate code for the symbol and set in the table
// a code ignores all max_bits - bits[symbol] bits, so it gets
// a range that spans all of those in the decoding table
let base_idx = self.rank_indexes[bits_for_symbol as usize];
let len = 1 << (max_bits - bits_for_symbol);
self.rank_indexes[bits_for_symbol as usize] += len;
for idx in 0..len {
self.decode[base_idx + idx].symbol = symbol as u8;
self.decode[base_idx + idx].num_bits = bits_for_symbol;
}
}
}
Ok(())
}
}
impl Default for HuffmanTable {
fn default() -> Self {
Self::new()
}
}
/// A single entry in the table contains the decoded symbol/literal and the
/// size of the prefix code.
#[derive(Copy, Clone, Debug)]
pub struct Entry {
/// The byte that the prefix code replaces during encoding.
symbol: u8,
/// The number of bits the prefix code occupies.
num_bits: u8,
}
/// Assert that the provided value is greater than zero, and returns the
/// 32 - the number of leading zeros
fn highest_bit_set(x: u32) -> u32 {
assert!(x > 0);
u32::BITS - x.leading_zeros()
}

484
vendor/ruzstd/src/huff0/huff0_encoder.rs vendored Normal file
View File

@@ -0,0 +1,484 @@
use alloc::vec::Vec;
use core::cmp::Ordering;
use crate::{
bit_io::BitWriter,
fse::fse_encoder::{self, FSEEncoder},
};
pub(crate) struct HuffmanEncoder<'output, 'table, V: AsMut<Vec<u8>>> {
table: &'table HuffmanTable,
writer: &'output mut BitWriter<V>,
}
impl<V: AsMut<Vec<u8>>> HuffmanEncoder<'_, '_, V> {
pub fn new<'o, 't>(
table: &'t HuffmanTable,
writer: &'o mut BitWriter<V>,
) -> HuffmanEncoder<'o, 't, V> {
HuffmanEncoder { table, writer }
}
/// Encodes the data using the provided table
/// Writes
/// * Table description
/// * Encoded data
/// * Padding bits to fill up last byte
pub fn encode(&mut self, data: &[u8], with_table: bool) {
if with_table {
self.write_table();
}
Self::encode_stream(self.table, self.writer, data);
}
/// Encodes the data using the provided table in 4 concatenated streams
/// Writes
/// * Table description
/// * Jumptable
/// * Encoded data in 4 streams, each padded to fill the last byte
pub fn encode4x(&mut self, data: &[u8], with_table: bool) {
assert!(data.len() >= 4);
// Split data in 4 equally sized parts (the last one might be a bit smaller than the rest)
let split_size = data.len().div_ceil(4);
let src1 = &data[..split_size];
let src2 = &data[split_size..split_size * 2];
let src3 = &data[split_size * 2..split_size * 3];
let src4 = &data[split_size * 3..];
// Write table description
if with_table {
self.write_table();
}
// Reserve space for the jump table, will be changed later
let size_idx = self.writer.index();
self.writer.write_bits(0u16, 16);
self.writer.write_bits(0u16, 16);
self.writer.write_bits(0u16, 16);
// Write the 4 streams, noting the sizes of the encoded streams
let index_before = self.writer.index();
Self::encode_stream(self.table, self.writer, src1);
let size1 = (self.writer.index() - index_before) / 8;
let index_before = self.writer.index();
Self::encode_stream(self.table, self.writer, src2);
let size2 = (self.writer.index() - index_before) / 8;
let index_before = self.writer.index();
Self::encode_stream(self.table, self.writer, src3);
let size3 = (self.writer.index() - index_before) / 8;
Self::encode_stream(self.table, self.writer, src4);
// Sanity check, if this doesn't hold we produce a broken stream
assert!(size1 <= u16::MAX as usize);
assert!(size2 <= u16::MAX as usize);
assert!(size3 <= u16::MAX as usize);
// Update the jumptable with the real sizes
self.writer.change_bits(size_idx, size1 as u16, 16);
self.writer.change_bits(size_idx + 16, size2 as u16, 16);
self.writer.change_bits(size_idx + 32, size3 as u16, 16);
}
/// Encode one stream and pad it to fill the last byte
fn encode_stream<VV: AsMut<Vec<u8>>>(
table: &HuffmanTable,
writer: &mut BitWriter<VV>,
data: &[u8],
) {
for symbol in data.iter().rev() {
let (code, num_bits) = table.codes[*symbol as usize];
debug_assert!(num_bits > 0);
writer.write_bits(code, num_bits as usize);
}
let bits_to_fill = writer.misaligned();
if bits_to_fill == 0 {
writer.write_bits(1u32, 8);
} else {
writer.write_bits(1u32, bits_to_fill);
}
}
pub(super) fn weights(&self) -> Vec<u8> {
let max = self.table.codes.iter().map(|(_, nb)| nb).max().unwrap();
let weights = self
.table
.codes
.iter()
.copied()
.map(|(_, nb)| if nb == 0 { 0 } else { max - nb + 1 })
.collect::<Vec<u8>>();
weights
}
fn write_table(&mut self) {
// TODO strategy for determining this?
let weights = self.weights();
let weights = &weights[..weights.len() - 1]; // dont encode last weight
if weights.len() > 16 {
let size_idx = self.writer.index();
self.writer.write_bits(0u8, 8);
let idx_before = self.writer.index();
let mut encoder = FSEEncoder::new(
fse_encoder::build_table_from_data(weights.iter().copied(), 6, true),
self.writer,
);
encoder.encode_interleaved(weights);
let encoded_len = (self.writer.index() - idx_before) / 8;
assert!(encoded_len < 128);
self.writer.change_bits(size_idx, encoded_len as u8, 8);
} else {
self.writer.write_bits(weights.len() as u8 + 127, 8);
let pairs = weights.chunks_exact(2);
let remainder = pairs.remainder();
for pair in pairs.into_iter() {
let weight1 = pair[0];
let weight2 = pair[1];
assert!(weight1 < 16);
assert!(weight2 < 16);
self.writer.write_bits(weight2, 4);
self.writer.write_bits(weight1, 4);
}
if !remainder.is_empty() {
let weight = remainder[0];
assert!(weight < 16);
self.writer.write_bits(weight << 4, 8);
}
}
}
}
pub struct HuffmanTable {
/// Index is the symbol, values are the bitstring in the lower bits of the u32 and the amount of bits in the u8
codes: Vec<(u32, u8)>,
}
impl HuffmanTable {
pub fn build_from_data(data: &[u8]) -> Self {
let mut counts = [0; 256];
let mut max = 0;
for x in data {
counts[*x as usize] += 1;
max = max.max(*x);
}
Self::build_from_counts(&counts[..=max as usize])
}
pub fn build_from_counts(counts: &[usize]) -> Self {
assert!(counts.len() <= 256);
let zeros = counts.iter().filter(|x| **x == 0).count();
let mut weights = distribute_weights(counts.len() - zeros);
let limit = weights.len().ilog2() as usize + 2;
redistribute_weights(&mut weights, limit);
weights.reverse();
let mut counts_sorted = counts.iter().enumerate().collect::<Vec<_>>();
counts_sorted.sort_by(|(_, c1), (_, c2)| c1.cmp(c2));
let mut weights_distributed = alloc::vec![0; counts.len()];
for (idx, count) in counts_sorted {
if *count == 0 {
weights_distributed[idx] = 0;
} else {
weights_distributed[idx] = weights.pop().unwrap();
}
}
Self::build_from_weights(&weights_distributed)
}
pub fn build_from_weights(weights: &[usize]) -> Self {
let mut sorted = Vec::with_capacity(weights.len());
struct SortEntry {
symbol: u8,
weight: usize,
}
// TODO this doesn't need to be a temporary Vec, it could be done in a [_; 264]
// only non-zero weights are interesting here
for (symbol, weight) in weights.iter().copied().enumerate() {
if weight > 0 {
sorted.push(SortEntry {
symbol: symbol as u8,
weight,
});
}
}
// We process symbols ordered by weight and then ordered by symbol
sorted.sort_by(|left, right| match left.weight.cmp(&right.weight) {
Ordering::Equal => left.symbol.cmp(&right.symbol),
other => other,
});
// Prepare huffman table with placeholders
let mut table = HuffmanTable {
codes: Vec::with_capacity(weights.len()),
};
for _ in 0..weights.len() {
table.codes.push((0, 0));
}
// Determine the number of bits needed for codes with the lowest weight
let weight_sum = sorted.iter().map(|e| 1 << (e.weight - 1)).sum::<usize>();
if !weight_sum.is_power_of_two() {
panic!("This is an internal error");
}
let max_num_bits = highest_bit_set(weight_sum) - 1; // this is a log_2 of a clean power of two
// Starting at the symbols with the lowest weight we update the placeholders in the table
let mut current_code = 0;
let mut current_weight = 0;
let mut current_num_bits = 0;
for entry in sorted.iter() {
// If the entry isn't the same weight as the last one we need to change a few things
if current_weight != entry.weight {
// The code shifts by the difference of the weights to allow for enough unique values
current_code >>= entry.weight - current_weight;
// Encoding a symbol of this weight will take less bits than the previous weight
current_num_bits = max_num_bits - entry.weight + 1;
// Run the next update when the weight changes again
current_weight = entry.weight;
}
table.codes[entry.symbol as usize] = (current_code as u32, current_num_bits as u8);
current_code += 1;
}
table
}
pub fn can_encode(&self, other: &Self) -> Option<usize> {
if other.codes.len() > self.codes.len() {
return None;
}
let mut sum = 0;
for ((_, other_num_bits), (_, self_num_bits)) in other.codes.iter().zip(self.codes.iter()) {
if *other_num_bits != 0 && *self_num_bits == 0 {
return None;
}
sum += other_num_bits.abs_diff(*self_num_bits) as usize;
}
Some(sum)
}
}
/// Assert that the provided value is greater than zero, and returns index of the first set bit
fn highest_bit_set(x: usize) -> usize {
assert!(x > 0);
usize::BITS as usize - x.leading_zeros() as usize
}
#[test]
fn huffman() {
let table = HuffmanTable::build_from_weights(&[2, 2, 2, 1, 1]);
assert_eq!(table.codes[0], (1, 2));
assert_eq!(table.codes[1], (2, 2));
assert_eq!(table.codes[2], (3, 2));
assert_eq!(table.codes[3], (0, 3));
assert_eq!(table.codes[4], (1, 3));
let table = HuffmanTable::build_from_weights(&[4, 3, 2, 0, 1, 1]);
assert_eq!(table.codes[0], (1, 1));
assert_eq!(table.codes[1], (1, 2));
assert_eq!(table.codes[2], (1, 3));
assert_eq!(table.codes[3], (0, 0));
assert_eq!(table.codes[4], (0, 4));
assert_eq!(table.codes[5], (1, 4));
}
/// Distributes weights that add up to a clean power of two
fn distribute_weights(amount: usize) -> Vec<usize> {
assert!(amount >= 2);
assert!(amount <= 256);
let mut weights = Vec::new();
// This is the trivial power of two we always need
weights.push(1);
weights.push(1);
// This is the weight we are adding right now
let mut target_weight = 1;
// Counts how many times we have added weights
let mut weight_counter = 2;
// We always add a power of 2 new weights so that the weights that we add equal
// the weights are already in the vec if raised to the power of two.
// This means we double the weights in the vec -> results in a new power of two
//
// Example: [1, 1] -> [1,1,2] (2^1 + 2^1 == 2^2)
//
// Example: [1, 1] -> [1,1,1,1] (2^1 + 2^1 == 2^1 + 2^1)
// [1,1,1,1] -> [1,1,1,1,3] (2^1 + 2^1 + 2^1 + 2^1 == 2^3)
while weights.len() < amount {
let mut add_new = 1 << (weight_counter - target_weight);
let available_space = amount - weights.len();
// If the amount of new weights needed to get to the next power of two would exceed amount
// We instead add 1 of a bigger weight and start the cycle again
if add_new > available_space {
// TODO we could maybe instead do this until add_new <= available_space?
// target_weight += 1
// add_new /= 2
target_weight = weight_counter;
add_new = 1;
}
for _ in 0..add_new {
weights.push(target_weight);
}
weight_counter += 1;
}
assert_eq!(amount, weights.len());
weights
}
/// Sometimes distribute_weights generates weights that require too many bits to encode
/// This redistributes the weights to have less variance by raising the lower weights while still maintaining the
/// required attributes of the weight distribution
fn redistribute_weights(weights: &mut [usize], max_num_bits: usize) {
let weight_sum_log = weights
.iter()
.copied()
.map(|x| 1 << x)
.sum::<usize>()
.ilog2() as usize;
// Nothing needs to be done, this is already fine
if weight_sum_log < max_num_bits {
return;
}
// We need to decrease the weight difference by the difference between weight_sum_log and max_num_bits
let decrease_weights_by = weight_sum_log - max_num_bits + 1;
// To do that we raise the lower weights up by that difference, recording how much weight we added in the process
let mut added_weights = 0;
for weight in weights.iter_mut() {
if *weight < decrease_weights_by {
for add in *weight..decrease_weights_by {
added_weights += 1 << add;
}
*weight = decrease_weights_by;
}
}
// Then we reduce weights until the added weights are equaled out
while added_weights > 0 {
// Find the highest weight that is still lower or equal to the added weight
let mut current_idx = 0;
let mut current_weight = 0;
for (idx, weight) in weights.iter().copied().enumerate() {
if 1 << (weight - 1) > added_weights {
break;
}
if weight > current_weight {
current_weight = weight;
current_idx = idx;
}
}
// Reduce that weight by 1
added_weights -= 1 << (current_weight - 1);
weights[current_idx] -= 1;
}
// At the end we normalize the weights so that they start at 1 again
if weights[0] > 1 {
let offset = weights[0] - 1;
for weight in weights.iter_mut() {
*weight -= offset;
}
}
}
#[test]
fn weights() {
// assert_eq!(distribute_weights(5).as_slice(), &[1, 1, 2, 3, 4]);
for amount in 2..=256 {
let mut weights = distribute_weights(amount);
assert_eq!(weights.len(), amount);
let sum = weights
.iter()
.copied()
.map(|weight| 1 << weight)
.sum::<usize>();
assert!(sum.is_power_of_two());
for num_bit_limit in (amount.ilog2() as usize + 1)..=11 {
redistribute_weights(&mut weights, num_bit_limit);
let sum = weights
.iter()
.copied()
.map(|weight| 1 << weight)
.sum::<usize>();
assert!(sum.is_power_of_two());
assert!(
sum.ilog2() <= 11,
"Max bits too big: sum: {} {weights:?}",
sum
);
let codes = HuffmanTable::build_from_weights(&weights).codes;
for (code, num_bits) in codes.iter().copied() {
for (code2, num_bits2) in codes.iter().copied() {
if num_bits == 0 || num_bits2 == 0 || (code, num_bits) == (code2, num_bits2) {
continue;
}
if num_bits <= num_bits2 {
let code2_shifted = code2 >> (num_bits2 - num_bits);
assert_ne!(
code, code2_shifted,
"{:b},{num_bits:} is prefix of {:b},{num_bits2:}",
code, code2
);
}
}
}
}
}
}
#[test]
fn counts() {
let counts = &[3, 0, 4, 1, 5];
let table = HuffmanTable::build_from_counts(counts).codes;
assert_eq!(table[1].1, 0);
assert!(table[3].1 >= table[0].1);
assert!(table[0].1 >= table[2].1);
assert!(table[2].1 >= table[4].1);
let counts = &[3, 0, 4, 0, 7, 2, 2, 2, 0, 2, 2, 1, 5];
let table = HuffmanTable::build_from_counts(counts).codes;
assert_eq!(table[1].1, 0);
assert_eq!(table[3].1, 0);
assert_eq!(table[8].1, 0);
assert!(table[11].1 >= table[5].1);
assert!(table[5].1 >= table[6].1);
assert!(table[6].1 >= table[7].1);
assert!(table[7].1 >= table[9].1);
assert!(table[9].1 >= table[10].1);
assert!(table[10].1 >= table[0].1);
assert!(table[0].1 >= table[2].1);
assert!(table[2].1 >= table[12].1);
assert!(table[12].1 >= table[4].1);
}
#[test]
fn from_data() {
let counts = &[3, 0, 4, 1, 5];
let table = HuffmanTable::build_from_counts(counts).codes;
let data = &[0, 2, 4, 4, 0, 3, 2, 2, 0, 2];
let table2 = HuffmanTable::build_from_data(data).codes;
assert_eq!(table, table2);
}

84
vendor/ruzstd/src/huff0/mod.rs vendored Normal file
View File

@@ -0,0 +1,84 @@
/// Huffman coding is a method of encoding where symbols are assigned a code,
/// and more commonly used symbols get shorter codes, and less commonly
/// used symbols get longer codes. Codes are prefix free, meaning no two codes
/// will start with the same sequence of bits.
mod huff0_decoder;
pub use huff0_decoder::*;
pub mod huff0_encoder;
/// Only needed for testing.
///
/// Encodes the data with a table built from that data
/// Decodes the result again by first decoding the table and then the data
/// Asserts that the decoded data equals the input
#[cfg(any(test, feature = "fuzz_exports"))]
pub fn round_trip(data: &[u8]) {
use crate::bit_io::{BitReaderReversed, BitWriter};
use alloc::vec::Vec;
if data.len() < 2 {
return;
}
if data.iter().all(|x| *x == data[0]) {
return;
}
let mut writer = BitWriter::new();
let encoder_table = huff0_encoder::HuffmanTable::build_from_data(data);
let mut encoder = huff0_encoder::HuffmanEncoder::new(&encoder_table, &mut writer);
encoder.encode(data, true);
let encoded = writer.dump();
let mut decoder_table = HuffmanTable::new();
let table_bytes = decoder_table.build_decoder(&encoded).unwrap();
let mut decoder = HuffmanDecoder::new(&decoder_table);
let mut br = BitReaderReversed::new(&encoded[table_bytes as usize..]);
let mut skipped_bits = 0;
loop {
let val = br.get_bits(1);
skipped_bits += 1;
if val == 1 || skipped_bits > 8 {
break;
}
}
if skipped_bits > 8 {
//if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data
panic!("Corrupted end marker");
}
decoder.init_state(&mut br);
let mut decoded = Vec::new();
while br.bits_remaining() > -(decoder_table.max_num_bits as isize) {
decoded.push(decoder.decode_symbol());
decoder.next_state(&mut br);
}
assert_eq!(&decoded, data);
}
#[test]
fn roundtrip() {
use alloc::vec::Vec;
round_trip(&[1, 1, 1, 1, 2, 3]);
round_trip(&[1, 1, 1, 1, 2, 3, 5, 45, 12, 90]);
for size in 2..512 {
use alloc::vec;
let data = vec![123; size];
round_trip(&data);
let mut data = Vec::new();
for x in 0..size {
data.push(x as u8);
}
round_trip(&data);
}
#[cfg(feature = "std")]
if std::fs::exists("fuzz/artifacts/huff0").unwrap_or(false) {
for file in std::fs::read_dir("fuzz/artifacts/huff0").unwrap() {
if file.as_ref().unwrap().file_type().unwrap().is_file() {
let data = std::fs::read(file.unwrap().path()).unwrap();
round_trip(&data);
}
}
}
}

260
vendor/ruzstd/src/io_nostd.rs vendored Normal file
View File

@@ -0,0 +1,260 @@
//! Manual implementations of representations for `#![no_std]`
use alloc::boxed::Box;
#[non_exhaustive]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd)]
pub enum ErrorKind {
Interrupted,
UnexpectedEof,
WouldBlock,
Other,
WriteAllEof,
}
impl ErrorKind {
fn as_str(&self) -> &'static str {
use ErrorKind::*;
match *self {
Interrupted => "operation interrupted",
UnexpectedEof => "unexpected end of file",
WouldBlock => "operation would block",
Other => "other error",
WriteAllEof => "write_all hit EOF",
}
}
}
impl core::fmt::Display for ErrorKind {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.write_str(self.as_str())
}
}
pub struct Error {
kind: ErrorKind,
err: Option<Box<dyn core::fmt::Display + Send + Sync + 'static>>,
}
impl alloc::fmt::Debug for Error {
fn fmt(&self, f: &mut alloc::fmt::Formatter<'_>) -> Result<(), alloc::fmt::Error> {
let mut s = f.debug_struct("Error");
s.field("kind", &self.kind);
if let Some(err) = self.err.as_ref() {
s.field("err", &alloc::format!("{err}"));
}
s.finish()
}
}
impl Error {
pub fn new(kind: ErrorKind, err: Box<dyn core::fmt::Display + Send + Sync + 'static>) -> Self {
Self {
kind,
err: Some(err),
}
}
pub fn from(kind: ErrorKind) -> Self {
Self { kind, err: None }
}
pub fn kind(&self) -> ErrorKind {
self.kind
}
pub fn is_interrupted(&self) -> bool {
matches!(self.kind, ErrorKind::Interrupted)
}
pub fn get_ref(&self) -> Option<&(dyn core::fmt::Display + Send + Sync)> {
self.err.as_ref().map(|e| e.as_ref())
}
pub fn into_inner(self) -> Option<Box<dyn core::fmt::Display + Send + Sync + 'static>> {
self.err
}
}
impl core::fmt::Display for Error {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.write_str(self.kind.as_str())?;
if let Some(ref e) = self.err {
e.fmt(f)?;
}
Ok(())
}
}
impl From<ErrorKind> for Error {
fn from(value: ErrorKind) -> Self {
Self::from(value)
}
}
pub trait Read {
fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error>;
fn read_exact(&mut self, mut buf: &mut [u8]) -> Result<(), Error> {
while !buf.is_empty() {
match self.read(buf) {
Ok(0) => break,
Ok(n) => {
let tmp = buf;
buf = &mut tmp[n..];
}
Err(ref e) if e.kind() == ErrorKind::Interrupted => {}
Err(e) => return Err(e),
}
}
if !buf.is_empty() {
Err(Error::from(ErrorKind::UnexpectedEof))
} else {
Ok(())
}
}
fn read_to_end(&mut self, output: &mut alloc::vec::Vec<u8>) -> Result<(), Error> {
let mut buf = [0u8; 1024 * 16];
loop {
let bytes = self.read(&mut buf)?;
if bytes == 0 {
break;
}
output.extend_from_slice(&buf[..bytes]);
}
Ok(())
}
fn take(self, limit: u64) -> Take<Self>
where
Self: Sized,
{
Take { inner: self, limit }
}
}
impl Read for &[u8] {
fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
let size = core::cmp::min(self.len(), buf.len());
let (to_copy, rest) = self.split_at(size);
if size == 1 {
buf[0] = to_copy[0];
} else {
buf[..size].copy_from_slice(to_copy);
}
*self = rest;
Ok(size)
}
}
impl<T> Read for &mut T
where
T: Read,
{
fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
(*self).read(buf)
}
}
pub struct Take<R: Read> {
inner: R,
limit: u64,
}
impl<R: Read> Take<R> {
pub fn limit(&self) -> u64 {
self.limit
}
pub fn set_limit(&mut self, limit: u64) {
self.limit = limit;
}
pub fn get_ref(&self) -> &R {
&self.inner
}
pub fn get_mut(&mut self) -> &mut R {
&mut self.inner
}
pub fn into_inner(self) -> R {
self.inner
}
}
impl<R: Read> Read for Take<R> {
fn read(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
if self.limit == 0 {
return Ok(0);
}
let at_most = (self.limit as usize).min(buf.len());
let bytes = self.inner.read(&mut buf[..at_most])?;
self.limit -= bytes as u64;
Ok(bytes)
}
}
pub trait Write {
fn write(&mut self, buf: &[u8]) -> Result<usize, Error>;
fn flush(&mut self) -> Result<(), Error>;
fn write_all(&mut self, mut buf: &[u8]) -> Result<(), Error> {
while !buf.is_empty() {
match self.write(buf) {
Ok(0) => {
return Err(Error::from(ErrorKind::WriteAllEof));
}
Ok(n) => buf = &buf[n..],
Err(ref e) if e.is_interrupted() => {}
Err(e) => return Err(e),
}
}
Ok(())
}
}
impl<T> Write for &mut T
where
T: Write,
{
fn write(&mut self, buf: &[u8]) -> Result<usize, Error> {
(*self).write(buf)
}
fn flush(&mut self) -> Result<(), Error> {
(*self).flush()
}
}
impl Write for &mut [u8] {
#[inline]
fn write(&mut self, data: &[u8]) -> Result<usize, Error> {
let amt = core::cmp::min(data.len(), self.len());
let (a, b) = core::mem::take(self).split_at_mut(amt);
a.copy_from_slice(&data[..amt]);
*self = b;
Ok(amt)
}
fn flush(&mut self) -> Result<(), Error> {
Ok(())
}
}
impl Write for alloc::vec::Vec<u8> {
#[inline]
fn write(&mut self, data: &[u8]) -> Result<usize, Error> {
self.extend_from_slice(data);
Ok(data.len())
}
fn flush(&mut self) -> Result<(), Error> {
Ok(())
}
}

3
vendor/ruzstd/src/io_std.rs vendored Normal file
View File

@@ -0,0 +1,3 @@
//! Re-exports of std traits or local reimplementations if std is not available
#[cfg(feature = "std")]
pub use std::io::{Error, ErrorKind, Read, Write};

64
vendor/ruzstd/src/lib.rs vendored Normal file
View File

@@ -0,0 +1,64 @@
//! A pure Rust implementation of the [Zstandard compression format](https://www.rfc-editor.org/rfc/rfc8878.pdf).
//!
//! ## Decompression
//! The [decoding] module contains the code for decompression.
//! Decompression can be achieved by using the [`decoding::StreamingDecoder`]
//! or the more low-level [`decoding::FrameDecoder`]
//!
//! ## Compression
//! The [encoding] module contains the code for compression.
//! Decompression can be achieved by using the [`encoding::compress`]/[`encoding::compress_to_vec`]
//! functions or the [`encoding::FrameCompressor`]
//!
#![doc = include_str!("../Readme.md")]
#![no_std]
#![deny(trivial_casts, trivial_numeric_casts, rust_2018_idioms)]
#[cfg(feature = "std")]
extern crate std;
#[cfg(not(feature = "rustc-dep-of-std"))]
extern crate alloc;
#[cfg(feature = "std")]
pub(crate) const VERBOSE: bool = false;
macro_rules! vprintln {
($($x:expr),*) => {
#[cfg(feature = "std")]
if crate::VERBOSE {
std::println!($($x),*);
}
}
}
mod bit_io;
mod common;
pub mod decoding;
pub mod encoding;
pub(crate) mod blocks;
#[cfg(feature = "fuzz_exports")]
pub mod fse;
#[cfg(feature = "fuzz_exports")]
pub mod huff0;
#[cfg(not(feature = "fuzz_exports"))]
pub(crate) mod fse;
#[cfg(not(feature = "fuzz_exports"))]
pub(crate) mod huff0;
mod tests;
#[cfg(feature = "std")]
pub mod io_std;
#[cfg(feature = "std")]
pub use io_std as io;
#[cfg(not(feature = "std"))]
pub mod io_nostd;
#[cfg(not(feature = "std"))]
pub use io_nostd as io;

79
vendor/ruzstd/src/tests/bit_reader.rs vendored Normal file
View File

@@ -0,0 +1,79 @@
#[test]
fn test_bitreader_reversed() {
use crate::bit_io::BitReaderReversed;
let encoded: [u8; 16] = [
0xC1, 0x41, 0x08, 0x00, 0x00, 0xEC, 0xC8, 0x96, 0x42, 0x79, 0xD4, 0xBC, 0xF7, 0x2C, 0xD5,
0x48,
];
//just the u128 in encoded
let num_rev: u128 = 0x48_D5_2C_F7_BC_D4_79_42_96_C8_EC_00_00_08_41_C1;
let mut br = BitReaderReversed::new(&encoded[..]);
let mut accumulator = 0;
let mut bits_read = 0;
let mut x = 0;
loop {
x += 3;
//semi random access pattern
let mut num_bits = x % 16;
if bits_read > 128 - num_bits {
num_bits = 128 - bits_read;
}
let bits = br.get_bits(num_bits);
bits_read += num_bits;
accumulator |= u128::from(bits) << (128 - bits_read);
if bits_read >= 128 {
break;
}
}
if accumulator != num_rev {
panic!(
"Bitreader failed somewhere. Accumulated bits: {:?}, Should be: {:?}",
accumulator, num_rev
);
}
}
#[test]
fn test_bitreader_normal() {
use crate::bit_io::BitReader;
let encoded: [u8; 16] = [
0xC1, 0x41, 0x08, 0x00, 0x00, 0xEC, 0xC8, 0x96, 0x42, 0x79, 0xD4, 0xBC, 0xF7, 0x2C, 0xD5,
0x48,
];
//just the u128 in encoded
let num: u128 = 0x48_D5_2C_F7_BC_D4_79_42_96_C8_EC_00_00_08_41_C1;
let mut br = BitReader::new(&encoded[..]);
let mut accumulator = 0;
let mut bits_read = 0;
let mut x = 0;
loop {
x += 3;
//semi random access pattern
let mut num_bits = x % 16;
if bits_read > 128 - num_bits {
num_bits = 128 - bits_read;
}
let bits = br.get_bits(num_bits).unwrap();
accumulator |= u128::from(bits) << bits_read;
bits_read += num_bits;
if bits_read >= 128 {
break;
}
}
if accumulator != num {
panic!(
"Bitreader failed somewhere. Accumulated bits: {:?}, Should be: {:?}",
accumulator, num
);
}
}

194
vendor/ruzstd/src/tests/decode_corpus.rs vendored Normal file
View File

@@ -0,0 +1,194 @@
#[test]
fn test_decode_corpus_files() {
extern crate std;
use crate::decoding::BlockDecodingStrategy;
use crate::decoding::FrameDecoder;
use alloc::borrow::ToOwned;
use alloc::string::{String, ToString};
use alloc::vec::Vec;
use std::fs;
use std::io::Read;
use std::println;
let mut success_counter = 0;
let mut fail_counter_diff = 0;
let mut fail_counter_size = 0;
let mut fail_counter_bytes_read = 0;
#[cfg_attr(not(feature = "hash"), allow(unused_mut))]
let mut fail_counter_chksum = 0;
let mut total_counter = 0;
let mut failed: Vec<String> = Vec::new();
let mut speeds = Vec::new();
let mut speeds_read = Vec::new();
let mut files: Vec<_> = fs::read_dir("./decodecorpus_files").unwrap().collect();
if fs::read_dir("./local_corpus_files").is_ok() {
files.extend(fs::read_dir("./local_corpus_files").unwrap());
}
files.sort_by_key(|x| match x {
Err(_) => "".to_owned(),
Ok(entry) => entry.path().to_str().unwrap().to_owned(),
});
let mut frame_dec = FrameDecoder::new();
for file in files {
let f = file.unwrap();
let metadata = f.metadata().unwrap();
let file_size = metadata.len();
let p = String::from(f.path().to_str().unwrap());
if !p.ends_with(".zst") {
continue;
}
println!("Trying file: {}", p);
let mut content = fs::File::open(f.path()).unwrap();
frame_dec.reset(&mut content).unwrap();
let start_time = std::time::Instant::now();
/////DECODING
frame_dec
.decode_blocks(&mut content, BlockDecodingStrategy::All)
.unwrap();
let result = frame_dec.collect().unwrap();
let end_time = start_time.elapsed();
match frame_dec.get_checksum_from_data() {
Some(chksum) => {
#[cfg(feature = "hash")]
if frame_dec.get_calculated_checksum().unwrap() != chksum {
println!(
"Checksum did not match! From data: {}, calculated while decoding: {}\n",
chksum,
frame_dec.get_calculated_checksum().unwrap()
);
fail_counter_chksum += 1;
failed.push(p.clone().to_string());
} else {
println!("Checksums are ok!\n");
}
#[cfg(not(feature = "hash"))]
println!(
"Checksum feature not enabled, skipping. From data: {}\n",
chksum
);
}
None => println!("No checksums to test\n"),
}
let mut original_p = p.clone();
original_p.truncate(original_p.len() - 4);
let original_f = fs::File::open(original_p).unwrap();
let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
println!("Results for file: {}", p.clone());
let mut success = true;
if original.len() != result.len() {
println!(
"Result has wrong length: {}, should be: {}",
result.len(),
original.len()
);
success = false;
fail_counter_size += 1;
}
if frame_dec.bytes_read_from_source() != file_size {
println!(
"Framedecoder counted wrong amount of bytes: {}, should be: {}",
frame_dec.bytes_read_from_source(),
file_size
);
success = false;
fail_counter_bytes_read += 1;
}
let mut counter = 0;
let min = if original.len() < result.len() {
original.len()
} else {
result.len()
};
for idx in 0..min {
if original[idx] != result[idx] {
counter += 1;
//println!(
// "Original {} not equal to result {} at byte: {}",
// original[idx], result[idx], idx,
//);
}
}
if counter > 0 {
println!("Result differs in at least {} bytes from original", counter);
success = false;
fail_counter_diff += 1;
}
if success {
success_counter += 1;
} else {
failed.push(p.clone().to_string());
}
total_counter += 1;
let dur = end_time.as_micros() as usize;
let speed = result.len() / if dur == 0 { 1 } else { dur };
let speed_read = file_size as usize / if dur == 0 { 1 } else { dur };
println!("SPEED: {}", speed);
println!("SPEED_read: {}", speed_read);
speeds.push(speed);
speeds_read.push(speed_read);
}
println!("###################");
println!("Summary:");
println!("###################");
println!(
"Total: {}, Success: {}, WrongSize: {}, WrongBytecount: {}, WrongChecksum: {}, Diffs: {}",
total_counter,
success_counter,
fail_counter_size,
fail_counter_bytes_read,
fail_counter_chksum,
fail_counter_diff
);
println!("Failed files: ");
for f in &failed {
println!("{}", f);
}
let speed_len = speeds.len();
let sum_speed: usize = speeds.into_iter().sum();
let avg_speed = sum_speed / speed_len;
let avg_speed_bps = avg_speed * 1_000_000;
if avg_speed_bps < 1000 {
println!("Average speed: {} B/s", avg_speed_bps);
} else if avg_speed_bps < 1_000_000 {
println!("Average speed: {} KB/s", avg_speed_bps / 1000);
} else {
println!("Average speed: {} MB/s", avg_speed_bps / 1_000_000);
}
let speed_read_len = speeds_read.len();
let sum_speed_read: usize = speeds_read.into_iter().sum();
let avg_speed_read = sum_speed_read / speed_read_len;
let avg_speed_read_bps = avg_speed_read * 1_000_000;
if avg_speed_read_bps < 1000 {
println!("Average speed reading: {} B/s", avg_speed_read_bps);
} else if avg_speed_bps < 1_000_000 {
println!("Average speed reading: {} KB/s", avg_speed_read_bps / 1000);
} else {
println!(
"Average speed reading: {} MB/s",
avg_speed_read_bps / 1_000_000
);
}
assert!(failed.is_empty());
}

266
vendor/ruzstd/src/tests/dict_test.rs vendored Normal file
View File

@@ -0,0 +1,266 @@
#[test]
fn test_dict_parsing() {
use crate::decoding::dictionary::Dictionary;
use alloc::vec;
let mut raw = vec![0u8; 8];
// correct magic num
raw[0] = 0x37;
raw[1] = 0xA4;
raw[2] = 0x30;
raw[3] = 0xEC;
//dict-id
let dict_id = 0x47232101;
raw[4] = 0x01;
raw[5] = 0x21;
raw[6] = 0x23;
raw[7] = 0x47;
// tables copied from ./dict_tests/dictionary
let raw_tables = &[
54, 16, 192, 155, 4, 0, 207, 59, 239, 121, 158, 116, 220, 93, 114, 229, 110, 41, 249, 95,
165, 255, 83, 202, 254, 68, 74, 159, 63, 161, 100, 151, 137, 21, 184, 183, 189, 100, 235,
209, 251, 174, 91, 75, 91, 185, 19, 39, 75, 146, 98, 177, 249, 14, 4, 35, 0, 0, 0, 40, 40,
20, 10, 12, 204, 37, 196, 1, 173, 122, 0, 4, 0, 128, 1, 2, 2, 25, 32, 27, 27, 22, 24, 26,
18, 12, 12, 15, 16, 11, 69, 37, 225, 48, 20, 12, 6, 2, 161, 80, 40, 20, 44, 137, 145, 204,
46, 0, 0, 0, 0, 0, 116, 253, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
];
raw.extend(&raw_tables[..]);
//offset history 3,10,0x00ABCDEF
raw.extend(vec![3, 0, 0, 0]);
raw.extend(vec![10, 0, 0, 0]);
raw.extend(vec![0xEF, 0xCD, 0xAB, 0]);
//just some random bytes
let raw_content = vec![
1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 123, 3, 234, 23, 234, 34, 23, 234, 34, 34, 234, 234,
];
raw.extend(&raw_content);
let dict = Dictionary::decode_dict(&raw).unwrap();
if dict.id != dict_id {
panic!(
"Dict-id did not get parsed correctly. Is: {}, Should be: {}",
dict.id, dict_id
);
}
if !dict.dict_content.eq(&raw_content) {
panic!(
"dict content did not get parsed correctly. Is: {:?}, Should be: {:?}",
dict.dict_content, raw_content
);
}
if !dict.offset_hist.eq(&[3, 10, 0x00ABCDEF]) {
panic!(
"offset history did not get parsed correctly. Is: {:?}, Should be: {:?}",
dict.offset_hist,
[3, 10, 0x00ABCDEF]
);
}
// test magic num checking
raw[0] = 1;
raw[1] = 1;
raw[2] = 1;
raw[3] = 1;
match Dictionary::decode_dict(&raw) {
Ok(_) => panic!("The dict got decoded but the magic num was incorrect!"),
Err(_) => { /* This is what should happen*/ }
}
}
#[test]
fn test_dict_decoding() {
extern crate std;
use crate::decoding::BlockDecodingStrategy;
use crate::decoding::FrameDecoder;
use alloc::borrow::ToOwned;
use alloc::string::{String, ToString};
use alloc::vec::Vec;
use std::fs;
use std::io::Read;
use std::println;
let mut success_counter = 0;
let mut fail_counter_diff = 0;
let mut fail_counter_size = 0;
let mut fail_counter_bytes_read = 0;
let mut total_counter = 0;
let mut failed: Vec<String> = Vec::new();
let mut speeds = Vec::new();
let mut speeds_read = Vec::new();
let mut files: Vec<_> = fs::read_dir("./dict_tests/files").unwrap().collect();
let dict = fs::File::open("./dict_tests/dictionary").unwrap();
let dict: Vec<u8> = dict.bytes().map(|x| x.unwrap()).collect();
files.sort_by_key(|x| match x {
Err(_) => "".to_owned(),
Ok(entry) => entry.path().to_str().unwrap().to_owned(),
});
let mut frame_dec = FrameDecoder::new();
let dict = crate::decoding::dictionary::Dictionary::decode_dict(&dict).unwrap();
frame_dec.add_dict(dict).unwrap();
for file in files {
let f = file.unwrap();
let metadata = f.metadata().unwrap();
let file_size = metadata.len();
let p = String::from(f.path().to_str().unwrap());
if !p.ends_with(".zst") {
continue;
}
println!("Trying file: {}", p);
let mut content = fs::File::open(f.path()).unwrap();
frame_dec.reset(&mut content).unwrap();
let start_time = std::time::Instant::now();
/////DECODING
frame_dec
.decode_blocks(&mut content, BlockDecodingStrategy::All)
.unwrap();
let result = frame_dec.collect().unwrap();
let end_time = start_time.elapsed();
match frame_dec.get_checksum_from_data() {
Some(chksum) => {
#[cfg(feature = "hash")]
if frame_dec.get_calculated_checksum().unwrap() != chksum {
println!(
"Checksum did not match! From data: {}, calculated while decoding: {}\n",
chksum,
frame_dec.get_calculated_checksum().unwrap()
);
} else {
println!("Checksums are ok!\n");
}
#[cfg(not(feature = "hash"))]
println!(
"Checksum feature not enabled, skipping. From data: {}\n",
chksum
);
}
None => println!("No checksums to test\n"),
}
let mut original_p = p.clone();
original_p.truncate(original_p.len() - 4);
let original_f = fs::File::open(original_p).unwrap();
let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
println!("Results for file: {}", p.clone());
let mut success = true;
if original.len() != result.len() {
println!(
"Result has wrong length: {}, should be: {}",
result.len(),
original.len()
);
success = false;
fail_counter_size += 1;
}
if frame_dec.bytes_read_from_source() != file_size {
println!(
"Framedecoder counted wrong amount of bytes: {}, should be: {}",
frame_dec.bytes_read_from_source(),
file_size
);
success = false;
fail_counter_bytes_read += 1;
}
let mut counter = 0;
let min = if original.len() < result.len() {
original.len()
} else {
result.len()
};
for idx in 0..min {
if original[idx] != result[idx] {
counter += 1;
//println!(
// "Original {} not equal to result {} at byte: {}",
// original[idx], result[idx], idx,
//);
}
}
if counter > 0 {
println!("Result differs in at least {} bytes from original", counter);
success = false;
fail_counter_diff += 1;
}
if success {
success_counter += 1;
} else {
failed.push(p.clone().to_string());
}
total_counter += 1;
let dur = end_time.as_micros() as usize;
let speed = result.len() / if dur == 0 { 1 } else { dur };
let speed_read = file_size as usize / if dur == 0 { 1 } else { dur };
println!("SPEED: {}", speed);
println!("SPEED_read: {}", speed_read);
speeds.push(speed);
speeds_read.push(speed_read);
}
println!("###################");
println!("Summary:");
println!("###################");
println!(
"Total: {}, Success: {}, WrongSize: {}, WrongBytecount: {}, Diffs: {}",
total_counter,
success_counter,
fail_counter_size,
fail_counter_bytes_read,
fail_counter_diff
);
println!("Failed files: ");
for f in &failed {
println!("{}", f);
}
let speed_len = speeds.len();
let sum_speed: usize = speeds.into_iter().sum();
let avg_speed = sum_speed / speed_len;
let avg_speed_bps = avg_speed * 1_000_000;
if avg_speed_bps < 1000 {
println!("Average speed: {} B/s", avg_speed_bps);
} else if avg_speed_bps < 1_000_000 {
println!("Average speed: {} KB/s", avg_speed_bps / 1000);
} else {
println!("Average speed: {} MB/s", avg_speed_bps / 1_000_000);
}
let speed_read_len = speeds_read.len();
let sum_speed_read: usize = speeds_read.into_iter().sum();
let avg_speed_read = sum_speed_read / speed_read_len;
let avg_speed_read_bps = avg_speed_read * 1_000_000;
if avg_speed_read_bps < 1000 {
println!("Average speed reading: {} B/s", avg_speed_read_bps);
} else if avg_speed_bps < 1_000_000 {
println!("Average speed reading: {} KB/s", avg_speed_read_bps / 1000);
} else {
println!(
"Average speed reading: {} MB/s",
avg_speed_read_bps / 1_000_000
);
}
assert!(failed.is_empty());
}

233
vendor/ruzstd/src/tests/encode_corpus.rs vendored Normal file
View File

@@ -0,0 +1,233 @@
#[test]
fn test_encode_corpus_files_uncompressed_our_decompressor() {
extern crate std;
use crate::encoding::FrameCompressor;
use alloc::borrow::ToOwned;
use alloc::vec::Vec;
use std::ffi::OsStr;
use std::fs;
use std::io::Read;
use std::path::PathBuf;
use std::println;
let mut failures: Vec<PathBuf> = Vec::new();
let mut files: Vec<_> = fs::read_dir("./decodecorpus_files").unwrap().collect();
if fs::read_dir("./local_corpus_files").is_ok() {
files.extend(fs::read_dir("./local_corpus_files").unwrap());
}
files.sort_by_key(|x| match x {
Err(_) => "".to_owned(),
Ok(entry) => entry.path().to_str().unwrap().to_owned(),
});
for entry in files.iter().map(|f| f.as_ref().unwrap()) {
let path = entry.path();
if path.extension() == Some(OsStr::new("zst")) {
continue;
}
println!("Trying file: {:?}", path);
let input = fs::read(entry.path()).unwrap();
let mut compressed_file: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(crate::encoding::CompressionLevel::Fastest);
compressor.set_source(input.as_slice());
compressor.set_drain(&mut compressed_file);
compressor.compress();
let mut decompressed_output = Vec::new();
let mut decoder =
crate::decoding::StreamingDecoder::new(compressed_file.as_slice()).unwrap();
decoder.read_to_end(&mut decompressed_output).unwrap();
if input != decompressed_output {
failures.push(path);
}
}
if !failures.is_empty() {
panic!(
"Decompression of compressed file failed on the following files: {:?}",
failures
);
}
}
#[test]
fn test_encode_corpus_files_uncompressed_original_decompressor() {
extern crate std;
use crate::encoding::FrameCompressor;
use alloc::borrow::ToOwned;
use alloc::format;
use alloc::vec::Vec;
use std::ffi::OsStr;
use std::fs;
use std::path::PathBuf;
use std::println;
use std::string::String;
let mut failures: Vec<(PathBuf, String)> = Vec::new();
let mut files: Vec<_> = fs::read_dir("./decodecorpus_files").unwrap().collect();
if fs::read_dir("./local_corpus_files").is_ok() {
files.extend(fs::read_dir("./local_corpus_files").unwrap());
}
files.sort_by_key(|x| match x {
Err(_) => "".to_owned(),
Ok(entry) => entry.path().to_str().unwrap().to_owned(),
});
for entry in files.iter().map(|f| f.as_ref().unwrap()) {
let path = entry.path();
if path.extension() == Some(OsStr::new("zst")) {
continue;
}
println!("Trying file: {:?}", path);
let input = fs::read(entry.path()).unwrap();
let mut compressed_file: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(crate::encoding::CompressionLevel::Fastest);
compressor.set_source(input.as_slice());
compressor.set_drain(&mut compressed_file);
compressor.compress();
let mut decompressed_output = Vec::new();
// zstd::stream::copy_decode(compressed_file.as_slice(), &mut decompressed_output).unwrap();
match zstd::stream::copy_decode(compressed_file.as_slice(), &mut decompressed_output) {
Ok(()) => {
if input != decompressed_output {
failures.push((path.to_owned(), "Input didn't equal output".to_owned()));
}
}
Err(e) => {
failures.push((
path.to_owned(),
format!("Decompressor threw an error: {e:?}"),
));
}
};
if !failures.is_empty() {
panic!(
"Decompression of the compressed file fails on the following files: {:?}",
failures
);
}
}
}
#[test]
fn test_encode_corpus_files_compressed_our_decompressor() {
extern crate std;
use crate::encoding::FrameCompressor;
use alloc::borrow::ToOwned;
use alloc::vec::Vec;
use std::ffi::OsStr;
use std::fs;
use std::io::Read;
use std::path::PathBuf;
use std::println;
let mut failures: Vec<PathBuf> = Vec::new();
let mut files: Vec<_> = fs::read_dir("./decodecorpus_files").unwrap().collect();
if fs::read_dir("./local_corpus_files").is_ok() {
files.extend(fs::read_dir("./local_corpus_files").unwrap());
}
files.sort_by_key(|x| match x {
Err(_) => "".to_owned(),
Ok(entry) => entry.path().to_str().unwrap().to_owned(),
});
for entry in files.iter().map(|f| f.as_ref().unwrap()) {
let path = entry.path();
if path.extension() == Some(OsStr::new("zst")) {
continue;
}
println!("Trying file: {:?}", path);
let input = fs::read(entry.path()).unwrap();
let mut compressed_file: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(crate::encoding::CompressionLevel::Fastest);
compressor.set_source(input.as_slice());
compressor.set_drain(&mut compressed_file);
compressor.compress();
let mut decompressed_output = Vec::new();
let mut decoder =
crate::decoding::StreamingDecoder::new(compressed_file.as_slice()).unwrap();
decoder.read_to_end(&mut decompressed_output).unwrap();
if input != decompressed_output {
failures.push(path);
}
}
if !failures.is_empty() {
panic!(
"Decompression of compressed file failed on the following files: {:?}",
failures
);
}
}
#[test]
fn test_encode_corpus_files_compressed_original_decompressor() {
extern crate std;
use crate::encoding::FrameCompressor;
use alloc::borrow::ToOwned;
use alloc::format;
use alloc::vec::Vec;
use std::ffi::OsStr;
use std::fs;
use std::path::PathBuf;
use std::println;
use std::string::String;
let mut failures: Vec<(PathBuf, String)> = Vec::new();
let mut files: Vec<_> = fs::read_dir("./decodecorpus_files").unwrap().collect();
if fs::read_dir("./local_corpus_files").is_ok() {
files.extend(fs::read_dir("./local_corpus_files").unwrap());
}
files.sort_by_key(|x| match x {
Err(_) => "".to_owned(),
Ok(entry) => entry.path().to_str().unwrap().to_owned(),
});
for entry in files.iter().map(|f| f.as_ref().unwrap()) {
let path = entry.path();
if path.extension() == Some(OsStr::new("zst")) {
continue;
}
println!("Trying file: {:?}", path);
let input = fs::read(entry.path()).unwrap();
let mut compressed_file: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(crate::encoding::CompressionLevel::Fastest);
compressor.set_source(input.as_slice());
compressor.set_drain(&mut compressed_file);
compressor.compress();
let mut decompressed_output = Vec::new();
// zstd::stream::copy_decode(compressed_file.as_slice(), &mut decompressed_output).unwrap();
match zstd::stream::copy_decode(compressed_file.as_slice(), &mut decompressed_output) {
Ok(()) => {
if input != decompressed_output {
failures.push((path.to_owned(), "Input didn't equal output".to_owned()));
}
}
Err(e) => {
failures.push((
path.to_owned(),
format!("Decompressor threw an error: {e:?}"),
));
}
};
if !failures.is_empty() {
panic!(
"Decompression of the compressed file fails on the following files: {:?}",
failures
);
}
}
}

View File

@@ -0,0 +1,27 @@
#[test]
fn test_all_artifacts() {
extern crate std;
use crate::decoding::BlockDecodingStrategy;
use crate::decoding::FrameDecoder;
use std::borrow::ToOwned;
use std::fs;
use std::fs::File;
let mut frame_dec = FrameDecoder::new();
for file in fs::read_dir("./fuzz/artifacts/decode").unwrap() {
let file_name = file.unwrap().path();
let fnstr = file_name.to_str().unwrap().to_owned();
if !fnstr.contains("/crash-") {
continue;
}
let mut f = File::open(file_name.clone()).unwrap();
/* ignore errors. It just should never panic on invalid input */
let _: Result<_, _> = frame_dec
.reset(&mut f)
.and_then(|()| frame_dec.decode_blocks(&mut f, BlockDecodingStrategy::All));
}
}

578
vendor/ruzstd/src/tests/mod.rs vendored Normal file
View File

@@ -0,0 +1,578 @@
#[cfg(test)]
use alloc::vec;
#[cfg(test)]
use alloc::vec::Vec;
#[cfg(test)]
extern crate std;
#[cfg(all(test, not(feature = "std")))]
impl crate::io_nostd::Read for std::fs::File {
fn read(&mut self, buf: &mut [u8]) -> Result<usize, crate::io_nostd::Error> {
std::io::Read::read(self, buf).map_err(|e| {
if e.get_ref().is_none() {
crate::io_nostd::Error::from(crate::io_nostd::ErrorKind::Other)
} else {
crate::io_nostd::Error::new(
crate::io_nostd::ErrorKind::Other,
alloc::boxed::Box::new(e.into_inner().unwrap()),
)
}
})
}
}
#[cfg(all(test, feature = "std"))]
#[allow(dead_code)]
fn assure_error_impl() {
// not a real test just there to throw an compiler error if Error is not derived correctly
use crate::decoding::errors::FrameDecoderError;
let _err: &dyn std::error::Error = &FrameDecoderError::NotYetInitialized;
}
#[cfg(all(test, feature = "std"))]
#[allow(dead_code)]
fn assure_decoder_send_sync() {
// not a real test just there to throw an compiler error if FrameDecoder is Send + Sync
use crate::decoding::FrameDecoder;
let decoder = FrameDecoder::new();
std::thread::spawn(move || {
drop(decoder);
});
}
#[test]
fn skippable_frame() {
use crate::decoding::errors;
use crate::decoding::frame;
let mut content = vec![];
content.extend_from_slice(&0x184D2A50u32.to_le_bytes());
content.extend_from_slice(&300u32.to_le_bytes());
assert_eq!(8, content.len());
let err = frame::read_frame_header(content.as_slice());
assert!(matches!(
err,
Err(errors::ReadFrameHeaderError::SkipFrame {
magic_number: 0x184D2A50u32,
length: 300
})
));
content.clear();
content.extend_from_slice(&0x184D2A5Fu32.to_le_bytes());
content.extend_from_slice(&0xFFFFFFFFu32.to_le_bytes());
assert_eq!(8, content.len());
let err = frame::read_frame_header(content.as_slice());
assert!(matches!(
err,
Err(errors::ReadFrameHeaderError::SkipFrame {
magic_number: 0x184D2A5Fu32,
length: 0xFFFFFFFF
})
));
}
#[cfg(test)]
#[test]
fn test_frame_header_reading() {
use crate::decoding::frame;
use std::fs;
let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap();
let (_frame, _) = frame::read_frame_header(&mut content).unwrap();
}
#[test]
fn test_block_header_reading() {
use crate::decoding;
use crate::decoding::frame;
use std::fs;
let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap();
let (_frame, _) = frame::read_frame_header(&mut content).unwrap();
let mut block_dec = decoding::block_decoder::new();
let block_header = block_dec.read_block_header(&mut content).unwrap();
let _ = block_header; //TODO validate blockheader in a smart way
}
#[test]
fn test_frame_decoder() {
use crate::decoding::BlockDecodingStrategy;
use crate::decoding::FrameDecoder;
use std::fs;
let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap();
struct NullWriter(());
impl std::io::Write for NullWriter {
fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
Ok(buf.len())
}
fn flush(&mut self) -> Result<(), std::io::Error> {
Ok(())
}
}
let mut _null_target = NullWriter(());
let mut frame_dec = FrameDecoder::new();
frame_dec.reset(&mut content).unwrap();
frame_dec
.decode_blocks(&mut content, BlockDecodingStrategy::All)
.unwrap();
}
#[test]
fn test_decode_from_to() {
use crate::decoding::FrameDecoder;
use std::fs::File;
use std::io::Read;
let f = File::open("./decodecorpus_files/z000088.zst").unwrap();
let mut frame_dec = FrameDecoder::new();
let content: Vec<u8> = f.bytes().map(|x| x.unwrap()).collect();
let mut target = vec![0u8; 1024 * 1024];
// first part
let source1 = &content[..50 * 1024];
let (read1, written1) = frame_dec
.decode_from_to(source1, target.as_mut_slice())
.unwrap();
//second part explicitely without checksum
let source2 = &content[read1..content.len() - 4];
let (read2, written2) = frame_dec
.decode_from_to(source2, &mut target[written1..])
.unwrap();
//must have decoded until checksum
assert!(read1 + read2 == content.len() - 4);
//insert checksum separatly to test that this is handled correctly
let chksum_source = &content[read1 + read2..];
let (read3, written3) = frame_dec
.decode_from_to(chksum_source, &mut target[written1 + written2..])
.unwrap();
//this must result in these values because just the checksum was processed
assert!(read3 == 4);
assert!(written3 == 0);
let read = read1 + read2 + read3;
let written = written1 + written2;
let result = &target.as_slice()[..written];
if read != content.len() {
panic!(
"Byte counter: {} was wrong. Should be: {}",
read,
content.len()
);
}
match frame_dec.get_checksum_from_data() {
Some(chksum) => {
#[cfg(feature = "hash")]
if frame_dec.get_calculated_checksum().unwrap() != chksum {
std::println!(
"Checksum did not match! From data: {}, calculated while decoding: {}\n",
chksum,
frame_dec.get_calculated_checksum().unwrap()
);
} else {
std::println!("Checksums are ok!\n");
}
#[cfg(not(feature = "hash"))]
std::println!(
"Checksum feature not enabled, skipping. From data: {}\n",
chksum
);
}
None => std::println!("No checksums to test\n"),
}
let original_f = File::open("./decodecorpus_files/z000088").unwrap();
let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
if original.len() != result.len() {
panic!(
"Result has wrong length: {}, should be: {}",
result.len(),
original.len()
);
}
let mut counter = 0;
let min = if original.len() < result.len() {
original.len()
} else {
result.len()
};
for idx in 0..min {
if original[idx] != result[idx] {
counter += 1;
//std::println!(
// "Original {:3} not equal to result {:3} at byte: {}",
// original[idx], result[idx], idx,
//);
}
}
if counter > 0 {
panic!("Result differs in at least {} bytes from original", counter);
}
}
#[test]
fn test_specific_file() {
use crate::decoding::BlockDecodingStrategy;
use crate::decoding::FrameDecoder;
use std::fs;
use std::io::Read;
let path = "./decodecorpus_files/z000068.zst";
let mut content = fs::File::open(path).unwrap();
struct NullWriter(());
impl std::io::Write for NullWriter {
fn write(&mut self, buf: &[u8]) -> Result<usize, std::io::Error> {
Ok(buf.len())
}
fn flush(&mut self) -> Result<(), std::io::Error> {
Ok(())
}
}
let mut _null_target = NullWriter(());
let mut frame_dec = FrameDecoder::new();
frame_dec.reset(&mut content).unwrap();
frame_dec
.decode_blocks(&mut content, BlockDecodingStrategy::All)
.unwrap();
let result = frame_dec.collect().unwrap();
let original_f = fs::File::open("./decodecorpus_files/z000088").unwrap();
let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
std::println!("Results for file: {}", path);
if original.len() != result.len() {
std::println!(
"Result has wrong length: {}, should be: {}",
result.len(),
original.len()
);
}
let mut counter = 0;
let min = if original.len() < result.len() {
original.len()
} else {
result.len()
};
for idx in 0..min {
if original[idx] != result[idx] {
counter += 1;
//std::println!(
// "Original {:3} not equal to result {:3} at byte: {}",
// original[idx], result[idx], idx,
//);
}
}
if counter > 0 {
std::println!("Result differs in at least {} bytes from original", counter);
}
}
#[test]
#[cfg(feature = "std")]
fn test_streaming() {
use std::fs;
use std::io::Read;
let mut content = fs::File::open("./decodecorpus_files/z000088.zst").unwrap();
let mut stream = crate::decoding::StreamingDecoder::new(&mut content).unwrap();
let mut result = Vec::new();
Read::read_to_end(&mut stream, &mut result).unwrap();
let original_f = fs::File::open("./decodecorpus_files/z000088").unwrap();
let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
if original.len() != result.len() {
panic!(
"Result has wrong length: {}, should be: {}",
result.len(),
original.len()
);
}
let mut counter = 0;
let min = if original.len() < result.len() {
original.len()
} else {
result.len()
};
for idx in 0..min {
if original[idx] != result[idx] {
counter += 1;
//std::println!(
// "Original {:3} not equal to result {:3} at byte: {}",
// original[idx], result[idx], idx,
//);
}
}
if counter > 0 {
panic!("Result differs in at least {} bytes from original", counter);
}
// Test resetting to a new file while keeping the old decoder
let mut content = fs::File::open("./decodecorpus_files/z000068.zst").unwrap();
let mut stream = crate::decoding::StreamingDecoder::new_with_decoder(
&mut content,
stream.into_frame_decoder(),
)
.unwrap();
let mut result = Vec::new();
Read::read_to_end(&mut stream, &mut result).unwrap();
let original_f = fs::File::open("./decodecorpus_files/z000068").unwrap();
let original: Vec<u8> = original_f.bytes().map(|x| x.unwrap()).collect();
std::println!("Results for file:");
if original.len() != result.len() {
panic!(
"Result has wrong length: {}, should be: {}",
result.len(),
original.len()
);
}
let mut counter = 0;
let min = if original.len() < result.len() {
original.len()
} else {
result.len()
};
for idx in 0..min {
if original[idx] != result[idx] {
counter += 1;
//std::println!(
// "Original {:3} not equal to result {:3} at byte: {}",
// original[idx], result[idx], idx,
//);
}
}
if counter > 0 {
panic!("Result differs in at least {} bytes from original", counter);
}
}
#[test]
fn test_incremental_read() {
use crate::decoding::FrameDecoder;
let mut unread_compressed_content =
include_bytes!("../../decodecorpus_files/abc.txt.zst").as_slice();
let mut frame_dec = FrameDecoder::new();
frame_dec.reset(&mut unread_compressed_content).unwrap();
let mut output = [0u8; 3];
let (_, written) = frame_dec
.decode_from_to(unread_compressed_content, &mut output)
.unwrap();
assert_eq!(written, 3);
assert_eq!(output.map(char::from), ['a', 'b', 'c']);
assert!(frame_dec.is_finished());
let written = frame_dec.collect_to_writer(&mut &mut output[..]).unwrap();
assert_eq!(written, 3);
assert_eq!(output.map(char::from), ['d', 'e', 'f']);
}
#[test]
#[cfg(not(feature = "std"))]
fn test_streaming_no_std() {
use crate::io::Read;
let content = include_bytes!("../../decodecorpus_files/z000088.zst");
let mut content = content.as_slice();
let mut stream = crate::decoding::StreamingDecoder::new(&mut content).unwrap();
let original = include_bytes!("../../decodecorpus_files/z000088");
let mut result = vec![0; original.len()];
Read::read_exact(&mut stream, &mut result).unwrap();
if original.len() != result.len() {
panic!(
"Result has wrong length: {}, should be: {}",
result.len(),
original.len()
);
}
let mut counter = 0;
let min = if original.len() < result.len() {
original.len()
} else {
result.len()
};
for idx in 0..min {
if original[idx] != result[idx] {
counter += 1;
//std::println!(
// "Original {:3} not equal to result {:3} at byte: {}",
// original[idx], result[idx], idx,
//);
}
}
if counter > 0 {
panic!("Result differs in at least {} bytes from original", counter);
}
// Test resetting to a new file while keeping the old decoder
let content = include_bytes!("../../decodecorpus_files/z000068.zst");
let mut content = content.as_slice();
let mut stream = crate::decoding::StreamingDecoder::new_with_decoder(
&mut content,
stream.into_frame_decoder(),
)
.unwrap();
let original = include_bytes!("../../decodecorpus_files/z000068");
let mut result = vec![0; original.len()];
Read::read_exact(&mut stream, &mut result).unwrap();
std::println!("Results for file:");
if original.len() != result.len() {
panic!(
"Result has wrong length: {}, should be: {}",
result.len(),
original.len()
);
}
let mut counter = 0;
let min = if original.len() < result.len() {
original.len()
} else {
result.len()
};
for idx in 0..min {
if original[idx] != result[idx] {
counter += 1;
//std::println!(
// "Original {:3} not equal to result {:3} at byte: {}",
// original[idx], result[idx], idx,
//);
}
}
if counter > 0 {
panic!("Result differs in at least {} bytes from original", counter);
}
}
#[test]
fn test_decode_all() {
use crate::decoding::errors::FrameDecoderError;
use crate::decoding::FrameDecoder;
let skip_frame = |input: &mut Vec<u8>, length: usize| {
input.extend_from_slice(&0x184D2A50u32.to_le_bytes());
input.extend_from_slice(&(length as u32).to_le_bytes());
input.resize(input.len() + length, 0);
};
let mut original = Vec::new();
let mut input = Vec::new();
skip_frame(&mut input, 300);
input.extend_from_slice(include_bytes!("../../decodecorpus_files/z000089.zst"));
original.extend_from_slice(include_bytes!("../../decodecorpus_files/z000089"));
skip_frame(&mut input, 400);
input.extend_from_slice(include_bytes!("../../decodecorpus_files/z000090.zst"));
original.extend_from_slice(include_bytes!("../../decodecorpus_files/z000090"));
skip_frame(&mut input, 500);
let mut decoder = FrameDecoder::new();
// decode_all with correct buffers.
let mut output = vec![0; original.len()];
let result = decoder.decode_all(&input, &mut output).unwrap();
assert_eq!(result, original.len());
assert_eq!(output, original);
// decode_all with smaller output length.
let mut output = vec![0; original.len() - 1];
let result = decoder.decode_all(&input, &mut output);
assert!(
matches!(result, Err(FrameDecoderError::TargetTooSmall)),
"{:?}",
result
);
// decode_all with larger output length.
let mut output = vec![0; original.len() + 1];
let result = decoder.decode_all(&input, &mut output).unwrap();
assert_eq!(result, original.len());
assert_eq!(&output[..result], original);
// decode_all with truncated regular frame.
let mut output = vec![0; original.len()];
let result = decoder.decode_all(&input[..input.len() - 600], &mut output);
assert!(
matches!(result, Err(FrameDecoderError::FailedToReadBlockBody(_))),
"{:?}",
result
);
// decode_all with truncated skip frame.
let mut output = vec![0; original.len()];
let result = decoder.decode_all(&input[..input.len() - 1], &mut output);
assert!(
matches!(result, Err(FrameDecoderError::FailedToSkipFrame)),
"{:?}",
result
);
// decode_all_to_vec with correct output capacity.
let mut output = Vec::new();
output.reserve_exact(original.len());
decoder.decode_all_to_vec(&input, &mut output).unwrap();
assert_eq!(output, original);
// decode_all_to_vec with smaller output capacity.
let mut output = Vec::new();
output.reserve_exact(original.len() - 1);
let result = decoder.decode_all_to_vec(&input, &mut output);
assert!(
matches!(result, Err(FrameDecoderError::TargetTooSmall)),
"{:?}",
result
);
// decode_all_to_vec with larger output capacity.
let mut output = Vec::new();
output.reserve_exact(original.len() + 1);
decoder.decode_all_to_vec(&input, &mut output).unwrap();
assert_eq!(output, original);
}
pub mod bit_reader;
pub mod decode_corpus;
pub mod dict_test;
#[cfg(feature = "std")]
pub mod encode_corpus;
pub mod fuzz_regressions;