Vendor dependencies for 0.3.0 release

This commit is contained in:
2025-09-27 10:29:08 -05:00
parent 0c8d39d483
commit 82ab7f317b
26803 changed files with 16134934 additions and 0 deletions

View File

@@ -0,0 +1,64 @@
use crate::blocks::block::BlockType;
use alloc::vec::Vec;
#[derive(Debug)]
pub struct BlockHeader {
/// Signals if this block is the last one.
/// The frame will end after this block.
pub last_block: bool,
/// Influences the meaning of `block_size`.
pub block_type: BlockType,
/// - For `Raw` blocks, this is the size of the block's
/// content in bytes.
/// - For `RLE` blocks, there will be a single byte follwing
/// the header, repeated `block_size` times.
/// - For `Compressed` blocks, this is the length of
/// the compressed data.
///
/// **This value must not be greater than 21 bits in length.**
pub block_size: u32,
}
impl BlockHeader {
/// Write encoded binary representation of this header into the provided buffer.
pub fn serialize(self, output: &mut Vec<u8>) {
vprintln!("Serializing block with the header: {self:?}");
let encoded_block_type = match self.block_type {
BlockType::Raw => 0,
BlockType::RLE => 1,
BlockType::Compressed => 2,
BlockType::Reserved => panic!("You cannot use a reserved block type"),
};
let mut block_header = self.block_size << 3;
block_header |= encoded_block_type << 1;
block_header |= self.last_block as u32;
output.extend_from_slice(&block_header.to_le_bytes()[0..3]);
}
}
#[cfg(test)]
mod tests {
use super::BlockHeader;
use crate::{blocks::block::BlockType, decoding::block_decoder};
use alloc::vec::Vec;
#[test]
fn block_header_serialize() {
let header = BlockHeader {
last_block: true,
block_type: super::BlockType::Compressed,
block_size: 69,
};
let mut serialized_header = Vec::new();
header.serialize(&mut serialized_header);
let mut decoder = block_decoder::new();
let parsed_header = decoder
.read_block_header(serialized_header.as_slice())
.unwrap()
.0;
assert!(parsed_header.last_block);
assert_eq!(parsed_header.block_type, BlockType::Compressed);
assert_eq!(parsed_header.content_size, 69);
}
}

View File

@@ -0,0 +1,376 @@
use alloc::vec::Vec;
use crate::{
bit_io::BitWriter,
encoding::frame_compressor::CompressState,
encoding::{Matcher, Sequence},
fse::fse_encoder::{build_table_from_data, FSETable, State},
huff0::huff0_encoder,
};
pub fn compress_block<M: Matcher>(state: &mut CompressState<M>, output: &mut Vec<u8>) {
let mut literals_vec = Vec::new();
let mut sequences = Vec::new();
state.matcher.start_matching(|seq| {
match seq {
Sequence::Literals { literals } => literals_vec.extend_from_slice(literals),
Sequence::Triple {
literals,
offset,
match_len,
} => {
literals_vec.extend_from_slice(literals);
sequences.push(crate::blocks::sequence_section::Sequence {
ll: literals.len() as u32,
ml: match_len as u32,
of: (offset + 3) as u32, // TODO make use of the offset history
});
}
}
});
// literals section
let mut writer = BitWriter::from(output);
if literals_vec.len() > 1024 {
if let Some(table) =
compress_literals(&literals_vec, state.last_huff_table.as_ref(), &mut writer)
{
state.last_huff_table.replace(table);
}
} else {
raw_literals(&literals_vec, &mut writer);
}
// sequences section
if sequences.is_empty() {
writer.write_bits(0u8, 8);
} else {
encode_seqnum(sequences.len(), &mut writer);
// Choose the tables
// TODO store previously used tables
let ll_mode = choose_table(
state.fse_tables.ll_previous.as_ref(),
&state.fse_tables.ll_default,
sequences.iter().map(|seq| encode_literal_length(seq.ll).0),
9,
);
let ml_mode = choose_table(
state.fse_tables.ml_previous.as_ref(),
&state.fse_tables.ml_default,
sequences.iter().map(|seq| encode_match_len(seq.ml).0),
9,
);
let of_mode = choose_table(
state.fse_tables.of_previous.as_ref(),
&state.fse_tables.of_default,
sequences.iter().map(|seq| encode_offset(seq.of).0),
8,
);
writer.write_bits(encode_fse_table_modes(&ll_mode, &ml_mode, &of_mode), 8);
encode_table(&ll_mode, &mut writer);
encode_table(&of_mode, &mut writer);
encode_table(&ml_mode, &mut writer);
encode_sequences(
&sequences,
&mut writer,
ll_mode.as_ref(),
ml_mode.as_ref(),
of_mode.as_ref(),
);
if let FseTableMode::Encoded(table) = ll_mode {
state.fse_tables.ll_previous = Some(table)
}
if let FseTableMode::Encoded(table) = ml_mode {
state.fse_tables.ml_previous = Some(table)
}
if let FseTableMode::Encoded(table) = of_mode {
state.fse_tables.of_previous = Some(table)
}
}
writer.flush();
}
#[derive(Clone)]
#[allow(clippy::large_enum_variant)]
enum FseTableMode<'a> {
Predefined(&'a FSETable),
Encoded(FSETable),
RepeateLast(&'a FSETable),
}
impl FseTableMode<'_> {
pub fn as_ref(&self) -> &FSETable {
match self {
Self::Predefined(t) => t,
Self::RepeateLast(t) => t,
Self::Encoded(t) => t,
}
}
}
fn choose_table<'a>(
previous: Option<&'a FSETable>,
default_table: &'a FSETable,
data: impl Iterator<Item = u8>,
max_log: u8,
) -> FseTableMode<'a> {
// TODO check if the new table is better than the predefined and previous table
let use_new_table = true;
let use_previous_table = false;
if use_previous_table {
FseTableMode::RepeateLast(previous.unwrap())
} else if use_new_table {
FseTableMode::Encoded(build_table_from_data(data, max_log, true))
} else {
FseTableMode::Predefined(default_table)
}
}
fn encode_table(mode: &FseTableMode<'_>, writer: &mut BitWriter<&mut Vec<u8>>) {
match mode {
FseTableMode::Predefined(_) => {}
FseTableMode::RepeateLast(_) => {}
FseTableMode::Encoded(table) => table.write_table(writer),
}
}
fn encode_fse_table_modes(
ll_mode: &FseTableMode<'_>,
ml_mode: &FseTableMode<'_>,
of_mode: &FseTableMode<'_>,
) -> u8 {
fn mode_to_bits(mode: &FseTableMode<'_>) -> u8 {
match mode {
FseTableMode::Predefined(_) => 0,
FseTableMode::Encoded(_) => 2,
FseTableMode::RepeateLast(_) => 3,
}
}
mode_to_bits(ll_mode) << 6 | mode_to_bits(of_mode) << 4 | mode_to_bits(ml_mode) << 2
}
fn encode_sequences(
sequences: &[crate::blocks::sequence_section::Sequence],
writer: &mut BitWriter<&mut Vec<u8>>,
ll_table: &FSETable,
ml_table: &FSETable,
of_table: &FSETable,
) {
let sequence = sequences[sequences.len() - 1];
let (ll_code, ll_add_bits, ll_num_bits) = encode_literal_length(sequence.ll);
let (of_code, of_add_bits, of_num_bits) = encode_offset(sequence.of);
let (ml_code, ml_add_bits, ml_num_bits) = encode_match_len(sequence.ml);
let mut ll_state: &State = ll_table.start_state(ll_code);
let mut ml_state: &State = ml_table.start_state(ml_code);
let mut of_state: &State = of_table.start_state(of_code);
writer.write_bits(ll_add_bits, ll_num_bits);
writer.write_bits(ml_add_bits, ml_num_bits);
writer.write_bits(of_add_bits, of_num_bits);
// encode backwards so the decoder reads the first sequence first
if sequences.len() > 1 {
for sequence in (0..=sequences.len() - 2).rev() {
let sequence = sequences[sequence];
let (ll_code, ll_add_bits, ll_num_bits) = encode_literal_length(sequence.ll);
let (of_code, of_add_bits, of_num_bits) = encode_offset(sequence.of);
let (ml_code, ml_add_bits, ml_num_bits) = encode_match_len(sequence.ml);
{
let next = of_table.next_state(of_code, of_state.index);
let diff = of_state.index - next.baseline;
writer.write_bits(diff as u64, next.num_bits as usize);
of_state = next;
}
{
let next = ml_table.next_state(ml_code, ml_state.index);
let diff = ml_state.index - next.baseline;
writer.write_bits(diff as u64, next.num_bits as usize);
ml_state = next;
}
{
let next = ll_table.next_state(ll_code, ll_state.index);
let diff = ll_state.index - next.baseline;
writer.write_bits(diff as u64, next.num_bits as usize);
ll_state = next;
}
writer.write_bits(ll_add_bits, ll_num_bits);
writer.write_bits(ml_add_bits, ml_num_bits);
writer.write_bits(of_add_bits, of_num_bits);
}
}
writer.write_bits(ml_state.index as u64, ml_table.table_size.ilog2() as usize);
writer.write_bits(of_state.index as u64, of_table.table_size.ilog2() as usize);
writer.write_bits(ll_state.index as u64, ll_table.table_size.ilog2() as usize);
let bits_to_fill = writer.misaligned();
if bits_to_fill == 0 {
writer.write_bits(1u32, 8);
} else {
writer.write_bits(1u32, bits_to_fill);
}
}
fn encode_seqnum(seqnum: usize, writer: &mut BitWriter<impl AsMut<Vec<u8>>>) {
const UPPER_LIMIT: usize = 0xFFFF + 0x7F00;
match seqnum {
1..=127 => writer.write_bits(seqnum as u32, 8),
128..=0x7FFF => {
let upper = ((seqnum >> 8) | 0x80) as u8;
let lower = seqnum as u8;
writer.write_bits(upper, 8);
writer.write_bits(lower, 8);
}
0x8000..=UPPER_LIMIT => {
let encode = seqnum - 0x7F00;
let upper = (encode >> 8) as u8;
let lower = encode as u8;
writer.write_bits(255u8, 8);
writer.write_bits(upper, 8);
writer.write_bits(lower, 8);
}
_ => unreachable!(),
}
}
fn encode_literal_length(len: u32) -> (u8, u32, usize) {
match len {
0..=15 => (len as u8, 0, 0),
16..=17 => (16, len - 16, 1),
18..=19 => (17, len - 18, 1),
20..=21 => (18, len - 20, 1),
22..=23 => (19, len - 22, 1),
24..=27 => (20, len - 24, 2),
28..=31 => (21, len - 28, 2),
32..=39 => (22, len - 32, 3),
40..=47 => (23, len - 40, 3),
48..=63 => (24, len - 48, 4),
64..=127 => (25, len - 64, 6),
128..=255 => (26, len - 128, 7),
256..=511 => (27, len - 256, 8),
512..=1023 => (28, len - 512, 9),
1024..=2047 => (29, len - 1024, 10),
2048..=4095 => (30, len - 2048, 11),
4096..=8191 => (31, len - 4096, 12),
8192..=16383 => (32, len - 8192, 13),
16384..=32767 => (33, len - 16384, 14),
32768..=65535 => (34, len - 32768, 15),
65536..=131071 => (35, len - 65536, 16),
131072.. => unreachable!(),
}
}
fn encode_match_len(len: u32) -> (u8, u32, usize) {
match len {
0..=2 => unreachable!(),
3..=34 => (len as u8 - 3, 0, 0),
35..=36 => (32, len - 35, 1),
37..=38 => (33, len - 37, 1),
39..=40 => (34, len - 39, 1),
41..=42 => (35, len - 41, 1),
43..=46 => (36, len - 43, 2),
47..=50 => (37, len - 47, 2),
51..=58 => (38, len - 51, 3),
59..=66 => (39, len - 59, 3),
67..=82 => (40, len - 67, 4),
83..=98 => (41, len - 83, 4),
99..=130 => (42, len - 99, 5),
131..=258 => (43, len - 131, 7),
259..=514 => (44, len - 259, 8),
515..=1026 => (45, len - 515, 9),
1027..=2050 => (46, len - 1027, 10),
2051..=4098 => (47, len - 2051, 11),
4099..=8194 => (48, len - 4099, 12),
8195..=16386 => (49, len - 8195, 13),
16387..=32770 => (50, len - 16387, 14),
32771..=65538 => (51, len - 32771, 15),
65539..=131074 => (52, len - 32771, 16),
131075.. => unreachable!(),
}
}
fn encode_offset(len: u32) -> (u8, u32, usize) {
let log = len.ilog2();
let lower = len & ((1 << log) - 1);
(log as u8, lower, log as usize)
}
fn raw_literals(literals: &[u8], writer: &mut BitWriter<&mut Vec<u8>>) {
writer.write_bits(0u8, 2);
writer.write_bits(0b11u8, 2);
writer.write_bits(literals.len() as u32, 20);
writer.append_bytes(literals);
}
fn compress_literals(
literals: &[u8],
last_table: Option<&huff0_encoder::HuffmanTable>,
writer: &mut BitWriter<&mut Vec<u8>>,
) -> Option<huff0_encoder::HuffmanTable> {
let reset_idx = writer.index();
let new_encoder_table = huff0_encoder::HuffmanTable::build_from_data(literals);
let (encoder_table, new_table) = if let Some(_table) = last_table {
if let Some(diff) = _table.can_encode(&new_encoder_table) {
// TODO this is a very simple heuristic, maybe we should try to do better
if diff > 5 {
(&new_encoder_table, true)
} else {
(_table, false)
}
} else {
(&new_encoder_table, true)
}
} else {
(&new_encoder_table, true)
};
if new_table {
writer.write_bits(2u8, 2); // compressed literals type
} else {
writer.write_bits(3u8, 2); // treeless compressed literals type
}
let (size_format, size_bits) = match literals.len() {
0..6 => (0b00u8, 10),
6..1024 => (0b01, 10),
1024..16384 => (0b10, 14),
16384..262144 => (0b11, 18),
_ => unimplemented!("too many literals"),
};
writer.write_bits(size_format, 2);
writer.write_bits(literals.len() as u32, size_bits);
let size_index = writer.index();
writer.write_bits(0u32, size_bits);
let index_before = writer.index();
let mut encoder = huff0_encoder::HuffmanEncoder::new(encoder_table, writer);
if size_format == 0 {
encoder.encode(literals, new_table)
} else {
encoder.encode4x(literals, new_table)
};
let encoded_len = (writer.index() - index_before) / 8;
writer.change_bits(size_index, encoded_len as u64, size_bits);
let total_len = (writer.index() - reset_idx) / 8;
// If encoded len is bigger than the raw literals we are better off just writing the raw literals here
if total_len >= literals.len() {
writer.reset_to(reset_idx);
raw_literals(literals, writer);
None
} else if new_table {
Some(new_encoder_table)
} else {
None
}
}

View File

@@ -0,0 +1,8 @@
//! After Magic_Number and Frame_Header, there are some number of blocks. Each frame must have at least one block,
//! but there is no upper limit on the number of blocks per frame.
//!
//! There are a few different kinds of blocks, and implementations for those kinds are
//! in this module.
mod compressed;
pub(super) use compressed::*;

View File

@@ -0,0 +1,461 @@
//! Utilities and interfaces for encoding an entire frame. Allows reusing resources
use alloc::vec::Vec;
use core::convert::TryInto;
#[cfg(feature = "hash")]
use twox_hash::XxHash64;
#[cfg(feature = "hash")]
use core::hash::Hasher;
use super::{
block_header::BlockHeader, frame_header::FrameHeader, levels::*,
match_generator::MatchGeneratorDriver, CompressionLevel, Matcher,
};
use crate::fse::fse_encoder::{default_ll_table, default_ml_table, default_of_table, FSETable};
use crate::io::{Read, Write};
/// An interface for compressing arbitrary data with the ZStandard compression algorithm.
///
/// `FrameCompressor` will generally be used by:
/// 1. Initializing a compressor by providing a buffer of data using `FrameCompressor::new()`
/// 2. Starting compression and writing that compression into a vec using `FrameCompressor::begin`
///
/// # Examples
/// ```
/// use ruzstd::encoding::{FrameCompressor, CompressionLevel};
/// let mock_data: &[_] = &[0x1, 0x2, 0x3, 0x4];
/// let mut output = std::vec::Vec::new();
/// // Initialize a compressor.
/// let mut compressor = FrameCompressor::new(CompressionLevel::Uncompressed);
/// compressor.set_source(mock_data);
/// compressor.set_drain(&mut output);
///
/// // `compress` writes the compressed output into the provided buffer.
/// compressor.compress();
/// ```
pub struct FrameCompressor<R: Read, W: Write, M: Matcher> {
uncompressed_data: Option<R>,
compressed_data: Option<W>,
compression_level: CompressionLevel,
state: CompressState<M>,
#[cfg(feature = "hash")]
hasher: XxHash64,
}
pub(crate) struct FseTables {
pub(crate) ll_default: FSETable,
pub(crate) ll_previous: Option<FSETable>,
pub(crate) ml_default: FSETable,
pub(crate) ml_previous: Option<FSETable>,
pub(crate) of_default: FSETable,
pub(crate) of_previous: Option<FSETable>,
}
impl FseTables {
pub fn new() -> Self {
Self {
ll_default: default_ll_table(),
ll_previous: None,
ml_default: default_ml_table(),
ml_previous: None,
of_default: default_of_table(),
of_previous: None,
}
}
}
pub(crate) struct CompressState<M: Matcher> {
pub(crate) matcher: M,
pub(crate) last_huff_table: Option<crate::huff0::huff0_encoder::HuffmanTable>,
pub(crate) fse_tables: FseTables,
}
impl<R: Read, W: Write> FrameCompressor<R, W, MatchGeneratorDriver> {
/// Create a new `FrameCompressor`
pub fn new(compression_level: CompressionLevel) -> Self {
Self {
uncompressed_data: None,
compressed_data: None,
compression_level,
state: CompressState {
matcher: MatchGeneratorDriver::new(1024 * 128, 1),
last_huff_table: None,
fse_tables: FseTables::new(),
},
#[cfg(feature = "hash")]
hasher: XxHash64::with_seed(0),
}
}
}
impl<R: Read, W: Write, M: Matcher> FrameCompressor<R, W, M> {
/// Create a new `FrameCompressor` with a custom matching algorithm implementation
pub fn new_with_matcher(matcher: M, compression_level: CompressionLevel) -> Self {
Self {
uncompressed_data: None,
compressed_data: None,
state: CompressState {
matcher,
last_huff_table: None,
fse_tables: FseTables::new(),
},
compression_level,
#[cfg(feature = "hash")]
hasher: XxHash64::with_seed(0),
}
}
/// Before calling [FrameCompressor::compress] you need to set the source.
///
/// This is the data that is compressed and written into the drain.
pub fn set_source(&mut self, uncompressed_data: R) -> Option<R> {
self.uncompressed_data.replace(uncompressed_data)
}
/// Before calling [FrameCompressor::compress] you need to set the drain.
///
/// As the compressor compresses data, the drain serves as a place for the output to be writte.
pub fn set_drain(&mut self, compressed_data: W) -> Option<W> {
self.compressed_data.replace(compressed_data)
}
/// Compress the uncompressed data from the provided source as one Zstd frame and write it to the provided drain
///
/// This will repeatedly call [Read::read] on the source to fill up blocks until the source returns 0 on the read call.
/// Also [Write::write_all] will be called on the drain after each block has been encoded.
///
/// To avoid endlessly encoding from a potentially endless source (like a network socket) you can use the
/// [Read::take] function
pub fn compress(&mut self) {
// Clearing buffers to allow re-using of the compressor
self.state.matcher.reset(self.compression_level);
self.state.last_huff_table = None;
let source = self.uncompressed_data.as_mut().unwrap();
let drain = self.compressed_data.as_mut().unwrap();
// As the frame is compressed, it's stored here
let output: &mut Vec<u8> = &mut Vec::with_capacity(1024 * 130);
// First write the frame header
let header = FrameHeader {
frame_content_size: None,
single_segment: false,
content_checksum: cfg!(feature = "hash"),
dictionary_id: None,
window_size: Some(self.state.matcher.window_size()),
};
header.serialize(output);
// Now compress block by block
loop {
// Read a single block's worth of uncompressed data from the input
let mut uncompressed_data = self.state.matcher.get_next_space();
let mut read_bytes = 0;
let last_block;
'read_loop: loop {
let new_bytes = source.read(&mut uncompressed_data[read_bytes..]).unwrap();
if new_bytes == 0 {
last_block = true;
break 'read_loop;
}
read_bytes += new_bytes;
if read_bytes == uncompressed_data.len() {
last_block = false;
break 'read_loop;
}
}
uncompressed_data.resize(read_bytes, 0);
// As we read, hash that data too
#[cfg(feature = "hash")]
self.hasher.write(&uncompressed_data);
// Special handling is needed for compression of a totally empty file (why you'd want to do that, I don't know)
if uncompressed_data.is_empty() {
let header = BlockHeader {
last_block: true,
block_type: crate::blocks::block::BlockType::Raw,
block_size: 0,
};
// Write the header, then the block
header.serialize(output);
drain.write_all(output).unwrap();
output.clear();
break;
}
match self.compression_level {
CompressionLevel::Uncompressed => {
let header = BlockHeader {
last_block,
block_type: crate::blocks::block::BlockType::Raw,
block_size: read_bytes.try_into().unwrap(),
};
// Write the header, then the block
header.serialize(output);
output.extend_from_slice(&uncompressed_data);
}
CompressionLevel::Fastest => {
compress_fastest(&mut self.state, last_block, uncompressed_data, output)
}
_ => {
unimplemented!();
}
}
drain.write_all(output).unwrap();
output.clear();
if last_block {
break;
}
}
// If the `hash` feature is enabled, then `content_checksum` is set to true in the header
// and a 32 bit hash is written at the end of the data.
#[cfg(feature = "hash")]
{
// Because we only have the data as a reader, we need to read all of it to calculate the checksum
// Possible TODO: create a wrapper around self.uncompressed data that hashes the data as it's read?
let content_checksum = self.hasher.finish();
drain
.write_all(&(content_checksum as u32).to_le_bytes())
.unwrap();
}
}
/// Get a mutable reference to the source
pub fn source_mut(&mut self) -> Option<&mut R> {
self.uncompressed_data.as_mut()
}
/// Get a mutable reference to the drain
pub fn drain_mut(&mut self) -> Option<&mut W> {
self.compressed_data.as_mut()
}
/// Get a reference to the source
pub fn source(&self) -> Option<&R> {
self.uncompressed_data.as_ref()
}
/// Get a reference to the drain
pub fn drain(&self) -> Option<&W> {
self.compressed_data.as_ref()
}
/// Retrieve the source
pub fn take_source(&mut self) -> Option<R> {
self.uncompressed_data.take()
}
/// Retrieve the drain
pub fn take_drain(&mut self) -> Option<W> {
self.compressed_data.take()
}
/// Before calling [FrameCompressor::compress] you can replace the matcher
pub fn replace_matcher(&mut self, mut match_generator: M) -> M {
core::mem::swap(&mut match_generator, &mut self.state.matcher);
match_generator
}
/// Before calling [FrameCompressor::compress] you can replace the compression level
pub fn set_compression_level(
&mut self,
compression_level: CompressionLevel,
) -> CompressionLevel {
let old = self.compression_level;
self.compression_level = compression_level;
old
}
/// Get the current compression level
pub fn compression_level(&self) -> CompressionLevel {
self.compression_level
}
}
#[cfg(test)]
mod tests {
use alloc::vec;
use super::FrameCompressor;
use crate::common::MAGIC_NUM;
use crate::decoding::FrameDecoder;
use alloc::vec::Vec;
#[test]
fn frame_starts_with_magic_num() {
let mock_data = [1_u8, 2, 3].as_slice();
let mut output: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(super::CompressionLevel::Uncompressed);
compressor.set_source(mock_data);
compressor.set_drain(&mut output);
compressor.compress();
assert!(output.starts_with(&MAGIC_NUM.to_le_bytes()));
}
#[test]
fn very_simple_raw_compress() {
let mock_data = [1_u8, 2, 3].as_slice();
let mut output: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(super::CompressionLevel::Uncompressed);
compressor.set_source(mock_data);
compressor.set_drain(&mut output);
compressor.compress();
}
#[test]
fn very_simple_compress() {
let mut mock_data = vec![0; 1 << 17];
mock_data.extend(vec![1; (1 << 17) - 1]);
mock_data.extend(vec![2; (1 << 18) - 1]);
mock_data.extend(vec![2; 1 << 17]);
mock_data.extend(vec![3; (1 << 17) - 1]);
let mut output: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(super::CompressionLevel::Uncompressed);
compressor.set_source(mock_data.as_slice());
compressor.set_drain(&mut output);
compressor.compress();
let mut decoder = FrameDecoder::new();
let mut decoded = Vec::with_capacity(mock_data.len());
decoder.decode_all_to_vec(&output, &mut decoded).unwrap();
assert_eq!(mock_data, decoded);
let mut decoded = Vec::new();
zstd::stream::copy_decode(output.as_slice(), &mut decoded).unwrap();
assert_eq!(mock_data, decoded);
}
#[test]
fn rle_compress() {
let mock_data = vec![0; 1 << 19];
let mut output: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(super::CompressionLevel::Uncompressed);
compressor.set_source(mock_data.as_slice());
compressor.set_drain(&mut output);
compressor.compress();
let mut decoder = FrameDecoder::new();
let mut decoded = Vec::with_capacity(mock_data.len());
decoder.decode_all_to_vec(&output, &mut decoded).unwrap();
assert_eq!(mock_data, decoded);
}
#[test]
fn aaa_compress() {
let mock_data = vec![0, 1, 3, 4, 5];
let mut output: Vec<u8> = Vec::new();
let mut compressor = FrameCompressor::new(super::CompressionLevel::Uncompressed);
compressor.set_source(mock_data.as_slice());
compressor.set_drain(&mut output);
compressor.compress();
let mut decoder = FrameDecoder::new();
let mut decoded = Vec::with_capacity(mock_data.len());
decoder.decode_all_to_vec(&output, &mut decoded).unwrap();
assert_eq!(mock_data, decoded);
let mut decoded = Vec::new();
zstd::stream::copy_decode(output.as_slice(), &mut decoded).unwrap();
assert_eq!(mock_data, decoded);
}
#[cfg(feature = "std")]
#[test]
fn fuzz_targets() {
use std::io::Read;
fn decode_ruzstd(data: &mut dyn std::io::Read) -> Vec<u8> {
let mut decoder = crate::decoding::StreamingDecoder::new(data).unwrap();
let mut result: Vec<u8> = Vec::new();
decoder.read_to_end(&mut result).expect("Decoding failed");
result
}
fn decode_ruzstd_writer(mut data: impl Read) -> Vec<u8> {
let mut decoder = crate::decoding::FrameDecoder::new();
decoder.reset(&mut data).unwrap();
let mut result = vec![];
while !decoder.is_finished() || decoder.can_collect() > 0 {
decoder
.decode_blocks(
&mut data,
crate::decoding::BlockDecodingStrategy::UptoBytes(1024 * 1024),
)
.unwrap();
decoder.collect_to_writer(&mut result).unwrap();
}
result
}
fn encode_zstd(data: &[u8]) -> Result<Vec<u8>, std::io::Error> {
zstd::stream::encode_all(std::io::Cursor::new(data), 3)
}
fn encode_ruzstd_uncompressed(data: &mut dyn std::io::Read) -> Vec<u8> {
let mut input = Vec::new();
data.read_to_end(&mut input).unwrap();
crate::encoding::compress_to_vec(
input.as_slice(),
crate::encoding::CompressionLevel::Uncompressed,
)
}
fn encode_ruzstd_compressed(data: &mut dyn std::io::Read) -> Vec<u8> {
let mut input = Vec::new();
data.read_to_end(&mut input).unwrap();
crate::encoding::compress_to_vec(
input.as_slice(),
crate::encoding::CompressionLevel::Fastest,
)
}
fn decode_zstd(data: &[u8]) -> Result<Vec<u8>, std::io::Error> {
let mut output = Vec::new();
zstd::stream::copy_decode(data, &mut output)?;
Ok(output)
}
if std::fs::exists("fuzz/artifacts/interop").unwrap_or(false) {
for file in std::fs::read_dir("fuzz/artifacts/interop").unwrap() {
if file.as_ref().unwrap().file_type().unwrap().is_file() {
let data = std::fs::read(file.unwrap().path()).unwrap();
let data = data.as_slice();
// Decoding
let compressed = encode_zstd(data).unwrap();
let decoded = decode_ruzstd(&mut compressed.as_slice());
let decoded2 = decode_ruzstd_writer(&mut compressed.as_slice());
assert!(
decoded == data,
"Decoded data did not match the original input during decompression"
);
assert_eq!(
decoded2, data,
"Decoded data did not match the original input during decompression"
);
// Encoding
// Uncompressed encoding
let mut input = data;
let compressed = encode_ruzstd_uncompressed(&mut input);
let decoded = decode_zstd(&compressed).unwrap();
assert_eq!(
decoded, data,
"Decoded data did not match the original input during compression"
);
// Compressed encoding
let mut input = data;
let compressed = encode_ruzstd_compressed(&mut input);
let decoded = decode_zstd(&compressed).unwrap();
assert_eq!(
decoded, data,
"Decoded data did not match the original input during compression"
);
}
}
}
}
}

View File

@@ -0,0 +1,231 @@
//! Utilities and representations for a frame header.
use crate::bit_io::BitWriter;
use crate::common::MAGIC_NUM;
use crate::encoding::util::{find_min_size, minify_val};
use alloc::vec::Vec;
/// A header for a single Zstandard frame.
///
/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header>
#[derive(Debug)]
pub struct FrameHeader {
/// Optionally, the original (uncompressed) size of the data within the frame in bytes.
/// If not present, `window_size` must be set.
pub frame_content_size: Option<u64>,
/// If set to true, data must be regenerated within a single
/// continuous memory segment.
pub single_segment: bool,
/// If set to true, a 32 bit content checksum will be present
/// at the end of the frame.
pub content_checksum: bool,
/// If a dictionary ID is provided, the ID of that dictionary.
pub dictionary_id: Option<u64>,
/// The minimum memory buffer required to compress a frame. If not present,
/// `single_segment` will be set to true. If present, this value must be greater than 1KB
/// and less than 3.75TB. Encoders should not generate a frame that requires a window size larger than
/// 8mb.
pub window_size: Option<u64>,
}
impl FrameHeader {
/// Writes the serialized frame header into the provided buffer.
///
/// The returned header *does include* a frame header descriptor.
pub fn serialize(self, output: &mut Vec<u8>) {
vprintln!("Serializing frame with header: {self:?}");
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header
// Magic Number:
output.extend_from_slice(&MAGIC_NUM.to_le_bytes());
// `Frame_Header_Descriptor`:
output.push(self.descriptor());
// `Window_Descriptor
// TODO: https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
if !self.single_segment {
if let Some(window_size) = self.window_size {
let log = window_size.next_power_of_two().ilog2();
let exponent = if log > 10 { log - 10 } else { 1 } as u8;
output.push(exponent << 3);
}
}
if let Some(id) = self.dictionary_id {
output.extend(minify_val(id));
}
if let Some(frame_content_size) = self.frame_content_size {
output.extend(minify_val_fcs(frame_content_size));
}
}
/// Generate a serialized frame header descriptor for the frame header.
///
/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header_descriptor
fn descriptor(&self) -> u8 {
let mut bw = BitWriter::new();
// A frame header starts with a frame header descriptor.
// It describes what other fields are present
// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_header_descriptor
// Writing the frame header descriptor:
// `Frame_Content_Size_flag`:
// The Frame_Content_Size_flag specifies if
// the Frame_Content_Size field is provided within the header.
// TODO: The Frame_Content_Size field isn't set at all, we should prefer to include it always.
// If the `Single_Segment_flag` is set and this value is zero,
// the size of the FCS field is 1 byte.
// Otherwise, the FCS field is omitted.
// | Value | Size of field (Bytes)
// | 0 | 0 or 1
// | 1 | 2
// | 2 | 4
// | 3 | 8
// `Dictionary_ID_flag`:
if let Some(id) = self.dictionary_id {
let flag_value: u8 = match find_min_size(id) {
0 => 0,
1 => 1,
2 => 2,
4 => 3,
_ => panic!(),
};
bw.write_bits(flag_value, 2);
} else {
// A `Dictionary_ID` was not provided
bw.write_bits(0u8, 2);
}
// `Content_Checksum_flag`:
if self.content_checksum {
bw.write_bits(1u8, 1);
} else {
bw.write_bits(0u8, 1);
}
// `Reserved_bit`:
// This value must be zero
bw.write_bits(0u8, 1);
// `Unused_bit`:
// An encoder compliant with this spec must set this bit to zero
bw.write_bits(0u8, 1);
// `Single_Segment_flag`:
// If this flag is set, data must be regenerated within a single continuous memory segment,
// and the `Frame_Content_Size` field must be present in the header.
// If this flag is not set, the `Window_Descriptor` field must be present in the frame header.
if self.single_segment {
assert!(self.frame_content_size.is_some(), "if the `single_segment` flag is set to true, then a frame content size must be provided");
bw.write_bits(1u8, 1);
} else {
assert!(
self.window_size.is_some(),
"if the `single_segment` flag is set to false, then a window size must be provided"
);
bw.write_bits(0u8, 1);
}
if let Some(frame_content_size) = self.frame_content_size {
let field_size = find_min_size(frame_content_size);
let flag_value: u8 = match field_size {
1 => 0,
2 => 1,
4 => 2,
3 => 8,
_ => panic!(),
};
bw.write_bits(flag_value, 2);
} else {
// `Frame_Content_Size` was not provided
bw.write_bits(0u8, 2);
}
bw.dump()[0]
}
}
/// Identical to [`minify_val`], but it implements the following edge case:
///
/// > When FCS_Field_Size is 1, 4 or 8 bytes, the value is read directly. When FCS_Field_Size is 2, the offset of 256 is added.
///
/// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_content_size
fn minify_val_fcs(val: u64) -> Vec<u8> {
let new_size = find_min_size(val);
let mut val = val;
if new_size == 2 {
val -= 256;
}
val.to_le_bytes()[0..new_size].to_vec()
}
#[cfg(test)]
mod tests {
use super::FrameHeader;
use crate::decoding::frame::{read_frame_header, FrameDescriptor};
use alloc::vec::Vec;
#[test]
fn frame_header_descriptor_decode() {
let header = FrameHeader {
frame_content_size: Some(1),
single_segment: true,
content_checksum: false,
dictionary_id: None,
window_size: None,
};
let descriptor = header.descriptor();
let decoded_descriptor = FrameDescriptor(descriptor);
assert_eq!(decoded_descriptor.frame_content_size_bytes().unwrap(), 1);
assert!(!decoded_descriptor.content_checksum_flag());
assert_eq!(decoded_descriptor.dictionary_id_bytes().unwrap(), 0);
}
#[test]
fn frame_header_decode() {
let header = FrameHeader {
frame_content_size: Some(1),
single_segment: true,
content_checksum: false,
dictionary_id: None,
window_size: None,
};
let mut serialized_header = Vec::new();
header.serialize(&mut serialized_header);
let parsed_header = read_frame_header(serialized_header.as_slice()).unwrap().0;
assert!(parsed_header.dictionary_id().is_none());
assert_eq!(parsed_header.frame_content_size(), 1);
}
#[test]
#[should_panic]
fn catches_single_segment_no_fcs() {
let header = FrameHeader {
frame_content_size: None,
single_segment: true,
content_checksum: false,
dictionary_id: None,
window_size: Some(1),
};
let mut serialized_header = Vec::new();
header.serialize(&mut serialized_header);
}
#[test]
#[should_panic]
fn catches_single_segment_no_winsize() {
let header = FrameHeader {
frame_content_size: Some(7),
single_segment: false,
content_checksum: false,
dictionary_id: None,
window_size: None,
};
let mut serialized_header = Vec::new();
header.serialize(&mut serialized_header);
}
}

View File

@@ -0,0 +1,67 @@
use crate::{
common::MAX_BLOCK_SIZE,
encoding::{
block_header::BlockHeader, blocks::compress_block, frame_compressor::CompressState, Matcher,
},
};
use alloc::vec::Vec;
/// Compresses a single block at [`crate::encoding::CompressionLevel::Fastest`].
///
/// # Parameters
/// - `state`: [`CompressState`] so the compressor can refer to data before
/// the start of this block
/// - `last_block`: Whether or not this block is going to be the last block in the frame
/// (needed because this info is written into the block header)
/// - `uncompressed_data`: A block's worth of uncompressed data, taken from the
/// larger input
/// - `output`: As `uncompressed_data` is compressed, it's appended to `output`.
#[inline]
pub fn compress_fastest<M: Matcher>(
state: &mut CompressState<M>,
last_block: bool,
uncompressed_data: Vec<u8>,
output: &mut Vec<u8>,
) {
let block_size = uncompressed_data.len() as u32;
// First check to see if run length encoding can be used for the entire block
if uncompressed_data.iter().all(|x| uncompressed_data[0].eq(x)) {
let rle_byte = uncompressed_data[0];
state.matcher.commit_space(uncompressed_data);
state.matcher.skip_matching();
let header = BlockHeader {
last_block,
block_type: crate::blocks::block::BlockType::RLE,
block_size,
};
// Write the header, then the block
header.serialize(output);
output.push(rle_byte);
} else {
// Compress as a standard compressed block
let mut compressed = Vec::new();
state.matcher.commit_space(uncompressed_data);
compress_block(state, &mut compressed);
// If the compressed data is larger than the maximum
// allowable block size, instead store uncompressed
if compressed.len() >= MAX_BLOCK_SIZE as usize {
let header = BlockHeader {
last_block,
block_type: crate::blocks::block::BlockType::Raw,
block_size,
};
// Write the header, then the block
header.serialize(output);
output.extend_from_slice(state.matcher.get_last_space());
} else {
let header = BlockHeader {
last_block,
block_type: crate::blocks::block::BlockType::Compressed,
block_size: compressed.len() as u32,
};
// Write the header, then the block
header.serialize(output);
output.extend(compressed);
}
}
}

View File

@@ -0,0 +1,2 @@
mod fastest;
pub use fastest::compress_fastest;

View File

@@ -0,0 +1,619 @@
//! Matching algorithm used find repeated parts in the original data
//!
//! The Zstd format relies on finden repeated sequences of data and compressing these sequences as instructions to the decoder.
//! A sequence basically tells the decoder "Go back X bytes and copy Y bytes to the end of your decode buffer".
//!
//! The task here is to efficiently find matches in the already encoded data for the current suffix of the not yet encoded data.
use alloc::vec::Vec;
use core::num::NonZeroUsize;
use super::CompressionLevel;
use super::Matcher;
use super::Sequence;
const MIN_MATCH_LEN: usize = 5;
/// Takes care of allocating and reusing vecs
pub struct MatchGeneratorDriver {
vec_pool: Vec<Vec<u8>>,
suffix_pool: Vec<SuffixStore>,
match_generator: MatchGenerator,
slice_size: usize,
}
impl MatchGeneratorDriver {
/// slice_size says how big the slices should be that are allocated to work with
/// max_slices_in_window says how many slices should at most be used while looking for matches
pub(crate) fn new(slice_size: usize, max_slices_in_window: usize) -> Self {
Self {
vec_pool: Vec::new(),
suffix_pool: Vec::new(),
match_generator: MatchGenerator::new(max_slices_in_window * slice_size),
slice_size,
}
}
}
impl Matcher for MatchGeneratorDriver {
fn reset(&mut self, _level: CompressionLevel) {
let vec_pool = &mut self.vec_pool;
let suffix_pool = &mut self.suffix_pool;
self.match_generator.reset(|mut data, mut suffixes| {
data.resize(data.capacity(), 0);
vec_pool.push(data);
suffixes.slots.clear();
suffixes.slots.resize(suffixes.slots.capacity(), None);
suffix_pool.push(suffixes);
});
}
fn window_size(&self) -> u64 {
self.match_generator.max_window_size as u64
}
fn get_next_space(&mut self) -> Vec<u8> {
self.vec_pool.pop().unwrap_or_else(|| {
let mut space = alloc::vec![0; self.slice_size];
space.resize(space.capacity(), 0);
space
})
}
fn get_last_space(&mut self) -> &[u8] {
self.match_generator.window.last().unwrap().data.as_slice()
}
fn commit_space(&mut self, space: Vec<u8>) {
let vec_pool = &mut self.vec_pool;
let suffixes = self
.suffix_pool
.pop()
.unwrap_or_else(|| SuffixStore::with_capacity(space.len()));
let suffix_pool = &mut self.suffix_pool;
self.match_generator
.add_data(space, suffixes, |mut data, mut suffixes| {
data.resize(data.capacity(), 0);
vec_pool.push(data);
suffixes.slots.clear();
suffixes.slots.resize(suffixes.slots.capacity(), None);
suffix_pool.push(suffixes);
});
}
fn start_matching(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) {
while self.match_generator.next_sequence(&mut handle_sequence) {}
}
fn skip_matching(&mut self) {
self.match_generator.skip_matching();
}
}
/// This stores the index of a suffix of a string by hashing the first few bytes of that suffix
/// This means that collisions just overwrite and that you need to check validity after a get
struct SuffixStore {
// We use NonZeroUsize to enable niche optimization here.
// On store we do +1 and on get -1
// This is ok since usize::MAX is never a valid offset
slots: Vec<Option<NonZeroUsize>>,
len_log: u32,
}
impl SuffixStore {
fn with_capacity(capacity: usize) -> Self {
Self {
slots: alloc::vec![None; capacity],
len_log: capacity.ilog2(),
}
}
#[inline(always)]
fn insert(&mut self, suffix: &[u8], idx: usize) {
let key = self.key(suffix);
self.slots[key] = Some(NonZeroUsize::new(idx + 1).unwrap());
}
#[inline(always)]
fn contains_key(&self, suffix: &[u8]) -> bool {
let key = self.key(suffix);
self.slots[key].is_some()
}
#[inline(always)]
fn get(&self, suffix: &[u8]) -> Option<usize> {
let key = self.key(suffix);
self.slots[key].map(|x| <NonZeroUsize as Into<usize>>::into(x) - 1)
}
#[inline(always)]
fn key(&self, suffix: &[u8]) -> usize {
let s0 = suffix[0] as u64;
let s1 = suffix[1] as u64;
let s2 = suffix[2] as u64;
let s3 = suffix[3] as u64;
let s4 = suffix[4] as u64;
const POLY: u64 = 0xCF3BCCDCABu64;
let s0 = (s0 << 24).wrapping_mul(POLY);
let s1 = (s1 << 32).wrapping_mul(POLY);
let s2 = (s2 << 40).wrapping_mul(POLY);
let s3 = (s3 << 48).wrapping_mul(POLY);
let s4 = (s4 << 56).wrapping_mul(POLY);
let index = s0 ^ s1 ^ s2 ^ s3 ^ s4;
let index = index >> (64 - self.len_log);
index as usize % self.slots.len()
}
}
/// We keep a window of a few of these entries
/// All of these are valid targets for a match to be generated for
struct WindowEntry {
data: Vec<u8>,
/// Stores indexes into data
suffixes: SuffixStore,
/// Makes offset calculations efficient
base_offset: usize,
}
pub(crate) struct MatchGenerator {
max_window_size: usize,
/// Data window we are operating on to find matches
/// The data we want to find matches for is in the last slice
window: Vec<WindowEntry>,
window_size: usize,
#[cfg(debug_assertions)]
concat_window: Vec<u8>,
/// Index in the last slice that we already processed
suffix_idx: usize,
/// Gets updated when a new sequence is returned to point right behind that sequence
last_idx_in_sequence: usize,
}
impl MatchGenerator {
/// max_size defines how many bytes will be used at most in the window used for matching
fn new(max_size: usize) -> Self {
Self {
max_window_size: max_size,
window: Vec::new(),
window_size: 0,
#[cfg(debug_assertions)]
concat_window: Vec::new(),
suffix_idx: 0,
last_idx_in_sequence: 0,
}
}
fn reset(&mut self, mut reuse_space: impl FnMut(Vec<u8>, SuffixStore)) {
self.window_size = 0;
#[cfg(debug_assertions)]
self.concat_window.clear();
self.suffix_idx = 0;
self.last_idx_in_sequence = 0;
self.window.drain(..).for_each(|entry| {
reuse_space(entry.data, entry.suffixes);
});
}
/// Processes bytes in the current window until either a match is found or no more matches can be found
/// * If a match is found handle_sequence is called with the Triple variant
/// * If no more matches can be found but there are bytes still left handle_sequence is called with the Literals variant
/// * If no more matches can be found and no more bytes are left this returns false
fn next_sequence(&mut self, mut handle_sequence: impl for<'a> FnMut(Sequence<'a>)) -> bool {
loop {
let last_entry = self.window.last().unwrap();
let data_slice = &last_entry.data;
// We already reached the end of the window, check if we need to return a Literals{}
if self.suffix_idx >= data_slice.len() {
if self.last_idx_in_sequence != self.suffix_idx {
let literals = &data_slice[self.last_idx_in_sequence..];
self.last_idx_in_sequence = self.suffix_idx;
handle_sequence(Sequence::Literals { literals });
return true;
} else {
return false;
}
}
// If the remaining data is smaller than the minimum match length we can stop and return a Literals{}
let data_slice = &data_slice[self.suffix_idx..];
if data_slice.len() < MIN_MATCH_LEN {
let last_idx_in_sequence = self.last_idx_in_sequence;
self.last_idx_in_sequence = last_entry.data.len();
self.suffix_idx = last_entry.data.len();
handle_sequence(Sequence::Literals {
literals: &last_entry.data[last_idx_in_sequence..],
});
return true;
}
// This is the key we are looking to find a match for
let key = &data_slice[..MIN_MATCH_LEN];
// Look in each window entry
let mut candidate = None;
for (match_entry_idx, match_entry) in self.window.iter().enumerate() {
let is_last = match_entry_idx == self.window.len() - 1;
if let Some(match_index) = match_entry.suffixes.get(key) {
let match_slice = if is_last {
&match_entry.data[match_index..self.suffix_idx]
} else {
&match_entry.data[match_index..]
};
// Check how long the common prefix actually is
let match_len = Self::common_prefix_len(match_slice, data_slice);
// Collisions in the suffix store might make this check fail
if match_len >= MIN_MATCH_LEN {
let offset = match_entry.base_offset + self.suffix_idx - match_index;
// If we are in debug/tests make sure the match we found is actually at the offset we calculated
#[cfg(debug_assertions)]
{
let unprocessed = last_entry.data.len() - self.suffix_idx;
let start = self.concat_window.len() - unprocessed - offset;
let end = start + match_len;
let check_slice = &self.concat_window[start..end];
debug_assert_eq!(check_slice, &match_slice[..match_len]);
}
if let Some((old_offset, old_match_len)) = candidate {
if match_len > old_match_len
|| (match_len == old_match_len && offset < old_offset)
{
candidate = Some((offset, match_len));
}
} else {
candidate = Some((offset, match_len));
}
}
}
}
if let Some((offset, match_len)) = candidate {
// For each index in the match we found we do not need to look for another match
// But we still want them registered in the suffix store
self.add_suffixes_till(self.suffix_idx + match_len);
// All literals that were not included between this match and the last are now included here
let last_entry = self.window.last().unwrap();
let literals = &last_entry.data[self.last_idx_in_sequence..self.suffix_idx];
// Update the indexes, all indexes upto and including the current index have been included in a sequence now
self.suffix_idx += match_len;
self.last_idx_in_sequence = self.suffix_idx;
handle_sequence(Sequence::Triple {
literals,
offset,
match_len,
});
return true;
}
let last_entry = self.window.last_mut().unwrap();
let key = &last_entry.data[self.suffix_idx..self.suffix_idx + MIN_MATCH_LEN];
if !last_entry.suffixes.contains_key(key) {
last_entry.suffixes.insert(key, self.suffix_idx);
}
self.suffix_idx += 1;
}
}
/// Find the common prefix length between two byte slices
#[inline(always)]
fn common_prefix_len(a: &[u8], b: &[u8]) -> usize {
Self::mismatch_chunks::<8>(a, b)
}
/// Find the common prefix length between two byte slices with a configurable chunk length
/// This enables vectorization optimizations
fn mismatch_chunks<const N: usize>(xs: &[u8], ys: &[u8]) -> usize {
let off = core::iter::zip(xs.chunks_exact(N), ys.chunks_exact(N))
.take_while(|(x, y)| x == y)
.count()
* N;
off + core::iter::zip(&xs[off..], &ys[off..])
.take_while(|(x, y)| x == y)
.count()
}
/// Process bytes and add the suffixes to the suffix store up to a specific index
#[inline(always)]
fn add_suffixes_till(&mut self, idx: usize) {
let last_entry = self.window.last_mut().unwrap();
if last_entry.data.len() < MIN_MATCH_LEN {
return;
}
let slice = &last_entry.data[self.suffix_idx..idx];
for (key_index, key) in slice.windows(MIN_MATCH_LEN).enumerate() {
if !last_entry.suffixes.contains_key(key) {
last_entry.suffixes.insert(key, self.suffix_idx + key_index);
}
}
}
/// Skip matching for the whole current window entry
fn skip_matching(&mut self) {
let len = self.window.last().unwrap().data.len();
self.add_suffixes_till(len);
self.suffix_idx = len;
self.last_idx_in_sequence = len;
}
/// Add a new window entry. Will panic if the last window entry hasn't been processed properly.
/// If any resources are released by pushing the new entry they are returned via the callback
fn add_data(
&mut self,
data: Vec<u8>,
suffixes: SuffixStore,
reuse_space: impl FnMut(Vec<u8>, SuffixStore),
) {
assert!(
self.window.is_empty() || self.suffix_idx == self.window.last().unwrap().data.len()
);
self.reserve(data.len(), reuse_space);
#[cfg(debug_assertions)]
self.concat_window.extend_from_slice(&data);
if let Some(last_len) = self.window.last().map(|last| last.data.len()) {
for entry in self.window.iter_mut() {
entry.base_offset += last_len;
}
}
let len = data.len();
self.window.push(WindowEntry {
data,
suffixes,
base_offset: 0,
});
self.window_size += len;
self.suffix_idx = 0;
self.last_idx_in_sequence = 0;
}
/// Reserve space for a new window entry
/// If any resources are released by pushing the new entry they are returned via the callback
fn reserve(&mut self, amount: usize, mut reuse_space: impl FnMut(Vec<u8>, SuffixStore)) {
assert!(self.max_window_size >= amount);
while self.window_size + amount > self.max_window_size {
let removed = self.window.remove(0);
self.window_size -= removed.data.len();
#[cfg(debug_assertions)]
self.concat_window.drain(0..removed.data.len());
let WindowEntry {
suffixes,
data: leaked_vec,
base_offset: _,
} = removed;
reuse_space(leaked_vec, suffixes);
}
}
}
#[test]
fn matches() {
let mut matcher = MatchGenerator::new(1000);
let mut original_data = Vec::new();
let mut reconstructed = Vec::new();
let assert_seq_equal = |seq1: Sequence<'_>, seq2: Sequence<'_>, reconstructed: &mut Vec<u8>| {
assert_eq!(seq1, seq2);
match seq2 {
Sequence::Literals { literals } => reconstructed.extend_from_slice(literals),
Sequence::Triple {
literals,
offset,
match_len,
} => {
reconstructed.extend_from_slice(literals);
let start = reconstructed.len() - offset;
let end = start + match_len;
reconstructed.extend_from_within(start..end);
}
}
};
matcher.add_data(
alloc::vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[0, 0, 0, 0, 0],
offset: 5,
match_len: 5,
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0,],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[
1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0,
]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[1, 2, 3, 4, 5, 6],
offset: 6,
match_len: 6,
},
&mut reconstructed,
)
});
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[],
offset: 12,
match_len: 6,
},
&mut reconstructed,
)
});
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[],
offset: 28,
match_len: 5,
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[],
offset: 23,
match_len: 6,
},
&mut reconstructed,
)
});
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[7, 8, 9, 10, 11],
offset: 16,
match_len: 5,
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![0, 0, 0, 0, 0],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[0, 0, 0, 0, 0]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[],
offset: 5,
match_len: 5,
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![7, 8, 9, 10, 11],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[7, 8, 9, 10, 11]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[],
offset: 15,
match_len: 5,
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![1, 3, 5, 7, 9],
SuffixStore::with_capacity(100),
|_, _| {},
);
matcher.skip_matching();
original_data.extend_from_slice(&[1, 3, 5, 7, 9]);
reconstructed.extend_from_slice(&[1, 3, 5, 7, 9]);
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![1, 3, 5, 7, 9],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[1, 3, 5, 7, 9]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[],
offset: 5,
match_len: 5,
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
matcher.add_data(
alloc::vec![0, 0, 11, 13, 15, 17, 20, 11, 13, 15, 17, 20, 21, 23],
SuffixStore::with_capacity(100),
|_, _| {},
);
original_data.extend_from_slice(&[0, 0, 11, 13, 15, 17, 20, 11, 13, 15, 17, 20, 21, 23]);
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Triple {
literals: &[0, 0, 11, 13, 15, 17, 20],
offset: 5,
match_len: 5,
},
&mut reconstructed,
)
});
matcher.next_sequence(|seq| {
assert_seq_equal(
seq,
Sequence::Literals {
literals: &[21, 23],
},
&mut reconstructed,
)
});
assert!(!matcher.next_sequence(|_| {}));
assert_eq!(reconstructed, original_data);
}

118
vendor/ruzstd/src/encoding/mod.rs vendored Normal file
View File

@@ -0,0 +1,118 @@
//! Structures and utilities used for compressing/encoding data into the Zstd format.
pub(crate) mod block_header;
pub(crate) mod blocks;
pub(crate) mod frame_header;
pub(crate) mod match_generator;
pub(crate) mod util;
mod frame_compressor;
mod levels;
pub use frame_compressor::FrameCompressor;
use crate::io::{Read, Write};
use alloc::vec::Vec;
/// Convenience function to compress some source into a target without reusing any resources of the compressor
/// ```rust
/// use ruzstd::encoding::{compress, CompressionLevel};
/// let data: &[u8] = &[0,0,0,0,0,0,0,0,0,0,0,0];
/// let mut target = Vec::new();
/// compress(data, &mut target, CompressionLevel::Fastest);
/// ```
pub fn compress<R: Read, W: Write>(source: R, target: W, level: CompressionLevel) {
let mut frame_enc = FrameCompressor::new(level);
frame_enc.set_source(source);
frame_enc.set_drain(target);
frame_enc.compress();
}
/// Convenience function to compress some source into a Vec without reusing any resources of the compressor
/// ```rust
/// use ruzstd::encoding::{compress_to_vec, CompressionLevel};
/// let data: &[u8] = &[0,0,0,0,0,0,0,0,0,0,0,0];
/// let compressed = compress_to_vec(data, CompressionLevel::Fastest);
/// ```
pub fn compress_to_vec<R: Read>(source: R, level: CompressionLevel) -> Vec<u8> {
let mut vec = Vec::new();
compress(source, &mut vec, level);
vec
}
/// The compression mode used impacts the speed of compression,
/// and resulting compression ratios. Faster compression will result
/// in worse compression ratios, and vice versa.
#[derive(Copy, Clone)]
pub enum CompressionLevel {
/// This level does not compress the data at all, and simply wraps
/// it in a Zstandard frame.
Uncompressed,
/// This level is roughly equivalent to Zstd compression level 1
Fastest,
/// This level is roughly equivalent to Zstd level 3,
/// or the one used by the official compressor when no level
/// is specified.
///
/// UNIMPLEMENTED
Default,
/// This level is roughly equivalent to Zstd level 7.
///
/// UNIMPLEMENTED
Better,
/// This level is roughly equivalent to Zstd level 11.
///
/// UNIMPLEMENTED
Best,
}
/// Trait used by the encoder that users can use to extend the matching facilities with their own algorithm
/// making their own tradeoffs between runtime, memory usage and compression ratio
///
/// This trait operates on buffers that represent the chunks of data the matching algorithm wants to work on.
/// Each one of these buffers is referred to as a *space*. One or more of these buffers represent the window
/// the decoder will need to decode the data again.
///
/// This library asks the Matcher for a new buffer using `get_next_space` to allow reusing of allocated buffers when they are no longer part of the
/// window of data that is being used for matching.
///
/// The library fills the buffer with data that is to be compressed and commits them back to the matcher using `commit_space`.
///
/// Then it will either call `start_matching` or, if the space is deemed not worth compressing, `skip_matching` is called.
///
/// This is repeated until no more data is left to be compressed.
pub trait Matcher {
/// Get a space where we can put data to be matched on. Will be encoded as one block. The maximum allowed size is 128 kB.
fn get_next_space(&mut self) -> alloc::vec::Vec<u8>;
/// Get a reference to the last commited space
fn get_last_space(&mut self) -> &[u8];
/// Commit a space to the matcher so it can be matched against
fn commit_space(&mut self, space: alloc::vec::Vec<u8>);
/// Just process the data in the last commited space for future matching
fn skip_matching(&mut self);
/// Process the data in the last commited space for future matching AND generate matches for the data
fn start_matching(&mut self, handle_sequence: impl for<'a> FnMut(Sequence<'a>));
/// Reset this matcher so it can be used for the next new frame
fn reset(&mut self, level: CompressionLevel);
/// The size of the window the decoder will need to execute all sequences produced by this matcher
///
/// May change after a call to reset with a different compression level
fn window_size(&self) -> u64;
}
#[derive(PartialEq, Eq, Debug)]
/// Sequences that a [`Matcher`] can produce
pub enum Sequence<'data> {
/// Is encoded as a sequence for the decoder sequence execution.
///
/// First the literals will be copied to the decoded data,
/// then `match_len` bytes are copied from `offset` bytes back in the buffer
Triple {
literals: &'data [u8],
offset: usize,
match_len: usize,
},
/// This is returned as the last sequence in a block
///
/// These literals will just be copied at the end of the sequence execution by the decoder
Literals { literals: &'data [u8] },
}

60
vendor/ruzstd/src/encoding/util.rs vendored Normal file
View File

@@ -0,0 +1,60 @@
use alloc::vec::Vec;
/// Returns the minimum number of bytes needed to represent this value, as
/// either 1, 2, 4, or 8 bytes. A value of 0 will still return one byte.
///
/// Used for variable length fields like `Dictionary_ID` or `Frame_Content_Size`.
pub fn find_min_size(val: u64) -> usize {
if val == 0 {
return 1;
}
if val >> 8 == 0 {
return 1;
}
if val >> 16 == 0 {
return 2;
}
if val >> 32 == 0 {
return 4;
}
8
}
/// Returns the same value, but represented using the smallest number of bytes needed.
/// Returned vector will be 1, 2, 4, or 8 bytes in length. Zero is represented as 1 byte.
///
/// Operates in **little-endian**.
pub fn minify_val(val: u64) -> Vec<u8> {
let new_size = find_min_size(val);
val.to_le_bytes()[0..new_size].to_vec()
}
#[cfg(test)]
mod tests {
use super::find_min_size;
use super::minify_val;
use alloc::vec;
#[test]
fn min_size_detection() {
assert_eq!(find_min_size(0), 1);
assert_eq!(find_min_size(0xff), 1);
assert_eq!(find_min_size(0xff_ff), 2);
assert_eq!(find_min_size(0x00_ff_ff_ff), 4);
assert_eq!(find_min_size(0xff_ff_ff_ff), 4);
assert_eq!(find_min_size(0x00ff_ffff_ffff_ffff), 8);
assert_eq!(find_min_size(0xffff_ffff_ffff_ffff), 8);
}
#[test]
fn bytes_minified() {
assert_eq!(minify_val(0), vec![0]);
assert_eq!(minify_val(0xff), vec![0xff]);
assert_eq!(minify_val(0xff_ff), vec![0xff, 0xff]);
assert_eq!(minify_val(0xff_ff_ff_ff), vec![0xff, 0xff, 0xff, 0xff]);
assert_eq!(
minify_val(0xffff_ffff_ffff_ffff),
vec![0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff]
);
}
}