fix cdc compile errors by integrating cdc

This commit is contained in:
Alexander Weiss 2023-01-14 06:51:48 +01:00
parent a260e948db
commit bdb68852ca
9 changed files with 362 additions and 10 deletions

7
Cargo.lock generated
View File

@ -211,12 +211,6 @@ dependencies = [
"jobserver",
]
[[package]]
name = "cdc"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f421655f68953d1cae92f72da23d7842679bd413e96735dac14ba1bbdf1c155b"
[[package]]
name = "cfg-if"
version = "1.0.0"
@ -1719,7 +1713,6 @@ dependencies = [
"bytes",
"bytesize",
"cachedir",
"cdc",
"chrono",
"clap",
"clap_complete",

View File

@ -39,7 +39,7 @@ sha2 = "0.10"
rand = "0.8"
scrypt = { version = "0.10", default-features = false }
# chunker / packer
cdc = "0.1"
# cdc = "0.1"
integer-sqrt = "0.1"
# serialization
base64 = "0.20"

21
src/cdc/LICENSE.txt Normal file
View File

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2016 Vincent Cantin (https://github.com/green-coder)
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

60
src/cdc/README.md Normal file
View File

@ -0,0 +1,60 @@
cdc
========
A library for performing *Content-Defined Chunking* (CDC) on data streams. Implemented using generic iterators, very easy to use.
- [API Documentation](https://docs.rs/cdc/)
## Example
```rust
let reader: BufReader<File> = BufReader::new(file);
let byte_iter = reader.bytes().map(|b| b.unwrap());
// Finds and iterates on the separators.
for separator in SeparatorIter::new(byte_iter) {
println!("Index: {}, hash: {:016x}", separator.index, separator.hash);
}
```
Each module is documented via an example which you can find in the `examples/` folder.
To run them, use a command like:
cargo run --example separator --release
**Note:** Some examples are looking for a file named `myLargeFile.bin` which I didn't upload to Github. Please use your own files for testing.
## What's in the crate
From low level to high level:
* A `RollingHash64` trait, for rolling hash with a 64 bits hash value.
* `Rabin64`, an implementation of the Rabin Fingerprint rolling hash with a 64 bits hash value.
* `Separator`, a struct which describes a place in a data stream identified as a separator.
* `SeparatorIter`, an adaptor which takes an `Iterator<Item=u8>` as input and which enumerates all the separators found.
* `Chunk`, a struct which describes a piece of the data stream (index and size).
* `ChunkIter`, an adaptor which takes an `Iterator<Item=Separator>` as input and which enumerates chunks.
## Implementation details
* The library is not cutting any files, it only provides information on how to do it.
* You can change the default window size used by `Rabin64`, and how the `SeparatorIter` is choosing the separator.
* The design of this crate may be subject to changes sometime in the future. I am waiting for some features of `Rust` to mature up, specially the [`impl Trait`](https://github.com/rust-lang/rust/issues/34511) feature.
## Performance
There is a **huge** difference between the debug build and the release build in terms of performance. Remember that when you test the lib, use `cargo run --release`.
I may try to improve the performance of the lib at some point, but for now it is good enough for most usages.
## License
Coded with ❤️ , licensed under the terms of the [MIT license](LICENSE.txt).

5
src/cdc/mod.rs Normal file
View File

@ -0,0 +1,5 @@
mod polynom;
mod rolling_hash;
pub use polynom::{Polynom, Polynom64};
pub use rolling_hash::{Rabin64, RollingHash64};

51
src/cdc/polynom.rs Normal file
View File

@ -0,0 +1,51 @@
// The irreductible polynom to be used in the fingerprint function.
pub trait Polynom {
fn degree(&self) -> i32;
fn modulo(&self, m: &Self) -> Self;
}
pub type Polynom64 = u64;
impl Polynom for Polynom64 {
// The degree of the polynom.
fn degree(&self) -> i32 {
63 - self.leading_zeros() as i32
}
fn modulo(&self, m: &Self) -> Self {
let mut p = *self;
while p.degree() >= m.degree() {
p ^= m << (p.degree() - m.degree());
}
p
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn polynom_degree() {
assert_eq!(0u64.degree(), -1);
assert_eq!(1u64.degree(), 0);
assert_eq!(((1u64 << 7) - 1).degree(), 6);
assert_eq!((1u64 << 7).degree(), 7);
assert_eq!(((1u64 << 7) + 1).degree(), 7);
}
#[test]
fn polynom_modulo() {
assert_eq!(7u64.modulo(&3), 1);
assert_eq!(7u64.modulo(&4), 3);
assert_eq!(7u64.modulo(&2), 1);
assert_eq!(16u64.modulo(&8), 0);
assert_eq!(19u64.modulo(&8), 3);
assert_eq!(16u64.modulo(&4), 0);
assert_eq!(19u64.modulo(&4), 3);
}
}

219
src/cdc/rolling_hash.rs Normal file
View File

@ -0,0 +1,219 @@
use super::{Polynom, Polynom64};
pub trait RollingHash64 {
fn reset(&mut self);
fn prefill_window<I>(&mut self, iter: &mut I) -> usize
where
I: Iterator<Item = u8>;
fn reset_and_prefill_window<I>(&mut self, iter: &mut I) -> usize
where
I: Iterator<Item = u8>;
fn slide(&mut self, byte: u8);
fn get_hash(&self) -> &Polynom64;
}
pub struct Rabin64 {
// Configuration
window_size: usize, // The size of the data window used in the hash calculation.
window_size_mask: usize, // = window_size - 1, supposing that it is an exponent of 2.
// Precalculations
polynom_shift: i32,
out_table: [Polynom64; 256],
mod_table: [Polynom64; 256],
// Current state
window_data: Vec<u8>,
window_index: usize,
pub hash: Polynom64,
}
impl Rabin64 {
pub fn calculate_out_table(window_size: usize, mod_polynom: &Polynom64) -> [Polynom64; 256] {
let mut out_table = [0; 256];
for (b, elem) in out_table.iter_mut().enumerate() {
let mut hash = (b as Polynom64).modulo(mod_polynom);
for _ in 0..window_size - 1 {
hash <<= 8;
hash = hash.modulo(mod_polynom);
}
*elem = hash;
}
out_table
}
pub fn calculate_mod_table(mod_polynom: &Polynom64) -> [Polynom64; 256] {
let mut mod_table = [0; 256];
let k = mod_polynom.degree();
for (b, elem) in mod_table.iter_mut().enumerate() {
let p: Polynom64 = (b as Polynom64) << k;
*elem = p.modulo(mod_polynom) | p;
}
mod_table
}
pub fn new_with_polynom(window_size_nb_bits: u32, mod_polynom: &Polynom64) -> Rabin64 {
let window_size = 1 << window_size_nb_bits;
let window_data = vec![0; window_size];
Rabin64 {
window_size,
window_size_mask: window_size - 1,
polynom_shift: mod_polynom.degree() - 8,
out_table: Self::calculate_out_table(window_size, mod_polynom),
mod_table: Self::calculate_mod_table(mod_polynom),
window_data,
window_index: 0,
hash: 0,
}
}
#[cfg(test)]
pub fn hash_block(&mut self, bytes: &[u8], mod_polynom: &Polynom64) {
for v in bytes {
self.hash <<= 8;
self.hash |= *v as Polynom64;
self.hash = self.hash.modulo(&mod_polynom);
}
}
}
impl RollingHash64 for Rabin64 {
fn reset(&mut self) {
self.window_data.clear();
self.window_data.resize(self.window_size, 0);
self.window_index = 0;
self.hash = 0;
// Not needed.
// self.slide(1);
}
// Attempt to fills the window - 1 byte.
fn prefill_window<I>(&mut self, iter: &mut I) -> usize
where
I: Iterator<Item = u8>,
{
let mut nb_bytes_read = 0;
for _ in 0..self.window_size - 1 {
match iter.next() {
Some(b) => {
self.slide(b);
nb_bytes_read += 1;
}
None => break,
}
}
nb_bytes_read
}
// Combines a reset with a prefill in an optimized way.
fn reset_and_prefill_window<I>(&mut self, iter: &mut I) -> usize
where
I: Iterator<Item = u8>,
{
self.hash = 0;
let mut nb_bytes_read = 0;
for _ in 0..self.window_size - 1 {
match iter.next() {
Some(b) => {
// Take the old value out of the window and the hash.
// ... let's suppose that the buffer contains zeroes, do nothing.
// Put the new value in the window and in the hash.
self.window_data[self.window_index] = b;
let mod_index = (self.hash >> self.polynom_shift) & 255;
self.hash <<= 8;
self.hash |= b as Polynom64;
self.hash ^= self.mod_table[mod_index as usize];
// Move the windowIndex to the next position.
self.window_index = (self.window_index + 1) & self.window_size_mask;
nb_bytes_read += 1;
}
None => break,
}
}
// Because we didn't overwrite that element in the loop above.
self.window_data[self.window_index] = 0;
nb_bytes_read
}
#[inline]
fn slide(&mut self, byte: u8) {
// Take the old value out of the window and the hash.
let out_value = self.window_data[self.window_index];
self.hash ^= self.out_table[out_value as usize];
// Put the new value in the window and in the hash.
self.window_data[self.window_index] = byte;
let mod_index = (self.hash >> self.polynom_shift) & 255;
self.hash <<= 8;
self.hash |= byte as Polynom64;
self.hash ^= self.mod_table[mod_index as usize];
// Move the windowIndex to the next position.
self.window_index = (self.window_index + 1) & self.window_size_mask;
}
#[inline]
fn get_hash(&self) -> &Polynom64 {
&self.hash
}
}
#[cfg(test)]
mod tests {
use super::super::polynom::Polynom64;
use super::*;
fn to_hex_string(polynoms: &[Polynom64], prefix: &str) -> String {
let strs: Vec<String> = polynoms
.iter()
.map(|p| format!("{}{:016x} {}", prefix, p, 0))
.collect();
strs.join("\n")
}
#[test]
fn print_tables() {
let out_table = Rabin64::calculate_out_table(32, &MOD_POLYNOM);
let mod_table = Rabin64::calculate_mod_table(&MOD_POLYNOM);
println!("{}", to_hex_string(&out_table[..], "outTable "));
println!("{}", to_hex_string(&mod_table[..], "modTable "));
}
#[test]
fn rabin_hash() {
use std::cmp::max;
// Random meaningless data.
let data = [
17u8, 28, 53, 64, 175, 216, 27, 208, 109, 130, 143, 35, 93, 244, 45, 18, 64, 193, 204,
59, 169, 139, 53, 59, 55, 65, 242, 73, 60, 198, 45, 22, 56, 90, 81, 181,
];
let mut rabin1 = Rabin64::new(5);
let mut rabin2 = Rabin64::new(5);
// Block by block, no optimization, used raw modulo formula.
for i in 0..data.len() {
let block = &data[max(31, i) - 31..i + 1];
rabin1.reset();
rabin1.hash_block(block, &MOD_POLYNOM);
rabin2.slide(data[i]);
//println!("{:02} {:02} {:016x} {:016x} {:?}", i, block.len(), rabin1.hash, rabin2.hash, block);
assert_eq!(rabin1.hash, rabin2.hash);
}
}
}

View File

@ -1,9 +1,10 @@
use std::io::{self, Read};
use anyhow::{anyhow, Result};
use cdc::{Polynom, Polynom64, Rabin64, RollingHash64};
use rand::{thread_rng, Rng};
use crate::cdc::{Polynom, Polynom64, Rabin64, RollingHash64};
const SPLITMASK: u64 = (1u64 << 20) - 1;
const KB: usize = 1024;
const MB: usize = 1024 * KB;
@ -115,7 +116,7 @@ impl<R: Read + Send> Iterator for ChunkIter<R> {
let byte = self.buf[self.pos];
vec.push(byte);
self.pos += 1;
self.rabin.slide(&byte);
self.rabin.slide(byte);
}
self.size_hint -= vec.len();
Some(Ok(vec))

View File

@ -47,6 +47,8 @@ mod index;
mod repofile;
mod repository;
mod cdc;
fn main() -> Result<()> {
// this is a workaround until unix_sigpipe (https://github.com/rust-lang/rust/issues/97889) is available.
// See also https://github.com/rust-lang/rust/issues/46016