mirror of
https://github.com/rustic-rs/rustic.git
synced 2025-10-26 11:18:51 +00:00
fix cdc compile errors by integrating cdc
This commit is contained in:
parent
a260e948db
commit
bdb68852ca
7
Cargo.lock
generated
7
Cargo.lock
generated
@ -211,12 +211,6 @@ dependencies = [
|
||||
"jobserver",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cdc"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f421655f68953d1cae92f72da23d7842679bd413e96735dac14ba1bbdf1c155b"
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.0"
|
||||
@ -1719,7 +1713,6 @@ dependencies = [
|
||||
"bytes",
|
||||
"bytesize",
|
||||
"cachedir",
|
||||
"cdc",
|
||||
"chrono",
|
||||
"clap",
|
||||
"clap_complete",
|
||||
|
||||
@ -39,7 +39,7 @@ sha2 = "0.10"
|
||||
rand = "0.8"
|
||||
scrypt = { version = "0.10", default-features = false }
|
||||
# chunker / packer
|
||||
cdc = "0.1"
|
||||
# cdc = "0.1"
|
||||
integer-sqrt = "0.1"
|
||||
# serialization
|
||||
base64 = "0.20"
|
||||
|
||||
21
src/cdc/LICENSE.txt
Normal file
21
src/cdc/LICENSE.txt
Normal file
@ -0,0 +1,21 @@
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2016 Vincent Cantin (https://github.com/green-coder)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
||||
this software and associated documentation files (the "Software"), to deal in
|
||||
the Software without restriction, including without limitation the rights to
|
||||
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
||||
of the Software, and to permit persons to whom the Software is furnished to do
|
||||
so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
60
src/cdc/README.md
Normal file
60
src/cdc/README.md
Normal file
@ -0,0 +1,60 @@
|
||||
cdc
|
||||
========
|
||||
|
||||
A library for performing *Content-Defined Chunking* (CDC) on data streams. Implemented using generic iterators, very easy to use.
|
||||
|
||||
- [API Documentation](https://docs.rs/cdc/)
|
||||
|
||||
## Example
|
||||
|
||||
```rust
|
||||
let reader: BufReader<File> = BufReader::new(file);
|
||||
let byte_iter = reader.bytes().map(|b| b.unwrap());
|
||||
|
||||
// Finds and iterates on the separators.
|
||||
for separator in SeparatorIter::new(byte_iter) {
|
||||
println!("Index: {}, hash: {:016x}", separator.index, separator.hash);
|
||||
}
|
||||
```
|
||||
|
||||
Each module is documented via an example which you can find in the `examples/` folder.
|
||||
|
||||
To run them, use a command like:
|
||||
|
||||
cargo run --example separator --release
|
||||
|
||||
**Note:** Some examples are looking for a file named `myLargeFile.bin` which I didn't upload to Github. Please use your own files for testing.
|
||||
|
||||
## What's in the crate
|
||||
|
||||
From low level to high level:
|
||||
|
||||
* A `RollingHash64` trait, for rolling hash with a 64 bits hash value.
|
||||
|
||||
* `Rabin64`, an implementation of the Rabin Fingerprint rolling hash with a 64 bits hash value.
|
||||
|
||||
* `Separator`, a struct which describes a place in a data stream identified as a separator.
|
||||
|
||||
* `SeparatorIter`, an adaptor which takes an `Iterator<Item=u8>` as input and which enumerates all the separators found.
|
||||
|
||||
* `Chunk`, a struct which describes a piece of the data stream (index and size).
|
||||
|
||||
* `ChunkIter`, an adaptor which takes an `Iterator<Item=Separator>` as input and which enumerates chunks.
|
||||
|
||||
## Implementation details
|
||||
|
||||
* The library is not cutting any files, it only provides information on how to do it.
|
||||
|
||||
* You can change the default window size used by `Rabin64`, and how the `SeparatorIter` is choosing the separator.
|
||||
|
||||
* The design of this crate may be subject to changes sometime in the future. I am waiting for some features of `Rust` to mature up, specially the [`impl Trait`](https://github.com/rust-lang/rust/issues/34511) feature.
|
||||
|
||||
## Performance
|
||||
|
||||
There is a **huge** difference between the debug build and the release build in terms of performance. Remember that when you test the lib, use `cargo run --release`.
|
||||
|
||||
I may try to improve the performance of the lib at some point, but for now it is good enough for most usages.
|
||||
|
||||
## License
|
||||
|
||||
Coded with ❤️ , licensed under the terms of the [MIT license](LICENSE.txt).
|
||||
5
src/cdc/mod.rs
Normal file
5
src/cdc/mod.rs
Normal file
@ -0,0 +1,5 @@
|
||||
mod polynom;
|
||||
mod rolling_hash;
|
||||
|
||||
pub use polynom::{Polynom, Polynom64};
|
||||
pub use rolling_hash::{Rabin64, RollingHash64};
|
||||
51
src/cdc/polynom.rs
Normal file
51
src/cdc/polynom.rs
Normal file
@ -0,0 +1,51 @@
|
||||
// The irreductible polynom to be used in the fingerprint function.
|
||||
pub trait Polynom {
|
||||
fn degree(&self) -> i32;
|
||||
fn modulo(&self, m: &Self) -> Self;
|
||||
}
|
||||
|
||||
pub type Polynom64 = u64;
|
||||
|
||||
impl Polynom for Polynom64 {
|
||||
// The degree of the polynom.
|
||||
fn degree(&self) -> i32 {
|
||||
63 - self.leading_zeros() as i32
|
||||
}
|
||||
|
||||
fn modulo(&self, m: &Self) -> Self {
|
||||
let mut p = *self;
|
||||
while p.degree() >= m.degree() {
|
||||
p ^= m << (p.degree() - m.degree());
|
||||
}
|
||||
|
||||
p
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn polynom_degree() {
|
||||
assert_eq!(0u64.degree(), -1);
|
||||
assert_eq!(1u64.degree(), 0);
|
||||
|
||||
assert_eq!(((1u64 << 7) - 1).degree(), 6);
|
||||
assert_eq!((1u64 << 7).degree(), 7);
|
||||
assert_eq!(((1u64 << 7) + 1).degree(), 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn polynom_modulo() {
|
||||
assert_eq!(7u64.modulo(&3), 1);
|
||||
assert_eq!(7u64.modulo(&4), 3);
|
||||
assert_eq!(7u64.modulo(&2), 1);
|
||||
|
||||
assert_eq!(16u64.modulo(&8), 0);
|
||||
assert_eq!(19u64.modulo(&8), 3);
|
||||
|
||||
assert_eq!(16u64.modulo(&4), 0);
|
||||
assert_eq!(19u64.modulo(&4), 3);
|
||||
}
|
||||
}
|
||||
219
src/cdc/rolling_hash.rs
Normal file
219
src/cdc/rolling_hash.rs
Normal file
@ -0,0 +1,219 @@
|
||||
use super::{Polynom, Polynom64};
|
||||
|
||||
pub trait RollingHash64 {
|
||||
fn reset(&mut self);
|
||||
fn prefill_window<I>(&mut self, iter: &mut I) -> usize
|
||||
where
|
||||
I: Iterator<Item = u8>;
|
||||
fn reset_and_prefill_window<I>(&mut self, iter: &mut I) -> usize
|
||||
where
|
||||
I: Iterator<Item = u8>;
|
||||
fn slide(&mut self, byte: u8);
|
||||
fn get_hash(&self) -> &Polynom64;
|
||||
}
|
||||
|
||||
pub struct Rabin64 {
|
||||
// Configuration
|
||||
window_size: usize, // The size of the data window used in the hash calculation.
|
||||
window_size_mask: usize, // = window_size - 1, supposing that it is an exponent of 2.
|
||||
|
||||
// Precalculations
|
||||
polynom_shift: i32,
|
||||
out_table: [Polynom64; 256],
|
||||
mod_table: [Polynom64; 256],
|
||||
|
||||
// Current state
|
||||
window_data: Vec<u8>,
|
||||
window_index: usize,
|
||||
pub hash: Polynom64,
|
||||
}
|
||||
|
||||
|
||||
impl Rabin64 {
|
||||
pub fn calculate_out_table(window_size: usize, mod_polynom: &Polynom64) -> [Polynom64; 256] {
|
||||
let mut out_table = [0; 256];
|
||||
for (b, elem) in out_table.iter_mut().enumerate() {
|
||||
let mut hash = (b as Polynom64).modulo(mod_polynom);
|
||||
for _ in 0..window_size - 1 {
|
||||
hash <<= 8;
|
||||
hash = hash.modulo(mod_polynom);
|
||||
}
|
||||
*elem = hash;
|
||||
}
|
||||
|
||||
out_table
|
||||
}
|
||||
|
||||
pub fn calculate_mod_table(mod_polynom: &Polynom64) -> [Polynom64; 256] {
|
||||
let mut mod_table = [0; 256];
|
||||
let k = mod_polynom.degree();
|
||||
for (b, elem) in mod_table.iter_mut().enumerate() {
|
||||
let p: Polynom64 = (b as Polynom64) << k;
|
||||
*elem = p.modulo(mod_polynom) | p;
|
||||
}
|
||||
|
||||
mod_table
|
||||
}
|
||||
|
||||
pub fn new_with_polynom(window_size_nb_bits: u32, mod_polynom: &Polynom64) -> Rabin64 {
|
||||
let window_size = 1 << window_size_nb_bits;
|
||||
|
||||
let window_data = vec![0; window_size];
|
||||
|
||||
Rabin64 {
|
||||
window_size,
|
||||
window_size_mask: window_size - 1,
|
||||
polynom_shift: mod_polynom.degree() - 8,
|
||||
out_table: Self::calculate_out_table(window_size, mod_polynom),
|
||||
mod_table: Self::calculate_mod_table(mod_polynom),
|
||||
window_data,
|
||||
window_index: 0,
|
||||
hash: 0,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn hash_block(&mut self, bytes: &[u8], mod_polynom: &Polynom64) {
|
||||
for v in bytes {
|
||||
self.hash <<= 8;
|
||||
self.hash |= *v as Polynom64;
|
||||
self.hash = self.hash.modulo(&mod_polynom);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RollingHash64 for Rabin64 {
|
||||
fn reset(&mut self) {
|
||||
self.window_data.clear();
|
||||
self.window_data.resize(self.window_size, 0);
|
||||
self.window_index = 0;
|
||||
self.hash = 0;
|
||||
|
||||
// Not needed.
|
||||
// self.slide(1);
|
||||
}
|
||||
|
||||
// Attempt to fills the window - 1 byte.
|
||||
fn prefill_window<I>(&mut self, iter: &mut I) -> usize
|
||||
where
|
||||
I: Iterator<Item = u8>,
|
||||
{
|
||||
let mut nb_bytes_read = 0;
|
||||
for _ in 0..self.window_size - 1 {
|
||||
match iter.next() {
|
||||
Some(b) => {
|
||||
self.slide(b);
|
||||
nb_bytes_read += 1;
|
||||
}
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
|
||||
nb_bytes_read
|
||||
}
|
||||
|
||||
// Combines a reset with a prefill in an optimized way.
|
||||
fn reset_and_prefill_window<I>(&mut self, iter: &mut I) -> usize
|
||||
where
|
||||
I: Iterator<Item = u8>,
|
||||
{
|
||||
self.hash = 0;
|
||||
let mut nb_bytes_read = 0;
|
||||
for _ in 0..self.window_size - 1 {
|
||||
match iter.next() {
|
||||
Some(b) => {
|
||||
// Take the old value out of the window and the hash.
|
||||
// ... let's suppose that the buffer contains zeroes, do nothing.
|
||||
|
||||
// Put the new value in the window and in the hash.
|
||||
self.window_data[self.window_index] = b;
|
||||
let mod_index = (self.hash >> self.polynom_shift) & 255;
|
||||
self.hash <<= 8;
|
||||
self.hash |= b as Polynom64;
|
||||
self.hash ^= self.mod_table[mod_index as usize];
|
||||
|
||||
// Move the windowIndex to the next position.
|
||||
self.window_index = (self.window_index + 1) & self.window_size_mask;
|
||||
|
||||
nb_bytes_read += 1;
|
||||
}
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
|
||||
// Because we didn't overwrite that element in the loop above.
|
||||
self.window_data[self.window_index] = 0;
|
||||
|
||||
nb_bytes_read
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn slide(&mut self, byte: u8) {
|
||||
// Take the old value out of the window and the hash.
|
||||
let out_value = self.window_data[self.window_index];
|
||||
self.hash ^= self.out_table[out_value as usize];
|
||||
|
||||
// Put the new value in the window and in the hash.
|
||||
self.window_data[self.window_index] = byte;
|
||||
let mod_index = (self.hash >> self.polynom_shift) & 255;
|
||||
self.hash <<= 8;
|
||||
self.hash |= byte as Polynom64;
|
||||
self.hash ^= self.mod_table[mod_index as usize];
|
||||
|
||||
// Move the windowIndex to the next position.
|
||||
self.window_index = (self.window_index + 1) & self.window_size_mask;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_hash(&self) -> &Polynom64 {
|
||||
&self.hash
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::super::polynom::Polynom64;
|
||||
use super::*;
|
||||
|
||||
fn to_hex_string(polynoms: &[Polynom64], prefix: &str) -> String {
|
||||
let strs: Vec<String> = polynoms
|
||||
.iter()
|
||||
.map(|p| format!("{}{:016x} {}", prefix, p, 0))
|
||||
.collect();
|
||||
strs.join("\n")
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_tables() {
|
||||
let out_table = Rabin64::calculate_out_table(32, &MOD_POLYNOM);
|
||||
let mod_table = Rabin64::calculate_mod_table(&MOD_POLYNOM);
|
||||
println!("{}", to_hex_string(&out_table[..], "outTable "));
|
||||
println!("{}", to_hex_string(&mod_table[..], "modTable "));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rabin_hash() {
|
||||
use std::cmp::max;
|
||||
|
||||
// Random meaningless data.
|
||||
let data = [
|
||||
17u8, 28, 53, 64, 175, 216, 27, 208, 109, 130, 143, 35, 93, 244, 45, 18, 64, 193, 204,
|
||||
59, 169, 139, 53, 59, 55, 65, 242, 73, 60, 198, 45, 22, 56, 90, 81, 181,
|
||||
];
|
||||
|
||||
let mut rabin1 = Rabin64::new(5);
|
||||
let mut rabin2 = Rabin64::new(5);
|
||||
|
||||
// Block by block, no optimization, used raw modulo formula.
|
||||
for i in 0..data.len() {
|
||||
let block = &data[max(31, i) - 31..i + 1];
|
||||
rabin1.reset();
|
||||
rabin1.hash_block(block, &MOD_POLYNOM);
|
||||
|
||||
rabin2.slide(data[i]);
|
||||
|
||||
//println!("{:02} {:02} {:016x} {:016x} {:?}", i, block.len(), rabin1.hash, rabin2.hash, block);
|
||||
assert_eq!(rabin1.hash, rabin2.hash);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,9 +1,10 @@
|
||||
use std::io::{self, Read};
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use cdc::{Polynom, Polynom64, Rabin64, RollingHash64};
|
||||
use rand::{thread_rng, Rng};
|
||||
|
||||
use crate::cdc::{Polynom, Polynom64, Rabin64, RollingHash64};
|
||||
|
||||
const SPLITMASK: u64 = (1u64 << 20) - 1;
|
||||
const KB: usize = 1024;
|
||||
const MB: usize = 1024 * KB;
|
||||
@ -115,7 +116,7 @@ impl<R: Read + Send> Iterator for ChunkIter<R> {
|
||||
let byte = self.buf[self.pos];
|
||||
vec.push(byte);
|
||||
self.pos += 1;
|
||||
self.rabin.slide(&byte);
|
||||
self.rabin.slide(byte);
|
||||
}
|
||||
self.size_hint -= vec.len();
|
||||
Some(Ok(vec))
|
||||
|
||||
@ -47,6 +47,8 @@ mod index;
|
||||
mod repofile;
|
||||
mod repository;
|
||||
|
||||
mod cdc;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// this is a workaround until unix_sigpipe (https://github.com/rust-lang/rust/issues/97889) is available.
|
||||
// See also https://github.com/rust-lang/rust/issues/46016
|
||||
|
||||
Loading…
Reference in New Issue
Block a user