diff --git a/src/archiver/file_archiver.rs b/src/archiver/file_archiver.rs index 35e5a58..95fc90c 100644 --- a/src/archiver/file_archiver.rs +++ b/src/archiver/file_archiver.rs @@ -6,7 +6,7 @@ use rayon::prelude::*; use crate::backend::{DecryptWriteBackend, ReadSourceOpen}; use crate::blob::{BlobType, Node, NodeType, Packer, PackerStats}; -use crate::chunker::ChunkIter; +use crate::chunker::{ChunkIter, Rabin64}; use crate::crypto::hash; use crate::index::{IndexedBackend, SharedIndexer}; use crate::repofile::ConfigFile; @@ -17,7 +17,7 @@ use super::{ItemWithParent, ParentResult, TreeItem, TreeType}; pub struct FileArchiver { index: I, data_packer: Packer, - poly: u64, + rabin: Rabin64, } impl FileArchiver { @@ -31,10 +31,11 @@ impl FileArchiver { config, index.total_size(BlobType::Data), )?; + let rabin = Rabin64::new_with_polynom(6, poly); Ok(Self { index, data_packer, - poly, + rabin, }) } @@ -68,24 +69,25 @@ impl FileArchiver { node: Node, p: ProgressBar, ) -> Result<(Node, u64)> { - let mut chunks: Vec<_> = ChunkIter::new(r, *node.meta().size() as usize, self.poly) - .enumerate() // see below - .par_bridge() - .map(|(num, chunk)| { - let chunk = chunk?; - let id = hash(&chunk); - let size = chunk.len() as u64; + let mut chunks: Vec<_> = + ChunkIter::new(r, *node.meta().size() as usize, self.rabin.clone()) + .enumerate() // see below + .par_bridge() + .map(|(num, chunk)| { + let chunk = chunk?; + let id = hash(&chunk); + let size = chunk.len() as u64; - if !self.index.has_data(&id) { - self.data_packer.add(&chunk, &id)?; - } - p.inc(size); - Ok((num, id, size)) - }) - .collect::>()?; + if !self.index.has_data(&id) { + self.data_packer.add(&chunk, &id)?; + } + p.inc(size); + Ok((num, id, size)) + }) + .collect::>()?; // As par_bridge doesn't guarantee to keep the order, we sort by the enumeration - chunks.par_sort_unstable_by_key(|x| x.0); + chunks.sort_unstable_by_key(|x| x.0); let filesize = chunks.iter().map(|x| x.2).sum(); let content = chunks.into_iter().map(|x| x.1).collect(); diff --git a/src/cdc/rolling_hash.rs b/src/cdc/rolling_hash.rs index 2bf1650..64ee954 100644 --- a/src/cdc/rolling_hash.rs +++ b/src/cdc/rolling_hash.rs @@ -12,6 +12,7 @@ pub trait RollingHash64 { fn get_hash(&self) -> &Polynom64; } +#[derive(Clone)] pub struct Rabin64 { // Configuration window_size: usize, // The size of the data window used in the hash calculation. diff --git a/src/chunker.rs b/src/chunker.rs index 6e843f8..7031571 100644 --- a/src/chunker.rs +++ b/src/chunker.rs @@ -3,7 +3,8 @@ use std::io::{self, Read}; use anyhow::{anyhow, Result}; use rand::{thread_rng, Rng}; -use crate::cdc::{Polynom, Polynom64, Rabin64, RollingHash64}; +pub use crate::cdc::Rabin64; +use crate::cdc::{Polynom, Polynom64, RollingHash64}; const SPLITMASK: u64 = (1u64 << 20) - 1; const KB: usize = 1024; @@ -30,13 +31,13 @@ pub struct ChunkIter { } impl ChunkIter { - pub fn new(reader: R, size_hint: usize, poly: Polynom64) -> Self { + pub fn new(reader: R, size_hint: usize, rabin: Rabin64) -> Self { Self { buf: Vec::with_capacity(4 * KB), pos: 0, reader, predicate: default_predicate, - rabin: Rabin64::new_with_polynom(6, poly), + rabin, size_hint, // size hint is used to optimize memory allocation; this should be an upper bound on the size min_size: MIN_SIZE, max_size: MAX_SIZE, @@ -244,7 +245,8 @@ mod tests { let mut reader = Cursor::new(empty); let poly = random_poly().unwrap(); - let chunker = ChunkIter::new(&mut reader, 0, poly); + let rabin = Rabin64::new_with_polynom(6, poly); + let chunker = ChunkIter::new(&mut reader, 0, rabin); let chunks: Vec<_> = chunker.into_iter().collect(); assert_eq!(0, chunks.len()); @@ -256,7 +258,8 @@ mod tests { let mut reader = Cursor::new(empty); let poly = random_poly().unwrap(); - let chunker = ChunkIter::new(&mut reader, 100, poly); + let rabin = Rabin64::new_with_polynom(6, poly); + let chunker = ChunkIter::new(&mut reader, 100, rabin); let chunks: Vec<_> = chunker.into_iter().collect(); assert_eq!(0, chunks.len()); @@ -267,7 +270,8 @@ mod tests { let mut reader = repeat(0u8); let poly = random_poly().unwrap(); - let mut chunker = ChunkIter::new(&mut reader, usize::MAX, poly); + let rabin = Rabin64::new_with_polynom(6, poly); + let mut chunker = ChunkIter::new(&mut reader, usize::MAX, rabin); let chunk = chunker.next().unwrap().unwrap(); assert_eq!(MIN_SIZE, chunk.len());