Merge pull request #590 from rustic-rs/optimize-chunker

Optimize chunker
This commit is contained in:
aawsome 2023-04-19 06:48:44 +02:00 committed by GitHub
commit dc17bffed2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 31 additions and 24 deletions

View File

@ -6,7 +6,7 @@ use rayon::prelude::*;
use crate::backend::{DecryptWriteBackend, ReadSourceOpen};
use crate::blob::{BlobType, Node, NodeType, Packer, PackerStats};
use crate::chunker::ChunkIter;
use crate::chunker::{ChunkIter, Rabin64};
use crate::crypto::hash;
use crate::index::{IndexedBackend, SharedIndexer};
use crate::repofile::ConfigFile;
@ -17,7 +17,7 @@ use super::{ItemWithParent, ParentResult, TreeItem, TreeType};
pub struct FileArchiver<BE: DecryptWriteBackend, I: IndexedBackend> {
index: I,
data_packer: Packer<BE>,
poly: u64,
rabin: Rabin64,
}
impl<BE: DecryptWriteBackend, I: IndexedBackend> FileArchiver<BE, I> {
@ -31,10 +31,11 @@ impl<BE: DecryptWriteBackend, I: IndexedBackend> FileArchiver<BE, I> {
config,
index.total_size(BlobType::Data),
)?;
let rabin = Rabin64::new_with_polynom(6, poly);
Ok(Self {
index,
data_packer,
poly,
rabin,
})
}
@ -68,24 +69,25 @@ impl<BE: DecryptWriteBackend, I: IndexedBackend> FileArchiver<BE, I> {
node: Node,
p: ProgressBar,
) -> Result<(Node, u64)> {
let mut chunks: Vec<_> = ChunkIter::new(r, *node.meta().size() as usize, self.poly)
.enumerate() // see below
.par_bridge()
.map(|(num, chunk)| {
let chunk = chunk?;
let id = hash(&chunk);
let size = chunk.len() as u64;
let mut chunks: Vec<_> =
ChunkIter::new(r, *node.meta().size() as usize, self.rabin.clone())
.enumerate() // see below
.par_bridge()
.map(|(num, chunk)| {
let chunk = chunk?;
let id = hash(&chunk);
let size = chunk.len() as u64;
if !self.index.has_data(&id) {
self.data_packer.add(&chunk, &id)?;
}
p.inc(size);
Ok((num, id, size))
})
.collect::<Result<_>>()?;
if !self.index.has_data(&id) {
self.data_packer.add(&chunk, &id)?;
}
p.inc(size);
Ok((num, id, size))
})
.collect::<Result<_>>()?;
// As par_bridge doesn't guarantee to keep the order, we sort by the enumeration
chunks.par_sort_unstable_by_key(|x| x.0);
chunks.sort_unstable_by_key(|x| x.0);
let filesize = chunks.iter().map(|x| x.2).sum();
let content = chunks.into_iter().map(|x| x.1).collect();

View File

@ -12,6 +12,7 @@ pub trait RollingHash64 {
fn get_hash(&self) -> &Polynom64;
}
#[derive(Clone)]
pub struct Rabin64 {
// Configuration
window_size: usize, // The size of the data window used in the hash calculation.

View File

@ -3,7 +3,8 @@ use std::io::{self, Read};
use anyhow::{anyhow, Result};
use rand::{thread_rng, Rng};
use crate::cdc::{Polynom, Polynom64, Rabin64, RollingHash64};
pub use crate::cdc::Rabin64;
use crate::cdc::{Polynom, Polynom64, RollingHash64};
const SPLITMASK: u64 = (1u64 << 20) - 1;
const KB: usize = 1024;
@ -30,13 +31,13 @@ pub struct ChunkIter<R: Read + Send> {
}
impl<R: Read + Send> ChunkIter<R> {
pub fn new(reader: R, size_hint: usize, poly: Polynom64) -> Self {
pub fn new(reader: R, size_hint: usize, rabin: Rabin64) -> Self {
Self {
buf: Vec::with_capacity(4 * KB),
pos: 0,
reader,
predicate: default_predicate,
rabin: Rabin64::new_with_polynom(6, poly),
rabin,
size_hint, // size hint is used to optimize memory allocation; this should be an upper bound on the size
min_size: MIN_SIZE,
max_size: MAX_SIZE,
@ -244,7 +245,8 @@ mod tests {
let mut reader = Cursor::new(empty);
let poly = random_poly().unwrap();
let chunker = ChunkIter::new(&mut reader, 0, poly);
let rabin = Rabin64::new_with_polynom(6, poly);
let chunker = ChunkIter::new(&mut reader, 0, rabin);
let chunks: Vec<_> = chunker.into_iter().collect();
assert_eq!(0, chunks.len());
@ -256,7 +258,8 @@ mod tests {
let mut reader = Cursor::new(empty);
let poly = random_poly().unwrap();
let chunker = ChunkIter::new(&mut reader, 100, poly);
let rabin = Rabin64::new_with_polynom(6, poly);
let chunker = ChunkIter::new(&mut reader, 100, rabin);
let chunks: Vec<_> = chunker.into_iter().collect();
assert_eq!(0, chunks.len());
@ -267,7 +270,8 @@ mod tests {
let mut reader = repeat(0u8);
let poly = random_poly().unwrap();
let mut chunker = ChunkIter::new(&mut reader, usize::MAX, poly);
let rabin = Rabin64::new_with_polynom(6, poly);
let mut chunker = ChunkIter::new(&mut reader, usize::MAX, rabin);
let chunk = chunker.next().unwrap().unwrap();
assert_eq!(MIN_SIZE, chunk.len());