mirror of
https://github.com/rustic-rs/rustic.git
synced 2025-10-26 11:18:51 +00:00
Merge pull request #590 from rustic-rs/optimize-chunker
Optimize chunker
This commit is contained in:
commit
dc17bffed2
@ -6,7 +6,7 @@ use rayon::prelude::*;
|
||||
|
||||
use crate::backend::{DecryptWriteBackend, ReadSourceOpen};
|
||||
use crate::blob::{BlobType, Node, NodeType, Packer, PackerStats};
|
||||
use crate::chunker::ChunkIter;
|
||||
use crate::chunker::{ChunkIter, Rabin64};
|
||||
use crate::crypto::hash;
|
||||
use crate::index::{IndexedBackend, SharedIndexer};
|
||||
use crate::repofile::ConfigFile;
|
||||
@ -17,7 +17,7 @@ use super::{ItemWithParent, ParentResult, TreeItem, TreeType};
|
||||
pub struct FileArchiver<BE: DecryptWriteBackend, I: IndexedBackend> {
|
||||
index: I,
|
||||
data_packer: Packer<BE>,
|
||||
poly: u64,
|
||||
rabin: Rabin64,
|
||||
}
|
||||
|
||||
impl<BE: DecryptWriteBackend, I: IndexedBackend> FileArchiver<BE, I> {
|
||||
@ -31,10 +31,11 @@ impl<BE: DecryptWriteBackend, I: IndexedBackend> FileArchiver<BE, I> {
|
||||
config,
|
||||
index.total_size(BlobType::Data),
|
||||
)?;
|
||||
let rabin = Rabin64::new_with_polynom(6, poly);
|
||||
Ok(Self {
|
||||
index,
|
||||
data_packer,
|
||||
poly,
|
||||
rabin,
|
||||
})
|
||||
}
|
||||
|
||||
@ -68,24 +69,25 @@ impl<BE: DecryptWriteBackend, I: IndexedBackend> FileArchiver<BE, I> {
|
||||
node: Node,
|
||||
p: ProgressBar,
|
||||
) -> Result<(Node, u64)> {
|
||||
let mut chunks: Vec<_> = ChunkIter::new(r, *node.meta().size() as usize, self.poly)
|
||||
.enumerate() // see below
|
||||
.par_bridge()
|
||||
.map(|(num, chunk)| {
|
||||
let chunk = chunk?;
|
||||
let id = hash(&chunk);
|
||||
let size = chunk.len() as u64;
|
||||
let mut chunks: Vec<_> =
|
||||
ChunkIter::new(r, *node.meta().size() as usize, self.rabin.clone())
|
||||
.enumerate() // see below
|
||||
.par_bridge()
|
||||
.map(|(num, chunk)| {
|
||||
let chunk = chunk?;
|
||||
let id = hash(&chunk);
|
||||
let size = chunk.len() as u64;
|
||||
|
||||
if !self.index.has_data(&id) {
|
||||
self.data_packer.add(&chunk, &id)?;
|
||||
}
|
||||
p.inc(size);
|
||||
Ok((num, id, size))
|
||||
})
|
||||
.collect::<Result<_>>()?;
|
||||
if !self.index.has_data(&id) {
|
||||
self.data_packer.add(&chunk, &id)?;
|
||||
}
|
||||
p.inc(size);
|
||||
Ok((num, id, size))
|
||||
})
|
||||
.collect::<Result<_>>()?;
|
||||
|
||||
// As par_bridge doesn't guarantee to keep the order, we sort by the enumeration
|
||||
chunks.par_sort_unstable_by_key(|x| x.0);
|
||||
chunks.sort_unstable_by_key(|x| x.0);
|
||||
|
||||
let filesize = chunks.iter().map(|x| x.2).sum();
|
||||
let content = chunks.into_iter().map(|x| x.1).collect();
|
||||
|
||||
@ -12,6 +12,7 @@ pub trait RollingHash64 {
|
||||
fn get_hash(&self) -> &Polynom64;
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Rabin64 {
|
||||
// Configuration
|
||||
window_size: usize, // The size of the data window used in the hash calculation.
|
||||
|
||||
@ -3,7 +3,8 @@ use std::io::{self, Read};
|
||||
use anyhow::{anyhow, Result};
|
||||
use rand::{thread_rng, Rng};
|
||||
|
||||
use crate::cdc::{Polynom, Polynom64, Rabin64, RollingHash64};
|
||||
pub use crate::cdc::Rabin64;
|
||||
use crate::cdc::{Polynom, Polynom64, RollingHash64};
|
||||
|
||||
const SPLITMASK: u64 = (1u64 << 20) - 1;
|
||||
const KB: usize = 1024;
|
||||
@ -30,13 +31,13 @@ pub struct ChunkIter<R: Read + Send> {
|
||||
}
|
||||
|
||||
impl<R: Read + Send> ChunkIter<R> {
|
||||
pub fn new(reader: R, size_hint: usize, poly: Polynom64) -> Self {
|
||||
pub fn new(reader: R, size_hint: usize, rabin: Rabin64) -> Self {
|
||||
Self {
|
||||
buf: Vec::with_capacity(4 * KB),
|
||||
pos: 0,
|
||||
reader,
|
||||
predicate: default_predicate,
|
||||
rabin: Rabin64::new_with_polynom(6, poly),
|
||||
rabin,
|
||||
size_hint, // size hint is used to optimize memory allocation; this should be an upper bound on the size
|
||||
min_size: MIN_SIZE,
|
||||
max_size: MAX_SIZE,
|
||||
@ -244,7 +245,8 @@ mod tests {
|
||||
let mut reader = Cursor::new(empty);
|
||||
|
||||
let poly = random_poly().unwrap();
|
||||
let chunker = ChunkIter::new(&mut reader, 0, poly);
|
||||
let rabin = Rabin64::new_with_polynom(6, poly);
|
||||
let chunker = ChunkIter::new(&mut reader, 0, rabin);
|
||||
|
||||
let chunks: Vec<_> = chunker.into_iter().collect();
|
||||
assert_eq!(0, chunks.len());
|
||||
@ -256,7 +258,8 @@ mod tests {
|
||||
let mut reader = Cursor::new(empty);
|
||||
|
||||
let poly = random_poly().unwrap();
|
||||
let chunker = ChunkIter::new(&mut reader, 100, poly);
|
||||
let rabin = Rabin64::new_with_polynom(6, poly);
|
||||
let chunker = ChunkIter::new(&mut reader, 100, rabin);
|
||||
|
||||
let chunks: Vec<_> = chunker.into_iter().collect();
|
||||
assert_eq!(0, chunks.len());
|
||||
@ -267,7 +270,8 @@ mod tests {
|
||||
let mut reader = repeat(0u8);
|
||||
|
||||
let poly = random_poly().unwrap();
|
||||
let mut chunker = ChunkIter::new(&mut reader, usize::MAX, poly);
|
||||
let rabin = Rabin64::new_with_polynom(6, poly);
|
||||
let mut chunker = ChunkIter::new(&mut reader, usize::MAX, rabin);
|
||||
|
||||
let chunk = chunker.next().unwrap().unwrap();
|
||||
assert_eq!(MIN_SIZE, chunk.len());
|
||||
|
||||
Loading…
Reference in New Issue
Block a user