rustic/src/commands/prune.rs

use std::cmp::Ordering;
use std::collections::{HashMap, HashSet};
use std::str::FromStr;
use std::sync::{Arc, Mutex};

use anyhow::{anyhow, bail, Result};
use bytesize::ByteSize;
use chrono::{DateTime, Duration, Local};
use clap::{AppSettings, Parser};
use derive_more::Add;
use itertools::Itertools;
use log::*;
use rayon::prelude::*;

use super::{bytes, no_progress, progress_bytes, progress_counter, warm_up_wait};
use crate::backend::{DecryptReadBackend, DecryptWriteBackend, FileType, ReadBackend};
use crate::blob::{
    BlobType, BlobTypeMap, Initialize, NodeType, PackSizer, Repacker, Sum, TreeStreamerOnce,
};
use crate::commands::helpers::progress_spinner;
use crate::id::Id;
use crate::index::{IndexBackend, IndexCollector, IndexType, IndexedBackend, Indexer, ReadIndex};
use crate::repofile::{HeaderEntry, IndexBlob, IndexFile, IndexPack, SnapshotFile};
use crate::repository::OpenRepository;

#[derive(Parser)]
#[clap(global_setting(AppSettings::DeriveDisplayOrder))]
pub(super) struct Opts {
    /// Don't remove anything, only show what would be done
    #[clap(long, short = 'n')]
    pub(crate) dry_run: bool,

    /// Define maximum data to repack in % of reposize or as size (e.g. '5b', '2 kB', '3M', '4TiB') or 'unlimited'
    #[clap(long, value_name = "LIMIT", default_value = "unlimited")]
    max_repack: LimitOption,

    /// Tolerate limit of unused data in % of reposize after pruning or as size (e.g. '5b', '2 kB', '3M', '4TiB') or 'unlimited'
    #[clap(long, value_name = "LIMIT", default_value = "5%")]
    max_unused: LimitOption,

    /// Minimum duration (e.g. 90d) to keep packs before repacking or removing. More recently created
    /// packs won't be repacked or marked for deletion within this prune run.
    #[clap(long, value_name = "DURATION", default_value = "0d")]
    keep_pack: humantime::Duration,

    /// Minimum duration (e.g. 10m) to keep packs marked for deletion. More recently marked packs won't be
    /// deleted within this prune run.
    #[clap(long, value_name = "DURATION", default_value = "23h")]
    keep_delete: humantime::Duration,

    /// Delete files immediately instead of marking them. This also removes all files already marked for deletion.
    /// WARNING: Only use if you are sure the repository is not accessed by parallel processes!
    #[clap(long)]
    instant_delete: bool,

    /// Simply copy blobs when repacking instead of decrypting; possibly compressing; encrypting
    #[clap(long)]
    fast_repack: bool,

    /// Repack packs containing uncompressed blobs. This cannot be used with --fast-repack.
    /// Implies --max-unused=0.
    #[clap(long, conflicts_with = "fast-repack")]
    repack_uncompressed: bool,

    /// Only repack packs which are cacheable [default: true for a hot/cold repository, else false]
    #[clap(long, value_name = "TRUE/FALSE")]
    repack_cacheable_only: Option<bool>,

    /// Do not repack packs which only needs to be resized
    #[clap(long)]
    no_resize: bool,
}

pub(super) fn execute(repo: OpenRepository, opts: Opts, ignore_snaps: Vec<Id>) -> Result<()> {
    let be = &repo.dbe;
    if repo.config.version < 2 && opts.repack_uncompressed {
        bail!("--repack-uncompressed makes no sense for v1 repo!");
    }

    let mut index_files = Vec::new();

    let p = progress_counter("reading index...");
    let mut index_collector = IndexCollector::new(IndexType::OnlyTrees);

    for index in be.stream_all::<IndexFile>(p.clone())? {
        let (id, index) = index?;
        index_collector.extend(index.packs.clone());
        // we add the trees from packs_to_delete to the index such that searching for
        // used blobs doesn't abort if they are already marked for deletion
        index_collector.extend(index.packs_to_delete.clone());

        index_files.push((id, index));
    }
    p.finish();

    let (used_ids, total_size) = {
        let index = index_collector.into_index();
        let total_size = BlobTypeMap::init(|blob_type| index.total_size(blob_type));
        let indexed_be = IndexBackend::new_from_index(&be.clone(), index);
        let used_ids = find_used_blobs(&indexed_be, ignore_snaps)?;
        (used_ids, total_size)
    };

    // list existing pack files
    let p = progress_spinner("geting packs from repository...");
    let existing_packs: HashMap<_, _> = be.list_with_size(FileType::Pack)?.into_iter().collect();
    p.finish();

    let mut pruner = Pruner::new(used_ids, existing_packs, index_files);
    pruner.count_used_blobs();
    pruner.check()?;
    let repack_cacheable_only = opts
        .repack_cacheable_only
        .unwrap_or_else(|| repo.config.is_hot == Some(true));
    let pack_sizer = total_size.map(|tpe, size| PackSizer::from_config(&repo.config, tpe, size));
    pruner.decide_packs(
        Duration::from_std(*opts.keep_pack)?,
        Duration::from_std(*opts.keep_delete)?,
        repack_cacheable_only,
        opts.repack_uncompressed,
        &pack_sizer,
    )?;
    pruner.decide_repack(
        &opts.max_repack,
        &opts.max_unused,
        opts.repack_uncompressed,
        opts.no_resize,
        &pack_sizer,
    );
    pruner.check_existing_packs()?;
    pruner.filter_index_files(opts.instant_delete);
    pruner.print_stats();

    warm_up_wait(&repo, pruner.repack_packs().into_iter(), !opts.dry_run)?;

    if !opts.dry_run {
        pruner.do_prune(repo, opts)?;
    }
    Ok(())
}

enum LimitOption {
    Size(ByteSize),
    Percentage(u64),
    Unlimited,
}

impl FromStr for LimitOption {
    type Err = anyhow::Error;
    fn from_str(s: &str) -> Result<Self> {
        Ok(match s.chars().last().unwrap_or('0') {
            '%' => Self::Percentage({
                let mut copy = s.to_string();
                copy.pop();
                copy.parse()?
            }),
            'd' if s == "unlimited" => Self::Unlimited,
            _ => Self::Size(ByteSize::from_str(s).map_err(|err| anyhow!(err))?),
        })
    }
}

#[derive(Default)]
struct DeleteStats {
    remove: u64,
    recover: u64,
    keep: u64,
}

impl DeleteStats {
    fn total(&self) -> u64 {
        self.remove + self.recover + self.keep
    }
}
#[derive(Default)]
struct PackStats {
    used: u64,
    partly_used: u64,
    unused: u64, // this equals to packs-to-remove
    repack: u64,
    keep: u64,
}
#[derive(Default, Clone, Copy, Add)]
struct SizeStats {
    used: u64,
    unused: u64,
    remove: u64,
    repack: u64,
    repackrm: u64,
}

impl SizeStats {
    fn total(&self) -> u64 {
        self.used + self.unused
    }
    fn total_after_prune(&self) -> u64 {
        self.used + self.unused_after_prune()
    }
    fn unused_after_prune(&self) -> u64 {
        self.unused - self.remove - self.repackrm
    }
}

#[derive(Default)]
struct PruneStats {
    packs_to_delete: DeleteStats,
    size_to_delete: DeleteStats,
    packs: PackStats,
    blobs: BlobTypeMap<SizeStats>,
    size: BlobTypeMap<SizeStats>,
    size_unref: u64,
    index_files: u64,
}

#[derive(Debug)]
struct PruneIndex {
    id: Id,
    modified: bool,
    packs: Vec<PrunePack>,
}

impl PruneIndex {
    fn len(&self) -> usize {
        self.packs.iter().map(|p| p.blobs.len()).sum()
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum PackToDo {
    Undecided,
    Keep,
    Repack,
    MarkDelete,
    KeepMarked,
    Recover,
    Delete,
}

#[derive(Debug)]
struct PrunePack {
    id: Id,
    blob_type: BlobType,
    size: u32,
    delete_mark: bool,
    to_do: PackToDo,
    time: Option<DateTime<Local>>,
    blobs: Vec<IndexBlob>,
}

impl PrunePack {
    fn from_index_pack(p: IndexPack, delete_mark: bool) -> Self {
        Self {
            id: p.id,
            blob_type: p.blob_type(),
            size: p.pack_size(),
            delete_mark,
            to_do: PackToDo::Undecided,
            time: p.time,
            blobs: p.blobs,
        }
    }

    fn from_index_pack_unmarked(p: IndexPack) -> Self {
        Self::from_index_pack(p, false)
    }

    fn from_index_pack_marked(p: IndexPack) -> Self {
        Self::from_index_pack(p, true)
    }

    fn into_index_pack(self) -> IndexPack {
        IndexPack {
            id: self.id,
            time: self.time,
            size: None,
            blobs: self.blobs,
        }
    }

    fn into_index_pack_with_time(self, time: DateTime<Local>) -> IndexPack {
        IndexPack {
            id: self.id,
            time: Some(time),
            size: None,
            blobs: self.blobs,
        }
    }

    fn set_todo(&mut self, todo: PackToDo, pi: &PackInfo, stats: &mut PruneStats) {
        let tpe = self.blob_type;
        match todo {
            PackToDo::Undecided => panic!("not possible"),
            PackToDo::Keep => {
                stats.blobs[tpe].used += u64::from(pi.used_blobs);
                stats.blobs[tpe].unused += u64::from(pi.unused_blobs);
                stats.size[tpe].used += u64::from(pi.used_size);
                stats.size[tpe].unused += u64::from(pi.unused_size);
                stats.packs.keep += 1;
            }
            PackToDo::Repack => {
                stats.blobs[tpe].used += u64::from(pi.used_blobs);
                stats.blobs[tpe].unused += u64::from(pi.unused_blobs);
                stats.size[tpe].used += u64::from(pi.used_size);
                stats.size[tpe].unused += u64::from(pi.unused_size);
                stats.packs.repack += 1;
                stats.blobs[tpe].repack += u64::from(pi.unused_blobs + pi.used_blobs);
                stats.blobs[tpe].repackrm += u64::from(pi.unused_blobs);
                stats.size[tpe].repack += u64::from(pi.unused_size + pi.used_size);
                stats.size[tpe].repackrm += u64::from(pi.unused_size);
            }

            PackToDo::MarkDelete => {
                stats.blobs[tpe].unused += u64::from(pi.unused_blobs);
                stats.size[tpe].unused += u64::from(pi.unused_size);
                stats.blobs[tpe].remove += u64::from(pi.unused_blobs);
                stats.size[tpe].remove += u64::from(pi.unused_size);
            }
            PackToDo::Recover => {
                stats.packs_to_delete.recover += 1;
                stats.size_to_delete.recover += u64::from(self.size);
            }
            PackToDo::Delete => {
                stats.packs_to_delete.remove += 1;
                stats.size_to_delete.remove += u64::from(self.size);
            }
            PackToDo::KeepMarked => {
                stats.packs_to_delete.keep += 1;
                stats.size_to_delete.keep += u64::from(self.size);
            }
        }
        self.to_do = todo;
    }

    fn is_compressed(&self) -> bool {
        self.blobs
            .iter()
            .all(|blob| blob.uncompressed_length.is_some())
    }
}

#[derive(PartialEq, Eq)]
enum RepackReason {
    PartlyUsed,
    ToCompress,
    SizeMismatch,
}
use RepackReason::*;

struct Pruner {
    time: DateTime<Local>,
    used_ids: HashMap<Id, u8>,
    existing_packs: HashMap<Id, u32>,
    repack_candidates: Vec<(PackInfo, RepackReason, usize, usize)>,
    index_files: Vec<PruneIndex>,
    stats: PruneStats,
}

impl Pruner {
    fn new(
        used_ids: HashMap<Id, u8>,
        existing_packs: HashMap<Id, u32>,
        index_files: Vec<(Id, IndexFile)>,
    ) -> Self {
        let mut processed_packs = HashSet::new();
        let mut processed_packs_delete = HashSet::new();
        let mut index_files: Vec<_> = index_files
            .into_iter()
            .map(|(id, index)| {
                let mut modified = false;
                let mut packs: Vec<_> = index
                    .packs
                    .into_iter()
                    // filter out duplicate packs
                    .filter(|p| {
                        let no_duplicate = processed_packs.insert(p.id);
                        modified |= !no_duplicate;
                        no_duplicate
                    })
                    .map(PrunePack::from_index_pack_unmarked)
                    .collect();
                packs.extend(
                    index
                        .packs_to_delete
                        .into_iter()
                        // filter out duplicate packs
                        .filter(|p| {
                            let no_duplicate = processed_packs_delete.insert(p.id);
                            modified |= !no_duplicate;
                            no_duplicate
                        })
                        .map(PrunePack::from_index_pack_marked),
                );

                PruneIndex {
                    id,
                    modified,
                    packs,
                }
            })
            .collect();

        // filter out "normally" indexed packs from packs_to_delete
        for index in index_files.iter_mut() {
            let mut modified = false;
            index.packs.retain(|p| {
                !p.delete_mark || {
                    let duplicate = processed_packs.contains(&p.id);
                    modified |= duplicate;
                    !duplicate
                }
            });

            index.modified |= modified;
        }

        Self {
            time: Local::now(),
            used_ids,
            existing_packs,
            repack_candidates: Vec::new(),
            index_files,
            stats: PruneStats::default(),
        }
    }

    fn count_used_blobs(&mut self) {
        for blob in self
            .index_files
            .iter()
            .flat_map(|index| &index.packs)
            .flat_map(|pack| &pack.blobs)
        {
            if let Some(count) = self.used_ids.get_mut(&blob.id) {
                // note that duplicates are only counted up to 255. If there are more
                // duplicates, the number is set to 255. This may imply that later on
                // not the "best" pack is chosen to have that blob marked as used.
                *count = count.saturating_add(1);
            }
        }
    }

    fn check(&self) -> Result<()> {
        // check that all used blobs are present in index
        for (id, count) in &self.used_ids {
            if *count == 0 {
                error!("used blob {} is missing", id);
                bail!("missing blobs");
            }
        }
        Ok(())
    }

    fn decide_packs(
        &mut self,
        keep_pack: Duration,
        keep_delete: Duration,
        repack_cacheable_only: bool,
        repack_uncompressed: bool,
        pack_sizer: &BlobTypeMap<PackSizer>,
    ) -> Result<()> {
        // first process all marked packs then the unmarked ones:
        // - first processed packs are more likely to have all blobs seen as unused
        // - if marked packs have used blob but these blobs are all present in
        //   unmarked packs, we want to perform the deletion!
        for mark_case in [true, false] {
            for (index_num, index) in self.index_files.iter_mut().enumerate() {
                for (pack_num, pack) in index
                    .packs
                    .iter_mut()
                    .enumerate()
                    .filter(|(_, p)| p.delete_mark == mark_case)
                {
                    let pi = PackInfo::from_pack(pack, &mut self.used_ids);

                    // Various checks to determine if packs need to be kept
                    let too_young = pack.time > Some(self.time - keep_pack);
                    let keep_uncacheable = repack_cacheable_only && !pack.blob_type.is_cacheable();

                    let to_compress = repack_uncompressed && !pack.is_compressed();
                    let size_mismatch = !pack_sizer[pack.blob_type].size_ok(pack.size);

                    match (pack.delete_mark, pi.used_blobs, pi.unused_blobs) {
                        (false, 0, _) => {
                            // unused pack
                            self.stats.packs.unused += 1;
                            if too_young {
                                // keep packs which are too young
                                pack.set_todo(PackToDo::Keep, &pi, &mut self.stats);
                            } else {
                                pack.set_todo(PackToDo::MarkDelete, &pi, &mut self.stats);
                            }
                        }
                        (false, 1.., 0) => {
                            // used pack
                            self.stats.packs.used += 1;
                            if too_young || keep_uncacheable {
                                pack.set_todo(PackToDo::Keep, &pi, &mut self.stats);
                            } else if to_compress {
                                self.repack_candidates
                                    .push((pi, ToCompress, index_num, pack_num));
                            } else if size_mismatch {
                                self.repack_candidates.push((
                                    pi,
                                    SizeMismatch,
                                    index_num,
                                    pack_num,
                                ));
                            } else {
                                pack.set_todo(PackToDo::Keep, &pi, &mut self.stats);
                            }
                        }

                        (false, 1.., 1..) => {
                            // partly used pack
                            self.stats.packs.partly_used += 1;

                            if too_young || keep_uncacheable {
                                // keep packs which are too young and non-cacheable packs if requested
                                pack.set_todo(PackToDo::Keep, &pi, &mut self.stats);
                            } else {
                                // other partly used pack => candidate for repacking
                                self.repack_candidates
                                    .push((pi, PartlyUsed, index_num, pack_num));
                            }
                        }
                        (true, 0, _) => {
                            if self.time - pack.time.expect("packs_to_delete has no time")
                                >= keep_delete
                            {
                                pack.set_todo(PackToDo::Delete, &pi, &mut self.stats);
                            } else {
                                pack.set_todo(PackToDo::KeepMarked, &pi, &mut self.stats);
                            }
                        }
                        (true, 1.., _) => {
                            // needed blobs; mark this pack for recovery
                            pack.set_todo(PackToDo::Recover, &pi, &mut self.stats);
                        }
                    }
                }
            }
        }

        Ok(())
    }

    fn decide_repack(
        &mut self,
        max_repack: &LimitOption,
        max_unused: &LimitOption,
        repack_uncompressed: bool,
        no_resize: bool,
        pack_sizer: &BlobTypeMap<PackSizer>,
    ) {
        let max_unused = match (repack_uncompressed, max_unused) {
            (true, _) => 0,
            (false, LimitOption::Unlimited) => u64::MAX,
            (false, LimitOption::Size(size)) => size.as_u64(),
            // if percentag is given, we want to have
            // unused <= p/100 * size_after = p/100 * (size_used + unused)
            // which equals (1 - p/100) * unused <= p/100 * size_used
            (false, LimitOption::Percentage(p)) => (p * self.stats.size.sum().used) / (100 - p),
        };

        let max_repack = match max_repack {
            LimitOption::Unlimited => u64::MAX,
            LimitOption::Size(size) => size.as_u64(),
            LimitOption::Percentage(p) => p * self.stats.size.sum().total(),
        };

        self.repack_candidates.sort_unstable_by_key(|rc| rc.0);
        let mut resize_packs = BlobTypeMap::<Vec<_>>::default();
        let mut do_repack = BlobTypeMap::default();
        let mut repack_size = BlobTypeMap::<u64>::default();

        for (pi, repack_reason, index_num, pack_num) in std::mem::take(&mut self.repack_candidates)
        {
            let pack = &mut self.index_files[index_num].packs[pack_num];
            let blob_type = pi.blob_type;

            let total_repack_size: u64 = repack_size.into_values().sum();
            if total_repack_size + u64::from(pi.used_size) >= max_repack
                || (self.stats.size.sum().unused_after_prune() < max_unused
                    && repack_reason == PartlyUsed
                    && blob_type == BlobType::Data)
                || (repack_reason == SizeMismatch && no_resize)
            {
                pack.set_todo(PackToDo::Keep, &pi, &mut self.stats);
            } else if repack_reason == SizeMismatch {
                resize_packs[blob_type].push((pi, index_num, pack_num));
                repack_size[blob_type] += u64::from(pi.used_size);
            } else {
                pack.set_todo(PackToDo::Repack, &pi, &mut self.stats);
                repack_size[blob_type] += u64::from(pi.used_size);
                do_repack[blob_type] = true;
            }
        }
        for (blob_type, resize_packs) in resize_packs {
            // packs in resize_packs are only repacked if we anyway repack this blob type or
            // if the target pack size is reached for the blob type.
            let todo = if do_repack[blob_type]
                || repack_size[blob_type] > u64::from(pack_sizer[blob_type].pack_size())
            {
                PackToDo::Repack
            } else {
                PackToDo::Keep
            };
            for (pi, index_num, pack_num) in resize_packs {
                let pack = &mut self.index_files[index_num].packs[pack_num];
                pack.set_todo(todo, &pi, &mut self.stats);
            }
        }
    }

    fn check_existing_packs(&mut self) -> Result<()> {
        for pack in self.index_files.iter().flat_map(|index| &index.packs) {
            let existing_size = self.existing_packs.remove(&pack.id);

            // TODO: Unused Packs which don't exist (i.e. only existing in index)
            let check_size = || {
                match existing_size {
                    Some(size) if size == pack.size => Ok(()), // size is ok => continue
                    Some(size) => bail!(
                        "used pack {}: size does not match! Expected size: {}, real size: {}",
                        pack.id,
                        pack.size,
                        size
                    ),
                    None => bail!("used pack {} does not exist!", pack.id),
                }
            };

            match pack.to_do {
                PackToDo::Undecided => {
                    bail!("should not happen!")
                }
                PackToDo::Keep | PackToDo::Recover => {
                    for blob in &pack.blobs {
                        self.used_ids.remove(&blob.id);
                    }
                    check_size()?;
                }
                PackToDo::Repack => {
                    check_size()?;
                }
                PackToDo::MarkDelete | PackToDo::Delete | PackToDo::KeepMarked => {}
            }
        }

        self.used_ids.shrink_to_fit();
        self.existing_packs.shrink_to_fit();

        // all remaining packs in existing_packs are unreferenced packs
        for size in self.existing_packs.values() {
            self.stats.size_unref += u64::from(*size);
        }

        Ok(())
    }

    fn filter_index_files(&mut self, instant_delete: bool) {
        const MIN_INDEX_LEN: usize = 10_000;

        let mut any_must_modify = false;
        self.stats.index_files = self.index_files.len() as u64;
        // filter out only the index files which need processing
        self.index_files.retain(|index| {
            // index must be processed if it has been modified
            // or if any pack is not kept
            let must_modify = index.modified
                || index.packs.iter().any(|p| {
                    p.to_do != PackToDo::Keep && (instant_delete || p.to_do != PackToDo::KeepMarked)
                });

            any_must_modify |= must_modify;

            // also process index files which are too small (i.e. rebuild them)
            must_modify || index.len() < MIN_INDEX_LEN
        });

        if !any_must_modify && self.index_files.len() == 1 {
            // only one index file to process but only because it is too small
            self.index_files.clear();
        }

        // TODO: Sort index files such that files with deletes come first and files with
        // repacks come at end
    }

    fn print_stats(&self) {
        let pack_stat = &self.stats.packs;
        let blob_stat = self.stats.blobs.sum();
        let size_stat = self.stats.size.sum();

        debug!(
            "used:   {:>10} blobs, {:>10}",
            blob_stat.used,
            bytes(size_stat.used)
        );

        debug!(
            "unused: {:>10} blobs, {:>10}",
            blob_stat.unused,
            bytes(size_stat.unused)
        );
        debug!(
            "total:  {:>10} blobs, {:>10}",
            blob_stat.total(),
            bytes(size_stat.total())
        );

        println!(
            "to repack: {:>10} packs, {:>10} blobs, {:>10}",
            pack_stat.repack,
            blob_stat.repack,
            bytes(size_stat.repack)
        );
        println!(
            "this removes:                {:>10} blobs, {:>10}",
            blob_stat.repackrm,
            bytes(size_stat.repackrm)
        );
        println!(
            "to delete: {:>10} packs, {:>10} blobs, {:>10}",
            pack_stat.unused,
            blob_stat.remove,
            bytes(size_stat.remove)
        );
        if !self.existing_packs.is_empty() {
            println!(
                "unindexed: {:>10} packs,         ?? blobs, {:>10}",
                self.existing_packs.len(),
                bytes(self.stats.size_unref)
            );
        }

        println!(
            "total prune:                 {:>10} blobs, {:>10}",
            blob_stat.repackrm + blob_stat.remove,
            bytes(size_stat.repackrm + size_stat.remove + self.stats.size_unref)
        );
        println!(
            "remaining:                   {:>10} blobs, {:>10}",
            blob_stat.total_after_prune(),
            bytes(size_stat.total_after_prune())
        );
        println!(
            "unused size after prune: {:>10} ({:.2}% of remaining size)",
            bytes(size_stat.unused_after_prune()),
            size_stat.unused_after_prune() as f64 / size_stat.total_after_prune() as f64 * 100.0
        );

        println!();

        println!(
            "packs marked for deletion: {:>10}, {:>10}",
            self.stats.packs_to_delete.total(),
            bytes(self.stats.size_to_delete.total()),
        );
        println!(
            " - complete deletion:      {:>10}, {:>10}",
            self.stats.packs_to_delete.remove,
            bytes(self.stats.size_to_delete.remove),
        );
        println!(
            " - keep marked:            {:>10}, {:>10}",
            self.stats.packs_to_delete.keep,
            bytes(self.stats.size_to_delete.keep),
        );
        println!(
            " - recover:                {:>10}, {:>10}",
            self.stats.packs_to_delete.recover,
            bytes(self.stats.size_to_delete.recover),
        );

        debug!(
            "index files to rebuild: {} / {}",
            self.index_files.len(),
            self.stats.index_files
        );
    }

    fn repack_packs(&self) -> Vec<Id> {
        self.index_files
            .iter()
            .flat_map(|index| &index.packs)
            .filter(|pack| pack.to_do == PackToDo::Repack)
            .map(|pack| pack.id)
            .collect()
    }

    fn do_prune(self, repo: OpenRepository, opts: Opts) -> Result<()> {
        let be = repo.dbe;

        let indexer = Indexer::new_unindexed(be.clone()).into_shared();

        // Calculate an approximation of sizes after pruning.
        // The size actually is:
        // total_size_of_all_blobs + total_size_of_pack_headers + #packs * pack_overhead
        // This is hard/impossible to compute because:
        // - the size of blobs can change during repacking if compression is changed
        // - the size of pack headers depends on wheter blobs are compressed or not
        // - we don't know the number of packs generated by repacking
        // So, we simply use the current size of the blobs and an estimation of the pack
        // header size.

        let size_after_prune = BlobTypeMap::init(|blob_type| {
            self.stats.size[blob_type].total_after_prune()
                + self.stats.blobs[blob_type].total_after_prune()
                    * u64::from(HeaderEntry::ENTRY_LEN_COMPRESSED)
        });

        let tree_repacker = Repacker::new(
            be.clone(),
            BlobType::Tree,
            indexer.clone(),
            &repo.config,
            size_after_prune[BlobType::Tree],
        )?;

        let data_repacker = Repacker::new(
            be.clone(),
            BlobType::Data,
            indexer.clone(),
            &repo.config,
            size_after_prune[BlobType::Data],
        )?;

        // mark unreferenced packs for deletion
        if !self.existing_packs.is_empty() {
            if opts.instant_delete {
                let p = progress_counter("removing unindexed packs...");
                let existing_packs: Vec<_> = self.existing_packs.into_keys().collect();
                be.delete_list(FileType::Pack, true, existing_packs.iter(), p)?;
            } else {
                info!("marking not needed unindexed pack files for deletion...");
                for (id, size) in self.existing_packs {
                    let pack = IndexPack {
                        id,
                        size: Some(size),
                        time: Some(Local::now()),
                        blobs: Vec::new(),
                    };
                    indexer.write().unwrap().add_remove(pack)?;
                }
            }
        }

        // process packs by index_file
        let p = match (self.index_files.is_empty(), self.stats.packs.repack > 0) {
            (true, _) => {
                info!("nothing to do!");
                no_progress()
            }
            // TODO: Use a MultiProgressBar here
            (false, true) => progress_bytes("repacking // rebuilding index..."),
            (false, false) => progress_spinner("rebuilding index..."),
        };

        p.set_length(self.stats.size.sum().repack - self.stats.size.sum().repackrm);

        let mut indexes_remove = Vec::new();
        let tree_packs_remove = Arc::new(Mutex::new(Vec::new()));
        let data_packs_remove = Arc::new(Mutex::new(Vec::new()));

        let delete_pack = |pack: PrunePack| {
            // delete pack
            match pack.blob_type {
                BlobType::Data => data_packs_remove.lock().unwrap().push(pack.id),
                BlobType::Tree => tree_packs_remove.lock().unwrap().push(pack.id),
            }
        };

        let used_ids = Arc::new(Mutex::new(self.used_ids));

        let packs: Vec<_> = self
            .index_files
            .into_iter()
            .map(|index| {
                indexes_remove.push(index.id);
                index
            })
            .flat_map(|index| index.packs)
            .collect();

        packs.into_par_iter().try_for_each(|pack| {
            match pack.to_do {
                PackToDo::Undecided => bail!("pack {} got no decicion what to do", pack.id),
                PackToDo::Keep => {
                    // keep pack: add to new index
                    let pack = pack.into_index_pack();
                    indexer.write().unwrap().add(pack)?;
                }
                PackToDo::Repack => {
                    // TODO: repack in parallel
                    for blob in &pack.blobs {
                        if used_ids.lock().unwrap().remove(&blob.id).is_none() {
                            // don't save duplicate blobs
                            continue;
                        }

                        let repacker = match blob.tpe {
                            BlobType::Data => &data_repacker,
                            BlobType::Tree => &tree_repacker,
                        };
                        if opts.fast_repack {
                            repacker.add_fast(&pack.id, blob)?;
                        } else {
                            repacker.add(&pack.id, blob)?;
                        }
                        p.inc(u64::from(blob.length));
                    }
                    if opts.instant_delete {
                        delete_pack(pack);
                    } else {
                        // mark pack for removal
                        let pack = pack.into_index_pack_with_time(self.time);
                        indexer.write().unwrap().add_remove(pack)?;
                    }
                }
                PackToDo::MarkDelete => {
                    if opts.instant_delete {
                        delete_pack(pack);
                    } else {
                        // mark pack for removal
                        let pack = pack.into_index_pack_with_time(self.time);
                        indexer.write().unwrap().add_remove(pack)?;
                    }
                }
                PackToDo::KeepMarked => {
                    if opts.instant_delete {
                        delete_pack(pack);
                    } else {
                        // keep pack: add to new index
                        let pack = pack.into_index_pack();
                        indexer.write().unwrap().add_remove(pack)?;
                    }
                }
                PackToDo::Recover => {
                    // recover pack: add to new index in section packs
                    let pack = pack.into_index_pack_with_time(self.time);
                    indexer.write().unwrap().add(pack)?;
                }
                PackToDo::Delete => delete_pack(pack),
            }
            Ok(())
        })?;
        tree_repacker.finalize()?;
        data_repacker.finalize()?;
        indexer.write().unwrap().finalize()?;
        p.finish();

        // remove old index files first as they may reference pack files which are removed soon.
        if !indexes_remove.is_empty() {
            let p = progress_counter("removing old index files...");
            be.delete_list(FileType::Index, true, indexes_remove.iter(), p)?;
        }

        // get variables out of Arc<Mutex<_>>
        let data_packs_remove = data_packs_remove.lock().unwrap();
        let tree_packs_remove = tree_packs_remove.lock().unwrap();

        if !data_packs_remove.is_empty() {
            let p = progress_counter("removing old data packs...");
            be.delete_list(FileType::Pack, false, data_packs_remove.iter(), p)?;
        }

        if !tree_packs_remove.is_empty() {
            let p = progress_counter("removing old tree packs...");
            be.delete_list(FileType::Pack, true, tree_packs_remove.iter(), p)?;
        }

        Ok(())
    }
}

#[derive(PartialEq, Eq, Clone, Copy)]
struct PackInfo {
    blob_type: BlobType,
    used_blobs: u16,
    unused_blobs: u16,
    used_size: u32,
    unused_size: u32,
}

impl PartialOrd<PackInfo> for PackInfo {
    fn partial_cmp(&self, other: &PackInfo) -> Option<Ordering> {
        Some(self.cmp(other))
    }
}

impl Ord for PackInfo {
    fn cmp(&self, other: &Self) -> Ordering {
        // first order by blob type such that tree packs are picked first
        self.blob_type.cmp(&other.blob_type).then(
            // then order such that packs with highest
            // ratio unused/used space are picked first.
            // This is equivalent to ordering by unused / total space.
            (u64::from(other.unused_size) * u64::from(self.used_size))
                .cmp(&(u64::from(self.unused_size) * u64::from(other.used_size))),
        )
    }
}

impl PackInfo {
    fn from_pack(pack: &PrunePack, used_ids: &mut HashMap<Id, u8>) -> Self {
        let mut pi = Self {
            blob_type: pack.blob_type,
            used_blobs: 0,
            unused_blobs: 0,
            used_size: 0,
            unused_size: 0,
        };

        // We search all blobs in the pack for needed ones. We do this by already marking
        // and decreasing the used blob counter for the processed blobs. If the counter
        // was decreased to 0, the blob and therefore the pack is actually used.
        // Note that by this processing, we are also able to handle duplicate blobs within a pack
        // correctly.
        // If we found a needed blob, we stop and process the information that the pack is actually needed.
        let first_needed = pack.blobs.iter().position(|blob| {
            match used_ids.get_mut(&blob.id) {
                None | Some(0) => {
                    pi.unused_size += blob.length;
                    pi.unused_blobs += 1;
                }
                Some(count) => {
                    // decrease counter
                    *count -= 1;
                    if *count == 0 {
                        // blob is actually needed
                        pi.used_size += blob.length;
                        pi.used_blobs += 1;
                        return true; // break the search
                    } else {
                        // blob is not needed
                        pi.unused_size += blob.length;
                        pi.unused_blobs += 1;
                    }
                }
            }
            false // continue with next blob
        });

        if let Some(first_needed) = first_needed {
            // The pack is actually needed.
            // We reprocess the blobs up to the first needed one and mark all blobs which are genarally needed as used.
            for blob in &pack.blobs[..first_needed] {
                match used_ids.get_mut(&blob.id) {
                    None | Some(0) => {} // already correctly marked
                    Some(count) => {
                        // remark blob as used
                        pi.unused_size -= blob.length;
                        pi.unused_blobs -= 1;
                        pi.used_size += blob.length;
                        pi.used_blobs += 1;
                        *count = 0; // count = 0 indicates to other packs that the blob is not needed anymore.
                    }
                }
            }
            // Then we process the remaining blobs and mark all blobs which are generally needed as used in this blob
            for blob in &pack.blobs[first_needed + 1..] {
                match used_ids.get_mut(&blob.id) {
                    None | Some(0) => {
                        pi.unused_size += blob.length;
                        pi.unused_blobs += 1;
                    }
                    Some(count) => {
                        // blob is used in this pack
                        pi.used_size += blob.length;
                        pi.used_blobs += 1;
                        *count = 0; // count = 0 indicates to other packs that the blob is not needed anymore.
                    }
                }
            }
        }

        pi
    }
}

// find used blobs in repo
fn find_used_blobs(
    index: &(impl IndexedBackend + Unpin),
    ignore_snaps: Vec<Id>,
) -> Result<HashMap<Id, u8>> {
    let ignore_snaps: HashSet<_> = ignore_snaps.into_iter().collect();

    let p = progress_counter("reading snapshots...");
    let list = index
        .be()
        .list(FileType::Snapshot)?
        .into_iter()
        .filter(|id| !ignore_snaps.contains(id))
        .collect();
    let snap_trees: Vec<_> = index
        .be()
        .stream_list::<SnapshotFile>(list, p.clone())?
        .into_iter()
        .map_ok(|(_, snap)| snap.tree)
        .try_collect()?;
    p.finish();

    let mut ids: HashMap<_, _> = snap_trees.iter().map(|id| (*id, 0)).collect();
    let p = progress_counter("finding used blobs...");

    let mut tree_streamer = TreeStreamerOnce::new(index.clone(), snap_trees, p)?;
    while let Some(item) = tree_streamer.next().transpose()? {
        let (_, tree) = item;
        for node in tree.nodes() {
            match node.node_type() {
                NodeType::File => ids.extend(node.content().iter().map(|id| (*id, 0))),
                NodeType::Dir => {
                    ids.insert(node.subtree().unwrap(), 0);
                }
                _ => {} // nothing to do
            }
        }
    }

    Ok(ids)
}