dedup/src/dedup.cpp

151 lines
4.2 KiB
C++

#include "dedup.h"
#include <chrono>
#include <cstring>
#include <iostream>
#include <numeric>
#include <sys/stat.h>
#include <thread>
#include <unistd.h>
#include <Log.h>
#include "progressbar.h"
Dedup::Dedup() {
}
Dedup::~Dedup() {
}
void Dedup::addSearchFolder(const std::string& path, bool recurse) {
folders.emplace_back(path, recurse);
}
Files& getFiles(uint64_t fs, std::map<uint64_t, Files>& m) {
auto it = m.find(fs);
if(it == m.end()) {
auto pair = m.insert({fs, {}});
return pair.first->second;
}
return it->second;
}
void Dedup::start() {
FileIndexer::map_t foundfiles;
for(const SearchFolder& sf : folders) {
foundfiles = indexer.index(sf);
}
uint64_t accFileSize = std::accumulate(foundfiles.begin(), foundfiles.end(), 0ul, [](uint64_t val, auto r){ return val + r.second->filesize; });
Log::info << "indexing done " << foundfiles.size() << " unique files found with a total file size of " << FileSize(accFileSize);
// remove uninteresting files
removeUninterestingFiles(foundfiles);
// hashing
uint64_t bytesToHash = std::accumulate(foundfiles.begin(), foundfiles.end(), 0ul, [](uint64_t& c, const std::pair<uint64_t, std::shared_ptr<File>>& it) { return c + it.second->filesize; });
Log::info << foundfiles.size() << " files and " << FileSize(bytesToHash) << " are going to be hashed";
// hash files
std::map<uint64_t, Files> filesizes = hash(foundfiles, bytesToHash);
// resolve
resolveDuplicates(filesizes);
Log::info << "done. Deduplicated: " << deduplicatedFiles << " files, saving: " << FileSize(deduplicatedBytes);
}
void Dedup::removeUninterestingFiles(std::multimap<uint64_t, std::shared_ptr<File>>& files) {
// remove all files with a size of 0
files.erase( (const uint64_t& ) 0 );
// remove all files, where they are the only
for(auto it = files.begin(); it != files.end(); ) {
auto upper = files.upper_bound(it->first);
auto next = ++it;
// only one file with this filesize
if(next == upper) {
files.erase(it);
}
it = upper;
}
}
std::map<uint64_t, Files> Dedup::hash(std::multimap<uint64_t, std::shared_ptr<File>>& foundfiles, uint64_t bytesToHash) {
std::map<uint64_t, Files> filesizes;
ProgressBar progressBar(foundfiles.size(), bytesToHash);
for(auto it = foundfiles.begin(); it != foundfiles.end(); ++it) {
Hash hash = it->second->createHash();
progressBar.update(1, it->second->filesize);
Files& files = getFiles(it->first, filesizes);
files.addNewFile(it->second, std::move(hash));
std::cout << progressBar;
}
Log::info << "Hashing done after: " << (int) (progressBar.getDuration().count() / 1000) << "s ";
return filesizes;
}
void Dedup::resolveDuplicates(std::map<uint64_t, Files>& hashed) {
for(auto it = hashed.begin(); it != hashed.end(); ++it) {
if(!it->second.hasDuplicates()) continue;
Log::info << it->second.getFileCount() << " files with " << FileSize(it->first) << " found";
std::vector<MergeFile>& merges = it->second.getFiles();
for(MergeFile& mf : merges) {
mf.updateMainFile();
auto files = mf.getFiles();
if(files.size() == 1) continue;
Log::info << "\t" << mf.getHash() << " at " << files.size() << " places found:";
for(uint32_t i = 0; i < files.size(); ++i) {
const std::shared_ptr<File>& f = files.at(i);
Log::note << "\t\t" << f->linkcount << ": " << f->path;
// skip the first "base" file and dont link empty files
if(i > 0 && it->first > 0) {
if(!relinkFile(files.at(0)->path, f->path)) {
break;
}
deduplicatedBytes += f->filesize;
++deduplicatedFiles;
}
}
}
}
}
bool Dedup::relinkFile(const std::string& linkbase, const std::string& replacedfile) {
// symlinks are not implemented
if(dryrun || !hardlink) {
Log::note << "delete " << replacedfile;
Log::note << "link " << linkbase << " -> " << replacedfile;
} else {
int res = ::unlink(replacedfile.c_str());
if(res != 0) {
Log::error << "unlink(" << replacedfile << ") failed: " << strerror(errno) << " (" << errno << ')';
return false;
}
res = ::link(linkbase.c_str(), replacedfile.c_str());
if(res != 0) {
Log::error << "link(" << linkbase << ", " << replacedfile << ") failed: " << strerror(errno) << " (" << errno << ')';
// TODO try to symlink?
return false;
}
}
return true;
}