dedup/src/dedup.cpp

208 lines
5.9 KiB
C++

#include "dedup.h"
#include <chrono>
#include <cstring>
#include <iostream>
#include <iterator>
#include <mutex>
#include <numeric>
#include <sys/stat.h>
#include <thread>
#include <unistd.h>
#include <Log.h>
#include "progressbar.h"
#include "hasherthread.h"
const uint_fast8_t Dedup::HASHTHREADCOUNT = 4;
Dedup::Dedup() {
}
Dedup::~Dedup() {
}
void Dedup::addSearchFolder(const std::string& path, bool recurse) {
folders.emplace_back(path, recurse);
}
Files& getFiles(uint64_t fs, std::map<uint64_t, Files>& m) {
auto it = m.find(fs);
if(it == m.end()) {
auto pair = m.insert({fs, {}});
return pair.first->second;
}
return it->second;
}
void Dedup::start() {
FileIndexer::map_t foundfiles;
for(const SearchFolder& sf : folders) {
foundfiles.merge(indexer.index(sf));
}
uint64_t accFileSize = std::accumulate(foundfiles.begin(), foundfiles.end(), 0ul, [](uint64_t val, auto r){ return val + r.second->filesize; });
Log::info << "indexing done " << foundfiles.size() << " unique files found with a total file size of " << FileSize(accFileSize);
// remove uninteresting files
removeUninterestingFiles(foundfiles);
// hashing
uint64_t bytesToHash = std::accumulate(foundfiles.begin(), foundfiles.end(), 0ul, [](uint64_t& c, const std::pair<uint64_t, std::shared_ptr<File>>& it) { return c + it.second->filesize; });
Log::info << foundfiles.size() << " files and " << FileSize(bytesToHash) << " are going to be hashed";
// hash files
std::map<uint64_t, Files> filesizes = hash(foundfiles, bytesToHash);
// resolve
resolveDuplicates(filesizes);
Log::info << "done. Deduplicated: " << deduplicatedFiles << " files, saving: " << FileSize(deduplicatedBytes);
}
void Dedup::removeUninterestingFiles(std::multimap<uint64_t, std::shared_ptr<File>>& files) {
// remove all files with a size of 0
files.erase( (const uint64_t& ) 0 );
// remove all files, where they are the only one with that size (there cant be any duplicate so no need to hash them)
for(auto it = files.begin(); it != files.end(); ) {
auto upper = files.upper_bound(it->first);
auto next = it;
++next;
// only one file with this filesize
if(next == upper) {
files.erase(it);
}
it = upper;
}
}
std::map<uint64_t, Files> Dedup::hash(std::multimap<uint64_t, std::shared_ptr<File>>& foundfiles, uint64_t bytesToHash) {
std::map<uint64_t, Files> filesizes;
ProgressBar progressBar(foundfiles.size(), bytesToHash);
uint64_t stepsize = foundfiles.size() / HASHTHREADCOUNT;
ProgressBar* bpptr = &progressBar;
std::map<uint64_t, Files>* filesizesptr = &filesizes;
std::mutex globalData;
std::vector<HasherThread*> threads;
std::vector<std::multimap<uint64_t, std::shared_ptr<File>>::iterator> threadit;
threads.reserve(HASHTHREADCOUNT);
threadit.reserve(HASHTHREADCOUNT + 1);
// create threads
Log::info << "spawning " << (int) HASHTHREADCOUNT << " hashing threads";
threadit.push_back(foundfiles.begin());
for(uint_fast8_t i = 0; i < HASHTHREADCOUNT; ++i) {
// make a copy!
std::multimap<uint64_t, std::shared_ptr<File>>::iterator it = threadit.at(i);
std::advance(it, stepsize);
threadit.push_back(it);
}
// make sure the last one is realy the last
threadit.at(HASHTHREADCOUNT) = foundfiles.end();
for(uint_fast8_t i = 0; i < HASHTHREADCOUNT; ++i) {
std::multimap<uint64_t, std::shared_ptr<File>>::iterator* itptr = &(threadit.at(i));
std::multimap<uint64_t, std::shared_ptr<File>>::iterator* endptr = &(threadit.at(i+1));
threads.push_back(new HasherThread(
globalData,
[itptr, endptr]() -> std::shared_ptr<File> {
if(*itptr != *endptr) {
auto copy = *itptr;
++(*itptr);
return copy->second;
}
return nullptr;
},
[this, bpptr, filesizesptr](Hash&& h, std::shared_ptr<File> f) -> void {
bpptr->update(1, f->filesize);
Files& files = getFiles(f->filesize, *filesizesptr);
files.addNewFile(f, std::move(h));
std::cout << *bpptr;
}
));
}
// start all threads
std::for_each(threads.begin(), threads.end(), [](HasherThread* ht){
ht->start();
});
// wait for all threads
std::for_each(threads.begin(), threads.end(), [](HasherThread* ht){
ht->waitfor();
});
std::cout << "\n";
Log::info << "Hashing done after: " << (int) (progressBar.getDuration().count() / 1000) << "s ";
Log::info << filesizes.size() << " Hashed files";
return filesizes;
}
void Dedup::resolveDuplicates(std::map<uint64_t, Files>& hashed) {
for(auto it = hashed.begin(); it != hashed.end(); ++it) {
if(!it->second.hasDuplicates()) continue;
Log::info << it->second.getFileCount() << " files with " << FileSize(it->first) << " found";
std::vector<MergeFile>& merges = it->second.getFiles();
for(MergeFile& mf : merges) {
mf.updateMainFile();
auto files = mf.getFiles();
if(files.size() == 1) continue;
Log::info << "\t" << mf.getHash() << " at " << files.size() << " places found:";
for(uint32_t i = 0; i < files.size(); ++i) {
const std::shared_ptr<File>& f = files.at(i);
Log::note << "\t\t" << f->linkcount << ": " << f->path;
// skip the first "base" file and dont link empty files
if(i > 0 && it->first > 0) {
if(!relinkFile(files.at(0)->path, f->path)) {
break;
}
deduplicatedBytes += f->filesize;
++deduplicatedFiles;
}
}
}
}
}
bool Dedup::relinkFile(const std::string& linkbase, const std::string& replacedfile) {
// symlinks are not implemented
if(dryrun || !hardlink) {
Log::note << "delete " << replacedfile;
Log::note << "link " << linkbase << " -> " << replacedfile;
} else {
int res = ::unlink(replacedfile.c_str());
if(res != 0) {
Log::error << "unlink(" << replacedfile << ") failed: " << strerror(errno) << " (" << errno << ')';
return false;
}
res = ::link(linkbase.c_str(), replacedfile.c_str());
if(res != 0) {
Log::error << "link(" << linkbase << ", " << replacedfile << ") failed: " << strerror(errno) << " (" << errno << ')';
// TODO try to symlink?
return false;
}
}
return true;
}