#include "dedup.h" #include #include #include #include #include #include #include #include #include #include #include "progressbar.h" #include "hasherthread.h" const uint_fast8_t Dedup::HASHTHREADCOUNT = 4; Dedup::Dedup() { } Dedup::~Dedup() { } void Dedup::addSearchFolder(const std::string& path, bool recurse) { folders.emplace_back(path, recurse); } Files& getFiles(uint64_t fs, std::map& m) { auto it = m.find(fs); if(it == m.end()) { auto pair = m.insert({fs, {}}); return pair.first->second; } return it->second; } void Dedup::start() { FileIndexer::map_t foundfiles; indexer.setIgnoreDotFiles(ignoredotfiles); indexer.setIgnoreInodeIDs(ignoreInodeID); for(const SearchFolder& sf : folders) { foundfiles.merge(indexer.index(sf)); } uint64_t accFileSize = std::accumulate(foundfiles.begin(), foundfiles.end(), 0ul, [](uint64_t val, auto r){ return val + r.second->filesize; }); Log::info << "indexing done " << foundfiles.size() << " unique files found with a total file size of " << FileSize(accFileSize); // remove uninteresting files removeUninterestingFiles(foundfiles); // hashing uint64_t bytesToHash = std::accumulate(foundfiles.begin(), foundfiles.end(), 0ul, [](uint64_t& c, auto it) { return c + it.second->filesize; }); Log::info << foundfiles.size() << " files and " << FileSize(bytesToHash) << " are going to be hashed"; // hash files std::map filesizes = hash(foundfiles, bytesToHash); // resolve resolveDuplicates(filesizes); Log::info << "done. Deduplicated: " << deduplicatedFiles << " files, saving: " << FileSize(deduplicatedBytes); } void Dedup::removeUninterestingFiles(std::multimap>& files) { // remove all files with a size of 0 files.erase( (const uint64_t& ) 0 ); // remove all files, where they are the only one with that size (there cant be any duplicate so no need to hash them) for(auto it = files.begin(); it != files.end(); ) { auto upper = files.upper_bound(it->first); auto next = it; ++next; // only one file with this filesize if(next == upper) { files.erase(it); } it = upper; } } std::map Dedup::hash(std::multimap>& foundfiles, uint64_t bytesToHash) { std::map filesizes; ProgressBar progressBar(foundfiles.size(), bytesToHash); uint64_t stepsize = foundfiles.size() / HASHTHREADCOUNT; ProgressBar* bpptr = &progressBar; std::map* filesizesptr = &filesizes; std::mutex globalData; std::vector threads; std::vector>::iterator> threadends; std::vector>::iterator> threadits; threads.reserve(HASHTHREADCOUNT); threadends.resize(HASHTHREADCOUNT); threadits.resize(HASHTHREADCOUNT); // create threads Log::info << "spawning " << (int) HASHTHREADCOUNT << " hashing threads"; threadits.at(0) = foundfiles.begin(); for(uint_fast8_t i = 1; i < HASHTHREADCOUNT; ++i) { // make a copy! std::multimap>::iterator it = threadits.at(i-1); std::advance(it, stepsize); threadits.at(i) = it; threadends.at(i-1) = it; } // make sure the last one is realy the last threadends.at(HASHTHREADCOUNT-1) = foundfiles.end(); for(uint_fast8_t i = 0; i < HASHTHREADCOUNT; ++i) { std::multimap>::iterator* itptr = &(threadits.at(i)); const std::multimap>::iterator* endptr = &(threadends.at(i)); threads.push_back(new HasherThread( globalData, [itptr, endptr, i]() -> std::shared_ptr { if(*itptr != *endptr) { std::shared_ptr value = (*itptr)->second; ++(*itptr); return value; } return nullptr; }, [this, bpptr, filesizesptr](Hash&& h, std::shared_ptr f) -> void { bpptr->update(1, f->filesize); Files& files = getFiles(f->filesize, *filesizesptr); files.addNewFile(f, std::move(h)); std::cout << *bpptr; } )); } // start all threads std::for_each(threads.begin(), threads.end(), [](HasherThread* ht){ ht->start(); }); // wait for all threads std::for_each(threads.begin(), threads.end(), [](HasherThread* ht){ ht->waitfor(); }); std::cout << "\n"; Log::info << "Hashing done after: " << (int) (progressBar.getDuration().count() / 1000) << "s "; Log::info << filesizes.size() << " Hashed files"; return filesizes; } void Dedup::resolveDuplicates(std::map& hashed) { for(auto it = hashed.begin(); it != hashed.end(); ++it) { if(!it->second.hasDuplicates()) continue; Log::info << it->second.getFileCount() << " files with " << FileSize(it->first) << " found"; std::vector& merges = it->second.getFiles(); for(MergeFile& mf : merges) { mf.updateMainFile(); auto files = mf.getFiles(); if(files.size() == 1) continue; Log::info << "\t" << mf.getHash() << " at " << files.size() << " places found:"; for(uint32_t i = 0; i < files.size(); ++i) { const std::shared_ptr& f = files.at(i); Log::note << "\t\t" << f->linkcount << ": " << f->path; // skip the first "base" file and dont link empty files if(i > 0 && it->first > 0) { if(!relinkFile(files.at(0)->path, f->path)) { break; } deduplicatedBytes += f->filesize; ++deduplicatedFiles; } } } } } bool Dedup::relinkFile(const std::string& linkbase, const std::string& replacedfile) { // symlinks are not implemented if(dryrun || !hardlink) { Log::note << "delete " << replacedfile; if(!deleteDuplicates) { Log::note << "link " << linkbase << " -> " << replacedfile; } } else { int res = ::unlink(replacedfile.c_str()); if(res != 0) { Log::error << "unlink(" << replacedfile << ") failed: " << strerror(errno) << " (" << errno << ')'; return false; } if(!deleteDuplicates) { res = ::link(linkbase.c_str(), replacedfile.c_str()); if(res != 0) { Log::error << "link(" << linkbase << ", " << replacedfile << ") failed: " << strerror(errno) << " (" << errno << ')'; // TODO try to symlink? return false; } } } return true; }