#include "dedup.h" #include #include #include #include #include #include #include #include #include "progressbar.h" Dedup::Dedup() { } Dedup::~Dedup() { } void Dedup::addSearchFolder(const std::string& path, bool recurse) { folders.emplace_back(path, recurse); } Files& getFiles(uint64_t fs, std::map& m) { auto it = m.find(fs); if(it == m.end()) { auto pair = m.insert({fs, {}}); return pair.first->second; } return it->second; } void Dedup::start() { FileIndexer::map_t foundfiles; for(const SearchFolder& sf : folders) { foundfiles.merge(indexer.index(sf)); } uint64_t accFileSize = std::accumulate(foundfiles.begin(), foundfiles.end(), 0ul, [](uint64_t val, auto r){ return val + r.second->filesize; }); Log::info << "indexing done " << foundfiles.size() << " unique files found with a total file size of " << FileSize(accFileSize); // remove uninteresting files removeUninterestingFiles(foundfiles); // hashing uint64_t bytesToHash = std::accumulate(foundfiles.begin(), foundfiles.end(), 0ul, [](uint64_t& c, const std::pair>& it) { return c + it.second->filesize; }); Log::info << foundfiles.size() << " files and " << FileSize(bytesToHash) << " are going to be hashed"; // hash files std::map filesizes = hash(foundfiles, bytesToHash); // resolve resolveDuplicates(filesizes); Log::info << "done. Deduplicated: " << deduplicatedFiles << " files, saving: " << FileSize(deduplicatedBytes); } void Dedup::removeUninterestingFiles(std::multimap>& files) { // remove all files with a size of 0 files.erase( (const uint64_t& ) 0 ); // remove all files, where they are the only one with that size (there cant be any duplicate so no need to hash them) for(auto it = files.begin(); it != files.end(); ) { auto upper = files.upper_bound(it->first); auto next = it; ++next; // only one file with this filesize if(next == upper) { files.erase(it); } it = upper; } } std::map Dedup::hash(std::multimap>& foundfiles, uint64_t bytesToHash) { std::map filesizes; ProgressBar progressBar(foundfiles.size(), bytesToHash); for(auto it = foundfiles.begin(); it != foundfiles.end(); ++it) { Hash hash = it->second->createHash(); progressBar.update(1, it->second->filesize); Files& files = getFiles(it->first, filesizes); files.addNewFile(it->second, std::move(hash)); std::cout << progressBar; } std::cout << "\n"; Log::info << "Hashing done after: " << (int) (progressBar.getDuration().count() / 1000) << "s "; return filesizes; } void Dedup::resolveDuplicates(std::map& hashed) { for(auto it = hashed.begin(); it != hashed.end(); ++it) { if(!it->second.hasDuplicates()) continue; Log::info << it->second.getFileCount() << " files with " << FileSize(it->first) << " found"; std::vector& merges = it->second.getFiles(); for(MergeFile& mf : merges) { mf.updateMainFile(); auto files = mf.getFiles(); if(files.size() == 1) continue; Log::info << "\t" << mf.getHash() << " at " << files.size() << " places found:"; for(uint32_t i = 0; i < files.size(); ++i) { const std::shared_ptr& f = files.at(i); Log::note << "\t\t" << f->linkcount << ": " << f->path; // skip the first "base" file and dont link empty files if(i > 0 && it->first > 0) { if(!relinkFile(files.at(0)->path, f->path)) { break; } deduplicatedBytes += f->filesize; ++deduplicatedFiles; } } } } } bool Dedup::relinkFile(const std::string& linkbase, const std::string& replacedfile) { // symlinks are not implemented if(dryrun || !hardlink) { Log::note << "delete " << replacedfile; Log::note << "link " << linkbase << " -> " << replacedfile; } else { int res = ::unlink(replacedfile.c_str()); if(res != 0) { Log::error << "unlink(" << replacedfile << ") failed: " << strerror(errno) << " (" << errno << ')'; return false; } res = ::link(linkbase.c_str(), replacedfile.c_str()); if(res != 0) { Log::error << "link(" << linkbase << ", " << replacedfile << ") failed: " << strerror(errno) << " (" << errno << ')'; // TODO try to symlink? return false; } } return true; }