From 4caabf923a5f281dbc31bccede3477708ec63d8e Mon Sep 17 00:00:00 2001 From: mrbesen Date: Sun, 23 Oct 2022 00:48:15 +0200 Subject: [PATCH] first index, than hash, then solve also add progressbar --- inc/dedup.h | 19 ++-- inc/file.h | 4 +- inc/fileindexer.h | 25 ++++++ inc/files.h | 8 +- inc/mergefile.h | 7 ++ inc/progressbar.h | 29 ++++++ src/dedup.cpp | 215 ++++++++++++++++++++++++++------------------ src/file.cpp | 2 +- src/fileindexer.cpp | 86 ++++++++++++++++++ src/files.cpp | 25 +++++- src/hash.cpp | 4 + src/main.cpp | 10 ++- src/mergefiles.cpp | 34 +++++++ src/progressbar.cpp | 44 +++++++++ 14 files changed, 406 insertions(+), 106 deletions(-) create mode 100644 inc/fileindexer.h create mode 100644 inc/progressbar.h create mode 100644 src/fileindexer.cpp create mode 100644 src/progressbar.cpp diff --git a/inc/dedup.h b/inc/dedup.h index 73bf7c4..01c0b6b 100644 --- a/inc/dedup.h +++ b/inc/dedup.h @@ -6,6 +6,7 @@ #include #include "file.h" +#include "fileindexer.h" #include "files.h" #include "searchfolder.h" @@ -19,21 +20,19 @@ public: void start(); private: - void indexFolder(const SearchFolder& sf); - void handleFolderContent(dirent* dircont, const SearchFolder& sf); - void handleNewFile(ino_t inode, const std::string& path); - Files& getFiles(uint64_t fs); + void removeUninterestingFiles(std::multimap>& files); - // all files that where found - std::map filesizes; + std::map hash(std::multimap>& files, uint64_t bytestohash); - // files that where found ordered by inode (to detect deduplicated files) - std::map> inodes; + void resolveDuplicates(std::map& hashed); + bool relinkFile(const std::string& linkbase, const std::string& replacedfile); - // folders that should be scanned std::vector folders; + FileIndexer indexer; bool hardlink = true; bool ignoredotfiles = false; - uint64_t deduplicated = 0; // amount of bytes deduplicated + bool dryrun = true; + uint64_t deduplicatedBytes = 0; // amount of bytes deduplicated + uint64_t deduplicatedFiles = 0; }; diff --git a/inc/file.h b/inc/file.h index 379ea62..109cd0b 100644 --- a/inc/file.h +++ b/inc/file.h @@ -13,11 +13,11 @@ struct FileSize { class File { public: - File(FileSize, uint64_t inode, uint64_t linkcount, const std::string& path); + File(uint64_t, uint64_t inode, uint64_t linkcount, const std::string& path); Hash createHash(); - FileSize filesize; + uint64_t filesize; uint64_t inode; uint64_t linkcount; std::string path; diff --git a/inc/fileindexer.h b/inc/fileindexer.h new file mode 100644 index 0000000..0ac040e --- /dev/null +++ b/inc/fileindexer.h @@ -0,0 +1,25 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "searchfolder.h" +#include "file.h" + +class FileIndexer { +public: + + using map_t = std::multimap>; + map_t index(const SearchFolder& sf); +private: + void handleFolderContent(dirent* dircont, const SearchFolder& sf, std::list& newfolders, map_t& newfiles); + void handleNewFile(ino_t inode, const std::string& path, map_t& newfiles); + + std::set knownInodes; // file inodes, that are known and should not be indexed again + bool ignoredotfiles = false; +}; diff --git a/inc/files.h b/inc/files.h index b1c6828..a3e54f0 100644 --- a/inc/files.h +++ b/inc/files.h @@ -9,8 +9,14 @@ class Files { public: Files(); - void addNewFile(std::shared_ptr file); + void addNewFile(std::shared_ptr file, Hash hash); + // check if at least one file can be deduplicated + bool hasDuplicates() const; + + uint32_t getFileCount() const; + std::vector& getFiles(); + const std::vector& getFiles() const; private: std::vector files; }; diff --git a/inc/mergefile.h b/inc/mergefile.h index 3344985..22882f0 100644 --- a/inc/mergefile.h +++ b/inc/mergefile.h @@ -15,6 +15,13 @@ public: // try to add a file, returns true on success bool addFile(std::shared_ptr f, const Hash& h); + + std::vector>& getFiles(); + const std::vector>& getFiles() const; + + // make sure the main file, which should be the base for the other files is stored at the begin of the vector + void updateMainFile(); + private: std::vector> files; Hash hash; diff --git a/inc/progressbar.h b/inc/progressbar.h new file mode 100644 index 0000000..1d8dd66 --- /dev/null +++ b/inc/progressbar.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include +#include + +class ProgressBar { +public: + ProgressBar(uint64_t files, uint64_t bytes); + + void update(uint64_t addFiles, uint64_t newBytes); + + std::chrono::duration getDuration() const; + +private: + static const uint_fast8_t BARLENGTH; + + uint64_t maxFiles; + uint64_t maxBytes; + + uint64_t currentFiles = 0; + uint64_t currentBytes = 0; + + std::chrono::time_point start = std::chrono::high_resolution_clock::now(); + + friend std::ostream& operator<<(std::ostream& str, const ProgressBar&); +}; + +std::ostream& operator<<(std::ostream& str, const ProgressBar&); diff --git a/src/dedup.cpp b/src/dedup.cpp index f7ea89c..3c06019 100644 --- a/src/dedup.cpp +++ b/src/dedup.cpp @@ -1,11 +1,17 @@ #include "dedup.h" +#include #include +#include #include #include +#include +#include #include +#include "progressbar.h" + Dedup::Dedup() { } @@ -16,94 +22,129 @@ void Dedup::addSearchFolder(const std::string& path, bool recurse) { folders.emplace_back(path, recurse); } -void Dedup::start() { - for(uint32_t i = 0; i < folders.size(); ++i) { - SearchFolder sf = folders.at(i); - - indexFolder(sf); - } - - uint64_t accFileSize = std::accumulate(inodes.begin(), inodes.end(), 0ul, [](uint64_t val, auto r){ return val + r.second->filesize.fs; }); - - Log::info << "indexing done " << inodes.size() << " unique files found with a total file size of " << FileSize(accFileSize); - - -} - -void Dedup::indexFolder(const SearchFolder& sf) { - DIR* dirptr = ::opendir(sf.path.c_str()); - if(!dirptr) { - Log::error << "Could not open directory: " << sf.path; - return; - } - - errno = 0; - while(dirent* dircont = ::readdir(dirptr)) { - handleFolderContent(dircont, sf); - } - - if(errno != 0) { - Log::error << "failed to read directory: " << sf.path; - } - - ::closedir(dirptr); -} - -void Dedup::handleFolderContent(dirent* dircont, const SearchFolder& sf) { - std::string name(dircont->d_name); - std::string filepath = sf.path + "/" + name; - - // handle dotfiles - if(name == "." || name == ".." || (ignoredotfiles && name.at(0) == '.')) { - return; - } - - // handle subfolders - if(dircont->d_type == DT_DIR) { - if(sf.recurse) { - folders.emplace_back(filepath, true); - Log::note << "found new subdir to index: " << filepath; - return; - } - } - - // regular file - if(dircont->d_type == DT_REG) { - handleNewFile(dircont->d_ino, filepath); - } -} - -void Dedup::handleNewFile(ino_t inode, const std::string& path) { - // check for already scanned inodes - auto it = inodes.find(inode); - if(it != inodes.end()) { - Log::note << "found already detected file: " << inode << ' ' << path; - return; - } - - struct stat statbuf; - int res = stat(path.c_str(), &statbuf); - if(res == -1) { - Log::error << "stat failed: " << path << " error: " << strerror(errno) << " (" << errno << ')'; - return; - } - - loff_t fileSize = statbuf.st_size; - nlink_t linkCount = statbuf.st_nlink; - - auto file = std::make_shared(fileSize, inode, linkCount, path); - - inodes.insert({inode, file}); - Files& files = getFiles(fileSize); - - files.addNewFile(file); -} - -Files& Dedup::getFiles(uint64_t fs) { - auto it = filesizes.find(fs); - if(it == filesizes.end()) { - auto pair = filesizes.insert({fs, {}}); +Files& getFiles(uint64_t fs, std::map& m) { + auto it = m.find(fs); + if(it == m.end()) { + auto pair = m.insert({fs, {}}); return pair.first->second; } return it->second; } + +void Dedup::start() { + FileIndexer::map_t foundfiles; + for(const SearchFolder& sf : folders) { + foundfiles = indexer.index(sf); + } + + uint64_t accFileSize = std::accumulate(foundfiles.begin(), foundfiles.end(), 0ul, [](uint64_t val, auto r){ return val + r.second->filesize; }); + + Log::info << "indexing done " << foundfiles.size() << " unique files found with a total file size of " << FileSize(accFileSize); + + // remove uninteresting files + removeUninterestingFiles(foundfiles); + + // hashing + uint64_t bytesToHash = std::accumulate(foundfiles.begin(), foundfiles.end(), 0ul, [](uint64_t& c, const std::pair>& it) { return c + it.second->filesize; }); + + Log::info << foundfiles.size() << " files and " << FileSize(bytesToHash) << " are going to be hashed"; + + // hash files + std::map filesizes = hash(foundfiles, bytesToHash); + + // resolve + resolveDuplicates(filesizes); + Log::info << "done. Deduplicated: " << deduplicatedFiles << " files, saving: " << FileSize(deduplicatedBytes); +} + +void Dedup::removeUninterestingFiles(std::multimap>& files) { + // remove all files with a size of 0 + files.erase( (const uint64_t& ) 0 ); + + // remove all files, where they are the only + for(auto it = files.begin(); it != files.end(); ) { + auto upper = files.upper_bound(it->first); + auto next = ++it; + + // only one file with this filesize + if(next == upper) { + files.erase(it); + } + + it = upper; + } +} + +std::map Dedup::hash(std::multimap>& foundfiles, uint64_t bytesToHash) { + std::map filesizes; + + ProgressBar progressBar(foundfiles.size(), bytesToHash); + for(auto it = foundfiles.begin(); it != foundfiles.end(); ++it) { + Hash hash = it->second->createHash(); + progressBar.update(1, it->second->filesize); + + Files& files = getFiles(it->first, filesizes); + files.addNewFile(it->second, std::move(hash)); + + std::cout << progressBar; + } + + Log::info << "Hashing done after: " << (int) (progressBar.getDuration().count() / 1000) << "s "; + return filesizes; +} + +void Dedup::resolveDuplicates(std::map& hashed) { + for(auto it = hashed.begin(); it != hashed.end(); ++it) { + if(!it->second.hasDuplicates()) continue; + + Log::info << it->second.getFileCount() << " files with " << FileSize(it->first) << " found"; + + std::vector& merges = it->second.getFiles(); + + for(MergeFile& mf : merges) { + mf.updateMainFile(); + + auto files = mf.getFiles(); + if(files.size() == 1) continue; + + Log::info << "\t" << mf.getHash() << " at " << files.size() << " places found:"; + + for(uint32_t i = 0; i < files.size(); ++i) { + const std::shared_ptr& f = files.at(i); + Log::note << "\t\t" << f->linkcount << ": " << f->path; + + // skip the first "base" file and dont link empty files + if(i > 0 && it->first > 0) { + if(!relinkFile(files.at(0)->path, f->path)) { + break; + } + deduplicatedBytes += f->filesize; + ++deduplicatedFiles; + } + } + } + } +} + +bool Dedup::relinkFile(const std::string& linkbase, const std::string& replacedfile) { + // symlinks are not implemented + + if(dryrun || !hardlink) { + Log::note << "delete " << replacedfile; + Log::note << "link " << linkbase << " -> " << replacedfile; + } else { + int res = ::unlink(replacedfile.c_str()); + if(res != 0) { + Log::error << "unlink(" << replacedfile << ") failed: " << strerror(errno) << " (" << errno << ')'; + return false; + } + + res = ::link(linkbase.c_str(), replacedfile.c_str()); + if(res != 0) { + Log::error << "link(" << linkbase << ", " << replacedfile << ") failed: " << strerror(errno) << " (" << errno << ')'; + // TODO try to symlink? + return false; + } + } + return true; +} diff --git a/src/file.cpp b/src/file.cpp index 92020d0..3f71512 100644 --- a/src/file.cpp +++ b/src/file.cpp @@ -2,7 +2,7 @@ FileSize::FileSize(uint64_t fs) : fs(fs) {} -File::File(FileSize fs, uint64_t inode, uint64_t linkcount, const std::string& path) : +File::File(uint64_t fs, uint64_t inode, uint64_t linkcount, const std::string& path) : filesize(fs), inode(inode), linkcount(linkcount), path(path) {} Hash File::createHash() { diff --git a/src/fileindexer.cpp b/src/fileindexer.cpp new file mode 100644 index 0000000..f5d8bdb --- /dev/null +++ b/src/fileindexer.cpp @@ -0,0 +1,86 @@ +#include "fileindexer.h" + +#include +#include + +#include + +std::multimap> FileIndexer::index(const SearchFolder& firstfolder) { + std::list folders; + folders.emplace_back(firstfolder); + + map_t out; + + while(!folders.empty()) { + SearchFolder sf = folders.front(); + folders.pop_front(); + + DIR* dirptr = ::opendir(sf.path.c_str()); + if(!dirptr) { + Log::error << "Could not open directory: " << sf.path; + continue; + } + + errno = 0; + while(dirent* dircont = ::readdir(dirptr)) { + handleFolderContent(dircont, sf, folders, out); + } + + if(errno != 0) { + Log::error << "failed to read directory: " << sf.path << ' ' << strerror(errno) << " (" << errno << ')'; + } + + ::closedir(dirptr); + } + return out; +} + + +void FileIndexer::handleFolderContent(dirent* dircont, const SearchFolder& sf, std::list& newfolders, map_t& newfiles) { + std::string name(dircont->d_name); + std::string filepath = sf.path + "/" + name; + + // handle dotfiles + if(name == "." || name == ".." || (ignoredotfiles && name.at(0) == '.')) { + return; + } + + // handle subfolders + if(dircont->d_type == DT_DIR) { + if(sf.recurse) { + newfolders.emplace_back(filepath, true); + Log::note << "found new subdir to index: " << filepath; + return; + } + } + + // regular file + if(dircont->d_type == DT_REG) { + handleNewFile(dircont->d_ino, filepath, newfiles); + } +} + +void FileIndexer::handleNewFile(ino_t inode, const std::string& path, map_t& newfiles) { + // check for already scanned inodes + auto it = knownInodes.find(inode); + if(it != knownInodes.end()) { + Log::note << "found already detected file: " << inode << ' ' << path; + return; + } + + struct stat statbuf; + int res = stat(path.c_str(), &statbuf); + if(res == -1) { + Log::error << "stat failed: " << path << " error: " << strerror(errno) << " (" << errno << ')'; + return; + } + + loff_t fileSize = statbuf.st_size; + nlink_t linkCount = statbuf.st_nlink; + + + knownInodes.insert(inode); + + auto file = std::make_shared(fileSize, inode, linkCount, path); + newfiles.insert({fileSize, file}); +} diff --git a/src/files.cpp b/src/files.cpp index 28ef35d..0399666 100644 --- a/src/files.cpp +++ b/src/files.cpp @@ -3,9 +3,7 @@ Files::Files() { } -void Files::addNewFile(std::shared_ptr file) { - Hash hash = file->createHash(); - +void Files::addNewFile(std::shared_ptr file, Hash hash) { // find first matching file for(uint32_t i = 0; i < files.size(); ++i) { MergeFile& mf = files.at(i); @@ -19,3 +17,24 @@ void Files::addNewFile(std::shared_ptr file) { // no suitable mergefile found files.emplace_back(file, std::move(hash)); } + +bool Files::hasDuplicates() const { + for(const MergeFile& mf : files) { + if(mf.getFiles().size() > 1) { + return true; + } + } + return false; +} + +uint32_t Files::getFileCount() const { + return files.size(); +} + +std::vector& Files::getFiles() { + return files; +} + +const std::vector& Files::getFiles() const { + return files; +} diff --git a/src/hash.cpp b/src/hash.cpp index 72123f9..45998d5 100644 --- a/src/hash.cpp +++ b/src/hash.cpp @@ -112,6 +112,10 @@ bool Hash::operator!=(const Hash& rhs) const { std::ostream& operator<<(std::ostream& str, const Hash& rhs) { + if(!rhs.data) { + return str << "[empty hash]"; + } + str << std::hex << std::setfill('0'); for(size_t i = 0; i < HASHSIZE; ++i) { diff --git a/src/main.cpp b/src/main.cpp index 4279c7d..579461d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -10,12 +10,18 @@ int main(int argc, const char** argv) { #if __unix__ Log::setColoredOutput(true); #endif - Log::info << "Hello, World!"; + if(argc < 2) { + Log::fatal << "Usage: " << argv[0] << " [path2] [...]"; + return -1; + } + Dedup dedup; - dedup.addSearchFolder("."); + for(int i = 1; i < argc; ++i) { + dedup.addSearchFolder(argv[i]); + } dedup.start(); diff --git a/src/mergefiles.cpp b/src/mergefiles.cpp index 7e5ffa0..d31caee 100644 --- a/src/mergefiles.cpp +++ b/src/mergefiles.cpp @@ -1,5 +1,7 @@ #include "mergefile.h" +#include + MergeFile::MergeFile(std::shared_ptr file, Hash&& h) : hash(std::move(h)) { files.emplace_back(file); } @@ -15,3 +17,35 @@ bool MergeFile::addFile(std::shared_ptr f, const Hash& h) { } return false; } + +std::vector>& MergeFile::getFiles() { + return files; +} + +const std::vector>& MergeFile::getFiles() const { + return files; +} + +void MergeFile::updateMainFile() { + // count the files with more than one link + uint32_t complexFiles = std::count_if(files.begin(), files.end(), [](const std::shared_ptr& f) { + return f->linkcount > 1; + }); + + if(complexFiles > 1) { + // move the file with the most hardlinks to the beginning + + uint32_t maxlinkcount = 0; + uint32_t maxlinkcountoffset = 0; + for(uint32_t i = 0; i < files.size(); ++i) { + if(files.at(i)->linkcount > maxlinkcount) { + maxlinkcount = files.at(i)->linkcount; + maxlinkcountoffset = i; + } + } + + if(maxlinkcountoffset != 0) { + std::swap(files.at(0), files.at(maxlinkcountoffset)); + } + } +} \ No newline at end of file diff --git a/src/progressbar.cpp b/src/progressbar.cpp new file mode 100644 index 0000000..2485ede --- /dev/null +++ b/src/progressbar.cpp @@ -0,0 +1,44 @@ +#include "progressbar.h" + +#include "file.h" + +const uint_fast8_t ProgressBar::BARLENGTH = 50; + +ProgressBar::ProgressBar(uint64_t files, uint64_t bytes) : maxFiles(files), maxBytes(bytes) { +} + +void ProgressBar::update(uint64_t addFiles, uint64_t newBytes) { + currentFiles += addFiles; + currentBytes += newBytes; +} + +std::chrono::duration ProgressBar::getDuration() const { + auto currenttime = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast(currenttime - start); +} + +std::ostream& operator<<(std::ostream& str, const ProgressBar& pb) { + double progressd = pb.currentBytes / (double) pb.maxBytes; + uint64_t progressi = (progressd * 10000.0); + + auto usedtime = pb.getDuration(); + std::chrono::duration> eta = std::chrono::duration_cast((usedtime / (double) pb.currentBytes) * pb.maxBytes); + + + // generate bar + str << "["; + for(uint_fast8_t i = 0; i < ProgressBar::BARLENGTH; ++i) { + if( (i / (double) ProgressBar::BARLENGTH) < progressd ) { + str << '#'; + } else { + str << ' '; + } + } + + // print info + str << "] " << (progressi / 100.0f) << "% " << pb.currentFiles << '/' << pb.maxFiles << " Files " + << FileSize(pb.currentBytes) << '/' << FileSize(pb.maxBytes) << " ETA: " << eta.count() + << "s \r" << std::flush; + + return str; +}