From 532a3c969c98710cc0041ae378c2816d8ccb9dc8 Mon Sep 17 00:00:00 2001 From: mrbesen Date: Sat, 22 Oct 2022 00:43:34 +0200 Subject: [PATCH] reading of files --- inc/dedup.h | 16 ++++++- inc/file.h | 21 +++++++-- inc/files.h | 3 ++ inc/hash.h | 4 +- inc/mergefile.h | 5 ++ inc/searchfolder.h | 2 + src/dedup.cpp | 109 +++++++++++++++++++++++++++++++++++++++++++ src/file.cpp | 30 ++++++++++++ src/files.cpp | 21 +++++++++ src/hash.cpp | 10 +++- src/main.cpp | 10 +++- src/mergefiles.cpp | 17 +++++++ src/searchfolder.cpp | 5 ++ 13 files changed, 245 insertions(+), 8 deletions(-) create mode 100644 src/dedup.cpp create mode 100644 src/file.cpp create mode 100644 src/files.cpp create mode 100644 src/mergefiles.cpp create mode 100644 src/searchfolder.cpp diff --git a/inc/dedup.h b/inc/dedup.h index 7a1683d..73bf7c4 100644 --- a/inc/dedup.h +++ b/inc/dedup.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -18,10 +19,21 @@ public: void start(); private: - std::map filesizes; - std::map> inodes; + void indexFolder(const SearchFolder& sf); + void handleFolderContent(dirent* dircont, const SearchFolder& sf); + void handleNewFile(ino_t inode, const std::string& path); + Files& getFiles(uint64_t fs); + // all files that where found + std::map filesizes; + + // files that where found ordered by inode (to detect deduplicated files) + std::map> inodes; + + // folders that should be scanned std::vector folders; bool hardlink = true; + bool ignoredotfiles = false; + uint64_t deduplicated = 0; // amount of bytes deduplicated }; diff --git a/inc/file.h b/inc/file.h index 72b8b66..379ea62 100644 --- a/inc/file.h +++ b/inc/file.h @@ -1,13 +1,28 @@ #pragma once #include +#include #include +#include "hash.h" + +struct FileSize { + FileSize(uint64_t); + uint64_t fs; +}; + class File { public: -private: - uint64_t filesize; - uint64_t inodeid; + File(FileSize, uint64_t inode, uint64_t linkcount, const std::string& path); + + Hash createHash(); + + FileSize filesize; + uint64_t inode; uint64_t linkcount; std::string path; }; + +// FileSize iomanip +// convert Bytes into human readable format +std::ostream& operator<<(std::ostream& str, const FileSize& fs); diff --git a/inc/files.h b/inc/files.h index 2ebd6c0..b1c6828 100644 --- a/inc/files.h +++ b/inc/files.h @@ -7,6 +7,9 @@ // a list of files with the same size class Files { public: + Files(); + + void addNewFile(std::shared_ptr file); private: std::vector files; diff --git a/inc/hash.h b/inc/hash.h index aac3ca3..29e14d4 100644 --- a/inc/hash.h +++ b/inc/hash.h @@ -6,6 +6,8 @@ class Hash { public: Hash(); + Hash(const Hash&); + Hash(Hash&&); virtual ~Hash(); static bool create(Hash& h, const std::string& file); @@ -18,7 +20,7 @@ public: bool operator!=(const Hash& rhs) const; private: - const unsigned char* data = nullptr; + unsigned char* data = nullptr; friend std::ostream& operator<<(std::ostream& str, const Hash& rhs); }; diff --git a/inc/mergefile.h b/inc/mergefile.h index 4862688..3344985 100644 --- a/inc/mergefile.h +++ b/inc/mergefile.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include "file.h" @@ -8,8 +9,12 @@ // eine liste an gleichen Dateien, die gemerged werden sollen class MergeFile { public: + MergeFile(std::shared_ptr file, Hash&& h); + const Hash& getHash() const; + // try to add a file, returns true on success + bool addFile(std::shared_ptr f, const Hash& h); private: std::vector> files; Hash hash; diff --git a/inc/searchfolder.h b/inc/searchfolder.h index 38f44f3..1055542 100644 --- a/inc/searchfolder.h +++ b/inc/searchfolder.h @@ -3,6 +3,8 @@ #include struct SearchFolder { + SearchFolder(const std::string& path, bool recurse); + std::string path; bool recurse; }; diff --git a/src/dedup.cpp b/src/dedup.cpp new file mode 100644 index 0000000..f7ea89c --- /dev/null +++ b/src/dedup.cpp @@ -0,0 +1,109 @@ +#include "dedup.h" + +#include +#include +#include + +#include + +Dedup::Dedup() { +} + +Dedup::~Dedup() { +} + +void Dedup::addSearchFolder(const std::string& path, bool recurse) { + folders.emplace_back(path, recurse); +} + +void Dedup::start() { + for(uint32_t i = 0; i < folders.size(); ++i) { + SearchFolder sf = folders.at(i); + + indexFolder(sf); + } + + uint64_t accFileSize = std::accumulate(inodes.begin(), inodes.end(), 0ul, [](uint64_t val, auto r){ return val + r.second->filesize.fs; }); + + Log::info << "indexing done " << inodes.size() << " unique files found with a total file size of " << FileSize(accFileSize); + + +} + +void Dedup::indexFolder(const SearchFolder& sf) { + DIR* dirptr = ::opendir(sf.path.c_str()); + if(!dirptr) { + Log::error << "Could not open directory: " << sf.path; + return; + } + + errno = 0; + while(dirent* dircont = ::readdir(dirptr)) { + handleFolderContent(dircont, sf); + } + + if(errno != 0) { + Log::error << "failed to read directory: " << sf.path; + } + + ::closedir(dirptr); +} + +void Dedup::handleFolderContent(dirent* dircont, const SearchFolder& sf) { + std::string name(dircont->d_name); + std::string filepath = sf.path + "/" + name; + + // handle dotfiles + if(name == "." || name == ".." || (ignoredotfiles && name.at(0) == '.')) { + return; + } + + // handle subfolders + if(dircont->d_type == DT_DIR) { + if(sf.recurse) { + folders.emplace_back(filepath, true); + Log::note << "found new subdir to index: " << filepath; + return; + } + } + + // regular file + if(dircont->d_type == DT_REG) { + handleNewFile(dircont->d_ino, filepath); + } +} + +void Dedup::handleNewFile(ino_t inode, const std::string& path) { + // check for already scanned inodes + auto it = inodes.find(inode); + if(it != inodes.end()) { + Log::note << "found already detected file: " << inode << ' ' << path; + return; + } + + struct stat statbuf; + int res = stat(path.c_str(), &statbuf); + if(res == -1) { + Log::error << "stat failed: " << path << " error: " << strerror(errno) << " (" << errno << ')'; + return; + } + + loff_t fileSize = statbuf.st_size; + nlink_t linkCount = statbuf.st_nlink; + + auto file = std::make_shared(fileSize, inode, linkCount, path); + + inodes.insert({inode, file}); + Files& files = getFiles(fileSize); + + files.addNewFile(file); +} + +Files& Dedup::getFiles(uint64_t fs) { + auto it = filesizes.find(fs); + if(it == filesizes.end()) { + auto pair = filesizes.insert({fs, {}}); + return pair.first->second; + } + return it->second; +} diff --git a/src/file.cpp b/src/file.cpp new file mode 100644 index 0000000..92020d0 --- /dev/null +++ b/src/file.cpp @@ -0,0 +1,30 @@ +#include "file.h" + +FileSize::FileSize(uint64_t fs) : fs(fs) {} + +File::File(FileSize fs, uint64_t inode, uint64_t linkcount, const std::string& path) : + filesize(fs), inode(inode), linkcount(linkcount), path(path) {} + +Hash File::createHash() { + Hash hash; + Hash::create(hash, path); + return hash; +} + +std::ostream& operator<<(std::ostream& str, const FileSize& fs) { + static const char PREFIX[] {' ', 'K', 'M', 'G', 'T', 'P', 'E'}; + static const uint_fast8_t PREFIXCOUNT = 7; + static const uint_fast32_t FACTOR = 1000; + static const uint_fast32_t COMMA = 100; // nach komma stellen + uint64_t cpy = fs.fs * COMMA; + uint_fast8_t prefix = 0; + while(cpy > (FACTOR * 3 * COMMA) && prefix < PREFIXCOUNT) { + cpy /= FACTOR; + ++prefix; + } + + str << (cpy / (float) COMMA); + if(prefix > 0) + str << PREFIX[prefix]; + return str << "B"; +} diff --git a/src/files.cpp b/src/files.cpp new file mode 100644 index 0000000..28ef35d --- /dev/null +++ b/src/files.cpp @@ -0,0 +1,21 @@ +#include "files.h" + +Files::Files() { +} + +void Files::addNewFile(std::shared_ptr file) { + Hash hash = file->createHash(); + + // find first matching file + for(uint32_t i = 0; i < files.size(); ++i) { + MergeFile& mf = files.at(i); + + if(mf.addFile(file, std::move(hash))) { + // successfully added + return; + } + } + + // no suitable mergefile found + files.emplace_back(file, std::move(hash)); +} diff --git a/src/hash.cpp b/src/hash.cpp index 92b41f4..72123f9 100644 --- a/src/hash.cpp +++ b/src/hash.cpp @@ -15,9 +15,17 @@ const static size_t HASHSIZE = 32; // the size of the SHA256 Hashsum in bytes Hash::Hash() {} +Hash::Hash(const Hash& h) : data(new unsigned char[HASHSIZE]) { + std::memcpy(data, h.data, HASHSIZE); +} + +Hash::Hash(Hash&& h) : data(h.data) { + h.data = nullptr; +} + Hash::~Hash() { if(data) { - delete data; + delete[] data; data = nullptr; } } diff --git a/src/main.cpp b/src/main.cpp index 16ef404..4279c7d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,5 +1,7 @@ -#include "Log.h" +#include + +#include "dedup.h" int main(int argc, const char** argv) { Log::init(); @@ -11,6 +13,12 @@ int main(int argc, const char** argv) { Log::info << "Hello, World!"; + Dedup dedup; + + dedup.addSearchFolder("."); + + dedup.start(); + Log::stop(); return 0; } diff --git a/src/mergefiles.cpp b/src/mergefiles.cpp new file mode 100644 index 0000000..7e5ffa0 --- /dev/null +++ b/src/mergefiles.cpp @@ -0,0 +1,17 @@ +#include "mergefile.h" + +MergeFile::MergeFile(std::shared_ptr file, Hash&& h) : hash(std::move(h)) { + files.emplace_back(file); +} + +const Hash& MergeFile::getHash() const { + return hash; +} + +bool MergeFile::addFile(std::shared_ptr f, const Hash& h) { + if(hash == h) { + files.emplace_back(f); + return true; + } + return false; +} diff --git a/src/searchfolder.cpp b/src/searchfolder.cpp new file mode 100644 index 0000000..0485b6b --- /dev/null +++ b/src/searchfolder.cpp @@ -0,0 +1,5 @@ +#include "searchfolder.h" + +SearchFolder::SearchFolder(const std::string& path, bool recurse) : path(path), recurse(recurse) { + +}