first index, than hash, then solve

also add progressbar
This commit is contained in:
mrbesen 2022-10-23 00:48:15 +02:00
parent 532a3c969c
commit 4caabf923a
Signed by: MrBesen
GPG Key ID: 596B2350DCD67504
14 changed files with 406 additions and 106 deletions

View File

@ -6,6 +6,7 @@
#include <memory>
#include "file.h"
#include "fileindexer.h"
#include "files.h"
#include "searchfolder.h"
@ -19,21 +20,19 @@ public:
void start();
private:
void indexFolder(const SearchFolder& sf);
void handleFolderContent(dirent* dircont, const SearchFolder& sf);
void handleNewFile(ino_t inode, const std::string& path);
Files& getFiles(uint64_t fs);
void removeUninterestingFiles(std::multimap<uint64_t, std::shared_ptr<File>>& files);
// all files that where found
std::map<uint64_t, Files> filesizes;
std::map<uint64_t, Files> hash(std::multimap<uint64_t, std::shared_ptr<File>>& files, uint64_t bytestohash);
// files that where found ordered by inode (to detect deduplicated files)
std::map<ino_t, std::shared_ptr<File>> inodes;
void resolveDuplicates(std::map<uint64_t, Files>& hashed);
bool relinkFile(const std::string& linkbase, const std::string& replacedfile);
// folders that should be scanned
std::vector<SearchFolder> folders;
FileIndexer indexer;
bool hardlink = true;
bool ignoredotfiles = false;
uint64_t deduplicated = 0; // amount of bytes deduplicated
bool dryrun = true;
uint64_t deduplicatedBytes = 0; // amount of bytes deduplicated
uint64_t deduplicatedFiles = 0;
};

View File

@ -13,11 +13,11 @@ struct FileSize {
class File {
public:
File(FileSize, uint64_t inode, uint64_t linkcount, const std::string& path);
File(uint64_t, uint64_t inode, uint64_t linkcount, const std::string& path);
Hash createHash();
FileSize filesize;
uint64_t filesize;
uint64_t inode;
uint64_t linkcount;
std::string path;

25
inc/fileindexer.h Normal file
View File

@ -0,0 +1,25 @@
#pragma once
#include <cstring>
#include <dirent.h>
#include <list>
#include <map>
#include <memory>
#include <set>
#include <string>
#include "searchfolder.h"
#include "file.h"
class FileIndexer {
public:
using map_t = std::multimap<uint64_t, std::shared_ptr<File>>;
map_t index(const SearchFolder& sf);
private:
void handleFolderContent(dirent* dircont, const SearchFolder& sf, std::list<SearchFolder>& newfolders, map_t& newfiles);
void handleNewFile(ino_t inode, const std::string& path, map_t& newfiles);
std::set<uint64_t> knownInodes; // file inodes, that are known and should not be indexed again
bool ignoredotfiles = false;
};

View File

@ -9,8 +9,14 @@ class Files {
public:
Files();
void addNewFile(std::shared_ptr<File> file);
void addNewFile(std::shared_ptr<File> file, Hash hash);
// check if at least one file can be deduplicated
bool hasDuplicates() const;
uint32_t getFileCount() const;
std::vector<MergeFile>& getFiles();
const std::vector<MergeFile>& getFiles() const;
private:
std::vector<MergeFile> files;
};

View File

@ -15,6 +15,13 @@ public:
// try to add a file, returns true on success
bool addFile(std::shared_ptr<File> f, const Hash& h);
std::vector<std::shared_ptr<File>>& getFiles();
const std::vector<std::shared_ptr<File>>& getFiles() const;
// make sure the main file, which should be the base for the other files is stored at the begin of the vector
void updateMainFile();
private:
std::vector<std::shared_ptr<File>> files;
Hash hash;

29
inc/progressbar.h Normal file
View File

@ -0,0 +1,29 @@
#pragma once
#include <chrono>
#include <cstdint>
#include <ostream>
class ProgressBar {
public:
ProgressBar(uint64_t files, uint64_t bytes);
void update(uint64_t addFiles, uint64_t newBytes);
std::chrono::duration<uint64_t, std::milli> getDuration() const;
private:
static const uint_fast8_t BARLENGTH;
uint64_t maxFiles;
uint64_t maxBytes;
uint64_t currentFiles = 0;
uint64_t currentBytes = 0;
std::chrono::time_point<std::chrono::high_resolution_clock> start = std::chrono::high_resolution_clock::now();
friend std::ostream& operator<<(std::ostream& str, const ProgressBar&);
};
std::ostream& operator<<(std::ostream& str, const ProgressBar&);

View File

@ -1,11 +1,17 @@
#include "dedup.h"
#include <chrono>
#include <cstring>
#include <iostream>
#include <numeric>
#include <sys/stat.h>
#include <thread>
#include <unistd.h>
#include <Log.h>
#include "progressbar.h"
Dedup::Dedup() {
}
@ -16,94 +22,129 @@ void Dedup::addSearchFolder(const std::string& path, bool recurse) {
folders.emplace_back(path, recurse);
}
void Dedup::start() {
for(uint32_t i = 0; i < folders.size(); ++i) {
SearchFolder sf = folders.at(i);
indexFolder(sf);
}
uint64_t accFileSize = std::accumulate(inodes.begin(), inodes.end(), 0ul, [](uint64_t val, auto r){ return val + r.second->filesize.fs; });
Log::info << "indexing done " << inodes.size() << " unique files found with a total file size of " << FileSize(accFileSize);
}
void Dedup::indexFolder(const SearchFolder& sf) {
DIR* dirptr = ::opendir(sf.path.c_str());
if(!dirptr) {
Log::error << "Could not open directory: " << sf.path;
return;
}
errno = 0;
while(dirent* dircont = ::readdir(dirptr)) {
handleFolderContent(dircont, sf);
}
if(errno != 0) {
Log::error << "failed to read directory: " << sf.path;
}
::closedir(dirptr);
}
void Dedup::handleFolderContent(dirent* dircont, const SearchFolder& sf) {
std::string name(dircont->d_name);
std::string filepath = sf.path + "/" + name;
// handle dotfiles
if(name == "." || name == ".." || (ignoredotfiles && name.at(0) == '.')) {
return;
}
// handle subfolders
if(dircont->d_type == DT_DIR) {
if(sf.recurse) {
folders.emplace_back(filepath, true);
Log::note << "found new subdir to index: " << filepath;
return;
}
}
// regular file
if(dircont->d_type == DT_REG) {
handleNewFile(dircont->d_ino, filepath);
}
}
void Dedup::handleNewFile(ino_t inode, const std::string& path) {
// check for already scanned inodes
auto it = inodes.find(inode);
if(it != inodes.end()) {
Log::note << "found already detected file: " << inode << ' ' << path;
return;
}
struct stat statbuf;
int res = stat(path.c_str(), &statbuf);
if(res == -1) {
Log::error << "stat failed: " << path << " error: " << strerror(errno) << " (" << errno << ')';
return;
}
loff_t fileSize = statbuf.st_size;
nlink_t linkCount = statbuf.st_nlink;
auto file = std::make_shared<File>(fileSize, inode, linkCount, path);
inodes.insert({inode, file});
Files& files = getFiles(fileSize);
files.addNewFile(file);
}
Files& Dedup::getFiles(uint64_t fs) {
auto it = filesizes.find(fs);
if(it == filesizes.end()) {
auto pair = filesizes.insert({fs, {}});
Files& getFiles(uint64_t fs, std::map<uint64_t, Files>& m) {
auto it = m.find(fs);
if(it == m.end()) {
auto pair = m.insert({fs, {}});
return pair.first->second;
}
return it->second;
}
void Dedup::start() {
FileIndexer::map_t foundfiles;
for(const SearchFolder& sf : folders) {
foundfiles = indexer.index(sf);
}
uint64_t accFileSize = std::accumulate(foundfiles.begin(), foundfiles.end(), 0ul, [](uint64_t val, auto r){ return val + r.second->filesize; });
Log::info << "indexing done " << foundfiles.size() << " unique files found with a total file size of " << FileSize(accFileSize);
// remove uninteresting files
removeUninterestingFiles(foundfiles);
// hashing
uint64_t bytesToHash = std::accumulate(foundfiles.begin(), foundfiles.end(), 0ul, [](uint64_t& c, const std::pair<uint64_t, std::shared_ptr<File>>& it) { return c + it.second->filesize; });
Log::info << foundfiles.size() << " files and " << FileSize(bytesToHash) << " are going to be hashed";
// hash files
std::map<uint64_t, Files> filesizes = hash(foundfiles, bytesToHash);
// resolve
resolveDuplicates(filesizes);
Log::info << "done. Deduplicated: " << deduplicatedFiles << " files, saving: " << FileSize(deduplicatedBytes);
}
void Dedup::removeUninterestingFiles(std::multimap<uint64_t, std::shared_ptr<File>>& files) {
// remove all files with a size of 0
files.erase( (const uint64_t& ) 0 );
// remove all files, where they are the only
for(auto it = files.begin(); it != files.end(); ) {
auto upper = files.upper_bound(it->first);
auto next = ++it;
// only one file with this filesize
if(next == upper) {
files.erase(it);
}
it = upper;
}
}
std::map<uint64_t, Files> Dedup::hash(std::multimap<uint64_t, std::shared_ptr<File>>& foundfiles, uint64_t bytesToHash) {
std::map<uint64_t, Files> filesizes;
ProgressBar progressBar(foundfiles.size(), bytesToHash);
for(auto it = foundfiles.begin(); it != foundfiles.end(); ++it) {
Hash hash = it->second->createHash();
progressBar.update(1, it->second->filesize);
Files& files = getFiles(it->first, filesizes);
files.addNewFile(it->second, std::move(hash));
std::cout << progressBar;
}
Log::info << "Hashing done after: " << (int) (progressBar.getDuration().count() / 1000) << "s ";
return filesizes;
}
void Dedup::resolveDuplicates(std::map<uint64_t, Files>& hashed) {
for(auto it = hashed.begin(); it != hashed.end(); ++it) {
if(!it->second.hasDuplicates()) continue;
Log::info << it->second.getFileCount() << " files with " << FileSize(it->first) << " found";
std::vector<MergeFile>& merges = it->second.getFiles();
for(MergeFile& mf : merges) {
mf.updateMainFile();
auto files = mf.getFiles();
if(files.size() == 1) continue;
Log::info << "\t" << mf.getHash() << " at " << files.size() << " places found:";
for(uint32_t i = 0; i < files.size(); ++i) {
const std::shared_ptr<File>& f = files.at(i);
Log::note << "\t\t" << f->linkcount << ": " << f->path;
// skip the first "base" file and dont link empty files
if(i > 0 && it->first > 0) {
if(!relinkFile(files.at(0)->path, f->path)) {
break;
}
deduplicatedBytes += f->filesize;
++deduplicatedFiles;
}
}
}
}
}
bool Dedup::relinkFile(const std::string& linkbase, const std::string& replacedfile) {
// symlinks are not implemented
if(dryrun || !hardlink) {
Log::note << "delete " << replacedfile;
Log::note << "link " << linkbase << " -> " << replacedfile;
} else {
int res = ::unlink(replacedfile.c_str());
if(res != 0) {
Log::error << "unlink(" << replacedfile << ") failed: " << strerror(errno) << " (" << errno << ')';
return false;
}
res = ::link(linkbase.c_str(), replacedfile.c_str());
if(res != 0) {
Log::error << "link(" << linkbase << ", " << replacedfile << ") failed: " << strerror(errno) << " (" << errno << ')';
// TODO try to symlink?
return false;
}
}
return true;
}

View File

@ -2,7 +2,7 @@
FileSize::FileSize(uint64_t fs) : fs(fs) {}
File::File(FileSize fs, uint64_t inode, uint64_t linkcount, const std::string& path) :
File::File(uint64_t fs, uint64_t inode, uint64_t linkcount, const std::string& path) :
filesize(fs), inode(inode), linkcount(linkcount), path(path) {}
Hash File::createHash() {

86
src/fileindexer.cpp Normal file
View File

@ -0,0 +1,86 @@
#include "fileindexer.h"
#include <sys/stat.h>
#include <unistd.h>
#include <Log.h>
std::multimap<uint64_t, std::shared_ptr<File>> FileIndexer::index(const SearchFolder& firstfolder) {
std::list<SearchFolder> folders;
folders.emplace_back(firstfolder);
map_t out;
while(!folders.empty()) {
SearchFolder sf = folders.front();
folders.pop_front();
DIR* dirptr = ::opendir(sf.path.c_str());
if(!dirptr) {
Log::error << "Could not open directory: " << sf.path;
continue;
}
errno = 0;
while(dirent* dircont = ::readdir(dirptr)) {
handleFolderContent(dircont, sf, folders, out);
}
if(errno != 0) {
Log::error << "failed to read directory: " << sf.path << ' ' << strerror(errno) << " (" << errno << ')';
}
::closedir(dirptr);
}
return out;
}
void FileIndexer::handleFolderContent(dirent* dircont, const SearchFolder& sf, std::list<SearchFolder>& newfolders, map_t& newfiles) {
std::string name(dircont->d_name);
std::string filepath = sf.path + "/" + name;
// handle dotfiles
if(name == "." || name == ".." || (ignoredotfiles && name.at(0) == '.')) {
return;
}
// handle subfolders
if(dircont->d_type == DT_DIR) {
if(sf.recurse) {
newfolders.emplace_back(filepath, true);
Log::note << "found new subdir to index: " << filepath;
return;
}
}
// regular file
if(dircont->d_type == DT_REG) {
handleNewFile(dircont->d_ino, filepath, newfiles);
}
}
void FileIndexer::handleNewFile(ino_t inode, const std::string& path, map_t& newfiles) {
// check for already scanned inodes
auto it = knownInodes.find(inode);
if(it != knownInodes.end()) {
Log::note << "found already detected file: " << inode << ' ' << path;
return;
}
struct stat statbuf;
int res = stat(path.c_str(), &statbuf);
if(res == -1) {
Log::error << "stat failed: " << path << " error: " << strerror(errno) << " (" << errno << ')';
return;
}
loff_t fileSize = statbuf.st_size;
nlink_t linkCount = statbuf.st_nlink;
knownInodes.insert(inode);
auto file = std::make_shared<File>(fileSize, inode, linkCount, path);
newfiles.insert({fileSize, file});
}

View File

@ -3,9 +3,7 @@
Files::Files() {
}
void Files::addNewFile(std::shared_ptr<File> file) {
Hash hash = file->createHash();
void Files::addNewFile(std::shared_ptr<File> file, Hash hash) {
// find first matching file
for(uint32_t i = 0; i < files.size(); ++i) {
MergeFile& mf = files.at(i);
@ -19,3 +17,24 @@ void Files::addNewFile(std::shared_ptr<File> file) {
// no suitable mergefile found
files.emplace_back(file, std::move(hash));
}
bool Files::hasDuplicates() const {
for(const MergeFile& mf : files) {
if(mf.getFiles().size() > 1) {
return true;
}
}
return false;
}
uint32_t Files::getFileCount() const {
return files.size();
}
std::vector<MergeFile>& Files::getFiles() {
return files;
}
const std::vector<MergeFile>& Files::getFiles() const {
return files;
}

View File

@ -112,6 +112,10 @@ bool Hash::operator!=(const Hash& rhs) const {
std::ostream& operator<<(std::ostream& str, const Hash& rhs) {
if(!rhs.data) {
return str << "[empty hash]";
}
str << std::hex << std::setfill('0');
for(size_t i = 0; i < HASHSIZE; ++i) {

View File

@ -10,12 +10,18 @@ int main(int argc, const char** argv) {
#if __unix__
Log::setColoredOutput(true);
#endif
Log::info << "Hello, World!";
if(argc < 2) {
Log::fatal << "Usage: " << argv[0] << " <path> [path2] [...]";
return -1;
}
Dedup dedup;
dedup.addSearchFolder(".");
for(int i = 1; i < argc; ++i) {
dedup.addSearchFolder(argv[i]);
}
dedup.start();

View File

@ -1,5 +1,7 @@
#include "mergefile.h"
#include <algorithm>
MergeFile::MergeFile(std::shared_ptr<File> file, Hash&& h) : hash(std::move(h)) {
files.emplace_back(file);
}
@ -15,3 +17,35 @@ bool MergeFile::addFile(std::shared_ptr<File> f, const Hash& h) {
}
return false;
}
std::vector<std::shared_ptr<File>>& MergeFile::getFiles() {
return files;
}
const std::vector<std::shared_ptr<File>>& MergeFile::getFiles() const {
return files;
}
void MergeFile::updateMainFile() {
// count the files with more than one link
uint32_t complexFiles = std::count_if(files.begin(), files.end(), [](const std::shared_ptr<File>& f) {
return f->linkcount > 1;
});
if(complexFiles > 1) {
// move the file with the most hardlinks to the beginning
uint32_t maxlinkcount = 0;
uint32_t maxlinkcountoffset = 0;
for(uint32_t i = 0; i < files.size(); ++i) {
if(files.at(i)->linkcount > maxlinkcount) {
maxlinkcount = files.at(i)->linkcount;
maxlinkcountoffset = i;
}
}
if(maxlinkcountoffset != 0) {
std::swap(files.at(0), files.at(maxlinkcountoffset));
}
}
}

44
src/progressbar.cpp Normal file
View File

@ -0,0 +1,44 @@
#include "progressbar.h"
#include "file.h"
const uint_fast8_t ProgressBar::BARLENGTH = 50;
ProgressBar::ProgressBar(uint64_t files, uint64_t bytes) : maxFiles(files), maxBytes(bytes) {
}
void ProgressBar::update(uint64_t addFiles, uint64_t newBytes) {
currentFiles += addFiles;
currentBytes += newBytes;
}
std::chrono::duration<uint64_t, std::milli> ProgressBar::getDuration() const {
auto currenttime = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::milliseconds>(currenttime - start);
}
std::ostream& operator<<(std::ostream& str, const ProgressBar& pb) {
double progressd = pb.currentBytes / (double) pb.maxBytes;
uint64_t progressi = (progressd * 10000.0);
auto usedtime = pb.getDuration();
std::chrono::duration<uint64_t, std::ratio<60>> eta = std::chrono::duration_cast<std::chrono::minutes>((usedtime / (double) pb.currentBytes) * pb.maxBytes);
// generate bar
str << "[";
for(uint_fast8_t i = 0; i < ProgressBar::BARLENGTH; ++i) {
if( (i / (double) ProgressBar::BARLENGTH) < progressd ) {
str << '#';
} else {
str << ' ';
}
}
// print info
str << "] " << (progressi / 100.0f) << "% " << pb.currentFiles << '/' << pb.maxFiles << " Files "
<< FileSize(pb.currentBytes) << '/' << FileSize(pb.maxBytes) << " ETA: " << eta.count()
<< "s \r" << std::flush;
return str;
}