reading of files
This commit is contained in:
parent
d2bb0d1ccc
commit
532a3c969c
16
inc/dedup.h
16
inc/dedup.h
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <dirent.h>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
|
||||
|
@ -18,10 +19,21 @@ public:
|
|||
void start();
|
||||
|
||||
private:
|
||||
std::map<uint64_t, Files> filesizes;
|
||||
std::map<uint64_t, std::shared_ptr<File>> inodes;
|
||||
void indexFolder(const SearchFolder& sf);
|
||||
void handleFolderContent(dirent* dircont, const SearchFolder& sf);
|
||||
void handleNewFile(ino_t inode, const std::string& path);
|
||||
Files& getFiles(uint64_t fs);
|
||||
|
||||
// all files that where found
|
||||
std::map<uint64_t, Files> filesizes;
|
||||
|
||||
// files that where found ordered by inode (to detect deduplicated files)
|
||||
std::map<ino_t, std::shared_ptr<File>> inodes;
|
||||
|
||||
// folders that should be scanned
|
||||
std::vector<SearchFolder> folders;
|
||||
|
||||
bool hardlink = true;
|
||||
bool ignoredotfiles = false;
|
||||
uint64_t deduplicated = 0; // amount of bytes deduplicated
|
||||
};
|
||||
|
|
21
inc/file.h
21
inc/file.h
|
@ -1,13 +1,28 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
|
||||
#include "hash.h"
|
||||
|
||||
struct FileSize {
|
||||
FileSize(uint64_t);
|
||||
uint64_t fs;
|
||||
};
|
||||
|
||||
class File {
|
||||
public:
|
||||
private:
|
||||
uint64_t filesize;
|
||||
uint64_t inodeid;
|
||||
File(FileSize, uint64_t inode, uint64_t linkcount, const std::string& path);
|
||||
|
||||
Hash createHash();
|
||||
|
||||
FileSize filesize;
|
||||
uint64_t inode;
|
||||
uint64_t linkcount;
|
||||
std::string path;
|
||||
};
|
||||
|
||||
// FileSize iomanip
|
||||
// convert Bytes into human readable format
|
||||
std::ostream& operator<<(std::ostream& str, const FileSize& fs);
|
||||
|
|
|
@ -7,6 +7,9 @@
|
|||
// a list of files with the same size
|
||||
class Files {
|
||||
public:
|
||||
Files();
|
||||
|
||||
void addNewFile(std::shared_ptr<File> file);
|
||||
|
||||
private:
|
||||
std::vector<MergeFile> files;
|
||||
|
|
|
@ -6,6 +6,8 @@
|
|||
class Hash {
|
||||
public:
|
||||
Hash();
|
||||
Hash(const Hash&);
|
||||
Hash(Hash&&);
|
||||
virtual ~Hash();
|
||||
|
||||
static bool create(Hash& h, const std::string& file);
|
||||
|
@ -18,7 +20,7 @@ public:
|
|||
bool operator!=(const Hash& rhs) const;
|
||||
|
||||
private:
|
||||
const unsigned char* data = nullptr;
|
||||
unsigned char* data = nullptr;
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& str, const Hash& rhs);
|
||||
};
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
#include "file.h"
|
||||
|
@ -8,8 +9,12 @@
|
|||
// eine liste an gleichen Dateien, die gemerged werden sollen
|
||||
class MergeFile {
|
||||
public:
|
||||
MergeFile(std::shared_ptr<File> file, Hash&& h);
|
||||
|
||||
const Hash& getHash() const;
|
||||
|
||||
// try to add a file, returns true on success
|
||||
bool addFile(std::shared_ptr<File> f, const Hash& h);
|
||||
private:
|
||||
std::vector<std::shared_ptr<File>> files;
|
||||
Hash hash;
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
#include <string>
|
||||
|
||||
struct SearchFolder {
|
||||
SearchFolder(const std::string& path, bool recurse);
|
||||
|
||||
std::string path;
|
||||
bool recurse;
|
||||
};
|
||||
|
|
|
@ -0,0 +1,109 @@
|
|||
#include "dedup.h"
|
||||
|
||||
#include <cstring>
|
||||
#include <numeric>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include <Log.h>
|
||||
|
||||
Dedup::Dedup() {
|
||||
}
|
||||
|
||||
Dedup::~Dedup() {
|
||||
}
|
||||
|
||||
void Dedup::addSearchFolder(const std::string& path, bool recurse) {
|
||||
folders.emplace_back(path, recurse);
|
||||
}
|
||||
|
||||
void Dedup::start() {
|
||||
for(uint32_t i = 0; i < folders.size(); ++i) {
|
||||
SearchFolder sf = folders.at(i);
|
||||
|
||||
indexFolder(sf);
|
||||
}
|
||||
|
||||
uint64_t accFileSize = std::accumulate(inodes.begin(), inodes.end(), 0ul, [](uint64_t val, auto r){ return val + r.second->filesize.fs; });
|
||||
|
||||
Log::info << "indexing done " << inodes.size() << " unique files found with a total file size of " << FileSize(accFileSize);
|
||||
|
||||
|
||||
}
|
||||
|
||||
void Dedup::indexFolder(const SearchFolder& sf) {
|
||||
DIR* dirptr = ::opendir(sf.path.c_str());
|
||||
if(!dirptr) {
|
||||
Log::error << "Could not open directory: " << sf.path;
|
||||
return;
|
||||
}
|
||||
|
||||
errno = 0;
|
||||
while(dirent* dircont = ::readdir(dirptr)) {
|
||||
handleFolderContent(dircont, sf);
|
||||
}
|
||||
|
||||
if(errno != 0) {
|
||||
Log::error << "failed to read directory: " << sf.path;
|
||||
}
|
||||
|
||||
::closedir(dirptr);
|
||||
}
|
||||
|
||||
void Dedup::handleFolderContent(dirent* dircont, const SearchFolder& sf) {
|
||||
std::string name(dircont->d_name);
|
||||
std::string filepath = sf.path + "/" + name;
|
||||
|
||||
// handle dotfiles
|
||||
if(name == "." || name == ".." || (ignoredotfiles && name.at(0) == '.')) {
|
||||
return;
|
||||
}
|
||||
|
||||
// handle subfolders
|
||||
if(dircont->d_type == DT_DIR) {
|
||||
if(sf.recurse) {
|
||||
folders.emplace_back(filepath, true);
|
||||
Log::note << "found new subdir to index: " << filepath;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// regular file
|
||||
if(dircont->d_type == DT_REG) {
|
||||
handleNewFile(dircont->d_ino, filepath);
|
||||
}
|
||||
}
|
||||
|
||||
void Dedup::handleNewFile(ino_t inode, const std::string& path) {
|
||||
// check for already scanned inodes
|
||||
auto it = inodes.find(inode);
|
||||
if(it != inodes.end()) {
|
||||
Log::note << "found already detected file: " << inode << ' ' << path;
|
||||
return;
|
||||
}
|
||||
|
||||
struct stat statbuf;
|
||||
int res = stat(path.c_str(), &statbuf);
|
||||
if(res == -1) {
|
||||
Log::error << "stat failed: " << path << " error: " << strerror(errno) << " (" << errno << ')';
|
||||
return;
|
||||
}
|
||||
|
||||
loff_t fileSize = statbuf.st_size;
|
||||
nlink_t linkCount = statbuf.st_nlink;
|
||||
|
||||
auto file = std::make_shared<File>(fileSize, inode, linkCount, path);
|
||||
|
||||
inodes.insert({inode, file});
|
||||
Files& files = getFiles(fileSize);
|
||||
|
||||
files.addNewFile(file);
|
||||
}
|
||||
|
||||
Files& Dedup::getFiles(uint64_t fs) {
|
||||
auto it = filesizes.find(fs);
|
||||
if(it == filesizes.end()) {
|
||||
auto pair = filesizes.insert({fs, {}});
|
||||
return pair.first->second;
|
||||
}
|
||||
return it->second;
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
#include "file.h"
|
||||
|
||||
FileSize::FileSize(uint64_t fs) : fs(fs) {}
|
||||
|
||||
File::File(FileSize fs, uint64_t inode, uint64_t linkcount, const std::string& path) :
|
||||
filesize(fs), inode(inode), linkcount(linkcount), path(path) {}
|
||||
|
||||
Hash File::createHash() {
|
||||
Hash hash;
|
||||
Hash::create(hash, path);
|
||||
return hash;
|
||||
}
|
||||
|
||||
std::ostream& operator<<(std::ostream& str, const FileSize& fs) {
|
||||
static const char PREFIX[] {' ', 'K', 'M', 'G', 'T', 'P', 'E'};
|
||||
static const uint_fast8_t PREFIXCOUNT = 7;
|
||||
static const uint_fast32_t FACTOR = 1000;
|
||||
static const uint_fast32_t COMMA = 100; // nach komma stellen
|
||||
uint64_t cpy = fs.fs * COMMA;
|
||||
uint_fast8_t prefix = 0;
|
||||
while(cpy > (FACTOR * 3 * COMMA) && prefix < PREFIXCOUNT) {
|
||||
cpy /= FACTOR;
|
||||
++prefix;
|
||||
}
|
||||
|
||||
str << (cpy / (float) COMMA);
|
||||
if(prefix > 0)
|
||||
str << PREFIX[prefix];
|
||||
return str << "B";
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
#include "files.h"
|
||||
|
||||
Files::Files() {
|
||||
}
|
||||
|
||||
void Files::addNewFile(std::shared_ptr<File> file) {
|
||||
Hash hash = file->createHash();
|
||||
|
||||
// find first matching file
|
||||
for(uint32_t i = 0; i < files.size(); ++i) {
|
||||
MergeFile& mf = files.at(i);
|
||||
|
||||
if(mf.addFile(file, std::move(hash))) {
|
||||
// successfully added
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// no suitable mergefile found
|
||||
files.emplace_back(file, std::move(hash));
|
||||
}
|
10
src/hash.cpp
10
src/hash.cpp
|
@ -15,9 +15,17 @@ const static size_t HASHSIZE = 32; // the size of the SHA256 Hashsum in bytes
|
|||
|
||||
Hash::Hash() {}
|
||||
|
||||
Hash::Hash(const Hash& h) : data(new unsigned char[HASHSIZE]) {
|
||||
std::memcpy(data, h.data, HASHSIZE);
|
||||
}
|
||||
|
||||
Hash::Hash(Hash&& h) : data(h.data) {
|
||||
h.data = nullptr;
|
||||
}
|
||||
|
||||
Hash::~Hash() {
|
||||
if(data) {
|
||||
delete data;
|
||||
delete[] data;
|
||||
data = nullptr;
|
||||
}
|
||||
}
|
||||
|
|
10
src/main.cpp
10
src/main.cpp
|
@ -1,5 +1,7 @@
|
|||
|
||||
#include "Log.h"
|
||||
#include <Log.h>
|
||||
|
||||
#include "dedup.h"
|
||||
|
||||
int main(int argc, const char** argv) {
|
||||
Log::init();
|
||||
|
@ -11,6 +13,12 @@ int main(int argc, const char** argv) {
|
|||
|
||||
Log::info << "Hello, World!";
|
||||
|
||||
Dedup dedup;
|
||||
|
||||
dedup.addSearchFolder(".");
|
||||
|
||||
dedup.start();
|
||||
|
||||
Log::stop();
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
#include "mergefile.h"
|
||||
|
||||
MergeFile::MergeFile(std::shared_ptr<File> file, Hash&& h) : hash(std::move(h)) {
|
||||
files.emplace_back(file);
|
||||
}
|
||||
|
||||
const Hash& MergeFile::getHash() const {
|
||||
return hash;
|
||||
}
|
||||
|
||||
bool MergeFile::addFile(std::shared_ptr<File> f, const Hash& h) {
|
||||
if(hash == h) {
|
||||
files.emplace_back(f);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
#include "searchfolder.h"
|
||||
|
||||
SearchFolder::SearchFolder(const std::string& path, bool recurse) : path(path), recurse(recurse) {
|
||||
|
||||
}
|
Loading…
Reference in New Issue