reading of files

This commit is contained in:
mrbesen 2022-10-22 00:43:34 +02:00
parent d2bb0d1ccc
commit 532a3c969c
Signed by: MrBesen
GPG Key ID: 596B2350DCD67504
13 changed files with 245 additions and 8 deletions

View File

@ -1,6 +1,7 @@
#pragma once
#include <cstdint>
#include <dirent.h>
#include <map>
#include <memory>
@ -18,10 +19,21 @@ public:
void start();
private:
std::map<uint64_t, Files> filesizes;
std::map<uint64_t, std::shared_ptr<File>> inodes;
void indexFolder(const SearchFolder& sf);
void handleFolderContent(dirent* dircont, const SearchFolder& sf);
void handleNewFile(ino_t inode, const std::string& path);
Files& getFiles(uint64_t fs);
// all files that where found
std::map<uint64_t, Files> filesizes;
// files that where found ordered by inode (to detect deduplicated files)
std::map<ino_t, std::shared_ptr<File>> inodes;
// folders that should be scanned
std::vector<SearchFolder> folders;
bool hardlink = true;
bool ignoredotfiles = false;
uint64_t deduplicated = 0; // amount of bytes deduplicated
};

View File

@ -1,13 +1,28 @@
#pragma once
#include <cstdint>
#include <ostream>
#include <string>
#include "hash.h"
struct FileSize {
FileSize(uint64_t);
uint64_t fs;
};
class File {
public:
private:
uint64_t filesize;
uint64_t inodeid;
File(FileSize, uint64_t inode, uint64_t linkcount, const std::string& path);
Hash createHash();
FileSize filesize;
uint64_t inode;
uint64_t linkcount;
std::string path;
};
// FileSize iomanip
// convert Bytes into human readable format
std::ostream& operator<<(std::ostream& str, const FileSize& fs);

View File

@ -7,6 +7,9 @@
// a list of files with the same size
class Files {
public:
Files();
void addNewFile(std::shared_ptr<File> file);
private:
std::vector<MergeFile> files;

View File

@ -6,6 +6,8 @@
class Hash {
public:
Hash();
Hash(const Hash&);
Hash(Hash&&);
virtual ~Hash();
static bool create(Hash& h, const std::string& file);
@ -18,7 +20,7 @@ public:
bool operator!=(const Hash& rhs) const;
private:
const unsigned char* data = nullptr;
unsigned char* data = nullptr;
friend std::ostream& operator<<(std::ostream& str, const Hash& rhs);
};

View File

@ -1,5 +1,6 @@
#pragma once
#include <memory>
#include <vector>
#include "file.h"
@ -8,8 +9,12 @@
// eine liste an gleichen Dateien, die gemerged werden sollen
class MergeFile {
public:
MergeFile(std::shared_ptr<File> file, Hash&& h);
const Hash& getHash() const;
// try to add a file, returns true on success
bool addFile(std::shared_ptr<File> f, const Hash& h);
private:
std::vector<std::shared_ptr<File>> files;
Hash hash;

View File

@ -3,6 +3,8 @@
#include <string>
struct SearchFolder {
SearchFolder(const std::string& path, bool recurse);
std::string path;
bool recurse;
};

109
src/dedup.cpp Normal file
View File

@ -0,0 +1,109 @@
#include "dedup.h"
#include <cstring>
#include <numeric>
#include <sys/stat.h>
#include <Log.h>
Dedup::Dedup() {
}
Dedup::~Dedup() {
}
void Dedup::addSearchFolder(const std::string& path, bool recurse) {
folders.emplace_back(path, recurse);
}
void Dedup::start() {
for(uint32_t i = 0; i < folders.size(); ++i) {
SearchFolder sf = folders.at(i);
indexFolder(sf);
}
uint64_t accFileSize = std::accumulate(inodes.begin(), inodes.end(), 0ul, [](uint64_t val, auto r){ return val + r.second->filesize.fs; });
Log::info << "indexing done " << inodes.size() << " unique files found with a total file size of " << FileSize(accFileSize);
}
void Dedup::indexFolder(const SearchFolder& sf) {
DIR* dirptr = ::opendir(sf.path.c_str());
if(!dirptr) {
Log::error << "Could not open directory: " << sf.path;
return;
}
errno = 0;
while(dirent* dircont = ::readdir(dirptr)) {
handleFolderContent(dircont, sf);
}
if(errno != 0) {
Log::error << "failed to read directory: " << sf.path;
}
::closedir(dirptr);
}
void Dedup::handleFolderContent(dirent* dircont, const SearchFolder& sf) {
std::string name(dircont->d_name);
std::string filepath = sf.path + "/" + name;
// handle dotfiles
if(name == "." || name == ".." || (ignoredotfiles && name.at(0) == '.')) {
return;
}
// handle subfolders
if(dircont->d_type == DT_DIR) {
if(sf.recurse) {
folders.emplace_back(filepath, true);
Log::note << "found new subdir to index: " << filepath;
return;
}
}
// regular file
if(dircont->d_type == DT_REG) {
handleNewFile(dircont->d_ino, filepath);
}
}
void Dedup::handleNewFile(ino_t inode, const std::string& path) {
// check for already scanned inodes
auto it = inodes.find(inode);
if(it != inodes.end()) {
Log::note << "found already detected file: " << inode << ' ' << path;
return;
}
struct stat statbuf;
int res = stat(path.c_str(), &statbuf);
if(res == -1) {
Log::error << "stat failed: " << path << " error: " << strerror(errno) << " (" << errno << ')';
return;
}
loff_t fileSize = statbuf.st_size;
nlink_t linkCount = statbuf.st_nlink;
auto file = std::make_shared<File>(fileSize, inode, linkCount, path);
inodes.insert({inode, file});
Files& files = getFiles(fileSize);
files.addNewFile(file);
}
Files& Dedup::getFiles(uint64_t fs) {
auto it = filesizes.find(fs);
if(it == filesizes.end()) {
auto pair = filesizes.insert({fs, {}});
return pair.first->second;
}
return it->second;
}

30
src/file.cpp Normal file
View File

@ -0,0 +1,30 @@
#include "file.h"
FileSize::FileSize(uint64_t fs) : fs(fs) {}
File::File(FileSize fs, uint64_t inode, uint64_t linkcount, const std::string& path) :
filesize(fs), inode(inode), linkcount(linkcount), path(path) {}
Hash File::createHash() {
Hash hash;
Hash::create(hash, path);
return hash;
}
std::ostream& operator<<(std::ostream& str, const FileSize& fs) {
static const char PREFIX[] {' ', 'K', 'M', 'G', 'T', 'P', 'E'};
static const uint_fast8_t PREFIXCOUNT = 7;
static const uint_fast32_t FACTOR = 1000;
static const uint_fast32_t COMMA = 100; // nach komma stellen
uint64_t cpy = fs.fs * COMMA;
uint_fast8_t prefix = 0;
while(cpy > (FACTOR * 3 * COMMA) && prefix < PREFIXCOUNT) {
cpy /= FACTOR;
++prefix;
}
str << (cpy / (float) COMMA);
if(prefix > 0)
str << PREFIX[prefix];
return str << "B";
}

21
src/files.cpp Normal file
View File

@ -0,0 +1,21 @@
#include "files.h"
Files::Files() {
}
void Files::addNewFile(std::shared_ptr<File> file) {
Hash hash = file->createHash();
// find first matching file
for(uint32_t i = 0; i < files.size(); ++i) {
MergeFile& mf = files.at(i);
if(mf.addFile(file, std::move(hash))) {
// successfully added
return;
}
}
// no suitable mergefile found
files.emplace_back(file, std::move(hash));
}

View File

@ -15,9 +15,17 @@ const static size_t HASHSIZE = 32; // the size of the SHA256 Hashsum in bytes
Hash::Hash() {}
Hash::Hash(const Hash& h) : data(new unsigned char[HASHSIZE]) {
std::memcpy(data, h.data, HASHSIZE);
}
Hash::Hash(Hash&& h) : data(h.data) {
h.data = nullptr;
}
Hash::~Hash() {
if(data) {
delete data;
delete[] data;
data = nullptr;
}
}

View File

@ -1,5 +1,7 @@
#include "Log.h"
#include <Log.h>
#include "dedup.h"
int main(int argc, const char** argv) {
Log::init();
@ -11,6 +13,12 @@ int main(int argc, const char** argv) {
Log::info << "Hello, World!";
Dedup dedup;
dedup.addSearchFolder(".");
dedup.start();
Log::stop();
return 0;
}

17
src/mergefiles.cpp Normal file
View File

@ -0,0 +1,17 @@
#include "mergefile.h"
MergeFile::MergeFile(std::shared_ptr<File> file, Hash&& h) : hash(std::move(h)) {
files.emplace_back(file);
}
const Hash& MergeFile::getHash() const {
return hash;
}
bool MergeFile::addFile(std::shared_ptr<File> f, const Hash& h) {
if(hash == h) {
files.emplace_back(f);
return true;
}
return false;
}

5
src/searchfolder.cpp Normal file
View File

@ -0,0 +1,5 @@
#include "searchfolder.h"
SearchFolder::SearchFolder(const std::string& path, bool recurse) : path(path), recurse(recurse) {
}