commit d2bb0d1ccc3ae25550016793b259bd6f9e29b3d4 Author: mrbesen Date: Fri Oct 21 19:39:55 2022 +0200 initial diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f3ae4cb --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +*.d +*.o +*.img + +build/ +dedup +test +tests/data/ + +.vscode/settings.json diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..5fd23c8 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "thirdparty/Log"] + path = thirdparty/Log + url = https://git.okaestne.de/okaestne/Log.git diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 0000000..35c1d58 --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,18 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/src/", + "${workspaceFolder}/thirdparty/Log/", + "${workspaceFolder}/inc/**" + ], + "defines": [], + "compilerPath": "/usr/bin/g++", + "cStandard": "c11", + "cppStandard": "c++17", + "intelliSenseMode": "gcc-x64" + } + ], + "version": 4 +} diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..8c0aba9 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,50 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Debuggen (gdb)", + "type": "cppdbg", + "request": "launch", + "program": "${workspaceFolder}/dedup", + "args": [], + "stopAtEntry": false, + "cwd": "${workspaceFolder}", + "environment": [], + "externalConsole": false, + "MIMode": "gdb", + "setupCommands": [ + { + "description": "Automatische Strukturierung und Einrückung für \"gdb\" aktivieren", + "text": "-enable-pretty-printing", + "ignoreFailures": true + } + ], + "preLaunchTask": "make all", + "miDebuggerPath": "/usr/bin/gdb" + }, + { + "name": "test Debuggen (gdb)", + "type": "cppdbg", + "request": "launch", + "program": "${workspaceFolder}/test", + "args": [], + "stopAtEntry": false, + "cwd": "${workspaceFolder}", + "environment": [], + "externalConsole": false, + "MIMode": "gdb", + "setupCommands": [ + { + "description": "Automatische Strukturierung und Einrückung für \"gdb\" aktivieren", + "text": "-enable-pretty-printing", + "ignoreFailures": true + } + ], + "preLaunchTask": "make test", + "miDebuggerPath": "/usr/bin/gdb" + } + ] +} diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..c899ce9 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,31 @@ +{ + // See https://go.microsoft.com/fwlink/?LinkId=733558 + // for the documentation about the tasks.json format + "version": "2.0.0", + "tasks": [ + { + "label": "make all", + "type": "shell", + "command": "make -j all", + "group": { + "kind": "build", + "isDefault": true + }, + "problemMatcher": [ + "$gcc" + ] + }, + { + "label": "make clean", + "type": "shell", + "command": "make clean", + "problemMatcher": [] + }, + { + "label": "make test", + "type": "shell", + "command": "make -j test", + "problemMatcher": ["$gcc"], + } + ] +} diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..6422343 --- /dev/null +++ b/Makefile @@ -0,0 +1,90 @@ +# Author Yannis Gerlach +# Hochschule Osnabrück +# 13.11.2020 + +# `make clean all` nicht mit -j verwenden! -> race condition im make file +# statdessen: `make clean; make all -j` verwenden + +NAME = dedup +NAMETEST = test +CFLAGS = -std=c++17 -O2 -g -pipe -Wall -Wextra -Wno-unused-parameter -Wpedantic -rdynamic #-march=native -Wall +CXX = g++ +SRCF = src/ +BUILDDIR = build/ +TESTF = tests/ +DEPF = $(BUILDDIR)deps/ +INCF = ./inc/ +INCFS = $(shell find $(INCF) -type d) + +LOGF = ./thirdparty/Log/ +LOGO = $(LOGF)Log.o +export LOG_USEMUTEX = 0 + +INCLUDES = -I$(LOGF) $(addprefix -I, $(INCFS)) +LDFLAGS = -lssl -lcrypto + +SRCFILES = $(shell find $(SRCF) -name "*.cpp") +OBJFILES = $(patsubst $(SRCF)%, $(BUILDDIR)%, $(patsubst %.cpp, %.o, $(SRCFILES))) $(LOGO) +DEPFILES = $(wildcard $(DEPF)*.d) + +SOURCEDIRS = $(shell find $(SRCF) -type d -printf "%p/\n") +BUILDDIRS = $(patsubst $(SRCF)%, $(BUILDDIR)%, $(SOURCEDIRS)) + +BUILDDIRTEST = $(BUILDDIR)tests/ +TESTSRCFILES = $(wildcard $(TESTF)*.cpp) +TESTOBJFILES = $(patsubst $(TESTF)%, $(BUILDDIRTEST)%, $(patsubst %.cpp, %.o, $(TESTSRCFILES))) +OBJFILESTEST = $(filter-out $(BUILDDIR)main.o, $(OBJFILES)) $(TESTOBJFILES) + +BUILDDIRS += $(BUILDDIRTEST) + +INCLUDES += $(addprefix -I, $(SOURCEDIRS)) + +all: $(NAME) runtest + +$(NAME): $(BUILDDIRS) $(DEPF) $(OBJFILES) + @echo "Linking $@" + @$(CXX) $(CFLAGS) -o $@ $(filter %.o, $^) $(LDFLAGS) + +# normal cpp files +$(BUILDDIR)%.o: $(SRCF)%.cpp + @echo "Compiling: $@" + @$(CXX) $(CFLAGS) $(INCLUDES) $< -MM -MT $@ > $(DEPF)$(subst /,_,$*).d + @$(CXX) -c -o $@ $(CFLAGS) $(INCLUDES) $< + +# test cpp files +$(BUILDDIRTEST)%.o: $(TESTF)%.cpp + @echo "Compiling: $@" + @$(CXX) $(CFLAGS) $(INCLUDES) $< -MM -MT $@ > $(DEPF)test_$(subst /,_,$*).d + @$(CXX) -c -o $@ $(CFLAGS) $(INCLUDES) $< + +$(NAME)_strip: $(NAME) + @echo "Strip $<" + @strip -o $@ $< + +%/: + mkdir -p $@ + +clean-depends: + $(RM) -r $(DEPF) + +$(LOGO): + $(MAKE) -C $(LOGF) all + +clean: + $(RM) -r $(NAME) $(BUILDDIR) $(NAMETEST) $(NAME)_strip + $(MAKE) -C $(LOGF) $@ + +$(NAMETEST): $(BUILDDIRS) $(DEPF) $(OBJFILESTEST) + @echo "Linking tests" + @$(CXX) -o $@ $(filter %.o, $^) $(filter %.a, $^) $(CFLAGS) $(LDFLAGS) + +runtest: $(BUILDDIR)testrun + +$(BUILDDIR)testrun: $(NAMETEST) + @echo "Running tests" + bash -c '. test.env 2> /dev/null; ./$<' + @touch $(BUILDDIR)testrun + +.PHONY: clean all clean-depends runtest + +include $(DEPFILES) diff --git a/inc/dedup.h b/inc/dedup.h new file mode 100644 index 0000000..7a1683d --- /dev/null +++ b/inc/dedup.h @@ -0,0 +1,27 @@ +#pragma once + +#include +#include +#include + +#include "file.h" +#include "files.h" +#include "searchfolder.h" + +class Dedup { +public: + Dedup(); + ~Dedup(); + + void addSearchFolder(const std::string& path, bool recurse = true); + + void start(); + +private: + std::map filesizes; + std::map> inodes; + + std::vector folders; + + bool hardlink = true; +}; diff --git a/inc/file.h b/inc/file.h new file mode 100644 index 0000000..72b8b66 --- /dev/null +++ b/inc/file.h @@ -0,0 +1,13 @@ +#pragma once + +#include +#include + +class File { +public: +private: + uint64_t filesize; + uint64_t inodeid; + uint64_t linkcount; + std::string path; +}; diff --git a/inc/files.h b/inc/files.h new file mode 100644 index 0000000..2ebd6c0 --- /dev/null +++ b/inc/files.h @@ -0,0 +1,13 @@ +#pragma once + +#include + +#include "mergefile.h" + +// a list of files with the same size +class Files { +public: + +private: + std::vector files; +}; diff --git a/inc/hash.h b/inc/hash.h new file mode 100644 index 0000000..aac3ca3 --- /dev/null +++ b/inc/hash.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include + +class Hash { +public: + Hash(); + virtual ~Hash(); + + static bool create(Hash& h, const std::string& file); + static bool create(Hash& h, int fd); + + operator bool() const; + operator std::string() const; + + bool operator==(const Hash& rhs) const; + bool operator!=(const Hash& rhs) const; + +private: + const unsigned char* data = nullptr; + + friend std::ostream& operator<<(std::ostream& str, const Hash& rhs); +}; + +std::ostream& operator<<(std::ostream& str, const Hash& rhs); diff --git a/inc/mergefile.h b/inc/mergefile.h new file mode 100644 index 0000000..4862688 --- /dev/null +++ b/inc/mergefile.h @@ -0,0 +1,16 @@ +#pragma once + +#include + +#include "file.h" +#include "hash.h" + +// eine liste an gleichen Dateien, die gemerged werden sollen +class MergeFile { +public: + + +private: + std::vector> files; + Hash hash; +}; diff --git a/inc/searchfolder.h b/inc/searchfolder.h new file mode 100644 index 0000000..38f44f3 --- /dev/null +++ b/inc/searchfolder.h @@ -0,0 +1,8 @@ +#pragma once + +#include + +struct SearchFolder { + std::string path; + bool recurse; +}; diff --git a/src/hash.cpp b/src/hash.cpp new file mode 100644 index 0000000..92b41f4 --- /dev/null +++ b/src/hash.cpp @@ -0,0 +1,114 @@ +#include "hash.h" + +#include +#include +#include +#include +#include +#include + +#include + +#include + +const static size_t HASHSIZE = 32; // the size of the SHA256 Hashsum in bytes + +Hash::Hash() {} + +Hash::~Hash() { + if(data) { + delete data; + data = nullptr; + } +} + + +bool Hash::create(Hash& h, const std::string& file) { + int fd = ::open(file.c_str(), O_RDONLY); + + bool result = Hash::create(h, fd); + + ::close(fd); + + return result; +} + +bool Hash::create(Hash& h, int fd) { + if(fd < 0 || h.data) { + return false; + } + + SHA256_CTX hashctx; + if(SHA256_Init(&hashctx) != 1) { + Log::error << "Can not create SHA256 CTX"; + return false; + } + + //determine filesize + loff_t fs = lseek64(fd, 0, SEEK_END); + lseek64(fd, 0, SEEK_SET); + + void* mapping = ::mmap(NULL, fs, PROT_READ, MAP_SHARED | MAP_POPULATE, fd, 0); + if(mapping == MAP_FAILED) { + return false; + } + + ::madvise(mapping, fs, MADV_DONTDUMP); + ::madvise(mapping, fs, MADV_DONTFORK); + ::madvise(mapping, fs, MADV_SEQUENTIAL); + ::madvise(mapping, fs, MADV_WILLNEED); + + if(SHA256_Update(&hashctx, mapping, fs) != 1) { + Log::error << "Can not update hash"; + return false; + } + + ::munmap(mapping, fs); + + unsigned char* md = new unsigned char[HASHSIZE]; + if(SHA256_Final(md, &hashctx) != 1) { + Log::error << "Can not finalize hash"; + return false; + } + + h.data = md; + + return true; +} + + +Hash::operator bool() const { + return data; +} + +Hash::operator std::string() const { + std::stringstream str; + str << *this; + return str.str(); +} + + +bool Hash::operator==(const Hash& rhs) const { + // same data ptr + if(data == rhs.data) return true; + + // both no data + if(static_cast(data) == static_cast(rhs) && data == nullptr) return true; + + return std::memcmp(data, rhs.data, HASHSIZE) == 0; +} + +bool Hash::operator!=(const Hash& rhs) const { + return !(*this == rhs); +} + + +std::ostream& operator<<(std::ostream& str, const Hash& rhs) { + str << std::hex << std::setfill('0'); + + for(size_t i = 0; i < HASHSIZE; ++i) { + str << std::setw(2) << (int) rhs.data[i]; + } + + return str; +} diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..16ef404 --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,16 @@ + +#include "Log.h" + +int main(int argc, const char** argv) { + Log::init(); + Log::setConsoleLogLevel(Log::Level::trace); + Log::addLogfile("log.txt", Log::Level::trace); +#if __unix__ + Log::setColoredOutput(true); +#endif + + Log::info << "Hello, World!"; + + Log::stop(); + return 0; +} diff --git a/tests/hashtest.cpp b/tests/hashtest.cpp new file mode 100644 index 0000000..5f3ec03 --- /dev/null +++ b/tests/hashtest.cpp @@ -0,0 +1,23 @@ +#include "test.h" + +#include "hash.h" + +TEST(hash) { + + Hash h; + + CMPASSERT(h, false); + + CMPASSERT(Hash::create(h, TESTDATA "random.img"), true); + CMPASSERT(h, true); + + // fail to write a existing hash + CMPASSERT(Hash::create(h, TESTDATA "random.img"), false); + CMPASSERT(h, true); + + CMPASSERT(static_cast(h), "de05bb13b33f1cc593348d733b84820c77f8f6be89cf417d21d8b2af81d3ebd8"); + CMPASSERT(h, true); + + CMPASSERT(h, h); + +} TESTEND diff --git a/tests/main.cpp b/tests/main.cpp new file mode 100644 index 0000000..e680fee --- /dev/null +++ b/tests/main.cpp @@ -0,0 +1,54 @@ +#include +#include "test.h" + +#include +#include +#include + +#define RED "\033[1;91m" +#define GREEN "\033[1;92m" +#define YELLOW "\033[1;93m" +#define AQUA "\033[1;36m" +#define RESET "\033[;1m" + +int main(int argc, char** argv) { + auto start = std::chrono::high_resolution_clock::now(); + + testdef* startit = &__start_testlist, *endit = &__stop_testlist; + int failcount = 0; + int skipcount = 0; + int testcount = endit-startit; + int testnumber = 0; + + // go through back -> front (tests are inserted in reverse order) + for(testdef* it = startit + testcount-1; it >= startit; --it) { + printf("\033[1mRunning test: %d/%d " AQUA "%s " RESET, ++testnumber, testcount, it->name); + + // run test + int result = TESTFAILED; + try { + result = (it->testf)(); + } catch(std::exception& e) { + std::cout << "catched exception: \"" << e.what() << "\" " << std::flush; + } catch(...) {} + + if(result == TESTGOOD) { + printf(GREEN "succeeded" RESET "!\n"); + } else if(result == TESTSKIPPED) { + printf(YELLOW "skipped" RESET "!\n"); + skipcount++; + } else { + printf(RED "failed" RESET "\n"); + failcount++; + } + } + + const char* color = (failcount > 0 ? RED : GREEN); // red or green + printf("%s%d" RESET "/%d failed (" YELLOW "%d " RESET "skipped)\n", color, failcount, testcount, skipcount); + + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration t = end - start; + printf("Testing took: %fms\n", (t.count() * 1000)); + + return failcount > 0; +} diff --git a/tests/test.h b/tests/test.h new file mode 100644 index 0000000..ef1456c --- /dev/null +++ b/tests/test.h @@ -0,0 +1,43 @@ +#define TESTFAILED 0 +#define TESTGOOD 1 +#define TESTSKIPPED -1 + +#include + +#define TESTDATA "./tests/data/" + +// very helpfull: https://mgalgs.io/2013/05/10/hacking-your-ELF-for-fun-and-profit.html + +#define TESTNAME(NAME) test_##NAME +#define TESTFUNC(NAME) int TESTNAME(NAME)() + +#define REGISTERTEST(NAME) static const testdef __test_ ## NAME \ + __attribute((__section__("testlist"))) \ + __attribute((__used__)) = { \ + TESTNAME(NAME), \ + #NAME, \ + } + +#define TEST(NAME) static TESTFUNC(NAME); \ + REGISTERTEST(NAME); \ + TESTFUNC(NAME) { + +#define TESTEND return TESTGOOD; } \ + + +#define ASSERT(BED, ERR) if(!(BED)) { std::cout << __FILE__ << ":" << __LINE__ << " " << ERR << ' ' << std::flush; return TESTFAILED; } +#define CMPASSERTE(IS, SHOULD, ERR) if( !((IS) == (SHOULD))) { std::cout << __FILE__ << ":" << __LINE__ << " is: \"" << (IS) << "\" should: \"" << (SHOULD) << "\" "<< std::flush; return TESTFAILED; } +#define CMPASSERT(IS, SHOULD) CMPASSERTE(IS, SHOULD, "") + +#define SKIPTEST return TESTSKIPPED + +typedef int (*test_t)(); + +struct testdef { + test_t testf; + const char* name; +}; + +// linker generates this <3 +extern struct testdef __start_testlist; +extern struct testdef __stop_testlist; \ No newline at end of file diff --git a/thirdparty/Log b/thirdparty/Log new file mode 160000 index 0000000..027f901 --- /dev/null +++ b/thirdparty/Log @@ -0,0 +1 @@ +Subproject commit 027f901dbe1002f40658ccc231997f94bb472bd1