dedup/src/dedup.cpp

#include "dedup.h"

#include <chrono>
#include <cstring>
#include <iostream>
#include <iterator>
#include <mutex>
#include <numeric>
#include <sys/stat.h>
#include <thread>
#include <unistd.h>

#include <Log.h>

#include "progressbar.h"
#include "hasherthread.h"

const uint_fast8_t Dedup::HASHTHREADCOUNT = 4;

Dedup::Dedup() {
}

Dedup::~Dedup() {
}

void Dedup::addSearchFolder(const std::string& path, bool recurse) {
	folders.emplace_back(path, recurse);
}

Files& getFiles(uint64_t fs, std::map<uint64_t, Files>& m) {
	auto it = m.find(fs);
	if(it == m.end()) {
		auto pair = m.insert({fs, {}});
		return pair.first->second;
	}
	return it->second;
}

void Dedup::start() {
	FileIndexer::map_t foundfiles;
	for(const SearchFolder& sf : folders) {
		foundfiles.merge(indexer.index(sf));
	}

	uint64_t accFileSize = std::accumulate(foundfiles.begin(), foundfiles.end(), 0ul, [](uint64_t val, auto r){ return val + r.second->filesize; });

	Log::info << "indexing done " << foundfiles.size() << " unique files found with a total file size of " << FileSize(accFileSize);

	// remove uninteresting files
	removeUninterestingFiles(foundfiles);

	// hashing
	uint64_t bytesToHash = std::accumulate(foundfiles.begin(), foundfiles.end(), 0ul, [](uint64_t& c, const std::pair<uint64_t, std::shared_ptr<File>>& it) { return c + it.second->filesize; });

	Log::info << foundfiles.size() << " files and " << FileSize(bytesToHash) << " are going to be hashed";

	// hash files
	std::map<uint64_t, Files> filesizes = hash(foundfiles, bytesToHash);

	// resolve
	resolveDuplicates(filesizes);
	Log::info << "done. Deduplicated: " << deduplicatedFiles << " files, saving: " << FileSize(deduplicatedBytes);
}

void Dedup::removeUninterestingFiles(std::multimap<uint64_t, std::shared_ptr<File>>& files) {
	// remove all files with a size of 0
	files.erase( (const uint64_t& ) 0 );

	// remove all files, where they are the only one with that size (there cant be any duplicate so no need to hash them)
	for(auto it = files.begin(); it != files.end(); ) {
		auto upper = files.upper_bound(it->first);
		auto next = it;
		++next;

		// only one file with this filesize
		if(next == upper) {
			files.erase(it);
		}

		it = upper;
	}
}

std::map<uint64_t, Files> Dedup::hash(std::multimap<uint64_t, std::shared_ptr<File>>& foundfiles, uint64_t bytesToHash) {
	std::map<uint64_t, Files> filesizes;
	ProgressBar progressBar(foundfiles.size(), bytesToHash);
	uint64_t stepsize = foundfiles.size() / HASHTHREADCOUNT;

	ProgressBar* bpptr = &progressBar;
	std::map<uint64_t, Files>* filesizesptr = &filesizes;

	std::mutex globalData;

	std::vector<HasherThread*> threads;
	std::vector<std::multimap<uint64_t, std::shared_ptr<File>>::iterator> threadit;
	threads.reserve(HASHTHREADCOUNT);
	threadit.reserve(HASHTHREADCOUNT + 1);

	// create threads
	Log::info << "spawning " << (int) HASHTHREADCOUNT << " hashing threads";

	threadit.push_back(foundfiles.begin());
	for(uint_fast8_t i = 0; i < HASHTHREADCOUNT; ++i) {
		// make a copy!
		std::multimap<uint64_t, std::shared_ptr<File>>::iterator it = threadit.at(i);
		std::advance(it, stepsize);
		threadit.push_back(it);
	}
	// make sure the last one is realy the last
	threadit.at(HASHTHREADCOUNT) = foundfiles.end();

	for(uint_fast8_t i = 0; i < HASHTHREADCOUNT; ++i) {
		std::multimap<uint64_t, std::shared_ptr<File>>::iterator* itptr = &(threadit.at(i));
		std::multimap<uint64_t, std::shared_ptr<File>>::iterator* endptr = &(threadit.at(i+1));

		threads.push_back(new HasherThread(
			globalData,
			[itptr, endptr]() -> std::shared_ptr<File> {
				if(*itptr != *endptr) {
					auto copy = *itptr;
					++(*itptr);
					return copy->second;
				}
				return nullptr;
			},
			[this, bpptr, filesizesptr](Hash&& h, std::shared_ptr<File> f) -> void {
				bpptr->update(1, f->filesize);
				Files& files = getFiles(f->filesize, *filesizesptr);
				files.addNewFile(f, std::move(h));

				std::cout << *bpptr;
			}
		));
	}

	// start all threads
	std::for_each(threads.begin(), threads.end(), [](HasherThread* ht){
		ht->start();
	});

	// wait for all threads
	std::for_each(threads.begin(), threads.end(), [](HasherThread* ht){
		ht->waitfor();
	});

	std::cout << "\n";

	Log::info << "Hashing done after: " << (int) (progressBar.getDuration().count() / 1000) << "s         ";
	Log::info << filesizes.size() << " Hashed files";
	return filesizes;
}

void Dedup::resolveDuplicates(std::map<uint64_t, Files>& hashed) {
	for(auto it = hashed.begin(); it != hashed.end(); ++it) {
		if(!it->second.hasDuplicates()) continue;

		Log::info << it->second.getFileCount() << " files with " << FileSize(it->first) << " found";

		std::vector<MergeFile>& merges = it->second.getFiles();

		for(MergeFile& mf : merges) {
			mf.updateMainFile();

			auto files = mf.getFiles();
			if(files.size() == 1) continue;

			Log::info << "\t" << mf.getHash() << " at " << files.size() << " places found:";

			for(uint32_t i = 0; i < files.size(); ++i) {
				const std::shared_ptr<File>& f = files.at(i);
				Log::note << "\t\t" << f->linkcount << ": " << f->path;

				// skip the first "base" file and dont link empty files
				if(i > 0 && it->first > 0) {
					if(!relinkFile(files.at(0)->path, f->path)) {
						break;
					}
					deduplicatedBytes += f->filesize;
					++deduplicatedFiles;
				}
			}
		}
	}
}

bool Dedup::relinkFile(const std::string& linkbase, const std::string& replacedfile) {
	// symlinks are not implemented

	if(dryrun || !hardlink) {
		Log::note << "delete " << replacedfile;
		Log::note << "link " << linkbase << " -> " << replacedfile;
	} else {
		int res = ::unlink(replacedfile.c_str());
		if(res != 0) {
			Log::error << "unlink(" << replacedfile << ") failed: " << strerror(errno) << " (" << errno << ')';
			return false;
		}

		res = ::link(linkbase.c_str(), replacedfile.c_str());
		if(res != 0) {
			Log::error << "link(" << linkbase << ", " << replacedfile << ") failed: " << strerror(errno) << " (" << errno << ')';
			// TODO try to symlink?
			return false;
		}
	}
	return true;
}