From 94f4903786d2c9efcb60dec9792c9b959b8248dd Mon Sep 17 00:00:00 2001 From: mrbesen Date: Wed, 7 Apr 2021 17:52:32 +0200 Subject: [PATCH] deduplicating messages --- src/main.cpp | 1 + src/search.cpp | 25 ++++++++++++++++++++++++- src/search.h | 10 +++++++++- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 16c2524..5d1241c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -36,6 +36,7 @@ int main(int argc, const char** argv) { Log::info << "Load File: " << argv[i]; rootsearch.addFile(argv[i]); } + rootsearch.finalize(); time_t end = time(nullptr); Log::info << rootsearch.getChatCount() << " Chats with " << rootsearch.getMessageCount() << " Messages loaded in " << end-start << "s"; diff --git a/src/search.cpp b/src/search.cpp index 9cebf5b..f8e697b 100644 --- a/src/search.cpp +++ b/src/search.cpp @@ -8,6 +8,21 @@ #include using json = nlohmann::json; +bool Message::operator==(const Message& m) const { + return (m.chatid == chatid) && (m.messageid == messageid); +} + +bool Message::operator!=(const Message& m) const { + return (m.chatid != chatid) || (m.messageid != messageid); +} + +bool Message::operator<(const Message& m) const { + if (chatid < m.chatid) return true; + if (chatid > m.chatid) return false; + // chatid == m.chatid + return (messageid < m.messageid); +} + Searchflags operator|=(Searchflags& lhs, const Searchflags sf) { lhs = (Searchflags) ((uint32_t) lhs | (uint32_t) sf); return lhs; @@ -114,6 +129,14 @@ void Search::addFile(const std::string& file) { } } +void Search::finalize() { + msgs.reserve(msgs.size() + deduplicate.size()); + for(const auto it : deduplicate) { + msgs.push_back(it); + } + deduplicate.clear(); +} + std::list Search::search(std::string text, Searchflags flags) const { std::list out; @@ -197,7 +220,7 @@ void Search::loadMessages(const json& j, uint64_t chatid) { if(m.contains("text")) { std::string text; readText(m["text"], text); - msgs.push_back({text, chatid, m["id"]}); + deduplicate.insert({text, chatid, m["id"]}); } else { Log::warn << "text less message: " << m; } diff --git a/src/search.h b/src/search.h index f14008b..564179c 100644 --- a/src/search.h +++ b/src/search.h @@ -2,6 +2,8 @@ #include #include +#include +#include #include #include @@ -12,6 +14,10 @@ struct Message { std::string text; uint64_t chatid; uint64_t messageid; + + bool operator==(const Message& m) const; + bool operator!=(const Message& m) const; + bool operator<(const Message& m) const; }; enum class Searchflags { @@ -36,6 +42,7 @@ public: static Searchflags fromString(const std::string&); void addFile(const std::string& file); + void finalize(); //stop adding files and finalize deduplication, could be called twice, but then a deduplication is not guaranteed std::list search(std::string text, Searchflags flags = Searchflags::NONE) const; const std::string& getChatname(uint64_t id) const; @@ -51,6 +58,7 @@ private: void loadMessages(const json& j, uint64_t chatid); - std::list msgs; + std::vector msgs; + std::set deduplicate; //intermediate store, for reading files and deduplicate them std::map chatnames; }; \ No newline at end of file