deduplicating messages

This commit is contained in:
mrbesen 2021-04-07 17:52:32 +02:00
parent f167d51840
commit 94f4903786
Signed by: MrBesen
GPG Key ID: 596B2350DCD67504
3 changed files with 34 additions and 2 deletions

View File

@ -36,6 +36,7 @@ int main(int argc, const char** argv) {
Log::info << "Load File: " << argv[i];
rootsearch.addFile(argv[i]);
}
rootsearch.finalize();
time_t end = time(nullptr);
Log::info << rootsearch.getChatCount() << " Chats with " << rootsearch.getMessageCount() << " Messages loaded in " << end-start << "s";

View File

@ -8,6 +8,21 @@
#include <nlohmann/json.hpp>
using json = nlohmann::json;
bool Message::operator==(const Message& m) const {
return (m.chatid == chatid) && (m.messageid == messageid);
}
bool Message::operator!=(const Message& m) const {
return (m.chatid != chatid) || (m.messageid != messageid);
}
bool Message::operator<(const Message& m) const {
if (chatid < m.chatid) return true;
if (chatid > m.chatid) return false;
// chatid == m.chatid
return (messageid < m.messageid);
}
Searchflags operator|=(Searchflags& lhs, const Searchflags sf) {
lhs = (Searchflags) ((uint32_t) lhs | (uint32_t) sf);
return lhs;
@ -114,6 +129,14 @@ void Search::addFile(const std::string& file) {
}
}
void Search::finalize() {
msgs.reserve(msgs.size() + deduplicate.size());
for(const auto it : deduplicate) {
msgs.push_back(it);
}
deduplicate.clear();
}
std::list<const Message*> Search::search(std::string text, Searchflags flags) const {
std::list<const Message*> out;
@ -197,7 +220,7 @@ void Search::loadMessages(const json& j, uint64_t chatid) {
if(m.contains("text")) {
std::string text;
readText(m["text"], text);
msgs.push_back({text, chatid, m["id"]});
deduplicate.insert({text, chatid, m["id"]});
} else {
Log::warn << "text less message: " << m;
}

View File

@ -2,6 +2,8 @@
#include <string>
#include <list>
#include <vector>
#include <set>
#include <map>
#include <cstdint>
@ -12,6 +14,10 @@ struct Message {
std::string text;
uint64_t chatid;
uint64_t messageid;
bool operator==(const Message& m) const;
bool operator!=(const Message& m) const;
bool operator<(const Message& m) const;
};
enum class Searchflags {
@ -36,6 +42,7 @@ public:
static Searchflags fromString(const std::string&);
void addFile(const std::string& file);
void finalize(); //stop adding files and finalize deduplication, could be called twice, but then a deduplication is not guaranteed
std::list<const Message*> search(std::string text, Searchflags flags = Searchflags::NONE) const;
const std::string& getChatname(uint64_t id) const;
@ -51,6 +58,7 @@ private:
void loadMessages(const json& j, uint64_t chatid);
std::list<Message> msgs;
std::vector<Message> msgs;
std::set<Message> deduplicate; //intermediate store, for reading files and deduplicate them
std::map<uint64_t, std::string> chatnames;
};