diff --git a/.gitignore b/.gitignore index 6c724f6..304bb2e 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ crawl.conf crawl.txt *.so .idea/ -.vscode/settings.json \ No newline at end of file +.vscode/settings.json +includes/ \ No newline at end of file diff --git a/cpp/crawlerthread.cpp b/cpp/crawlerthread.cpp index c966df8..f7a7d93 100644 --- a/cpp/crawlerthread.cpp +++ b/cpp/crawlerthread.cpp @@ -1,6 +1,7 @@ #include "de_mrbesen_youtubecrawler_CrawlerThread.h" #include +#include #include #include #include @@ -8,6 +9,7 @@ #include + static const std::string YOUTUBEBASE = "https://youtube.com/watch?v="; static const std::regex YOUTUBELINKPATTERN("watch\\?v=([-_a-zA-Z0-9]{11})"); @@ -57,11 +59,11 @@ std::string download(CURL* curl, const std::string& url) { return out.str(); } -JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl(JNIEnv* env, jobject that, jstring videoid, jint threadid) { +JNIEXPORT jboolean JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl(JNIEnv* env, jobject that, jstring videoid, jint threadid) { // get videoid argument jboolean myfalseval = false; const char* cvideid = env->GetStringUTFChars(videoid, &myfalseval); - std::string svideoid(cvideid); + const std::string svideoid(cvideid); // makes a copy env->ReleaseStringUTFChars(videoid, cvideid); // std::cout << "crawl: " << svideoid << std::endl; @@ -71,27 +73,40 @@ JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl(JNIEnv std::string webcontent = download(curl, YOUTUBEBASE + svideoid); if(webcontent.empty()) { std::cout << "webcontent is empty" << std::endl; + return JNI_FALSE; } - jclass jclass_ct = env->FindClass("de/mrbesen/youtubecrawler/CrawlerThread"); - jfieldID fid_ctfound = env->GetFieldID(jclass_ct, "found" , "Ljava/util/LinkedList;"); - jclass jclass_ll = env->FindClass("java/util/LinkedList"); - jmethodID mid_add = env->GetMethodID(jclass_ll, "add", "(Ljava/lang/Object;)Z"); + jclass jclass_ct = env->FindClass("de/mrbesen/youtubecrawler/CrawlerThread"); // class of CrawlerThread + jfieldID fid_ctfound = env->GetFieldID(jclass_ct, "found" , "Ljava/util/List;"); // fieldID of CrawlerThread.found + jclass jclass_ll = env->FindClass("java/util/List"); // Linked List Class + jmethodID mid_add = env->GetMethodID(jclass_ll, "add", "(Ljava/lang/Object;)Z"); // add() method of LinkedList + jobject ll_found = env->GetObjectField(that, fid_ctfound); // linked list to store the results to // match regex - auto it = std::sregex_iterator(webcontent.begin(), webcontent.end(), YOUTUBELINKPATTERN); - auto itend = std::sregex_iterator(); + std::sregex_iterator it = std::sregex_iterator(webcontent.begin(), webcontent.end(), YOUTUBELINKPATTERN); + const std::sregex_iterator itend = std::sregex_iterator(); + // this set is to find duplicated strings it does not catch everything, because it is only valid during this crawl, but it should filter a few with very little effort + std::set known; + known.insert(svideoid); // do not "find" the same ytid + + env->MonitorEnter(that); // syncronized(this) { for( ; it != itend; ++it) { - std::string ytid = (*it)[1].str(); + const std::string ytid = (*it)[1].str(); // get the String from the first sub-group match - // add to the found list - if(ytid != svideoid) { - //construct java string - jstring jytid = env->NewStringUTF(ytid.c_str()); - jobject ll_found = env->GetObjectField(that, fid_ctfound); - jboolean b = env->CallBooleanMethod(ll_found, mid_add, jytid); + // only if the ytid is not known + if(known.find(ytid) == known.end()) { + // add to the found list + // std::cout << ytid << std::endl; + + jstring jytid = env->NewStringUTF(ytid.c_str()); // create a java string object + jboolean b = env->CallBooleanMethod(ll_found, mid_add, jytid); // call add() on the LinkedList object with the String + + known.insert(ytid); } } + env->MonitorExit(that); // end of syncronized block + + return JNI_TRUE; } diff --git a/src/main/de/mrbesen/youtubecrawler/Crawler.java b/src/main/de/mrbesen/youtubecrawler/Crawler.java index 80bccac..a46846d 100644 --- a/src/main/de/mrbesen/youtubecrawler/Crawler.java +++ b/src/main/de/mrbesen/youtubecrawler/Crawler.java @@ -7,12 +7,7 @@ import java.io.IOException; import java.io.PrintWriter; import java.text.DateFormat; import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Date; -import java.util.LinkedList; -import java.util.List; -import java.util.NoSuchElementException; -import java.util.Scanner; +import java.util.*; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; @@ -24,8 +19,8 @@ public class Crawler implements Runnable { int requestlimit = 5;//amount of videos to be left in the todo que of a thread until it requests new videos private int idlecount = 5;//amount of idle loops allowed - private LinkedList toSave = new LinkedList<>();//all found ytids, witch need to be analysed - private LinkedList toCrawl = new LinkedList<>();//all videos tu crawl + private Set toSave = new TreeSet<>();//all found ytids, witch need to be analysed + private Set toCrawl = new TreeSet<>();//all videos tu crawl //private LinkedList toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle private List threads;//list of all threads private List requested = new LinkedList<>(); @@ -73,7 +68,7 @@ public class Crawler implements Runnable { //if(! (toCrawl.contains(videoid) || toknown.contains(videoid))) //toknown.add(videoid); if(toCrawl.contains(videoid)) { - ArrayList str = new ArrayList(1); + ArrayList str = new ArrayList<>(1); str.add(videoid); db.storeTemp(str, false); } @@ -92,11 +87,13 @@ public class Crawler implements Runnable { } private void send(CrawlerThread t) { - // listlock.writeLock().lock(); - for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) { - t.todo.add(toCrawl.removeFirst()); + synchronized (toCrawl) { + for (int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) { + String s = toCrawl.stream().findAny().get(); + toCrawl.remove(s); + t.todo.add(s); + } } - // listlock.writeLock().unlock(); t.requested = false; } @@ -156,7 +153,7 @@ public class Crawler implements Runnable { for (CrawlerThread crawlerThread : threads) { String threadname = crawlerThread.thread.getName(); profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1)); - LinkedList[] report = crawlerThread.report(); + List[] report = crawlerThread.report(); crawlcount+= report[0].size(); toSave.addAll(report[0]); crawlerThread.crawled.clear(); @@ -186,17 +183,21 @@ public class Crawler implements Runnable { private void savetodb() { log.info("save " + toSave.size() + " videos to DB."); - while(!toSave.isEmpty()) { - LinkedList videoids = new LinkedList<>(); - for(int i = 0; i < 50 && !toSave.isEmpty(); i++) { - videoids.add(toSave.remove(0)); - } - if(videoids.size() > 0) { - profiler.startSection("getinfo"); - ArrayList