diff --git a/.gitignore b/.gitignore index acc0a9f..6c724f6 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,9 @@ .project .settings/* .classpath +admins +crawl.conf +crawl.txt +*.so +.idea/ +.vscode/settings.json \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e11d885 --- /dev/null +++ b/Makefile @@ -0,0 +1,11 @@ + + +createhfiles: + mkdir -p includes/ + javah -d includes/ -classpath target/YoutubeCrawler-0.0.2-jar-with-dependencies.jar de.mrbesen.youtubecrawler.CrawlerThread + +compile: + g++ -shared -fPIC -o libcrawlerthread.so cpp/crawlerthread.cpp -Iincludes/ -I/usr/lib/jvm/java-8-openjdk-amd64/include/ -I/usr/lib/jvm/java-8-openjdk-amd64/include/linux/ -lcurl + +clean: + rm -rf includes/ libcrawlerthread.so \ No newline at end of file diff --git a/cpp/crawlerthread.cpp b/cpp/crawlerthread.cpp new file mode 100644 index 0000000..c966df8 --- /dev/null +++ b/cpp/crawlerthread.cpp @@ -0,0 +1,97 @@ +#include "de_mrbesen_youtubecrawler_CrawlerThread.h" + +#include +#include +#include +#include +#include + +#include + +static const std::string YOUTUBEBASE = "https://youtube.com/watch?v="; +static const std::regex YOUTUBELINKPATTERN("watch\\?v=([-_a-zA-Z0-9]{11})"); + +std::vector curls; + +static size_t WriteCallback(void* contents, size_t size, size_t nmemb, void* userp) { + *((std::ostringstream*) userp) << std::string((char*) contents, size * nmemb); + return size * nmemb; +} + +JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_initLib(JNIEnv* env, jclass clazz, jint threadcount) { + curls.reserve(threadcount); + for(int i = 0; i < threadcount; ++i) { + CURL* curl = curl_easy_init(); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback); + curls.push_back(curl); + } +} + +JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_deinitLib(JNIEnv* env, jclass) { + for(int i = 0; i < curls.size(); ++i) { + CURL* curl = curls.at(i); + curl_easy_cleanup(curl); + } + curls.clear(); +} + +std::string download(CURL* curl, const std::string& url) { + curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); + + std::ostringstream out; + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &out); + + CURLcode res = curl_easy_perform(curl); + long responsecode = 404; + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &responsecode); + if(responsecode != 200) { + std::cout << "Curl error: got " << responsecode << std::endl; + return ""; + } + if(res != CURLE_OK) { + std::cout << "Curl error: " << res << std::endl; + return ""; + } + + return out.str(); +} + +JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl(JNIEnv* env, jobject that, jstring videoid, jint threadid) { + // get videoid argument + jboolean myfalseval = false; + const char* cvideid = env->GetStringUTFChars(videoid, &myfalseval); + std::string svideoid(cvideid); + env->ReleaseStringUTFChars(videoid, cvideid); + + // std::cout << "crawl: " << svideoid << std::endl; + + // use curl to get the website + CURL* curl = curls.at(threadid); + std::string webcontent = download(curl, YOUTUBEBASE + svideoid); + if(webcontent.empty()) { + std::cout << "webcontent is empty" << std::endl; + } + + jclass jclass_ct = env->FindClass("de/mrbesen/youtubecrawler/CrawlerThread"); + jfieldID fid_ctfound = env->GetFieldID(jclass_ct, "found" , "Ljava/util/LinkedList;"); + jclass jclass_ll = env->FindClass("java/util/LinkedList"); + jmethodID mid_add = env->GetMethodID(jclass_ll, "add", "(Ljava/lang/Object;)Z"); + + // match regex + auto it = std::sregex_iterator(webcontent.begin(), webcontent.end(), YOUTUBELINKPATTERN); + auto itend = std::sregex_iterator(); + + for( ; it != itend; ++it) { + std::string ytid = (*it)[1].str(); + + // add to the found list + if(ytid != svideoid) { + //construct java string + jstring jytid = env->NewStringUTF(ytid.c_str()); + jobject ll_found = env->GetObjectField(that, fid_ctfound); + jboolean b = env->CallBooleanMethod(ll_found, mid_add, jytid); + } + } +} + diff --git a/src/main/de/mrbesen/youtubecrawler/Crawler.java b/src/main/de/mrbesen/youtubecrawler/Crawler.java index 3ead8bd..80bccac 100644 --- a/src/main/de/mrbesen/youtubecrawler/Crawler.java +++ b/src/main/de/mrbesen/youtubecrawler/Crawler.java @@ -42,6 +42,7 @@ public class Crawler implements Runnable { private File crawlfile = new File("crawl.txt"); private Logger log = Logger.getLogger(this.getClass().getName()); private Profiler profiler = new Profiler(); + private long lastadminreport = 0; private int startup = 2;//to keep the beginning cool - counter how often the program is allowed to enter startup sleep @@ -99,14 +100,7 @@ public class Crawler implements Runnable { t.requested = false; } - @Override - public void run() { - profiler.profilingEnabled = true; - profiler.clearProfiling(); - profiler.startSection("root"); - profiler.startSection("startup"); - profiler.startSection("loadingcrawlfile"); - start = System.currentTimeMillis(); + private void loadCrawlFile() { log.info("Try to load crawlfile"); if(crawlfile.exists()) { try { @@ -136,7 +130,9 @@ public class Crawler implements Runnable { e.printStackTrace(); } } - profiler.endStartSection("populateThreads");//loading crawlfile closed + } + + private int createThreads() { //populate threads int threadcount = 4; try { @@ -145,13 +141,89 @@ public class Crawler implements Runnable { log.warn("Could not read the Number \"" + Config.prop.getProperty("crawler.threadcount") + "\" from the Config."); } threads = new ArrayList<>(threadcount); - + CrawlerThread.initLib(threadcount); for(int i = 0; i < threadcount; i++) { - CrawlerThread thr = new CrawlerThread( this); + CrawlerThread thr = new CrawlerThread( this, i); thr.setThread(new Thread(thr, "Crawler #" + i)); threads.add(thr); thr.thread.start(); } + return threadcount; + } + + private void getreports() { + log.info("get report"); + for (CrawlerThread crawlerThread : threads) { + String threadname = crawlerThread.thread.getName(); + profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1)); + LinkedList[] report = crawlerThread.report(); + crawlcount+= report[0].size(); + toSave.addAll(report[0]); + crawlerThread.crawled.clear(); + + int count = 0; + while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden. + ArrayList store = null; + try { + if(report[1].size() <= 50) { + store = new ArrayList<>(report[1]); + count += report[1].size(); + report[1].clear(); + } else { + store = new ArrayList<>(report[1].subList(0, 50)); + report[1].removeAll(store); + count+=50; + } + } catch(NoSuchElementException ignored) {//concurrentmodification fuckery + log.info("no suchelement bla"); + } + db.storeTemp(store, false); + } + log.info(count + " videos added from " + threadname); + profiler.endSection(); + } + } + + private void savetodb() { + log.info("save " + toSave.size() + " videos to DB."); + while(!toSave.isEmpty()) { + LinkedList videoids = new LinkedList<>(); + for(int i = 0; i < 50 && !toSave.isEmpty(); i++) { + videoids.add(toSave.remove(0)); + } + if(videoids.size() > 0) { + profiler.startSection("getinfo"); + ArrayList