cpp crawl implementation

This commit is contained in:
mrbesen 2021-10-25 17:51:46 +02:00
parent ba270f85f6
commit e02d51b72c
Signed by: MrBesen
GPG Key ID: 596B2350DCD67504
6 changed files with 228 additions and 86 deletions

6
.gitignore vendored
View File

@ -3,3 +3,9 @@
.project
.settings/*
.classpath
admins
crawl.conf
crawl.txt
*.so
.idea/
.vscode/settings.json

11
Makefile Normal file
View File

@ -0,0 +1,11 @@
createhfiles:
mkdir -p includes/
javah -d includes/ -classpath target/YoutubeCrawler-0.0.2-jar-with-dependencies.jar de.mrbesen.youtubecrawler.CrawlerThread
compile:
g++ -shared -fPIC -o libcrawlerthread.so cpp/crawlerthread.cpp -Iincludes/ -I/usr/lib/jvm/java-8-openjdk-amd64/include/ -I/usr/lib/jvm/java-8-openjdk-amd64/include/linux/ -lcurl
clean:
rm -rf includes/ libcrawlerthread.so

97
cpp/crawlerthread.cpp Normal file
View File

@ -0,0 +1,97 @@
#include "de_mrbesen_youtubecrawler_CrawlerThread.h"
#include <iostream>
#include <string>
#include <sstream>
#include <regex>
#include <vector>
#include <curl/curl.h>
static const std::string YOUTUBEBASE = "https://youtube.com/watch?v=";
static const std::regex YOUTUBELINKPATTERN("watch\\?v=([-_a-zA-Z0-9]{11})");
std::vector<CURL*> curls;
static size_t WriteCallback(void* contents, size_t size, size_t nmemb, void* userp) {
*((std::ostringstream*) userp) << std::string((char*) contents, size * nmemb);
return size * nmemb;
}
JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_initLib(JNIEnv* env, jclass clazz, jint threadcount) {
curls.reserve(threadcount);
for(int i = 0; i < threadcount; ++i) {
CURL* curl = curl_easy_init();
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
curls.push_back(curl);
}
}
JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_deinitLib(JNIEnv* env, jclass) {
for(int i = 0; i < curls.size(); ++i) {
CURL* curl = curls.at(i);
curl_easy_cleanup(curl);
}
curls.clear();
}
std::string download(CURL* curl, const std::string& url) {
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
std::ostringstream out;
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &out);
CURLcode res = curl_easy_perform(curl);
long responsecode = 404;
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &responsecode);
if(responsecode != 200) {
std::cout << "Curl error: got " << responsecode << std::endl;
return "";
}
if(res != CURLE_OK) {
std::cout << "Curl error: " << res << std::endl;
return "";
}
return out.str();
}
JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl(JNIEnv* env, jobject that, jstring videoid, jint threadid) {
// get videoid argument
jboolean myfalseval = false;
const char* cvideid = env->GetStringUTFChars(videoid, &myfalseval);
std::string svideoid(cvideid);
env->ReleaseStringUTFChars(videoid, cvideid);
// std::cout << "crawl: " << svideoid << std::endl;
// use curl to get the website
CURL* curl = curls.at(threadid);
std::string webcontent = download(curl, YOUTUBEBASE + svideoid);
if(webcontent.empty()) {
std::cout << "webcontent is empty" << std::endl;
}
jclass jclass_ct = env->FindClass("de/mrbesen/youtubecrawler/CrawlerThread");
jfieldID fid_ctfound = env->GetFieldID(jclass_ct, "found" , "Ljava/util/LinkedList;");
jclass jclass_ll = env->FindClass("java/util/LinkedList");
jmethodID mid_add = env->GetMethodID(jclass_ll, "add", "(Ljava/lang/Object;)Z");
// match regex
auto it = std::sregex_iterator(webcontent.begin(), webcontent.end(), YOUTUBELINKPATTERN);
auto itend = std::sregex_iterator();
for( ; it != itend; ++it) {
std::string ytid = (*it)[1].str();
// add to the found list
if(ytid != svideoid) {
//construct java string
jstring jytid = env->NewStringUTF(ytid.c_str());
jobject ll_found = env->GetObjectField(that, fid_ctfound);
jboolean b = env->CallBooleanMethod(ll_found, mid_add, jytid);
}
}
}

View File

@ -42,6 +42,7 @@ public class Crawler implements Runnable {
private File crawlfile = new File("crawl.txt");
private Logger log = Logger.getLogger(this.getClass().getName());
private Profiler profiler = new Profiler();
private long lastadminreport = 0;
private int startup = 2;//to keep the beginning cool - counter how often the program is allowed to enter startup sleep
@ -99,14 +100,7 @@ public class Crawler implements Runnable {
t.requested = false;
}
@Override
public void run() {
profiler.profilingEnabled = true;
profiler.clearProfiling();
profiler.startSection("root");
profiler.startSection("startup");
profiler.startSection("loadingcrawlfile");
start = System.currentTimeMillis();
private void loadCrawlFile() {
log.info("Try to load crawlfile");
if(crawlfile.exists()) {
try {
@ -136,7 +130,9 @@ public class Crawler implements Runnable {
e.printStackTrace();
}
}
profiler.endStartSection("populateThreads");//loading crawlfile closed
}
private int createThreads() {
//populate threads
int threadcount = 4;
try {
@ -145,13 +141,89 @@ public class Crawler implements Runnable {
log.warn("Could not read the Number \"" + Config.prop.getProperty("crawler.threadcount") + "\" from the Config.");
}
threads = new ArrayList<>(threadcount);
CrawlerThread.initLib(threadcount);
for(int i = 0; i < threadcount; i++) {
CrawlerThread thr = new CrawlerThread( this);
CrawlerThread thr = new CrawlerThread( this, i);
thr.setThread(new Thread(thr, "Crawler #" + i));
threads.add(thr);
thr.thread.start();
}
return threadcount;
}
private void getreports() {
log.info("get report");
for (CrawlerThread crawlerThread : threads) {
String threadname = crawlerThread.thread.getName();
profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1));
LinkedList<String>[] report = crawlerThread.report();
crawlcount+= report[0].size();
toSave.addAll(report[0]);
crawlerThread.crawled.clear();
int count = 0;
while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden.
ArrayList<String> store = null;
try {
if(report[1].size() <= 50) {
store = new ArrayList<>(report[1]);
count += report[1].size();
report[1].clear();
} else {
store = new ArrayList<>(report[1].subList(0, 50));
report[1].removeAll(store);
count+=50;
}
} catch(NoSuchElementException ignored) {//concurrentmodification fuckery
log.info("no suchelement bla");
}
db.storeTemp(store, false);
}
log.info(count + " videos added from " + threadname);
profiler.endSection();
}
}
private void savetodb() {
log.info("save " + toSave.size() + " videos to DB.");
while(!toSave.isEmpty()) {
LinkedList<String> videoids = new LinkedList<>();
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
videoids.add(toSave.remove(0));
}
if(videoids.size() > 0) {
profiler.startSection("getinfo");
ArrayList<Video> videos = (ArrayList<Video>) api.getInfos(videoids)[0];
profiler.endStartSection("sendtoDB");
db.addVideos(videos, false);
profiler.endSection();//sendtoDB
}
}
}
private void sendAdminMessage() {
long currenttime = System.currentTimeMillis();
if((currenttime - lastadminreport) / 1000 > 3600) {
long runtimes = (currenttime - start) / 1000;
if (runtimes < 0)
runtimes = 1;
float vidps = (crawlcount / (float) runtimes);//videos per second
Main.getMain().broadcastAdmin(vidps + "v/s " + crawlcount + " total V");
lastadminreport = currenttime;
}
}
@Override
public void run() {
profiler.profilingEnabled = true;
profiler.clearProfiling();
profiler.startSection("root");
profiler.startSection("startup");
profiler.startSection("loadingcrawlfile");
start = System.currentTimeMillis();
loadCrawlFile();
profiler.endStartSection("populateThreads");//loading crawlfile closed
int threadcount = createThreads();
profiler.endStartSection("deleteDouble");//populate threads
long lastdoubledelete = System.currentTimeMillis();
//db.deleteDouble();
@ -189,7 +261,6 @@ public class Crawler implements Runnable {
break;
}
}
// updateDB();
}
}
//nothing left?
@ -197,27 +268,6 @@ public class Crawler implements Runnable {
log.warn("nothing left to crawl");
}
//refil the tocrawl list.
/*if(!toknown.isEmpty()) {
//check in db for known videos
log.info("Checking the DB");
currentstate = "get new tocrawl";
// listlock.writeLock().lock();
while(toCrawl.size() < jobspeerthread * threads.size() * 2 && crawl && !toknown.isEmpty()) {
LinkedList<String> tocheck = new LinkedList<>();
for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) {
tocheck.add(toknown.removeFirst());
}
toCrawl.addAll(db.checkvideos(tocheck));
}
// listlock.writeLock().unlock();
}
while(toknown.size() < threadcount * jobspeerthread * 20 && crawl) {
currentstate = "restoretemp";
log.info("restoreTemp");
LinkedList<String> rest = db.restoreTemp();
toknown.addAll(rest);
}*/
{
profiler.endStartSection("loadCrawl");
boolean joined = true;
@ -246,60 +296,14 @@ public class Crawler implements Runnable {
//get reports
profiler.endStartSection("getreport");
log.info("get report");
for (CrawlerThread crawlerThread : threads) {
String threadname = crawlerThread.thread.getName();
profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1));
LinkedList<String>[] report = crawlerThread.report();
crawlcount+= report[0].size();
toSave.addAll(report[0]);
crawlerThread.crawled.clear();
int count = 0;
while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden.
ArrayList<String> store = null;
try {
if(report[1].size() <= 50) {
store = new ArrayList<>(report[1]);
count += report[1].size();
report[1].clear();
} else {
store = new ArrayList<>(report[1].subList(0, 50));
report[1].removeAll(store);
count+=50;
}
} catch(NoSuchElementException ignored) {//concurrentmodification fuckery
log.info("no suchelement bla");
}
db.storeTemp(store, false);
}
log.info(count + " videos added from " + threadname);
profiler.endSection();
}
getreports();
profiler.endStartSection("debug");
long runtimes = (System.currentTimeMillis() - start) / 1000;
if(runtimes < 0)
runtimes = 1;
float vidps = (crawlcount / (float) runtimes);//videos per second
Main.getMain().broadcastAdmin(vidps + "v/s " + crawlcount + " total V");
sendAdminMessage();
//save to db
profiler.endStartSection("save2DB");
log.info("save " + toSave.size() + " videos to DB.");
while(!toSave.isEmpty()) {
LinkedList<String> videoids = new LinkedList<>();
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
videoids.add(toSave.remove(0));
}
if(videoids.size() > 0) {
profiler.startSection("getinfo");
ArrayList<Video> videos = (ArrayList<Video>) api.getInfos(videoids)[0];
profiler.endStartSection("sendtoDB");
db.addVideos(videos, false);
profiler.endSection();//sendtoDB
}
}
savetodb();
profiler.endSection();//save2DB
//at the beginning there is maybe just one video to crawl, so keep it calm.
@ -326,6 +330,9 @@ public class Crawler implements Runnable {
}
profiler.endSection();//main
}
profiler.startSection("waitforthreads");
for(CrawlerThread ct : threads) {
try {
@ -355,6 +362,7 @@ public class Crawler implements Runnable {
int runtimem = (int) (runtimes / 60);
float vidps = (crawlcount / (float) runtimes);//videos per second
log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " v/s )");
CrawlerThread.deinitLib();
Main.getMain().stopcallback();
}

View File

@ -21,10 +21,22 @@ public class CrawlerThread implements Runnable {
boolean requested = true;//is a request pending?
boolean lockforreport = false;
public CrawlerThread( Crawler root) {
private int threadid;
static {
String libpath = System.getProperty("java.library.path");
libpath += ":./";
System.setProperty("java.library.path", libpath);
System.loadLibrary("crawlerthread");
}
public static native void initLib(int threadCount);
public static native void deinitLib();
public CrawlerThread( Crawler root, int threadid) {
parent = root;
root.request(this);
this.threadid = threadid;
}
void setThread(Thread t) {
@ -50,13 +62,16 @@ public class CrawlerThread implements Runnable {
lockforreport = false;
}
}
crawl(todo.removeFirst());
String vid = todo.removeFirst();
// System.out.println("crawling: " + vid + " size: " + found.size());
crawled.add(vid);
crawl(vid, threadid);
if(todo.size() < parent.requestlimit && !requested) {
requested = true;
parent.request(this);
}
}
if(todo.isEmpty() && !requested) {
requested = true;
parent.request(this);
@ -85,6 +100,7 @@ public class CrawlerThread implements Runnable {
return out;
}
/*
private void crawl(String videoid) {
try {
crawled.add(videoid);
@ -103,4 +119,7 @@ public class CrawlerThread implements Runnable {
e.printStackTrace();
}
}
*/
private native void crawl(String videid, int threadid);
}

View File

@ -308,6 +308,7 @@ public class DB implements Runnable {
if(con != null) {
if(!con.isClosed()) {
addVideos(null, true);
con.commit();
con.close();
}
}