cpp crawl implementation
This commit is contained in:
parent
ba270f85f6
commit
e02d51b72c
|
@ -3,3 +3,9 @@
|
||||||
.project
|
.project
|
||||||
.settings/*
|
.settings/*
|
||||||
.classpath
|
.classpath
|
||||||
|
admins
|
||||||
|
crawl.conf
|
||||||
|
crawl.txt
|
||||||
|
*.so
|
||||||
|
.idea/
|
||||||
|
.vscode/settings.json
|
|
@ -0,0 +1,11 @@
|
||||||
|
|
||||||
|
|
||||||
|
createhfiles:
|
||||||
|
mkdir -p includes/
|
||||||
|
javah -d includes/ -classpath target/YoutubeCrawler-0.0.2-jar-with-dependencies.jar de.mrbesen.youtubecrawler.CrawlerThread
|
||||||
|
|
||||||
|
compile:
|
||||||
|
g++ -shared -fPIC -o libcrawlerthread.so cpp/crawlerthread.cpp -Iincludes/ -I/usr/lib/jvm/java-8-openjdk-amd64/include/ -I/usr/lib/jvm/java-8-openjdk-amd64/include/linux/ -lcurl
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf includes/ libcrawlerthread.so
|
|
@ -0,0 +1,97 @@
|
||||||
|
#include "de_mrbesen_youtubecrawler_CrawlerThread.h"
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
#include <sstream>
|
||||||
|
#include <regex>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include <curl/curl.h>
|
||||||
|
|
||||||
|
static const std::string YOUTUBEBASE = "https://youtube.com/watch?v=";
|
||||||
|
static const std::regex YOUTUBELINKPATTERN("watch\\?v=([-_a-zA-Z0-9]{11})");
|
||||||
|
|
||||||
|
std::vector<CURL*> curls;
|
||||||
|
|
||||||
|
static size_t WriteCallback(void* contents, size_t size, size_t nmemb, void* userp) {
|
||||||
|
*((std::ostringstream*) userp) << std::string((char*) contents, size * nmemb);
|
||||||
|
return size * nmemb;
|
||||||
|
}
|
||||||
|
|
||||||
|
JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_initLib(JNIEnv* env, jclass clazz, jint threadcount) {
|
||||||
|
curls.reserve(threadcount);
|
||||||
|
for(int i = 0; i < threadcount; ++i) {
|
||||||
|
CURL* curl = curl_easy_init();
|
||||||
|
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
|
||||||
|
curls.push_back(curl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_deinitLib(JNIEnv* env, jclass) {
|
||||||
|
for(int i = 0; i < curls.size(); ++i) {
|
||||||
|
CURL* curl = curls.at(i);
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
}
|
||||||
|
curls.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string download(CURL* curl, const std::string& url) {
|
||||||
|
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||||
|
|
||||||
|
std::ostringstream out;
|
||||||
|
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &out);
|
||||||
|
|
||||||
|
CURLcode res = curl_easy_perform(curl);
|
||||||
|
long responsecode = 404;
|
||||||
|
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &responsecode);
|
||||||
|
if(responsecode != 200) {
|
||||||
|
std::cout << "Curl error: got " << responsecode << std::endl;
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
if(res != CURLE_OK) {
|
||||||
|
std::cout << "Curl error: " << res << std::endl;
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl(JNIEnv* env, jobject that, jstring videoid, jint threadid) {
|
||||||
|
// get videoid argument
|
||||||
|
jboolean myfalseval = false;
|
||||||
|
const char* cvideid = env->GetStringUTFChars(videoid, &myfalseval);
|
||||||
|
std::string svideoid(cvideid);
|
||||||
|
env->ReleaseStringUTFChars(videoid, cvideid);
|
||||||
|
|
||||||
|
// std::cout << "crawl: " << svideoid << std::endl;
|
||||||
|
|
||||||
|
// use curl to get the website
|
||||||
|
CURL* curl = curls.at(threadid);
|
||||||
|
std::string webcontent = download(curl, YOUTUBEBASE + svideoid);
|
||||||
|
if(webcontent.empty()) {
|
||||||
|
std::cout << "webcontent is empty" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
jclass jclass_ct = env->FindClass("de/mrbesen/youtubecrawler/CrawlerThread");
|
||||||
|
jfieldID fid_ctfound = env->GetFieldID(jclass_ct, "found" , "Ljava/util/LinkedList;");
|
||||||
|
jclass jclass_ll = env->FindClass("java/util/LinkedList");
|
||||||
|
jmethodID mid_add = env->GetMethodID(jclass_ll, "add", "(Ljava/lang/Object;)Z");
|
||||||
|
|
||||||
|
// match regex
|
||||||
|
auto it = std::sregex_iterator(webcontent.begin(), webcontent.end(), YOUTUBELINKPATTERN);
|
||||||
|
auto itend = std::sregex_iterator();
|
||||||
|
|
||||||
|
for( ; it != itend; ++it) {
|
||||||
|
std::string ytid = (*it)[1].str();
|
||||||
|
|
||||||
|
// add to the found list
|
||||||
|
if(ytid != svideoid) {
|
||||||
|
//construct java string
|
||||||
|
jstring jytid = env->NewStringUTF(ytid.c_str());
|
||||||
|
jobject ll_found = env->GetObjectField(that, fid_ctfound);
|
||||||
|
jboolean b = env->CallBooleanMethod(ll_found, mid_add, jytid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -42,6 +42,7 @@ public class Crawler implements Runnable {
|
||||||
private File crawlfile = new File("crawl.txt");
|
private File crawlfile = new File("crawl.txt");
|
||||||
private Logger log = Logger.getLogger(this.getClass().getName());
|
private Logger log = Logger.getLogger(this.getClass().getName());
|
||||||
private Profiler profiler = new Profiler();
|
private Profiler profiler = new Profiler();
|
||||||
|
private long lastadminreport = 0;
|
||||||
|
|
||||||
private int startup = 2;//to keep the beginning cool - counter how often the program is allowed to enter startup sleep
|
private int startup = 2;//to keep the beginning cool - counter how often the program is allowed to enter startup sleep
|
||||||
|
|
||||||
|
@ -99,14 +100,7 @@ public class Crawler implements Runnable {
|
||||||
t.requested = false;
|
t.requested = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
private void loadCrawlFile() {
|
||||||
public void run() {
|
|
||||||
profiler.profilingEnabled = true;
|
|
||||||
profiler.clearProfiling();
|
|
||||||
profiler.startSection("root");
|
|
||||||
profiler.startSection("startup");
|
|
||||||
profiler.startSection("loadingcrawlfile");
|
|
||||||
start = System.currentTimeMillis();
|
|
||||||
log.info("Try to load crawlfile");
|
log.info("Try to load crawlfile");
|
||||||
if(crawlfile.exists()) {
|
if(crawlfile.exists()) {
|
||||||
try {
|
try {
|
||||||
|
@ -136,7 +130,9 @@ public class Crawler implements Runnable {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
profiler.endStartSection("populateThreads");//loading crawlfile closed
|
}
|
||||||
|
|
||||||
|
private int createThreads() {
|
||||||
//populate threads
|
//populate threads
|
||||||
int threadcount = 4;
|
int threadcount = 4;
|
||||||
try {
|
try {
|
||||||
|
@ -145,13 +141,89 @@ public class Crawler implements Runnable {
|
||||||
log.warn("Could not read the Number \"" + Config.prop.getProperty("crawler.threadcount") + "\" from the Config.");
|
log.warn("Could not read the Number \"" + Config.prop.getProperty("crawler.threadcount") + "\" from the Config.");
|
||||||
}
|
}
|
||||||
threads = new ArrayList<>(threadcount);
|
threads = new ArrayList<>(threadcount);
|
||||||
|
CrawlerThread.initLib(threadcount);
|
||||||
for(int i = 0; i < threadcount; i++) {
|
for(int i = 0; i < threadcount; i++) {
|
||||||
CrawlerThread thr = new CrawlerThread( this);
|
CrawlerThread thr = new CrawlerThread( this, i);
|
||||||
thr.setThread(new Thread(thr, "Crawler #" + i));
|
thr.setThread(new Thread(thr, "Crawler #" + i));
|
||||||
threads.add(thr);
|
threads.add(thr);
|
||||||
thr.thread.start();
|
thr.thread.start();
|
||||||
}
|
}
|
||||||
|
return threadcount;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void getreports() {
|
||||||
|
log.info("get report");
|
||||||
|
for (CrawlerThread crawlerThread : threads) {
|
||||||
|
String threadname = crawlerThread.thread.getName();
|
||||||
|
profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1));
|
||||||
|
LinkedList<String>[] report = crawlerThread.report();
|
||||||
|
crawlcount+= report[0].size();
|
||||||
|
toSave.addAll(report[0]);
|
||||||
|
crawlerThread.crawled.clear();
|
||||||
|
|
||||||
|
int count = 0;
|
||||||
|
while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden.
|
||||||
|
ArrayList<String> store = null;
|
||||||
|
try {
|
||||||
|
if(report[1].size() <= 50) {
|
||||||
|
store = new ArrayList<>(report[1]);
|
||||||
|
count += report[1].size();
|
||||||
|
report[1].clear();
|
||||||
|
} else {
|
||||||
|
store = new ArrayList<>(report[1].subList(0, 50));
|
||||||
|
report[1].removeAll(store);
|
||||||
|
count+=50;
|
||||||
|
}
|
||||||
|
} catch(NoSuchElementException ignored) {//concurrentmodification fuckery
|
||||||
|
log.info("no suchelement bla");
|
||||||
|
}
|
||||||
|
db.storeTemp(store, false);
|
||||||
|
}
|
||||||
|
log.info(count + " videos added from " + threadname);
|
||||||
|
profiler.endSection();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void savetodb() {
|
||||||
|
log.info("save " + toSave.size() + " videos to DB.");
|
||||||
|
while(!toSave.isEmpty()) {
|
||||||
|
LinkedList<String> videoids = new LinkedList<>();
|
||||||
|
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
|
||||||
|
videoids.add(toSave.remove(0));
|
||||||
|
}
|
||||||
|
if(videoids.size() > 0) {
|
||||||
|
profiler.startSection("getinfo");
|
||||||
|
ArrayList<Video> videos = (ArrayList<Video>) api.getInfos(videoids)[0];
|
||||||
|
profiler.endStartSection("sendtoDB");
|
||||||
|
db.addVideos(videos, false);
|
||||||
|
profiler.endSection();//sendtoDB
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void sendAdminMessage() {
|
||||||
|
long currenttime = System.currentTimeMillis();
|
||||||
|
if((currenttime - lastadminreport) / 1000 > 3600) {
|
||||||
|
long runtimes = (currenttime - start) / 1000;
|
||||||
|
if (runtimes < 0)
|
||||||
|
runtimes = 1;
|
||||||
|
float vidps = (crawlcount / (float) runtimes);//videos per second
|
||||||
|
Main.getMain().broadcastAdmin(vidps + "v/s " + crawlcount + " total V");
|
||||||
|
lastadminreport = currenttime;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
profiler.profilingEnabled = true;
|
||||||
|
profiler.clearProfiling();
|
||||||
|
profiler.startSection("root");
|
||||||
|
profiler.startSection("startup");
|
||||||
|
profiler.startSection("loadingcrawlfile");
|
||||||
|
start = System.currentTimeMillis();
|
||||||
|
loadCrawlFile();
|
||||||
|
profiler.endStartSection("populateThreads");//loading crawlfile closed
|
||||||
|
int threadcount = createThreads();
|
||||||
profiler.endStartSection("deleteDouble");//populate threads
|
profiler.endStartSection("deleteDouble");//populate threads
|
||||||
long lastdoubledelete = System.currentTimeMillis();
|
long lastdoubledelete = System.currentTimeMillis();
|
||||||
//db.deleteDouble();
|
//db.deleteDouble();
|
||||||
|
@ -189,7 +261,6 @@ public class Crawler implements Runnable {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// updateDB();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//nothing left?
|
//nothing left?
|
||||||
|
@ -197,27 +268,6 @@ public class Crawler implements Runnable {
|
||||||
log.warn("nothing left to crawl");
|
log.warn("nothing left to crawl");
|
||||||
}
|
}
|
||||||
|
|
||||||
//refil the tocrawl list.
|
|
||||||
/*if(!toknown.isEmpty()) {
|
|
||||||
//check in db for known videos
|
|
||||||
log.info("Checking the DB");
|
|
||||||
currentstate = "get new tocrawl";
|
|
||||||
// listlock.writeLock().lock();
|
|
||||||
while(toCrawl.size() < jobspeerthread * threads.size() * 2 && crawl && !toknown.isEmpty()) {
|
|
||||||
LinkedList<String> tocheck = new LinkedList<>();
|
|
||||||
for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) {
|
|
||||||
tocheck.add(toknown.removeFirst());
|
|
||||||
}
|
|
||||||
toCrawl.addAll(db.checkvideos(tocheck));
|
|
||||||
}
|
|
||||||
// listlock.writeLock().unlock();
|
|
||||||
}
|
|
||||||
while(toknown.size() < threadcount * jobspeerthread * 20 && crawl) {
|
|
||||||
currentstate = "restoretemp";
|
|
||||||
log.info("restoreTemp");
|
|
||||||
LinkedList<String> rest = db.restoreTemp();
|
|
||||||
toknown.addAll(rest);
|
|
||||||
}*/
|
|
||||||
{
|
{
|
||||||
profiler.endStartSection("loadCrawl");
|
profiler.endStartSection("loadCrawl");
|
||||||
boolean joined = true;
|
boolean joined = true;
|
||||||
|
@ -246,60 +296,14 @@ public class Crawler implements Runnable {
|
||||||
|
|
||||||
//get reports
|
//get reports
|
||||||
profiler.endStartSection("getreport");
|
profiler.endStartSection("getreport");
|
||||||
log.info("get report");
|
getreports();
|
||||||
for (CrawlerThread crawlerThread : threads) {
|
|
||||||
String threadname = crawlerThread.thread.getName();
|
|
||||||
profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1));
|
|
||||||
LinkedList<String>[] report = crawlerThread.report();
|
|
||||||
crawlcount+= report[0].size();
|
|
||||||
toSave.addAll(report[0]);
|
|
||||||
crawlerThread.crawled.clear();
|
|
||||||
|
|
||||||
int count = 0;
|
|
||||||
while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden.
|
|
||||||
ArrayList<String> store = null;
|
|
||||||
try {
|
|
||||||
if(report[1].size() <= 50) {
|
|
||||||
store = new ArrayList<>(report[1]);
|
|
||||||
count += report[1].size();
|
|
||||||
report[1].clear();
|
|
||||||
} else {
|
|
||||||
store = new ArrayList<>(report[1].subList(0, 50));
|
|
||||||
report[1].removeAll(store);
|
|
||||||
count+=50;
|
|
||||||
}
|
|
||||||
} catch(NoSuchElementException ignored) {//concurrentmodification fuckery
|
|
||||||
log.info("no suchelement bla");
|
|
||||||
}
|
|
||||||
db.storeTemp(store, false);
|
|
||||||
}
|
|
||||||
log.info(count + " videos added from " + threadname);
|
|
||||||
profiler.endSection();
|
|
||||||
}
|
|
||||||
|
|
||||||
profiler.endStartSection("debug");
|
profiler.endStartSection("debug");
|
||||||
long runtimes = (System.currentTimeMillis() - start) / 1000;
|
sendAdminMessage();
|
||||||
if(runtimes < 0)
|
|
||||||
runtimes = 1;
|
|
||||||
float vidps = (crawlcount / (float) runtimes);//videos per second
|
|
||||||
Main.getMain().broadcastAdmin(vidps + "v/s " + crawlcount + " total V");
|
|
||||||
|
|
||||||
//save to db
|
//save to db
|
||||||
profiler.endStartSection("save2DB");
|
profiler.endStartSection("save2DB");
|
||||||
log.info("save " + toSave.size() + " videos to DB.");
|
savetodb();
|
||||||
while(!toSave.isEmpty()) {
|
|
||||||
LinkedList<String> videoids = new LinkedList<>();
|
|
||||||
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
|
|
||||||
videoids.add(toSave.remove(0));
|
|
||||||
}
|
|
||||||
if(videoids.size() > 0) {
|
|
||||||
profiler.startSection("getinfo");
|
|
||||||
ArrayList<Video> videos = (ArrayList<Video>) api.getInfos(videoids)[0];
|
|
||||||
profiler.endStartSection("sendtoDB");
|
|
||||||
db.addVideos(videos, false);
|
|
||||||
profiler.endSection();//sendtoDB
|
|
||||||
}
|
|
||||||
}
|
|
||||||
profiler.endSection();//save2DB
|
profiler.endSection();//save2DB
|
||||||
|
|
||||||
//at the beginning there is maybe just one video to crawl, so keep it calm.
|
//at the beginning there is maybe just one video to crawl, so keep it calm.
|
||||||
|
@ -326,6 +330,9 @@ public class Crawler implements Runnable {
|
||||||
}
|
}
|
||||||
profiler.endSection();//main
|
profiler.endSection();//main
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
profiler.startSection("waitforthreads");
|
profiler.startSection("waitforthreads");
|
||||||
for(CrawlerThread ct : threads) {
|
for(CrawlerThread ct : threads) {
|
||||||
try {
|
try {
|
||||||
|
@ -355,6 +362,7 @@ public class Crawler implements Runnable {
|
||||||
int runtimem = (int) (runtimes / 60);
|
int runtimem = (int) (runtimes / 60);
|
||||||
float vidps = (crawlcount / (float) runtimes);//videos per second
|
float vidps = (crawlcount / (float) runtimes);//videos per second
|
||||||
log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " v/s )");
|
log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " v/s )");
|
||||||
|
CrawlerThread.deinitLib();
|
||||||
Main.getMain().stopcallback();
|
Main.getMain().stopcallback();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,10 +21,22 @@ public class CrawlerThread implements Runnable {
|
||||||
|
|
||||||
boolean requested = true;//is a request pending?
|
boolean requested = true;//is a request pending?
|
||||||
boolean lockforreport = false;
|
boolean lockforreport = false;
|
||||||
|
private int threadid;
|
||||||
public CrawlerThread( Crawler root) {
|
|
||||||
|
static {
|
||||||
|
String libpath = System.getProperty("java.library.path");
|
||||||
|
libpath += ":./";
|
||||||
|
System.setProperty("java.library.path", libpath);
|
||||||
|
System.loadLibrary("crawlerthread");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static native void initLib(int threadCount);
|
||||||
|
public static native void deinitLib();
|
||||||
|
|
||||||
|
public CrawlerThread( Crawler root, int threadid) {
|
||||||
parent = root;
|
parent = root;
|
||||||
root.request(this);
|
root.request(this);
|
||||||
|
this.threadid = threadid;
|
||||||
}
|
}
|
||||||
|
|
||||||
void setThread(Thread t) {
|
void setThread(Thread t) {
|
||||||
|
@ -50,13 +62,16 @@ public class CrawlerThread implements Runnable {
|
||||||
lockforreport = false;
|
lockforreport = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
crawl(todo.removeFirst());
|
String vid = todo.removeFirst();
|
||||||
|
// System.out.println("crawling: " + vid + " size: " + found.size());
|
||||||
|
crawled.add(vid);
|
||||||
|
crawl(vid, threadid);
|
||||||
if(todo.size() < parent.requestlimit && !requested) {
|
if(todo.size() < parent.requestlimit && !requested) {
|
||||||
requested = true;
|
requested = true;
|
||||||
parent.request(this);
|
parent.request(this);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if(todo.isEmpty() && !requested) {
|
if(todo.isEmpty() && !requested) {
|
||||||
requested = true;
|
requested = true;
|
||||||
parent.request(this);
|
parent.request(this);
|
||||||
|
@ -85,6 +100,7 @@ public class CrawlerThread implements Runnable {
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
private void crawl(String videoid) {
|
private void crawl(String videoid) {
|
||||||
try {
|
try {
|
||||||
crawled.add(videoid);
|
crawled.add(videoid);
|
||||||
|
@ -103,4 +119,7 @@ public class CrawlerThread implements Runnable {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
private native void crawl(String videid, int threadid);
|
||||||
}
|
}
|
|
@ -308,6 +308,7 @@ public class DB implements Runnable {
|
||||||
if(con != null) {
|
if(con != null) {
|
||||||
if(!con.isClosed()) {
|
if(!con.isClosed()) {
|
||||||
addVideos(null, true);
|
addVideos(null, true);
|
||||||
|
con.commit();
|
||||||
con.close();
|
con.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue