cpp crawl implementation
This commit is contained in:
parent
ba270f85f6
commit
e02d51b72c
|
@ -3,3 +3,9 @@
|
|||
.project
|
||||
.settings/*
|
||||
.classpath
|
||||
admins
|
||||
crawl.conf
|
||||
crawl.txt
|
||||
*.so
|
||||
.idea/
|
||||
.vscode/settings.json
|
|
@ -0,0 +1,11 @@
|
|||
|
||||
|
||||
createhfiles:
|
||||
mkdir -p includes/
|
||||
javah -d includes/ -classpath target/YoutubeCrawler-0.0.2-jar-with-dependencies.jar de.mrbesen.youtubecrawler.CrawlerThread
|
||||
|
||||
compile:
|
||||
g++ -shared -fPIC -o libcrawlerthread.so cpp/crawlerthread.cpp -Iincludes/ -I/usr/lib/jvm/java-8-openjdk-amd64/include/ -I/usr/lib/jvm/java-8-openjdk-amd64/include/linux/ -lcurl
|
||||
|
||||
clean:
|
||||
rm -rf includes/ libcrawlerthread.so
|
|
@ -0,0 +1,97 @@
|
|||
#include "de_mrbesen_youtubecrawler_CrawlerThread.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <regex>
|
||||
#include <vector>
|
||||
|
||||
#include <curl/curl.h>
|
||||
|
||||
static const std::string YOUTUBEBASE = "https://youtube.com/watch?v=";
|
||||
static const std::regex YOUTUBELINKPATTERN("watch\\?v=([-_a-zA-Z0-9]{11})");
|
||||
|
||||
std::vector<CURL*> curls;
|
||||
|
||||
static size_t WriteCallback(void* contents, size_t size, size_t nmemb, void* userp) {
|
||||
*((std::ostringstream*) userp) << std::string((char*) contents, size * nmemb);
|
||||
return size * nmemb;
|
||||
}
|
||||
|
||||
JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_initLib(JNIEnv* env, jclass clazz, jint threadcount) {
|
||||
curls.reserve(threadcount);
|
||||
for(int i = 0; i < threadcount; ++i) {
|
||||
CURL* curl = curl_easy_init();
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
|
||||
curls.push_back(curl);
|
||||
}
|
||||
}
|
||||
|
||||
JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_deinitLib(JNIEnv* env, jclass) {
|
||||
for(int i = 0; i < curls.size(); ++i) {
|
||||
CURL* curl = curls.at(i);
|
||||
curl_easy_cleanup(curl);
|
||||
}
|
||||
curls.clear();
|
||||
}
|
||||
|
||||
std::string download(CURL* curl, const std::string& url) {
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||
|
||||
std::ostringstream out;
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &out);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
long responsecode = 404;
|
||||
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &responsecode);
|
||||
if(responsecode != 200) {
|
||||
std::cout << "Curl error: got " << responsecode << std::endl;
|
||||
return "";
|
||||
}
|
||||
if(res != CURLE_OK) {
|
||||
std::cout << "Curl error: " << res << std::endl;
|
||||
return "";
|
||||
}
|
||||
|
||||
return out.str();
|
||||
}
|
||||
|
||||
JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl(JNIEnv* env, jobject that, jstring videoid, jint threadid) {
|
||||
// get videoid argument
|
||||
jboolean myfalseval = false;
|
||||
const char* cvideid = env->GetStringUTFChars(videoid, &myfalseval);
|
||||
std::string svideoid(cvideid);
|
||||
env->ReleaseStringUTFChars(videoid, cvideid);
|
||||
|
||||
// std::cout << "crawl: " << svideoid << std::endl;
|
||||
|
||||
// use curl to get the website
|
||||
CURL* curl = curls.at(threadid);
|
||||
std::string webcontent = download(curl, YOUTUBEBASE + svideoid);
|
||||
if(webcontent.empty()) {
|
||||
std::cout << "webcontent is empty" << std::endl;
|
||||
}
|
||||
|
||||
jclass jclass_ct = env->FindClass("de/mrbesen/youtubecrawler/CrawlerThread");
|
||||
jfieldID fid_ctfound = env->GetFieldID(jclass_ct, "found" , "Ljava/util/LinkedList;");
|
||||
jclass jclass_ll = env->FindClass("java/util/LinkedList");
|
||||
jmethodID mid_add = env->GetMethodID(jclass_ll, "add", "(Ljava/lang/Object;)Z");
|
||||
|
||||
// match regex
|
||||
auto it = std::sregex_iterator(webcontent.begin(), webcontent.end(), YOUTUBELINKPATTERN);
|
||||
auto itend = std::sregex_iterator();
|
||||
|
||||
for( ; it != itend; ++it) {
|
||||
std::string ytid = (*it)[1].str();
|
||||
|
||||
// add to the found list
|
||||
if(ytid != svideoid) {
|
||||
//construct java string
|
||||
jstring jytid = env->NewStringUTF(ytid.c_str());
|
||||
jobject ll_found = env->GetObjectField(that, fid_ctfound);
|
||||
jboolean b = env->CallBooleanMethod(ll_found, mid_add, jytid);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -42,6 +42,7 @@ public class Crawler implements Runnable {
|
|||
private File crawlfile = new File("crawl.txt");
|
||||
private Logger log = Logger.getLogger(this.getClass().getName());
|
||||
private Profiler profiler = new Profiler();
|
||||
private long lastadminreport = 0;
|
||||
|
||||
private int startup = 2;//to keep the beginning cool - counter how often the program is allowed to enter startup sleep
|
||||
|
||||
|
@ -99,14 +100,7 @@ public class Crawler implements Runnable {
|
|||
t.requested = false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
profiler.profilingEnabled = true;
|
||||
profiler.clearProfiling();
|
||||
profiler.startSection("root");
|
||||
profiler.startSection("startup");
|
||||
profiler.startSection("loadingcrawlfile");
|
||||
start = System.currentTimeMillis();
|
||||
private void loadCrawlFile() {
|
||||
log.info("Try to load crawlfile");
|
||||
if(crawlfile.exists()) {
|
||||
try {
|
||||
|
@ -136,7 +130,9 @@ public class Crawler implements Runnable {
|
|||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
profiler.endStartSection("populateThreads");//loading crawlfile closed
|
||||
}
|
||||
|
||||
private int createThreads() {
|
||||
//populate threads
|
||||
int threadcount = 4;
|
||||
try {
|
||||
|
@ -145,13 +141,89 @@ public class Crawler implements Runnable {
|
|||
log.warn("Could not read the Number \"" + Config.prop.getProperty("crawler.threadcount") + "\" from the Config.");
|
||||
}
|
||||
threads = new ArrayList<>(threadcount);
|
||||
|
||||
CrawlerThread.initLib(threadcount);
|
||||
for(int i = 0; i < threadcount; i++) {
|
||||
CrawlerThread thr = new CrawlerThread( this);
|
||||
CrawlerThread thr = new CrawlerThread( this, i);
|
||||
thr.setThread(new Thread(thr, "Crawler #" + i));
|
||||
threads.add(thr);
|
||||
thr.thread.start();
|
||||
}
|
||||
return threadcount;
|
||||
}
|
||||
|
||||
private void getreports() {
|
||||
log.info("get report");
|
||||
for (CrawlerThread crawlerThread : threads) {
|
||||
String threadname = crawlerThread.thread.getName();
|
||||
profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1));
|
||||
LinkedList<String>[] report = crawlerThread.report();
|
||||
crawlcount+= report[0].size();
|
||||
toSave.addAll(report[0]);
|
||||
crawlerThread.crawled.clear();
|
||||
|
||||
int count = 0;
|
||||
while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden.
|
||||
ArrayList<String> store = null;
|
||||
try {
|
||||
if(report[1].size() <= 50) {
|
||||
store = new ArrayList<>(report[1]);
|
||||
count += report[1].size();
|
||||
report[1].clear();
|
||||
} else {
|
||||
store = new ArrayList<>(report[1].subList(0, 50));
|
||||
report[1].removeAll(store);
|
||||
count+=50;
|
||||
}
|
||||
} catch(NoSuchElementException ignored) {//concurrentmodification fuckery
|
||||
log.info("no suchelement bla");
|
||||
}
|
||||
db.storeTemp(store, false);
|
||||
}
|
||||
log.info(count + " videos added from " + threadname);
|
||||
profiler.endSection();
|
||||
}
|
||||
}
|
||||
|
||||
private void savetodb() {
|
||||
log.info("save " + toSave.size() + " videos to DB.");
|
||||
while(!toSave.isEmpty()) {
|
||||
LinkedList<String> videoids = new LinkedList<>();
|
||||
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
|
||||
videoids.add(toSave.remove(0));
|
||||
}
|
||||
if(videoids.size() > 0) {
|
||||
profiler.startSection("getinfo");
|
||||
ArrayList<Video> videos = (ArrayList<Video>) api.getInfos(videoids)[0];
|
||||
profiler.endStartSection("sendtoDB");
|
||||
db.addVideos(videos, false);
|
||||
profiler.endSection();//sendtoDB
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void sendAdminMessage() {
|
||||
long currenttime = System.currentTimeMillis();
|
||||
if((currenttime - lastadminreport) / 1000 > 3600) {
|
||||
long runtimes = (currenttime - start) / 1000;
|
||||
if (runtimes < 0)
|
||||
runtimes = 1;
|
||||
float vidps = (crawlcount / (float) runtimes);//videos per second
|
||||
Main.getMain().broadcastAdmin(vidps + "v/s " + crawlcount + " total V");
|
||||
lastadminreport = currenttime;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
profiler.profilingEnabled = true;
|
||||
profiler.clearProfiling();
|
||||
profiler.startSection("root");
|
||||
profiler.startSection("startup");
|
||||
profiler.startSection("loadingcrawlfile");
|
||||
start = System.currentTimeMillis();
|
||||
loadCrawlFile();
|
||||
profiler.endStartSection("populateThreads");//loading crawlfile closed
|
||||
int threadcount = createThreads();
|
||||
profiler.endStartSection("deleteDouble");//populate threads
|
||||
long lastdoubledelete = System.currentTimeMillis();
|
||||
//db.deleteDouble();
|
||||
|
@ -189,7 +261,6 @@ public class Crawler implements Runnable {
|
|||
break;
|
||||
}
|
||||
}
|
||||
// updateDB();
|
||||
}
|
||||
}
|
||||
//nothing left?
|
||||
|
@ -197,27 +268,6 @@ public class Crawler implements Runnable {
|
|||
log.warn("nothing left to crawl");
|
||||
}
|
||||
|
||||
//refil the tocrawl list.
|
||||
/*if(!toknown.isEmpty()) {
|
||||
//check in db for known videos
|
||||
log.info("Checking the DB");
|
||||
currentstate = "get new tocrawl";
|
||||
// listlock.writeLock().lock();
|
||||
while(toCrawl.size() < jobspeerthread * threads.size() * 2 && crawl && !toknown.isEmpty()) {
|
||||
LinkedList<String> tocheck = new LinkedList<>();
|
||||
for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) {
|
||||
tocheck.add(toknown.removeFirst());
|
||||
}
|
||||
toCrawl.addAll(db.checkvideos(tocheck));
|
||||
}
|
||||
// listlock.writeLock().unlock();
|
||||
}
|
||||
while(toknown.size() < threadcount * jobspeerthread * 20 && crawl) {
|
||||
currentstate = "restoretemp";
|
||||
log.info("restoreTemp");
|
||||
LinkedList<String> rest = db.restoreTemp();
|
||||
toknown.addAll(rest);
|
||||
}*/
|
||||
{
|
||||
profiler.endStartSection("loadCrawl");
|
||||
boolean joined = true;
|
||||
|
@ -246,60 +296,14 @@ public class Crawler implements Runnable {
|
|||
|
||||
//get reports
|
||||
profiler.endStartSection("getreport");
|
||||
log.info("get report");
|
||||
for (CrawlerThread crawlerThread : threads) {
|
||||
String threadname = crawlerThread.thread.getName();
|
||||
profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1));
|
||||
LinkedList<String>[] report = crawlerThread.report();
|
||||
crawlcount+= report[0].size();
|
||||
toSave.addAll(report[0]);
|
||||
crawlerThread.crawled.clear();
|
||||
|
||||
int count = 0;
|
||||
while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden.
|
||||
ArrayList<String> store = null;
|
||||
try {
|
||||
if(report[1].size() <= 50) {
|
||||
store = new ArrayList<>(report[1]);
|
||||
count += report[1].size();
|
||||
report[1].clear();
|
||||
} else {
|
||||
store = new ArrayList<>(report[1].subList(0, 50));
|
||||
report[1].removeAll(store);
|
||||
count+=50;
|
||||
}
|
||||
} catch(NoSuchElementException ignored) {//concurrentmodification fuckery
|
||||
log.info("no suchelement bla");
|
||||
}
|
||||
db.storeTemp(store, false);
|
||||
}
|
||||
log.info(count + " videos added from " + threadname);
|
||||
profiler.endSection();
|
||||
}
|
||||
getreports();
|
||||
|
||||
profiler.endStartSection("debug");
|
||||
long runtimes = (System.currentTimeMillis() - start) / 1000;
|
||||
if(runtimes < 0)
|
||||
runtimes = 1;
|
||||
float vidps = (crawlcount / (float) runtimes);//videos per second
|
||||
Main.getMain().broadcastAdmin(vidps + "v/s " + crawlcount + " total V");
|
||||
sendAdminMessage();
|
||||
|
||||
//save to db
|
||||
profiler.endStartSection("save2DB");
|
||||
log.info("save " + toSave.size() + " videos to DB.");
|
||||
while(!toSave.isEmpty()) {
|
||||
LinkedList<String> videoids = new LinkedList<>();
|
||||
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
|
||||
videoids.add(toSave.remove(0));
|
||||
}
|
||||
if(videoids.size() > 0) {
|
||||
profiler.startSection("getinfo");
|
||||
ArrayList<Video> videos = (ArrayList<Video>) api.getInfos(videoids)[0];
|
||||
profiler.endStartSection("sendtoDB");
|
||||
db.addVideos(videos, false);
|
||||
profiler.endSection();//sendtoDB
|
||||
}
|
||||
}
|
||||
savetodb();
|
||||
profiler.endSection();//save2DB
|
||||
|
||||
//at the beginning there is maybe just one video to crawl, so keep it calm.
|
||||
|
@ -326,6 +330,9 @@ public class Crawler implements Runnable {
|
|||
}
|
||||
profiler.endSection();//main
|
||||
}
|
||||
|
||||
|
||||
|
||||
profiler.startSection("waitforthreads");
|
||||
for(CrawlerThread ct : threads) {
|
||||
try {
|
||||
|
@ -355,6 +362,7 @@ public class Crawler implements Runnable {
|
|||
int runtimem = (int) (runtimes / 60);
|
||||
float vidps = (crawlcount / (float) runtimes);//videos per second
|
||||
log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " v/s )");
|
||||
CrawlerThread.deinitLib();
|
||||
Main.getMain().stopcallback();
|
||||
}
|
||||
|
||||
|
|
|
@ -21,10 +21,22 @@ public class CrawlerThread implements Runnable {
|
|||
|
||||
boolean requested = true;//is a request pending?
|
||||
boolean lockforreport = false;
|
||||
|
||||
public CrawlerThread( Crawler root) {
|
||||
private int threadid;
|
||||
|
||||
static {
|
||||
String libpath = System.getProperty("java.library.path");
|
||||
libpath += ":./";
|
||||
System.setProperty("java.library.path", libpath);
|
||||
System.loadLibrary("crawlerthread");
|
||||
}
|
||||
|
||||
public static native void initLib(int threadCount);
|
||||
public static native void deinitLib();
|
||||
|
||||
public CrawlerThread( Crawler root, int threadid) {
|
||||
parent = root;
|
||||
root.request(this);
|
||||
this.threadid = threadid;
|
||||
}
|
||||
|
||||
void setThread(Thread t) {
|
||||
|
@ -50,13 +62,16 @@ public class CrawlerThread implements Runnable {
|
|||
lockforreport = false;
|
||||
}
|
||||
}
|
||||
crawl(todo.removeFirst());
|
||||
String vid = todo.removeFirst();
|
||||
// System.out.println("crawling: " + vid + " size: " + found.size());
|
||||
crawled.add(vid);
|
||||
crawl(vid, threadid);
|
||||
if(todo.size() < parent.requestlimit && !requested) {
|
||||
requested = true;
|
||||
parent.request(this);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if(todo.isEmpty() && !requested) {
|
||||
requested = true;
|
||||
parent.request(this);
|
||||
|
@ -85,6 +100,7 @@ public class CrawlerThread implements Runnable {
|
|||
return out;
|
||||
}
|
||||
|
||||
/*
|
||||
private void crawl(String videoid) {
|
||||
try {
|
||||
crawled.add(videoid);
|
||||
|
@ -103,4 +119,7 @@ public class CrawlerThread implements Runnable {
|
|||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
private native void crawl(String videid, int threadid);
|
||||
}
|
|
@ -308,6 +308,7 @@ public class DB implements Runnable {
|
|||
if(con != null) {
|
||||
if(!con.isClosed()) {
|
||||
addVideos(null, true);
|
||||
con.commit();
|
||||
con.close();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue