improved crawling
This commit is contained in:
parent
928942e3ef
commit
01104a2f5e
|
@ -8,4 +8,5 @@ crawl.conf
|
|||
crawl.txt
|
||||
*.so
|
||||
.idea/
|
||||
.vscode/settings.json
|
||||
.vscode/settings.json
|
||||
includes/
|
|
@ -1,6 +1,7 @@
|
|||
#include "de_mrbesen_youtubecrawler_CrawlerThread.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <regex>
|
||||
|
@ -8,6 +9,7 @@
|
|||
|
||||
#include <curl/curl.h>
|
||||
|
||||
|
||||
static const std::string YOUTUBEBASE = "https://youtube.com/watch?v=";
|
||||
static const std::regex YOUTUBELINKPATTERN("watch\\?v=([-_a-zA-Z0-9]{11})");
|
||||
|
||||
|
@ -57,11 +59,11 @@ std::string download(CURL* curl, const std::string& url) {
|
|||
return out.str();
|
||||
}
|
||||
|
||||
JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl(JNIEnv* env, jobject that, jstring videoid, jint threadid) {
|
||||
JNIEXPORT jboolean JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl(JNIEnv* env, jobject that, jstring videoid, jint threadid) {
|
||||
// get videoid argument
|
||||
jboolean myfalseval = false;
|
||||
const char* cvideid = env->GetStringUTFChars(videoid, &myfalseval);
|
||||
std::string svideoid(cvideid);
|
||||
const std::string svideoid(cvideid); // makes a copy
|
||||
env->ReleaseStringUTFChars(videoid, cvideid);
|
||||
|
||||
// std::cout << "crawl: " << svideoid << std::endl;
|
||||
|
@ -71,27 +73,40 @@ JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl(JNIEnv
|
|||
std::string webcontent = download(curl, YOUTUBEBASE + svideoid);
|
||||
if(webcontent.empty()) {
|
||||
std::cout << "webcontent is empty" << std::endl;
|
||||
return JNI_FALSE;
|
||||
}
|
||||
|
||||
jclass jclass_ct = env->FindClass("de/mrbesen/youtubecrawler/CrawlerThread");
|
||||
jfieldID fid_ctfound = env->GetFieldID(jclass_ct, "found" , "Ljava/util/LinkedList;");
|
||||
jclass jclass_ll = env->FindClass("java/util/LinkedList");
|
||||
jmethodID mid_add = env->GetMethodID(jclass_ll, "add", "(Ljava/lang/Object;)Z");
|
||||
jclass jclass_ct = env->FindClass("de/mrbesen/youtubecrawler/CrawlerThread"); // class of CrawlerThread
|
||||
jfieldID fid_ctfound = env->GetFieldID(jclass_ct, "found" , "Ljava/util/List;"); // fieldID of CrawlerThread.found
|
||||
jclass jclass_ll = env->FindClass("java/util/List"); // Linked List Class
|
||||
jmethodID mid_add = env->GetMethodID(jclass_ll, "add", "(Ljava/lang/Object;)Z"); // add() method of LinkedList
|
||||
jobject ll_found = env->GetObjectField(that, fid_ctfound); // linked list to store the results to
|
||||
|
||||
// match regex
|
||||
auto it = std::sregex_iterator(webcontent.begin(), webcontent.end(), YOUTUBELINKPATTERN);
|
||||
auto itend = std::sregex_iterator();
|
||||
std::sregex_iterator it = std::sregex_iterator(webcontent.begin(), webcontent.end(), YOUTUBELINKPATTERN);
|
||||
const std::sregex_iterator itend = std::sregex_iterator();
|
||||
|
||||
// this set is to find duplicated strings it does not catch everything, because it is only valid during this crawl, but it should filter a few with very little effort
|
||||
std::set<std::string> known;
|
||||
known.insert(svideoid); // do not "find" the same ytid
|
||||
|
||||
env->MonitorEnter(that); // syncronized(this) {
|
||||
for( ; it != itend; ++it) {
|
||||
std::string ytid = (*it)[1].str();
|
||||
const std::string ytid = (*it)[1].str(); // get the String from the first sub-group match
|
||||
|
||||
// add to the found list
|
||||
if(ytid != svideoid) {
|
||||
//construct java string
|
||||
jstring jytid = env->NewStringUTF(ytid.c_str());
|
||||
jobject ll_found = env->GetObjectField(that, fid_ctfound);
|
||||
jboolean b = env->CallBooleanMethod(ll_found, mid_add, jytid);
|
||||
// only if the ytid is not known
|
||||
if(known.find(ytid) == known.end()) {
|
||||
// add to the found list
|
||||
// std::cout << ytid << std::endl;
|
||||
|
||||
jstring jytid = env->NewStringUTF(ytid.c_str()); // create a java string object
|
||||
jboolean b = env->CallBooleanMethod(ll_found, mid_add, jytid); // call add() on the LinkedList object with the String
|
||||
|
||||
known.insert(ytid);
|
||||
}
|
||||
}
|
||||
env->MonitorExit(that); // end of syncronized block
|
||||
|
||||
return JNI_TRUE;
|
||||
}
|
||||
|
||||
|
|
|
@ -7,12 +7,7 @@ import java.io.IOException;
|
|||
import java.io.PrintWriter;
|
||||
import java.text.DateFormat;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Scanner;
|
||||
import java.util.*;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
@ -24,8 +19,8 @@ public class Crawler implements Runnable {
|
|||
int requestlimit = 5;//amount of videos to be left in the todo que of a thread until it requests new videos
|
||||
private int idlecount = 5;//amount of idle loops allowed
|
||||
|
||||
private LinkedList<String> toSave = new LinkedList<>();//all found ytids, witch need to be analysed
|
||||
private LinkedList<String> toCrawl = new LinkedList<>();//all videos tu crawl
|
||||
private Set<String> toSave = new TreeSet<>();//all found ytids, witch need to be analysed
|
||||
private Set<String> toCrawl = new TreeSet<>();//all videos tu crawl
|
||||
//private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle
|
||||
private List<CrawlerThread> threads;//list of all threads
|
||||
private List<CrawlerThread> requested = new LinkedList<>();
|
||||
|
@ -73,7 +68,7 @@ public class Crawler implements Runnable {
|
|||
//if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
|
||||
//toknown.add(videoid);
|
||||
if(toCrawl.contains(videoid)) {
|
||||
ArrayList<String> str = new ArrayList<String>(1);
|
||||
ArrayList<String> str = new ArrayList<>(1);
|
||||
str.add(videoid);
|
||||
db.storeTemp(str, false);
|
||||
}
|
||||
|
@ -92,11 +87,13 @@ public class Crawler implements Runnable {
|
|||
}
|
||||
|
||||
private void send(CrawlerThread t) {
|
||||
// listlock.writeLock().lock();
|
||||
for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) {
|
||||
t.todo.add(toCrawl.removeFirst());
|
||||
synchronized (toCrawl) {
|
||||
for (int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) {
|
||||
String s = toCrawl.stream().findAny().get();
|
||||
toCrawl.remove(s);
|
||||
t.todo.add(s);
|
||||
}
|
||||
}
|
||||
// listlock.writeLock().unlock();
|
||||
t.requested = false;
|
||||
}
|
||||
|
||||
|
@ -156,7 +153,7 @@ public class Crawler implements Runnable {
|
|||
for (CrawlerThread crawlerThread : threads) {
|
||||
String threadname = crawlerThread.thread.getName();
|
||||
profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1));
|
||||
LinkedList<String>[] report = crawlerThread.report();
|
||||
List<String>[] report = crawlerThread.report();
|
||||
crawlcount+= report[0].size();
|
||||
toSave.addAll(report[0]);
|
||||
crawlerThread.crawled.clear();
|
||||
|
@ -186,17 +183,21 @@ public class Crawler implements Runnable {
|
|||
|
||||
private void savetodb() {
|
||||
log.info("save " + toSave.size() + " videos to DB.");
|
||||
while(!toSave.isEmpty()) {
|
||||
LinkedList<String> videoids = new LinkedList<>();
|
||||
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
|
||||
videoids.add(toSave.remove(0));
|
||||
}
|
||||
if(videoids.size() > 0) {
|
||||
profiler.startSection("getinfo");
|
||||
ArrayList<Video> videos = (ArrayList<Video>) api.getInfos(videoids)[0];
|
||||
profiler.endStartSection("sendtoDB");
|
||||
db.addVideos(videos, false);
|
||||
profiler.endSection();//sendtoDB
|
||||
synchronized (toSave) {
|
||||
while (!toSave.isEmpty()) {
|
||||
Set<String> videoids = new TreeSet<>();
|
||||
for (int i = 0; i < 50 && !toSave.isEmpty(); i++) {
|
||||
String save = toSave.stream().findAny().get();
|
||||
toSave.remove(save);
|
||||
videoids.add(save);
|
||||
}
|
||||
if (videoids.size() > 0) {
|
||||
profiler.startSection("getinfo");
|
||||
ArrayList<Video> videos = (ArrayList<Video>) api.getInfos(videoids)[0];
|
||||
profiler.endStartSection("sendtoDB");
|
||||
db.addVideos(videos, false);
|
||||
profiler.endSection();//sendtoDB
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -393,7 +394,7 @@ public class Crawler implements Runnable {
|
|||
out += "\nThread Nr, todo size, requested, crawledsize, foundsize";
|
||||
for (int i = 0; i < threads.size(); i++) {
|
||||
CrawlerThread thre = threads.get(i);
|
||||
out += "\n " + i + " " + (thre.lockforreport ? "\uD83D\uDD12" : "\uD83D\uDD13") + " " + thre.todo.size() + " " + thre.requested + " " + thre.crawled.size() + " " + thre.found.size();
|
||||
out += "\n " + i + " " + thre.todo.size() + " " + thre.requested + " " + thre.crawled.size() + " " + thre.found.size();
|
||||
}
|
||||
}
|
||||
return out;
|
||||
|
|
|
@ -2,6 +2,8 @@ package de.mrbesen.youtubecrawler;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
@ -16,11 +18,13 @@ public class CrawlerThread implements Runnable {
|
|||
Thread thread;
|
||||
|
||||
LinkedList<String> todo = new LinkedList<>();//videos, this thread should crawl
|
||||
LinkedList<String> crawled = new LinkedList<>();//videos this thread had crawled
|
||||
LinkedList<String> found = new LinkedList<>();//videos this thread had found
|
||||
List<String> crawled = new LinkedList<>();//videos this thread had crawled
|
||||
List<String> found = new LinkedList<>();//videos this thread had found
|
||||
|
||||
static AtomicInteger fails = new AtomicInteger(0);
|
||||
private static int MAXFAILS = 100;
|
||||
|
||||
boolean requested = true;//is a request pending?
|
||||
boolean lockforreport = false;
|
||||
private int threadid;
|
||||
|
||||
static {
|
||||
|
@ -54,50 +58,55 @@ public class CrawlerThread implements Runnable {
|
|||
@Override
|
||||
public void run() {
|
||||
while(parent.isCrawling()) {
|
||||
while(!todo.isEmpty() && parent.isCrawling()) {
|
||||
if(lockforreport) {
|
||||
try {
|
||||
Thread.sleep(10);
|
||||
} catch(InterruptedException e) {
|
||||
lockforreport = false;
|
||||
synchronized (this) {
|
||||
while (!todo.isEmpty() && parent.isCrawling()) {
|
||||
String vid = todo.removeFirst();
|
||||
// System.out.println("crawling: " + vid + " size: " + found.size());
|
||||
crawled.add(vid);
|
||||
|
||||
boolean success = crawl(vid, threadid);
|
||||
|
||||
if (todo.size() < parent.requestlimit && !requested) {
|
||||
requested = true;
|
||||
parent.request(this);
|
||||
}
|
||||
if (!success) {
|
||||
int val = fails.addAndGet(1);
|
||||
if (val > MAXFAILS) {
|
||||
System.err.println("Max Crawlfails reached, stopping");
|
||||
parent.stop();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
String vid = todo.removeFirst();
|
||||
// System.out.println("crawling: " + vid + " size: " + found.size());
|
||||
crawled.add(vid);
|
||||
crawl(vid, threadid);
|
||||
if(todo.size() < parent.requestlimit && !requested) {
|
||||
|
||||
if (todo.isEmpty() && !requested) {
|
||||
requested = true;
|
||||
parent.request(this);
|
||||
}
|
||||
}
|
||||
|
||||
if(todo.isEmpty() && !requested) {
|
||||
requested = true;
|
||||
parent.request(this);
|
||||
log.warn("No Object left!");
|
||||
Thread.yield();
|
||||
try {
|
||||
Thread.sleep(10000);//sleep for 10 seconds
|
||||
} catch (InterruptedException ignored) {
|
||||
}
|
||||
}
|
||||
|
||||
log.warn("No Object left!");
|
||||
Thread.yield();
|
||||
try {
|
||||
Thread.sleep(10000);//sleep for 10 seconds
|
||||
} catch (InterruptedException ignored) {}
|
||||
}
|
||||
log.info("Stopped.");
|
||||
}
|
||||
|
||||
/**
|
||||
* returns a linkedlist of all crawled videos
|
||||
* returns a list of all crawled videos
|
||||
* @return
|
||||
*/
|
||||
LinkedList<String>[] report() {
|
||||
lockforreport = true;
|
||||
LinkedList<String>[] out = new LinkedList[] {(LinkedList) crawled, (LinkedList) found};
|
||||
crawled = new LinkedList<>();
|
||||
found = new LinkedList<>();
|
||||
lockforreport = false;
|
||||
thread.interrupt();
|
||||
return out;
|
||||
List<String>[] report() {
|
||||
synchronized (this) {
|
||||
List<String>[] out = new List[]{crawled, found};
|
||||
crawled = new LinkedList<>();
|
||||
found = new LinkedList<>();
|
||||
return out;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -121,5 +130,6 @@ public class CrawlerThread implements Runnable {
|
|||
}
|
||||
*/
|
||||
|
||||
private native void crawl(String videid, int threadid);
|
||||
// returns false when it fails
|
||||
private native boolean crawl(String videid, int threadid);
|
||||
}
|
|
@ -3,6 +3,7 @@ package de.mrbesen.youtubecrawler;
|
|||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParseException;
|
||||
|
@ -10,7 +11,6 @@ import java.text.SimpleDateFormat;
|
|||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javax.net.ssl.HttpsURLConnection;
|
||||
import javax.xml.datatype.DatatypeConfigurationException;
|
||||
import javax.xml.datatype.DatatypeFactory;
|
||||
import javax.xml.datatype.Duration;
|
||||
|
@ -42,13 +42,22 @@ public class YoutubeAPI {
|
|||
return getInfos(id)[0].get(0);
|
||||
}
|
||||
|
||||
public List<Crawler.Video>[] getInfos(List<String> ids) {
|
||||
public List<Crawler.Video>[] getInfos(Collection<String> ids) {
|
||||
if(ids.isEmpty())
|
||||
return null;
|
||||
|
||||
StringBuilder sb = new StringBuilder(ids.remove(0));
|
||||
while(!ids.isEmpty()) {
|
||||
sb.append(',').append(ids.remove(0));
|
||||
StringBuilder sb = new StringBuilder();
|
||||
boolean isFirst = true;
|
||||
for(String id : ids) {
|
||||
if(id.matches("[a-zA-Z0-9_-]{11}")) {
|
||||
if(!isFirst) {
|
||||
sb.append(',');
|
||||
}
|
||||
sb.append(id);
|
||||
isFirst = false;
|
||||
} else {
|
||||
System.out.println("non matching id: \"" + id + "\"");
|
||||
}
|
||||
}
|
||||
return getInfos(sb.toString());
|
||||
}
|
||||
|
@ -146,9 +155,12 @@ public class YoutubeAPI {
|
|||
}
|
||||
|
||||
public BufferedReader connect(String url) {
|
||||
if(url == null)
|
||||
return null;
|
||||
try {
|
||||
URL urll = new URL(url);
|
||||
HttpsURLConnection con = (HttpsURLConnection) urll.openConnection();
|
||||
//System.out.println("url: " + urll.toString());
|
||||
HttpURLConnection con = (HttpURLConnection) urll.openConnection();
|
||||
con.connect();
|
||||
//System.out.println(con.getResponseCode());
|
||||
return new BufferedReader(new InputStreamReader(con.getInputStream()));
|
||||
|
|
|
@ -1,10 +1,8 @@
|
|||
package de.mrbesen.youtubecrawler;
|
||||
|
||||
import org.json.JSONObject;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import javax.xml.datatype.DatatypeConfigurationException;
|
||||
import java.util.Map;
|
||||
|
||||
public class YoutubeAPITest {
|
||||
|
@ -17,7 +15,7 @@ public class YoutubeAPITest {
|
|||
|
||||
Assertions.assertEquals(vid.id, "gsvKF2ojUzs");
|
||||
Assertions.assertEquals(vid.title, "Mikebøi - Missed");
|
||||
Assertions.assertEquals(vid.channel, "Trap Nation");
|
||||
Assertions.assertEquals(vid.channel, "TrapNation");
|
||||
Assertions.assertEquals(vid.tags, "mikeboi missed,trap nation,trapnation,трап натион,electronic music 2020,Trap,Electronic Dance Music,missed trap nation,trap music,Electronic Music,Trap Music,Dance Music,missed mike boi,gaming music,Trap Music 2017,mike boy missed,mikebøi - missed,нас не догонят ремикс,Trap Nation,TrapNation,Mikebøi - Missed,trap nation 2020,trap music 2020 remix,EDM,missed mikeboi,music");
|
||||
Assertions.assertEquals(vid.length, 213);
|
||||
Assertions.assertEquals(vid.languageCode, "en");
|
||||
|
|
Loading…
Reference in New Issue