improved crawling

This commit is contained in:
mrbesen 2022-02-20 23:53:05 +01:00
parent 928942e3ef
commit 01104a2f5e
Signed by: MrBesen
GPG Key ID: 596B2350DCD67504
6 changed files with 122 additions and 85 deletions

3
.gitignore vendored
View File

@ -8,4 +8,5 @@ crawl.conf
crawl.txt
*.so
.idea/
.vscode/settings.json
.vscode/settings.json
includes/

View File

@ -1,6 +1,7 @@
#include "de_mrbesen_youtubecrawler_CrawlerThread.h"
#include <iostream>
#include <set>
#include <string>
#include <sstream>
#include <regex>
@ -8,6 +9,7 @@
#include <curl/curl.h>
static const std::string YOUTUBEBASE = "https://youtube.com/watch?v=";
static const std::regex YOUTUBELINKPATTERN("watch\\?v=([-_a-zA-Z0-9]{11})");
@ -57,11 +59,11 @@ std::string download(CURL* curl, const std::string& url) {
return out.str();
}
JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl(JNIEnv* env, jobject that, jstring videoid, jint threadid) {
JNIEXPORT jboolean JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl(JNIEnv* env, jobject that, jstring videoid, jint threadid) {
// get videoid argument
jboolean myfalseval = false;
const char* cvideid = env->GetStringUTFChars(videoid, &myfalseval);
std::string svideoid(cvideid);
const std::string svideoid(cvideid); // makes a copy
env->ReleaseStringUTFChars(videoid, cvideid);
// std::cout << "crawl: " << svideoid << std::endl;
@ -71,27 +73,40 @@ JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl(JNIEnv
std::string webcontent = download(curl, YOUTUBEBASE + svideoid);
if(webcontent.empty()) {
std::cout << "webcontent is empty" << std::endl;
return JNI_FALSE;
}
jclass jclass_ct = env->FindClass("de/mrbesen/youtubecrawler/CrawlerThread");
jfieldID fid_ctfound = env->GetFieldID(jclass_ct, "found" , "Ljava/util/LinkedList;");
jclass jclass_ll = env->FindClass("java/util/LinkedList");
jmethodID mid_add = env->GetMethodID(jclass_ll, "add", "(Ljava/lang/Object;)Z");
jclass jclass_ct = env->FindClass("de/mrbesen/youtubecrawler/CrawlerThread"); // class of CrawlerThread
jfieldID fid_ctfound = env->GetFieldID(jclass_ct, "found" , "Ljava/util/List;"); // fieldID of CrawlerThread.found
jclass jclass_ll = env->FindClass("java/util/List"); // Linked List Class
jmethodID mid_add = env->GetMethodID(jclass_ll, "add", "(Ljava/lang/Object;)Z"); // add() method of LinkedList
jobject ll_found = env->GetObjectField(that, fid_ctfound); // linked list to store the results to
// match regex
auto it = std::sregex_iterator(webcontent.begin(), webcontent.end(), YOUTUBELINKPATTERN);
auto itend = std::sregex_iterator();
std::sregex_iterator it = std::sregex_iterator(webcontent.begin(), webcontent.end(), YOUTUBELINKPATTERN);
const std::sregex_iterator itend = std::sregex_iterator();
// this set is to find duplicated strings it does not catch everything, because it is only valid during this crawl, but it should filter a few with very little effort
std::set<std::string> known;
known.insert(svideoid); // do not "find" the same ytid
env->MonitorEnter(that); // syncronized(this) {
for( ; it != itend; ++it) {
std::string ytid = (*it)[1].str();
const std::string ytid = (*it)[1].str(); // get the String from the first sub-group match
// add to the found list
if(ytid != svideoid) {
//construct java string
jstring jytid = env->NewStringUTF(ytid.c_str());
jobject ll_found = env->GetObjectField(that, fid_ctfound);
jboolean b = env->CallBooleanMethod(ll_found, mid_add, jytid);
// only if the ytid is not known
if(known.find(ytid) == known.end()) {
// add to the found list
// std::cout << ytid << std::endl;
jstring jytid = env->NewStringUTF(ytid.c_str()); // create a java string object
jboolean b = env->CallBooleanMethod(ll_found, mid_add, jytid); // call add() on the LinkedList object with the String
known.insert(ytid);
}
}
env->MonitorExit(that); // end of syncronized block
return JNI_TRUE;
}

View File

@ -7,12 +7,7 @@ import java.io.IOException;
import java.io.PrintWriter;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Scanner;
import java.util.*;
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
@ -24,8 +19,8 @@ public class Crawler implements Runnable {
int requestlimit = 5;//amount of videos to be left in the todo que of a thread until it requests new videos
private int idlecount = 5;//amount of idle loops allowed
private LinkedList<String> toSave = new LinkedList<>();//all found ytids, witch need to be analysed
private LinkedList<String> toCrawl = new LinkedList<>();//all videos tu crawl
private Set<String> toSave = new TreeSet<>();//all found ytids, witch need to be analysed
private Set<String> toCrawl = new TreeSet<>();//all videos tu crawl
//private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle
private List<CrawlerThread> threads;//list of all threads
private List<CrawlerThread> requested = new LinkedList<>();
@ -73,7 +68,7 @@ public class Crawler implements Runnable {
//if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
//toknown.add(videoid);
if(toCrawl.contains(videoid)) {
ArrayList<String> str = new ArrayList<String>(1);
ArrayList<String> str = new ArrayList<>(1);
str.add(videoid);
db.storeTemp(str, false);
}
@ -92,11 +87,13 @@ public class Crawler implements Runnable {
}
private void send(CrawlerThread t) {
// listlock.writeLock().lock();
for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) {
t.todo.add(toCrawl.removeFirst());
synchronized (toCrawl) {
for (int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) {
String s = toCrawl.stream().findAny().get();
toCrawl.remove(s);
t.todo.add(s);
}
}
// listlock.writeLock().unlock();
t.requested = false;
}
@ -156,7 +153,7 @@ public class Crawler implements Runnable {
for (CrawlerThread crawlerThread : threads) {
String threadname = crawlerThread.thread.getName();
profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1));
LinkedList<String>[] report = crawlerThread.report();
List<String>[] report = crawlerThread.report();
crawlcount+= report[0].size();
toSave.addAll(report[0]);
crawlerThread.crawled.clear();
@ -186,17 +183,21 @@ public class Crawler implements Runnable {
private void savetodb() {
log.info("save " + toSave.size() + " videos to DB.");
while(!toSave.isEmpty()) {
LinkedList<String> videoids = new LinkedList<>();
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
videoids.add(toSave.remove(0));
}
if(videoids.size() > 0) {
profiler.startSection("getinfo");
ArrayList<Video> videos = (ArrayList<Video>) api.getInfos(videoids)[0];
profiler.endStartSection("sendtoDB");
db.addVideos(videos, false);
profiler.endSection();//sendtoDB
synchronized (toSave) {
while (!toSave.isEmpty()) {
Set<String> videoids = new TreeSet<>();
for (int i = 0; i < 50 && !toSave.isEmpty(); i++) {
String save = toSave.stream().findAny().get();
toSave.remove(save);
videoids.add(save);
}
if (videoids.size() > 0) {
profiler.startSection("getinfo");
ArrayList<Video> videos = (ArrayList<Video>) api.getInfos(videoids)[0];
profiler.endStartSection("sendtoDB");
db.addVideos(videos, false);
profiler.endSection();//sendtoDB
}
}
}
}
@ -393,7 +394,7 @@ public class Crawler implements Runnable {
out += "\nThread Nr, todo size, requested, crawledsize, foundsize";
for (int i = 0; i < threads.size(); i++) {
CrawlerThread thre = threads.get(i);
out += "\n " + i + " " + (thre.lockforreport ? "\uD83D\uDD12" : "\uD83D\uDD13") + " " + thre.todo.size() + " " + thre.requested + " " + thre.crawled.size() + " " + thre.found.size();
out += "\n " + i + " " + thre.todo.size() + " " + thre.requested + " " + thre.crawled.size() + " " + thre.found.size();
}
}
return out;

View File

@ -2,6 +2,8 @@ package de.mrbesen.youtubecrawler;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -16,11 +18,13 @@ public class CrawlerThread implements Runnable {
Thread thread;
LinkedList<String> todo = new LinkedList<>();//videos, this thread should crawl
LinkedList<String> crawled = new LinkedList<>();//videos this thread had crawled
LinkedList<String> found = new LinkedList<>();//videos this thread had found
List<String> crawled = new LinkedList<>();//videos this thread had crawled
List<String> found = new LinkedList<>();//videos this thread had found
static AtomicInteger fails = new AtomicInteger(0);
private static int MAXFAILS = 100;
boolean requested = true;//is a request pending?
boolean lockforreport = false;
private int threadid;
static {
@ -54,50 +58,55 @@ public class CrawlerThread implements Runnable {
@Override
public void run() {
while(parent.isCrawling()) {
while(!todo.isEmpty() && parent.isCrawling()) {
if(lockforreport) {
try {
Thread.sleep(10);
} catch(InterruptedException e) {
lockforreport = false;
synchronized (this) {
while (!todo.isEmpty() && parent.isCrawling()) {
String vid = todo.removeFirst();
// System.out.println("crawling: " + vid + " size: " + found.size());
crawled.add(vid);
boolean success = crawl(vid, threadid);
if (todo.size() < parent.requestlimit && !requested) {
requested = true;
parent.request(this);
}
if (!success) {
int val = fails.addAndGet(1);
if (val > MAXFAILS) {
System.err.println("Max Crawlfails reached, stopping");
parent.stop();
break;
}
}
}
String vid = todo.removeFirst();
// System.out.println("crawling: " + vid + " size: " + found.size());
crawled.add(vid);
crawl(vid, threadid);
if(todo.size() < parent.requestlimit && !requested) {
if (todo.isEmpty() && !requested) {
requested = true;
parent.request(this);
}
}
if(todo.isEmpty() && !requested) {
requested = true;
parent.request(this);
log.warn("No Object left!");
Thread.yield();
try {
Thread.sleep(10000);//sleep for 10 seconds
} catch (InterruptedException ignored) {
}
}
log.warn("No Object left!");
Thread.yield();
try {
Thread.sleep(10000);//sleep for 10 seconds
} catch (InterruptedException ignored) {}
}
log.info("Stopped.");
}
/**
* returns a linkedlist of all crawled videos
* returns a list of all crawled videos
* @return
*/
LinkedList<String>[] report() {
lockforreport = true;
LinkedList<String>[] out = new LinkedList[] {(LinkedList) crawled, (LinkedList) found};
crawled = new LinkedList<>();
found = new LinkedList<>();
lockforreport = false;
thread.interrupt();
return out;
List<String>[] report() {
synchronized (this) {
List<String>[] out = new List[]{crawled, found};
crawled = new LinkedList<>();
found = new LinkedList<>();
return out;
}
}
/*
@ -121,5 +130,6 @@ public class CrawlerThread implements Runnable {
}
*/
private native void crawl(String videid, int threadid);
// returns false when it fails
private native boolean crawl(String videid, int threadid);
}

View File

@ -3,6 +3,7 @@ package de.mrbesen.youtubecrawler;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.DateFormat;
import java.text.ParseException;
@ -10,7 +11,6 @@ import java.text.SimpleDateFormat;
import java.util.*;
import java.util.stream.Collectors;
import javax.net.ssl.HttpsURLConnection;
import javax.xml.datatype.DatatypeConfigurationException;
import javax.xml.datatype.DatatypeFactory;
import javax.xml.datatype.Duration;
@ -42,13 +42,22 @@ public class YoutubeAPI {
return getInfos(id)[0].get(0);
}
public List<Crawler.Video>[] getInfos(List<String> ids) {
public List<Crawler.Video>[] getInfos(Collection<String> ids) {
if(ids.isEmpty())
return null;
StringBuilder sb = new StringBuilder(ids.remove(0));
while(!ids.isEmpty()) {
sb.append(',').append(ids.remove(0));
StringBuilder sb = new StringBuilder();
boolean isFirst = true;
for(String id : ids) {
if(id.matches("[a-zA-Z0-9_-]{11}")) {
if(!isFirst) {
sb.append(',');
}
sb.append(id);
isFirst = false;
} else {
System.out.println("non matching id: \"" + id + "\"");
}
}
return getInfos(sb.toString());
}
@ -146,9 +155,12 @@ public class YoutubeAPI {
}
public BufferedReader connect(String url) {
if(url == null)
return null;
try {
URL urll = new URL(url);
HttpsURLConnection con = (HttpsURLConnection) urll.openConnection();
//System.out.println("url: " + urll.toString());
HttpURLConnection con = (HttpURLConnection) urll.openConnection();
con.connect();
//System.out.println(con.getResponseCode());
return new BufferedReader(new InputStreamReader(con.getInputStream()));

View File

@ -1,10 +1,8 @@
package de.mrbesen.youtubecrawler;
import org.json.JSONObject;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import javax.xml.datatype.DatatypeConfigurationException;
import java.util.Map;
public class YoutubeAPITest {
@ -17,7 +15,7 @@ public class YoutubeAPITest {
Assertions.assertEquals(vid.id, "gsvKF2ojUzs");
Assertions.assertEquals(vid.title, "Mikebøi - Missed");
Assertions.assertEquals(vid.channel, "Trap Nation");
Assertions.assertEquals(vid.channel, "TrapNation");
Assertions.assertEquals(vid.tags, "mikeboi missed,trap nation,trapnation,трап натион,electronic music 2020,Trap,Electronic Dance Music,missed trap nation,trap music,Electronic Music,Trap Music,Dance Music,missed mike boi,gaming music,Trap Music 2017,mike boy missed,mikebøi - missed,нас не догонят ремикс,Trap Nation,TrapNation,Mikebøi - Missed,trap nation 2020,trap music 2020 remix,EDM,missed mikeboi,music");
Assertions.assertEquals(vid.length, 213);
Assertions.assertEquals(vid.languageCode, "en");