improved crawling

2022-02-20 23:53:05 +01:00 · 2022-02-20 23:53:05 +01:00 · 01104a2f5e
parent 928942e3ef
commit 01104a2f5e
6 changed files with 122 additions and 85 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,4 +8,5 @@ crawl.conf
 crawl.txt
 *.so
 .idea/
-.vscode/settings.json
+.vscode/settings.json
+includes/
--- a/cpp/crawlerthread.cpp
+++ b/cpp/crawlerthread.cpp
@ -1,6 +1,7 @@
 #include "de_mrbesen_youtubecrawler_CrawlerThread.h"

 #include <iostream>
+#include <set>
 #include <string>
 #include <sstream>
 #include <regex>
@ -8,6 +9,7 @@

 #include <curl/curl.h>

+
 static const std::string YOUTUBEBASE = "https://youtube.com/watch?v=";
 static const std::regex YOUTUBELINKPATTERN("watch\\?v=([-_a-zA-Z0-9]{11})");

@ -57,11 +59,11 @@ std::string download(CURL* curl, const std::string& url) {
 	return out.str();
 }

-JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl(JNIEnv* env, jobject that, jstring videoid, jint threadid) {
+JNIEXPORT jboolean JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl(JNIEnv* env, jobject that, jstring videoid, jint threadid) {
 	// get videoid argument
 	jboolean myfalseval = false;
 	const char* cvideid = env->GetStringUTFChars(videoid, &myfalseval);
-	std::string svideoid(cvideid);
+	const std::string svideoid(cvideid); // makes a copy
 	env->ReleaseStringUTFChars(videoid, cvideid);

 	// std::cout << "crawl: " << svideoid << std::endl;
@ -71,27 +73,40 @@ JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl(JNIEnv
 	std::string webcontent = download(curl, YOUTUBEBASE + svideoid);
 	if(webcontent.empty()) {
 		std::cout << "webcontent is empty" << std::endl;
+		return JNI_FALSE;
 	}

-	jclass jclass_ct = env->FindClass("de/mrbesen/youtubecrawler/CrawlerThread");
-	jfieldID fid_ctfound = env->GetFieldID(jclass_ct, "found" , "Ljava/util/LinkedList;");
-	jclass jclass_ll = env->FindClass("java/util/LinkedList");
-	jmethodID mid_add = env->GetMethodID(jclass_ll, "add", "(Ljava/lang/Object;)Z");
+	jclass jclass_ct = env->FindClass("de/mrbesen/youtubecrawler/CrawlerThread"); // class of CrawlerThread
+	jfieldID fid_ctfound = env->GetFieldID(jclass_ct, "found" , "Ljava/util/List;"); // fieldID of CrawlerThread.found
+	jclass jclass_ll = env->FindClass("java/util/List"); // Linked List Class
+	jmethodID mid_add = env->GetMethodID(jclass_ll, "add", "(Ljava/lang/Object;)Z"); // add() method of LinkedList
+	jobject ll_found = env->GetObjectField(that, fid_ctfound); // linked list to store the results to

 	// match regex
-	auto it = std::sregex_iterator(webcontent.begin(), webcontent.end(), YOUTUBELINKPATTERN);
-	auto itend = std::sregex_iterator();
+	std::sregex_iterator it = std::sregex_iterator(webcontent.begin(), webcontent.end(), YOUTUBELINKPATTERN);
+	const std::sregex_iterator itend = std::sregex_iterator();

+	// this set is to find duplicated strings it does not catch everything, because it is only valid during this crawl, but it should filter a few with very little effort
+	std::set<std::string> known;
+	known.insert(svideoid); // do not "find" the same ytid
+
+	env->MonitorEnter(that); // syncronized(this) {
 	for( ; it != itend; ++it) {
-		std::string ytid = (*it)[1].str();
+		const std::string ytid = (*it)[1].str(); // get the String from the first sub-group match

-		// add to the found list
-		if(ytid != svideoid) {
-			//construct java string
-			jstring jytid = env->NewStringUTF(ytid.c_str());
-			jobject ll_found = env->GetObjectField(that, fid_ctfound);
-			jboolean b = env->CallBooleanMethod(ll_found, mid_add, jytid);
+		// only if the ytid is not known
+		if(known.find(ytid) == known.end()) {
+			// add to the found list
+			// std::cout << ytid << std::endl;
+
+			jstring jytid = env->NewStringUTF(ytid.c_str()); // create a java string object
+			jboolean b = env->CallBooleanMethod(ll_found, mid_add, jytid); // call add() on the LinkedList object with the String
+
+			known.insert(ytid);
 		}
 	}
+	env->MonitorExit(that); // end of syncronized block
+
+	return JNI_TRUE;
 }

--- a/src/main/de/mrbesen/youtubecrawler/Crawler.java
+++ b/src/main/de/mrbesen/youtubecrawler/Crawler.java
@ -7,12 +7,7 @@ import java.io.IOException;
 import java.io.PrintWriter;
 import java.text.DateFormat;
 import java.text.SimpleDateFormat;
-import java.util.ArrayList;
-import java.util.Date;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.NoSuchElementException;
-import java.util.Scanner;
+import java.util.*;

 import lombok.AllArgsConstructor;
 import lombok.NoArgsConstructor;
@ -24,8 +19,8 @@ public class Crawler implements Runnable {
 	int requestlimit = 5;//amount of videos to be left in the todo que of a thread until it requests new videos
 	private int idlecount = 5;//amount of idle loops allowed

-	private LinkedList<String> toSave = new LinkedList<>();//all found ytids, witch need to be analysed
-	private LinkedList<String> toCrawl = new LinkedList<>();//all videos tu crawl
+	private Set<String> toSave = new TreeSet<>();//all found ytids, witch need to be analysed
+	private Set<String> toCrawl = new TreeSet<>();//all videos tu crawl
 	//private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle
 	private List<CrawlerThread> threads;//list of all threads
 	private List<CrawlerThread> requested = new LinkedList<>();
@ -73,7 +68,7 @@ public class Crawler implements Runnable {
 		//if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
 		//toknown.add(videoid);
 		if(toCrawl.contains(videoid)) {
-			ArrayList<String> str = new ArrayList<String>(1);
+			ArrayList<String> str = new ArrayList<>(1);
 			str.add(videoid);
 			db.storeTemp(str, false);
 		}
@ -92,11 +87,13 @@ public class Crawler implements Runnable {
 	}

 	private void send(CrawlerThread t) {
-		//		listlock.writeLock().lock();
-		for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) {
-			t.todo.add(toCrawl.removeFirst());
+		synchronized (toCrawl) {
+			for (int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) {
+				String s = toCrawl.stream().findAny().get();
+				toCrawl.remove(s);
+				t.todo.add(s);
+			}
 		}
-		//		listlock.writeLock().unlock();
 		t.requested = false;
 	}

@ -156,7 +153,7 @@ public class Crawler implements Runnable {
 		for (CrawlerThread crawlerThread : threads) {
 			String threadname = crawlerThread.thread.getName();
 			profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1));
-			LinkedList<String>[] report = crawlerThread.report();
+			List<String>[] report = crawlerThread.report();
 			crawlcount+= report[0].size();
 			toSave.addAll(report[0]);
 			crawlerThread.crawled.clear();
@ -186,17 +183,21 @@ public class Crawler implements Runnable {

 	private void savetodb() {
 		log.info("save " + toSave.size() + " videos to DB.");
-		while(!toSave.isEmpty()) {
-			LinkedList<String> videoids = new LinkedList<>();
-			for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
-				videoids.add(toSave.remove(0));
-			}
-			if(videoids.size() > 0) {
-				profiler.startSection("getinfo");
-				ArrayList<Video> videos = (ArrayList<Video>) api.getInfos(videoids)[0];
-				profiler.endStartSection("sendtoDB");
-				db.addVideos(videos, false);
-				profiler.endSection();//sendtoDB
+		synchronized (toSave) {
+			while (!toSave.isEmpty()) {
+				Set<String> videoids = new TreeSet<>();
+				for (int i = 0; i < 50 && !toSave.isEmpty(); i++) {
+					String save = toSave.stream().findAny().get();
+					toSave.remove(save);
+					videoids.add(save);
+				}
+				if (videoids.size() > 0) {
+					profiler.startSection("getinfo");
+					ArrayList<Video> videos = (ArrayList<Video>) api.getInfos(videoids)[0];
+					profiler.endStartSection("sendtoDB");
+					db.addVideos(videos, false);
+					profiler.endSection();//sendtoDB
+				}
 			}
 		}
 	}
@ -393,7 +394,7 @@ public class Crawler implements Runnable {
 			out += "\nThread Nr, todo size, requested, crawledsize, foundsize";
 			for (int i = 0; i < threads.size(); i++) {
 				CrawlerThread thre = threads.get(i);
-				out += "\n    " + i + " " + (thre.lockforreport ? "\uD83D\uDD12" : "\uD83D\uDD13") + "      " + thre.todo.size() + "    " + thre.requested + "   " + thre.crawled.size() + "    " + thre.found.size();
+				out += "\n    " + i +  "      " + thre.todo.size() + "    " + thre.requested + "   " + thre.crawled.size() + "    " + thre.found.size();
 			}
 		}
 		return out;
--- a/src/main/de/mrbesen/youtubecrawler/CrawlerThread.java
+++ b/src/main/de/mrbesen/youtubecrawler/CrawlerThread.java
@ -2,6 +2,8 @@ package de.mrbesen.youtubecrawler;

 import java.io.IOException;
 import java.util.LinkedList;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

@ -16,11 +18,13 @@ public class CrawlerThread implements Runnable {
 	Thread thread;
 	
 	LinkedList<String> todo = new LinkedList<>();//videos, this thread should crawl
-	LinkedList<String> crawled = new LinkedList<>();//videos this thread had crawled
-	LinkedList<String> found = new LinkedList<>();//videos this thread had found
+	List<String> crawled = new LinkedList<>();//videos this thread had crawled
+	List<String> found = new LinkedList<>();//videos this thread had found
+
+	static AtomicInteger fails = new AtomicInteger(0);
+	private static int MAXFAILS = 100;

 	boolean requested = true;//is a request pending?
-	boolean lockforreport = false;
 	private int threadid;

 	static {
@ -54,50 +58,55 @@ public class CrawlerThread implements Runnable {
 	@Override
 	public void run() {
 		while(parent.isCrawling()) {
-			while(!todo.isEmpty() && parent.isCrawling()) {
-				if(lockforreport) {
-					try {
-						Thread.sleep(10);
-					} catch(InterruptedException e) {
-						lockforreport = false;
+			synchronized (this) {
+				while (!todo.isEmpty() && parent.isCrawling()) {
+					String vid = todo.removeFirst();
+					// System.out.println("crawling: " + vid + " size: " + found.size());
+					crawled.add(vid);
+
+					boolean success = crawl(vid, threadid);
+
+					if (todo.size() < parent.requestlimit && !requested) {
+						requested = true;
+						parent.request(this);
+					}
+					if (!success) {
+						int val = fails.addAndGet(1);
+						if (val > MAXFAILS) {
+							System.err.println("Max Crawlfails reached, stopping");
+							parent.stop();
+							break;
+						}
 					}
 				}
-				String vid = todo.removeFirst();
-				// System.out.println("crawling: " + vid + " size: " + found.size());
-				crawled.add(vid);
-				crawl(vid, threadid);
-				if(todo.size() < parent.requestlimit && !requested) {
+
+				if (todo.isEmpty() && !requested) {
 					requested = true;
 					parent.request(this);
 				}
-			}

-			if(todo.isEmpty() && !requested) {
-				requested = true;
-				parent.request(this);
+				log.warn("No Object left!");
+				Thread.yield();
+				try {
+					Thread.sleep(10000);//sleep for 10 seconds
+				} catch (InterruptedException ignored) {
+				}
 			}
-			
-			log.warn("No Object left!");
-			Thread.yield();
-			try {
-				Thread.sleep(10000);//sleep for 10 seconds
-			} catch (InterruptedException ignored) {}
 		}
 		log.info("Stopped.");
 	}

 	/**
-	 * returns a linkedlist of all crawled videos
+	 * returns a list of all crawled videos
 	 * @return
 	 */
-	LinkedList<String>[] report() {
-		lockforreport = true;
-		LinkedList<String>[] out = new LinkedList[] {(LinkedList) crawled, (LinkedList) found};
-		crawled = new LinkedList<>();
-		found = new LinkedList<>();
-		lockforreport = false;
-		thread.interrupt();
-		return out;
+	List<String>[] report() {
+		synchronized (this) {
+			List<String>[] out = new List[]{crawled, found};
+			crawled = new LinkedList<>();
+			found = new LinkedList<>();
+			return out;
+		}
 	}

 	/*
@ -121,5 +130,6 @@ public class CrawlerThread implements Runnable {
 	}
 	*/

-	private native void crawl(String videid, int threadid);
+	// returns false when it fails
+	private native boolean crawl(String videid, int threadid);
 }
--- a/src/main/de/mrbesen/youtubecrawler/YoutubeAPI.java
+++ b/src/main/de/mrbesen/youtubecrawler/YoutubeAPI.java
@ -3,6 +3,7 @@ package de.mrbesen.youtubecrawler;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
+import java.net.HttpURLConnection;
 import java.net.URL;
 import java.text.DateFormat;
 import java.text.ParseException;
@ -10,7 +11,6 @@ import java.text.SimpleDateFormat;
 import java.util.*;
 import java.util.stream.Collectors;

-import javax.net.ssl.HttpsURLConnection;
 import javax.xml.datatype.DatatypeConfigurationException;
 import javax.xml.datatype.DatatypeFactory;
 import javax.xml.datatype.Duration;
@ -42,13 +42,22 @@ public class YoutubeAPI {
 		return getInfos(id)[0].get(0);
 	}

-	public List<Crawler.Video>[] getInfos(List<String> ids) {
+	public List<Crawler.Video>[] getInfos(Collection<String> ids) {
 		if(ids.isEmpty())
 			return null;

-		StringBuilder sb = new StringBuilder(ids.remove(0));
-		while(!ids.isEmpty()) {
-			sb.append(',').append(ids.remove(0));
+		StringBuilder sb = new StringBuilder();
+		boolean isFirst = true;
+		for(String id : ids) {
+			if(id.matches("[a-zA-Z0-9_-]{11}")) {
+				if(!isFirst) {
+					sb.append(',');
+				}
+				sb.append(id);
+				isFirst = false;
+			} else {
+				System.out.println("non matching id: \"" + id + "\"");
+			}
 		}
 		return getInfos(sb.toString());
 	}
@ -146,9 +155,12 @@ public class YoutubeAPI {
 	}

 	public BufferedReader connect(String url) {
+		if(url == null)
+			return null;
 		try {
 			URL urll = new URL(url);
-			HttpsURLConnection con = (HttpsURLConnection) urll.openConnection();
+			//System.out.println("url: " + urll.toString());
+			HttpURLConnection con = (HttpURLConnection) urll.openConnection();
 			con.connect();
 			//System.out.println(con.getResponseCode());
 			return new BufferedReader(new InputStreamReader(con.getInputStream()));
--- a/src/test/de/mrbesen/youtubecrawler/YoutubeAPITest.java
+++ b/src/test/de/mrbesen/youtubecrawler/YoutubeAPITest.java
@ -1,10 +1,8 @@
 package de.mrbesen.youtubecrawler;

-import org.json.JSONObject;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;

-import javax.xml.datatype.DatatypeConfigurationException;
 import java.util.Map;

 public class YoutubeAPITest {
@ -17,7 +15,7 @@ public class YoutubeAPITest {

 		Assertions.assertEquals(vid.id, "gsvKF2ojUzs");
 		Assertions.assertEquals(vid.title, "Mikebøi - Missed");
-		Assertions.assertEquals(vid.channel, "Trap Nation");
+		Assertions.assertEquals(vid.channel, "TrapNation");
 		Assertions.assertEquals(vid.tags, "mikeboi missed,trap nation,trapnation,трап натион,electronic music 2020,Trap,Electronic Dance Music,missed trap nation,trap music,Electronic Music,Trap Music,Dance Music,missed mike boi,gaming music,Trap Music 2017,mike boy missed,mikebøi - missed,нас не догонят ремикс,Trap Nation,TrapNation,Mikebøi - Missed,trap nation 2020,trap music 2020 remix,EDM,missed mikeboi,music");
 		Assertions.assertEquals(vid.length, 213);
 		Assertions.assertEquals(vid.languageCode, "en");