YoutubeCrawler/src/main/de/mrbesen/youtubecrawler/CrawlerThread.java

135 lines
3.1 KiB
Java
Raw Normal View History

2018-07-16 23:22:32 +02:00
package de.mrbesen.youtubecrawler;
import java.io.IOException;
import java.util.LinkedList;
2022-02-20 23:53:05 +01:00
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
2018-07-16 23:22:32 +02:00
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
public class CrawlerThread implements Runnable {
2021-10-18 16:42:44 +02:00
private static Pattern linkpattern = Pattern.compile("watch\\?v=([-_a-zA-Z0-9]{11})");
2018-07-16 23:22:32 +02:00
private Logger log = Logger.getLogger(this.getClass().getName());
private Crawler parent;
Thread thread;
2018-07-16 23:22:32 +02:00
LinkedList<String> todo = new LinkedList<>();//videos, this thread should crawl
2022-02-20 23:53:05 +01:00
List<String> crawled = new LinkedList<>();//videos this thread had crawled
List<String> found = new LinkedList<>();//videos this thread had found
static AtomicInteger fails = new AtomicInteger(0);
private static int MAXFAILS = 100;
2018-07-16 23:22:32 +02:00
boolean requested = true;//is a request pending?
2021-10-25 17:51:46 +02:00
private int threadid;
static {
String libpath = System.getProperty("java.library.path");
libpath += ":./";
System.setProperty("java.library.path", libpath);
System.loadLibrary("crawlerthread");
}
public static native void initLib(int threadCount);
public static native void deinitLib();
public CrawlerThread( Crawler root, int threadid) {
2018-07-16 23:22:32 +02:00
parent = root;
root.request(this);
2021-10-25 17:51:46 +02:00
this.threadid = threadid;
2018-07-16 23:22:32 +02:00
}
void setThread(Thread t) {
thread = t;
}
2018-07-16 23:22:32 +02:00
2018-11-14 11:39:21 +01:00
LinkedList<String> undone() {
return todo;
}
int undoneSize() {
return todo.size();
}
2018-07-16 23:22:32 +02:00
@Override
public void run() {
while(parent.isCrawling()) {
2022-02-20 23:53:05 +01:00
synchronized (this) {
while (!todo.isEmpty() && parent.isCrawling()) {
String vid = todo.removeFirst();
// System.out.println("crawling: " + vid + " size: " + found.size());
crawled.add(vid);
boolean success = crawl(vid, threadid);
if (todo.size() < parent.requestlimit && !requested) {
requested = true;
parent.request(this);
}
if (!success) {
int val = fails.addAndGet(1);
if (val > MAXFAILS) {
System.err.println("Max Crawlfails reached, stopping");
parent.stop();
break;
}
}
}
2022-02-20 23:53:05 +01:00
if (todo.isEmpty() && !requested) {
2018-07-16 23:22:32 +02:00
requested = true;
parent.request(this);
}
2021-10-25 17:51:46 +02:00
2022-02-20 23:53:05 +01:00
log.warn("No Object left!");
Thread.yield();
try {
Thread.sleep(10000);//sleep for 10 seconds
} catch (InterruptedException ignored) {
}
2018-07-16 23:22:32 +02:00
}
}
log.info("Stopped.");
}
/**
2022-02-20 23:53:05 +01:00
* returns a list of all crawled videos
2018-07-16 23:22:32 +02:00
* @return
*/
2022-02-20 23:53:05 +01:00
List<String>[] report() {
synchronized (this) {
List<String>[] out = new List[]{crawled, found};
crawled = new LinkedList<>();
found = new LinkedList<>();
return out;
}
2018-07-16 23:22:32 +02:00
}
2021-10-25 17:51:46 +02:00
/*
2018-07-16 23:22:32 +02:00
private void crawl(String videoid) {
try {
crawled.add(videoid);
2018-07-16 23:22:32 +02:00
// log.info("crawling: " + videoid);
2021-10-18 16:42:44 +02:00
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
2018-07-16 23:22:32 +02:00
String s = con.getContent();
Matcher matcher = linkpattern.matcher(s);
while(matcher.find()) {
2021-10-18 16:42:44 +02:00
String ytid = matcher.group(1);
if(!ytid.equals(videoid)) {
found.add(ytid);
2018-07-16 23:22:32 +02:00
}
}
} catch(IOException e) {
e.printStackTrace();
}
}
2021-10-25 17:51:46 +02:00
*/
2022-02-20 23:53:05 +01:00
// returns false when it fails
private native boolean crawl(String videid, int threadid);
2018-07-16 23:22:32 +02:00
}