YoutubeCrawler/src/main/de/mrbesen/youtubecrawler/CrawlerThread.java

135 lines
3.1 KiB
Java

package de.mrbesen.youtubecrawler;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
public class CrawlerThread implements Runnable {
private static Pattern linkpattern = Pattern.compile("watch\\?v=([-_a-zA-Z0-9]{11})");
private Logger log = Logger.getLogger(this.getClass().getName());
private Crawler parent;
Thread thread;
LinkedList<String> todo = new LinkedList<>();//videos, this thread should crawl
List<String> crawled = new LinkedList<>();//videos this thread had crawled
List<String> found = new LinkedList<>();//videos this thread had found
static AtomicInteger fails = new AtomicInteger(0);
private static int MAXFAILS = 100;
boolean requested = true;//is a request pending?
private int threadid;
static {
String libpath = System.getProperty("java.library.path");
libpath += ":./";
System.setProperty("java.library.path", libpath);
System.loadLibrary("crawlerthread");
}
public static native void initLib(int threadCount);
public static native void deinitLib();
public CrawlerThread( Crawler root, int threadid) {
parent = root;
root.request(this);
this.threadid = threadid;
}
void setThread(Thread t) {
thread = t;
}
LinkedList<String> undone() {
return todo;
}
int undoneSize() {
return todo.size();
}
@Override
public void run() {
while(parent.isCrawling()) {
synchronized (this) {
while (!todo.isEmpty() && parent.isCrawling()) {
String vid = todo.removeFirst();
// System.out.println("crawling: " + vid + " size: " + found.size());
crawled.add(vid);
boolean success = crawl(vid, threadid);
if (todo.size() < parent.requestlimit && !requested) {
requested = true;
parent.request(this);
}
if (!success) {
int val = fails.addAndGet(1);
if (val > MAXFAILS) {
System.err.println("Max Crawlfails reached, stopping");
parent.stop();
break;
}
}
}
if (todo.isEmpty() && !requested) {
requested = true;
parent.request(this);
}
log.warn("No Object left!");
Thread.yield();
try {
Thread.sleep(10000);//sleep for 10 seconds
} catch (InterruptedException ignored) {
}
}
}
log.info("Stopped.");
}
/**
* returns a list of all crawled videos
* @return
*/
List<String>[] report() {
synchronized (this) {
List<String>[] out = new List[]{crawled, found};
crawled = new LinkedList<>();
found = new LinkedList<>();
return out;
}
}
/*
private void crawl(String videoid) {
try {
crawled.add(videoid);
// log.info("crawling: " + videoid);
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
String s = con.getContent();
Matcher matcher = linkpattern.matcher(s);
while(matcher.find()) {
String ytid = matcher.group(1);
if(!ytid.equals(videoid)) {
found.add(ytid);
}
}
} catch(IOException e) {
e.printStackTrace();
}
}
*/
// returns false when it fails
private native boolean crawl(String videid, int threadid);
}