135 lines
3.1 KiB
Java
135 lines
3.1 KiB
Java
package de.mrbesen.youtubecrawler;
|
|
|
|
import java.io.IOException;
|
|
import java.util.LinkedList;
|
|
import java.util.List;
|
|
import java.util.concurrent.atomic.AtomicInteger;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
import org.apache.log4j.Logger;
|
|
|
|
public class CrawlerThread implements Runnable {
|
|
|
|
private static Pattern linkpattern = Pattern.compile("watch\\?v=([-_a-zA-Z0-9]{11})");
|
|
|
|
private Logger log = Logger.getLogger(this.getClass().getName());
|
|
private Crawler parent;
|
|
Thread thread;
|
|
|
|
LinkedList<String> todo = new LinkedList<>();//videos, this thread should crawl
|
|
List<String> crawled = new LinkedList<>();//videos this thread had crawled
|
|
List<String> found = new LinkedList<>();//videos this thread had found
|
|
|
|
static AtomicInteger fails = new AtomicInteger(0);
|
|
private static int MAXFAILS = 100;
|
|
|
|
boolean requested = true;//is a request pending?
|
|
private int threadid;
|
|
|
|
static {
|
|
String libpath = System.getProperty("java.library.path");
|
|
libpath += ":./";
|
|
System.setProperty("java.library.path", libpath);
|
|
System.loadLibrary("crawlerthread");
|
|
}
|
|
|
|
public static native void initLib(int threadCount);
|
|
public static native void deinitLib();
|
|
|
|
public CrawlerThread( Crawler root, int threadid) {
|
|
parent = root;
|
|
root.request(this);
|
|
this.threadid = threadid;
|
|
}
|
|
|
|
void setThread(Thread t) {
|
|
thread = t;
|
|
}
|
|
|
|
LinkedList<String> undone() {
|
|
return todo;
|
|
}
|
|
|
|
int undoneSize() {
|
|
return todo.size();
|
|
}
|
|
|
|
@Override
|
|
public void run() {
|
|
while(parent.isCrawling()) {
|
|
synchronized (this) {
|
|
while (!todo.isEmpty() && parent.isCrawling()) {
|
|
String vid = todo.removeFirst();
|
|
// System.out.println("crawling: " + vid + " size: " + found.size());
|
|
crawled.add(vid);
|
|
|
|
boolean success = crawl(vid, threadid);
|
|
|
|
if (todo.size() < parent.requestlimit && !requested) {
|
|
requested = true;
|
|
parent.request(this);
|
|
}
|
|
if (!success) {
|
|
int val = fails.addAndGet(1);
|
|
if (val > MAXFAILS) {
|
|
System.err.println("Max Crawlfails reached, stopping");
|
|
parent.stop();
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (todo.isEmpty() && !requested) {
|
|
requested = true;
|
|
parent.request(this);
|
|
}
|
|
|
|
log.warn("No Object left!");
|
|
Thread.yield();
|
|
try {
|
|
Thread.sleep(10000);//sleep for 10 seconds
|
|
} catch (InterruptedException ignored) {
|
|
}
|
|
}
|
|
}
|
|
log.info("Stopped.");
|
|
}
|
|
|
|
/**
|
|
* returns a list of all crawled videos
|
|
* @return
|
|
*/
|
|
List<String>[] report() {
|
|
synchronized (this) {
|
|
List<String>[] out = new List[]{crawled, found};
|
|
crawled = new LinkedList<>();
|
|
found = new LinkedList<>();
|
|
return out;
|
|
}
|
|
}
|
|
|
|
/*
|
|
private void crawl(String videoid) {
|
|
try {
|
|
crawled.add(videoid);
|
|
|
|
// log.info("crawling: " + videoid);
|
|
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
|
|
String s = con.getContent();
|
|
Matcher matcher = linkpattern.matcher(s);
|
|
while(matcher.find()) {
|
|
String ytid = matcher.group(1);
|
|
if(!ytid.equals(videoid)) {
|
|
found.add(ytid);
|
|
}
|
|
}
|
|
} catch(IOException e) {
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
*/
|
|
|
|
// returns false when it fails
|
|
private native boolean crawl(String videid, int threadid);
|
|
} |