YoutubeCrawler/src/de/mrbesen/youtubecrawler/CrawlerThread.java

114 lines
2.7 KiB
Java

package de.mrbesen.youtubecrawler;
import java.io.IOException;
import java.util.LinkedList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
public class CrawlerThread implements Runnable {
private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v=");
private Logger log = Logger.getLogger(this.getClass().getName());
private Crawler parent;
Thread thread;
LinkedList<String> todo = new LinkedList<>();//videos, this thread should crawl
LinkedList<String> crawled = new LinkedList<>();//videos this thread had crawled
LinkedList<String> found = new LinkedList<>();//videos this thread had found
boolean requested = true;//is a request pending?
boolean lockforreport = false;
public CrawlerThread( Crawler root) {
parent = root;
root.request(this);
}
void setThread(Thread t) {
thread = t;
}
LinkedList<String> undone() {
return todo;
}
int undoneSize() {
return todo.size();
}
@Override
public void run() {
while(parent.isCrawling()) {
while(!todo.isEmpty() && parent.isCrawling()) {
if(lockforreport) {
try {
Thread.sleep(10);
} catch(InterruptedException e) {
lockforreport = false;
}
}
crawl(todo.removeFirst());
if(todo.size() < parent.requestlimit && !requested) {
requested = true;
parent.request(this);
}
}
if(todo.isEmpty() && !requested) {
requested = true;
parent.request(this);
}
log.warn("No Object left!");
Thread.yield();
try {
Thread.sleep(10000);//sleep for 10 seconds
} catch (InterruptedException ignored) {}
}
log.info("Stopped.");
}
/**
* returns a linkedlist of all crawled videos
* @return
*/
LinkedList<String>[] report() {
lockforreport = true;
LinkedList<String>[] out = new LinkedList[] {(LinkedList) crawled, (LinkedList) found};
crawled = new LinkedList<>();
found = new LinkedList<>();
lockforreport = false;
thread.interrupt();
return out;
}
private void crawl(String videoid) {
try {
crawled.add(videoid);
// log.info("crawling: " + videoid);
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
String s = con.getContent();
Matcher matcher = linkpattern.matcher(s);
while(matcher.find()) {
int beginytid = matcher.end();
int endxtid = s.indexOf('"', beginytid);
int endid = s.indexOf('&', beginytid);
if(endid < endxtid) {
endxtid = endid;
}
String ytid = s.substring(beginytid, endxtid);
if(ytid.length() > 9 && ytid.length() <= 12) {
found.add(ytid);
} else {
// log.warn("youtube id has wrong length: \"" + ytid + "\"");
}
}
} catch(IOException e) {
e.printStackTrace();
}
}
}