86 lines
2.0 KiB
Java
86 lines
2.0 KiB
Java
package de.mrbesen.youtubecrawler;
|
|
|
|
import java.io.IOException;
|
|
import java.util.LinkedList;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
import org.apache.log4j.Logger;
|
|
|
|
public class CrawlerThread implements Runnable {
|
|
|
|
private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v=");
|
|
|
|
private Logger log = Logger.getLogger(this.getClass().getName());
|
|
private Crawler parent;
|
|
|
|
LinkedList<String> todo = new LinkedList<>();//videos, this thread should crawl
|
|
LinkedList<String> list = new LinkedList<>();//videos this thread had crawled
|
|
|
|
boolean requested = true;//is a request pending?
|
|
|
|
public CrawlerThread( Crawler root) {
|
|
parent = root;
|
|
root.request(this);
|
|
}
|
|
|
|
@Override
|
|
public void run() {
|
|
while(parent.isCrawling()) {
|
|
while(!todo.isEmpty() && parent.isCrawling()) {
|
|
crawl(todo.removeFirst());
|
|
if(todo.size() < 5 && !requested) {
|
|
requested = true;
|
|
parent.request(this);
|
|
}
|
|
}
|
|
|
|
if(todo.isEmpty() && !requested) {
|
|
requested = true;
|
|
parent.request(this);
|
|
}
|
|
|
|
log.warn("No Object left!");
|
|
Thread.yield();
|
|
try {
|
|
Thread.sleep(10000);//sleep for 10 seconds
|
|
} catch (InterruptedException ignored) {}
|
|
}
|
|
log.info("Stopped.");
|
|
}
|
|
|
|
/**
|
|
* returns a linkedlist of all crawled videos
|
|
* @return
|
|
*/
|
|
LinkedList<String> report() {
|
|
return list;
|
|
}
|
|
|
|
private void crawl(String videoid) {
|
|
try {
|
|
list.add(videoid);
|
|
|
|
// log.info("crawling: " + videoid);
|
|
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
|
|
String s = con.getContent();
|
|
Matcher matcher = linkpattern.matcher(s);
|
|
while(matcher.find()) {
|
|
int beginytid = matcher.end();
|
|
int endxtid = s.indexOf('"', beginytid);
|
|
int endid = s.indexOf('&', beginytid);
|
|
if(endid < endxtid) {
|
|
endxtid = endid;
|
|
}
|
|
String ytid = s.substring(beginytid, endxtid);
|
|
if(ytid.length() > 9 && ytid.length() <= 12) {
|
|
parent.addtoCrawl(ytid);
|
|
} else {
|
|
// log.warn("youtube id has wrong length: \"" + ytid + "\"");
|
|
}
|
|
}
|
|
} catch(IOException e) {
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
} |