package de.mrbesen.youtubecrawler; import java.io.IOException; import java.util.LinkedList; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; public class CrawlerThread implements Runnable { private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v="); private Logger log = Logger.getLogger(this.getClass().getName()); private Crawler parent; Thread thread; LinkedList todo = new LinkedList<>();//videos, this thread should crawl LinkedList crawled = new LinkedList<>();//videos this thread had crawled LinkedList found = new LinkedList<>();//videos this thread had found boolean requested = true;//is a request pending? boolean lockforreport = false; public CrawlerThread( Crawler root) { parent = root; root.request(this); } void setThread(Thread t) { thread = t; } LinkedList undone() { return todo; } int undoneSize() { return todo.size(); } @Override public void run() { while(parent.isCrawling()) { while(!todo.isEmpty() && parent.isCrawling()) { if(lockforreport) { try { Thread.sleep(10); } catch(InterruptedException e) { lockforreport = false; } } crawl(todo.removeFirst()); if(todo.size() < parent.requestlimit && !requested) { requested = true; parent.request(this); } } if(todo.isEmpty() && !requested) { requested = true; parent.request(this); } log.warn("No Object left!"); Thread.yield(); try { Thread.sleep(10000);//sleep for 10 seconds } catch (InterruptedException ignored) {} } log.info("Stopped."); } /** * returns a linkedlist of all crawled videos * @return */ LinkedList[] report() { lockforreport = true; LinkedList[] out = new LinkedList[] {(LinkedList) crawled, (LinkedList) found}; crawled = new LinkedList<>(); found = new LinkedList<>(); lockforreport = false; thread.interrupt(); return out; } private void crawl(String videoid) { try { crawled.add(videoid); // log.info("crawling: " + videoid); HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid); String s = con.getContent(); Matcher matcher = linkpattern.matcher(s); while(matcher.find()) { int beginytid = matcher.end(); int endxtid = s.indexOf('"', beginytid); int endid = s.indexOf('&', beginytid); if(endid < endxtid) { endxtid = endid; } String ytid = s.substring(beginytid, endxtid); if(ytid.length() > 9 && ytid.length() <= 12) { found.add(ytid); } else { // log.warn("youtube id has wrong length: \"" + ytid + "\""); } } } catch(IOException e) { e.printStackTrace(); } } }