package de.mrbesen.youtubecrawler; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.LinkedList; import java.util.List; import java.util.Scanner; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; public class Crawler implements Runnable { private LinkedList toSave = new LinkedList<>();//all found ytids, witch need to be analysed private LinkedList toCrawl = new LinkedList<>();//all videos tu crawl private LinkedList toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v="); private boolean crawl = true; private int crawlcount = 0; private DB db = new DB(); private YoutubeAPI api = new YoutubeAPI(); private File crawlfile = new File("crawl.txt"); private Logger log = Logger.getLogger(Crawler.class.getName()); private static int maxvideostotest = 100; public void stop() { crawl = false; } public void addtoCrawl(String videoid) { if(! (toCrawl.contains(videoid) || toknown.contains(videoid))) toknown.add(videoid); } private void crawl(String videoid) { try { crawlcount++; log.info("crawling: " + videoid); toSave.add(videoid); HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid); String s = con.getContent(); Matcher matcher = linkpattern.matcher(s); while(matcher.find()) { int beginytid = matcher.end(); int endxtid = s.indexOf('"', beginytid); int endid = s.indexOf('&', beginytid); if(endid < endxtid) { endxtid = endid; } String ytid = s.substring(beginytid, endxtid); if(ytid.length() > 9 && ytid.length() <= 12) { addtoCrawl(ytid); } else { log.warn("youtube id has wrong length: \"" + ytid + "\""); } } } catch(IOException e) { e.printStackTrace(); } } @Override public void run() { long start = System.currentTimeMillis(); log.info("Try to load crawlfile"); if(crawlfile.exists()) { try { Scanner in = new Scanner(crawlfile); while(in.hasNextLine()) { String line = in.nextLine(); if(line == null) { break; } else { if(!line.isEmpty()) { addtoCrawl(line); } } } in.close(); } catch(IOException e) { log.warn("Error while loading crawl file."); e.printStackTrace(); } } while(crawl) { log.info("to Crawl: " + toCrawl.size()); while(!toCrawl.isEmpty() && crawl) { crawl(toCrawl.remove(0)); } if(toknown.isEmpty()) {//very uncommon log.warn("nothing left."); crawl = false; //delete / clear crawl file } else { LinkedList tocheck = new LinkedList<>(); for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) { tocheck.add(toknown.removeFirst()); } toCrawl.addAll(db.checkvideos(tocheck)); try { PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile))); for(String t : toCrawl) { p.println(t); } p.close(); } catch (IOException e) { log.error("Error writing crawlfile.", e); } } // System.out.println("try to save " + toSave.size() + " videos."); while(!toSave.isEmpty()) { LinkedList videoids = new LinkedList<>(); for(int i = 0; i < 50 && !toSave.isEmpty(); i++) { videoids.add(toSave.remove(0)); } if(videoids.size() > 0) { List