package de.mrbesen.youtubecrawler; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.LinkedList; import java.util.List; import java.util.NoSuchElementException; import java.util.Scanner; import org.apache.log4j.Logger; public class Crawler implements Runnable { private int jobspeerthread = 100; //the amount of jobs a thread get peer request private LinkedList toSave = new LinkedList<>();//all found ytids, witch need to be analysed private LinkedList toCrawl = new LinkedList<>();//all videos tu crawl private LinkedList toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle private List threads;//list of all threads private List requested = new LinkedList<>(); private static DateFormat dateform = new SimpleDateFormat("dd-MM-yyyy HH:mm:ss"); private String currentstate = "undefined"; private long start; private boolean crawl = true; private int crawlcount = 0; private int updateOffset = 0; private DB db = new DB(); private YoutubeAPI api = new YoutubeAPI(); private File crawlfile = new File("crawl.txt"); private Logger log = Logger.getLogger(this.getClass().getName()); private int maxvideostotest = 100; private int startup = 2;//to keep the beginning cool public Crawler() { try { maxvideostotest = Integer.parseInt(Config.prop.getProperty("crawler.maxvideos")); } catch(NumberFormatException e) { log.warn("could not read the number \"" + Config.prop.getProperty("crawler.maxvideos") + "\" from the config file. maxvideo"); maxvideostotest = 100; } try { jobspeerthread = Integer.parseInt(Config.prop.getProperty("crawler.jobspeerthread")); } catch(NumberFormatException e) { log.warn("could not read the number \"" + Config.prop.getProperty("crawler.jobspeerthread") + "\" from the config file. maxvideo"); jobspeerthread = 100; } } public void stop() { crawl = false; db.stop(); } public synchronized void addtoCrawl(String videoid) { if(! (toCrawl.contains(videoid) || toknown.contains(videoid))) toknown.add(videoid); } public boolean isCrawling() { return crawl; } public void request(CrawlerThread t) { if(!toCrawl.isEmpty()) { send(t); } else { requested.add(t); } } private void send(CrawlerThread t) { // listlock.writeLock().lock(); for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) { t.todo.add(toCrawl.removeFirst()); } // listlock.writeLock().unlock(); t.requested = false; } @Override public void run() { currentstate = "loading crawlfile"; start = System.currentTimeMillis(); log.info("Try to load crawlfile"); if(crawlfile.exists()) { try { Scanner in = new Scanner(crawlfile); boolean crawl = true;//section of file while(in.hasNextLine()) { String line = in.nextLine(); if(line == null) { break; } else { if(!line.isEmpty()) { if(line.equals("-")) {//section delimiter crawl = false; } else { if(crawl) { toCrawl.add(line); } else { toknown.add(line); } } } } } in.close(); } catch(IOException e) { log.warn("Error while loading crawl file."); e.printStackTrace(); } } //populate threads int threadcount = 4; try { threadcount = Integer.parseInt(Config.prop.getProperty("crawler.threadcount")); } catch(NumberFormatException e) { log.warn("Could not read the Number \"" + Config.prop.getProperty("crawler.threadcount") + "\" from the Config."); } threads = new ArrayList<>(threadcount); for(int i = 0; i < threadcount; i++) { CrawlerThread thr = new CrawlerThread( this); thr.setThread(new Thread(thr, "Crawler #" + i)); threads.add(thr); thr.thread.start(); } long lastdoubledelete = System.currentTimeMillis(); db.deleteDouble(); while(crawl) { log.info("to Crawl: " + toCrawl.size() + " known: " + toknown.size() + " Time: " + dateform.format(new Date())); try { //fullfill request while(!requested.isEmpty() && !toCrawl.isEmpty() && crawl) { log.info("fullfill request"); currentstate = "fullfill requests"; send(requested.remove(0)); } //kindof idle while(toCrawl.size() > (jobspeerthread * threads.size()) && crawl && requested.isEmpty()) { startup = 0;//stop startup count currentstate = "idle"; if((System.currentTimeMillis() - lastdoubledelete) / 1000 > 1800) { db.deleteDouble(); lastdoubledelete = System.currentTimeMillis(); } else { Thread.yield(); try { Thread.sleep(100); } catch(InterruptedException ignored) { break; } } // updateDB(); } //nothing left? if(toknown.isEmpty() && toCrawl.isEmpty() && requested.size() == threads.size()) {//very uncommon log.warn("nothing left to crawl"); crawl = false; } //refil the tocrawl list. if(!toknown.isEmpty()) { //check in db for known videos log.info("Checking the DB"); currentstate = "get new tocrawl"; // listlock.writeLock().lock(); while(toCrawl.size() < jobspeerthread * threads.size() * 2 && crawl && !toknown.isEmpty()) { LinkedList tocheck = new LinkedList<>(); for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) { tocheck.add(toknown.removeFirst()); } toCrawl.addAll(db.checkvideos(tocheck)); } // listlock.writeLock().unlock(); } if(toknown.size() < threadcount * jobspeerthread * 20 && crawl) { currentstate = "restoretemp"; log.info("restoreTemp"); LinkedList rest = db.restoreTemp(); toknown.addAll(rest); } //writing crawlfile log.info("Writing Crawlfile"); currentstate = "writing crawlfile"; // listlock.writeLock().lock(); try { PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile))); for(String t : toCrawl) { p.println(t); } p.println("-"); for(String t : toknown) { p.println(t); } p.close(); } catch (IOException e) { log.error("Error writing crawlfile.", e); } //get reports currentstate = "get report"; log.info("get report"); int count = 0; for (CrawlerThread crawlerThread : threads) { LinkedList[] report = crawlerThread.report(); crawlcount+= report[0].size(); toSave.addAll(report[0]); crawlerThread.crawled.clear(); while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden. LinkedList store = new LinkedList<>(); try { while(!report[1].isEmpty() && store.size() < 50) { store.add(report[1].removeFirst()); count++; } } catch(NoSuchElementException ignored) {//concurrentmodification fuckery log.info("no suchelement bla"); } db.storeTemp(store); } log.info(count + " videos added."); crawlerThread.found.clear(); crawlerThread.thread.interrupt();//free from lock } long runtimes = (System.currentTimeMillis() - start) / 1000; if(runtimes < 0) runtimes = 1; float vidps = (crawlcount / (float) runtimes);//videos per second Main.getMain().broadcastAdmin(vidps + "v/s " + crawlcount + " total V"); //save to db currentstate = "save to DB"; log.info("save " + toSave.size() + " videos to DB."); while(!toSave.isEmpty()) { LinkedList videoids = new LinkedList<>(); for(int i = 0; i < 50 && !toSave.isEmpty(); i++) { videoids.add(toSave.remove(0)); } if(videoids.size() > 0) { List