package de.mrbesen.youtubecrawler; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.LinkedList; import java.util.List; import java.util.NoSuchElementException; import java.util.Scanner; import org.apache.log4j.Logger; public class Crawler implements Runnable { private int jobspeerthread = 100; //the amount of jobs a thread get peer request int requestlimit = 5;//amount of videos to be left in the todo que of a thread until it requests new videos private int idlecount = 5;//amount of idle loops allowed private LinkedList toSave = new LinkedList<>();//all found ytids, witch need to be analysed private LinkedList toCrawl = new LinkedList<>();//all videos tu crawl //private LinkedList toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle private List threads;//list of all threads private List requested = new LinkedList<>(); private static DateFormat dateform = new SimpleDateFormat("dd-MM-yyyy HH:mm:ss"); private long start; private boolean crawl = true; private int crawlcount = 0; //private int updateOffset = 0; private DB db = new DB(); private YoutubeAPI api = new YoutubeAPI(); private File crawlfile = new File("crawl.txt"); private Logger log = Logger.getLogger(this.getClass().getName()); private Profiler profiler = new Profiler(); private int startup = 2;//to keep the beginning cool - counter how often the program is allowed to enter startup sleep public Crawler() { try { jobspeerthread = Integer.parseInt(Config.prop.getProperty("crawler.jobspeerthread")); } catch(NumberFormatException e) { log.warn("could not read the number \"" + Config.prop.getProperty("crawler.jobspeerthread") + "\" from the config file. maxvideo"); jobspeerthread = 100; } try { requestlimit = Integer.parseInt(Config.prop.getProperty("crawler.requestlimit")); } catch(NumberFormatException e) { log.warn("could not read the number \"" + Config.prop.getProperty("crawler.requestlimit") + "\" from the config file. crawler.requestlimit"); } try { idlecount = Integer.parseInt(Config.prop.getProperty("crawler.idlecount")); } catch(NumberFormatException e) { log.warn("could not read the number \"" + Config.prop.getProperty("crawler.idlecount") + "\" from the config file. crawler.idlecount"); } } public void stop() { crawl = false; } public synchronized void addtoCrawl(String videoid) { //if(! (toCrawl.contains(videoid) || toknown.contains(videoid))) //toknown.add(videoid); if(toCrawl.contains(videoid)) { LinkedList str = new LinkedList(); str.add(videoid); db.storeTemp(str); } } public boolean isCrawling() { return crawl; } public void request(CrawlerThread t) { if(!toCrawl.isEmpty()) { send(t); } else { requested.add(t); } } private void send(CrawlerThread t) { // listlock.writeLock().lock(); for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) { t.todo.add(toCrawl.removeFirst()); } // listlock.writeLock().unlock(); t.requested = false; } @Override public void run() { profiler.profilingEnabled = true; profiler.clearProfiling(); profiler.startSection("root"); profiler.startSection("startup"); profiler.startSection("loadingcrawlfile"); start = System.currentTimeMillis(); log.info("Try to load crawlfile"); if(crawlfile.exists()) { try { Scanner in = new Scanner(crawlfile); //boolean crawl = true;//section of file while(in.hasNextLine()) { String line = in.nextLine(); if(line == null) { break; } else { if(!line.isEmpty()) { if(line.equals("-")) {//section delimiter continue; } else { //if(crawl) { toCrawl.add(line); /*} else { toknown.add(line); }*/ } } } } in.close(); } catch(IOException e) { log.warn("Error while loading crawl file."); e.printStackTrace(); } } profiler.endStartSection("populateThreads");//loading crawlfile closed //populate threads int threadcount = 4; try { threadcount = Integer.parseInt(Config.prop.getProperty("crawler.threadcount")); } catch(NumberFormatException e) { log.warn("Could not read the Number \"" + Config.prop.getProperty("crawler.threadcount") + "\" from the Config."); } threads = new ArrayList<>(threadcount); for(int i = 0; i < threadcount; i++) { CrawlerThread thr = new CrawlerThread( this); thr.setThread(new Thread(thr, "Crawler #" + i)); threads.add(thr); thr.thread.start(); } profiler.endStartSection("deleteDouble");//populate threads long lastdoubledelete = System.currentTimeMillis(); //db.deleteDouble(); profiler.endSection();//deletedouble profiler.endSection();//startup boolean savedall = false;//ein 2. durch lauf, um wirklich alles zu speichern while(crawl || savedall) { profiler.startSection("main"); log.info("to Crawl: " + toCrawl.size() + /*" known: " + toknown.size() +*/ " Time: " + dateform.format(new Date())); if(!crawl) savedall = true; try { //fullfill request profiler.startSection("fullfill request"); while(!requested.isEmpty() && !toCrawl.isEmpty() && crawl) { log.info("fullfill request"); send(requested.remove(0)); } //kindof idle { int count = 0;//donst stay to long in idle! profiler.endStartSection("idle"); while(toCrawl.size() > (jobspeerthread * threads.size() * 2) && crawl && requested.isEmpty() && count < idlecount) { count ++; startup = 0;//stop startup count if((System.currentTimeMillis() - lastdoubledelete) / 1000 > 1800) { //db.deleteDouble(); lastdoubledelete = System.currentTimeMillis(); } else { Thread.yield(); try { Thread.sleep(100); } catch(InterruptedException ignored) { break; } } // updateDB(); } } //nothing left? if(/*toknown.isEmpty() && */toCrawl.isEmpty() && requested.size() == threads.size()) {//very uncommon log.warn("nothing left to crawl"); } //refil the tocrawl list. /*if(!toknown.isEmpty()) { //check in db for known videos log.info("Checking the DB"); currentstate = "get new tocrawl"; // listlock.writeLock().lock(); while(toCrawl.size() < jobspeerthread * threads.size() * 2 && crawl && !toknown.isEmpty()) { LinkedList tocheck = new LinkedList<>(); for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) { tocheck.add(toknown.removeFirst()); } toCrawl.addAll(db.checkvideos(tocheck)); } // listlock.writeLock().unlock(); } while(toknown.size() < threadcount * jobspeerthread * 20 && crawl) { currentstate = "restoretemp"; log.info("restoreTemp"); LinkedList rest = db.restoreTemp(); toknown.addAll(rest); }*/ { profiler.endStartSection("loadCrawl"); boolean joined = true; while(toCrawl.size() < ( threadcount * jobspeerthread * 3) && crawl) { if(joined) { joined = false; log.info("loadCrawl"); } LinkedList rest = db.restoreTemp(); toCrawl.addAll(rest); } } //writing crawlfile profiler.endStartSection("writingcrawlfile"); log.info("Writing Crawlfile"); try { PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile))); for(String t : toCrawl) { p.println(t); } p.close(); } catch (IOException e) { log.error("Error writing crawlfile.", e); } //get reports profiler.endStartSection("getreport"); log.info("get report"); for (CrawlerThread crawlerThread : threads) { String threadname = crawlerThread.thread.getName(); profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1)); LinkedList[] report = crawlerThread.report(); crawlcount+= report[0].size(); toSave.addAll(report[0]); crawlerThread.crawled.clear(); int count = 0; while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden. LinkedList store = null; try { if(report[1].size() <= 50) { store = report[1]; count += report[1].size(); report[1].clear(); } else { store = new LinkedList<>(); while(!report[1].isEmpty() && store.size() < 50) { store.add(report[1].removeFirst()); count++; } } } catch(NoSuchElementException ignored) {//concurrentmodification fuckery log.info("no suchelement bla"); } db.storeTemp(store); } log.info(count + " videos added from " + threadname); profiler.endSection(); } profiler.endStartSection("debug"); long runtimes = (System.currentTimeMillis() - start) / 1000; if(runtimes < 0) runtimes = 1; float vidps = (crawlcount / (float) runtimes);//videos per second Main.getMain().broadcastAdmin(vidps + "v/s " + crawlcount + " total V"); //save to db profiler.endStartSection("save2DB"); log.info("save " + toSave.size() + " videos to DB."); while(!toSave.isEmpty()) { LinkedList videoids = new LinkedList<>(); for(int i = 0; i < 50 && !toSave.isEmpty(); i++) { videoids.add(toSave.remove(0)); } if(videoids.size() > 0) { profiler.startSection("getinfo"); ArrayList