package de.mrbesen.youtubecrawler; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.*; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; import org.apache.log4j.Logger; public class Crawler implements Runnable { private int jobspeerthread = 100; //the amount of jobs a thread get peer request int requestlimit = 5;//amount of videos to be left in the todo que of a thread until it requests new videos private int idlecount = 5;//amount of idle loops allowed private Set toSave = new TreeSet<>();//all found ytids, witch need to be analysed private Set toCrawl = new TreeSet<>();//all videos tu crawl //private LinkedList toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle private List threads;//list of all threads private List requested = new LinkedList<>(); private static DateFormat dateform = new SimpleDateFormat("dd-MM-yyyy HH:mm:ss"); private long start; private boolean crawl = true; private int crawlcount = 0; //private int updateOffset = 0; private DB db = new DB(); private YoutubeAPI api = new YoutubeAPI(Config.prop.getProperty("youtube.apikey")); private File crawlfile = new File("crawl.txt"); private Logger log = Logger.getLogger(this.getClass().getName()); private Profiler profiler = new Profiler(); private long lastadminreport = 0; private int startup = 2;//to keep the beginning cool - counter how often the program is allowed to enter startup sleep public Crawler() { try { jobspeerthread = Integer.parseInt(Config.prop.getProperty("crawler.jobspeerthread")); } catch(NumberFormatException e) { log.warn("could not read the number \"" + Config.prop.getProperty("crawler.jobspeerthread") + "\" from the config file. maxvideo"); jobspeerthread = 100; } try { requestlimit = Integer.parseInt(Config.prop.getProperty("crawler.requestlimit")); } catch(NumberFormatException e) { log.warn("could not read the number \"" + Config.prop.getProperty("crawler.requestlimit") + "\" from the config file. crawler.requestlimit"); } try { idlecount = Integer.parseInt(Config.prop.getProperty("crawler.idlecount")); } catch(NumberFormatException e) { log.warn("could not read the number \"" + Config.prop.getProperty("crawler.idlecount") + "\" from the config file. crawler.idlecount"); } } public void stop() { crawl = false; } public synchronized void addtoCrawl(String videoid) { //if(! (toCrawl.contains(videoid) || toknown.contains(videoid))) //toknown.add(videoid); if(toCrawl.contains(videoid)) { ArrayList str = new ArrayList<>(1); str.add(videoid); db.storeTemp(str, false); } } public boolean isCrawling() { return crawl; } public void request(CrawlerThread t) { if(!toCrawl.isEmpty()) { send(t); } else { requested.add(t); } } private void send(CrawlerThread t) { synchronized (toCrawl) { for (int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) { String s = toCrawl.stream().findAny().get(); toCrawl.remove(s); t.todo.add(s); } } t.requested = false; } private void loadCrawlFile() { log.info("Try to load crawlfile"); if(crawlfile.exists()) { try { Scanner in = new Scanner(crawlfile); //boolean crawl = true;//section of file while(in.hasNextLine()) { String line = in.nextLine(); if(line == null) { break; } else { if(!line.isEmpty()) { if(line.equals("-")) {//section delimiter continue; } else { //if(crawl) { toCrawl.add(line); /*} else { toknown.add(line); }*/ } } } } in.close(); } catch(IOException e) { log.warn("Error while loading crawl file."); e.printStackTrace(); } } } private int createThreads() { //populate threads int threadcount = 4; try { threadcount = Integer.parseInt(Config.prop.getProperty("crawler.threadcount")); } catch(NumberFormatException e) { log.warn("Could not read the Number \"" + Config.prop.getProperty("crawler.threadcount") + "\" from the Config."); } threads = new ArrayList<>(threadcount); CrawlerThread.initLib(threadcount); for(int i = 0; i < threadcount; i++) { CrawlerThread thr = new CrawlerThread( this, i); thr.setThread(new Thread(thr, "Crawler #" + i)); threads.add(thr); thr.thread.start(); } return threadcount; } private void getreports() { log.info("get report"); for (CrawlerThread crawlerThread : threads) { String threadname = crawlerThread.thread.getName(); profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1)); List[] report = crawlerThread.report(); crawlcount+= report[0].size(); toSave.addAll(report[0]); crawlerThread.crawled.clear(); int count = 0; while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden. ArrayList store = null; try { if(report[1].size() <= 50) { store = new ArrayList<>(report[1]); count += report[1].size(); report[1].clear(); } else { store = new ArrayList<>(report[1].subList(0, 50)); report[1].removeAll(store); count+=50; } } catch(NoSuchElementException ignored) {//concurrentmodification fuckery log.info("no suchelement bla"); } db.storeTemp(store, false); } log.info(count + " videos added from " + threadname); profiler.endSection(); } } private void savetodb() { log.info("save " + toSave.size() + " videos to DB."); synchronized (toSave) { while (!toSave.isEmpty()) { Set videoids = new TreeSet<>(); for (int i = 0; i < 50 && !toSave.isEmpty(); i++) { String save = toSave.stream().findAny().get(); toSave.remove(save); videoids.add(save); } if (videoids.size() > 0) { profiler.startSection("getinfo"); ArrayList