package de.mrbesen.youtubecrawler; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.LinkedList; import java.util.List; import java.util.Scanner; import java.util.concurrent.locks.ReentrantReadWriteLock; import org.apache.log4j.Logger; public class Crawler implements Runnable { private static int jobspeerthread = 100; //the amount of jobs a thread get peer request private ReentrantReadWriteLock listlock = new ReentrantReadWriteLock(true);//only writelock is used, this lock should lock the list toCrawl and toknown because they may be accsessed by other threads private LinkedList toSave = new LinkedList<>();//all found ytids, witch need to be analysed private LinkedList toCrawl = new LinkedList<>();//all videos tu crawl private LinkedList toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle private List threads;//list of all threads private List requested = new LinkedList<>(); private static DateFormat dateform = new SimpleDateFormat("dd-MM-yyyy HH:mm:ss"); private long start; private boolean crawl = true; private int crawlcount = 0; private DB db = new DB(); private YoutubeAPI api = new YoutubeAPI(); private File crawlfile = new File("crawl.txt"); private Logger log = Logger.getLogger(Crawler.class.getName()); private int maxvideostotest; private int startup = 10;//to keep the beginning cool public Crawler() { try { maxvideostotest = Integer.parseInt(Config.prop.getProperty("crawler.maxvideos")); } catch(NumberFormatException e) { log.warn("could not read the number \"" + Config.prop.getProperty("") + "\" from the config file. maxvideo"); maxvideostotest = 100; } } public void stop() { crawl = false; } public synchronized void addtoCrawl(String videoid) { listlock.writeLock().lock(); if(! (toCrawl.contains(videoid) || toknown.contains(videoid))) toknown.add(videoid); listlock.writeLock().unlock(); } public boolean isCrawling() { return crawl; } public void request(CrawlerThread t) { if(!toCrawl.isEmpty()) { send(t); } else { requested.add(t); } } private void send(CrawlerThread t) { listlock.writeLock().lock(); for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) { t.todo.add(toCrawl.removeFirst()); } listlock.writeLock().unlock(); t.requested = false; } @Override public void run() { start = System.currentTimeMillis(); log.info("Try to load crawlfile"); if(crawlfile.exists()) { listlock.writeLock().lock(); try { Scanner in = new Scanner(crawlfile); boolean crawl = true;//section of file while(in.hasNextLine()) { String line = in.nextLine(); if(line == null) { break; } else { if(!line.isEmpty()) { if(line.equals("-")) {//section delimiter crawl = false; } else { if(crawl) { toCrawl.add(line); } else { toknown.add(line); } } } } } in.close(); } catch(IOException e) { log.warn("Error while loading crawl file."); e.printStackTrace(); } finally { listlock.writeLock().unlock(); } } //populate threads int threadcount = 4; try { threadcount = Integer.parseInt(Config.prop.getProperty("crawler.threadcount")); } catch(NumberFormatException e) { log.warn("Could not read the Number \"" + Config.prop.getProperty("crawler.threadcount") + "\" from the Config."); } threads = new ArrayList<>(threadcount); for(int i = 0; i < threadcount; i++) { CrawlerThread thr = new CrawlerThread( this); new Thread(thr, "Crawler #" + i).start(); threads.add(thr); } int updateOffset = 0; while(crawl) { log.info("to Crawl: " + toCrawl.size() + " known: " + toknown.size() + " Time: " + dateform.format(new Date())); //fullfill request while(!requested.isEmpty() && !toCrawl.isEmpty() && crawl) { send(requested.remove(0)); } //kindof idle while(toCrawl.size() > (jobspeerthread * threads.size()) && crawl && requested.isEmpty()) { startup = 0;//stop startup count Thread.yield(); try { Thread.sleep(5000); } catch(InterruptedException ignored) { break; } log.info("updating DB Offset= " + updateOffset); LinkedList vids = db.getUncompleted(50, updateOffset); LinkedList