diff --git a/src/de/mrbesen/youtubecrawler/Crawler.java b/src/de/mrbesen/youtubecrawler/Crawler.java index 3e74ab6..3914847 100644 --- a/src/de/mrbesen/youtubecrawler/Crawler.java +++ b/src/de/mrbesen/youtubecrawler/Crawler.java @@ -22,12 +22,11 @@ public class Crawler implements Runnable { private LinkedList toSave = new LinkedList<>();//all found ytids, witch need to be analysed private LinkedList toCrawl = new LinkedList<>();//all videos tu crawl - private LinkedList toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle + //private LinkedList toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle private List threads;//list of all threads private List requested = new LinkedList<>(); private static DateFormat dateform = new SimpleDateFormat("dd-MM-yyyy HH:mm:ss"); - private String currentstate = "undefined"; private long start; private boolean crawl = true; @@ -38,17 +37,11 @@ public class Crawler implements Runnable { private YoutubeAPI api = new YoutubeAPI(); private File crawlfile = new File("crawl.txt"); private Logger log = Logger.getLogger(this.getClass().getName()); + private Profiler profiler = new Profiler(); - private int maxvideostotest = 100; - private int startup = 2;//to keep the beginning cool + private int startup = 2;//to keep the beginning cool - counter how often the program is allowed to enter startup sleep public Crawler() { - try { - maxvideostotest = Integer.parseInt(Config.prop.getProperty("crawler.maxvideos")); - } catch(NumberFormatException e) { - log.warn("could not read the number \"" + Config.prop.getProperty("crawler.maxvideos") + "\" from the config file. maxvideo"); - maxvideostotest = 100; - } try { jobspeerthread = Integer.parseInt(Config.prop.getProperty("crawler.jobspeerthread")); } catch(NumberFormatException e) { @@ -62,8 +55,13 @@ public class Crawler implements Runnable { } public synchronized void addtoCrawl(String videoid) { - if(! (toCrawl.contains(videoid) || toknown.contains(videoid))) - toknown.add(videoid); + //if(! (toCrawl.contains(videoid) || toknown.contains(videoid))) + //toknown.add(videoid); + if(toCrawl.contains(videoid)) { + LinkedList str = new LinkedList(); + str.add(videoid); + db.storeTemp(str); + } } public boolean isCrawling() { @@ -89,13 +87,17 @@ public class Crawler implements Runnable { @Override public void run() { - currentstate = "loading crawlfile"; + profiler.profilingEnabled = true; + profiler.clearProfiling(); + profiler.startSection("root"); + profiler.startSection("startup"); + profiler.startSection("loadingcrawlfile"); start = System.currentTimeMillis(); log.info("Try to load crawlfile"); if(crawlfile.exists()) { try { Scanner in = new Scanner(crawlfile); - boolean crawl = true;//section of file + //boolean crawl = true;//section of file while(in.hasNextLine()) { String line = in.nextLine(); if(line == null) { @@ -103,13 +105,13 @@ public class Crawler implements Runnable { } else { if(!line.isEmpty()) { if(line.equals("-")) {//section delimiter - crawl = false; + continue; } else { - if(crawl) { - toCrawl.add(line); - } else { + //if(crawl) { + toCrawl.add(line); + /*} else { toknown.add(line); - } + }*/ } } } @@ -120,8 +122,7 @@ public class Crawler implements Runnable { e.printStackTrace(); } } - - currentstate = "populate threads"; + profiler.endStartSection("populateThreads"); //populate threads int threadcount = 4; try { @@ -137,24 +138,28 @@ public class Crawler implements Runnable { threads.add(thr); thr.thread.start(); } - currentstate = "delete Double"; + profiler.endStartSection("deleteDouble"); long lastdoubledelete = System.currentTimeMillis(); - db.deleteDouble(); - currentstate = "crawl"; - while(crawl) { - log.info("to Crawl: " + toCrawl.size() + " known: " + toknown.size() + " Time: " + dateform.format(new Date())); + //db.deleteDouble(); + profiler.endSection();//startup + profiler.endStartSection("main"); + boolean savedall = false;//ein 2. durch lauf, um wirklich alles zu speichern + while(crawl || savedall) { + log.info("to Crawl: " + toCrawl.size() + /*" known: " + toknown.size() +*/ " Time: " + dateform.format(new Date())); + if(!crawl) + savedall = true; try { //fullfill request + profiler.startSection("fullfill request"); while(!requested.isEmpty() && !toCrawl.isEmpty() && crawl) { log.info("fullfill request"); - currentstate = "fullfill requests"; send(requested.remove(0)); } //kindof idle + profiler.endStartSection("idle"); while(toCrawl.size() > (jobspeerthread * threads.size()) && crawl && requested.isEmpty()) { startup = 0;//stop startup count - currentstate = "idle"; if((System.currentTimeMillis() - lastdoubledelete) / 1000 > 1800) { db.deleteDouble(); lastdoubledelete = System.currentTimeMillis(); @@ -169,13 +174,12 @@ public class Crawler implements Runnable { // updateDB(); } //nothing left? - if(toknown.isEmpty() && toCrawl.isEmpty() && requested.size() == threads.size()) {//very uncommon + if(/*toknown.isEmpty() && */toCrawl.isEmpty() && requested.size() == threads.size()) {//very uncommon log.warn("nothing left to crawl"); - crawl = false; } //refil the tocrawl list. - if(!toknown.isEmpty()) { + /*if(!toknown.isEmpty()) { //check in db for known videos log.info("Checking the DB"); currentstate = "get new tocrawl"; @@ -188,50 +192,69 @@ public class Crawler implements Runnable { toCrawl.addAll(db.checkvideos(tocheck)); } // listlock.writeLock().unlock(); - } - if(toknown.size() < threadcount * jobspeerthread * 20 && crawl) { + } + while(toknown.size() < threadcount * jobspeerthread * 20 && crawl) { currentstate = "restoretemp"; log.info("restoreTemp"); LinkedList rest = db.restoreTemp(); toknown.addAll(rest); + }*/ + { + profiler.endStartSection("loadCrawl"); + boolean joined = true; + while(toCrawl.size() < ( threadcount * jobspeerthread * 3) && crawl) { + if(joined) { + joined = false; + log.info("loadCrawl"); + } + LinkedList rest = db.restoreTemp(); + toCrawl.addAll(rest); + } } //writing crawlfile + profiler.endStartSection("writingcrawlfile"); log.info("Writing Crawlfile"); - currentstate = "writing crawlfile"; - // listlock.writeLock().lock(); try { PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile))); for(String t : toCrawl) { p.println(t); } + /* p.println("-"); for(String t : toknown) { p.println(t); - } + }*/ p.close(); } catch (IOException e) { log.error("Error writing crawlfile.", e); } //get reports - currentstate = "get report"; + profiler.endStartSection("getreport"); log.info("get report"); - int count = 0; for (CrawlerThread crawlerThread : threads) { - currentstate = "get report: " + crawlerThread.thread.getName(); + String threadname = crawlerThread.thread.getName(); + profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1)); LinkedList[] report = crawlerThread.report(); crawlcount+= report[0].size(); toSave.addAll(report[0]); crawlerThread.crawled.clear(); - - while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden. - LinkedList store = new LinkedList<>(); - try { - while(!report[1].isEmpty() && store.size() < 50) { - store.add(report[1].removeFirst()); - count++; + int count = 0; + while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden. + LinkedList store = null; + try { + if(report[1].size() <= 50) { + store = report[1]; + count += report[1].size(); + report[1].clear(); + } else { + store = new LinkedList<>(); + while(!report[1].isEmpty() && store.size() < 50) { + store.add(report[1].removeFirst()); + count++; + } } } catch(NoSuchElementException ignored) {//concurrentmodification fuckery log.info("no suchelement bla"); @@ -239,10 +262,10 @@ public class Crawler implements Runnable { db.storeTemp(store); } log.info(count + " videos added."); - crawlerThread.found.clear(); - crawlerThread.thread.interrupt();//free from lock + profiler.endSection(); } + profiler.endStartSection("debug"); long runtimes = (System.currentTimeMillis() - start) / 1000; if(runtimes < 0) runtimes = 1; @@ -250,7 +273,7 @@ public class Crawler implements Runnable { Main.getMain().broadcastAdmin(vidps + "v/s " + crawlcount + " total V"); //save to db - currentstate = "save to DB"; + profiler.endStartSection("save2DB"); log.info("save " + toSave.size() + " videos to DB."); while(!toSave.isEmpty()) { LinkedList videoids = new LinkedList<>(); @@ -258,19 +281,26 @@ public class Crawler implements Runnable { videoids.add(toSave.remove(0)); } if(videoids.size() > 0) { + profiler.startSection("getinfo"); List