From c7a6a28d49785e4e7526561e66b8c77b3f62ca6d Mon Sep 17 00:00:00 2001 From: mrbesen Date: Tue, 17 Jul 2018 13:42:06 +0200 Subject: [PATCH] lock for the lists, save toknown list --- src/de/mrbesen/youtubecrawler/Crawler.java | 46 ++++++++++++++++--- src/de/mrbesen/youtubecrawler/DB.java | 2 +- src/de/mrbesen/youtubecrawler/YoutubeAPI.java | 1 + 3 files changed, 41 insertions(+), 8 deletions(-) diff --git a/src/de/mrbesen/youtubecrawler/Crawler.java b/src/de/mrbesen/youtubecrawler/Crawler.java index 278ff57..e02a28f 100644 --- a/src/de/mrbesen/youtubecrawler/Crawler.java +++ b/src/de/mrbesen/youtubecrawler/Crawler.java @@ -5,10 +5,14 @@ import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; +import java.text.DateFormat; +import java.text.SimpleDateFormat; import java.util.ArrayList; +import java.util.Date; import java.util.LinkedList; import java.util.List; import java.util.Scanner; +import java.util.concurrent.locks.ReentrantReadWriteLock; import org.apache.log4j.Logger; @@ -16,11 +20,13 @@ public class Crawler implements Runnable { private static int jobspeerthread = 100; //the amount of jobs a thread get peer request + private ReentrantReadWriteLock listlock = new ReentrantReadWriteLock(true);//only writelock is used, this lock should lock the list toCrawl and toknown because they may be accsessed by other threads private LinkedList toSave = new LinkedList<>();//all found ytids, witch need to be analysed private LinkedList toCrawl = new LinkedList<>();//all videos tu crawl private LinkedList toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle private List threads;//list of all threads private List requested = new LinkedList<>(); + private static DateFormat dateform = new SimpleDateFormat("dd-MM-yyyy HH:mm:ss"); private boolean crawl = true; private int crawlcount = 0; @@ -46,9 +52,11 @@ public class Crawler implements Runnable { crawl = false; } - public synchronized void addtoCrawl(String videoid) {//TODO some kind of lock! + public synchronized void addtoCrawl(String videoid) { + listlock.writeLock().lock(); if(! (toCrawl.contains(videoid) || toknown.contains(videoid))) toknown.add(videoid); + listlock.writeLock().unlock(); } public boolean isCrawling() { @@ -64,9 +72,11 @@ public class Crawler implements Runnable { } private void send(CrawlerThread t) { + listlock.writeLock().lock(); for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) { t.todo.add(toCrawl.removeFirst()); } + listlock.writeLock().unlock(); t.requested = false; } @@ -75,15 +85,25 @@ public class Crawler implements Runnable { long start = System.currentTimeMillis(); log.info("Try to load crawlfile"); if(crawlfile.exists()) { + listlock.writeLock().lock(); try { Scanner in = new Scanner(crawlfile); + boolean crawl = true;//section of file while(in.hasNextLine()) { String line = in.nextLine(); if(line == null) { break; } else { if(!line.isEmpty()) { - addtoCrawl(line); + if(line.equals("-")) {//section delimiter + crawl = false; + } else { + if(crawl) { + toCrawl.add(line); + } else { + toknown.add(line); + } + } } } } @@ -91,6 +111,8 @@ public class Crawler implements Runnable { } catch(IOException e) { log.warn("Error while loading crawl file."); e.printStackTrace(); + } finally { + listlock.writeLock().unlock(); } } @@ -110,7 +132,7 @@ public class Crawler implements Runnable { } while(crawl) { - log.info("to Crawl: " + toCrawl.size()); + log.info("to Crawl: " + toCrawl.size() + " known: " + toknown.size() + " Time: " + dateform.format(new Date())); //fullfill request while(!requested.isEmpty() && !toCrawl.isEmpty() && crawl) { @@ -119,6 +141,7 @@ public class Crawler implements Runnable { //kindof idle while(toCrawl.size() > (jobspeerthread * threads.size()) && crawl && requested.isEmpty()) { + startup = 0;//stop startup count Thread.yield(); try { Thread.sleep(100); @@ -134,6 +157,7 @@ public class Crawler implements Runnable { if(!toknown.isEmpty()) { //check in db for known videos log.info("Checking the DB"); + listlock.writeLock().lock(); while(toCrawl.size() < jobspeerthread * threads.size() * 2 && crawl && !toknown.isEmpty()) { LinkedList tocheck = new LinkedList<>(); for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) { @@ -141,18 +165,26 @@ public class Crawler implements Runnable { } toCrawl.addAll(db.checkvideos(tocheck)); } + listlock.writeLock().unlock(); } //writing crawlfile log.info("Writing Crawlfile"); + listlock.writeLock().lock(); try { PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile))); for(String t : toCrawl) { p.println(t); } + p.println("-"); + for(String t : toknown) { + p.println(t); + } p.close(); } catch (IOException e) { log.error("Error writing crawlfile.", e); + } finally { + listlock.writeLock().unlock(); } //get reports @@ -164,6 +196,7 @@ public class Crawler implements Runnable { } //save to db + log.info("save " + toSave.size() + " videos to DB."); while(!toSave.isEmpty()) { LinkedList videoids = new LinkedList<>(); for(int i = 0; i < 50 && !toSave.isEmpty(); i++) { @@ -175,6 +208,7 @@ public class Crawler implements Runnable { } } + //at the beginning there is maybe just one video to crawl, so keep it calm. if(startup > 0) { startup --; try { @@ -186,8 +220,8 @@ public class Crawler implements Runnable { //end long runtimes = (System.currentTimeMillis() - start) / 1000; int runtimem = (int) (runtimes / 60); - float vidps = (crawlcount / (float) runtimes); - log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " )"); + float vidps = (crawlcount / (float) runtimes);//videos per second + log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " v/s )"); } public static Video getVideo() { @@ -201,6 +235,4 @@ public class Crawler implements Runnable { byte categorie; long created; } - - } diff --git a/src/de/mrbesen/youtubecrawler/DB.java b/src/de/mrbesen/youtubecrawler/DB.java index e183889..c211a72 100644 --- a/src/de/mrbesen/youtubecrawler/DB.java +++ b/src/de/mrbesen/youtubecrawler/DB.java @@ -86,7 +86,7 @@ public class DB { * @param input */ public void addVideos(List