From 4917369b34e5cf6d57d1a5cd4bcd5a3eaa6fb6cf Mon Sep 17 00:00:00 2001 From: MrBesen Date: Mon, 23 Jul 2018 12:27:51 +0200 Subject: [PATCH] non-lock multithreading, cache DB, TelegramAdmins --- src/de/mrbesen/youtubecrawler/Crawler.java | 141 ++++++++++------ .../mrbesen/youtubecrawler/CrawlerThread.java | 11 +- src/de/mrbesen/youtubecrawler/DB.java | 42 ++++- src/de/mrbesen/youtubecrawler/Main.java | 157 +++++++++++++----- src/de/mrbesen/youtubecrawler/YoutubeAPI.java | 2 +- 5 files changed, 247 insertions(+), 106 deletions(-) diff --git a/src/de/mrbesen/youtubecrawler/Crawler.java b/src/de/mrbesen/youtubecrawler/Crawler.java index b28698b..622a1a5 100644 --- a/src/de/mrbesen/youtubecrawler/Crawler.java +++ b/src/de/mrbesen/youtubecrawler/Crawler.java @@ -11,42 +11,50 @@ import java.util.ArrayList; import java.util.Date; import java.util.LinkedList; import java.util.List; +import java.util.NoSuchElementException; import java.util.Scanner; -import java.util.concurrent.locks.ReentrantReadWriteLock; import org.apache.log4j.Logger; public class Crawler implements Runnable { - private static int jobspeerthread = 100; //the amount of jobs a thread get peer request + private int jobspeerthread = 100; //the amount of jobs a thread get peer request - private ReentrantReadWriteLock listlock = new ReentrantReadWriteLock(true);//only writelock is used, this lock should lock the list toCrawl and toknown because they may be accsessed by other threads private LinkedList toSave = new LinkedList<>();//all found ytids, witch need to be analysed private LinkedList toCrawl = new LinkedList<>();//all videos tu crawl private LinkedList toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle private List threads;//list of all threads private List requested = new LinkedList<>(); + private static DateFormat dateform = new SimpleDateFormat("dd-MM-yyyy HH:mm:ss"); + private String currentstate = "undefined"; private long start; private boolean crawl = true; private int crawlcount = 0; + private int updateOffset = 0; private DB db = new DB(); private YoutubeAPI api = new YoutubeAPI(); private File crawlfile = new File("crawl.txt"); - private Logger log = Logger.getLogger(Crawler.class.getName()); + private Logger log = Logger.getLogger(this.getClass().getName()); - private int maxvideostotest; - private int startup = 10;//to keep the beginning cool + private int maxvideostotest = 100; + private int startup = 2;//to keep the beginning cool public Crawler() { try { maxvideostotest = Integer.parseInt(Config.prop.getProperty("crawler.maxvideos")); } catch(NumberFormatException e) { - log.warn("could not read the number \"" + Config.prop.getProperty("") + "\" from the config file. maxvideo"); + log.warn("could not read the number \"" + Config.prop.getProperty("crawler.maxvideos") + "\" from the config file. maxvideo"); maxvideostotest = 100; } + try { + jobspeerthread = Integer.parseInt(Config.prop.getProperty("crawler.jobspeerthread")); + } catch(NumberFormatException e) { + log.warn("could not read the number \"" + Config.prop.getProperty("crawler.jobspeerthread") + "\" from the config file. maxvideo"); + jobspeerthread = 100; + } } public void stop() { @@ -54,10 +62,8 @@ public class Crawler implements Runnable { } public synchronized void addtoCrawl(String videoid) { - listlock.writeLock().lock(); if(! (toCrawl.contains(videoid) || toknown.contains(videoid))) toknown.add(videoid); - listlock.writeLock().unlock(); } public boolean isCrawling() { @@ -73,20 +79,20 @@ public class Crawler implements Runnable { } private void send(CrawlerThread t) { - listlock.writeLock().lock(); + // listlock.writeLock().lock(); for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) { t.todo.add(toCrawl.removeFirst()); } - listlock.writeLock().unlock(); + // listlock.writeLock().unlock(); t.requested = false; } @Override public void run() { + currentstate = "loading crawlfile"; start = System.currentTimeMillis(); log.info("Try to load crawlfile"); if(crawlfile.exists()) { - listlock.writeLock().lock(); try { Scanner in = new Scanner(crawlfile); boolean crawl = true;//section of file @@ -112,8 +118,6 @@ public class Crawler implements Runnable { } catch(IOException e) { log.warn("Error while loading crawl file."); e.printStackTrace(); - } finally { - listlock.writeLock().unlock(); } } @@ -131,40 +135,27 @@ public class Crawler implements Runnable { new Thread(thr, "Crawler #" + i).start(); threads.add(thr); } - int updateOffset = 0; while(crawl) { log.info("to Crawl: " + toCrawl.size() + " known: " + toknown.size() + " Time: " + dateform.format(new Date())); //fullfill request while(!requested.isEmpty() && !toCrawl.isEmpty() && crawl) { + log.info("fullfill request"); + currentstate = "fullfill requests"; send(requested.remove(0)); } //kindof idle while(toCrawl.size() > (jobspeerthread * threads.size()) && crawl && requested.isEmpty()) { startup = 0;//stop startup count + currentstate = "idle"; Thread.yield(); try { - Thread.sleep(5000); + Thread.sleep(100); } catch(InterruptedException ignored) { break; } - log.info("updating DB Offset= " + updateOffset); - LinkedList vids = db.getUncompleted(50, updateOffset); - LinkedList