lock for the lists, save toknown list

This commit is contained in:
mrbesen 2018-07-17 13:42:06 +02:00
parent 04b098d9ff
commit c7a6a28d49
3 changed files with 41 additions and 8 deletions

View File

@ -5,10 +5,14 @@ import java.io.File;
import java.io.FileWriter; import java.io.FileWriter;
import java.io.IOException; import java.io.IOException;
import java.io.PrintWriter; import java.io.PrintWriter;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Scanner; import java.util.Scanner;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@ -16,11 +20,13 @@ public class Crawler implements Runnable {
private static int jobspeerthread = 100; //the amount of jobs a thread get peer request private static int jobspeerthread = 100; //the amount of jobs a thread get peer request
private ReentrantReadWriteLock listlock = new ReentrantReadWriteLock(true);//only writelock is used, this lock should lock the list toCrawl and toknown because they may be accsessed by other threads
private LinkedList<String> toSave = new LinkedList<>();//all found ytids, witch need to be analysed private LinkedList<String> toSave = new LinkedList<>();//all found ytids, witch need to be analysed
private LinkedList<String> toCrawl = new LinkedList<>();//all videos tu crawl private LinkedList<String> toCrawl = new LinkedList<>();//all videos tu crawl
private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle
private List<CrawlerThread> threads;//list of all threads private List<CrawlerThread> threads;//list of all threads
private List<CrawlerThread> requested = new LinkedList<>(); private List<CrawlerThread> requested = new LinkedList<>();
private static DateFormat dateform = new SimpleDateFormat("dd-MM-yyyy HH:mm:ss");
private boolean crawl = true; private boolean crawl = true;
private int crawlcount = 0; private int crawlcount = 0;
@ -46,9 +52,11 @@ public class Crawler implements Runnable {
crawl = false; crawl = false;
} }
public synchronized void addtoCrawl(String videoid) {//TODO some kind of lock! public synchronized void addtoCrawl(String videoid) {
listlock.writeLock().lock();
if(! (toCrawl.contains(videoid) || toknown.contains(videoid))) if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
toknown.add(videoid); toknown.add(videoid);
listlock.writeLock().unlock();
} }
public boolean isCrawling() { public boolean isCrawling() {
@ -64,9 +72,11 @@ public class Crawler implements Runnable {
} }
private void send(CrawlerThread t) { private void send(CrawlerThread t) {
listlock.writeLock().lock();
for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) { for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) {
t.todo.add(toCrawl.removeFirst()); t.todo.add(toCrawl.removeFirst());
} }
listlock.writeLock().unlock();
t.requested = false; t.requested = false;
} }
@ -75,15 +85,25 @@ public class Crawler implements Runnable {
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
log.info("Try to load crawlfile"); log.info("Try to load crawlfile");
if(crawlfile.exists()) { if(crawlfile.exists()) {
listlock.writeLock().lock();
try { try {
Scanner in = new Scanner(crawlfile); Scanner in = new Scanner(crawlfile);
boolean crawl = true;//section of file
while(in.hasNextLine()) { while(in.hasNextLine()) {
String line = in.nextLine(); String line = in.nextLine();
if(line == null) { if(line == null) {
break; break;
} else { } else {
if(!line.isEmpty()) { if(!line.isEmpty()) {
addtoCrawl(line); if(line.equals("-")) {//section delimiter
crawl = false;
} else {
if(crawl) {
toCrawl.add(line);
} else {
toknown.add(line);
}
}
} }
} }
} }
@ -91,6 +111,8 @@ public class Crawler implements Runnable {
} catch(IOException e) { } catch(IOException e) {
log.warn("Error while loading crawl file."); log.warn("Error while loading crawl file.");
e.printStackTrace(); e.printStackTrace();
} finally {
listlock.writeLock().unlock();
} }
} }
@ -110,7 +132,7 @@ public class Crawler implements Runnable {
} }
while(crawl) { while(crawl) {
log.info("to Crawl: " + toCrawl.size()); log.info("to Crawl: " + toCrawl.size() + " known: " + toknown.size() + " Time: " + dateform.format(new Date()));
//fullfill request //fullfill request
while(!requested.isEmpty() && !toCrawl.isEmpty() && crawl) { while(!requested.isEmpty() && !toCrawl.isEmpty() && crawl) {
@ -119,6 +141,7 @@ public class Crawler implements Runnable {
//kindof idle //kindof idle
while(toCrawl.size() > (jobspeerthread * threads.size()) && crawl && requested.isEmpty()) { while(toCrawl.size() > (jobspeerthread * threads.size()) && crawl && requested.isEmpty()) {
startup = 0;//stop startup count
Thread.yield(); Thread.yield();
try { try {
Thread.sleep(100); Thread.sleep(100);
@ -134,6 +157,7 @@ public class Crawler implements Runnable {
if(!toknown.isEmpty()) { if(!toknown.isEmpty()) {
//check in db for known videos //check in db for known videos
log.info("Checking the DB"); log.info("Checking the DB");
listlock.writeLock().lock();
while(toCrawl.size() < jobspeerthread * threads.size() * 2 && crawl && !toknown.isEmpty()) { while(toCrawl.size() < jobspeerthread * threads.size() * 2 && crawl && !toknown.isEmpty()) {
LinkedList<String> tocheck = new LinkedList<>(); LinkedList<String> tocheck = new LinkedList<>();
for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) { for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) {
@ -141,18 +165,26 @@ public class Crawler implements Runnable {
} }
toCrawl.addAll(db.checkvideos(tocheck)); toCrawl.addAll(db.checkvideos(tocheck));
} }
listlock.writeLock().unlock();
} }
//writing crawlfile //writing crawlfile
log.info("Writing Crawlfile"); log.info("Writing Crawlfile");
listlock.writeLock().lock();
try { try {
PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile))); PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile)));
for(String t : toCrawl) { for(String t : toCrawl) {
p.println(t); p.println(t);
} }
p.println("-");
for(String t : toknown) {
p.println(t);
}
p.close(); p.close();
} catch (IOException e) { } catch (IOException e) {
log.error("Error writing crawlfile.", e); log.error("Error writing crawlfile.", e);
} finally {
listlock.writeLock().unlock();
} }
//get reports //get reports
@ -164,6 +196,7 @@ public class Crawler implements Runnable {
} }
//save to db //save to db
log.info("save " + toSave.size() + " videos to DB.");
while(!toSave.isEmpty()) { while(!toSave.isEmpty()) {
LinkedList<String> videoids = new LinkedList<>(); LinkedList<String> videoids = new LinkedList<>();
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) { for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
@ -175,6 +208,7 @@ public class Crawler implements Runnable {
} }
} }
//at the beginning there is maybe just one video to crawl, so keep it calm.
if(startup > 0) { if(startup > 0) {
startup --; startup --;
try { try {
@ -186,8 +220,8 @@ public class Crawler implements Runnable {
//end //end
long runtimes = (System.currentTimeMillis() - start) / 1000; long runtimes = (System.currentTimeMillis() - start) / 1000;
int runtimem = (int) (runtimes / 60); int runtimem = (int) (runtimes / 60);
float vidps = (crawlcount / (float) runtimes); float vidps = (crawlcount / (float) runtimes);//videos per second
log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " )"); log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " v/s )");
} }
public static Video getVideo() { public static Video getVideo() {
@ -201,6 +235,4 @@ public class Crawler implements Runnable {
byte categorie; byte categorie;
long created; long created;
} }
} }

View File

@ -86,7 +86,7 @@ public class DB {
* @param input * @param input
*/ */
public void addVideos(List<Video> input) { public void addVideos(List<Video> input) {
log.info("add " + input.size() + " videos"); //log.info("add " + input.size() + " videos");
if(input.size() > 0) { if(input.size() > 0) {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
for(int i = 0; i< input.size(); i++) { for(int i = 0; i< input.size(); i++) {

View File

@ -36,6 +36,7 @@ public class YoutubeAPI {
} }
public LinkedList<Video> getInfos(List<String> ids) { public LinkedList<Video> getInfos(List<String> ids) {
log.info("get " + ids.size() + " infos");
if(ids.isEmpty()) if(ids.isEmpty())
return new LinkedList<Video>(); return new LinkedList<Video>();