lock for the lists, save toknown list
This commit is contained in:
parent
04b098d9ff
commit
c7a6a28d49
|
@ -5,10 +5,14 @@ import java.io.File;
|
||||||
import java.io.FileWriter;
|
import java.io.FileWriter;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.PrintWriter;
|
import java.io.PrintWriter;
|
||||||
|
import java.text.DateFormat;
|
||||||
|
import java.text.SimpleDateFormat;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Date;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Scanner;
|
import java.util.Scanner;
|
||||||
|
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
|
@ -16,11 +20,13 @@ public class Crawler implements Runnable {
|
||||||
|
|
||||||
private static int jobspeerthread = 100; //the amount of jobs a thread get peer request
|
private static int jobspeerthread = 100; //the amount of jobs a thread get peer request
|
||||||
|
|
||||||
|
private ReentrantReadWriteLock listlock = new ReentrantReadWriteLock(true);//only writelock is used, this lock should lock the list toCrawl and toknown because they may be accsessed by other threads
|
||||||
private LinkedList<String> toSave = new LinkedList<>();//all found ytids, witch need to be analysed
|
private LinkedList<String> toSave = new LinkedList<>();//all found ytids, witch need to be analysed
|
||||||
private LinkedList<String> toCrawl = new LinkedList<>();//all videos tu crawl
|
private LinkedList<String> toCrawl = new LinkedList<>();//all videos tu crawl
|
||||||
private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle
|
private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle
|
||||||
private List<CrawlerThread> threads;//list of all threads
|
private List<CrawlerThread> threads;//list of all threads
|
||||||
private List<CrawlerThread> requested = new LinkedList<>();
|
private List<CrawlerThread> requested = new LinkedList<>();
|
||||||
|
private static DateFormat dateform = new SimpleDateFormat("dd-MM-yyyy HH:mm:ss");
|
||||||
|
|
||||||
private boolean crawl = true;
|
private boolean crawl = true;
|
||||||
private int crawlcount = 0;
|
private int crawlcount = 0;
|
||||||
|
@ -46,9 +52,11 @@ public class Crawler implements Runnable {
|
||||||
crawl = false;
|
crawl = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public synchronized void addtoCrawl(String videoid) {//TODO some kind of lock!
|
public synchronized void addtoCrawl(String videoid) {
|
||||||
|
listlock.writeLock().lock();
|
||||||
if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
|
if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
|
||||||
toknown.add(videoid);
|
toknown.add(videoid);
|
||||||
|
listlock.writeLock().unlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isCrawling() {
|
public boolean isCrawling() {
|
||||||
|
@ -64,9 +72,11 @@ public class Crawler implements Runnable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void send(CrawlerThread t) {
|
private void send(CrawlerThread t) {
|
||||||
|
listlock.writeLock().lock();
|
||||||
for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) {
|
for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) {
|
||||||
t.todo.add(toCrawl.removeFirst());
|
t.todo.add(toCrawl.removeFirst());
|
||||||
}
|
}
|
||||||
|
listlock.writeLock().unlock();
|
||||||
t.requested = false;
|
t.requested = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -75,15 +85,25 @@ public class Crawler implements Runnable {
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
log.info("Try to load crawlfile");
|
log.info("Try to load crawlfile");
|
||||||
if(crawlfile.exists()) {
|
if(crawlfile.exists()) {
|
||||||
|
listlock.writeLock().lock();
|
||||||
try {
|
try {
|
||||||
Scanner in = new Scanner(crawlfile);
|
Scanner in = new Scanner(crawlfile);
|
||||||
|
boolean crawl = true;//section of file
|
||||||
while(in.hasNextLine()) {
|
while(in.hasNextLine()) {
|
||||||
String line = in.nextLine();
|
String line = in.nextLine();
|
||||||
if(line == null) {
|
if(line == null) {
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
if(!line.isEmpty()) {
|
if(!line.isEmpty()) {
|
||||||
addtoCrawl(line);
|
if(line.equals("-")) {//section delimiter
|
||||||
|
crawl = false;
|
||||||
|
} else {
|
||||||
|
if(crawl) {
|
||||||
|
toCrawl.add(line);
|
||||||
|
} else {
|
||||||
|
toknown.add(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -91,6 +111,8 @@ public class Crawler implements Runnable {
|
||||||
} catch(IOException e) {
|
} catch(IOException e) {
|
||||||
log.warn("Error while loading crawl file.");
|
log.warn("Error while loading crawl file.");
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
|
} finally {
|
||||||
|
listlock.writeLock().unlock();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -110,7 +132,7 @@ public class Crawler implements Runnable {
|
||||||
}
|
}
|
||||||
|
|
||||||
while(crawl) {
|
while(crawl) {
|
||||||
log.info("to Crawl: " + toCrawl.size());
|
log.info("to Crawl: " + toCrawl.size() + " known: " + toknown.size() + " Time: " + dateform.format(new Date()));
|
||||||
|
|
||||||
//fullfill request
|
//fullfill request
|
||||||
while(!requested.isEmpty() && !toCrawl.isEmpty() && crawl) {
|
while(!requested.isEmpty() && !toCrawl.isEmpty() && crawl) {
|
||||||
|
@ -119,6 +141,7 @@ public class Crawler implements Runnable {
|
||||||
|
|
||||||
//kindof idle
|
//kindof idle
|
||||||
while(toCrawl.size() > (jobspeerthread * threads.size()) && crawl && requested.isEmpty()) {
|
while(toCrawl.size() > (jobspeerthread * threads.size()) && crawl && requested.isEmpty()) {
|
||||||
|
startup = 0;//stop startup count
|
||||||
Thread.yield();
|
Thread.yield();
|
||||||
try {
|
try {
|
||||||
Thread.sleep(100);
|
Thread.sleep(100);
|
||||||
|
@ -134,6 +157,7 @@ public class Crawler implements Runnable {
|
||||||
if(!toknown.isEmpty()) {
|
if(!toknown.isEmpty()) {
|
||||||
//check in db for known videos
|
//check in db for known videos
|
||||||
log.info("Checking the DB");
|
log.info("Checking the DB");
|
||||||
|
listlock.writeLock().lock();
|
||||||
while(toCrawl.size() < jobspeerthread * threads.size() * 2 && crawl && !toknown.isEmpty()) {
|
while(toCrawl.size() < jobspeerthread * threads.size() * 2 && crawl && !toknown.isEmpty()) {
|
||||||
LinkedList<String> tocheck = new LinkedList<>();
|
LinkedList<String> tocheck = new LinkedList<>();
|
||||||
for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) {
|
for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) {
|
||||||
|
@ -141,18 +165,26 @@ public class Crawler implements Runnable {
|
||||||
}
|
}
|
||||||
toCrawl.addAll(db.checkvideos(tocheck));
|
toCrawl.addAll(db.checkvideos(tocheck));
|
||||||
}
|
}
|
||||||
|
listlock.writeLock().unlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
//writing crawlfile
|
//writing crawlfile
|
||||||
log.info("Writing Crawlfile");
|
log.info("Writing Crawlfile");
|
||||||
|
listlock.writeLock().lock();
|
||||||
try {
|
try {
|
||||||
PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile)));
|
PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile)));
|
||||||
for(String t : toCrawl) {
|
for(String t : toCrawl) {
|
||||||
p.println(t);
|
p.println(t);
|
||||||
}
|
}
|
||||||
|
p.println("-");
|
||||||
|
for(String t : toknown) {
|
||||||
|
p.println(t);
|
||||||
|
}
|
||||||
p.close();
|
p.close();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
log.error("Error writing crawlfile.", e);
|
log.error("Error writing crawlfile.", e);
|
||||||
|
} finally {
|
||||||
|
listlock.writeLock().unlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
//get reports
|
//get reports
|
||||||
|
@ -164,6 +196,7 @@ public class Crawler implements Runnable {
|
||||||
}
|
}
|
||||||
|
|
||||||
//save to db
|
//save to db
|
||||||
|
log.info("save " + toSave.size() + " videos to DB.");
|
||||||
while(!toSave.isEmpty()) {
|
while(!toSave.isEmpty()) {
|
||||||
LinkedList<String> videoids = new LinkedList<>();
|
LinkedList<String> videoids = new LinkedList<>();
|
||||||
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
|
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
|
||||||
|
@ -175,6 +208,7 @@ public class Crawler implements Runnable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//at the beginning there is maybe just one video to crawl, so keep it calm.
|
||||||
if(startup > 0) {
|
if(startup > 0) {
|
||||||
startup --;
|
startup --;
|
||||||
try {
|
try {
|
||||||
|
@ -186,8 +220,8 @@ public class Crawler implements Runnable {
|
||||||
//end
|
//end
|
||||||
long runtimes = (System.currentTimeMillis() - start) / 1000;
|
long runtimes = (System.currentTimeMillis() - start) / 1000;
|
||||||
int runtimem = (int) (runtimes / 60);
|
int runtimem = (int) (runtimes / 60);
|
||||||
float vidps = (crawlcount / (float) runtimes);
|
float vidps = (crawlcount / (float) runtimes);//videos per second
|
||||||
log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " )");
|
log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " v/s )");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Video getVideo() {
|
public static Video getVideo() {
|
||||||
|
@ -201,6 +235,4 @@ public class Crawler implements Runnable {
|
||||||
byte categorie;
|
byte categorie;
|
||||||
long created;
|
long created;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -86,7 +86,7 @@ public class DB {
|
||||||
* @param input
|
* @param input
|
||||||
*/
|
*/
|
||||||
public void addVideos(List<Video> input) {
|
public void addVideos(List<Video> input) {
|
||||||
log.info("add " + input.size() + " videos");
|
//log.info("add " + input.size() + " videos");
|
||||||
if(input.size() > 0) {
|
if(input.size() > 0) {
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
for(int i = 0; i< input.size(); i++) {
|
for(int i = 0; i< input.size(); i++) {
|
||||||
|
|
|
@ -36,6 +36,7 @@ public class YoutubeAPI {
|
||||||
}
|
}
|
||||||
|
|
||||||
public LinkedList<Video> getInfos(List<String> ids) {
|
public LinkedList<Video> getInfos(List<String> ids) {
|
||||||
|
log.info("get " + ids.size() + " infos");
|
||||||
if(ids.isEmpty())
|
if(ids.isEmpty())
|
||||||
return new LinkedList<Video>();
|
return new LinkedList<Video>();
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue