2018-07-15 21:30:12 +02:00
package de.mrbesen.youtubecrawler ;
import java.io.BufferedWriter ;
import java.io.File ;
import java.io.FileWriter ;
import java.io.IOException ;
import java.io.PrintWriter ;
2018-07-17 13:42:06 +02:00
import java.text.DateFormat ;
import java.text.SimpleDateFormat ;
2018-07-16 23:22:32 +02:00
import java.util.ArrayList ;
2018-07-17 13:42:06 +02:00
import java.util.Date ;
2018-07-15 21:30:12 +02:00
import java.util.LinkedList ;
import java.util.List ;
import java.util.Scanner ;
2018-07-17 13:42:06 +02:00
import java.util.concurrent.locks.ReentrantReadWriteLock ;
2018-07-15 21:30:12 +02:00
import org.apache.log4j.Logger ;
public class Crawler implements Runnable {
2018-07-16 23:22:32 +02:00
private static int jobspeerthread = 100 ; //the amount of jobs a thread get peer request
2018-07-17 13:42:06 +02:00
private ReentrantReadWriteLock listlock = new ReentrantReadWriteLock ( true ) ; //only writelock is used, this lock should lock the list toCrawl and toknown because they may be accsessed by other threads
2018-07-15 21:30:12 +02:00
private LinkedList < String > toSave = new LinkedList < > ( ) ; //all found ytids, witch need to be analysed
private LinkedList < String > toCrawl = new LinkedList < > ( ) ; //all videos tu crawl
private LinkedList < String > toknown = new LinkedList < > ( ) ; //list with all videos, to test if they are allready known, if not they are moved to tocrawle
2018-07-16 23:22:32 +02:00
private List < CrawlerThread > threads ; //list of all threads
private List < CrawlerThread > requested = new LinkedList < > ( ) ;
2018-07-17 13:42:06 +02:00
private static DateFormat dateform = new SimpleDateFormat ( " dd-MM-yyyy HH:mm:ss " ) ;
2018-07-19 20:34:11 +02:00
private long start ;
2018-07-16 23:22:32 +02:00
2018-07-15 21:30:12 +02:00
private boolean crawl = true ;
private int crawlcount = 0 ;
2018-07-16 23:22:32 +02:00
2018-07-15 21:30:12 +02:00
private DB db = new DB ( ) ;
private YoutubeAPI api = new YoutubeAPI ( ) ;
private File crawlfile = new File ( " crawl.txt " ) ;
2018-07-15 22:09:37 +02:00
private Logger log = Logger . getLogger ( Crawler . class . getName ( ) ) ;
2018-07-16 23:22:32 +02:00
private int maxvideostotest ;
private int startup = 10 ; //to keep the beginning cool
public Crawler ( ) {
try {
maxvideostotest = Integer . parseInt ( Config . prop . getProperty ( " crawler.maxvideos " ) ) ;
} catch ( NumberFormatException e ) {
log . warn ( " could not read the number \" " + Config . prop . getProperty ( " " ) + " \" from the config file. maxvideo " ) ;
maxvideostotest = 100 ;
}
}
2018-07-15 21:30:12 +02:00
public void stop ( ) {
crawl = false ;
}
2018-07-17 13:42:06 +02:00
public synchronized void addtoCrawl ( String videoid ) {
listlock . writeLock ( ) . lock ( ) ;
2018-07-15 21:30:12 +02:00
if ( ! ( toCrawl . contains ( videoid ) | | toknown . contains ( videoid ) ) )
toknown . add ( videoid ) ;
2018-07-17 13:42:06 +02:00
listlock . writeLock ( ) . unlock ( ) ;
2018-07-15 21:30:12 +02:00
}
2018-07-16 23:22:32 +02:00
public boolean isCrawling ( ) {
return crawl ;
}
public void request ( CrawlerThread t ) {
if ( ! toCrawl . isEmpty ( ) ) {
send ( t ) ;
} else {
requested . add ( t ) ;
2018-07-15 21:30:12 +02:00
}
}
2018-07-16 23:22:32 +02:00
private void send ( CrawlerThread t ) {
2018-07-17 13:42:06 +02:00
listlock . writeLock ( ) . lock ( ) ;
2018-07-16 23:22:32 +02:00
for ( int i = 0 ; i < jobspeerthread & & ! toCrawl . isEmpty ( ) ; i + + ) {
t . todo . add ( toCrawl . removeFirst ( ) ) ;
}
2018-07-17 13:42:06 +02:00
listlock . writeLock ( ) . unlock ( ) ;
2018-07-16 23:22:32 +02:00
t . requested = false ;
}
2018-07-15 21:30:12 +02:00
@Override
public void run ( ) {
2018-07-19 20:34:11 +02:00
start = System . currentTimeMillis ( ) ;
2018-07-15 21:30:12 +02:00
log . info ( " Try to load crawlfile " ) ;
if ( crawlfile . exists ( ) ) {
2018-07-17 13:42:06 +02:00
listlock . writeLock ( ) . lock ( ) ;
2018-07-15 21:30:12 +02:00
try {
Scanner in = new Scanner ( crawlfile ) ;
2018-07-17 13:42:06 +02:00
boolean crawl = true ; //section of file
2018-07-15 21:30:12 +02:00
while ( in . hasNextLine ( ) ) {
String line = in . nextLine ( ) ;
if ( line = = null ) {
break ;
} else {
if ( ! line . isEmpty ( ) ) {
2018-07-17 13:42:06 +02:00
if ( line . equals ( " - " ) ) { //section delimiter
crawl = false ;
} else {
if ( crawl ) {
toCrawl . add ( line ) ;
} else {
toknown . add ( line ) ;
}
}
2018-07-15 21:30:12 +02:00
}
}
}
in . close ( ) ;
} catch ( IOException e ) {
log . warn ( " Error while loading crawl file. " ) ;
e . printStackTrace ( ) ;
2018-07-17 13:42:06 +02:00
} finally {
listlock . writeLock ( ) . unlock ( ) ;
2018-07-15 21:30:12 +02:00
}
}
2018-07-16 23:22:32 +02:00
//populate threads
int threadcount = 4 ;
try {
threadcount = Integer . parseInt ( Config . prop . getProperty ( " crawler.threadcount " ) ) ;
} catch ( NumberFormatException e ) {
log . warn ( " Could not read the Number \" " + Config . prop . getProperty ( " crawler.threadcount " ) + " \" from the Config. " ) ;
}
threads = new ArrayList < > ( threadcount ) ;
for ( int i = 0 ; i < threadcount ; i + + ) {
CrawlerThread thr = new CrawlerThread ( this ) ;
new Thread ( thr , " Crawler # " + i ) . start ( ) ;
threads . add ( thr ) ;
}
2018-07-15 21:30:12 +02:00
while ( crawl ) {
2018-07-17 13:42:06 +02:00
log . info ( " to Crawl: " + toCrawl . size ( ) + " known: " + toknown . size ( ) + " Time: " + dateform . format ( new Date ( ) ) ) ;
2018-07-15 21:30:12 +02:00
2018-07-16 23:22:32 +02:00
//fullfill request
while ( ! requested . isEmpty ( ) & & ! toCrawl . isEmpty ( ) & & crawl ) {
send ( requested . remove ( 0 ) ) ;
2018-07-15 21:30:12 +02:00
}
2018-07-16 23:22:32 +02:00
//kindof idle
while ( toCrawl . size ( ) > ( jobspeerthread * threads . size ( ) ) & & crawl & & requested . isEmpty ( ) ) {
2018-07-17 13:42:06 +02:00
startup = 0 ; //stop startup count
2018-07-16 23:22:32 +02:00
Thread . yield ( ) ;
2018-07-16 20:12:20 +02:00
try {
2018-07-16 23:22:32 +02:00
Thread . sleep ( 100 ) ;
2018-07-19 20:34:11 +02:00
} catch ( InterruptedException ignored ) {
break ;
}
2018-07-16 23:22:32 +02:00
}
//nothing left?
if ( toknown . isEmpty ( ) & & toCrawl . isEmpty ( ) & & requested . size ( ) = = threads . size ( ) ) { //very uncommon
log . warn ( " nothing left to crawl " ) ;
crawl = false ;
}
//refil the tocrawl list.
if ( ! toknown . isEmpty ( ) ) {
//check in db for known videos
log . info ( " Checking the DB " ) ;
2018-07-17 13:42:06 +02:00
listlock . writeLock ( ) . lock ( ) ;
2018-07-16 23:22:32 +02:00
while ( toCrawl . size ( ) < jobspeerthread * threads . size ( ) * 2 & & crawl & & ! toknown . isEmpty ( ) ) {
LinkedList < String > tocheck = new LinkedList < > ( ) ;
for ( int i = 0 ; i < toknown . size ( ) & & i < maxvideostotest ; i + + ) {
tocheck . add ( toknown . removeFirst ( ) ) ;
2018-07-16 20:12:20 +02:00
}
2018-07-16 23:22:32 +02:00
toCrawl . addAll ( db . checkvideos ( tocheck ) ) ;
}
2018-07-17 13:42:06 +02:00
listlock . writeLock ( ) . unlock ( ) ;
2018-07-16 23:22:32 +02:00
}
//writing crawlfile
log . info ( " Writing Crawlfile " ) ;
2018-07-17 13:42:06 +02:00
listlock . writeLock ( ) . lock ( ) ;
2018-07-16 23:22:32 +02:00
try {
PrintWriter p = new PrintWriter ( new BufferedWriter ( new FileWriter ( crawlfile ) ) ) ;
for ( String t : toCrawl ) {
p . println ( t ) ;
2018-07-15 21:30:12 +02:00
}
2018-07-17 13:42:06 +02:00
p . println ( " - " ) ;
for ( String t : toknown ) {
p . println ( t ) ;
}
2018-07-16 23:22:32 +02:00
p . close ( ) ;
} catch ( IOException e ) {
log . error ( " Error writing crawlfile. " , e ) ;
2018-07-17 13:42:06 +02:00
} finally {
listlock . writeLock ( ) . unlock ( ) ;
2018-07-16 23:22:32 +02:00
}
//get reports
for ( CrawlerThread crawlerThread : threads ) {
LinkedList < String > report = crawlerThread . report ( ) ;
crawlcount + = report . size ( ) ;
toSave . addAll ( report ) ;
crawlerThread . list . clear ( ) ;
2018-07-15 21:30:12 +02:00
}
2018-07-16 23:22:32 +02:00
//save to db
2018-07-17 13:42:06 +02:00
log . info ( " save " + toSave . size ( ) + " videos to DB. " ) ;
2018-07-15 21:30:12 +02:00
while ( ! toSave . isEmpty ( ) ) {
LinkedList < String > videoids = new LinkedList < > ( ) ;
for ( int i = 0 ; i < 50 & & ! toSave . isEmpty ( ) ; i + + ) {
videoids . add ( toSave . remove ( 0 ) ) ;
}
if ( videoids . size ( ) > 0 ) {
List < Video > videos = api . getInfos ( videoids ) ;
db . addVideos ( videos ) ;
}
}
2018-07-16 23:22:32 +02:00
2018-07-17 13:42:06 +02:00
//at the beginning there is maybe just one video to crawl, so keep it calm.
2018-07-16 23:22:32 +02:00
if ( startup > 0 ) {
startup - - ;
try {
Thread . sleep ( 20000 ) ;
} catch ( InterruptedException e ) { }
}
2018-07-15 21:30:12 +02:00
}
2018-07-16 23:22:32 +02:00
//end
long runtimes = ( System . currentTimeMillis ( ) - start ) / 1000 ;
2018-07-19 20:34:11 +02:00
if ( runtimes < 0 )
runtimes = 1 ;
2018-07-16 23:22:32 +02:00
int runtimem = ( int ) ( runtimes / 60 ) ;
2018-07-17 13:42:06 +02:00
float vidps = ( crawlcount / ( float ) runtimes ) ; //videos per second
log . info ( " Crawling Stopped. Runtime: " + runtimem + " min and " + crawlcount + " videos crawled. ( " + vidps + " v/s ) " ) ;
2018-07-15 21:30:12 +02:00
}
2018-07-19 17:59:26 +02:00
public DB getDB ( ) {
return db ;
}
2018-07-15 21:30:12 +02:00
public static Video getVideo ( ) {
return new Video ( ) ;
}
2018-07-19 20:34:11 +02:00
public void printStats ( ) {
long runtimes = ( System . currentTimeMillis ( ) - start ) / 1000 ;
if ( runtimes < 0 )
runtimes = 1 ;
int runtimem = ( int ) ( runtimes / 60 ) ;
float vidps = ( crawlcount / ( float ) runtimes ) ; //videos per second
log . info ( " ToCrawl: " + toCrawl . size ( ) ) ;
log . info ( " Toknown: " + toknown . size ( ) ) ;
log . info ( " ToSave: " + toSave . size ( ) ) ;
log . info ( " Runtime: " + runtimem + " min and " + crawlcount + " videos crawled. ( " + vidps + " v/s ) " ) ;
}
2018-07-15 21:30:12 +02:00
public static class Video {
String id ;
int length ; //the length of the video in seconds
String languageCode ;
byte categorie ;
long created ;
}
2018-07-19 20:34:11 +02:00
2018-07-15 21:30:12 +02:00
}