Multithreading
This commit is contained in:
parent
73643625f3
commit
04b098d9ff
|
@ -19,12 +19,14 @@ public class Config {
|
|||
|
||||
private static Map<String,String> properties = new HashMap<String, String>() {
|
||||
{
|
||||
put("db.host" , "localhost" );
|
||||
put("db.port" , "3306" );
|
||||
put("db.user" , "ytcrawler" );
|
||||
put("db.pw" , "" );
|
||||
put("db.dbname" , "ytcrawler" );
|
||||
put("youtube.apikey", "" );
|
||||
put("db.host" , "localhost" );
|
||||
put("db.port" , "3306" );
|
||||
put("db.user" , "ytcrawler" );
|
||||
put("db.pw" , "" );
|
||||
put("db.dbname" , "ytcrawler" );
|
||||
put("youtube.apikey" , "" );
|
||||
put("crawler.maxvideos" , "100" );
|
||||
put("crawler.threadcount", "4" );
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -5,63 +5,70 @@ import java.io.File;
|
|||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Scanner;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
public class Crawler implements Runnable {
|
||||
|
||||
private static int jobspeerthread = 100; //the amount of jobs a thread get peer request
|
||||
|
||||
private LinkedList<String> toSave = new LinkedList<>();//all found ytids, witch need to be analysed
|
||||
private LinkedList<String> toCrawl = new LinkedList<>();//all videos tu crawl
|
||||
private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle
|
||||
private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v=");
|
||||
private List<CrawlerThread> threads;//list of all threads
|
||||
private List<CrawlerThread> requested = new LinkedList<>();
|
||||
|
||||
private boolean crawl = true;
|
||||
private int crawlcount = 0;
|
||||
|
||||
private DB db = new DB();
|
||||
private YoutubeAPI api = new YoutubeAPI();
|
||||
private File crawlfile = new File("crawl.txt");
|
||||
private Logger log = Logger.getLogger(Crawler.class.getName());
|
||||
private static int maxvideostotest = 100;
|
||||
|
||||
private int maxvideostotest;
|
||||
private int startup = 10;//to keep the beginning cool
|
||||
|
||||
public Crawler() {
|
||||
try {
|
||||
maxvideostotest = Integer.parseInt(Config.prop.getProperty("crawler.maxvideos"));
|
||||
} catch(NumberFormatException e) {
|
||||
log.warn("could not read the number \"" + Config.prop.getProperty("") + "\" from the config file. maxvideo");
|
||||
maxvideostotest = 100;
|
||||
}
|
||||
}
|
||||
|
||||
public void stop() {
|
||||
crawl = false;
|
||||
}
|
||||
|
||||
public void addtoCrawl(String videoid) {
|
||||
public synchronized void addtoCrawl(String videoid) {//TODO some kind of lock!
|
||||
if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
|
||||
toknown.add(videoid);
|
||||
}
|
||||
|
||||
private void crawl(String videoid) {
|
||||
try {
|
||||
crawlcount++;
|
||||
log.info("crawling: " + videoid);
|
||||
toSave.add(videoid);
|
||||
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
|
||||
String s = con.getContent();
|
||||
Matcher matcher = linkpattern.matcher(s);
|
||||
while(matcher.find()) {
|
||||
int beginytid = matcher.end();
|
||||
int endxtid = s.indexOf('"', beginytid);
|
||||
int endid = s.indexOf('&', beginytid);
|
||||
if(endid < endxtid) {
|
||||
endxtid = endid;
|
||||
}
|
||||
String ytid = s.substring(beginytid, endxtid);
|
||||
if(ytid.length() > 9 && ytid.length() <= 12) {
|
||||
addtoCrawl(ytid);
|
||||
} else {
|
||||
log.warn("youtube id has wrong length: \"" + ytid + "\"");
|
||||
}
|
||||
}
|
||||
} catch(IOException e) {
|
||||
e.printStackTrace();
|
||||
public boolean isCrawling() {
|
||||
return crawl;
|
||||
}
|
||||
|
||||
public void request(CrawlerThread t) {
|
||||
if(!toCrawl.isEmpty()) {
|
||||
send(t);
|
||||
} else {
|
||||
requested.add(t);
|
||||
}
|
||||
}
|
||||
|
||||
private void send(CrawlerThread t) {
|
||||
for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) {
|
||||
t.todo.add(toCrawl.removeFirst());
|
||||
}
|
||||
t.requested = false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
|
@ -86,34 +93,77 @@ public class Crawler implements Runnable {
|
|||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
//populate threads
|
||||
int threadcount = 4;
|
||||
try {
|
||||
threadcount = Integer.parseInt(Config.prop.getProperty("crawler.threadcount"));
|
||||
} catch(NumberFormatException e) {
|
||||
log.warn("Could not read the Number \"" + Config.prop.getProperty("crawler.threadcount") + "\" from the Config.");
|
||||
}
|
||||
threads = new ArrayList<>(threadcount);
|
||||
|
||||
for(int i = 0; i < threadcount; i++) {
|
||||
CrawlerThread thr = new CrawlerThread( this);
|
||||
new Thread(thr, "Crawler #" + i).start();
|
||||
threads.add(thr);
|
||||
}
|
||||
|
||||
while(crawl) {
|
||||
log.info("to Crawl: " + toCrawl.size());
|
||||
|
||||
while(!toCrawl.isEmpty() && crawl) {
|
||||
crawl(toCrawl.remove(0));
|
||||
//fullfill request
|
||||
while(!requested.isEmpty() && !toCrawl.isEmpty() && crawl) {
|
||||
send(requested.remove(0));
|
||||
}
|
||||
if(toknown.isEmpty()) {//very uncommon
|
||||
log.warn("nothing left.");
|
||||
crawl = false;
|
||||
//delete / clear crawl file
|
||||
} else {
|
||||
LinkedList<String> tocheck = new LinkedList<>();
|
||||
for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) {
|
||||
tocheck.add(toknown.removeFirst());
|
||||
}
|
||||
toCrawl.addAll(db.checkvideos(tocheck));
|
||||
|
||||
|
||||
//kindof idle
|
||||
while(toCrawl.size() > (jobspeerthread * threads.size()) && crawl && requested.isEmpty()) {
|
||||
Thread.yield();
|
||||
try {
|
||||
PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile)));
|
||||
for(String t : toCrawl) {
|
||||
p.println(t);
|
||||
Thread.sleep(100);
|
||||
} catch(InterruptedException ignored) { }
|
||||
}
|
||||
//nothing left?
|
||||
if(toknown.isEmpty() && toCrawl.isEmpty() && requested.size() == threads.size()) {//very uncommon
|
||||
log.warn("nothing left to crawl");
|
||||
crawl = false;
|
||||
}
|
||||
|
||||
//refil the tocrawl list.
|
||||
if(!toknown.isEmpty()) {
|
||||
//check in db for known videos
|
||||
log.info("Checking the DB");
|
||||
while(toCrawl.size() < jobspeerthread * threads.size() * 2 && crawl && !toknown.isEmpty()) {
|
||||
LinkedList<String> tocheck = new LinkedList<>();
|
||||
for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) {
|
||||
tocheck.add(toknown.removeFirst());
|
||||
}
|
||||
p.close();
|
||||
} catch (IOException e) {
|
||||
log.error("Error writing crawlfile.", e);
|
||||
toCrawl.addAll(db.checkvideos(tocheck));
|
||||
}
|
||||
}
|
||||
// System.out.println("try to save " + toSave.size() + " videos.");
|
||||
|
||||
//writing crawlfile
|
||||
log.info("Writing Crawlfile");
|
||||
try {
|
||||
PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile)));
|
||||
for(String t : toCrawl) {
|
||||
p.println(t);
|
||||
}
|
||||
p.close();
|
||||
} catch (IOException e) {
|
||||
log.error("Error writing crawlfile.", e);
|
||||
}
|
||||
|
||||
//get reports
|
||||
for (CrawlerThread crawlerThread : threads) {
|
||||
LinkedList<String> report = crawlerThread.report();
|
||||
crawlcount+= report.size();
|
||||
toSave.addAll(report);
|
||||
crawlerThread.list.clear();
|
||||
}
|
||||
|
||||
//save to db
|
||||
while(!toSave.isEmpty()) {
|
||||
LinkedList<String> videoids = new LinkedList<>();
|
||||
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
|
||||
|
@ -124,9 +174,20 @@ public class Crawler implements Runnable {
|
|||
db.addVideos(videos);
|
||||
}
|
||||
}
|
||||
|
||||
if(startup > 0) {
|
||||
startup --;
|
||||
try {
|
||||
Thread.sleep(20000);
|
||||
} catch(InterruptedException e) {}
|
||||
}
|
||||
}
|
||||
long diff = (System.currentTimeMillis() - start)/ 60000;
|
||||
log.info("Crawling Stopped. Runtime: " + ((int) diff) + "min and " + crawlcount + " videos crawled.");
|
||||
|
||||
//end
|
||||
long runtimes = (System.currentTimeMillis() - start) / 1000;
|
||||
int runtimem = (int) (runtimes / 60);
|
||||
float vidps = (crawlcount / (float) runtimes);
|
||||
log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " )");
|
||||
}
|
||||
|
||||
public static Video getVideo() {
|
||||
|
@ -140,4 +201,6 @@ public class Crawler implements Runnable {
|
|||
byte categorie;
|
||||
long created;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
package de.mrbesen.youtubecrawler;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
public class CrawlerThread implements Runnable {
|
||||
|
||||
private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v=");
|
||||
|
||||
private Logger log = Logger.getLogger(this.getClass().getName());
|
||||
private Crawler parent;
|
||||
|
||||
LinkedList<String> todo = new LinkedList<>();//videos, this thread should crawl
|
||||
LinkedList<String> list = new LinkedList<>();//videos this thread had crawled
|
||||
|
||||
boolean requested = true;//is a request pending?
|
||||
|
||||
public CrawlerThread( Crawler root) {
|
||||
parent = root;
|
||||
root.request(this);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
while(parent.isCrawling()) {
|
||||
while(!todo.isEmpty() && parent.isCrawling()) {
|
||||
crawl(todo.removeFirst());
|
||||
if(todo.size() < 5 && !requested) {
|
||||
requested = true;
|
||||
parent.request(this);
|
||||
}
|
||||
}
|
||||
|
||||
if(todo.isEmpty() && !requested) {
|
||||
requested = true;
|
||||
parent.request(this);
|
||||
}
|
||||
|
||||
log.warn("No Object left!");
|
||||
Thread.yield();
|
||||
try {
|
||||
Thread.sleep(10000);//sleep for 10 seconds
|
||||
} catch (InterruptedException ignored) {}
|
||||
}
|
||||
log.info("Stopped.");
|
||||
}
|
||||
|
||||
/**
|
||||
* returns a linkedlist of all crawled videos
|
||||
* @return
|
||||
*/
|
||||
LinkedList<String> report() {
|
||||
return list;
|
||||
}
|
||||
|
||||
private void crawl(String videoid) {
|
||||
try {
|
||||
list.add(videoid);
|
||||
|
||||
// log.info("crawling: " + videoid);
|
||||
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
|
||||
String s = con.getContent();
|
||||
Matcher matcher = linkpattern.matcher(s);
|
||||
while(matcher.find()) {
|
||||
int beginytid = matcher.end();
|
||||
int endxtid = s.indexOf('"', beginytid);
|
||||
int endid = s.indexOf('&', beginytid);
|
||||
if(endid < endxtid) {
|
||||
endxtid = endid;
|
||||
}
|
||||
String ytid = s.substring(beginytid, endxtid);
|
||||
if(ytid.length() > 9 && ytid.length() <= 12) {
|
||||
parent.addtoCrawl(ytid);
|
||||
} else {
|
||||
// log.warn("youtube id has wrong length: \"" + ytid + "\"");
|
||||
}
|
||||
}
|
||||
} catch(IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -13,7 +13,6 @@ import de.mrbesen.youtubecrawler.Crawler.Video;
|
|||
public class DB {
|
||||
|
||||
private Connection con;
|
||||
//private String server = "localhost", user = "ytcrawler", pw ="pDWmDhmZKArwvG2q", db = "ytcrawler";
|
||||
private String server = Config.prop.getProperty("db.host", "localhost"), user = Config.prop.getProperty("db.user", "ytcrawler"), pw = Config.prop.getProperty("db.pw", ""), db = Config.prop.getProperty("db.dbname", "ytcrawler");
|
||||
private int port = Integer.parseInt(Config.prop.getProperty("db.port", "3306"));
|
||||
private Logger log = Logger.getLogger(DB.class.getName());
|
||||
|
@ -22,7 +21,7 @@ public class DB {
|
|||
public DB() {
|
||||
try {
|
||||
connect(false);
|
||||
|
||||
|
||||
//set the database up!
|
||||
boolean found = false;
|
||||
ResultSet set = con.getMetaData().getCatalogs();//does the db exists?
|
||||
|
@ -41,7 +40,7 @@ public class DB {
|
|||
con.setCatalog(db);
|
||||
update("CREATE TABLE `videos` (`id` varchar(13) NOT NULL,`length` int(11) NOT NULL,`created` int(11) NOT NULL,`langcode` varchar(3) NOT NULL DEFAULT 'en',`category` int(11) DEFAULT NULL, PRIMARY KEY (`id`), UNIQUE KEY `id_UNIQUE` (`id`)) ENGINE=InnoDB DEFAULT CHARSET=latin1;");
|
||||
|
||||
log.info("Database is set up! -> \n\nFirst Entry in uploaded needed!!!!!!\nPlease insert MANUALY!\n ");
|
||||
log.info("Database is set up!");
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
log.error("Error while connecting to the database! ", e);
|
||||
|
@ -58,25 +57,26 @@ public class DB {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* removes all videos, that are known from the db
|
||||
* @param input
|
||||
* @return
|
||||
*/
|
||||
public List<String> checkvideos(List<String> input) {
|
||||
StringBuilder ids = new StringBuilder();
|
||||
for(int i = 0; i < input.size(); i++) {
|
||||
ids.append(',').append(input.get(i));
|
||||
}
|
||||
String query = "SELECT `id` FROM `videos` WHERE concat('%',`id`,'%') LIKE '" + ids.toString() + "';";
|
||||
ResultSet res = query(query);
|
||||
try {
|
||||
while(res.next()) {
|
||||
input.remove(res.getString(1));
|
||||
if(!input.isEmpty()) {
|
||||
StringBuilder ids = new StringBuilder();
|
||||
for(int i = 0; i < input.size(); i++) {
|
||||
ids.append(',').append(input.get(i));
|
||||
}
|
||||
String query = "SELECT `id` FROM `videos` WHERE concat('%',`id`,'%') LIKE '" + ids.toString() + "';";
|
||||
ResultSet res = query(query);
|
||||
try {
|
||||
while(res.next()) {
|
||||
input.remove(res.getString(1));
|
||||
}
|
||||
} catch(SQLException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
} catch(SQLException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return input;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue