Multithreading
This commit is contained in:
parent
73643625f3
commit
04b098d9ff
|
@ -19,12 +19,14 @@ public class Config {
|
||||||
|
|
||||||
private static Map<String,String> properties = new HashMap<String, String>() {
|
private static Map<String,String> properties = new HashMap<String, String>() {
|
||||||
{
|
{
|
||||||
put("db.host" , "localhost" );
|
put("db.host" , "localhost" );
|
||||||
put("db.port" , "3306" );
|
put("db.port" , "3306" );
|
||||||
put("db.user" , "ytcrawler" );
|
put("db.user" , "ytcrawler" );
|
||||||
put("db.pw" , "" );
|
put("db.pw" , "" );
|
||||||
put("db.dbname" , "ytcrawler" );
|
put("db.dbname" , "ytcrawler" );
|
||||||
put("youtube.apikey", "" );
|
put("youtube.apikey" , "" );
|
||||||
|
put("crawler.maxvideos" , "100" );
|
||||||
|
put("crawler.threadcount", "4" );
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -5,63 +5,70 @@ import java.io.File;
|
||||||
import java.io.FileWriter;
|
import java.io.FileWriter;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.PrintWriter;
|
import java.io.PrintWriter;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Scanner;
|
import java.util.Scanner;
|
||||||
import java.util.regex.Matcher;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
public class Crawler implements Runnable {
|
public class Crawler implements Runnable {
|
||||||
|
|
||||||
|
private static int jobspeerthread = 100; //the amount of jobs a thread get peer request
|
||||||
|
|
||||||
private LinkedList<String> toSave = new LinkedList<>();//all found ytids, witch need to be analysed
|
private LinkedList<String> toSave = new LinkedList<>();//all found ytids, witch need to be analysed
|
||||||
private LinkedList<String> toCrawl = new LinkedList<>();//all videos tu crawl
|
private LinkedList<String> toCrawl = new LinkedList<>();//all videos tu crawl
|
||||||
private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle
|
private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle
|
||||||
private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v=");
|
private List<CrawlerThread> threads;//list of all threads
|
||||||
|
private List<CrawlerThread> requested = new LinkedList<>();
|
||||||
|
|
||||||
private boolean crawl = true;
|
private boolean crawl = true;
|
||||||
private int crawlcount = 0;
|
private int crawlcount = 0;
|
||||||
|
|
||||||
private DB db = new DB();
|
private DB db = new DB();
|
||||||
private YoutubeAPI api = new YoutubeAPI();
|
private YoutubeAPI api = new YoutubeAPI();
|
||||||
private File crawlfile = new File("crawl.txt");
|
private File crawlfile = new File("crawl.txt");
|
||||||
private Logger log = Logger.getLogger(Crawler.class.getName());
|
private Logger log = Logger.getLogger(Crawler.class.getName());
|
||||||
private static int maxvideostotest = 100;
|
|
||||||
|
private int maxvideostotest;
|
||||||
|
private int startup = 10;//to keep the beginning cool
|
||||||
|
|
||||||
|
public Crawler() {
|
||||||
|
try {
|
||||||
|
maxvideostotest = Integer.parseInt(Config.prop.getProperty("crawler.maxvideos"));
|
||||||
|
} catch(NumberFormatException e) {
|
||||||
|
log.warn("could not read the number \"" + Config.prop.getProperty("") + "\" from the config file. maxvideo");
|
||||||
|
maxvideostotest = 100;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void stop() {
|
public void stop() {
|
||||||
crawl = false;
|
crawl = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addtoCrawl(String videoid) {
|
public synchronized void addtoCrawl(String videoid) {//TODO some kind of lock!
|
||||||
if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
|
if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
|
||||||
toknown.add(videoid);
|
toknown.add(videoid);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void crawl(String videoid) {
|
public boolean isCrawling() {
|
||||||
try {
|
return crawl;
|
||||||
crawlcount++;
|
}
|
||||||
log.info("crawling: " + videoid);
|
|
||||||
toSave.add(videoid);
|
public void request(CrawlerThread t) {
|
||||||
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
|
if(!toCrawl.isEmpty()) {
|
||||||
String s = con.getContent();
|
send(t);
|
||||||
Matcher matcher = linkpattern.matcher(s);
|
} else {
|
||||||
while(matcher.find()) {
|
requested.add(t);
|
||||||
int beginytid = matcher.end();
|
|
||||||
int endxtid = s.indexOf('"', beginytid);
|
|
||||||
int endid = s.indexOf('&', beginytid);
|
|
||||||
if(endid < endxtid) {
|
|
||||||
endxtid = endid;
|
|
||||||
}
|
|
||||||
String ytid = s.substring(beginytid, endxtid);
|
|
||||||
if(ytid.length() > 9 && ytid.length() <= 12) {
|
|
||||||
addtoCrawl(ytid);
|
|
||||||
} else {
|
|
||||||
log.warn("youtube id has wrong length: \"" + ytid + "\"");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch(IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void send(CrawlerThread t) {
|
||||||
|
for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) {
|
||||||
|
t.todo.add(toCrawl.removeFirst());
|
||||||
|
}
|
||||||
|
t.requested = false;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
|
@ -86,34 +93,77 @@ public class Crawler implements Runnable {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//populate threads
|
||||||
|
int threadcount = 4;
|
||||||
|
try {
|
||||||
|
threadcount = Integer.parseInt(Config.prop.getProperty("crawler.threadcount"));
|
||||||
|
} catch(NumberFormatException e) {
|
||||||
|
log.warn("Could not read the Number \"" + Config.prop.getProperty("crawler.threadcount") + "\" from the Config.");
|
||||||
|
}
|
||||||
|
threads = new ArrayList<>(threadcount);
|
||||||
|
|
||||||
|
for(int i = 0; i < threadcount; i++) {
|
||||||
|
CrawlerThread thr = new CrawlerThread( this);
|
||||||
|
new Thread(thr, "Crawler #" + i).start();
|
||||||
|
threads.add(thr);
|
||||||
|
}
|
||||||
|
|
||||||
while(crawl) {
|
while(crawl) {
|
||||||
log.info("to Crawl: " + toCrawl.size());
|
log.info("to Crawl: " + toCrawl.size());
|
||||||
|
|
||||||
while(!toCrawl.isEmpty() && crawl) {
|
//fullfill request
|
||||||
crawl(toCrawl.remove(0));
|
while(!requested.isEmpty() && !toCrawl.isEmpty() && crawl) {
|
||||||
|
send(requested.remove(0));
|
||||||
}
|
}
|
||||||
if(toknown.isEmpty()) {//very uncommon
|
|
||||||
log.warn("nothing left.");
|
//kindof idle
|
||||||
crawl = false;
|
while(toCrawl.size() > (jobspeerthread * threads.size()) && crawl && requested.isEmpty()) {
|
||||||
//delete / clear crawl file
|
Thread.yield();
|
||||||
} else {
|
|
||||||
LinkedList<String> tocheck = new LinkedList<>();
|
|
||||||
for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) {
|
|
||||||
tocheck.add(toknown.removeFirst());
|
|
||||||
}
|
|
||||||
toCrawl.addAll(db.checkvideos(tocheck));
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile)));
|
Thread.sleep(100);
|
||||||
for(String t : toCrawl) {
|
} catch(InterruptedException ignored) { }
|
||||||
p.println(t);
|
}
|
||||||
|
//nothing left?
|
||||||
|
if(toknown.isEmpty() && toCrawl.isEmpty() && requested.size() == threads.size()) {//very uncommon
|
||||||
|
log.warn("nothing left to crawl");
|
||||||
|
crawl = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
//refil the tocrawl list.
|
||||||
|
if(!toknown.isEmpty()) {
|
||||||
|
//check in db for known videos
|
||||||
|
log.info("Checking the DB");
|
||||||
|
while(toCrawl.size() < jobspeerthread * threads.size() * 2 && crawl && !toknown.isEmpty()) {
|
||||||
|
LinkedList<String> tocheck = new LinkedList<>();
|
||||||
|
for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) {
|
||||||
|
tocheck.add(toknown.removeFirst());
|
||||||
}
|
}
|
||||||
p.close();
|
toCrawl.addAll(db.checkvideos(tocheck));
|
||||||
} catch (IOException e) {
|
|
||||||
log.error("Error writing crawlfile.", e);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// System.out.println("try to save " + toSave.size() + " videos.");
|
|
||||||
|
//writing crawlfile
|
||||||
|
log.info("Writing Crawlfile");
|
||||||
|
try {
|
||||||
|
PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile)));
|
||||||
|
for(String t : toCrawl) {
|
||||||
|
p.println(t);
|
||||||
|
}
|
||||||
|
p.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.error("Error writing crawlfile.", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
//get reports
|
||||||
|
for (CrawlerThread crawlerThread : threads) {
|
||||||
|
LinkedList<String> report = crawlerThread.report();
|
||||||
|
crawlcount+= report.size();
|
||||||
|
toSave.addAll(report);
|
||||||
|
crawlerThread.list.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
//save to db
|
||||||
while(!toSave.isEmpty()) {
|
while(!toSave.isEmpty()) {
|
||||||
LinkedList<String> videoids = new LinkedList<>();
|
LinkedList<String> videoids = new LinkedList<>();
|
||||||
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
|
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
|
||||||
|
@ -124,9 +174,20 @@ public class Crawler implements Runnable {
|
||||||
db.addVideos(videos);
|
db.addVideos(videos);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(startup > 0) {
|
||||||
|
startup --;
|
||||||
|
try {
|
||||||
|
Thread.sleep(20000);
|
||||||
|
} catch(InterruptedException e) {}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
long diff = (System.currentTimeMillis() - start)/ 60000;
|
|
||||||
log.info("Crawling Stopped. Runtime: " + ((int) diff) + "min and " + crawlcount + " videos crawled.");
|
//end
|
||||||
|
long runtimes = (System.currentTimeMillis() - start) / 1000;
|
||||||
|
int runtimem = (int) (runtimes / 60);
|
||||||
|
float vidps = (crawlcount / (float) runtimes);
|
||||||
|
log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " )");
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Video getVideo() {
|
public static Video getVideo() {
|
||||||
|
@ -140,4 +201,6 @@ public class Crawler implements Runnable {
|
||||||
byte categorie;
|
byte categorie;
|
||||||
long created;
|
long created;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,86 @@
|
||||||
|
package de.mrbesen.youtubecrawler;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
|
public class CrawlerThread implements Runnable {
|
||||||
|
|
||||||
|
private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v=");
|
||||||
|
|
||||||
|
private Logger log = Logger.getLogger(this.getClass().getName());
|
||||||
|
private Crawler parent;
|
||||||
|
|
||||||
|
LinkedList<String> todo = new LinkedList<>();//videos, this thread should crawl
|
||||||
|
LinkedList<String> list = new LinkedList<>();//videos this thread had crawled
|
||||||
|
|
||||||
|
boolean requested = true;//is a request pending?
|
||||||
|
|
||||||
|
public CrawlerThread( Crawler root) {
|
||||||
|
parent = root;
|
||||||
|
root.request(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
while(parent.isCrawling()) {
|
||||||
|
while(!todo.isEmpty() && parent.isCrawling()) {
|
||||||
|
crawl(todo.removeFirst());
|
||||||
|
if(todo.size() < 5 && !requested) {
|
||||||
|
requested = true;
|
||||||
|
parent.request(this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(todo.isEmpty() && !requested) {
|
||||||
|
requested = true;
|
||||||
|
parent.request(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
log.warn("No Object left!");
|
||||||
|
Thread.yield();
|
||||||
|
try {
|
||||||
|
Thread.sleep(10000);//sleep for 10 seconds
|
||||||
|
} catch (InterruptedException ignored) {}
|
||||||
|
}
|
||||||
|
log.info("Stopped.");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* returns a linkedlist of all crawled videos
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
LinkedList<String> report() {
|
||||||
|
return list;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void crawl(String videoid) {
|
||||||
|
try {
|
||||||
|
list.add(videoid);
|
||||||
|
|
||||||
|
// log.info("crawling: " + videoid);
|
||||||
|
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
|
||||||
|
String s = con.getContent();
|
||||||
|
Matcher matcher = linkpattern.matcher(s);
|
||||||
|
while(matcher.find()) {
|
||||||
|
int beginytid = matcher.end();
|
||||||
|
int endxtid = s.indexOf('"', beginytid);
|
||||||
|
int endid = s.indexOf('&', beginytid);
|
||||||
|
if(endid < endxtid) {
|
||||||
|
endxtid = endid;
|
||||||
|
}
|
||||||
|
String ytid = s.substring(beginytid, endxtid);
|
||||||
|
if(ytid.length() > 9 && ytid.length() <= 12) {
|
||||||
|
parent.addtoCrawl(ytid);
|
||||||
|
} else {
|
||||||
|
// log.warn("youtube id has wrong length: \"" + ytid + "\"");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch(IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -13,7 +13,6 @@ import de.mrbesen.youtubecrawler.Crawler.Video;
|
||||||
public class DB {
|
public class DB {
|
||||||
|
|
||||||
private Connection con;
|
private Connection con;
|
||||||
//private String server = "localhost", user = "ytcrawler", pw ="pDWmDhmZKArwvG2q", db = "ytcrawler";
|
|
||||||
private String server = Config.prop.getProperty("db.host", "localhost"), user = Config.prop.getProperty("db.user", "ytcrawler"), pw = Config.prop.getProperty("db.pw", ""), db = Config.prop.getProperty("db.dbname", "ytcrawler");
|
private String server = Config.prop.getProperty("db.host", "localhost"), user = Config.prop.getProperty("db.user", "ytcrawler"), pw = Config.prop.getProperty("db.pw", ""), db = Config.prop.getProperty("db.dbname", "ytcrawler");
|
||||||
private int port = Integer.parseInt(Config.prop.getProperty("db.port", "3306"));
|
private int port = Integer.parseInt(Config.prop.getProperty("db.port", "3306"));
|
||||||
private Logger log = Logger.getLogger(DB.class.getName());
|
private Logger log = Logger.getLogger(DB.class.getName());
|
||||||
|
@ -22,7 +21,7 @@ public class DB {
|
||||||
public DB() {
|
public DB() {
|
||||||
try {
|
try {
|
||||||
connect(false);
|
connect(false);
|
||||||
|
|
||||||
//set the database up!
|
//set the database up!
|
||||||
boolean found = false;
|
boolean found = false;
|
||||||
ResultSet set = con.getMetaData().getCatalogs();//does the db exists?
|
ResultSet set = con.getMetaData().getCatalogs();//does the db exists?
|
||||||
|
@ -41,7 +40,7 @@ public class DB {
|
||||||
con.setCatalog(db);
|
con.setCatalog(db);
|
||||||
update("CREATE TABLE `videos` (`id` varchar(13) NOT NULL,`length` int(11) NOT NULL,`created` int(11) NOT NULL,`langcode` varchar(3) NOT NULL DEFAULT 'en',`category` int(11) DEFAULT NULL, PRIMARY KEY (`id`), UNIQUE KEY `id_UNIQUE` (`id`)) ENGINE=InnoDB DEFAULT CHARSET=latin1;");
|
update("CREATE TABLE `videos` (`id` varchar(13) NOT NULL,`length` int(11) NOT NULL,`created` int(11) NOT NULL,`langcode` varchar(3) NOT NULL DEFAULT 'en',`category` int(11) DEFAULT NULL, PRIMARY KEY (`id`), UNIQUE KEY `id_UNIQUE` (`id`)) ENGINE=InnoDB DEFAULT CHARSET=latin1;");
|
||||||
|
|
||||||
log.info("Database is set up! -> \n\nFirst Entry in uploaded needed!!!!!!\nPlease insert MANUALY!\n ");
|
log.info("Database is set up!");
|
||||||
}
|
}
|
||||||
} catch (SQLException e) {
|
} catch (SQLException e) {
|
||||||
log.error("Error while connecting to the database! ", e);
|
log.error("Error while connecting to the database! ", e);
|
||||||
|
@ -58,25 +57,26 @@ public class DB {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* removes all videos, that are known from the db
|
* removes all videos, that are known from the db
|
||||||
* @param input
|
* @param input
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public List<String> checkvideos(List<String> input) {
|
public List<String> checkvideos(List<String> input) {
|
||||||
StringBuilder ids = new StringBuilder();
|
if(!input.isEmpty()) {
|
||||||
for(int i = 0; i < input.size(); i++) {
|
StringBuilder ids = new StringBuilder();
|
||||||
ids.append(',').append(input.get(i));
|
for(int i = 0; i < input.size(); i++) {
|
||||||
}
|
ids.append(',').append(input.get(i));
|
||||||
String query = "SELECT `id` FROM `videos` WHERE concat('%',`id`,'%') LIKE '" + ids.toString() + "';";
|
}
|
||||||
ResultSet res = query(query);
|
String query = "SELECT `id` FROM `videos` WHERE concat('%',`id`,'%') LIKE '" + ids.toString() + "';";
|
||||||
try {
|
ResultSet res = query(query);
|
||||||
while(res.next()) {
|
try {
|
||||||
input.remove(res.getString(1));
|
while(res.next()) {
|
||||||
|
input.remove(res.getString(1));
|
||||||
|
}
|
||||||
|
} catch(SQLException e) {
|
||||||
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
} catch(SQLException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
}
|
||||||
return input;
|
return input;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue