Multithreading

This commit is contained in:
mrbesen 2018-07-16 23:22:32 +02:00
parent 73643625f3
commit 04b098d9ff
4 changed files with 223 additions and 72 deletions

View File

@ -19,12 +19,14 @@ public class Config {
private static Map<String,String> properties = new HashMap<String, String>() {
{
put("db.host" , "localhost" );
put("db.port" , "3306" );
put("db.user" , "ytcrawler" );
put("db.pw" , "" );
put("db.dbname" , "ytcrawler" );
put("youtube.apikey", "" );
put("db.host" , "localhost" );
put("db.port" , "3306" );
put("db.user" , "ytcrawler" );
put("db.pw" , "" );
put("db.dbname" , "ytcrawler" );
put("youtube.apikey" , "" );
put("crawler.maxvideos" , "100" );
put("crawler.threadcount", "4" );
}
};

View File

@ -5,63 +5,70 @@ import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
public class Crawler implements Runnable {
private static int jobspeerthread = 100; //the amount of jobs a thread get peer request
private LinkedList<String> toSave = new LinkedList<>();//all found ytids, witch need to be analysed
private LinkedList<String> toCrawl = new LinkedList<>();//all videos tu crawl
private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle
private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v=");
private List<CrawlerThread> threads;//list of all threads
private List<CrawlerThread> requested = new LinkedList<>();
private boolean crawl = true;
private int crawlcount = 0;
private DB db = new DB();
private YoutubeAPI api = new YoutubeAPI();
private File crawlfile = new File("crawl.txt");
private Logger log = Logger.getLogger(Crawler.class.getName());
private static int maxvideostotest = 100;
private int maxvideostotest;
private int startup = 10;//to keep the beginning cool
public Crawler() {
try {
maxvideostotest = Integer.parseInt(Config.prop.getProperty("crawler.maxvideos"));
} catch(NumberFormatException e) {
log.warn("could not read the number \"" + Config.prop.getProperty("") + "\" from the config file. maxvideo");
maxvideostotest = 100;
}
}
public void stop() {
crawl = false;
}
public void addtoCrawl(String videoid) {
public synchronized void addtoCrawl(String videoid) {//TODO some kind of lock!
if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
toknown.add(videoid);
}
private void crawl(String videoid) {
try {
crawlcount++;
log.info("crawling: " + videoid);
toSave.add(videoid);
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
String s = con.getContent();
Matcher matcher = linkpattern.matcher(s);
while(matcher.find()) {
int beginytid = matcher.end();
int endxtid = s.indexOf('"', beginytid);
int endid = s.indexOf('&', beginytid);
if(endid < endxtid) {
endxtid = endid;
}
String ytid = s.substring(beginytid, endxtid);
if(ytid.length() > 9 && ytid.length() <= 12) {
addtoCrawl(ytid);
} else {
log.warn("youtube id has wrong length: \"" + ytid + "\"");
}
}
} catch(IOException e) {
e.printStackTrace();
public boolean isCrawling() {
return crawl;
}
public void request(CrawlerThread t) {
if(!toCrawl.isEmpty()) {
send(t);
} else {
requested.add(t);
}
}
private void send(CrawlerThread t) {
for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) {
t.todo.add(toCrawl.removeFirst());
}
t.requested = false;
}
@Override
public void run() {
@ -86,34 +93,77 @@ public class Crawler implements Runnable {
e.printStackTrace();
}
}
//populate threads
int threadcount = 4;
try {
threadcount = Integer.parseInt(Config.prop.getProperty("crawler.threadcount"));
} catch(NumberFormatException e) {
log.warn("Could not read the Number \"" + Config.prop.getProperty("crawler.threadcount") + "\" from the Config.");
}
threads = new ArrayList<>(threadcount);
for(int i = 0; i < threadcount; i++) {
CrawlerThread thr = new CrawlerThread( this);
new Thread(thr, "Crawler #" + i).start();
threads.add(thr);
}
while(crawl) {
log.info("to Crawl: " + toCrawl.size());
while(!toCrawl.isEmpty() && crawl) {
crawl(toCrawl.remove(0));
//fullfill request
while(!requested.isEmpty() && !toCrawl.isEmpty() && crawl) {
send(requested.remove(0));
}
if(toknown.isEmpty()) {//very uncommon
log.warn("nothing left.");
crawl = false;
//delete / clear crawl file
} else {
LinkedList<String> tocheck = new LinkedList<>();
for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) {
tocheck.add(toknown.removeFirst());
}
toCrawl.addAll(db.checkvideos(tocheck));
//kindof idle
while(toCrawl.size() > (jobspeerthread * threads.size()) && crawl && requested.isEmpty()) {
Thread.yield();
try {
PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile)));
for(String t : toCrawl) {
p.println(t);
Thread.sleep(100);
} catch(InterruptedException ignored) { }
}
//nothing left?
if(toknown.isEmpty() && toCrawl.isEmpty() && requested.size() == threads.size()) {//very uncommon
log.warn("nothing left to crawl");
crawl = false;
}
//refil the tocrawl list.
if(!toknown.isEmpty()) {
//check in db for known videos
log.info("Checking the DB");
while(toCrawl.size() < jobspeerthread * threads.size() * 2 && crawl && !toknown.isEmpty()) {
LinkedList<String> tocheck = new LinkedList<>();
for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) {
tocheck.add(toknown.removeFirst());
}
p.close();
} catch (IOException e) {
log.error("Error writing crawlfile.", e);
toCrawl.addAll(db.checkvideos(tocheck));
}
}
// System.out.println("try to save " + toSave.size() + " videos.");
//writing crawlfile
log.info("Writing Crawlfile");
try {
PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile)));
for(String t : toCrawl) {
p.println(t);
}
p.close();
} catch (IOException e) {
log.error("Error writing crawlfile.", e);
}
//get reports
for (CrawlerThread crawlerThread : threads) {
LinkedList<String> report = crawlerThread.report();
crawlcount+= report.size();
toSave.addAll(report);
crawlerThread.list.clear();
}
//save to db
while(!toSave.isEmpty()) {
LinkedList<String> videoids = new LinkedList<>();
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
@ -124,9 +174,20 @@ public class Crawler implements Runnable {
db.addVideos(videos);
}
}
if(startup > 0) {
startup --;
try {
Thread.sleep(20000);
} catch(InterruptedException e) {}
}
}
long diff = (System.currentTimeMillis() - start)/ 60000;
log.info("Crawling Stopped. Runtime: " + ((int) diff) + "min and " + crawlcount + " videos crawled.");
//end
long runtimes = (System.currentTimeMillis() - start) / 1000;
int runtimem = (int) (runtimes / 60);
float vidps = (crawlcount / (float) runtimes);
log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " )");
}
public static Video getVideo() {
@ -140,4 +201,6 @@ public class Crawler implements Runnable {
byte categorie;
long created;
}
}

View File

@ -0,0 +1,86 @@
package de.mrbesen.youtubecrawler;
import java.io.IOException;
import java.util.LinkedList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
public class CrawlerThread implements Runnable {
private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v=");
private Logger log = Logger.getLogger(this.getClass().getName());
private Crawler parent;
LinkedList<String> todo = new LinkedList<>();//videos, this thread should crawl
LinkedList<String> list = new LinkedList<>();//videos this thread had crawled
boolean requested = true;//is a request pending?
public CrawlerThread( Crawler root) {
parent = root;
root.request(this);
}
@Override
public void run() {
while(parent.isCrawling()) {
while(!todo.isEmpty() && parent.isCrawling()) {
crawl(todo.removeFirst());
if(todo.size() < 5 && !requested) {
requested = true;
parent.request(this);
}
}
if(todo.isEmpty() && !requested) {
requested = true;
parent.request(this);
}
log.warn("No Object left!");
Thread.yield();
try {
Thread.sleep(10000);//sleep for 10 seconds
} catch (InterruptedException ignored) {}
}
log.info("Stopped.");
}
/**
* returns a linkedlist of all crawled videos
* @return
*/
LinkedList<String> report() {
return list;
}
private void crawl(String videoid) {
try {
list.add(videoid);
// log.info("crawling: " + videoid);
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
String s = con.getContent();
Matcher matcher = linkpattern.matcher(s);
while(matcher.find()) {
int beginytid = matcher.end();
int endxtid = s.indexOf('"', beginytid);
int endid = s.indexOf('&', beginytid);
if(endid < endxtid) {
endxtid = endid;
}
String ytid = s.substring(beginytid, endxtid);
if(ytid.length() > 9 && ytid.length() <= 12) {
parent.addtoCrawl(ytid);
} else {
// log.warn("youtube id has wrong length: \"" + ytid + "\"");
}
}
} catch(IOException e) {
e.printStackTrace();
}
}
}

View File

@ -13,7 +13,6 @@ import de.mrbesen.youtubecrawler.Crawler.Video;
public class DB {
private Connection con;
//private String server = "localhost", user = "ytcrawler", pw ="pDWmDhmZKArwvG2q", db = "ytcrawler";
private String server = Config.prop.getProperty("db.host", "localhost"), user = Config.prop.getProperty("db.user", "ytcrawler"), pw = Config.prop.getProperty("db.pw", ""), db = Config.prop.getProperty("db.dbname", "ytcrawler");
private int port = Integer.parseInt(Config.prop.getProperty("db.port", "3306"));
private Logger log = Logger.getLogger(DB.class.getName());
@ -22,7 +21,7 @@ public class DB {
public DB() {
try {
connect(false);
//set the database up!
boolean found = false;
ResultSet set = con.getMetaData().getCatalogs();//does the db exists?
@ -41,7 +40,7 @@ public class DB {
con.setCatalog(db);
update("CREATE TABLE `videos` (`id` varchar(13) NOT NULL,`length` int(11) NOT NULL,`created` int(11) NOT NULL,`langcode` varchar(3) NOT NULL DEFAULT 'en',`category` int(11) DEFAULT NULL, PRIMARY KEY (`id`), UNIQUE KEY `id_UNIQUE` (`id`)) ENGINE=InnoDB DEFAULT CHARSET=latin1;");
log.info("Database is set up! -> \n\nFirst Entry in uploaded needed!!!!!!\nPlease insert MANUALY!\n ");
log.info("Database is set up!");
}
} catch (SQLException e) {
log.error("Error while connecting to the database! ", e);
@ -58,25 +57,26 @@ public class DB {
}
}
/**
* removes all videos, that are known from the db
* @param input
* @return
*/
public List<String> checkvideos(List<String> input) {
StringBuilder ids = new StringBuilder();
for(int i = 0; i < input.size(); i++) {
ids.append(',').append(input.get(i));
}
String query = "SELECT `id` FROM `videos` WHERE concat('%',`id`,'%') LIKE '" + ids.toString() + "';";
ResultSet res = query(query);
try {
while(res.next()) {
input.remove(res.getString(1));
if(!input.isEmpty()) {
StringBuilder ids = new StringBuilder();
for(int i = 0; i < input.size(); i++) {
ids.append(',').append(input.get(i));
}
String query = "SELECT `id` FROM `videos` WHERE concat('%',`id`,'%') LIKE '" + ids.toString() + "';";
ResultSet res = query(query);
try {
while(res.next()) {
input.remove(res.getString(1));
}
} catch(SQLException e) {
e.printStackTrace();
}
} catch(SQLException e) {
e.printStackTrace();
}
return input;
}