diff --git a/src/de/mrbesen/youtubecrawler/Config.java b/src/de/mrbesen/youtubecrawler/Config.java index a8c4731..003e594 100644 --- a/src/de/mrbesen/youtubecrawler/Config.java +++ b/src/de/mrbesen/youtubecrawler/Config.java @@ -19,12 +19,14 @@ public class Config { private static Map properties = new HashMap() { { - put("db.host" , "localhost" ); - put("db.port" , "3306" ); - put("db.user" , "ytcrawler" ); - put("db.pw" , "" ); - put("db.dbname" , "ytcrawler" ); - put("youtube.apikey", "" ); + put("db.host" , "localhost" ); + put("db.port" , "3306" ); + put("db.user" , "ytcrawler" ); + put("db.pw" , "" ); + put("db.dbname" , "ytcrawler" ); + put("youtube.apikey" , "" ); + put("crawler.maxvideos" , "100" ); + put("crawler.threadcount", "4" ); } }; diff --git a/src/de/mrbesen/youtubecrawler/Crawler.java b/src/de/mrbesen/youtubecrawler/Crawler.java index fd8a37b..278ff57 100644 --- a/src/de/mrbesen/youtubecrawler/Crawler.java +++ b/src/de/mrbesen/youtubecrawler/Crawler.java @@ -5,63 +5,70 @@ import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; +import java.util.ArrayList; import java.util.LinkedList; import java.util.List; import java.util.Scanner; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.apache.log4j.Logger; public class Crawler implements Runnable { + private static int jobspeerthread = 100; //the amount of jobs a thread get peer request + private LinkedList toSave = new LinkedList<>();//all found ytids, witch need to be analysed private LinkedList toCrawl = new LinkedList<>();//all videos tu crawl private LinkedList toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle - private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v="); + private List threads;//list of all threads + private List requested = new LinkedList<>(); + private boolean crawl = true; private int crawlcount = 0; + private DB db = new DB(); private YoutubeAPI api = new YoutubeAPI(); private File crawlfile = new File("crawl.txt"); private Logger log = Logger.getLogger(Crawler.class.getName()); - private static int maxvideostotest = 100; + + private int maxvideostotest; + private int startup = 10;//to keep the beginning cool + + public Crawler() { + try { + maxvideostotest = Integer.parseInt(Config.prop.getProperty("crawler.maxvideos")); + } catch(NumberFormatException e) { + log.warn("could not read the number \"" + Config.prop.getProperty("") + "\" from the config file. maxvideo"); + maxvideostotest = 100; + } + } public void stop() { crawl = false; } - public void addtoCrawl(String videoid) { + public synchronized void addtoCrawl(String videoid) {//TODO some kind of lock! if(! (toCrawl.contains(videoid) || toknown.contains(videoid))) toknown.add(videoid); } - private void crawl(String videoid) { - try { - crawlcount++; - log.info("crawling: " + videoid); - toSave.add(videoid); - HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid); - String s = con.getContent(); - Matcher matcher = linkpattern.matcher(s); - while(matcher.find()) { - int beginytid = matcher.end(); - int endxtid = s.indexOf('"', beginytid); - int endid = s.indexOf('&', beginytid); - if(endid < endxtid) { - endxtid = endid; - } - String ytid = s.substring(beginytid, endxtid); - if(ytid.length() > 9 && ytid.length() <= 12) { - addtoCrawl(ytid); - } else { - log.warn("youtube id has wrong length: \"" + ytid + "\""); - } - } - } catch(IOException e) { - e.printStackTrace(); + public boolean isCrawling() { + return crawl; + } + + public void request(CrawlerThread t) { + if(!toCrawl.isEmpty()) { + send(t); + } else { + requested.add(t); } } + + private void send(CrawlerThread t) { + for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) { + t.todo.add(toCrawl.removeFirst()); + } + t.requested = false; + } @Override public void run() { @@ -86,34 +93,77 @@ public class Crawler implements Runnable { e.printStackTrace(); } } + + //populate threads + int threadcount = 4; + try { + threadcount = Integer.parseInt(Config.prop.getProperty("crawler.threadcount")); + } catch(NumberFormatException e) { + log.warn("Could not read the Number \"" + Config.prop.getProperty("crawler.threadcount") + "\" from the Config."); + } + threads = new ArrayList<>(threadcount); + + for(int i = 0; i < threadcount; i++) { + CrawlerThread thr = new CrawlerThread( this); + new Thread(thr, "Crawler #" + i).start(); + threads.add(thr); + } + while(crawl) { log.info("to Crawl: " + toCrawl.size()); - while(!toCrawl.isEmpty() && crawl) { - crawl(toCrawl.remove(0)); + //fullfill request + while(!requested.isEmpty() && !toCrawl.isEmpty() && crawl) { + send(requested.remove(0)); } - if(toknown.isEmpty()) {//very uncommon - log.warn("nothing left."); - crawl = false; - //delete / clear crawl file - } else { - LinkedList tocheck = new LinkedList<>(); - for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) { - tocheck.add(toknown.removeFirst()); - } - toCrawl.addAll(db.checkvideos(tocheck)); - + + //kindof idle + while(toCrawl.size() > (jobspeerthread * threads.size()) && crawl && requested.isEmpty()) { + Thread.yield(); try { - PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile))); - for(String t : toCrawl) { - p.println(t); + Thread.sleep(100); + } catch(InterruptedException ignored) { } + } + //nothing left? + if(toknown.isEmpty() && toCrawl.isEmpty() && requested.size() == threads.size()) {//very uncommon + log.warn("nothing left to crawl"); + crawl = false; + } + + //refil the tocrawl list. + if(!toknown.isEmpty()) { + //check in db for known videos + log.info("Checking the DB"); + while(toCrawl.size() < jobspeerthread * threads.size() * 2 && crawl && !toknown.isEmpty()) { + LinkedList tocheck = new LinkedList<>(); + for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) { + tocheck.add(toknown.removeFirst()); } - p.close(); - } catch (IOException e) { - log.error("Error writing crawlfile.", e); + toCrawl.addAll(db.checkvideos(tocheck)); } } - // System.out.println("try to save " + toSave.size() + " videos."); + + //writing crawlfile + log.info("Writing Crawlfile"); + try { + PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile))); + for(String t : toCrawl) { + p.println(t); + } + p.close(); + } catch (IOException e) { + log.error("Error writing crawlfile.", e); + } + + //get reports + for (CrawlerThread crawlerThread : threads) { + LinkedList report = crawlerThread.report(); + crawlcount+= report.size(); + toSave.addAll(report); + crawlerThread.list.clear(); + } + + //save to db while(!toSave.isEmpty()) { LinkedList videoids = new LinkedList<>(); for(int i = 0; i < 50 && !toSave.isEmpty(); i++) { @@ -124,9 +174,20 @@ public class Crawler implements Runnable { db.addVideos(videos); } } + + if(startup > 0) { + startup --; + try { + Thread.sleep(20000); + } catch(InterruptedException e) {} + } } - long diff = (System.currentTimeMillis() - start)/ 60000; - log.info("Crawling Stopped. Runtime: " + ((int) diff) + "min and " + crawlcount + " videos crawled."); + + //end + long runtimes = (System.currentTimeMillis() - start) / 1000; + int runtimem = (int) (runtimes / 60); + float vidps = (crawlcount / (float) runtimes); + log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " )"); } public static Video getVideo() { @@ -140,4 +201,6 @@ public class Crawler implements Runnable { byte categorie; long created; } + + } diff --git a/src/de/mrbesen/youtubecrawler/CrawlerThread.java b/src/de/mrbesen/youtubecrawler/CrawlerThread.java new file mode 100644 index 0000000..be2a060 --- /dev/null +++ b/src/de/mrbesen/youtubecrawler/CrawlerThread.java @@ -0,0 +1,86 @@ +package de.mrbesen.youtubecrawler; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.log4j.Logger; + +public class CrawlerThread implements Runnable { + + private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v="); + + private Logger log = Logger.getLogger(this.getClass().getName()); + private Crawler parent; + + LinkedList todo = new LinkedList<>();//videos, this thread should crawl + LinkedList list = new LinkedList<>();//videos this thread had crawled + + boolean requested = true;//is a request pending? + + public CrawlerThread( Crawler root) { + parent = root; + root.request(this); + } + + @Override + public void run() { + while(parent.isCrawling()) { + while(!todo.isEmpty() && parent.isCrawling()) { + crawl(todo.removeFirst()); + if(todo.size() < 5 && !requested) { + requested = true; + parent.request(this); + } + } + + if(todo.isEmpty() && !requested) { + requested = true; + parent.request(this); + } + + log.warn("No Object left!"); + Thread.yield(); + try { + Thread.sleep(10000);//sleep for 10 seconds + } catch (InterruptedException ignored) {} + } + log.info("Stopped."); + } + + /** + * returns a linkedlist of all crawled videos + * @return + */ + LinkedList report() { + return list; + } + + private void crawl(String videoid) { + try { + list.add(videoid); + +// log.info("crawling: " + videoid); + HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid); + String s = con.getContent(); + Matcher matcher = linkpattern.matcher(s); + while(matcher.find()) { + int beginytid = matcher.end(); + int endxtid = s.indexOf('"', beginytid); + int endid = s.indexOf('&', beginytid); + if(endid < endxtid) { + endxtid = endid; + } + String ytid = s.substring(beginytid, endxtid); + if(ytid.length() > 9 && ytid.length() <= 12) { + parent.addtoCrawl(ytid); + } else { +// log.warn("youtube id has wrong length: \"" + ytid + "\""); + } + } + } catch(IOException e) { + e.printStackTrace(); + } + } +} \ No newline at end of file diff --git a/src/de/mrbesen/youtubecrawler/DB.java b/src/de/mrbesen/youtubecrawler/DB.java index facad79..e183889 100644 --- a/src/de/mrbesen/youtubecrawler/DB.java +++ b/src/de/mrbesen/youtubecrawler/DB.java @@ -13,7 +13,6 @@ import de.mrbesen.youtubecrawler.Crawler.Video; public class DB { private Connection con; - //private String server = "localhost", user = "ytcrawler", pw ="pDWmDhmZKArwvG2q", db = "ytcrawler"; private String server = Config.prop.getProperty("db.host", "localhost"), user = Config.prop.getProperty("db.user", "ytcrawler"), pw = Config.prop.getProperty("db.pw", ""), db = Config.prop.getProperty("db.dbname", "ytcrawler"); private int port = Integer.parseInt(Config.prop.getProperty("db.port", "3306")); private Logger log = Logger.getLogger(DB.class.getName()); @@ -22,7 +21,7 @@ public class DB { public DB() { try { connect(false); - + //set the database up! boolean found = false; ResultSet set = con.getMetaData().getCatalogs();//does the db exists? @@ -41,7 +40,7 @@ public class DB { con.setCatalog(db); update("CREATE TABLE `videos` (`id` varchar(13) NOT NULL,`length` int(11) NOT NULL,`created` int(11) NOT NULL,`langcode` varchar(3) NOT NULL DEFAULT 'en',`category` int(11) DEFAULT NULL, PRIMARY KEY (`id`), UNIQUE KEY `id_UNIQUE` (`id`)) ENGINE=InnoDB DEFAULT CHARSET=latin1;"); - log.info("Database is set up! -> \n\nFirst Entry in uploaded needed!!!!!!\nPlease insert MANUALY!\n "); + log.info("Database is set up!"); } } catch (SQLException e) { log.error("Error while connecting to the database! ", e); @@ -58,25 +57,26 @@ public class DB { } } - /** * removes all videos, that are known from the db * @param input * @return */ public List checkvideos(List input) { - StringBuilder ids = new StringBuilder(); - for(int i = 0; i < input.size(); i++) { - ids.append(',').append(input.get(i)); - } - String query = "SELECT `id` FROM `videos` WHERE concat('%',`id`,'%') LIKE '" + ids.toString() + "';"; - ResultSet res = query(query); - try { - while(res.next()) { - input.remove(res.getString(1)); + if(!input.isEmpty()) { + StringBuilder ids = new StringBuilder(); + for(int i = 0; i < input.size(); i++) { + ids.append(',').append(input.get(i)); + } + String query = "SELECT `id` FROM `videos` WHERE concat('%',`id`,'%') LIKE '" + ids.toString() + "';"; + ResultSet res = query(query); + try { + while(res.next()) { + input.remove(res.getString(1)); + } + } catch(SQLException e) { + e.printStackTrace(); } - } catch(SQLException e) { - e.printStackTrace(); } return input; }