447 lines
13 KiB
Java
447 lines
13 KiB
Java
package de.mrbesen.youtubecrawler;
|
|
|
|
import java.io.BufferedWriter;
|
|
import java.io.File;
|
|
import java.io.FileWriter;
|
|
import java.io.IOException;
|
|
import java.io.PrintWriter;
|
|
import java.text.DateFormat;
|
|
import java.text.SimpleDateFormat;
|
|
import java.util.*;
|
|
|
|
import lombok.AllArgsConstructor;
|
|
import lombok.NoArgsConstructor;
|
|
import org.apache.log4j.Logger;
|
|
|
|
public class Crawler implements Runnable {
|
|
|
|
private int jobspeerthread = 100; //the amount of jobs a thread get peer request
|
|
int requestlimit = 5;//amount of videos to be left in the todo que of a thread until it requests new videos
|
|
private int idlecount = 5;//amount of idle loops allowed
|
|
|
|
private Set<String> toSave = new TreeSet<>();//all found ytids, witch need to be analysed
|
|
private Set<String> toCrawl = new TreeSet<>();//all videos tu crawl
|
|
//private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle
|
|
private List<CrawlerThread> threads;//list of all threads
|
|
private List<CrawlerThread> requested = new LinkedList<>();
|
|
|
|
private static DateFormat dateform = new SimpleDateFormat("dd-MM-yyyy HH:mm:ss");
|
|
private long start;
|
|
|
|
private boolean crawl = true;
|
|
private int crawlcount = 0;
|
|
//private int updateOffset = 0;
|
|
|
|
private DB db = new DB();
|
|
private YoutubeAPI api = new YoutubeAPI(Config.prop.getProperty("youtube.apikey"));
|
|
private File crawlfile = new File("crawl.txt");
|
|
private Logger log = Logger.getLogger(this.getClass().getName());
|
|
private Profiler profiler = new Profiler();
|
|
private long lastadminreport = 0;
|
|
|
|
private int startup = 2;//to keep the beginning cool - counter how often the program is allowed to enter startup sleep
|
|
|
|
public Crawler() {
|
|
try {
|
|
jobspeerthread = Integer.parseInt(Config.prop.getProperty("crawler.jobspeerthread"));
|
|
} catch(NumberFormatException e) {
|
|
log.warn("could not read the number \"" + Config.prop.getProperty("crawler.jobspeerthread") + "\" from the config file. maxvideo");
|
|
jobspeerthread = 100;
|
|
}
|
|
try {
|
|
requestlimit = Integer.parseInt(Config.prop.getProperty("crawler.requestlimit"));
|
|
} catch(NumberFormatException e) {
|
|
log.warn("could not read the number \"" + Config.prop.getProperty("crawler.requestlimit") + "\" from the config file. crawler.requestlimit");
|
|
}
|
|
try {
|
|
idlecount = Integer.parseInt(Config.prop.getProperty("crawler.idlecount"));
|
|
} catch(NumberFormatException e) {
|
|
log.warn("could not read the number \"" + Config.prop.getProperty("crawler.idlecount") + "\" from the config file. crawler.idlecount");
|
|
}
|
|
}
|
|
|
|
public void stop() {
|
|
crawl = false;
|
|
}
|
|
|
|
public synchronized void addtoCrawl(String videoid) {
|
|
//if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
|
|
//toknown.add(videoid);
|
|
if(toCrawl.contains(videoid)) {
|
|
ArrayList<String> str = new ArrayList<>(1);
|
|
str.add(videoid);
|
|
db.storeTemp(str, false);
|
|
}
|
|
}
|
|
|
|
public boolean isCrawling() {
|
|
return crawl;
|
|
}
|
|
|
|
public void request(CrawlerThread t) {
|
|
if(!toCrawl.isEmpty()) {
|
|
send(t);
|
|
} else {
|
|
requested.add(t);
|
|
}
|
|
}
|
|
|
|
private void send(CrawlerThread t) {
|
|
synchronized (toCrawl) {
|
|
for (int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) {
|
|
String s = toCrawl.stream().findAny().get();
|
|
toCrawl.remove(s);
|
|
t.todo.add(s);
|
|
}
|
|
}
|
|
t.requested = false;
|
|
}
|
|
|
|
private void loadCrawlFile() {
|
|
log.info("Try to load crawlfile");
|
|
if(crawlfile.exists()) {
|
|
try {
|
|
Scanner in = new Scanner(crawlfile);
|
|
//boolean crawl = true;//section of file
|
|
while(in.hasNextLine()) {
|
|
String line = in.nextLine();
|
|
if(line == null) {
|
|
break;
|
|
} else {
|
|
if(!line.isEmpty()) {
|
|
if(line.equals("-")) {//section delimiter
|
|
continue;
|
|
} else {
|
|
//if(crawl) {
|
|
toCrawl.add(line);
|
|
/*} else {
|
|
toknown.add(line);
|
|
}*/
|
|
}
|
|
}
|
|
}
|
|
}
|
|
in.close();
|
|
} catch(IOException e) {
|
|
log.warn("Error while loading crawl file.");
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
}
|
|
|
|
private int createThreads() {
|
|
//populate threads
|
|
int threadcount = 4;
|
|
try {
|
|
threadcount = Integer.parseInt(Config.prop.getProperty("crawler.threadcount"));
|
|
} catch(NumberFormatException e) {
|
|
log.warn("Could not read the Number \"" + Config.prop.getProperty("crawler.threadcount") + "\" from the Config.");
|
|
}
|
|
threads = new ArrayList<>(threadcount);
|
|
CrawlerThread.initLib(threadcount);
|
|
for(int i = 0; i < threadcount; i++) {
|
|
CrawlerThread thr = new CrawlerThread( this, i);
|
|
thr.setThread(new Thread(thr, "Crawler #" + i));
|
|
threads.add(thr);
|
|
thr.thread.start();
|
|
}
|
|
return threadcount;
|
|
}
|
|
|
|
private void getreports() {
|
|
log.info("get report");
|
|
for (CrawlerThread crawlerThread : threads) {
|
|
String threadname = crawlerThread.thread.getName();
|
|
profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1));
|
|
List<String>[] report = crawlerThread.report();
|
|
crawlcount+= report[0].size();
|
|
toSave.addAll(report[0]);
|
|
crawlerThread.crawled.clear();
|
|
|
|
int count = 0;
|
|
while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden.
|
|
ArrayList<String> store = null;
|
|
try {
|
|
if(report[1].size() <= 50) {
|
|
store = new ArrayList<>(report[1]);
|
|
count += report[1].size();
|
|
report[1].clear();
|
|
} else {
|
|
store = new ArrayList<>(report[1].subList(0, 50));
|
|
report[1].removeAll(store);
|
|
count+=50;
|
|
}
|
|
} catch(NoSuchElementException ignored) {//concurrentmodification fuckery
|
|
log.info("no suchelement bla");
|
|
}
|
|
db.storeTemp(store, false);
|
|
}
|
|
log.info(count + " videos added from " + threadname);
|
|
profiler.endSection();
|
|
}
|
|
}
|
|
|
|
private void savetodb() {
|
|
log.info("save " + toSave.size() + " videos to DB.");
|
|
synchronized (toSave) {
|
|
while (!toSave.isEmpty()) {
|
|
Set<String> videoids = new TreeSet<>();
|
|
for (int i = 0; i < 50 && !toSave.isEmpty(); i++) {
|
|
String save = toSave.stream().findAny().get();
|
|
toSave.remove(save);
|
|
videoids.add(save);
|
|
}
|
|
if (videoids.size() > 0) {
|
|
profiler.startSection("getinfo");
|
|
ArrayList<Video> videos = (ArrayList<Video>) api.getInfos(videoids)[0];
|
|
profiler.endStartSection("sendtoDB");
|
|
db.addVideos(videos, false);
|
|
profiler.endSection();//sendtoDB
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private void sendAdminMessage() {
|
|
long currenttime = System.currentTimeMillis();
|
|
if((currenttime - lastadminreport) / 1000 > 3600) {
|
|
long runtimes = (currenttime - start) / 1000;
|
|
if (runtimes < 0)
|
|
runtimes = 1;
|
|
float vidps = (crawlcount / (float) runtimes);//videos per second
|
|
Main.getMain().broadcastAdmin(vidps + "v/s " + crawlcount + " total V");
|
|
lastadminreport = currenttime;
|
|
}
|
|
}
|
|
|
|
@Override
|
|
public void run() {
|
|
profiler.profilingEnabled = true;
|
|
profiler.clearProfiling();
|
|
profiler.startSection("root");
|
|
profiler.startSection("startup");
|
|
profiler.startSection("loadingcrawlfile");
|
|
start = System.currentTimeMillis();
|
|
loadCrawlFile();
|
|
profiler.endStartSection("populateThreads");//loading crawlfile closed
|
|
int threadcount = createThreads();
|
|
profiler.endStartSection("deleteDouble");//populate threads
|
|
long lastdoubledelete = System.currentTimeMillis();
|
|
//db.deleteDouble();
|
|
profiler.endSection();//deletedouble
|
|
profiler.endSection();//startup
|
|
boolean savedall = false;//ein 2. durch lauf, um wirklich alles zu speichern
|
|
while(crawl || savedall) {
|
|
profiler.startSection("main");
|
|
log.info("to Crawl: " + toCrawl.size() + /*" known: " + toknown.size() +*/ " Time: " + dateform.format(new Date()));
|
|
if(!crawl)
|
|
savedall = true;
|
|
try {
|
|
//fullfill request
|
|
profiler.startSection("fullfill request");
|
|
while(!requested.isEmpty() && !toCrawl.isEmpty() && crawl) {
|
|
log.info("fullfill request");
|
|
send(requested.remove(0));
|
|
}
|
|
|
|
//kindof idle
|
|
{
|
|
int count = 0;//donst stay to long in idle!
|
|
profiler.endStartSection("idle");
|
|
while(toCrawl.size() > (jobspeerthread * threads.size() * 2) && crawl && requested.isEmpty() && count < idlecount) {
|
|
count ++;
|
|
startup = 0;//stop startup count
|
|
if((System.currentTimeMillis() - lastdoubledelete) / 1000 > 1800) {
|
|
//db.deleteDouble();
|
|
lastdoubledelete = System.currentTimeMillis();
|
|
} else {
|
|
Thread.yield();
|
|
try {
|
|
Thread.sleep(100);
|
|
} catch(InterruptedException ignored) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
//nothing left?
|
|
if(/*toknown.isEmpty() && */toCrawl.isEmpty() && requested.size() == threads.size()) {//very uncommon
|
|
log.warn("nothing left to crawl");
|
|
}
|
|
|
|
{
|
|
profiler.endStartSection("loadCrawl");
|
|
boolean joined = true;
|
|
while(toCrawl.size() < ( threadcount * jobspeerthread * 3) && crawl) {
|
|
if(joined) {
|
|
joined = false;
|
|
log.info("loadCrawl");
|
|
}
|
|
LinkedList<String> rest = db.restoreTemp();
|
|
toCrawl.addAll(rest);
|
|
}
|
|
}
|
|
|
|
//writing crawlfile
|
|
profiler.endStartSection("writingcrawlfile");
|
|
log.info("Writing Crawlfile");
|
|
try {
|
|
PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile)));
|
|
for(String t : toCrawl) {
|
|
p.println(t);
|
|
}
|
|
p.close();
|
|
} catch (IOException e) {
|
|
log.error("Error writing crawlfile.", e);
|
|
}
|
|
|
|
//get reports
|
|
profiler.endStartSection("getreport");
|
|
getreports();
|
|
|
|
profiler.endStartSection("debug");
|
|
sendAdminMessage();
|
|
|
|
//save to db
|
|
profiler.endStartSection("save2DB");
|
|
savetodb();
|
|
profiler.endSection();//save2DB
|
|
|
|
//at the beginning there is maybe just one video to crawl, so keep it calm.
|
|
if(startup > 0) {
|
|
profiler.startSection("startupsleep");
|
|
startup --;
|
|
log.info("startup sleep");
|
|
try {
|
|
Thread.sleep(2000);
|
|
} catch(InterruptedException e) {}
|
|
finally {
|
|
profiler.endSection();//startupsleep
|
|
}
|
|
}
|
|
} catch(Throwable t) {
|
|
log.warn("exception in Crawler!", t);
|
|
StringBuilder sb = new StringBuilder();
|
|
for(StackTraceElement elem : t.getStackTrace()) {
|
|
sb.append(elem.getFileName() + "(").append(elem.getMethodName() + ":").append(elem.getLineNumber() + ")\n");
|
|
}
|
|
Main.getMain().broadcastAdmin("Excpetion in crawler: " + t.toString() + "\n" + sb.toString() );
|
|
crawl = false;
|
|
Main.getMain().stop();
|
|
}
|
|
profiler.endSection();//main
|
|
}
|
|
|
|
|
|
|
|
profiler.startSection("waitforthreads");
|
|
for(CrawlerThread ct : threads) {
|
|
try {
|
|
ct.thread.join();
|
|
} catch (InterruptedException ignore) {}
|
|
}
|
|
log.info("All Threads Terminated.");
|
|
|
|
profiler.endStartSection("insertback");
|
|
ArrayList<String> putback = new ArrayList<>(threadcount * threads.get(0).undoneSize());//create list with approximated size
|
|
for(CrawlerThread ct : threads) {
|
|
putback.addAll(ct.undone());
|
|
}
|
|
db.storeTemp(putback, true);
|
|
profiler.endSection();//insertback
|
|
|
|
profiler.endSection();//root
|
|
log.info("Profiler:");
|
|
for(String s : profiler.getTreeView()) {
|
|
log.info(s);
|
|
}
|
|
|
|
//end
|
|
long runtimes = (System.currentTimeMillis() - start) / 1000;
|
|
if(runtimes < 0)
|
|
runtimes = 1;
|
|
int runtimem = (int) (runtimes / 60);
|
|
float vidps = (crawlcount / (float) runtimes);//videos per second
|
|
log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " v/s )");
|
|
CrawlerThread.deinitLib();
|
|
Main.getMain().stopcallback();
|
|
}
|
|
|
|
public DB getDB() {
|
|
return db;
|
|
}
|
|
|
|
public static Video getVideo() {
|
|
return new Video();
|
|
}
|
|
|
|
public String getStats() {
|
|
long runtimes = (System.currentTimeMillis() - start) / 1000;
|
|
if(runtimes < 0)
|
|
runtimes = 1;
|
|
float vidps = (crawlcount / (float) runtimes);//videos per second
|
|
int runtimem = (int) (runtimes / 60);
|
|
String out = "";
|
|
out += "ToCrawl: " + toCrawl.size();
|
|
//out += "\nToknown: " + toknown.size();
|
|
out += "\nToSave: " + toSave.size();
|
|
out += "\nrequested: " + requested.size();
|
|
out += "\nRandomBuffer: " + db.getRandomCount();
|
|
out += "\nRuntime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " v/s )";
|
|
out += "\nprofiler: " + profiler.getNameOfLastSection();
|
|
out += "\nDBSize: " + db.getDBSize();
|
|
if(threads != null) {
|
|
out += "\nThread Nr, todo size, requested, crawledsize, foundsize";
|
|
for (int i = 0; i < threads.size(); i++) {
|
|
CrawlerThread thre = threads.get(i);
|
|
out += "\n " + i + " " + thre.todo.size() + " " + thre.requested + " " + thre.crawled.size() + " " + thre.found.size();
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
public LinkedList<String> getProfiling() {
|
|
return profiler.getTreeView();
|
|
}
|
|
|
|
/**
|
|
* Updates old entrys of the DB. currently unused.
|
|
*/
|
|
/*
|
|
private void updateDB() {
|
|
log.info("updating DB Offset= " + updateOffset);
|
|
LinkedList<String> vids = db.getUncompleted(50, updateOffset);
|
|
LinkedList<Video>[] infos = api.getInfos(vids);
|
|
if(infos != null) {
|
|
int size = infos[0].size() + infos[1].size();
|
|
if(size < 50) {
|
|
updateOffset += ((50-size)/2)+1;
|
|
}
|
|
if(infos[1].size() > 0) {
|
|
log.info("delete " + infos[1].size() + " livestreams");
|
|
db.removeVideos(infos[1]);
|
|
}
|
|
|
|
db.updateVideos(infos[0]);
|
|
log.info("Updated " + infos[0].size() + " Videos.");
|
|
}
|
|
}
|
|
*/
|
|
|
|
@AllArgsConstructor
|
|
@NoArgsConstructor
|
|
public static class Video {
|
|
String id = "";
|
|
String title = "";
|
|
String channel = "";
|
|
String tags = "";
|
|
int length = 0;//the length of the video in seconds
|
|
String languageCode = "";
|
|
byte categorie = 0;
|
|
long created = 0;
|
|
boolean live = false;
|
|
}
|
|
|
|
}
|
|
|