fehler behoben, profiler, dellete double entfehrnt

This commit is contained in:
MrBesen 2018-10-11 15:32:19 +02:00
parent 33bdc8cf5c
commit 2080602278
7 changed files with 363 additions and 80 deletions

View File

@ -22,12 +22,11 @@ public class Crawler implements Runnable {
private LinkedList<String> toSave = new LinkedList<>();//all found ytids, witch need to be analysed
private LinkedList<String> toCrawl = new LinkedList<>();//all videos tu crawl
private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle
//private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle
private List<CrawlerThread> threads;//list of all threads
private List<CrawlerThread> requested = new LinkedList<>();
private static DateFormat dateform = new SimpleDateFormat("dd-MM-yyyy HH:mm:ss");
private String currentstate = "undefined";
private long start;
private boolean crawl = true;
@ -38,17 +37,11 @@ public class Crawler implements Runnable {
private YoutubeAPI api = new YoutubeAPI();
private File crawlfile = new File("crawl.txt");
private Logger log = Logger.getLogger(this.getClass().getName());
private Profiler profiler = new Profiler();
private int maxvideostotest = 100;
private int startup = 2;//to keep the beginning cool
private int startup = 2;//to keep the beginning cool - counter how often the program is allowed to enter startup sleep
public Crawler() {
try {
maxvideostotest = Integer.parseInt(Config.prop.getProperty("crawler.maxvideos"));
} catch(NumberFormatException e) {
log.warn("could not read the number \"" + Config.prop.getProperty("crawler.maxvideos") + "\" from the config file. maxvideo");
maxvideostotest = 100;
}
try {
jobspeerthread = Integer.parseInt(Config.prop.getProperty("crawler.jobspeerthread"));
} catch(NumberFormatException e) {
@ -62,8 +55,13 @@ public class Crawler implements Runnable {
}
public synchronized void addtoCrawl(String videoid) {
if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
toknown.add(videoid);
//if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
//toknown.add(videoid);
if(toCrawl.contains(videoid)) {
LinkedList<String> str = new LinkedList<String>();
str.add(videoid);
db.storeTemp(str);
}
}
public boolean isCrawling() {
@ -89,13 +87,17 @@ public class Crawler implements Runnable {
@Override
public void run() {
currentstate = "loading crawlfile";
profiler.profilingEnabled = true;
profiler.clearProfiling();
profiler.startSection("root");
profiler.startSection("startup");
profiler.startSection("loadingcrawlfile");
start = System.currentTimeMillis();
log.info("Try to load crawlfile");
if(crawlfile.exists()) {
try {
Scanner in = new Scanner(crawlfile);
boolean crawl = true;//section of file
//boolean crawl = true;//section of file
while(in.hasNextLine()) {
String line = in.nextLine();
if(line == null) {
@ -103,13 +105,13 @@ public class Crawler implements Runnable {
} else {
if(!line.isEmpty()) {
if(line.equals("-")) {//section delimiter
crawl = false;
continue;
} else {
if(crawl) {
toCrawl.add(line);
} else {
//if(crawl) {
toCrawl.add(line);
/*} else {
toknown.add(line);
}
}*/
}
}
}
@ -120,8 +122,7 @@ public class Crawler implements Runnable {
e.printStackTrace();
}
}
currentstate = "populate threads";
profiler.endStartSection("populateThreads");
//populate threads
int threadcount = 4;
try {
@ -137,24 +138,28 @@ public class Crawler implements Runnable {
threads.add(thr);
thr.thread.start();
}
currentstate = "delete Double";
profiler.endStartSection("deleteDouble");
long lastdoubledelete = System.currentTimeMillis();
db.deleteDouble();
currentstate = "crawl";
while(crawl) {
log.info("to Crawl: " + toCrawl.size() + " known: " + toknown.size() + " Time: " + dateform.format(new Date()));
//db.deleteDouble();
profiler.endSection();//startup
profiler.endStartSection("main");
boolean savedall = false;//ein 2. durch lauf, um wirklich alles zu speichern
while(crawl || savedall) {
log.info("to Crawl: " + toCrawl.size() + /*" known: " + toknown.size() +*/ " Time: " + dateform.format(new Date()));
if(!crawl)
savedall = true;
try {
//fullfill request
profiler.startSection("fullfill request");
while(!requested.isEmpty() && !toCrawl.isEmpty() && crawl) {
log.info("fullfill request");
currentstate = "fullfill requests";
send(requested.remove(0));
}
//kindof idle
profiler.endStartSection("idle");
while(toCrawl.size() > (jobspeerthread * threads.size()) && crawl && requested.isEmpty()) {
startup = 0;//stop startup count
currentstate = "idle";
if((System.currentTimeMillis() - lastdoubledelete) / 1000 > 1800) {
db.deleteDouble();
lastdoubledelete = System.currentTimeMillis();
@ -169,13 +174,12 @@ public class Crawler implements Runnable {
// updateDB();
}
//nothing left?
if(toknown.isEmpty() && toCrawl.isEmpty() && requested.size() == threads.size()) {//very uncommon
if(/*toknown.isEmpty() && */toCrawl.isEmpty() && requested.size() == threads.size()) {//very uncommon
log.warn("nothing left to crawl");
crawl = false;
}
//refil the tocrawl list.
if(!toknown.isEmpty()) {
/*if(!toknown.isEmpty()) {
//check in db for known videos
log.info("Checking the DB");
currentstate = "get new tocrawl";
@ -188,50 +192,69 @@ public class Crawler implements Runnable {
toCrawl.addAll(db.checkvideos(tocheck));
}
// listlock.writeLock().unlock();
}
if(toknown.size() < threadcount * jobspeerthread * 20 && crawl) {
}
while(toknown.size() < threadcount * jobspeerthread * 20 && crawl) {
currentstate = "restoretemp";
log.info("restoreTemp");
LinkedList<String> rest = db.restoreTemp();
toknown.addAll(rest);
}*/
{
profiler.endStartSection("loadCrawl");
boolean joined = true;
while(toCrawl.size() < ( threadcount * jobspeerthread * 3) && crawl) {
if(joined) {
joined = false;
log.info("loadCrawl");
}
LinkedList<String> rest = db.restoreTemp();
toCrawl.addAll(rest);
}
}
//writing crawlfile
profiler.endStartSection("writingcrawlfile");
log.info("Writing Crawlfile");
currentstate = "writing crawlfile";
// listlock.writeLock().lock();
try {
PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile)));
for(String t : toCrawl) {
p.println(t);
}
/*
p.println("-");
for(String t : toknown) {
p.println(t);
}
}*/
p.close();
} catch (IOException e) {
log.error("Error writing crawlfile.", e);
}
//get reports
currentstate = "get report";
profiler.endStartSection("getreport");
log.info("get report");
int count = 0;
for (CrawlerThread crawlerThread : threads) {
currentstate = "get report: " + crawlerThread.thread.getName();
String threadname = crawlerThread.thread.getName();
profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1));
LinkedList<String>[] report = crawlerThread.report();
crawlcount+= report[0].size();
toSave.addAll(report[0]);
crawlerThread.crawled.clear();
while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden.
LinkedList<String> store = new LinkedList<>();
try {
while(!report[1].isEmpty() && store.size() < 50) {
store.add(report[1].removeFirst());
count++;
int count = 0;
while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden.
LinkedList<String> store = null;
try {
if(report[1].size() <= 50) {
store = report[1];
count += report[1].size();
report[1].clear();
} else {
store = new LinkedList<>();
while(!report[1].isEmpty() && store.size() < 50) {
store.add(report[1].removeFirst());
count++;
}
}
} catch(NoSuchElementException ignored) {//concurrentmodification fuckery
log.info("no suchelement bla");
@ -239,10 +262,10 @@ public class Crawler implements Runnable {
db.storeTemp(store);
}
log.info(count + " videos added.");
crawlerThread.found.clear();
crawlerThread.thread.interrupt();//free from lock
profiler.endSection();
}
profiler.endStartSection("debug");
long runtimes = (System.currentTimeMillis() - start) / 1000;
if(runtimes < 0)
runtimes = 1;
@ -250,7 +273,7 @@ public class Crawler implements Runnable {
Main.getMain().broadcastAdmin(vidps + "v/s " + crawlcount + " total V");
//save to db
currentstate = "save to DB";
profiler.endStartSection("save2DB");
log.info("save " + toSave.size() + " videos to DB.");
while(!toSave.isEmpty()) {
LinkedList<String> videoids = new LinkedList<>();
@ -258,19 +281,26 @@ public class Crawler implements Runnable {
videoids.add(toSave.remove(0));
}
if(videoids.size() > 0) {
profiler.startSection("getinfo");
List<Video> videos = api.getInfos(videoids)[0];
profiler.endStartSection("sendtoDB");
db.addVideos(videos);
profiler.endSection();
}
}
profiler.endSection();
//at the beginning there is maybe just one video to crawl, so keep it calm.
if(startup > 0) {
profiler.startSection("startupsleep");
startup --;
currentstate = "startup sleep";
log.info("startup sleep");
try {
Thread.sleep(2000);
} catch(InterruptedException e) {}
finally {
profiler.endSection();
}
}
} catch(Throwable t) {
log.warn("exception in Crawler!", t);
@ -283,10 +313,20 @@ public class Crawler implements Runnable {
Main.getMain().stop();
}
}
currentstate = "delete Double";
db.deleteDouble();
currentstate = "stop DB";
profiler.endStartSection("cleanup");
profiler.startSection("deleteDouble");
//db.deleteDouble();
profiler.endStartSection("stopDB");
db.stop();
profiler.endSection();
profiler.endSection();//root
log.info("Profiler:");
//for (Result res : profiler.getProfilingData("root")) {
// log.info(res.profilerName + " " + res.usePercentage + "% total: " + res.usePercentage + "%");
//}
for(String s : profiler.getTreeView()) {
log.info(s);
}
//end
long runtimes = (System.currentTimeMillis() - start) / 1000;
@ -295,7 +335,7 @@ public class Crawler implements Runnable {
int runtimem = (int) (runtimes / 60);
float vidps = (crawlcount / (float) runtimes);//videos per second
log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " v/s )");
currentstate = "ended.";
Main.getMain().stopcallback();
}
public DB getDB() {
@ -314,21 +354,27 @@ public class Crawler implements Runnable {
int runtimem = (int) (runtimes / 60);
String out = "";
out += "ToCrawl: " + toCrawl.size();
out += "\nToknown: " + toknown.size();
//out += "\nToknown: " + toknown.size();
out += "\nToSave: " + toSave.size();
out += "\nrequested: " + requested.size();
out += "\nRandomBuffer: " + db.getRandomCount();
out += "\nRuntime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " v/s )";
out += "\nState: " + currentstate;
out += "\nprofiler: " + profiler.getNameOfLastSection();
out += "\nDBSize: " + db.getDBSize();
out += "\nThread Nr, todo size, locked, requested, crawledsize, foundsize";
for (int i = 0; i < threads.size(); i++) {
CrawlerThread thre = threads.get(i);
out += "\n " + i + " " + thre.todo.size() + " " + thre.lockforreport + " " + thre.requested + " " + thre.crawled.size() + " " + thre.found.size();
if(threads != null) {
out += "\nThread Nr, todo size, requested, crawledsize, foundsize";
for (int i = 0; i < threads.size(); i++) {
CrawlerThread thre = threads.get(i);
out += "\n " + i + " " + (thre.lockforreport ? "\uD83D\uDD12" : "\uD83D\uDD13") + " " + thre.todo.size() + " " + thre.requested + " " + thre.crawled.size() + " " + thre.found.size();
}
}
return out;
}
public LinkedList<String> getProfiling() {
return profiler.getTreeView();
}
/**
* Updates old entrys of the DB. currently unused.
*/

View File

@ -69,7 +69,12 @@ public class CrawlerThread implements Runnable {
*/
LinkedList<String>[] report() {
lockforreport = true;
return new LinkedList[] {(LinkedList) crawled.clone(), (LinkedList) found.clone()};
LinkedList[] out = new LinkedList[] {(LinkedList) crawled, (LinkedList) found};
crawled = new LinkedList<>();
found = new LinkedList<>();
lockforreport = false;
thread.interrupt();
return out;
}
private void crawl(String videoid) {

View File

@ -255,6 +255,8 @@ public class DB implements Runnable {
}
public void storeTemp(LinkedList<String> strings) {
if(strings == null)
return;
if(!strings.isEmpty()) {
StringBuilder sb = new StringBuilder();
for(String s : strings) {

View File

@ -2,17 +2,21 @@ package de.mrbesen.youtubecrawler;
import java.io.File;
import java.io.IOException;
import java.lang.Thread.State;
import java.util.ArrayList;
import java.util.Random;
import java.util.Scanner;
import org.apache.log4j.Logger;
import org.json.JSONObject;
import de.mrbesen.telegram.MessageBuilder;
import de.mrbesen.telegram.TelegramAPI;
import de.mrbesen.telegram.commands.CommandHandler;
import de.mrbesen.telegram.event.EventHandler;
import de.mrbesen.telegram.event.EventListener;
import de.mrbesen.telegram.event.events.UserSendMessageEvent;
import de.mrbesen.telegram.log.Log4JLog;
import de.mrbesen.telegram.objects.TUser;
public class Main implements CommandHandler, EventListener{
@ -20,12 +24,14 @@ public class Main implements CommandHandler, EventListener{
private ArrayList<String> admins = new ArrayList<>();//usernames of admins
private String adminstr = null;
private long setadminstr = -1;
private static String abc = "abcdefghijklmnopqrstuvwxyz";
private static String abc = "abcdefghijklmnopqrstuvwxyz";//used for random string generation
private Logger log = Logger.getLogger(this.getClass().getName());
private TelegramAPI tapi;
private Thread mainthread;
private static Main main;
private boolean startcrawler = true;
private Crawler cra;
private Thread crawlerthread;
public static Main getMain() {
return main;
@ -38,16 +44,15 @@ public class Main implements CommandHandler, EventListener{
startcrawl = false;
}
}
main = new Main(startcrawl);
main.run();
}
public Main(boolean startcra) {
startcrawler = startcra;
}
private Crawler cra;
private void run() {
mainthread = Thread.currentThread();
@ -61,9 +66,9 @@ public class Main implements CommandHandler, EventListener{
//starting crawler
cra = new Crawler();
Thread t = new Thread(cra, "Crawler");
crawlerthread = new Thread(cra, "Crawler");
if(startcrawler) {
t.start();
crawlerthread.start();
}
//starting BOT API
@ -72,9 +77,11 @@ public class Main implements CommandHandler, EventListener{
tapi.getCommandManager().registerCommand("admin", this);
tapi.getCommandManager().registerCommand("stats", this);
tapi.getCommandManager().registerCommand("stop", this);
tapi.getCommandManager().registerCommand("profiler", this);
tapi.getEventManager().registerEvent(this);
tapi.setHelpText("Send the command /random to get a random video.");
tapi.setUpdateInterval(2000);
tapi.setLog(new Log4JLog());
tapi.start();
//load admins
@ -94,10 +101,9 @@ public class Main implements CommandHandler, EventListener{
//CLI
Scanner s = new Scanner(System.in);
String in;
while((in= s.nextLine()) != null && t.isAlive()) {
while((in= s.nextLine()) != null && crawlerthread.isAlive()) {
if(in.equalsIgnoreCase("stop")) {
stop();
break;
} else if(in.equalsIgnoreCase("add")) {
log.info("please enter ytid:");
String id = s.nextLine().trim();
@ -113,31 +119,76 @@ public class Main implements CommandHandler, EventListener{
} else if(in.equalsIgnoreCase("crastop")) {
log.info("Stop crawler");
cra.stop();
} else if(in.equalsIgnoreCase("profiler")) {
for(String profline : cra.getProfiling()) {
log.info(profline);
}
}
}
s.close();
log.info("Terminated.");
}
private String format(String in, int length) {
while(in.length() < length) {
in += ' ';
}
if(in.length() > length) {
in = in.substring(0, length);
}
return in;
}
public void stop() {
log.info("Stop.");
cra.stop();
log.info("cra stopped");
try {
Thread.sleep(100);
} catch(InterruptedException e) {
e.printStackTrace();
if(startcrawler) {
new Thread(new Runnable() {
@Override
public void run() {
int count = 0;
while(crawlerthread.isAlive()) {
try {
Thread.sleep(20000);
} catch(InterruptedException ignored) {}
count = 0;
for(Thread t : Thread.getAllStackTraces().keySet()) {
String name = format(t.getName(), 15);
log.info(name + ":\t" + t.getState().name());
if(t.getState() == State.RUNNABLE) {
count ++;
}
}
log.info("count: " + count);
}
log.info("Stoped Shutdown Watchdog");
}
}, "Shutdown Watchdog").start();
}
}
void stopcallback() {
tapi.stop();
log.info("tapi stopped");
mainthread.interrupt();
}
@Override
public boolean onCommand(TUser sender, String cmd, String[] args) {
if(cmd.equals("random")) {
public boolean onCommand(TUser sender, String cmd, String[] args, JSONObject j) {
if(cmd.startsWith("random")) {
String ytid = cra.getDB().getRandom();
sender.sendMessage("https://youtube.com/watch?v=" + ytid);
int chatid = 0;
try {
//log.debug(j.toString());
chatid = j.getJSONObject("chat").getInt("id");
} catch(Throwable t) { t.printStackTrace(); }
//log.info("chatid: " + chatid);
if(chatid != 0) {
tapi.sendMessage(new MessageBuilder().setText("https://youtube.com/watch?v=" + ytid).setReciver(chatid).build());
} else {
sender.sendMessage("https://youtube.com/watch?v=" + ytid);
}
return true;
} else if(cmd.equals("admin")) {
if(admins.contains(sender.getName())) {
@ -153,14 +204,25 @@ public class Main implements CommandHandler, EventListener{
sender.sendMessage(cra.getStats());
return true;
}
} else if(cmd.equals("stop")) {
} else if(cmd.equals("profiler")) {
if(admins.contains(sender.getName())) {
StringBuilder sb = new StringBuilder();
for(String s : cra.getProfiling()) {
sb.append(s).append('\n');
}
sender.sendMessage(sb.toString());
return true;
}
}
/*else if(cmd.equals("stop")) {
if(admins.contains(sender.getName())) {
stop();
sender.sendMessage("Stop.");
log.info("Stopped via Telegram by " + sender.getFirstName());
return true;
}
}
}*/
return false;
}

View File

@ -0,0 +1,168 @@
package de.mrbesen.youtubecrawler;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
public class Profiler {
private final List<String> sectionList = Lists.<String>newArrayList();
private final List<Long> timestampList = Lists.<Long>newArrayList();
/** Flag profiling enabled */
public boolean profilingEnabled;
/** Current profiling section */
private String profilingSection = "";
private final Map<String, Long> profilingMap = Maps.<String, Long>newHashMap();
private long start = -1;
/**
* Clear profiling.
*/
public void clearProfiling() {
this.profilingMap.clear();
this.profilingSection = "";
this.sectionList.clear();
}
/**
* Start section
*/
public void startSection(String name) {
if (this.profilingEnabled) {
if(start == -1)
start = System.nanoTime();
if (this.profilingSection.length() > 0) {
this.profilingSection = this.profilingSection + ".";
}
this.profilingSection = this.profilingSection + name;
this.sectionList.add(this.profilingSection);
this.timestampList.add(Long.valueOf(System.nanoTime()));
}
}
/**
* End section
*/
public void endSection() {
if (this.profilingEnabled) {
long start = System.nanoTime();
long stop = ((Long) this.timestampList.remove(this.timestampList.size() - 1)).longValue();
this.sectionList.remove(this.sectionList.size() - 1);
long delta = start - stop;
if (this.profilingMap.containsKey(this.profilingSection)) {
this.profilingMap.put(this.profilingSection,
Long.valueOf(((Long) this.profilingMap.get(this.profilingSection)).longValue() + delta));
} else {
this.profilingMap.put(this.profilingSection, Long.valueOf(delta));
}
this.profilingSection = this.sectionList.isEmpty() ? ""
: (String) this.sectionList.get(this.sectionList.size() - 1);
}
}
private int sectioncount(String profilername) {
int count = 1;
for(char c : profilername.toCharArray()) {
if(c == '.')
count ++;
}
return count;
}
public List<Profiler.Result> getProfilingData(String profilerName) {
if (!this.profilingEnabled) {
return Collections.<Profiler.Result>emptyList();
} else {
List<Profiler.Result> out = Lists.<Profiler.Result>newArrayList();
long totaltime = System.nanoTime() - start;
//calculate percentage of each child section
int count = sectioncount(profilerName);
for (String prfiler_name : this.profilingMap.keySet()) {
if (prfiler_name.startsWith(profilerName) && sectioncount(prfiler_name) == count+1) {
long subsectiontime = ((Long) this.profilingMap.get(prfiler_name)).longValue();
double totaltimep = (double) subsectiontime * 100.0D / (double) totaltime;
out.add(new Profiler.Result(prfiler_name, totaltimep, subsectiontime));
}
}
for (String key : this.profilingMap.keySet()) {
this.profilingMap.put(key, Long.valueOf(((Long) this.profilingMap.get(key)).longValue() * 999L / 1000L));
}
return out;
}
}
public LinkedList<String> getTreeView() {
/*for(String key : profilingMap.keySet()) {
System.out.println(key);
}*/
//end all sections
try {
/*LinkedList<String> sections = new LinkedList<>();
while(!sectionList.isEmpty()) {
String current = sectionList.get(sectionList.size() -1);
if(current != null) {
sections.add(current.substring(current.lastIndexOf('.')+1));
endSection();
}
}
for(String section : sections) {
startSection(section);
}
*/
return getTreeView("root", "");
} catch(OutOfMemoryError e) {
e.printStackTrace();
}
return null;
}
private LinkedList<String> getTreeView(String name, String leading) {
LinkedList<String> out = new LinkedList<>();
if(new Exception().getStackTrace().length > 20)//prevent stack overflow debug only
return out;
for(Result res : getProfilingData(name)) {
out.add(leading + res.profilerName + " " + (res.time/10000000)/100D + "s " + form(res.totalUsePercentage) + "% ");
out.addAll(getTreeView(res.profilerName, leading + "\t"));
}
return out;
}
private String form(double d) {
return (((long) (d*1000))/1000)+"";
}
/**
* End current section and start a new section
*/
public void endStartSection(String name) {
this.endSection();
this.startSection(name);
}
public String getNameOfLastSection() {
return this.sectionList.size() == 0 ? "[UNKNOWN]" : (String) this.sectionList.get(this.sectionList.size() - 1);
}
public static final class Result {
public double totalUsePercentage;
public String profilerName;
public long time;
public Result(String profilerName, double totalUsePercentage, long time) {
this.profilerName = profilerName;
this.totalUsePercentage = totalUsePercentage;
this.time = time;
}
}
}

View File

@ -3,7 +3,6 @@ package de.mrbesen.youtubecrawler;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.ServerSocket;
import java.net.Socket;
import java.net.SocketTimeoutException;

View File

@ -107,6 +107,7 @@ public class YoutubeAPI {
//Seconds
v.length += Integer.parseInt(timeparts[timeparts.length-1]);
} catch(NumberFormatException e) {//failed: P6DT17H59M53S and P15W3DT4H1M11S and P1W2DT20H47M55S video id: 1NPyC0psMaI and P2W2DT23H58M58S video id: Jd9KjbRxhN4 For input string: "W2DT23"
Main.getMain().broadcastAdmin(removeunwanted(split[1]) + " video id: " + v.id);
log.warn("Error saving the time string: " + removeunwanted(split[1]) + " video id: " + v.id, e);
}
} else if(split[0].equals("publishedAt")) {