forked from MrBesen/YoutubeCrawler
performance improved
This commit is contained in:
parent
2080602278
commit
4471c0f01d
|
@ -19,6 +19,8 @@ import org.apache.log4j.Logger;
|
|||
public class Crawler implements Runnable {
|
||||
|
||||
private int jobspeerthread = 100; //the amount of jobs a thread get peer request
|
||||
int requestlimit = 5;//amount of videos to be left in the todo que of a thread until it requests new videos
|
||||
private int idlecount = 5;//amount of idle loops allowed
|
||||
|
||||
private LinkedList<String> toSave = new LinkedList<>();//all found ytids, witch need to be analysed
|
||||
private LinkedList<String> toCrawl = new LinkedList<>();//all videos tu crawl
|
||||
|
@ -31,7 +33,7 @@ public class Crawler implements Runnable {
|
|||
|
||||
private boolean crawl = true;
|
||||
private int crawlcount = 0;
|
||||
private int updateOffset = 0;
|
||||
//private int updateOffset = 0;
|
||||
|
||||
private DB db = new DB();
|
||||
private YoutubeAPI api = new YoutubeAPI();
|
||||
|
@ -48,6 +50,16 @@ public class Crawler implements Runnable {
|
|||
log.warn("could not read the number \"" + Config.prop.getProperty("crawler.jobspeerthread") + "\" from the config file. maxvideo");
|
||||
jobspeerthread = 100;
|
||||
}
|
||||
try {
|
||||
requestlimit = Integer.parseInt(Config.prop.getProperty("crawler.requestlimit"));
|
||||
} catch(NumberFormatException e) {
|
||||
log.warn("could not read the number \"" + Config.prop.getProperty("crawler.requestlimit") + "\" from the config file. crawler.requestlimit");
|
||||
}
|
||||
try {
|
||||
idlecount = Integer.parseInt(Config.prop.getProperty("crawler.idlecount"));
|
||||
} catch(NumberFormatException e) {
|
||||
log.warn("could not read the number \"" + Config.prop.getProperty("crawler.idlecount") + "\" from the config file. crawler.idlecount");
|
||||
}
|
||||
}
|
||||
|
||||
public void stop() {
|
||||
|
@ -122,7 +134,7 @@ public class Crawler implements Runnable {
|
|||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
profiler.endStartSection("populateThreads");
|
||||
profiler.endStartSection("populateThreads");//loading crawlfile closed
|
||||
//populate threads
|
||||
int threadcount = 4;
|
||||
try {
|
||||
|
@ -138,13 +150,14 @@ public class Crawler implements Runnable {
|
|||
threads.add(thr);
|
||||
thr.thread.start();
|
||||
}
|
||||
profiler.endStartSection("deleteDouble");
|
||||
profiler.endStartSection("deleteDouble");//populate threads
|
||||
long lastdoubledelete = System.currentTimeMillis();
|
||||
//db.deleteDouble();
|
||||
profiler.endSection();//deletedouble
|
||||
profiler.endSection();//startup
|
||||
profiler.endStartSection("main");
|
||||
boolean savedall = false;//ein 2. durch lauf, um wirklich alles zu speichern
|
||||
while(crawl || savedall) {
|
||||
profiler.startSection("main");
|
||||
log.info("to Crawl: " + toCrawl.size() + /*" known: " + toknown.size() +*/ " Time: " + dateform.format(new Date()));
|
||||
if(!crawl)
|
||||
savedall = true;
|
||||
|
@ -157,21 +170,25 @@ public class Crawler implements Runnable {
|
|||
}
|
||||
|
||||
//kindof idle
|
||||
profiler.endStartSection("idle");
|
||||
while(toCrawl.size() > (jobspeerthread * threads.size()) && crawl && requested.isEmpty()) {
|
||||
startup = 0;//stop startup count
|
||||
if((System.currentTimeMillis() - lastdoubledelete) / 1000 > 1800) {
|
||||
db.deleteDouble();
|
||||
lastdoubledelete = System.currentTimeMillis();
|
||||
} else {
|
||||
Thread.yield();
|
||||
try {
|
||||
Thread.sleep(100);
|
||||
} catch(InterruptedException ignored) {
|
||||
break;
|
||||
{
|
||||
int count = 0;//donst stay to long in idle!
|
||||
profiler.endStartSection("idle");
|
||||
while(toCrawl.size() > (jobspeerthread * threads.size() * 2) && crawl && requested.isEmpty() && count < idlecount) {
|
||||
count ++;
|
||||
startup = 0;//stop startup count
|
||||
if((System.currentTimeMillis() - lastdoubledelete) / 1000 > 1800) {
|
||||
//db.deleteDouble();
|
||||
lastdoubledelete = System.currentTimeMillis();
|
||||
} else {
|
||||
Thread.yield();
|
||||
try {
|
||||
Thread.sleep(100);
|
||||
} catch(InterruptedException ignored) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// updateDB();
|
||||
}
|
||||
// updateDB();
|
||||
}
|
||||
//nothing left?
|
||||
if(/*toknown.isEmpty() && */toCrawl.isEmpty() && requested.size() == threads.size()) {//very uncommon
|
||||
|
@ -282,13 +299,13 @@ public class Crawler implements Runnable {
|
|||
}
|
||||
if(videoids.size() > 0) {
|
||||
profiler.startSection("getinfo");
|
||||
List<Video> videos = api.getInfos(videoids)[0];
|
||||
ArrayList<Video> videos = (ArrayList<Video>) api.getInfos(videoids)[0];
|
||||
profiler.endStartSection("sendtoDB");
|
||||
db.addVideos(videos);
|
||||
db.addVideos(videos, false);
|
||||
profiler.endSection();
|
||||
}
|
||||
}
|
||||
profiler.endSection();
|
||||
profiler.endSection();//save2DB
|
||||
|
||||
//at the beginning there is maybe just one video to crawl, so keep it calm.
|
||||
if(startup > 0) {
|
||||
|
@ -299,7 +316,7 @@ public class Crawler implements Runnable {
|
|||
Thread.sleep(2000);
|
||||
} catch(InterruptedException e) {}
|
||||
finally {
|
||||
profiler.endSection();
|
||||
profiler.endSection();//startupsleep
|
||||
}
|
||||
}
|
||||
} catch(Throwable t) {
|
||||
|
@ -312,8 +329,9 @@ public class Crawler implements Runnable {
|
|||
crawl = false;
|
||||
Main.getMain().stop();
|
||||
}
|
||||
profiler.endSection();//main
|
||||
}
|
||||
profiler.endStartSection("cleanup");
|
||||
profiler.startSection("cleanup");
|
||||
profiler.startSection("deleteDouble");
|
||||
//db.deleteDouble();
|
||||
profiler.endStartSection("stopDB");
|
||||
|
@ -321,9 +339,6 @@ public class Crawler implements Runnable {
|
|||
profiler.endSection();
|
||||
profiler.endSection();//root
|
||||
log.info("Profiler:");
|
||||
//for (Result res : profiler.getProfilingData("root")) {
|
||||
// log.info(res.profilerName + " " + res.usePercentage + "% total: " + res.usePercentage + "%");
|
||||
//}
|
||||
for(String s : profiler.getTreeView()) {
|
||||
log.info(s);
|
||||
}
|
||||
|
@ -378,6 +393,7 @@ public class Crawler implements Runnable {
|
|||
/**
|
||||
* Updates old entrys of the DB. currently unused.
|
||||
*/
|
||||
/*
|
||||
private void updateDB() {
|
||||
log.info("updating DB Offset= " + updateOffset);
|
||||
LinkedList<String> vids = db.getUncompleted(50, updateOffset);
|
||||
|
@ -396,6 +412,7 @@ public class Crawler implements Runnable {
|
|||
log.info("Updated " + infos[0].size() + " Videos.");
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
public static class Video {
|
||||
String id = "";
|
||||
|
|
|
@ -43,7 +43,7 @@ public class CrawlerThread implements Runnable {
|
|||
}
|
||||
}
|
||||
crawl(todo.removeFirst());
|
||||
if(todo.size() < 5 && !requested) {
|
||||
if(todo.size() < parent.requestlimit && !requested) {
|
||||
requested = true;
|
||||
parent.request(this);
|
||||
}
|
||||
|
|
|
@ -26,6 +26,8 @@ public class DB implements Runnable {
|
|||
private Server serv = new Server(this);
|
||||
private Thread randomrefill = null;
|
||||
private int dbsize = 0;
|
||||
private ArrayList<Video> tostorebuffer;
|
||||
private int writebuffersize = 500;
|
||||
|
||||
public DB() {
|
||||
try {
|
||||
|
@ -52,20 +54,28 @@ public class DB implements Runnable {
|
|||
|
||||
log.info("Database is set up!");
|
||||
}
|
||||
|
||||
|
||||
serv.start();
|
||||
refillbuffer();
|
||||
|
||||
|
||||
//get db size
|
||||
dbsize();
|
||||
|
||||
//config data
|
||||
try {
|
||||
writebuffersize = Integer.parseInt(Config.prop.getProperty("db.writebuffersize"));
|
||||
} catch(NumberFormatException e) {
|
||||
log.warn("could not read the number \"" + Config.prop.getProperty("db.writebuffersize") + "\" from the config file. db.writebuffersize");
|
||||
}
|
||||
tostorebuffer = new ArrayList<>(writebuffersize);
|
||||
} catch (SQLException e) {
|
||||
log.error("Error while connecting to the database! ", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void dbsize() {
|
||||
try {
|
||||
ResultSet set = query("SELECT count(*) as count FROM `videos`;");
|
||||
ResultSet set = query("SELECT count(*) as count FROM `videos`;");
|
||||
if(set != null) {
|
||||
if(set.next()) {
|
||||
dbsize = set.getInt(1);
|
||||
|
@ -75,12 +85,11 @@ public class DB implements Runnable {
|
|||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public int getDBSize() {
|
||||
return dbsize;
|
||||
}
|
||||
|
||||
|
||||
private void connect(boolean selectdb) {
|
||||
try {
|
||||
Class.forName("com.mysql.jdbc.Driver");//Treiber laden try this driver: com.mysql.cj.jdbc.Driver
|
||||
|
@ -119,18 +128,24 @@ public class DB implements Runnable {
|
|||
* save the list of videos to the DB
|
||||
* @param input
|
||||
*/
|
||||
public void addVideos(List<Video> input) {
|
||||
public void addVideos(List<Video> input, boolean force) {
|
||||
//log.info("add " + input.size() + " videos");
|
||||
if(input.size() > 0) {
|
||||
dbsize += input.size();
|
||||
if(input != null) {
|
||||
if(input.size() > 0) {
|
||||
tostorebuffer.addAll(input);
|
||||
}
|
||||
}
|
||||
if(tostorebuffer.size() > writebuffersize || force) {
|
||||
dbsize += tostorebuffer.size();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for(int i = 0; i< input.size(); i++) {
|
||||
for(int i = 0; i < tostorebuffer.size(); i++) {
|
||||
if(i > 0)
|
||||
sb.append(',');
|
||||
Video v = input.get(i);
|
||||
Video v = tostorebuffer.get(i);
|
||||
if(v != null)
|
||||
sb.append("('").append(v.id).append("','").append(v.length).append("','").append(v.created).append("','").append(v.languageCode).append("','").append(v.categorie).append("','").append(v.title).append("','").append(v.channel).append("','").append(v.tags).append("') ");
|
||||
}
|
||||
tostorebuffer.clear();
|
||||
if(sb.length() > 2) {
|
||||
String qu = "INSERT IGNORE INTO `ytcrawler`.`videos`(`id`, `length`, `created`, `langcode`, `category`, `videotitle`, `channel`, `tags`) VALUES " + sb.toString();
|
||||
update(qu);
|
||||
|
@ -217,7 +232,7 @@ public class DB implements Runnable {
|
|||
randomrefill.start();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public String getRandom() {
|
||||
log.info("Get random Video");
|
||||
if(randombuffer.size() < 10 ) {
|
||||
|
@ -233,7 +248,7 @@ public class DB implements Runnable {
|
|||
public int getRandomCount() {
|
||||
return randombuffer.size();
|
||||
}
|
||||
|
||||
|
||||
public LinkedList<String> restoreTemp() {
|
||||
ResultSet res = query("SELECT * FROM `ytcrawler`.`temp` LIMIT 500;");
|
||||
LinkedList<String> out = new LinkedList<>();
|
||||
|
@ -266,7 +281,7 @@ public class DB implements Runnable {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Stops the randomnes-Server and disconnect
|
||||
*/
|
||||
|
@ -275,6 +290,7 @@ public class DB implements Runnable {
|
|||
try {
|
||||
if(con != null) {
|
||||
if(!con.isClosed()) {
|
||||
addVideos(null, true);
|
||||
con.close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12,14 +12,14 @@ import org.json.JSONObject;
|
|||
|
||||
import de.mrbesen.telegram.MessageBuilder;
|
||||
import de.mrbesen.telegram.TelegramAPI;
|
||||
import de.mrbesen.telegram.commands.CommandHandler;
|
||||
import de.mrbesen.telegram.commands.JSONCommandHandler;
|
||||
import de.mrbesen.telegram.event.EventHandler;
|
||||
import de.mrbesen.telegram.event.EventListener;
|
||||
import de.mrbesen.telegram.event.events.UserSendMessageEvent;
|
||||
import de.mrbesen.telegram.log.Log4JLog;
|
||||
import de.mrbesen.telegram.objects.TUser;
|
||||
|
||||
public class Main implements CommandHandler, EventListener{
|
||||
public class Main implements JSONCommandHandler, EventListener{
|
||||
|
||||
private ArrayList<String> admins = new ArrayList<>();//usernames of admins
|
||||
private String adminstr = null;
|
||||
|
@ -174,6 +174,12 @@ public class Main implements CommandHandler, EventListener{
|
|||
mainthread.interrupt();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean onCommand(TUser sender, String cmd, String[] args) {
|
||||
//unused
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean onCommand(TUser sender, String cmd, String[] args, JSONObject j) {
|
||||
if(cmd.startsWith("random")) {
|
||||
|
@ -210,6 +216,8 @@ public class Main implements CommandHandler, EventListener{
|
|||
for(String s : cra.getProfiling()) {
|
||||
sb.append(s).append('\n');
|
||||
}
|
||||
if(sb.length() == 0)
|
||||
sb.append("No Data.");
|
||||
sender.sendMessage(sb.toString());
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -27,6 +27,7 @@ public class Profiler {
|
|||
this.profilingMap.clear();
|
||||
this.profilingSection = "";
|
||||
this.sectionList.clear();
|
||||
start = -1;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -36,6 +37,7 @@ public class Profiler {
|
|||
if (this.profilingEnabled) {
|
||||
if(start == -1)
|
||||
start = System.nanoTime();
|
||||
|
||||
if (this.profilingSection.length() > 0) {
|
||||
this.profilingSection = this.profilingSection + ".";
|
||||
}
|
||||
|
@ -43,6 +45,7 @@ public class Profiler {
|
|||
this.profilingSection = this.profilingSection + name;
|
||||
this.sectionList.add(this.profilingSection);
|
||||
this.timestampList.add(Long.valueOf(System.nanoTime()));
|
||||
//System.out.println("started section: " + name + " currently in: " + profilingSection);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -51,20 +54,23 @@ public class Profiler {
|
|||
*/
|
||||
public void endSection() {
|
||||
if (this.profilingEnabled) {
|
||||
long start = System.nanoTime();
|
||||
long stop = ((Long) this.timestampList.remove(this.timestampList.size() - 1)).longValue();
|
||||
this.sectionList.remove(this.sectionList.size() - 1);
|
||||
long delta = start - stop;
|
||||
try {
|
||||
long start = System.nanoTime();
|
||||
long stop = ((Long) this.timestampList.remove(this.timestampList.size() - 1)).longValue();
|
||||
this.sectionList.remove(this.sectionList.size() - 1);
|
||||
long delta = start - stop;
|
||||
|
||||
if (this.profilingMap.containsKey(this.profilingSection)) {
|
||||
this.profilingMap.put(this.profilingSection,
|
||||
Long.valueOf(((Long) this.profilingMap.get(this.profilingSection)).longValue() + delta));
|
||||
} else {
|
||||
this.profilingMap.put(this.profilingSection, Long.valueOf(delta));
|
||||
if (this.profilingMap.containsKey(this.profilingSection)) {
|
||||
this.profilingMap.put(this.profilingSection,
|
||||
Long.valueOf(((Long) this.profilingMap.get(this.profilingSection)).longValue() + delta));
|
||||
} else {
|
||||
this.profilingMap.put(this.profilingSection, Long.valueOf(delta));
|
||||
}
|
||||
|
||||
this.profilingSection = this.sectionList.isEmpty() ? "" : (String) this.sectionList.get(this.sectionList.size() - 1);
|
||||
} catch (Exception e ) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
this.profilingSection = this.sectionList.isEmpty() ? ""
|
||||
: (String) this.sectionList.get(this.sectionList.size() - 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -78,10 +84,8 @@ public class Profiler {
|
|||
}
|
||||
|
||||
public List<Profiler.Result> getProfilingData(String profilerName) {
|
||||
if (!this.profilingEnabled) {
|
||||
return Collections.<Profiler.Result>emptyList();
|
||||
} else {
|
||||
List<Profiler.Result> out = Lists.<Profiler.Result>newArrayList();
|
||||
if (this.profilingEnabled) {
|
||||
List<Profiler.Result> out = Lists.<Profiler.Result>newArrayList();
|
||||
long totaltime = System.nanoTime() - start;
|
||||
|
||||
//calculate percentage of each child section
|
||||
|
@ -93,44 +97,26 @@ public class Profiler {
|
|||
out.add(new Profiler.Result(prfiler_name, totaltimep, subsectiontime));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for (String key : this.profilingMap.keySet()) {
|
||||
this.profilingMap.put(key, Long.valueOf(((Long) this.profilingMap.get(key)).longValue() * 999L / 1000L));
|
||||
}
|
||||
|
||||
return out;
|
||||
} else {
|
||||
return Collections.<Profiler.Result>emptyList();
|
||||
}
|
||||
}
|
||||
|
||||
public LinkedList<String> getTreeView() {
|
||||
/*for(String key : profilingMap.keySet()) {
|
||||
System.out.println(key);
|
||||
}*/
|
||||
//end all sections
|
||||
try {
|
||||
/*LinkedList<String> sections = new LinkedList<>();
|
||||
while(!sectionList.isEmpty()) {
|
||||
String current = sectionList.get(sectionList.size() -1);
|
||||
if(current != null) {
|
||||
sections.add(current.substring(current.lastIndexOf('.')+1));
|
||||
endSection();
|
||||
}
|
||||
}
|
||||
for(String section : sections) {
|
||||
startSection(section);
|
||||
}
|
||||
*/
|
||||
return getTreeView("root", "");
|
||||
} catch(OutOfMemoryError e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return null;
|
||||
return getTreeView("root", "");
|
||||
}
|
||||
|
||||
private LinkedList<String> getTreeView(String name, String leading) {
|
||||
LinkedList<String> out = new LinkedList<>();
|
||||
if(new Exception().getStackTrace().length > 20)//prevent stack overflow debug only
|
||||
if(new Exception().getStackTrace().length > 50)//prevent stack overflow debug only
|
||||
return out;
|
||||
|
||||
for(Result res : getProfilingData(name)) {
|
||||
out.add(leading + res.profilerName + " " + (res.time/10000000)/100D + "s " + form(res.totalUsePercentage) + "% ");
|
||||
out.addAll(getTreeView(res.profilerName, leading + "\t"));
|
||||
|
|
|
@ -7,6 +7,7 @@ import java.net.URL;
|
|||
import java.text.DateFormat;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
@ -35,7 +36,7 @@ public class YoutubeAPI {
|
|||
return (Video) getInfos(id)[0].get(0);
|
||||
}
|
||||
|
||||
public LinkedList<Video>[] getInfos(List<String> ids) {
|
||||
public List<Video>[] getInfos(List<String> ids) {
|
||||
log.info("get " + ids.size() + " infos");
|
||||
if(ids.isEmpty())
|
||||
return null;
|
||||
|
@ -47,8 +48,8 @@ public class YoutubeAPI {
|
|||
return getInfos(sb.toString());
|
||||
}
|
||||
|
||||
public LinkedList<Video>[] getInfos(String idlist) {
|
||||
LinkedList<Video> out = new LinkedList<Video>();
|
||||
public List<Video>[] getInfos(String idlist) {
|
||||
ArrayList<Video> out = new ArrayList<Video>(idlist.length() / 12);//approximierte vorraussichtliche länge
|
||||
LinkedList<Video> livestr = new LinkedList<Video>();
|
||||
String nextpage = "";
|
||||
do {
|
||||
|
@ -138,7 +139,7 @@ public class YoutubeAPI {
|
|||
}
|
||||
} while(!nextpage.equals(""));
|
||||
log.info("got " + (out.size() + livestr.size()) + " infos");
|
||||
return new LinkedList[] {out, livestr};
|
||||
return new List[] {out, livestr};
|
||||
}
|
||||
|
||||
private String removeunwanted(String in) {
|
||||
|
|
Loading…
Reference in New Issue