forked from MrBesen/YoutubeCrawler
Merge branch 'master' of ssh://git.mrbesen.de:2222/MrBesen/YoutubeCrawler
This commit is contained in:
commit
d56f1271c4
|
@ -70,9 +70,9 @@ public class Crawler implements Runnable {
|
||||||
//if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
|
//if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
|
||||||
//toknown.add(videoid);
|
//toknown.add(videoid);
|
||||||
if(toCrawl.contains(videoid)) {
|
if(toCrawl.contains(videoid)) {
|
||||||
LinkedList<String> str = new LinkedList<String>();
|
ArrayList<String> str = new ArrayList<String>(1);
|
||||||
str.add(videoid);
|
str.add(videoid);
|
||||||
db.storeTemp(str);
|
db.storeTemp(str, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -255,23 +255,21 @@ public class Crawler implements Runnable {
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden.
|
while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden.
|
||||||
LinkedList<String> store = null;
|
ArrayList<String> store = null;
|
||||||
try {
|
try {
|
||||||
if(report[1].size() <= 50) {
|
if(report[1].size() <= 50) {
|
||||||
store = report[1];
|
store = new ArrayList<>(report[1]);
|
||||||
count += report[1].size();
|
count += report[1].size();
|
||||||
report[1].clear();
|
report[1].clear();
|
||||||
} else {
|
} else {
|
||||||
store = new LinkedList<>();
|
store = new ArrayList<>(report[1].subList(0, 50));
|
||||||
while(!report[1].isEmpty() && store.size() < 50) {
|
report[1].removeAll(store);
|
||||||
store.add(report[1].removeFirst());
|
count+=50;
|
||||||
count++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} catch(NoSuchElementException ignored) {//concurrentmodification fuckery
|
} catch(NoSuchElementException ignored) {//concurrentmodification fuckery
|
||||||
log.info("no suchelement bla");
|
log.info("no suchelement bla");
|
||||||
}
|
}
|
||||||
db.storeTemp(store);
|
db.storeTemp(store, false);
|
||||||
}
|
}
|
||||||
log.info(count + " videos added from " + threadname);
|
log.info(count + " videos added from " + threadname);
|
||||||
profiler.endSection();
|
profiler.endSection();
|
||||||
|
@ -326,6 +324,22 @@ public class Crawler implements Runnable {
|
||||||
}
|
}
|
||||||
profiler.endSection();//main
|
profiler.endSection();//main
|
||||||
}
|
}
|
||||||
|
profiler.startSection("waitforthreads");
|
||||||
|
for(CrawlerThread ct : threads) {
|
||||||
|
try {
|
||||||
|
ct.thread.join();
|
||||||
|
} catch (InterruptedException ignore) {}
|
||||||
|
}
|
||||||
|
log.info("All Threads Terminated.");
|
||||||
|
|
||||||
|
profiler.endStartSection("insertback");
|
||||||
|
ArrayList<String> putback = new ArrayList<>(threadcount * threads.get(0).undoneSize());//create list with approximated size
|
||||||
|
for(CrawlerThread ct : threads) {
|
||||||
|
putback.addAll(ct.undone());
|
||||||
|
}
|
||||||
|
db.storeTemp(putback, true);
|
||||||
|
profiler.endSection();//insertback
|
||||||
|
|
||||||
profiler.endSection();//root
|
profiler.endSection();//root
|
||||||
log.info("Profiler:");
|
log.info("Profiler:");
|
||||||
for(String s : profiler.getTreeView()) {
|
for(String s : profiler.getTreeView()) {
|
||||||
|
@ -416,3 +430,4 @@ public class Crawler implements Runnable {
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package de.mrbesen.youtubecrawler;
|
package de.mrbesen.youtubecrawler;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
@ -31,6 +32,14 @@ public class CrawlerThread implements Runnable {
|
||||||
thread = t;
|
thread = t;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LinkedList<String> undone() {
|
||||||
|
return todo;
|
||||||
|
}
|
||||||
|
|
||||||
|
int undoneSize() {
|
||||||
|
return todo.size();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
while(parent.isCrawling()) {
|
while(parent.isCrawling()) {
|
||||||
|
|
|
@ -26,9 +26,13 @@ public class DB implements Runnable {
|
||||||
private Server serv = new Server(this);
|
private Server serv = new Server(this);
|
||||||
private Thread randomrefill = null;
|
private Thread randomrefill = null;
|
||||||
private int dbsize = 0;
|
private int dbsize = 0;
|
||||||
|
|
||||||
private StringBuilder tostorebuffer ;
|
private StringBuilder tostorebuffer ;
|
||||||
private int writebuffersize = 500;
|
private int writebuffersize = 500;
|
||||||
private int writebuffercurrentsize = 0;
|
private int writebuffercurrentsize = 0;
|
||||||
|
|
||||||
|
private StringBuilder totempbuffer;
|
||||||
|
private int writetempbuffercurrentsize = 0;
|
||||||
|
|
||||||
public DB() {
|
public DB() {
|
||||||
try {
|
try {
|
||||||
|
@ -69,6 +73,7 @@ public class DB implements Runnable {
|
||||||
log.warn("could not read the number \"" + Config.prop.getProperty("db.writebuffersize") + "\" from the config file. db.writebuffersize");
|
log.warn("could not read the number \"" + Config.prop.getProperty("db.writebuffersize") + "\" from the config file. db.writebuffersize");
|
||||||
}
|
}
|
||||||
tostorebuffer = new StringBuilder(writebuffersize);
|
tostorebuffer = new StringBuilder(writebuffersize);
|
||||||
|
totempbuffer = new StringBuilder(writebuffersize);
|
||||||
} catch (SQLException e) {
|
} catch (SQLException e) {
|
||||||
log.error("Error while connecting to the database! ", e);
|
log.error("Error while connecting to the database! ", e);
|
||||||
}
|
}
|
||||||
|
@ -143,15 +148,16 @@ public class DB implements Runnable {
|
||||||
}
|
}
|
||||||
if(writebuffercurrentsize > writebuffersize || force) {
|
if(writebuffercurrentsize > writebuffersize || force) {
|
||||||
if(tostorebuffer.length() > 10) {
|
if(tostorebuffer.length() > 10) {
|
||||||
log.info("Write data to DB video count: " + writebuffercurrentsize);
|
log.info("Write databuffer to DB video count: " + writebuffercurrentsize);
|
||||||
dbsize += writebuffercurrentsize;
|
dbsize += writebuffercurrentsize;
|
||||||
tostorebuffer.deleteCharAt(0);//delete leading ','
|
tostorebuffer.deleteCharAt(0);//delete leading ','
|
||||||
String qu = "INSERT IGNORE INTO `ytcrawler`.`videos`(`id`, `length`, `created`, `langcode`, `category`, `videotitle`, `channel`, `tags`) VALUES " + tostorebuffer.toString();
|
String qu = "INSERT IGNORE INTO `ytcrawler`.`videos`(`id`, `length`, `created`, `langcode`, `category`, `videotitle`, `channel`, `tags`) VALUES " + tostorebuffer.toString();
|
||||||
update(qu);
|
update(qu);
|
||||||
|
|
||||||
|
//reset buffer
|
||||||
|
writebuffercurrentsize = 0;
|
||||||
|
tostorebuffer = new StringBuilder(writebuffersize);
|
||||||
}
|
}
|
||||||
//reset buffer
|
|
||||||
writebuffercurrentsize = 0;
|
|
||||||
tostorebuffer = new StringBuilder(writebuffersize);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -271,15 +277,25 @@ public class DB implements Runnable {
|
||||||
log.info("Delete Double done in " + ((System.currentTimeMillis() - start)/60000) + " min");
|
log.info("Delete Double done in " + ((System.currentTimeMillis() - start)/60000) + " min");
|
||||||
}
|
}
|
||||||
|
|
||||||
public void storeTemp(LinkedList<String> strings) {
|
public void storeTemp(ArrayList<String> strings, boolean force) {
|
||||||
if(strings == null)
|
if(strings == null)
|
||||||
return;
|
return;
|
||||||
if(!strings.isEmpty()) {
|
if(!strings.isEmpty()) {
|
||||||
StringBuilder sb = new StringBuilder();
|
log.info("store Temp to buffer: " + strings.size());
|
||||||
|
writetempbuffercurrentsize += strings.size();
|
||||||
for(String s : strings) {
|
for(String s : strings) {
|
||||||
sb.append("'), ('").append(s);
|
totempbuffer.append(", ('").append(s).append("')");
|
||||||
}
|
}
|
||||||
update("INSERT IGNORE INTO `ytcrawler`.`temp` (`ytid`) VALUES ('" + sb.substring(6).toString() + "');");
|
}
|
||||||
|
if(writetempbuffercurrentsize > writebuffersize || force) {
|
||||||
|
log.info("Write Buffer: " + writetempbuffercurrentsize);
|
||||||
|
totempbuffer.deleteCharAt(0);//delete leading ','
|
||||||
|
String qu = "INSERT IGNORE INTO `ytcrawler`.`temp` (`ytid`) VALUES " + totempbuffer.toString() + ";";
|
||||||
|
update(qu);
|
||||||
|
|
||||||
|
//reset
|
||||||
|
writetempbuffercurrentsize = 0;
|
||||||
|
totempbuffer = new StringBuilder(writebuffersize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue