YoutubeCrawler/src/main/de/mrbesen/youtubecrawler/DB.java

343 lines
9.8 KiB
Java
Raw Normal View History

2018-07-15 21:30:12 +02:00
package de.mrbesen.youtubecrawler;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
2018-07-20 19:57:43 +02:00
import java.util.LinkedList;
2018-07-15 21:30:12 +02:00
import java.util.List;
import java.util.Random;
2018-07-15 21:30:12 +02:00
import org.apache.log4j.Logger;
2018-07-20 19:57:43 +02:00
import com.mysql.cj.jdbc.exceptions.MysqlDataTruncation;
2018-07-15 21:30:12 +02:00
import de.mrbesen.youtubecrawler.Crawler.Video;
public class DB implements Runnable {
2018-07-15 21:30:12 +02:00
private Connection con;
private String server = Config.prop.getProperty("db.host", "localhost"), user = Config.prop.getProperty("db.user", "ytcrawler"), pw = Config.prop.getProperty("db.pw", ""), db = Config.prop.getProperty("db.dbname", "ytcrawler");
private int port = Integer.parseInt(Config.prop.getProperty("db.port", "3306"));
2018-07-15 22:09:37 +02:00
private Logger log = Logger.getLogger(DB.class.getName());
private ArrayList<String> randombuffer = new ArrayList<>(100);
private Random rand = new Random();
private Thread randomrefill = null;
2018-10-08 11:22:45 +02:00
private int dbsize = 0;
2018-11-14 11:39:21 +01:00
2018-10-12 20:03:53 +02:00
private StringBuilder tostorebuffer ;
2018-10-12 17:22:36 +02:00
private int writebuffersize = 500;
2018-10-12 20:03:53 +02:00
private int writebuffercurrentsize = 0;
2018-11-14 11:39:21 +01:00
private StringBuilder totempbuffer;
private int writetempbuffercurrentsize = 0;
2018-07-15 21:30:12 +02:00
2018-07-16 12:43:56 +02:00
public DB() {
2018-07-15 21:30:12 +02:00
try {
2018-07-16 12:43:56 +02:00
connect(false);
2018-07-16 23:22:32 +02:00
2018-07-15 21:30:12 +02:00
//set the database up!
boolean found = false;
ResultSet set = con.getMetaData().getCatalogs();//does the db exists?
while(set.next()) {
if(set.getString(1).equalsIgnoreCase(db)) {
found = true;
con.setCatalog(db);
break;
}
}
if(!found) {//DataBase not found, try to create
log.warn("Database not found! tring to create!");
//create DB, table: konten / player / Transactions
update("CREATE DATABASE `" + db + "` /*!40100 DEFAULT CHARACTER SET latin1*/;");
con.setCatalog(db);
update("CREATE TABLE `videos` (`id` varchar(13) NOT NULL,`length` int(11) NOT NULL,`created` int(11) NOT NULL,`langcode` varchar(3) NOT NULL DEFAULT 'en',`category` int(11) DEFAULT NULL, PRIMARY KEY (`id`), UNIQUE KEY `id_UNIQUE` (`id`)) ENGINE=InnoDB DEFAULT CHARSET=latin1;");
update("CREATE TABLE `temp` ( `ytid` varchar(13) NOT NULL COMMENT 'a Table to store Video ids, when they are found to process them later', PRIMARY KEY (`ytid`), UNIQUE KEY `ytid_UNIQUE` (`ytid`)) ENGINE=InnoDB DEFAULT CHARSET=utf8;");
2018-09-11 00:05:10 +02:00
2018-07-16 23:22:32 +02:00
log.info("Database is set up!");
}
2018-10-12 17:22:36 +02:00
refillbuffer();
2018-10-12 17:22:36 +02:00
2018-10-08 11:22:45 +02:00
//get db size
dbsize();
2018-10-12 17:22:36 +02:00
//config data
try {
writebuffersize = Integer.parseInt(Config.prop.getProperty("db.writebuffersize"));
} catch(NumberFormatException e) {
log.warn("could not read the number \"" + Config.prop.getProperty("db.writebuffersize") + "\" from the config file. db.writebuffersize");
}
2018-10-12 20:03:53 +02:00
tostorebuffer = new StringBuilder(writebuffersize);
2018-11-14 11:39:21 +01:00
totempbuffer = new StringBuilder(writebuffersize);
2018-07-16 12:43:56 +02:00
} catch (SQLException e) {
2018-07-15 21:30:12 +02:00
log.error("Error while connecting to the database! ", e);
}
2018-07-16 12:43:56 +02:00
}
2018-10-12 17:22:36 +02:00
2018-10-08 11:22:45 +02:00
private void dbsize() {
try {
2018-10-12 17:22:36 +02:00
ResultSet set = query("SELECT count(*) as count FROM `videos`;");
2018-10-08 11:22:45 +02:00
if(set != null) {
if(set.next()) {
dbsize = set.getInt(1);
}
}
} catch(SQLException e) {
e.printStackTrace();
}
}
2018-10-12 17:22:36 +02:00
2018-10-08 11:22:45 +02:00
public int getDBSize() {
return dbsize;
}
2018-07-16 12:43:56 +02:00
private void connect(boolean selectdb) {
try {
//verbinden
con = DriverManager.getConnection("jdbc:mysql://" + server + ":" + port + "/" + (selectdb ? db : "") + "?serverTimezone=UTC&verifyServerCertificate=false&useSSL=true&useUnicode=true&characterEncoding=utf-8", user, pw);
}catch (SQLException e) {
2018-07-16 12:43:56 +02:00
log.error("Error while connecting to the database! ", e);
}
2018-07-15 21:30:12 +02:00
}
/**
* removes all videos, that are known from the db
* @param input
* @return
*/
public List<String> checkvideos(List<String> input) {
2018-07-16 23:22:32 +02:00
if(!input.isEmpty()) {
StringBuilder ids = new StringBuilder();
for(int i = 0; i < input.size(); i++) {
ids.append(',').append(input.get(i));
}
String query = "SELECT `id` FROM `videos` WHERE concat('%',`id`,'%') LIKE '" + ids.toString() + "';";
ResultSet res = query(query);
try {
while(res.next()) {
input.remove(res.getString(1));
}
} catch(SQLException e) {
e.printStackTrace();
2018-07-15 21:30:12 +02:00
}
}
return input;
}
/**
* save the list of videos to the DB
* @param input
*/
2018-10-12 20:03:53 +02:00
public void addVideos(ArrayList<Video> input, boolean force) {
2018-10-12 17:22:36 +02:00
if(input != null) {
if(input.size() > 0) {
2018-10-21 21:57:36 +02:00
writebuffercurrentsize += input.size();
2018-10-12 20:03:53 +02:00
for(int i = 0; i < input.size(); i++) {
Video v = input.get(i);
if(v != null)
2021-10-24 23:10:37 +02:00
tostorebuffer.append(",('").append(escape(v.id)).append("',").append(v.length).append(",").append(v.created).append(",'").append(escape(v.languageCode)).append("',").append(v.categorie).append(",'").append(escape(v.title)).append("','").append(escape(v.channel)).append("','").append(escape(v.tags)).append("') ");
2018-10-12 20:03:53 +02:00
}
2018-10-12 17:22:36 +02:00
}
}
2018-10-12 20:03:53 +02:00
if(writebuffercurrentsize > writebuffersize || force) {
if(tostorebuffer.length() > 10) {
2018-11-14 11:39:21 +01:00
log.info("Write databuffer to DB video count: " + writebuffercurrentsize);
2018-10-12 20:03:53 +02:00
dbsize += writebuffercurrentsize;
tostorebuffer.deleteCharAt(0);//delete leading ','
2021-10-18 17:19:22 +02:00
String qu = "INSERT IGNORE INTO `videos`(`id`, `length`, `created`, `langcode`, `category`, `videotitle`, `channel`, `tags`) VALUES " + tostorebuffer.toString();
2018-07-20 19:57:43 +02:00
update(qu);
2018-11-14 11:39:21 +01:00
//reset buffer
writebuffercurrentsize = 0;
tostorebuffer = new StringBuilder(writebuffersize);
2018-07-15 21:30:12 +02:00
}
2018-07-20 19:57:43 +02:00
}
}
2021-10-24 23:10:37 +02:00
private String escape(String e) {
return e.replace("'", "\\'");
}
2018-07-20 19:57:43 +02:00
public void updateVideos(List<Video> input) {
log.info("Updateing " + input.size() + " videos.");
for(Video v : input) {
if(v != null)
updateVideo(v);
}
}
2018-07-20 19:57:43 +02:00
private void updateVideo(Video v) {
try {
2021-10-24 23:10:37 +02:00
String qu = "UPDATE `videos` SET `length` = " + v.length + ", `created` = " + v.created + ", `langcode` = SUBSTR('" + v.languageCode + "', 1, 3) ,`category` = " + v.categorie + ",`videotitle` = SUBSTR('" + escape(v.title) + "',1,100),`channel` = SUBSTR('" + escape(v.channel) + "',1,20),`tags` = '" + escape(v.tags) + "' WHERE `id` = '" + escape(v.id) + "';";
2018-07-15 21:30:12 +02:00
update(qu);
2018-07-20 19:57:43 +02:00
} catch(NullPointerException e) {
2018-07-20 19:57:43 +02:00
}
}
2018-07-20 19:57:43 +02:00
public LinkedList<String> getUncompleted(int limit, int offset) {
LinkedList<String> out = new LinkedList<>();
String sql = "SELECT `id` FROM `videos` WHERE `channel` IS NULL LIMIT " + offset + "," + limit + ";";
ResultSet resu = query(sql);
try {
while(resu.next()) {
out.add(resu.getString(1));
}
} catch (SQLException e) {
log.info("error", e);
}
return out;
}
2018-07-20 19:57:43 +02:00
public void removeVideos(LinkedList<Video> vids) {
log.info("Delete " + vids.size() + " videos.");
for(Video s : vids) {
2021-10-24 23:10:37 +02:00
update("DELETE FROM `videos` WHERE `id`='" + escape(s.id) + "';");
2018-07-15 21:30:12 +02:00
}
}
/**
* instant query
* @param q
* @return Das resultSet der Query
*/
public ResultSet query(String q) {
2018-07-15 21:30:12 +02:00
try {
2018-07-16 12:43:56 +02:00
if(con.isClosed()) {
connect(true);
}
2018-07-15 21:30:12 +02:00
return con.prepareStatement(q).executeQuery();
} catch (SQLException e) {
log.error("Fehler bim ausführen der Query: " + q, e);
}
return null;//ERROR!
}
/**
* instant update
* @param q
*/
public void update(String q) {
try {
2018-07-16 12:43:56 +02:00
if(con.isClosed()) {
connect(true);
}
2018-07-15 21:30:12 +02:00
con.prepareStatement(q).executeUpdate();
2018-07-20 19:57:43 +02:00
} catch (MysqlDataTruncation ignore) {
log.info("truncated.", ignore);
2018-07-15 21:30:12 +02:00
} catch (SQLException e) {
log.error("Fehler bim ausführen der Update-Query: " + q, e);
}
}
2018-07-19 17:59:26 +02:00
private void refillbuffer() {
if(randomrefill == null) {
randomrefill = new Thread(this, "Randomrefill");
randomrefill.start();
}
}
2018-10-12 17:22:36 +02:00
2018-07-19 17:59:26 +02:00
public String getRandom() {
log.info("Get random Video");
if(randombuffer.size() < 10 ) {
refillbuffer();
}
if(randombuffer.isEmpty()) {
log.warn("randombuffer is empty!");
2018-07-19 17:59:26 +02:00
return null;
}
return randombuffer.remove(0);
2018-07-19 17:59:26 +02:00
}
2018-10-04 22:52:18 +02:00
public int getRandomCount() {
return randombuffer.size();
}
2018-10-12 17:22:36 +02:00
public LinkedList<String> restoreTemp() {
2021-10-18 17:19:22 +02:00
ResultSet res = query("SELECT * FROM `temp` LIMIT 500;");
LinkedList<String> out = new LinkedList<>();
log.info("RestoreTemp");
try {
while(res.next()) {
out.add(res.getString(1));
}
2021-10-18 17:19:22 +02:00
update("DELETE FROM `temp` LIMIT 500;");
} catch (Exception e) {}
return out;
}
2018-09-11 00:05:10 +02:00
public void deleteDouble() {
log.info("Started Delete Double");
long start = System.currentTimeMillis();
2021-10-18 17:19:22 +02:00
update("CALL deletedouble();");
log.info("Delete Double done in " + ((System.currentTimeMillis() - start)/60000) + " min");
2018-09-11 00:05:10 +02:00
}
2018-11-14 11:39:21 +01:00
public void storeTemp(ArrayList<String> strings, boolean force) {
if(strings == null)
return;
if(!strings.isEmpty()) {
2018-11-14 11:39:21 +01:00
log.info("store Temp to buffer: " + strings.size());
writetempbuffercurrentsize += strings.size();
for(String s : strings) {
2021-10-24 23:10:37 +02:00
totempbuffer.append(", ('").append(escape(s)).append("')");
}
2018-11-14 11:39:21 +01:00
}
if(writetempbuffercurrentsize > writebuffersize || force) {
log.info("Write Buffer: " + writetempbuffercurrentsize);
totempbuffer.deleteCharAt(0);//delete leading ','
2021-10-18 17:19:22 +02:00
String qu = "INSERT IGNORE INTO `temp` (`ytid`) VALUES " + totempbuffer.toString() + ";";
2018-11-14 11:39:21 +01:00
update(qu);
//reset
writetempbuffercurrentsize = 0;
totempbuffer = new StringBuilder(writebuffersize);
}
}
2018-10-12 17:22:36 +02:00
/**
* Stops the randomnes-Server and disconnect
*/
public void stop() {
try {
if(con != null) {
if(!con.isClosed()) {
2018-10-12 17:22:36 +02:00
addVideos(null, true);
2021-10-25 17:51:46 +02:00
con.commit();
con.close();
}
}
} catch (SQLException e) {
e.printStackTrace();
}
}
/**
* runable, of Thread for randomrefill
*/
@Override
public void run() {
2018-09-25 23:56:20 +02:00
log.info("Started Refilling.");
try {
2021-10-18 15:36:01 +02:00
ResultSet set = query("SELECT `id` FROM `videos` ORDER BY rand() LIMIT 100;");
if(set != null) {
while(set.next()) {
randombuffer.add(set.getString(1));
}
2021-10-18 15:36:01 +02:00
log.info("refilled randombuffer to " + randombuffer.size() + " videos.");
}
} catch (SQLException e) {
log.warn("error getting a random video", e);
}
2021-10-18 15:36:01 +02:00
if(randombuffer.isEmpty()) {
log.error("Unable to retrieve RandomVideos");
}
randomrefill = null;
}
2018-07-15 21:30:12 +02:00
}