DB updater & stuff
This commit is contained in:
parent
442c4c0630
commit
0a7dc697a0
|
@ -28,7 +28,7 @@ public class Crawler implements Runnable {
|
|||
private List<CrawlerThread> requested = new LinkedList<>();
|
||||
private static DateFormat dateform = new SimpleDateFormat("dd-MM-yyyy HH:mm:ss");
|
||||
private long start;
|
||||
|
||||
|
||||
private boolean crawl = true;
|
||||
private int crawlcount = 0;
|
||||
|
||||
|
@ -71,7 +71,7 @@ public class Crawler implements Runnable {
|
|||
requested.add(t);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void send(CrawlerThread t) {
|
||||
listlock.writeLock().lock();
|
||||
for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) {
|
||||
|
@ -125,13 +125,13 @@ public class Crawler implements Runnable {
|
|||
log.warn("Could not read the Number \"" + Config.prop.getProperty("crawler.threadcount") + "\" from the Config.");
|
||||
}
|
||||
threads = new ArrayList<>(threadcount);
|
||||
|
||||
|
||||
for(int i = 0; i < threadcount; i++) {
|
||||
CrawlerThread thr = new CrawlerThread( this);
|
||||
new Thread(thr, "Crawler #" + i).start();
|
||||
threads.add(thr);
|
||||
}
|
||||
|
||||
int updateOffset = 0;
|
||||
while(crawl) {
|
||||
log.info("to Crawl: " + toCrawl.size() + " known: " + toknown.size() + " Time: " + dateform.format(new Date()));
|
||||
|
||||
|
@ -139,23 +139,39 @@ public class Crawler implements Runnable {
|
|||
while(!requested.isEmpty() && !toCrawl.isEmpty() && crawl) {
|
||||
send(requested.remove(0));
|
||||
}
|
||||
|
||||
|
||||
//kindof idle
|
||||
while(toCrawl.size() > (jobspeerthread * threads.size()) && crawl && requested.isEmpty()) {
|
||||
startup = 0;//stop startup count
|
||||
Thread.yield();
|
||||
try {
|
||||
Thread.sleep(100);
|
||||
Thread.sleep(5000);
|
||||
} catch(InterruptedException ignored) {
|
||||
break;
|
||||
}
|
||||
log.info("updating DB Offset= " + updateOffset);
|
||||
LinkedList<String> vids = db.getUncompleted(50, updateOffset);
|
||||
LinkedList<Video>[] infos = api.getInfos(vids);
|
||||
if(infos != null) {
|
||||
int size = infos[0].size() + infos[1].size();
|
||||
if(size < 50) {
|
||||
updateOffset += ((50-size)/2)+1;
|
||||
}
|
||||
if(infos[1].size() > 0) {
|
||||
log.info("delete " + infos[1].size() + " livestreams");
|
||||
db.removeVideos(infos[1]);
|
||||
}
|
||||
|
||||
db.updateVideos(infos[0]);
|
||||
log.info("Updated " + infos[0].size() + " Videos.");
|
||||
}
|
||||
}
|
||||
//nothing left?
|
||||
if(toknown.isEmpty() && toCrawl.isEmpty() && requested.size() == threads.size()) {//very uncommon
|
||||
log.warn("nothing left to crawl");
|
||||
crawl = false;
|
||||
}
|
||||
|
||||
|
||||
//refil the tocrawl list.
|
||||
if(!toknown.isEmpty()) {
|
||||
//check in db for known videos
|
||||
|
@ -170,7 +186,7 @@ public class Crawler implements Runnable {
|
|||
}
|
||||
listlock.writeLock().unlock();
|
||||
}
|
||||
|
||||
|
||||
//writing crawlfile
|
||||
log.info("Writing Crawlfile");
|
||||
listlock.writeLock().lock();
|
||||
|
@ -189,7 +205,7 @@ public class Crawler implements Runnable {
|
|||
} finally {
|
||||
listlock.writeLock().unlock();
|
||||
}
|
||||
|
||||
|
||||
//get reports
|
||||
for (CrawlerThread crawlerThread : threads) {
|
||||
LinkedList<String> report = crawlerThread.report();
|
||||
|
@ -206,11 +222,11 @@ public class Crawler implements Runnable {
|
|||
videoids.add(toSave.remove(0));
|
||||
}
|
||||
if(videoids.size() > 0) {
|
||||
List<Video> videos = api.getInfos(videoids);
|
||||
List<Video> videos = api.getInfos(videoids)[0];
|
||||
db.addVideos(videos);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//at the beginning there is maybe just one video to crawl, so keep it calm.
|
||||
if(startup > 0) {
|
||||
startup --;
|
||||
|
@ -232,7 +248,7 @@ public class Crawler implements Runnable {
|
|||
public DB getDB() {
|
||||
return db;
|
||||
}
|
||||
|
||||
|
||||
public static Video getVideo() {
|
||||
return new Video();
|
||||
}
|
||||
|
@ -247,14 +263,23 @@ public class Crawler implements Runnable {
|
|||
log.info("Toknown:" + toknown.size());
|
||||
log.info("ToSave:" + toSave.size());
|
||||
log.info("Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " v/s )");
|
||||
log.info("Thread Nr, todo size, requested, listsize");
|
||||
for (int i = 0; i < threads.size(); i++) {
|
||||
CrawlerThread thre = threads.get(i);
|
||||
log.info(" " + i + " " + thre.todo.size() + " " + thre.requested + " " + thre.list.size());
|
||||
}
|
||||
}
|
||||
|
||||
public static class Video {
|
||||
String id;
|
||||
int length;//the length of the video in seconds
|
||||
String languageCode;
|
||||
byte categorie;
|
||||
long created;
|
||||
String id = "";
|
||||
String title = "";
|
||||
String channel = "";
|
||||
String tags = "";
|
||||
int length = 0;//the length of the video in seconds
|
||||
String languageCode = "";
|
||||
byte categorie = 0;
|
||||
long created = 0;
|
||||
boolean live = false;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -4,10 +4,13 @@ import java.sql.Connection;
|
|||
import java.sql.DriverManager;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import com.mysql.cj.jdbc.exceptions.MysqlDataTruncation;
|
||||
|
||||
import de.mrbesen.youtubecrawler.Crawler.Video;
|
||||
|
||||
public class DB {
|
||||
|
@ -93,10 +96,50 @@ public class DB {
|
|||
if(i > 0)
|
||||
sb.append(',');
|
||||
Video v = input.get(i);
|
||||
sb.append("('").append(v.id).append("','").append(v.length).append("','").append(v.created).append("','").append(v.languageCode).append("','").append(v.categorie).append("') ");
|
||||
sb.append("('").append(v.id).append("','").append(v.length).append("','").append(v.created).append("','").append(v.languageCode).append("','").append(v.categorie).append("','").append(v.title).append("','").append(v.channel).append("','").append(v.tags).append("') ");
|
||||
}
|
||||
String qu = "INSERT IGNORE INTO `ytcrawler`.`videos`(`id`, `length`, `created`, `langcode`, `category`) VALUES " + sb.toString();
|
||||
if(sb.length() > 2) {
|
||||
String qu = "INSERT IGNORE INTO `ytcrawler`.`videos`(`id`, `length`, `created`, `langcode`, `category`, `videotitle`, `channel`, `tags`) VALUES " + sb.toString();
|
||||
update(qu);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void updateVideos(List<Video> input) {
|
||||
log.info("Updateing " + input.size() + " videos.");
|
||||
for(Video v : input) {
|
||||
if(v != null)
|
||||
updateVideo(v);
|
||||
}
|
||||
}
|
||||
|
||||
private void updateVideo(Video v) {
|
||||
try {
|
||||
String qu = "UPDATE `ytcrawler`.`videos` SET `length` = '" + v.length + "', `created` = '" + v.created + "', `langcode` = SUBSTR('" + v.languageCode + "', 1, 3) ,`category` = '" + v.categorie + "',`videotitle` = SUBSTR('" + v.title + "',1,100),`channel` = SUBSTR('" + v.channel + "',1,20),`tags` = '" + v.tags.substring(0, v.tags.length() > 40 ? 40 : v.tags.length()) + "' WHERE `id` = '" + v.id + "';";
|
||||
update(qu);
|
||||
} catch(NullPointerException e) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public LinkedList<String> getUncompleted(int limit, int offset) {
|
||||
LinkedList<String> out = new LinkedList<>();
|
||||
String sql = "SELECT `id` FROM `videos` WHERE `channel` IS NULL LIMIT " + offset + "," + limit + ";";
|
||||
ResultSet resu = query(sql);
|
||||
try {
|
||||
while(resu.next()) {
|
||||
out.add(resu.getString(1));
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
log.info("error", e);
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
public void removeVideos(LinkedList<Video> vids) {
|
||||
log.info("Delete " + vids.size() + " videos.");
|
||||
for(Video s : vids) {
|
||||
update("DELETE FROM `ytcrawler`.`videos` WHERE `id`='" + s.id + "';");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -105,7 +148,8 @@ public class DB {
|
|||
* @param q
|
||||
* @return Das resultSet der Query
|
||||
*/
|
||||
public ResultSet query(String q) {
|
||||
|
||||
public ResultSet query(String q) {
|
||||
try {
|
||||
if(con.isClosed()) {
|
||||
connect(true);
|
||||
|
@ -127,6 +171,8 @@ public class DB {
|
|||
connect(true);
|
||||
}
|
||||
con.prepareStatement(q).executeUpdate();
|
||||
} catch (MysqlDataTruncation ignore) {
|
||||
log.info("truncated.", ignore);
|
||||
} catch (SQLException e) {
|
||||
log.error("Fehler bim ausführen der Update-Query: " + q, e);
|
||||
}
|
||||
|
|
|
@ -32,13 +32,13 @@ public class YoutubeAPI {
|
|||
}
|
||||
|
||||
public Video getInfo(String id) {
|
||||
return getInfos(id).get(0);
|
||||
return (Video) getInfos(id)[0].get(0);
|
||||
}
|
||||
|
||||
public LinkedList<Video> getInfos(List<String> ids) {
|
||||
public LinkedList<Video>[] getInfos(List<String> ids) {
|
||||
log.info("get " + ids.size() + " infos");
|
||||
if(ids.isEmpty())
|
||||
return new LinkedList<Video>();
|
||||
return null;
|
||||
|
||||
StringBuilder sb = new StringBuilder(ids.remove(0));
|
||||
while(!ids.isEmpty()) {
|
||||
|
@ -47,10 +47,9 @@ public class YoutubeAPI {
|
|||
return getInfos(sb.toString());
|
||||
}
|
||||
|
||||
|
||||
public LinkedList<Video> getInfos(String idlist) {
|
||||
//System.out.println("get info vor " + idlist);
|
||||
public LinkedList<Video>[] getInfos(String idlist) {
|
||||
LinkedList<Video> out = new LinkedList<Video>();
|
||||
LinkedList<Video> livestr = new LinkedList<Video>();
|
||||
String nextpage = "";
|
||||
do {
|
||||
String query = basequery + idlist + "&key=" + api_key;
|
||||
|
@ -60,6 +59,7 @@ public class YoutubeAPI {
|
|||
try {
|
||||
String line;
|
||||
Video v = null;
|
||||
boolean tags = false;
|
||||
while((line = br.readLine()) != null) {
|
||||
String split[] = line.split(":",2);
|
||||
if(split.length == 2) {
|
||||
|
@ -70,9 +70,26 @@ public class YoutubeAPI {
|
|||
v.languageCode = removeunwanted(split[1]);
|
||||
} else if(split[0].equals("defaultLanguage")) {
|
||||
v.languageCode = removeunwanted(split[1]);
|
||||
} else if(split[0].equals("title")) {
|
||||
if(v.title.isEmpty())
|
||||
v.title = removeunwanted(split[1]);
|
||||
} else if(split[0].equals("channelTitle")) {
|
||||
v.channel = removeunwanted(split[1]);
|
||||
} else if(split[0].equals("defaultLanguage")) {
|
||||
v.languageCode = removeunwanted(split[1]);
|
||||
} else if(split[0].equals("tags")) {
|
||||
tags = true;
|
||||
} else if(split[0].equals("liveBroadcastContent")) {
|
||||
v.live = !removeunwanted(split[1]).equalsIgnoreCase("none");
|
||||
} else if(split[0].equals("id")) {
|
||||
if(v != null)
|
||||
out.add(v);
|
||||
if(v != null) {
|
||||
if(!v.live)
|
||||
out.add(v);
|
||||
else {
|
||||
livestr.add(v);
|
||||
log.info("livestream found! " + v.id + " " + v.channel);
|
||||
}
|
||||
}
|
||||
v = new Video();
|
||||
v.id = removeunwanted(split[1]);
|
||||
//System.out.println("new video: " + v.id + " " + v.length + " " + v.languageCode);
|
||||
|
@ -89,19 +106,27 @@ public class YoutubeAPI {
|
|||
}
|
||||
//Seconds
|
||||
v.length += Integer.parseInt(timeparts[timeparts.length-1]);
|
||||
} catch(NumberFormatException e) {
|
||||
log.warn("Error saving the time string: " + removeunwanted(split[1]), e);
|
||||
} catch(NumberFormatException e) {//failed: P6DT17H59M53S and P15W3DT4H1M11S and P1W2DT20H47M55S video id: 1NPyC0psMaI
|
||||
log.warn("Error saving the time string: " + removeunwanted(split[1]) + " video id: " + v.id, e);
|
||||
}
|
||||
} else if(split[0].equals("publishedAt")) {
|
||||
String tmp = removeunwanted(split[1]);
|
||||
tmp = tmp.replace('T', ' ');
|
||||
tmp = tmp.substring(0, tmp.length()-5);
|
||||
Date d = dateformat.parse(tmp);
|
||||
v.created = d.getTime();
|
||||
v.created = d.getTime() / 1000;
|
||||
}else if(split[0].equals("nextPageToken")) {
|
||||
nextpage = "&pageToken=" + removeunwanted(split[1]);
|
||||
// System.out.println("nextpage set to " + nextpage);
|
||||
}
|
||||
} else {
|
||||
if(line.contains("]")) {
|
||||
if(v.tags.length() > 1)
|
||||
v.tags = v.tags.substring(1);
|
||||
tags = false;
|
||||
} else if(tags) {
|
||||
v.tags += ", " + removeunwanted(line);
|
||||
}
|
||||
}
|
||||
}
|
||||
out.add(v);//add the last video
|
||||
|
@ -111,11 +136,12 @@ public class YoutubeAPI {
|
|||
}
|
||||
}
|
||||
} while(!nextpage.equals(""));
|
||||
return out;
|
||||
log.info("got " + (out.size() + livestr.size()) + " infos");
|
||||
return new LinkedList[] {out, livestr};
|
||||
}
|
||||
|
||||
private String removeunwanted(String in) {
|
||||
return in.replaceAll("[\"}{\\,]", "").replaceAll("'", "''").trim();
|
||||
return in.replaceAll("[\"}{\\,\\\\]", "").replaceAll("'", "").trim();
|
||||
}
|
||||
|
||||
public BufferedReader connect(String url) {
|
||||
|
@ -130,4 +156,4 @@ public class YoutubeAPI {
|
|||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue