Compare commits

...

17 Commits

18 changed files with 648 additions and 471 deletions

7
.gitignore vendored
View File

@ -3,3 +3,10 @@
.project
.settings/*
.classpath
admins
crawl.conf
crawl.txt
*.so
.idea/
.vscode/settings.json
includes/

18
.vscode/c_cpp_properties.json vendored Normal file
View File

@ -0,0 +1,18 @@
{
"configurations": [
{
"name": "Linux",
"includePath": [
"${workspaceFolder}/**",
"${workspaceFolder}/includes/",
"/usr/lib/jvm/java-8-openjdk-amd64/include/",
"/usr/lib/jvm/java-8-openjdk-amd64/include/linux/"
],
"defines": [],
"compilerPath": "/usr/bin/clang",
"cStandard": "c11",
"cppStandard": "c++17"
}
],
"version": 4
}

11
Makefile Normal file
View File

@ -0,0 +1,11 @@
createhfiles:
mkdir -p includes/
javah -d includes/ -classpath target/YoutubeCrawler-0.0.2-jar-with-dependencies.jar de.mrbesen.youtubecrawler.CrawlerThread
compile:
g++ -shared -fPIC -o libcrawlerthread.so cpp/crawlerthread.cpp -Iincludes/ -I/usr/lib/jvm/java-8-openjdk-amd64/include/ -I/usr/lib/jvm/java-8-openjdk-amd64/include/linux/ -lcurl
clean:
rm -rf includes/ libcrawlerthread.so

112
cpp/crawlerthread.cpp Normal file
View File

@ -0,0 +1,112 @@
#include "de_mrbesen_youtubecrawler_CrawlerThread.h"
#include <iostream>
#include <set>
#include <string>
#include <sstream>
#include <regex>
#include <vector>
#include <curl/curl.h>
static const std::string YOUTUBEBASE = "https://youtube.com/watch?v=";
static const std::regex YOUTUBELINKPATTERN("watch\\?v=([-_a-zA-Z0-9]{11})");
std::vector<CURL*> curls;
static size_t WriteCallback(void* contents, size_t size, size_t nmemb, void* userp) {
*((std::ostringstream*) userp) << std::string((char*) contents, size * nmemb);
return size * nmemb;
}
JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_initLib(JNIEnv* env, jclass clazz, jint threadcount) {
curls.reserve(threadcount);
for(int i = 0; i < threadcount; ++i) {
CURL* curl = curl_easy_init();
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
curls.push_back(curl);
}
}
JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_deinitLib(JNIEnv* env, jclass) {
for(int i = 0; i < curls.size(); ++i) {
CURL* curl = curls.at(i);
curl_easy_cleanup(curl);
}
curls.clear();
}
std::string download(CURL* curl, const std::string& url) {
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
std::ostringstream out;
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &out);
CURLcode res = curl_easy_perform(curl);
long responsecode = 404;
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &responsecode);
if(responsecode != 200) {
std::cout << "Curl error: got " << responsecode << std::endl;
return "";
}
if(res != CURLE_OK) {
std::cout << "Curl error: " << res << std::endl;
return "";
}
return out.str();
}
JNIEXPORT jboolean JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl(JNIEnv* env, jobject that, jstring videoid, jint threadid) {
// get videoid argument
jboolean myfalseval = false;
const char* cvideid = env->GetStringUTFChars(videoid, &myfalseval);
const std::string svideoid(cvideid); // makes a copy
env->ReleaseStringUTFChars(videoid, cvideid);
// std::cout << "crawl: " << svideoid << std::endl;
// use curl to get the website
CURL* curl = curls.at(threadid);
std::string webcontent = download(curl, YOUTUBEBASE + svideoid);
if(webcontent.empty()) {
std::cout << "webcontent is empty" << std::endl;
return JNI_FALSE;
}
jclass jclass_ct = env->FindClass("de/mrbesen/youtubecrawler/CrawlerThread"); // class of CrawlerThread
jfieldID fid_ctfound = env->GetFieldID(jclass_ct, "found" , "Ljava/util/List;"); // fieldID of CrawlerThread.found
jclass jclass_ll = env->FindClass("java/util/List"); // Linked List Class
jmethodID mid_add = env->GetMethodID(jclass_ll, "add", "(Ljava/lang/Object;)Z"); // add() method of LinkedList
jobject ll_found = env->GetObjectField(that, fid_ctfound); // linked list to store the results to
// match regex
std::sregex_iterator it = std::sregex_iterator(webcontent.begin(), webcontent.end(), YOUTUBELINKPATTERN);
const std::sregex_iterator itend = std::sregex_iterator();
// this set is to find duplicated strings it does not catch everything, because it is only valid during this crawl, but it should filter a few with very little effort
std::set<std::string> known;
known.insert(svideoid); // do not "find" the same ytid
env->MonitorEnter(that); // syncronized(this) {
for( ; it != itend; ++it) {
const std::string ytid = (*it)[1].str(); // get the String from the first sub-group match
// only if the ytid is not known
if(known.find(ytid) == known.end()) {
// add to the found list
// std::cout << ytid << std::endl;
jstring jytid = env->NewStringUTF(ytid.c_str()); // create a java string object
jboolean b = env->CallBooleanMethod(ll_found, mid_add, jytid); // call add() on the LinkedList object with the String
known.insert(ytid);
}
}
env->MonitorExit(that); // end of syncronized block
return JNI_TRUE;
}

18
pom.xml
View File

@ -5,7 +5,8 @@
<artifactId>YoutubeCrawler</artifactId>
<version>0.0.2</version>
<build>
<sourceDirectory>src</sourceDirectory>
<sourceDirectory>src/main</sourceDirectory>
<testSourceDirectory>src/test</testSourceDirectory>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
@ -59,8 +60,8 @@
</plugins>
</build>
<properties>
<maven.compiler.target>1.8</maven.compiler.target>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.11</maven.compiler.target>
<maven.compiler.source>1.11</maven.compiler.source>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
@ -85,5 +86,16 @@
<artifactId>guava</artifactId>
<version>11.0.2</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<version>5.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.12</version>
</dependency>
</dependencies>
</project>

View File

@ -1,114 +0,0 @@
package de.mrbesen.youtubecrawler;
import java.io.IOException;
import java.util.LinkedList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
public class CrawlerThread implements Runnable {
private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v=");
private Logger log = Logger.getLogger(this.getClass().getName());
private Crawler parent;
Thread thread;
LinkedList<String> todo = new LinkedList<>();//videos, this thread should crawl
LinkedList<String> crawled = new LinkedList<>();//videos this thread had crawled
LinkedList<String> found = new LinkedList<>();//videos this thread had found
boolean requested = true;//is a request pending?
boolean lockforreport = false;
public CrawlerThread( Crawler root) {
parent = root;
root.request(this);
}
void setThread(Thread t) {
thread = t;
}
LinkedList<String> undone() {
return todo;
}
int undoneSize() {
return todo.size();
}
@Override
public void run() {
while(parent.isCrawling()) {
while(!todo.isEmpty() && parent.isCrawling()) {
if(lockforreport) {
try {
Thread.sleep(10);
} catch(InterruptedException e) {
lockforreport = false;
}
}
crawl(todo.removeFirst());
if(todo.size() < parent.requestlimit && !requested) {
requested = true;
parent.request(this);
}
}
if(todo.isEmpty() && !requested) {
requested = true;
parent.request(this);
}
log.warn("No Object left!");
Thread.yield();
try {
Thread.sleep(10000);//sleep for 10 seconds
} catch (InterruptedException ignored) {}
}
log.info("Stopped.");
}
/**
* returns a linkedlist of all crawled videos
* @return
*/
LinkedList<String>[] report() {
lockforreport = true;
LinkedList<String>[] out = new LinkedList[] {(LinkedList) crawled, (LinkedList) found};
crawled = new LinkedList<>();
found = new LinkedList<>();
lockforreport = false;
thread.interrupt();
return out;
}
private void crawl(String videoid) {
try {
crawled.add(videoid);
// log.info("crawling: " + videoid);
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
String s = con.getContent();
Matcher matcher = linkpattern.matcher(s);
while(matcher.find()) {
int beginytid = matcher.end();
int endxtid = s.indexOf('"', beginytid);
int endid = s.indexOf('&', beginytid);
if(endid < endxtid) {
endxtid = endid;
}
String ytid = s.substring(beginytid, endxtid);
if(ytid.length() > 9 && ytid.length() <= 12) {
found.add(ytid);
} else {
// log.warn("youtube id has wrong length: \"" + ytid + "\"");
}
}
} catch(IOException e) {
e.printStackTrace();
}
}
}

View File

@ -1,60 +0,0 @@
package de.mrbesen.youtubecrawler;
import java.io.IOException;
import java.io.PrintWriter;
import java.net.ServerSocket;
import java.net.Socket;
import java.net.SocketTimeoutException;
import org.apache.log4j.Logger;
public class Server implements Runnable {
private ServerSocket ssoc;
private Thread t;
private boolean run = false;
private Logger log = Logger.getLogger(this.getClass().getName());
private DB db;
public Server(DB d) {
db = d;
}
void start() {
run = true;
t = new Thread(this, "Server");
t.start();
}
void stop() {
run = false;
t.interrupt();
}
public void run() {
try {
final int port = 2419;
ssoc = new ServerSocket(port, -1);
//ssoc.bind(new InetSocketAddress(, port));
ssoc.setSoTimeout(5);
log.info("opened Server at port " + port);
while(run) {
try {
Socket client = ssoc.accept();
//if(client.getInetAddress().isLoopbackAddress()) {
PrintWriter out = new PrintWriter(client.getOutputStream());
out.println(db.getRandom());
out.flush();
out.close();
/*} else {
log.info("client connected: " + client.getInetAddress().toString());
}*/
client.close();
} catch(SocketTimeoutException ignored) {}
}
ssoc.close();
} catch(IOException e) {
e.printStackTrace();
}
}
}

View File

@ -1,162 +0,0 @@
package de.mrbesen.youtubecrawler;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import javax.net.ssl.HttpsURLConnection;
import org.apache.log4j.Logger;
import de.mrbesen.youtubecrawler.Crawler.Video;
public class YoutubeAPI {
private final String api_key = Config.prop.getProperty("youtube.apikey");
private static String basequery = "https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails&id=";
private static DateFormat dateformat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
private Logger log = Logger.getLogger(YoutubeAPI.class.getName());
public YoutubeAPI() {
if(api_key.isEmpty()) {
log.error("apikey is not defined!");
System.exit(1);
}
}
public Video getInfo(String id) {
return (Video) getInfos(id)[0].get(0);
}
public List<Video>[] getInfos(List<String> ids) {
//log.info("get " + ids.size() + " infos");
if(ids.isEmpty())
return null;
StringBuilder sb = new StringBuilder(ids.remove(0));
while(!ids.isEmpty()) {
sb.append(',').append(ids.remove(0));
}
return getInfos(sb.toString());
}
public List<Video>[] getInfos(String idlist) {
ArrayList<Video> out = new ArrayList<Video>(idlist.length() / 12);//approximierte vorraussichtliche länge
LinkedList<Video> livestr = new LinkedList<Video>();
String nextpage = "";
do {
String query = basequery + idlist + "&key=" + api_key;
BufferedReader br = connect(query);
nextpage = "";
if(br != null) {
try {
String line;
Video v = null;
boolean tags = false;
while((line = br.readLine()) != null) {
String split[] = line.split(":",2);
if(split.length == 2) {
split[0] = removeunwanted(split[0]);
//System.out.println(split[0] + " " + split[1]);
if(split[0].equals("defaultAudioLanguage")) {
v.languageCode = removeunwanted(split[1]);
} else if(split[0].equals("defaultLanguage")) {
v.languageCode = removeunwanted(split[1]);
} else if(split[0].equals("title")) {
if(v.title.isEmpty())
v.title = removeunwanted(split[1]);
} else if(split[0].equals("channelTitle")) {
v.channel = removeunwanted(split[1]);
} else if(split[0].equals("defaultLanguage")) {
v.languageCode = removeunwanted(split[1]);
} else if(split[0].equals("tags")) {
tags = true;
} else if(split[0].equals("liveBroadcastContent")) {
v.live = !removeunwanted(split[1]).equalsIgnoreCase("none");
} else if(split[0].equals("id")) {
if(v != null) {
if(!v.live)
out.add(v);
else {
livestr.add(v);
log.info("livestream found! " + v.id + " " + v.channel);
}
}
v = new Video();
v.id = removeunwanted(split[1]);
//System.out.println("new video: " + v.id + " " + v.length + " " + v.languageCode);
} else if(split[0].equals("categoryId")) {
v.categorie = Byte.parseByte(removeunwanted(split[1]));
} else if(split[0].equals("duration")) {
String timeparts[] = removeunwanted(split[1]).substring(2).split("[H,M,S]");
try {
if(timeparts.length > 2) {//hours
v.length = 3600 * Integer.parseInt(timeparts[0]);
}
if(timeparts.length > 1) {//minutes
v.length += 60 * Integer.parseInt(timeparts[timeparts.length -2]);
}
//Seconds
v.length += Integer.parseInt(timeparts[timeparts.length-1]);
} catch(NumberFormatException e) {//failed: P6DT17H59M53S and P15W3DT4H1M11S and P1W2DT20H47M55S video id: 1NPyC0psMaI and P2W2DT23H58M58S video id: Jd9KjbRxhN4 For input string: "W2DT23"
Main.getMain().broadcastAdmin(removeunwanted(split[1]) + " video id: " + v.id);
log.warn("Error saving the time string: " + removeunwanted(split[1]) + " video id: " + v.id, e);
}
} else if(split[0].equals("publishedAt")) {
String tmp = removeunwanted(split[1]);
tmp = tmp.replace('T', ' ');
tmp = tmp.substring(0, tmp.length()-5);
Date d = dateformat.parse(tmp);
v.created = d.getTime() / 1000;
}else if(split[0].equals("nextPageToken")) {
nextpage = "&pageToken=" + removeunwanted(split[1]);
// System.out.println("nextpage set to " + nextpage);
}
} else {
if(line.contains("]")) {
if(v.tags.length() > 1)
v.tags = v.tags.substring(1);
tags = false;
} else if(tags) {
v.tags += ", " + removeunwanted(line);
}
}
}
out.add(v);//add the last video
br.close();
} catch(IOException | ParseException e) {
e.printStackTrace();
}
}
} while(!nextpage.equals(""));
//log.info("got " + (out.size() + livestr.size()) + " infos");
return new List[] {out, livestr};
}
private String removeunwanted(String in) {
return in.replaceAll("[\"}{\\,\\\\]", "").replaceAll("'", "").trim();
}
public BufferedReader connect(String url) {
try {
URL urll = new URL(url);
HttpsURLConnection con = (HttpsURLConnection) urll.openConnection();
con.connect();
//System.out.println(con.getResponseCode());
return new BufferedReader(new InputStreamReader(con.getInputStream()));
} catch(IOException e) {
e.printStackTrace();
}
return null;
}
}
// no suchelement bla

View File

@ -7,13 +7,10 @@ import java.io.IOException;
import java.io.PrintWriter;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Scanner;
import java.util.*;
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
import org.apache.log4j.Logger;
public class Crawler implements Runnable {
@ -22,8 +19,8 @@ public class Crawler implements Runnable {
int requestlimit = 5;//amount of videos to be left in the todo que of a thread until it requests new videos
private int idlecount = 5;//amount of idle loops allowed
private LinkedList<String> toSave = new LinkedList<>();//all found ytids, witch need to be analysed
private LinkedList<String> toCrawl = new LinkedList<>();//all videos tu crawl
private Set<String> toSave = new TreeSet<>();//all found ytids, witch need to be analysed
private Set<String> toCrawl = new TreeSet<>();//all videos tu crawl
//private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle
private List<CrawlerThread> threads;//list of all threads
private List<CrawlerThread> requested = new LinkedList<>();
@ -36,10 +33,11 @@ public class Crawler implements Runnable {
//private int updateOffset = 0;
private DB db = new DB();
private YoutubeAPI api = new YoutubeAPI();
private YoutubeAPI api = new YoutubeAPI(Config.prop.getProperty("youtube.apikey"));
private File crawlfile = new File("crawl.txt");
private Logger log = Logger.getLogger(this.getClass().getName());
private Profiler profiler = new Profiler();
private long lastadminreport = 0;
private int startup = 2;//to keep the beginning cool - counter how often the program is allowed to enter startup sleep
@ -70,7 +68,7 @@ public class Crawler implements Runnable {
//if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
//toknown.add(videoid);
if(toCrawl.contains(videoid)) {
ArrayList<String> str = new ArrayList<String>(1);
ArrayList<String> str = new ArrayList<>(1);
str.add(videoid);
db.storeTemp(str, false);
}
@ -89,22 +87,17 @@ public class Crawler implements Runnable {
}
private void send(CrawlerThread t) {
// listlock.writeLock().lock();
for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) {
t.todo.add(toCrawl.removeFirst());
synchronized (toCrawl) {
for (int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) {
String s = toCrawl.stream().findAny().get();
toCrawl.remove(s);
t.todo.add(s);
}
}
// listlock.writeLock().unlock();
t.requested = false;
}
@Override
public void run() {
profiler.profilingEnabled = true;
profiler.clearProfiling();
profiler.startSection("root");
profiler.startSection("startup");
profiler.startSection("loadingcrawlfile");
start = System.currentTimeMillis();
private void loadCrawlFile() {
log.info("Try to load crawlfile");
if(crawlfile.exists()) {
try {
@ -134,7 +127,9 @@ public class Crawler implements Runnable {
e.printStackTrace();
}
}
profiler.endStartSection("populateThreads");//loading crawlfile closed
}
private int createThreads() {
//populate threads
int threadcount = 4;
try {
@ -143,13 +138,93 @@ public class Crawler implements Runnable {
log.warn("Could not read the Number \"" + Config.prop.getProperty("crawler.threadcount") + "\" from the Config.");
}
threads = new ArrayList<>(threadcount);
CrawlerThread.initLib(threadcount);
for(int i = 0; i < threadcount; i++) {
CrawlerThread thr = new CrawlerThread( this);
CrawlerThread thr = new CrawlerThread( this, i);
thr.setThread(new Thread(thr, "Crawler #" + i));
threads.add(thr);
thr.thread.start();
}
return threadcount;
}
private void getreports() {
log.info("get report");
for (CrawlerThread crawlerThread : threads) {
String threadname = crawlerThread.thread.getName();
profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1));
List<String>[] report = crawlerThread.report();
crawlcount+= report[0].size();
toSave.addAll(report[0]);
crawlerThread.crawled.clear();
int count = 0;
while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden.
ArrayList<String> store = null;
try {
if(report[1].size() <= 50) {
store = new ArrayList<>(report[1]);
count += report[1].size();
report[1].clear();
} else {
store = new ArrayList<>(report[1].subList(0, 50));
report[1].removeAll(store);
count+=50;
}
} catch(NoSuchElementException ignored) {//concurrentmodification fuckery
log.info("no suchelement bla");
}
db.storeTemp(store, false);
}
log.info(count + " videos added from " + threadname);
profiler.endSection();
}
}
private void savetodb() {
log.info("save " + toSave.size() + " videos to DB.");
synchronized (toSave) {
while (!toSave.isEmpty()) {
Set<String> videoids = new TreeSet<>();
for (int i = 0; i < 50 && !toSave.isEmpty(); i++) {
String save = toSave.stream().findAny().get();
toSave.remove(save);
videoids.add(save);
}
if (videoids.size() > 0) {
profiler.startSection("getinfo");
ArrayList<Video> videos = (ArrayList<Video>) api.getInfos(videoids)[0];
profiler.endStartSection("sendtoDB");
db.addVideos(videos, false);
profiler.endSection();//sendtoDB
}
}
}
}
private void sendAdminMessage() {
long currenttime = System.currentTimeMillis();
if((currenttime - lastadminreport) / 1000 > 3600) {
long runtimes = (currenttime - start) / 1000;
if (runtimes < 0)
runtimes = 1;
float vidps = (crawlcount / (float) runtimes);//videos per second
Main.getMain().broadcastAdmin(vidps + "v/s " + crawlcount + " total V");
lastadminreport = currenttime;
}
}
@Override
public void run() {
profiler.profilingEnabled = true;
profiler.clearProfiling();
profiler.startSection("root");
profiler.startSection("startup");
profiler.startSection("loadingcrawlfile");
start = System.currentTimeMillis();
loadCrawlFile();
profiler.endStartSection("populateThreads");//loading crawlfile closed
int threadcount = createThreads();
profiler.endStartSection("deleteDouble");//populate threads
long lastdoubledelete = System.currentTimeMillis();
//db.deleteDouble();
@ -187,7 +262,6 @@ public class Crawler implements Runnable {
break;
}
}
// updateDB();
}
}
//nothing left?
@ -195,27 +269,6 @@ public class Crawler implements Runnable {
log.warn("nothing left to crawl");
}
//refil the tocrawl list.
/*if(!toknown.isEmpty()) {
//check in db for known videos
log.info("Checking the DB");
currentstate = "get new tocrawl";
// listlock.writeLock().lock();
while(toCrawl.size() < jobspeerthread * threads.size() * 2 && crawl && !toknown.isEmpty()) {
LinkedList<String> tocheck = new LinkedList<>();
for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) {
tocheck.add(toknown.removeFirst());
}
toCrawl.addAll(db.checkvideos(tocheck));
}
// listlock.writeLock().unlock();
}
while(toknown.size() < threadcount * jobspeerthread * 20 && crawl) {
currentstate = "restoretemp";
log.info("restoreTemp");
LinkedList<String> rest = db.restoreTemp();
toknown.addAll(rest);
}*/
{
profiler.endStartSection("loadCrawl");
boolean joined = true;
@ -244,60 +297,14 @@ public class Crawler implements Runnable {
//get reports
profiler.endStartSection("getreport");
log.info("get report");
for (CrawlerThread crawlerThread : threads) {
String threadname = crawlerThread.thread.getName();
profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1));
LinkedList<String>[] report = crawlerThread.report();
crawlcount+= report[0].size();
toSave.addAll(report[0]);
crawlerThread.crawled.clear();
int count = 0;
while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden.
ArrayList<String> store = null;
try {
if(report[1].size() <= 50) {
store = new ArrayList<>(report[1]);
count += report[1].size();
report[1].clear();
} else {
store = new ArrayList<>(report[1].subList(0, 50));
report[1].removeAll(store);
count+=50;
}
} catch(NoSuchElementException ignored) {//concurrentmodification fuckery
log.info("no suchelement bla");
}
db.storeTemp(store, false);
}
log.info(count + " videos added from " + threadname);
profiler.endSection();
}
getreports();
profiler.endStartSection("debug");
long runtimes = (System.currentTimeMillis() - start) / 1000;
if(runtimes < 0)
runtimes = 1;
float vidps = (crawlcount / (float) runtimes);//videos per second
Main.getMain().broadcastAdmin(vidps + "v/s " + crawlcount + " total V");
sendAdminMessage();
//save to db
profiler.endStartSection("save2DB");
log.info("save " + toSave.size() + " videos to DB.");
while(!toSave.isEmpty()) {
LinkedList<String> videoids = new LinkedList<>();
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
videoids.add(toSave.remove(0));
}
if(videoids.size() > 0) {
profiler.startSection("getinfo");
ArrayList<Video> videos = (ArrayList<Video>) api.getInfos(videoids)[0];
profiler.endStartSection("sendtoDB");
db.addVideos(videos, false);
profiler.endSection();//sendtoDB
}
}
savetodb();
profiler.endSection();//save2DB
//at the beginning there is maybe just one video to crawl, so keep it calm.
@ -324,6 +331,9 @@ public class Crawler implements Runnable {
}
profiler.endSection();//main
}
profiler.startSection("waitforthreads");
for(CrawlerThread ct : threads) {
try {
@ -353,6 +363,7 @@ public class Crawler implements Runnable {
int runtimem = (int) (runtimes / 60);
float vidps = (crawlcount / (float) runtimes);//videos per second
log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " v/s )");
CrawlerThread.deinitLib();
Main.getMain().stopcallback();
}
@ -383,7 +394,7 @@ public class Crawler implements Runnable {
out += "\nThread Nr, todo size, requested, crawledsize, foundsize";
for (int i = 0; i < threads.size(); i++) {
CrawlerThread thre = threads.get(i);
out += "\n " + i + " " + (thre.lockforreport ? "\uD83D\uDD12" : "\uD83D\uDD13") + " " + thre.todo.size() + " " + thre.requested + " " + thre.crawled.size() + " " + thre.found.size();
out += "\n " + i + " " + thre.todo.size() + " " + thre.requested + " " + thre.crawled.size() + " " + thre.found.size();
}
}
return out;
@ -417,6 +428,8 @@ public class Crawler implements Runnable {
}
*/
@AllArgsConstructor
@NoArgsConstructor
public static class Video {
String id = "";
String title = "";

View File

@ -0,0 +1,135 @@
package de.mrbesen.youtubecrawler;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
public class CrawlerThread implements Runnable {
private static Pattern linkpattern = Pattern.compile("watch\\?v=([-_a-zA-Z0-9]{11})");
private Logger log = Logger.getLogger(this.getClass().getName());
private Crawler parent;
Thread thread;
LinkedList<String> todo = new LinkedList<>();//videos, this thread should crawl
List<String> crawled = new LinkedList<>();//videos this thread had crawled
List<String> found = new LinkedList<>();//videos this thread had found
static AtomicInteger fails = new AtomicInteger(0);
private static int MAXFAILS = 100;
boolean requested = true;//is a request pending?
private int threadid;
static {
String libpath = System.getProperty("java.library.path");
libpath += ":./";
System.setProperty("java.library.path", libpath);
System.loadLibrary("crawlerthread");
}
public static native void initLib(int threadCount);
public static native void deinitLib();
public CrawlerThread( Crawler root, int threadid) {
parent = root;
root.request(this);
this.threadid = threadid;
}
void setThread(Thread t) {
thread = t;
}
LinkedList<String> undone() {
return todo;
}
int undoneSize() {
return todo.size();
}
@Override
public void run() {
while(parent.isCrawling()) {
synchronized (this) {
while (!todo.isEmpty() && parent.isCrawling()) {
String vid = todo.removeFirst();
// System.out.println("crawling: " + vid + " size: " + found.size());
crawled.add(vid);
boolean success = crawl(vid, threadid);
if (todo.size() < parent.requestlimit && !requested) {
requested = true;
parent.request(this);
}
if (!success) {
int val = fails.addAndGet(1);
if (val > MAXFAILS) {
System.err.println("Max Crawlfails reached, stopping");
parent.stop();
break;
}
}
}
if (todo.isEmpty() && !requested) {
requested = true;
parent.request(this);
}
log.warn("No Object left!");
Thread.yield();
try {
Thread.sleep(10000);//sleep for 10 seconds
} catch (InterruptedException ignored) {
}
}
}
log.info("Stopped.");
}
/**
* returns a list of all crawled videos
* @return
*/
List<String>[] report() {
synchronized (this) {
List<String>[] out = new List[]{crawled, found};
crawled = new LinkedList<>();
found = new LinkedList<>();
return out;
}
}
/*
private void crawl(String videoid) {
try {
crawled.add(videoid);
// log.info("crawling: " + videoid);
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
String s = con.getContent();
Matcher matcher = linkpattern.matcher(s);
while(matcher.find()) {
String ytid = matcher.group(1);
if(!ytid.equals(videoid)) {
found.add(ytid);
}
}
} catch(IOException e) {
e.printStackTrace();
}
}
*/
// returns false when it fails
private native boolean crawl(String videid, int threadid);
}

View File

@ -23,17 +23,19 @@ public class DB implements Runnable {
private Logger log = Logger.getLogger(DB.class.getName());
private ArrayList<String> randombuffer = new ArrayList<>(100);
private Random rand = new Random();
private Server serv = new Server(this);
private Thread randomrefill = null;
private int dbsize = 0;
private StringBuilder tostorebuffer ;
private StringBuilder tostorebuffer;
private int writebuffersize = 500;
private int writebuffercurrentsize = 0;
private StringBuilder totempbuffer;
private int writetempbuffercurrentsize = 0;
private final int TEMPBUFFERRATIO = 15;
private final int STOREBUFFERRATIO = 100;
public DB() {
try {
connect(false);
@ -60,7 +62,6 @@ public class DB implements Runnable {
log.info("Database is set up!");
}
serv.start();
refillbuffer();
//get db size
@ -72,8 +73,8 @@ public class DB implements Runnable {
} catch(NumberFormatException e) {
log.warn("could not read the number \"" + Config.prop.getProperty("db.writebuffersize") + "\" from the config file. db.writebuffersize");
}
tostorebuffer = new StringBuilder(writebuffersize);
totempbuffer = new StringBuilder(writebuffersize);
tostorebuffer = new StringBuilder(writebuffersize * STOREBUFFERRATIO);
totempbuffer = new StringBuilder(writebuffersize * TEMPBUFFERRATIO);
} catch (SQLException e) {
log.error("Error while connecting to the database! ", e);
}
@ -98,10 +99,9 @@ public class DB implements Runnable {
private void connect(boolean selectdb) {
try {
Class.forName("com.mysql.jdbc.Driver");//Treiber laden try this driver: com.mysql.cj.jdbc.Driver
//verbinden
con = DriverManager.getConnection("jdbc:mysql://" + server + ":" + port + "/" + (selectdb ? db : "") + "?serverTimezone=UTC" ,user,pw);
}catch (ClassNotFoundException | SQLException e) {
con = DriverManager.getConnection("jdbc:mysql://" + server + ":" + port + "/" + (selectdb ? db : "") + "?serverTimezone=UTC&verifyServerCertificate=false&useSSL=true&useUnicode=true&characterEncoding=utf-8", user, pw);
}catch (SQLException e) {
log.error("Error while connecting to the database! ", e);
}
}
@ -135,14 +135,13 @@ public class DB implements Runnable {
* @param input
*/
public void addVideos(ArrayList<Video> input, boolean force) {
//log.info("add " + input.size() + " videos");
if(input != null) {
if(input.size() > 0) {
writebuffercurrentsize += input.size();
for(int i = 0; i < input.size(); i++) {
Video v = input.get(i);
if(v != null)
tostorebuffer.append(",('").append(v.id).append("','").append(v.length).append("','").append(v.created).append("','").append(v.languageCode).append("','").append(v.categorie).append("','").append(v.title).append("','").append(v.channel).append("','").append(v.tags).append("') ");
tostorebuffer.append("('").append(escape(v.id)).append("',").append(v.length).append(",").append(v.created).append(",'").append(escape(v.languageCode)).append("',").append(v.categorie).append(",'").append(escape(v.title)).append("','").append(escape(v.channel)).append("','").append(escape(v.tags)).append("'),");
}
}
}
@ -150,17 +149,21 @@ public class DB implements Runnable {
if(tostorebuffer.length() > 10) {
log.info("Write databuffer to DB video count: " + writebuffercurrentsize);
dbsize += writebuffercurrentsize;
tostorebuffer.deleteCharAt(0);//delete leading ','
String qu = "INSERT IGNORE INTO `ytcrawler`.`videos`(`id`, `length`, `created`, `langcode`, `category`, `videotitle`, `channel`, `tags`) VALUES " + tostorebuffer.toString();
tostorebuffer.deleteCharAt(tostorebuffer.length()-1);//delete trailing ','
String qu = "INSERT IGNORE INTO `videos`(`id`, `length`, `created`, `langcode`, `category`, `videotitle`, `channel`, `tags`) VALUES " + tostorebuffer.toString();
update(qu);
//reset buffer
writebuffercurrentsize = 0;
tostorebuffer = new StringBuilder(writebuffersize);
tostorebuffer = new StringBuilder(writebuffersize * STOREBUFFERRATIO);
}
}
}
private String escape(String e) {
return e.replace("'", "\\'");
}
public void updateVideos(List<Video> input) {
log.info("Updateing " + input.size() + " videos.");
for(Video v : input) {
@ -171,7 +174,7 @@ public class DB implements Runnable {
private void updateVideo(Video v) {
try {
String qu = "UPDATE `ytcrawler`.`videos` SET `length` = '" + v.length + "', `created` = '" + v.created + "', `langcode` = SUBSTR('" + v.languageCode + "', 1, 3) ,`category` = '" + v.categorie + "',`videotitle` = SUBSTR('" + v.title + "',1,100),`channel` = SUBSTR('" + v.channel + "',1,20),`tags` = '" + v.tags.substring(0, v.tags.length() > 40 ? 40 : v.tags.length()) + "' WHERE `id` = '" + v.id + "';";
String qu = "UPDATE `videos` SET `length` = " + v.length + ", `created` = " + v.created + ", `langcode` = SUBSTR('" + v.languageCode + "', 1, 3) ,`category` = " + v.categorie + ",`videotitle` = SUBSTR('" + escape(v.title) + "',1,100),`channel` = SUBSTR('" + escape(v.channel) + "',1,20),`tags` = '" + escape(v.tags) + "' WHERE `id` = '" + escape(v.id) + "';";
update(qu);
} catch(NullPointerException e) {
@ -195,7 +198,7 @@ public class DB implements Runnable {
public void removeVideos(LinkedList<Video> vids) {
log.info("Delete " + vids.size() + " videos.");
for(Video s : vids) {
update("DELETE FROM `ytcrawler`.`videos` WHERE `id`='" + s.id + "';");
update("DELETE FROM `videos` WHERE `id`='" + escape(s.id) + "';");
}
}
@ -258,14 +261,14 @@ public class DB implements Runnable {
}
public LinkedList<String> restoreTemp() {
ResultSet res = query("SELECT * FROM `ytcrawler`.`temp` LIMIT 500;");
ResultSet res = query("SELECT * FROM `temp` LIMIT 500;");
LinkedList<String> out = new LinkedList<>();
log.info("RestoreTemp");
try {
while(res.next()) {
out.add(res.getString(1));
}
update("DELETE FROM `ytcrawler`.`temp` LIMIT 500;");
update("DELETE FROM `temp` LIMIT 500;");
} catch (Exception e) {}
return out;
}
@ -273,7 +276,7 @@ public class DB implements Runnable {
public void deleteDouble() {
log.info("Started Delete Double");
long start = System.currentTimeMillis();
update("call ytcrawler.deletedouble();");
update("CALL deletedouble();");
log.info("Delete Double done in " + ((System.currentTimeMillis() - start)/60000) + " min");
}
@ -284,18 +287,18 @@ public class DB implements Runnable {
log.info("store Temp to buffer: " + strings.size());
writetempbuffercurrentsize += strings.size();
for(String s : strings) {
totempbuffer.append(", ('").append(s).append("')");
totempbuffer.append("('").append(escape(s)).append("'),");
}
}
if(writetempbuffercurrentsize > writebuffersize || force) {
log.info("Write Buffer: " + writetempbuffercurrentsize);
totempbuffer.deleteCharAt(0);//delete leading ','
String qu = "INSERT IGNORE INTO `ytcrawler`.`temp` (`ytid`) VALUES " + totempbuffer.toString() + ";";
totempbuffer.deleteCharAt(totempbuffer.length()-1);//delete trailing ','
String qu = "INSERT IGNORE INTO `temp` (`ytid`) VALUES " + totempbuffer.toString() + ";";
update(qu);
//reset
writetempbuffercurrentsize = 0;
totempbuffer = new StringBuilder(writebuffersize);
totempbuffer = new StringBuilder(writebuffersize * TEMPBUFFERRATIO);
}
}
@ -304,7 +307,6 @@ public class DB implements Runnable {
* Stops the randomnes-Server and disconnect
*/
public void stop() {
serv.stop();
try {
if(con != null) {
if(!con.isClosed()) {
@ -323,23 +325,18 @@ public class DB implements Runnable {
@Override
public void run() {
log.info("Started Refilling.");
ResultSet count = query("SELECT `" + db + "`.`getLimit`() as 'l';");
if(count != null) {
try {
if(count.next()) {
int max = count.getInt("l");
ResultSet set = query("SELECT `id` FROM `videos` LIMIT " + rand.nextInt(max) + ",100;");
if(set != null) {
while(set.next()) {
randombuffer.add(set.getString(1));
}
log.info("refilled randombuffer to " + randombuffer.size() + " videos.");
ResultSet set = query("SELECT `id` FROM `videos` ORDER BY rand() LIMIT 100;");
if(set != null) {
while(set.next()) {
randombuffer.add(set.getString(1));
}
log.info("refilled randombuffer to " + randombuffer.size() + " videos.");
}
} catch (SQLException e) {
log.warn("error getting a random video", e);
}
}
if(randombuffer.isEmpty()) {
log.error("Unable to retrieve RandomVideos");
}

View File

@ -68,7 +68,7 @@ public class Main implements JSONCommandHandler {
}
//starting BOT API
tapi = new TelegramAPI(Config.prop.getProperty("telegramapi.key"));
tapi = new TelegramAPI(Config.prop.getProperty("telegramapi.key"), "randomytvideobot");
tapi.getCommandManager().registerCommand( this);
tapi.getEventManager().registerEvent(UserSendMessageEvent.class, this::onAdmin);
tapi.setHelpText("Send the command /random to get a random video.");

View File

@ -0,0 +1,178 @@
package de.mrbesen.youtubecrawler;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.stream.Collectors;
import javax.xml.datatype.DatatypeConfigurationException;
import javax.xml.datatype.DatatypeFactory;
import javax.xml.datatype.Duration;
import org.apache.log4j.Logger;
import org.json.JSONArray;
import org.json.JSONObject;
import org.json.JSONTokener;
public class YoutubeAPI {
private String api_key = null;
private static String BASEQUERY = "https://www.googleapis.com/youtube/v3/";
private static DateFormat dateformat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
private Logger log = Logger.getLogger(YoutubeAPI.class.getName());
private DatatypeFactory durationfactory = null;
public YoutubeAPI(String apikey) {
api_key = apikey;
try {
durationfactory = DatatypeFactory.newInstance();
} catch(DatatypeConfigurationException e) {
e.printStackTrace();
System.exit(1);
}
}
public Crawler.Video getInfo(String id) {
return getInfos(id)[0].get(0);
}
public List<Crawler.Video>[] getInfos(Collection<String> ids) {
if(ids.isEmpty())
return null;
StringBuilder sb = new StringBuilder();
boolean isFirst = true;
for(String id : ids) {
if(id.matches("[a-zA-Z0-9_-]{11}")) {
if(!isFirst) {
sb.append(',');
}
sb.append(id);
isFirst = false;
} else {
System.out.println("non matching id: \"" + id + "\"");
}
}
return getInfos(sb.toString());
}
public Map<Integer, String> getCategories() {
String query = BASEQUERY + "videoCategories?part=snippet&regionCode=us&key=" + api_key;
JSONObject obj = parse(connect(query));
Map<Integer, String> out = new TreeMap<>();
if(obj != null) {
JSONArray items = obj.getJSONArray("items");
for(int i = 0; !items.isNull(i); ++i) {
JSONObject item = items.getJSONObject(i);
String id = item.getString("id");
String name = item.getJSONObject("snippet").getString("title");
try {
int intid = Integer.parseInt(id);
out.put(intid, name);
// System.out.println(intid + ";" + name);
} catch (NumberFormatException e) {
e.printStackTrace();
}
}
}
return out;
}
public List<Crawler.Video>[] getInfos(String idlist) {
ArrayList<Crawler.Video> out = new ArrayList<>(idlist.length() / 12);//approximierte vorraussichtliche länge
LinkedList<Crawler.Video> livestr = new LinkedList<>();
String nextpage = "";
do {
String query = BASEQUERY + "videos?part=snippet,contentDetails&id=" + idlist + nextpage + "&key=" + api_key;
JSONObject json = parse(connect(query));
nextpage = "";
if(json != null) {
if(json.has("items")) {
//get video list
json.getJSONArray("items").forEach(item -> out.add( getVid((JSONObject) item) ));
if(json.has("nextPageToken")) {
nextpage = "&pageToken=" + json.getString("nextPageToken");
}
}
}
} while(!nextpage.isEmpty());
return new List[] {out, livestr};
}
private Crawler.Video getVid(JSONObject json) {
String vdid = json.getString("id");
JSONObject snippet = json.getJSONObject("snippet");
String title = snippet.optString("title", ""); //maxlen: 100
long published = getDate(snippet.optString("publishedAt", ""));
String channel = snippet.optString("channelTitle", "");
String tags = "";
if(snippet.optJSONArray("tags") != null)
tags = snippet.getJSONArray("tags").toList().stream().map(o -> (String) o).collect(Collectors.joining(",")); // max len: ~500
byte category = 0;
try {
category = Byte.parseByte(snippet.getString("categoryId"));
} catch(NumberFormatException e) {}
JSONObject contentDetails = json.getJSONObject("contentDetails");
int duration = (int) getDuration(contentDetails.optString("duration", ""));
boolean live = !snippet.getString("liveBroadcastContent").equalsIgnoreCase("none");
String langCode = snippet.optString("defaultLanguage", snippet.optString("defaultAudioLanguage", ""));
if(langCode.length() > 3) {
langCode = langCode.substring(0, 3);
}
if(langCode.endsWith("-")) {
langCode = langCode.substring(0, 2);
}
return new Crawler.Video(vdid, title, channel, tags, duration, langCode, category, published, live);
}
private long getDate(String format) {
try {
Date d = dateformat.parse(format.substring(0, 19).replace('T', ' '));
return d.getTime() / 1000;
} catch (ParseException e) {
e.printStackTrace();
System.err.println("Failed to parse date: " + format);
}
return 0;
}
private long getDuration(String iso8601) {
Duration dur = durationfactory.newDuration(iso8601);
return dur.getTimeInMillis(new Date(0)) / 1000;
}
public BufferedReader connect(String url) {
if(url == null)
return null;
try {
URL urll = new URL(url);
//System.out.println("url: " + urll.toString());
HttpURLConnection con = (HttpURLConnection) urll.openConnection();
con.connect();
//System.out.println(con.getResponseCode());
return new BufferedReader(new InputStreamReader(con.getInputStream()));
} catch(IOException e) {
e.printStackTrace();
}
return null;
}
public JSONObject parse(BufferedReader in) {
if(in == null)
return null;
return new JSONObject(new JSONTokener(in));
}
}

View File

@ -0,0 +1,30 @@
package de.mrbesen.youtubecrawler;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.util.Map;
public class YoutubeAPITest {
@Test
public void testgetVideo() {
YoutubeAPI api = new YoutubeAPI(System.getenv("YOUTUBEAPIKEY"));
Crawler.Video vid = api.getInfo("gsvKF2ojUzs");
Assertions.assertEquals(vid.id, "gsvKF2ojUzs");
Assertions.assertEquals(vid.title, "Mikebøi - Missed");
Assertions.assertEquals(vid.channel, "TrapNation");
Assertions.assertEquals(vid.tags, "mikeboi missed,trap nation,trapnation,трап натион,electronic music 2020,Trap,Electronic Dance Music,missed trap nation,trap music,Electronic Music,Trap Music,Dance Music,missed mike boi,gaming music,Trap Music 2017,mike boy missed,mikebøi - missed,нас не догонят ремикс,Trap Nation,TrapNation,Mikebøi - Missed,trap nation 2020,trap music 2020 remix,EDM,missed mikeboi,music");
Assertions.assertEquals(vid.length, 213);
Assertions.assertEquals(vid.languageCode, "en");
Assertions.assertEquals(vid.categorie, 10);
Assertions.assertEquals(vid.created, 1491571496);
Assertions.assertFalse(vid.live);
Map<Integer, String> obj = api.getCategories();
Assertions.assertNotNull(obj);
}
}