forked from MrBesen/YoutubeCrawler
Compare commits
17 Commits
Author | SHA1 | Date |
---|---|---|
mrbesen | 01104a2f5e | |
mrbesen | 928942e3ef | |
mrbesen | e02d51b72c | |
mrbesen | ba270f85f6 | |
mrbesen | 9d1d73137e | |
mrbesen | a0b0ce6bfb | |
mrbesen | bdcc276d58 | |
mrbesen | 2df0029c18 | |
mrbesen | 2aaa788aac | |
mrbesen | 9925f5fa94 | |
mrbesen | ba4134345a | |
mrbesen | e57842d3e1 | |
mrbesen | 7180007dea | |
mrbesen | d3fa21cbc0 | |
mrbesen | 25c82acfd9 | |
MrBesen | 4572e67e76 | |
MrBesen | e34622e7e7 |
|
@ -3,3 +3,10 @@
|
||||||
.project
|
.project
|
||||||
.settings/*
|
.settings/*
|
||||||
.classpath
|
.classpath
|
||||||
|
admins
|
||||||
|
crawl.conf
|
||||||
|
crawl.txt
|
||||||
|
*.so
|
||||||
|
.idea/
|
||||||
|
.vscode/settings.json
|
||||||
|
includes/
|
|
@ -0,0 +1,18 @@
|
||||||
|
{
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"name": "Linux",
|
||||||
|
"includePath": [
|
||||||
|
"${workspaceFolder}/**",
|
||||||
|
"${workspaceFolder}/includes/",
|
||||||
|
"/usr/lib/jvm/java-8-openjdk-amd64/include/",
|
||||||
|
"/usr/lib/jvm/java-8-openjdk-amd64/include/linux/"
|
||||||
|
],
|
||||||
|
"defines": [],
|
||||||
|
"compilerPath": "/usr/bin/clang",
|
||||||
|
"cStandard": "c11",
|
||||||
|
"cppStandard": "c++17"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"version": 4
|
||||||
|
}
|
|
@ -0,0 +1,11 @@
|
||||||
|
|
||||||
|
|
||||||
|
createhfiles:
|
||||||
|
mkdir -p includes/
|
||||||
|
javah -d includes/ -classpath target/YoutubeCrawler-0.0.2-jar-with-dependencies.jar de.mrbesen.youtubecrawler.CrawlerThread
|
||||||
|
|
||||||
|
compile:
|
||||||
|
g++ -shared -fPIC -o libcrawlerthread.so cpp/crawlerthread.cpp -Iincludes/ -I/usr/lib/jvm/java-8-openjdk-amd64/include/ -I/usr/lib/jvm/java-8-openjdk-amd64/include/linux/ -lcurl
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf includes/ libcrawlerthread.so
|
|
@ -0,0 +1,112 @@
|
||||||
|
#include "de_mrbesen_youtubecrawler_CrawlerThread.h"
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <set>
|
||||||
|
#include <string>
|
||||||
|
#include <sstream>
|
||||||
|
#include <regex>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include <curl/curl.h>
|
||||||
|
|
||||||
|
|
||||||
|
static const std::string YOUTUBEBASE = "https://youtube.com/watch?v=";
|
||||||
|
static const std::regex YOUTUBELINKPATTERN("watch\\?v=([-_a-zA-Z0-9]{11})");
|
||||||
|
|
||||||
|
std::vector<CURL*> curls;
|
||||||
|
|
||||||
|
static size_t WriteCallback(void* contents, size_t size, size_t nmemb, void* userp) {
|
||||||
|
*((std::ostringstream*) userp) << std::string((char*) contents, size * nmemb);
|
||||||
|
return size * nmemb;
|
||||||
|
}
|
||||||
|
|
||||||
|
JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_initLib(JNIEnv* env, jclass clazz, jint threadcount) {
|
||||||
|
curls.reserve(threadcount);
|
||||||
|
for(int i = 0; i < threadcount; ++i) {
|
||||||
|
CURL* curl = curl_easy_init();
|
||||||
|
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
|
||||||
|
curls.push_back(curl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_deinitLib(JNIEnv* env, jclass) {
|
||||||
|
for(int i = 0; i < curls.size(); ++i) {
|
||||||
|
CURL* curl = curls.at(i);
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
}
|
||||||
|
curls.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string download(CURL* curl, const std::string& url) {
|
||||||
|
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||||
|
|
||||||
|
std::ostringstream out;
|
||||||
|
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &out);
|
||||||
|
|
||||||
|
CURLcode res = curl_easy_perform(curl);
|
||||||
|
long responsecode = 404;
|
||||||
|
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &responsecode);
|
||||||
|
if(responsecode != 200) {
|
||||||
|
std::cout << "Curl error: got " << responsecode << std::endl;
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
if(res != CURLE_OK) {
|
||||||
|
std::cout << "Curl error: " << res << std::endl;
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
JNIEXPORT jboolean JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl(JNIEnv* env, jobject that, jstring videoid, jint threadid) {
|
||||||
|
// get videoid argument
|
||||||
|
jboolean myfalseval = false;
|
||||||
|
const char* cvideid = env->GetStringUTFChars(videoid, &myfalseval);
|
||||||
|
const std::string svideoid(cvideid); // makes a copy
|
||||||
|
env->ReleaseStringUTFChars(videoid, cvideid);
|
||||||
|
|
||||||
|
// std::cout << "crawl: " << svideoid << std::endl;
|
||||||
|
|
||||||
|
// use curl to get the website
|
||||||
|
CURL* curl = curls.at(threadid);
|
||||||
|
std::string webcontent = download(curl, YOUTUBEBASE + svideoid);
|
||||||
|
if(webcontent.empty()) {
|
||||||
|
std::cout << "webcontent is empty" << std::endl;
|
||||||
|
return JNI_FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
jclass jclass_ct = env->FindClass("de/mrbesen/youtubecrawler/CrawlerThread"); // class of CrawlerThread
|
||||||
|
jfieldID fid_ctfound = env->GetFieldID(jclass_ct, "found" , "Ljava/util/List;"); // fieldID of CrawlerThread.found
|
||||||
|
jclass jclass_ll = env->FindClass("java/util/List"); // Linked List Class
|
||||||
|
jmethodID mid_add = env->GetMethodID(jclass_ll, "add", "(Ljava/lang/Object;)Z"); // add() method of LinkedList
|
||||||
|
jobject ll_found = env->GetObjectField(that, fid_ctfound); // linked list to store the results to
|
||||||
|
|
||||||
|
// match regex
|
||||||
|
std::sregex_iterator it = std::sregex_iterator(webcontent.begin(), webcontent.end(), YOUTUBELINKPATTERN);
|
||||||
|
const std::sregex_iterator itend = std::sregex_iterator();
|
||||||
|
|
||||||
|
// this set is to find duplicated strings it does not catch everything, because it is only valid during this crawl, but it should filter a few with very little effort
|
||||||
|
std::set<std::string> known;
|
||||||
|
known.insert(svideoid); // do not "find" the same ytid
|
||||||
|
|
||||||
|
env->MonitorEnter(that); // syncronized(this) {
|
||||||
|
for( ; it != itend; ++it) {
|
||||||
|
const std::string ytid = (*it)[1].str(); // get the String from the first sub-group match
|
||||||
|
|
||||||
|
// only if the ytid is not known
|
||||||
|
if(known.find(ytid) == known.end()) {
|
||||||
|
// add to the found list
|
||||||
|
// std::cout << ytid << std::endl;
|
||||||
|
|
||||||
|
jstring jytid = env->NewStringUTF(ytid.c_str()); // create a java string object
|
||||||
|
jboolean b = env->CallBooleanMethod(ll_found, mid_add, jytid); // call add() on the LinkedList object with the String
|
||||||
|
|
||||||
|
known.insert(ytid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
env->MonitorExit(that); // end of syncronized block
|
||||||
|
|
||||||
|
return JNI_TRUE;
|
||||||
|
}
|
||||||
|
|
18
pom.xml
18
pom.xml
|
@ -5,7 +5,8 @@
|
||||||
<artifactId>YoutubeCrawler</artifactId>
|
<artifactId>YoutubeCrawler</artifactId>
|
||||||
<version>0.0.2</version>
|
<version>0.0.2</version>
|
||||||
<build>
|
<build>
|
||||||
<sourceDirectory>src</sourceDirectory>
|
<sourceDirectory>src/main</sourceDirectory>
|
||||||
|
<testSourceDirectory>src/test</testSourceDirectory>
|
||||||
<plugins>
|
<plugins>
|
||||||
<plugin>
|
<plugin>
|
||||||
<artifactId>maven-compiler-plugin</artifactId>
|
<artifactId>maven-compiler-plugin</artifactId>
|
||||||
|
@ -59,8 +60,8 @@
|
||||||
</plugins>
|
</plugins>
|
||||||
</build>
|
</build>
|
||||||
<properties>
|
<properties>
|
||||||
<maven.compiler.target>1.8</maven.compiler.target>
|
<maven.compiler.target>1.11</maven.compiler.target>
|
||||||
<maven.compiler.source>1.8</maven.compiler.source>
|
<maven.compiler.source>1.11</maven.compiler.source>
|
||||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
|
||||||
</properties>
|
</properties>
|
||||||
|
@ -85,5 +86,16 @@
|
||||||
<artifactId>guava</artifactId>
|
<artifactId>guava</artifactId>
|
||||||
<version>11.0.2</version>
|
<version>11.0.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.junit.jupiter</groupId>
|
||||||
|
<artifactId>junit-jupiter-engine</artifactId>
|
||||||
|
<version>5.8.1</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.projectlombok</groupId>
|
||||||
|
<artifactId>lombok</artifactId>
|
||||||
|
<version>1.18.12</version>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -1,114 +0,0 @@
|
||||||
package de.mrbesen.youtubecrawler;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.regex.Matcher;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
|
|
||||||
public class CrawlerThread implements Runnable {
|
|
||||||
|
|
||||||
private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v=");
|
|
||||||
|
|
||||||
private Logger log = Logger.getLogger(this.getClass().getName());
|
|
||||||
private Crawler parent;
|
|
||||||
Thread thread;
|
|
||||||
|
|
||||||
LinkedList<String> todo = new LinkedList<>();//videos, this thread should crawl
|
|
||||||
LinkedList<String> crawled = new LinkedList<>();//videos this thread had crawled
|
|
||||||
LinkedList<String> found = new LinkedList<>();//videos this thread had found
|
|
||||||
|
|
||||||
boolean requested = true;//is a request pending?
|
|
||||||
boolean lockforreport = false;
|
|
||||||
|
|
||||||
public CrawlerThread( Crawler root) {
|
|
||||||
parent = root;
|
|
||||||
root.request(this);
|
|
||||||
}
|
|
||||||
|
|
||||||
void setThread(Thread t) {
|
|
||||||
thread = t;
|
|
||||||
}
|
|
||||||
|
|
||||||
LinkedList<String> undone() {
|
|
||||||
return todo;
|
|
||||||
}
|
|
||||||
|
|
||||||
int undoneSize() {
|
|
||||||
return todo.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void run() {
|
|
||||||
while(parent.isCrawling()) {
|
|
||||||
while(!todo.isEmpty() && parent.isCrawling()) {
|
|
||||||
if(lockforreport) {
|
|
||||||
try {
|
|
||||||
Thread.sleep(10);
|
|
||||||
} catch(InterruptedException e) {
|
|
||||||
lockforreport = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
crawl(todo.removeFirst());
|
|
||||||
if(todo.size() < parent.requestlimit && !requested) {
|
|
||||||
requested = true;
|
|
||||||
parent.request(this);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if(todo.isEmpty() && !requested) {
|
|
||||||
requested = true;
|
|
||||||
parent.request(this);
|
|
||||||
}
|
|
||||||
|
|
||||||
log.warn("No Object left!");
|
|
||||||
Thread.yield();
|
|
||||||
try {
|
|
||||||
Thread.sleep(10000);//sleep for 10 seconds
|
|
||||||
} catch (InterruptedException ignored) {}
|
|
||||||
}
|
|
||||||
log.info("Stopped.");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* returns a linkedlist of all crawled videos
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
LinkedList<String>[] report() {
|
|
||||||
lockforreport = true;
|
|
||||||
LinkedList<String>[] out = new LinkedList[] {(LinkedList) crawled, (LinkedList) found};
|
|
||||||
crawled = new LinkedList<>();
|
|
||||||
found = new LinkedList<>();
|
|
||||||
lockforreport = false;
|
|
||||||
thread.interrupt();
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void crawl(String videoid) {
|
|
||||||
try {
|
|
||||||
crawled.add(videoid);
|
|
||||||
|
|
||||||
// log.info("crawling: " + videoid);
|
|
||||||
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
|
|
||||||
String s = con.getContent();
|
|
||||||
Matcher matcher = linkpattern.matcher(s);
|
|
||||||
while(matcher.find()) {
|
|
||||||
int beginytid = matcher.end();
|
|
||||||
int endxtid = s.indexOf('"', beginytid);
|
|
||||||
int endid = s.indexOf('&', beginytid);
|
|
||||||
if(endid < endxtid) {
|
|
||||||
endxtid = endid;
|
|
||||||
}
|
|
||||||
String ytid = s.substring(beginytid, endxtid);
|
|
||||||
if(ytid.length() > 9 && ytid.length() <= 12) {
|
|
||||||
found.add(ytid);
|
|
||||||
} else {
|
|
||||||
// log.warn("youtube id has wrong length: \"" + ytid + "\"");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch(IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,60 +0,0 @@
|
||||||
package de.mrbesen.youtubecrawler;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.PrintWriter;
|
|
||||||
import java.net.ServerSocket;
|
|
||||||
import java.net.Socket;
|
|
||||||
import java.net.SocketTimeoutException;
|
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
|
|
||||||
public class Server implements Runnable {
|
|
||||||
|
|
||||||
private ServerSocket ssoc;
|
|
||||||
private Thread t;
|
|
||||||
private boolean run = false;
|
|
||||||
private Logger log = Logger.getLogger(this.getClass().getName());
|
|
||||||
private DB db;
|
|
||||||
|
|
||||||
public Server(DB d) {
|
|
||||||
db = d;
|
|
||||||
}
|
|
||||||
|
|
||||||
void start() {
|
|
||||||
run = true;
|
|
||||||
t = new Thread(this, "Server");
|
|
||||||
t.start();
|
|
||||||
}
|
|
||||||
|
|
||||||
void stop() {
|
|
||||||
run = false;
|
|
||||||
t.interrupt();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void run() {
|
|
||||||
try {
|
|
||||||
final int port = 2419;
|
|
||||||
ssoc = new ServerSocket(port, -1);
|
|
||||||
//ssoc.bind(new InetSocketAddress(, port));
|
|
||||||
ssoc.setSoTimeout(5);
|
|
||||||
log.info("opened Server at port " + port);
|
|
||||||
while(run) {
|
|
||||||
try {
|
|
||||||
Socket client = ssoc.accept();
|
|
||||||
//if(client.getInetAddress().isLoopbackAddress()) {
|
|
||||||
PrintWriter out = new PrintWriter(client.getOutputStream());
|
|
||||||
out.println(db.getRandom());
|
|
||||||
out.flush();
|
|
||||||
out.close();
|
|
||||||
/*} else {
|
|
||||||
log.info("client connected: " + client.getInetAddress().toString());
|
|
||||||
}*/
|
|
||||||
client.close();
|
|
||||||
} catch(SocketTimeoutException ignored) {}
|
|
||||||
}
|
|
||||||
ssoc.close();
|
|
||||||
} catch(IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,162 +0,0 @@
|
||||||
package de.mrbesen.youtubecrawler;
|
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.net.URL;
|
|
||||||
import java.text.DateFormat;
|
|
||||||
import java.text.ParseException;
|
|
||||||
import java.text.SimpleDateFormat;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import javax.net.ssl.HttpsURLConnection;
|
|
||||||
|
|
||||||
import org.apache.log4j.Logger;
|
|
||||||
|
|
||||||
import de.mrbesen.youtubecrawler.Crawler.Video;
|
|
||||||
|
|
||||||
public class YoutubeAPI {
|
|
||||||
|
|
||||||
private final String api_key = Config.prop.getProperty("youtube.apikey");
|
|
||||||
private static String basequery = "https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails&id=";
|
|
||||||
private static DateFormat dateformat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
|
|
||||||
private Logger log = Logger.getLogger(YoutubeAPI.class.getName());
|
|
||||||
|
|
||||||
public YoutubeAPI() {
|
|
||||||
if(api_key.isEmpty()) {
|
|
||||||
log.error("apikey is not defined!");
|
|
||||||
System.exit(1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public Video getInfo(String id) {
|
|
||||||
return (Video) getInfos(id)[0].get(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<Video>[] getInfos(List<String> ids) {
|
|
||||||
//log.info("get " + ids.size() + " infos");
|
|
||||||
if(ids.isEmpty())
|
|
||||||
return null;
|
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder(ids.remove(0));
|
|
||||||
while(!ids.isEmpty()) {
|
|
||||||
sb.append(',').append(ids.remove(0));
|
|
||||||
}
|
|
||||||
return getInfos(sb.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<Video>[] getInfos(String idlist) {
|
|
||||||
ArrayList<Video> out = new ArrayList<Video>(idlist.length() / 12);//approximierte vorraussichtliche länge
|
|
||||||
LinkedList<Video> livestr = new LinkedList<Video>();
|
|
||||||
String nextpage = "";
|
|
||||||
do {
|
|
||||||
String query = basequery + idlist + "&key=" + api_key;
|
|
||||||
BufferedReader br = connect(query);
|
|
||||||
nextpage = "";
|
|
||||||
if(br != null) {
|
|
||||||
try {
|
|
||||||
String line;
|
|
||||||
Video v = null;
|
|
||||||
boolean tags = false;
|
|
||||||
while((line = br.readLine()) != null) {
|
|
||||||
String split[] = line.split(":",2);
|
|
||||||
if(split.length == 2) {
|
|
||||||
split[0] = removeunwanted(split[0]);
|
|
||||||
|
|
||||||
//System.out.println(split[0] + " " + split[1]);
|
|
||||||
if(split[0].equals("defaultAudioLanguage")) {
|
|
||||||
v.languageCode = removeunwanted(split[1]);
|
|
||||||
} else if(split[0].equals("defaultLanguage")) {
|
|
||||||
v.languageCode = removeunwanted(split[1]);
|
|
||||||
} else if(split[0].equals("title")) {
|
|
||||||
if(v.title.isEmpty())
|
|
||||||
v.title = removeunwanted(split[1]);
|
|
||||||
} else if(split[0].equals("channelTitle")) {
|
|
||||||
v.channel = removeunwanted(split[1]);
|
|
||||||
} else if(split[0].equals("defaultLanguage")) {
|
|
||||||
v.languageCode = removeunwanted(split[1]);
|
|
||||||
} else if(split[0].equals("tags")) {
|
|
||||||
tags = true;
|
|
||||||
} else if(split[0].equals("liveBroadcastContent")) {
|
|
||||||
v.live = !removeunwanted(split[1]).equalsIgnoreCase("none");
|
|
||||||
} else if(split[0].equals("id")) {
|
|
||||||
if(v != null) {
|
|
||||||
if(!v.live)
|
|
||||||
out.add(v);
|
|
||||||
else {
|
|
||||||
livestr.add(v);
|
|
||||||
log.info("livestream found! " + v.id + " " + v.channel);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
v = new Video();
|
|
||||||
v.id = removeunwanted(split[1]);
|
|
||||||
//System.out.println("new video: " + v.id + " " + v.length + " " + v.languageCode);
|
|
||||||
} else if(split[0].equals("categoryId")) {
|
|
||||||
v.categorie = Byte.parseByte(removeunwanted(split[1]));
|
|
||||||
} else if(split[0].equals("duration")) {
|
|
||||||
String timeparts[] = removeunwanted(split[1]).substring(2).split("[H,M,S]");
|
|
||||||
try {
|
|
||||||
if(timeparts.length > 2) {//hours
|
|
||||||
v.length = 3600 * Integer.parseInt(timeparts[0]);
|
|
||||||
}
|
|
||||||
if(timeparts.length > 1) {//minutes
|
|
||||||
v.length += 60 * Integer.parseInt(timeparts[timeparts.length -2]);
|
|
||||||
}
|
|
||||||
//Seconds
|
|
||||||
v.length += Integer.parseInt(timeparts[timeparts.length-1]);
|
|
||||||
} catch(NumberFormatException e) {//failed: P6DT17H59M53S and P15W3DT4H1M11S and P1W2DT20H47M55S video id: 1NPyC0psMaI and P2W2DT23H58M58S video id: Jd9KjbRxhN4 For input string: "W2DT23"
|
|
||||||
Main.getMain().broadcastAdmin(removeunwanted(split[1]) + " video id: " + v.id);
|
|
||||||
log.warn("Error saving the time string: " + removeunwanted(split[1]) + " video id: " + v.id, e);
|
|
||||||
}
|
|
||||||
} else if(split[0].equals("publishedAt")) {
|
|
||||||
String tmp = removeunwanted(split[1]);
|
|
||||||
tmp = tmp.replace('T', ' ');
|
|
||||||
tmp = tmp.substring(0, tmp.length()-5);
|
|
||||||
Date d = dateformat.parse(tmp);
|
|
||||||
v.created = d.getTime() / 1000;
|
|
||||||
}else if(split[0].equals("nextPageToken")) {
|
|
||||||
nextpage = "&pageToken=" + removeunwanted(split[1]);
|
|
||||||
// System.out.println("nextpage set to " + nextpage);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if(line.contains("]")) {
|
|
||||||
if(v.tags.length() > 1)
|
|
||||||
v.tags = v.tags.substring(1);
|
|
||||||
tags = false;
|
|
||||||
} else if(tags) {
|
|
||||||
v.tags += ", " + removeunwanted(line);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
out.add(v);//add the last video
|
|
||||||
br.close();
|
|
||||||
} catch(IOException | ParseException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} while(!nextpage.equals(""));
|
|
||||||
//log.info("got " + (out.size() + livestr.size()) + " infos");
|
|
||||||
return new List[] {out, livestr};
|
|
||||||
}
|
|
||||||
|
|
||||||
private String removeunwanted(String in) {
|
|
||||||
return in.replaceAll("[\"}{\\,\\\\]", "").replaceAll("'", "").trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
public BufferedReader connect(String url) {
|
|
||||||
try {
|
|
||||||
URL urll = new URL(url);
|
|
||||||
HttpsURLConnection con = (HttpsURLConnection) urll.openConnection();
|
|
||||||
con.connect();
|
|
||||||
//System.out.println(con.getResponseCode());
|
|
||||||
return new BufferedReader(new InputStreamReader(con.getInputStream()));
|
|
||||||
} catch(IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// no suchelement bla
|
|
|
@ -7,13 +7,10 @@ import java.io.IOException;
|
||||||
import java.io.PrintWriter;
|
import java.io.PrintWriter;
|
||||||
import java.text.DateFormat;
|
import java.text.DateFormat;
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Date;
|
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.NoSuchElementException;
|
|
||||||
import java.util.Scanner;
|
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
public class Crawler implements Runnable {
|
public class Crawler implements Runnable {
|
||||||
|
@ -22,8 +19,8 @@ public class Crawler implements Runnable {
|
||||||
int requestlimit = 5;//amount of videos to be left in the todo que of a thread until it requests new videos
|
int requestlimit = 5;//amount of videos to be left in the todo que of a thread until it requests new videos
|
||||||
private int idlecount = 5;//amount of idle loops allowed
|
private int idlecount = 5;//amount of idle loops allowed
|
||||||
|
|
||||||
private LinkedList<String> toSave = new LinkedList<>();//all found ytids, witch need to be analysed
|
private Set<String> toSave = new TreeSet<>();//all found ytids, witch need to be analysed
|
||||||
private LinkedList<String> toCrawl = new LinkedList<>();//all videos tu crawl
|
private Set<String> toCrawl = new TreeSet<>();//all videos tu crawl
|
||||||
//private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle
|
//private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle
|
||||||
private List<CrawlerThread> threads;//list of all threads
|
private List<CrawlerThread> threads;//list of all threads
|
||||||
private List<CrawlerThread> requested = new LinkedList<>();
|
private List<CrawlerThread> requested = new LinkedList<>();
|
||||||
|
@ -36,10 +33,11 @@ public class Crawler implements Runnable {
|
||||||
//private int updateOffset = 0;
|
//private int updateOffset = 0;
|
||||||
|
|
||||||
private DB db = new DB();
|
private DB db = new DB();
|
||||||
private YoutubeAPI api = new YoutubeAPI();
|
private YoutubeAPI api = new YoutubeAPI(Config.prop.getProperty("youtube.apikey"));
|
||||||
private File crawlfile = new File("crawl.txt");
|
private File crawlfile = new File("crawl.txt");
|
||||||
private Logger log = Logger.getLogger(this.getClass().getName());
|
private Logger log = Logger.getLogger(this.getClass().getName());
|
||||||
private Profiler profiler = new Profiler();
|
private Profiler profiler = new Profiler();
|
||||||
|
private long lastadminreport = 0;
|
||||||
|
|
||||||
private int startup = 2;//to keep the beginning cool - counter how often the program is allowed to enter startup sleep
|
private int startup = 2;//to keep the beginning cool - counter how often the program is allowed to enter startup sleep
|
||||||
|
|
||||||
|
@ -70,7 +68,7 @@ public class Crawler implements Runnable {
|
||||||
//if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
|
//if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
|
||||||
//toknown.add(videoid);
|
//toknown.add(videoid);
|
||||||
if(toCrawl.contains(videoid)) {
|
if(toCrawl.contains(videoid)) {
|
||||||
ArrayList<String> str = new ArrayList<String>(1);
|
ArrayList<String> str = new ArrayList<>(1);
|
||||||
str.add(videoid);
|
str.add(videoid);
|
||||||
db.storeTemp(str, false);
|
db.storeTemp(str, false);
|
||||||
}
|
}
|
||||||
|
@ -89,22 +87,17 @@ public class Crawler implements Runnable {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void send(CrawlerThread t) {
|
private void send(CrawlerThread t) {
|
||||||
// listlock.writeLock().lock();
|
synchronized (toCrawl) {
|
||||||
for(int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) {
|
for (int i = 0; i < jobspeerthread && !toCrawl.isEmpty(); i++) {
|
||||||
t.todo.add(toCrawl.removeFirst());
|
String s = toCrawl.stream().findAny().get();
|
||||||
|
toCrawl.remove(s);
|
||||||
|
t.todo.add(s);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// listlock.writeLock().unlock();
|
|
||||||
t.requested = false;
|
t.requested = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
private void loadCrawlFile() {
|
||||||
public void run() {
|
|
||||||
profiler.profilingEnabled = true;
|
|
||||||
profiler.clearProfiling();
|
|
||||||
profiler.startSection("root");
|
|
||||||
profiler.startSection("startup");
|
|
||||||
profiler.startSection("loadingcrawlfile");
|
|
||||||
start = System.currentTimeMillis();
|
|
||||||
log.info("Try to load crawlfile");
|
log.info("Try to load crawlfile");
|
||||||
if(crawlfile.exists()) {
|
if(crawlfile.exists()) {
|
||||||
try {
|
try {
|
||||||
|
@ -134,7 +127,9 @@ public class Crawler implements Runnable {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
profiler.endStartSection("populateThreads");//loading crawlfile closed
|
}
|
||||||
|
|
||||||
|
private int createThreads() {
|
||||||
//populate threads
|
//populate threads
|
||||||
int threadcount = 4;
|
int threadcount = 4;
|
||||||
try {
|
try {
|
||||||
|
@ -143,13 +138,93 @@ public class Crawler implements Runnable {
|
||||||
log.warn("Could not read the Number \"" + Config.prop.getProperty("crawler.threadcount") + "\" from the Config.");
|
log.warn("Could not read the Number \"" + Config.prop.getProperty("crawler.threadcount") + "\" from the Config.");
|
||||||
}
|
}
|
||||||
threads = new ArrayList<>(threadcount);
|
threads = new ArrayList<>(threadcount);
|
||||||
|
CrawlerThread.initLib(threadcount);
|
||||||
for(int i = 0; i < threadcount; i++) {
|
for(int i = 0; i < threadcount; i++) {
|
||||||
CrawlerThread thr = new CrawlerThread( this);
|
CrawlerThread thr = new CrawlerThread( this, i);
|
||||||
thr.setThread(new Thread(thr, "Crawler #" + i));
|
thr.setThread(new Thread(thr, "Crawler #" + i));
|
||||||
threads.add(thr);
|
threads.add(thr);
|
||||||
thr.thread.start();
|
thr.thread.start();
|
||||||
}
|
}
|
||||||
|
return threadcount;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void getreports() {
|
||||||
|
log.info("get report");
|
||||||
|
for (CrawlerThread crawlerThread : threads) {
|
||||||
|
String threadname = crawlerThread.thread.getName();
|
||||||
|
profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1));
|
||||||
|
List<String>[] report = crawlerThread.report();
|
||||||
|
crawlcount+= report[0].size();
|
||||||
|
toSave.addAll(report[0]);
|
||||||
|
crawlerThread.crawled.clear();
|
||||||
|
|
||||||
|
int count = 0;
|
||||||
|
while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden.
|
||||||
|
ArrayList<String> store = null;
|
||||||
|
try {
|
||||||
|
if(report[1].size() <= 50) {
|
||||||
|
store = new ArrayList<>(report[1]);
|
||||||
|
count += report[1].size();
|
||||||
|
report[1].clear();
|
||||||
|
} else {
|
||||||
|
store = new ArrayList<>(report[1].subList(0, 50));
|
||||||
|
report[1].removeAll(store);
|
||||||
|
count+=50;
|
||||||
|
}
|
||||||
|
} catch(NoSuchElementException ignored) {//concurrentmodification fuckery
|
||||||
|
log.info("no suchelement bla");
|
||||||
|
}
|
||||||
|
db.storeTemp(store, false);
|
||||||
|
}
|
||||||
|
log.info(count + " videos added from " + threadname);
|
||||||
|
profiler.endSection();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void savetodb() {
|
||||||
|
log.info("save " + toSave.size() + " videos to DB.");
|
||||||
|
synchronized (toSave) {
|
||||||
|
while (!toSave.isEmpty()) {
|
||||||
|
Set<String> videoids = new TreeSet<>();
|
||||||
|
for (int i = 0; i < 50 && !toSave.isEmpty(); i++) {
|
||||||
|
String save = toSave.stream().findAny().get();
|
||||||
|
toSave.remove(save);
|
||||||
|
videoids.add(save);
|
||||||
|
}
|
||||||
|
if (videoids.size() > 0) {
|
||||||
|
profiler.startSection("getinfo");
|
||||||
|
ArrayList<Video> videos = (ArrayList<Video>) api.getInfos(videoids)[0];
|
||||||
|
profiler.endStartSection("sendtoDB");
|
||||||
|
db.addVideos(videos, false);
|
||||||
|
profiler.endSection();//sendtoDB
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void sendAdminMessage() {
|
||||||
|
long currenttime = System.currentTimeMillis();
|
||||||
|
if((currenttime - lastadminreport) / 1000 > 3600) {
|
||||||
|
long runtimes = (currenttime - start) / 1000;
|
||||||
|
if (runtimes < 0)
|
||||||
|
runtimes = 1;
|
||||||
|
float vidps = (crawlcount / (float) runtimes);//videos per second
|
||||||
|
Main.getMain().broadcastAdmin(vidps + "v/s " + crawlcount + " total V");
|
||||||
|
lastadminreport = currenttime;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
profiler.profilingEnabled = true;
|
||||||
|
profiler.clearProfiling();
|
||||||
|
profiler.startSection("root");
|
||||||
|
profiler.startSection("startup");
|
||||||
|
profiler.startSection("loadingcrawlfile");
|
||||||
|
start = System.currentTimeMillis();
|
||||||
|
loadCrawlFile();
|
||||||
|
profiler.endStartSection("populateThreads");//loading crawlfile closed
|
||||||
|
int threadcount = createThreads();
|
||||||
profiler.endStartSection("deleteDouble");//populate threads
|
profiler.endStartSection("deleteDouble");//populate threads
|
||||||
long lastdoubledelete = System.currentTimeMillis();
|
long lastdoubledelete = System.currentTimeMillis();
|
||||||
//db.deleteDouble();
|
//db.deleteDouble();
|
||||||
|
@ -187,7 +262,6 @@ public class Crawler implements Runnable {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// updateDB();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
//nothing left?
|
//nothing left?
|
||||||
|
@ -195,27 +269,6 @@ public class Crawler implements Runnable {
|
||||||
log.warn("nothing left to crawl");
|
log.warn("nothing left to crawl");
|
||||||
}
|
}
|
||||||
|
|
||||||
//refil the tocrawl list.
|
|
||||||
/*if(!toknown.isEmpty()) {
|
|
||||||
//check in db for known videos
|
|
||||||
log.info("Checking the DB");
|
|
||||||
currentstate = "get new tocrawl";
|
|
||||||
// listlock.writeLock().lock();
|
|
||||||
while(toCrawl.size() < jobspeerthread * threads.size() * 2 && crawl && !toknown.isEmpty()) {
|
|
||||||
LinkedList<String> tocheck = new LinkedList<>();
|
|
||||||
for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) {
|
|
||||||
tocheck.add(toknown.removeFirst());
|
|
||||||
}
|
|
||||||
toCrawl.addAll(db.checkvideos(tocheck));
|
|
||||||
}
|
|
||||||
// listlock.writeLock().unlock();
|
|
||||||
}
|
|
||||||
while(toknown.size() < threadcount * jobspeerthread * 20 && crawl) {
|
|
||||||
currentstate = "restoretemp";
|
|
||||||
log.info("restoreTemp");
|
|
||||||
LinkedList<String> rest = db.restoreTemp();
|
|
||||||
toknown.addAll(rest);
|
|
||||||
}*/
|
|
||||||
{
|
{
|
||||||
profiler.endStartSection("loadCrawl");
|
profiler.endStartSection("loadCrawl");
|
||||||
boolean joined = true;
|
boolean joined = true;
|
||||||
|
@ -244,60 +297,14 @@ public class Crawler implements Runnable {
|
||||||
|
|
||||||
//get reports
|
//get reports
|
||||||
profiler.endStartSection("getreport");
|
profiler.endStartSection("getreport");
|
||||||
log.info("get report");
|
getreports();
|
||||||
for (CrawlerThread crawlerThread : threads) {
|
|
||||||
String threadname = crawlerThread.thread.getName();
|
|
||||||
profiler.startSection("T" + threadname.substring(threadname.lastIndexOf('#')+1));
|
|
||||||
LinkedList<String>[] report = crawlerThread.report();
|
|
||||||
crawlcount+= report[0].size();
|
|
||||||
toSave.addAll(report[0]);
|
|
||||||
crawlerThread.crawled.clear();
|
|
||||||
|
|
||||||
int count = 0;
|
|
||||||
while(report[1].size() > 1) {//2 videos werden ggf. gelöscht ohne gesehen zu werden.
|
|
||||||
ArrayList<String> store = null;
|
|
||||||
try {
|
|
||||||
if(report[1].size() <= 50) {
|
|
||||||
store = new ArrayList<>(report[1]);
|
|
||||||
count += report[1].size();
|
|
||||||
report[1].clear();
|
|
||||||
} else {
|
|
||||||
store = new ArrayList<>(report[1].subList(0, 50));
|
|
||||||
report[1].removeAll(store);
|
|
||||||
count+=50;
|
|
||||||
}
|
|
||||||
} catch(NoSuchElementException ignored) {//concurrentmodification fuckery
|
|
||||||
log.info("no suchelement bla");
|
|
||||||
}
|
|
||||||
db.storeTemp(store, false);
|
|
||||||
}
|
|
||||||
log.info(count + " videos added from " + threadname);
|
|
||||||
profiler.endSection();
|
|
||||||
}
|
|
||||||
|
|
||||||
profiler.endStartSection("debug");
|
profiler.endStartSection("debug");
|
||||||
long runtimes = (System.currentTimeMillis() - start) / 1000;
|
sendAdminMessage();
|
||||||
if(runtimes < 0)
|
|
||||||
runtimes = 1;
|
|
||||||
float vidps = (crawlcount / (float) runtimes);//videos per second
|
|
||||||
Main.getMain().broadcastAdmin(vidps + "v/s " + crawlcount + " total V");
|
|
||||||
|
|
||||||
//save to db
|
//save to db
|
||||||
profiler.endStartSection("save2DB");
|
profiler.endStartSection("save2DB");
|
||||||
log.info("save " + toSave.size() + " videos to DB.");
|
savetodb();
|
||||||
while(!toSave.isEmpty()) {
|
|
||||||
LinkedList<String> videoids = new LinkedList<>();
|
|
||||||
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
|
|
||||||
videoids.add(toSave.remove(0));
|
|
||||||
}
|
|
||||||
if(videoids.size() > 0) {
|
|
||||||
profiler.startSection("getinfo");
|
|
||||||
ArrayList<Video> videos = (ArrayList<Video>) api.getInfos(videoids)[0];
|
|
||||||
profiler.endStartSection("sendtoDB");
|
|
||||||
db.addVideos(videos, false);
|
|
||||||
profiler.endSection();//sendtoDB
|
|
||||||
}
|
|
||||||
}
|
|
||||||
profiler.endSection();//save2DB
|
profiler.endSection();//save2DB
|
||||||
|
|
||||||
//at the beginning there is maybe just one video to crawl, so keep it calm.
|
//at the beginning there is maybe just one video to crawl, so keep it calm.
|
||||||
|
@ -324,6 +331,9 @@ public class Crawler implements Runnable {
|
||||||
}
|
}
|
||||||
profiler.endSection();//main
|
profiler.endSection();//main
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
profiler.startSection("waitforthreads");
|
profiler.startSection("waitforthreads");
|
||||||
for(CrawlerThread ct : threads) {
|
for(CrawlerThread ct : threads) {
|
||||||
try {
|
try {
|
||||||
|
@ -353,6 +363,7 @@ public class Crawler implements Runnable {
|
||||||
int runtimem = (int) (runtimes / 60);
|
int runtimem = (int) (runtimes / 60);
|
||||||
float vidps = (crawlcount / (float) runtimes);//videos per second
|
float vidps = (crawlcount / (float) runtimes);//videos per second
|
||||||
log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " v/s )");
|
log.info("Crawling Stopped. Runtime: " + runtimem + "min and " + crawlcount + " videos crawled. ( " + vidps + " v/s )");
|
||||||
|
CrawlerThread.deinitLib();
|
||||||
Main.getMain().stopcallback();
|
Main.getMain().stopcallback();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -383,7 +394,7 @@ public class Crawler implements Runnable {
|
||||||
out += "\nThread Nr, todo size, requested, crawledsize, foundsize";
|
out += "\nThread Nr, todo size, requested, crawledsize, foundsize";
|
||||||
for (int i = 0; i < threads.size(); i++) {
|
for (int i = 0; i < threads.size(); i++) {
|
||||||
CrawlerThread thre = threads.get(i);
|
CrawlerThread thre = threads.get(i);
|
||||||
out += "\n " + i + " " + (thre.lockforreport ? "\uD83D\uDD12" : "\uD83D\uDD13") + " " + thre.todo.size() + " " + thre.requested + " " + thre.crawled.size() + " " + thre.found.size();
|
out += "\n " + i + " " + thre.todo.size() + " " + thre.requested + " " + thre.crawled.size() + " " + thre.found.size();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return out;
|
return out;
|
||||||
|
@ -417,6 +428,8 @@ public class Crawler implements Runnable {
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
@AllArgsConstructor
|
||||||
|
@NoArgsConstructor
|
||||||
public static class Video {
|
public static class Video {
|
||||||
String id = "";
|
String id = "";
|
||||||
String title = "";
|
String title = "";
|
|
@ -0,0 +1,135 @@
|
||||||
|
package de.mrbesen.youtubecrawler;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
|
public class CrawlerThread implements Runnable {
|
||||||
|
|
||||||
|
private static Pattern linkpattern = Pattern.compile("watch\\?v=([-_a-zA-Z0-9]{11})");
|
||||||
|
|
||||||
|
private Logger log = Logger.getLogger(this.getClass().getName());
|
||||||
|
private Crawler parent;
|
||||||
|
Thread thread;
|
||||||
|
|
||||||
|
LinkedList<String> todo = new LinkedList<>();//videos, this thread should crawl
|
||||||
|
List<String> crawled = new LinkedList<>();//videos this thread had crawled
|
||||||
|
List<String> found = new LinkedList<>();//videos this thread had found
|
||||||
|
|
||||||
|
static AtomicInteger fails = new AtomicInteger(0);
|
||||||
|
private static int MAXFAILS = 100;
|
||||||
|
|
||||||
|
boolean requested = true;//is a request pending?
|
||||||
|
private int threadid;
|
||||||
|
|
||||||
|
static {
|
||||||
|
String libpath = System.getProperty("java.library.path");
|
||||||
|
libpath += ":./";
|
||||||
|
System.setProperty("java.library.path", libpath);
|
||||||
|
System.loadLibrary("crawlerthread");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static native void initLib(int threadCount);
|
||||||
|
public static native void deinitLib();
|
||||||
|
|
||||||
|
public CrawlerThread( Crawler root, int threadid) {
|
||||||
|
parent = root;
|
||||||
|
root.request(this);
|
||||||
|
this.threadid = threadid;
|
||||||
|
}
|
||||||
|
|
||||||
|
void setThread(Thread t) {
|
||||||
|
thread = t;
|
||||||
|
}
|
||||||
|
|
||||||
|
LinkedList<String> undone() {
|
||||||
|
return todo;
|
||||||
|
}
|
||||||
|
|
||||||
|
int undoneSize() {
|
||||||
|
return todo.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
while(parent.isCrawling()) {
|
||||||
|
synchronized (this) {
|
||||||
|
while (!todo.isEmpty() && parent.isCrawling()) {
|
||||||
|
String vid = todo.removeFirst();
|
||||||
|
// System.out.println("crawling: " + vid + " size: " + found.size());
|
||||||
|
crawled.add(vid);
|
||||||
|
|
||||||
|
boolean success = crawl(vid, threadid);
|
||||||
|
|
||||||
|
if (todo.size() < parent.requestlimit && !requested) {
|
||||||
|
requested = true;
|
||||||
|
parent.request(this);
|
||||||
|
}
|
||||||
|
if (!success) {
|
||||||
|
int val = fails.addAndGet(1);
|
||||||
|
if (val > MAXFAILS) {
|
||||||
|
System.err.println("Max Crawlfails reached, stopping");
|
||||||
|
parent.stop();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (todo.isEmpty() && !requested) {
|
||||||
|
requested = true;
|
||||||
|
parent.request(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
log.warn("No Object left!");
|
||||||
|
Thread.yield();
|
||||||
|
try {
|
||||||
|
Thread.sleep(10000);//sleep for 10 seconds
|
||||||
|
} catch (InterruptedException ignored) {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
log.info("Stopped.");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* returns a list of all crawled videos
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
List<String>[] report() {
|
||||||
|
synchronized (this) {
|
||||||
|
List<String>[] out = new List[]{crawled, found};
|
||||||
|
crawled = new LinkedList<>();
|
||||||
|
found = new LinkedList<>();
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
private void crawl(String videoid) {
|
||||||
|
try {
|
||||||
|
crawled.add(videoid);
|
||||||
|
|
||||||
|
// log.info("crawling: " + videoid);
|
||||||
|
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
|
||||||
|
String s = con.getContent();
|
||||||
|
Matcher matcher = linkpattern.matcher(s);
|
||||||
|
while(matcher.find()) {
|
||||||
|
String ytid = matcher.group(1);
|
||||||
|
if(!ytid.equals(videoid)) {
|
||||||
|
found.add(ytid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch(IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
// returns false when it fails
|
||||||
|
private native boolean crawl(String videid, int threadid);
|
||||||
|
}
|
|
@ -23,17 +23,19 @@ public class DB implements Runnable {
|
||||||
private Logger log = Logger.getLogger(DB.class.getName());
|
private Logger log = Logger.getLogger(DB.class.getName());
|
||||||
private ArrayList<String> randombuffer = new ArrayList<>(100);
|
private ArrayList<String> randombuffer = new ArrayList<>(100);
|
||||||
private Random rand = new Random();
|
private Random rand = new Random();
|
||||||
private Server serv = new Server(this);
|
|
||||||
private Thread randomrefill = null;
|
private Thread randomrefill = null;
|
||||||
private int dbsize = 0;
|
private int dbsize = 0;
|
||||||
|
|
||||||
private StringBuilder tostorebuffer ;
|
private StringBuilder tostorebuffer;
|
||||||
private int writebuffersize = 500;
|
private int writebuffersize = 500;
|
||||||
private int writebuffercurrentsize = 0;
|
private int writebuffercurrentsize = 0;
|
||||||
|
|
||||||
private StringBuilder totempbuffer;
|
private StringBuilder totempbuffer;
|
||||||
private int writetempbuffercurrentsize = 0;
|
private int writetempbuffercurrentsize = 0;
|
||||||
|
|
||||||
|
private final int TEMPBUFFERRATIO = 15;
|
||||||
|
private final int STOREBUFFERRATIO = 100;
|
||||||
|
|
||||||
public DB() {
|
public DB() {
|
||||||
try {
|
try {
|
||||||
connect(false);
|
connect(false);
|
||||||
|
@ -60,7 +62,6 @@ public class DB implements Runnable {
|
||||||
log.info("Database is set up!");
|
log.info("Database is set up!");
|
||||||
}
|
}
|
||||||
|
|
||||||
serv.start();
|
|
||||||
refillbuffer();
|
refillbuffer();
|
||||||
|
|
||||||
//get db size
|
//get db size
|
||||||
|
@ -72,8 +73,8 @@ public class DB implements Runnable {
|
||||||
} catch(NumberFormatException e) {
|
} catch(NumberFormatException e) {
|
||||||
log.warn("could not read the number \"" + Config.prop.getProperty("db.writebuffersize") + "\" from the config file. db.writebuffersize");
|
log.warn("could not read the number \"" + Config.prop.getProperty("db.writebuffersize") + "\" from the config file. db.writebuffersize");
|
||||||
}
|
}
|
||||||
tostorebuffer = new StringBuilder(writebuffersize);
|
tostorebuffer = new StringBuilder(writebuffersize * STOREBUFFERRATIO);
|
||||||
totempbuffer = new StringBuilder(writebuffersize);
|
totempbuffer = new StringBuilder(writebuffersize * TEMPBUFFERRATIO);
|
||||||
} catch (SQLException e) {
|
} catch (SQLException e) {
|
||||||
log.error("Error while connecting to the database! ", e);
|
log.error("Error while connecting to the database! ", e);
|
||||||
}
|
}
|
||||||
|
@ -98,10 +99,9 @@ public class DB implements Runnable {
|
||||||
|
|
||||||
private void connect(boolean selectdb) {
|
private void connect(boolean selectdb) {
|
||||||
try {
|
try {
|
||||||
Class.forName("com.mysql.jdbc.Driver");//Treiber laden try this driver: com.mysql.cj.jdbc.Driver
|
|
||||||
//verbinden
|
//verbinden
|
||||||
con = DriverManager.getConnection("jdbc:mysql://" + server + ":" + port + "/" + (selectdb ? db : "") + "?serverTimezone=UTC" ,user,pw);
|
con = DriverManager.getConnection("jdbc:mysql://" + server + ":" + port + "/" + (selectdb ? db : "") + "?serverTimezone=UTC&verifyServerCertificate=false&useSSL=true&useUnicode=true&characterEncoding=utf-8", user, pw);
|
||||||
}catch (ClassNotFoundException | SQLException e) {
|
}catch (SQLException e) {
|
||||||
log.error("Error while connecting to the database! ", e);
|
log.error("Error while connecting to the database! ", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -135,14 +135,13 @@ public class DB implements Runnable {
|
||||||
* @param input
|
* @param input
|
||||||
*/
|
*/
|
||||||
public void addVideos(ArrayList<Video> input, boolean force) {
|
public void addVideos(ArrayList<Video> input, boolean force) {
|
||||||
//log.info("add " + input.size() + " videos");
|
|
||||||
if(input != null) {
|
if(input != null) {
|
||||||
if(input.size() > 0) {
|
if(input.size() > 0) {
|
||||||
writebuffercurrentsize += input.size();
|
writebuffercurrentsize += input.size();
|
||||||
for(int i = 0; i < input.size(); i++) {
|
for(int i = 0; i < input.size(); i++) {
|
||||||
Video v = input.get(i);
|
Video v = input.get(i);
|
||||||
if(v != null)
|
if(v != null)
|
||||||
tostorebuffer.append(",('").append(v.id).append("','").append(v.length).append("','").append(v.created).append("','").append(v.languageCode).append("','").append(v.categorie).append("','").append(v.title).append("','").append(v.channel).append("','").append(v.tags).append("') ");
|
tostorebuffer.append("('").append(escape(v.id)).append("',").append(v.length).append(",").append(v.created).append(",'").append(escape(v.languageCode)).append("',").append(v.categorie).append(",'").append(escape(v.title)).append("','").append(escape(v.channel)).append("','").append(escape(v.tags)).append("'),");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -150,17 +149,21 @@ public class DB implements Runnable {
|
||||||
if(tostorebuffer.length() > 10) {
|
if(tostorebuffer.length() > 10) {
|
||||||
log.info("Write databuffer to DB video count: " + writebuffercurrentsize);
|
log.info("Write databuffer to DB video count: " + writebuffercurrentsize);
|
||||||
dbsize += writebuffercurrentsize;
|
dbsize += writebuffercurrentsize;
|
||||||
tostorebuffer.deleteCharAt(0);//delete leading ','
|
tostorebuffer.deleteCharAt(tostorebuffer.length()-1);//delete trailing ','
|
||||||
String qu = "INSERT IGNORE INTO `ytcrawler`.`videos`(`id`, `length`, `created`, `langcode`, `category`, `videotitle`, `channel`, `tags`) VALUES " + tostorebuffer.toString();
|
String qu = "INSERT IGNORE INTO `videos`(`id`, `length`, `created`, `langcode`, `category`, `videotitle`, `channel`, `tags`) VALUES " + tostorebuffer.toString();
|
||||||
update(qu);
|
update(qu);
|
||||||
|
|
||||||
//reset buffer
|
//reset buffer
|
||||||
writebuffercurrentsize = 0;
|
writebuffercurrentsize = 0;
|
||||||
tostorebuffer = new StringBuilder(writebuffersize);
|
tostorebuffer = new StringBuilder(writebuffersize * STOREBUFFERRATIO);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String escape(String e) {
|
||||||
|
return e.replace("'", "\\'");
|
||||||
|
}
|
||||||
|
|
||||||
public void updateVideos(List<Video> input) {
|
public void updateVideos(List<Video> input) {
|
||||||
log.info("Updateing " + input.size() + " videos.");
|
log.info("Updateing " + input.size() + " videos.");
|
||||||
for(Video v : input) {
|
for(Video v : input) {
|
||||||
|
@ -171,7 +174,7 @@ public class DB implements Runnable {
|
||||||
|
|
||||||
private void updateVideo(Video v) {
|
private void updateVideo(Video v) {
|
||||||
try {
|
try {
|
||||||
String qu = "UPDATE `ytcrawler`.`videos` SET `length` = '" + v.length + "', `created` = '" + v.created + "', `langcode` = SUBSTR('" + v.languageCode + "', 1, 3) ,`category` = '" + v.categorie + "',`videotitle` = SUBSTR('" + v.title + "',1,100),`channel` = SUBSTR('" + v.channel + "',1,20),`tags` = '" + v.tags.substring(0, v.tags.length() > 40 ? 40 : v.tags.length()) + "' WHERE `id` = '" + v.id + "';";
|
String qu = "UPDATE `videos` SET `length` = " + v.length + ", `created` = " + v.created + ", `langcode` = SUBSTR('" + v.languageCode + "', 1, 3) ,`category` = " + v.categorie + ",`videotitle` = SUBSTR('" + escape(v.title) + "',1,100),`channel` = SUBSTR('" + escape(v.channel) + "',1,20),`tags` = '" + escape(v.tags) + "' WHERE `id` = '" + escape(v.id) + "';";
|
||||||
update(qu);
|
update(qu);
|
||||||
} catch(NullPointerException e) {
|
} catch(NullPointerException e) {
|
||||||
|
|
||||||
|
@ -195,7 +198,7 @@ public class DB implements Runnable {
|
||||||
public void removeVideos(LinkedList<Video> vids) {
|
public void removeVideos(LinkedList<Video> vids) {
|
||||||
log.info("Delete " + vids.size() + " videos.");
|
log.info("Delete " + vids.size() + " videos.");
|
||||||
for(Video s : vids) {
|
for(Video s : vids) {
|
||||||
update("DELETE FROM `ytcrawler`.`videos` WHERE `id`='" + s.id + "';");
|
update("DELETE FROM `videos` WHERE `id`='" + escape(s.id) + "';");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -258,14 +261,14 @@ public class DB implements Runnable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public LinkedList<String> restoreTemp() {
|
public LinkedList<String> restoreTemp() {
|
||||||
ResultSet res = query("SELECT * FROM `ytcrawler`.`temp` LIMIT 500;");
|
ResultSet res = query("SELECT * FROM `temp` LIMIT 500;");
|
||||||
LinkedList<String> out = new LinkedList<>();
|
LinkedList<String> out = new LinkedList<>();
|
||||||
log.info("RestoreTemp");
|
log.info("RestoreTemp");
|
||||||
try {
|
try {
|
||||||
while(res.next()) {
|
while(res.next()) {
|
||||||
out.add(res.getString(1));
|
out.add(res.getString(1));
|
||||||
}
|
}
|
||||||
update("DELETE FROM `ytcrawler`.`temp` LIMIT 500;");
|
update("DELETE FROM `temp` LIMIT 500;");
|
||||||
} catch (Exception e) {}
|
} catch (Exception e) {}
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
@ -273,7 +276,7 @@ public class DB implements Runnable {
|
||||||
public void deleteDouble() {
|
public void deleteDouble() {
|
||||||
log.info("Started Delete Double");
|
log.info("Started Delete Double");
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
update("call ytcrawler.deletedouble();");
|
update("CALL deletedouble();");
|
||||||
log.info("Delete Double done in " + ((System.currentTimeMillis() - start)/60000) + " min");
|
log.info("Delete Double done in " + ((System.currentTimeMillis() - start)/60000) + " min");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -284,18 +287,18 @@ public class DB implements Runnable {
|
||||||
log.info("store Temp to buffer: " + strings.size());
|
log.info("store Temp to buffer: " + strings.size());
|
||||||
writetempbuffercurrentsize += strings.size();
|
writetempbuffercurrentsize += strings.size();
|
||||||
for(String s : strings) {
|
for(String s : strings) {
|
||||||
totempbuffer.append(", ('").append(s).append("')");
|
totempbuffer.append("('").append(escape(s)).append("'),");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(writetempbuffercurrentsize > writebuffersize || force) {
|
if(writetempbuffercurrentsize > writebuffersize || force) {
|
||||||
log.info("Write Buffer: " + writetempbuffercurrentsize);
|
log.info("Write Buffer: " + writetempbuffercurrentsize);
|
||||||
totempbuffer.deleteCharAt(0);//delete leading ','
|
totempbuffer.deleteCharAt(totempbuffer.length()-1);//delete trailing ','
|
||||||
String qu = "INSERT IGNORE INTO `ytcrawler`.`temp` (`ytid`) VALUES " + totempbuffer.toString() + ";";
|
String qu = "INSERT IGNORE INTO `temp` (`ytid`) VALUES " + totempbuffer.toString() + ";";
|
||||||
update(qu);
|
update(qu);
|
||||||
|
|
||||||
//reset
|
//reset
|
||||||
writetempbuffercurrentsize = 0;
|
writetempbuffercurrentsize = 0;
|
||||||
totempbuffer = new StringBuilder(writebuffersize);
|
totempbuffer = new StringBuilder(writebuffersize * TEMPBUFFERRATIO);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -304,7 +307,6 @@ public class DB implements Runnable {
|
||||||
* Stops the randomnes-Server and disconnect
|
* Stops the randomnes-Server and disconnect
|
||||||
*/
|
*/
|
||||||
public void stop() {
|
public void stop() {
|
||||||
serv.stop();
|
|
||||||
try {
|
try {
|
||||||
if(con != null) {
|
if(con != null) {
|
||||||
if(!con.isClosed()) {
|
if(!con.isClosed()) {
|
||||||
|
@ -323,23 +325,18 @@ public class DB implements Runnable {
|
||||||
@Override
|
@Override
|
||||||
public void run() {
|
public void run() {
|
||||||
log.info("Started Refilling.");
|
log.info("Started Refilling.");
|
||||||
ResultSet count = query("SELECT `" + db + "`.`getLimit`() as 'l';");
|
|
||||||
if(count != null) {
|
|
||||||
try {
|
try {
|
||||||
if(count.next()) {
|
ResultSet set = query("SELECT `id` FROM `videos` ORDER BY rand() LIMIT 100;");
|
||||||
int max = count.getInt("l");
|
if(set != null) {
|
||||||
ResultSet set = query("SELECT `id` FROM `videos` LIMIT " + rand.nextInt(max) + ",100;");
|
while(set.next()) {
|
||||||
if(set != null) {
|
randombuffer.add(set.getString(1));
|
||||||
while(set.next()) {
|
|
||||||
randombuffer.add(set.getString(1));
|
|
||||||
}
|
|
||||||
log.info("refilled randombuffer to " + randombuffer.size() + " videos.");
|
|
||||||
}
|
}
|
||||||
|
log.info("refilled randombuffer to " + randombuffer.size() + " videos.");
|
||||||
}
|
}
|
||||||
} catch (SQLException e) {
|
} catch (SQLException e) {
|
||||||
log.warn("error getting a random video", e);
|
log.warn("error getting a random video", e);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
if(randombuffer.isEmpty()) {
|
if(randombuffer.isEmpty()) {
|
||||||
log.error("Unable to retrieve RandomVideos");
|
log.error("Unable to retrieve RandomVideos");
|
||||||
}
|
}
|
|
@ -68,7 +68,7 @@ public class Main implements JSONCommandHandler {
|
||||||
}
|
}
|
||||||
|
|
||||||
//starting BOT API
|
//starting BOT API
|
||||||
tapi = new TelegramAPI(Config.prop.getProperty("telegramapi.key"));
|
tapi = new TelegramAPI(Config.prop.getProperty("telegramapi.key"), "randomytvideobot");
|
||||||
tapi.getCommandManager().registerCommand( this);
|
tapi.getCommandManager().registerCommand( this);
|
||||||
tapi.getEventManager().registerEvent(UserSendMessageEvent.class, this::onAdmin);
|
tapi.getEventManager().registerEvent(UserSendMessageEvent.class, this::onAdmin);
|
||||||
tapi.setHelpText("Send the command /random to get a random video.");
|
tapi.setHelpText("Send the command /random to get a random video.");
|
|
@ -0,0 +1,178 @@
|
||||||
|
package de.mrbesen.youtubecrawler;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.net.HttpURLConnection;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.text.DateFormat;
|
||||||
|
import java.text.ParseException;
|
||||||
|
import java.text.SimpleDateFormat;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import javax.xml.datatype.DatatypeConfigurationException;
|
||||||
|
import javax.xml.datatype.DatatypeFactory;
|
||||||
|
import javax.xml.datatype.Duration;
|
||||||
|
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.json.JSONArray;
|
||||||
|
import org.json.JSONObject;
|
||||||
|
import org.json.JSONTokener;
|
||||||
|
|
||||||
|
public class YoutubeAPI {
|
||||||
|
|
||||||
|
private String api_key = null;
|
||||||
|
private static String BASEQUERY = "https://www.googleapis.com/youtube/v3/";
|
||||||
|
private static DateFormat dateformat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
|
||||||
|
private Logger log = Logger.getLogger(YoutubeAPI.class.getName());
|
||||||
|
private DatatypeFactory durationfactory = null;
|
||||||
|
|
||||||
|
public YoutubeAPI(String apikey) {
|
||||||
|
api_key = apikey;
|
||||||
|
try {
|
||||||
|
durationfactory = DatatypeFactory.newInstance();
|
||||||
|
} catch(DatatypeConfigurationException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Crawler.Video getInfo(String id) {
|
||||||
|
return getInfos(id)[0].get(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Crawler.Video>[] getInfos(Collection<String> ids) {
|
||||||
|
if(ids.isEmpty())
|
||||||
|
return null;
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
boolean isFirst = true;
|
||||||
|
for(String id : ids) {
|
||||||
|
if(id.matches("[a-zA-Z0-9_-]{11}")) {
|
||||||
|
if(!isFirst) {
|
||||||
|
sb.append(',');
|
||||||
|
}
|
||||||
|
sb.append(id);
|
||||||
|
isFirst = false;
|
||||||
|
} else {
|
||||||
|
System.out.println("non matching id: \"" + id + "\"");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return getInfos(sb.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<Integer, String> getCategories() {
|
||||||
|
String query = BASEQUERY + "videoCategories?part=snippet®ionCode=us&key=" + api_key;
|
||||||
|
JSONObject obj = parse(connect(query));
|
||||||
|
Map<Integer, String> out = new TreeMap<>();
|
||||||
|
if(obj != null) {
|
||||||
|
JSONArray items = obj.getJSONArray("items");
|
||||||
|
for(int i = 0; !items.isNull(i); ++i) {
|
||||||
|
JSONObject item = items.getJSONObject(i);
|
||||||
|
String id = item.getString("id");
|
||||||
|
String name = item.getJSONObject("snippet").getString("title");
|
||||||
|
try {
|
||||||
|
int intid = Integer.parseInt(id);
|
||||||
|
out.put(intid, name);
|
||||||
|
// System.out.println(intid + ";" + name);
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Crawler.Video>[] getInfos(String idlist) {
|
||||||
|
ArrayList<Crawler.Video> out = new ArrayList<>(idlist.length() / 12);//approximierte vorraussichtliche länge
|
||||||
|
LinkedList<Crawler.Video> livestr = new LinkedList<>();
|
||||||
|
String nextpage = "";
|
||||||
|
do {
|
||||||
|
String query = BASEQUERY + "videos?part=snippet,contentDetails&id=" + idlist + nextpage + "&key=" + api_key;
|
||||||
|
JSONObject json = parse(connect(query));
|
||||||
|
nextpage = "";
|
||||||
|
if(json != null) {
|
||||||
|
if(json.has("items")) {
|
||||||
|
//get video list
|
||||||
|
json.getJSONArray("items").forEach(item -> out.add( getVid((JSONObject) item) ));
|
||||||
|
|
||||||
|
if(json.has("nextPageToken")) {
|
||||||
|
nextpage = "&pageToken=" + json.getString("nextPageToken");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} while(!nextpage.isEmpty());
|
||||||
|
|
||||||
|
return new List[] {out, livestr};
|
||||||
|
}
|
||||||
|
|
||||||
|
private Crawler.Video getVid(JSONObject json) {
|
||||||
|
String vdid = json.getString("id");
|
||||||
|
JSONObject snippet = json.getJSONObject("snippet");
|
||||||
|
String title = snippet.optString("title", ""); //maxlen: 100
|
||||||
|
long published = getDate(snippet.optString("publishedAt", ""));
|
||||||
|
String channel = snippet.optString("channelTitle", "");
|
||||||
|
|
||||||
|
String tags = "";
|
||||||
|
if(snippet.optJSONArray("tags") != null)
|
||||||
|
tags = snippet.getJSONArray("tags").toList().stream().map(o -> (String) o).collect(Collectors.joining(",")); // max len: ~500
|
||||||
|
|
||||||
|
byte category = 0;
|
||||||
|
try {
|
||||||
|
category = Byte.parseByte(snippet.getString("categoryId"));
|
||||||
|
} catch(NumberFormatException e) {}
|
||||||
|
|
||||||
|
JSONObject contentDetails = json.getJSONObject("contentDetails");
|
||||||
|
int duration = (int) getDuration(contentDetails.optString("duration", ""));
|
||||||
|
boolean live = !snippet.getString("liveBroadcastContent").equalsIgnoreCase("none");
|
||||||
|
|
||||||
|
String langCode = snippet.optString("defaultLanguage", snippet.optString("defaultAudioLanguage", ""));
|
||||||
|
if(langCode.length() > 3) {
|
||||||
|
langCode = langCode.substring(0, 3);
|
||||||
|
}
|
||||||
|
if(langCode.endsWith("-")) {
|
||||||
|
langCode = langCode.substring(0, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new Crawler.Video(vdid, title, channel, tags, duration, langCode, category, published, live);
|
||||||
|
}
|
||||||
|
|
||||||
|
private long getDate(String format) {
|
||||||
|
try {
|
||||||
|
Date d = dateformat.parse(format.substring(0, 19).replace('T', ' '));
|
||||||
|
return d.getTime() / 1000;
|
||||||
|
} catch (ParseException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
System.err.println("Failed to parse date: " + format);
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private long getDuration(String iso8601) {
|
||||||
|
Duration dur = durationfactory.newDuration(iso8601);
|
||||||
|
return dur.getTimeInMillis(new Date(0)) / 1000;
|
||||||
|
}
|
||||||
|
|
||||||
|
public BufferedReader connect(String url) {
|
||||||
|
if(url == null)
|
||||||
|
return null;
|
||||||
|
try {
|
||||||
|
URL urll = new URL(url);
|
||||||
|
//System.out.println("url: " + urll.toString());
|
||||||
|
HttpURLConnection con = (HttpURLConnection) urll.openConnection();
|
||||||
|
con.connect();
|
||||||
|
//System.out.println(con.getResponseCode());
|
||||||
|
return new BufferedReader(new InputStreamReader(con.getInputStream()));
|
||||||
|
} catch(IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public JSONObject parse(BufferedReader in) {
|
||||||
|
if(in == null)
|
||||||
|
return null;
|
||||||
|
return new JSONObject(new JSONTokener(in));
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
package de.mrbesen.youtubecrawler;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class YoutubeAPITest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testgetVideo() {
|
||||||
|
YoutubeAPI api = new YoutubeAPI(System.getenv("YOUTUBEAPIKEY"));
|
||||||
|
|
||||||
|
Crawler.Video vid = api.getInfo("gsvKF2ojUzs");
|
||||||
|
|
||||||
|
Assertions.assertEquals(vid.id, "gsvKF2ojUzs");
|
||||||
|
Assertions.assertEquals(vid.title, "Mikebøi - Missed");
|
||||||
|
Assertions.assertEquals(vid.channel, "TrapNation");
|
||||||
|
Assertions.assertEquals(vid.tags, "mikeboi missed,trap nation,trapnation,трап натион,electronic music 2020,Trap,Electronic Dance Music,missed trap nation,trap music,Electronic Music,Trap Music,Dance Music,missed mike boi,gaming music,Trap Music 2017,mike boy missed,mikebøi - missed,нас не догонят ремикс,Trap Nation,TrapNation,Mikebøi - Missed,trap nation 2020,trap music 2020 remix,EDM,missed mikeboi,music");
|
||||||
|
Assertions.assertEquals(vid.length, 213);
|
||||||
|
Assertions.assertEquals(vid.languageCode, "en");
|
||||||
|
Assertions.assertEquals(vid.categorie, 10);
|
||||||
|
Assertions.assertEquals(vid.created, 1491571496);
|
||||||
|
Assertions.assertFalse(vid.live);
|
||||||
|
|
||||||
|
Map<Integer, String> obj = api.getCategories();
|
||||||
|
|
||||||
|
Assertions.assertNotNull(obj);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue