added tests, new YoutubeAPI with JSON

This commit is contained in:
mrbesen 2021-10-18 21:12:08 +02:00
parent e57842d3e1
commit ba4134345a
Signed by: MrBesen
GPG Key ID: 596B2350DCD67504
11 changed files with 196 additions and 169 deletions

19
pom.xml
View File

@ -5,7 +5,8 @@
<artifactId>YoutubeCrawler</artifactId>
<version>0.0.2</version>
<build>
<sourceDirectory>src</sourceDirectory>
<sourceDirectory>src/main</sourceDirectory>
<testSourceDirectory>src/test</testSourceDirectory>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
@ -22,7 +23,7 @@
<configuration>
<archive>
<manifest>
<mainClass>de.mrbesen.youtubecrawler.Main</mainClass>
<mainClass>Main</mainClass>
</manifest>
</archive>
</configuration>
@ -32,7 +33,7 @@
<artifactId>exec-maven-plugin</artifactId>
<version>1.2.1</version>
<configuration>
<mainClass>de.mrbesen.youtubecrawler.Main</mainClass>
<mainClass>Main</mainClass>
</configuration>
</plugin>
<plugin>
@ -51,7 +52,7 @@
</descriptorRefs>
<archive>
<manifest>
<mainClass>de.mrbesen.youtubecrawler.Main</mainClass>
<mainClass>Main</mainClass>
</manifest>
</archive>
</configuration>
@ -59,8 +60,8 @@
</plugins>
</build>
<properties>
<maven.compiler.target>1.8</maven.compiler.target>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.11</maven.compiler.target>
<maven.compiler.source>1.11</maven.compiler.source>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
@ -85,5 +86,11 @@
<artifactId>guava</artifactId>
<version>11.0.2</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-engine</artifactId>
<version>5.8.1</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@ -1,162 +0,0 @@
package de.mrbesen.youtubecrawler;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import javax.net.ssl.HttpsURLConnection;
import org.apache.log4j.Logger;
import de.mrbesen.youtubecrawler.Crawler.Video;
public class YoutubeAPI {
private final String api_key = Config.prop.getProperty("youtube.apikey");
private static String basequery = "https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails&id=";
private static DateFormat dateformat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
private Logger log = Logger.getLogger(YoutubeAPI.class.getName());
public YoutubeAPI() {
if(api_key.isEmpty()) {
log.error("apikey is not defined!");
System.exit(1);
}
}
public Video getInfo(String id) {
return (Video) getInfos(id)[0].get(0);
}
public List<Video>[] getInfos(List<String> ids) {
//log.info("get " + ids.size() + " infos");
if(ids.isEmpty())
return null;
StringBuilder sb = new StringBuilder(ids.remove(0));
while(!ids.isEmpty()) {
sb.append(',').append(ids.remove(0));
}
return getInfos(sb.toString());
}
public List<Video>[] getInfos(String idlist) {
ArrayList<Video> out = new ArrayList<Video>(idlist.length() / 12);//approximierte vorraussichtliche länge
LinkedList<Video> livestr = new LinkedList<Video>();
String nextpage = "";
do {
String query = basequery + idlist + "&key=" + api_key;
BufferedReader br = connect(query);
nextpage = "";
if(br != null) {
try {
String line;
Video v = null;
boolean tags = false;
while((line = br.readLine()) != null) {
String split[] = line.split(":",2);
if(split.length == 2) {
split[0] = removeunwanted(split[0]);
//System.out.println(split[0] + " " + split[1]);
if(split[0].equals("defaultAudioLanguage")) {
v.languageCode = removeunwanted(split[1]);
} else if(split[0].equals("defaultLanguage")) {
v.languageCode = removeunwanted(split[1]);
} else if(split[0].equals("title")) {
if(v.title.isEmpty())
v.title = removeunwanted(split[1]);
} else if(split[0].equals("channelTitle")) {
v.channel = removeunwanted(split[1]);
} else if(split[0].equals("defaultLanguage")) {
v.languageCode = removeunwanted(split[1]);
} else if(split[0].equals("tags")) {
tags = true;
} else if(split[0].equals("liveBroadcastContent")) {
v.live = !removeunwanted(split[1]).equalsIgnoreCase("none");
} else if(split[0].equals("id")) {
if(v != null) {
if(!v.live)
out.add(v);
else {
livestr.add(v);
log.info("livestream found! " + v.id + " " + v.channel);
}
}
v = new Video();
v.id = removeunwanted(split[1]);
//System.out.println("new video: " + v.id + " " + v.length + " " + v.languageCode);
} else if(split[0].equals("categoryId")) {
v.categorie = Byte.parseByte(removeunwanted(split[1]));
} else if(split[0].equals("duration")) {
String timeparts[] = removeunwanted(split[1]).substring(2).split("[H,M,S]");
try {
if(timeparts.length > 2) {//hours
v.length = 3600 * Integer.parseInt(timeparts[0]);
}
if(timeparts.length > 1) {//minutes
v.length += 60 * Integer.parseInt(timeparts[timeparts.length -2]);
}
//Seconds
v.length += Integer.parseInt(timeparts[timeparts.length-1]);
} catch(NumberFormatException e) {//failed: P6DT17H59M53S and P15W3DT4H1M11S and P1W2DT20H47M55S video id: 1NPyC0psMaI and P2W2DT23H58M58S video id: Jd9KjbRxhN4 For input string: "W2DT23"
Main.getMain().broadcastAdmin(removeunwanted(split[1]) + " video id: " + v.id);
log.warn("Error saving the time string: " + removeunwanted(split[1]) + " video id: " + v.id, e);
}
} else if(split[0].equals("publishedAt")) {
String tmp = removeunwanted(split[1]);
tmp = tmp.replace('T', ' ');
tmp = tmp.substring(0, 19);
Date d = dateformat.parse(tmp);
v.created = d.getTime() / 1000;
}else if(split[0].equals("nextPageToken")) {
nextpage = "&pageToken=" + removeunwanted(split[1]);
// System.out.println("nextpage set to " + nextpage);
}
} else {
if(line.contains("]")) {
if(v.tags.length() > 1)
v.tags = v.tags.substring(1);
tags = false;
} else if(tags) {
v.tags += ", " + removeunwanted(line);
}
}
}
out.add(v);//add the last video
br.close();
} catch(IOException | ParseException e) {
e.printStackTrace();
}
}
} while(!nextpage.equals(""));
//log.info("got " + (out.size() + livestr.size()) + " infos");
return new List[] {out, livestr};
}
private String removeunwanted(String in) {
return in.replaceAll("[\"}{\\,\\\\]", "").replaceAll("'", "").trim();
}
public BufferedReader connect(String url) {
try {
URL urll = new URL(url);
HttpsURLConnection con = (HttpsURLConnection) urll.openConnection();
con.connect();
//System.out.println(con.getResponseCode());
return new BufferedReader(new InputStreamReader(con.getInputStream()));
} catch(IOException e) {
e.printStackTrace();
}
return null;
}
}
// no suchelement bla

View File

@ -14,6 +14,8 @@ import java.util.List;
import java.util.NoSuchElementException;
import java.util.Scanner;
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
import org.apache.log4j.Logger;
public class Crawler implements Runnable {
@ -36,7 +38,7 @@ public class Crawler implements Runnable {
//private int updateOffset = 0;
private DB db = new DB();
private YoutubeAPI api = new YoutubeAPI();
private YoutubeAPI api = new YoutubeAPI(Config.prop.getProperty("youtube.apikey"));
private File crawlfile = new File("crawl.txt");
private Logger log = Logger.getLogger(this.getClass().getName());
private Profiler profiler = new Profiler();
@ -417,6 +419,8 @@ public class Crawler implements Runnable {
}
*/
@AllArgsConstructor
@NoArgsConstructor
public static class Video {
String id = "";
String title = "";

View File

@ -0,0 +1,178 @@
package de.mrbesen.youtubecrawler;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.stream.Collectors;
import javax.net.ssl.HttpsURLConnection;
import javax.xml.datatype.DatatypeConfigurationException;
import javax.xml.datatype.DatatypeFactory;
import javax.xml.datatype.Duration;
import org.apache.log4j.Logger;
import org.json.JSONArray;
import org.json.JSONObject;
import org.json.JSONTokener;
public class YoutubeAPI {
private String api_key = null;
private static String BASEQUERY = "https://www.googleapis.com/youtube/v3/";
private static DateFormat dateformat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
private Logger log = Logger.getLogger(YoutubeAPI.class.getName());
private DatatypeFactory durationfactory = null;
public YoutubeAPI(String apikey) {
api_key = apikey;
try {
durationfactory = DatatypeFactory.newInstance();
} catch(DatatypeConfigurationException e) {
e.printStackTrace();
System.exit(1);
}
}
public YoutubeAPI() {
if(api_key.isEmpty()) {
log.error("apikey is not defined!");
System.exit(1);
}
try {
durationfactory = DatatypeFactory.newInstance();
} catch(DatatypeConfigurationException e) {
e.printStackTrace();
System.exit(1);
}
}
public Crawler.Video getInfo(String id) {
return (Crawler.Video) getInfos(id)[0].get(0);
}
public List<Crawler.Video>[] getInfos(List<String> ids) {
//log.info("get " + ids.size() + " infos");
if(ids.isEmpty())
return null;
StringBuilder sb = new StringBuilder(ids.remove(0));
while(!ids.isEmpty()) {
sb.append(',').append(ids.remove(0));
}
return getInfos(sb.toString());
}
public Map<Integer, String> getCategories() {
String query = BASEQUERY + "videoCategories?part=snippet&regionCode=us&key=" + api_key;
JSONObject obj = parse(connect(query));
Map<Integer, String> out = new TreeMap<>();
if(obj != null) {
JSONArray items = obj.getJSONArray("items");
for(int i = 0; !items.isNull(i); ++i) {
JSONObject item = items.getJSONObject(i);
String id = item.getString("id");
String name = item.getJSONObject("snippet").getString("title");
try {
int intid = Integer.parseInt(id);
out.put(intid, name);
System.out.println(intid + ";" + name);
} catch (NumberFormatException e) {
e.printStackTrace();
}
}
}
return out;
}
public List<Crawler.Video>[] getInfos(String idlist) {
ArrayList<Crawler.Video> out = new ArrayList<>(idlist.length() / 12);//approximierte vorraussichtliche länge
LinkedList<Crawler.Video> livestr = new LinkedList<>();
String nextpage = "";
do {
String query = BASEQUERY + "videos?part=snippet,contentDetails&id=" + idlist + nextpage + "&key=" + api_key;
JSONObject json = parse(connect(query));
nextpage = "";
if(json != null) {
if(json.has("items")) {
//get video list
json.getJSONArray("items").forEach(item -> out.add( getVid((JSONObject) item) ));
if(json.has("nextPageToken")) {
nextpage = "&pageToken=" + json.getString("nextPageToken");
}
}
}
} while(!nextpage.isEmpty());
return new List[] {out, livestr};
}
private Crawler.Video getVid(JSONObject json) {
String vdid = json.getString("id");
JSONObject snippet = json.getJSONObject("snippet");
String title = snippet.getString("title"); //maxlen: 100
long published = getDate(snippet.getString("publishedAt"));
String channel = snippet.getString("channelTitle");
String tags = snippet.getJSONArray("tags").toList().stream().map(o -> (String) o).collect(Collectors.joining(", ")); // max len: ~500
byte category = 0;
try {
category = Byte.parseByte(snippet.getString("categoryId"));
} catch(NumberFormatException e) {}
JSONObject contentDetails = json.getJSONObject("contentDetails");
int duration = (int) getDuration(contentDetails.getString("duration"));
boolean live = !snippet.getString("liveBroadcastContent").equalsIgnoreCase("none");
String langCode = snippet.getString("defaultLanguage");
if(langCode.length() > 3) {
langCode = langCode.substring(0, 3);
}
if(langCode.endsWith("-")) {
langCode = langCode.substring(0, 2);
}
return new Crawler.Video(vdid, title, channel, tags, duration, langCode, category, published, live);
}
private long getDate(String format) {
try {
Date d = dateformat.parse(format.substring(0, 19).replace('T', ' '));
return d.getTime() / 1000;
} catch (ParseException e) {
e.printStackTrace();
System.err.println("Failed to parse date: " + format);
}
return 0;
}
private long getDuration(String iso8601) {
Duration dur = durationfactory.newDuration(iso8601);
return dur.getTimeInMillis(new Date(0)) / 1000;
}
public BufferedReader connect(String url) {
try {
URL urll = new URL(url);
HttpsURLConnection con = (HttpsURLConnection) urll.openConnection();
con.connect();
//System.out.println(con.getResponseCode());
return new BufferedReader(new InputStreamReader(con.getInputStream()));
} catch(IOException e) {
e.printStackTrace();
}
return null;
}
public JSONObject parse(BufferedReader in) {
if(in == null)
return null;
return new JSONObject(new JSONTokener(in));
}
}
// no suchelement bla