139 lines
3.8 KiB
Java
139 lines
3.8 KiB
Java
package de.mrbesen.youtubecrawler;
|
|
|
|
import java.io.BufferedWriter;
|
|
import java.io.File;
|
|
import java.io.FileWriter;
|
|
import java.io.IOException;
|
|
import java.io.PrintWriter;
|
|
import java.util.LinkedList;
|
|
import java.util.List;
|
|
import java.util.Scanner;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
import org.apache.log4j.Logger;
|
|
|
|
public class Crawler implements Runnable {
|
|
|
|
private LinkedList<String> toSave = new LinkedList<>();//all found ytids, witch need to be analysed
|
|
private LinkedList<String> toCrawl = new LinkedList<>();//all videos tu crawl
|
|
private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle
|
|
private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v=");
|
|
private boolean crawl = true;
|
|
private int crawlcount = 0;
|
|
private DB db = new DB();
|
|
private YoutubeAPI api = new YoutubeAPI();
|
|
private File crawlfile = new File("crawl.txt");
|
|
private Logger log = Logger.getLogger(Config.class.getName());
|
|
|
|
public void stop() {
|
|
crawl = false;
|
|
}
|
|
|
|
public void addtoCrawl(String videoid) {
|
|
if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
|
|
toknown.add(videoid);
|
|
}
|
|
|
|
private void crawl(String videoid) {
|
|
try {
|
|
crawlcount++;
|
|
log.info("crawling: " + videoid);
|
|
toSave.add(videoid);
|
|
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
|
|
String s = con.getContent();
|
|
Matcher matcher = linkpattern.matcher(s);
|
|
while(matcher.find()) {
|
|
int beginytid = matcher.end();
|
|
int endxtid = s.indexOf('"', beginytid);
|
|
int endid = s.indexOf('&', beginytid);
|
|
if(endid < endxtid) {
|
|
endxtid = endid;
|
|
}
|
|
String ytid = s.substring(beginytid, endxtid);
|
|
if(ytid.length() > 9 && ytid.length() <= 12) {
|
|
addtoCrawl(ytid);
|
|
} else {
|
|
log.warn("youtube id has wrong length: \"" + ytid + "\"");
|
|
}
|
|
}
|
|
} catch(IOException e) {
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
|
|
@Override
|
|
public void run() {
|
|
long start = System.currentTimeMillis();
|
|
log.info("Try to load crawlfile");
|
|
if(crawlfile.exists()) {
|
|
try {
|
|
Scanner in = new Scanner(crawlfile);
|
|
while(in.hasNextLine()) {
|
|
String line = in.nextLine();
|
|
if(line == null) {
|
|
break;
|
|
} else {
|
|
if(!line.isEmpty()) {
|
|
addtoCrawl(line);
|
|
}
|
|
}
|
|
}
|
|
in.close();
|
|
} catch(IOException e) {
|
|
log.warn("Error while loading crawl file.");
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
while(crawl) {
|
|
log.info("to Crawl: " + toCrawl.size());
|
|
|
|
while(!toCrawl.isEmpty() && crawl) {
|
|
crawl(toCrawl.remove(0));
|
|
}
|
|
if(toknown.isEmpty()) {//very uncommon
|
|
log.warn("nothing left.");
|
|
crawl = false;
|
|
//delete / clear crawl file
|
|
} else {
|
|
toCrawl.addAll(db.checkvideos(toknown));//TODO cap it at something like 1000 videos
|
|
toknown.clear();
|
|
try {
|
|
PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile)));
|
|
for(String t : toCrawl) {
|
|
p.println(t);
|
|
}
|
|
p.close();
|
|
} catch (IOException e) {
|
|
log.error("Error writing crawlfile.", e);
|
|
}
|
|
}
|
|
// System.out.println("try to save " + toSave.size() + " videos.");
|
|
while(!toSave.isEmpty()) {
|
|
LinkedList<String> videoids = new LinkedList<>();
|
|
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
|
|
videoids.add(toSave.remove(0));
|
|
}
|
|
if(videoids.size() > 0) {
|
|
List<Video> videos = api.getInfos(videoids);
|
|
db.addVideos(videos);
|
|
}
|
|
}
|
|
}
|
|
long diff = (System.currentTimeMillis() - start)/ 60000;
|
|
log.info("Crawling Stopped. Runtime: " + ((int) diff) + "min and " + crawlcount + " videos crawled.");
|
|
}
|
|
|
|
public static Video getVideo() {
|
|
return new Video();
|
|
}
|
|
|
|
public static class Video {
|
|
String id;
|
|
int length;//the length of the video in seconds
|
|
String languageCode;
|
|
byte categorie;
|
|
long created;
|
|
}
|
|
}
|