YoutubeCrawler/src/de/mrbesen/youtubecrawler/Crawler.java

144 lines
3.9 KiB
Java

package de.mrbesen.youtubecrawler;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.LinkedList;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
public class Crawler implements Runnable {
private LinkedList<String> toSave = new LinkedList<>();//all found ytids, witch need to be analysed
private LinkedList<String> toCrawl = new LinkedList<>();//all videos tu crawl
private LinkedList<String> toknown = new LinkedList<>();//list with all videos, to test if they are allready known, if not they are moved to tocrawle
private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v=");
private boolean crawl = true;
private int crawlcount = 0;
private DB db = new DB();
private YoutubeAPI api = new YoutubeAPI();
private File crawlfile = new File("crawl.txt");
private Logger log = Logger.getLogger(Crawler.class.getName());
private static int maxvideostotest = 100;
public void stop() {
crawl = false;
}
public void addtoCrawl(String videoid) {
if(! (toCrawl.contains(videoid) || toknown.contains(videoid)))
toknown.add(videoid);
}
private void crawl(String videoid) {
try {
crawlcount++;
log.info("crawling: " + videoid);
toSave.add(videoid);
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
String s = con.getContent();
Matcher matcher = linkpattern.matcher(s);
while(matcher.find()) {
int beginytid = matcher.end();
int endxtid = s.indexOf('"', beginytid);
int endid = s.indexOf('&', beginytid);
if(endid < endxtid) {
endxtid = endid;
}
String ytid = s.substring(beginytid, endxtid);
if(ytid.length() > 9 && ytid.length() <= 12) {
addtoCrawl(ytid);
} else {
log.warn("youtube id has wrong length: \"" + ytid + "\"");
}
}
} catch(IOException e) {
e.printStackTrace();
}
}
@Override
public void run() {
long start = System.currentTimeMillis();
log.info("Try to load crawlfile");
if(crawlfile.exists()) {
try {
Scanner in = new Scanner(crawlfile);
while(in.hasNextLine()) {
String line = in.nextLine();
if(line == null) {
break;
} else {
if(!line.isEmpty()) {
addtoCrawl(line);
}
}
}
in.close();
} catch(IOException e) {
log.warn("Error while loading crawl file.");
e.printStackTrace();
}
}
while(crawl) {
log.info("to Crawl: " + toCrawl.size());
while(!toCrawl.isEmpty() && crawl) {
crawl(toCrawl.remove(0));
}
if(toknown.isEmpty()) {//very uncommon
log.warn("nothing left.");
crawl = false;
//delete / clear crawl file
} else {
LinkedList<String> tocheck = new LinkedList<>();
for(int i = 0; i < toknown.size() && i < maxvideostotest; i++) {
tocheck.add(toknown.removeFirst());
}
toCrawl.addAll(db.checkvideos(tocheck));
try {
PrintWriter p = new PrintWriter(new BufferedWriter(new FileWriter(crawlfile)));
for(String t : toCrawl) {
p.println(t);
}
p.close();
} catch (IOException e) {
log.error("Error writing crawlfile.", e);
}
}
// System.out.println("try to save " + toSave.size() + " videos.");
while(!toSave.isEmpty()) {
LinkedList<String> videoids = new LinkedList<>();
for(int i = 0; i < 50 && !toSave.isEmpty(); i++) {
videoids.add(toSave.remove(0));
}
if(videoids.size() > 0) {
List<Video> videos = api.getInfos(videoids);
db.addVideos(videos);
}
}
}
long diff = (System.currentTimeMillis() - start)/ 60000;
log.info("Crawling Stopped. Runtime: " + ((int) diff) + "min and " + crawlcount + " videos crawled.");
}
public static Video getVideo() {
return new Video();
}
public static class Video {
String id;
int length;//the length of the video in seconds
String languageCode;
byte categorie;
long created;
}
}