changed crawler regex

This commit is contained in:
mrbesen 2021-10-18 16:42:44 +02:00
parent d3fa21cbc0
commit 7180007dea
Signed by: MrBesen
GPG Key ID: 596B2350DCD67504
1 changed files with 4 additions and 12 deletions

View File

@ -9,7 +9,7 @@ import org.apache.log4j.Logger;
public class CrawlerThread implements Runnable {
private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v=");
private static Pattern linkpattern = Pattern.compile("watch\\?v=([-_a-zA-Z0-9]{11})");
private Logger log = Logger.getLogger(this.getClass().getName());
private Crawler parent;
@ -90,21 +90,13 @@ public class CrawlerThread implements Runnable {
crawled.add(videoid);
// log.info("crawling: " + videoid);
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
String s = con.getContent();
Matcher matcher = linkpattern.matcher(s);
while(matcher.find()) {
int beginytid = matcher.end();
int endxtid = s.indexOf('"', beginytid);
int endid = s.indexOf('&', beginytid);
if(endid < endxtid) {
endxtid = endid;
}
String ytid = s.substring(beginytid, endxtid);
if(ytid.length() > 9 && ytid.length() <= 12) {
String ytid = matcher.group(1);
if(!ytid.equals(videoid)) {
found.add(ytid);
} else {
// log.warn("youtube id has wrong length: \"" + ytid + "\"");
}
}
} catch(IOException e) {