changed crawler regex

This commit is contained in:
mrbesen 2021-10-18 16:42:44 +02:00
parent d3fa21cbc0
commit 7180007dea
Signed by: MrBesen
GPG Key ID: 596B2350DCD67504
1 changed files with 4 additions and 12 deletions

View File

@ -9,7 +9,7 @@ import org.apache.log4j.Logger;
public class CrawlerThread implements Runnable { public class CrawlerThread implements Runnable {
private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v="); private static Pattern linkpattern = Pattern.compile("watch\\?v=([-_a-zA-Z0-9]{11})");
private Logger log = Logger.getLogger(this.getClass().getName()); private Logger log = Logger.getLogger(this.getClass().getName());
private Crawler parent; private Crawler parent;
@ -90,21 +90,13 @@ public class CrawlerThread implements Runnable {
crawled.add(videoid); crawled.add(videoid);
// log.info("crawling: " + videoid); // log.info("crawling: " + videoid);
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid); HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
String s = con.getContent(); String s = con.getContent();
Matcher matcher = linkpattern.matcher(s); Matcher matcher = linkpattern.matcher(s);
while(matcher.find()) { while(matcher.find()) {
int beginytid = matcher.end(); String ytid = matcher.group(1);
int endxtid = s.indexOf('"', beginytid); if(!ytid.equals(videoid)) {
int endid = s.indexOf('&', beginytid);
if(endid < endxtid) {
endxtid = endid;
}
String ytid = s.substring(beginytid, endxtid);
if(ytid.length() > 9 && ytid.length() <= 12) {
found.add(ytid); found.add(ytid);
} else {
// log.warn("youtube id has wrong length: \"" + ytid + "\"");
} }
} }
} catch(IOException e) { } catch(IOException e) {