changed crawler regex
This commit is contained in:
parent
d3fa21cbc0
commit
7180007dea
|
@ -9,7 +9,7 @@ import org.apache.log4j.Logger;
|
|||
|
||||
public class CrawlerThread implements Runnable {
|
||||
|
||||
private static Pattern linkpattern = Pattern.compile("href=\"\\/watch\\?v=");
|
||||
private static Pattern linkpattern = Pattern.compile("watch\\?v=([-_a-zA-Z0-9]{11})");
|
||||
|
||||
private Logger log = Logger.getLogger(this.getClass().getName());
|
||||
private Crawler parent;
|
||||
|
@ -90,21 +90,13 @@ public class CrawlerThread implements Runnable {
|
|||
crawled.add(videoid);
|
||||
|
||||
// log.info("crawling: " + videoid);
|
||||
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
|
||||
HTTPS con = new HTTPS("https://youtube.com/watch?v=" + videoid);
|
||||
String s = con.getContent();
|
||||
Matcher matcher = linkpattern.matcher(s);
|
||||
while(matcher.find()) {
|
||||
int beginytid = matcher.end();
|
||||
int endxtid = s.indexOf('"', beginytid);
|
||||
int endid = s.indexOf('&', beginytid);
|
||||
if(endid < endxtid) {
|
||||
endxtid = endid;
|
||||
}
|
||||
String ytid = s.substring(beginytid, endxtid);
|
||||
if(ytid.length() > 9 && ytid.length() <= 12) {
|
||||
String ytid = matcher.group(1);
|
||||
if(!ytid.equals(videoid)) {
|
||||
found.add(ytid);
|
||||
} else {
|
||||
// log.warn("youtube id has wrong length: \"" + ytid + "\"");
|
||||
}
|
||||
}
|
||||
} catch(IOException e) {
|
||||
|
|
Loading…
Reference in New Issue