2021-10-25 17:51:46 +02:00
# include "de_mrbesen_youtubecrawler_CrawlerThread.h"
# include <iostream>
2022-02-20 23:53:05 +01:00
# include <set>
2021-10-25 17:51:46 +02:00
# include <string>
# include <sstream>
# include <regex>
# include <vector>
# include <curl/curl.h>
2022-02-20 23:53:05 +01:00
2021-10-25 17:51:46 +02:00
static const std : : string YOUTUBEBASE = " https://youtube.com/watch?v= " ;
static const std : : regex YOUTUBELINKPATTERN ( " watch \\ ?v=([-_a-zA-Z0-9]{11}) " ) ;
std : : vector < CURL * > curls ;
static size_t WriteCallback ( void * contents , size_t size , size_t nmemb , void * userp ) {
* ( ( std : : ostringstream * ) userp ) < < std : : string ( ( char * ) contents , size * nmemb ) ;
return size * nmemb ;
}
JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_initLib ( JNIEnv * env , jclass clazz , jint threadcount ) {
curls . reserve ( threadcount ) ;
for ( int i = 0 ; i < threadcount ; + + i ) {
CURL * curl = curl_easy_init ( ) ;
curl_easy_setopt ( curl , CURLOPT_FOLLOWLOCATION , 1 ) ;
curl_easy_setopt ( curl , CURLOPT_WRITEFUNCTION , WriteCallback ) ;
curls . push_back ( curl ) ;
}
}
JNIEXPORT void JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_deinitLib ( JNIEnv * env , jclass ) {
for ( int i = 0 ; i < curls . size ( ) ; + + i ) {
CURL * curl = curls . at ( i ) ;
curl_easy_cleanup ( curl ) ;
}
curls . clear ( ) ;
}
std : : string download ( CURL * curl , const std : : string & url ) {
curl_easy_setopt ( curl , CURLOPT_URL , url . c_str ( ) ) ;
std : : ostringstream out ;
curl_easy_setopt ( curl , CURLOPT_WRITEDATA , & out ) ;
CURLcode res = curl_easy_perform ( curl ) ;
long responsecode = 404 ;
curl_easy_getinfo ( curl , CURLINFO_RESPONSE_CODE , & responsecode ) ;
if ( responsecode ! = 200 ) {
std : : cout < < " Curl error: got " < < responsecode < < std : : endl ;
return " " ;
}
if ( res ! = CURLE_OK ) {
std : : cout < < " Curl error: " < < res < < std : : endl ;
return " " ;
}
return out . str ( ) ;
}
2022-02-20 23:53:05 +01:00
JNIEXPORT jboolean JNICALL Java_de_mrbesen_youtubecrawler_CrawlerThread_crawl ( JNIEnv * env , jobject that , jstring videoid , jint threadid ) {
2021-10-25 17:51:46 +02:00
// get videoid argument
jboolean myfalseval = false ;
const char * cvideid = env - > GetStringUTFChars ( videoid , & myfalseval ) ;
2022-02-20 23:53:05 +01:00
const std : : string svideoid ( cvideid ) ; // makes a copy
2021-10-25 17:51:46 +02:00
env - > ReleaseStringUTFChars ( videoid , cvideid ) ;
// std::cout << "crawl: " << svideoid << std::endl;
// use curl to get the website
CURL * curl = curls . at ( threadid ) ;
std : : string webcontent = download ( curl , YOUTUBEBASE + svideoid ) ;
if ( webcontent . empty ( ) ) {
std : : cout < < " webcontent is empty " < < std : : endl ;
2022-02-20 23:53:05 +01:00
return JNI_FALSE ;
2021-10-25 17:51:46 +02:00
}
2022-02-20 23:53:05 +01:00
jclass jclass_ct = env - > FindClass ( " de/mrbesen/youtubecrawler/CrawlerThread " ) ; // class of CrawlerThread
jfieldID fid_ctfound = env - > GetFieldID ( jclass_ct , " found " , " Ljava/util/List; " ) ; // fieldID of CrawlerThread.found
jclass jclass_ll = env - > FindClass ( " java/util/List " ) ; // Linked List Class
jmethodID mid_add = env - > GetMethodID ( jclass_ll , " add " , " (Ljava/lang/Object;)Z " ) ; // add() method of LinkedList
jobject ll_found = env - > GetObjectField ( that , fid_ctfound ) ; // linked list to store the results to
2021-10-25 17:51:46 +02:00
// match regex
2022-02-20 23:53:05 +01:00
std : : sregex_iterator it = std : : sregex_iterator ( webcontent . begin ( ) , webcontent . end ( ) , YOUTUBELINKPATTERN ) ;
const std : : sregex_iterator itend = std : : sregex_iterator ( ) ;
// this set is to find duplicated strings it does not catch everything, because it is only valid during this crawl, but it should filter a few with very little effort
std : : set < std : : string > known ;
known . insert ( svideoid ) ; // do not "find" the same ytid
2021-10-25 17:51:46 +02:00
2022-02-20 23:53:05 +01:00
env - > MonitorEnter ( that ) ; // syncronized(this) {
2021-10-25 17:51:46 +02:00
for ( ; it ! = itend ; + + it ) {
2022-02-20 23:53:05 +01:00
const std : : string ytid = ( * it ) [ 1 ] . str ( ) ; // get the String from the first sub-group match
// only if the ytid is not known
if ( known . find ( ytid ) = = known . end ( ) ) {
// add to the found list
// std::cout << ytid << std::endl;
jstring jytid = env - > NewStringUTF ( ytid . c_str ( ) ) ; // create a java string object
jboolean b = env - > CallBooleanMethod ( ll_found , mid_add , jytid ) ; // call add() on the LinkedList object with the String
known . insert ( ytid ) ;
2021-10-25 17:51:46 +02:00
}
}
2022-02-20 23:53:05 +01:00
env - > MonitorExit ( that ) ; // end of syncronized block
return JNI_TRUE ;
2021-10-25 17:51:46 +02:00
}