pages="links.txt" output="postillon.txt" #get links #nice way with crawling link="https://www.der-postillon.com/search/label/Newsticker" append="" rm "$pages" for i in {1..100}; do echo "get: $link$append" page=$(curl -s "$link$append") echo "$page" | grep -Po "href='https.*\\.html" | cut -c 7- | uniq >> $pages append=$(echo "$page" | grep -Po "\\?updated-max.*?by-date=false" | grep -v "&" | uniq -c | sort | head -n 1 | cut -d " " -f 8-) #echo "possibles: " #echo "$page" | grep -Po "\\?updated-max.*?by-date=false" | grep -v "&" | uniq -c | sort sleep 1 # dont ban me pls done # lame way with rss #curl http://feeds.feedburner.com/blogspot/rkEL -s | sed 's/item>/item>\n/g' | grep "Newsticker" | grep -Po ".*?" | cut -c 7-86 > $pages #remove duplicates cat "$pages" | sort | uniq > temp mv temp "$pages" echo "page count:" wc -l "$pages" #download pages wget -nc -nv -w 1 -i $pages for f in newsticker-*.html; do echo "check file: $f" cat $f | grep "+++" | grep -v "_Widget" | grep -v "meta" | grep -v "var" | grep -v "'\/>" | sed 's#
##g;s#+++##g' >> $output done # make them uniq cat $output | sort | uniq > temp mv temp $output #remove temp rm $pages newsticker-*.html