44 lines
1.2 KiB
Bash
Executable File
44 lines
1.2 KiB
Bash
Executable File
|
|
pages="links.txt"
|
|
output="postillon.txt"
|
|
|
|
#get links
|
|
|
|
#nice way with crawling
|
|
link="https://www.der-postillon.com/search/label/Newsticker"
|
|
append=""
|
|
rm "$pages"
|
|
for i in {1..100}; do
|
|
echo "get: $link$append"
|
|
page=$(curl -s "$link$append")
|
|
echo "$page" | grep -Po "href='https.*\\.html" | cut -c 7- | uniq >> $pages
|
|
append=$(echo "$page" | grep -Po "\\?updated-max.*?by-date=false" | grep -v "&" | uniq -c | sort | head -n 1 | cut -d " " -f 8-)
|
|
#echo "possibles: "
|
|
#echo "$page" | grep -Po "\\?updated-max.*?by-date=false" | grep -v "&" | uniq -c | sort
|
|
sleep 1 # dont ban me pls
|
|
done
|
|
|
|
# lame way with rss
|
|
#curl http://feeds.feedburner.com/blogspot/rkEL -s | sed 's/item>/item>\n/g' | grep "Newsticker" | grep -Po "<link>.*?</link>" | cut -c 7-86 > $pages
|
|
|
|
#remove duplicates
|
|
cat "$pages" | sort | uniq > temp
|
|
mv temp "$pages"
|
|
echo "page count:"
|
|
wc -l "$pages"
|
|
|
|
#download pages
|
|
wget -nc -nv -w 1 -i $pages
|
|
|
|
for f in newsticker-*.html; do
|
|
echo "check file: $f"
|
|
cat $f | grep "+++" | grep -v "_Widget" | grep -v "meta" | grep -v "var" | grep -v "'\/>" | sed 's#<br />##g;s#+++##g' >> $output
|
|
done
|
|
|
|
# make them uniq
|
|
cat $output | sort | uniq > temp
|
|
mv temp $output
|
|
|
|
#remove temp
|
|
rm $pages newsticker-*.html
|