added postillon, added createchroot
This commit is contained in:
parent
f775343ca1
commit
f1a4442a53
|
@ -0,0 +1,12 @@
|
||||||
|
#!/bin/bash
|
||||||
|
chroot="chroot2"
|
||||||
|
mkdir $chroot
|
||||||
|
cd $chroot
|
||||||
|
cp -r /bin/ .
|
||||||
|
cp -r /lib/ .
|
||||||
|
cp -r /lib64/ .
|
||||||
|
|
||||||
|
mkdir usr
|
||||||
|
cp -r /usr/bin ./usr/
|
||||||
|
|
||||||
|
sudo chroot ./ /bin/bash
|
|
@ -0,0 +1,43 @@
|
||||||
|
|
||||||
|
pages="links.txt"
|
||||||
|
output="postillon.txt"
|
||||||
|
|
||||||
|
#get links
|
||||||
|
|
||||||
|
#nice way with crawling
|
||||||
|
link="https://www.der-postillon.com/search/label/Newsticker"
|
||||||
|
append=""
|
||||||
|
rm "$pages"
|
||||||
|
for i in {1..100}; do
|
||||||
|
echo "get: $link$append"
|
||||||
|
page=$(curl -s "$link$append")
|
||||||
|
echo "$page" | grep -Po "href='https.*\\.html" | cut -c 7- | uniq >> $pages
|
||||||
|
append=$(echo "$page" | grep -Po "\\?updated-max.*?by-date=false" | grep -v "&" | uniq -c | sort | head -n 1 | cut -d " " -f 8-)
|
||||||
|
#echo "possibles: "
|
||||||
|
#echo "$page" | grep -Po "\\?updated-max.*?by-date=false" | grep -v "&" | uniq -c | sort
|
||||||
|
sleep 1 # dont ban me pls
|
||||||
|
done
|
||||||
|
|
||||||
|
# lame way with rss
|
||||||
|
#curl http://feeds.feedburner.com/blogspot/rkEL -s | sed 's/item>/item>\n/g' | grep "Newsticker" | grep -Po "<link>.*?</link>" | cut -c 7-86 > $pages
|
||||||
|
|
||||||
|
#remove duplicates
|
||||||
|
cat "$pages" | sort | uniq > temp
|
||||||
|
mv temp "$pages"
|
||||||
|
echo "page count:"
|
||||||
|
wc -l "$pages"
|
||||||
|
|
||||||
|
#download pages
|
||||||
|
wget -nc -nv -w 1 -i $pages
|
||||||
|
|
||||||
|
for f in newsticker-*.html; do
|
||||||
|
echo "check file: $f"
|
||||||
|
cat $f | grep "+++" | grep -v "_Widget" | grep -v "meta" | grep -v "var" | grep -v "'\/>" | sed 's#<br />##g;s#+++##g' >> $output
|
||||||
|
done
|
||||||
|
|
||||||
|
# make them uniq
|
||||||
|
cat $output | sort | uniq > temp
|
||||||
|
mv temp $output
|
||||||
|
|
||||||
|
#remove temp
|
||||||
|
rm $pages newsticker-*.html
|
Loading…
Reference in New Issue