123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109 |
- #!/bin/bash
- #
- # This script downloads all the pages from Mr Money Mustache's blog archive
- # and extracts the post content's HTML. Then, if Calibre is available in the
- # system, converts it into a nice and easy epub file for viewing in your
- # favorite ebook reader.
- #
- # Requirements besides the shell built-ins and common distro programs are:
- #
- # - pup (https://github.com/ericchiang/pup): HTML parser written in Go
- # - calibre: optional, required to create epub files.
- # - wget: for downloading articles (with images!)
- # - curl: for downloading index file
- #
- # Warning: running this may put a large load in Mr Money Mustache's servers,
- # which could be wrongly identified as a DDoS attack. Be polite and refrain
- # from running this very often.
- #
- # TODO: implement a tool to download only updates, so you don't have to
- # download 500 pages every time one article comes out.
- #
- # Copyright 2018 - kzimmermann - https://quitter.se/kzimmermann
- #
- # This program is free software: you can redistribute it and/or modify
- # it under the terms of the GNU General Public License as published by
- # the Free Software Foundation, either version 3 of the License, or
- # (at your option) any later version.
- #
- # This program is distributed in the hope that it will be useful,
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- # GNU General Public License for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with this program. If not, see <http://www.gnu.org/licenses/>.
- #
- if [[ -z $(which pup) ]]
- then
- echo "This script requires pup to work, which we couldn't find here."
- echo "Please download the latest version before proceeding."
- exit 1
- fi
- article_list="http://www.mrmoneymustache.com/all-the-posts-since-the-beginning-of-time/"
- # Parse article list to produce an "index" from which we will download pages:
- echo "Getting MMM's index page..."
- curl -sL "$article_list" | grep "https://mrmoneymustache.com/20" > /tmp/links_raw
- number=$(wc -l /tmp/links_raw | cut -d " " -f 1)
- echo "There are $number articles to be downloaded."
- # We need just the raw links.
- # His HTTPS cert is wrong, which means that GnuTLS will complain if the links
- # are left as they are. Turn them into HTTP instead:
- cat /tmp/links_raw | sed 's/<li><a href="https/http/g' | sed 's/">.*//g' > /tmp/links_raw
- # Order it backwards (older first) so it reads more like a book
- sort links_raw > links_raw
- # Create a neat directory for tidiness, and start downloading pages.
- # This can take a loooong time (about 500 articles!)
- mkdir /tmp/mmmbook
- cd /tmp/mmmbook
- echo "Downloading article pages. Please be patient, as it can take a long time."
- counter=1
- while read line
- do
- echo "Downloading article $counter of $number..."
- wget "$line" 2> /dev/null
- counter=$(expr $counter + 1)
- done < /tmp/links_raw
- echo "Download completed."
- # With all articles downloaded, now it's time to trim the garbage (headers,
- # comments, etc) into a long, readable, HTML document. pup is important here.
- echo "Composing HTML document..."
- for article in index*
- do
- cat "$article" | pup "div[class~=post_box]" >> mmm_book.html
- done
- # At this point, you are good to go. However, if you would like to make it into
- # an ebook, install calibre and follow up on converting it into epub!
- if [[ -n $(which ebook-convert) ]]
- then
- echo "Converting into EPUB. This can take a while..."
- ebook-convert mmm_book.html mmm_book.epub 2> /dev/null
- echo "Conversion complete. Your ebook is available here:"
- echo "/tmp/mmmbook/mmm_book.epub"
- else
- echo "Calibre is not available in this system. Cannot convert to epub."
- echo "Your book (in html format) is available here:"
- echo "/tmp/mmmbook/mmm_book.html"
- fi
- exit 0
|