createbook.sh 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. #!/bin/bash
  2. #
  3. # This script downloads all the pages from Mr Money Mustache's blog archive
  4. # and extracts the post content's HTML. Then, if Calibre is available in the
  5. # system, converts it into a nice and easy epub file for viewing in your
  6. # favorite ebook reader.
  7. #
  8. # Requirements besides the shell built-ins and common distro programs are:
  9. #
  10. # - pup (https://github.com/ericchiang/pup): HTML parser written in Go
  11. # - calibre: optional, required to create epub files.
  12. # - wget: for downloading articles (with images!)
  13. # - curl: for downloading index file
  14. #
  15. # Warning: running this may put a large load in Mr Money Mustache's servers,
  16. # which could be wrongly identified as a DDoS attack. Be polite and refrain
  17. # from running this very often.
  18. #
  19. # TODO: implement a tool to download only updates, so you don't have to
  20. # download 500 pages every time one article comes out.
  21. #
  22. # Copyright 2018 - kzimmermann - https://quitter.se/kzimmermann
  23. #
  24. # This program is free software: you can redistribute it and/or modify
  25. # it under the terms of the GNU General Public License as published by
  26. # the Free Software Foundation, either version 3 of the License, or
  27. # (at your option) any later version.
  28. #
  29. # This program is distributed in the hope that it will be useful,
  30. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  31. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  32. # GNU General Public License for more details.
  33. #
  34. # You should have received a copy of the GNU General Public License
  35. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  36. #
  37. if [[ -z $(which pup) ]]
  38. then
  39. echo "This script requires pup to work, which we couldn't find here."
  40. echo "Please download the latest version before proceeding."
  41. exit 1
  42. fi
  43. article_list="http://www.mrmoneymustache.com/all-the-posts-since-the-beginning-of-time/"
  44. # Parse article list to produce an "index" from which we will download pages:
  45. echo "Getting MMM's index page..."
  46. curl -sL "$article_list" | grep "https://mrmoneymustache.com/20" > /tmp/links_raw
  47. number=$(wc -l /tmp/links_raw | cut -d " " -f 1)
  48. echo "There are $number articles to be downloaded."
  49. # We need just the raw links.
  50. # His HTTPS cert is wrong, which means that GnuTLS will complain if the links
  51. # are left as they are. Turn them into HTTP instead:
  52. cat /tmp/links_raw | sed 's/<li><a href="https/http/g' | sed 's/">.*//g' > /tmp/links_raw
  53. # Order it backwards (older first) so it reads more like a book
  54. sort links_raw > links_raw
  55. # Create a neat directory for tidiness, and start downloading pages.
  56. # This can take a loooong time (about 500 articles!)
  57. mkdir /tmp/mmmbook
  58. cd /tmp/mmmbook
  59. echo "Downloading article pages. Please be patient, as it can take a long time."
  60. counter=1
  61. while read line
  62. do
  63. echo "Downloading article $counter of $number..."
  64. wget "$line" 2> /dev/null
  65. counter=$(expr $counter + 1)
  66. done < /tmp/links_raw
  67. echo "Download completed."
  68. # With all articles downloaded, now it's time to trim the garbage (headers,
  69. # comments, etc) into a long, readable, HTML document. pup is important here.
  70. echo "Composing HTML document..."
  71. for article in index*
  72. do
  73. cat "$article" | pup "div[class~=post_box]" >> mmm_book.html
  74. done
  75. # At this point, you are good to go. However, if you would like to make it into
  76. # an ebook, install calibre and follow up on converting it into epub!
  77. if [[ -n $(which ebook-convert) ]]
  78. then
  79. echo "Converting into EPUB. This can take a while..."
  80. ebook-convert mmm_book.html mmm_book.epub 2> /dev/null
  81. echo "Conversion complete. Your ebook is available here:"
  82. echo "/tmp/mmmbook/mmm_book.epub"
  83. else
  84. echo "Calibre is not available in this system. Cannot convert to epub."
  85. echo "Your book (in html format) is available here:"
  86. echo "/tmp/mmmbook/mmm_book.html"
  87. fi
  88. exit 0