complaint-scraper.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. """
  4. ================================================================================
  5. Programmer: Jonathan E. Landrum
  6. Program: complaint-scraper.py
  7. Description: Scrapes phone number complaints from a website and makes the
  8. data available via a RESTful API.
  9. Dependencies: bs4
  10. bottle
  11. bottle.ext.sqlite
  12. cfscrape
  13. json
  14. lxml
  15. os
  16. socket
  17. socks
  18. sqlite3
  19. sys
  20. References: https://deshmukhsuraj.wordpress.com/2015/03/08/anonymous-web-
  21. scraping-using-python-and-tor/
  22. http://www.gregreda.com/2013/03/03/web-scraping-101-with-python/
  23. http://docs.python-guide.org/en/latest/scenarios/scrape/
  24. https://github.com/Anorov/cloudflare-scrape
  25. Usage: Run the script with `python complaint-scraper.py`
  26. Visit `http://localhost:8080` to view results
  27. Routes defined:
  28. / Returns all results in the database
  29. /123 Returns the results for this area code only, where
  30. 123 is some valid area code
  31. Version: 1.3
  32. ================================================================================
  33. Changelog:
  34. Time Note Version
  35. --------------------------------------------------------------------------------
  36. 14 Dec 15 13:00 CST Added route for favicon.ico to remove errors 1.3
  37. 14 Dec 15 11:30 CST Refactored purge() and getComments() 1.2
  38. 14 Dec 15 11:00 CST Modified the API to be more user-friendly 1.1
  39. 12 Dec 15 03:30 CST Completed the API 1.0
  40. 12 Dec 15 02:15 CST Successfully getting JSON results from the API 0.4
  41. 11 Dec 15 23:45 CST Successfully storing results in the database 0.3
  42. 11 Dec 15 22:30 CST Successfully pulling data from target site 0.2
  43. 11 Dec 15 19:00 CST Testing database connection 0.1
  44. ================================================================================
  45. """
  46. from bottle import response, route, run
  47. from bs4 import BeautifulSoup
  48. from lxml import html
  49. import bottle
  50. import bottle.ext.sqlite
  51. import cfscrape
  52. import json
  53. import os
  54. import socket
  55. import socks
  56. import sqlite3 as sql
  57. import sys
  58. app = bottle.Bottle()
  59. plugin = bottle.ext.sqlite.Plugin(dbfile='complaint-scraper.db')
  60. app.install(plugin)
  61. def touch():
  62. # See if the database file exists, and create it if not
  63. if not os.path.exists('complaint-scraper.db'):
  64. open('complaint-scraper.db', 'w').close()
  65. def init():
  66. try:
  67. touch()
  68. # Create the table
  69. with sql.connect('complaint-scraper.db') as con:
  70. with con as cur:
  71. cur.execute('''
  72. CREATE TABLE IF NOT EXISTS Comments(
  73. ID INTEGER PRIMARY KEY AUTOINCREMENT,
  74. Area_Code INTEGER,
  75. Full_Number INTEGER,
  76. Comment TEXT,
  77. Num_Comments INTEGER)
  78. ''')
  79. except sql.Error, e:
  80. print "Error %s:" % e.args[0]
  81. def purge():
  82. try:
  83. touch()
  84. # Drop the table
  85. with sql.connect('complaint-scraper.db') as con:
  86. with con as cur:
  87. cur.execute('''DROP TABLE IF EXISTS Comments''')
  88. init()
  89. except sql.Error, e:
  90. print "Error %s:" % e.args[0]
  91. def scrape():
  92. try:
  93. purge()
  94. # Connect to the site
  95. scrp = cfscrape.create_scraper()
  96. rqst = scrp.get('http://800notes.com/').content
  97. soup = BeautifulSoup(rqst, 'lxml')
  98. # Connect to the database
  99. with sql.connect('complaint-scraper.db') as con:
  100. with con as cur:
  101. for div in soup.findAll('div', class_='oos_preview'):
  102. cnt = div.find('div', class_='oos_previewSide')
  103. wrp = div.find('div', class_='oos_previewMain')
  104. num = wrp.find('div', class_='oos_previewHeader')
  105. lnk = num.find('a', class_='oos_previewTitle')
  106. txt = wrp.find('div', class_='oos_previewBody')
  107. areaCode = lnk.text[:3]
  108. fullNmbr = areaCode + lnk.text[4:7] + lnk.text[8:]
  109. cmntText = txt.text
  110. numCmnts = cnt.text
  111. cur.execute('''
  112. INSERT INTO Comments(
  113. Area_Code, Full_Number, Comment, Num_Comments)
  114. VALUES(?,?,?,?)
  115. ''', (areaCode, fullNmbr, cmntText, numCmnts))
  116. except sql.IntegrityError, e:
  117. print "Error: %s" % e.args[0]
  118. except sql.Error, e:
  119. print "Error: %s" % e.args[0]
  120. @app.route('/favicon.ico')
  121. def favicon():
  122. return ""
  123. @app.route('/', method='GET')
  124. @app.route('/<ac>', method='GET')
  125. @app.route('/<ac>/', method='GET')
  126. def get(ac=''):
  127. scrape()
  128. response.content_type = 'application/json'
  129. result = []
  130. try:
  131. # Connect to the database
  132. with sql.connect('complaint-scraper.db') as con:
  133. with con as cur:
  134. cmd = '''SELECT * FROM Comments'''
  135. if ac:
  136. cmd += ''' WHERE Area_Code = ''' + ac
  137. for row in cur.execute(cmd):
  138. res = {
  139. 'Area Code':row[1],
  140. 'Full Number':row[2],
  141. 'Comment':row[3],
  142. 'Number of Comments':row[4]
  143. }
  144. result.append(res)
  145. return json.dumps(result)
  146. except sql.IntegrityError, e:
  147. print "Error: %s" % e.args[0]
  148. except sql.Error, e:
  149. print "Error: %s" % e.args[0]
  150. app.run(host='localhost', port=8080, debug=True)