123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167 |
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- """
- ================================================================================
- Programmer: Jonathan E. Landrum
- Program: complaint-scraper.py
- Description: Scrapes phone number complaints from a website and makes the
- data available via a RESTful API.
- Dependencies: bs4
- bottle
- bottle.ext.sqlite
- cfscrape
- json
- lxml
- os
- socket
- socks
- sqlite3
- sys
- References: https://deshmukhsuraj.wordpress.com/2015/03/08/anonymous-web-
- scraping-using-python-and-tor/
- http://www.gregreda.com/2013/03/03/web-scraping-101-with-python/
- http://docs.python-guide.org/en/latest/scenarios/scrape/
- https://github.com/Anorov/cloudflare-scrape
- Usage: Run the script with `python complaint-scraper.py`
- Visit `http://localhost:8080` to view results
- Routes defined:
- / Returns all results in the database
- /123 Returns the results for this area code only, where
- 123 is some valid area code
- Version: 1.3
- ================================================================================
- Changelog:
- Time Note Version
- --------------------------------------------------------------------------------
- 14 Dec 15 13:00 CST Added route for favicon.ico to remove errors 1.3
- 14 Dec 15 11:30 CST Refactored purge() and getComments() 1.2
- 14 Dec 15 11:00 CST Modified the API to be more user-friendly 1.1
- 12 Dec 15 03:30 CST Completed the API 1.0
- 12 Dec 15 02:15 CST Successfully getting JSON results from the API 0.4
- 11 Dec 15 23:45 CST Successfully storing results in the database 0.3
- 11 Dec 15 22:30 CST Successfully pulling data from target site 0.2
- 11 Dec 15 19:00 CST Testing database connection 0.1
- ================================================================================
- """
- from bottle import response, route, run
- from bs4 import BeautifulSoup
- from lxml import html
- import bottle
- import bottle.ext.sqlite
- import cfscrape
- import json
- import os
- import socket
- import socks
- import sqlite3 as sql
- import sys
- app = bottle.Bottle()
- plugin = bottle.ext.sqlite.Plugin(dbfile='complaint-scraper.db')
- app.install(plugin)
- def touch():
- # See if the database file exists, and create it if not
- if not os.path.exists('complaint-scraper.db'):
- open('complaint-scraper.db', 'w').close()
- def init():
- try:
- touch()
-
- # Create the table
- with sql.connect('complaint-scraper.db') as con:
- with con as cur:
- cur.execute('''
- CREATE TABLE IF NOT EXISTS Comments(
- ID INTEGER PRIMARY KEY AUTOINCREMENT,
- Area_Code INTEGER,
- Full_Number INTEGER,
- Comment TEXT,
- Num_Comments INTEGER)
- ''')
- except sql.Error, e:
- print "Error %s:" % e.args[0]
- def purge():
- try:
- touch()
-
- # Drop the table
- with sql.connect('complaint-scraper.db') as con:
- with con as cur:
- cur.execute('''DROP TABLE IF EXISTS Comments''')
- init()
- except sql.Error, e:
- print "Error %s:" % e.args[0]
- def scrape():
- try:
- purge()
-
- # Connect to the site
- scrp = cfscrape.create_scraper()
- rqst = scrp.get('http://800notes.com/').content
- soup = BeautifulSoup(rqst, 'lxml')
-
- # Connect to the database
- with sql.connect('complaint-scraper.db') as con:
- with con as cur:
- for div in soup.findAll('div', class_='oos_preview'):
- cnt = div.find('div', class_='oos_previewSide')
- wrp = div.find('div', class_='oos_previewMain')
- num = wrp.find('div', class_='oos_previewHeader')
- lnk = num.find('a', class_='oos_previewTitle')
- txt = wrp.find('div', class_='oos_previewBody')
- areaCode = lnk.text[:3]
- fullNmbr = areaCode + lnk.text[4:7] + lnk.text[8:]
- cmntText = txt.text
- numCmnts = cnt.text
- cur.execute('''
- INSERT INTO Comments(
- Area_Code, Full_Number, Comment, Num_Comments)
- VALUES(?,?,?,?)
- ''', (areaCode, fullNmbr, cmntText, numCmnts))
- except sql.IntegrityError, e:
- print "Error: %s" % e.args[0]
- except sql.Error, e:
- print "Error: %s" % e.args[0]
- @app.route('/favicon.ico')
- def favicon():
- return ""
- @app.route('/', method='GET')
- @app.route('/<ac>', method='GET')
- @app.route('/<ac>/', method='GET')
- def get(ac=''):
- scrape()
- response.content_type = 'application/json'
- result = []
- try:
- # Connect to the database
- with sql.connect('complaint-scraper.db') as con:
- with con as cur:
- cmd = '''SELECT * FROM Comments'''
- if ac:
- cmd += ''' WHERE Area_Code = ''' + ac
- for row in cur.execute(cmd):
- res = {
- 'Area Code':row[1],
- 'Full Number':row[2],
- 'Comment':row[3],
- 'Number of Comments':row[4]
- }
- result.append(res)
- return json.dumps(result)
- except sql.IntegrityError, e:
- print "Error: %s" % e.args[0]
- except sql.Error, e:
- print "Error: %s" % e.args[0]
- app.run(host='localhost', port=8080, debug=True)
|