123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130 |
- '''
- deadseeker.py
- Seeking out your 404s in around 100 lines of vanilla Python.
- '''
- import sys
- import urllib
- from urllib import request, parse
- from urllib.parse import urlparse, urljoin
- from urllib.request import Request
- from html.parser import HTMLParser
- from collections import deque
- search_attrs = set(['href', 'src'])
- agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
- def is_file(string):
- is_a_file = True
- if 'http' in string or 'https' in string:
- is_a_file = False
- return is_a_file
- class LinkParser(HTMLParser):
- def __init__(self, home, verbose):
- ''':home: a homepage, e.g. 'http://www.example.com/'
- :verbose: boolean for for verbose mode'''
- super().__init__()
- self.home = home
- self.verbose = verbose
- self.file = True
- self.checked_links = set()
- self.pages_to_check = deque()
- self.pages_to_check.appendleft(home)
- self.source = home
- self.scanner()
- def scanner(self):
- '''Loop through remaining pages, looking for HTML responses'''
- while self.pages_to_check:
- page = self.pages_to_check.pop()
- self.source = page
- self.file = is_file(page)
- if(not self.file):
- req = Request(page, headers={'User-Agent': agent})
- try:
- res = request.urlopen(req)
- if res.headers['content-type'] is not None:
- if 'html' in res.headers['content-type']:
- with res as f:
- body = f.read().decode('utf-8', errors='ignore')
- self.feed(body)
-
- except urllib.error.HTTPError as e:
- print(f'SOURCE: {self.source}')
- print(f'HTTPError: {e.code} - {self.source}') # (e.g. 404, 501, etc)
- else:
- if page[len(page) - 1] == '/':
- page += "index.html"
- try:
- read_file = open(page, "r", encoding="utf8", errors='ignore')
- if read_file:
- body = read_file.read()
- self.feed(body)
- else:
- print(f'SOURCE: {self.source}')
- print(f'FileError: {link}')
- except FileNotFoundError as e:
- print(f'SOURCE: {self.source}')
- print(f'FileError: {link}')
- def handle_starttag(self, tag, attrs):
- '''Override parent method and check tag for our attributes'''
- for attr in attrs:
- # ('href', 'http://www.example.com/')
- encoded_link = ""
- if type(attr[1]) != type(None):
- encoded_link = attr[1].replace(' ','%20')
- if attr[0] in search_attrs and encoded_link not in self.checked_links:
- self.checked_links.add(encoded_link)
- self.handle_link(encoded_link)
- def handle_link(self, link):
- '''Send a HEAD request to the link, catch any pesky errors'''
- if not bool(urlparse(link).netloc):
- link = urljoin(self.source, link)
-
- self.file = is_file(link)
- if(self.file):
- if link[len(link) - 1] == '/':
- link += "index.html"
- try:
- read_file = open(link, "r", encoding="utf8", errors='ignore')
- if read_file:
- page = read_file.read()
- if self.home in link:
- self.pages_to_check.appendleft(link)
- if self.verbose:
- print(f'File found: {link}')
- except FileNotFoundError as e:
- print(f'SOURCE: {self.source}')
- print(f'FileError: {link}')
- else:
- try:
- req = Request(link, headers={'User-Agent': agent}, method='HEAD')
- status = request.urlopen(req).getcode()
- if self.home in link:
- self.pages_to_check.appendleft(link)
- if self.verbose:
- print(f'{status} - {link}')
- except urllib.error.HTTPError as e:
- print(f'SOURCE: {self.source}')
- print(f'HTTPError: {e.code} - {link}') # (e.g. 404, 501, etc)
- except urllib.error.URLError as e:
- print(f'SOURCE: {self.source}')
- print(f'URLError: {e.reason} - {link}') # (e.g. conn. refused)
- except ValueError as e:
- print(f'SOURCE: {self.source}')
- print(f'ValueError {e} - {link}') # (e.g. missing protocol http)
- # check for verbose tag
- verbose = len(sys.argv) > 2 and sys.argv[2] == 'v'
- # enable this as a script, e.g., 'http://www.example.com/ v'
- LinkParser(sys.argv[1], verbose)
|