deadseeker.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. '''
  2. deadseeker.py
  3. Seeking out your 404s in around 100 lines of vanilla Python.
  4. '''
  5. import sys
  6. import urllib
  7. from urllib import request, parse
  8. from urllib.parse import urlparse, urljoin
  9. from urllib.request import Request
  10. from html.parser import HTMLParser
  11. from collections import deque
  12. search_attrs = set(['href', 'src'])
  13. agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
  14. def is_file(string):
  15. is_a_file = True
  16. if 'http' in string or 'https' in string:
  17. is_a_file = False
  18. return is_a_file
  19. class LinkParser(HTMLParser):
  20. def __init__(self, home, verbose):
  21. ''':home: a homepage, e.g. 'http://www.example.com/'
  22. :verbose: boolean for for verbose mode'''
  23. super().__init__()
  24. self.home = home
  25. self.verbose = verbose
  26. self.file = True
  27. self.checked_links = set()
  28. self.pages_to_check = deque()
  29. self.pages_to_check.appendleft(home)
  30. self.source = home
  31. self.scanner()
  32. def scanner(self):
  33. '''Loop through remaining pages, looking for HTML responses'''
  34. while self.pages_to_check:
  35. page = self.pages_to_check.pop()
  36. self.source = page
  37. self.file = is_file(page)
  38. if(not self.file):
  39. req = Request(page, headers={'User-Agent': agent})
  40. try:
  41. res = request.urlopen(req)
  42. if res.headers['content-type'] is not None:
  43. if 'html' in res.headers['content-type']:
  44. with res as f:
  45. body = f.read().decode('utf-8', errors='ignore')
  46. self.feed(body)
  47. except urllib.error.HTTPError as e:
  48. print(f'SOURCE: {self.source}')
  49. print(f'HTTPError: {e.code} - {self.source}') # (e.g. 404, 501, etc)
  50. else:
  51. if page[len(page) - 1] == '/':
  52. page += "index.html"
  53. try:
  54. read_file = open(page, "r", encoding="utf8", errors='ignore')
  55. if read_file:
  56. body = read_file.read()
  57. self.feed(body)
  58. else:
  59. print(f'SOURCE: {self.source}')
  60. print(f'FileError: {link}')
  61. except FileNotFoundError as e:
  62. print(f'SOURCE: {self.source}')
  63. print(f'FileError: {link}')
  64. def handle_starttag(self, tag, attrs):
  65. '''Override parent method and check tag for our attributes'''
  66. for attr in attrs:
  67. # ('href', 'http://www.example.com/')
  68. encoded_link = ""
  69. if type(attr[1]) != type(None):
  70. encoded_link = attr[1].replace(' ','%20')
  71. if attr[0] in search_attrs and encoded_link not in self.checked_links:
  72. self.checked_links.add(encoded_link)
  73. self.handle_link(encoded_link)
  74. def handle_link(self, link):
  75. '''Send a HEAD request to the link, catch any pesky errors'''
  76. if not bool(urlparse(link).netloc):
  77. link = urljoin(self.source, link)
  78. self.file = is_file(link)
  79. if(self.file):
  80. if link[len(link) - 1] == '/':
  81. link += "index.html"
  82. try:
  83. read_file = open(link, "r", encoding="utf8", errors='ignore')
  84. if read_file:
  85. page = read_file.read()
  86. if self.home in link:
  87. self.pages_to_check.appendleft(link)
  88. if self.verbose:
  89. print(f'File found: {link}')
  90. except FileNotFoundError as e:
  91. print(f'SOURCE: {self.source}')
  92. print(f'FileError: {link}')
  93. else:
  94. try:
  95. req = Request(link, headers={'User-Agent': agent}, method='HEAD')
  96. status = request.urlopen(req).getcode()
  97. if self.home in link:
  98. self.pages_to_check.appendleft(link)
  99. if self.verbose:
  100. print(f'{status} - {link}')
  101. except urllib.error.HTTPError as e:
  102. print(f'SOURCE: {self.source}')
  103. print(f'HTTPError: {e.code} - {link}') # (e.g. 404, 501, etc)
  104. except urllib.error.URLError as e:
  105. print(f'SOURCE: {self.source}')
  106. print(f'URLError: {e.reason} - {link}') # (e.g. conn. refused)
  107. except ValueError as e:
  108. print(f'SOURCE: {self.source}')
  109. print(f'ValueError {e} - {link}') # (e.g. missing protocol http)
  110. # check for verbose tag
  111. verbose = len(sys.argv) > 2 and sys.argv[2] == 'v'
  112. # enable this as a script, e.g., 'http://www.example.com/ v'
  113. LinkParser(sys.argv[1], verbose)