alkeon
/
deadseeker


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
							'''
deadseeker.py
Seeking out your 404s in around 100 lines of vanilla Python.
'''

import sys
import urllib
from urllib import request, parse
from urllib.parse import urlparse, urljoin
from urllib.request import Request
from html.parser import HTMLParser
from collections import deque

search_attrs = set(['href', 'src'])
agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'

def is_file(string):
    is_a_file = True
    if 'http' in string or 'https' in string:
        is_a_file = False
    return is_a_file

class LinkParser(HTMLParser):
    def __init__(self, home, verbose):
        ''':home:    a homepage, e.g. 'http://www.example.com/'
           :verbose: boolean for for verbose mode'''
        super().__init__()
        self.home = home
        self.verbose = verbose
        self.file = True
        self.checked_links = set()
        self.pages_to_check = deque()
        self.pages_to_check.appendleft(home)
        self.source = home
        self.scanner()

    def scanner(self):
        '''Loop through remaining pages, looking for HTML responses'''
        while self.pages_to_check:
            page = self.pages_to_check.pop()
            self.source = page

            self.file = is_file(page)

            if(not self.file):
                req = Request(page, headers={'User-Agent': agent})
                try:
                    res = request.urlopen(req)
                    if res.headers['content-type'] is not None:
                        if 'html' in res.headers['content-type']:
                            with res as f:
                                body = f.read().decode('utf-8', errors='ignore')
                                self.feed(body)
                
                except urllib.error.HTTPError as e:
                    print(f'SOURCE: {self.source}')
                    print(f'HTTPError: {e.code} - {self.source}')  # (e.g. 404, 501, etc)
            else:
                if page[len(page) - 1] == '/':
                    page += "index.html"
                try:
                    read_file = open(page, "r", encoding="utf8", errors='ignore')
                    if read_file:
                        body = read_file.read()
                        self.feed(body)
                    else:
                        print(f'SOURCE: {self.source}')
                        print(f'FileError: {link}')
                except FileNotFoundError as e:
                    print(f'SOURCE: {self.source}')
                    print(f'FileError: {link}')

    def handle_starttag(self, tag, attrs):
        '''Override parent method and check tag for our attributes'''

        for attr in attrs:
            # ('href', 'http://www.example.com/')
            encoded_link = ""
            if type(attr[1]) != type(None):
            	encoded_link = attr[1].replace(' ','%20')
            if attr[0] in search_attrs and encoded_link not in self.checked_links:
                self.checked_links.add(encoded_link)
                self.handle_link(encoded_link)

    def handle_link(self, link):
        '''Send a HEAD request to the link, catch any pesky errors'''
        if not bool(urlparse(link).netloc):  
            link = urljoin(self.source, link)
        
        self.file = is_file(link)

        if(self.file):
            if link[len(link) - 1] == '/':
                link += "index.html"
            try:
                read_file = open(link, "r", encoding="utf8", errors='ignore')

                if read_file:
                    page = read_file.read()
                    if self.home in link:
                        self.pages_to_check.appendleft(link)
                    if self.verbose:
                        print(f'File found: {link}')
            except FileNotFoundError as e:
                    print(f'SOURCE: {self.source}')
                    print(f'FileError: {link}')
        else:
            try:
                req = Request(link, headers={'User-Agent': agent}, method='HEAD')
                status = request.urlopen(req).getcode()
                if self.home in link:
                    self.pages_to_check.appendleft(link)
                if self.verbose:
                    print(f'{status} - {link}')
            except urllib.error.HTTPError as e:
                print(f'SOURCE: {self.source}')
                print(f'HTTPError: {e.code} - {link}')  # (e.g. 404, 501, etc)
            except urllib.error.URLError as e:
                print(f'SOURCE: {self.source}')
                print(f'URLError: {e.reason} - {link}')  # (e.g. conn. refused)
            except ValueError as e:
                print(f'SOURCE: {self.source}')
                print(f'ValueError {e} - {link}')  # (e.g. missing protocol http)

# check for verbose tag
verbose = len(sys.argv) > 2 and sys.argv[2] == 'v'

# enable this as a script, e.g., 'http://www.example.com/ v'
LinkParser(sys.argv[1], verbose)