123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116 |
- #!/usr/bin/env python
- #
- # "THE CRAWLER"
- #
- # In this three parts search engine, the crawler is responsible for retreiving
- # information from the filesystem and assembling it into a way for querying
- # later on.
- #
- # And btw, we overwrite the file everytime we crawl. For this release there is
- # no "delta checking" on the filesystem (at least not yet).
- #
- # VERSIONING:
- # - Mechanism that allows the filesystem tree information to be stored in a
- # database for querying afterwards. Most, if not all, of the filesystem can
- # be crawled now.
- # - The crawler is now a Python module; allowing it to be imported from another
- # python program.
- #
- import os
- import sys
- import sqlite3 as sql
- import time
- # all the "index-searchable" filetypes:
- indexable = [
- 'txt',
- 'csv',
- 'html',
- 'xml',
- 'xhtml',
- 'conf',
- 'sh',
- 'py',
- 'pl',
- 'abw',
- 'cpp',
- 'h'
- ]
- conn = sql.connect('index.db')
- cursor = conn.cursor()
- CLEAN_SLATE_QUERY = "DROP TABLE IF EXISTS fileindex"
- GENESIS_QUERY = """
- CREATE TABLE fileindex (
- path TEXT,
- name TEXT,
- extension TEXT,
- contents TEXT
- )
- """
- ADD_QUERY = u"""
- INSERT INTO fileindex (path, name, extension, contents)
- VALUES
- (?,?,?,?)
- """
-
- class Crawler(object):
- def __init__(self):
- # Create a database. EVERYTIME you crawl.
- global CLEAN_SLATE_QUERY
- global GENESIS_QUERY
- cursor.execute(CLEAN_SLATE_QUERY)
- cursor.execute(GENESIS_QUERY)
- conn.commit()
-
- def crawl(self, start_dir = '.'):
- print "Now crawling, starting from %s..." % start_dir
- print "It may take a few minutes!"
- self.begin = time.time()
-
- # The crawling cycle:
- for directory in os.walk(os.path.realpath(start_dir)):
- if directory[2] == []:
- continue
- else:
- filenames = directory[2]
- for filename in filenames:
- extension = filename.split('.')[-1]
- # Instead of printing to stdout, let's write it down:
- filepath = directory[0] + '/' + filename
-
- # Grab the contents if the file is plain text:
- if extension in indexable:
- # we may have issues here with reading some files.
- try:
- with file(filepath, 'r') as plaintext:
- content = plaintext.read()
- except IOError:
- content = None
- else:
- content = "binary"
-
- try:
- cursor.execute(ADD_QUERY, (filepath, filename, extension, content))
- # For some reason, even when you crawl as root, some parts of the
- # operating system fail to get indexed =P
- except sql.ProgrammingError:
- pass
-
- conn.commit()
- self.elapsed = time.time() - self.begin
- print "The operation took %d seconds" % self.elapsed
-
- def close(self):
- conn.close()
- if __name__ == "__main__":
- google = Crawler()
- if len(sys.argv) == 2:
- google.crawl(sys.argv[1])
- else:
- google.crawl()
- google.close()
|