123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135 |
- #!/usr/bin/env python
- #
- # THE QUERIER
- #
- # The querier will act as the intermediate between the user interface and the
- # Crawler's database. Although it can act as a user interface itself, that is
- # not the goal of the querier. Rather, it focuses on being able to present the
- # data given by the crawler in the most diverse ways possible.
- #
- # VERSIONING:
- # - A text-based user interface allowing for multiple queries per session, and
- # an option to recrawl the filesystem.
- # - A text-based user interface allowing for index searching within plain text
- # files.
- # - A module that allows for the querying and preparation of the results to be
- # displayed in another interface module.
- #
- import os
- import sys
- import sqlite3 as sql
- import crawl
- # Session variables:
- query = "nothing"
- results = "nothing still"
- total = 0
- # For the sake of modularity, we have to set methods for compatibility:
- # searching for name only
- def simple_search(token):
- if token == "":
- return
- result_set = crawl.cursor.execute("SELECT path FROM fileindex WHERE name LIKE ?", ('%{}%'.format(token),))
- return result_set
- # searching for file contents:
- def index_search(token):
- if token == "":
- return
- result_set = crawl.cursor.execute("SELECT path, contents FROM fileindex WHERE contents LIKE ?", ('%{}%'.format(token),))
- return result_set
- def refresh():
- google = crawl.Crawler()
- google.crawl('/')
- return True
- if __name__ == "__main__":
- # Start of program:
- print "========="
- print "Linux and Unix file locating utility"
- print "========="
- # Querying session:
- while True:
- total = 0
- query = raw_input("\nType the name of the file you're looking for, or '.com' for a list of commands:\n>_ ")
- if query == ".com":
- print """.com - shows a list of commands
- .quit - exits the program
- .index - searches within files
- .refresh - refreshes the database by crawling again"""
- continue
- if query == ".quit":
- break
-
- if query == ".index":
- # Perform index searching and show a snippet of the results:
- query = raw_input("Type a query to search for: ")
- results = index_search(query)
- for result in results:
- print "=========\nIn %s, we found: " % result[0]
-
- # creating 10-word long snippets:
- try:
- before = result[1].split(query)[0].split(" ")
- before.reverse() # required for iteration...
- except IndexError:
- before = ['']
- try:
- after = result[1].split(query)[1].split(" ")
- except IndexError:
- after = ['']
-
- snippet_before = []
- snippet_after = []
-
- word_count = 0
- snippet_size = 5 # word limit to each side
- while word_count < snippet_size:
- try:
- snippet_before.append(before[word_count])
- except IndexError:
- # If we are on either "margin" of the text content, we may
- # not be able to extract all the required words:
- pass
- try:
- snippet_after.append(after[word_count])
- except IndexError:
- pass
- word_count += 1
-
- # Now we reverse the before part once again.
- snippet_before.reverse()
- snippet = " ".join(snippet_before) + query + \
- " ".join(snippet_after)
- print snippet + "\n"
- total += 1
- print "======\nFound %d files containing '%s'" % (total, query)
- continue
-
- if query == ".refresh":
- print "Refreshing database... this may take a while."
- google = crawl.Crawler()
- google.crawl('/')
- print "=======\nRefreshing complete. Please try again."
- continue
-
- if query == "":
- continue # this will avoid the huge overhead of listing everything!
-
- # standard file search
- results = simple_search(query)
- for result in results:
- print result[0]
- total += 1
- crawl.conn.commit()
- print "======\n%d files found, named '%s'" % (total, query)
-
- # Finish everything beautifully
- crawl.conn.close()
- print "Bye!"
|