#! /usr/bin/env python3 # analysis.py [(-l | --library) ] [ ...] # # Search the library archive (a .csv file produced by index-data) for the words in the query, and print the respective file paths. import argparse import csv from fuzzywuzzy import fuzz import sys def makeCsvFieldLimitHuge(): """The csv module has a fixed limit on field sizes. Fix that.""" limit = sys.maxsize while True: try: csv.field_size_limit(limit) return except OverflowError: limit = int(limit/2) def parseArgs(): """Define the options, parse the command line, and return the options object.""" optionsParser = argparse.ArgumentParser() optionsParser.add_argument('-l', '--library', type = str, nargs = 1, default = 'articles.csv', help = "specify the library to search") optionsParser.add_argument('query', type = str, nargs='*', default = 'South Kensington, London', help="strings to search in the library") return optionsParser.parse_args() def readArticles(path: str) -> list: """Read the library file.""" with open(path, 'r') as csvfile: return [ f for f in csv.reader(csvfile) ] def query(articles: list, search: str): """Search all the indexed documents for the given words, sort them by how well they match the search, and list all documents that score at least 30%.""" ratio = [ (fuzz.token_set_ratio(f[1], search), f[0]) for f in articles ] ratio.sort(reverse=True) for x in ratio: if x[0] >= 30: print("Match: %d%% in URL: %s" %(x)) def main(): options = parseArgs() makeCsvFieldLimitHuge() query(readArticles(options.library), options.query) main()