50 lines
1.6 KiB
Python
50 lines
1.6 KiB
Python
|
#! /usr/bin/env python3
|
||
|
|
||
|
# analysis.py [(-l | --library) <library-path>] [<query> ...]
|
||
|
#
|
||
|
# Search the library archive (a .csv file produced by index-data) for the words in the query, and print the respective file paths.
|
||
|
|
||
|
import argparse
|
||
|
import csv
|
||
|
from fuzzywuzzy import fuzz
|
||
|
import sys
|
||
|
|
||
|
def makeCsvFieldLimitHuge():
|
||
|
"""The csv module has a fixed limit on field sizes. Fix that."""
|
||
|
limit = sys.maxsize
|
||
|
while True:
|
||
|
try:
|
||
|
csv.field_size_limit(limit)
|
||
|
return
|
||
|
except OverflowError:
|
||
|
limit = int(limit/2)
|
||
|
|
||
|
def parseArgs():
|
||
|
"""Define the options, parse the command line, and return the options object."""
|
||
|
optionsParser = argparse.ArgumentParser()
|
||
|
optionsParser.add_argument('-l', '--library', type = str, nargs = 1, default = 'articles.csv', help = "specify the library to search")
|
||
|
optionsParser.add_argument('query', type = str, nargs='*', default = 'South Kensington, London', help="strings to search in the library")
|
||
|
|
||
|
return optionsParser.parse_args()
|
||
|
|
||
|
def readArticles(path: str) -> list:
|
||
|
"""Read the library file."""
|
||
|
with open(path, 'r') as csvfile:
|
||
|
return [ f for f in csv.reader(csvfile) ]
|
||
|
|
||
|
def query(articles: list, search: str):
|
||
|
"""Search all the indexed documents for the given words, sort them by how well they match the search, and list all documents that score at least 30%."""
|
||
|
ratio = [ (fuzz.token_set_ratio(f[1], search), f[0]) for f in articles ]
|
||
|
ratio.sort(reverse=True)
|
||
|
|
||
|
for x in ratio:
|
||
|
if x[0] >= 30:
|
||
|
print("Match: %d%% in URL: %s" %(x))
|
||
|
|
||
|
def main():
|
||
|
options = parseArgs()
|
||
|
makeCsvFieldLimitHuge()
|
||
|
query(readArticles(options.library), options.query)
|
||
|
|
||
|
main()
|