www-cluster/crawler/analysis.py

#! /usr/bin/env python3

# analysis.py [(-l | --library) <library-path>] [<query> ...]
#
# Search the library archive (a .csv file produced by index-data) for the words in the query, and print the respective file paths.

import argparse
import csv
from fuzzywuzzy import fuzz
import sys

def makeCsvFieldLimitHuge():
	"""The csv module has a fixed limit on field sizes. Fix that."""
	limit = sys.maxsize
	while True:
		try:
			csv.field_size_limit(limit)
			return
		except OverflowError:
			limit = int(limit/2)

def parseArgs():
	"""Define the options, parse the command line, and return the options object."""
	optionsParser = argparse.ArgumentParser()
	optionsParser.add_argument('-l', '--library', type = str, nargs = 1, default = 'articles.csv', help = "specify the library to search")
	optionsParser.add_argument('query', type = str, nargs='*', default = 'South Kensington, London', help="strings to search in the library")

	return optionsParser.parse_args()

def readArticles(path: str) -> list:
	"""Read the library file."""
	with open(path, 'r') as csvfile:
		return [ f for f in csv.reader(csvfile) ]

def query(articles: list, search: str):
	"""Search all the indexed documents for the given words, sort them by how well they match the search, and list all documents that score at least 30%."""
	ratio = [ (fuzz.token_set_ratio(f[1], search), f[0]) for f in articles ]
	ratio.sort(reverse=True)

	for x in ratio:
		if x[0] >= 30:
			print("Match: %d%% in URL: %s" %(x))

def main():
	options = parseArgs()
	makeCsvFieldLimitHuge()
	query(readArticles(options.library), options.query)

main()