from collections import Counter import string from controllers import Config import re punctuation = str.maketrans('', '', string.punctuation) digits = str.maketrans('', '', string.digits) shortword = re.compile(r'\W*\b\w{1,3}\b') SEPARATOR = " $$$ " def generate_keywords(data, cluster, depth, unique): response = [] cluster_id = 0 for res in data["responses"]: titles = "" for document in res["hits"]["hits"]: titles = titles + " " + document["_source"]["title_cleaned"] # todo: better split and clean words. expl: TrobriandsCandidate titles = " ".join(re.findall('[A-Z][^A-Z]*', titles)) titles = shortword.sub('', titles).translate({**punctuation, **digits}).split(" ") freq = Counter(titles) keys = [] if unique == "false": tmp = dict(freq.most_common()[:5000]) else: tmp = dict(reversed(freq.most_common()[-50:])) for key in tmp: if len(key) > 2: keys.append({ "text": key, "weight": tmp[key], "link": "/?query=" + key + "&clustersize=" + cluster + "&cluster=" + str(cluster_id) + "&weight=" + str( tmp[key]) + "&depth=" + str(int(depth) + 1) }) cluster_id = cluster_id + 1 response.append({ "name": "Cluster " + str(cluster_id), "total": res["hits"]["total"], "feature": keys }) return response def generate_keywords_filtered(keywords, data, cluster, depth, unique): titles = {} keywords_filter = "" response = [] depth = get_cluster_depth(depth) for res in data["responses"]: for document in res["hits"]["hits"]: titles[document["_source"][Config.DEPTH[depth]]] = "" for res in data["responses"]: for document in res["hits"]["hits"]: titles[document["_source"][Config.DEPTH[depth]]] = titles[document["_source"][ Config.DEPTH[depth]]] + \ document["_source"]["title_cleaned"] + SEPARATOR keywords_filter = keywords_filter + " " + document["_source"]["title_cleaned"] allowed_keywords = Counter(keywords_filter.split(" ")) allowed_keywords = list({x: allowed_keywords[x] for x in allowed_keywords if allowed_keywords[x] < 2}) for document in titles: total = len(titles[document].split(SEPARATOR)) - 1 titles[document] = shortword.sub('', titles[document]).translate({**punctuation, **digits}).lower().split(" ") freq = Counter(titles[document]) keys = [] if unique == "false": tmp = dict(freq.most_common()[:5000]) else: tmp = dict(reversed(freq.most_common()[-50:])) for key in tmp: if total < 2: keys.append({ "text": key, "weight": tmp[key], "link": "/docs?query=" + keywords + "," + key + "&depth=" + str( depth + 1) + "&cluster=" + cluster + "&delta_cloud=" + unique + "&name=" + document + "&cluster_size=3" }) else: if (depth + 1) == 2: if key in allowed_keywords and key.lower() not in keywords: keys.append({ "text": key, "weight": tmp[key], "link": "/?query=" + keywords + "," + key + "&depth=" + str( depth + 1) + "&cluster=" + cluster + "&delta_cloud=" + unique + "&name=" + document }) else: if key.lower() not in keywords: keys.append({ "text": key, "weight": tmp[key], "link": "/?query=" + keywords + "," + key + "&depth=" + str( depth + 1) + "&cluster=" + cluster + "&delta_cloud=" + unique + "&name=" + document }) response.append({ "name": document, "total": total, "feature": keys }) return response def get_cluster_depth(depth): if int(depth) >= len(Config.DEPTH): return len(Config.DEPTH) - 1 if int(depth) == 0: return 1 return int(depth) def filter_documents(data, filters): docs = [] for res in data["responses"]: for document in res["hits"]["hits"]: for attribute in Config.DEPTH: if document["_source"][Config.DEPTH[attribute]] == filters: docs.append(document) return docs