from collections import Counter import string import re import random punctuation = str.maketrans('', '', string.punctuation) digits = str.maketrans('', '', string.digits) shortword = re.compile(r'\W*\b\w{1,3}\b') SEPARATOR = " $$$ " def generate_clusters(data): clusters = {} cluster_id = 0 cluster_size = 0 annotations_max = 1 for res in data["responses"]: if res["hits"]["total"] > 1: cluster_size = cluster_size + 1 if cluster_size < 3: annotations_max = 5 for res in data["responses"]: annotations_counter = 0 cluster_id = cluster_id + 1 clusters[cluster_id] = {} clusters[cluster_id] = {"X": [], "Y": [], "TEXT": [], "IDS": [], "HOVERS": [], "SIGN": random.choice([-1, 1, -2, 2])} for document in res["hits"]["hits"]: x = random.uniform((cluster_id * 10) + 30, (cluster_id * 30) + 20) y = clusters[cluster_id]["SIGN"] * random.uniform((cluster_id * 10), (cluster_id * 10) + 100) title = document["_source"]["title"] + " " cls = document["_source"]["discipline"] if ( document["_source"]["discipline"] != "Not stated" and cluster_size > 2) else \ document["_source"]["title"] clusters[cluster_id]["X"].append(x) clusters[cluster_id]["Y"].append(y) clusters[cluster_id]["HOVERS"].append(title) clusters[cluster_id]["IDS"].append(document["_id"]) if annotations_counter < annotations_max and (random.choice([1, 7, 9]) % 2): annotations_counter = annotations_counter + 1 clusters[cluster_id]["TEXT"].append(cls) else: clusters[cluster_id]["TEXT"].append(None) return clusters def generate_clouds(keywords, data, unique, uniq_keys): response = [] cluster_id = 0 for res in data["responses"]: titles = "" for document in res["hits"]["hits"]: titles = titles + " " + document["_source"]["title_cleaned"] # todo: better split and clean words. expl: TrobriandsCandidate titles = " ".join(re.findall('[A-Z][^A-Z]*', titles)).lower() titles = shortword.sub('', titles).translate({**punctuation, **digits}).split(" ") freq = Counter(titles) keys = [] if unique == "false" and uniq_keys == "1": tmp = dict(freq.most_common()[:5000]) else: tmp = dict(reversed(freq.most_common()[-50:])) for key in tmp: if len(key) > 2 and (key.lower() not in keywords): if tmp[key] == res["hits"]["total"] and res["hits"]["total"] > 1: continue keys.append({ "text": key, "weight": tmp[key], "link": "/?query=" + key + "&clustersize=" + str(cluster_id) }) cluster_id = cluster_id + 1 response.append({ "name": "Cluster " + str(cluster_id), "total": res["hits"]["total"], "feature": keys }) return response def top_documents(data): clusters = [] MAX_DOCS = 3 DUPLICAT_DOCS = [] for res in data["responses"]: docs = [] max_counter = 0 for document in res["hits"]["hits"]: if max_counter > MAX_DOCS: break tmp = " ".join(document["_source"]["title"].split(" ")[-4:]) if tmp not in DUPLICAT_DOCS and (len(document["_source"]["title"]) > 15): max_counter = max_counter + 1 DUPLICAT_DOCS.append(tmp) docs.append({ "id": document["_id"], "title": document["_source"]["title"], }) clusters.append({ "total": res["hits"]["total"], "docs": docs }) return clusters