textnavi/helper/QueryParser.py

118 lines
3.9 KiB
Python

from collections import Counter
import string
import re
import random
punctuation = str.maketrans('', '', string.punctuation)
digits = str.maketrans('', '', string.digits)
shortword = re.compile(r'\W*\b\w{1,3}\b')
SEPARATOR = " $$$ "
def generate_clusters(data):
clusters = {}
cluster_id = 0
cluster_size = 0
annotations_max = 1
for res in data["responses"]:
if res["hits"]["total"] > 1:
cluster_size = cluster_size + 1
if cluster_size < 3:
annotations_max = 5
for res in data["responses"]:
annotations_counter = 0
cluster_id = cluster_id + 1
clusters[cluster_id] = {}
clusters[cluster_id] = {"X": [], "Y": [], "TEXT": [], "IDS": [], "HOVERS": [],
"SIGN": random.choice([-1, 1, -2, 2])}
for document in res["hits"]["hits"]:
x = random.uniform((cluster_id * 10) + 30, (cluster_id * 30) + 20)
y = clusters[cluster_id]["SIGN"] * random.uniform((cluster_id * 10), (cluster_id * 10) + 100)
title = document["_source"]["title"] + " <i style='display:none'>" + document["_id"] + "</i>"
cls = document["_source"]["discipline"] if (
document["_source"]["discipline"] != "Not stated" and cluster_size > 2) else \
document["_source"]["title"]
clusters[cluster_id]["X"].append(x)
clusters[cluster_id]["Y"].append(y)
clusters[cluster_id]["HOVERS"].append(title)
clusters[cluster_id]["IDS"].append(document["_id"])
if annotations_counter < annotations_max and (random.choice([1, 7, 9]) % 2):
annotations_counter = annotations_counter + 1
clusters[cluster_id]["TEXT"].append(cls)
else:
clusters[cluster_id]["TEXT"].append(None)
return clusters
def generate_clouds(keywords, data, unique, uniq_keys):
response = []
cluster_id = 0
for res in data["responses"]:
titles = ""
for document in res["hits"]["hits"]:
titles = titles + " " + document["_source"]["title_cleaned"]
# todo: better split and clean words. expl: TrobriandsCandidate
titles = " ".join(re.findall('[A-Z][^A-Z]*', titles)).lower()
titles = shortword.sub('', titles).translate({**punctuation, **digits}).split(" ")
freq = Counter(titles)
keys = []
if unique == "false" and uniq_keys == "1":
tmp = dict(freq.most_common()[:5000])
else:
tmp = dict(reversed(freq.most_common()[-50:]))
for key in tmp:
if len(key) > 2 and (key.lower() not in keywords):
if tmp[key] == res["hits"]["total"] and res["hits"]["total"] > 1:
continue
keys.append({
"text": key,
"weight": tmp[key],
"link": "/?query=" + key + "&clustersize=" + str(cluster_id)
})
cluster_id = cluster_id + 1
response.append({
"name": "Cluster " + str(cluster_id),
"total": res["hits"]["total"],
"feature": keys
})
return response
def top_documents(data):
clusters = []
MAX_DOCS = 3
DUPLICAT_DOCS = []
for res in data["responses"]:
docs = []
max_counter = 0
for document in res["hits"]["hits"]:
if max_counter > MAX_DOCS:
break
tmp = " ".join(document["_source"]["title"].split(" ")[-4:])
if tmp not in DUPLICAT_DOCS and (len(document["_source"]["title"]) > 15):
max_counter = max_counter + 1
DUPLICAT_DOCS.append(tmp)
docs.append({
"id": document["_id"],
"title": document["_source"]["title"],
})
clusters.append({
"total": res["hits"]["total"],
"docs": docs
})
return clusters