118 lines
3.9 KiB
Python
118 lines
3.9 KiB
Python
from collections import Counter
|
|
import string
|
|
import re
|
|
import random
|
|
|
|
punctuation = str.maketrans('', '', string.punctuation)
|
|
digits = str.maketrans('', '', string.digits)
|
|
shortword = re.compile(r'\W*\b\w{1,3}\b')
|
|
SEPARATOR = " $$$ "
|
|
|
|
|
|
def generate_clusters(data):
|
|
clusters = {}
|
|
cluster_id = 0
|
|
cluster_size = 0
|
|
annotations_max = 1
|
|
for res in data["responses"]:
|
|
if res["hits"]["total"] > 1:
|
|
cluster_size = cluster_size + 1
|
|
|
|
if cluster_size < 3:
|
|
annotations_max = 5
|
|
|
|
for res in data["responses"]:
|
|
annotations_counter = 0
|
|
cluster_id = cluster_id + 1
|
|
clusters[cluster_id] = {}
|
|
clusters[cluster_id] = {"X": [], "Y": [], "TEXT": [], "IDS": [], "HOVERS": [],
|
|
"SIGN": random.choice([-1, 1, -2, 2])}
|
|
|
|
for document in res["hits"]["hits"]:
|
|
x = random.uniform((cluster_id * 10) + 30, (cluster_id * 30) + 20)
|
|
y = clusters[cluster_id]["SIGN"] * random.uniform((cluster_id * 10), (cluster_id * 10) + 100)
|
|
title = document["_source"]["title"] + " <i style='display:none'>" + document["_id"] + "</i>"
|
|
cls = document["_source"]["discipline"] if (
|
|
document["_source"]["discipline"] != "Not stated" and cluster_size > 2) else \
|
|
document["_source"]["title"]
|
|
|
|
clusters[cluster_id]["X"].append(x)
|
|
clusters[cluster_id]["Y"].append(y)
|
|
|
|
clusters[cluster_id]["HOVERS"].append(title)
|
|
clusters[cluster_id]["IDS"].append(document["_id"])
|
|
|
|
if annotations_counter < annotations_max and (random.choice([1, 7, 9]) % 2):
|
|
annotations_counter = annotations_counter + 1
|
|
clusters[cluster_id]["TEXT"].append(cls)
|
|
else:
|
|
clusters[cluster_id]["TEXT"].append(None)
|
|
|
|
return clusters
|
|
|
|
|
|
def generate_clouds(keywords, data, unique, uniq_keys):
|
|
response = []
|
|
cluster_id = 0
|
|
|
|
for res in data["responses"]:
|
|
titles = ""
|
|
for document in res["hits"]["hits"]:
|
|
titles = titles + " " + document["_source"]["title_cleaned"]
|
|
# todo: better split and clean words. expl: TrobriandsCandidate
|
|
titles = " ".join(re.findall('[A-Z][^A-Z]*', titles)).lower()
|
|
titles = shortword.sub('', titles).translate({**punctuation, **digits}).split(" ")
|
|
freq = Counter(titles)
|
|
|
|
keys = []
|
|
if unique == "false" and uniq_keys == "1":
|
|
tmp = dict(freq.most_common()[:5000])
|
|
else:
|
|
tmp = dict(reversed(freq.most_common()[-50:]))
|
|
|
|
for key in tmp:
|
|
if len(key) > 2 and (key.lower() not in keywords):
|
|
if tmp[key] == res["hits"]["total"] and res["hits"]["total"] > 1:
|
|
continue
|
|
keys.append({
|
|
"text": key,
|
|
"weight": tmp[key],
|
|
"link": "/?query=" + key + "&clustersize=" + str(cluster_id)
|
|
})
|
|
|
|
cluster_id = cluster_id + 1
|
|
response.append({
|
|
"name": "Cluster " + str(cluster_id),
|
|
"total": res["hits"]["total"],
|
|
"feature": keys
|
|
})
|
|
|
|
return response
|
|
|
|
|
|
def top_documents(data):
|
|
clusters = []
|
|
MAX_DOCS = 3
|
|
DUPLICAT_DOCS = []
|
|
for res in data["responses"]:
|
|
docs = []
|
|
max_counter = 0
|
|
for document in res["hits"]["hits"]:
|
|
if max_counter > MAX_DOCS:
|
|
break
|
|
|
|
tmp = " ".join(document["_source"]["title"].split(" ")[-4:])
|
|
if tmp not in DUPLICAT_DOCS and (len(document["_source"]["title"]) > 15):
|
|
max_counter = max_counter + 1
|
|
DUPLICAT_DOCS.append(tmp)
|
|
docs.append({
|
|
"id": document["_id"],
|
|
"title": document["_source"]["title"],
|
|
})
|
|
clusters.append({
|
|
"total": res["hits"]["total"],
|
|
"docs": docs
|
|
|
|
})
|
|
return clusters
|