136 lines
4.7 KiB
136 lines
4.7 KiB
from collections import Counter
import string
from controllers import Config
import re
punctuation = str.maketrans('', '', string.punctuation)
digits = str.maketrans('', '', string.digits)
shortword = re.compile(r'\W*\b\w{1,3}\b')
SEPARATOR = " $$$ "
def generate_keywords(data, cluster, depth, unique):
response = []
cluster_id = 0
for res in data["responses"]:
titles = ""
for document in res["hits"]["hits"]:
titles = titles + " " + document["_source"]["title_cleaned"]
# todo: better split and clean words. expl: TrobriandsCandidate
titles = " ".join(re.findall('[A-Z][^A-Z]*', titles))
titles = shortword.sub('', titles).translate({**punctuation, **digits}).split(" ")
freq = Counter(titles)
keys = []
if unique == "false":
tmp = dict(freq.most_common()[:5000])
tmp = dict(reversed(freq.most_common()[-50:]))
for key in tmp:
if len(key) > 2:
"text": key,
"weight": tmp[key],
"link": "/?query=" + key + "&clustersize=" + cluster + "&cluster=" + str(cluster_id) + "&weight=" + str(
tmp[key]) + "&depth=" + str(int(depth) + 1)
cluster_id = cluster_id + 1
"name": "Cluster " + str(cluster_id),
"total": res["hits"]["total"],
"feature": keys
return response
def generate_keywords_filtered(keywords, data, cluster, depth, unique):
titles = {}
keywords_filter = ""
response = []
depth = get_cluster_depth(depth)
for res in data["responses"]:
for document in res["hits"]["hits"]:
titles[document["_source"][Config.DEPTH[depth]]] = ""
for res in data["responses"]:
for document in res["hits"]["hits"]:
titles[document["_source"][Config.DEPTH[depth]]] = titles[document["_source"][
Config.DEPTH[depth]]] + \
document["_source"]["title_cleaned"] + SEPARATOR
keywords_filter = keywords_filter + " " + document["_source"]["title_cleaned"]
allowed_keywords = Counter(keywords_filter.split(" "))
allowed_keywords = list({x: allowed_keywords[x] for x in allowed_keywords if allowed_keywords[x] < 2})
for document in titles:
total = len(titles[document].split(SEPARATOR)) - 1
titles[document] = shortword.sub('', titles[document]).translate({**punctuation, **digits}).lower().split(" ")
freq = Counter(titles[document])
keys = []
if unique == "false":
tmp = dict(freq.most_common()[:5000])
tmp = dict(reversed(freq.most_common()[-50:]))
for key in tmp:
if total < 2:
"text": key,
"weight": tmp[key],
"link": "/docs?query=" + keywords + "," + key + "&depth=" + str(
depth + 1) + "&cluster=" + cluster + "&delta_cloud=" + unique + "&name=" + document + "&cluster_size=3"
if (depth + 1) == 2:
if key in allowed_keywords and key.lower() not in keywords:
"text": key,
"weight": tmp[key],
"link": "/?query=" + keywords + "," + key + "&depth=" + str(
depth + 1) + "&cluster=" + cluster + "&delta_cloud=" + unique + "&name=" + document
if key.lower() not in keywords:
"text": key,
"weight": tmp[key],
"link": "/?query=" + keywords + "," + key + "&depth=" + str(
depth + 1) + "&cluster=" + cluster + "&delta_cloud=" + unique + "&name=" + document
"name": document,
"total": total,
"feature": keys
return response
def get_cluster_depth(depth):
if int(depth) >= len(Config.DEPTH):
return len(Config.DEPTH) - 1
if int(depth) == 0:
return 1
return int(depth)
def filter_documents(data, filters):
docs = []
for res in data["responses"]:
for document in res["hits"]["hits"]:
for attribute in Config.DEPTH:
if document["_source"][Config.DEPTH[attribute]] == filters:
return docs