136 lines
4.7 KiB
Python
136 lines
4.7 KiB
Python
from collections import Counter
|
|
import string
|
|
from controllers import Config
|
|
import re
|
|
|
|
punctuation = str.maketrans('', '', string.punctuation)
|
|
digits = str.maketrans('', '', string.digits)
|
|
shortword = re.compile(r'\W*\b\w{1,3}\b')
|
|
SEPARATOR = " $$$ "
|
|
|
|
|
|
def generate_keywords(data, cluster, depth, unique):
|
|
response = []
|
|
cluster_id = 0
|
|
for res in data["responses"]:
|
|
titles = ""
|
|
for document in res["hits"]["hits"]:
|
|
titles = titles + " " + document["_source"]["title_cleaned"]
|
|
# todo: better split and clean words. expl: TrobriandsCandidate
|
|
titles = " ".join(re.findall('[A-Z][^A-Z]*', titles))
|
|
titles = shortword.sub('', titles).translate({**punctuation, **digits}).split(" ")
|
|
freq = Counter(titles)
|
|
|
|
keys = []
|
|
if unique == "false":
|
|
tmp = dict(freq.most_common()[:5000])
|
|
else:
|
|
tmp = dict(reversed(freq.most_common()[-50:]))
|
|
|
|
for key in tmp:
|
|
if len(key) > 2:
|
|
keys.append({
|
|
"text": key,
|
|
"weight": tmp[key],
|
|
"link": "/?query=" + key + "&clustersize=" + cluster + "&cluster=" + str(cluster_id) + "&weight=" + str(
|
|
tmp[key]) + "&depth=" + str(int(depth) + 1)
|
|
})
|
|
|
|
cluster_id = cluster_id + 1
|
|
response.append({
|
|
"name": "Cluster " + str(cluster_id),
|
|
"total": res["hits"]["total"],
|
|
"feature": keys
|
|
})
|
|
|
|
return response
|
|
|
|
|
|
def generate_keywords_filtered(keywords, data, cluster, depth, unique):
|
|
titles = {}
|
|
keywords_filter = ""
|
|
response = []
|
|
|
|
depth = get_cluster_depth(depth)
|
|
|
|
for res in data["responses"]:
|
|
for document in res["hits"]["hits"]:
|
|
titles[document["_source"][Config.DEPTH[depth]]] = ""
|
|
|
|
for res in data["responses"]:
|
|
for document in res["hits"]["hits"]:
|
|
titles[document["_source"][Config.DEPTH[depth]]] = titles[document["_source"][
|
|
Config.DEPTH[depth]]] + \
|
|
document["_source"]["title_cleaned"] + SEPARATOR
|
|
|
|
keywords_filter = keywords_filter + " " + document["_source"]["title_cleaned"]
|
|
|
|
allowed_keywords = Counter(keywords_filter.split(" "))
|
|
allowed_keywords = list({x: allowed_keywords[x] for x in allowed_keywords if allowed_keywords[x] < 2})
|
|
|
|
for document in titles:
|
|
|
|
total = len(titles[document].split(SEPARATOR)) - 1
|
|
|
|
titles[document] = shortword.sub('', titles[document]).translate({**punctuation, **digits}).lower().split(" ")
|
|
|
|
freq = Counter(titles[document])
|
|
|
|
keys = []
|
|
if unique == "false":
|
|
tmp = dict(freq.most_common()[:5000])
|
|
else:
|
|
tmp = dict(reversed(freq.most_common()[-50:]))
|
|
|
|
for key in tmp:
|
|
if total < 2:
|
|
keys.append({
|
|
"text": key,
|
|
"weight": tmp[key],
|
|
"link": "/docs?query=" + keywords + "," + key + "&depth=" + str(
|
|
depth + 1) + "&cluster=" + cluster + "&delta_cloud=" + unique + "&name=" + document + "&cluster_size=3"
|
|
})
|
|
else:
|
|
if (depth + 1) == 2:
|
|
if key in allowed_keywords and key.lower() not in keywords:
|
|
keys.append({
|
|
"text": key,
|
|
"weight": tmp[key],
|
|
"link": "/?query=" + keywords + "," + key + "&depth=" + str(
|
|
depth + 1) + "&cluster=" + cluster + "&delta_cloud=" + unique + "&name=" + document
|
|
})
|
|
else:
|
|
if key.lower() not in keywords:
|
|
keys.append({
|
|
"text": key,
|
|
"weight": tmp[key],
|
|
"link": "/?query=" + keywords + "," + key + "&depth=" + str(
|
|
depth + 1) + "&cluster=" + cluster + "&delta_cloud=" + unique + "&name=" + document
|
|
})
|
|
|
|
response.append({
|
|
"name": document,
|
|
"total": total,
|
|
"feature": keys
|
|
})
|
|
|
|
return response
|
|
|
|
|
|
def get_cluster_depth(depth):
|
|
if int(depth) >= len(Config.DEPTH):
|
|
return len(Config.DEPTH) - 1
|
|
if int(depth) == 0:
|
|
return 1
|
|
return int(depth)
|
|
|
|
|
|
def filter_documents(data, filters):
|
|
docs = []
|
|
for res in data["responses"]:
|
|
for document in res["hits"]["hits"]:
|
|
for attribute in Config.DEPTH:
|
|
if document["_source"][Config.DEPTH[attribute]] == filters:
|
|
docs.append(document)
|
|
return docs
|