textnavi/controllers/PreviewHelper.py

from collections import Counter
import string
from controllers import Config
import re

punctuation = str.maketrans('', '', string.punctuation)
digits = str.maketrans('', '', string.digits)
shortword = re.compile(r'\W*\b\w{1,3}\b')
SEPARATOR = " $$$ "


def generate_keywords(data, cluster, depth, unique):
    response = []
    cluster_id = 0
    for res in data["responses"]:
        titles = ""
        for document in res["hits"]["hits"]:
            titles = titles + " " + document["_source"]["title_cleaned"]
        # todo: better split and clean words. expl: TrobriandsCandidate
        titles = " ".join(re.findall('[A-Z][^A-Z]*', titles))
        titles = shortword.sub('', titles).translate({**punctuation, **digits}).split(" ")
        freq = Counter(titles)

        keys = []
        if unique == "false":
            tmp = dict(freq.most_common()[:5000])
        else:
            tmp = dict(reversed(freq.most_common()[-50:]))

        for key in tmp:
            if len(key) > 2:
                keys.append({
                    "text": key,
                    "weight": tmp[key],
                    "link": "/?query=" + key + "&clustersize=" + cluster + "&cluster=" + str(cluster_id) + "&weight=" + str(
                        tmp[key]) + "&depth=" + str(int(depth) + 1)
                })

        cluster_id = cluster_id + 1
        response.append({
            "name": "Cluster " + str(cluster_id),
            "total": res["hits"]["total"],
            "feature": keys
        })

    return response


def generate_keywords_filtered(keywords, data, cluster, depth, unique):
    titles = {}
    keywords_filter = ""
    response = []

    depth = get_cluster_depth(depth)

    for res in data["responses"]:
        for document in res["hits"]["hits"]:
            titles[document["_source"][Config.DEPTH[depth]]] = ""

    for res in data["responses"]:
        for document in res["hits"]["hits"]:
            titles[document["_source"][Config.DEPTH[depth]]] = titles[document["_source"][
                Config.DEPTH[depth]]] + \
                                                                      document["_source"]["title_cleaned"] + SEPARATOR

            keywords_filter = keywords_filter + " " + document["_source"]["title_cleaned"]

    allowed_keywords = Counter(keywords_filter.split(" "))
    allowed_keywords = list({x: allowed_keywords[x] for x in allowed_keywords if allowed_keywords[x] < 2})

    for document in titles:

        total = len(titles[document].split(SEPARATOR)) - 1

        titles[document] = shortword.sub('', titles[document]).translate({**punctuation, **digits}).lower().split(" ")

        freq = Counter(titles[document])

        keys = []
        if unique == "false":
            tmp = dict(freq.most_common()[:5000])
        else:
            tmp = dict(reversed(freq.most_common()[-50:]))

        for key in tmp:
            if total < 2:
                keys.append({
                    "text": key,
                    "weight": tmp[key],
                    "link": "/docs?query=" + keywords + "," + key + "&depth=" + str(
                        depth + 1) + "&cluster=" + cluster + "&delta_cloud=" + unique + "&name=" + document + "&cluster_size=3"
                })
            else:
                if (depth + 1) == 2:
                    if key in allowed_keywords and key.lower() not in keywords:
                        keys.append({
                            "text": key,
                            "weight": tmp[key],
                            "link": "/?query=" + keywords + "," + key + "&depth=" + str(
                                depth + 1) + "&cluster=" + cluster + "&delta_cloud=" + unique + "&name=" + document
                        })
                else:
                    if key.lower() not in keywords:
                        keys.append({
                            "text": key,
                            "weight": tmp[key],
                            "link": "/?query=" + keywords + "," + key + "&depth=" + str(
                                depth + 1) + "&cluster=" + cluster + "&delta_cloud=" + unique + "&name=" + document
                        })

        response.append({
            "name": document,
            "total": total,
            "feature": keys
        })

    return response


def get_cluster_depth(depth):
    if int(depth) >= len(Config.DEPTH):
        return len(Config.DEPTH) - 1
    if int(depth) == 0:
        return 1
    return int(depth)


def filter_documents(data, filters):
    docs = []
    for res in data["responses"]:
        for document in res["hits"]["hits"]:
            for attribute in Config.DEPTH:
                if document["_source"][Config.DEPTH[attribute]] == filters:
                    docs.append(document)
    return docs