textnavi/helper/QueryParser.py

from collections import Counter
import string
import re
import random

punctuation = str.maketrans('', '', string.punctuation)
digits = str.maketrans('', '', string.digits)
shortword = re.compile(r'\W*\b\w{1,3}\b')
SEPARATOR = " $$$ "


def generate_clusters(data):
    clusters = {}
    cluster_id = 0
    cluster_size = 0
    annotations_max = 1
    for res in data["responses"]:
        if res["hits"]["total"] > 1:
            cluster_size = cluster_size + 1

    if cluster_size < 3:
        annotations_max = 5

    for res in data["responses"]:
        annotations_counter = 0
        cluster_id = cluster_id + 1
        clusters[cluster_id] = {}
        clusters[cluster_id] = {"X": [], "Y": [], "TEXT": [], "IDS": [], "HOVERS": [],
                                "SIGN": random.choice([-1, 1, -2, 2])}

        for document in res["hits"]["hits"]:
            x = random.uniform((cluster_id * 10) + 30, (cluster_id * 30) + 20)
            y = clusters[cluster_id]["SIGN"] * random.uniform((cluster_id * 10), (cluster_id * 10) + 100)
            title = document["_source"]["title"] + " <i style='display:none'>" + document["_id"] + "</i>"
            cls = document["_source"]["discipline"] if (
                document["_source"]["discipline"] != "Not stated" and cluster_size > 2) else \
                document["_source"]["title"]

            clusters[cluster_id]["X"].append(x)
            clusters[cluster_id]["Y"].append(y)

            clusters[cluster_id]["HOVERS"].append(title)
            clusters[cluster_id]["IDS"].append(document["_id"])

            if annotations_counter < annotations_max and (random.choice([1, 7, 9]) % 2):
                annotations_counter = annotations_counter + 1
                clusters[cluster_id]["TEXT"].append(cls)
            else:
                clusters[cluster_id]["TEXT"].append(None)

    return clusters


def generate_clouds(keywords, data, unique, uniq_keys):
    response = []
    cluster_id = 0

    for res in data["responses"]:
        titles = ""
        for document in res["hits"]["hits"]:
            titles = titles + " " + document["_source"]["title_cleaned"]
        # todo: better split and clean words. expl: TrobriandsCandidate
        titles = " ".join(re.findall('[A-Z][^A-Z]*', titles)).lower()
        titles = shortword.sub('', titles).translate({**punctuation, **digits}).split(" ")
        freq = Counter(titles)

        keys = []
        if unique == "false" and uniq_keys == "1":
            tmp = dict(freq.most_common()[:5000])
        else:
            tmp = dict(reversed(freq.most_common()[-50:]))

        for key in tmp:
            if len(key) > 2 and (key.lower() not in keywords):
                if tmp[key] == res["hits"]["total"] and res["hits"]["total"] > 1:
                    continue
                keys.append({
                    "text": key,
                    "weight": tmp[key],
                    "link": "/?query=" + key + "&clustersize=" + str(cluster_id)
                })

        cluster_id = cluster_id + 1
        response.append({
            "name": "Cluster " + str(cluster_id),
            "total": res["hits"]["total"],
            "feature": keys
        })

    return response


def top_documents(data):
    clusters = []
    MAX_DOCS = 3
    DUPLICAT_DOCS = []
    for res in data["responses"]:
        docs = []
        max_counter = 0
        for document in res["hits"]["hits"]:
            if max_counter > MAX_DOCS:
                break

            tmp = " ".join(document["_source"]["title"].split(" ")[-4:])
            if tmp not in DUPLICAT_DOCS and (len(document["_source"]["title"]) > 15):
                max_counter = max_counter + 1
                DUPLICAT_DOCS.append(tmp)
                docs.append({
                    "id": document["_id"],
                    "title": document["_source"]["title"],
                })
        clusters.append({
            "total": res["hits"]["total"],
            "docs": docs

        })
    return clusters