136 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			136 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from collections import Counter
 | |
| import string
 | |
| from controllers import Config
 | |
| import re
 | |
| 
 | |
| punctuation = str.maketrans('', '', string.punctuation)
 | |
| digits = str.maketrans('', '', string.digits)
 | |
| shortword = re.compile(r'\W*\b\w{1,3}\b')
 | |
| SEPARATOR = " $$$ "
 | |
| 
 | |
| 
 | |
| def generate_keywords(data, cluster, depth, unique):
 | |
|     response = []
 | |
|     cluster_id = 0
 | |
|     for res in data["responses"]:
 | |
|         titles = ""
 | |
|         for document in res["hits"]["hits"]:
 | |
|             titles = titles + " " + document["_source"]["title_cleaned"]
 | |
|         # todo: better split and clean words. expl: TrobriandsCandidate
 | |
|         titles = " ".join(re.findall('[A-Z][^A-Z]*', titles))
 | |
|         titles = shortword.sub('', titles).translate({**punctuation, **digits}).split(" ")
 | |
|         freq = Counter(titles)
 | |
| 
 | |
|         keys = []
 | |
|         if unique == "false":
 | |
|             tmp = dict(freq.most_common()[:5000])
 | |
|         else:
 | |
|             tmp = dict(reversed(freq.most_common()[-50:]))
 | |
| 
 | |
|         for key in tmp:
 | |
|             if len(key) > 2:
 | |
|                 keys.append({
 | |
|                     "text": key,
 | |
|                     "weight": tmp[key],
 | |
|                     "link": "/?query=" + key + "&clustersize=" + cluster + "&cluster=" + str(cluster_id) + "&weight=" + str(
 | |
|                         tmp[key]) + "&depth=" + str(int(depth) + 1)
 | |
|                 })
 | |
| 
 | |
|         cluster_id = cluster_id + 1
 | |
|         response.append({
 | |
|             "name": "Cluster " + str(cluster_id),
 | |
|             "total": res["hits"]["total"],
 | |
|             "feature": keys
 | |
|         })
 | |
| 
 | |
|     return response
 | |
| 
 | |
| 
 | |
| def generate_keywords_filtered(keywords, data, cluster, depth, unique):
 | |
|     titles = {}
 | |
|     keywords_filter = ""
 | |
|     response = []
 | |
| 
 | |
|     depth = get_cluster_depth(depth)
 | |
| 
 | |
|     for res in data["responses"]:
 | |
|         for document in res["hits"]["hits"]:
 | |
|             titles[document["_source"][Config.DEPTH[depth]]] = ""
 | |
| 
 | |
|     for res in data["responses"]:
 | |
|         for document in res["hits"]["hits"]:
 | |
|             titles[document["_source"][Config.DEPTH[depth]]] = titles[document["_source"][
 | |
|                 Config.DEPTH[depth]]] + \
 | |
|                                                                       document["_source"]["title_cleaned"] + SEPARATOR
 | |
| 
 | |
|             keywords_filter = keywords_filter + " " + document["_source"]["title_cleaned"]
 | |
| 
 | |
|     allowed_keywords = Counter(keywords_filter.split(" "))
 | |
|     allowed_keywords = list({x: allowed_keywords[x] for x in allowed_keywords if allowed_keywords[x] < 2})
 | |
| 
 | |
|     for document in titles:
 | |
| 
 | |
|         total = len(titles[document].split(SEPARATOR)) - 1
 | |
| 
 | |
|         titles[document] = shortword.sub('', titles[document]).translate({**punctuation, **digits}).lower().split(" ")
 | |
| 
 | |
|         freq = Counter(titles[document])
 | |
| 
 | |
|         keys = []
 | |
|         if unique == "false":
 | |
|             tmp = dict(freq.most_common()[:5000])
 | |
|         else:
 | |
|             tmp = dict(reversed(freq.most_common()[-50:]))
 | |
| 
 | |
|         for key in tmp:
 | |
|             if total < 2:
 | |
|                 keys.append({
 | |
|                     "text": key,
 | |
|                     "weight": tmp[key],
 | |
|                     "link": "/docs?query=" + keywords + "," + key + "&depth=" + str(
 | |
|                         depth + 1) + "&cluster=" + cluster + "&delta_cloud=" + unique + "&name=" + document + "&cluster_size=3"
 | |
|                 })
 | |
|             else:
 | |
|                 if (depth + 1) == 2:
 | |
|                     if key in allowed_keywords and key.lower() not in keywords:
 | |
|                         keys.append({
 | |
|                             "text": key,
 | |
|                             "weight": tmp[key],
 | |
|                             "link": "/?query=" + keywords + "," + key + "&depth=" + str(
 | |
|                                 depth + 1) + "&cluster=" + cluster + "&delta_cloud=" + unique + "&name=" + document
 | |
|                         })
 | |
|                 else:
 | |
|                     if key.lower() not in keywords:
 | |
|                         keys.append({
 | |
|                             "text": key,
 | |
|                             "weight": tmp[key],
 | |
|                             "link": "/?query=" + keywords + "," + key + "&depth=" + str(
 | |
|                                 depth + 1) + "&cluster=" + cluster + "&delta_cloud=" + unique + "&name=" + document
 | |
|                         })
 | |
| 
 | |
|         response.append({
 | |
|             "name": document,
 | |
|             "total": total,
 | |
|             "feature": keys
 | |
|         })
 | |
| 
 | |
|     return response
 | |
| 
 | |
| 
 | |
| def get_cluster_depth(depth):
 | |
|     if int(depth) >= len(Config.DEPTH):
 | |
|         return len(Config.DEPTH) - 1
 | |
|     if int(depth) == 0:
 | |
|         return 1
 | |
|     return int(depth)
 | |
| 
 | |
| 
 | |
| def filter_documents(data, filters):
 | |
|     docs = []
 | |
|     for res in data["responses"]:
 | |
|         for document in res["hits"]["hits"]:
 | |
|             for attribute in Config.DEPTH:
 | |
|                 if document["_source"][Config.DEPTH[attribute]] == filters:
 | |
|                     docs.append(document)
 | |
|     return docs
 |