209 lines
5.6 KiB
Python
209 lines
5.6 KiB
Python
from elasticsearch import Elasticsearch
|
|
from controllers import Config
|
|
import json
|
|
|
|
client = Elasticsearch(
|
|
['54.37.31.100'],
|
|
http_auth=('oaboss', 'master'),
|
|
port=9201,
|
|
)
|
|
|
|
RESULT_SIE = 300
|
|
RESULT_SIE_CLUSTERED = 2000
|
|
|
|
|
|
def fetch_clusters(size, group, disipline, author, pub_period):
|
|
queries = []
|
|
filters = []
|
|
if group:
|
|
filters.append({"term": {"group_name": group}})
|
|
if disipline:
|
|
filters.append({"term": {"discipline": disipline}})
|
|
if author:
|
|
filters.append({"term": {"author": author}})
|
|
if pub_period and len(pub_period) > 0:
|
|
filters.append({
|
|
"range": {
|
|
"published_at": {
|
|
"gte": pub_period[0],
|
|
"lte": pub_period[1],
|
|
"format": "dd/MM/yyyy"
|
|
}
|
|
}
|
|
})
|
|
|
|
for idx in range(0, int(size)):
|
|
cluster = "cluster_" + str(size) + "_" + str(idx)
|
|
queries.append({
|
|
"_source": ["title", "title_cleaned", "discipline"],
|
|
"size": RESULT_SIE,
|
|
"query": {
|
|
"bool": {
|
|
"must": [
|
|
{"term": {"cluster.id": cluster}}
|
|
],
|
|
"filter": filters
|
|
}
|
|
}
|
|
})
|
|
request = ''
|
|
for each in queries:
|
|
request += '{} \n'
|
|
request += '%s \n' % json.dumps(each)
|
|
|
|
res = client.msearch(body=request, index="app", doc_type="document")
|
|
return res
|
|
|
|
|
|
def get_facets(keywords, cluster_size, group, discipline, author, pub_period):
|
|
filters = []
|
|
terms = []
|
|
for keyword in keywords:
|
|
terms.append({"term": {"title": keyword.lower()}})
|
|
|
|
if group:
|
|
filters.append({"term": {"group_name": group}})
|
|
if discipline:
|
|
filters.append({"term": {"discipline": discipline}})
|
|
if author:
|
|
filters.append({"term": {"author": author}})
|
|
if pub_period and len(pub_period) > 0:
|
|
filters.append({
|
|
"range": {
|
|
"published_at": {
|
|
"gte": pub_period[0],
|
|
"lte": pub_period[1],
|
|
"format": "dd/MM/yyyy"
|
|
}
|
|
}
|
|
})
|
|
|
|
request = {
|
|
"size": 0,
|
|
"query": {
|
|
"bool": {
|
|
"must": terms,
|
|
"filter": filters
|
|
}
|
|
},
|
|
"aggregations": {
|
|
"group_name": {
|
|
"terms": {
|
|
"field": "group_name"
|
|
}
|
|
},
|
|
"discipline": {
|
|
"terms": {
|
|
"field": "discipline"
|
|
}
|
|
},
|
|
"author": {
|
|
"terms": {
|
|
"field": "author"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return client.search(body=request, index="app", doc_type="document")
|
|
|
|
|
|
def search_query_filter(keywords, subcluster, size):
|
|
queries = []
|
|
terms = []
|
|
for keyword in keywords:
|
|
terms.append({"term": {"title": keyword.lower()}})
|
|
|
|
try:
|
|
subcluster = int(subcluster)
|
|
except:
|
|
subcluster = None
|
|
|
|
if not subcluster:
|
|
for idx in range(0, int(size)):
|
|
cluster = "cluster_" + str(size) + "_" + str(idx)
|
|
queries.append({
|
|
"_source": ["title", "title_cleaned"] + list(Config.DEPTH.values()),
|
|
"size": RESULT_SIE,
|
|
"query": {
|
|
"bool": {
|
|
"must": [{"term": {"cluster.id": cluster}}] + terms
|
|
}
|
|
}
|
|
})
|
|
else:
|
|
cluster = "cluster_" + str(size) + "_" + str(subcluster)
|
|
queries.append({
|
|
"_source": ["title", "title_cleaned"] + list(Config.DEPTH.values()),
|
|
"size": RESULT_SIE,
|
|
"query": {
|
|
"bool": {
|
|
"must": [{"term": {"cluster.id": cluster}}] + terms
|
|
}
|
|
}
|
|
})
|
|
|
|
request = ''
|
|
for each in queries:
|
|
request += '{} \n'
|
|
request += '%s \n' % json.dumps(each)
|
|
|
|
res = client.msearch(body=request, index="app", doc_type="document")
|
|
|
|
return res
|
|
|
|
|
|
def search_query_by_cluster(keywords, size, group, discipline, author, pub_period):
|
|
queries = []
|
|
terms = []
|
|
filters = []
|
|
|
|
if group:
|
|
filters.append({"term": {"group_name": group}})
|
|
if discipline:
|
|
filters.append({"term": {"discipline": discipline}})
|
|
if author:
|
|
filters.append({"term": {"author": author}})
|
|
|
|
if pub_period and len(pub_period) > 0:
|
|
filters.append({
|
|
"range": {
|
|
"published_at": {
|
|
"gte": pub_period[0],
|
|
"lte": pub_period[1],
|
|
"format": "dd/MM/yyyy"
|
|
}
|
|
}
|
|
})
|
|
|
|
if keywords:
|
|
for keyword in keywords:
|
|
terms.append({"term": {"title": keyword.lower()}})
|
|
|
|
for idx in range(0, int(size)):
|
|
cluster = "cluster_" + str(size) + "_" + str(idx)
|
|
queries.append({
|
|
"_source": ["title", "title_cleaned", "discipline"],
|
|
"size": RESULT_SIE_CLUSTERED,
|
|
"query": {
|
|
"bool": {
|
|
"must": [{"term": {"cluster.id": cluster}}] + terms,
|
|
"filter": filters
|
|
}
|
|
}
|
|
})
|
|
|
|
request = ''
|
|
for each in queries:
|
|
request += '{} \n'
|
|
request += '%s \n' % json.dumps(each)
|
|
|
|
res = client.msearch(body=request, index="app", doc_type="document")
|
|
|
|
print(request)
|
|
return res
|
|
|
|
|
|
def get(id):
|
|
return client.get(index="app", doc_type='document', id=id)
|