Julian M. Kunkel 2019-05-14 17:19:32 +01:00
commit a92a0810f4
4 changed files with 179 additions and 0 deletions

2
.gitignore vendored
View File

@ -1 +1,3 @@
*.pyc
.idea/
indexer/data/

24
indexer/README 100644
View File

@ -0,0 +1,24 @@
Author: Imad Hamoumi
1- Put your data into the directory /data.
2- Start the script with python run.py
3- follow the instructions
Note:
CSV:
+ Only two extensions are allowed currently. the first is csv and will be read using pandas.
+ You have to provide the name of the column where the scripte can read the text data.
PDF
+ In some cases, reading a pdf file is not allowed
+ Some PDF files are not well encoded
You can add your own training model in the pipline or change the cleaning parameters such as ngram size etc.

View File

153
indexer/run.py 100644
View File

@ -0,0 +1,153 @@
from __future__ import print_function, unicode_literals
from PyInquirer import prompt
import os
import PyPDF2
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from elasticsearch import Elasticsearch
import time
DATA_DIR = "./indexer/data"
FILE_READER_PROTOCOL = {}
df = None
class MC:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
print(MC.OKBLUE + MC.BOLD + "1. Step: reading textual data ..." + MC.ENDC)
questions = [
{
'type': 'input',
'name': 'extension',
'message': 'Select one of these extensions [.csv, .pdf]',
}
]
answers = prompt(questions)
documents = {}
if answers["extension"] == ".pdf":
for file_name in os.listdir(DATA_DIR)[:2]:
if file_name.endswith(answers["extension"]):
file = open(DATA_DIR + "/" + file_name, 'rb')
doc = ""
try:
fileReader = PyPDF2.PdfFileReader(file)
info = fileReader.getDocumentInfo()
for page_number in range(1, fileReader.getNumPages() + 1):
page = fileReader.getPage(page_number)
doc = doc + page.extractText() + " "
except:
FILE_READER_PROTOCOL[file_name] = "file can not be read"
if len(doc) > 0:
documents[file_name] = doc
else:
print(MC.WARNING + "File (" + file_name + ") is ignored." + MC.ENDC)
df = pd.DataFrame.from_dict(documents, columns=['data'])
if answers["extension"] == ".csv":
questions2 = [{
'type': 'input',
'name': 'file_name',
'message': 'Provide the name of the file',
},
{
'type': 'input',
'name': 'column_name',
'message': 'Provide the name column that contains text data',
}]
answers2 = prompt(questions2)
df = pd.read_csv(DATA_DIR + "/" + answers2["file_name"] + ".csv",
sep=';', error_bad_lines=False).rename(columns={
answers2["column_name"]: "data"
})
print(MC.OKBLUE + MC.BOLD + "2. Step: train model ..." + MC.ENDC)
nltk.download('stopwords')
nltk.download('punkt')
en_stop_words = stopwords.words('english')
en_stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
if df:
training = df["data"]
for cluster_size in range(2, 16):
CNAME = "CLUSTER_" + str(cluster_size)
text_clf = Pipeline([('vect', CountVectorizer(stop_words=en_stop_words,
lowercase=True,
ngram_range=(1, 2),
max_features=None)),
('tfidf', TfidfTransformer()),
('clf', KMeans(n_clusters=cluster_size, random_state=0))
])
classifier = text_clf.fit(training)
cluster_labels = classifier.fit_predict(training)
df[CNAME] = cluster_labels
print(MC.OKBLUE + MC.BOLD + "3. Step: indexing data ..." + MC.ENDC)
es = Elasticsearch(
['54.37.31.100'],
http_auth=('oaboss', 'master'),
port=9201,
)
size = df["TITLE"].count()
for index, row in df.ix[:].iterrows():
word_tokens = word_tokenize(row.TITLE)
filtered_title = [w for w in word_tokens if not w in en_stop_words]
doc = {
"title": row.data,
# "title_cleaned": " ".join(filtered_title),
# "author": str(row.AUTHOR),
# "tags": str(row.TAGS),
# "discipline": str(row.DISCIPLINE),
# "url": str(row.URL),
# "notes": str(row.NOTES),
# "group_name": str(row.GROUP_TITLE),
# "group_description": str(row.GROUP_DESCRIPTION),
# "group_image": str(row.GROUP_IMG_URL),
# "language": str(row.LANG),
# "extras": str(row.EXTRAS),
# "published_at": row.PUBLICATION_DATE.strftime("%Y%m%d") if str(row.PUBLICATION_DATE) != 'NaT' else "",
"created_at": time.strftime("%Y%m%d"),
"cluster": [
{"id": "cluster_2_" + str(row.CLUSTER_2)},
{"id": "cluster_3_" + str(row.CLUSTER_3)},
{"id": "cluster_4_" + str(row.CLUSTER_4)},
{"id": "cluster_5_" + str(row.CLUSTER_5)},
{"id": "cluster_6_" + str(row.CLUSTER_6)},
{"id": "cluster_7_" + str(row.CLUSTER_7)},
{"id": "cluster_8_" + str(row.CLUSTER_8)},
{"id": "cluster_9_" + str(row.CLUSTER_9)},
{"id": "cluster_10_" + str(row.CLUSTER_10)},
{"id": "cluster_11_" + str(row.CLUSTER_11)},
{"id": "cluster_12_" + str(row.CLUSTER_12)},
{"id": "cluster_13_" + str(row.CLUSTER_13)},
{"id": "cluster_14_" + str(row.CLUSTER_14)},
{"id": "cluster_15_" + str(row.CLUSTER_15)},
]
}
try:
res = es.index(index="app", doc_type='document', body=doc)
except:
continue
# sys.stdout.write('\r Progress: '+ str(index) + ' ' + str(( index / size) * 100) + ' %' + ' Error:' + str(row.AUTHOR))