4 changed files with 179 additions and 0 deletions
@ -1 +1,3 @@ |
|||
*.pyc |
|||
.idea/ |
|||
indexer/data/ |
|||
|
@ -0,0 +1,24 @@ |
|||
Author: Imad Hamoumi |
|||
|
|||
|
|||
1- Put your data into the directory /data. |
|||
2- Start the script with python run.py |
|||
3- follow the instructions |
|||
|
|||
|
|||
Note: |
|||
CSV: |
|||
+ Only two extensions are allowed currently. the first is csv and will be read using pandas. |
|||
+ You have to provide the name of the column where the scripte can read the text data. |
|||
|
|||
PDF |
|||
+ In some cases, reading a pdf file is not allowed |
|||
+ Some PDF files are not well encoded |
|||
|
|||
|
|||
You can add your own training model in the pipline or change the cleaning parameters such as ngram size etc. |
|||
|
|||
|
|||
|
|||
|
|||
|
@ -0,0 +1,153 @@ |
|||
from __future__ import print_function, unicode_literals |
|||
from PyInquirer import prompt |
|||
import os |
|||
import PyPDF2 |
|||
import pandas as pd |
|||
import nltk |
|||
from nltk.corpus import stopwords |
|||
from nltk import word_tokenize |
|||
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer |
|||
from sklearn.cluster import KMeans |
|||
from sklearn.pipeline import Pipeline |
|||
from elasticsearch import Elasticsearch |
|||
import time |
|||
|
|||
DATA_DIR = "./indexer/data" |
|||
FILE_READER_PROTOCOL = {} |
|||
df = None |
|||
|
|||
|
|||
class MC: |
|||
HEADER = '\033[95m' |
|||
OKBLUE = '\033[94m' |
|||
OKGREEN = '\033[92m' |
|||
WARNING = '\033[93m' |
|||
FAIL = '\033[91m' |
|||
ENDC = '\033[0m' |
|||
BOLD = '\033[1m' |
|||
UNDERLINE = '\033[4m' |
|||
|
|||
|
|||
print(MC.OKBLUE + MC.BOLD + "1. Step: reading textual data ..." + MC.ENDC) |
|||
|
|||
questions = [ |
|||
{ |
|||
'type': 'input', |
|||
'name': 'extension', |
|||
'message': 'Select one of these extensions [.csv, .pdf]', |
|||
} |
|||
] |
|||
answers = prompt(questions) |
|||
|
|||
documents = {} |
|||
if answers["extension"] == ".pdf": |
|||
for file_name in os.listdir(DATA_DIR)[:2]: |
|||
if file_name.endswith(answers["extension"]): |
|||
file = open(DATA_DIR + "/" + file_name, 'rb') |
|||
doc = "" |
|||
try: |
|||
fileReader = PyPDF2.PdfFileReader(file) |
|||
info = fileReader.getDocumentInfo() |
|||
for page_number in range(1, fileReader.getNumPages() + 1): |
|||
page = fileReader.getPage(page_number) |
|||
doc = doc + page.extractText() + " " |
|||
except: |
|||
FILE_READER_PROTOCOL[file_name] = "file can not be read" |
|||
if len(doc) > 0: |
|||
documents[file_name] = doc |
|||
else: |
|||
print(MC.WARNING + "File (" + file_name + ") is ignored." + MC.ENDC) |
|||
df = pd.DataFrame.from_dict(documents, columns=['data']) |
|||
if answers["extension"] == ".csv": |
|||
questions2 = [{ |
|||
'type': 'input', |
|||
'name': 'file_name', |
|||
'message': 'Provide the name of the file', |
|||
}, |
|||
{ |
|||
'type': 'input', |
|||
'name': 'column_name', |
|||
'message': 'Provide the name column that contains text data', |
|||
}] |
|||
|
|||
answers2 = prompt(questions2) |
|||
df = pd.read_csv(DATA_DIR + "/" + answers2["file_name"] + ".csv", |
|||
sep=';', error_bad_lines=False).rename(columns={ |
|||
answers2["column_name"]: "data" |
|||
}) |
|||
|
|||
print(MC.OKBLUE + MC.BOLD + "2. Step: train model ..." + MC.ENDC) |
|||
|
|||
nltk.download('stopwords') |
|||
nltk.download('punkt') |
|||
en_stop_words = stopwords.words('english') |
|||
en_stop_words.extend(['from', 'subject', 're', 'edu', 'use']) |
|||
|
|||
if df: |
|||
training = df["data"] |
|||
|
|||
for cluster_size in range(2, 16): |
|||
CNAME = "CLUSTER_" + str(cluster_size) |
|||
text_clf = Pipeline([('vect', CountVectorizer(stop_words=en_stop_words, |
|||
lowercase=True, |
|||
ngram_range=(1, 2), |
|||
max_features=None)), |
|||
('tfidf', TfidfTransformer()), |
|||
('clf', KMeans(n_clusters=cluster_size, random_state=0)) |
|||
]) |
|||
|
|||
classifier = text_clf.fit(training) |
|||
cluster_labels = classifier.fit_predict(training) |
|||
df[CNAME] = cluster_labels |
|||
|
|||
print(MC.OKBLUE + MC.BOLD + "3. Step: indexing data ..." + MC.ENDC) |
|||
|
|||
es = Elasticsearch( |
|||
['54.37.31.100'], |
|||
http_auth=('oaboss', 'master'), |
|||
port=9201, |
|||
) |
|||
|
|||
size = df["TITLE"].count() |
|||
for index, row in df.ix[:].iterrows(): |
|||
|
|||
word_tokens = word_tokenize(row.TITLE) |
|||
filtered_title = [w for w in word_tokens if not w in en_stop_words] |
|||
|
|||
doc = { |
|||
"title": row.data, |
|||
# "title_cleaned": " ".join(filtered_title), |
|||
# "author": str(row.AUTHOR), |
|||
# "tags": str(row.TAGS), |
|||
# "discipline": str(row.DISCIPLINE), |
|||
# "url": str(row.URL), |
|||
# "notes": str(row.NOTES), |
|||
# "group_name": str(row.GROUP_TITLE), |
|||
# "group_description": str(row.GROUP_DESCRIPTION), |
|||
# "group_image": str(row.GROUP_IMG_URL), |
|||
# "language": str(row.LANG), |
|||
# "extras": str(row.EXTRAS), |
|||
# "published_at": row.PUBLICATION_DATE.strftime("%Y%m%d") if str(row.PUBLICATION_DATE) != 'NaT' else "", |
|||
"created_at": time.strftime("%Y%m%d"), |
|||
"cluster": [ |
|||
{"id": "cluster_2_" + str(row.CLUSTER_2)}, |
|||
{"id": "cluster_3_" + str(row.CLUSTER_3)}, |
|||
{"id": "cluster_4_" + str(row.CLUSTER_4)}, |
|||
{"id": "cluster_5_" + str(row.CLUSTER_5)}, |
|||
{"id": "cluster_6_" + str(row.CLUSTER_6)}, |
|||
{"id": "cluster_7_" + str(row.CLUSTER_7)}, |
|||
{"id": "cluster_8_" + str(row.CLUSTER_8)}, |
|||
{"id": "cluster_9_" + str(row.CLUSTER_9)}, |
|||
{"id": "cluster_10_" + str(row.CLUSTER_10)}, |
|||
{"id": "cluster_11_" + str(row.CLUSTER_11)}, |
|||
{"id": "cluster_12_" + str(row.CLUSTER_12)}, |
|||
{"id": "cluster_13_" + str(row.CLUSTER_13)}, |
|||
{"id": "cluster_14_" + str(row.CLUSTER_14)}, |
|||
{"id": "cluster_15_" + str(row.CLUSTER_15)}, |
|||
] |
|||
} |
|||
try: |
|||
res = es.index(index="app", doc_type='document', body=doc) |
|||
except: |
|||
continue |
|||
# sys.stdout.write('\r Progress: '+ str(index) + ' ' + str(( index / size) * 100) + ' %' + ' Error:' + str(row.AUTHOR)) |
Loading…
Reference in new issue