From c9176ebe3ee050d94db8e62078fcbfcc440a86a2 Mon Sep 17 00:00:00 2001 From: Imad Date: Mon, 28 Jan 2019 17:08:35 +0100 Subject: [PATCH] indexer impl. --- .gitignore | 3 + indexer/README | 24 +++++++ indexer/__init__.py | 0 indexer/run.py | 153 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 180 insertions(+) create mode 100644 .gitignore create mode 100644 indexer/README create mode 100644 indexer/__init__.py create mode 100644 indexer/run.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d5ddb89 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ + +.idea/ +indexer/data/ diff --git a/indexer/README b/indexer/README new file mode 100644 index 0000000..7048056 --- /dev/null +++ b/indexer/README @@ -0,0 +1,24 @@ +Author: Imad Hamoumi + + +1- Put your data into the directory /data. +2- Start the script with python run.py +3- follow the instructions + + +Note: + CSV: + + Only two extensions are allowed currently. the first is csv and will be read using pandas. + + You have to provide the name of the column where the scripte can read the text data. + + PDF + + In some cases, reading a pdf file is not allowed + + Some PDF files are not well encoded + + +You can add your own training model in the pipline or change the cleaning parameters such as ngram size etc. + + + + + diff --git a/indexer/__init__.py b/indexer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/indexer/run.py b/indexer/run.py new file mode 100644 index 0000000..d35941e --- /dev/null +++ b/indexer/run.py @@ -0,0 +1,153 @@ +from __future__ import print_function, unicode_literals +from PyInquirer import prompt +import os +import PyPDF2 +import pandas as pd +import nltk +from nltk.corpus import stopwords +from nltk import word_tokenize +from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer +from sklearn.cluster import KMeans +from sklearn.pipeline import Pipeline +from elasticsearch import Elasticsearch +import time + +DATA_DIR = "./indexer/data" +FILE_READER_PROTOCOL = {} +df = None + + +class MC: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + + +print(MC.OKBLUE + MC.BOLD + "1. Step: reading textual data ..." + MC.ENDC) + +questions = [ + { + 'type': 'input', + 'name': 'extension', + 'message': 'Select one of these extensions [.csv, .pdf]', + } +] +answers = prompt(questions) + +documents = {} +if answers["extension"] == ".pdf": + for file_name in os.listdir(DATA_DIR)[:2]: + if file_name.endswith(answers["extension"]): + file = open(DATA_DIR + "/" + file_name, 'rb') + doc = "" + try: + fileReader = PyPDF2.PdfFileReader(file) + info = fileReader.getDocumentInfo() + for page_number in range(1, fileReader.getNumPages() + 1): + page = fileReader.getPage(page_number) + doc = doc + page.extractText() + " " + except: + FILE_READER_PROTOCOL[file_name] = "file can not be read" + if len(doc) > 0: + documents[file_name] = doc + else: + print(MC.WARNING + "File (" + file_name + ") is ignored." + MC.ENDC) + df = pd.DataFrame.from_dict(documents, columns=['data']) +if answers["extension"] == ".csv": + questions2 = [{ + 'type': 'input', + 'name': 'file_name', + 'message': 'Provide the name of the file', + }, + { + 'type': 'input', + 'name': 'column_name', + 'message': 'Provide the name column that contains text data', + }] + + answers2 = prompt(questions2) + df = pd.read_csv(DATA_DIR + "/" + answers2["file_name"] + ".csv", + sep=';', error_bad_lines=False).rename(columns={ + answers2["column_name"]: "data" + }) + +print(MC.OKBLUE + MC.BOLD + "2. Step: train model ..." + MC.ENDC) + +nltk.download('stopwords') +nltk.download('punkt') +en_stop_words = stopwords.words('english') +en_stop_words.extend(['from', 'subject', 're', 'edu', 'use']) + +if df: + training = df["data"] + + for cluster_size in range(2, 16): + CNAME = "CLUSTER_" + str(cluster_size) + text_clf = Pipeline([('vect', CountVectorizer(stop_words=en_stop_words, + lowercase=True, + ngram_range=(1, 2), + max_features=None)), + ('tfidf', TfidfTransformer()), + ('clf', KMeans(n_clusters=cluster_size, random_state=0)) + ]) + + classifier = text_clf.fit(training) + cluster_labels = classifier.fit_predict(training) + df[CNAME] = cluster_labels + +print(MC.OKBLUE + MC.BOLD + "3. Step: indexing data ..." + MC.ENDC) + +es = Elasticsearch( + ['54.37.31.100'], + http_auth=('oaboss', 'master'), + port=9201, +) + +size = df["TITLE"].count() +for index, row in df.ix[:].iterrows(): + + word_tokens = word_tokenize(row.TITLE) + filtered_title = [w for w in word_tokens if not w in en_stop_words] + + doc = { + "title": row.data, + # "title_cleaned": " ".join(filtered_title), + # "author": str(row.AUTHOR), + # "tags": str(row.TAGS), + # "discipline": str(row.DISCIPLINE), + # "url": str(row.URL), + # "notes": str(row.NOTES), + # "group_name": str(row.GROUP_TITLE), + # "group_description": str(row.GROUP_DESCRIPTION), + # "group_image": str(row.GROUP_IMG_URL), + # "language": str(row.LANG), + # "extras": str(row.EXTRAS), + # "published_at": row.PUBLICATION_DATE.strftime("%Y%m%d") if str(row.PUBLICATION_DATE) != 'NaT' else "", + "created_at": time.strftime("%Y%m%d"), + "cluster": [ + {"id": "cluster_2_" + str(row.CLUSTER_2)}, + {"id": "cluster_3_" + str(row.CLUSTER_3)}, + {"id": "cluster_4_" + str(row.CLUSTER_4)}, + {"id": "cluster_5_" + str(row.CLUSTER_5)}, + {"id": "cluster_6_" + str(row.CLUSTER_6)}, + {"id": "cluster_7_" + str(row.CLUSTER_7)}, + {"id": "cluster_8_" + str(row.CLUSTER_8)}, + {"id": "cluster_9_" + str(row.CLUSTER_9)}, + {"id": "cluster_10_" + str(row.CLUSTER_10)}, + {"id": "cluster_11_" + str(row.CLUSTER_11)}, + {"id": "cluster_12_" + str(row.CLUSTER_12)}, + {"id": "cluster_13_" + str(row.CLUSTER_13)}, + {"id": "cluster_14_" + str(row.CLUSTER_14)}, + {"id": "cluster_15_" + str(row.CLUSTER_15)}, + ] + } + try: + res = es.index(index="app", doc_type='document', body=doc) + except: + continue + # sys.stdout.write('\r Progress: '+ str(index) + ' ' + str(( index / size) * 100) + ' %' + ' Error:' + str(row.AUTHOR))