from __future__ import print_function, unicode_literals from PyInquirer import prompt import os import PyPDF2 import pandas as pd import nltk from nltk.corpus import stopwords from nltk import word_tokenize from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer from sklearn.cluster import KMeans from sklearn.pipeline import Pipeline from elasticsearch import Elasticsearch import time DATA_DIR = "./indexer/data" FILE_READER_PROTOCOL = {} df = None class MC: HEADER = '\033[95m' OKBLUE = '\033[94m' OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' BOLD = '\033[1m' UNDERLINE = '\033[4m' print(MC.OKBLUE + MC.BOLD + "1. Step: reading textual data ..." + MC.ENDC) questions = [ { 'type': 'input', 'name': 'extension', 'message': 'Select one of these extensions [.csv, .pdf]', } ] answers = prompt(questions) documents = {} if answers["extension"] == ".pdf": for file_name in os.listdir(DATA_DIR)[:2]: if file_name.endswith(answers["extension"]): file = open(DATA_DIR + "/" + file_name, 'rb') doc = "" try: fileReader = PyPDF2.PdfFileReader(file) info = fileReader.getDocumentInfo() for page_number in range(1, fileReader.getNumPages() + 1): page = fileReader.getPage(page_number) doc = doc + page.extractText() + " " except: FILE_READER_PROTOCOL[file_name] = "file can not be read" if len(doc) > 0: documents[file_name] = doc else: print(MC.WARNING + "File (" + file_name + ") is ignored." + MC.ENDC) df = pd.DataFrame.from_dict(documents, columns=['data']) if answers["extension"] == ".csv": questions2 = [{ 'type': 'input', 'name': 'file_name', 'message': 'Provide the name of the file', }, { 'type': 'input', 'name': 'column_name', 'message': 'Provide the name column that contains text data', }] answers2 = prompt(questions2) df = pd.read_csv(DATA_DIR + "/" + answers2["file_name"] + ".csv", sep=';', error_bad_lines=False).rename(columns={ answers2["column_name"]: "data" }) print(MC.OKBLUE + MC.BOLD + "2. Step: train model ..." + MC.ENDC) nltk.download('stopwords') nltk.download('punkt') en_stop_words = stopwords.words('english') en_stop_words.extend(['from', 'subject', 're', 'edu', 'use']) if df: training = df["data"] for cluster_size in range(2, 16): CNAME = "CLUSTER_" + str(cluster_size) text_clf = Pipeline([('vect', CountVectorizer(stop_words=en_stop_words, lowercase=True, ngram_range=(1, 2), max_features=None)), ('tfidf', TfidfTransformer()), ('clf', KMeans(n_clusters=cluster_size, random_state=0)) ]) classifier = text_clf.fit(training) cluster_labels = classifier.fit_predict(training) df[CNAME] = cluster_labels print(MC.OKBLUE + MC.BOLD + "3. Step: indexing data ..." + MC.ENDC) es = Elasticsearch( ['54.37.31.100'], http_auth=('oaboss', 'master'), port=9201, ) size = df["TITLE"].count() for index, row in df.ix[:].iterrows(): word_tokens = word_tokenize(row.TITLE) filtered_title = [w for w in word_tokens if not w in en_stop_words] doc = { "title": row.data, # "title_cleaned": " ".join(filtered_title), # "author": str(row.AUTHOR), # "tags": str(row.TAGS), # "discipline": str(row.DISCIPLINE), # "url": str(row.URL), # "notes": str(row.NOTES), # "group_name": str(row.GROUP_TITLE), # "group_description": str(row.GROUP_DESCRIPTION), # "group_image": str(row.GROUP_IMG_URL), # "language": str(row.LANG), # "extras": str(row.EXTRAS), # "published_at": row.PUBLICATION_DATE.strftime("%Y%m%d") if str(row.PUBLICATION_DATE) != 'NaT' else "", "created_at": time.strftime("%Y%m%d"), "cluster": [ {"id": "cluster_2_" + str(row.CLUSTER_2)}, {"id": "cluster_3_" + str(row.CLUSTER_3)}, {"id": "cluster_4_" + str(row.CLUSTER_4)}, {"id": "cluster_5_" + str(row.CLUSTER_5)}, {"id": "cluster_6_" + str(row.CLUSTER_6)}, {"id": "cluster_7_" + str(row.CLUSTER_7)}, {"id": "cluster_8_" + str(row.CLUSTER_8)}, {"id": "cluster_9_" + str(row.CLUSTER_9)}, {"id": "cluster_10_" + str(row.CLUSTER_10)}, {"id": "cluster_11_" + str(row.CLUSTER_11)}, {"id": "cluster_12_" + str(row.CLUSTER_12)}, {"id": "cluster_13_" + str(row.CLUSTER_13)}, {"id": "cluster_14_" + str(row.CLUSTER_14)}, {"id": "cluster_15_" + str(row.CLUSTER_15)}, ] } try: res = es.index(index="app", doc_type='document', body=doc) except: continue # sys.stdout.write('\r Progress: '+ str(index) + ' ' + str(( index / size) * 100) + ' %' + ' Error:' + str(row.AUTHOR))