From c9176ebe3ee050d94db8e62078fcbfcc440a86a2 Mon Sep 17 00:00:00 2001
From: Imad <imad.hamoumi@wlw.de>
Date: Mon, 28 Jan 2019 17:08:35 +0100
Subject: [PATCH] indexer impl.

---
 .gitignore          |   3 +
 indexer/README      |  24 +++++++
 indexer/__init__.py |   0
 indexer/run.py      | 153 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 180 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 indexer/README
 create mode 100644 indexer/__init__.py
 create mode 100644 indexer/run.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d5ddb89
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+
+.idea/
+indexer/data/
diff --git a/indexer/README b/indexer/README
new file mode 100644
index 0000000..7048056
--- /dev/null
+++ b/indexer/README
@@ -0,0 +1,24 @@
+Author: Imad Hamoumi
+
+
+1- Put your data into the directory /data.
+2- Start the script with python run.py
+3- follow the instructions
+
+
+Note:
+    CSV:
+     + Only two extensions are allowed currently. the first is csv and will be read using pandas.
+     + You have to provide the name of the column where the scripte can read the text data.
+
+    PDF
+     + In some cases, reading a pdf file is not allowed
+     + Some PDF files are not well encoded
+
+
+You can add your own training model in the pipline or change the cleaning parameters  such as ngram size etc.
+
+
+
+
+
diff --git a/indexer/__init__.py b/indexer/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/indexer/run.py b/indexer/run.py
new file mode 100644
index 0000000..d35941e
--- /dev/null
+++ b/indexer/run.py
@@ -0,0 +1,153 @@
+from __future__ import print_function, unicode_literals
+from PyInquirer import prompt
+import os
+import PyPDF2
+import pandas as pd
+import nltk
+from nltk.corpus import stopwords
+from nltk import word_tokenize
+from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
+from sklearn.cluster import KMeans
+from sklearn.pipeline import Pipeline
+from elasticsearch import Elasticsearch
+import time
+
+DATA_DIR = "./indexer/data"
+FILE_READER_PROTOCOL = {}
+df = None
+
+
+class MC:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+
+print(MC.OKBLUE + MC.BOLD + "1. Step: reading textual data ..." + MC.ENDC)
+
+questions = [
+    {
+        'type': 'input',
+        'name': 'extension',
+        'message': 'Select one of these extensions [.csv, .pdf]',
+    }
+]
+answers = prompt(questions)
+
+documents = {}
+if answers["extension"] == ".pdf":
+    for file_name in os.listdir(DATA_DIR)[:2]:
+        if file_name.endswith(answers["extension"]):
+            file = open(DATA_DIR + "/" + file_name, 'rb')
+            doc = ""
+            try:
+                fileReader = PyPDF2.PdfFileReader(file)
+                info = fileReader.getDocumentInfo()
+                for page_number in range(1, fileReader.getNumPages() + 1):
+                    page = fileReader.getPage(page_number)
+                    doc = doc + page.extractText() + " "
+            except:
+                FILE_READER_PROTOCOL[file_name] = "file can not be read"
+            if len(doc) > 0:
+                documents[file_name] = doc
+        else:
+            print(MC.WARNING + "File (" + file_name + ") is ignored." + MC.ENDC)
+    df = pd.DataFrame.from_dict(documents, columns=['data'])
+if answers["extension"] == ".csv":
+    questions2 = [{
+        'type': 'input',
+        'name': 'file_name',
+        'message': 'Provide the name of the file',
+    },
+        {
+            'type': 'input',
+            'name': 'column_name',
+            'message': 'Provide the name column that contains text data',
+        }]
+
+    answers2 = prompt(questions2)
+    df = pd.read_csv(DATA_DIR + "/" + answers2["file_name"] + ".csv",
+                     sep=';', error_bad_lines=False).rename(columns={
+        answers2["column_name"]: "data"
+    })
+
+print(MC.OKBLUE + MC.BOLD + "2. Step: train model ..." + MC.ENDC)
+
+nltk.download('stopwords')
+nltk.download('punkt')
+en_stop_words = stopwords.words('english')
+en_stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
+
+if df:
+    training = df["data"]
+
+    for cluster_size in range(2, 16):
+        CNAME = "CLUSTER_" + str(cluster_size)
+        text_clf = Pipeline([('vect', CountVectorizer(stop_words=en_stop_words,
+                                                      lowercase=True,
+                                                      ngram_range=(1, 2),
+                                                      max_features=None)),
+                             ('tfidf', TfidfTransformer()),
+                             ('clf', KMeans(n_clusters=cluster_size, random_state=0))
+                             ])
+
+        classifier = text_clf.fit(training)
+        cluster_labels = classifier.fit_predict(training)
+        df[CNAME] = cluster_labels
+
+print(MC.OKBLUE + MC.BOLD + "3. Step: indexing data ..." + MC.ENDC)
+
+es = Elasticsearch(
+    ['54.37.31.100'],
+    http_auth=('oaboss', 'master'),
+    port=9201,
+)
+
+size = df["TITLE"].count()
+for index, row in df.ix[:].iterrows():
+
+    word_tokens = word_tokenize(row.TITLE)
+    filtered_title = [w for w in word_tokens if not w in en_stop_words]
+
+    doc = {
+        "title": row.data,
+        # "title_cleaned": " ".join(filtered_title),
+        # "author": str(row.AUTHOR),
+        # "tags": str(row.TAGS),
+        # "discipline": str(row.DISCIPLINE),
+        # "url": str(row.URL),
+        # "notes": str(row.NOTES),
+        # "group_name": str(row.GROUP_TITLE),
+        # "group_description": str(row.GROUP_DESCRIPTION),
+        # "group_image": str(row.GROUP_IMG_URL),
+        # "language": str(row.LANG),
+        # "extras": str(row.EXTRAS),
+        # "published_at": row.PUBLICATION_DATE.strftime("%Y%m%d") if str(row.PUBLICATION_DATE) != 'NaT'  else "",
+        "created_at": time.strftime("%Y%m%d"),
+        "cluster": [
+            {"id": "cluster_2_" + str(row.CLUSTER_2)},
+            {"id": "cluster_3_" + str(row.CLUSTER_3)},
+            {"id": "cluster_4_" + str(row.CLUSTER_4)},
+            {"id": "cluster_5_" + str(row.CLUSTER_5)},
+            {"id": "cluster_6_" + str(row.CLUSTER_6)},
+            {"id": "cluster_7_" + str(row.CLUSTER_7)},
+            {"id": "cluster_8_" + str(row.CLUSTER_8)},
+            {"id": "cluster_9_" + str(row.CLUSTER_9)},
+            {"id": "cluster_10_" + str(row.CLUSTER_10)},
+            {"id": "cluster_11_" + str(row.CLUSTER_11)},
+            {"id": "cluster_12_" + str(row.CLUSTER_12)},
+            {"id": "cluster_13_" + str(row.CLUSTER_13)},
+            {"id": "cluster_14_" + str(row.CLUSTER_14)},
+            {"id": "cluster_15_" + str(row.CLUSTER_15)},
+        ]
+    }
+    try:
+        res = es.index(index="app", doc_type='document', body=doc)
+    except:
+        continue
+        # sys.stdout.write('\r Progress: '+ str(index) + ' ' + str(( index / size) * 100) + ' %' + ' Error:' + str(row.AUTHOR))