Merge branch 'master' of https://bitbucket.org/textnavigation/textnavi

2019-05-14 17:19:32 +01:00 · 2019-05-14 17:19:32 +01:00 · a92a0810f4
commit a92a0810f4
parent 543c4487b5 c9176ebe3e
4 changed files with 179 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,3 @@
 *.pyc
 .idea/
 indexer/data/
--- a/indexer/README
+++ b/indexer/README
@ -0,0 +1,24 @@
 Author: Imad Hamoumi
 1- Put your data into the directory /data.
 2- Start the script with python run.py
 3- follow the instructions
 Note:
    CSV:
     + Only two extensions are allowed currently. the first is csv and will be read using pandas.
     + You have to provide the name of the column where the scripte can read the text data.
    PDF
     + In some cases, reading a pdf file is not allowed
     + Some PDF files are not well encoded
 You can add your own training model in the pipline or change the cleaning parameters  such as ngram size etc.
--- a/indexer/init.py
+++ b/indexer/init.py
--- a/indexer/run.py
+++ b/indexer/run.py
@ -0,0 +1,153 @@
 from __future__ import print_function, unicode_literals
 from PyInquirer import prompt
 import os
 import PyPDF2
 import pandas as pd
 import nltk
 from nltk.corpus import stopwords
 from nltk import word_tokenize
 from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
 from sklearn.cluster import KMeans
 from sklearn.pipeline import Pipeline
 from elasticsearch import Elasticsearch
 import time
 DATA_DIR = "./indexer/data"
 FILE_READER_PROTOCOL = {}
 df = None
 class MC:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
 print(MC.OKBLUE + MC.BOLD + "1. Step: reading textual data ..." + MC.ENDC)
 questions = [
    {
        'type': 'input',
        'name': 'extension',
        'message': 'Select one of these extensions [.csv, .pdf]',
    }
 ]
 answers = prompt(questions)
 documents = {}
 if answers["extension"] == ".pdf":
    for file_name in os.listdir(DATA_DIR)[:2]:
        if file_name.endswith(answers["extension"]):
            file = open(DATA_DIR + "/" + file_name, 'rb')
            doc = ""
            try:
                fileReader = PyPDF2.PdfFileReader(file)
                info = fileReader.getDocumentInfo()
                for page_number in range(1, fileReader.getNumPages() + 1):
                    page = fileReader.getPage(page_number)
                    doc = doc + page.extractText() + " "
            except:
                FILE_READER_PROTOCOL[file_name] = "file can not be read"
            if len(doc) > 0:
                documents[file_name] = doc
        else:
            print(MC.WARNING + "File (" + file_name + ") is ignored." + MC.ENDC)
    df = pd.DataFrame.from_dict(documents, columns=['data'])
 if answers["extension"] == ".csv":
    questions2 = [{
        'type': 'input',
        'name': 'file_name',
        'message': 'Provide the name of the file',
    },
        {
            'type': 'input',
            'name': 'column_name',
            'message': 'Provide the name column that contains text data',
        }]
    answers2 = prompt(questions2)
    df = pd.read_csv(DATA_DIR + "/" + answers2["file_name"] + ".csv",
                     sep=';', error_bad_lines=False).rename(columns={
        answers2["column_name"]: "data"
    })
 print(MC.OKBLUE + MC.BOLD + "2. Step: train model ..." + MC.ENDC)
 nltk.download('stopwords')
 nltk.download('punkt')
 en_stop_words = stopwords.words('english')
 en_stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
 if df:
    training = df["data"]
    for cluster_size in range(2, 16):
        CNAME = "CLUSTER_" + str(cluster_size)
        text_clf = Pipeline([('vect', CountVectorizer(stop_words=en_stop_words,
                                                      lowercase=True,
                                                      ngram_range=(1, 2),
                                                      max_features=None)),
                             ('tfidf', TfidfTransformer()),
                             ('clf', KMeans(n_clusters=cluster_size, random_state=0))
                             ])
        classifier = text_clf.fit(training)
        cluster_labels = classifier.fit_predict(training)
        df[CNAME] = cluster_labels
 print(MC.OKBLUE + MC.BOLD + "3. Step: indexing data ..." + MC.ENDC)
 es = Elasticsearch(
    ['54.37.31.100'],
    http_auth=('oaboss', 'master'),
    port=9201,
 )
 size = df["TITLE"].count()
 for index, row in df.ix[:].iterrows():
    word_tokens = word_tokenize(row.TITLE)
    filtered_title = [w for w in word_tokens if not w in en_stop_words]
    doc = {
        "title": row.data,
        # "title_cleaned": " ".join(filtered_title),
        # "author": str(row.AUTHOR),
        # "tags": str(row.TAGS),
        # "discipline": str(row.DISCIPLINE),
        # "url": str(row.URL),
        # "notes": str(row.NOTES),
        # "group_name": str(row.GROUP_TITLE),
        # "group_description": str(row.GROUP_DESCRIPTION),
        # "group_image": str(row.GROUP_IMG_URL),
        # "language": str(row.LANG),
        # "extras": str(row.EXTRAS),
        # "published_at": row.PUBLICATION_DATE.strftime("%Y%m%d") if str(row.PUBLICATION_DATE) != 'NaT'  else "",
        "created_at": time.strftime("%Y%m%d"),
        "cluster": [
            {"id": "cluster_2_" + str(row.CLUSTER_2)},
            {"id": "cluster_3_" + str(row.CLUSTER_3)},
            {"id": "cluster_4_" + str(row.CLUSTER_4)},
            {"id": "cluster_5_" + str(row.CLUSTER_5)},
            {"id": "cluster_6_" + str(row.CLUSTER_6)},
            {"id": "cluster_7_" + str(row.CLUSTER_7)},
            {"id": "cluster_8_" + str(row.CLUSTER_8)},
            {"id": "cluster_9_" + str(row.CLUSTER_9)},
            {"id": "cluster_10_" + str(row.CLUSTER_10)},
            {"id": "cluster_11_" + str(row.CLUSTER_11)},
            {"id": "cluster_12_" + str(row.CLUSTER_12)},
            {"id": "cluster_13_" + str(row.CLUSTER_13)},
            {"id": "cluster_14_" + str(row.CLUSTER_14)},
            {"id": "cluster_15_" + str(row.CLUSTER_15)},
        ]
    }
    try:
        res = es.index(index="app", doc_type='document', body=doc)
    except:
        continue
        # sys.stdout.write('\r Progress: '+ str(index) + ' ' + str(( index / size) * 100) + ' %' + ' Error:' + str(row.AUTHOR))