Merge branch 'master' of https://bitbucket.org/textnavigation/textnavi
This commit is contained in:
commit
a92a0810f4
|
@ -1 +1,3 @@
|
||||||
*.pyc
|
*.pyc
|
||||||
|
.idea/
|
||||||
|
indexer/data/
|
||||||
|
|
|
@ -0,0 +1,24 @@
|
||||||
|
Author: Imad Hamoumi
|
||||||
|
|
||||||
|
|
||||||
|
1- Put your data into the directory /data.
|
||||||
|
2- Start the script with python run.py
|
||||||
|
3- follow the instructions
|
||||||
|
|
||||||
|
|
||||||
|
Note:
|
||||||
|
CSV:
|
||||||
|
+ Only two extensions are allowed currently. the first is csv and will be read using pandas.
|
||||||
|
+ You have to provide the name of the column where the scripte can read the text data.
|
||||||
|
|
||||||
|
PDF
|
||||||
|
+ In some cases, reading a pdf file is not allowed
|
||||||
|
+ Some PDF files are not well encoded
|
||||||
|
|
||||||
|
|
||||||
|
You can add your own training model in the pipline or change the cleaning parameters such as ngram size etc.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,153 @@
|
||||||
|
from __future__ import print_function, unicode_literals
|
||||||
|
from PyInquirer import prompt
|
||||||
|
import os
|
||||||
|
import PyPDF2
|
||||||
|
import pandas as pd
|
||||||
|
import nltk
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk import word_tokenize
|
||||||
|
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from elasticsearch import Elasticsearch
|
||||||
|
import time
|
||||||
|
|
||||||
|
DATA_DIR = "./indexer/data"
|
||||||
|
FILE_READER_PROTOCOL = {}
|
||||||
|
df = None
|
||||||
|
|
||||||
|
|
||||||
|
class MC:
|
||||||
|
HEADER = '\033[95m'
|
||||||
|
OKBLUE = '\033[94m'
|
||||||
|
OKGREEN = '\033[92m'
|
||||||
|
WARNING = '\033[93m'
|
||||||
|
FAIL = '\033[91m'
|
||||||
|
ENDC = '\033[0m'
|
||||||
|
BOLD = '\033[1m'
|
||||||
|
UNDERLINE = '\033[4m'
|
||||||
|
|
||||||
|
|
||||||
|
print(MC.OKBLUE + MC.BOLD + "1. Step: reading textual data ..." + MC.ENDC)
|
||||||
|
|
||||||
|
questions = [
|
||||||
|
{
|
||||||
|
'type': 'input',
|
||||||
|
'name': 'extension',
|
||||||
|
'message': 'Select one of these extensions [.csv, .pdf]',
|
||||||
|
}
|
||||||
|
]
|
||||||
|
answers = prompt(questions)
|
||||||
|
|
||||||
|
documents = {}
|
||||||
|
if answers["extension"] == ".pdf":
|
||||||
|
for file_name in os.listdir(DATA_DIR)[:2]:
|
||||||
|
if file_name.endswith(answers["extension"]):
|
||||||
|
file = open(DATA_DIR + "/" + file_name, 'rb')
|
||||||
|
doc = ""
|
||||||
|
try:
|
||||||
|
fileReader = PyPDF2.PdfFileReader(file)
|
||||||
|
info = fileReader.getDocumentInfo()
|
||||||
|
for page_number in range(1, fileReader.getNumPages() + 1):
|
||||||
|
page = fileReader.getPage(page_number)
|
||||||
|
doc = doc + page.extractText() + " "
|
||||||
|
except:
|
||||||
|
FILE_READER_PROTOCOL[file_name] = "file can not be read"
|
||||||
|
if len(doc) > 0:
|
||||||
|
documents[file_name] = doc
|
||||||
|
else:
|
||||||
|
print(MC.WARNING + "File (" + file_name + ") is ignored." + MC.ENDC)
|
||||||
|
df = pd.DataFrame.from_dict(documents, columns=['data'])
|
||||||
|
if answers["extension"] == ".csv":
|
||||||
|
questions2 = [{
|
||||||
|
'type': 'input',
|
||||||
|
'name': 'file_name',
|
||||||
|
'message': 'Provide the name of the file',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'type': 'input',
|
||||||
|
'name': 'column_name',
|
||||||
|
'message': 'Provide the name column that contains text data',
|
||||||
|
}]
|
||||||
|
|
||||||
|
answers2 = prompt(questions2)
|
||||||
|
df = pd.read_csv(DATA_DIR + "/" + answers2["file_name"] + ".csv",
|
||||||
|
sep=';', error_bad_lines=False).rename(columns={
|
||||||
|
answers2["column_name"]: "data"
|
||||||
|
})
|
||||||
|
|
||||||
|
print(MC.OKBLUE + MC.BOLD + "2. Step: train model ..." + MC.ENDC)
|
||||||
|
|
||||||
|
nltk.download('stopwords')
|
||||||
|
nltk.download('punkt')
|
||||||
|
en_stop_words = stopwords.words('english')
|
||||||
|
en_stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
|
||||||
|
|
||||||
|
if df:
|
||||||
|
training = df["data"]
|
||||||
|
|
||||||
|
for cluster_size in range(2, 16):
|
||||||
|
CNAME = "CLUSTER_" + str(cluster_size)
|
||||||
|
text_clf = Pipeline([('vect', CountVectorizer(stop_words=en_stop_words,
|
||||||
|
lowercase=True,
|
||||||
|
ngram_range=(1, 2),
|
||||||
|
max_features=None)),
|
||||||
|
('tfidf', TfidfTransformer()),
|
||||||
|
('clf', KMeans(n_clusters=cluster_size, random_state=0))
|
||||||
|
])
|
||||||
|
|
||||||
|
classifier = text_clf.fit(training)
|
||||||
|
cluster_labels = classifier.fit_predict(training)
|
||||||
|
df[CNAME] = cluster_labels
|
||||||
|
|
||||||
|
print(MC.OKBLUE + MC.BOLD + "3. Step: indexing data ..." + MC.ENDC)
|
||||||
|
|
||||||
|
es = Elasticsearch(
|
||||||
|
['54.37.31.100'],
|
||||||
|
http_auth=('oaboss', 'master'),
|
||||||
|
port=9201,
|
||||||
|
)
|
||||||
|
|
||||||
|
size = df["TITLE"].count()
|
||||||
|
for index, row in df.ix[:].iterrows():
|
||||||
|
|
||||||
|
word_tokens = word_tokenize(row.TITLE)
|
||||||
|
filtered_title = [w for w in word_tokens if not w in en_stop_words]
|
||||||
|
|
||||||
|
doc = {
|
||||||
|
"title": row.data,
|
||||||
|
# "title_cleaned": " ".join(filtered_title),
|
||||||
|
# "author": str(row.AUTHOR),
|
||||||
|
# "tags": str(row.TAGS),
|
||||||
|
# "discipline": str(row.DISCIPLINE),
|
||||||
|
# "url": str(row.URL),
|
||||||
|
# "notes": str(row.NOTES),
|
||||||
|
# "group_name": str(row.GROUP_TITLE),
|
||||||
|
# "group_description": str(row.GROUP_DESCRIPTION),
|
||||||
|
# "group_image": str(row.GROUP_IMG_URL),
|
||||||
|
# "language": str(row.LANG),
|
||||||
|
# "extras": str(row.EXTRAS),
|
||||||
|
# "published_at": row.PUBLICATION_DATE.strftime("%Y%m%d") if str(row.PUBLICATION_DATE) != 'NaT' else "",
|
||||||
|
"created_at": time.strftime("%Y%m%d"),
|
||||||
|
"cluster": [
|
||||||
|
{"id": "cluster_2_" + str(row.CLUSTER_2)},
|
||||||
|
{"id": "cluster_3_" + str(row.CLUSTER_3)},
|
||||||
|
{"id": "cluster_4_" + str(row.CLUSTER_4)},
|
||||||
|
{"id": "cluster_5_" + str(row.CLUSTER_5)},
|
||||||
|
{"id": "cluster_6_" + str(row.CLUSTER_6)},
|
||||||
|
{"id": "cluster_7_" + str(row.CLUSTER_7)},
|
||||||
|
{"id": "cluster_8_" + str(row.CLUSTER_8)},
|
||||||
|
{"id": "cluster_9_" + str(row.CLUSTER_9)},
|
||||||
|
{"id": "cluster_10_" + str(row.CLUSTER_10)},
|
||||||
|
{"id": "cluster_11_" + str(row.CLUSTER_11)},
|
||||||
|
{"id": "cluster_12_" + str(row.CLUSTER_12)},
|
||||||
|
{"id": "cluster_13_" + str(row.CLUSTER_13)},
|
||||||
|
{"id": "cluster_14_" + str(row.CLUSTER_14)},
|
||||||
|
{"id": "cluster_15_" + str(row.CLUSTER_15)},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
res = es.index(index="app", doc_type='document', body=doc)
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
# sys.stdout.write('\r Progress: '+ str(index) + ' ' + str(( index / size) * 100) + ' %' + ' Error:' + str(row.AUTHOR))
|
Loading…
Reference in New Issue