Doc2Vec test
This commit is contained in:
parent
86e34de8ab
commit
94f501ab6d
|
@ -73,13 +73,16 @@ class MultinomialNaiveBayes_Word2Vec:
|
||||||
all_data = read_corpus(X, tokens_only=False)
|
all_data = read_corpus(X, tokens_only=False)
|
||||||
|
|
||||||
# instantiate a Doc2Vec object
|
# instantiate a Doc2Vec object
|
||||||
doc2vec_model = Doc2Vec(training_data, vector_size=100, window=2, min_count=1, workers=4)
|
doc2vec_model = Doc2Vec(training_data, vector_size=100, window=2, min_count=2, epochs = 40)
|
||||||
|
|
||||||
|
# Frage: hier dürfen keine negativen Werte drin sein für Naive Bayes?
|
||||||
print(doc2vec_model.docvecs[0])
|
print(doc2vec_model.docvecs[0])
|
||||||
print(doc2vec_model.docvecs[1])
|
print(doc2vec_model.docvecs[1])
|
||||||
print(doc2vec_model.docvecs[2])
|
print(doc2vec_model.docvecs[2])
|
||||||
|
|
||||||
training_data = [doc2vec_model.docvecs[i] for i in range(len(training_data))]
|
training_data = [doc2vec_model.docvecs[i] for i in range(len(training_data))]
|
||||||
|
|
||||||
|
# Frage: muss man bei den testing daten auch einen tag mit machen?
|
||||||
testing_data = [doc2vec_model.infer_vector(vector) for vector in testing_data]
|
testing_data = [doc2vec_model.infer_vector(vector) for vector in testing_data]
|
||||||
|
|
||||||
#fit classifier
|
#fit classifier
|
||||||
|
|
|
@ -0,0 +1,128 @@
|
||||||
|
from BagOfWords import BagOfWords
|
||||||
|
|
||||||
|
import csv
|
||||||
|
|
||||||
|
import gensim
|
||||||
|
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.feature_selection import SelectPercentile
|
||||||
|
from sklearn.metrics import recall_score, precision_score
|
||||||
|
import sklearn
|
||||||
|
from sklearn.model_selection import StratifiedKFold
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
# read current data set from csv
|
||||||
|
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
|
||||||
|
sep='|',
|
||||||
|
usecols=range(1,13), # drop first column 'unnamed'
|
||||||
|
encoding='utf-8',
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
|
quotechar='\'')
|
||||||
|
|
||||||
|
dataset = df.loc[df['Label'] != -1][:100].reset_index(drop=True)
|
||||||
|
|
||||||
|
train = dataset[:15]
|
||||||
|
test = dataset[15:20].reset_index(drop=True)
|
||||||
|
|
||||||
|
classifier = MultinomialNB(alpha=1.0e-10,
|
||||||
|
fit_prior=False,
|
||||||
|
class_prior=None)
|
||||||
|
|
||||||
|
def make_tagged_document(row):
|
||||||
|
# TaggedDocument wie wo was?
|
||||||
|
# tags (a list of tokens). Tags may be one or more unicode string tokens,
|
||||||
|
# but typical practice (which will also be the most memory-efficient) is
|
||||||
|
# for the tags list to include a unique integer id as the only tag.
|
||||||
|
# also kein Label?
|
||||||
|
|
||||||
|
return TaggedDocument(words=BagOfWords.extract_words(row['Text']),
|
||||||
|
tags=[row['Label']])
|
||||||
|
|
||||||
|
tagged_train_data=train.apply(lambda row: make_tagged_document(row), axis=1)
|
||||||
|
print(tagged_train_data[0])
|
||||||
|
|
||||||
|
tagged_test_data=test.apply(lambda row: make_tagged_document(row), axis=1)
|
||||||
|
print(tagged_test_data[0])
|
||||||
|
|
||||||
|
model = Doc2Vec(vector_size=100,
|
||||||
|
min_count=20,
|
||||||
|
epochs=40,
|
||||||
|
negative=0)
|
||||||
|
|
||||||
|
model.build_vocab(tagged_train_data)
|
||||||
|
|
||||||
|
model.train(tagged_train_data,
|
||||||
|
total_examples=model.corpus_count,
|
||||||
|
epochs=model.epochs)
|
||||||
|
|
||||||
|
model.docvecs.count
|
||||||
|
|
||||||
|
y_train=np.array([doc.tags[0] for doc in tagged_train_data])
|
||||||
|
|
||||||
|
y_test=np.array([doc.tags[0] for doc in tagged_test_data])
|
||||||
|
|
||||||
|
X_train=[model.infer_vector(doc.words, steps=20) for doc in tagged_train_data]
|
||||||
|
|
||||||
|
X_test=[model.infer_vector(doc.words, steps=20) for doc in tagged_test_data]
|
||||||
|
|
||||||
|
# X_train=np.vstack(X_train)
|
||||||
|
|
||||||
|
# X_test=np.vstack(X_test)
|
||||||
|
|
||||||
|
# X_test.shape
|
||||||
|
|
||||||
|
# y_test.shape
|
||||||
|
|
||||||
|
# X_train.shape
|
||||||
|
|
||||||
|
# y_train.shape
|
||||||
|
|
||||||
|
print(X_test)
|
||||||
|
print(y_test)
|
||||||
|
print(X_train)
|
||||||
|
print(y_train)
|
||||||
|
|
||||||
|
# reshape data
|
||||||
|
|
||||||
|
X_train = np.array(X_train)
|
||||||
|
X_test = np.array(X_test)
|
||||||
|
|
||||||
|
#X_train = X_train.reshape((X_train.shape[0],1,X_train.shape[1]))
|
||||||
|
#X_test = X_test.reshape((X_test.shape[0],1,X_test.shape[1]))
|
||||||
|
X_train.shape
|
||||||
|
X_test.shape
|
||||||
|
|
||||||
|
|
||||||
|
#fit classifier
|
||||||
|
classifier.fit(X_train, y_train)
|
||||||
|
#predict class
|
||||||
|
predictions_train = classifier.predict(X_train)
|
||||||
|
predictions_test = classifier.predict(X_test)
|
||||||
|
|
||||||
|
#print and store metrics
|
||||||
|
rec = recall_score(y_test, predictions_test, average='weighted')
|
||||||
|
print('rec: ' + str(rec))
|
||||||
|
recall_scores.append(rec)
|
||||||
|
prec = precision_score(y_test, predictions_test, average='weighted')
|
||||||
|
print('prec: ' + str(prec))
|
||||||
|
print('#')
|
||||||
|
precision_scores.append(prec)
|
||||||
|
# equation for f1 score
|
||||||
|
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||||
|
|
||||||
|
##########################
|
||||||
|
# probability estimates for the test vector (testing_data)
|
||||||
|
class_probs = classifier.predict_proba(testing_data)
|
||||||
|
|
||||||
|
# number of samples encountered for each class during fitting
|
||||||
|
# this value is weighted by the sample weight when provided
|
||||||
|
class_count = classifier.class_count_
|
||||||
|
|
||||||
|
# classes in order used
|
||||||
|
classes = classifier.classes_
|
||||||
|
|
||||||
|
# return classes and vector of class estimates
|
||||||
|
print (recall_scores, precision_scores, f1_scores, class_probs)
|
|
@ -0,0 +1,131 @@
|
||||||
|
from BagOfWords import BagOfWords
|
||||||
|
|
||||||
|
import csv
|
||||||
|
|
||||||
|
import gensim
|
||||||
|
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.feature_selection import SelectPercentile
|
||||||
|
from sklearn.metrics import recall_score, precision_score
|
||||||
|
import sklearn
|
||||||
|
from sklearn.model_selection import StratifiedKFold
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
# read current data set from csv
|
||||||
|
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
|
||||||
|
sep='|',
|
||||||
|
usecols=range(1,13), # drop first column 'unnamed'
|
||||||
|
encoding='utf-8',
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
|
quotechar='\'')
|
||||||
|
|
||||||
|
dataset = df.loc[df['Label'] != -1].reset_index(drop=True)
|
||||||
|
|
||||||
|
X = dataset['Title'] + '. ' + dataset['Text']
|
||||||
|
y = dataset['Label']
|
||||||
|
|
||||||
|
classifier = MultinomialNB(alpha=1.0e-10,
|
||||||
|
fit_prior=False,
|
||||||
|
class_prior=None)
|
||||||
|
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=5)
|
||||||
|
|
||||||
|
def read_corpus(data, tokens_only=False):
|
||||||
|
list_of_lists = []
|
||||||
|
for i, text in enumerate(data):
|
||||||
|
if tokens_only:
|
||||||
|
list_of_lists.append(BagOfWords.extract_words(text))
|
||||||
|
else:
|
||||||
|
# For training data, add tags
|
||||||
|
list_of_lists.append(gensim.models.doc2vec.TaggedDocument(BagOfWords.extract_words(text), [i]))
|
||||||
|
return list_of_lists
|
||||||
|
|
||||||
|
tagged_train_data = read_corpus(X_train, tokens_only=False)
|
||||||
|
|
||||||
|
print('tagged_train_data[0]:')
|
||||||
|
print(tagged_train_data[0])
|
||||||
|
|
||||||
|
tagged_test_data = read_corpus(X_test, tokens_only=False)
|
||||||
|
|
||||||
|
print('tagged_test_data[0]:')
|
||||||
|
print(tagged_test_data[0])
|
||||||
|
|
||||||
|
model = Doc2Vec(vector_size=100,
|
||||||
|
min_count=20,
|
||||||
|
epochs=40,
|
||||||
|
negative=0)
|
||||||
|
|
||||||
|
model.build_vocab(tagged_train_data)
|
||||||
|
|
||||||
|
model.train(tagged_train_data,
|
||||||
|
total_examples=model.corpus_count,
|
||||||
|
epochs=model.epochs)
|
||||||
|
|
||||||
|
model.docvecs.count
|
||||||
|
|
||||||
|
#y_train=np.array([doc.tags[0] for doc in tagged_train_data])
|
||||||
|
#y_test=np.array([doc.tags[0] for doc in tagged_test_data])
|
||||||
|
|
||||||
|
X_train=[model.infer_vector(doc.words, steps=20) for doc in tagged_train_data]
|
||||||
|
|
||||||
|
X_test=[model.infer_vector(doc.words, steps=20) for doc in tagged_test_data]
|
||||||
|
|
||||||
|
X_train=np.vstack(X_train)
|
||||||
|
|
||||||
|
X_test=np.vstack(X_test)
|
||||||
|
|
||||||
|
X_test.shape
|
||||||
|
|
||||||
|
y_test.shape
|
||||||
|
|
||||||
|
X_train.shape
|
||||||
|
|
||||||
|
y_train.shape
|
||||||
|
|
||||||
|
print('X_test:')
|
||||||
|
print(X_test)
|
||||||
|
|
||||||
|
print('y_test:')
|
||||||
|
print(y_test)
|
||||||
|
|
||||||
|
print('X_train:')
|
||||||
|
print(X_train)
|
||||||
|
|
||||||
|
print('y_train:')
|
||||||
|
print(y_train)
|
||||||
|
|
||||||
|
# hier: ValueError: Input X must be non-negative
|
||||||
|
|
||||||
|
#fit classifier
|
||||||
|
classifier.fit(X_train, y_train)
|
||||||
|
#predict class
|
||||||
|
predictions_train = classifier.predict(X_train)
|
||||||
|
predictions_test = classifier.predict(X_test)
|
||||||
|
|
||||||
|
#print and store metrics
|
||||||
|
rec = recall_score(y_test, predictions_test, average='weighted')
|
||||||
|
print('rec: ' + str(rec))
|
||||||
|
recall_scores.append(rec)
|
||||||
|
prec = precision_score(y_test, predictions_test, average='weighted')
|
||||||
|
print('prec: ' + str(prec))
|
||||||
|
print('#')
|
||||||
|
precision_scores.append(prec)
|
||||||
|
# equation for f1 score
|
||||||
|
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||||
|
|
||||||
|
##########################
|
||||||
|
# probability estimates for the test vector (testing_data)
|
||||||
|
class_probs = classifier.predict_proba(testing_data)
|
||||||
|
|
||||||
|
# number of samples encountered for each class during fitting
|
||||||
|
# this value is weighted by the sample weight when provided
|
||||||
|
class_count = classifier.class_count_
|
||||||
|
|
||||||
|
# classes in order used
|
||||||
|
classes = classifier.classes_
|
||||||
|
|
||||||
|
# return classes and vector of class estimates
|
||||||
|
print (recall_scores, precision_scores, f1_scores, class_probs)
|
Loading…
Reference in New Issue