From 94f501ab6d9476f885a48d932c52452a3af30e10 Mon Sep 17 00:00:00 2001 From: annealias Date: Mon, 25 Mar 2019 21:44:32 +0100 Subject: [PATCH] Doc2Vec test --- src/MultinomialNaiveBayes_Word2Vec.py | 5 +- src/test.py | 128 +++++++++++++++++++++++++ src/test_2.py | 131 ++++++++++++++++++++++++++ 3 files changed, 263 insertions(+), 1 deletion(-) create mode 100644 src/test.py create mode 100644 src/test_2.py diff --git a/src/MultinomialNaiveBayes_Word2Vec.py b/src/MultinomialNaiveBayes_Word2Vec.py index 95c56a9..318440f 100644 --- a/src/MultinomialNaiveBayes_Word2Vec.py +++ b/src/MultinomialNaiveBayes_Word2Vec.py @@ -73,13 +73,16 @@ class MultinomialNaiveBayes_Word2Vec: all_data = read_corpus(X, tokens_only=False) # instantiate a Doc2Vec object - doc2vec_model = Doc2Vec(training_data, vector_size=100, window=2, min_count=1, workers=4) + doc2vec_model = Doc2Vec(training_data, vector_size=100, window=2, min_count=2, epochs = 40) + # Frage: hier dürfen keine negativen Werte drin sein für Naive Bayes? print(doc2vec_model.docvecs[0]) print(doc2vec_model.docvecs[1]) print(doc2vec_model.docvecs[2]) training_data = [doc2vec_model.docvecs[i] for i in range(len(training_data))] + + # Frage: muss man bei den testing daten auch einen tag mit machen? testing_data = [doc2vec_model.infer_vector(vector) for vector in testing_data] #fit classifier diff --git a/src/test.py b/src/test.py new file mode 100644 index 0000000..1da91ea --- /dev/null +++ b/src/test.py @@ -0,0 +1,128 @@ +from BagOfWords import BagOfWords + +import csv + +import gensim +from gensim.models.doc2vec import Doc2Vec, TaggedDocument +import numpy as np +import pandas as pd +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_selection import SelectPercentile +from sklearn.metrics import recall_score, precision_score +import sklearn +from sklearn.model_selection import StratifiedKFold +from sklearn.naive_bayes import MultinomialNB +from sklearn.model_selection import train_test_split + +# read current data set from csv +df = pd.read_csv('../data/interactive_labeling_round_11.csv', + sep='|', + usecols=range(1,13), # drop first column 'unnamed' + encoding='utf-8', + quoting=csv.QUOTE_NONNUMERIC, + quotechar='\'') + +dataset = df.loc[df['Label'] != -1][:100].reset_index(drop=True) + +train = dataset[:15] +test = dataset[15:20].reset_index(drop=True) + +classifier = MultinomialNB(alpha=1.0e-10, + fit_prior=False, + class_prior=None) + +def make_tagged_document(row): + # TaggedDocument wie wo was? + # tags (a list of tokens). Tags may be one or more unicode string tokens, + # but typical practice (which will also be the most memory-efficient) is + # for the tags list to include a unique integer id as the only tag. + # also kein Label? + + return TaggedDocument(words=BagOfWords.extract_words(row['Text']), + tags=[row['Label']]) + +tagged_train_data=train.apply(lambda row: make_tagged_document(row), axis=1) +print(tagged_train_data[0]) + +tagged_test_data=test.apply(lambda row: make_tagged_document(row), axis=1) +print(tagged_test_data[0]) + +model = Doc2Vec(vector_size=100, + min_count=20, + epochs=40, + negative=0) + +model.build_vocab(tagged_train_data) + +model.train(tagged_train_data, + total_examples=model.corpus_count, + epochs=model.epochs) + +model.docvecs.count + +y_train=np.array([doc.tags[0] for doc in tagged_train_data]) + +y_test=np.array([doc.tags[0] for doc in tagged_test_data]) + +X_train=[model.infer_vector(doc.words, steps=20) for doc in tagged_train_data] + +X_test=[model.infer_vector(doc.words, steps=20) for doc in tagged_test_data] + +# X_train=np.vstack(X_train) + +# X_test=np.vstack(X_test) + +# X_test.shape + +# y_test.shape + +# X_train.shape + +# y_train.shape + +print(X_test) +print(y_test) +print(X_train) +print(y_train) + +# reshape data + +X_train = np.array(X_train) +X_test = np.array(X_test) + +#X_train = X_train.reshape((X_train.shape[0],1,X_train.shape[1])) +#X_test = X_test.reshape((X_test.shape[0],1,X_test.shape[1])) +X_train.shape +X_test.shape + + +#fit classifier +classifier.fit(X_train, y_train) +#predict class +predictions_train = classifier.predict(X_train) +predictions_test = classifier.predict(X_test) + +#print and store metrics +rec = recall_score(y_test, predictions_test, average='weighted') +print('rec: ' + str(rec)) +recall_scores.append(rec) +prec = precision_score(y_test, predictions_test, average='weighted') +print('prec: ' + str(prec)) +print('#') +precision_scores.append(prec) +# equation for f1 score +f1_scores.append(2 * (prec * rec)/(prec + rec)) + +########################## +# probability estimates for the test vector (testing_data) +class_probs = classifier.predict_proba(testing_data) + +# number of samples encountered for each class during fitting +# this value is weighted by the sample weight when provided +class_count = classifier.class_count_ + +# classes in order used +classes = classifier.classes_ + +# return classes and vector of class estimates +print (recall_scores, precision_scores, f1_scores, class_probs) \ No newline at end of file diff --git a/src/test_2.py b/src/test_2.py new file mode 100644 index 0000000..2b49dc9 --- /dev/null +++ b/src/test_2.py @@ -0,0 +1,131 @@ +from BagOfWords import BagOfWords + +import csv + +import gensim +from gensim.models.doc2vec import Doc2Vec, TaggedDocument +import numpy as np +import pandas as pd +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_selection import SelectPercentile +from sklearn.metrics import recall_score, precision_score +import sklearn +from sklearn.model_selection import StratifiedKFold +from sklearn.naive_bayes import MultinomialNB +from sklearn.model_selection import train_test_split + +# read current data set from csv +df = pd.read_csv('../data/interactive_labeling_round_11.csv', + sep='|', + usecols=range(1,13), # drop first column 'unnamed' + encoding='utf-8', + quoting=csv.QUOTE_NONNUMERIC, + quotechar='\'') + +dataset = df.loc[df['Label'] != -1].reset_index(drop=True) + +X = dataset['Title'] + '. ' + dataset['Text'] +y = dataset['Label'] + +classifier = MultinomialNB(alpha=1.0e-10, + fit_prior=False, + class_prior=None) + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=5) + +def read_corpus(data, tokens_only=False): + list_of_lists = [] + for i, text in enumerate(data): + if tokens_only: + list_of_lists.append(BagOfWords.extract_words(text)) + else: + # For training data, add tags + list_of_lists.append(gensim.models.doc2vec.TaggedDocument(BagOfWords.extract_words(text), [i])) + return list_of_lists + +tagged_train_data = read_corpus(X_train, tokens_only=False) + +print('tagged_train_data[0]:') +print(tagged_train_data[0]) + +tagged_test_data = read_corpus(X_test, tokens_only=False) + +print('tagged_test_data[0]:') +print(tagged_test_data[0]) + +model = Doc2Vec(vector_size=100, + min_count=20, + epochs=40, + negative=0) + +model.build_vocab(tagged_train_data) + +model.train(tagged_train_data, + total_examples=model.corpus_count, + epochs=model.epochs) + +model.docvecs.count + +#y_train=np.array([doc.tags[0] for doc in tagged_train_data]) +#y_test=np.array([doc.tags[0] for doc in tagged_test_data]) + +X_train=[model.infer_vector(doc.words, steps=20) for doc in tagged_train_data] + +X_test=[model.infer_vector(doc.words, steps=20) for doc in tagged_test_data] + +X_train=np.vstack(X_train) + +X_test=np.vstack(X_test) + +X_test.shape + +y_test.shape + +X_train.shape + +y_train.shape + +print('X_test:') +print(X_test) + +print('y_test:') +print(y_test) + +print('X_train:') +print(X_train) + +print('y_train:') +print(y_train) + +# hier: ValueError: Input X must be non-negative + +#fit classifier +classifier.fit(X_train, y_train) +#predict class +predictions_train = classifier.predict(X_train) +predictions_test = classifier.predict(X_test) + +#print and store metrics +rec = recall_score(y_test, predictions_test, average='weighted') +print('rec: ' + str(rec)) +recall_scores.append(rec) +prec = precision_score(y_test, predictions_test, average='weighted') +print('prec: ' + str(prec)) +print('#') +precision_scores.append(prec) +# equation for f1 score +f1_scores.append(2 * (prec * rec)/(prec + rec)) + +########################## +# probability estimates for the test vector (testing_data) +class_probs = classifier.predict_proba(testing_data) + +# number of samples encountered for each class during fitting +# this value is weighted by the sample weight when provided +class_count = classifier.class_count_ + +# classes in order used +classes = classifier.classes_ + +# return classes and vector of class estimates +print (recall_scores, precision_scores, f1_scores, class_probs) \ No newline at end of file