update tri-model and try word2vec

This commit is contained in:
annealias 2019-03-25 12:41:10 +01:00
parent c87a85b818
commit ea0a132bd6
10 changed files with 21173 additions and 199 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -24,7 +24,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@ -209,7 +209,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@ -218,7 +218,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 6,
"metadata": {},
"outputs": [
{
@ -233,7 +233,7 @@
],
"source": [
"# read current data set from csv\n",
"df = pd.read_csv('../data/interactive_labeling_round_{}_temp.csv'.format(m),\n",
"df = pd.read_csv('../data/interactive_labeling_round_{}.csv'.format(m),\n",
" sep='|',\n",
" usecols=range(1,13), # drop first column 'unnamed'\n",
" encoding='utf-8',\n",
@ -725,27 +725,15 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'pd' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-1-a51d7411db70>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;31m# read current data set from csv\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mm\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m9\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m df = pd.read_csv('../data/interactive_labeling_round_{}.csv'.format(m),\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[0msep\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'|'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0musecols\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m13\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;31m# drop first column 'unnamed'\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mNameError\u001b[0m: name 'pd' is not defined"
]
}
],
"outputs": [],
"source": [
"# THIS CELL IS OPTIONAL\n",
"\n",
"# read current data set from csv\n",
"m = 9\n",
"df = pd.read_csv('../data/interactive_labeling_round_{}_corrected.csv'.format(m),\n",
"m = 11\n",
"df = pd.read_csv('../data/interactive_labeling_round_{}.csv'.format(m),\n",
" sep='|',\n",
" usecols=range(1,13), # drop first column 'unnamed'\n",
" encoding='utf-8',\n",

View File

@ -0,0 +1,744 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model Evaluation"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import operator\n",
"import pickle\n",
"import random\n",
"\n",
"from ipywidgets import interact, interactive, fixed, interact_manual\n",
"import ipywidgets as widgets\n",
"from IPython.core.interactiveshell import InteractiveShell\n",
"from IPython.display import display\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.metrics import recall_score, precision_score, precision_recall_fscore_support\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"\n",
"from MNBInteractive import MNBInteractive\n",
"from MultinomialNaiveBayes import MultinomialNaiveBayes\n",
"from NaiveBayes import NaiveBayes"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# initialize random => reproducible sequence\n",
"random.seed(5)\n",
"\n",
"# set up wider display area\n",
"pd.set_option('display.max_colwidth', -1)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Last round number: 11\n",
"Number of manually labeled articles: 1082\n",
"Number of manually unlabeled articles: 8918\n"
]
}
],
"source": [
"# read current data set from csv\n",
"df = pd.read_csv('../data/interactive_labeling_round_11.csv',\n",
" sep='|',\n",
" usecols=range(1,13), # drop first column 'unnamed'\n",
" encoding='utf-8',\n",
" quoting=csv.QUOTE_NONNUMERIC,\n",
" quotechar='\\'')\n",
"\n",
"# find current iteration/round number\n",
"m = int(df['Round'].max())\n",
"print('Last round number: {}'.format(m))\n",
"print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n",
"print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def show_next(index):\n",
" ''' this method displays an article's text and an interactive slider to set its label manually\n",
" '''\n",
" print('News article no. {}:'.format(index))\n",
" print()\n",
" print('HEADLINE:')\n",
" print(df.loc[df['Index'] == index, 'Title'])\n",
" print()\n",
" print('TEXT:')\n",
" print(df.loc[df['Index'] == index, 'Text'])\n",
" print()\n",
" print('ESTIMATED_0:')\n",
" print(df.loc[df['Index'] == index, 'Estimated_0'])\n",
" print()\n",
" print('ESTIMATED_1:')\n",
" print(df.loc[df['Index'] == index, 'Estimated_1'])\n",
" print()\n",
" print('ESTIMATED_2:')\n",
" print(df.loc[df['Index'] == index, 'Estimated_2'])\n",
" \n",
" def f(x):\n",
" # save user input\n",
" df.loc[df['Index'] == index, 'Label'] = x\n",
"\n",
" # create slider widget for labels\n",
" interact(f, x = widgets.IntSlider(min=-1, max=2, step=1, value=df.loc[df['Index'] == index, 'Label']))\n",
" print('0: Other/Unrelated news, 1: Merger,')\n",
" print('2: Topics related to deals, investments and mergers')\n",
" print('___________________________________________________________________________________________________________')\n",
" print()\n",
" print()\n",
"\n",
"# list of article indices that will be shown next\n",
"label_next = []"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## How to find a better model:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A) Multinomial Naive Bayes Algorithm"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"recall_scores, precision_scores, f1_scores, class_probs = MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# toDo: läuft noch nicht\n",
"\n",
"# series of indices of recently estimated articles \n",
"indices_estimated = df.loc[df['Label'] != -1, 'Index'].tolist()\n",
"\n",
"# annotate probability\n",
"n = 0\n",
"for row in class_probs[0]:\n",
" index = indices_estimated[n]\n",
" # save estimated label\n",
" df.loc[index, 'Estimated_2'] = row[1]\n",
" n += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Recall (Min): {}\".format(min(recall_scores)))\n",
"print(\"Recall (Max): {}\".format(max(recall_scores)))\n",
"print(\"Recall (Average): {}\".format(sum(recall_scores)/10))\n",
"print()\n",
"print(\"Precision (Min): {}\".format(min(precision_scores)))\n",
"print(\"Precision (Max): {}\".format(max(precision_scores)))\n",
"print(\"Precision (Average): {}\".format(sum(precision_scores)/10))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('confusion matrix:')\n",
"print('###############')\n",
"zero_0 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 0)])\n",
"zero_0\n",
"zero_1 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 1)])\n",
"zero_1\n",
"zero_2 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 2)])\n",
"zero_2\n",
"print('/')\n",
"one_0 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 0)])\n",
"one_0\n",
"one_1 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 1)])\n",
"one_1\n",
"one_2 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 2)])\n",
"one_2\n",
"print('/')\n",
"\n",
"two_0 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 0)])\n",
"two_0\n",
"two_1 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 1)])\n",
"two_1\n",
"two_2 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 2)])\n",
"two_2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Building three separate models:\n",
"\n",
"B) One model per class: Funktioniert es besser wenn man 3 Modelle hat.\n",
"Begründung: wir sind interessiert an Klasse 1\n",
"Pro Klasse 1 Modell bauen (Hier ist das ziel das beste Modell zu finden. Dafür nehmen wir 1082 gelabelte Daten.)\n",
"3 Modelle => Ergebnis für 1 Sample: (70%, 40%, 80%) unklar => überprüfen\n",
"=> (90%, 90%, 90%) => überprüfen\n",
"liefert das bessere ambiguity Samples als oben\n",
"Stratified sample: (50 + 50 (1/2 von der anderen Klasse 1/2 der dritten Klasse))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"labeled_pos_0 = df.loc[df['Label'] == 0].reset_index(drop=True)\n",
"labeled_pos_1 = df.loc[df['Label'] == 1].reset_index(drop=True)\n",
"labeled_pos_2 = df.loc[df['Label'] == 2].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"847"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(labeled_pos_0)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"50"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(labeled_pos_1)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"185"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(labeled_pos_2)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# add three new columns for the three models, initialize with nans\n",
"df['Estimated_0'] = np.nan\n",
"df['Estimated_1'] = np.nan\n",
"df['Estimated_2'] = np.nan"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"sampling_class0_0 = labeled_pos_0.sample(n=100, random_state=5)\n",
"sampling_class0_1 = labeled_pos_1.sample(n=50, random_state=5)\n",
"sampling_class0_2 = labeled_pos_2.sample(n=50, random_state=5)\n",
"\n",
"sampling_class1_0 = labeled_pos_0.sample(n=25, random_state=5)\n",
"sampling_class1_1 = sampling_class0_1\n",
"sampling_class1_2 = labeled_pos_2.sample(n=25, random_state=5)\n",
"\n",
"sampling_class2_0 = labeled_pos_0.sample(n=50, random_state=5)\n",
"sampling_class2_1 = sampling_class0_1\n",
"sampling_class2_2 = labeled_pos_2.sample(n=100, random_state=5)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"sampling_class0_complement = pd.concat([sampling_class0_1, sampling_class0_2])\n",
"sampling_class1_complement = pd.concat([sampling_class1_0, sampling_class1_2])\n",
"sampling_class2_complement = pd.concat([sampling_class2_0, sampling_class2_1])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# prepare for binary classification:\n",
"# pos_label = 3\n",
"sampling_class0_0['Label'] = 3\n",
"sampling_class1_1['Label'] = 3\n",
"sampling_class2_2['Label'] = 3\n",
"# neg_label = 4\n",
"sampling_class0_complement['Label'] = 4\n",
"sampling_class1_complement['Label'] = 4\n",
"sampling_class2_complement['Label'] = 4"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"sampling_class0 = pd.concat([sampling_class0_0, sampling_class0_complement]).reset_index(drop=True)\n",
"sampling_class1 = pd.concat([sampling_class1_1, sampling_class1_complement]).reset_index(drop=True)\n",
"sampling_class2 = pd.concat([sampling_class2_2, sampling_class2_complement]).reset_index(drop=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Apply Naive Bayes Model to estimate all labeled articles (1082 samples):"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"200"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_data = sampling_class2\n",
"indices_train = train_data['Index'].tolist()\n",
"len(indices_train)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"882"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_data = df.loc[(df['Label'] != -1) & (~df['Index'].isin(indices_train))].reset_index(drop=True)\n",
"len(test_data)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"test_data.loc[(test_data['Label'] == 0), 'Label'] = 3\n",
"test_data.loc[(test_data['Label'] == 1) | (test_data['Label'] == 2), 'Label'] = 4"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# split training data into text and label set\n",
"# join title and text\n",
"X = train_data['Title'] + '. ' + train_data['Text']\n",
"y = train_data['Label']"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# split testing data into text and label set\n",
"U = test_data['Title'] + '. ' + test_data['Text']\n",
"v = test_data['Label']"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"recall: 0.19949811794228356\n",
"precision: 0.803030303030303\n"
]
}
],
"source": [
"classifier = GaussianNB()\n",
"\n",
"cv = CountVectorizer()\n",
"\n",
"# probabilities of each class\n",
"class_probs = []\n",
"\n",
"# use sklearn CountVectorizer\n",
"# fit the training data and then return the matrix\n",
"training_data = cv.fit_transform(X, y).toarray()\n",
"# transform testing data and return the matrix\n",
"testing_data = cv.transform(U).toarray()\n",
"\n",
"#fit classifier\n",
"classifier.fit(training_data, y)\n",
"\n",
"#predict class\n",
"predictions_test = classifier.predict(testing_data)\n",
"\n",
"class_probs = classifier.predict_proba(testing_data)\n",
"\n",
"#print and store metrics\n",
"rec = recall_score(v, predictions_test, pos_label=3)\n",
"print('recall: ' + str(rec))\n",
"prec = precision_score(v, predictions_test, pos_label=3)\n",
"print('precision: ' + str(prec))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class_probs[:10]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# series of indices of recently estimated articles \n",
"indices_estimated_2 = test_data['Index'].tolist()\n",
"\n",
"# annotate probability\n",
"n = 0\n",
"for row in class_probs:\n",
" index = indices_estimated_2[n]\n",
" # save estimated label\n",
" df.loc[index, 'Estimated_2'] = row[0]\n",
" n += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Apply Naive Bayes Model (10-fold-cross validation):"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataset = sampling_class0\n",
"\n",
"X = dataset['Title'] + '. ' + dataset['Text']\n",
"y = dataset['Label']\n",
"\n",
"cv = CountVectorizer()\n",
"\n",
"# use stratified k-fold cross-validation as split method\n",
"skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)\n",
"\n",
"classifier = GaussianNB()\n",
"\n",
"# metrics\n",
"recall_scores = []\n",
"precision_scores = []\n",
"\n",
"# probabilities of each class (of each fold)\n",
"class_probs = []\n",
"# counts number of training samples observed in each class \n",
"class_counts = []\n",
"\n",
"# for each fold\n",
"for train, test in skf.split(X,y):\n",
" \n",
" # fit the training data and then return the matrix\n",
" training_data = cv.fit_transform(X[train], y[train]).toarray()\n",
" # transform testing data and return the matrix\n",
" testing_data = cv.transform(X[test]).toarray()\n",
"\n",
" #fit classifier\n",
" classifier.fit(training_data, y[train])\n",
" #predict class\n",
" predictions_train = classifier.predict(training_data)\n",
" predictions_test = classifier.predict(testing_data)\n",
"\n",
" #print and store metrics\n",
" rec = recall_score(y[test], predictions_test)\n",
" recall_scores.append(rec)\n",
" prec = precision_score(y[test], predictions_test)\n",
" precision_scores.append(prec)\n",
"\n",
" class_probs.append(classifier.class_prior_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# series of indices of recently estimated articles \n",
"indices_estimated_0 = sampling_class0['Index'].tolist()\n",
"\n",
"# annotate probability\n",
"n = 0\n",
"for row in class_probs:\n",
" index = indices_estimated_0[n]\n",
" # save estimated label\n",
" df.loc[index, 'Estimated_0'] = row[1]\n",
" n += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Recall (Min): {}\".format(min(recall_scores)))\n",
"print(\"Recall (Max): {}\".format(max(recall_scores)))\n",
"print(\"Recall (Average): {}\".format(sum(recall_scores)/10))\n",
"print()\n",
"print(\"Precision (Min): {}\".format(min(precision_scores)))\n",
"print(\"Precision (Max): {}\".format(max(precision_scores)))\n",
"print(\"Precision (Average): {}\".format(sum(precision_scores)/10))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Number of used samples:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"indices_all_samples = set((indices_estimated_0 + indices_estimated_1) + indices_estimated_2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(indices_all_samples)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Check if there are samples where more than one class was marked with 1."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) > 1.0)])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"indices = df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) > 1.0), 'Index'].tolist()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# save tri-model to csv \n",
"df.to_csv('../data/interactive_labeling_triple_model_auf_1082.csv',\n",
" sep='|',\n",
" mode='w',\n",
" encoding='utf-8',\n",
" quoting=csv.QUOTE_NONNUMERIC,\n",
" quotechar='\\'')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# read current data set from csv\n",
"df = pd.read_csv('../data/interactive_labeling_triple_model_auf_1082.csv',\n",
" sep='|',\n",
" usecols=range(1,16), # drop first column 'unnamed'\n",
" encoding='utf-8',\n",
" quoting=csv.QUOTE_NONNUMERIC,\n",
" quotechar='\\'')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for index in indices:\n",
" show_next(index)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -53,7 +53,7 @@ class BagOfWords:
for word in words:
word = word.lower()
# check if alphabetic and not stop word
if (word.isalpha()):# and word not in stop_words):
if (word.isalpha() and word not in stop_words):
if stemming:
# reduce word to its stem
word = stemmer.stem(word)

View File

@ -1,6 +1,6 @@
'''
Multinomial Naive Bayes Classifier
======================
==================================
'''
from BagOfWords import BagOfWords
@ -11,6 +11,7 @@ import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
@ -19,19 +20,13 @@ class MultinomialNaiveBayes:
def make_mnb(dataset, sklearn_cv=True, percentile=100):
'''fits naive bayes model with StratifiedKFold
'''
print('# starting classical multinomial naive bayes')
print('# starting multinomial naive bayes')
print('# ...')
# split data into text and label set
# join title and text
X = dataset['Title'] + '. ' + dataset['Text']
print(X[:12])
y = dataset['Label']
print(y[:12])
# bis hierhin stimmt noch alles...
if sklearn_cv:
cv = CountVectorizer()
@ -63,14 +58,6 @@ class MultinomialNaiveBayes:
if sklearn_cv:
# use sklearn CountVectorizer
# fit the training data and then return the matrix
print('Title + Text von train')
# aber ab hier stimmt was nicht. irgendwie rutschen da NaNs mit rein...
print(X[train])
print('Label von train')
print(y[train])
training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix
@ -136,8 +123,12 @@ class MultinomialNaiveBayes:
# classes in order used
classes = classifier.classes_
print('average: recall, precision, f1 score')
print(sum(recall_scores)/10, sum(precision_scores)/10, sum(f1_scores)/10)
# return classes and vector of class estimates
return recall_scores, precision_scores, f1_scores
return recall_scores, precision_scores, f1_scores, class_probs
######## nur für resubstitutionsfehler benötigt ########
def analyze_errors(training, testing):
@ -204,7 +195,4 @@ if __name__ == '__main__':
quotechar='\'')
# select only labeled articles
#print('Anzahl aller gelabelten:')
#print(len(df.loc[df['Label'] != -1]))
#print(df.loc[df['Label'] != -1][:5])
MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reindex(), sklearn_cv=True, percentile=100)
MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=True, percentile=100)

View File

@ -0,0 +1,135 @@
'''
Multinomial Naive Bayes Classifier
==================================
'''
from BagOfWords import BagOfWords
import csv
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
class MultinomialNaiveBayes:
def make_mnb(dataset, sklearn_cv=True, percentile=100):
'''fits naive bayes model with StratifiedKFold
'''
vector_size=150
def read_corpus(data, tokens_only=False):
list_of_lists = []
for i, text in enumerate(data):
if tokens_only:
list_of_lists.append(BagOfWords.extract_words(text))
else:
# For training data, add tags
list_of_lists.append(gensim.models.doc2vec.TaggedDocument(BagOfWords.extract_words(text), [i]))
return list_of_lists
print('# starting multinomial naive bayes')
print('# ...')
# split data into text and label set
# join title and text
X = dataset['Title'] + '. ' + dataset['Text']
y = dataset['Label']
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
classifier = MultinomialNB(alpha=1.0e-10,
fit_prior=False,
class_prior=None)
# metrics
recall_scores = []
precision_scores = []
f1_scores = []
# probabilities of each class (of each fold)
#class_prob = []
# counts number of training samples observed in each class
#class_counts = []
# for each fold
n = 0
for train, test in skf.split(X,y):
n += 1
print('# split no. ' + str(n))
# train model with gensim
training_data = read_corpus(X[train], tokens_only=False)
testing_data = read_corpus(X[test], tokens_only=True)
all_data = read_corpus(X, tokens_only=False)
# instantiate a Doc2Vec object
doc2vec_model = Doc2Vec(training_data, vector_size=5, window=2, min_count=1, workers=4)
print(doc2vec_model.docvecs[0])
print(doc2vec_model.docvecs[1])
print(doc2vec_model.docvecs[2])
training_data = [doc2vec_model.docvecs[i] for i in range(len(training_data))]
testing_data = [doc2vec_model.infer_vector(vector) for vector in testing_data]
#fit classifier
classifier.fit(training_data, y[train])
#predict class
predictions_train = classifier.predict(training_data)
predictions_test = classifier.predict(testing_data)
#print and store metrics
rec = recall_score(y[test], predictions_test, average='weighted')
print('rec: ' + str(rec))
recall_scores.append(rec)
prec = precision_score(y[test], predictions_test, average='weighted')
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
##########################
# probability estimates for the test vector (testing_data)
class_probs = classifier.predict_proba(testing_data)
# number of samples encountered for each class during fitting
# this value is weighted by the sample weight when provided
class_count = classifier.class_count_
# classes in order used
classes = classifier.classes_
print('average: recall, precision, f1 score')
print(sum(recall_scores)/10, sum(precision_scores)/10, sum(f1_scores)/10)
# return classes and vector of class estimates
return recall_scores, precision_scores, f1_scores, class_probs
if __name__ == '__main__':
# read csv file
print('# reading dataset')
print('# ...')
# read current data set from csv
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
sep='|',
usecols=range(1,13), # drop first column 'unnamed'
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
# select only labeled articles
MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1][:100].reset_index(drop=True), sklearn_cv=False, percentile=100)

View File

@ -48,7 +48,7 @@ class NaiveBayes:
# metrics
recall_scores = []
precision_scores = []
f1_scores = []
#f1_scores = []
# probabilities of each class (of each fold)
class_prob = []
@ -106,37 +106,39 @@ class NaiveBayes:
print('#')
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
#f1_scores.append(2 * (prec * rec)/(prec + rec))
class_prob.append(classifier.class_prior_)
class_counts.append(classifier.class_count_)
##########################
#print metrics of test set
print('-------------------------')
print('prediction of testing set:')
print('Precision score: min = {}, max = {}, average = {}'
.format(min(precision_scores),
max(precision_scores),
sum(precision_scores)/float(len(precision_scores))))
print('Recall score: min = {}, max = {}, average = {}'
.format(min(recall_scores),
max(recall_scores),
sum(recall_scores)/float(len(recall_scores))))
print('F1 score: min = {}, max = {}, average = {}'
.format(min(f1_scores),
max(f1_scores),
sum(f1_scores)/float(len(f1_scores))))
print()
# print probability of each class
print('probability of each class:')
print()
print(class_prob)
print()
print('number of samples of each class:')
print()
print(class_counts)
print()
# print('-------------------------')
# print('prediction of testing set:')
# print('Precision score: min = {}, max = {}, average = {}'
# .format(min(precision_scores),
# max(precision_scores),
# sum(precision_scores)/float(len(precision_scores))))
# print('Recall score: min = {}, max = {}, average = {}'
# .format(min(recall_scores),
# max(recall_scores),
# sum(recall_scores)/float(len(recall_scores))))
# print('F1 score: min = {}, max = {}, average = {}'
# .format(min(f1_scores),
# max(f1_scores),
# sum(f1_scores)/float(len(f1_scores))))
# print()
# # print probability of each class
# print('probability of each class:')
# print()
# print(class_prob)
# print()
# print('number of samples of each class:')
# print()
# print(class_counts)
# print()
return class_prob, class_counts, recall_scores, precision_scores#, f1_scores
##### nur für overfit testing ###########
#print('overfit testing: prediction of training set')

121
src/SVM_multiclass.py Normal file
View File

@ -0,0 +1,121 @@
'''
Support Vector Machines (SVM) Classifier
========================================
The SVM training algorithm builds a model from the training data that assigns
the test samples to one category ('merger' or 'not merger'),
making it a non-probabilistic binary linear classifier.
An SVM model is a representation of the samples as points in space,
mapped so that the examples of the separate categories are divided
by a clear gap that is as wide as possible.
New samples are then mapped into that same space and predicted
to belong to a category based on which side of the gap they fall.
'''
from BagOfWords import BagOfWords
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
class SVM:
def make_svm(dataset, sklearn_cv=True):
print('# fitting model')
print('# ...')
# split data into text and label set
# articles' text (title + text)
X = dataset['Title'] + '. ' + dataset['Text']
# articles' labels
y = dataset['Label']
matrix = pd.DataFrame()
# fit the training data and then return the matrix
if sklearn_cv:
# use sklearn CountVectorizer
matrix = CountVectorizer().fit_transform(X).toarray()
else:
# use own BOW implementation
matrix = BagOfWords.fit_transform(X)
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
# use only most important features
selector = SelectPercentile()
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75, 100],
'SVC__kernel': ['linear'],
'SVC__gamma': [0.00001, 0.0001],
'SVC__C': [0.1, 1]},
cv=skf,
scoring=make_scorer(f1_score, average='micro'))
print('# fit classifier')
print('# ...')
grid.fit(matrix,y)
# DataFrame of results
df_results = grid.cv_results_
# print results
######################
print('RESULTS:')
print('')
print('mean_test_score:')
print(df_results['mean_test_score'])
print('')
print('mean of means:')
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
print('')
print('best score:')
print(grid.best_score_)
print()
print('best parameters set found on development set:')
print(grid.best_params_)
print()
if __name__ == '__main__':
print('# starting svm')
print('# ...')
#file = '..\\data\\classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('# ...')
# data = pd.read_csv(file,
# sep='|',
# engine='python',
# decimal='.',
# quotechar='\'',
# quoting=csv.QUOTE_NONE)
# read current data set from csv
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
sep='|',
usecols=range(1,13), # drop first column 'unnamed'
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
data = df.loc[df['Label'] != -1].reset_index(drop=True)
use_count_vectorizer = True
make_svm(data, use_count_vectorizer)
print('# ending svm')

View File

@ -1,6 +0,0 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
}