update tri-model and try word2vec

This commit is contained in:
annealias 2019-03-25 12:41:10 +01:00
parent c87a85b818
commit ea0a132bd6
10 changed files with 21173 additions and 199 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -24,7 +24,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 20, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -209,7 +209,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -218,7 +218,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -233,7 +233,7 @@
], ],
"source": [ "source": [
"# read current data set from csv\n", "# read current data set from csv\n",
"df = pd.read_csv('../data/interactive_labeling_round_{}_temp.csv'.format(m),\n", "df = pd.read_csv('../data/interactive_labeling_round_{}.csv'.format(m),\n",
" sep='|',\n", " sep='|',\n",
" usecols=range(1,13), # drop first column 'unnamed'\n", " usecols=range(1,13), # drop first column 'unnamed'\n",
" encoding='utf-8',\n", " encoding='utf-8',\n",
@ -725,27 +725,15 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [],
{
"ename": "NameError",
"evalue": "name 'pd' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-1-a51d7411db70>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;31m# read current data set from csv\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mm\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m9\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m df = pd.read_csv('../data/interactive_labeling_round_{}.csv'.format(m),\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[0msep\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'|'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0musecols\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m13\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;31m# drop first column 'unnamed'\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mNameError\u001b[0m: name 'pd' is not defined"
]
}
],
"source": [ "source": [
"# THIS CELL IS OPTIONAL\n", "# THIS CELL IS OPTIONAL\n",
"\n", "\n",
"# read current data set from csv\n", "# read current data set from csv\n",
"m = 9\n", "m = 11\n",
"df = pd.read_csv('../data/interactive_labeling_round_{}_corrected.csv'.format(m),\n", "df = pd.read_csv('../data/interactive_labeling_round_{}.csv'.format(m),\n",
" sep='|',\n", " sep='|',\n",
" usecols=range(1,13), # drop first column 'unnamed'\n", " usecols=range(1,13), # drop first column 'unnamed'\n",
" encoding='utf-8',\n", " encoding='utf-8',\n",

View File

@ -0,0 +1,744 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model Evaluation"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import operator\n",
"import pickle\n",
"import random\n",
"\n",
"from ipywidgets import interact, interactive, fixed, interact_manual\n",
"import ipywidgets as widgets\n",
"from IPython.core.interactiveshell import InteractiveShell\n",
"from IPython.display import display\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.metrics import recall_score, precision_score, precision_recall_fscore_support\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"\n",
"from MNBInteractive import MNBInteractive\n",
"from MultinomialNaiveBayes import MultinomialNaiveBayes\n",
"from NaiveBayes import NaiveBayes"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# initialize random => reproducible sequence\n",
"random.seed(5)\n",
"\n",
"# set up wider display area\n",
"pd.set_option('display.max_colwidth', -1)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Last round number: 11\n",
"Number of manually labeled articles: 1082\n",
"Number of manually unlabeled articles: 8918\n"
]
}
],
"source": [
"# read current data set from csv\n",
"df = pd.read_csv('../data/interactive_labeling_round_11.csv',\n",
" sep='|',\n",
" usecols=range(1,13), # drop first column 'unnamed'\n",
" encoding='utf-8',\n",
" quoting=csv.QUOTE_NONNUMERIC,\n",
" quotechar='\\'')\n",
"\n",
"# find current iteration/round number\n",
"m = int(df['Round'].max())\n",
"print('Last round number: {}'.format(m))\n",
"print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n",
"print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def show_next(index):\n",
" ''' this method displays an article's text and an interactive slider to set its label manually\n",
" '''\n",
" print('News article no. {}:'.format(index))\n",
" print()\n",
" print('HEADLINE:')\n",
" print(df.loc[df['Index'] == index, 'Title'])\n",
" print()\n",
" print('TEXT:')\n",
" print(df.loc[df['Index'] == index, 'Text'])\n",
" print()\n",
" print('ESTIMATED_0:')\n",
" print(df.loc[df['Index'] == index, 'Estimated_0'])\n",
" print()\n",
" print('ESTIMATED_1:')\n",
" print(df.loc[df['Index'] == index, 'Estimated_1'])\n",
" print()\n",
" print('ESTIMATED_2:')\n",
" print(df.loc[df['Index'] == index, 'Estimated_2'])\n",
" \n",
" def f(x):\n",
" # save user input\n",
" df.loc[df['Index'] == index, 'Label'] = x\n",
"\n",
" # create slider widget for labels\n",
" interact(f, x = widgets.IntSlider(min=-1, max=2, step=1, value=df.loc[df['Index'] == index, 'Label']))\n",
" print('0: Other/Unrelated news, 1: Merger,')\n",
" print('2: Topics related to deals, investments and mergers')\n",
" print('___________________________________________________________________________________________________________')\n",
" print()\n",
" print()\n",
"\n",
"# list of article indices that will be shown next\n",
"label_next = []"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## How to find a better model:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A) Multinomial Naive Bayes Algorithm"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"recall_scores, precision_scores, f1_scores, class_probs = MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# toDo: läuft noch nicht\n",
"\n",
"# series of indices of recently estimated articles \n",
"indices_estimated = df.loc[df['Label'] != -1, 'Index'].tolist()\n",
"\n",
"# annotate probability\n",
"n = 0\n",
"for row in class_probs[0]:\n",
" index = indices_estimated[n]\n",
" # save estimated label\n",
" df.loc[index, 'Estimated_2'] = row[1]\n",
" n += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Recall (Min): {}\".format(min(recall_scores)))\n",
"print(\"Recall (Max): {}\".format(max(recall_scores)))\n",
"print(\"Recall (Average): {}\".format(sum(recall_scores)/10))\n",
"print()\n",
"print(\"Precision (Min): {}\".format(min(precision_scores)))\n",
"print(\"Precision (Max): {}\".format(max(precision_scores)))\n",
"print(\"Precision (Average): {}\".format(sum(precision_scores)/10))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('confusion matrix:')\n",
"print('###############')\n",
"zero_0 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 0)])\n",
"zero_0\n",
"zero_1 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 1)])\n",
"zero_1\n",
"zero_2 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 2)])\n",
"zero_2\n",
"print('/')\n",
"one_0 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 0)])\n",
"one_0\n",
"one_1 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 1)])\n",
"one_1\n",
"one_2 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 2)])\n",
"one_2\n",
"print('/')\n",
"\n",
"two_0 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 0)])\n",
"two_0\n",
"two_1 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 1)])\n",
"two_1\n",
"two_2 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 2)])\n",
"two_2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Building three separate models:\n",
"\n",
"B) One model per class: Funktioniert es besser wenn man 3 Modelle hat.\n",
"Begründung: wir sind interessiert an Klasse 1\n",
"Pro Klasse 1 Modell bauen (Hier ist das ziel das beste Modell zu finden. Dafür nehmen wir 1082 gelabelte Daten.)\n",
"3 Modelle => Ergebnis für 1 Sample: (70%, 40%, 80%) unklar => überprüfen\n",
"=> (90%, 90%, 90%) => überprüfen\n",
"liefert das bessere ambiguity Samples als oben\n",
"Stratified sample: (50 + 50 (1/2 von der anderen Klasse 1/2 der dritten Klasse))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"labeled_pos_0 = df.loc[df['Label'] == 0].reset_index(drop=True)\n",
"labeled_pos_1 = df.loc[df['Label'] == 1].reset_index(drop=True)\n",
"labeled_pos_2 = df.loc[df['Label'] == 2].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"847"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(labeled_pos_0)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"50"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(labeled_pos_1)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"185"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(labeled_pos_2)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# add three new columns for the three models, initialize with nans\n",
"df['Estimated_0'] = np.nan\n",
"df['Estimated_1'] = np.nan\n",
"df['Estimated_2'] = np.nan"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"sampling_class0_0 = labeled_pos_0.sample(n=100, random_state=5)\n",
"sampling_class0_1 = labeled_pos_1.sample(n=50, random_state=5)\n",
"sampling_class0_2 = labeled_pos_2.sample(n=50, random_state=5)\n",
"\n",
"sampling_class1_0 = labeled_pos_0.sample(n=25, random_state=5)\n",
"sampling_class1_1 = sampling_class0_1\n",
"sampling_class1_2 = labeled_pos_2.sample(n=25, random_state=5)\n",
"\n",
"sampling_class2_0 = labeled_pos_0.sample(n=50, random_state=5)\n",
"sampling_class2_1 = sampling_class0_1\n",
"sampling_class2_2 = labeled_pos_2.sample(n=100, random_state=5)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"sampling_class0_complement = pd.concat([sampling_class0_1, sampling_class0_2])\n",
"sampling_class1_complement = pd.concat([sampling_class1_0, sampling_class1_2])\n",
"sampling_class2_complement = pd.concat([sampling_class2_0, sampling_class2_1])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# prepare for binary classification:\n",
"# pos_label = 3\n",
"sampling_class0_0['Label'] = 3\n",
"sampling_class1_1['Label'] = 3\n",
"sampling_class2_2['Label'] = 3\n",
"# neg_label = 4\n",
"sampling_class0_complement['Label'] = 4\n",
"sampling_class1_complement['Label'] = 4\n",
"sampling_class2_complement['Label'] = 4"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"sampling_class0 = pd.concat([sampling_class0_0, sampling_class0_complement]).reset_index(drop=True)\n",
"sampling_class1 = pd.concat([sampling_class1_1, sampling_class1_complement]).reset_index(drop=True)\n",
"sampling_class2 = pd.concat([sampling_class2_2, sampling_class2_complement]).reset_index(drop=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Apply Naive Bayes Model to estimate all labeled articles (1082 samples):"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"200"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_data = sampling_class2\n",
"indices_train = train_data['Index'].tolist()\n",
"len(indices_train)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"882"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_data = df.loc[(df['Label'] != -1) & (~df['Index'].isin(indices_train))].reset_index(drop=True)\n",
"len(test_data)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"test_data.loc[(test_data['Label'] == 0), 'Label'] = 3\n",
"test_data.loc[(test_data['Label'] == 1) | (test_data['Label'] == 2), 'Label'] = 4"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# split training data into text and label set\n",
"# join title and text\n",
"X = train_data['Title'] + '. ' + train_data['Text']\n",
"y = train_data['Label']"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"# split testing data into text and label set\n",
"U = test_data['Title'] + '. ' + test_data['Text']\n",
"v = test_data['Label']"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"recall: 0.19949811794228356\n",
"precision: 0.803030303030303\n"
]
}
],
"source": [
"classifier = GaussianNB()\n",
"\n",
"cv = CountVectorizer()\n",
"\n",
"# probabilities of each class\n",
"class_probs = []\n",
"\n",
"# use sklearn CountVectorizer\n",
"# fit the training data and then return the matrix\n",
"training_data = cv.fit_transform(X, y).toarray()\n",
"# transform testing data and return the matrix\n",
"testing_data = cv.transform(U).toarray()\n",
"\n",
"#fit classifier\n",
"classifier.fit(training_data, y)\n",
"\n",
"#predict class\n",
"predictions_test = classifier.predict(testing_data)\n",
"\n",
"class_probs = classifier.predict_proba(testing_data)\n",
"\n",
"#print and store metrics\n",
"rec = recall_score(v, predictions_test, pos_label=3)\n",
"print('recall: ' + str(rec))\n",
"prec = precision_score(v, predictions_test, pos_label=3)\n",
"print('precision: ' + str(prec))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class_probs[:10]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# series of indices of recently estimated articles \n",
"indices_estimated_2 = test_data['Index'].tolist()\n",
"\n",
"# annotate probability\n",
"n = 0\n",
"for row in class_probs:\n",
" index = indices_estimated_2[n]\n",
" # save estimated label\n",
" df.loc[index, 'Estimated_2'] = row[0]\n",
" n += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Apply Naive Bayes Model (10-fold-cross validation):"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dataset = sampling_class0\n",
"\n",
"X = dataset['Title'] + '. ' + dataset['Text']\n",
"y = dataset['Label']\n",
"\n",
"cv = CountVectorizer()\n",
"\n",
"# use stratified k-fold cross-validation as split method\n",
"skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)\n",
"\n",
"classifier = GaussianNB()\n",
"\n",
"# metrics\n",
"recall_scores = []\n",
"precision_scores = []\n",
"\n",
"# probabilities of each class (of each fold)\n",
"class_probs = []\n",
"# counts number of training samples observed in each class \n",
"class_counts = []\n",
"\n",
"# for each fold\n",
"for train, test in skf.split(X,y):\n",
" \n",
" # fit the training data and then return the matrix\n",
" training_data = cv.fit_transform(X[train], y[train]).toarray()\n",
" # transform testing data and return the matrix\n",
" testing_data = cv.transform(X[test]).toarray()\n",
"\n",
" #fit classifier\n",
" classifier.fit(training_data, y[train])\n",
" #predict class\n",
" predictions_train = classifier.predict(training_data)\n",
" predictions_test = classifier.predict(testing_data)\n",
"\n",
" #print and store metrics\n",
" rec = recall_score(y[test], predictions_test)\n",
" recall_scores.append(rec)\n",
" prec = precision_score(y[test], predictions_test)\n",
" precision_scores.append(prec)\n",
"\n",
" class_probs.append(classifier.class_prior_)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# series of indices of recently estimated articles \n",
"indices_estimated_0 = sampling_class0['Index'].tolist()\n",
"\n",
"# annotate probability\n",
"n = 0\n",
"for row in class_probs:\n",
" index = indices_estimated_0[n]\n",
" # save estimated label\n",
" df.loc[index, 'Estimated_0'] = row[1]\n",
" n += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Recall (Min): {}\".format(min(recall_scores)))\n",
"print(\"Recall (Max): {}\".format(max(recall_scores)))\n",
"print(\"Recall (Average): {}\".format(sum(recall_scores)/10))\n",
"print()\n",
"print(\"Precision (Min): {}\".format(min(precision_scores)))\n",
"print(\"Precision (Max): {}\".format(max(precision_scores)))\n",
"print(\"Precision (Average): {}\".format(sum(precision_scores)/10))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Number of used samples:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"indices_all_samples = set((indices_estimated_0 + indices_estimated_1) + indices_estimated_2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(indices_all_samples)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Check if there are samples where more than one class was marked with 1."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) > 1.0)])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"indices = df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) > 1.0), 'Index'].tolist()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# save tri-model to csv \n",
"df.to_csv('../data/interactive_labeling_triple_model_auf_1082.csv',\n",
" sep='|',\n",
" mode='w',\n",
" encoding='utf-8',\n",
" quoting=csv.QUOTE_NONNUMERIC,\n",
" quotechar='\\'')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# read current data set from csv\n",
"df = pd.read_csv('../data/interactive_labeling_triple_model_auf_1082.csv',\n",
" sep='|',\n",
" usecols=range(1,16), # drop first column 'unnamed'\n",
" encoding='utf-8',\n",
" quoting=csv.QUOTE_NONNUMERIC,\n",
" quotechar='\\'')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for index in indices:\n",
" show_next(index)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -53,7 +53,7 @@ class BagOfWords:
for word in words: for word in words:
word = word.lower() word = word.lower()
# check if alphabetic and not stop word # check if alphabetic and not stop word
if (word.isalpha()):# and word not in stop_words): if (word.isalpha() and word not in stop_words):
if stemming: if stemming:
# reduce word to its stem # reduce word to its stem
word = stemmer.stem(word) word = stemmer.stem(word)

View File

@ -1,6 +1,6 @@
''' '''
Multinomial Naive Bayes Classifier Multinomial Naive Bayes Classifier
====================== ==================================
''' '''
from BagOfWords import BagOfWords from BagOfWords import BagOfWords
@ -11,6 +11,7 @@ import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score from sklearn.metrics import recall_score, precision_score
import sklearn
from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import MultinomialNB
@ -19,19 +20,13 @@ class MultinomialNaiveBayes:
def make_mnb(dataset, sklearn_cv=True, percentile=100): def make_mnb(dataset, sklearn_cv=True, percentile=100):
'''fits naive bayes model with StratifiedKFold '''fits naive bayes model with StratifiedKFold
''' '''
print('# starting classical multinomial naive bayes') print('# starting multinomial naive bayes')
print('# ...') print('# ...')
# split data into text and label set # split data into text and label set
# join title and text # join title and text
X = dataset['Title'] + '. ' + dataset['Text'] X = dataset['Title'] + '. ' + dataset['Text']
print(X[:12])
y = dataset['Label'] y = dataset['Label']
print(y[:12])
# bis hierhin stimmt noch alles...
if sklearn_cv: if sklearn_cv:
cv = CountVectorizer() cv = CountVectorizer()
@ -63,14 +58,6 @@ class MultinomialNaiveBayes:
if sklearn_cv: if sklearn_cv:
# use sklearn CountVectorizer # use sklearn CountVectorizer
# fit the training data and then return the matrix # fit the training data and then return the matrix
print('Title + Text von train')
# aber ab hier stimmt was nicht. irgendwie rutschen da NaNs mit rein...
print(X[train])
print('Label von train')
print(y[train])
training_data = cv.fit_transform(X[train], y[train]).toarray() training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix # transform testing data and return the matrix
@ -136,8 +123,12 @@ class MultinomialNaiveBayes:
# classes in order used # classes in order used
classes = classifier.classes_ classes = classifier.classes_
print('average: recall, precision, f1 score')
print(sum(recall_scores)/10, sum(precision_scores)/10, sum(f1_scores)/10)
# return classes and vector of class estimates # return classes and vector of class estimates
return recall_scores, precision_scores, f1_scores return recall_scores, precision_scores, f1_scores, class_probs
######## nur für resubstitutionsfehler benötigt ######## ######## nur für resubstitutionsfehler benötigt ########
def analyze_errors(training, testing): def analyze_errors(training, testing):
@ -204,7 +195,4 @@ if __name__ == '__main__':
quotechar='\'') quotechar='\'')
# select only labeled articles # select only labeled articles
#print('Anzahl aller gelabelten:') MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=True, percentile=100)
#print(len(df.loc[df['Label'] != -1]))
#print(df.loc[df['Label'] != -1][:5])
MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reindex(), sklearn_cv=True, percentile=100)

View File

@ -0,0 +1,135 @@
'''
Multinomial Naive Bayes Classifier
==================================
'''
from BagOfWords import BagOfWords
import csv
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
class MultinomialNaiveBayes:
def make_mnb(dataset, sklearn_cv=True, percentile=100):
'''fits naive bayes model with StratifiedKFold
'''
vector_size=150
def read_corpus(data, tokens_only=False):
list_of_lists = []
for i, text in enumerate(data):
if tokens_only:
list_of_lists.append(BagOfWords.extract_words(text))
else:
# For training data, add tags
list_of_lists.append(gensim.models.doc2vec.TaggedDocument(BagOfWords.extract_words(text), [i]))
return list_of_lists
print('# starting multinomial naive bayes')
print('# ...')
# split data into text and label set
# join title and text
X = dataset['Title'] + '. ' + dataset['Text']
y = dataset['Label']
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
classifier = MultinomialNB(alpha=1.0e-10,
fit_prior=False,
class_prior=None)
# metrics
recall_scores = []
precision_scores = []
f1_scores = []
# probabilities of each class (of each fold)
#class_prob = []
# counts number of training samples observed in each class
#class_counts = []
# for each fold
n = 0
for train, test in skf.split(X,y):
n += 1
print('# split no. ' + str(n))
# train model with gensim
training_data = read_corpus(X[train], tokens_only=False)
testing_data = read_corpus(X[test], tokens_only=True)
all_data = read_corpus(X, tokens_only=False)
# instantiate a Doc2Vec object
doc2vec_model = Doc2Vec(training_data, vector_size=5, window=2, min_count=1, workers=4)
print(doc2vec_model.docvecs[0])
print(doc2vec_model.docvecs[1])
print(doc2vec_model.docvecs[2])
training_data = [doc2vec_model.docvecs[i] for i in range(len(training_data))]
testing_data = [doc2vec_model.infer_vector(vector) for vector in testing_data]
#fit classifier
classifier.fit(training_data, y[train])
#predict class
predictions_train = classifier.predict(training_data)
predictions_test = classifier.predict(testing_data)
#print and store metrics
rec = recall_score(y[test], predictions_test, average='weighted')
print('rec: ' + str(rec))
recall_scores.append(rec)
prec = precision_score(y[test], predictions_test, average='weighted')
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
##########################
# probability estimates for the test vector (testing_data)
class_probs = classifier.predict_proba(testing_data)
# number of samples encountered for each class during fitting
# this value is weighted by the sample weight when provided
class_count = classifier.class_count_
# classes in order used
classes = classifier.classes_
print('average: recall, precision, f1 score')
print(sum(recall_scores)/10, sum(precision_scores)/10, sum(f1_scores)/10)
# return classes and vector of class estimates
return recall_scores, precision_scores, f1_scores, class_probs
if __name__ == '__main__':
# read csv file
print('# reading dataset')
print('# ...')
# read current data set from csv
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
sep='|',
usecols=range(1,13), # drop first column 'unnamed'
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
# select only labeled articles
MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1][:100].reset_index(drop=True), sklearn_cv=False, percentile=100)

View File

@ -25,182 +25,184 @@ from sklearn.naive_bayes import GaussianNB
class NaiveBayes: class NaiveBayes:
def make_naive_bayes(dataset, sklearn_cv=True, percentile=100): def make_naive_bayes(dataset, sklearn_cv=True, percentile=100):
'''fits naive bayes model with StratifiedKFold, '''fits naive bayes model with StratifiedKFold,
uses my BOW uses my BOW
''' '''
print('# fitting model') print('# fitting model')
print('# ...') print('# ...')
# split data into text and label set # split data into text and label set
# join title and text # join title and text
X = dataset['Title'] + '. ' + dataset['Text'] X = dataset['Title'] + '. ' + dataset['Text']
y = dataset['Label'] y = dataset['Label']
if sklearn_cv: if sklearn_cv:
cv = CountVectorizer() cv = CountVectorizer()
# use stratified k-fold cross-validation as split method # use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5) skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
classifier = GaussianNB() classifier = GaussianNB()
# metrics # metrics
recall_scores = [] recall_scores = []
precision_scores = [] precision_scores = []
f1_scores = [] #f1_scores = []
# probabilities of each class (of each fold) # probabilities of each class (of each fold)
class_prob = [] class_prob = []
# counts number of training samples observed in each class # counts number of training samples observed in each class
class_counts = [] class_counts = []
# for each fold # for each fold
n = 0 n = 0
for train, test in skf.split(X,y): for train, test in skf.split(X,y):
n += 1 n += 1
print('# split no. ' + str(n)) print('# split no. ' + str(n))
if sklearn_cv: if sklearn_cv:
# use sklearn CountVectorizer # use sklearn CountVectorizer
# fit the training data and then return the matrix # fit the training data and then return the matrix
training_data = cv.fit_transform(X[train], y[train]).toarray() training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix # transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray() testing_data = cv.transform(X[test]).toarray()
else: else:
# use my own BagOfWords python implementation # use my own BagOfWords python implementation
stemming = True stemming = True
rel_freq = True rel_freq = True
extracted_words = BagOfWords.extract_all_words(X[train]) extracted_words = BagOfWords.extract_all_words(X[train])
vocab = BagOfWords.make_vocab(extracted_words) vocab = BagOfWords.make_vocab(extracted_words)
# fit the training data and then return the matrix # fit the training data and then return the matrix
training_data = BagOfWords.make_matrix(extracted_words, training_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming) vocab, rel_freq, stemming)
# transform testing data and return the matrix # transform testing data and return the matrix
extracted_words = BagOfWords.extract_all_words(X[test]) extracted_words = BagOfWords.extract_all_words(X[test])
testing_data = BagOfWords.make_matrix(extracted_words, testing_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming) vocab, rel_freq, stemming)
# apply select percentile # apply select percentile
selector = SelectPercentile(percentile=percentile) selector = SelectPercentile(percentile=percentile)
selector.fit(training_data, y[train]) selector.fit(training_data, y[train])
# new reduced data sets # new reduced data sets
training_data_r = selector.transform(training_data) training_data_r = selector.transform(training_data)
testing_data_r = selector.transform(testing_data) testing_data_r = selector.transform(testing_data)
#fit classifier #fit classifier
classifier.fit(training_data_r, y[train]) classifier.fit(training_data_r, y[train])
#predict class #predict class
predictions_train = classifier.predict(training_data_r) predictions_train = classifier.predict(training_data_r)
predictions_test = classifier.predict(testing_data_r) predictions_test = classifier.predict(testing_data_r)
#print and store metrics #print and store metrics
rec = recall_score(y[test], predictions_test) rec = recall_score(y[test], predictions_test)
print('rec: ' + str(rec)) print('rec: ' + str(rec))
recall_scores.append(rec) recall_scores.append(rec)
prec = precision_score(y[test], predictions_test) prec = precision_score(y[test], predictions_test)
print('prec: ' + str(prec)) print('prec: ' + str(prec))
print('#') print('#')
precision_scores.append(prec) precision_scores.append(prec)
# equation for f1 score # equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec)) #f1_scores.append(2 * (prec * rec)/(prec + rec))
class_prob.append(classifier.class_prior_) class_prob.append(classifier.class_prior_)
class_counts.append(classifier.class_count_) class_counts.append(classifier.class_count_)
########################## ##########################
#print metrics of test set #print metrics of test set
print('-------------------------') # print('-------------------------')
print('prediction of testing set:') # print('prediction of testing set:')
print('Precision score: min = {}, max = {}, average = {}' # print('Precision score: min = {}, max = {}, average = {}'
.format(min(precision_scores), # .format(min(precision_scores),
max(precision_scores), # max(precision_scores),
sum(precision_scores)/float(len(precision_scores)))) # sum(precision_scores)/float(len(precision_scores))))
print('Recall score: min = {}, max = {}, average = {}' # print('Recall score: min = {}, max = {}, average = {}'
.format(min(recall_scores), # .format(min(recall_scores),
max(recall_scores), # max(recall_scores),
sum(recall_scores)/float(len(recall_scores)))) # sum(recall_scores)/float(len(recall_scores))))
print('F1 score: min = {}, max = {}, average = {}' # print('F1 score: min = {}, max = {}, average = {}'
.format(min(f1_scores), # .format(min(f1_scores),
max(f1_scores), # max(f1_scores),
sum(f1_scores)/float(len(f1_scores)))) # sum(f1_scores)/float(len(f1_scores))))
print() # print()
# print probability of each class # # print probability of each class
print('probability of each class:') # print('probability of each class:')
print() # print()
print(class_prob) # print(class_prob)
print() # print()
print('number of samples of each class:') # print('number of samples of each class:')
print() # print()
print(class_counts) # print(class_counts)
print() # print()
##### nur für overfit testing ########### return class_prob, class_counts, recall_scores, precision_scores#, f1_scores
#print('overfit testing: prediction of training set')
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
#format(min(f1_scores_train), max(f1_scores_train),
#sum(f1_scores_train)/float(len(f1_scores_train))))
#print()
######## nur für resubstitutionsfehler benötigt ######## ##### nur für overfit testing ###########
def analyze_errors(dataset): #print('overfit testing: prediction of training set')
'''calculates resubstitution error #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
shows indices of false classified articles #format(min(f1_scores_train), max(f1_scores_train),
uses Gaussian Bayes with train test split #sum(f1_scores_train)/float(len(f1_scores_train))))
''' #print()
X_train_test = dataset['Title'] + ' ' + dataset['Text']
y_train_test = dataset['Label']
count_vector = CountVectorizer() ######## nur für resubstitutionsfehler benötigt ########
# fit the training data and then return the matrix def analyze_errors(dataset):
training_data = count_vector.fit_transform(X_train_test).toarray() '''calculates resubstitution error
# transform testing data and return the matrix shows indices of false classified articles
testing_data = count_vector.transform(X_train_test).toarray() uses Gaussian Bayes with train test split
'''
X_train_test = dataset['Title'] + ' ' + dataset['Text']
y_train_test = dataset['Label']
# Naive Bayes count_vector = CountVectorizer()
classifier = GaussianNB() # fit the training data and then return the matrix
# fit classifier training_data = count_vector.fit_transform(X_train_test).toarray()
classifier.fit(training_data, y_train_test) # transform testing data and return the matrix
testing_data = count_vector.transform(X_train_test).toarray()
# Predict class # Naive Bayes
predictions = classifier.predict(testing_data) classifier = GaussianNB()
print('Errors at index:') # fit classifier
print() classifier.fit(training_data, y_train_test)
n = 0
for i in range(len(y_train_test)):
if y_train_test[i] != predictions[i]:
n += 1
print('error no.{}'.format(n))
print('prediction at index {} is: {}, but actual is: {}'
.format(i, predictions[i], y_train_test[i]))
print(X_train_test[i])
print(y_train_test[i])
print()
#print metrics
print('F1 score: ', format(f1_score(y_train_test, predictions)))
if __name__ == '__main__': # Predict class
predictions = classifier.predict(testing_data)
print('Errors at index:')
print()
n = 0
for i in range(len(y_train_test)):
if y_train_test[i] != predictions[i]:
n += 1
print('error no.{}'.format(n))
print('prediction at index {} is: {}, but actual is: {}'
.format(i, predictions[i], y_train_test[i]))
print(X_train_test[i])
print(y_train_test[i])
print()
#print metrics
print('F1 score: ', format(f1_score(y_train_test, predictions)))
print('# starting naive bayes') if __name__ == '__main__':
print('# ...')
file = '..\\data\\classification_labelled_corrected.csv' print('# starting naive bayes')
print('# ...')
# read csv file file = '..\\data\\classification_labelled_corrected.csv'
print('# reading dataset')
print('# ...')
data = pd.read_csv(file, # read csv file
sep='|', print('# reading dataset')
engine='python', print('# ...')
decimal='.',
quotechar='\'',
quoting=csv.QUOTE_NONE)
make_naive_bayes(data) data = pd.read_csv(file,
sep='|',
engine='python',
decimal='.',
quotechar='\'',
quoting=csv.QUOTE_NONE)
print('#') make_naive_bayes(data)
print('# ending naive bayes')
print('#')
print('# ending naive bayes')

121
src/SVM_multiclass.py Normal file
View File

@ -0,0 +1,121 @@
'''
Support Vector Machines (SVM) Classifier
========================================
The SVM training algorithm builds a model from the training data that assigns
the test samples to one category ('merger' or 'not merger'),
making it a non-probabilistic binary linear classifier.
An SVM model is a representation of the samples as points in space,
mapped so that the examples of the separate categories are divided
by a clear gap that is as wide as possible.
New samples are then mapped into that same space and predicted
to belong to a category based on which side of the gap they fall.
'''
from BagOfWords import BagOfWords
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
class SVM:
def make_svm(dataset, sklearn_cv=True):
print('# fitting model')
print('# ...')
# split data into text and label set
# articles' text (title + text)
X = dataset['Title'] + '. ' + dataset['Text']
# articles' labels
y = dataset['Label']
matrix = pd.DataFrame()
# fit the training data and then return the matrix
if sklearn_cv:
# use sklearn CountVectorizer
matrix = CountVectorizer().fit_transform(X).toarray()
else:
# use own BOW implementation
matrix = BagOfWords.fit_transform(X)
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
# use only most important features
selector = SelectPercentile()
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75, 100],
'SVC__kernel': ['linear'],
'SVC__gamma': [0.00001, 0.0001],
'SVC__C': [0.1, 1]},
cv=skf,
scoring=make_scorer(f1_score, average='micro'))
print('# fit classifier')
print('# ...')
grid.fit(matrix,y)
# DataFrame of results
df_results = grid.cv_results_
# print results
######################
print('RESULTS:')
print('')
print('mean_test_score:')
print(df_results['mean_test_score'])
print('')
print('mean of means:')
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
print('')
print('best score:')
print(grid.best_score_)
print()
print('best parameters set found on development set:')
print(grid.best_params_)
print()
if __name__ == '__main__':
print('# starting svm')
print('# ...')
#file = '..\\data\\classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('# ...')
# data = pd.read_csv(file,
# sep='|',
# engine='python',
# decimal='.',
# quotechar='\'',
# quoting=csv.QUOTE_NONE)
# read current data set from csv
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
sep='|',
usecols=range(1,13), # drop first column 'unnamed'
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
data = df.loc[df['Label'] != -1].reset_index(drop=True)
use_count_vectorizer = True
make_svm(data, use_count_vectorizer)
print('# ending svm')

View File

@ -1,6 +0,0 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
}