update tri-model and try word2vec
This commit is contained in:
parent
c87a85b818
commit
ea0a132bd6
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -24,7 +24,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -209,7 +209,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -218,7 +218,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -233,7 +233,7 @@
|
|||
],
|
||||
"source": [
|
||||
"# read current data set from csv\n",
|
||||
"df = pd.read_csv('../data/interactive_labeling_round_{}_temp.csv'.format(m),\n",
|
||||
"df = pd.read_csv('../data/interactive_labeling_round_{}.csv'.format(m),\n",
|
||||
" sep='|',\n",
|
||||
" usecols=range(1,13), # drop first column 'unnamed'\n",
|
||||
" encoding='utf-8',\n",
|
||||
|
@ -725,27 +725,15 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'pd' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[1;32m<ipython-input-1-a51d7411db70>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;31m# read current data set from csv\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mm\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m9\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m df = pd.read_csv('../data/interactive_labeling_round_{}.csv'.format(m),\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[0msep\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'|'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[0musecols\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m13\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;31m# drop first column 'unnamed'\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
|
||||
"\u001b[1;31mNameError\u001b[0m: name 'pd' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# THIS CELL IS OPTIONAL\n",
|
||||
"\n",
|
||||
"# read current data set from csv\n",
|
||||
"m = 9\n",
|
||||
"df = pd.read_csv('../data/interactive_labeling_round_{}_corrected.csv'.format(m),\n",
|
||||
"m = 11\n",
|
||||
"df = pd.read_csv('../data/interactive_labeling_round_{}.csv'.format(m),\n",
|
||||
" sep='|',\n",
|
||||
" usecols=range(1,13), # drop first column 'unnamed'\n",
|
||||
" encoding='utf-8',\n",
|
||||
|
|
|
@ -0,0 +1,744 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Model Evaluation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import csv\n",
|
||||
"import operator\n",
|
||||
"import pickle\n",
|
||||
"import random\n",
|
||||
"\n",
|
||||
"from ipywidgets import interact, interactive, fixed, interact_manual\n",
|
||||
"import ipywidgets as widgets\n",
|
||||
"from IPython.core.interactiveshell import InteractiveShell\n",
|
||||
"from IPython.display import display\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||||
"from sklearn.metrics import recall_score, precision_score, precision_recall_fscore_support\n",
|
||||
"from sklearn.model_selection import GridSearchCV\n",
|
||||
"from sklearn.model_selection import StratifiedKFold\n",
|
||||
"from sklearn.pipeline import Pipeline\n",
|
||||
"from sklearn.naive_bayes import GaussianNB\n",
|
||||
"from sklearn.naive_bayes import MultinomialNB\n",
|
||||
"\n",
|
||||
"from MNBInteractive import MNBInteractive\n",
|
||||
"from MultinomialNaiveBayes import MultinomialNaiveBayes\n",
|
||||
"from NaiveBayes import NaiveBayes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# initialize random => reproducible sequence\n",
|
||||
"random.seed(5)\n",
|
||||
"\n",
|
||||
"# set up wider display area\n",
|
||||
"pd.set_option('display.max_colwidth', -1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Last round number: 11\n",
|
||||
"Number of manually labeled articles: 1082\n",
|
||||
"Number of manually unlabeled articles: 8918\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# read current data set from csv\n",
|
||||
"df = pd.read_csv('../data/interactive_labeling_round_11.csv',\n",
|
||||
" sep='|',\n",
|
||||
" usecols=range(1,13), # drop first column 'unnamed'\n",
|
||||
" encoding='utf-8',\n",
|
||||
" quoting=csv.QUOTE_NONNUMERIC,\n",
|
||||
" quotechar='\\'')\n",
|
||||
"\n",
|
||||
"# find current iteration/round number\n",
|
||||
"m = int(df['Round'].max())\n",
|
||||
"print('Last round number: {}'.format(m))\n",
|
||||
"print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n",
|
||||
"print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def show_next(index):\n",
|
||||
" ''' this method displays an article's text and an interactive slider to set its label manually\n",
|
||||
" '''\n",
|
||||
" print('News article no. {}:'.format(index))\n",
|
||||
" print()\n",
|
||||
" print('HEADLINE:')\n",
|
||||
" print(df.loc[df['Index'] == index, 'Title'])\n",
|
||||
" print()\n",
|
||||
" print('TEXT:')\n",
|
||||
" print(df.loc[df['Index'] == index, 'Text'])\n",
|
||||
" print()\n",
|
||||
" print('ESTIMATED_0:')\n",
|
||||
" print(df.loc[df['Index'] == index, 'Estimated_0'])\n",
|
||||
" print()\n",
|
||||
" print('ESTIMATED_1:')\n",
|
||||
" print(df.loc[df['Index'] == index, 'Estimated_1'])\n",
|
||||
" print()\n",
|
||||
" print('ESTIMATED_2:')\n",
|
||||
" print(df.loc[df['Index'] == index, 'Estimated_2'])\n",
|
||||
" \n",
|
||||
" def f(x):\n",
|
||||
" # save user input\n",
|
||||
" df.loc[df['Index'] == index, 'Label'] = x\n",
|
||||
"\n",
|
||||
" # create slider widget for labels\n",
|
||||
" interact(f, x = widgets.IntSlider(min=-1, max=2, step=1, value=df.loc[df['Index'] == index, 'Label']))\n",
|
||||
" print('0: Other/Unrelated news, 1: Merger,')\n",
|
||||
" print('2: Topics related to deals, investments and mergers')\n",
|
||||
" print('___________________________________________________________________________________________________________')\n",
|
||||
" print()\n",
|
||||
" print()\n",
|
||||
"\n",
|
||||
"# list of article indices that will be shown next\n",
|
||||
"label_next = []"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## How to find a better model:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"A) Multinomial Naive Bayes Algorithm"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"recall_scores, precision_scores, f1_scores, class_probs = MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# toDo: läuft noch nicht\n",
|
||||
"\n",
|
||||
"# series of indices of recently estimated articles \n",
|
||||
"indices_estimated = df.loc[df['Label'] != -1, 'Index'].tolist()\n",
|
||||
"\n",
|
||||
"# annotate probability\n",
|
||||
"n = 0\n",
|
||||
"for row in class_probs[0]:\n",
|
||||
" index = indices_estimated[n]\n",
|
||||
" # save estimated label\n",
|
||||
" df.loc[index, 'Estimated_2'] = row[1]\n",
|
||||
" n += 1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Recall (Min): {}\".format(min(recall_scores)))\n",
|
||||
"print(\"Recall (Max): {}\".format(max(recall_scores)))\n",
|
||||
"print(\"Recall (Average): {}\".format(sum(recall_scores)/10))\n",
|
||||
"print()\n",
|
||||
"print(\"Precision (Min): {}\".format(min(precision_scores)))\n",
|
||||
"print(\"Precision (Max): {}\".format(max(precision_scores)))\n",
|
||||
"print(\"Precision (Average): {}\".format(sum(precision_scores)/10))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print('confusion matrix:')\n",
|
||||
"print('###############')\n",
|
||||
"zero_0 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 0)])\n",
|
||||
"zero_0\n",
|
||||
"zero_1 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 1)])\n",
|
||||
"zero_1\n",
|
||||
"zero_2 = len(testing_data.loc[(testing_data['Estimated'] == 0) & (testing_data['Label'] == 2)])\n",
|
||||
"zero_2\n",
|
||||
"print('/')\n",
|
||||
"one_0 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 0)])\n",
|
||||
"one_0\n",
|
||||
"one_1 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 1)])\n",
|
||||
"one_1\n",
|
||||
"one_2 = len(testing_data.loc[(testing_data['Estimated'] == 1) & (testing_data['Label'] == 2)])\n",
|
||||
"one_2\n",
|
||||
"print('/')\n",
|
||||
"\n",
|
||||
"two_0 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 0)])\n",
|
||||
"two_0\n",
|
||||
"two_1 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 1)])\n",
|
||||
"two_1\n",
|
||||
"two_2 = len(testing_data.loc[(testing_data['Estimated'] == 2) & (testing_data['Label'] == 2)])\n",
|
||||
"two_2"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Building three separate models:\n",
|
||||
"\n",
|
||||
"B) One model per class: Funktioniert es besser wenn man 3 Modelle hat.\n",
|
||||
"Begründung: wir sind interessiert an Klasse 1\n",
|
||||
"Pro Klasse 1 Modell bauen (Hier ist das ziel das beste Modell zu finden. Dafür nehmen wir 1082 gelabelte Daten.)\n",
|
||||
"3 Modelle => Ergebnis für 1 Sample: (70%, 40%, 80%) unklar => überprüfen\n",
|
||||
"=> (90%, 90%, 90%) => überprüfen\n",
|
||||
"liefert das bessere ambiguity Samples als oben\n",
|
||||
"Stratified sample: (50 + 50 (1/2 von der anderen Klasse 1/2 der dritten Klasse))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"labeled_pos_0 = df.loc[df['Label'] == 0].reset_index(drop=True)\n",
|
||||
"labeled_pos_1 = df.loc[df['Label'] == 1].reset_index(drop=True)\n",
|
||||
"labeled_pos_2 = df.loc[df['Label'] == 2].reset_index(drop=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"847"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(labeled_pos_0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"50"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(labeled_pos_1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"185"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(labeled_pos_2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# add three new columns for the three models, initialize with nans\n",
|
||||
"df['Estimated_0'] = np.nan\n",
|
||||
"df['Estimated_1'] = np.nan\n",
|
||||
"df['Estimated_2'] = np.nan"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sampling_class0_0 = labeled_pos_0.sample(n=100, random_state=5)\n",
|
||||
"sampling_class0_1 = labeled_pos_1.sample(n=50, random_state=5)\n",
|
||||
"sampling_class0_2 = labeled_pos_2.sample(n=50, random_state=5)\n",
|
||||
"\n",
|
||||
"sampling_class1_0 = labeled_pos_0.sample(n=25, random_state=5)\n",
|
||||
"sampling_class1_1 = sampling_class0_1\n",
|
||||
"sampling_class1_2 = labeled_pos_2.sample(n=25, random_state=5)\n",
|
||||
"\n",
|
||||
"sampling_class2_0 = labeled_pos_0.sample(n=50, random_state=5)\n",
|
||||
"sampling_class2_1 = sampling_class0_1\n",
|
||||
"sampling_class2_2 = labeled_pos_2.sample(n=100, random_state=5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sampling_class0_complement = pd.concat([sampling_class0_1, sampling_class0_2])\n",
|
||||
"sampling_class1_complement = pd.concat([sampling_class1_0, sampling_class1_2])\n",
|
||||
"sampling_class2_complement = pd.concat([sampling_class2_0, sampling_class2_1])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# prepare for binary classification:\n",
|
||||
"# pos_label = 3\n",
|
||||
"sampling_class0_0['Label'] = 3\n",
|
||||
"sampling_class1_1['Label'] = 3\n",
|
||||
"sampling_class2_2['Label'] = 3\n",
|
||||
"# neg_label = 4\n",
|
||||
"sampling_class0_complement['Label'] = 4\n",
|
||||
"sampling_class1_complement['Label'] = 4\n",
|
||||
"sampling_class2_complement['Label'] = 4"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"sampling_class0 = pd.concat([sampling_class0_0, sampling_class0_complement]).reset_index(drop=True)\n",
|
||||
"sampling_class1 = pd.concat([sampling_class1_1, sampling_class1_complement]).reset_index(drop=True)\n",
|
||||
"sampling_class2 = pd.concat([sampling_class2_2, sampling_class2_complement]).reset_index(drop=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Apply Naive Bayes Model to estimate all labeled articles (1082 samples):"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"200"
|
||||
]
|
||||
},
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"train_data = sampling_class2\n",
|
||||
"indices_train = train_data['Index'].tolist()\n",
|
||||
"len(indices_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"882"
|
||||
]
|
||||
},
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"test_data = df.loc[(df['Label'] != -1) & (~df['Index'].isin(indices_train))].reset_index(drop=True)\n",
|
||||
"len(test_data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_data.loc[(test_data['Label'] == 0), 'Label'] = 3\n",
|
||||
"test_data.loc[(test_data['Label'] == 1) | (test_data['Label'] == 2), 'Label'] = 4"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# split training data into text and label set\n",
|
||||
"# join title and text\n",
|
||||
"X = train_data['Title'] + '. ' + train_data['Text']\n",
|
||||
"y = train_data['Label']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# split testing data into text and label set\n",
|
||||
"U = test_data['Title'] + '. ' + test_data['Text']\n",
|
||||
"v = test_data['Label']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"recall: 0.19949811794228356\n",
|
||||
"precision: 0.803030303030303\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"classifier = GaussianNB()\n",
|
||||
"\n",
|
||||
"cv = CountVectorizer()\n",
|
||||
"\n",
|
||||
"# probabilities of each class\n",
|
||||
"class_probs = []\n",
|
||||
"\n",
|
||||
"# use sklearn CountVectorizer\n",
|
||||
"# fit the training data and then return the matrix\n",
|
||||
"training_data = cv.fit_transform(X, y).toarray()\n",
|
||||
"# transform testing data and return the matrix\n",
|
||||
"testing_data = cv.transform(U).toarray()\n",
|
||||
"\n",
|
||||
"#fit classifier\n",
|
||||
"classifier.fit(training_data, y)\n",
|
||||
"\n",
|
||||
"#predict class\n",
|
||||
"predictions_test = classifier.predict(testing_data)\n",
|
||||
"\n",
|
||||
"class_probs = classifier.predict_proba(testing_data)\n",
|
||||
"\n",
|
||||
"#print and store metrics\n",
|
||||
"rec = recall_score(v, predictions_test, pos_label=3)\n",
|
||||
"print('recall: ' + str(rec))\n",
|
||||
"prec = precision_score(v, predictions_test, pos_label=3)\n",
|
||||
"print('precision: ' + str(prec))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class_probs[:10]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# series of indices of recently estimated articles \n",
|
||||
"indices_estimated_2 = test_data['Index'].tolist()\n",
|
||||
"\n",
|
||||
"# annotate probability\n",
|
||||
"n = 0\n",
|
||||
"for row in class_probs:\n",
|
||||
" index = indices_estimated_2[n]\n",
|
||||
" # save estimated label\n",
|
||||
" df.loc[index, 'Estimated_2'] = row[0]\n",
|
||||
" n += 1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Apply Naive Bayes Model (10-fold-cross validation):"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset = sampling_class0\n",
|
||||
"\n",
|
||||
"X = dataset['Title'] + '. ' + dataset['Text']\n",
|
||||
"y = dataset['Label']\n",
|
||||
"\n",
|
||||
"cv = CountVectorizer()\n",
|
||||
"\n",
|
||||
"# use stratified k-fold cross-validation as split method\n",
|
||||
"skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)\n",
|
||||
"\n",
|
||||
"classifier = GaussianNB()\n",
|
||||
"\n",
|
||||
"# metrics\n",
|
||||
"recall_scores = []\n",
|
||||
"precision_scores = []\n",
|
||||
"\n",
|
||||
"# probabilities of each class (of each fold)\n",
|
||||
"class_probs = []\n",
|
||||
"# counts number of training samples observed in each class \n",
|
||||
"class_counts = []\n",
|
||||
"\n",
|
||||
"# for each fold\n",
|
||||
"for train, test in skf.split(X,y):\n",
|
||||
" \n",
|
||||
" # fit the training data and then return the matrix\n",
|
||||
" training_data = cv.fit_transform(X[train], y[train]).toarray()\n",
|
||||
" # transform testing data and return the matrix\n",
|
||||
" testing_data = cv.transform(X[test]).toarray()\n",
|
||||
"\n",
|
||||
" #fit classifier\n",
|
||||
" classifier.fit(training_data, y[train])\n",
|
||||
" #predict class\n",
|
||||
" predictions_train = classifier.predict(training_data)\n",
|
||||
" predictions_test = classifier.predict(testing_data)\n",
|
||||
"\n",
|
||||
" #print and store metrics\n",
|
||||
" rec = recall_score(y[test], predictions_test)\n",
|
||||
" recall_scores.append(rec)\n",
|
||||
" prec = precision_score(y[test], predictions_test)\n",
|
||||
" precision_scores.append(prec)\n",
|
||||
"\n",
|
||||
" class_probs.append(classifier.class_prior_)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# series of indices of recently estimated articles \n",
|
||||
"indices_estimated_0 = sampling_class0['Index'].tolist()\n",
|
||||
"\n",
|
||||
"# annotate probability\n",
|
||||
"n = 0\n",
|
||||
"for row in class_probs:\n",
|
||||
" index = indices_estimated_0[n]\n",
|
||||
" # save estimated label\n",
|
||||
" df.loc[index, 'Estimated_0'] = row[1]\n",
|
||||
" n += 1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(\"Recall (Min): {}\".format(min(recall_scores)))\n",
|
||||
"print(\"Recall (Max): {}\".format(max(recall_scores)))\n",
|
||||
"print(\"Recall (Average): {}\".format(sum(recall_scores)/10))\n",
|
||||
"print()\n",
|
||||
"print(\"Precision (Min): {}\".format(min(precision_scores)))\n",
|
||||
"print(\"Precision (Max): {}\".format(max(precision_scores)))\n",
|
||||
"print(\"Precision (Average): {}\".format(sum(precision_scores)/10))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Number of used samples:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"indices_all_samples = set((indices_estimated_0 + indices_estimated_1) + indices_estimated_2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(indices_all_samples)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Check if there are samples where more than one class was marked with 1."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) > 1.0)])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"indices = df.loc[(df['Label'] != -1) & ((df['Estimated_0'] + df['Estimated_1'] + df['Estimated_2']) > 1.0), 'Index'].tolist()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# save tri-model to csv \n",
|
||||
"df.to_csv('../data/interactive_labeling_triple_model_auf_1082.csv',\n",
|
||||
" sep='|',\n",
|
||||
" mode='w',\n",
|
||||
" encoding='utf-8',\n",
|
||||
" quoting=csv.QUOTE_NONNUMERIC,\n",
|
||||
" quotechar='\\'')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# read current data set from csv\n",
|
||||
"df = pd.read_csv('../data/interactive_labeling_triple_model_auf_1082.csv',\n",
|
||||
" sep='|',\n",
|
||||
" usecols=range(1,16), # drop first column 'unnamed'\n",
|
||||
" encoding='utf-8',\n",
|
||||
" quoting=csv.QUOTE_NONNUMERIC,\n",
|
||||
" quotechar='\\'')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for index in indices:\n",
|
||||
" show_next(index)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
|
@ -53,7 +53,7 @@ class BagOfWords:
|
|||
for word in words:
|
||||
word = word.lower()
|
||||
# check if alphabetic and not stop word
|
||||
if (word.isalpha()):# and word not in stop_words):
|
||||
if (word.isalpha() and word not in stop_words):
|
||||
if stemming:
|
||||
# reduce word to its stem
|
||||
word = stemmer.stem(word)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
'''
|
||||
Multinomial Naive Bayes Classifier
|
||||
======================
|
||||
==================================
|
||||
'''
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
|
@ -11,6 +11,7 @@ import pandas as pd
|
|||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.metrics import recall_score, precision_score
|
||||
import sklearn
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
|
||||
|
@ -19,19 +20,13 @@ class MultinomialNaiveBayes:
|
|||
def make_mnb(dataset, sklearn_cv=True, percentile=100):
|
||||
'''fits naive bayes model with StratifiedKFold
|
||||
'''
|
||||
print('# starting classical multinomial naive bayes')
|
||||
print('# starting multinomial naive bayes')
|
||||
print('# ...')
|
||||
|
||||
# split data into text and label set
|
||||
# join title and text
|
||||
X = dataset['Title'] + '. ' + dataset['Text']
|
||||
|
||||
print(X[:12])
|
||||
|
||||
y = dataset['Label']
|
||||
print(y[:12])
|
||||
|
||||
# bis hierhin stimmt noch alles...
|
||||
|
||||
if sklearn_cv:
|
||||
cv = CountVectorizer()
|
||||
|
@ -63,14 +58,6 @@ class MultinomialNaiveBayes:
|
|||
if sklearn_cv:
|
||||
# use sklearn CountVectorizer
|
||||
# fit the training data and then return the matrix
|
||||
print('Title + Text von train')
|
||||
|
||||
# aber ab hier stimmt was nicht. irgendwie rutschen da NaNs mit rein...
|
||||
|
||||
print(X[train])
|
||||
|
||||
print('Label von train')
|
||||
print(y[train])
|
||||
|
||||
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||
# transform testing data and return the matrix
|
||||
|
@ -136,8 +123,12 @@ class MultinomialNaiveBayes:
|
|||
# classes in order used
|
||||
classes = classifier.classes_
|
||||
|
||||
print('average: recall, precision, f1 score')
|
||||
print(sum(recall_scores)/10, sum(precision_scores)/10, sum(f1_scores)/10)
|
||||
|
||||
|
||||
# return classes and vector of class estimates
|
||||
return recall_scores, precision_scores, f1_scores
|
||||
return recall_scores, precision_scores, f1_scores, class_probs
|
||||
|
||||
######## nur für resubstitutionsfehler benötigt ########
|
||||
def analyze_errors(training, testing):
|
||||
|
@ -204,7 +195,4 @@ if __name__ == '__main__':
|
|||
quotechar='\'')
|
||||
|
||||
# select only labeled articles
|
||||
#print('Anzahl aller gelabelten:')
|
||||
#print(len(df.loc[df['Label'] != -1]))
|
||||
#print(df.loc[df['Label'] != -1][:5])
|
||||
MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reindex(), sklearn_cv=True, percentile=100)
|
||||
MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=True, percentile=100)
|
|
@ -0,0 +1,135 @@
|
|||
'''
|
||||
Multinomial Naive Bayes Classifier
|
||||
==================================
|
||||
'''
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
|
||||
import csv
|
||||
|
||||
import gensim
|
||||
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.metrics import recall_score, precision_score
|
||||
import sklearn
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
|
||||
class MultinomialNaiveBayes:
|
||||
|
||||
def make_mnb(dataset, sklearn_cv=True, percentile=100):
|
||||
'''fits naive bayes model with StratifiedKFold
|
||||
'''
|
||||
vector_size=150
|
||||
|
||||
def read_corpus(data, tokens_only=False):
|
||||
list_of_lists = []
|
||||
for i, text in enumerate(data):
|
||||
if tokens_only:
|
||||
list_of_lists.append(BagOfWords.extract_words(text))
|
||||
else:
|
||||
# For training data, add tags
|
||||
list_of_lists.append(gensim.models.doc2vec.TaggedDocument(BagOfWords.extract_words(text), [i]))
|
||||
return list_of_lists
|
||||
|
||||
print('# starting multinomial naive bayes')
|
||||
print('# ...')
|
||||
|
||||
# split data into text and label set
|
||||
# join title and text
|
||||
X = dataset['Title'] + '. ' + dataset['Text']
|
||||
y = dataset['Label']
|
||||
|
||||
# use stratified k-fold cross-validation as split method
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
|
||||
|
||||
classifier = MultinomialNB(alpha=1.0e-10,
|
||||
fit_prior=False,
|
||||
class_prior=None)
|
||||
|
||||
# metrics
|
||||
recall_scores = []
|
||||
precision_scores = []
|
||||
f1_scores = []
|
||||
|
||||
# probabilities of each class (of each fold)
|
||||
#class_prob = []
|
||||
# counts number of training samples observed in each class
|
||||
#class_counts = []
|
||||
|
||||
# for each fold
|
||||
n = 0
|
||||
for train, test in skf.split(X,y):
|
||||
|
||||
n += 1
|
||||
print('# split no. ' + str(n))
|
||||
|
||||
# train model with gensim
|
||||
training_data = read_corpus(X[train], tokens_only=False)
|
||||
testing_data = read_corpus(X[test], tokens_only=True)
|
||||
all_data = read_corpus(X, tokens_only=False)
|
||||
|
||||
# instantiate a Doc2Vec object
|
||||
doc2vec_model = Doc2Vec(training_data, vector_size=5, window=2, min_count=1, workers=4)
|
||||
|
||||
print(doc2vec_model.docvecs[0])
|
||||
print(doc2vec_model.docvecs[1])
|
||||
print(doc2vec_model.docvecs[2])
|
||||
|
||||
training_data = [doc2vec_model.docvecs[i] for i in range(len(training_data))]
|
||||
testing_data = [doc2vec_model.infer_vector(vector) for vector in testing_data]
|
||||
|
||||
#fit classifier
|
||||
classifier.fit(training_data, y[train])
|
||||
#predict class
|
||||
predictions_train = classifier.predict(training_data)
|
||||
predictions_test = classifier.predict(testing_data)
|
||||
|
||||
#print and store metrics
|
||||
rec = recall_score(y[test], predictions_test, average='weighted')
|
||||
print('rec: ' + str(rec))
|
||||
recall_scores.append(rec)
|
||||
prec = precision_score(y[test], predictions_test, average='weighted')
|
||||
print('prec: ' + str(prec))
|
||||
print('#')
|
||||
precision_scores.append(prec)
|
||||
# equation for f1 score
|
||||
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||
|
||||
##########################
|
||||
# probability estimates for the test vector (testing_data)
|
||||
class_probs = classifier.predict_proba(testing_data)
|
||||
|
||||
# number of samples encountered for each class during fitting
|
||||
# this value is weighted by the sample weight when provided
|
||||
class_count = classifier.class_count_
|
||||
|
||||
# classes in order used
|
||||
classes = classifier.classes_
|
||||
|
||||
print('average: recall, precision, f1 score')
|
||||
print(sum(recall_scores)/10, sum(precision_scores)/10, sum(f1_scores)/10)
|
||||
|
||||
|
||||
# return classes and vector of class estimates
|
||||
return recall_scores, precision_scores, f1_scores, class_probs
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('# ...')
|
||||
|
||||
# read current data set from csv
|
||||
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
|
||||
sep='|',
|
||||
usecols=range(1,13), # drop first column 'unnamed'
|
||||
encoding='utf-8',
|
||||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
quotechar='\'')
|
||||
|
||||
# select only labeled articles
|
||||
MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1][:100].reset_index(drop=True), sklearn_cv=False, percentile=100)
|
|
@ -48,7 +48,7 @@ class NaiveBayes:
|
|||
# metrics
|
||||
recall_scores = []
|
||||
precision_scores = []
|
||||
f1_scores = []
|
||||
#f1_scores = []
|
||||
|
||||
# probabilities of each class (of each fold)
|
||||
class_prob = []
|
||||
|
@ -106,37 +106,39 @@ class NaiveBayes:
|
|||
print('#')
|
||||
precision_scores.append(prec)
|
||||
# equation for f1 score
|
||||
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||
#f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||
|
||||
class_prob.append(classifier.class_prior_)
|
||||
class_counts.append(classifier.class_count_)
|
||||
|
||||
##########################
|
||||
#print metrics of test set
|
||||
print('-------------------------')
|
||||
print('prediction of testing set:')
|
||||
print('Precision score: min = {}, max = {}, average = {}'
|
||||
.format(min(precision_scores),
|
||||
max(precision_scores),
|
||||
sum(precision_scores)/float(len(precision_scores))))
|
||||
print('Recall score: min = {}, max = {}, average = {}'
|
||||
.format(min(recall_scores),
|
||||
max(recall_scores),
|
||||
sum(recall_scores)/float(len(recall_scores))))
|
||||
print('F1 score: min = {}, max = {}, average = {}'
|
||||
.format(min(f1_scores),
|
||||
max(f1_scores),
|
||||
sum(f1_scores)/float(len(f1_scores))))
|
||||
print()
|
||||
# print probability of each class
|
||||
print('probability of each class:')
|
||||
print()
|
||||
print(class_prob)
|
||||
print()
|
||||
print('number of samples of each class:')
|
||||
print()
|
||||
print(class_counts)
|
||||
print()
|
||||
# print('-------------------------')
|
||||
# print('prediction of testing set:')
|
||||
# print('Precision score: min = {}, max = {}, average = {}'
|
||||
# .format(min(precision_scores),
|
||||
# max(precision_scores),
|
||||
# sum(precision_scores)/float(len(precision_scores))))
|
||||
# print('Recall score: min = {}, max = {}, average = {}'
|
||||
# .format(min(recall_scores),
|
||||
# max(recall_scores),
|
||||
# sum(recall_scores)/float(len(recall_scores))))
|
||||
# print('F1 score: min = {}, max = {}, average = {}'
|
||||
# .format(min(f1_scores),
|
||||
# max(f1_scores),
|
||||
# sum(f1_scores)/float(len(f1_scores))))
|
||||
# print()
|
||||
# # print probability of each class
|
||||
# print('probability of each class:')
|
||||
# print()
|
||||
# print(class_prob)
|
||||
# print()
|
||||
# print('number of samples of each class:')
|
||||
# print()
|
||||
# print(class_counts)
|
||||
# print()
|
||||
|
||||
return class_prob, class_counts, recall_scores, precision_scores#, f1_scores
|
||||
|
||||
##### nur für overfit testing ###########
|
||||
#print('overfit testing: prediction of training set')
|
||||
|
|
|
@ -0,0 +1,121 @@
|
|||
'''
|
||||
Support Vector Machines (SVM) Classifier
|
||||
========================================
|
||||
|
||||
The SVM training algorithm builds a model from the training data that assigns
|
||||
the test samples to one category ('merger' or 'not merger'),
|
||||
making it a non-probabilistic binary linear classifier.
|
||||
An SVM model is a representation of the samples as points in space,
|
||||
mapped so that the examples of the separate categories are divided
|
||||
by a clear gap that is as wide as possible.
|
||||
New samples are then mapped into that same space and predicted
|
||||
to belong to a category based on which side of the gap they fall.
|
||||
'''
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
|
||||
import csv
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.metrics import f1_score, make_scorer
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.svm import SVC
|
||||
|
||||
class SVM:
|
||||
|
||||
def make_svm(dataset, sklearn_cv=True):
|
||||
|
||||
print('# fitting model')
|
||||
print('# ...')
|
||||
|
||||
# split data into text and label set
|
||||
|
||||
# articles' text (title + text)
|
||||
X = dataset['Title'] + '. ' + dataset['Text']
|
||||
# articles' labels
|
||||
y = dataset['Label']
|
||||
matrix = pd.DataFrame()
|
||||
|
||||
# fit the training data and then return the matrix
|
||||
if sklearn_cv:
|
||||
# use sklearn CountVectorizer
|
||||
matrix = CountVectorizer().fit_transform(X).toarray()
|
||||
else:
|
||||
# use own BOW implementation
|
||||
matrix = BagOfWords.fit_transform(X)
|
||||
|
||||
# use stratified k-fold cross-validation as split method
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
|
||||
# use only most important features
|
||||
selector = SelectPercentile()
|
||||
|
||||
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
|
||||
|
||||
grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75, 100],
|
||||
'SVC__kernel': ['linear'],
|
||||
'SVC__gamma': [0.00001, 0.0001],
|
||||
'SVC__C': [0.1, 1]},
|
||||
cv=skf,
|
||||
scoring=make_scorer(f1_score, average='micro'))
|
||||
|
||||
print('# fit classifier')
|
||||
print('# ...')
|
||||
|
||||
grid.fit(matrix,y)
|
||||
|
||||
# DataFrame of results
|
||||
df_results = grid.cv_results_
|
||||
|
||||
# print results
|
||||
######################
|
||||
print('RESULTS:')
|
||||
print('')
|
||||
print('mean_test_score:')
|
||||
print(df_results['mean_test_score'])
|
||||
print('')
|
||||
print('mean of means:')
|
||||
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
|
||||
print('')
|
||||
print('best score:')
|
||||
print(grid.best_score_)
|
||||
print()
|
||||
print('best parameters set found on development set:')
|
||||
print(grid.best_params_)
|
||||
print()
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
print('# starting svm')
|
||||
print('# ...')
|
||||
|
||||
#file = '..\\data\\classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('# ...')
|
||||
|
||||
# data = pd.read_csv(file,
|
||||
# sep='|',
|
||||
# engine='python',
|
||||
# decimal='.',
|
||||
# quotechar='\'',
|
||||
# quoting=csv.QUOTE_NONE)
|
||||
# read current data set from csv
|
||||
|
||||
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
|
||||
sep='|',
|
||||
usecols=range(1,13), # drop first column 'unnamed'
|
||||
encoding='utf-8',
|
||||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
quotechar='\'')
|
||||
data = df.loc[df['Label'] != -1].reset_index(drop=True)
|
||||
|
||||
use_count_vectorizer = True
|
||||
make_svm(data, use_count_vectorizer)
|
||||
|
||||
print('# ending svm')
|
|
@ -1,6 +0,0 @@
|
|||
{
|
||||
"cells": [],
|
||||
"metadata": {},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
Loading…
Reference in New Issue