update labeling and documentation
|
@ -96,25 +96,25 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 8,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"m=11"
|
"m=16"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 9,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"This round number: 11\n",
|
"This round number: 16\n",
|
||||||
"Number of manually labeled articles: 1082\n",
|
"Number of manually labeled articles: 1132\n",
|
||||||
"Number of manually unlabeled articles: 8918\n"
|
"Number of manually unlabeled articles: 8868\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@ -842,8 +842,425 @@
|
||||||
" df.loc[index, 'Estimated'] = classes[i]\n",
|
" df.loc[index, 'Estimated'] = classes[i]\n",
|
||||||
" # annotate probability\n",
|
" # annotate probability\n",
|
||||||
" df.loc[index, 'Probability'] = row[i]\n",
|
" df.loc[index, 'Probability'] = row[i]\n",
|
||||||
" n += 1\n",
|
" n += 1"
|
||||||
"\n",
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"m = 16"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"###############\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"5"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"###############\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"###############\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"###############\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"5"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"###############\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"7"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"###############\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"8"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"###############\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"83.33333333333334"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"62.5"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"60.0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"###############\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"33.33333333333333"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"100.0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"80.0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"###############\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0.0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0.0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"80.0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"###############\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"38.88888888888889"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"54.166666666666664"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"73.33333333333333"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
"print('###############')\n",
|
"print('###############')\n",
|
||||||
"zero_0 = len(df.loc[(df['Round'] == m) & (df['Estimated'] == 0) & (df['Label'] == 0)])\n",
|
"zero_0 = len(df.loc[(df['Round'] == m) & (df['Estimated'] == 0) & (df['Label'] == 0)])\n",
|
||||||
"zero_0\n",
|
"zero_0\n",
|
||||||
|
@ -910,7 +1327,7 @@
|
||||||
"\n",
|
"\n",
|
||||||
"prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
|
"prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
|
||||||
"prec_1\n",
|
"prec_1\n",
|
||||||
"rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
|
"rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
|
||||||
"rec_1\n",
|
"rec_1\n",
|
||||||
"acc_1 = (tp_1 + tn_1) / total * 100\n",
|
"acc_1 = (tp_1 + tn_1) / total * 100\n",
|
||||||
"acc_1\n",
|
"acc_1\n",
|
||||||
|
|
|
@ -17,7 +17,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 5,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -42,6 +42,7 @@
|
||||||
"from sklearn.pipeline import Pipeline\n",
|
"from sklearn.pipeline import Pipeline\n",
|
||||||
"from sklearn.naive_bayes import MultinomialNB\n",
|
"from sklearn.naive_bayes import MultinomialNB\n",
|
||||||
"from sklearn.semi_supervised import label_propagation\n",
|
"from sklearn.semi_supervised import label_propagation\n",
|
||||||
|
"from sklearn.svm import LinearSVC\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from BagOfWords import BagOfWords\n",
|
"from BagOfWords import BagOfWords\n",
|
||||||
"from MultinomialNaiveBayes import MultinomialNaiveBayes\n",
|
"from MultinomialNaiveBayes import MultinomialNaiveBayes\n",
|
||||||
|
@ -50,7 +51,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 6,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -66,7 +67,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 7,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
@ -105,16 +106,16 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 70,
|
"execution_count": 117,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"9"
|
"8"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 70,
|
"execution_count": 117,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -126,7 +127,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 71,
|
"execution_count": 118,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -138,16 +139,16 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 72,
|
"execution_count": 119,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"number of labeled samples by class (0/1/2): 80/2/18\n",
|
"number of labeled samples by class (0/1/2): 79/8/13\n",
|
||||||
"minimum of new labeled samples: 2\n",
|
"minimum of new labeled samples: 8\n",
|
||||||
"length of current data set for resubstitution error: 6\n"
|
"length of current data set for resubstitution error: 24\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@ -162,7 +163,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 73,
|
"execution_count": 120,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -174,7 +175,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 74,
|
"execution_count": 121,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -187,62 +188,67 @@
|
||||||
"#training_data_5 = pd.concat([selec_0, selec_1, selec_2])\n",
|
"#training_data_5 = pd.concat([selec_0, selec_1, selec_2])\n",
|
||||||
"#training_data_6 = pd.concat([selec_0, selec_1, selec_2])\n",
|
"#training_data_6 = pd.concat([selec_0, selec_1, selec_2])\n",
|
||||||
"#training_data_7 = pd.concat([selec_0, selec_1, selec_2])\n",
|
"#training_data_7 = pd.concat([selec_0, selec_1, selec_2])\n",
|
||||||
"#training_data_8 = pd.concat([selec_0, selec_1, selec_2])\n",
|
"training_data_8 = pd.concat([selec_0, selec_1, selec_2])\n",
|
||||||
"training_data_9 = pd.concat([selec_0, selec_1, selec_2])"
|
"#training_data_9 = pd.concat([selec_0, selec_1, selec_2])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 75,
|
"execution_count": 122,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# indices of training samples\n",
|
"# indices of training samples\n",
|
||||||
"# idx_0 = training_data_0['Index'].tolist()\n",
|
"#idx_0 = training_data_0['Index'].tolist()\n",
|
||||||
"# idx_1 = training_data_1['Index'].tolist()\n",
|
"#idx_1 = training_data_1['Index'].tolist()\n",
|
||||||
"# idx_2 = training_data_2['Index'].tolist()\n",
|
"#idx_2 = training_data_2['Index'].tolist()\n",
|
||||||
"# idx_3 = training_data_3['Index'].tolist()\n",
|
"#idx_3 = training_data_3['Index'].tolist()\n",
|
||||||
"# idx_4 = training_data_4['Index'].tolist()\n",
|
"#idx_4 = training_data_4['Index'].tolist()\n",
|
||||||
"# idx_5 = training_data_5['Index'].tolist()\n",
|
"#idx_5 = training_data_5['Index'].tolist()\n",
|
||||||
"# idx_6 = training_data_6['Index'].tolist()\n",
|
"#idx_6 = training_data_6['Index'].tolist()\n",
|
||||||
"# idx_7 = training_data_7['Index'].tolist()\n",
|
"#idx_7 = training_data_7['Index'].tolist()\n",
|
||||||
"# idx_8 = training_data_8['Index'].tolist()\n",
|
"idx_8 = training_data_8['Index'].tolist()\n",
|
||||||
"idx_9 = training_data_9['Index'].tolist()"
|
"#idx_9 = training_data_9['Index'].tolist()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 103,
|
"execution_count": 123,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"#train_all = training_data_0\n",
|
"#train_0_1 = training_data_0.append([training_data_1])\n",
|
||||||
"train_0_8 = training_data_0.append([training_data_1, training_data_2, training_data_3, training_data_4, training_data_5, training_data_6, training_data_7, training_data_8])"
|
"#train_0_2 = train_0_1.append([training_data_2])\n",
|
||||||
|
"#train_0_3 = train_0_2.append([training_data_3])\n",
|
||||||
|
"#train_0_4 = train_0_3.append([training_data_4])\n",
|
||||||
|
"#train_0_5 = train_0_4.append([training_data_5])\n",
|
||||||
|
"#train_0_6 = train_0_5.append([training_data_6])\n",
|
||||||
|
"#train_0_7 = train_0_6.append([training_data_7])\n",
|
||||||
|
"train_0_8 = train_0_7.append([training_data_8])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 91,
|
"execution_count": 124,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"#idx_all = idx_0\n",
|
"train_all = train_0_8\n",
|
||||||
"idx_all = train_all['Index'].tolist()\n",
|
"idx_all = train_all['Index'].tolist()"
|
||||||
"#idx_9"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 92,
|
"execution_count": 125,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"117"
|
"111"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 92,
|
"execution_count": 125,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -257,26 +263,35 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"train_0_9 = train_0_2.append(training_data_3)\n",
|
"#train_0_9 = train_0_2.append(training_data_3)\n",
|
||||||
"len(train_0_3)"
|
"#len(train_0_3)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 86,
|
"execution_count": 59,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#m = 4"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 111,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"stratified number in round 9: 6\n",
|
"stratified number in round 7: 18\n",
|
||||||
"stratified number in total: 138\n"
|
"stratified number in total: 87\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"print('stratified number in round {}: {}'.format(m, len(idx_9)))\n",
|
"print('stratified number in round {}: {}'.format(m, len(idx_7)))\n",
|
||||||
"print('stratified number in total: {}'.format(len(idx_all)))"
|
"print('stratified number in total: {}'.format(len(idx_all)))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -288,22 +303,22 @@
|
||||||
"source": [
|
"source": [
|
||||||
"# STEP 1:\n",
|
"# STEP 1:\n",
|
||||||
"# resubstitution error round\n",
|
"# resubstitution error round\n",
|
||||||
"training_data = train_0_8\n",
|
"#training_data = train_0_8\n",
|
||||||
"testing_data = training_data_9"
|
"#testing_data = training_data_9"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 115,
|
"execution_count": 64,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"9"
|
"4"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 115,
|
"execution_count": 64,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -314,16 +329,26 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 160,
|
"execution_count": 126,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"1082"
|
"111"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 160,
|
"execution_count": 126,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"100"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 126,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -331,10 +356,13 @@
|
||||||
"source": [
|
"source": [
|
||||||
"# STEP 2: \n",
|
"# STEP 2: \n",
|
||||||
"# resubstitution error all labeled articles in round\n",
|
"# resubstitution error all labeled articles in round\n",
|
||||||
"training_data = train_all\n",
|
"training_data = train_0_8\n",
|
||||||
"testing_data = df.loc[(df['Round'] <= 11)]# & (~df['Index'].isin(idx_all))]\n",
|
"testing_data = df.loc[(df['Round'] == (m+1))]\n",
|
||||||
|
"\n",
|
||||||
|
"# & (~df['Index'].isin(idx_all))]\n",
|
||||||
"#df[~df['Index'].isin(idx_all)]\n",
|
"#df[~df['Index'].isin(idx_all)]\n",
|
||||||
"#df.loc[(df['Label'] == -1) | (df['Round'] >= 10)]\n",
|
"#df.loc[(df['Label'] == -1) | (df['Round'] >= 10)]\n",
|
||||||
|
"len(training_data)\n",
|
||||||
"len(testing_data)"
|
"len(testing_data)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -345,24 +373,44 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# STEP 3:\n",
|
"# STEP 3:\n",
|
||||||
"training_data = train_all\n",
|
"#training_data = train_all\n",
|
||||||
"testing_data = train_all"
|
"#testing_data = train_all"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 6,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# STEP 4:\n",
|
"# STEP 4:\n",
|
||||||
"training_data = train_all\n",
|
"#training_data = df.loc[df['Label'] != -1].reset_index(drop=True)\n",
|
||||||
"testing_data = train_all"
|
"#testing_data = df.loc[df['Label'] == -1].reset_index(drop=True)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 161,
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"8918"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"#len(testing_data)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 127,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
@ -385,7 +433,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 162,
|
"execution_count": 128,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
@ -425,7 +473,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 131,
|
"execution_count": 57,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
@ -438,10 +486,10 @@
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"7140"
|
"65"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 131,
|
"execution_count": 57,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
},
|
},
|
||||||
|
@ -455,10 +503,10 @@
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"2007"
|
"26"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 131,
|
"execution_count": 57,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
},
|
},
|
||||||
|
@ -472,10 +520,10 @@
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"736"
|
"9"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 131,
|
"execution_count": 57,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -570,27 +618,27 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 181,
|
"execution_count": 158,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Nachberechnung fürs Latex:\n",
|
"# Nachberechnung fürs Latex:\n",
|
||||||
"zero_0 = 1\n",
|
"zero_0 = 80\n",
|
||||||
"zero_1 = 1\n",
|
"zero_1 = 2\n",
|
||||||
"zero_2 = 0\n",
|
"zero_2 = 14\n",
|
||||||
"\n",
|
"\n",
|
||||||
"one_0 = 4\n",
|
"one_0 = 0\n",
|
||||||
"one_1 = 3\n",
|
"one_1 = 0\n",
|
||||||
"one_2 = 4\n",
|
"one_2 = 1\n",
|
||||||
"\n",
|
"\n",
|
||||||
"two_0 = 0\n",
|
"two_0 = 0\n",
|
||||||
"two_1 = 1\n",
|
"two_1 = 0\n",
|
||||||
"two_2 = 1"
|
"two_2 = 3"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 163,
|
"execution_count": 129,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
@ -604,10 +652,10 @@
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"701"
|
"68"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 163,
|
"execution_count": 129,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
},
|
},
|
||||||
|
@ -617,17 +665,17 @@
|
||||||
"0"
|
"0"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 163,
|
"execution_count": 129,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"41"
|
"6"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 163,
|
"execution_count": 129,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
},
|
},
|
||||||
|
@ -641,47 +689,10 @@
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"99"
|
"8"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 163,
|
"execution_count": 129,
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"49"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 163,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"74"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 163,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"/\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"47"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 163,
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
},
|
},
|
||||||
|
@ -691,17 +702,54 @@
|
||||||
"1"
|
"1"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 163,
|
"execution_count": 129,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"70"
|
"11"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 163,
|
"execution_count": 129,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"4"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 129,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 129,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 129,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
|
@ -734,7 +782,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 182,
|
"execution_count": 159,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": false
|
"scrolled": false
|
||||||
},
|
},
|
||||||
|
@ -747,51 +795,51 @@
|
||||||
"\n",
|
"\n",
|
||||||
"class 0:\n",
|
"class 0:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"TP: 1\n",
|
"TP: 80\n",
|
||||||
"TN: 9\n",
|
"TN: 4\n",
|
||||||
"FP: 1\n",
|
"FP: 16\n",
|
||||||
"FN: 4\n",
|
"FN: 0\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class 1:\n",
|
"class 1:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"TP: 3\n",
|
"TP: 0\n",
|
||||||
"TN: 2\n",
|
"TN: 97\n",
|
||||||
"FP: 8\n",
|
"FP: 1\n",
|
||||||
"FN: 2\n",
|
"FN: 2\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class 2:\n",
|
"class 2:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"TP: 1\n",
|
"TP: 3\n",
|
||||||
"TN: 9\n",
|
"TN: 82\n",
|
||||||
"FP: 1\n",
|
"FP: 0\n",
|
||||||
"FN: 4\n",
|
"FN: 15\n",
|
||||||
"###############\n",
|
"###############\n",
|
||||||
"\n",
|
"\n",
|
||||||
"METRICS:\n",
|
"METRICS:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class 0:\n",
|
"class 0:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"precision: 50.0\n",
|
"precision: 83.33\n",
|
||||||
"recall: 20.0\n",
|
"recall: 100.0\n",
|
||||||
"accuracy: 66.67\n",
|
"accuracy: 84.0\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class 1:\n",
|
"class 1:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"precision: 27.27\n",
|
"precision: 0.0\n",
|
||||||
"recall: 60.0\n",
|
"recall: 0.0\n",
|
||||||
"accuracy: 33.33\n",
|
"accuracy: 97.0\n",
|
||||||
"\n",
|
"\n",
|
||||||
"class 2:\n",
|
"class 2:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"precision: 50.0\n",
|
"precision: 100.0\n",
|
||||||
"recall: 20.0\n",
|
"recall: 16.67\n",
|
||||||
"accuracy: 66.67\n",
|
"accuracy: 85.0\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Average Metrics:\n",
|
"Average Metrics:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"precision: 42.42424242424242\n",
|
"precision: 61.111111111111114\n",
|
||||||
"recall: 33.333333333333336\n",
|
"recall: 38.888888888888886\n",
|
||||||
"accuracy: 55.55555555555554\n"
|
"accuracy: 88.66666666666667\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
|
@ -0,0 +1,374 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 112,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import csv\n",
|
||||||
|
"import operator\n",
|
||||||
|
"import pickle\n",
|
||||||
|
"import random\n",
|
||||||
|
"\n",
|
||||||
|
"from ipywidgets import interact, interactive, fixed, interact_manual\n",
|
||||||
|
"import ipywidgets as widgets\n",
|
||||||
|
"from IPython.core.interactiveshell import InteractiveShell\n",
|
||||||
|
"from IPython.display import display\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||||||
|
"from sklearn.metrics import recall_score, precision_score, precision_recall_fscore_support\n",
|
||||||
|
"from sklearn.model_selection import GridSearchCV\n",
|
||||||
|
"from sklearn.model_selection import StratifiedKFold\n",
|
||||||
|
"from sklearn.pipeline import Pipeline\n",
|
||||||
|
"from sklearn.naive_bayes import GaussianNB\n",
|
||||||
|
"from sklearn.naive_bayes import MultinomialNB\n",
|
||||||
|
"from sklearn.svm import SVC\n",
|
||||||
|
"from sklearn.svm import LinearSVC\n",
|
||||||
|
"\n",
|
||||||
|
"from BagOfWords import BagOfWords\n",
|
||||||
|
"from MNBInteractive import MNBInteractive\n",
|
||||||
|
"from MultinomialNaiveBayes import MultinomialNaiveBayes\n",
|
||||||
|
"from NaiveBayes import NaiveBayes"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 115,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Last round number: 15\n",
|
||||||
|
"Number of manually labeled articles: 1122\n",
|
||||||
|
"Number of manually unlabeled articles: 8878\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# initialize random => reproducible sequence\n",
|
||||||
|
"random.seed(5)\n",
|
||||||
|
"random_state=5\n",
|
||||||
|
"\n",
|
||||||
|
"# set up wider display area\n",
|
||||||
|
"pd.set_option('display.max_colwidth', -1)\n",
|
||||||
|
"\n",
|
||||||
|
"# read current data set from csv\n",
|
||||||
|
"df = pd.read_csv('../data/interactive_labeling_round_15_temp.csv',\n",
|
||||||
|
" sep='|',\n",
|
||||||
|
" usecols=range(1,13), # drop first column 'unnamed'\n",
|
||||||
|
" encoding='utf-8',\n",
|
||||||
|
" quoting=csv.QUOTE_NONNUMERIC,\n",
|
||||||
|
" quotechar='\\'')\n",
|
||||||
|
"\n",
|
||||||
|
"# find current iteration/round number\n",
|
||||||
|
"m = int(df['Round'].max())\n",
|
||||||
|
"print('Last round number: {}'.format(m))\n",
|
||||||
|
"print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n",
|
||||||
|
"print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 116,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"52\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"labeled_pos_0 = df.loc[df['Label'] == 0].reset_index(drop=True)\n",
|
||||||
|
"labeled_pos_1 = df.loc[df['Label'] == 1].reset_index(drop=True)\n",
|
||||||
|
"labeled_pos_2 = df.loc[df['Label'] == 2].reset_index(drop=True)\n",
|
||||||
|
"\n",
|
||||||
|
"max_sample = min(len(labeled_pos_0), len(labeled_pos_1), len(labeled_pos_2))\n",
|
||||||
|
"print(max_sample)\n",
|
||||||
|
"\n",
|
||||||
|
"sampling_class0 = labeled_pos_0.sample(n=max_sample, random_state=random_state)\n",
|
||||||
|
"sampling_class1 = labeled_pos_1.sample(n=max_sample, random_state=random_state)\n",
|
||||||
|
"sampling_class2 = labeled_pos_2.sample(n=max_sample, random_state=random_state)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 56,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# nur für subset EINDEUTIG\n",
|
||||||
|
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
|
||||||
|
"testing_data = df.loc[(df['Label'] != -1) & (df['Index'].isin(subset_indices))].reset_index(drop=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 117,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
|
||||||
|
"testing_data = df.loc[(df['Label'] != -1)].reset_index(drop=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 118,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"len(testing_data)\n",
|
||||||
|
"indices_predicted = df.loc[(df['Label'] != -1), 'Index'].tolist()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 119,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# split training data into text and label set\n",
|
||||||
|
"# join title and text\n",
|
||||||
|
"X = training_data['Title'] + '. ' + training_data['Text']\n",
|
||||||
|
"y = training_data['Label']\n",
|
||||||
|
"\n",
|
||||||
|
"# split testing data into text and label set\n",
|
||||||
|
"U = testing_data['Title'] + '. ' + testing_data['Text']\n",
|
||||||
|
"v = testing_data['Label']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 120,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#classifier = MultinomialNB(alpha=1.0e-10,\n",
|
||||||
|
"# fit_prior=False,\n",
|
||||||
|
"# class_prior=None)\n",
|
||||||
|
"#classifier = SVC()\n",
|
||||||
|
"classifier = LinearSVC()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 92,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"cv = CountVectorizer()\n",
|
||||||
|
"\n",
|
||||||
|
"# fit the training data and then return the matrix\n",
|
||||||
|
"training_data = cv.fit_transform(X, y).toarray()\n",
|
||||||
|
"# transform testing data and return the matrix\n",
|
||||||
|
"testing_data = cv.transform(U).toarray()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 93,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#fit classifier\n",
|
||||||
|
"classifier.fit(training_data, y)\n",
|
||||||
|
"\n",
|
||||||
|
"#predict class\n",
|
||||||
|
"predictions_test = classifier.predict(testing_data)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 94,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# annotate estimated labels\n",
|
||||||
|
"df['Estimated'] = np.nan\n",
|
||||||
|
"\n",
|
||||||
|
"for i, value in enumerate(indices_predicted):\n",
|
||||||
|
" df.loc[(df['Index'] == value), 'Estimated'] = predictions_test[i]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 95,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"###############\n",
|
||||||
|
"642\n",
|
||||||
|
"0\n",
|
||||||
|
"19\n",
|
||||||
|
"###############\n",
|
||||||
|
"55\n",
|
||||||
|
"50\n",
|
||||||
|
"36\n",
|
||||||
|
"###############\n",
|
||||||
|
"150\n",
|
||||||
|
"0\n",
|
||||||
|
"130\n",
|
||||||
|
"###############\n",
|
||||||
|
"metrics:\n",
|
||||||
|
"\n",
|
||||||
|
"642\n",
|
||||||
|
"216\n",
|
||||||
|
"19\n",
|
||||||
|
"205\n",
|
||||||
|
"###############\n",
|
||||||
|
"50\n",
|
||||||
|
"941\n",
|
||||||
|
"91\n",
|
||||||
|
"0\n",
|
||||||
|
"###############\n",
|
||||||
|
"130\n",
|
||||||
|
"747\n",
|
||||||
|
"150\n",
|
||||||
|
"55\n",
|
||||||
|
"###############\n",
|
||||||
|
"97.12556732223904\n",
|
||||||
|
"75.79693034238488\n",
|
||||||
|
"79.29759704251387\n",
|
||||||
|
"###############\n",
|
||||||
|
"35.46099290780142\n",
|
||||||
|
"100.0\n",
|
||||||
|
"91.58964879852127\n",
|
||||||
|
"###############\n",
|
||||||
|
"46.42857142857143\n",
|
||||||
|
"70.27027027027027\n",
|
||||||
|
"81.05360443622921\n",
|
||||||
|
"###############\n",
|
||||||
|
"59.67171055287063\n",
|
||||||
|
"82.02240020421839\n",
|
||||||
|
"83.98028342575479\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print('###############')\n",
|
||||||
|
"zero_0 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 0)])\n",
|
||||||
|
"print(zero_0)\n",
|
||||||
|
"zero_1 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 1)])\n",
|
||||||
|
"print(zero_1)\n",
|
||||||
|
"zero_2 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 2)])\n",
|
||||||
|
"print(zero_2)\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"one_0 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 0)])\n",
|
||||||
|
"print(one_0)\n",
|
||||||
|
"one_1 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 1)])\n",
|
||||||
|
"print(one_1)\n",
|
||||||
|
"one_2 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 2)])\n",
|
||||||
|
"print(one_2)\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"\n",
|
||||||
|
"two_0 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 0)])\n",
|
||||||
|
"print(two_0)\n",
|
||||||
|
"two_1 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 1)])\n",
|
||||||
|
"print(two_1)\n",
|
||||||
|
"two_2 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 2)])\n",
|
||||||
|
"print(two_2)\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"print('metrics:')\n",
|
||||||
|
"print()\n",
|
||||||
|
"\n",
|
||||||
|
"total = zero_0 + zero_1 + zero_2 + one_0 + one_1 + one_2 + two_0 + two_1 + two_2\n",
|
||||||
|
"\n",
|
||||||
|
"tp_0 = zero_0\n",
|
||||||
|
"print(tp_0)\n",
|
||||||
|
"tn_0 = one_1 + one_2 + two_1 + two_2\n",
|
||||||
|
"print(tn_0)\n",
|
||||||
|
"fp_0 = zero_1 + zero_2\n",
|
||||||
|
"print(fp_0)\n",
|
||||||
|
"fn_0 = one_0 + two_0\n",
|
||||||
|
"print(fn_0)\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"\n",
|
||||||
|
"tp_1 = one_1\n",
|
||||||
|
"print(tp_1)\n",
|
||||||
|
"tn_1 = zero_0 + zero_2 + two_0 + two_2\n",
|
||||||
|
"print(tn_1)\n",
|
||||||
|
"fp_1 = one_0 + one_2\n",
|
||||||
|
"print(fp_1)\n",
|
||||||
|
"fn_1 = zero_1 + two_1\n",
|
||||||
|
"print(fn_1)\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"\n",
|
||||||
|
"tp_2 = two_2\n",
|
||||||
|
"print(tp_2)\n",
|
||||||
|
"tn_2 = zero_0 + zero_1 + one_0 + one_1\n",
|
||||||
|
"print(tn_2)\n",
|
||||||
|
"fp_2 = two_0 + two_1\n",
|
||||||
|
"print(fp_2)\n",
|
||||||
|
"fn_2 = zero_2 + one_2\n",
|
||||||
|
"print(fn_2)\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"\n",
|
||||||
|
"prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
|
||||||
|
"print(prec_0)\n",
|
||||||
|
"rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
|
||||||
|
"print(rec_0)\n",
|
||||||
|
"acc_0 = (tp_0 + tn_0) / total * 100\n",
|
||||||
|
"print(acc_0)\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"\n",
|
||||||
|
"prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
|
||||||
|
"print(prec_1)\n",
|
||||||
|
"rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
|
||||||
|
"print(rec_1)\n",
|
||||||
|
"acc_1 = (tp_1 + tn_1) / total * 100\n",
|
||||||
|
"print(acc_1)\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"\n",
|
||||||
|
"prec_2 = tp_2 / (tp_2 + fp_2) * 100\n",
|
||||||
|
"print(prec_2)\n",
|
||||||
|
"rec_2 = tp_2 / (tp_2 + fn_2) * 100\n",
|
||||||
|
"print(rec_2)\n",
|
||||||
|
"acc_2 = (tp_2 + tn_2) / total * 100\n",
|
||||||
|
"print(acc_2)\n",
|
||||||
|
"print('###############')\n",
|
||||||
|
"\n",
|
||||||
|
"print((prec_1 + prec_2 + prec_0) / 3)\n",
|
||||||
|
"print((rec_1 + rec_2 + rec_0) / 3)\n",
|
||||||
|
"print((acc_1 + acc_2 + acc_0) / 3)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
|
@ -11,52 +11,135 @@ class LabelingPlotter():
|
||||||
# round numbers
|
# round numbers
|
||||||
round = [0,1,2,3,4,5,6,7,8,9]
|
round = [0,1,2,3,4,5,6,7,8,9]
|
||||||
|
|
||||||
# number of wrong estimated labels per round
|
# # number of wrong estimated labels per round
|
||||||
wrong = [0/100, 19/100, 17/100, 16/100, 20/100, 12/100, 10/100, 20/100, 14/100, 12/100]
|
# wrong = [0/100, 19/100, 17/100, 16/100, 20/100, 12/100, 10/100, 20/100, 14/100, 12/100]
|
||||||
|
|
||||||
# number of manual classified articles per class and round
|
# # number of manual classified articles per class and round
|
||||||
man_0 = [84/100, 165/200, 247/300, 329/400, 410/500, 498/600, 586/700, 662/800, 741/900, 821/1000]
|
# man_0 = [84/100, 165/200, 247/300, 329/400, 410/500, 498/600, 586/700, 662/800, 741/900, 821/1000]
|
||||||
man_1 = [3/100, 7/200, 12/300, 16/400, 20/500, 22/600, 23/700, 29/800, 37/900, 39/1000]
|
# man_1 = [3/100, 7/200, 12/300, 16/400, 20/500, 22/600, 23/700, 29/800, 37/900, 39/1000]
|
||||||
man_2 = [13/100, 28/200, 41/300, 55/400, 70/500, 80/600, 91/700, 109/800, 122/900, 140/1000]
|
# man_2 = [13/100, 28/200, 41/300, 55/400, 70/500, 80/600, 91/700, 109/800, 122/900, 140/1000]
|
||||||
|
|
||||||
# number of estimated labels per class and round
|
# # number of estimated labels per class and round
|
||||||
est_0 = [9873/9900, 9757/9800, 9603/9700, 9470/9600, 9735/9500, 9238/9400, 9107/9300, 8007/9200, 8064/9100, 7641/9000]
|
# est_0 = [9873/9900, 9757/9800, 9603/9700, 9470/9600, 9735/9500, 9238/9400, 9107/9300, 8007/9200, 8064/9100, 7641/9000]
|
||||||
est_1 = [14/9900, 15/9800, 11/9700, 11/9600, 16/9500, 17/9400, 18/9300, 19/9200, 18/9100, 20/9000]
|
# est_1 = [14/9900, 15/9800, 11/9700, 11/9600, 16/9500, 17/9400, 18/9300, 19/9200, 18/9100, 20/9000]
|
||||||
est_2 = [12/9900, 26/9800, 77/9700, 94/9600, 380/9500, 123/9400, 147/9300, 676/9200, 595/9100, 837/9000]
|
# est_2 = [12/9900, 26/9800, 77/9700, 94/9600, 380/9500, 123/9400, 147/9300, 676/9200, 595/9100, 837/9000]
|
||||||
|
|
||||||
fig, ax = plt.subplots(3, 1)
|
# naive study
|
||||||
|
rec_av_n = [np.nan, 33.3, 35.9, 38.1, 37.4, 33.3, 39.4, 40.7, 40.1, 38.9]
|
||||||
|
rec_1_n = [np.nan, 0, 0, 0, 0, 0, 0, 0, 12.5, 0]
|
||||||
|
|
||||||
|
prec_av_n = [np.nan, 26.3, 44.56, 61.22, 49.7, 29.3, 63.3, 59.7, 77.1, 61.1]
|
||||||
|
prec_1_n = [np.nan, 0, 0, 0, 0, 0, 0, 0, 100, 0]
|
||||||
|
|
||||||
|
acc_av_n = [np.nan,86,88.7,89.3,88,92,93.3,86.7,87.3,88.7]
|
||||||
|
acc_1_n = [np.nan,96,95, 96, 96 ,98,99, 94,93 ,97.0]
|
||||||
|
|
||||||
|
# stratified
|
||||||
|
rec_av_s = [np.nan, 44.53, 47.85, 56.45, 56.36, 58.71, 57.20, 62.13, 55.41, 46.85]
|
||||||
|
rec_1_s = [np.nan, 75.00, 50, 100, 75.00, 100, 100, 100, 75.00, 50.00]
|
||||||
|
|
||||||
|
prec_av_s = [np.nan, 36.8, 46.63, 41.42, 45.73, 33.69, 33.01, 52.68 , 44.68, 37.85]
|
||||||
|
prec_1_s = [np.nan, 6.67, 8.33, 9.52, 11.54, 8, 3.57, 16.67, 28.57, 5.00]
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(4, 1)
|
||||||
|
|
||||||
ax[0].plot(round, wrong)
|
ax[0].plot(round, rec_av_n, round, rec_av_s)
|
||||||
ax[2].set_xlabel('Iteration number')
|
ax[0].set_ylabel('Recall (Average)')
|
||||||
ax[0].set_ylabel('Error rate')
|
ax[0].legend(('Naive Sampling', 'Stratified Sampling'))
|
||||||
|
|
||||||
|
ax[1].plot(round, prec_av_n, round, prec_av_s)
|
||||||
|
ax[1].set_ylabel('Precision (Average)')
|
||||||
|
ax[1].legend(('Naive Sampling', 'Stratified Sampling'))
|
||||||
|
|
||||||
ax[1].plot(round, man_0, round, man_1, round, man_2)
|
ax[2].plot(round, rec_1_n, round, rec_1_s)
|
||||||
ax[1].set_ylabel('Fraction of manual labels')
|
ax[2].set_ylabel('Recall (Class 1)')
|
||||||
|
ax[2].legend(('Naive Sampling', 'Stratified Sampling'))
|
||||||
ax[2].plot(round, est_0, round, est_1, round, est_2)
|
|
||||||
ax[2].set_ylabel('Fraction of estimated labels')
|
|
||||||
|
|
||||||
|
ax[3].plot(round, prec_1_n, round, prec_1_s)
|
||||||
|
ax[3].set_ylabel('Precision (Class 1)')
|
||||||
|
ax[3].legend(('Naive Sampling', 'Stratified Sampling'))
|
||||||
|
|
||||||
|
ax[3].set_xlabel('Iteration number')
|
||||||
|
|
||||||
# limit x axis
|
# limit x axis
|
||||||
ax[0].set_xbound(lower=1, upper=9)
|
ax[0].set_xbound(lower=1, upper=9)
|
||||||
ax[1].set_xbound(lower=1, upper=9)
|
ax[1].set_xbound(lower=1, upper=9)
|
||||||
ax[2].set_xbound(lower=1, upper=9)
|
ax[2].set_xbound(lower=1, upper=9)
|
||||||
|
ax[3].set_xbound(lower=1, upper=9)
|
||||||
|
|
||||||
ax[0].set_ybound(lower=0)
|
ax[0].set_ybound(lower=0)
|
||||||
ax[1].set_ybound(lower=0)
|
ax[1].set_ybound(lower=0)
|
||||||
#ax[2].set_ybound(lower=0)
|
ax[2].set_ybound(lower=0)
|
||||||
|
ax[3].set_ybound(lower=0)
|
||||||
|
|
||||||
# insert legend
|
# ax[0].plot(round, rec_av_n)
|
||||||
ax[1].legend(('class 0', 'class 1', 'class 2'))
|
# ax[2].set_xlabel('Iteration number')
|
||||||
ax[2].legend(('class 0', 'class 1', 'class 2'))
|
# ax[0].set_ylabel('Metrics without stratified sampling')
|
||||||
|
|
||||||
fig.tight_layout()
|
# ax[1].plot(round, man_0, round, man_1, round, man_2)
|
||||||
|
# ax[1].set_ylabel('Fraction of manual labels')
|
||||||
|
|
||||||
plt.savefig('..\\visualization\\Labeling_Grafik_070219.png')
|
# ax[2].plot(round, est_0, round, est_1, round, est_2)
|
||||||
|
# ax[2].set_ylabel('Fraction of estimated labels')
|
||||||
|
|
||||||
|
# # limit x axis
|
||||||
|
# ax[0].set_xbound(lower=1, upper=9)
|
||||||
|
# ax[1].set_xbound(lower=1, upper=9)
|
||||||
|
# ax[2].set_xbound(lower=1, upper=9)
|
||||||
|
|
||||||
|
# ax[0].set_ybound(lower=0)
|
||||||
|
# ax[1].set_ybound(lower=0)
|
||||||
|
# #ax[2].set_ybound(lower=0)
|
||||||
|
|
||||||
|
# # insert legend
|
||||||
|
# ax[1].legend(('class 0', 'class 1', 'class 2'))
|
||||||
|
# ax[2].legend(('class 0', 'class 1', 'class 2'))
|
||||||
|
|
||||||
|
plt.savefig('..\\visualization\\Labeling_plot_190404.png')
|
||||||
|
plt.savefig('..\\visualization\\Labeling_plot_190404.eps')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
def plot_labeling_rounds_naive():
|
||||||
|
# round numbers
|
||||||
|
round = [0,1,2,3,4,5,6,7,8,9]
|
||||||
|
|
||||||
|
# naive study
|
||||||
|
rec_av_n = [np.nan, 33.3, 35.9, 38.1, 37.4, 33.3, 39.4, 40.7, 40.1, 38.9]
|
||||||
|
rec_1_n = [np.nan, 0, 0, 0, 0, 0, 0, 0, 12.5, 0]
|
||||||
|
|
||||||
|
prec_av_n = [np.nan, 26.3, 44.56, 61.22, 49.7, 29.3, 63.3, 59.7, 77.1, 61.1]
|
||||||
|
prec_1_n = [np.nan, 0, 0, 0, 0, 0, 0, 0, 100, 0]
|
||||||
|
|
||||||
|
acc_av_n = [np.nan, 86,88.7,89.3,88,92,93.3,86.7,87.3,88.7]
|
||||||
|
acc_1_n = [np.nan, 96,95, 96, 96 ,98,99, 94,93 ,97.0]
|
||||||
|
|
||||||
|
fig, ax = plt.subplots(2, 1)
|
||||||
|
|
||||||
|
ax[0].plot(round, rec_av_n, round, prec_av_n, round, acc_av_n)
|
||||||
|
ax[0].set_ylabel('Average metrics')
|
||||||
|
ax[0].legend(('Recall', 'Precision', 'Accuracy'))
|
||||||
|
|
||||||
|
ax[1].plot(round, rec_1_n, round, prec_1_n, round, acc_1_n)
|
||||||
|
ax[1].set_ylabel('Class 1 metrics')
|
||||||
|
ax[1].legend(('Recall', 'Precision', 'Accuracy'))
|
||||||
|
|
||||||
|
ax[1].set_xlabel('Iteration number')
|
||||||
|
|
||||||
|
# limit x axis
|
||||||
|
ax[0].set_xbound(lower=1, upper=9)
|
||||||
|
ax[1].set_xbound(lower=1, upper=9)
|
||||||
|
|
||||||
|
# y axis
|
||||||
|
ax[1].set_ybound(lower=-5)
|
||||||
|
ax[0].set_ybound(lower=-5)
|
||||||
|
|
||||||
|
plt.savefig('..\\visualization\\Labeling_plot_190411.png')
|
||||||
|
plt.savefig('..\\visualization\\Labeling_plot_190411.eps')
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def plot_cumulative():
|
def plot_cumulative():
|
||||||
# load pickle object
|
# load pickle object
|
||||||
with open('../obj/array_class_probs_stratified_round_9.pkl', 'rb') as input:
|
with open('../obj/array_3model_svm_class2.pkl', 'rb') as input:
|
||||||
list = pickle.load(input)
|
list = pickle.load(input)
|
||||||
|
|
||||||
# sort list in descending order
|
# sort list in descending order
|
||||||
|
@ -80,18 +163,25 @@ class LabelingPlotter():
|
||||||
#ax.set_yticklabels(['{:,.1%}'.format(x / 200) for x in vals])
|
#ax.set_yticklabels(['{:,.1%}'.format(x / 200) for x in vals])
|
||||||
|
|
||||||
|
|
||||||
ax.grid(True)
|
#ax.grid(True)
|
||||||
#ax.legend(loc='right')
|
#ax.legend(loc='right')
|
||||||
#ax.set_title('Cumulative distribution of highest estimated probability')
|
ax.set_title('Predictions class 2 (SVM)')
|
||||||
ax.set_xlabel('Highest estimated probability')
|
# for iterations
|
||||||
ax.set_ylabel('Fraction of articles with this highest estimated probability')
|
#ax.set_xlabel('Highest estimated probability')
|
||||||
|
#ax.set_ylabel('Fraction of articles with this highest estimated probability')
|
||||||
|
# for 3-models
|
||||||
|
ax.set_xlabel('Estimated probability for class 2')
|
||||||
|
ax.set_ylabel('Fraction of articles with this probability')
|
||||||
|
#plt.axis([0.97, 1, 0.95, 1.01])
|
||||||
#plt.axis([0.5, 0.99, 0, 0.006]) #round 9
|
#plt.axis([0.5, 0.99, 0, 0.006]) #round 9
|
||||||
plt.axis([0.5, 1, 0, 0.015]) # round 9 stratified
|
#plt.axis([0.5, 1, 0, 0.015]) # round 9 stratified
|
||||||
#plt.axis([0.65, 1, 0, 0.003]) # round 10
|
#plt.axis([0.65, 1, 0, 0.003]) # round 10
|
||||||
#plt.axis([0.7, 1, 0, 0.002]) # round 11
|
#plt.axis([0.7, 1, 0, 0.002]) # round 11
|
||||||
#ax.set_xbound(lower=0.5, upper=0.99)
|
#ax.set_xbound(lower=0.5, upper=0.99)
|
||||||
plt.savefig('..\\visualization\\proba_stratified_round_9.png')
|
#plt.savefig('..\\visualization\\proba_stratified_round_9.png')
|
||||||
plt.savefig('..\\visualization\\proba_stratified_round_9.eps')
|
#plt.savefig('..\\visualization\\proba_stratified_round_9.eps')
|
||||||
|
plt.savefig('..\\visualization\\3model_svm_class2.png')
|
||||||
|
plt.savefig('..\\visualization\\3model_svm_class2.eps')
|
||||||
|
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
@ -121,4 +211,5 @@ class LabelingPlotter():
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
#LabelingPlotter.plot_correlation()
|
#LabelingPlotter.plot_correlation()
|
||||||
LabelingPlotter.plot_cumulative()
|
#LabelingPlotter.plot_cumulative()
|
||||||
|
LabelingPlotter.plot_labeling_rounds_naive()
|
|
@ -20,7 +20,7 @@ class MNBInteractive:
|
||||||
However, in practice, fractional counts such as tf-idf may also work.
|
However, in practice, fractional counts such as tf-idf may also work.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
def estimate_mnb(labeled_data, unlabeled_data, sklearn_cv=False):
|
def estimate_mnb(labeled_data, unlabeled_data, sklearn_cv=True):
|
||||||
'''fits naive bayes model
|
'''fits naive bayes model
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,7 @@ from sklearn.naive_bayes import MultinomialNB
|
||||||
|
|
||||||
class MultinomialNaiveBayes:
|
class MultinomialNaiveBayes:
|
||||||
|
|
||||||
def make_mnb(dataset, sklearn_cv=True, percentile=100):
|
def make_mnb(dataset, sklearn_cv=True, percentile=100, bigram=False):
|
||||||
'''fits naive bayes model with StratifiedKFold
|
'''fits naive bayes model with StratifiedKFold
|
||||||
'''
|
'''
|
||||||
print('# starting multinomial naive bayes')
|
print('# starting multinomial naive bayes')
|
||||||
|
@ -29,7 +29,13 @@ class MultinomialNaiveBayes:
|
||||||
y = dataset['Label']
|
y = dataset['Label']
|
||||||
|
|
||||||
if sklearn_cv:
|
if sklearn_cv:
|
||||||
cv = CountVectorizer()
|
if bigram:
|
||||||
|
cv = CountVectorizer(ngram_range=(2,2))
|
||||||
|
else:
|
||||||
|
# ignore company names
|
||||||
|
company_names_list = BagOfWords.load_company_names()
|
||||||
|
stopwords = list(BagOfWords.set_stop_words()).extend(company_names_list)
|
||||||
|
cv = CountVectorizer(stop_words = stopwords)
|
||||||
|
|
||||||
# use stratified k-fold cross-validation as split method
|
# use stratified k-fold cross-validation as split method
|
||||||
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
|
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
|
||||||
|
@ -43,11 +49,6 @@ class MultinomialNaiveBayes:
|
||||||
precision_scores = []
|
precision_scores = []
|
||||||
f1_scores = []
|
f1_scores = []
|
||||||
|
|
||||||
# probabilities of each class (of each fold)
|
|
||||||
#class_prob = []
|
|
||||||
# counts number of training samples observed in each class
|
|
||||||
#class_counts = []
|
|
||||||
|
|
||||||
# for each fold
|
# for each fold
|
||||||
n = 0
|
n = 0
|
||||||
for train, test in skf.split(X,y):
|
for train, test in skf.split(X,y):
|
||||||
|
@ -90,13 +91,6 @@ class MultinomialNaiveBayes:
|
||||||
#predict class
|
#predict class
|
||||||
predictions_train = classifier.predict(training_data_r)
|
predictions_train = classifier.predict(training_data_r)
|
||||||
predictions_test = classifier.predict(testing_data_r)
|
predictions_test = classifier.predict(testing_data_r)
|
||||||
# print('train:')
|
|
||||||
# print(y[train])
|
|
||||||
# print('test:')
|
|
||||||
# print(y[test])
|
|
||||||
# print()
|
|
||||||
# print('pred')
|
|
||||||
# print(predictions_test)
|
|
||||||
|
|
||||||
#print and store metrics
|
#print and store metrics
|
||||||
rec = recall_score(y[test], predictions_test, average='weighted')
|
rec = recall_score(y[test], predictions_test, average='weighted')
|
||||||
|
@ -113,22 +107,19 @@ class MultinomialNaiveBayes:
|
||||||
#class_counts.append(classifier.class_count_)
|
#class_counts.append(classifier.class_count_)
|
||||||
|
|
||||||
##########################
|
##########################
|
||||||
# probability estimates for the test vector (testing_data)
|
|
||||||
class_probs = classifier.predict_proba(testing_data)
|
|
||||||
|
|
||||||
# number of samples encountered for each class during fitting
|
|
||||||
# this value is weighted by the sample weight when provided
|
|
||||||
class_count = classifier.class_count_
|
|
||||||
|
|
||||||
# classes in order used
|
# classes in order used
|
||||||
classes = classifier.classes_
|
classes = classifier.classes_
|
||||||
|
|
||||||
print('average: recall, precision, f1 score')
|
print('Recall (Min): ' + str(min(recall_scores)))
|
||||||
print(sum(recall_scores)/10, sum(precision_scores)/10, sum(f1_scores)/10)
|
print('Recall (Max): ' + str(max(recall_scores)))
|
||||||
|
print('Recall (Average): ' + str(sum(recall_scores)/len(recall_scores)))
|
||||||
|
print()
|
||||||
|
print('Precision (Min): ' + str(min(precision_scores)))
|
||||||
|
print('Precision (Max): ' + str(max(precision_scores)))
|
||||||
|
print('Precision (Average) :' + str(sum(precision_scores)/len(precision_scores)))
|
||||||
|
|
||||||
# return classes and vector of class estimates
|
# return classes and vector of class estimates
|
||||||
return recall_scores, precision_scores, f1_scores, class_probs
|
return recall_scores, precision_scores
|
||||||
|
|
||||||
######## nur für resubstitutionsfehler benötigt ########
|
######## nur für resubstitutionsfehler benötigt ########
|
||||||
def analyze_errors(training, testing):
|
def analyze_errors(training, testing):
|
||||||
|
@ -195,4 +186,4 @@ if __name__ == '__main__':
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
|
|
||||||
# select only labeled articles
|
# select only labeled articles
|
||||||
MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=True, percentile=100)
|
MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=False)
|
|
@ -17,13 +17,14 @@ from sklearn.metrics import recall_score, precision_score
|
||||||
import sklearn
|
import sklearn
|
||||||
from sklearn.model_selection import StratifiedKFold
|
from sklearn.model_selection import StratifiedKFold
|
||||||
from sklearn.naive_bayes import MultinomialNB
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
|
||||||
class MultinomialNaiveBayes_Word2Vec:
|
class MultinomialNaiveBayes_Word2Vec:
|
||||||
|
|
||||||
def make_mnb(dataset, sklearn_cv=True, percentile=100):
|
def make_mnb(dataset):
|
||||||
'''fits naive bayes model with StratifiedKFold
|
'''fits naive bayes model with StratifiedKFold
|
||||||
'''
|
'''
|
||||||
vector_size=150
|
|
||||||
|
|
||||||
def read_corpus(data, tokens_only=False):
|
def read_corpus(data, tokens_only=False):
|
||||||
list_of_lists = []
|
list_of_lists = []
|
||||||
|
@ -35,7 +36,13 @@ class MultinomialNaiveBayes_Word2Vec:
|
||||||
list_of_lists.append(gensim.models.doc2vec.TaggedDocument(BagOfWords.extract_words(text), [i]))
|
list_of_lists.append(gensim.models.doc2vec.TaggedDocument(BagOfWords.extract_words(text), [i]))
|
||||||
return list_of_lists
|
return list_of_lists
|
||||||
|
|
||||||
print('# starting multinomial naive bayes')
|
def normalize_vector(two_dim_array, min, max):
|
||||||
|
norm_array = two_dim_array
|
||||||
|
for (x,y), value in np.ndenumerate(two_dim_array):
|
||||||
|
norm_array[x][y] = ((value - min) / (max - min))
|
||||||
|
return norm_array
|
||||||
|
|
||||||
|
print('# starting multinomial naive bayes with Word2Vec')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
# split data into text and label set
|
# split data into text and label set
|
||||||
|
@ -46,20 +53,19 @@ class MultinomialNaiveBayes_Word2Vec:
|
||||||
# use stratified k-fold cross-validation as split method
|
# use stratified k-fold cross-validation as split method
|
||||||
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
|
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
|
||||||
|
|
||||||
classifier = MultinomialNB(alpha=1.0e-10,
|
#classifier = MultinomialNB(alpha=1.0e-10,
|
||||||
fit_prior=False,
|
# fit_prior=False,
|
||||||
class_prior=None)
|
# class_prior=None)
|
||||||
|
|
||||||
|
# classifier = SVC(probability=True,
|
||||||
|
# gamma='auto')
|
||||||
|
classifier = LinearSVC()
|
||||||
|
|
||||||
# metrics
|
# metrics
|
||||||
recall_scores = []
|
recall_scores = []
|
||||||
precision_scores = []
|
precision_scores = []
|
||||||
f1_scores = []
|
f1_scores = []
|
||||||
|
|
||||||
# probabilities of each class (of each fold)
|
|
||||||
#class_prob = []
|
|
||||||
# counts number of training samples observed in each class
|
|
||||||
#class_counts = []
|
|
||||||
|
|
||||||
# for each fold
|
# for each fold
|
||||||
n = 0
|
n = 0
|
||||||
for train, test in skf.split(X,y):
|
for train, test in skf.split(X,y):
|
||||||
|
@ -68,28 +74,51 @@ class MultinomialNaiveBayes_Word2Vec:
|
||||||
print('# split no. ' + str(n))
|
print('# split no. ' + str(n))
|
||||||
|
|
||||||
# train model with gensim
|
# train model with gensim
|
||||||
training_data = read_corpus(X[train], tokens_only=False)
|
tagged_train_data = read_corpus(X[train], tokens_only=False)
|
||||||
testing_data = read_corpus(X[test], tokens_only=True)
|
tagged_test_data = read_corpus(X[test], tokens_only=False)
|
||||||
all_data = read_corpus(X, tokens_only=False)
|
|
||||||
|
|
||||||
# instantiate a Doc2Vec object
|
# instantiate a Doc2Vec object
|
||||||
doc2vec_model = Doc2Vec(training_data, vector_size=100, window=2, min_count=2, epochs = 40)
|
model = Doc2Vec(vector_size=100,
|
||||||
|
min_count=20,
|
||||||
|
epochs=40,
|
||||||
|
negative=0,
|
||||||
|
workers=1,
|
||||||
|
seed=5,
|
||||||
|
hs=1)
|
||||||
|
|
||||||
# Frage: hier dürfen keine negativen Werte drin sein für Naive Bayes?
|
model.build_vocab(tagged_train_data)
|
||||||
print(doc2vec_model.docvecs[0])
|
|
||||||
print(doc2vec_model.docvecs[1])
|
model.train(tagged_train_data,
|
||||||
print(doc2vec_model.docvecs[2])
|
total_examples=model.corpus_count,
|
||||||
|
epochs=model.epochs)
|
||||||
training_data = [doc2vec_model.docvecs[i] for i in range(len(training_data))]
|
|
||||||
|
model.docvecs.count
|
||||||
# Frage: muss man bei den testing daten auch einen tag mit machen?
|
|
||||||
testing_data = [doc2vec_model.infer_vector(vector) for vector in testing_data]
|
X_train=[model.infer_vector(doc.words, steps=20) for doc in tagged_train_data]
|
||||||
|
X_test=[model.infer_vector(doc.words, steps=20) for doc in tagged_test_data]
|
||||||
|
|
||||||
|
# convert matrix
|
||||||
|
X_train=np.vstack(X_train)
|
||||||
|
X_test=np.vstack(X_test)
|
||||||
|
|
||||||
|
# min max for normalization
|
||||||
|
minimum = min(X_train.min(), X_test.min())
|
||||||
|
maximum = max(X_train.max(), X_test.max())
|
||||||
|
|
||||||
|
X_test_norm = normalize_vector(X_test, minimum, maximum)
|
||||||
|
X_train_norm = normalize_vector(X_train, minimum, maximum)
|
||||||
|
|
||||||
|
# shape vectors
|
||||||
|
X_test_norm.shape
|
||||||
|
y[test].shape
|
||||||
|
X_train_norm.shape
|
||||||
|
y[train].shape
|
||||||
|
|
||||||
#fit classifier
|
#fit classifier
|
||||||
classifier.fit(training_data, y[train])
|
classifier.fit(X_train_norm, y[train])
|
||||||
#predict class
|
#predict class
|
||||||
predictions_train = classifier.predict(training_data)
|
predictions_train = classifier.predict(X_train_norm)
|
||||||
predictions_test = classifier.predict(testing_data)
|
predictions_test = classifier.predict(X_test_norm)
|
||||||
|
|
||||||
#print and store metrics
|
#print and store metrics
|
||||||
rec = recall_score(y[test], predictions_test, average='weighted')
|
rec = recall_score(y[test], predictions_test, average='weighted')
|
||||||
|
@ -104,21 +133,25 @@ class MultinomialNaiveBayes_Word2Vec:
|
||||||
|
|
||||||
##########################
|
##########################
|
||||||
# probability estimates for the test vector (testing_data)
|
# probability estimates for the test vector (testing_data)
|
||||||
class_probs = classifier.predict_proba(testing_data)
|
#class_probs = classifier.predict_proba(X_test_norm)
|
||||||
|
|
||||||
# number of samples encountered for each class during fitting
|
# number of samples encountered for each class during fitting
|
||||||
# this value is weighted by the sample weight when provided
|
# this value is weighted by the sample weight when provided
|
||||||
class_count = classifier.class_count_
|
#class_count = classifier.class_count_
|
||||||
|
|
||||||
# classes in order used
|
# classes in order used
|
||||||
classes = classifier.classes_
|
#classes = classifier.classes_
|
||||||
|
|
||||||
print('average: recall, precision, f1 score')
|
|
||||||
print(sum(recall_scores)/10, sum(precision_scores)/10, sum(f1_scores)/10)
|
|
||||||
|
|
||||||
|
print('Recall (Min): ' + str(min(recall_scores)))
|
||||||
|
print('Recall (Max): ' + str(max(recall_scores)))
|
||||||
|
print('Recall (Average): ' + str(sum(recall_scores)/len(recall_scores)))
|
||||||
|
print()
|
||||||
|
print('Precision (Min): ' + str(min(precision_scores)))
|
||||||
|
print('Precision (Max): ' + str(max(precision_scores)))
|
||||||
|
print('Precision (Average) :' + str(sum(precision_scores)/len(precision_scores)))
|
||||||
|
|
||||||
# return classes and vector of class estimates
|
# return classes and vector of class estimates
|
||||||
return recall_scores, precision_scores, f1_scores, class_probs
|
return recall_scores, precision_scores, f1_scores#, class_probs
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
@ -135,4 +168,4 @@ if __name__ == '__main__':
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
|
|
||||||
# select only labeled articles
|
# select only labeled articles
|
||||||
MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=False, percentile=100)
|
MultinomialNaiveBayes_Word2Vec.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True))
|
|
@ -0,0 +1,198 @@
|
||||||
|
'''
|
||||||
|
Multinomial Naive Bayes Classifier
|
||||||
|
==================================
|
||||||
|
'''
|
||||||
|
|
||||||
|
from BagOfWords import BagOfWords
|
||||||
|
|
||||||
|
import csv
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.feature_selection import SelectPercentile
|
||||||
|
from sklearn.metrics import recall_score, precision_score
|
||||||
|
import sklearn
|
||||||
|
from sklearn.model_selection import StratifiedKFold
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
|
||||||
|
class MultinomialNaiveBayes:
|
||||||
|
|
||||||
|
def make_mnb(dataset, sklearn_cv=True, percentile=100):
|
||||||
|
'''fits naive bayes model with StratifiedKFold
|
||||||
|
'''
|
||||||
|
print('# starting multinomial naive bayes')
|
||||||
|
print('# ...')
|
||||||
|
|
||||||
|
# split data into text and label set
|
||||||
|
# join title and text
|
||||||
|
X = dataset['Title'] + '. ' + dataset['Text']
|
||||||
|
y = dataset['Label']
|
||||||
|
|
||||||
|
if sklearn_cv:
|
||||||
|
cv = CountVectorizer(ngram_range = (1,2))
|
||||||
|
|
||||||
|
# use stratified k-fold cross-validation as split method
|
||||||
|
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
|
||||||
|
|
||||||
|
classifier = MultinomialNB(alpha=1.0e-10,
|
||||||
|
fit_prior=False,
|
||||||
|
class_prior=None)
|
||||||
|
|
||||||
|
# metrics
|
||||||
|
recall_scores = []
|
||||||
|
precision_scores = []
|
||||||
|
f1_scores = []
|
||||||
|
|
||||||
|
# probabilities of each class (of each fold)
|
||||||
|
#class_prob = []
|
||||||
|
# counts number of training samples observed in each class
|
||||||
|
#class_counts = []
|
||||||
|
|
||||||
|
# for each fold
|
||||||
|
n = 0
|
||||||
|
for train, test in skf.split(X,y):
|
||||||
|
|
||||||
|
n += 1
|
||||||
|
print('# split no. ' + str(n))
|
||||||
|
|
||||||
|
if sklearn_cv:
|
||||||
|
# use sklearn CountVectorizer
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
|
||||||
|
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
testing_data = cv.transform(X[test]).toarray()
|
||||||
|
else:
|
||||||
|
# use my own BagOfWords python implementation
|
||||||
|
stemming = True
|
||||||
|
rel_freq = True
|
||||||
|
extracted_words = BagOfWords.extract_all_words(X[train])
|
||||||
|
vocab = BagOfWords.make_vocab(extracted_words)
|
||||||
|
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
training_data = BagOfWords.make_matrix(extracted_words,
|
||||||
|
vocab, rel_freq, stemming)
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
extracted_words = BagOfWords.extract_all_words(X[test])
|
||||||
|
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||||
|
vocab, rel_freq, stemming)
|
||||||
|
|
||||||
|
# apply select percentile
|
||||||
|
selector = SelectPercentile(percentile=percentile)
|
||||||
|
selector.fit(training_data, y[train])
|
||||||
|
|
||||||
|
# new reduced data sets
|
||||||
|
training_data_r = selector.transform(training_data)
|
||||||
|
testing_data_r = selector.transform(testing_data)
|
||||||
|
|
||||||
|
#fit classifier
|
||||||
|
classifier.fit(training_data_r, y[train])
|
||||||
|
#predict class
|
||||||
|
predictions_train = classifier.predict(training_data_r)
|
||||||
|
predictions_test = classifier.predict(testing_data_r)
|
||||||
|
# print('train:')
|
||||||
|
# print(y[train])
|
||||||
|
# print('test:')
|
||||||
|
# print(y[test])
|
||||||
|
# print()
|
||||||
|
# print('pred')
|
||||||
|
# print(predictions_test)
|
||||||
|
|
||||||
|
#print and store metrics
|
||||||
|
rec = recall_score(y[test], predictions_test, average='weighted')
|
||||||
|
print('rec: ' + str(rec))
|
||||||
|
recall_scores.append(rec)
|
||||||
|
prec = precision_score(y[test], predictions_test, average='weighted')
|
||||||
|
print('prec: ' + str(prec))
|
||||||
|
print('#')
|
||||||
|
precision_scores.append(prec)
|
||||||
|
# equation for f1 score
|
||||||
|
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||||
|
|
||||||
|
#class_prob.append(classifier.class_prior_)
|
||||||
|
#class_counts.append(classifier.class_count_)
|
||||||
|
|
||||||
|
##########################
|
||||||
|
# probability estimates for the test vector (testing_data)
|
||||||
|
class_probs = classifier.predict_proba(testing_data)
|
||||||
|
|
||||||
|
# number of samples encountered for each class during fitting
|
||||||
|
# this value is weighted by the sample weight when provided
|
||||||
|
class_count = classifier.class_count_
|
||||||
|
|
||||||
|
# classes in order used
|
||||||
|
classes = classifier.classes_
|
||||||
|
|
||||||
|
print('average: recall, precision, f1 score')
|
||||||
|
print(sum(recall_scores)/10, sum(precision_scores)/10, sum(f1_scores)/10)
|
||||||
|
|
||||||
|
|
||||||
|
# return classes and vector of class estimates
|
||||||
|
return recall_scores, precision_scores, f1_scores, class_probs
|
||||||
|
|
||||||
|
######## nur für resubstitutionsfehler benötigt ########
|
||||||
|
def analyze_errors(training, testing):
|
||||||
|
'''calculates resubstitution error
|
||||||
|
shows indices of false classified articles
|
||||||
|
uses Gaussian Bayes with train test split
|
||||||
|
'''
|
||||||
|
X_train = training['Title'] + ' ' + training['Text']
|
||||||
|
y_train = training['Label']
|
||||||
|
|
||||||
|
X_test = testing['Title'] + ' ' + testing['Text']
|
||||||
|
y_test = testing['Label']
|
||||||
|
|
||||||
|
count_vector = CountVectorizer()
|
||||||
|
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
training_data = count_vector.fit_transform(X_train).toarray()
|
||||||
|
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
testing_data = count_vector.transform(X_test).toarray()
|
||||||
|
|
||||||
|
# Naive Bayes
|
||||||
|
classifier = MultinomialNB(alpha=1.0e-10,
|
||||||
|
fit_prior=False,
|
||||||
|
class_prior=None)
|
||||||
|
# fit classifier
|
||||||
|
classifier.fit(training_data, y_train)
|
||||||
|
|
||||||
|
# Predict class
|
||||||
|
predictions = classifier.predict(testing_data)
|
||||||
|
|
||||||
|
print(type(y_test))
|
||||||
|
print(len(y_test))
|
||||||
|
print(type(predictions))
|
||||||
|
print(len(predictions))
|
||||||
|
|
||||||
|
print('Errors at index:')
|
||||||
|
print()
|
||||||
|
n = 0
|
||||||
|
for i in range(len(y_test)):
|
||||||
|
if y_test[i] != predictions[i]:
|
||||||
|
n += 1
|
||||||
|
print('error no.{}'.format(n))
|
||||||
|
print('prediction at index {} is: {}, but actual is: {}'
|
||||||
|
.format(i, predictions[i], y_test[i]))
|
||||||
|
print(X_test[i])
|
||||||
|
print(y_test[i])
|
||||||
|
print()
|
||||||
|
#print metrics
|
||||||
|
print('F1 score: ', format(f1_score(y_test, predictions)))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
# read csv file
|
||||||
|
print('# reading dataset')
|
||||||
|
print('# ...')
|
||||||
|
|
||||||
|
# read current data set from csv
|
||||||
|
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
|
||||||
|
sep='|',
|
||||||
|
usecols=range(1,13), # drop first column 'unnamed'
|
||||||
|
encoding='utf-8',
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
|
quotechar='\'')
|
||||||
|
|
||||||
|
# select only labeled articles
|
||||||
|
MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=True, percentile=100)
|
|
@ -48,7 +48,6 @@ class NaiveBayes:
|
||||||
# metrics
|
# metrics
|
||||||
recall_scores = []
|
recall_scores = []
|
||||||
precision_scores = []
|
precision_scores = []
|
||||||
#f1_scores = []
|
|
||||||
|
|
||||||
# probabilities of each class (of each fold)
|
# probabilities of each class (of each fold)
|
||||||
class_prob = []
|
class_prob = []
|
||||||
|
@ -113,32 +112,15 @@ class NaiveBayes:
|
||||||
|
|
||||||
##########################
|
##########################
|
||||||
#print metrics of test set
|
#print metrics of test set
|
||||||
# print('-------------------------')
|
print('Recall (Min): ' + str(min(recall_scores)))
|
||||||
# print('prediction of testing set:')
|
print('Recall (Max): ' + str(max(recall_scores)))
|
||||||
# print('Precision score: min = {}, max = {}, average = {}'
|
print('Recall (Average): ' + str(sum(recall_scores)/len(recall_scores)))
|
||||||
# .format(min(precision_scores),
|
print()
|
||||||
# max(precision_scores),
|
print('Precision (Min): ' + str(min(precision_scores)))
|
||||||
# sum(precision_scores)/float(len(precision_scores))))
|
print('Precision (Max): ' + str(max(precision_scores)))
|
||||||
# print('Recall score: min = {}, max = {}, average = {}'
|
print('Precision (Average) :' + str(sum(precision_scores)/len(precision_scores)))
|
||||||
# .format(min(recall_scores),
|
|
||||||
# max(recall_scores),
|
|
||||||
# sum(recall_scores)/float(len(recall_scores))))
|
|
||||||
# print('F1 score: min = {}, max = {}, average = {}'
|
|
||||||
# .format(min(f1_scores),
|
|
||||||
# max(f1_scores),
|
|
||||||
# sum(f1_scores)/float(len(f1_scores))))
|
|
||||||
# print()
|
|
||||||
# # print probability of each class
|
|
||||||
# print('probability of each class:')
|
|
||||||
# print()
|
|
||||||
# print(class_prob)
|
|
||||||
# print()
|
|
||||||
# print('number of samples of each class:')
|
|
||||||
# print()
|
|
||||||
# print(class_counts)
|
|
||||||
# print()
|
|
||||||
|
|
||||||
return class_prob, class_counts, recall_scores, precision_scores#, f1_scores
|
return class_prob, class_counts, recall_scores, precision_scores
|
||||||
|
|
||||||
##### nur für overfit testing ###########
|
##### nur für overfit testing ###########
|
||||||
#print('overfit testing: prediction of training set')
|
#print('overfit testing: prediction of training set')
|
||||||
|
|
|
@ -0,0 +1,83 @@
|
||||||
|
'''
|
||||||
|
SVM Classifier for Interactive Labeling
|
||||||
|
=======================================
|
||||||
|
|
||||||
|
returns probabilities for classes needed for interactive labeling.
|
||||||
|
'''
|
||||||
|
from BagOfWords import BagOfWords
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.feature_selection import SelectPercentile
|
||||||
|
from sklearn.metrics import recall_score, precision_score
|
||||||
|
from sklearn.model_selection import StratifiedKFold
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
|
||||||
|
class SVMInteractive:
|
||||||
|
|
||||||
|
def estimate_svm(labeled_data, unlabeled_data, sklearn_cv=True):
|
||||||
|
|
||||||
|
print('# SVM: starting interactive SVM...')
|
||||||
|
print()
|
||||||
|
|
||||||
|
# split labeled data into text and label set
|
||||||
|
# join title and text
|
||||||
|
X = labeled_data['Title'] + '. ' + labeled_data['Text']
|
||||||
|
y = labeled_data['Label']
|
||||||
|
|
||||||
|
# split unlabeled data into text and label set
|
||||||
|
# join title and text
|
||||||
|
U = unlabeled_data['Title'] + '. ' + unlabeled_data['Text']
|
||||||
|
l = unlabeled_data['Label']
|
||||||
|
|
||||||
|
if sklearn_cv:
|
||||||
|
cv = CountVectorizer()
|
||||||
|
|
||||||
|
# fit_prior=False: a uniform prior will be used instead
|
||||||
|
# of learning class prior probabilities
|
||||||
|
classifier = SVC(probability=True,
|
||||||
|
gamma='auto')
|
||||||
|
|
||||||
|
# probabilities of each class (of each fold)
|
||||||
|
class_probs = []
|
||||||
|
|
||||||
|
if sklearn_cv:
|
||||||
|
# use sklearn CountVectorizer
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
training_data = cv.fit_transform(X, y).toarray()
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
testing_data = cv.transform(U).toarray()
|
||||||
|
else:
|
||||||
|
# use my own BagOfWords python implementation
|
||||||
|
stemming = True
|
||||||
|
rel_freq = False
|
||||||
|
extracted_words = BagOfWords.extract_all_words(X)
|
||||||
|
vocab = BagOfWords.make_vocab(extracted_words)
|
||||||
|
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
print('# MNB: fit training data and calculate matrix...')
|
||||||
|
print()
|
||||||
|
training_data = BagOfWords.make_matrix(extracted_words,
|
||||||
|
vocab, rel_freq, stemming)
|
||||||
|
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
print('# MNB: transform testing data to matrix...')
|
||||||
|
print()
|
||||||
|
extracted_words = BagOfWords.extract_all_words(U)
|
||||||
|
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||||
|
vocab, rel_freq, stemming)
|
||||||
|
|
||||||
|
#fit classifier
|
||||||
|
classifier.fit(training_data, y)
|
||||||
|
|
||||||
|
# probability estimates for the test vector (testing_data)
|
||||||
|
class_probs = classifier.predict_proba(testing_data)
|
||||||
|
|
||||||
|
# classes in order used
|
||||||
|
classes = classifier.classes_
|
||||||
|
|
||||||
|
print('# ending SVM')
|
||||||
|
|
||||||
|
# return classes and vector of class estimates
|
||||||
|
return classes, class_probs
|
|
@ -0,0 +1,81 @@
|
||||||
|
'''
|
||||||
|
SVM Classifier for Interactive Labeling
|
||||||
|
=======================================
|
||||||
|
|
||||||
|
returns probabilities for classes needed for interactive labeling.
|
||||||
|
'''
|
||||||
|
from BagOfWords import BagOfWords
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.feature_selection import SelectPercentile
|
||||||
|
from sklearn.metrics import recall_score, precision_score
|
||||||
|
from sklearn.model_selection import StratifiedKFold
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
|
||||||
|
class SVMInteractive_wp:
|
||||||
|
|
||||||
|
def estimate_svm(labeled_data, unlabeled_data, sklearn_cv=True):
|
||||||
|
|
||||||
|
print('# SVM: starting interactive SVM...')
|
||||||
|
print()
|
||||||
|
|
||||||
|
# split labeled data into text and label set
|
||||||
|
# join title and text
|
||||||
|
X = labeled_data['Title'] + '. ' + labeled_data['Text']
|
||||||
|
y = labeled_data['Label']
|
||||||
|
|
||||||
|
# split unlabeled data into text and label set
|
||||||
|
# join title and text
|
||||||
|
U = unlabeled_data['Title'] + '. ' + unlabeled_data['Text']
|
||||||
|
l = unlabeled_data['Label']
|
||||||
|
|
||||||
|
if sklearn_cv:
|
||||||
|
cv = CountVectorizer()
|
||||||
|
|
||||||
|
# fit_prior=False: a uniform prior will be used instead
|
||||||
|
# of learning class prior probabilities
|
||||||
|
classifier = LinearSVC()
|
||||||
|
|
||||||
|
# probabilities of each class (of each fold)
|
||||||
|
class_probs = []
|
||||||
|
|
||||||
|
if sklearn_cv:
|
||||||
|
# use sklearn CountVectorizer
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
training_data = cv.fit_transform(X, y).toarray()
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
testing_data = cv.transform(U).toarray()
|
||||||
|
else:
|
||||||
|
# use my own BagOfWords python implementation
|
||||||
|
stemming = True
|
||||||
|
rel_freq = False
|
||||||
|
extracted_words = BagOfWords.extract_all_words(X)
|
||||||
|
vocab = BagOfWords.make_vocab(extracted_words)
|
||||||
|
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
print('# MNB: fit training data and calculate matrix...')
|
||||||
|
print()
|
||||||
|
training_data = BagOfWords.make_matrix(extracted_words,
|
||||||
|
vocab, rel_freq, stemming)
|
||||||
|
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
print('# MNB: transform testing data to matrix...')
|
||||||
|
print()
|
||||||
|
extracted_words = BagOfWords.extract_all_words(U)
|
||||||
|
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||||
|
vocab, rel_freq, stemming)
|
||||||
|
|
||||||
|
#fit classifier
|
||||||
|
classifier.fit(training_data, y)
|
||||||
|
|
||||||
|
predictions_test = classifier.predict(testing_data)
|
||||||
|
|
||||||
|
# classes in order used
|
||||||
|
classes = classifier.classes_
|
||||||
|
|
||||||
|
print('# ending SVM')
|
||||||
|
|
||||||
|
# return classes and vector of class estimates
|
||||||
|
return classes, predictions_test
|
|
@ -19,103 +19,143 @@ import csv
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
from sklearn.feature_selection import SelectPercentile
|
from sklearn.feature_selection import SelectPercentile
|
||||||
from sklearn.metrics import f1_score, make_scorer
|
from sklearn.metrics import recall_score, precision_score, f1_score, make_scorer, accuracy_score
|
||||||
from sklearn.model_selection import StratifiedKFold
|
from sklearn.model_selection import StratifiedKFold
|
||||||
from sklearn.model_selection import GridSearchCV
|
from sklearn.model_selection import GridSearchCV
|
||||||
from sklearn.pipeline import Pipeline
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.svm import LinearSVC
|
||||||
from sklearn.svm import SVC
|
from sklearn.svm import SVC
|
||||||
|
from sklearn.svm import NuSVC
|
||||||
|
|
||||||
class SVM:
|
class SVM_multiclass:
|
||||||
|
|
||||||
def make_svm(dataset, sklearn_cv=True):
|
def make_svm(dataset, sklearn_cv=True, percentile=100):
|
||||||
|
|
||||||
print('# fitting model')
|
print('# starting multinomial svm')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
# split data into text and label set
|
# split data into text and label set
|
||||||
|
# join title and text
|
||||||
# articles' text (title + text)
|
|
||||||
X = dataset['Title'] + '. ' + dataset['Text']
|
X = dataset['Title'] + '. ' + dataset['Text']
|
||||||
# articles' labels
|
|
||||||
y = dataset['Label']
|
y = dataset['Label']
|
||||||
matrix = pd.DataFrame()
|
|
||||||
|
|
||||||
# fit the training data and then return the matrix
|
|
||||||
if sklearn_cv:
|
if sklearn_cv:
|
||||||
# use sklearn CountVectorizer
|
|
||||||
matrix = CountVectorizer().fit_transform(X).toarray()
|
# ignore company names
|
||||||
else:
|
company_names_list = BagOfWords.load_company_names()
|
||||||
# use own BOW implementation
|
stopwords = list(BagOfWords.set_stop_words()).extend(company_names_list)
|
||||||
matrix = BagOfWords.fit_transform(X)
|
cv = CountVectorizer(stop_words = stopwords)
|
||||||
|
|
||||||
# use stratified k-fold cross-validation as split method
|
# use stratified k-fold cross-validation as split method
|
||||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
|
||||||
|
|
||||||
# use only most important features
|
classifier = LinearSVC()
|
||||||
selector = SelectPercentile()
|
|
||||||
|
# for predict proba:
|
||||||
|
#classifier = SVC(probability=True,
|
||||||
|
# gamma='auto')
|
||||||
|
|
||||||
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
|
# metrics
|
||||||
|
recall_scores = []
|
||||||
|
precision_scores = []
|
||||||
|
accuracy_scores = []
|
||||||
|
f1_scores = []
|
||||||
|
|
||||||
grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75, 100],
|
# for each fold
|
||||||
'SVC__kernel': ['linear'],
|
n = 0
|
||||||
'SVC__gamma': [0.00001, 0.0001],
|
for train, test in skf.split(X,y):
|
||||||
'SVC__C': [0.1, 1]},
|
|
||||||
cv=skf,
|
|
||||||
scoring=make_scorer(f1_score, average='micro'))
|
|
||||||
|
|
||||||
print('# fit classifier')
|
n += 1
|
||||||
print('# ...')
|
print('# split no. ' + str(n))
|
||||||
|
|
||||||
grid.fit(matrix,y)
|
if sklearn_cv:
|
||||||
|
# use sklearn CountVectorizer
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
|
||||||
|
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
testing_data = cv.transform(X[test]).toarray()
|
||||||
|
else:
|
||||||
|
# use my own BagOfWords python implementation
|
||||||
|
stemming = True
|
||||||
|
rel_freq = True
|
||||||
|
extracted_words = BagOfWords.extract_all_words(X[train])
|
||||||
|
vocab = BagOfWords.make_vocab(extracted_words)
|
||||||
|
|
||||||
# DataFrame of results
|
# fit the training data and then return the matrix
|
||||||
df_results = grid.cv_results_
|
training_data = BagOfWords.make_matrix(extracted_words,
|
||||||
|
vocab, rel_freq, stemming)
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
extracted_words = BagOfWords.extract_all_words(X[test])
|
||||||
|
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||||
|
vocab, rel_freq, stemming)
|
||||||
|
|
||||||
# print results
|
# apply select percentile
|
||||||
######################
|
selector = SelectPercentile(percentile=percentile)
|
||||||
print('RESULTS:')
|
selector.fit(training_data, y[train])
|
||||||
print('')
|
|
||||||
print('mean_test_score:')
|
# new reduced data sets
|
||||||
print(df_results['mean_test_score'])
|
training_data_r = selector.transform(training_data)
|
||||||
print('')
|
testing_data_r = selector.transform(testing_data)
|
||||||
print('mean of means:')
|
|
||||||
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
|
#fit classifier
|
||||||
print('')
|
classifier.fit(training_data_r, y[train])
|
||||||
print('best score:')
|
#predict class
|
||||||
print(grid.best_score_)
|
predictions_train = classifier.predict(training_data_r)
|
||||||
|
predictions_test = classifier.predict(testing_data_r)
|
||||||
|
|
||||||
|
#print and store metrics
|
||||||
|
rec = recall_score(y[test], predictions_test, average='weighted')
|
||||||
|
print('rec: ' + str(rec))
|
||||||
|
recall_scores.append(rec)
|
||||||
|
prec = precision_score(y[test], predictions_test, average='weighted')
|
||||||
|
print('prec: ' + str(prec))
|
||||||
|
print('#')
|
||||||
|
precision_scores.append(prec)
|
||||||
|
acc = recall_score(y[test], predictions_test, average='weighted')
|
||||||
|
accuracy_scores.append(acc)
|
||||||
|
print('acc: ' + str(acc))
|
||||||
|
print('#')
|
||||||
|
# equation for f1 score
|
||||||
|
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||||
|
|
||||||
|
#class_prob.append(classifier.class_prior_)
|
||||||
|
#class_counts.append(classifier.class_count_)
|
||||||
|
#print(classifier.predict_proba(testing_data_r))
|
||||||
|
|
||||||
|
##########################
|
||||||
|
# classes in order used
|
||||||
|
classes = classifier.classes_
|
||||||
|
|
||||||
|
print('Recall (Min): ' + str(min(recall_scores)))
|
||||||
|
print('Recall (Max): ' + str(max(recall_scores)))
|
||||||
|
print('Recall (Average): ' + str(sum(recall_scores)/len(recall_scores)))
|
||||||
print()
|
print()
|
||||||
print('best parameters set found on development set:')
|
print('Precision (Min): ' + str(min(precision_scores)))
|
||||||
print(grid.best_params_)
|
print('Precision (Max): ' + str(max(precision_scores)))
|
||||||
|
print('Precision (Average) :' + str(sum(precision_scores)/len(precision_scores)))
|
||||||
print()
|
print()
|
||||||
|
print('Accuracy (Min): ' + str(min(accuracy_scores)))
|
||||||
|
print('Accuracy (Max): ' + str(max(accuracy_scores)))
|
||||||
|
print('Accuracy (Average) :' + str(sum(accuracy_scores)/len(accuracy_scores)))
|
||||||
|
|
||||||
if __name__ == '__main__':
|
# return classes and vector of class estimates
|
||||||
|
return recall_scores, precision_scores
|
||||||
|
|
||||||
print('# starting svm')
|
if __name__ == '__main__':
|
||||||
print('# ...')
|
|
||||||
|
|
||||||
#file = '..\\data\\classification_labelled_corrected.csv'
|
# read csv file
|
||||||
|
print('# reading dataset')
|
||||||
|
print('# ...')
|
||||||
|
|
||||||
# read csv file
|
# read current data set from csv
|
||||||
print('# reading dataset')
|
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
|
||||||
print('# ...')
|
sep='|',
|
||||||
|
usecols=range(1,13), # drop first column 'unnamed'
|
||||||
|
encoding='utf-8',
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
|
quotechar='\'')
|
||||||
|
|
||||||
# data = pd.read_csv(file,
|
# select only labeled articles
|
||||||
# sep='|',
|
SVM_multiclass.make_svm(df.loc[df['Label'] != -1].reset_index(drop=True),
|
||||||
# engine='python',
|
sklearn_cv=True)
|
||||||
# decimal='.',
|
|
||||||
# quotechar='\'',
|
|
||||||
# quoting=csv.QUOTE_NONE)
|
|
||||||
# read current data set from csv
|
|
||||||
|
|
||||||
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
|
|
||||||
sep='|',
|
|
||||||
usecols=range(1,13), # drop first column 'unnamed'
|
|
||||||
encoding='utf-8',
|
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
|
||||||
quotechar='\'')
|
|
||||||
data = df.loc[df['Label'] != -1].reset_index(drop=True)
|
|
||||||
|
|
||||||
use_count_vectorizer = True
|
|
||||||
make_svm(data, use_count_vectorizer)
|
|
||||||
|
|
||||||
print('# ending svm')
|
|
|
@ -0,0 +1,123 @@
|
||||||
|
'''
|
||||||
|
Support Vector Machines (SVM) Classifier
|
||||||
|
========================================
|
||||||
|
|
||||||
|
The SVM training algorithm builds a model from the training data that assigns
|
||||||
|
the test samples to one category ('merger' or 'not merger'),
|
||||||
|
making it a non-probabilistic binary linear classifier.
|
||||||
|
An SVM model is a representation of the samples as points in space,
|
||||||
|
mapped so that the examples of the separate categories are divided
|
||||||
|
by a clear gap that is as wide as possible.
|
||||||
|
New samples are then mapped into that same space and predicted
|
||||||
|
to belong to a category based on which side of the gap they fall.
|
||||||
|
'''
|
||||||
|
|
||||||
|
from BagOfWords import BagOfWords
|
||||||
|
|
||||||
|
import csv
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.feature_selection import SelectPercentile
|
||||||
|
from sklearn.metrics import recall_score, f1_score, make_scorer
|
||||||
|
from sklearn.model_selection import StratifiedKFold
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
|
||||||
|
class SVM_multiclass_grid:
|
||||||
|
|
||||||
|
def make_svm(dataset, sklearn_cv=True):
|
||||||
|
|
||||||
|
print('# fitting model')
|
||||||
|
print('# ...')
|
||||||
|
|
||||||
|
# split data into text and label set
|
||||||
|
|
||||||
|
# articles' text (title + text)
|
||||||
|
X = dataset['Title'] + '. ' + dataset['Text']
|
||||||
|
# articles' labels
|
||||||
|
y = dataset['Label']
|
||||||
|
matrix = pd.DataFrame()
|
||||||
|
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
if sklearn_cv:
|
||||||
|
# use sklearn CountVectorizer
|
||||||
|
company_names_list = BagOfWords.load_company_names()
|
||||||
|
stopwords = list(BagOfWords.set_stop_words()).extend(company_names_list)
|
||||||
|
matrix = CountVectorizer(stop_words = stopwords).fit_transform(X).toarray()
|
||||||
|
else:
|
||||||
|
# use own BOW implementation
|
||||||
|
matrix = BagOfWords.fit_transform(X)
|
||||||
|
|
||||||
|
# use stratified k-fold cross-validation as split method
|
||||||
|
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||||
|
|
||||||
|
# use only most important features
|
||||||
|
selector = SelectPercentile()
|
||||||
|
|
||||||
|
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
|
||||||
|
|
||||||
|
grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75, 100],
|
||||||
|
'SVC__kernel': ['linear'],
|
||||||
|
'SVC__gamma': [0.000001, 0.00001],
|
||||||
|
'SVC__C': [0.01, 0.1]},
|
||||||
|
cv=skf,
|
||||||
|
scoring=make_scorer(recall_score, average='micro'))
|
||||||
|
|
||||||
|
print('# fit classifier')
|
||||||
|
print('# ...')
|
||||||
|
|
||||||
|
grid.fit(matrix,y)
|
||||||
|
|
||||||
|
# DataFrame of results
|
||||||
|
df_results = grid.cv_results_
|
||||||
|
|
||||||
|
# print results
|
||||||
|
######################
|
||||||
|
print('RESULTS:')
|
||||||
|
print('')
|
||||||
|
print('mean_test_score:')
|
||||||
|
print(df_results['mean_test_score'])
|
||||||
|
print('')
|
||||||
|
print('mean of means:')
|
||||||
|
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
|
||||||
|
print('')
|
||||||
|
print('best score:')
|
||||||
|
print(grid.best_score_)
|
||||||
|
print()
|
||||||
|
print('best parameters set found on development set:')
|
||||||
|
print(grid.best_params_)
|
||||||
|
print()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
print('# starting svm')
|
||||||
|
print('# ...')
|
||||||
|
|
||||||
|
#file = '..\\data\\classification_labelled_corrected.csv'
|
||||||
|
|
||||||
|
# read csv file
|
||||||
|
print('# reading dataset')
|
||||||
|
print('# ...')
|
||||||
|
|
||||||
|
# data = pd.read_csv(file,
|
||||||
|
# sep='|',
|
||||||
|
# engine='python',
|
||||||
|
# decimal='.',
|
||||||
|
# quotechar='\'',
|
||||||
|
# quoting=csv.QUOTE_NONE)
|
||||||
|
# read current data set from csv
|
||||||
|
|
||||||
|
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
|
||||||
|
sep='|',
|
||||||
|
usecols=range(1,13), # drop first column 'unnamed'
|
||||||
|
encoding='utf-8',
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
|
quotechar='\'')
|
||||||
|
data = df.loc[df['Label'] != -1].reset_index(drop=True)
|
||||||
|
|
||||||
|
use_count_vectorizer = True
|
||||||
|
make_svm(data, use_count_vectorizer)
|
||||||
|
|
||||||
|
print('# ending svm')
|
|
@ -0,0 +1,152 @@
|
||||||
|
'''
|
||||||
|
Support Vector Machines (SVM) Classifier
|
||||||
|
========================================
|
||||||
|
|
||||||
|
The SVM training algorithm builds a model from the training data that assigns
|
||||||
|
the test samples to one category ('merger' or 'not merger'),
|
||||||
|
making it a non-probabilistic binary linear classifier.
|
||||||
|
An SVM model is a representation of the samples as points in space,
|
||||||
|
mapped so that the examples of the separate categories are divided
|
||||||
|
by a clear gap that is as wide as possible.
|
||||||
|
New samples are then mapped into that same space and predicted
|
||||||
|
to belong to a category based on which side of the gap they fall.
|
||||||
|
'''
|
||||||
|
|
||||||
|
from BagOfWords import BagOfWords
|
||||||
|
|
||||||
|
import csv
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.feature_selection import SelectPercentile
|
||||||
|
from sklearn.metrics import recall_score, precision_score, f1_score, make_scorer
|
||||||
|
from sklearn.model_selection import StratifiedKFold
|
||||||
|
from sklearn.model_selection import GridSearchCV
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
from sklearn.svm import NuSVC
|
||||||
|
|
||||||
|
class SVM_multiclass:
|
||||||
|
|
||||||
|
def make_svm(dataset, sklearn_cv=True, percentile=100):
|
||||||
|
|
||||||
|
print('# starting multinomial svm')
|
||||||
|
print('# ...')
|
||||||
|
|
||||||
|
# split data into text and label set
|
||||||
|
# join title and text
|
||||||
|
X = dataset['Title'] + '. ' + dataset['Text']
|
||||||
|
y = dataset['Label']
|
||||||
|
|
||||||
|
if sklearn_cv:
|
||||||
|
|
||||||
|
# ignore company names
|
||||||
|
company_names_list = BagOfWords.load_company_names()
|
||||||
|
stopwords = list(BagOfWords.set_stop_words()).extend(company_names_list)
|
||||||
|
cv = CountVectorizer(stop_words = stopwords)
|
||||||
|
|
||||||
|
# use stratified k-fold cross-validation as split method
|
||||||
|
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
|
||||||
|
|
||||||
|
#classifier = LinearSVC()
|
||||||
|
|
||||||
|
# for predict proba:
|
||||||
|
classifier = SVC(probability=True,
|
||||||
|
gamma='auto')
|
||||||
|
|
||||||
|
# metrics
|
||||||
|
recall_scores = []
|
||||||
|
precision_scores = []
|
||||||
|
f1_scores = []
|
||||||
|
|
||||||
|
# for each fold
|
||||||
|
n = 0
|
||||||
|
for train, test in skf.split(X,y):
|
||||||
|
|
||||||
|
n += 1
|
||||||
|
print('# split no. ' + str(n))
|
||||||
|
|
||||||
|
if sklearn_cv:
|
||||||
|
# use sklearn CountVectorizer
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
|
||||||
|
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
testing_data = cv.transform(X[test]).toarray()
|
||||||
|
else:
|
||||||
|
# use my own BagOfWords python implementation
|
||||||
|
stemming = True
|
||||||
|
rel_freq = True
|
||||||
|
extracted_words = BagOfWords.extract_all_words(X[train])
|
||||||
|
vocab = BagOfWords.make_vocab(extracted_words)
|
||||||
|
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
training_data = BagOfWords.make_matrix(extracted_words,
|
||||||
|
vocab, rel_freq, stemming)
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
extracted_words = BagOfWords.extract_all_words(X[test])
|
||||||
|
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||||
|
vocab, rel_freq, stemming)
|
||||||
|
|
||||||
|
# apply select percentile
|
||||||
|
selector = SelectPercentile(percentile=percentile)
|
||||||
|
selector.fit(training_data, y[train])
|
||||||
|
|
||||||
|
# new reduced data sets
|
||||||
|
training_data_r = selector.transform(training_data)
|
||||||
|
testing_data_r = selector.transform(testing_data)
|
||||||
|
|
||||||
|
#fit classifier
|
||||||
|
classifier.fit(training_data_r, y[train])
|
||||||
|
#predict class
|
||||||
|
predictions_train = classifier.predict(training_data_r)
|
||||||
|
predictions_test = classifier.predict(testing_data_r)
|
||||||
|
|
||||||
|
#print and store metrics
|
||||||
|
rec = recall_score(y[test], predictions_test, average='weighted')
|
||||||
|
print('rec: ' + str(rec))
|
||||||
|
recall_scores.append(rec)
|
||||||
|
prec = precision_score(y[test], predictions_test, average='weighted')
|
||||||
|
print('prec: ' + str(prec))
|
||||||
|
print('#')
|
||||||
|
precision_scores.append(prec)
|
||||||
|
# equation for f1 score
|
||||||
|
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||||
|
|
||||||
|
#class_prob.append(classifier.class_prior_)
|
||||||
|
#class_counts.append(classifier.class_count_)
|
||||||
|
print(classifier.predict_proba(testing_data_r))
|
||||||
|
|
||||||
|
##########################
|
||||||
|
# classes in order used
|
||||||
|
classes = classifier.classes_
|
||||||
|
|
||||||
|
print('Recall (Min): ' + str(min(recall_scores)))
|
||||||
|
print('Recall (Max): ' + str(max(recall_scores)))
|
||||||
|
print('Recall (Average): ' + str(sum(recall_scores)/len(recall_scores)))
|
||||||
|
print()
|
||||||
|
print('Precision (Min): ' + str(min(precision_scores)))
|
||||||
|
print('Precision (Max): ' + str(max(precision_scores)))
|
||||||
|
print('Precision (Average) :' + str(sum(precision_scores)/len(precision_scores)))
|
||||||
|
|
||||||
|
# return classes and vector of class estimates
|
||||||
|
return recall_scores, precision_scores
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
# read csv file
|
||||||
|
print('# reading dataset')
|
||||||
|
print('# ...')
|
||||||
|
|
||||||
|
# read current data set from csv
|
||||||
|
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
|
||||||
|
sep='|',
|
||||||
|
usecols=range(1,13), # drop first column 'unnamed'
|
||||||
|
encoding='utf-8',
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
|
quotechar='\'')
|
||||||
|
|
||||||
|
# select only labeled articles
|
||||||
|
SVM_multiclass.make_svm(df.loc[df['Label'] != -1].reset_index(drop=True),
|
||||||
|
sklearn_cv=True)
|
|
@ -22,314 +22,315 @@ from wordcloud import WordCloud
|
||||||
|
|
||||||
class VisualizerNews:
|
class VisualizerNews:
|
||||||
|
|
||||||
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
|
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
|
||||||
|
|
||||||
def plot_wordcloud_dataset():
|
def plot_wordcloud_dataset():
|
||||||
'''plots word cloud image of most common words in dataset.
|
'''plots word cloud image of most common words in dataset.
|
||||||
'''
|
'''
|
||||||
print('# preparing word cloud of 200 most common words...')
|
print('# preparing word cloud of 200 most common words...')
|
||||||
print()
|
print()
|
||||||
# load new data set
|
# load new data set
|
||||||
file = '..\\data\\cleaned_data_set_without_header.csv'
|
file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||||
df_dataset = pd.read_csv(file,
|
df_dataset = pd.read_csv(file,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
usecols=[1,2],
|
usecols=[1,2],
|
||||||
#nrows=100,
|
#nrows=100,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
|
|
||||||
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
||||||
stemming = True
|
stemming = True
|
||||||
rel_freq = True
|
rel_freq = True
|
||||||
|
|
||||||
# find most common words in dataset
|
# find most common words in dataset
|
||||||
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||||
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||||
matrix = BagOfWords.make_matrix(extracted_words, vocab,
|
matrix = BagOfWords.make_matrix(extracted_words, vocab,
|
||||||
rel_freq, stemming)
|
rel_freq, stemming)
|
||||||
dict = BagOfWords.make_dict_common_words(matrix, 200,
|
dict = BagOfWords.make_dict_common_words(matrix, 200,
|
||||||
rel_freq, stemming)
|
rel_freq, stemming)
|
||||||
# save dict object
|
# save dict object
|
||||||
with open('../obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f:
|
with open('../obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f:
|
||||||
pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)
|
pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
wordcloud = WordCloud(background_color='white',
|
wordcloud = WordCloud(background_color='white',
|
||||||
width=2400,
|
width=2400,
|
||||||
height=1200,
|
height=1200,
|
||||||
scale=2,
|
scale=2,
|
||||||
# true if bigram:
|
# true if bigram:
|
||||||
collocations=False)\
|
collocations=False)\
|
||||||
.generate_from_frequencies(dict)
|
.generate_from_frequencies(dict)
|
||||||
|
|
||||||
# display generated image
|
# display generated image
|
||||||
plt.imshow(wordcloud, interpolation='bilinear')
|
plt.imshow(wordcloud, interpolation='bilinear')
|
||||||
plt.axis("off")
|
plt.axis("off")
|
||||||
plt.savefig('visualization\\WordCloud_{}.eps'
|
plt.savefig('visualization\\WordCloud_{}.eps'
|
||||||
.format(VisualizerNews.datestring))
|
.format(VisualizerNews.datestring))
|
||||||
plt.savefig('visualization\\WordCloud_{}.png'
|
plt.savefig('visualization\\WordCloud_{}.png'
|
||||||
.format(VisualizerNews.datestring))
|
.format(VisualizerNews.datestring))
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def plot_histogram_companies():
|
def plot_histogram_companies():
|
||||||
'''plots diagram of company names distribution
|
'''plots diagram of company names distribution
|
||||||
count_names: list of company counts(int)
|
count_names: list of company counts(int)
|
||||||
x-axis: number of mentions of the company
|
x-axis: number of mentions of the company
|
||||||
y-axis: frequency
|
y-axis: frequency
|
||||||
'''
|
'''
|
||||||
print('# preparing histogram of company mentions...')
|
print('# preparing histogram of company mentions...')
|
||||||
print()
|
print()
|
||||||
# # read data set
|
# # read data set
|
||||||
# file = '..\\data\\cleaned_data_set_without_header.csv'
|
# file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||||
# df = pd.read_csv(file,
|
# df = pd.read_csv(file,
|
||||||
# delimiter='|',
|
# delimiter='|',
|
||||||
# header=None,
|
# header=None,
|
||||||
# index_col=None,
|
# index_col=None,
|
||||||
# engine='python',
|
# engine='python',
|
||||||
# usecols=[1,2],
|
# usecols=[1,2],
|
||||||
# #nrows=10,
|
# #nrows=10,
|
||||||
# quoting=csv.QUOTE_NONNUMERIC,
|
# quoting=csv.QUOTE_NONNUMERIC,
|
||||||
# quotechar='\'')
|
# quotechar='\'')
|
||||||
|
|
||||||
# # # only articles with label==1
|
# # # only articles with label==1
|
||||||
# # df_hits = df[df['Label'] == 1]
|
# # df_hits = df[df['Label'] == 1]
|
||||||
# # texts = df_hits['Title'] + '. ' + df_hits['Text']
|
# # texts = df_hits['Title'] + '. ' + df_hits['Text']
|
||||||
# texts = df[1] + '. ' + df[2]
|
# texts = df[1] + '. ' + df[2]
|
||||||
|
|
||||||
# # list: count articles with company names
|
# # list: count articles with company names
|
||||||
# count_names = NER.count_companies(texts)
|
# count_names = NER.count_companies(texts)
|
||||||
|
|
||||||
# # sort list in descending order
|
# # sort list in descending order
|
||||||
# count_names.sort(reverse=True)
|
# count_names.sort(reverse=True)
|
||||||
# # convert list to array
|
# # convert list to array
|
||||||
# names = np.asarray(count_names)
|
# names = np.asarray(count_names)
|
||||||
|
|
||||||
# load pickle object
|
# load pickle object
|
||||||
with open('../obj/dict_organizations.pkl', 'rb') as input:
|
with open('../obj/dict_organizations.pkl', 'rb') as input:
|
||||||
dict = pickle.load(input)
|
dict = pickle.load(input)
|
||||||
# make list of dict's values
|
# make list of dict's values
|
||||||
count_companies = list(dict.values())
|
count_companies = list(dict.values())
|
||||||
# sort list in descending order
|
# sort list in descending order
|
||||||
count_companies.sort(reverse=True)
|
count_companies.sort(reverse=True)
|
||||||
# convert list to array
|
# convert list to array
|
||||||
names = np.asarray(count_companies)
|
names = np.asarray(count_companies)
|
||||||
|
|
||||||
plt.xlabel('Count of articles that mention a company')
|
plt.xlabel('Count of articles that mention a special company')
|
||||||
# Number of companies with this number of mentions
|
# Number of companies with this number of mentions
|
||||||
plt.ylabel('Number of companies with this number of articles')
|
plt.ylabel('Number of companies with this number of articles')
|
||||||
num_bins = 400
|
num_bins = 300
|
||||||
n, bins, patches = plt.hist(names, num_bins,
|
n, bins, patches = plt.hist(names, num_bins,
|
||||||
facecolor='darkred', alpha=0.5)
|
color='darkred', alpha=1)
|
||||||
plt.axis([1, 14, 0, 14000])
|
plt.axis([1, 14, 0, 14000])
|
||||||
|
|
||||||
# format axis labels for thousends (e.g. '10,000')
|
# format axis labels for thousends (e.g. '10,000')
|
||||||
plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
|
plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
|
||||||
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
||||||
|
|
||||||
# save to file
|
# save to file
|
||||||
plt.savefig('..\\visualization\\NER_{}.eps'
|
plt.savefig('..\\visualization\\NER_{}.eps'
|
||||||
.format(VisualizerNews.datestring))
|
.format(VisualizerNews.datestring))
|
||||||
plt.savefig('..\\visualization\\NER_{}.png'
|
plt.savefig('..\\visualization\\NER_{}.png'
|
||||||
.format(VisualizerNews.datestring))
|
.format(VisualizerNews.datestring))
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def plot_histogram_text_lengths():
|
def plot_histogram_text_lengths():
|
||||||
'''plot histogram of article length
|
'''plot histogram of article length
|
||||||
x-axis: number of characters in article (without headline)
|
x-axis: number of characters in article (without headline)
|
||||||
y-axis: frequency
|
y-axis: frequency
|
||||||
'''
|
'''
|
||||||
print('# preparing histogram of text lengths...')
|
print('# preparing histogram of text lengths...')
|
||||||
print()
|
print()
|
||||||
# read data set
|
# read data set
|
||||||
filepath = '..\\data\\cleaned_data_set_without_header.csv'
|
filepath = '..\\data\\cleaned_data_set_without_header.csv'
|
||||||
df_dataset = pd.read_csv(filepath,
|
df_dataset = pd.read_csv(filepath,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
usecols=[2],
|
usecols=[2],
|
||||||
#nrows=100,
|
#nrows=100,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
# consider only Text, not Headline
|
# consider only Text, not Headline
|
||||||
texts = df_dataset[2]
|
texts = df_dataset[2]
|
||||||
|
|
||||||
# count characters in articles
|
# count characters in articles
|
||||||
print('# counting characters in articles...')
|
print('# counting characters in articles...')
|
||||||
print()
|
print()
|
||||||
count_chars = []
|
count_chars = []
|
||||||
for text in texts:
|
for text in texts:
|
||||||
count_chars.append(len(text))
|
count_chars.append(len(text))
|
||||||
# average of number of characters
|
# average of number of characters
|
||||||
av = int(sum(count_chars) / len(count_chars))
|
av = int(sum(count_chars) / len(count_chars))
|
||||||
print('# average length of news articles is {} characters'.format(av))
|
print('# average length of news articles is {} characters'.format(av))
|
||||||
print()
|
print()
|
||||||
# sort list in descending order
|
# sort list in descending order
|
||||||
count_chars.sort(reverse=True)
|
count_chars.sort(reverse=True)
|
||||||
# convert list to array
|
# convert list to array
|
||||||
names = np.asarray(count_chars)
|
names = np.asarray(count_chars)
|
||||||
# plt.title('Length of News Articles')
|
# plt.title('Length of News Articles')
|
||||||
plt.xlabel('Number of characters in article')
|
plt.xlabel('Number of characters in the article')
|
||||||
plt.ylabel('Frequency')
|
plt.ylabel('Frequency')
|
||||||
# number of vertical bins
|
# number of vertical bins
|
||||||
num_bins = 200
|
num_bins = 200
|
||||||
n, bins, patches = plt.hist(names, num_bins,
|
n, bins, patches = plt.hist(names, num_bins,
|
||||||
facecolor='darkslategrey', alpha=0.5)
|
facecolor='darkslategrey', alpha=0.5)
|
||||||
# [xmin, xmax, ymin, ymax] of axis
|
# [xmin, xmax, ymin, ymax] of axis
|
||||||
plt.axis([300,10000,0,500])
|
plt.axis([300,10000,0,500])
|
||||||
# format axis labels for thousends (e.g. '10,000')
|
# format axis labels for thousends (e.g. '10,000')
|
||||||
plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
|
plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
|
||||||
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
||||||
# save plot
|
# save plot
|
||||||
plt.savefig('..\\visualization\\TextLength_{}.eps'\
|
plt.savefig('..\\visualization\\TextLength_{}.eps'\
|
||||||
.format(VisualizerNews.datestring))
|
.format(VisualizerNews.datestring))
|
||||||
plt.savefig('..\\visualization\\TextLength_{}.png'\
|
plt.savefig('..\\visualization\\TextLength_{}.png'\
|
||||||
.format(VisualizerNews.datestring))
|
.format(VisualizerNews.datestring))
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def plot_pie_chart_of_sites():
|
def plot_pie_chart_of_sites():
|
||||||
|
|
||||||
print('# preparing pie chart of news article sites...')
|
print('# preparing pie chart of news article sites...')
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# load data set
|
# load data set
|
||||||
filepath = '..\\data\\cleaned_data_set_without_header.csv'
|
filepath = '..\\data\\cleaned_data_set_without_header.csv'
|
||||||
df_dataset = pd.read_csv(filepath,
|
df_dataset = pd.read_csv(filepath,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
#usecols=[3], #column 'Site'
|
#usecols=[3], #column 'Site'
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
#nrows=10,
|
#nrows=10,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
# find all different sites, group by 'Site'
|
# find all different sites, group by 'Site'
|
||||||
df_counts = df_dataset.groupby(3).count()
|
df_counts = df_dataset.groupby(3).count()
|
||||||
# count occurences of each site, count different 'Url's
|
# count occurences of each site, count different 'Url's
|
||||||
df_counts = df_counts.sort_values([5], ascending=False)
|
df_counts = df_counts.sort_values([5], ascending=False)
|
||||||
|
|
||||||
fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
|
fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
|
||||||
|
|
||||||
data = list(df_counts[5])
|
data = list(df_counts[5])
|
||||||
# legend labels
|
# legend labels
|
||||||
labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)',
|
labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)',
|
||||||
'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']
|
'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']
|
||||||
|
|
||||||
wedges, texts, autotexts = ax.pie(data, autopct='%1.0f%%', pctdistance=2.0,
|
wedges, texts, autotexts = ax.pie(data, autopct='%1.0f%%', pctdistance=2.0,
|
||||||
startangle=90, textprops=dict(color="w"))
|
startangle=90, textprops=dict(color="w"))
|
||||||
|
|
||||||
ax.legend(wedges, labels,
|
ax.legend(wedges, labels,
|
||||||
#title="News Article Sources",
|
#title="News Article Sources",
|
||||||
loc="center left",
|
loc="center left",
|
||||||
bbox_to_anchor=(1, 0, 0.5, 1),
|
bbox_to_anchor=(1, 0, 0.5, 1),
|
||||||
prop={'size': 10},
|
prop={'size': 10},
|
||||||
fontsize=10)
|
fontsize=10)
|
||||||
|
|
||||||
plt.setp(autotexts, size=8, weight="bold")
|
plt.setp(autotexts, size=8, weight="bold")
|
||||||
plt.show()
|
plt.show()
|
||||||
plt.savefig('..\\visualization\\Sites_{}.pdf'.format(VisualizerNews.datestring))
|
plt.savefig('..\\visualization\\Sites_{}.pdf'.format(VisualizerNews.datestring))
|
||||||
plt.savefig('..\\visualization\\Sites_{}.pgf'.format(VisualizerNews.datestring))
|
plt.savefig('..\\visualization\\Sites_{}.pgf'.format(VisualizerNews.datestring))
|
||||||
|
|
||||||
def plot_hist_most_common_words(n_commons = 10):
|
def plot_hist_most_common_words(n_commons = 10):
|
||||||
print('# preparing histogram of most common words...')
|
print('# preparing histogram of most common words...')
|
||||||
print()
|
print()
|
||||||
# # load data set
|
# load data set
|
||||||
# filepath = '..\\data\\cleaned_data_set_without_header.csv'
|
df = pd.read_csv('../data/interactive_labeling_round_16_temp.csv',
|
||||||
# df_dataset = pd.read_csv(filepath,
|
sep='|',
|
||||||
# delimiter='|',
|
usecols=range(1,13), # drop first column 'unnamed'
|
||||||
# header=None,
|
encoding='utf-8',
|
||||||
# usecols=[1,2],
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
# index_col=None,
|
quotechar='\'')
|
||||||
# engine='python',
|
|
||||||
# #nrows=1000,
|
|
||||||
# quoting=csv.QUOTE_NONNUMERIC,
|
|
||||||
# quotechar='\'')
|
|
||||||
|
|
||||||
# corpus = df_dataset[1] + '. ' + df_dataset[2]
|
# select only labeled articles
|
||||||
|
df = df.loc[df['Label'] != -1].reset_index(drop=True)
|
||||||
|
|
||||||
# stemming = False
|
corpus = df['Title'] + '. ' + df['Text']
|
||||||
# rel_freq = True
|
|
||||||
|
|
||||||
# # find most common words in dataset
|
stemming = False
|
||||||
# extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
rel_freq = True
|
||||||
# vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
|
||||||
# matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
|
|
||||||
# stemming)
|
|
||||||
# dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
|
|
||||||
# stemming)
|
|
||||||
# # save dict object
|
|
||||||
# with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f:
|
|
||||||
# pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
|
||||||
|
|
||||||
# load pickle object
|
# find most common words in dataset
|
||||||
with open ('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i:
|
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||||
dict = pickle.load(i)
|
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||||
# sort dict by value
|
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
|
||||||
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
stemming)
|
||||||
reverse=True))
|
dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
|
||||||
# return n higest values as dict (word => count)
|
stemming)
|
||||||
n_dict = {}
|
# save dict object
|
||||||
|
#with open('obj/'+ 'dict_10_most_common_words_merger' + '.pkl', 'wb') as f:
|
||||||
|
# pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
for i in range(n_commons):
|
# load pickle object
|
||||||
# next highest score
|
#with open ('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i:
|
||||||
next_highest = o_dict.popitem(last=False)
|
# dict = pickle.load(i)
|
||||||
n_dict[next_highest[0]] = next_highest[1]
|
|
||||||
|
# sort dict by value
|
||||||
|
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
||||||
|
reverse=True))
|
||||||
|
# return n higest values as dict (word => count)
|
||||||
|
n_dict = {}
|
||||||
|
|
||||||
#plt.xlabel('Most common words in textual corpus')
|
for i in range(n_commons):
|
||||||
plt.ylabel('Relative frequency')
|
# next highest score
|
||||||
|
next_highest = o_dict.popitem(last=False)
|
||||||
|
n_dict[next_highest[0]] = next_highest[1]
|
||||||
|
|
||||||
labels = list(n_dict.keys())
|
#plt.xlabel('Most common words in textual corpus')
|
||||||
numbers = list(n_dict.values())
|
plt.ylabel('Relative frequency')
|
||||||
nbars = n_commons
|
|
||||||
plt.bar(np.arange(nbars),
|
|
||||||
height=numbers,
|
|
||||||
tick_label=labels,
|
|
||||||
facecolor='royalblue')
|
|
||||||
plt.savefig('..\\visualization\\10_most_common_words_{}.eps'
|
|
||||||
.format(VisualizerNews.datestring))
|
|
||||||
plt.savefig('..\\visualization\\10_most_common_words_{}.png'
|
|
||||||
.format(VisualizerNews.datestring))
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
def plot_hist_num_comp_per_art():
|
labels = list(n_dict.keys())
|
||||||
''' open pkl file of dict, plot histogram of number of different
|
numbers = list(n_dict.values())
|
||||||
company names per article.
|
nbars = n_commons
|
||||||
'''
|
plt.bar(np.arange(nbars),
|
||||||
# list of number of different companies per article (int)
|
height=numbers,
|
||||||
list = []
|
tick_label=labels,
|
||||||
with open('../obj/num_mentions_companies.pkl', 'rb') as input:
|
facecolor='royalblue')
|
||||||
list = pickle.load(input)
|
|
||||||
|
|
||||||
# sort list in descending order
|
plt.savefig('..\\visualization\\10_most_common_words_mergers_{}.eps'
|
||||||
list.sort(reverse=True)
|
.format(VisualizerNews.datestring))
|
||||||
|
plt.savefig('..\\visualization\\10_most_common_words_mergers_{}.png'
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
|
plt.show()
|
||||||
|
|
||||||
# convert list to array
|
def plot_hist_num_comp_per_art():
|
||||||
names = np.asarray(list)
|
''' open pkl file of dict, plot histogram of number of different
|
||||||
|
company names per article.
|
||||||
|
'''
|
||||||
|
# list of number of different companies per article (int)
|
||||||
|
list = []
|
||||||
|
with open('../obj/num_mentions_companies.pkl', 'rb') as input:
|
||||||
|
list = pickle.load(input)
|
||||||
|
|
||||||
plt.xlabel('Number of different company names in news article')
|
# sort list in descending order
|
||||||
plt.ylabel('Number of articles with this number of company names')
|
list.sort(reverse=True)
|
||||||
num_bins = 100
|
|
||||||
n, bins, patches = plt.hist(names, num_bins,
|
|
||||||
facecolor='darkgreen', alpha=0.5)
|
|
||||||
plt.axis([0, 30, 0, 1500])
|
|
||||||
|
|
||||||
# format axis labels for thousends (e.g. '10,000')
|
# convert list to array
|
||||||
plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
|
names = np.asarray(list)
|
||||||
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
|
||||||
|
|
||||||
# save to file
|
plt.xlabel('Number of different company names in the news article')
|
||||||
plt.savefig('..\\visualization\\NER_2_{}.eps'
|
plt.ylabel('Number of articles with this number of company names')
|
||||||
.format(VisualizerNews.datestring))
|
num_bins = 100
|
||||||
plt.savefig('..\\visualization\\NER_2_{}.png'
|
n, bins, patches = plt.hist(names, num_bins,
|
||||||
.format(VisualizerNews.datestring))
|
facecolor='darkgreen', alpha=0.5)
|
||||||
plt.show()
|
plt.axis([0, 30, 0, 1500])
|
||||||
|
|
||||||
|
# format axis labels for thousends (e.g. '10,000')
|
||||||
|
plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
|
||||||
|
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
||||||
|
|
||||||
|
# save to file
|
||||||
|
plt.savefig('..\\visualization\\NER_2_{}.eps'
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
|
plt.savefig('..\\visualization\\NER_2_{}.png'
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
|
plt.show()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
VisualizerNews.plot_wordcloud_dataset()
|
# VisualizerNews.plot_wordcloud_dataset()
|
||||||
# VisualizerNews.plot_histogram_companies()
|
VisualizerNews.plot_histogram_companies()
|
||||||
# VisualizerNews.plot_hist_num_comp_per_art()
|
# VisualizerNews.plot_hist_num_comp_per_art()
|
||||||
# VisualizerNews.plot_histogram_text_lengths()
|
# VisualizerNews.plot_histogram_text_lengths()
|
||||||
# VisualizerNews.plot_pie_chart_of_sites()
|
# VisualizerNews.plot_pie_chart_of_sites()
|
||||||
# VisualizerNews.plot_hist_most_common_words(10)
|
# VisualizerNews.plot_hist_most_common_words(10)
|
128
src/test.py
|
@ -1,128 +0,0 @@
|
||||||
from BagOfWords import BagOfWords
|
|
||||||
|
|
||||||
import csv
|
|
||||||
|
|
||||||
import gensim
|
|
||||||
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
|
||||||
from sklearn.feature_selection import SelectPercentile
|
|
||||||
from sklearn.metrics import recall_score, precision_score
|
|
||||||
import sklearn
|
|
||||||
from sklearn.model_selection import StratifiedKFold
|
|
||||||
from sklearn.naive_bayes import MultinomialNB
|
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
|
|
||||||
# read current data set from csv
|
|
||||||
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
|
|
||||||
sep='|',
|
|
||||||
usecols=range(1,13), # drop first column 'unnamed'
|
|
||||||
encoding='utf-8',
|
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
|
||||||
quotechar='\'')
|
|
||||||
|
|
||||||
dataset = df.loc[df['Label'] != -1][:100].reset_index(drop=True)
|
|
||||||
|
|
||||||
train = dataset[:15]
|
|
||||||
test = dataset[15:20].reset_index(drop=True)
|
|
||||||
|
|
||||||
classifier = MultinomialNB(alpha=1.0e-10,
|
|
||||||
fit_prior=False,
|
|
||||||
class_prior=None)
|
|
||||||
|
|
||||||
def make_tagged_document(row):
|
|
||||||
# TaggedDocument wie wo was?
|
|
||||||
# tags (a list of tokens). Tags may be one or more unicode string tokens,
|
|
||||||
# but typical practice (which will also be the most memory-efficient) is
|
|
||||||
# for the tags list to include a unique integer id as the only tag.
|
|
||||||
# also kein Label?
|
|
||||||
|
|
||||||
return TaggedDocument(words=BagOfWords.extract_words(row['Text']),
|
|
||||||
tags=[row['Label']])
|
|
||||||
|
|
||||||
tagged_train_data=train.apply(lambda row: make_tagged_document(row), axis=1)
|
|
||||||
print(tagged_train_data[0])
|
|
||||||
|
|
||||||
tagged_test_data=test.apply(lambda row: make_tagged_document(row), axis=1)
|
|
||||||
print(tagged_test_data[0])
|
|
||||||
|
|
||||||
model = Doc2Vec(vector_size=100,
|
|
||||||
min_count=20,
|
|
||||||
epochs=40,
|
|
||||||
negative=0)
|
|
||||||
|
|
||||||
model.build_vocab(tagged_train_data)
|
|
||||||
|
|
||||||
model.train(tagged_train_data,
|
|
||||||
total_examples=model.corpus_count,
|
|
||||||
epochs=model.epochs)
|
|
||||||
|
|
||||||
model.docvecs.count
|
|
||||||
|
|
||||||
y_train=np.array([doc.tags[0] for doc in tagged_train_data])
|
|
||||||
|
|
||||||
y_test=np.array([doc.tags[0] for doc in tagged_test_data])
|
|
||||||
|
|
||||||
X_train=[model.infer_vector(doc.words, steps=20) for doc in tagged_train_data]
|
|
||||||
|
|
||||||
X_test=[model.infer_vector(doc.words, steps=20) for doc in tagged_test_data]
|
|
||||||
|
|
||||||
# X_train=np.vstack(X_train)
|
|
||||||
|
|
||||||
# X_test=np.vstack(X_test)
|
|
||||||
|
|
||||||
# X_test.shape
|
|
||||||
|
|
||||||
# y_test.shape
|
|
||||||
|
|
||||||
# X_train.shape
|
|
||||||
|
|
||||||
# y_train.shape
|
|
||||||
|
|
||||||
print(X_test)
|
|
||||||
print(y_test)
|
|
||||||
print(X_train)
|
|
||||||
print(y_train)
|
|
||||||
|
|
||||||
# reshape data
|
|
||||||
|
|
||||||
X_train = np.array(X_train)
|
|
||||||
X_test = np.array(X_test)
|
|
||||||
|
|
||||||
#X_train = X_train.reshape((X_train.shape[0],1,X_train.shape[1]))
|
|
||||||
#X_test = X_test.reshape((X_test.shape[0],1,X_test.shape[1]))
|
|
||||||
X_train.shape
|
|
||||||
X_test.shape
|
|
||||||
|
|
||||||
|
|
||||||
#fit classifier
|
|
||||||
classifier.fit(X_train, y_train)
|
|
||||||
#predict class
|
|
||||||
predictions_train = classifier.predict(X_train)
|
|
||||||
predictions_test = classifier.predict(X_test)
|
|
||||||
|
|
||||||
#print and store metrics
|
|
||||||
rec = recall_score(y_test, predictions_test, average='weighted')
|
|
||||||
print('rec: ' + str(rec))
|
|
||||||
recall_scores.append(rec)
|
|
||||||
prec = precision_score(y_test, predictions_test, average='weighted')
|
|
||||||
print('prec: ' + str(prec))
|
|
||||||
print('#')
|
|
||||||
precision_scores.append(prec)
|
|
||||||
# equation for f1 score
|
|
||||||
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
|
||||||
|
|
||||||
##########################
|
|
||||||
# probability estimates for the test vector (testing_data)
|
|
||||||
class_probs = classifier.predict_proba(testing_data)
|
|
||||||
|
|
||||||
# number of samples encountered for each class during fitting
|
|
||||||
# this value is weighted by the sample weight when provided
|
|
||||||
class_count = classifier.class_count_
|
|
||||||
|
|
||||||
# classes in order used
|
|
||||||
classes = classifier.classes_
|
|
||||||
|
|
||||||
# return classes and vector of class estimates
|
|
||||||
print (recall_scores, precision_scores, f1_scores, class_probs)
|
|
131
src/test_2.py
|
@ -1,131 +0,0 @@
|
||||||
from BagOfWords import BagOfWords
|
|
||||||
|
|
||||||
import csv
|
|
||||||
|
|
||||||
import gensim
|
|
||||||
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
|
||||||
from sklearn.feature_selection import SelectPercentile
|
|
||||||
from sklearn.metrics import recall_score, precision_score
|
|
||||||
import sklearn
|
|
||||||
from sklearn.model_selection import StratifiedKFold
|
|
||||||
from sklearn.naive_bayes import MultinomialNB
|
|
||||||
from sklearn.model_selection import train_test_split
|
|
||||||
|
|
||||||
# read current data set from csv
|
|
||||||
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
|
|
||||||
sep='|',
|
|
||||||
usecols=range(1,13), # drop first column 'unnamed'
|
|
||||||
encoding='utf-8',
|
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
|
||||||
quotechar='\'')
|
|
||||||
|
|
||||||
dataset = df.loc[df['Label'] != -1].reset_index(drop=True)
|
|
||||||
|
|
||||||
X = dataset['Title'] + '. ' + dataset['Text']
|
|
||||||
y = dataset['Label']
|
|
||||||
|
|
||||||
classifier = MultinomialNB(alpha=1.0e-10,
|
|
||||||
fit_prior=False,
|
|
||||||
class_prior=None)
|
|
||||||
|
|
||||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=5)
|
|
||||||
|
|
||||||
def read_corpus(data, tokens_only=False):
|
|
||||||
list_of_lists = []
|
|
||||||
for i, text in enumerate(data):
|
|
||||||
if tokens_only:
|
|
||||||
list_of_lists.append(BagOfWords.extract_words(text))
|
|
||||||
else:
|
|
||||||
# For training data, add tags
|
|
||||||
list_of_lists.append(gensim.models.doc2vec.TaggedDocument(BagOfWords.extract_words(text), [i]))
|
|
||||||
return list_of_lists
|
|
||||||
|
|
||||||
tagged_train_data = read_corpus(X_train, tokens_only=False)
|
|
||||||
|
|
||||||
print('tagged_train_data[0]:')
|
|
||||||
print(tagged_train_data[0])
|
|
||||||
|
|
||||||
tagged_test_data = read_corpus(X_test, tokens_only=False)
|
|
||||||
|
|
||||||
print('tagged_test_data[0]:')
|
|
||||||
print(tagged_test_data[0])
|
|
||||||
|
|
||||||
model = Doc2Vec(vector_size=100,
|
|
||||||
min_count=20,
|
|
||||||
epochs=40,
|
|
||||||
negative=0)
|
|
||||||
|
|
||||||
model.build_vocab(tagged_train_data)
|
|
||||||
|
|
||||||
model.train(tagged_train_data,
|
|
||||||
total_examples=model.corpus_count,
|
|
||||||
epochs=model.epochs)
|
|
||||||
|
|
||||||
model.docvecs.count
|
|
||||||
|
|
||||||
#y_train=np.array([doc.tags[0] for doc in tagged_train_data])
|
|
||||||
#y_test=np.array([doc.tags[0] for doc in tagged_test_data])
|
|
||||||
|
|
||||||
X_train=[model.infer_vector(doc.words, steps=20) for doc in tagged_train_data]
|
|
||||||
|
|
||||||
X_test=[model.infer_vector(doc.words, steps=20) for doc in tagged_test_data]
|
|
||||||
|
|
||||||
X_train=np.vstack(X_train)
|
|
||||||
|
|
||||||
X_test=np.vstack(X_test)
|
|
||||||
|
|
||||||
X_test.shape
|
|
||||||
|
|
||||||
y_test.shape
|
|
||||||
|
|
||||||
X_train.shape
|
|
||||||
|
|
||||||
y_train.shape
|
|
||||||
|
|
||||||
print('X_test:')
|
|
||||||
print(X_test)
|
|
||||||
|
|
||||||
print('y_test:')
|
|
||||||
print(y_test)
|
|
||||||
|
|
||||||
print('X_train:')
|
|
||||||
print(X_train)
|
|
||||||
|
|
||||||
print('y_train:')
|
|
||||||
print(y_train)
|
|
||||||
|
|
||||||
# hier: ValueError: Input X must be non-negative
|
|
||||||
|
|
||||||
#fit classifier
|
|
||||||
classifier.fit(X_train, y_train)
|
|
||||||
#predict class
|
|
||||||
predictions_train = classifier.predict(X_train)
|
|
||||||
predictions_test = classifier.predict(X_test)
|
|
||||||
|
|
||||||
#print and store metrics
|
|
||||||
rec = recall_score(y_test, predictions_test, average='weighted')
|
|
||||||
print('rec: ' + str(rec))
|
|
||||||
recall_scores.append(rec)
|
|
||||||
prec = precision_score(y_test, predictions_test, average='weighted')
|
|
||||||
print('prec: ' + str(prec))
|
|
||||||
print('#')
|
|
||||||
precision_scores.append(prec)
|
|
||||||
# equation for f1 score
|
|
||||||
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
|
||||||
|
|
||||||
##########################
|
|
||||||
# probability estimates for the test vector (testing_data)
|
|
||||||
class_probs = classifier.predict_proba(testing_data)
|
|
||||||
|
|
||||||
# number of samples encountered for each class during fitting
|
|
||||||
# this value is weighted by the sample weight when provided
|
|
||||||
class_count = classifier.class_count_
|
|
||||||
|
|
||||||
# classes in order used
|
|
||||||
classes = classifier.classes_
|
|
||||||
|
|
||||||
# return classes and vector of class estimates
|
|
||||||
print (recall_scores, precision_scores, f1_scores, class_probs)
|
|
After Width: | Height: | Size: 20 KiB |
After Width: | Height: | Size: 20 KiB |
After Width: | Height: | Size: 20 KiB |
After Width: | Height: | Size: 20 KiB |
After Width: | Height: | Size: 23 KiB |
After Width: | Height: | Size: 22 KiB |
After Width: | Height: | Size: 22 KiB |
After Width: | Height: | Size: 61 KiB |
After Width: | Height: | Size: 83 KiB |
After Width: | Height: | Size: 42 KiB |
After Width: | Height: | Size: 23 KiB |
After Width: | Height: | Size: 24 KiB |
After Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 26 KiB After Width: | Height: | Size: 20 KiB |
After Width: | Height: | Size: 1.2 KiB |
After Width: | Height: | Size: 1.2 KiB |
After Width: | Height: | Size: 1.2 KiB |
After Width: | Height: | Size: 1.2 KiB |
After Width: | Height: | Size: 1.2 KiB |