update labeling and documentation

This commit is contained in:
annealias 2019-04-17 13:20:46 +02:00
parent 94f501ab6d
commit 8ddf23d801
68 changed files with 39651 additions and 30404 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -96,25 +96,25 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"m=11" "m=16"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 9,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"This round number: 11\n", "This round number: 16\n",
"Number of manually labeled articles: 1082\n", "Number of manually labeled articles: 1132\n",
"Number of manually unlabeled articles: 8918\n" "Number of manually unlabeled articles: 8868\n"
] ]
} }
], ],
@ -842,8 +842,425 @@
" df.loc[index, 'Estimated'] = classes[i]\n", " df.loc[index, 'Estimated'] = classes[i]\n",
" # annotate probability\n", " # annotate probability\n",
" df.loc[index, 'Probability'] = row[i]\n", " df.loc[index, 'Probability'] = row[i]\n",
" n += 1\n", " n += 1"
"\n", ]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"m = 16"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"###############\n"
]
},
{
"data": {
"text/plain": [
"5"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"###############\n"
]
},
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"###############\n"
]
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"###############\n"
]
},
{
"data": {
"text/plain": [
"5"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"3"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"###############\n"
]
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"7"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"###############\n"
]
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"8"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"###############\n"
]
},
{
"data": {
"text/plain": [
"83.33333333333334"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"62.5"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"60.0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"###############\n"
]
},
{
"data": {
"text/plain": [
"33.33333333333333"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"100.0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"80.0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"###############\n"
]
},
{
"data": {
"text/plain": [
"0.0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"0.0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"80.0"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"###############\n"
]
},
{
"data": {
"text/plain": [
"38.88888888888889"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"54.166666666666664"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"73.33333333333333"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print('###############')\n", "print('###############')\n",
"zero_0 = len(df.loc[(df['Round'] == m) & (df['Estimated'] == 0) & (df['Label'] == 0)])\n", "zero_0 = len(df.loc[(df['Round'] == m) & (df['Estimated'] == 0) & (df['Label'] == 0)])\n",
"zero_0\n", "zero_0\n",
@ -910,7 +1327,7 @@
"\n", "\n",
"prec_1 = tp_1 / (tp_1 + fp_1) * 100\n", "prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
"prec_1\n", "prec_1\n",
"rec_1 = tp_1 / (tp_1 + fn_1) * 100\n", "rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
"rec_1\n", "rec_1\n",
"acc_1 = (tp_1 + tn_1) / total * 100\n", "acc_1 = (tp_1 + tn_1) / total * 100\n",
"acc_1\n", "acc_1\n",

View File

@ -17,7 +17,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -42,6 +42,7 @@
"from sklearn.pipeline import Pipeline\n", "from sklearn.pipeline import Pipeline\n",
"from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.semi_supervised import label_propagation\n", "from sklearn.semi_supervised import label_propagation\n",
"from sklearn.svm import LinearSVC\n",
"\n", "\n",
"from BagOfWords import BagOfWords\n", "from BagOfWords import BagOfWords\n",
"from MultinomialNaiveBayes import MultinomialNaiveBayes\n", "from MultinomialNaiveBayes import MultinomialNaiveBayes\n",
@ -50,7 +51,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -66,7 +67,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -105,16 +106,16 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 70, "execution_count": 117,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"9" "8"
] ]
}, },
"execution_count": 70, "execution_count": 117,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -126,7 +127,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 71, "execution_count": 118,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -138,16 +139,16 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 72, "execution_count": 119,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"number of labeled samples by class (0/1/2): 80/2/18\n", "number of labeled samples by class (0/1/2): 79/8/13\n",
"minimum of new labeled samples: 2\n", "minimum of new labeled samples: 8\n",
"length of current data set for resubstitution error: 6\n" "length of current data set for resubstitution error: 24\n"
] ]
} }
], ],
@ -162,7 +163,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 73, "execution_count": 120,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -174,7 +175,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 74, "execution_count": 121,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -187,62 +188,67 @@
"#training_data_5 = pd.concat([selec_0, selec_1, selec_2])\n", "#training_data_5 = pd.concat([selec_0, selec_1, selec_2])\n",
"#training_data_6 = pd.concat([selec_0, selec_1, selec_2])\n", "#training_data_6 = pd.concat([selec_0, selec_1, selec_2])\n",
"#training_data_7 = pd.concat([selec_0, selec_1, selec_2])\n", "#training_data_7 = pd.concat([selec_0, selec_1, selec_2])\n",
"#training_data_8 = pd.concat([selec_0, selec_1, selec_2])\n", "training_data_8 = pd.concat([selec_0, selec_1, selec_2])\n",
"training_data_9 = pd.concat([selec_0, selec_1, selec_2])" "#training_data_9 = pd.concat([selec_0, selec_1, selec_2])"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 75, "execution_count": 122,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# indices of training samples\n", "# indices of training samples\n",
"# idx_0 = training_data_0['Index'].tolist()\n", "#idx_0 = training_data_0['Index'].tolist()\n",
"# idx_1 = training_data_1['Index'].tolist()\n", "#idx_1 = training_data_1['Index'].tolist()\n",
"# idx_2 = training_data_2['Index'].tolist()\n", "#idx_2 = training_data_2['Index'].tolist()\n",
"# idx_3 = training_data_3['Index'].tolist()\n", "#idx_3 = training_data_3['Index'].tolist()\n",
"# idx_4 = training_data_4['Index'].tolist()\n", "#idx_4 = training_data_4['Index'].tolist()\n",
"# idx_5 = training_data_5['Index'].tolist()\n", "#idx_5 = training_data_5['Index'].tolist()\n",
"# idx_6 = training_data_6['Index'].tolist()\n", "#idx_6 = training_data_6['Index'].tolist()\n",
"# idx_7 = training_data_7['Index'].tolist()\n", "#idx_7 = training_data_7['Index'].tolist()\n",
"# idx_8 = training_data_8['Index'].tolist()\n", "idx_8 = training_data_8['Index'].tolist()\n",
"idx_9 = training_data_9['Index'].tolist()" "#idx_9 = training_data_9['Index'].tolist()"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 103, "execution_count": 123,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"#train_all = training_data_0\n", "#train_0_1 = training_data_0.append([training_data_1])\n",
"train_0_8 = training_data_0.append([training_data_1, training_data_2, training_data_3, training_data_4, training_data_5, training_data_6, training_data_7, training_data_8])" "#train_0_2 = train_0_1.append([training_data_2])\n",
"#train_0_3 = train_0_2.append([training_data_3])\n",
"#train_0_4 = train_0_3.append([training_data_4])\n",
"#train_0_5 = train_0_4.append([training_data_5])\n",
"#train_0_6 = train_0_5.append([training_data_6])\n",
"#train_0_7 = train_0_6.append([training_data_7])\n",
"train_0_8 = train_0_7.append([training_data_8])"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 91, "execution_count": 124,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"#idx_all = idx_0\n", "train_all = train_0_8\n",
"idx_all = train_all['Index'].tolist()\n", "idx_all = train_all['Index'].tolist()"
"#idx_9"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 92, "execution_count": 125,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"117" "111"
] ]
}, },
"execution_count": 92, "execution_count": 125,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -257,26 +263,35 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"train_0_9 = train_0_2.append(training_data_3)\n", "#train_0_9 = train_0_2.append(training_data_3)\n",
"len(train_0_3)" "#len(train_0_3)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 86, "execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"#m = 4"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"stratified number in round 9: 6\n", "stratified number in round 7: 18\n",
"stratified number in total: 138\n" "stratified number in total: 87\n"
] ]
} }
], ],
"source": [ "source": [
"print('stratified number in round {}: {}'.format(m, len(idx_9)))\n", "print('stratified number in round {}: {}'.format(m, len(idx_7)))\n",
"print('stratified number in total: {}'.format(len(idx_all)))" "print('stratified number in total: {}'.format(len(idx_all)))"
] ]
}, },
@ -288,22 +303,22 @@
"source": [ "source": [
"# STEP 1:\n", "# STEP 1:\n",
"# resubstitution error round\n", "# resubstitution error round\n",
"training_data = train_0_8\n", "#training_data = train_0_8\n",
"testing_data = training_data_9" "#testing_data = training_data_9"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 115, "execution_count": 64,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"9" "4"
] ]
}, },
"execution_count": 115, "execution_count": 64,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -314,16 +329,26 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 160, "execution_count": 126,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"1082" "111"
] ]
}, },
"execution_count": 160, "execution_count": 126,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"100"
]
},
"execution_count": 126,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -331,10 +356,13 @@
"source": [ "source": [
"# STEP 2: \n", "# STEP 2: \n",
"# resubstitution error all labeled articles in round\n", "# resubstitution error all labeled articles in round\n",
"training_data = train_all\n", "training_data = train_0_8\n",
"testing_data = df.loc[(df['Round'] <= 11)]# & (~df['Index'].isin(idx_all))]\n", "testing_data = df.loc[(df['Round'] == (m+1))]\n",
"\n",
"# & (~df['Index'].isin(idx_all))]\n",
"#df[~df['Index'].isin(idx_all)]\n", "#df[~df['Index'].isin(idx_all)]\n",
"#df.loc[(df['Label'] == -1) | (df['Round'] >= 10)]\n", "#df.loc[(df['Label'] == -1) | (df['Round'] >= 10)]\n",
"len(training_data)\n",
"len(testing_data)" "len(testing_data)"
] ]
}, },
@ -345,24 +373,44 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# STEP 3:\n", "# STEP 3:\n",
"training_data = train_all\n", "#training_data = train_all\n",
"testing_data = train_all" "#testing_data = train_all"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# STEP 4:\n", "# STEP 4:\n",
"training_data = train_all\n", "#training_data = df.loc[df['Label'] != -1].reset_index(drop=True)\n",
"testing_data = train_all" "#testing_data = df.loc[df['Label'] == -1].reset_index(drop=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 161, "execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8918"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#len(testing_data)"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -385,7 +433,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 162, "execution_count": 128,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -425,7 +473,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 131, "execution_count": 57,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -438,10 +486,10 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"7140" "65"
] ]
}, },
"execution_count": 131, "execution_count": 57,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
}, },
@ -455,10 +503,10 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"2007" "26"
] ]
}, },
"execution_count": 131, "execution_count": 57,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
}, },
@ -472,10 +520,10 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"736" "9"
] ]
}, },
"execution_count": 131, "execution_count": 57,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -570,27 +618,27 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 181, "execution_count": 158,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Nachberechnung fürs Latex:\n", "# Nachberechnung fürs Latex:\n",
"zero_0 = 1\n", "zero_0 = 80\n",
"zero_1 = 1\n", "zero_1 = 2\n",
"zero_2 = 0\n", "zero_2 = 14\n",
"\n", "\n",
"one_0 = 4\n", "one_0 = 0\n",
"one_1 = 3\n", "one_1 = 0\n",
"one_2 = 4\n", "one_2 = 1\n",
"\n", "\n",
"two_0 = 0\n", "two_0 = 0\n",
"two_1 = 1\n", "two_1 = 0\n",
"two_2 = 1" "two_2 = 3"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 163, "execution_count": 129,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -604,10 +652,10 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"701" "68"
] ]
}, },
"execution_count": 163, "execution_count": 129,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
}, },
@ -617,17 +665,17 @@
"0" "0"
] ]
}, },
"execution_count": 163, "execution_count": 129,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
}, },
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"41" "6"
] ]
}, },
"execution_count": 163, "execution_count": 129,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
}, },
@ -641,47 +689,10 @@
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"99" "8"
] ]
}, },
"execution_count": 163, "execution_count": 129,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"49"
]
},
"execution_count": 163,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"74"
]
},
"execution_count": 163,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"/\n"
]
},
{
"data": {
"text/plain": [
"47"
]
},
"execution_count": 163,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
}, },
@ -691,17 +702,54 @@
"1" "1"
] ]
}, },
"execution_count": 163, "execution_count": 129,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
}, },
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"70" "11"
] ]
}, },
"execution_count": 163, "execution_count": 129,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"/\n"
]
},
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 129,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 129,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -734,7 +782,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 182, "execution_count": 159,
"metadata": { "metadata": {
"scrolled": false "scrolled": false
}, },
@ -747,51 +795,51 @@
"\n", "\n",
"class 0:\n", "class 0:\n",
"\n", "\n",
"TP: 1\n", "TP: 80\n",
"TN: 9\n", "TN: 4\n",
"FP: 1\n", "FP: 16\n",
"FN: 4\n", "FN: 0\n",
"\n", "\n",
"class 1:\n", "class 1:\n",
"\n", "\n",
"TP: 3\n", "TP: 0\n",
"TN: 2\n", "TN: 97\n",
"FP: 8\n", "FP: 1\n",
"FN: 2\n", "FN: 2\n",
"\n", "\n",
"class 2:\n", "class 2:\n",
"\n", "\n",
"TP: 1\n", "TP: 3\n",
"TN: 9\n", "TN: 82\n",
"FP: 1\n", "FP: 0\n",
"FN: 4\n", "FN: 15\n",
"###############\n", "###############\n",
"\n", "\n",
"METRICS:\n", "METRICS:\n",
"\n", "\n",
"class 0:\n", "class 0:\n",
"\n", "\n",
"precision: 50.0\n", "precision: 83.33\n",
"recall: 20.0\n", "recall: 100.0\n",
"accuracy: 66.67\n", "accuracy: 84.0\n",
"\n", "\n",
"class 1:\n", "class 1:\n",
"\n", "\n",
"precision: 27.27\n", "precision: 0.0\n",
"recall: 60.0\n", "recall: 0.0\n",
"accuracy: 33.33\n", "accuracy: 97.0\n",
"\n", "\n",
"class 2:\n", "class 2:\n",
"\n", "\n",
"precision: 50.0\n", "precision: 100.0\n",
"recall: 20.0\n", "recall: 16.67\n",
"accuracy: 66.67\n", "accuracy: 85.0\n",
"\n", "\n",
"Average Metrics:\n", "Average Metrics:\n",
"\n", "\n",
"precision: 42.42424242424242\n", "precision: 61.111111111111114\n",
"recall: 33.333333333333336\n", "recall: 38.888888888888886\n",
"accuracy: 55.55555555555554\n" "accuracy: 88.66666666666667\n"
] ]
} }
], ],

View File

@ -0,0 +1,374 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [],
"source": [
"import csv\n",
"import operator\n",
"import pickle\n",
"import random\n",
"\n",
"from ipywidgets import interact, interactive, fixed, interact_manual\n",
"import ipywidgets as widgets\n",
"from IPython.core.interactiveshell import InteractiveShell\n",
"from IPython.display import display\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.metrics import recall_score, precision_score, precision_recall_fscore_support\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.svm import SVC\n",
"from sklearn.svm import LinearSVC\n",
"\n",
"from BagOfWords import BagOfWords\n",
"from MNBInteractive import MNBInteractive\n",
"from MultinomialNaiveBayes import MultinomialNaiveBayes\n",
"from NaiveBayes import NaiveBayes"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Last round number: 15\n",
"Number of manually labeled articles: 1122\n",
"Number of manually unlabeled articles: 8878\n"
]
}
],
"source": [
"# initialize random => reproducible sequence\n",
"random.seed(5)\n",
"random_state=5\n",
"\n",
"# set up wider display area\n",
"pd.set_option('display.max_colwidth', -1)\n",
"\n",
"# read current data set from csv\n",
"df = pd.read_csv('../data/interactive_labeling_round_15_temp.csv',\n",
" sep='|',\n",
" usecols=range(1,13), # drop first column 'unnamed'\n",
" encoding='utf-8',\n",
" quoting=csv.QUOTE_NONNUMERIC,\n",
" quotechar='\\'')\n",
"\n",
"# find current iteration/round number\n",
"m = int(df['Round'].max())\n",
"print('Last round number: {}'.format(m))\n",
"print('Number of manually labeled articles: {}'.format(len(df.loc[df['Label'] != -1])))\n",
"print('Number of manually unlabeled articles: {}'.format(len(df.loc[df['Label'] == -1])))"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"52\n"
]
}
],
"source": [
"labeled_pos_0 = df.loc[df['Label'] == 0].reset_index(drop=True)\n",
"labeled_pos_1 = df.loc[df['Label'] == 1].reset_index(drop=True)\n",
"labeled_pos_2 = df.loc[df['Label'] == 2].reset_index(drop=True)\n",
"\n",
"max_sample = min(len(labeled_pos_0), len(labeled_pos_1), len(labeled_pos_2))\n",
"print(max_sample)\n",
"\n",
"sampling_class0 = labeled_pos_0.sample(n=max_sample, random_state=random_state)\n",
"sampling_class1 = labeled_pos_1.sample(n=max_sample, random_state=random_state)\n",
"sampling_class2 = labeled_pos_2.sample(n=max_sample, random_state=random_state)"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"# nur für subset EINDEUTIG\n",
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
"testing_data = df.loc[(df['Label'] != -1) & (df['Index'].isin(subset_indices))].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [],
"source": [
"training_data = pd.concat([sampling_class0, sampling_class1, sampling_class2])\n",
"testing_data = df.loc[(df['Label'] != -1)].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
"len(testing_data)\n",
"indices_predicted = df.loc[(df['Label'] != -1), 'Index'].tolist()"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
"# split training data into text and label set\n",
"# join title and text\n",
"X = training_data['Title'] + '. ' + training_data['Text']\n",
"y = training_data['Label']\n",
"\n",
"# split testing data into text and label set\n",
"U = testing_data['Title'] + '. ' + testing_data['Text']\n",
"v = testing_data['Label']"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
"#classifier = MultinomialNB(alpha=1.0e-10,\n",
"# fit_prior=False,\n",
"# class_prior=None)\n",
"#classifier = SVC()\n",
"classifier = LinearSVC()"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
"cv = CountVectorizer()\n",
"\n",
"# fit the training data and then return the matrix\n",
"training_data = cv.fit_transform(X, y).toarray()\n",
"# transform testing data and return the matrix\n",
"testing_data = cv.transform(U).toarray()"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
"#fit classifier\n",
"classifier.fit(training_data, y)\n",
"\n",
"#predict class\n",
"predictions_test = classifier.predict(testing_data)"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"# annotate estimated labels\n",
"df['Estimated'] = np.nan\n",
"\n",
"for i, value in enumerate(indices_predicted):\n",
" df.loc[(df['Index'] == value), 'Estimated'] = predictions_test[i]"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"###############\n",
"642\n",
"0\n",
"19\n",
"###############\n",
"55\n",
"50\n",
"36\n",
"###############\n",
"150\n",
"0\n",
"130\n",
"###############\n",
"metrics:\n",
"\n",
"642\n",
"216\n",
"19\n",
"205\n",
"###############\n",
"50\n",
"941\n",
"91\n",
"0\n",
"###############\n",
"130\n",
"747\n",
"150\n",
"55\n",
"###############\n",
"97.12556732223904\n",
"75.79693034238488\n",
"79.29759704251387\n",
"###############\n",
"35.46099290780142\n",
"100.0\n",
"91.58964879852127\n",
"###############\n",
"46.42857142857143\n",
"70.27027027027027\n",
"81.05360443622921\n",
"###############\n",
"59.67171055287063\n",
"82.02240020421839\n",
"83.98028342575479\n"
]
}
],
"source": [
"print('###############')\n",
"zero_0 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 0)])\n",
"print(zero_0)\n",
"zero_1 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 1)])\n",
"print(zero_1)\n",
"zero_2 = len(df.loc[(df['Estimated'] == 0) & (df['Label'] == 2)])\n",
"print(zero_2)\n",
"print('###############')\n",
"one_0 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 0)])\n",
"print(one_0)\n",
"one_1 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 1)])\n",
"print(one_1)\n",
"one_2 = len(df.loc[(df['Estimated'] == 1) & (df['Label'] == 2)])\n",
"print(one_2)\n",
"print('###############')\n",
"\n",
"two_0 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 0)])\n",
"print(two_0)\n",
"two_1 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 1)])\n",
"print(two_1)\n",
"two_2 = len(df.loc[(df['Estimated'] == 2) & (df['Label'] == 2)])\n",
"print(two_2)\n",
"print('###############')\n",
"print('metrics:')\n",
"print()\n",
"\n",
"total = zero_0 + zero_1 + zero_2 + one_0 + one_1 + one_2 + two_0 + two_1 + two_2\n",
"\n",
"tp_0 = zero_0\n",
"print(tp_0)\n",
"tn_0 = one_1 + one_2 + two_1 + two_2\n",
"print(tn_0)\n",
"fp_0 = zero_1 + zero_2\n",
"print(fp_0)\n",
"fn_0 = one_0 + two_0\n",
"print(fn_0)\n",
"print('###############')\n",
"\n",
"tp_1 = one_1\n",
"print(tp_1)\n",
"tn_1 = zero_0 + zero_2 + two_0 + two_2\n",
"print(tn_1)\n",
"fp_1 = one_0 + one_2\n",
"print(fp_1)\n",
"fn_1 = zero_1 + two_1\n",
"print(fn_1)\n",
"print('###############')\n",
"\n",
"tp_2 = two_2\n",
"print(tp_2)\n",
"tn_2 = zero_0 + zero_1 + one_0 + one_1\n",
"print(tn_2)\n",
"fp_2 = two_0 + two_1\n",
"print(fp_2)\n",
"fn_2 = zero_2 + one_2\n",
"print(fn_2)\n",
"print('###############')\n",
"\n",
"prec_0 = tp_0 / (tp_0 + fp_0) * 100\n",
"print(prec_0)\n",
"rec_0 = tp_0 / (tp_0 + fn_0) * 100\n",
"print(rec_0)\n",
"acc_0 = (tp_0 + tn_0) / total * 100\n",
"print(acc_0)\n",
"print('###############')\n",
"\n",
"prec_1 = tp_1 / (tp_1 + fp_1) * 100\n",
"print(prec_1)\n",
"rec_1 = tp_1 / (tp_1 + fn_1) * 100\n",
"print(rec_1)\n",
"acc_1 = (tp_1 + tn_1) / total * 100\n",
"print(acc_1)\n",
"print('###############')\n",
"\n",
"prec_2 = tp_2 / (tp_2 + fp_2) * 100\n",
"print(prec_2)\n",
"rec_2 = tp_2 / (tp_2 + fn_2) * 100\n",
"print(rec_2)\n",
"acc_2 = (tp_2 + tn_2) / total * 100\n",
"print(acc_2)\n",
"print('###############')\n",
"\n",
"print((prec_1 + prec_2 + prec_0) / 3)\n",
"print((rec_1 + rec_2 + rec_0) / 3)\n",
"print((acc_1 + acc_2 + acc_0) / 3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -11,52 +11,135 @@ class LabelingPlotter():
# round numbers # round numbers
round = [0,1,2,3,4,5,6,7,8,9] round = [0,1,2,3,4,5,6,7,8,9]
# number of wrong estimated labels per round # # number of wrong estimated labels per round
wrong = [0/100, 19/100, 17/100, 16/100, 20/100, 12/100, 10/100, 20/100, 14/100, 12/100] # wrong = [0/100, 19/100, 17/100, 16/100, 20/100, 12/100, 10/100, 20/100, 14/100, 12/100]
# number of manual classified articles per class and round # # number of manual classified articles per class and round
man_0 = [84/100, 165/200, 247/300, 329/400, 410/500, 498/600, 586/700, 662/800, 741/900, 821/1000] # man_0 = [84/100, 165/200, 247/300, 329/400, 410/500, 498/600, 586/700, 662/800, 741/900, 821/1000]
man_1 = [3/100, 7/200, 12/300, 16/400, 20/500, 22/600, 23/700, 29/800, 37/900, 39/1000] # man_1 = [3/100, 7/200, 12/300, 16/400, 20/500, 22/600, 23/700, 29/800, 37/900, 39/1000]
man_2 = [13/100, 28/200, 41/300, 55/400, 70/500, 80/600, 91/700, 109/800, 122/900, 140/1000] # man_2 = [13/100, 28/200, 41/300, 55/400, 70/500, 80/600, 91/700, 109/800, 122/900, 140/1000]
# number of estimated labels per class and round # # number of estimated labels per class and round
est_0 = [9873/9900, 9757/9800, 9603/9700, 9470/9600, 9735/9500, 9238/9400, 9107/9300, 8007/9200, 8064/9100, 7641/9000] # est_0 = [9873/9900, 9757/9800, 9603/9700, 9470/9600, 9735/9500, 9238/9400, 9107/9300, 8007/9200, 8064/9100, 7641/9000]
est_1 = [14/9900, 15/9800, 11/9700, 11/9600, 16/9500, 17/9400, 18/9300, 19/9200, 18/9100, 20/9000] # est_1 = [14/9900, 15/9800, 11/9700, 11/9600, 16/9500, 17/9400, 18/9300, 19/9200, 18/9100, 20/9000]
est_2 = [12/9900, 26/9800, 77/9700, 94/9600, 380/9500, 123/9400, 147/9300, 676/9200, 595/9100, 837/9000] # est_2 = [12/9900, 26/9800, 77/9700, 94/9600, 380/9500, 123/9400, 147/9300, 676/9200, 595/9100, 837/9000]
fig, ax = plt.subplots(3, 1) # naive study
rec_av_n = [np.nan, 33.3, 35.9, 38.1, 37.4, 33.3, 39.4, 40.7, 40.1, 38.9]
rec_1_n = [np.nan, 0, 0, 0, 0, 0, 0, 0, 12.5, 0]
prec_av_n = [np.nan, 26.3, 44.56, 61.22, 49.7, 29.3, 63.3, 59.7, 77.1, 61.1]
prec_1_n = [np.nan, 0, 0, 0, 0, 0, 0, 0, 100, 0]
acc_av_n = [np.nan,86,88.7,89.3,88,92,93.3,86.7,87.3,88.7]
acc_1_n = [np.nan,96,95, 96, 96 ,98,99, 94,93 ,97.0]
# stratified
rec_av_s = [np.nan, 44.53, 47.85, 56.45, 56.36, 58.71, 57.20, 62.13, 55.41, 46.85]
rec_1_s = [np.nan, 75.00, 50, 100, 75.00, 100, 100, 100, 75.00, 50.00]
prec_av_s = [np.nan, 36.8, 46.63, 41.42, 45.73, 33.69, 33.01, 52.68 , 44.68, 37.85]
prec_1_s = [np.nan, 6.67, 8.33, 9.52, 11.54, 8, 3.57, 16.67, 28.57, 5.00]
fig, ax = plt.subplots(4, 1)
ax[0].plot(round, wrong) ax[0].plot(round, rec_av_n, round, rec_av_s)
ax[2].set_xlabel('Iteration number') ax[0].set_ylabel('Recall (Average)')
ax[0].set_ylabel('Error rate') ax[0].legend(('Naive Sampling', 'Stratified Sampling'))
ax[1].plot(round, prec_av_n, round, prec_av_s)
ax[1].set_ylabel('Precision (Average)')
ax[1].legend(('Naive Sampling', 'Stratified Sampling'))
ax[1].plot(round, man_0, round, man_1, round, man_2) ax[2].plot(round, rec_1_n, round, rec_1_s)
ax[1].set_ylabel('Fraction of manual labels') ax[2].set_ylabel('Recall (Class 1)')
ax[2].legend(('Naive Sampling', 'Stratified Sampling'))
ax[2].plot(round, est_0, round, est_1, round, est_2)
ax[2].set_ylabel('Fraction of estimated labels')
ax[3].plot(round, prec_1_n, round, prec_1_s)
ax[3].set_ylabel('Precision (Class 1)')
ax[3].legend(('Naive Sampling', 'Stratified Sampling'))
ax[3].set_xlabel('Iteration number')
# limit x axis # limit x axis
ax[0].set_xbound(lower=1, upper=9) ax[0].set_xbound(lower=1, upper=9)
ax[1].set_xbound(lower=1, upper=9) ax[1].set_xbound(lower=1, upper=9)
ax[2].set_xbound(lower=1, upper=9) ax[2].set_xbound(lower=1, upper=9)
ax[3].set_xbound(lower=1, upper=9)
ax[0].set_ybound(lower=0) ax[0].set_ybound(lower=0)
ax[1].set_ybound(lower=0) ax[1].set_ybound(lower=0)
#ax[2].set_ybound(lower=0) ax[2].set_ybound(lower=0)
ax[3].set_ybound(lower=0)
# insert legend # ax[0].plot(round, rec_av_n)
ax[1].legend(('class 0', 'class 1', 'class 2')) # ax[2].set_xlabel('Iteration number')
ax[2].legend(('class 0', 'class 1', 'class 2')) # ax[0].set_ylabel('Metrics without stratified sampling')
fig.tight_layout() # ax[1].plot(round, man_0, round, man_1, round, man_2)
# ax[1].set_ylabel('Fraction of manual labels')
plt.savefig('..\\visualization\\Labeling_Grafik_070219.png') # ax[2].plot(round, est_0, round, est_1, round, est_2)
# ax[2].set_ylabel('Fraction of estimated labels')
# # limit x axis
# ax[0].set_xbound(lower=1, upper=9)
# ax[1].set_xbound(lower=1, upper=9)
# ax[2].set_xbound(lower=1, upper=9)
# ax[0].set_ybound(lower=0)
# ax[1].set_ybound(lower=0)
# #ax[2].set_ybound(lower=0)
# # insert legend
# ax[1].legend(('class 0', 'class 1', 'class 2'))
# ax[2].legend(('class 0', 'class 1', 'class 2'))
plt.savefig('..\\visualization\\Labeling_plot_190404.png')
plt.savefig('..\\visualization\\Labeling_plot_190404.eps')
plt.show()
def plot_labeling_rounds_naive():
# round numbers
round = [0,1,2,3,4,5,6,7,8,9]
# naive study
rec_av_n = [np.nan, 33.3, 35.9, 38.1, 37.4, 33.3, 39.4, 40.7, 40.1, 38.9]
rec_1_n = [np.nan, 0, 0, 0, 0, 0, 0, 0, 12.5, 0]
prec_av_n = [np.nan, 26.3, 44.56, 61.22, 49.7, 29.3, 63.3, 59.7, 77.1, 61.1]
prec_1_n = [np.nan, 0, 0, 0, 0, 0, 0, 0, 100, 0]
acc_av_n = [np.nan, 86,88.7,89.3,88,92,93.3,86.7,87.3,88.7]
acc_1_n = [np.nan, 96,95, 96, 96 ,98,99, 94,93 ,97.0]
fig, ax = plt.subplots(2, 1)
ax[0].plot(round, rec_av_n, round, prec_av_n, round, acc_av_n)
ax[0].set_ylabel('Average metrics')
ax[0].legend(('Recall', 'Precision', 'Accuracy'))
ax[1].plot(round, rec_1_n, round, prec_1_n, round, acc_1_n)
ax[1].set_ylabel('Class 1 metrics')
ax[1].legend(('Recall', 'Precision', 'Accuracy'))
ax[1].set_xlabel('Iteration number')
# limit x axis
ax[0].set_xbound(lower=1, upper=9)
ax[1].set_xbound(lower=1, upper=9)
# y axis
ax[1].set_ybound(lower=-5)
ax[0].set_ybound(lower=-5)
plt.savefig('..\\visualization\\Labeling_plot_190411.png')
plt.savefig('..\\visualization\\Labeling_plot_190411.eps')
plt.show() plt.show()
def plot_cumulative(): def plot_cumulative():
# load pickle object # load pickle object
with open('../obj/array_class_probs_stratified_round_9.pkl', 'rb') as input: with open('../obj/array_3model_svm_class2.pkl', 'rb') as input:
list = pickle.load(input) list = pickle.load(input)
# sort list in descending order # sort list in descending order
@ -80,18 +163,25 @@ class LabelingPlotter():
#ax.set_yticklabels(['{:,.1%}'.format(x / 200) for x in vals]) #ax.set_yticklabels(['{:,.1%}'.format(x / 200) for x in vals])
ax.grid(True) #ax.grid(True)
#ax.legend(loc='right') #ax.legend(loc='right')
#ax.set_title('Cumulative distribution of highest estimated probability') ax.set_title('Predictions class 2 (SVM)')
ax.set_xlabel('Highest estimated probability') # for iterations
ax.set_ylabel('Fraction of articles with this highest estimated probability') #ax.set_xlabel('Highest estimated probability')
#ax.set_ylabel('Fraction of articles with this highest estimated probability')
# for 3-models
ax.set_xlabel('Estimated probability for class 2')
ax.set_ylabel('Fraction of articles with this probability')
#plt.axis([0.97, 1, 0.95, 1.01])
#plt.axis([0.5, 0.99, 0, 0.006]) #round 9 #plt.axis([0.5, 0.99, 0, 0.006]) #round 9
plt.axis([0.5, 1, 0, 0.015]) # round 9 stratified #plt.axis([0.5, 1, 0, 0.015]) # round 9 stratified
#plt.axis([0.65, 1, 0, 0.003]) # round 10 #plt.axis([0.65, 1, 0, 0.003]) # round 10
#plt.axis([0.7, 1, 0, 0.002]) # round 11 #plt.axis([0.7, 1, 0, 0.002]) # round 11
#ax.set_xbound(lower=0.5, upper=0.99) #ax.set_xbound(lower=0.5, upper=0.99)
plt.savefig('..\\visualization\\proba_stratified_round_9.png') #plt.savefig('..\\visualization\\proba_stratified_round_9.png')
plt.savefig('..\\visualization\\proba_stratified_round_9.eps') #plt.savefig('..\\visualization\\proba_stratified_round_9.eps')
plt.savefig('..\\visualization\\3model_svm_class2.png')
plt.savefig('..\\visualization\\3model_svm_class2.eps')
plt.show() plt.show()
@ -121,4 +211,5 @@ class LabelingPlotter():
if __name__ == '__main__': if __name__ == '__main__':
#LabelingPlotter.plot_correlation() #LabelingPlotter.plot_correlation()
LabelingPlotter.plot_cumulative() #LabelingPlotter.plot_cumulative()
LabelingPlotter.plot_labeling_rounds_naive()

View File

@ -20,7 +20,7 @@ class MNBInteractive:
However, in practice, fractional counts such as tf-idf may also work. However, in practice, fractional counts such as tf-idf may also work.
''' '''
def estimate_mnb(labeled_data, unlabeled_data, sklearn_cv=False): def estimate_mnb(labeled_data, unlabeled_data, sklearn_cv=True):
'''fits naive bayes model '''fits naive bayes model
''' '''

View File

@ -17,7 +17,7 @@ from sklearn.naive_bayes import MultinomialNB
class MultinomialNaiveBayes: class MultinomialNaiveBayes:
def make_mnb(dataset, sklearn_cv=True, percentile=100): def make_mnb(dataset, sklearn_cv=True, percentile=100, bigram=False):
'''fits naive bayes model with StratifiedKFold '''fits naive bayes model with StratifiedKFold
''' '''
print('# starting multinomial naive bayes') print('# starting multinomial naive bayes')
@ -29,7 +29,13 @@ class MultinomialNaiveBayes:
y = dataset['Label'] y = dataset['Label']
if sklearn_cv: if sklearn_cv:
cv = CountVectorizer() if bigram:
cv = CountVectorizer(ngram_range=(2,2))
else:
# ignore company names
company_names_list = BagOfWords.load_company_names()
stopwords = list(BagOfWords.set_stop_words()).extend(company_names_list)
cv = CountVectorizer(stop_words = stopwords)
# use stratified k-fold cross-validation as split method # use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5) skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
@ -43,11 +49,6 @@ class MultinomialNaiveBayes:
precision_scores = [] precision_scores = []
f1_scores = [] f1_scores = []
# probabilities of each class (of each fold)
#class_prob = []
# counts number of training samples observed in each class
#class_counts = []
# for each fold # for each fold
n = 0 n = 0
for train, test in skf.split(X,y): for train, test in skf.split(X,y):
@ -90,13 +91,6 @@ class MultinomialNaiveBayes:
#predict class #predict class
predictions_train = classifier.predict(training_data_r) predictions_train = classifier.predict(training_data_r)
predictions_test = classifier.predict(testing_data_r) predictions_test = classifier.predict(testing_data_r)
# print('train:')
# print(y[train])
# print('test:')
# print(y[test])
# print()
# print('pred')
# print(predictions_test)
#print and store metrics #print and store metrics
rec = recall_score(y[test], predictions_test, average='weighted') rec = recall_score(y[test], predictions_test, average='weighted')
@ -113,22 +107,19 @@ class MultinomialNaiveBayes:
#class_counts.append(classifier.class_count_) #class_counts.append(classifier.class_count_)
########################## ##########################
# probability estimates for the test vector (testing_data)
class_probs = classifier.predict_proba(testing_data)
# number of samples encountered for each class during fitting
# this value is weighted by the sample weight when provided
class_count = classifier.class_count_
# classes in order used # classes in order used
classes = classifier.classes_ classes = classifier.classes_
print('average: recall, precision, f1 score') print('Recall (Min): ' + str(min(recall_scores)))
print(sum(recall_scores)/10, sum(precision_scores)/10, sum(f1_scores)/10) print('Recall (Max): ' + str(max(recall_scores)))
print('Recall (Average): ' + str(sum(recall_scores)/len(recall_scores)))
print()
print('Precision (Min): ' + str(min(precision_scores)))
print('Precision (Max): ' + str(max(precision_scores)))
print('Precision (Average) :' + str(sum(precision_scores)/len(precision_scores)))
# return classes and vector of class estimates # return classes and vector of class estimates
return recall_scores, precision_scores, f1_scores, class_probs return recall_scores, precision_scores
######## nur für resubstitutionsfehler benötigt ######## ######## nur für resubstitutionsfehler benötigt ########
def analyze_errors(training, testing): def analyze_errors(training, testing):
@ -195,4 +186,4 @@ if __name__ == '__main__':
quotechar='\'') quotechar='\'')
# select only labeled articles # select only labeled articles
MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=True, percentile=100) MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=False)

View File

@ -17,13 +17,14 @@ from sklearn.metrics import recall_score, precision_score
import sklearn import sklearn
from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
class MultinomialNaiveBayes_Word2Vec: class MultinomialNaiveBayes_Word2Vec:
def make_mnb(dataset, sklearn_cv=True, percentile=100): def make_mnb(dataset):
'''fits naive bayes model with StratifiedKFold '''fits naive bayes model with StratifiedKFold
''' '''
vector_size=150
def read_corpus(data, tokens_only=False): def read_corpus(data, tokens_only=False):
list_of_lists = [] list_of_lists = []
@ -35,7 +36,13 @@ class MultinomialNaiveBayes_Word2Vec:
list_of_lists.append(gensim.models.doc2vec.TaggedDocument(BagOfWords.extract_words(text), [i])) list_of_lists.append(gensim.models.doc2vec.TaggedDocument(BagOfWords.extract_words(text), [i]))
return list_of_lists return list_of_lists
print('# starting multinomial naive bayes') def normalize_vector(two_dim_array, min, max):
norm_array = two_dim_array
for (x,y), value in np.ndenumerate(two_dim_array):
norm_array[x][y] = ((value - min) / (max - min))
return norm_array
print('# starting multinomial naive bayes with Word2Vec')
print('# ...') print('# ...')
# split data into text and label set # split data into text and label set
@ -46,20 +53,19 @@ class MultinomialNaiveBayes_Word2Vec:
# use stratified k-fold cross-validation as split method # use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5) skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
classifier = MultinomialNB(alpha=1.0e-10, #classifier = MultinomialNB(alpha=1.0e-10,
fit_prior=False, # fit_prior=False,
class_prior=None) # class_prior=None)
# classifier = SVC(probability=True,
# gamma='auto')
classifier = LinearSVC()
# metrics # metrics
recall_scores = [] recall_scores = []
precision_scores = [] precision_scores = []
f1_scores = [] f1_scores = []
# probabilities of each class (of each fold)
#class_prob = []
# counts number of training samples observed in each class
#class_counts = []
# for each fold # for each fold
n = 0 n = 0
for train, test in skf.split(X,y): for train, test in skf.split(X,y):
@ -68,28 +74,51 @@ class MultinomialNaiveBayes_Word2Vec:
print('# split no. ' + str(n)) print('# split no. ' + str(n))
# train model with gensim # train model with gensim
training_data = read_corpus(X[train], tokens_only=False) tagged_train_data = read_corpus(X[train], tokens_only=False)
testing_data = read_corpus(X[test], tokens_only=True) tagged_test_data = read_corpus(X[test], tokens_only=False)
all_data = read_corpus(X, tokens_only=False)
# instantiate a Doc2Vec object # instantiate a Doc2Vec object
doc2vec_model = Doc2Vec(training_data, vector_size=100, window=2, min_count=2, epochs = 40) model = Doc2Vec(vector_size=100,
min_count=20,
epochs=40,
negative=0,
workers=1,
seed=5,
hs=1)
# Frage: hier dürfen keine negativen Werte drin sein für Naive Bayes? model.build_vocab(tagged_train_data)
print(doc2vec_model.docvecs[0])
print(doc2vec_model.docvecs[1]) model.train(tagged_train_data,
print(doc2vec_model.docvecs[2]) total_examples=model.corpus_count,
epochs=model.epochs)
training_data = [doc2vec_model.docvecs[i] for i in range(len(training_data))]
model.docvecs.count
# Frage: muss man bei den testing daten auch einen tag mit machen?
testing_data = [doc2vec_model.infer_vector(vector) for vector in testing_data] X_train=[model.infer_vector(doc.words, steps=20) for doc in tagged_train_data]
X_test=[model.infer_vector(doc.words, steps=20) for doc in tagged_test_data]
# convert matrix
X_train=np.vstack(X_train)
X_test=np.vstack(X_test)
# min max for normalization
minimum = min(X_train.min(), X_test.min())
maximum = max(X_train.max(), X_test.max())
X_test_norm = normalize_vector(X_test, minimum, maximum)
X_train_norm = normalize_vector(X_train, minimum, maximum)
# shape vectors
X_test_norm.shape
y[test].shape
X_train_norm.shape
y[train].shape
#fit classifier #fit classifier
classifier.fit(training_data, y[train]) classifier.fit(X_train_norm, y[train])
#predict class #predict class
predictions_train = classifier.predict(training_data) predictions_train = classifier.predict(X_train_norm)
predictions_test = classifier.predict(testing_data) predictions_test = classifier.predict(X_test_norm)
#print and store metrics #print and store metrics
rec = recall_score(y[test], predictions_test, average='weighted') rec = recall_score(y[test], predictions_test, average='weighted')
@ -104,21 +133,25 @@ class MultinomialNaiveBayes_Word2Vec:
########################## ##########################
# probability estimates for the test vector (testing_data) # probability estimates for the test vector (testing_data)
class_probs = classifier.predict_proba(testing_data) #class_probs = classifier.predict_proba(X_test_norm)
# number of samples encountered for each class during fitting # number of samples encountered for each class during fitting
# this value is weighted by the sample weight when provided # this value is weighted by the sample weight when provided
class_count = classifier.class_count_ #class_count = classifier.class_count_
# classes in order used # classes in order used
classes = classifier.classes_ #classes = classifier.classes_
print('average: recall, precision, f1 score')
print(sum(recall_scores)/10, sum(precision_scores)/10, sum(f1_scores)/10)
print('Recall (Min): ' + str(min(recall_scores)))
print('Recall (Max): ' + str(max(recall_scores)))
print('Recall (Average): ' + str(sum(recall_scores)/len(recall_scores)))
print()
print('Precision (Min): ' + str(min(precision_scores)))
print('Precision (Max): ' + str(max(precision_scores)))
print('Precision (Average) :' + str(sum(precision_scores)/len(precision_scores)))
# return classes and vector of class estimates # return classes and vector of class estimates
return recall_scores, precision_scores, f1_scores, class_probs return recall_scores, precision_scores, f1_scores#, class_probs
if __name__ == '__main__': if __name__ == '__main__':
@ -135,4 +168,4 @@ if __name__ == '__main__':
quotechar='\'') quotechar='\'')
# select only labeled articles # select only labeled articles
MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=False, percentile=100) MultinomialNaiveBayes_Word2Vec.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True))

View File

@ -0,0 +1,198 @@
'''
Multinomial Naive Bayes Classifier
==================================
'''
from BagOfWords import BagOfWords
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
class MultinomialNaiveBayes:
def make_mnb(dataset, sklearn_cv=True, percentile=100):
'''fits naive bayes model with StratifiedKFold
'''
print('# starting multinomial naive bayes')
print('# ...')
# split data into text and label set
# join title and text
X = dataset['Title'] + '. ' + dataset['Text']
y = dataset['Label']
if sklearn_cv:
cv = CountVectorizer(ngram_range = (1,2))
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
classifier = MultinomialNB(alpha=1.0e-10,
fit_prior=False,
class_prior=None)
# metrics
recall_scores = []
precision_scores = []
f1_scores = []
# probabilities of each class (of each fold)
#class_prob = []
# counts number of training samples observed in each class
#class_counts = []
# for each fold
n = 0
for train, test in skf.split(X,y):
n += 1
print('# split no. ' + str(n))
if sklearn_cv:
# use sklearn CountVectorizer
# fit the training data and then return the matrix
training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray()
else:
# use my own BagOfWords python implementation
stemming = True
rel_freq = True
extracted_words = BagOfWords.extract_all_words(X[train])
vocab = BagOfWords.make_vocab(extracted_words)
# fit the training data and then return the matrix
training_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# transform testing data and return the matrix
extracted_words = BagOfWords.extract_all_words(X[test])
testing_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# apply select percentile
selector = SelectPercentile(percentile=percentile)
selector.fit(training_data, y[train])
# new reduced data sets
training_data_r = selector.transform(training_data)
testing_data_r = selector.transform(testing_data)
#fit classifier
classifier.fit(training_data_r, y[train])
#predict class
predictions_train = classifier.predict(training_data_r)
predictions_test = classifier.predict(testing_data_r)
# print('train:')
# print(y[train])
# print('test:')
# print(y[test])
# print()
# print('pred')
# print(predictions_test)
#print and store metrics
rec = recall_score(y[test], predictions_test, average='weighted')
print('rec: ' + str(rec))
recall_scores.append(rec)
prec = precision_score(y[test], predictions_test, average='weighted')
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
#class_prob.append(classifier.class_prior_)
#class_counts.append(classifier.class_count_)
##########################
# probability estimates for the test vector (testing_data)
class_probs = classifier.predict_proba(testing_data)
# number of samples encountered for each class during fitting
# this value is weighted by the sample weight when provided
class_count = classifier.class_count_
# classes in order used
classes = classifier.classes_
print('average: recall, precision, f1 score')
print(sum(recall_scores)/10, sum(precision_scores)/10, sum(f1_scores)/10)
# return classes and vector of class estimates
return recall_scores, precision_scores, f1_scores, class_probs
######## nur für resubstitutionsfehler benötigt ########
def analyze_errors(training, testing):
'''calculates resubstitution error
shows indices of false classified articles
uses Gaussian Bayes with train test split
'''
X_train = training['Title'] + ' ' + training['Text']
y_train = training['Label']
X_test = testing['Title'] + ' ' + testing['Text']
y_test = testing['Label']
count_vector = CountVectorizer()
# fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train).toarray()
# transform testing data and return the matrix
testing_data = count_vector.transform(X_test).toarray()
# Naive Bayes
classifier = MultinomialNB(alpha=1.0e-10,
fit_prior=False,
class_prior=None)
# fit classifier
classifier.fit(training_data, y_train)
# Predict class
predictions = classifier.predict(testing_data)
print(type(y_test))
print(len(y_test))
print(type(predictions))
print(len(predictions))
print('Errors at index:')
print()
n = 0
for i in range(len(y_test)):
if y_test[i] != predictions[i]:
n += 1
print('error no.{}'.format(n))
print('prediction at index {} is: {}, but actual is: {}'
.format(i, predictions[i], y_test[i]))
print(X_test[i])
print(y_test[i])
print()
#print metrics
print('F1 score: ', format(f1_score(y_test, predictions)))
if __name__ == '__main__':
# read csv file
print('# reading dataset')
print('# ...')
# read current data set from csv
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
sep='|',
usecols=range(1,13), # drop first column 'unnamed'
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
# select only labeled articles
MultinomialNaiveBayes.make_mnb(df.loc[df['Label'] != -1].reset_index(drop=True), sklearn_cv=True, percentile=100)

View File

@ -48,7 +48,6 @@ class NaiveBayes:
# metrics # metrics
recall_scores = [] recall_scores = []
precision_scores = [] precision_scores = []
#f1_scores = []
# probabilities of each class (of each fold) # probabilities of each class (of each fold)
class_prob = [] class_prob = []
@ -113,32 +112,15 @@ class NaiveBayes:
########################## ##########################
#print metrics of test set #print metrics of test set
# print('-------------------------') print('Recall (Min): ' + str(min(recall_scores)))
# print('prediction of testing set:') print('Recall (Max): ' + str(max(recall_scores)))
# print('Precision score: min = {}, max = {}, average = {}' print('Recall (Average): ' + str(sum(recall_scores)/len(recall_scores)))
# .format(min(precision_scores), print()
# max(precision_scores), print('Precision (Min): ' + str(min(precision_scores)))
# sum(precision_scores)/float(len(precision_scores)))) print('Precision (Max): ' + str(max(precision_scores)))
# print('Recall score: min = {}, max = {}, average = {}' print('Precision (Average) :' + str(sum(precision_scores)/len(precision_scores)))
# .format(min(recall_scores),
# max(recall_scores),
# sum(recall_scores)/float(len(recall_scores))))
# print('F1 score: min = {}, max = {}, average = {}'
# .format(min(f1_scores),
# max(f1_scores),
# sum(f1_scores)/float(len(f1_scores))))
# print()
# # print probability of each class
# print('probability of each class:')
# print()
# print(class_prob)
# print()
# print('number of samples of each class:')
# print()
# print(class_counts)
# print()
return class_prob, class_counts, recall_scores, precision_scores#, f1_scores return class_prob, class_counts, recall_scores, precision_scores
##### nur für overfit testing ########### ##### nur für overfit testing ###########
#print('overfit testing: prediction of training set') #print('overfit testing: prediction of training set')

83
src/SVMInteractive.py Normal file
View File

@ -0,0 +1,83 @@
'''
SVM Classifier for Interactive Labeling
=======================================
returns probabilities for classes needed for interactive labeling.
'''
from BagOfWords import BagOfWords
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
class SVMInteractive:
def estimate_svm(labeled_data, unlabeled_data, sklearn_cv=True):
print('# SVM: starting interactive SVM...')
print()
# split labeled data into text and label set
# join title and text
X = labeled_data['Title'] + '. ' + labeled_data['Text']
y = labeled_data['Label']
# split unlabeled data into text and label set
# join title and text
U = unlabeled_data['Title'] + '. ' + unlabeled_data['Text']
l = unlabeled_data['Label']
if sklearn_cv:
cv = CountVectorizer()
# fit_prior=False: a uniform prior will be used instead
# of learning class prior probabilities
classifier = SVC(probability=True,
gamma='auto')
# probabilities of each class (of each fold)
class_probs = []
if sklearn_cv:
# use sklearn CountVectorizer
# fit the training data and then return the matrix
training_data = cv.fit_transform(X, y).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(U).toarray()
else:
# use my own BagOfWords python implementation
stemming = True
rel_freq = False
extracted_words = BagOfWords.extract_all_words(X)
vocab = BagOfWords.make_vocab(extracted_words)
# fit the training data and then return the matrix
print('# MNB: fit training data and calculate matrix...')
print()
training_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# transform testing data and return the matrix
print('# MNB: transform testing data to matrix...')
print()
extracted_words = BagOfWords.extract_all_words(U)
testing_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
#fit classifier
classifier.fit(training_data, y)
# probability estimates for the test vector (testing_data)
class_probs = classifier.predict_proba(testing_data)
# classes in order used
classes = classifier.classes_
print('# ending SVM')
# return classes and vector of class estimates
return classes, class_probs

81
src/SVMInteractive_wp.py Normal file
View File

@ -0,0 +1,81 @@
'''
SVM Classifier for Interactive Labeling
=======================================
returns probabilities for classes needed for interactive labeling.
'''
from BagOfWords import BagOfWords
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
class SVMInteractive_wp:
def estimate_svm(labeled_data, unlabeled_data, sklearn_cv=True):
print('# SVM: starting interactive SVM...')
print()
# split labeled data into text and label set
# join title and text
X = labeled_data['Title'] + '. ' + labeled_data['Text']
y = labeled_data['Label']
# split unlabeled data into text and label set
# join title and text
U = unlabeled_data['Title'] + '. ' + unlabeled_data['Text']
l = unlabeled_data['Label']
if sklearn_cv:
cv = CountVectorizer()
# fit_prior=False: a uniform prior will be used instead
# of learning class prior probabilities
classifier = LinearSVC()
# probabilities of each class (of each fold)
class_probs = []
if sklearn_cv:
# use sklearn CountVectorizer
# fit the training data and then return the matrix
training_data = cv.fit_transform(X, y).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(U).toarray()
else:
# use my own BagOfWords python implementation
stemming = True
rel_freq = False
extracted_words = BagOfWords.extract_all_words(X)
vocab = BagOfWords.make_vocab(extracted_words)
# fit the training data and then return the matrix
print('# MNB: fit training data and calculate matrix...')
print()
training_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# transform testing data and return the matrix
print('# MNB: transform testing data to matrix...')
print()
extracted_words = BagOfWords.extract_all_words(U)
testing_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
#fit classifier
classifier.fit(training_data, y)
predictions_test = classifier.predict(testing_data)
# classes in order used
classes = classifier.classes_
print('# ending SVM')
# return classes and vector of class estimates
return classes, predictions_test

View File

@ -19,103 +19,143 @@ import csv
import pandas as pd import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score, make_scorer from sklearn.metrics import recall_score, precision_score, f1_score, make_scorer, accuracy_score
from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC from sklearn.svm import SVC
from sklearn.svm import NuSVC
class SVM: class SVM_multiclass:
def make_svm(dataset, sklearn_cv=True): def make_svm(dataset, sklearn_cv=True, percentile=100):
print('# fitting model') print('# starting multinomial svm')
print('# ...') print('# ...')
# split data into text and label set # split data into text and label set
# join title and text
# articles' text (title + text)
X = dataset['Title'] + '. ' + dataset['Text'] X = dataset['Title'] + '. ' + dataset['Text']
# articles' labels
y = dataset['Label'] y = dataset['Label']
matrix = pd.DataFrame()
# fit the training data and then return the matrix
if sklearn_cv: if sklearn_cv:
# use sklearn CountVectorizer
matrix = CountVectorizer().fit_transform(X).toarray() # ignore company names
else: company_names_list = BagOfWords.load_company_names()
# use own BOW implementation stopwords = list(BagOfWords.set_stop_words()).extend(company_names_list)
matrix = BagOfWords.fit_transform(X) cv = CountVectorizer(stop_words = stopwords)
# use stratified k-fold cross-validation as split method # use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True) skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
# use only most important features classifier = LinearSVC()
selector = SelectPercentile()
# for predict proba:
#classifier = SVC(probability=True,
# gamma='auto')
pipeline = Pipeline([('perc', selector), ('SVC', SVC())]) # metrics
recall_scores = []
precision_scores = []
accuracy_scores = []
f1_scores = []
grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75, 100], # for each fold
'SVC__kernel': ['linear'], n = 0
'SVC__gamma': [0.00001, 0.0001], for train, test in skf.split(X,y):
'SVC__C': [0.1, 1]},
cv=skf,
scoring=make_scorer(f1_score, average='micro'))
print('# fit classifier') n += 1
print('# ...') print('# split no. ' + str(n))
grid.fit(matrix,y) if sklearn_cv:
# use sklearn CountVectorizer
# fit the training data and then return the matrix
training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray()
else:
# use my own BagOfWords python implementation
stemming = True
rel_freq = True
extracted_words = BagOfWords.extract_all_words(X[train])
vocab = BagOfWords.make_vocab(extracted_words)
# DataFrame of results # fit the training data and then return the matrix
df_results = grid.cv_results_ training_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# transform testing data and return the matrix
extracted_words = BagOfWords.extract_all_words(X[test])
testing_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# print results # apply select percentile
###################### selector = SelectPercentile(percentile=percentile)
print('RESULTS:') selector.fit(training_data, y[train])
print('')
print('mean_test_score:') # new reduced data sets
print(df_results['mean_test_score']) training_data_r = selector.transform(training_data)
print('') testing_data_r = selector.transform(testing_data)
print('mean of means:')
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score'])) #fit classifier
print('') classifier.fit(training_data_r, y[train])
print('best score:') #predict class
print(grid.best_score_) predictions_train = classifier.predict(training_data_r)
predictions_test = classifier.predict(testing_data_r)
#print and store metrics
rec = recall_score(y[test], predictions_test, average='weighted')
print('rec: ' + str(rec))
recall_scores.append(rec)
prec = precision_score(y[test], predictions_test, average='weighted')
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)
acc = recall_score(y[test], predictions_test, average='weighted')
accuracy_scores.append(acc)
print('acc: ' + str(acc))
print('#')
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
#class_prob.append(classifier.class_prior_)
#class_counts.append(classifier.class_count_)
#print(classifier.predict_proba(testing_data_r))
##########################
# classes in order used
classes = classifier.classes_
print('Recall (Min): ' + str(min(recall_scores)))
print('Recall (Max): ' + str(max(recall_scores)))
print('Recall (Average): ' + str(sum(recall_scores)/len(recall_scores)))
print() print()
print('best parameters set found on development set:') print('Precision (Min): ' + str(min(precision_scores)))
print(grid.best_params_) print('Precision (Max): ' + str(max(precision_scores)))
print('Precision (Average) :' + str(sum(precision_scores)/len(precision_scores)))
print() print()
print('Accuracy (Min): ' + str(min(accuracy_scores)))
print('Accuracy (Max): ' + str(max(accuracy_scores)))
print('Accuracy (Average) :' + str(sum(accuracy_scores)/len(accuracy_scores)))
if __name__ == '__main__': # return classes and vector of class estimates
return recall_scores, precision_scores
print('# starting svm') if __name__ == '__main__':
print('# ...')
#file = '..\\data\\classification_labelled_corrected.csv' # read csv file
print('# reading dataset')
print('# ...')
# read csv file # read current data set from csv
print('# reading dataset') df = pd.read_csv('../data/interactive_labeling_round_11.csv',
print('# ...') sep='|',
usecols=range(1,13), # drop first column 'unnamed'
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
# data = pd.read_csv(file, # select only labeled articles
# sep='|', SVM_multiclass.make_svm(df.loc[df['Label'] != -1].reset_index(drop=True),
# engine='python', sklearn_cv=True)
# decimal='.',
# quotechar='\'',
# quoting=csv.QUOTE_NONE)
# read current data set from csv
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
sep='|',
usecols=range(1,13), # drop first column 'unnamed'
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
data = df.loc[df['Label'] != -1].reset_index(drop=True)
use_count_vectorizer = True
make_svm(data, use_count_vectorizer)
print('# ending svm')

123
src/SVM_multiclass_grid.py Normal file
View File

@ -0,0 +1,123 @@
'''
Support Vector Machines (SVM) Classifier
========================================
The SVM training algorithm builds a model from the training data that assigns
the test samples to one category ('merger' or 'not merger'),
making it a non-probabilistic binary linear classifier.
An SVM model is a representation of the samples as points in space,
mapped so that the examples of the separate categories are divided
by a clear gap that is as wide as possible.
New samples are then mapped into that same space and predicted
to belong to a category based on which side of the gap they fall.
'''
from BagOfWords import BagOfWords
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
class SVM_multiclass_grid:
def make_svm(dataset, sklearn_cv=True):
print('# fitting model')
print('# ...')
# split data into text and label set
# articles' text (title + text)
X = dataset['Title'] + '. ' + dataset['Text']
# articles' labels
y = dataset['Label']
matrix = pd.DataFrame()
# fit the training data and then return the matrix
if sklearn_cv:
# use sklearn CountVectorizer
company_names_list = BagOfWords.load_company_names()
stopwords = list(BagOfWords.set_stop_words()).extend(company_names_list)
matrix = CountVectorizer(stop_words = stopwords).fit_transform(X).toarray()
else:
# use own BOW implementation
matrix = BagOfWords.fit_transform(X)
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
# use only most important features
selector = SelectPercentile()
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
grid = GridSearchCV(pipeline, {'perc__percentile': [50, 75, 100],
'SVC__kernel': ['linear'],
'SVC__gamma': [0.000001, 0.00001],
'SVC__C': [0.01, 0.1]},
cv=skf,
scoring=make_scorer(recall_score, average='micro'))
print('# fit classifier')
print('# ...')
grid.fit(matrix,y)
# DataFrame of results
df_results = grid.cv_results_
# print results
######################
print('RESULTS:')
print('')
print('mean_test_score:')
print(df_results['mean_test_score'])
print('')
print('mean of means:')
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
print('')
print('best score:')
print(grid.best_score_)
print()
print('best parameters set found on development set:')
print(grid.best_params_)
print()
if __name__ == '__main__':
print('# starting svm')
print('# ...')
#file = '..\\data\\classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('# ...')
# data = pd.read_csv(file,
# sep='|',
# engine='python',
# decimal='.',
# quotechar='\'',
# quoting=csv.QUOTE_NONE)
# read current data set from csv
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
sep='|',
usecols=range(1,13), # drop first column 'unnamed'
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
data = df.loc[df['Label'] != -1].reset_index(drop=True)
use_count_vectorizer = True
make_svm(data, use_count_vectorizer)
print('# ending svm')

View File

@ -0,0 +1,152 @@
'''
Support Vector Machines (SVM) Classifier
========================================
The SVM training algorithm builds a model from the training data that assigns
the test samples to one category ('merger' or 'not merger'),
making it a non-probabilistic binary linear classifier.
An SVM model is a representation of the samples as points in space,
mapped so that the examples of the separate categories are divided
by a clear gap that is as wide as possible.
New samples are then mapped into that same space and predicted
to belong to a category based on which side of the gap they fall.
'''
from BagOfWords import BagOfWords
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score, f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.svm import NuSVC
class SVM_multiclass:
def make_svm(dataset, sklearn_cv=True, percentile=100):
print('# starting multinomial svm')
print('# ...')
# split data into text and label set
# join title and text
X = dataset['Title'] + '. ' + dataset['Text']
y = dataset['Label']
if sklearn_cv:
# ignore company names
company_names_list = BagOfWords.load_company_names()
stopwords = list(BagOfWords.set_stop_words()).extend(company_names_list)
cv = CountVectorizer(stop_words = stopwords)
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
#classifier = LinearSVC()
# for predict proba:
classifier = SVC(probability=True,
gamma='auto')
# metrics
recall_scores = []
precision_scores = []
f1_scores = []
# for each fold
n = 0
for train, test in skf.split(X,y):
n += 1
print('# split no. ' + str(n))
if sklearn_cv:
# use sklearn CountVectorizer
# fit the training data and then return the matrix
training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray()
else:
# use my own BagOfWords python implementation
stemming = True
rel_freq = True
extracted_words = BagOfWords.extract_all_words(X[train])
vocab = BagOfWords.make_vocab(extracted_words)
# fit the training data and then return the matrix
training_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# transform testing data and return the matrix
extracted_words = BagOfWords.extract_all_words(X[test])
testing_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# apply select percentile
selector = SelectPercentile(percentile=percentile)
selector.fit(training_data, y[train])
# new reduced data sets
training_data_r = selector.transform(training_data)
testing_data_r = selector.transform(testing_data)
#fit classifier
classifier.fit(training_data_r, y[train])
#predict class
predictions_train = classifier.predict(training_data_r)
predictions_test = classifier.predict(testing_data_r)
#print and store metrics
rec = recall_score(y[test], predictions_test, average='weighted')
print('rec: ' + str(rec))
recall_scores.append(rec)
prec = precision_score(y[test], predictions_test, average='weighted')
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
#class_prob.append(classifier.class_prior_)
#class_counts.append(classifier.class_count_)
print(classifier.predict_proba(testing_data_r))
##########################
# classes in order used
classes = classifier.classes_
print('Recall (Min): ' + str(min(recall_scores)))
print('Recall (Max): ' + str(max(recall_scores)))
print('Recall (Average): ' + str(sum(recall_scores)/len(recall_scores)))
print()
print('Precision (Min): ' + str(min(precision_scores)))
print('Precision (Max): ' + str(max(precision_scores)))
print('Precision (Average) :' + str(sum(precision_scores)/len(precision_scores)))
# return classes and vector of class estimates
return recall_scores, precision_scores
if __name__ == '__main__':
# read csv file
print('# reading dataset')
print('# ...')
# read current data set from csv
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
sep='|',
usecols=range(1,13), # drop first column 'unnamed'
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
# select only labeled articles
SVM_multiclass.make_svm(df.loc[df['Label'] != -1].reset_index(drop=True),
sklearn_cv=True)

View File

@ -22,314 +22,315 @@ from wordcloud import WordCloud
class VisualizerNews: class VisualizerNews:
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d') datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
def plot_wordcloud_dataset(): def plot_wordcloud_dataset():
'''plots word cloud image of most common words in dataset. '''plots word cloud image of most common words in dataset.
''' '''
print('# preparing word cloud of 200 most common words...') print('# preparing word cloud of 200 most common words...')
print() print()
# load new data set # load new data set
file = '..\\data\\cleaned_data_set_without_header.csv' file = '..\\data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(file, df_dataset = pd.read_csv(file,
delimiter='|', delimiter='|',
header=None, header=None,
index_col=None, index_col=None,
engine='python', engine='python',
usecols=[1,2], usecols=[1,2],
#nrows=100, #nrows=100,
quoting=csv.QUOTE_NONNUMERIC, quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') quotechar='\'')
corpus = df_dataset[1] + '. ' + df_dataset[2] corpus = df_dataset[1] + '. ' + df_dataset[2]
stemming = True stemming = True
rel_freq = True rel_freq = True
# find most common words in dataset # find most common words in dataset
extracted_words = BagOfWords.extract_all_words(corpus, stemming) extracted_words = BagOfWords.extract_all_words(corpus, stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming) vocab = BagOfWords.make_vocab(extracted_words, stemming)
matrix = BagOfWords.make_matrix(extracted_words, vocab, matrix = BagOfWords.make_matrix(extracted_words, vocab,
rel_freq, stemming) rel_freq, stemming)
dict = BagOfWords.make_dict_common_words(matrix, 200, dict = BagOfWords.make_dict_common_words(matrix, 200,
rel_freq, stemming) rel_freq, stemming)
# save dict object # save dict object
with open('../obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f: with open('../obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f:
pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL) pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)
wordcloud = WordCloud(background_color='white', wordcloud = WordCloud(background_color='white',
width=2400, width=2400,
height=1200, height=1200,
scale=2, scale=2,
# true if bigram: # true if bigram:
collocations=False)\ collocations=False)\
.generate_from_frequencies(dict) .generate_from_frequencies(dict)
# display generated image # display generated image
plt.imshow(wordcloud, interpolation='bilinear') plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off") plt.axis("off")
plt.savefig('visualization\\WordCloud_{}.eps' plt.savefig('visualization\\WordCloud_{}.eps'
.format(VisualizerNews.datestring)) .format(VisualizerNews.datestring))
plt.savefig('visualization\\WordCloud_{}.png' plt.savefig('visualization\\WordCloud_{}.png'
.format(VisualizerNews.datestring)) .format(VisualizerNews.datestring))
plt.show() plt.show()
def plot_histogram_companies(): def plot_histogram_companies():
'''plots diagram of company names distribution '''plots diagram of company names distribution
count_names: list of company counts(int) count_names: list of company counts(int)
x-axis: number of mentions of the company x-axis: number of mentions of the company
y-axis: frequency y-axis: frequency
''' '''
print('# preparing histogram of company mentions...') print('# preparing histogram of company mentions...')
print() print()
# # read data set # # read data set
# file = '..\\data\\cleaned_data_set_without_header.csv' # file = '..\\data\\cleaned_data_set_without_header.csv'
# df = pd.read_csv(file, # df = pd.read_csv(file,
# delimiter='|', # delimiter='|',
# header=None, # header=None,
# index_col=None, # index_col=None,
# engine='python', # engine='python',
# usecols=[1,2], # usecols=[1,2],
# #nrows=10, # #nrows=10,
# quoting=csv.QUOTE_NONNUMERIC, # quoting=csv.QUOTE_NONNUMERIC,
# quotechar='\'') # quotechar='\'')
# # # only articles with label==1 # # # only articles with label==1
# # df_hits = df[df['Label'] == 1] # # df_hits = df[df['Label'] == 1]
# # texts = df_hits['Title'] + '. ' + df_hits['Text'] # # texts = df_hits['Title'] + '. ' + df_hits['Text']
# texts = df[1] + '. ' + df[2] # texts = df[1] + '. ' + df[2]
# # list: count articles with company names # # list: count articles with company names
# count_names = NER.count_companies(texts) # count_names = NER.count_companies(texts)
# # sort list in descending order # # sort list in descending order
# count_names.sort(reverse=True) # count_names.sort(reverse=True)
# # convert list to array # # convert list to array
# names = np.asarray(count_names) # names = np.asarray(count_names)
# load pickle object # load pickle object
with open('../obj/dict_organizations.pkl', 'rb') as input: with open('../obj/dict_organizations.pkl', 'rb') as input:
dict = pickle.load(input) dict = pickle.load(input)
# make list of dict's values # make list of dict's values
count_companies = list(dict.values()) count_companies = list(dict.values())
# sort list in descending order # sort list in descending order
count_companies.sort(reverse=True) count_companies.sort(reverse=True)
# convert list to array # convert list to array
names = np.asarray(count_companies) names = np.asarray(count_companies)
plt.xlabel('Count of articles that mention a company') plt.xlabel('Count of articles that mention a special company')
# Number of companies with this number of mentions # Number of companies with this number of mentions
plt.ylabel('Number of companies with this number of articles') plt.ylabel('Number of companies with this number of articles')
num_bins = 400 num_bins = 300
n, bins, patches = plt.hist(names, num_bins, n, bins, patches = plt.hist(names, num_bins,
facecolor='darkred', alpha=0.5) color='darkred', alpha=1)
plt.axis([1, 14, 0, 14000]) plt.axis([1, 14, 0, 14000])
# format axis labels for thousends (e.g. '10,000') # format axis labels for thousends (e.g. '10,000')
plt.gca().yaxis.set_major_formatter(matplotlib.ticker\ plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
.FuncFormatter(lambda x, p: format(int(x), ','))) .FuncFormatter(lambda x, p: format(int(x), ',')))
# save to file # save to file
plt.savefig('..\\visualization\\NER_{}.eps' plt.savefig('..\\visualization\\NER_{}.eps'
.format(VisualizerNews.datestring)) .format(VisualizerNews.datestring))
plt.savefig('..\\visualization\\NER_{}.png' plt.savefig('..\\visualization\\NER_{}.png'
.format(VisualizerNews.datestring)) .format(VisualizerNews.datestring))
plt.show() plt.show()
def plot_histogram_text_lengths(): def plot_histogram_text_lengths():
'''plot histogram of article length '''plot histogram of article length
x-axis: number of characters in article (without headline) x-axis: number of characters in article (without headline)
y-axis: frequency y-axis: frequency
''' '''
print('# preparing histogram of text lengths...') print('# preparing histogram of text lengths...')
print() print()
# read data set # read data set
filepath = '..\\data\\cleaned_data_set_without_header.csv' filepath = '..\\data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(filepath, df_dataset = pd.read_csv(filepath,
delimiter='|', delimiter='|',
header=None, header=None,
index_col=None, index_col=None,
engine='python', engine='python',
usecols=[2], usecols=[2],
#nrows=100, #nrows=100,
quoting=csv.QUOTE_NONNUMERIC, quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') quotechar='\'')
# consider only Text, not Headline # consider only Text, not Headline
texts = df_dataset[2] texts = df_dataset[2]
# count characters in articles # count characters in articles
print('# counting characters in articles...') print('# counting characters in articles...')
print() print()
count_chars = [] count_chars = []
for text in texts: for text in texts:
count_chars.append(len(text)) count_chars.append(len(text))
# average of number of characters # average of number of characters
av = int(sum(count_chars) / len(count_chars)) av = int(sum(count_chars) / len(count_chars))
print('# average length of news articles is {} characters'.format(av)) print('# average length of news articles is {} characters'.format(av))
print() print()
# sort list in descending order # sort list in descending order
count_chars.sort(reverse=True) count_chars.sort(reverse=True)
# convert list to array # convert list to array
names = np.asarray(count_chars) names = np.asarray(count_chars)
# plt.title('Length of News Articles') # plt.title('Length of News Articles')
plt.xlabel('Number of characters in article') plt.xlabel('Number of characters in the article')
plt.ylabel('Frequency') plt.ylabel('Frequency')
# number of vertical bins # number of vertical bins
num_bins = 200 num_bins = 200
n, bins, patches = plt.hist(names, num_bins, n, bins, patches = plt.hist(names, num_bins,
facecolor='darkslategrey', alpha=0.5) facecolor='darkslategrey', alpha=0.5)
# [xmin, xmax, ymin, ymax] of axis # [xmin, xmax, ymin, ymax] of axis
plt.axis([300,10000,0,500]) plt.axis([300,10000,0,500])
# format axis labels for thousends (e.g. '10,000') # format axis labels for thousends (e.g. '10,000')
plt.gca().xaxis.set_major_formatter(matplotlib.ticker\ plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
.FuncFormatter(lambda x, p: format(int(x), ','))) .FuncFormatter(lambda x, p: format(int(x), ',')))
# save plot # save plot
plt.savefig('..\\visualization\\TextLength_{}.eps'\ plt.savefig('..\\visualization\\TextLength_{}.eps'\
.format(VisualizerNews.datestring)) .format(VisualizerNews.datestring))
plt.savefig('..\\visualization\\TextLength_{}.png'\ plt.savefig('..\\visualization\\TextLength_{}.png'\
.format(VisualizerNews.datestring)) .format(VisualizerNews.datestring))
plt.show() plt.show()
def plot_pie_chart_of_sites(): def plot_pie_chart_of_sites():
print('# preparing pie chart of news article sites...') print('# preparing pie chart of news article sites...')
print() print()
# load data set # load data set
filepath = '..\\data\\cleaned_data_set_without_header.csv' filepath = '..\\data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(filepath, df_dataset = pd.read_csv(filepath,
delimiter='|', delimiter='|',
header=None, header=None,
#usecols=[3], #column 'Site' #usecols=[3], #column 'Site'
index_col=None, index_col=None,
engine='python', engine='python',
#nrows=10, #nrows=10,
quoting=csv.QUOTE_NONNUMERIC, quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') quotechar='\'')
# find all different sites, group by 'Site' # find all different sites, group by 'Site'
df_counts = df_dataset.groupby(3).count() df_counts = df_dataset.groupby(3).count()
# count occurences of each site, count different 'Url's # count occurences of each site, count different 'Url's
df_counts = df_counts.sort_values([5], ascending=False) df_counts = df_counts.sort_values([5], ascending=False)
fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal")) fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
data = list(df_counts[5]) data = list(df_counts[5])
# legend labels # legend labels
labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)', labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)',
'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)'] 'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']
wedges, texts, autotexts = ax.pie(data, autopct='%1.0f%%', pctdistance=2.0, wedges, texts, autotexts = ax.pie(data, autopct='%1.0f%%', pctdistance=2.0,
startangle=90, textprops=dict(color="w")) startangle=90, textprops=dict(color="w"))
ax.legend(wedges, labels, ax.legend(wedges, labels,
#title="News Article Sources", #title="News Article Sources",
loc="center left", loc="center left",
bbox_to_anchor=(1, 0, 0.5, 1), bbox_to_anchor=(1, 0, 0.5, 1),
prop={'size': 10}, prop={'size': 10},
fontsize=10) fontsize=10)
plt.setp(autotexts, size=8, weight="bold") plt.setp(autotexts, size=8, weight="bold")
plt.show() plt.show()
plt.savefig('..\\visualization\\Sites_{}.pdf'.format(VisualizerNews.datestring)) plt.savefig('..\\visualization\\Sites_{}.pdf'.format(VisualizerNews.datestring))
plt.savefig('..\\visualization\\Sites_{}.pgf'.format(VisualizerNews.datestring)) plt.savefig('..\\visualization\\Sites_{}.pgf'.format(VisualizerNews.datestring))
def plot_hist_most_common_words(n_commons = 10): def plot_hist_most_common_words(n_commons = 10):
print('# preparing histogram of most common words...') print('# preparing histogram of most common words...')
print() print()
# # load data set # load data set
# filepath = '..\\data\\cleaned_data_set_without_header.csv' df = pd.read_csv('../data/interactive_labeling_round_16_temp.csv',
# df_dataset = pd.read_csv(filepath, sep='|',
# delimiter='|', usecols=range(1,13), # drop first column 'unnamed'
# header=None, encoding='utf-8',
# usecols=[1,2], quoting=csv.QUOTE_NONNUMERIC,
# index_col=None, quotechar='\'')
# engine='python',
# #nrows=1000,
# quoting=csv.QUOTE_NONNUMERIC,
# quotechar='\'')
# corpus = df_dataset[1] + '. ' + df_dataset[2] # select only labeled articles
df = df.loc[df['Label'] != -1].reset_index(drop=True)
# stemming = False corpus = df['Title'] + '. ' + df['Text']
# rel_freq = True
# # find most common words in dataset stemming = False
# extracted_words = BagOfWords.extract_all_words(corpus, stemming) rel_freq = True
# vocab = BagOfWords.make_vocab(extracted_words, stemming)
# matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
# stemming)
# dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
# stemming)
# # save dict object
# with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f:
# pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
# load pickle object # find most common words in dataset
with open ('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i: extracted_words = BagOfWords.extract_all_words(corpus, stemming)
dict = pickle.load(i) vocab = BagOfWords.make_vocab(extracted_words, stemming)
# sort dict by value matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\ stemming)
reverse=True)) dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
# return n higest values as dict (word => count) stemming)
n_dict = {} # save dict object
#with open('obj/'+ 'dict_10_most_common_words_merger' + '.pkl', 'wb') as f:
# pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
for i in range(n_commons): # load pickle object
# next highest score #with open ('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i:
next_highest = o_dict.popitem(last=False) # dict = pickle.load(i)
n_dict[next_highest[0]] = next_highest[1]
# sort dict by value
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
reverse=True))
# return n higest values as dict (word => count)
n_dict = {}
#plt.xlabel('Most common words in textual corpus') for i in range(n_commons):
plt.ylabel('Relative frequency') # next highest score
next_highest = o_dict.popitem(last=False)
n_dict[next_highest[0]] = next_highest[1]
labels = list(n_dict.keys()) #plt.xlabel('Most common words in textual corpus')
numbers = list(n_dict.values()) plt.ylabel('Relative frequency')
nbars = n_commons
plt.bar(np.arange(nbars),
height=numbers,
tick_label=labels,
facecolor='royalblue')
plt.savefig('..\\visualization\\10_most_common_words_{}.eps'
.format(VisualizerNews.datestring))
plt.savefig('..\\visualization\\10_most_common_words_{}.png'
.format(VisualizerNews.datestring))
plt.show()
def plot_hist_num_comp_per_art(): labels = list(n_dict.keys())
''' open pkl file of dict, plot histogram of number of different numbers = list(n_dict.values())
company names per article. nbars = n_commons
''' plt.bar(np.arange(nbars),
# list of number of different companies per article (int) height=numbers,
list = [] tick_label=labels,
with open('../obj/num_mentions_companies.pkl', 'rb') as input: facecolor='royalblue')
list = pickle.load(input)
# sort list in descending order plt.savefig('..\\visualization\\10_most_common_words_mergers_{}.eps'
list.sort(reverse=True) .format(VisualizerNews.datestring))
plt.savefig('..\\visualization\\10_most_common_words_mergers_{}.png'
.format(VisualizerNews.datestring))
plt.show()
# convert list to array def plot_hist_num_comp_per_art():
names = np.asarray(list) ''' open pkl file of dict, plot histogram of number of different
company names per article.
'''
# list of number of different companies per article (int)
list = []
with open('../obj/num_mentions_companies.pkl', 'rb') as input:
list = pickle.load(input)
plt.xlabel('Number of different company names in news article') # sort list in descending order
plt.ylabel('Number of articles with this number of company names') list.sort(reverse=True)
num_bins = 100
n, bins, patches = plt.hist(names, num_bins,
facecolor='darkgreen', alpha=0.5)
plt.axis([0, 30, 0, 1500])
# format axis labels for thousends (e.g. '10,000') # convert list to array
plt.gca().yaxis.set_major_formatter(matplotlib.ticker\ names = np.asarray(list)
.FuncFormatter(lambda x, p: format(int(x), ',')))
# save to file plt.xlabel('Number of different company names in the news article')
plt.savefig('..\\visualization\\NER_2_{}.eps' plt.ylabel('Number of articles with this number of company names')
.format(VisualizerNews.datestring)) num_bins = 100
plt.savefig('..\\visualization\\NER_2_{}.png' n, bins, patches = plt.hist(names, num_bins,
.format(VisualizerNews.datestring)) facecolor='darkgreen', alpha=0.5)
plt.show() plt.axis([0, 30, 0, 1500])
# format axis labels for thousends (e.g. '10,000')
plt.gca().yaxis.set_major_formatter(matplotlib.ticker\
.FuncFormatter(lambda x, p: format(int(x), ',')))
# save to file
plt.savefig('..\\visualization\\NER_2_{}.eps'
.format(VisualizerNews.datestring))
plt.savefig('..\\visualization\\NER_2_{}.png'
.format(VisualizerNews.datestring))
plt.show()
if __name__ == '__main__': if __name__ == '__main__':
VisualizerNews.plot_wordcloud_dataset() # VisualizerNews.plot_wordcloud_dataset()
# VisualizerNews.plot_histogram_companies() VisualizerNews.plot_histogram_companies()
# VisualizerNews.plot_hist_num_comp_per_art() # VisualizerNews.plot_hist_num_comp_per_art()
# VisualizerNews.plot_histogram_text_lengths() # VisualizerNews.plot_histogram_text_lengths()
# VisualizerNews.plot_pie_chart_of_sites() # VisualizerNews.plot_pie_chart_of_sites()
# VisualizerNews.plot_hist_most_common_words(10) # VisualizerNews.plot_hist_most_common_words(10)

View File

@ -1,128 +0,0 @@
from BagOfWords import BagOfWords
import csv
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
# read current data set from csv
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
sep='|',
usecols=range(1,13), # drop first column 'unnamed'
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
dataset = df.loc[df['Label'] != -1][:100].reset_index(drop=True)
train = dataset[:15]
test = dataset[15:20].reset_index(drop=True)
classifier = MultinomialNB(alpha=1.0e-10,
fit_prior=False,
class_prior=None)
def make_tagged_document(row):
# TaggedDocument wie wo was?
# tags (a list of tokens). Tags may be one or more unicode string tokens,
# but typical practice (which will also be the most memory-efficient) is
# for the tags list to include a unique integer id as the only tag.
# also kein Label?
return TaggedDocument(words=BagOfWords.extract_words(row['Text']),
tags=[row['Label']])
tagged_train_data=train.apply(lambda row: make_tagged_document(row), axis=1)
print(tagged_train_data[0])
tagged_test_data=test.apply(lambda row: make_tagged_document(row), axis=1)
print(tagged_test_data[0])
model = Doc2Vec(vector_size=100,
min_count=20,
epochs=40,
negative=0)
model.build_vocab(tagged_train_data)
model.train(tagged_train_data,
total_examples=model.corpus_count,
epochs=model.epochs)
model.docvecs.count
y_train=np.array([doc.tags[0] for doc in tagged_train_data])
y_test=np.array([doc.tags[0] for doc in tagged_test_data])
X_train=[model.infer_vector(doc.words, steps=20) for doc in tagged_train_data]
X_test=[model.infer_vector(doc.words, steps=20) for doc in tagged_test_data]
# X_train=np.vstack(X_train)
# X_test=np.vstack(X_test)
# X_test.shape
# y_test.shape
# X_train.shape
# y_train.shape
print(X_test)
print(y_test)
print(X_train)
print(y_train)
# reshape data
X_train = np.array(X_train)
X_test = np.array(X_test)
#X_train = X_train.reshape((X_train.shape[0],1,X_train.shape[1]))
#X_test = X_test.reshape((X_test.shape[0],1,X_test.shape[1]))
X_train.shape
X_test.shape
#fit classifier
classifier.fit(X_train, y_train)
#predict class
predictions_train = classifier.predict(X_train)
predictions_test = classifier.predict(X_test)
#print and store metrics
rec = recall_score(y_test, predictions_test, average='weighted')
print('rec: ' + str(rec))
recall_scores.append(rec)
prec = precision_score(y_test, predictions_test, average='weighted')
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
##########################
# probability estimates for the test vector (testing_data)
class_probs = classifier.predict_proba(testing_data)
# number of samples encountered for each class during fitting
# this value is weighted by the sample weight when provided
class_count = classifier.class_count_
# classes in order used
classes = classifier.classes_
# return classes and vector of class estimates
print (recall_scores, precision_scores, f1_scores, class_probs)

View File

@ -1,131 +0,0 @@
from BagOfWords import BagOfWords
import csv
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
# read current data set from csv
df = pd.read_csv('../data/interactive_labeling_round_11.csv',
sep='|',
usecols=range(1,13), # drop first column 'unnamed'
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
dataset = df.loc[df['Label'] != -1].reset_index(drop=True)
X = dataset['Title'] + '. ' + dataset['Text']
y = dataset['Label']
classifier = MultinomialNB(alpha=1.0e-10,
fit_prior=False,
class_prior=None)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=5)
def read_corpus(data, tokens_only=False):
list_of_lists = []
for i, text in enumerate(data):
if tokens_only:
list_of_lists.append(BagOfWords.extract_words(text))
else:
# For training data, add tags
list_of_lists.append(gensim.models.doc2vec.TaggedDocument(BagOfWords.extract_words(text), [i]))
return list_of_lists
tagged_train_data = read_corpus(X_train, tokens_only=False)
print('tagged_train_data[0]:')
print(tagged_train_data[0])
tagged_test_data = read_corpus(X_test, tokens_only=False)
print('tagged_test_data[0]:')
print(tagged_test_data[0])
model = Doc2Vec(vector_size=100,
min_count=20,
epochs=40,
negative=0)
model.build_vocab(tagged_train_data)
model.train(tagged_train_data,
total_examples=model.corpus_count,
epochs=model.epochs)
model.docvecs.count
#y_train=np.array([doc.tags[0] for doc in tagged_train_data])
#y_test=np.array([doc.tags[0] for doc in tagged_test_data])
X_train=[model.infer_vector(doc.words, steps=20) for doc in tagged_train_data]
X_test=[model.infer_vector(doc.words, steps=20) for doc in tagged_test_data]
X_train=np.vstack(X_train)
X_test=np.vstack(X_test)
X_test.shape
y_test.shape
X_train.shape
y_train.shape
print('X_test:')
print(X_test)
print('y_test:')
print(y_test)
print('X_train:')
print(X_train)
print('y_train:')
print(y_train)
# hier: ValueError: Input X must be non-negative
#fit classifier
classifier.fit(X_train, y_train)
#predict class
predictions_train = classifier.predict(X_train)
predictions_test = classifier.predict(X_test)
#print and store metrics
rec = recall_score(y_test, predictions_test, average='weighted')
print('rec: ' + str(rec))
recall_scores.append(rec)
prec = precision_score(y_test, predictions_test, average='weighted')
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
##########################
# probability estimates for the test vector (testing_data)
class_probs = classifier.predict_proba(testing_data)
# number of samples encountered for each class during fitting
# this value is weighted by the sample weight when provided
class_count = classifier.class_count_
# classes in order used
classes = classifier.classes_
# return classes and vector of class estimates
print (recall_scores, precision_scores, f1_scores, class_probs)

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 61 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 83 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 42 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

Before

Width:  |  Height:  |  Size: 26 KiB

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB