interactive labeling round 10

This commit is contained in:
annealias 2019-02-19 14:46:12 +01:00
parent 943c24cef0
commit 213bb148de
6 changed files with 22714 additions and 41 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

View File

@ -15,7 +15,7 @@
"In each iteration we...\n", "In each iteration we...\n",
"- check/correct the next 100 article labels manually.\n", "- check/correct the next 100 article labels manually.\n",
" \n", " \n",
"- apply the Multinomial Naive Bayes classification algorithm which returns a vector class_probs $(K_1, K_2, ... , K_6)$ per sample with the probabilities $K_i$ per class $i$. Estimated class labels are adopted automatically, if the estimated probability $K_x > 0.99$ with $x \\in {1,...,6}$.\n", "- apply the Multinomial Naive Bayes classification algorithm which returns a vector class_probs $(K_1, K_2, ... , K_6)$ per sample with the probabilities $K_i$ per class $i$. Estimated class labels are checked, if the estimated probability $K_x < 0.99$ with $x \\in {1,...,6}$.\n",
" \n", " \n",
"Please note: User instructions are written in upper-case.\n", "Please note: User instructions are written in upper-case.\n",
"__________\n", "__________\n",
@ -24,7 +24,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -65,7 +65,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -89,7 +89,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -140,7 +140,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -176,7 +176,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -207,7 +207,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 15,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -216,7 +216,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 16,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -247,7 +247,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -265,11 +265,135 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 9,
"metadata": { "metadata": {
"scrolled": true "scrolled": true
}, },
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Audi\n",
"China Development Bank\n",
"FCC\n",
"Commercial Bank of China\n",
"Commonwealth Bank of Australia\n",
"Westpac Banking\n",
"BT\n",
"Cabinet\n",
"Carlyle\n",
"United Technologies\n",
"World Trade Organization\n",
"Ford\n",
"Alitalia\n",
"Singapore Airlines\n",
"Hitachi\n",
"Chevron\n",
"CBI\n",
"BMO Capital Markets\n",
"Anheuser-Busch\n",
"Prudential\n",
"Snap\n",
"Sistema\n",
"BBC\n",
"PSA\n",
"GM\n",
"RBS\n",
"Barrick Gold\n",
"Potash\n",
"CFIUS\n",
"Noble\n",
"Saint-Gobain\n",
"Anthem\n",
"Aetna\n",
"Prudential Financial\n",
"Daiwa Securities\n",
"Volvo\n",
"Raiffeisen\n",
"Aviva\n",
"Asahi\n",
"PPG Industries\n",
"Vale\n",
"Mitsui &\n",
"Hudson 's Bay\n",
"Unicredit\n",
"Pioneer\n",
"Infosys\n",
"Hiscox\n",
"Westinghouse\n",
"ExxonMobil\n",
"Viacom\n",
"Paramount\n",
"Canada Pension Plan Investment Board\n",
"Permira\n",
"RBC\n",
"AGM\n",
"PPG\n",
"Akzo\n",
"Anglo American\n",
"National Guard\n",
"Reckitt Benckiser\n",
"LVMH\n",
"Centrica\n",
"Gucci\n",
"United Airlines\n",
"Alphabet\n",
"HNA\n",
"FCA\n",
"WTO\n",
"Meredith\n",
"Netflix\n",
"CIC\n",
"Exxon\n",
"WSJ\n",
"Honda Motor\n",
"FDA\n",
"Banco BPM\n",
"BlackBerry\n",
"Ford Motor\n",
"BC Partners\n",
"Kinder Morgan Canada\n",
"McDonalds\n",
"Coca-Cola\n",
"Mitsubishi\n",
"Baker Hughes\n",
"CFTC\n",
"Home Capital\n",
"Nestle\n",
"ConocoPhillips\n",
"Cenovus Energy\n",
"Diageo\n",
"Le Maire\n",
"Bayer\n",
"Cardinal Health\n",
"Time\n",
"ArcelorMittal\n",
"Communist Party\n",
"G Crosse\n",
"Bundesbank\n",
"PNC Financial Services\n",
"KBC\n",
"FOMC\n",
"McKinsey\n",
"RBC Capital Markets\n",
"Elliott Management\n",
"Danone\n",
"Uber\n",
"Pfizer\n",
"Enel\n",
"Western Digital\n",
"Delta Air Lines\n",
"Toyota Motor\n",
"ING\n",
"Ahold Delhaize\n",
"ADP\n",
"Moodys Investors Service\n",
"Moodys Analytics\n",
"WPP\n"
]
}
],
"source": [ "source": [
"# OPTIONAL:\n", "# OPTIONAL:\n",
"# print organizations that are mentioned 3 times and therefore limited\n", "# print organizations that are mentioned 3 times and therefore limited\n",
@ -287,7 +411,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 10,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -300,9 +424,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 11,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"This round number: 10\n"
]
}
],
"source": [ "source": [
"# increment round number\n", "# increment round number\n",
"m += 1\n", "m += 1\n",
@ -341,6 +473,27 @@
" return list_arts" " return list_arts"
] ]
}, },
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8108"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# pick articles with P < 0.99:\n",
"len(df.loc[(df['Label'] == -1) & (df['Estimated'] < 0.99)])"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -373,9 +526,19 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 13,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of manual labels in round no. 10:\n",
"0:0, 1:0, 2:0\n",
"Number of articles to be corrected in this round: 0\n"
]
}
],
"source": [ "source": [
"print('Number of manual labels in round no. {}:'.format(m))\n", "print('Number of manual labels in round no. {}:'.format(m))\n",
"print('0:{}, 1:{}, 2:{}'.format(len(df.loc[(df['Label'] == 0) & (df['Round'] == m)]), len(df.loc[(df['Label'] == 1) & (df['Round'] == m)]), len(df.loc[(df['Label'] == 2) & (df['Round'] == m)])))\n", "print('0:{}, 1:{}, 2:{}'.format(len(df.loc[(df['Label'] == 0) & (df['Round'] == m)]), len(df.loc[(df['Label'] == 1) & (df['Round'] == m)]), len(df.loc[(df['Label'] == 2) & (df['Round'] == m)])))\n",
@ -823,7 +986,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -847,13 +1010,7 @@
"# MNB: transform testing data to matrix...\n", "# MNB: transform testing data to matrix...\n",
"\n", "\n",
"# BOW: extracting all words from articles...\n", "# BOW: extracting all words from articles...\n",
"\n", "\n"
"# BOW: calculating matrix...\n",
"\n",
"# BOW: calculating frequencies...\n",
"\n",
"# MNB: ending multinomial naive bayes\n",
"Wall time: 1h 35min 18s\n"
] ]
} }
], ],
@ -965,7 +1122,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"We label each article with class $j$, if its estimated probability for class $j$ is higher than our threshold:" "We annotate each article's estimated class with its probability in columns 'Estimated' and 'Probability':"
] ]
}, },
{ {
@ -974,24 +1131,22 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# only labels with this minimum probability are adopted\n",
"threshold = ?\n",
"# dict for counting estimated labels\n", "# dict for counting estimated labels\n",
"estimated_labels = {0:0, 1:0, 2:0}\n", "#estimated_labels = {0:0, 1:0, 2:0}\n",
"\n", "\n",
"# series of indices of recently estimated articles \n", "# series of indices of recently estimated articles \n",
"indices_estimated = df.loc[df['Label'] == -1, 'Index'].tolist()\n", "#indices_estimated = df.loc[df['Label'] == -1, 'Index'].tolist()\n",
"\n", "\n",
"# annotate estimated probability for every instance\n", "# annotate estimated probability for every instance\n",
"# for every row i and every element j in row i\n", "# for every row i and every element j in row i\n",
"for (i,j), value in np.ndenumerate(class_probs):\n", "for (i,j), value in np.ndenumerate(class_probs):\n",
" index = indices_estimated[i]\n", " #index = indices_estimated[i]\n",
" # save estimated label\n", " # save estimated label\n",
" df.loc[index, 'Estimated'] = classes[j]\n", " df.loc[index, 'Estimated'] = classes[j]\n",
" # annotate probability\n", " # annotate probability\n",
" df.loc[index, 'Probability'] = value\n", " df.loc[index, 'Probability'] = value\n",
" # count labels\n", " # count labels\n",
" estimated_labels[int(classes[j])] += 1" " #estimated_labels[int(classes[j])] += 1"
] ]
}, },
{ {
@ -1000,8 +1155,8 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"print('Number of auto-labeled samples in round {}: {}'.format(m, sum(estimated_labels.values())))\n", "#print('Number of auto-labeled samples in round {}: {}'.format(m, sum(estimated_labels.values())))\n",
"print('Estimated labels: {}'.format(estimated_labels))" "#print('Estimated labels: {}'.format(estimated_labels))"
] ]
}, },
{ {

File diff suppressed because one or more lines are too long

View File

@ -54,7 +54,7 @@ class LabelingPlotter():
def plot_cumulative(): def plot_cumulative():
# load pickle object # load pickle object
with open('../obj/array_class_probs.pkl', 'rb') as input: with open('../obj/array_class_probs_round_9.pkl', 'rb') as input:
list = pickle.load(input) list = pickle.load(input)
# sort list in descending order # sort list in descending order
@ -70,23 +70,23 @@ class LabelingPlotter():
fig, ax = plt.subplots(figsize=(8, 4)) fig, ax = plt.subplots(figsize=(8, 4))
# plot the cumulative histogram # plot the cumulative histogram
n, bins, patches = ax.hist(probas, n_bins, normed=1, histtype='step', n, bins, patches = ax.hist(probas, n_bins, density=1, histtype='step',
cumulative=True, facecolor='darkred') cumulative=True, facecolor='darkred')
# manipulate # manipulate
vals = ax.get_yticks() #vals = ax.get_yticks()
ax.set_yticklabels(['{:,.1%}'.format(x / 200) for x in vals]) #ax.set_yticklabels(['{:,.1%}'.format(x / 200) for x in vals])
ax.grid(True) ax.grid(True)
ax.legend(loc='right') ax.legend(loc='right')
#ax.set_title('Cumulative distribution of highest estimated probability') #ax.set_title('Cumulative distribution of highest estimated probability')
ax.set_xlabel('Highest estimated probability') ax.set_xlabel('Highest estimated probability')
ax.set_ylabel('Percentage of articles with this highest estimated probability') ax.set_ylabel('Fraction of articles with this highest estimated probability')
plt.axis([0.5, 0.99, 0, 0.006]) #plt.axis([0.5, 0.99, 0, 0.006])
ax.set_xbound(lower=0.5, upper=0.99) #ax.set_xbound(lower=0.5, upper=0.99)
plt.show() plt.show()
if __name__ == '__main__': if __name__ == '__main__':
LabelingPlotter.plot_labeling_rounds() LabelingPlotter.plot_cumulative()