interactive labeling round 10
This commit is contained in:
parent
943c24cef0
commit
213bb148de
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
|
@ -15,7 +15,7 @@
|
|||
"In each iteration we...\n",
|
||||
"- check/correct the next 100 article labels manually.\n",
|
||||
" \n",
|
||||
"- apply the Multinomial Naive Bayes classification algorithm which returns a vector class_probs $(K_1, K_2, ... , K_6)$ per sample with the probabilities $K_i$ per class $i$. Estimated class labels are adopted automatically, if the estimated probability $K_x > 0.99$ with $x \\in {1,...,6}$.\n",
|
||||
"- apply the Multinomial Naive Bayes classification algorithm which returns a vector class_probs $(K_1, K_2, ... , K_6)$ per sample with the probabilities $K_i$ per class $i$. Estimated class labels are checked, if the estimated probability $K_x < 0.99$ with $x \\in {1,...,6}$.\n",
|
||||
" \n",
|
||||
"Please note: User instructions are written in upper-case.\n",
|
||||
"__________\n",
|
||||
|
@ -24,7 +24,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -65,7 +65,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -89,7 +89,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -140,7 +140,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -176,7 +176,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -207,7 +207,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -216,7 +216,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -247,7 +247,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -265,11 +265,135 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Audi\n",
|
||||
"China Development Bank\n",
|
||||
"FCC\n",
|
||||
"Commercial Bank of China\n",
|
||||
"Commonwealth Bank of Australia\n",
|
||||
"Westpac Banking\n",
|
||||
"BT\n",
|
||||
"Cabinet\n",
|
||||
"Carlyle\n",
|
||||
"United Technologies\n",
|
||||
"World Trade Organization\n",
|
||||
"Ford\n",
|
||||
"Alitalia\n",
|
||||
"Singapore Airlines\n",
|
||||
"Hitachi\n",
|
||||
"Chevron\n",
|
||||
"CBI\n",
|
||||
"BMO Capital Markets\n",
|
||||
"Anheuser-Busch\n",
|
||||
"Prudential\n",
|
||||
"Snap\n",
|
||||
"Sistema\n",
|
||||
"BBC\n",
|
||||
"PSA\n",
|
||||
"GM\n",
|
||||
"RBS\n",
|
||||
"Barrick Gold\n",
|
||||
"Potash\n",
|
||||
"CFIUS\n",
|
||||
"Noble\n",
|
||||
"Saint-Gobain\n",
|
||||
"Anthem\n",
|
||||
"Aetna\n",
|
||||
"Prudential Financial\n",
|
||||
"Daiwa Securities\n",
|
||||
"Volvo\n",
|
||||
"Raiffeisen\n",
|
||||
"Aviva\n",
|
||||
"Asahi\n",
|
||||
"PPG Industries\n",
|
||||
"Vale\n",
|
||||
"Mitsui &\n",
|
||||
"Hudson 's Bay\n",
|
||||
"Unicredit\n",
|
||||
"Pioneer\n",
|
||||
"Infosys\n",
|
||||
"Hiscox\n",
|
||||
"Westinghouse\n",
|
||||
"ExxonMobil\n",
|
||||
"Viacom\n",
|
||||
"Paramount\n",
|
||||
"Canada Pension Plan Investment Board\n",
|
||||
"Permira\n",
|
||||
"RBC\n",
|
||||
"AGM\n",
|
||||
"PPG\n",
|
||||
"Akzo\n",
|
||||
"Anglo American\n",
|
||||
"National Guard\n",
|
||||
"Reckitt Benckiser\n",
|
||||
"LVMH\n",
|
||||
"Centrica\n",
|
||||
"Gucci\n",
|
||||
"United Airlines\n",
|
||||
"Alphabet\n",
|
||||
"HNA\n",
|
||||
"FCA\n",
|
||||
"WTO\n",
|
||||
"Meredith\n",
|
||||
"Netflix\n",
|
||||
"CIC\n",
|
||||
"Exxon\n",
|
||||
"WSJ\n",
|
||||
"Honda Motor\n",
|
||||
"FDA\n",
|
||||
"Banco BPM\n",
|
||||
"BlackBerry\n",
|
||||
"Ford Motor\n",
|
||||
"BC Partners\n",
|
||||
"Kinder Morgan Canada\n",
|
||||
"McDonalds\n",
|
||||
"Coca-Cola\n",
|
||||
"Mitsubishi\n",
|
||||
"Baker Hughes\n",
|
||||
"CFTC\n",
|
||||
"Home Capital\n",
|
||||
"Nestle\n",
|
||||
"ConocoPhillips\n",
|
||||
"Cenovus Energy\n",
|
||||
"Diageo\n",
|
||||
"Le Maire\n",
|
||||
"Bayer\n",
|
||||
"Cardinal Health\n",
|
||||
"Time\n",
|
||||
"ArcelorMittal\n",
|
||||
"Communist Party\n",
|
||||
"G Crosse\n",
|
||||
"Bundesbank\n",
|
||||
"PNC Financial Services\n",
|
||||
"KBC\n",
|
||||
"FOMC\n",
|
||||
"McKinsey\n",
|
||||
"RBC Capital Markets\n",
|
||||
"Elliott Management\n",
|
||||
"Danone\n",
|
||||
"Uber\n",
|
||||
"Pfizer\n",
|
||||
"Enel\n",
|
||||
"Western Digital\n",
|
||||
"Delta Air Lines\n",
|
||||
"Toyota Motor\n",
|
||||
"ING\n",
|
||||
"Ahold Delhaize\n",
|
||||
"ADP\n",
|
||||
"Moodys Investors Service\n",
|
||||
"Moodys Analytics\n",
|
||||
"WPP\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# OPTIONAL:\n",
|
||||
"# print organizations that are mentioned 3 times and therefore limited\n",
|
||||
|
@ -287,7 +411,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
@ -300,9 +424,17 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"This round number: 10\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# increment round number\n",
|
||||
"m += 1\n",
|
||||
|
@ -341,6 +473,27 @@
|
|||
" return list_arts"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"8108"
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# pick articles with P < 0.99:\n",
|
||||
"len(df.loc[(df['Label'] == -1) & (df['Estimated'] < 0.99)])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -373,9 +526,19 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Number of manual labels in round no. 10:\n",
|
||||
"0:0, 1:0, 2:0\n",
|
||||
"Number of articles to be corrected in this round: 0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print('Number of manual labels in round no. {}:'.format(m))\n",
|
||||
"print('0:{}, 1:{}, 2:{}'.format(len(df.loc[(df['Label'] == 0) & (df['Round'] == m)]), len(df.loc[(df['Label'] == 1) & (df['Round'] == m)]), len(df.loc[(df['Label'] == 2) & (df['Round'] == m)])))\n",
|
||||
|
@ -823,7 +986,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -847,13 +1010,7 @@
|
|||
"# MNB: transform testing data to matrix...\n",
|
||||
"\n",
|
||||
"# BOW: extracting all words from articles...\n",
|
||||
"\n",
|
||||
"# BOW: calculating matrix...\n",
|
||||
"\n",
|
||||
"# BOW: calculating frequencies...\n",
|
||||
"\n",
|
||||
"# MNB: ending multinomial naive bayes\n",
|
||||
"Wall time: 1h 35min 18s\n"
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -965,7 +1122,7 @@
|
|||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We label each article with class $j$, if its estimated probability for class $j$ is higher than our threshold:"
|
||||
"We annotate each article's estimated class with its probability in columns 'Estimated' and 'Probability':"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -974,24 +1131,22 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# only labels with this minimum probability are adopted\n",
|
||||
"threshold = ?\n",
|
||||
"# dict for counting estimated labels\n",
|
||||
"estimated_labels = {0:0, 1:0, 2:0}\n",
|
||||
"#estimated_labels = {0:0, 1:0, 2:0}\n",
|
||||
"\n",
|
||||
"# series of indices of recently estimated articles \n",
|
||||
"indices_estimated = df.loc[df['Label'] == -1, 'Index'].tolist()\n",
|
||||
"#indices_estimated = df.loc[df['Label'] == -1, 'Index'].tolist()\n",
|
||||
"\n",
|
||||
"# annotate estimated probability for every instance\n",
|
||||
"# for every row i and every element j in row i\n",
|
||||
"for (i,j), value in np.ndenumerate(class_probs):\n",
|
||||
" index = indices_estimated[i]\n",
|
||||
" #index = indices_estimated[i]\n",
|
||||
" # save estimated label\n",
|
||||
" df.loc[index, 'Estimated'] = classes[j]\n",
|
||||
" # annotate probability\n",
|
||||
" df.loc[index, 'Probability'] = value\n",
|
||||
" # count labels\n",
|
||||
" estimated_labels[int(classes[j])] += 1"
|
||||
" #estimated_labels[int(classes[j])] += 1"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -1000,8 +1155,8 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print('Number of auto-labeled samples in round {}: {}'.format(m, sum(estimated_labels.values())))\n",
|
||||
"print('Estimated labels: {}'.format(estimated_labels))"
|
||||
"#print('Number of auto-labeled samples in round {}: {}'.format(m, sum(estimated_labels.values())))\n",
|
||||
"#print('Estimated labels: {}'.format(estimated_labels))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -54,7 +54,7 @@ class LabelingPlotter():
|
|||
|
||||
def plot_cumulative():
|
||||
# load pickle object
|
||||
with open('../obj/array_class_probs.pkl', 'rb') as input:
|
||||
with open('../obj/array_class_probs_round_9.pkl', 'rb') as input:
|
||||
list = pickle.load(input)
|
||||
|
||||
# sort list in descending order
|
||||
|
@ -70,23 +70,23 @@ class LabelingPlotter():
|
|||
fig, ax = plt.subplots(figsize=(8, 4))
|
||||
|
||||
# plot the cumulative histogram
|
||||
n, bins, patches = ax.hist(probas, n_bins, normed=1, histtype='step',
|
||||
n, bins, patches = ax.hist(probas, n_bins, density=1, histtype='step',
|
||||
cumulative=True, facecolor='darkred')
|
||||
|
||||
# manipulate
|
||||
vals = ax.get_yticks()
|
||||
ax.set_yticklabels(['{:,.1%}'.format(x / 200) for x in vals])
|
||||
#vals = ax.get_yticks()
|
||||
#ax.set_yticklabels(['{:,.1%}'.format(x / 200) for x in vals])
|
||||
|
||||
|
||||
ax.grid(True)
|
||||
ax.legend(loc='right')
|
||||
#ax.set_title('Cumulative distribution of highest estimated probability')
|
||||
ax.set_xlabel('Highest estimated probability')
|
||||
ax.set_ylabel('Percentage of articles with this highest estimated probability')
|
||||
plt.axis([0.5, 0.99, 0, 0.006])
|
||||
ax.set_xbound(lower=0.5, upper=0.99)
|
||||
ax.set_ylabel('Fraction of articles with this highest estimated probability')
|
||||
#plt.axis([0.5, 0.99, 0, 0.006])
|
||||
#ax.set_xbound(lower=0.5, upper=0.99)
|
||||
|
||||
plt.show()
|
||||
|
||||
if __name__ == '__main__':
|
||||
LabelingPlotter.plot_labeling_rounds()
|
||||
LabelingPlotter.plot_cumulative()
|
Loading…
Reference in New Issue