interactive labeling round 10
This commit is contained in:
parent
943c24cef0
commit
213bb148de
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
|
@ -15,7 +15,7 @@
|
||||||
"In each iteration we...\n",
|
"In each iteration we...\n",
|
||||||
"- check/correct the next 100 article labels manually.\n",
|
"- check/correct the next 100 article labels manually.\n",
|
||||||
" \n",
|
" \n",
|
||||||
"- apply the Multinomial Naive Bayes classification algorithm which returns a vector class_probs $(K_1, K_2, ... , K_6)$ per sample with the probabilities $K_i$ per class $i$. Estimated class labels are adopted automatically, if the estimated probability $K_x > 0.99$ with $x \\in {1,...,6}$.\n",
|
"- apply the Multinomial Naive Bayes classification algorithm which returns a vector class_probs $(K_1, K_2, ... , K_6)$ per sample with the probabilities $K_i$ per class $i$. Estimated class labels are checked, if the estimated probability $K_x < 0.99$ with $x \\in {1,...,6}$.\n",
|
||||||
" \n",
|
" \n",
|
||||||
"Please note: User instructions are written in upper-case.\n",
|
"Please note: User instructions are written in upper-case.\n",
|
||||||
"__________\n",
|
"__________\n",
|
||||||
|
@ -24,7 +24,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 1,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -65,7 +65,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 2,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -89,7 +89,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 3,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
@ -140,7 +140,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 4,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -176,7 +176,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 5,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -207,7 +207,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": 15,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -216,7 +216,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 7,
|
"execution_count": 16,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
@ -247,7 +247,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 8,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -265,11 +265,135 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 9,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"scrolled": true
|
"scrolled": true
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Audi\n",
|
||||||
|
"China Development Bank\n",
|
||||||
|
"FCC\n",
|
||||||
|
"Commercial Bank of China\n",
|
||||||
|
"Commonwealth Bank of Australia\n",
|
||||||
|
"Westpac Banking\n",
|
||||||
|
"BT\n",
|
||||||
|
"Cabinet\n",
|
||||||
|
"Carlyle\n",
|
||||||
|
"United Technologies\n",
|
||||||
|
"World Trade Organization\n",
|
||||||
|
"Ford\n",
|
||||||
|
"Alitalia\n",
|
||||||
|
"Singapore Airlines\n",
|
||||||
|
"Hitachi\n",
|
||||||
|
"Chevron\n",
|
||||||
|
"CBI\n",
|
||||||
|
"BMO Capital Markets\n",
|
||||||
|
"Anheuser-Busch\n",
|
||||||
|
"Prudential\n",
|
||||||
|
"Snap\n",
|
||||||
|
"Sistema\n",
|
||||||
|
"BBC\n",
|
||||||
|
"PSA\n",
|
||||||
|
"GM\n",
|
||||||
|
"RBS\n",
|
||||||
|
"Barrick Gold\n",
|
||||||
|
"Potash\n",
|
||||||
|
"CFIUS\n",
|
||||||
|
"Noble\n",
|
||||||
|
"Saint-Gobain\n",
|
||||||
|
"Anthem\n",
|
||||||
|
"Aetna\n",
|
||||||
|
"Prudential Financial\n",
|
||||||
|
"Daiwa Securities\n",
|
||||||
|
"Volvo\n",
|
||||||
|
"Raiffeisen\n",
|
||||||
|
"Aviva\n",
|
||||||
|
"Asahi\n",
|
||||||
|
"PPG Industries\n",
|
||||||
|
"Vale\n",
|
||||||
|
"Mitsui &\n",
|
||||||
|
"Hudson 's Bay\n",
|
||||||
|
"Unicredit\n",
|
||||||
|
"Pioneer\n",
|
||||||
|
"Infosys\n",
|
||||||
|
"Hiscox\n",
|
||||||
|
"Westinghouse\n",
|
||||||
|
"ExxonMobil\n",
|
||||||
|
"Viacom\n",
|
||||||
|
"Paramount\n",
|
||||||
|
"Canada Pension Plan Investment Board\n",
|
||||||
|
"Permira\n",
|
||||||
|
"RBC\n",
|
||||||
|
"AGM\n",
|
||||||
|
"PPG\n",
|
||||||
|
"Akzo\n",
|
||||||
|
"Anglo American\n",
|
||||||
|
"National Guard\n",
|
||||||
|
"Reckitt Benckiser\n",
|
||||||
|
"LVMH\n",
|
||||||
|
"Centrica\n",
|
||||||
|
"Gucci\n",
|
||||||
|
"United Airlines\n",
|
||||||
|
"Alphabet\n",
|
||||||
|
"HNA\n",
|
||||||
|
"FCA\n",
|
||||||
|
"WTO\n",
|
||||||
|
"Meredith\n",
|
||||||
|
"Netflix\n",
|
||||||
|
"CIC\n",
|
||||||
|
"Exxon\n",
|
||||||
|
"WSJ\n",
|
||||||
|
"Honda Motor\n",
|
||||||
|
"FDA\n",
|
||||||
|
"Banco BPM\n",
|
||||||
|
"BlackBerry\n",
|
||||||
|
"Ford Motor\n",
|
||||||
|
"BC Partners\n",
|
||||||
|
"Kinder Morgan Canada\n",
|
||||||
|
"McDonalds\n",
|
||||||
|
"Coca-Cola\n",
|
||||||
|
"Mitsubishi\n",
|
||||||
|
"Baker Hughes\n",
|
||||||
|
"CFTC\n",
|
||||||
|
"Home Capital\n",
|
||||||
|
"Nestle\n",
|
||||||
|
"ConocoPhillips\n",
|
||||||
|
"Cenovus Energy\n",
|
||||||
|
"Diageo\n",
|
||||||
|
"Le Maire\n",
|
||||||
|
"Bayer\n",
|
||||||
|
"Cardinal Health\n",
|
||||||
|
"Time\n",
|
||||||
|
"ArcelorMittal\n",
|
||||||
|
"Communist Party\n",
|
||||||
|
"G Crosse\n",
|
||||||
|
"Bundesbank\n",
|
||||||
|
"PNC Financial Services\n",
|
||||||
|
"KBC\n",
|
||||||
|
"FOMC\n",
|
||||||
|
"McKinsey\n",
|
||||||
|
"RBC Capital Markets\n",
|
||||||
|
"Elliott Management\n",
|
||||||
|
"Danone\n",
|
||||||
|
"Uber\n",
|
||||||
|
"Pfizer\n",
|
||||||
|
"Enel\n",
|
||||||
|
"Western Digital\n",
|
||||||
|
"Delta Air Lines\n",
|
||||||
|
"Toyota Motor\n",
|
||||||
|
"ING\n",
|
||||||
|
"Ahold Delhaize\n",
|
||||||
|
"ADP\n",
|
||||||
|
"Moodys Investors Service\n",
|
||||||
|
"Moodys Analytics\n",
|
||||||
|
"WPP\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# OPTIONAL:\n",
|
"# OPTIONAL:\n",
|
||||||
"# print organizations that are mentioned 3 times and therefore limited\n",
|
"# print organizations that are mentioned 3 times and therefore limited\n",
|
||||||
|
@ -287,7 +411,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 10,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -300,9 +424,17 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 11,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"This round number: 10\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# increment round number\n",
|
"# increment round number\n",
|
||||||
"m += 1\n",
|
"m += 1\n",
|
||||||
|
@ -341,6 +473,27 @@
|
||||||
" return list_arts"
|
" return list_arts"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"8108"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# pick articles with P < 0.99:\n",
|
||||||
|
"len(df.loc[(df['Label'] == -1) & (df['Estimated'] < 0.99)])"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
@ -373,9 +526,19 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 13,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Number of manual labels in round no. 10:\n",
|
||||||
|
"0:0, 1:0, 2:0\n",
|
||||||
|
"Number of articles to be corrected in this round: 0\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"print('Number of manual labels in round no. {}:'.format(m))\n",
|
"print('Number of manual labels in round no. {}:'.format(m))\n",
|
||||||
"print('0:{}, 1:{}, 2:{}'.format(len(df.loc[(df['Label'] == 0) & (df['Round'] == m)]), len(df.loc[(df['Label'] == 1) & (df['Round'] == m)]), len(df.loc[(df['Label'] == 2) & (df['Round'] == m)])))\n",
|
"print('0:{}, 1:{}, 2:{}'.format(len(df.loc[(df['Label'] == 0) & (df['Round'] == m)]), len(df.loc[(df['Label'] == 1) & (df['Round'] == m)]), len(df.loc[(df['Label'] == 2) & (df['Round'] == m)])))\n",
|
||||||
|
@ -823,7 +986,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 8,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
|
@ -847,13 +1010,7 @@
|
||||||
"# MNB: transform testing data to matrix...\n",
|
"# MNB: transform testing data to matrix...\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# BOW: extracting all words from articles...\n",
|
"# BOW: extracting all words from articles...\n",
|
||||||
"\n",
|
"\n"
|
||||||
"# BOW: calculating matrix...\n",
|
|
||||||
"\n",
|
|
||||||
"# BOW: calculating frequencies...\n",
|
|
||||||
"\n",
|
|
||||||
"# MNB: ending multinomial naive bayes\n",
|
|
||||||
"Wall time: 1h 35min 18s\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@ -965,7 +1122,7 @@
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"We label each article with class $j$, if its estimated probability for class $j$ is higher than our threshold:"
|
"We annotate each article's estimated class with its probability in columns 'Estimated' and 'Probability':"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -974,24 +1131,22 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# only labels with this minimum probability are adopted\n",
|
|
||||||
"threshold = ?\n",
|
|
||||||
"# dict for counting estimated labels\n",
|
"# dict for counting estimated labels\n",
|
||||||
"estimated_labels = {0:0, 1:0, 2:0}\n",
|
"#estimated_labels = {0:0, 1:0, 2:0}\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# series of indices of recently estimated articles \n",
|
"# series of indices of recently estimated articles \n",
|
||||||
"indices_estimated = df.loc[df['Label'] == -1, 'Index'].tolist()\n",
|
"#indices_estimated = df.loc[df['Label'] == -1, 'Index'].tolist()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# annotate estimated probability for every instance\n",
|
"# annotate estimated probability for every instance\n",
|
||||||
"# for every row i and every element j in row i\n",
|
"# for every row i and every element j in row i\n",
|
||||||
"for (i,j), value in np.ndenumerate(class_probs):\n",
|
"for (i,j), value in np.ndenumerate(class_probs):\n",
|
||||||
" index = indices_estimated[i]\n",
|
" #index = indices_estimated[i]\n",
|
||||||
" # save estimated label\n",
|
" # save estimated label\n",
|
||||||
" df.loc[index, 'Estimated'] = classes[j]\n",
|
" df.loc[index, 'Estimated'] = classes[j]\n",
|
||||||
" # annotate probability\n",
|
" # annotate probability\n",
|
||||||
" df.loc[index, 'Probability'] = value\n",
|
" df.loc[index, 'Probability'] = value\n",
|
||||||
" # count labels\n",
|
" # count labels\n",
|
||||||
" estimated_labels[int(classes[j])] += 1"
|
" #estimated_labels[int(classes[j])] += 1"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1000,8 +1155,8 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"print('Number of auto-labeled samples in round {}: {}'.format(m, sum(estimated_labels.values())))\n",
|
"#print('Number of auto-labeled samples in round {}: {}'.format(m, sum(estimated_labels.values())))\n",
|
||||||
"print('Estimated labels: {}'.format(estimated_labels))"
|
"#print('Estimated labels: {}'.format(estimated_labels))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -54,7 +54,7 @@ class LabelingPlotter():
|
||||||
|
|
||||||
def plot_cumulative():
|
def plot_cumulative():
|
||||||
# load pickle object
|
# load pickle object
|
||||||
with open('../obj/array_class_probs.pkl', 'rb') as input:
|
with open('../obj/array_class_probs_round_9.pkl', 'rb') as input:
|
||||||
list = pickle.load(input)
|
list = pickle.load(input)
|
||||||
|
|
||||||
# sort list in descending order
|
# sort list in descending order
|
||||||
|
@ -70,23 +70,23 @@ class LabelingPlotter():
|
||||||
fig, ax = plt.subplots(figsize=(8, 4))
|
fig, ax = plt.subplots(figsize=(8, 4))
|
||||||
|
|
||||||
# plot the cumulative histogram
|
# plot the cumulative histogram
|
||||||
n, bins, patches = ax.hist(probas, n_bins, normed=1, histtype='step',
|
n, bins, patches = ax.hist(probas, n_bins, density=1, histtype='step',
|
||||||
cumulative=True, facecolor='darkred')
|
cumulative=True, facecolor='darkred')
|
||||||
|
|
||||||
# manipulate
|
# manipulate
|
||||||
vals = ax.get_yticks()
|
#vals = ax.get_yticks()
|
||||||
ax.set_yticklabels(['{:,.1%}'.format(x / 200) for x in vals])
|
#ax.set_yticklabels(['{:,.1%}'.format(x / 200) for x in vals])
|
||||||
|
|
||||||
|
|
||||||
ax.grid(True)
|
ax.grid(True)
|
||||||
ax.legend(loc='right')
|
ax.legend(loc='right')
|
||||||
#ax.set_title('Cumulative distribution of highest estimated probability')
|
#ax.set_title('Cumulative distribution of highest estimated probability')
|
||||||
ax.set_xlabel('Highest estimated probability')
|
ax.set_xlabel('Highest estimated probability')
|
||||||
ax.set_ylabel('Percentage of articles with this highest estimated probability')
|
ax.set_ylabel('Fraction of articles with this highest estimated probability')
|
||||||
plt.axis([0.5, 0.99, 0, 0.006])
|
#plt.axis([0.5, 0.99, 0, 0.006])
|
||||||
ax.set_xbound(lower=0.5, upper=0.99)
|
#ax.set_xbound(lower=0.5, upper=0.99)
|
||||||
|
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
LabelingPlotter.plot_labeling_rounds()
|
LabelingPlotter.plot_cumulative()
|
Loading…
Reference in New Issue