interactive labeling round 10

2019-02-19 14:46:12 +01:00 · 2019-02-19 14:46:12 +01:00 · 213bb148de
commit 213bb148de
parent 943c24cef0
6 changed files with 22714 additions and 41 deletions
--- a/data/interactive_labeling_round_10.csv
+++ b/data/interactive_labeling_round_10.csv
--- a/data/interactive_labeling_round_9_neu.csv
+++ b/data/interactive_labeling_round_9_neu.csv
--- a/obj/array_class_probs_round_9.pkl
+++ b/obj/array_class_probs_round_9.pkl
--- a/src/2019-02-11-interactive-labeling-analysis.ipynb
+++ b/src/2019-02-11-interactive-labeling-analysis.ipynb
@ -15,7 +15,7 @@
    "In each iteration we...\n",
    "- check/correct the next 100 article labels manually.\n",
    "  \n",
-    "- apply the Multinomial Naive Bayes classification algorithm which returns a vector class_probs $(K_1, K_2, ... , K_6)$ per sample with the probabilities $K_i$ per class $i$. Estimated class labels are adopted automatically, if the estimated probability $K_x > 0.99$ with $x \\in {1,...,6}$.\n",
+    "- apply the Multinomial Naive Bayes classification algorithm which returns a vector class_probs $(K_1, K_2, ... , K_6)$ per sample with the probabilities $K_i$ per class $i$. Estimated class labels are checked, if the estimated probability $K_x < 0.99$ with $x \\in {1,...,6}$.\n",
    "  \n",
    "Please note: User instructions are written in upper-case.\n",
    "__________\n",
@ -24,7 +24,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -65,7 +65,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -89,7 +89,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
@ -140,7 +140,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@ -176,7 +176,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -207,7 +207,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
@ -216,7 +216,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
@ -247,7 +247,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@ -265,11 +265,135 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Audi\n",
+      "China Development Bank\n",
+      "FCC\n",
+      "Commercial Bank of China\n",
+      "Commonwealth Bank of Australia\n",
+      "Westpac Banking\n",
+      "BT\n",
+      "Cabinet\n",
+      "Carlyle\n",
+      "United Technologies\n",
+      "World Trade Organization\n",
+      "Ford\n",
+      "Alitalia\n",
+      "Singapore Airlines\n",
+      "Hitachi\n",
+      "Chevron\n",
+      "CBI\n",
+      "BMO Capital Markets\n",
+      "Anheuser-Busch\n",
+      "Prudential\n",
+      "Snap\n",
+      "Sistema\n",
+      "BBC\n",
+      "PSA\n",
+      "GM\n",
+      "RBS\n",
+      "Barrick Gold\n",
+      "Potash\n",
+      "CFIUS\n",
+      "Noble\n",
+      "Saint-Gobain\n",
+      "Anthem\n",
+      "Aetna\n",
+      "Prudential Financial\n",
+      "Daiwa Securities\n",
+      "Volvo\n",
+      "Raiffeisen\n",
+      "Aviva\n",
+      "Asahi\n",
+      "PPG Industries\n",
+      "Vale\n",
+      "Mitsui &\n",
+      "Hudson 's Bay\n",
+      "Unicredit\n",
+      "Pioneer\n",
+      "Infosys\n",
+      "Hiscox\n",
+      "Westinghouse\n",
+      "ExxonMobil\n",
+      "Viacom\n",
+      "Paramount\n",
+      "Canada Pension Plan Investment Board\n",
+      "Permira\n",
+      "RBC\n",
+      "AGM\n",
+      "PPG\n",
+      "Akzo\n",
+      "Anglo American\n",
+      "National Guard\n",
+      "Reckitt Benckiser\n",
+      "LVMH\n",
+      "Centrica\n",
+      "Gucci\n",
+      "United Airlines\n",
+      "Alphabet\n",
+      "HNA\n",
+      "FCA\n",
+      "WTO\n",
+      "Meredith\n",
+      "Netflix\n",
+      "CIC\n",
+      "Exxon\n",
+      "WSJ\n",
+      "Honda Motor\n",
+      "FDA\n",
+      "Banco BPM\n",
+      "BlackBerry\n",
+      "Ford Motor\n",
+      "BC Partners\n",
+      "Kinder Morgan Canada\n",
+      "McDonalds\n",
+      "Coca-Cola\n",
+      "Mitsubishi\n",
+      "Baker Hughes\n",
+      "CFTC\n",
+      "Home Capital\n",
+      "Nestle\n",
+      "ConocoPhillips\n",
+      "Cenovus Energy\n",
+      "Diageo\n",
+      "Le Maire\n",
+      "Bayer\n",
+      "Cardinal Health\n",
+      "Time\n",
+      "ArcelorMittal\n",
+      "Communist Party\n",
+      "G Crosse\n",
+      "Bundesbank\n",
+      "PNC Financial Services\n",
+      "KBC\n",
+      "FOMC\n",
+      "McKinsey\n",
+      "RBC Capital Markets\n",
+      "Elliott Management\n",
+      "Danone\n",
+      "Uber\n",
+      "Pfizer\n",
+      "Enel\n",
+      "Western Digital\n",
+      "Delta Air Lines\n",
+      "Toyota Motor\n",
+      "ING\n",
+      "Ahold Delhaize\n",
+      "ADP\n",
+      "Moodys Investors Service\n",
+      "Moodys Analytics\n",
+      "WPP\n"
+     ]
+    }
+   ],
   "source": [
    "# OPTIONAL:\n",
    "# print organizations that are mentioned 3 times and therefore limited\n",
@ -287,7 +411,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
@ -300,9 +424,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "This round number: 10\n"
+     ]
+    }
+   ],
   "source": [
    "# increment round number\n",
    "m += 1\n",
@ -341,6 +473,27 @@
    "    return list_arts"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "8108"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# pick articles with P < 0.99:\n",
+    "len(df.loc[(df['Label'] == -1) & (df['Estimated'] < 0.99)])"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -373,9 +526,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of manual labels in round no. 10:\n",
+      "0:0, 1:0, 2:0\n",
+      "Number of articles to be corrected in this round: 0\n"
+     ]
+    }
+   ],
   "source": [
    "print('Number of manual labels in round no. {}:'.format(m))\n",
    "print('0:{}, 1:{}, 2:{}'.format(len(df.loc[(df['Label'] == 0) & (df['Round'] == m)]), len(df.loc[(df['Label'] == 1) & (df['Round'] == m)]), len(df.loc[(df['Label'] == 2) & (df['Round'] == m)])))\n",
@ -823,7 +986,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
@ -847,13 +1010,7 @@
      "# MNB: transform testing data to matrix...\n",
      "\n",
      "# BOW: extracting all words from articles...\n",
-      "\n",
-      "# BOW: calculating matrix...\n",
-      "\n",
-      "# BOW: calculating frequencies...\n",
-      "\n",
-      "# MNB: ending multinomial naive bayes\n",
-      "Wall time: 1h 35min 18s\n"
+      "\n"
     ]
    }
   ],
@ -965,7 +1122,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "We label each article with class $j$, if its estimated probability for class $j$ is higher than our threshold:"
+    "We annotate each article's estimated class with its probability in columns 'Estimated' and 'Probability':"
   ]
  },
  {
@ -974,24 +1131,22 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# only labels with this minimum probability are adopted\n",
-    "threshold = ?\n",
    "# dict for counting estimated labels\n",
-    "estimated_labels = {0:0, 1:0, 2:0}\n",
+    "#estimated_labels = {0:0, 1:0, 2:0}\n",
    "\n",
    "# series of indices of recently estimated articles \n",
-    "indices_estimated = df.loc[df['Label'] == -1, 'Index'].tolist()\n",
+    "#indices_estimated = df.loc[df['Label'] == -1, 'Index'].tolist()\n",
    "\n",
    "# annotate estimated probability for every instance\n",
    "# for every row i and every element j in row i\n",
    "for (i,j), value in np.ndenumerate(class_probs):\n",
-    "    index = indices_estimated[i]\n",
+    "    #index = indices_estimated[i]\n",
    "    # save estimated label\n",
    "    df.loc[index, 'Estimated'] = classes[j]\n",
    "    # annotate probability\n",
    "    df.loc[index, 'Probability'] = value\n",
    "    # count labels\n",
-    "    estimated_labels[int(classes[j])] += 1"
+    "    #estimated_labels[int(classes[j])] += 1"
   ]
  },
  {
@ -1000,8 +1155,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "print('Number of auto-labeled samples in round {}: {}'.format(m, sum(estimated_labels.values())))\n",
-    "print('Estimated labels: {}'.format(estimated_labels))"
+    "#print('Number of auto-labeled samples in round {}: {}'.format(m, sum(estimated_labels.values())))\n",
+    "#print('Estimated labels: {}'.format(estimated_labels))"
   ]
  },
  {
--- a/src/2019-02-19-al-interactive-labeling-part2.ipynb
+++ b/src/2019-02-19-al-interactive-labeling-part2.ipynb
--- a/src/LabelingPlotter.py
+++ b/src/LabelingPlotter.py
@ -54,7 +54,7 @@ class LabelingPlotter():

 	def plot_cumulative():
 		# load pickle object
-		with open('../obj/array_class_probs.pkl', 'rb') as input:
+		with open('../obj/array_class_probs_round_9.pkl', 'rb') as input:
 			list = pickle.load(input)

 		# sort list in descending order
@ -70,23 +70,23 @@ class LabelingPlotter():
 		fig, ax = plt.subplots(figsize=(8, 4))

 		# plot the cumulative histogram
-		n, bins, patches = ax.hist(probas, n_bins, normed=1, histtype='step',
+		n, bins, patches = ax.hist(probas, n_bins, density=1, histtype='step',
 								   cumulative=True, facecolor='darkred')
 								   
 		# manipulate
-		vals = ax.get_yticks()
-		ax.set_yticklabels(['{:,.1%}'.format(x / 200) for x in vals])
+		#vals = ax.get_yticks()
+		#ax.set_yticklabels(['{:,.1%}'.format(x / 200) for x in vals])


 		ax.grid(True)
 		ax.legend(loc='right')
 		#ax.set_title('Cumulative distribution of highest estimated probability')
 		ax.set_xlabel('Highest estimated probability')
-		ax.set_ylabel('Percentage of articles with this highest estimated probability')
-		plt.axis([0.5, 0.99, 0, 0.006])
-		ax.set_xbound(lower=0.5, upper=0.99)
+		ax.set_ylabel('Fraction of articles with this highest estimated probability')
+		#plt.axis([0.5, 0.99, 0, 0.006])
+		#ax.set_xbound(lower=0.5, upper=0.99)

 		plt.show()

 if __name__ == '__main__':
-    LabelingPlotter.plot_labeling_rounds()
+    LabelingPlotter.plot_cumulative()