new folder

2018-12-05 10:15:04 +01:00 · 2018-12-05 10:15:04 +01:00 · 417a26d114
commit 417a26d114
parent 29dabecb9e
17 changed files with 10819 additions and 62 deletions
--- a/data/cleaned_data_set_without_duplicates.csv
+++ b/data/cleaned_data_set_without_duplicates.csv
--- a/obj/article_indices_mentions_companies.pkl
+++ b/obj/article_indices_mentions_companies.pkl
--- a/obj/dict_200_most_common_words_stemmed.pkl
+++ b/obj/dict_200_most_common_words_stemmed.pkl
--- a/obj/dict_organizations.pkl
+++ b/obj/dict_organizations.pkl
--- a/obj/num_mentions_companies.pkl
+++ b/obj/num_mentions_companies.pkl
--- a/src/2018-12-01-al-interactive-labeling.ipynb
+++ b/src/2018-12-01-al-interactive-labeling.ipynb
@ -0,0 +1,717 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Interactive Labeling using Naive Bayes Classifier\n",
+    "\n",
+    "This Jupyter Notebook combines a manual and automated labeling technique.\n",
+    "It includes a basic implementation of Naive Bayes Classifier.\n",
+    "By calculating class probabilities, we decide wheather a news article has to be labeled manually or automatically.\n",
+    "\n",
+    "For the multi-class classification we use the following 6 classes:\n",
+    "\n",
+    "* 1: merger of company A and B\n",
+    "* 2: merger is pending\n",
+    "* 3: merger is aborted\n",
+    "* 4: sale of shares\n",
+    "* 5: merger as incidental remark, not main topic\n",
+    "* 6: other / irrelevant news"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from collections import OrderedDict\n",
+    "import csv\n",
+    "import pickle\n",
+    "import random\n",
+    "\n",
+    "from ipywidgets import interact, interactive, fixed, interact_manual\n",
+    "import ipywidgets as widgets\n",
+    "from IPython.display import display\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "from sklearn.feature_selection import SelectPercentile\n",
+    "from sklearn.metrics import recall_score, precision_score\n",
+    "from sklearn.model_selection import StratifiedKFold\n",
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "\n",
+    "from FileHandler import FileHandler\n",
+    "from NaiveBayesInteractive import NaiveBayesInteractive"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, we import our data set of 10 000 business news articles from a csv file.\n",
+    "It contains 833/834 articles of each month of the year 2017.\n",
+    "For detailed information regarding the data set, please read the full documentation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Timestamp</th>\n",
+       "      <th>Title</th>\n",
+       "      <th>Text</th>\n",
+       "      <th>Index</th>\n",
+       "      <th>Label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>'7f565b5746ba33be1784c7d3d6a7ede0c9cd81a2'</td>\n",
+       "      <td>'Toshiba to sell less than 20 pct of chip unit...</td>\n",
+       "      <td>'Industrials  25am EST Toshiba to sell less th...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>'64e474522a8fbcdbb86a829a9c5708d3dd76e04b'</td>\n",
+       "      <td>'Alaska Air to record $82 million as merger-re...</td>\n",
+       "      <td>'Alaska Air Group Inc ( ALK.N ) said on Wednes...</td>\n",
+       "      <td>1</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>'244f708215c689f2fb7fa502434743a5410a254b'</td>\n",
+       "      <td>'Delta Air Lines forecasts smaller drop in key...</td>\n",
+       "      <td>' 20am EST Delta Air Lines forecasts smaller d...</td>\n",
+       "      <td>2</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>'4a55c5a8cbbf3ff0b62d19127b664cb5ce483bca'</td>\n",
+       "      <td>'Water utility Severn Trent sees FY rewards be...</td>\n",
+       "      <td>'Business News - Tue Jan 31, 2017 - 8:26am GMT...</td>\n",
+       "      <td>3</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>'4f21e2d67d3b1dce026c874c2ae69f6792eb30ae'</td>\n",
+       "      <td>'German industry orders fall more than expecte...</td>\n",
+       "      <td>'Business News - Fri Jan 6, 2017 - 2:09am EST ...</td>\n",
+       "      <td>4</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                    Timestamp  \\\n",
+       "0  '7f565b5746ba33be1784c7d3d6a7ede0c9cd81a2'   \n",
+       "1  '64e474522a8fbcdbb86a829a9c5708d3dd76e04b'   \n",
+       "2  '244f708215c689f2fb7fa502434743a5410a254b'   \n",
+       "3  '4a55c5a8cbbf3ff0b62d19127b664cb5ce483bca'   \n",
+       "4  '4f21e2d67d3b1dce026c874c2ae69f6792eb30ae'   \n",
+       "\n",
+       "                                               Title  \\\n",
+       "0  'Toshiba to sell less than 20 pct of chip unit...   \n",
+       "1  'Alaska Air to record $82 million as merger-re...   \n",
+       "2  'Delta Air Lines forecasts smaller drop in key...   \n",
+       "3  'Water utility Severn Trent sees FY rewards be...   \n",
+       "4  'German industry orders fall more than expecte...   \n",
+       "\n",
+       "                                                Text  Index  Label  \n",
+       "0  'Industrials  25am EST Toshiba to sell less th...      0     -1  \n",
+       "1  'Alaska Air Group Inc ( ALK.N ) said on Wednes...      1     -1  \n",
+       "2  ' 20am EST Delta Air Lines forecasts smaller d...      2     -1  \n",
+       "3  'Business News - Tue Jan 31, 2017 - 8:26am GMT...      3     -1  \n",
+       "4  'Business News - Fri Jan 6, 2017 - 2:09am EST ...      4     -1  "
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filepath = '../data/cleaned_data_set_without_header.csv'\n",
+    "\n",
+    "df = pd.read_csv(filepath,\n",
+    "                 header=None,\n",
+    "                 sep='|',\n",
+    "                 engine='python',\n",
+    "                 usecols=[0,1,2],\n",
+    "                 names = [\"Timestamp\", \"Title\", \"Text\"],\n",
+    "                 decimal='.',\n",
+    "                 quotechar='\\'',\n",
+    "                 quoting=csv.QUOTE_NONE)\n",
+    "\n",
+    "n = len(df)\n",
+    "\n",
+    "# create new column with indices\n",
+    "df['Index'] = df.index.values\n",
+    "\n",
+    "# create new column and initialize with -1 for unlabeled samples\n",
+    "df['Label'] = np.full((n), -1)\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we load the previously created dictionary 'article_indices_mentions_companies.pkl'. It is a dictionary of all different organizations in the data set (keys) with the list of article indices where a organization was mentioned."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load pickle object of dict (company => [article numbers])\n",
+    "with open('../obj/article_indices_mentions_companies.pkl', 'rb') as input:\n",
+    "    dict = pickle.load(input)\n",
+    "\n",
+    "# list of companies in insertion order\n",
+    "comp_list = list(dict)\n",
+    "\n",
+    "# number of companies\n",
+    "len_dict = len(dict)\n",
+    "\n",
+    "# list of indices of next articles\n",
+    "labeled = []\n",
+    "\n",
+    "# indices of articles that mention the already picked companies\n",
+    "black_list = []\n",
+    "\n",
+    "def pick_random_articles(n):\n",
+    "    ''' returns list of n indices of the articles we can label next\n",
+    "    '''\n",
+    "    # pick n random articles about n different companies\n",
+    "    i = 0\n",
+    "    # list of chosen articles' indices\n",
+    "    list_arts = []\n",
+    "    while i < n:\n",
+    "        # random company\n",
+    "        rand_c = random.randint(0, len_dict)\n",
+    "        # random article\n",
+    "        rand_i = random.choice(dict[comp_list[rand_c]])\n",
+    "        if rand_i not in (black_list or list_arts):\n",
+    "            list_arts.append(rand_i)\n",
+    "            black_list.extend(dict[comp_list[rand_c]])\n",
+    "            i += 1\n",
+    "    return list_arts\n",
+    "\n",
+    "def f(x):\n",
+    "    # store user input\n",
+    "    current_label = x\n",
+    "\n",
+    "# first round\n",
+    "label_next = pick_random_articles(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## - Here starts user computer interaction: -\n",
+    "### *** Please enter correct label manually: ***\n",
+    "- 1: merger of companies A and B\n",
+    "- 2: merger is pending\n",
+    "- 3: merger is aborted\n",
+    "- 4: sale of shares\n",
+    "- 5: merger as incidental remark, not main topic\n",
+    "- 6: other/irrelevant news"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "616    'Close Brothers sees strong first half, report...\n",
+      "Name: Title, dtype: object\n",
+      "\n",
+      "616    'Business News - Fri Jan 20, 2017 - 7:57am GMT...\n",
+      "Name: Text, dtype: object\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "586ec1797e9b4111a441c64e16c8326f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/html": [
+       "<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
+       "<p>\n",
+       "  If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
+       "  that the widgets JavaScript is still loading. If this message persists, it\n",
+       "  likely means that the widgets JavaScript library is either not installed or\n",
+       "  not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
+       "  Widgets Documentation</a> for setup instructions.\n",
+       "</p>\n",
+       "<p>\n",
+       "  If you're reading this message in another frontend (for example, a static\n",
+       "  rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
+       "  it may mean that your frontend doesn't currently support widgets.\n",
+       "</p>\n"
+      ],
+      "text/plain": [
+       "interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "8227    'Britain''s financial watchdog fines Merrill L...\n",
+      "Name: Title, dtype: object\n",
+      "\n",
+      "8227    ' 30 AM / in 14 minutes Britain''s financial w...\n",
+      "Name: Text, dtype: object\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\anne.lorenz\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:7: SettingWithCopyWarning: \n",
+      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+      "Try using .loc[row_indexer,col_indexer] = value instead\n",
+      "\n",
+      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
+      "  import sys\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "219feab0bae845a5993b1194f6f2107c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/html": [
+       "<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
+       "<p>\n",
+       "  If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
+       "  that the widgets JavaScript is still loading. If this message persists, it\n",
+       "  likely means that the widgets JavaScript library is either not installed or\n",
+       "  not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
+       "  Widgets Documentation</a> for setup instructions.\n",
+       "</p>\n",
+       "<p>\n",
+       "  If you're reading this message in another frontend (for example, a static\n",
+       "  rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
+       "  it may mean that your frontend doesn't currently support widgets.\n",
+       "</p>\n"
+      ],
+      "text/plain": [
+       "interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "4495    'Takata decides to file for bankruptcy - Japan...\n",
+      "Name: Title, dtype: object\n",
+      "\n",
+      "4495    'Bonds News - Sun Jun 25, 2017 - 6:56pm EDT Ta...\n",
+      "Name: Text, dtype: object\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fdd5e247d2764f9d99966567f959cbf2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/html": [
+       "<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
+       "<p>\n",
+       "  If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
+       "  that the widgets JavaScript is still loading. If this message persists, it\n",
+       "  likely means that the widgets JavaScript library is either not installed or\n",
+       "  not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
+       "  Widgets Documentation</a> for setup instructions.\n",
+       "</p>\n",
+       "<p>\n",
+       "  If you're reading this message in another frontend (for example, a static\n",
+       "  rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
+       "  it may mean that your frontend doesn't currently support widgets.\n",
+       "</p>\n"
+      ],
+      "text/plain": [
+       "interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "7665    'Bombardier eyes Asian markets amid U.S. trade...\n",
+      "Name: Title, dtype: object\n",
+      "\n",
+      "7665    'October 5, 2017 / 1:22 PM / Updated 2 hours a...\n",
+      "Name: Text, dtype: object\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d9f08a572e83449db5bfbb63981dc810",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/html": [
+       "<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
+       "<p>\n",
+       "  If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
+       "  that the widgets JavaScript is still loading. If this message persists, it\n",
+       "  likely means that the widgets JavaScript library is either not installed or\n",
+       "  not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
+       "  Widgets Documentation</a> for setup instructions.\n",
+       "</p>\n",
+       "<p>\n",
+       "  If you're reading this message in another frontend (for example, a static\n",
+       "  rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
+       "  it may mean that your frontend doesn't currently support widgets.\n",
+       "</p>\n"
+      ],
+      "text/plain": [
+       "interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "9076    'Canadian regulator denies request to suspend ...\n",
+      "Name: Title, dtype: object\n",
+      "\n",
+      "9076    'TORONTO, Nov 23 (Reuters) - Canadas biggest s...\n",
+      "Name: Text, dtype: object\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "eac7e0d9e6264a3f99011742e7d50023",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/html": [
+       "<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
+       "<p>\n",
+       "  If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
+       "  that the widgets JavaScript is still loading. If this message persists, it\n",
+       "  likely means that the widgets JavaScript library is either not installed or\n",
+       "  not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
+       "  Widgets Documentation</a> for setup instructions.\n",
+       "</p>\n",
+       "<p>\n",
+       "  If you're reading this message in another frontend (for example, a static\n",
+       "  rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
+       "  it may mean that your frontend doesn't currently support widgets.\n",
+       "</p>\n"
+      ],
+      "text/plain": [
+       "interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "8955    '.'\n",
+      "Name: Title, dtype: object\n",
+      "\n",
+      "8955    '(Corrects to make clear the comparison of dai...\n",
+      "Name: Text, dtype: object\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "46e776d0ae38403da38b471f8da28179",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/html": [
+       "<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
+       "<p>\n",
+       "  If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
+       "  that the widgets JavaScript is still loading. If this message persists, it\n",
+       "  likely means that the widgets JavaScript library is either not installed or\n",
+       "  not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
+       "  Widgets Documentation</a> for setup instructions.\n",
+       "</p>\n",
+       "<p>\n",
+       "  If you're reading this message in another frontend (for example, a static\n",
+       "  rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
+       "  it may mean that your frontend doesn't currently support widgets.\n",
+       "</p>\n"
+      ],
+      "text/plain": [
+       "interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "5345    'European banks struggle to solve toxic shippi...\n",
+      "Name: Title, dtype: object\n",
+      "\n",
+      "5345    'July 24, 2017 / 6:07 AM / 34 minutes ago Euro...\n",
+      "Name: Text, dtype: object\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3f1e6b3bf95549f9b26fae754e7cc4e6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/html": [
+       "<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
+       "<p>\n",
+       "  If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
+       "  that the widgets JavaScript is still loading. If this message persists, it\n",
+       "  likely means that the widgets JavaScript library is either not installed or\n",
+       "  not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
+       "  Widgets Documentation</a> for setup instructions.\n",
+       "</p>\n",
+       "<p>\n",
+       "  If you're reading this message in another frontend (for example, a static\n",
+       "  rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
+       "  it may mean that your frontend doesn't currently support widgets.\n",
+       "</p>\n"
+      ],
+      "text/plain": [
+       "interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "603    'PRESS DIGEST- New York Times business news - ...\n",
+      "Name: Title, dtype: object\n",
+      "\n",
+      "603    ' 17am EST PRESS DIGEST- New York Times busine...\n",
+      "Name: Text, dtype: object\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6fdd9d92a04044c2b757c74c7205775f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/html": [
+       "<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
+       "<p>\n",
+       "  If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
+       "  that the widgets JavaScript is still loading. If this message persists, it\n",
+       "  likely means that the widgets JavaScript library is either not installed or\n",
+       "  not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
+       "  Widgets Documentation</a> for setup instructions.\n",
+       "</p>\n",
+       "<p>\n",
+       "  If you're reading this message in another frontend (for example, a static\n",
+       "  rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
+       "  it may mean that your frontend doesn't currently support widgets.\n",
+       "</p>\n"
+      ],
+      "text/plain": [
+       "interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "536    'UPDATE 2-Viacom names global entertainment gr...\n",
+      "Name: Title, dtype: object\n",
+      "\n",
+      "536    '(Adds detail from internal memo, changes sour...\n",
+      "Name: Text, dtype: object\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "02a38605cfd743428b34424428ef3b0e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/html": [
+       "<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
+       "<p>\n",
+       "  If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
+       "  that the widgets JavaScript is still loading. If this message persists, it\n",
+       "  likely means that the widgets JavaScript library is either not installed or\n",
+       "  not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
+       "  Widgets Documentation</a> for setup instructions.\n",
+       "</p>\n",
+       "<p>\n",
+       "  If you're reading this message in another frontend (for example, a static\n",
+       "  rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
+       "  it may mean that your frontend doesn't currently support widgets.\n",
+       "</p>\n"
+      ],
+      "text/plain": [
+       "interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "9396    'Fiat Chrysler in talks over potential diesel ...\n",
+      "Name: Title, dtype: object\n",
+      "\n",
+      "9396    'December 19, 2017 / 10:31 PM / Updated 20 min...\n",
+      "Name: Text, dtype: object\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "551f6fa037f44acc967165c56b03ae93",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/html": [
+       "<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
+       "<p>\n",
+       "  If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
+       "  that the widgets JavaScript is still loading. If this message persists, it\n",
+       "  likely means that the widgets JavaScript library is either not installed or\n",
+       "  not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
+       "  Widgets Documentation</a> for setup instructions.\n",
+       "</p>\n",
+       "<p>\n",
+       "  If you're reading this message in another frontend (for example, a static\n",
+       "  rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
+       "  it may mean that your frontend doesn't currently support widgets.\n",
+       "</p>\n"
+      ],
+      "text/plain": [
+       "interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "for index in label_next:\n",
+    "    print(df.loc[df['Index'] == index]['Title'])\n",
+    "    print()\n",
+    "    print(df.loc[df['Index'] == index]['Text'])\n",
+    "    # create widget\n",
+    "    current_label = interact(f, x=6)\n",
+    "    df.loc[df['Index'] == index]['Label'] = current_label\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/src/BagOfWords.py
+++ b/src/BagOfWords.py
@ -15,6 +15,7 @@ from collections import OrderedDict
 import csv
 import pickle
 import re
+import string

 import numpy as np
 import pandas as pd
@ -37,6 +38,12 @@ class BagOfWords:
        '''
        stemmer = PorterStemmer()
        stop_words = BagOfWords.set_stop_words(stemming)
+
+        # ignore company names
+        company_names_list = BagOfWords.load_company_names()
+        for company in company_names_list:
+            text = text.replace(company, '')
+
        # replace punctuation marks with spaces
        words = re.sub(r'\W', ' ', text)
        # split str into list of single words
@ -138,8 +145,18 @@ class BagOfWords:
        # transform set to list
        return list(vocab)

+    def load_company_names():
+        # load pickle object of organizations
+        with open('../obj/dict_organizations.pkl', 'rb') as input:
+            dict = pickle.load(input)
+        list = []
+        for key in dict.keys():
+            list.append(key)
+        return list
+
    def set_stop_words(stemming=True):
-        '''creates list of all words that will be ignored
+        '''creates list of all words that will be ignored:
+        stopwords, company names and other disturbing terms
        '''
        # stopwords
        stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
@ -232,7 +249,7 @@ class BagOfWords:
            n_dict[next_highest[0]] = next_highest[1]

        # save n_dict object
-        with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
+        with open('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
            pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)

        return n_dict
@ -254,7 +271,7 @@ class BagOfWords:
        return sum

    def test():
-        file = 'data\\cleaned_data_set_without_header.csv'
+        file = '..\\data\\cleaned_data_set_without_header.csv'
        df_dataset = pd.read_csv(file,
                                 delimiter='|',
                                 header=None,
--- a/src/CosineSimilarity.py
+++ b/src/CosineSimilarity.py
@ -62,7 +62,7 @@ class CosineSimilarity:

 if __name__ == '__main__':
        # read data set
-        file = 'data\\cleaned_data_set_without_header.csv'
+        file = '..\\data\\cleaned_data_set_without_header.csv'
        df = pd.read_csv(file,
                         delimiter='|',
                         header=None,
--- a/src/DecisionTree.py
+++ b/src/DecisionTree.py
@ -131,7 +131,7 @@ class DecisionTree:
        print('# starting decision tree')
        print('# ...')

-        file = 'data\\classification_labelled_corrected.csv'
+        file = '..\\data\\classification_labelled_corrected.csv'

        # read csv file
        print('# reading dataset')
--- a/src/FileHandler.py
+++ b/src/FileHandler.py
@ -40,7 +40,7 @@ class FileHandler:

    def create_labeling_dataset():
        # output file
-        o_file = 'data\\cleaned_data_set_without_header.csv'
+        o_file = '..\\data\\cleaned_data_set_without_header.csv'
        # create file and write header
        with open(o_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, 
@ -57,7 +57,7 @@ class FileHandler:
        # number of articles to select from each month (10000/12=833,33)
        n_select = 833
        for m in FileHandler.months:
-            df = pd.read_csv('data\\articles\\all_{}.csv'.format(m),
+            df = pd.read_csv('..\\data\\articles\\all_{}.csv'.format(m),
                             delimiter='|',
                             header=0,
                             index_col=None,
@ -82,7 +82,7 @@ class FileHandler:
        '''clean articles in data set: filter out all non-printable characters
        '''
        # read data set
-        file = 'data\\cleaned_data_set_without_header.csv'
+        file = '..\\data\\cleaned_data_set_without_header.csv'
        df = pd.read_csv(file,
                         delimiter='|',
                         header=None,
@ -114,7 +114,7 @@ class FileHandler:
        '''remove articles with exactly same headline
        '''
        # read data set
-        file = 'data\\cleaned_data_set_without_header.csv'
+        file = '..\\data\\cleaned_data_set_without_header.csv'
        df = pd.read_csv(file,
                         delimiter='|',
                         header=None,
@ -137,7 +137,7 @@ class FileHandler:
                i += 1

        # save cleaned dataframe
-        df.to_csv('data\\cleaned_data_set_without_header.csv',
+        df.to_csv('..\\data\\cleaned_data_set_without_header.csv',
                   header=False,
                   index=False,
                   sep='|',
@ -152,14 +152,14 @@ class FileHandler:
        # reliable sources (site_sections)
        site_sections = []
        # read list from 'sections.txt' file
-        with open('data\\sections.txt', 'r') as s_list:
+        with open('..\\data\\sections.txt', 'r') as s_list:
            site_sections = s_list.read().split('\n')

        # article counter
        a = 0
        for m in FileHandler.months:
            # 1 output file per month
-            output_file = 'data\\articles\\all_{}.csv'.format(m)
+            output_file = '..\\data\\articles\\all_{}.csv'.format(m)
            # path of input JSON files per month
            path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
                   '\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
--- a/src/FilterKeywords.py
+++ b/src/FilterKeywords.py
--- a/src/NER.py
+++ b/src/NER.py
@ -40,10 +40,10 @@ class NER:

    def tag_words(text):
        # path to Stanford NER
-        stanford_classifier = 'stanford-ner-2018-02-27'\
+        stanford_classifier = '..\stanford-ner-2018-02-27'\
                              '\\classifiers'\
                              '\\english.all.3class.distsim.crf.ser.gz'
-        stanford_ner_path = 'stanford-ner-2018-02-27'\
+        stanford_ner_path = '..\stanford-ner-2018-02-27'\
                            '\\stanford-ner.jar'
        # create tagger object
        st = StanfordNERTagger(stanford_classifier, stanford_ner_path, 
@ -100,36 +100,55 @@ class NER:
        '''
        print('# counting company names...')
        print()
+
        # dictionary of companies with their count
        dict_com = {}
+
        # list of company lists (one per article)
        coms_list = []
+
+        # dict of articles per company name
+        # (company name => list of indices of articles mentioning the company)
+        dict_mentions = {}
+
        for i, text in enumerate(texts):
            # list of found companies in article
            print('# article no. {}:'.format(i))
            coms = NER.find_companies(text)
            coms_list.append(coms)

+            # annotate article number in dict
+            for com in coms:
+                if com in dict_mentions.keys():
+                    dict_mentions[com].append(i)
+                else:
+                    dict_mentions[com] = [i]
+
            for com in coms:
                if com in dict_com.keys():
                    dict_com[com] += 1
                else:
                    dict_com[com] = 1
-        # print(coms_list)
-        # print()
+
        # calculate number of company mentions per article
        num_companies = []
        for l in coms_list:
            num_companies.append(len(l))
+
        # print(num_companies)
        print('# average number of different companies mentioned per article:')
        print(sum(num_companies)/len(num_companies))
        print()
+
+        # save dict_mentions of article indices per company name
+        with open('../obj/'+ 'article_indices_mentions_companies' + '.pkl', 'wb') as f:
+            pickle.dump(dict_mentions, f, pickle.HIGHEST_PROTOCOL)
+
        # save num_companies object in file (for plotting)
-        with open('obj/'+ 'num_mentions_companies' + '.pkl', 'wb') as f:
+        with open('../obj/'+ 'num_mentions_companies' + '.pkl', 'wb') as f:
            pickle.dump(num_companies, f, pickle.HIGHEST_PROTOCOL)
        # save dict_com object in file (for plotting)
-        with open('obj/'+ 'dict_organizations' + '.pkl', 'wb') as f:
+        with open('../obj/'+ 'dict_organizations' + '.pkl', 'wb') as f:
            pickle.dump(dict_com, f, pickle.HIGHEST_PROTOCOL)

        #print(dict_com)
@ -139,7 +158,7 @@ class NER:

    def show_most_common_companies(n_commons=50):
        # load pickle object
-        with open('obj/dict_organizations.pkl', 'rb') as input:
+        with open('../obj/dict_organizations.pkl', 'rb') as input:
            dict = pickle.load(input)
        # sort dict by value
        o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
@ -154,21 +173,22 @@ class NER:
        print(n_dict)

 if __name__ == '__main__':
-    # print('# starting NER...')
-    # print()
-    # # read data set
-    # file = 'data\\cleaned_data_set_without_header.csv'
-    # df = pd.read_csv(file,
-                     # delimiter='|',
-                     # header=None,
-                     # index_col=None,
-                     # engine='python',
-                     # # usecols=[1,2],
-                     # # nrows=100,
-                     # quoting=csv.QUOTE_NONNUMERIC,
-                     # quotechar='\'')
-    # #print(df)
-    # texts = df[1] + '. ' + df[2]
-    # NER.count_companies(texts)
-    # # NER.show_most_common_companies()
+
+    print('# starting NER...')
+    print()
+    # read data set
+    file = '..\\data\\cleaned_data_set_without_header.csv'
+    df = pd.read_csv(file,
+                     delimiter='|',
+                     header=None,
+                     index_col=None,
+                     engine='python',
+                     # usecols=[1,2],
+                     # nrows=100,
+                     quoting=csv.QUOTE_NONNUMERIC,
+                     quotechar='\'')
+    #print(df)
+    texts = df[1] + '. ' + df[2]
+    NER.count_companies(texts)
+    # NER.show_most_common_companies()
    # print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
--- a/src/NaiveBayes.py
+++ b/src/NaiveBayes.py
@ -187,7 +187,7 @@ class NaiveBayes:
        print('# starting naive bayes')
        print('# ...')

-        file = 'data\\classification_labelled_corrected.csv'
+        file = '..\\data\\classification_labelled_corrected.csv'

        # read csv file
        print('# reading dataset')
--- a/src/NaiveBayesInteractive.py
+++ b/src/NaiveBayesInteractive.py
@ -5,6 +5,7 @@ Naive Bayes Classifier
 basic implementation of naive bayes.
 prints out probabilities for classes needed for interactive labeling.
 '''
+from BagOfWords import BagOfWords

 import csv

@ -15,9 +16,9 @@ from sklearn.metrics import recall_score, precision_score
 from sklearn.model_selection import StratifiedKFold
 from sklearn.naive_bayes import GaussianNB

-class NaiveBayes_Interactive:
+class NaiveBayesInteractive:

-    def make_naive_bayes(dataset, sklearn_cv=True, percentile=100):
+    def make_naive_bayes(dataset, sklearn_cv=False, percentile=100):
        '''fits naive bayes model
        '''
        print('# fitting model')
@ -178,7 +179,7 @@ class NaiveBayes_Interactive:
        print('# starting naive bayes')
        print('# ...')

-        file = 'data\\classification_labelled_corrected.csv'
+        file = '..\data\\classification_labelled_corrected.csv'

        # read csv file
        print('# reading dataset')
@ -191,8 +192,10 @@ class NaiveBayes_Interactive:
                           quotechar='\'',
                           quoting=csv.QUOTE_NONE)

-        use_count_vectorizer = True
+        # training options
+        use_count_vectorizer = False
        select_percentile = 100
+
        make_naive_bayes(data, use_count_vectorizer, select_percentile)

        print('#')
--- a/src/Requester.py
+++ b/src/Requester.py
--- a/src/SVM.py
+++ b/src/SVM.py
@ -93,7 +93,7 @@ class SVM:
        print('# starting svm')
        print('# ...')

-        file = 'data\\classification_labelled_corrected.csv'
+        file = '..\\data\\classification_labelled_corrected.csv'

        # read csv file
        print('# reading dataset')
--- a/src/VisualizerNews.py
+++ b/src/VisualizerNews.py
@ -30,7 +30,7 @@ class VisualizerNews:
        print('# preparing word cloud of 200 most common words...')
        print()
        # load new data set
-        file = 'data\\cleaned_data_set_without_header.csv'
+        file = '..\\data\\cleaned_data_set_without_header.csv'
        df_dataset = pd.read_csv(file,
                                 delimiter='|',
                                 header=None,
@ -53,7 +53,7 @@ class VisualizerNews:
        dict = BagOfWords.make_dict_common_words(matrix, 200,
                                                 rel_freq, stemming)
        # save dict object
-        with open('obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f:
+        with open('../obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f:
            pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)

        wordcloud = WordCloud(background_color='white',
@ -82,7 +82,7 @@ class VisualizerNews:
        print('# preparing histogram of company mentions...')
        print()
        # # read data set
-        # file = 'data\\cleaned_data_set_without_header.csv'
+        # file = '..\\data\\cleaned_data_set_without_header.csv'
        # df = pd.read_csv(file,
                         # delimiter='|',
                         # header=None,
@ -107,7 +107,7 @@ class VisualizerNews:
        # names = np.asarray(count_names)

        # load pickle object
-        with open('obj/dict_organizations.pkl', 'rb') as input:
+        with open('../obj/dict_organizations.pkl', 'rb') as input:
            dict = pickle.load(input)
        # make list of dict's values
        count_companies = list(dict.values())
@ -129,9 +129,9 @@ class VisualizerNews:
            .FuncFormatter(lambda x, p: format(int(x), ',')))

        # save to file
-        plt.savefig('visualization\\NER_{}.eps'
+        plt.savefig('..\\visualization\\NER_{}.eps'
                    .format(VisualizerNews.datestring))
-        plt.savefig('visualization\\NER_{}.png'
+        plt.savefig('..\\visualization\\NER_{}.png'
                    .format(VisualizerNews.datestring))
        plt.show()

@ -143,7 +143,7 @@ class VisualizerNews:
        print('# preparing histogram of text lengths...')
        print()
        # read data set
-        filepath = 'data\\cleaned_data_set_without_header.csv'
+        filepath = '..\\data\\cleaned_data_set_without_header.csv'
        df_dataset = pd.read_csv(filepath,
                                 delimiter='|',
                                 header=None,
@ -183,9 +183,9 @@ class VisualizerNews:
        plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
            .FuncFormatter(lambda x, p: format(int(x), ',')))
        # save plot
-        plt.savefig('visualization\\TextLength_{}.eps'\
+        plt.savefig('..\\visualization\\TextLength_{}.eps'\
                    .format(VisualizerNews.datestring))
-        plt.savefig('visualization\\TextLength_{}.png'\
+        plt.savefig('..\\visualization\\TextLength_{}.png'\
                    .format(VisualizerNews.datestring))
        plt.show()

@ -195,7 +195,7 @@ class VisualizerNews:
        print()

        # load data set
-        filepath = 'data\\cleaned_data_set_without_header.csv'
+        filepath = '..\\data\\cleaned_data_set_without_header.csv'
        df_dataset = pd.read_csv(filepath,
                                 delimiter='|',
                                 header=None,
@ -229,14 +229,14 @@ class VisualizerNews:

        plt.setp(autotexts, size=8, weight="bold")
        plt.show()
-        plt.savefig('Sites_{}.pdf'.format(VisualizerNews.datestring))
-        plt.savefig('Sites_{}.pgf'.format(VisualizerNews.datestring))
+        plt.savefig('..\\visualization\\Sites_{}.pdf'.format(VisualizerNews.datestring))
+        plt.savefig('..\\visualization\\Sites_{}.pgf'.format(VisualizerNews.datestring))

    def plot_hist_most_common_words(n_commons = 10):
        print('# preparing histogram of most common words...')
        print()
        # # load data set
-        # filepath = 'data\\cleaned_data_set_without_header.csv'
+        # filepath = '..\\data\\cleaned_data_set_without_header.csv'
        # df_dataset = pd.read_csv(filepath,
                                 # delimiter='|',
                                 # header=None,
@ -264,7 +264,7 @@ class VisualizerNews:
            # pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)

        # load pickle object
-        with open ('obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i:
+        with open ('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i:
            dict = pickle.load(i)
        # sort dict by value
        o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
@ -287,9 +287,9 @@ class VisualizerNews:
                height=numbers, 
                tick_label=labels, 
                facecolor='royalblue')
-        plt.savefig('visualization\\10_most_common_words_{}.eps'
+        plt.savefig('..\\visualization\\10_most_common_words_{}.eps'
                    .format(VisualizerNews.datestring))
-        plt.savefig('visualization\\10_most_common_words_{}.png'
+        plt.savefig('..\\visualization\\10_most_common_words_{}.png'
                    .format(VisualizerNews.datestring))
        plt.show()

@ -299,7 +299,7 @@ class VisualizerNews:
        '''
        # list of number of different companies per article (int)
        list = []
-        with open('obj/num_mentions_companies.pkl', 'rb') as input:
+        with open('../obj/num_mentions_companies.pkl', 'rb') as input:
            list = pickle.load(input)

        # sort list in descending order
@ -320,9 +320,9 @@ class VisualizerNews:
            .FuncFormatter(lambda x, p: format(int(x), ',')))

        # save to file
-        plt.savefig('visualization\\NER_2_{}.eps'
+        plt.savefig('..\\visualization\\NER_2_{}.eps'
                    .format(VisualizerNews.datestring))
-        plt.savefig('visualization\\NER_2_{}.png'
+        plt.savefig('..\\visualization\\NER_2_{}.png'
                    .format(VisualizerNews.datestring))
        plt.show()