new folder

This commit is contained in:
Anne Lorenz 2018-12-05 10:15:04 +01:00
parent 29dabecb9e
commit 417a26d114
17 changed files with 10819 additions and 62 deletions

File diff suppressed because one or more lines are too long

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,717 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Interactive Labeling using Naive Bayes Classifier\n",
"\n",
"This Jupyter Notebook combines a manual and automated labeling technique.\n",
"It includes a basic implementation of Naive Bayes Classifier.\n",
"By calculating class probabilities, we decide wheather a news article has to be labeled manually or automatically.\n",
"\n",
"For the multi-class classification we use the following 6 classes:\n",
"\n",
"* 1: merger of company A and B\n",
"* 2: merger is pending\n",
"* 3: merger is aborted\n",
"* 4: sale of shares\n",
"* 5: merger as incidental remark, not main topic\n",
"* 6: other / irrelevant news"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from collections import OrderedDict\n",
"import csv\n",
"import pickle\n",
"import random\n",
"\n",
"from ipywidgets import interact, interactive, fixed, interact_manual\n",
"import ipywidgets as widgets\n",
"from IPython.display import display\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.feature_selection import SelectPercentile\n",
"from sklearn.metrics import recall_score, precision_score\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from sklearn.naive_bayes import GaussianNB\n",
"\n",
"from FileHandler import FileHandler\n",
"from NaiveBayesInteractive import NaiveBayesInteractive"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First, we import our data set of 10 000 business news articles from a csv file.\n",
"It contains 833/834 articles of each month of the year 2017.\n",
"For detailed information regarding the data set, please read the full documentation."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Timestamp</th>\n",
" <th>Title</th>\n",
" <th>Text</th>\n",
" <th>Index</th>\n",
" <th>Label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>'7f565b5746ba33be1784c7d3d6a7ede0c9cd81a2'</td>\n",
" <td>'Toshiba to sell less than 20 pct of chip unit...</td>\n",
" <td>'Industrials 25am EST Toshiba to sell less th...</td>\n",
" <td>0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>'64e474522a8fbcdbb86a829a9c5708d3dd76e04b'</td>\n",
" <td>'Alaska Air to record $82 million as merger-re...</td>\n",
" <td>'Alaska Air Group Inc ( ALK.N ) said on Wednes...</td>\n",
" <td>1</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>'244f708215c689f2fb7fa502434743a5410a254b'</td>\n",
" <td>'Delta Air Lines forecasts smaller drop in key...</td>\n",
" <td>' 20am EST Delta Air Lines forecasts smaller d...</td>\n",
" <td>2</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>'4a55c5a8cbbf3ff0b62d19127b664cb5ce483bca'</td>\n",
" <td>'Water utility Severn Trent sees FY rewards be...</td>\n",
" <td>'Business News - Tue Jan 31, 2017 - 8:26am GMT...</td>\n",
" <td>3</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>'4f21e2d67d3b1dce026c874c2ae69f6792eb30ae'</td>\n",
" <td>'German industry orders fall more than expecte...</td>\n",
" <td>'Business News - Fri Jan 6, 2017 - 2:09am EST ...</td>\n",
" <td>4</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Timestamp \\\n",
"0 '7f565b5746ba33be1784c7d3d6a7ede0c9cd81a2' \n",
"1 '64e474522a8fbcdbb86a829a9c5708d3dd76e04b' \n",
"2 '244f708215c689f2fb7fa502434743a5410a254b' \n",
"3 '4a55c5a8cbbf3ff0b62d19127b664cb5ce483bca' \n",
"4 '4f21e2d67d3b1dce026c874c2ae69f6792eb30ae' \n",
"\n",
" Title \\\n",
"0 'Toshiba to sell less than 20 pct of chip unit... \n",
"1 'Alaska Air to record $82 million as merger-re... \n",
"2 'Delta Air Lines forecasts smaller drop in key... \n",
"3 'Water utility Severn Trent sees FY rewards be... \n",
"4 'German industry orders fall more than expecte... \n",
"\n",
" Text Index Label \n",
"0 'Industrials 25am EST Toshiba to sell less th... 0 -1 \n",
"1 'Alaska Air Group Inc ( ALK.N ) said on Wednes... 1 -1 \n",
"2 ' 20am EST Delta Air Lines forecasts smaller d... 2 -1 \n",
"3 'Business News - Tue Jan 31, 2017 - 8:26am GMT... 3 -1 \n",
"4 'Business News - Fri Jan 6, 2017 - 2:09am EST ... 4 -1 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filepath = '../data/cleaned_data_set_without_header.csv'\n",
"\n",
"df = pd.read_csv(filepath,\n",
" header=None,\n",
" sep='|',\n",
" engine='python',\n",
" usecols=[0,1,2],\n",
" names = [\"Timestamp\", \"Title\", \"Text\"],\n",
" decimal='.',\n",
" quotechar='\\'',\n",
" quoting=csv.QUOTE_NONE)\n",
"\n",
"n = len(df)\n",
"\n",
"# create new column with indices\n",
"df['Index'] = df.index.values\n",
"\n",
"# create new column and initialize with -1 for unlabeled samples\n",
"df['Label'] = np.full((n), -1)\n",
"\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we load the previously created dictionary 'article_indices_mentions_companies.pkl'. It is a dictionary of all different organizations in the data set (keys) with the list of article indices where a organization was mentioned."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# load pickle object of dict (company => [article numbers])\n",
"with open('../obj/article_indices_mentions_companies.pkl', 'rb') as input:\n",
" dict = pickle.load(input)\n",
"\n",
"# list of companies in insertion order\n",
"comp_list = list(dict)\n",
"\n",
"# number of companies\n",
"len_dict = len(dict)\n",
"\n",
"# list of indices of next articles\n",
"labeled = []\n",
"\n",
"# indices of articles that mention the already picked companies\n",
"black_list = []\n",
"\n",
"def pick_random_articles(n):\n",
" ''' returns list of n indices of the articles we can label next\n",
" '''\n",
" # pick n random articles about n different companies\n",
" i = 0\n",
" # list of chosen articles' indices\n",
" list_arts = []\n",
" while i < n:\n",
" # random company\n",
" rand_c = random.randint(0, len_dict)\n",
" # random article\n",
" rand_i = random.choice(dict[comp_list[rand_c]])\n",
" if rand_i not in (black_list or list_arts):\n",
" list_arts.append(rand_i)\n",
" black_list.extend(dict[comp_list[rand_c]])\n",
" i += 1\n",
" return list_arts\n",
"\n",
"def f(x):\n",
" # store user input\n",
" current_label = x\n",
"\n",
"# first round\n",
"label_next = pick_random_articles(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## - Here starts user computer interaction: -\n",
"### *** Please enter correct label manually: ***\n",
"- 1: merger of companies A and B\n",
"- 2: merger is pending\n",
"- 3: merger is aborted\n",
"- 4: sale of shares\n",
"- 5: merger as incidental remark, not main topic\n",
"- 6: other/irrelevant news"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"616 'Close Brothers sees strong first half, report...\n",
"Name: Title, dtype: object\n",
"\n",
"616 'Business News - Fri Jan 20, 2017 - 7:57am GMT...\n",
"Name: Text, dtype: object\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "586ec1797e9b4111a441c64e16c8326f",
"version_major": 2,
"version_minor": 0
},
"text/html": [
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
"<p>\n",
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
" that the widgets JavaScript is still loading. If this message persists, it\n",
" likely means that the widgets JavaScript library is either not installed or\n",
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
" Widgets Documentation</a> for setup instructions.\n",
"</p>\n",
"<p>\n",
" If you're reading this message in another frontend (for example, a static\n",
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
" it may mean that your frontend doesn't currently support widgets.\n",
"</p>\n"
],
"text/plain": [
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"8227 'Britain''s financial watchdog fines Merrill L...\n",
"Name: Title, dtype: object\n",
"\n",
"8227 ' 30 AM / in 14 minutes Britain''s financial w...\n",
"Name: Text, dtype: object\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\anne.lorenz\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:7: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" import sys\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "219feab0bae845a5993b1194f6f2107c",
"version_major": 2,
"version_minor": 0
},
"text/html": [
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
"<p>\n",
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
" that the widgets JavaScript is still loading. If this message persists, it\n",
" likely means that the widgets JavaScript library is either not installed or\n",
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
" Widgets Documentation</a> for setup instructions.\n",
"</p>\n",
"<p>\n",
" If you're reading this message in another frontend (for example, a static\n",
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
" it may mean that your frontend doesn't currently support widgets.\n",
"</p>\n"
],
"text/plain": [
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"4495 'Takata decides to file for bankruptcy - Japan...\n",
"Name: Title, dtype: object\n",
"\n",
"4495 'Bonds News - Sun Jun 25, 2017 - 6:56pm EDT Ta...\n",
"Name: Text, dtype: object\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "fdd5e247d2764f9d99966567f959cbf2",
"version_major": 2,
"version_minor": 0
},
"text/html": [
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
"<p>\n",
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
" that the widgets JavaScript is still loading. If this message persists, it\n",
" likely means that the widgets JavaScript library is either not installed or\n",
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
" Widgets Documentation</a> for setup instructions.\n",
"</p>\n",
"<p>\n",
" If you're reading this message in another frontend (for example, a static\n",
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
" it may mean that your frontend doesn't currently support widgets.\n",
"</p>\n"
],
"text/plain": [
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"7665 'Bombardier eyes Asian markets amid U.S. trade...\n",
"Name: Title, dtype: object\n",
"\n",
"7665 'October 5, 2017 / 1:22 PM / Updated 2 hours a...\n",
"Name: Text, dtype: object\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "d9f08a572e83449db5bfbb63981dc810",
"version_major": 2,
"version_minor": 0
},
"text/html": [
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
"<p>\n",
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
" that the widgets JavaScript is still loading. If this message persists, it\n",
" likely means that the widgets JavaScript library is either not installed or\n",
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
" Widgets Documentation</a> for setup instructions.\n",
"</p>\n",
"<p>\n",
" If you're reading this message in another frontend (for example, a static\n",
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
" it may mean that your frontend doesn't currently support widgets.\n",
"</p>\n"
],
"text/plain": [
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"9076 'Canadian regulator denies request to suspend ...\n",
"Name: Title, dtype: object\n",
"\n",
"9076 'TORONTO, Nov 23 (Reuters) - Canadas biggest s...\n",
"Name: Text, dtype: object\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "eac7e0d9e6264a3f99011742e7d50023",
"version_major": 2,
"version_minor": 0
},
"text/html": [
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
"<p>\n",
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
" that the widgets JavaScript is still loading. If this message persists, it\n",
" likely means that the widgets JavaScript library is either not installed or\n",
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
" Widgets Documentation</a> for setup instructions.\n",
"</p>\n",
"<p>\n",
" If you're reading this message in another frontend (for example, a static\n",
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
" it may mean that your frontend doesn't currently support widgets.\n",
"</p>\n"
],
"text/plain": [
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"8955 '.'\n",
"Name: Title, dtype: object\n",
"\n",
"8955 '(Corrects to make clear the comparison of dai...\n",
"Name: Text, dtype: object\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "46e776d0ae38403da38b471f8da28179",
"version_major": 2,
"version_minor": 0
},
"text/html": [
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
"<p>\n",
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
" that the widgets JavaScript is still loading. If this message persists, it\n",
" likely means that the widgets JavaScript library is either not installed or\n",
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
" Widgets Documentation</a> for setup instructions.\n",
"</p>\n",
"<p>\n",
" If you're reading this message in another frontend (for example, a static\n",
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
" it may mean that your frontend doesn't currently support widgets.\n",
"</p>\n"
],
"text/plain": [
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"5345 'European banks struggle to solve toxic shippi...\n",
"Name: Title, dtype: object\n",
"\n",
"5345 'July 24, 2017 / 6:07 AM / 34 minutes ago Euro...\n",
"Name: Text, dtype: object\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3f1e6b3bf95549f9b26fae754e7cc4e6",
"version_major": 2,
"version_minor": 0
},
"text/html": [
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
"<p>\n",
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
" that the widgets JavaScript is still loading. If this message persists, it\n",
" likely means that the widgets JavaScript library is either not installed or\n",
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
" Widgets Documentation</a> for setup instructions.\n",
"</p>\n",
"<p>\n",
" If you're reading this message in another frontend (for example, a static\n",
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
" it may mean that your frontend doesn't currently support widgets.\n",
"</p>\n"
],
"text/plain": [
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"603 'PRESS DIGEST- New York Times business news - ...\n",
"Name: Title, dtype: object\n",
"\n",
"603 ' 17am EST PRESS DIGEST- New York Times busine...\n",
"Name: Text, dtype: object\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6fdd9d92a04044c2b757c74c7205775f",
"version_major": 2,
"version_minor": 0
},
"text/html": [
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
"<p>\n",
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
" that the widgets JavaScript is still loading. If this message persists, it\n",
" likely means that the widgets JavaScript library is either not installed or\n",
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
" Widgets Documentation</a> for setup instructions.\n",
"</p>\n",
"<p>\n",
" If you're reading this message in another frontend (for example, a static\n",
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
" it may mean that your frontend doesn't currently support widgets.\n",
"</p>\n"
],
"text/plain": [
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"536 'UPDATE 2-Viacom names global entertainment gr...\n",
"Name: Title, dtype: object\n",
"\n",
"536 '(Adds detail from internal memo, changes sour...\n",
"Name: Text, dtype: object\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "02a38605cfd743428b34424428ef3b0e",
"version_major": 2,
"version_minor": 0
},
"text/html": [
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
"<p>\n",
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
" that the widgets JavaScript is still loading. If this message persists, it\n",
" likely means that the widgets JavaScript library is either not installed or\n",
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
" Widgets Documentation</a> for setup instructions.\n",
"</p>\n",
"<p>\n",
" If you're reading this message in another frontend (for example, a static\n",
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
" it may mean that your frontend doesn't currently support widgets.\n",
"</p>\n"
],
"text/plain": [
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"9396 'Fiat Chrysler in talks over potential diesel ...\n",
"Name: Title, dtype: object\n",
"\n",
"9396 'December 19, 2017 / 10:31 PM / Updated 20 min...\n",
"Name: Text, dtype: object\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "551f6fa037f44acc967165c56b03ae93",
"version_major": 2,
"version_minor": 0
},
"text/html": [
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
"<p>\n",
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
" that the widgets JavaScript is still loading. If this message persists, it\n",
" likely means that the widgets JavaScript library is either not installed or\n",
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
" Widgets Documentation</a> for setup instructions.\n",
"</p>\n",
"<p>\n",
" If you're reading this message in another frontend (for example, a static\n",
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
" it may mean that your frontend doesn't currently support widgets.\n",
"</p>\n"
],
"text/plain": [
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for index in label_next:\n",
" print(df.loc[df['Index'] == index]['Title'])\n",
" print()\n",
" print(df.loc[df['Index'] == index]['Text'])\n",
" # create widget\n",
" current_label = interact(f, x=6)\n",
" df.loc[df['Index'] == index]['Label'] = current_label\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -15,6 +15,7 @@ from collections import OrderedDict
import csv import csv
import pickle import pickle
import re import re
import string
import numpy as np import numpy as np
import pandas as pd import pandas as pd
@ -37,6 +38,12 @@ class BagOfWords:
''' '''
stemmer = PorterStemmer() stemmer = PorterStemmer()
stop_words = BagOfWords.set_stop_words(stemming) stop_words = BagOfWords.set_stop_words(stemming)
# ignore company names
company_names_list = BagOfWords.load_company_names()
for company in company_names_list:
text = text.replace(company, '')
# replace punctuation marks with spaces # replace punctuation marks with spaces
words = re.sub(r'\W', ' ', text) words = re.sub(r'\W', ' ', text)
# split str into list of single words # split str into list of single words
@ -138,8 +145,18 @@ class BagOfWords:
# transform set to list # transform set to list
return list(vocab) return list(vocab)
def load_company_names():
# load pickle object of organizations
with open('../obj/dict_organizations.pkl', 'rb') as input:
dict = pickle.load(input)
list = []
for key in dict.keys():
list.append(key)
return list
def set_stop_words(stemming=True): def set_stop_words(stemming=True):
'''creates list of all words that will be ignored '''creates list of all words that will be ignored:
stopwords, company names and other disturbing terms
''' '''
# stopwords # stopwords
stop_words = ['a', 'about', 'above', 'after', 'again', 'against', stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
@ -232,7 +249,7 @@ class BagOfWords:
n_dict[next_highest[0]] = next_highest[1] n_dict[next_highest[0]] = next_highest[1]
# save n_dict object # save n_dict object
with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f: with open('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL) pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
return n_dict return n_dict
@ -254,7 +271,7 @@ class BagOfWords:
return sum return sum
def test(): def test():
file = 'data\\cleaned_data_set_without_header.csv' file = '..\\data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(file, df_dataset = pd.read_csv(file,
delimiter='|', delimiter='|',
header=None, header=None,

View File

@ -62,7 +62,7 @@ class CosineSimilarity:
if __name__ == '__main__': if __name__ == '__main__':
# read data set # read data set
file = 'data\\cleaned_data_set_without_header.csv' file = '..\\data\\cleaned_data_set_without_header.csv'
df = pd.read_csv(file, df = pd.read_csv(file,
delimiter='|', delimiter='|',
header=None, header=None,

View File

@ -131,7 +131,7 @@ class DecisionTree:
print('# starting decision tree') print('# starting decision tree')
print('# ...') print('# ...')
file = 'data\\classification_labelled_corrected.csv' file = '..\\data\\classification_labelled_corrected.csv'
# read csv file # read csv file
print('# reading dataset') print('# reading dataset')

View File

@ -40,7 +40,7 @@ class FileHandler:
def create_labeling_dataset(): def create_labeling_dataset():
# output file # output file
o_file = 'data\\cleaned_data_set_without_header.csv' o_file = '..\\data\\cleaned_data_set_without_header.csv'
# create file and write header # create file and write header
with open(o_file, 'w', newline='') as csvfile: with open(o_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile, writer = csv.writer(csvfile,
@ -57,7 +57,7 @@ class FileHandler:
# number of articles to select from each month (10000/12=833,33) # number of articles to select from each month (10000/12=833,33)
n_select = 833 n_select = 833
for m in FileHandler.months: for m in FileHandler.months:
df = pd.read_csv('data\\articles\\all_{}.csv'.format(m), df = pd.read_csv('..\\data\\articles\\all_{}.csv'.format(m),
delimiter='|', delimiter='|',
header=0, header=0,
index_col=None, index_col=None,
@ -82,7 +82,7 @@ class FileHandler:
'''clean articles in data set: filter out all non-printable characters '''clean articles in data set: filter out all non-printable characters
''' '''
# read data set # read data set
file = 'data\\cleaned_data_set_without_header.csv' file = '..\\data\\cleaned_data_set_without_header.csv'
df = pd.read_csv(file, df = pd.read_csv(file,
delimiter='|', delimiter='|',
header=None, header=None,
@ -114,7 +114,7 @@ class FileHandler:
'''remove articles with exactly same headline '''remove articles with exactly same headline
''' '''
# read data set # read data set
file = 'data\\cleaned_data_set_without_header.csv' file = '..\\data\\cleaned_data_set_without_header.csv'
df = pd.read_csv(file, df = pd.read_csv(file,
delimiter='|', delimiter='|',
header=None, header=None,
@ -137,7 +137,7 @@ class FileHandler:
i += 1 i += 1
# save cleaned dataframe # save cleaned dataframe
df.to_csv('data\\cleaned_data_set_without_header.csv', df.to_csv('..\\data\\cleaned_data_set_without_header.csv',
header=False, header=False,
index=False, index=False,
sep='|', sep='|',
@ -152,14 +152,14 @@ class FileHandler:
# reliable sources (site_sections) # reliable sources (site_sections)
site_sections = [] site_sections = []
# read list from 'sections.txt' file # read list from 'sections.txt' file
with open('data\\sections.txt', 'r') as s_list: with open('..\\data\\sections.txt', 'r') as s_list:
site_sections = s_list.read().split('\n') site_sections = s_list.read().split('\n')
# article counter # article counter
a = 0 a = 0
for m in FileHandler.months: for m in FileHandler.months:
# 1 output file per month # 1 output file per month
output_file = 'data\\articles\\all_{}.csv'.format(m) output_file = '..\\data\\articles\\all_{}.csv'.format(m)
# path of input JSON files per month # path of input JSON files per month
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\ path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
'\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\ '\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\

View File

@ -40,10 +40,10 @@ class NER:
def tag_words(text): def tag_words(text):
# path to Stanford NER # path to Stanford NER
stanford_classifier = 'stanford-ner-2018-02-27'\ stanford_classifier = '..\stanford-ner-2018-02-27'\
'\\classifiers'\ '\\classifiers'\
'\\english.all.3class.distsim.crf.ser.gz' '\\english.all.3class.distsim.crf.ser.gz'
stanford_ner_path = 'stanford-ner-2018-02-27'\ stanford_ner_path = '..\stanford-ner-2018-02-27'\
'\\stanford-ner.jar' '\\stanford-ner.jar'
# create tagger object # create tagger object
st = StanfordNERTagger(stanford_classifier, stanford_ner_path, st = StanfordNERTagger(stanford_classifier, stanford_ner_path,
@ -100,36 +100,55 @@ class NER:
''' '''
print('# counting company names...') print('# counting company names...')
print() print()
# dictionary of companies with their count # dictionary of companies with their count
dict_com = {} dict_com = {}
# list of company lists (one per article) # list of company lists (one per article)
coms_list = [] coms_list = []
# dict of articles per company name
# (company name => list of indices of articles mentioning the company)
dict_mentions = {}
for i, text in enumerate(texts): for i, text in enumerate(texts):
# list of found companies in article # list of found companies in article
print('# article no. {}:'.format(i)) print('# article no. {}:'.format(i))
coms = NER.find_companies(text) coms = NER.find_companies(text)
coms_list.append(coms) coms_list.append(coms)
# annotate article number in dict
for com in coms:
if com in dict_mentions.keys():
dict_mentions[com].append(i)
else:
dict_mentions[com] = [i]
for com in coms: for com in coms:
if com in dict_com.keys(): if com in dict_com.keys():
dict_com[com] += 1 dict_com[com] += 1
else: else:
dict_com[com] = 1 dict_com[com] = 1
# print(coms_list)
# print()
# calculate number of company mentions per article # calculate number of company mentions per article
num_companies = [] num_companies = []
for l in coms_list: for l in coms_list:
num_companies.append(len(l)) num_companies.append(len(l))
# print(num_companies) # print(num_companies)
print('# average number of different companies mentioned per article:') print('# average number of different companies mentioned per article:')
print(sum(num_companies)/len(num_companies)) print(sum(num_companies)/len(num_companies))
print() print()
# save dict_mentions of article indices per company name
with open('../obj/'+ 'article_indices_mentions_companies' + '.pkl', 'wb') as f:
pickle.dump(dict_mentions, f, pickle.HIGHEST_PROTOCOL)
# save num_companies object in file (for plotting) # save num_companies object in file (for plotting)
with open('obj/'+ 'num_mentions_companies' + '.pkl', 'wb') as f: with open('../obj/'+ 'num_mentions_companies' + '.pkl', 'wb') as f:
pickle.dump(num_companies, f, pickle.HIGHEST_PROTOCOL) pickle.dump(num_companies, f, pickle.HIGHEST_PROTOCOL)
# save dict_com object in file (for plotting) # save dict_com object in file (for plotting)
with open('obj/'+ 'dict_organizations' + '.pkl', 'wb') as f: with open('../obj/'+ 'dict_organizations' + '.pkl', 'wb') as f:
pickle.dump(dict_com, f, pickle.HIGHEST_PROTOCOL) pickle.dump(dict_com, f, pickle.HIGHEST_PROTOCOL)
#print(dict_com) #print(dict_com)
@ -139,7 +158,7 @@ class NER:
def show_most_common_companies(n_commons=50): def show_most_common_companies(n_commons=50):
# load pickle object # load pickle object
with open('obj/dict_organizations.pkl', 'rb') as input: with open('../obj/dict_organizations.pkl', 'rb') as input:
dict = pickle.load(input) dict = pickle.load(input)
# sort dict by value # sort dict by value
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\ o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
@ -154,21 +173,22 @@ class NER:
print(n_dict) print(n_dict)
if __name__ == '__main__': if __name__ == '__main__':
# print('# starting NER...')
# print() print('# starting NER...')
# # read data set print()
# file = 'data\\cleaned_data_set_without_header.csv' # read data set
# df = pd.read_csv(file, file = '..\\data\\cleaned_data_set_without_header.csv'
# delimiter='|', df = pd.read_csv(file,
# header=None, delimiter='|',
# index_col=None, header=None,
# engine='python', index_col=None,
# # usecols=[1,2], engine='python',
# # nrows=100, # usecols=[1,2],
# quoting=csv.QUOTE_NONNUMERIC, # nrows=100,
# quotechar='\'') quoting=csv.QUOTE_NONNUMERIC,
# #print(df) quotechar='\'')
# texts = df[1] + '. ' + df[2] #print(df)
# NER.count_companies(texts) texts = df[1] + '. ' + df[2]
# # NER.show_most_common_companies() NER.count_companies(texts)
#print(NER.tag_words('On Monday, Github and Microsoft announced their merger.')) # NER.show_most_common_companies()
# print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))

View File

@ -187,7 +187,7 @@ class NaiveBayes:
print('# starting naive bayes') print('# starting naive bayes')
print('# ...') print('# ...')
file = 'data\\classification_labelled_corrected.csv' file = '..\\data\\classification_labelled_corrected.csv'
# read csv file # read csv file
print('# reading dataset') print('# reading dataset')

View File

@ -5,6 +5,7 @@ Naive Bayes Classifier
basic implementation of naive bayes. basic implementation of naive bayes.
prints out probabilities for classes needed for interactive labeling. prints out probabilities for classes needed for interactive labeling.
''' '''
from BagOfWords import BagOfWords
import csv import csv
@ -15,9 +16,9 @@ from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import GaussianNB
class NaiveBayes_Interactive: class NaiveBayesInteractive:
def make_naive_bayes(dataset, sklearn_cv=True, percentile=100): def make_naive_bayes(dataset, sklearn_cv=False, percentile=100):
'''fits naive bayes model '''fits naive bayes model
''' '''
print('# fitting model') print('# fitting model')
@ -178,7 +179,7 @@ class NaiveBayes_Interactive:
print('# starting naive bayes') print('# starting naive bayes')
print('# ...') print('# ...')
file = 'data\\classification_labelled_corrected.csv' file = '..\data\\classification_labelled_corrected.csv'
# read csv file # read csv file
print('# reading dataset') print('# reading dataset')
@ -191,8 +192,10 @@ class NaiveBayes_Interactive:
quotechar='\'', quotechar='\'',
quoting=csv.QUOTE_NONE) quoting=csv.QUOTE_NONE)
use_count_vectorizer = True # training options
use_count_vectorizer = False
select_percentile = 100 select_percentile = 100
make_naive_bayes(data, use_count_vectorizer, select_percentile) make_naive_bayes(data, use_count_vectorizer, select_percentile)
print('#') print('#')

View File

@ -93,7 +93,7 @@ class SVM:
print('# starting svm') print('# starting svm')
print('# ...') print('# ...')
file = 'data\\classification_labelled_corrected.csv' file = '..\\data\\classification_labelled_corrected.csv'
# read csv file # read csv file
print('# reading dataset') print('# reading dataset')

View File

@ -30,7 +30,7 @@ class VisualizerNews:
print('# preparing word cloud of 200 most common words...') print('# preparing word cloud of 200 most common words...')
print() print()
# load new data set # load new data set
file = 'data\\cleaned_data_set_without_header.csv' file = '..\\data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(file, df_dataset = pd.read_csv(file,
delimiter='|', delimiter='|',
header=None, header=None,
@ -53,7 +53,7 @@ class VisualizerNews:
dict = BagOfWords.make_dict_common_words(matrix, 200, dict = BagOfWords.make_dict_common_words(matrix, 200,
rel_freq, stemming) rel_freq, stemming)
# save dict object # save dict object
with open('obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f: with open('../obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f:
pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL) pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)
wordcloud = WordCloud(background_color='white', wordcloud = WordCloud(background_color='white',
@ -82,7 +82,7 @@ class VisualizerNews:
print('# preparing histogram of company mentions...') print('# preparing histogram of company mentions...')
print() print()
# # read data set # # read data set
# file = 'data\\cleaned_data_set_without_header.csv' # file = '..\\data\\cleaned_data_set_without_header.csv'
# df = pd.read_csv(file, # df = pd.read_csv(file,
# delimiter='|', # delimiter='|',
# header=None, # header=None,
@ -107,7 +107,7 @@ class VisualizerNews:
# names = np.asarray(count_names) # names = np.asarray(count_names)
# load pickle object # load pickle object
with open('obj/dict_organizations.pkl', 'rb') as input: with open('../obj/dict_organizations.pkl', 'rb') as input:
dict = pickle.load(input) dict = pickle.load(input)
# make list of dict's values # make list of dict's values
count_companies = list(dict.values()) count_companies = list(dict.values())
@ -129,9 +129,9 @@ class VisualizerNews:
.FuncFormatter(lambda x, p: format(int(x), ','))) .FuncFormatter(lambda x, p: format(int(x), ',')))
# save to file # save to file
plt.savefig('visualization\\NER_{}.eps' plt.savefig('..\\visualization\\NER_{}.eps'
.format(VisualizerNews.datestring)) .format(VisualizerNews.datestring))
plt.savefig('visualization\\NER_{}.png' plt.savefig('..\\visualization\\NER_{}.png'
.format(VisualizerNews.datestring)) .format(VisualizerNews.datestring))
plt.show() plt.show()
@ -143,7 +143,7 @@ class VisualizerNews:
print('# preparing histogram of text lengths...') print('# preparing histogram of text lengths...')
print() print()
# read data set # read data set
filepath = 'data\\cleaned_data_set_without_header.csv' filepath = '..\\data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(filepath, df_dataset = pd.read_csv(filepath,
delimiter='|', delimiter='|',
header=None, header=None,
@ -183,9 +183,9 @@ class VisualizerNews:
plt.gca().xaxis.set_major_formatter(matplotlib.ticker\ plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
.FuncFormatter(lambda x, p: format(int(x), ','))) .FuncFormatter(lambda x, p: format(int(x), ',')))
# save plot # save plot
plt.savefig('visualization\\TextLength_{}.eps'\ plt.savefig('..\\visualization\\TextLength_{}.eps'\
.format(VisualizerNews.datestring)) .format(VisualizerNews.datestring))
plt.savefig('visualization\\TextLength_{}.png'\ plt.savefig('..\\visualization\\TextLength_{}.png'\
.format(VisualizerNews.datestring)) .format(VisualizerNews.datestring))
plt.show() plt.show()
@ -195,7 +195,7 @@ class VisualizerNews:
print() print()
# load data set # load data set
filepath = 'data\\cleaned_data_set_without_header.csv' filepath = '..\\data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(filepath, df_dataset = pd.read_csv(filepath,
delimiter='|', delimiter='|',
header=None, header=None,
@ -229,14 +229,14 @@ class VisualizerNews:
plt.setp(autotexts, size=8, weight="bold") plt.setp(autotexts, size=8, weight="bold")
plt.show() plt.show()
plt.savefig('Sites_{}.pdf'.format(VisualizerNews.datestring)) plt.savefig('..\\visualization\\Sites_{}.pdf'.format(VisualizerNews.datestring))
plt.savefig('Sites_{}.pgf'.format(VisualizerNews.datestring)) plt.savefig('..\\visualization\\Sites_{}.pgf'.format(VisualizerNews.datestring))
def plot_hist_most_common_words(n_commons = 10): def plot_hist_most_common_words(n_commons = 10):
print('# preparing histogram of most common words...') print('# preparing histogram of most common words...')
print() print()
# # load data set # # load data set
# filepath = 'data\\cleaned_data_set_without_header.csv' # filepath = '..\\data\\cleaned_data_set_without_header.csv'
# df_dataset = pd.read_csv(filepath, # df_dataset = pd.read_csv(filepath,
# delimiter='|', # delimiter='|',
# header=None, # header=None,
@ -264,7 +264,7 @@ class VisualizerNews:
# pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL) # pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
# load pickle object # load pickle object
with open ('obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i: with open ('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i:
dict = pickle.load(i) dict = pickle.load(i)
# sort dict by value # sort dict by value
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\ o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
@ -287,9 +287,9 @@ class VisualizerNews:
height=numbers, height=numbers,
tick_label=labels, tick_label=labels,
facecolor='royalblue') facecolor='royalblue')
plt.savefig('visualization\\10_most_common_words_{}.eps' plt.savefig('..\\visualization\\10_most_common_words_{}.eps'
.format(VisualizerNews.datestring)) .format(VisualizerNews.datestring))
plt.savefig('visualization\\10_most_common_words_{}.png' plt.savefig('..\\visualization\\10_most_common_words_{}.png'
.format(VisualizerNews.datestring)) .format(VisualizerNews.datestring))
plt.show() plt.show()
@ -299,7 +299,7 @@ class VisualizerNews:
''' '''
# list of number of different companies per article (int) # list of number of different companies per article (int)
list = [] list = []
with open('obj/num_mentions_companies.pkl', 'rb') as input: with open('../obj/num_mentions_companies.pkl', 'rb') as input:
list = pickle.load(input) list = pickle.load(input)
# sort list in descending order # sort list in descending order
@ -320,9 +320,9 @@ class VisualizerNews:
.FuncFormatter(lambda x, p: format(int(x), ','))) .FuncFormatter(lambda x, p: format(int(x), ',')))
# save to file # save to file
plt.savefig('visualization\\NER_2_{}.eps' plt.savefig('..\\visualization\\NER_2_{}.eps'
.format(VisualizerNews.datestring)) .format(VisualizerNews.datestring))
plt.savefig('visualization\\NER_2_{}.png' plt.savefig('..\\visualization\\NER_2_{}.png'
.format(VisualizerNews.datestring)) .format(VisualizerNews.datestring))
plt.show() plt.show()