new folder
This commit is contained in:
parent
29dabecb9e
commit
417a26d114
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,717 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Interactive Labeling using Naive Bayes Classifier\n",
|
||||||
|
"\n",
|
||||||
|
"This Jupyter Notebook combines a manual and automated labeling technique.\n",
|
||||||
|
"It includes a basic implementation of Naive Bayes Classifier.\n",
|
||||||
|
"By calculating class probabilities, we decide wheather a news article has to be labeled manually or automatically.\n",
|
||||||
|
"\n",
|
||||||
|
"For the multi-class classification we use the following 6 classes:\n",
|
||||||
|
"\n",
|
||||||
|
"* 1: merger of company A and B\n",
|
||||||
|
"* 2: merger is pending\n",
|
||||||
|
"* 3: merger is aborted\n",
|
||||||
|
"* 4: sale of shares\n",
|
||||||
|
"* 5: merger as incidental remark, not main topic\n",
|
||||||
|
"* 6: other / irrelevant news"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from collections import OrderedDict\n",
|
||||||
|
"import csv\n",
|
||||||
|
"import pickle\n",
|
||||||
|
"import random\n",
|
||||||
|
"\n",
|
||||||
|
"from ipywidgets import interact, interactive, fixed, interact_manual\n",
|
||||||
|
"import ipywidgets as widgets\n",
|
||||||
|
"from IPython.display import display\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||||||
|
"from sklearn.feature_selection import SelectPercentile\n",
|
||||||
|
"from sklearn.metrics import recall_score, precision_score\n",
|
||||||
|
"from sklearn.model_selection import StratifiedKFold\n",
|
||||||
|
"from sklearn.naive_bayes import GaussianNB\n",
|
||||||
|
"\n",
|
||||||
|
"from FileHandler import FileHandler\n",
|
||||||
|
"from NaiveBayesInteractive import NaiveBayesInteractive"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"First, we import our data set of 10 000 business news articles from a csv file.\n",
|
||||||
|
"It contains 833/834 articles of each month of the year 2017.\n",
|
||||||
|
"For detailed information regarding the data set, please read the full documentation."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style>\n",
|
||||||
|
" .dataframe thead tr:only-child th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: left;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>Timestamp</th>\n",
|
||||||
|
" <th>Title</th>\n",
|
||||||
|
" <th>Text</th>\n",
|
||||||
|
" <th>Index</th>\n",
|
||||||
|
" <th>Label</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>'7f565b5746ba33be1784c7d3d6a7ede0c9cd81a2'</td>\n",
|
||||||
|
" <td>'Toshiba to sell less than 20 pct of chip unit...</td>\n",
|
||||||
|
" <td>'Industrials 25am EST Toshiba to sell less th...</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" <td>-1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>'64e474522a8fbcdbb86a829a9c5708d3dd76e04b'</td>\n",
|
||||||
|
" <td>'Alaska Air to record $82 million as merger-re...</td>\n",
|
||||||
|
" <td>'Alaska Air Group Inc ( ALK.N ) said on Wednes...</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" <td>-1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>'244f708215c689f2fb7fa502434743a5410a254b'</td>\n",
|
||||||
|
" <td>'Delta Air Lines forecasts smaller drop in key...</td>\n",
|
||||||
|
" <td>' 20am EST Delta Air Lines forecasts smaller d...</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" <td>-1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>'4a55c5a8cbbf3ff0b62d19127b664cb5ce483bca'</td>\n",
|
||||||
|
" <td>'Water utility Severn Trent sees FY rewards be...</td>\n",
|
||||||
|
" <td>'Business News - Tue Jan 31, 2017 - 8:26am GMT...</td>\n",
|
||||||
|
" <td>3</td>\n",
|
||||||
|
" <td>-1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>'4f21e2d67d3b1dce026c874c2ae69f6792eb30ae'</td>\n",
|
||||||
|
" <td>'German industry orders fall more than expecte...</td>\n",
|
||||||
|
" <td>'Business News - Fri Jan 6, 2017 - 2:09am EST ...</td>\n",
|
||||||
|
" <td>4</td>\n",
|
||||||
|
" <td>-1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" Timestamp \\\n",
|
||||||
|
"0 '7f565b5746ba33be1784c7d3d6a7ede0c9cd81a2' \n",
|
||||||
|
"1 '64e474522a8fbcdbb86a829a9c5708d3dd76e04b' \n",
|
||||||
|
"2 '244f708215c689f2fb7fa502434743a5410a254b' \n",
|
||||||
|
"3 '4a55c5a8cbbf3ff0b62d19127b664cb5ce483bca' \n",
|
||||||
|
"4 '4f21e2d67d3b1dce026c874c2ae69f6792eb30ae' \n",
|
||||||
|
"\n",
|
||||||
|
" Title \\\n",
|
||||||
|
"0 'Toshiba to sell less than 20 pct of chip unit... \n",
|
||||||
|
"1 'Alaska Air to record $82 million as merger-re... \n",
|
||||||
|
"2 'Delta Air Lines forecasts smaller drop in key... \n",
|
||||||
|
"3 'Water utility Severn Trent sees FY rewards be... \n",
|
||||||
|
"4 'German industry orders fall more than expecte... \n",
|
||||||
|
"\n",
|
||||||
|
" Text Index Label \n",
|
||||||
|
"0 'Industrials 25am EST Toshiba to sell less th... 0 -1 \n",
|
||||||
|
"1 'Alaska Air Group Inc ( ALK.N ) said on Wednes... 1 -1 \n",
|
||||||
|
"2 ' 20am EST Delta Air Lines forecasts smaller d... 2 -1 \n",
|
||||||
|
"3 'Business News - Tue Jan 31, 2017 - 8:26am GMT... 3 -1 \n",
|
||||||
|
"4 'Business News - Fri Jan 6, 2017 - 2:09am EST ... 4 -1 "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"filepath = '../data/cleaned_data_set_without_header.csv'\n",
|
||||||
|
"\n",
|
||||||
|
"df = pd.read_csv(filepath,\n",
|
||||||
|
" header=None,\n",
|
||||||
|
" sep='|',\n",
|
||||||
|
" engine='python',\n",
|
||||||
|
" usecols=[0,1,2],\n",
|
||||||
|
" names = [\"Timestamp\", \"Title\", \"Text\"],\n",
|
||||||
|
" decimal='.',\n",
|
||||||
|
" quotechar='\\'',\n",
|
||||||
|
" quoting=csv.QUOTE_NONE)\n",
|
||||||
|
"\n",
|
||||||
|
"n = len(df)\n",
|
||||||
|
"\n",
|
||||||
|
"# create new column with indices\n",
|
||||||
|
"df['Index'] = df.index.values\n",
|
||||||
|
"\n",
|
||||||
|
"# create new column and initialize with -1 for unlabeled samples\n",
|
||||||
|
"df['Label'] = np.full((n), -1)\n",
|
||||||
|
"\n",
|
||||||
|
"df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Now we load the previously created dictionary 'article_indices_mentions_companies.pkl'. It is a dictionary of all different organizations in the data set (keys) with the list of article indices where a organization was mentioned."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# load pickle object of dict (company => [article numbers])\n",
|
||||||
|
"with open('../obj/article_indices_mentions_companies.pkl', 'rb') as input:\n",
|
||||||
|
" dict = pickle.load(input)\n",
|
||||||
|
"\n",
|
||||||
|
"# list of companies in insertion order\n",
|
||||||
|
"comp_list = list(dict)\n",
|
||||||
|
"\n",
|
||||||
|
"# number of companies\n",
|
||||||
|
"len_dict = len(dict)\n",
|
||||||
|
"\n",
|
||||||
|
"# list of indices of next articles\n",
|
||||||
|
"labeled = []\n",
|
||||||
|
"\n",
|
||||||
|
"# indices of articles that mention the already picked companies\n",
|
||||||
|
"black_list = []\n",
|
||||||
|
"\n",
|
||||||
|
"def pick_random_articles(n):\n",
|
||||||
|
" ''' returns list of n indices of the articles we can label next\n",
|
||||||
|
" '''\n",
|
||||||
|
" # pick n random articles about n different companies\n",
|
||||||
|
" i = 0\n",
|
||||||
|
" # list of chosen articles' indices\n",
|
||||||
|
" list_arts = []\n",
|
||||||
|
" while i < n:\n",
|
||||||
|
" # random company\n",
|
||||||
|
" rand_c = random.randint(0, len_dict)\n",
|
||||||
|
" # random article\n",
|
||||||
|
" rand_i = random.choice(dict[comp_list[rand_c]])\n",
|
||||||
|
" if rand_i not in (black_list or list_arts):\n",
|
||||||
|
" list_arts.append(rand_i)\n",
|
||||||
|
" black_list.extend(dict[comp_list[rand_c]])\n",
|
||||||
|
" i += 1\n",
|
||||||
|
" return list_arts\n",
|
||||||
|
"\n",
|
||||||
|
"def f(x):\n",
|
||||||
|
" # store user input\n",
|
||||||
|
" current_label = x\n",
|
||||||
|
"\n",
|
||||||
|
"# first round\n",
|
||||||
|
"label_next = pick_random_articles(10)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## - Here starts user computer interaction: -\n",
|
||||||
|
"### *** Please enter correct label manually: ***\n",
|
||||||
|
"- 1: merger of companies A and B\n",
|
||||||
|
"- 2: merger is pending\n",
|
||||||
|
"- 3: merger is aborted\n",
|
||||||
|
"- 4: sale of shares\n",
|
||||||
|
"- 5: merger as incidental remark, not main topic\n",
|
||||||
|
"- 6: other/irrelevant news"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": false
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"616 'Close Brothers sees strong first half, report...\n",
|
||||||
|
"Name: Title, dtype: object\n",
|
||||||
|
"\n",
|
||||||
|
"616 'Business News - Fri Jan 20, 2017 - 7:57am GMT...\n",
|
||||||
|
"Name: Text, dtype: object\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "586ec1797e9b4111a441c64e16c8326f",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/html": [
|
||||||
|
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||||
|
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||||
|
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||||
|
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||||
|
" Widgets Documentation</a> for setup instructions.\n",
|
||||||
|
"</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in another frontend (for example, a static\n",
|
||||||
|
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||||
|
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||||
|
"</p>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"8227 'Britain''s financial watchdog fines Merrill L...\n",
|
||||||
|
"Name: Title, dtype: object\n",
|
||||||
|
"\n",
|
||||||
|
"8227 ' 30 AM / in 14 minutes Britain''s financial w...\n",
|
||||||
|
"Name: Text, dtype: object\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"C:\\Users\\anne.lorenz\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:7: SettingWithCopyWarning: \n",
|
||||||
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||||
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||||
|
"\n",
|
||||||
|
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
|
||||||
|
" import sys\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "219feab0bae845a5993b1194f6f2107c",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/html": [
|
||||||
|
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||||
|
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||||
|
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||||
|
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||||
|
" Widgets Documentation</a> for setup instructions.\n",
|
||||||
|
"</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in another frontend (for example, a static\n",
|
||||||
|
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||||
|
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||||
|
"</p>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"4495 'Takata decides to file for bankruptcy - Japan...\n",
|
||||||
|
"Name: Title, dtype: object\n",
|
||||||
|
"\n",
|
||||||
|
"4495 'Bonds News - Sun Jun 25, 2017 - 6:56pm EDT Ta...\n",
|
||||||
|
"Name: Text, dtype: object\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "fdd5e247d2764f9d99966567f959cbf2",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/html": [
|
||||||
|
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||||
|
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||||
|
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||||
|
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||||
|
" Widgets Documentation</a> for setup instructions.\n",
|
||||||
|
"</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in another frontend (for example, a static\n",
|
||||||
|
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||||
|
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||||
|
"</p>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"7665 'Bombardier eyes Asian markets amid U.S. trade...\n",
|
||||||
|
"Name: Title, dtype: object\n",
|
||||||
|
"\n",
|
||||||
|
"7665 'October 5, 2017 / 1:22 PM / Updated 2 hours a...\n",
|
||||||
|
"Name: Text, dtype: object\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "d9f08a572e83449db5bfbb63981dc810",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/html": [
|
||||||
|
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||||
|
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||||
|
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||||
|
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||||
|
" Widgets Documentation</a> for setup instructions.\n",
|
||||||
|
"</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in another frontend (for example, a static\n",
|
||||||
|
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||||
|
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||||
|
"</p>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"9076 'Canadian regulator denies request to suspend ...\n",
|
||||||
|
"Name: Title, dtype: object\n",
|
||||||
|
"\n",
|
||||||
|
"9076 'TORONTO, Nov 23 (Reuters) - Canadas biggest s...\n",
|
||||||
|
"Name: Text, dtype: object\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "eac7e0d9e6264a3f99011742e7d50023",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/html": [
|
||||||
|
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||||
|
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||||
|
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||||
|
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||||
|
" Widgets Documentation</a> for setup instructions.\n",
|
||||||
|
"</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in another frontend (for example, a static\n",
|
||||||
|
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||||
|
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||||
|
"</p>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"8955 '.'\n",
|
||||||
|
"Name: Title, dtype: object\n",
|
||||||
|
"\n",
|
||||||
|
"8955 '(Corrects to make clear the comparison of dai...\n",
|
||||||
|
"Name: Text, dtype: object\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "46e776d0ae38403da38b471f8da28179",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/html": [
|
||||||
|
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||||
|
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||||
|
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||||
|
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||||
|
" Widgets Documentation</a> for setup instructions.\n",
|
||||||
|
"</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in another frontend (for example, a static\n",
|
||||||
|
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||||
|
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||||
|
"</p>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"5345 'European banks struggle to solve toxic shippi...\n",
|
||||||
|
"Name: Title, dtype: object\n",
|
||||||
|
"\n",
|
||||||
|
"5345 'July 24, 2017 / 6:07 AM / 34 minutes ago Euro...\n",
|
||||||
|
"Name: Text, dtype: object\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "3f1e6b3bf95549f9b26fae754e7cc4e6",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/html": [
|
||||||
|
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||||
|
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||||
|
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||||
|
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||||
|
" Widgets Documentation</a> for setup instructions.\n",
|
||||||
|
"</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in another frontend (for example, a static\n",
|
||||||
|
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||||
|
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||||
|
"</p>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"603 'PRESS DIGEST- New York Times business news - ...\n",
|
||||||
|
"Name: Title, dtype: object\n",
|
||||||
|
"\n",
|
||||||
|
"603 ' 17am EST PRESS DIGEST- New York Times busine...\n",
|
||||||
|
"Name: Text, dtype: object\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "6fdd9d92a04044c2b757c74c7205775f",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/html": [
|
||||||
|
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||||
|
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||||
|
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||||
|
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||||
|
" Widgets Documentation</a> for setup instructions.\n",
|
||||||
|
"</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in another frontend (for example, a static\n",
|
||||||
|
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||||
|
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||||
|
"</p>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"536 'UPDATE 2-Viacom names global entertainment gr...\n",
|
||||||
|
"Name: Title, dtype: object\n",
|
||||||
|
"\n",
|
||||||
|
"536 '(Adds detail from internal memo, changes sour...\n",
|
||||||
|
"Name: Text, dtype: object\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "02a38605cfd743428b34424428ef3b0e",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/html": [
|
||||||
|
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||||
|
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||||
|
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||||
|
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||||
|
" Widgets Documentation</a> for setup instructions.\n",
|
||||||
|
"</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in another frontend (for example, a static\n",
|
||||||
|
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||||
|
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||||
|
"</p>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"9396 'Fiat Chrysler in talks over potential diesel ...\n",
|
||||||
|
"Name: Title, dtype: object\n",
|
||||||
|
"\n",
|
||||||
|
"9396 'December 19, 2017 / 10:31 PM / Updated 20 min...\n",
|
||||||
|
"Name: Text, dtype: object\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"application/vnd.jupyter.widget-view+json": {
|
||||||
|
"model_id": "551f6fa037f44acc967165c56b03ae93",
|
||||||
|
"version_major": 2,
|
||||||
|
"version_minor": 0
|
||||||
|
},
|
||||||
|
"text/html": [
|
||||||
|
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||||
|
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||||
|
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||||
|
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||||
|
" Widgets Documentation</a> for setup instructions.\n",
|
||||||
|
"</p>\n",
|
||||||
|
"<p>\n",
|
||||||
|
" If you're reading this message in another frontend (for example, a static\n",
|
||||||
|
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||||
|
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||||
|
"</p>\n"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for index in label_next:\n",
|
||||||
|
" print(df.loc[df['Index'] == index]['Title'])\n",
|
||||||
|
" print()\n",
|
||||||
|
" print(df.loc[df['Index'] == index]['Text'])\n",
|
||||||
|
" # create widget\n",
|
||||||
|
" current_label = interact(f, x=6)\n",
|
||||||
|
" df.loc[df['Index'] == index]['Label'] = current_label\n",
|
||||||
|
" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
|
@ -15,6 +15,7 @@ from collections import OrderedDict
|
||||||
import csv
|
import csv
|
||||||
import pickle
|
import pickle
|
||||||
import re
|
import re
|
||||||
|
import string
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
@ -37,6 +38,12 @@ class BagOfWords:
|
||||||
'''
|
'''
|
||||||
stemmer = PorterStemmer()
|
stemmer = PorterStemmer()
|
||||||
stop_words = BagOfWords.set_stop_words(stemming)
|
stop_words = BagOfWords.set_stop_words(stemming)
|
||||||
|
|
||||||
|
# ignore company names
|
||||||
|
company_names_list = BagOfWords.load_company_names()
|
||||||
|
for company in company_names_list:
|
||||||
|
text = text.replace(company, '')
|
||||||
|
|
||||||
# replace punctuation marks with spaces
|
# replace punctuation marks with spaces
|
||||||
words = re.sub(r'\W', ' ', text)
|
words = re.sub(r'\W', ' ', text)
|
||||||
# split str into list of single words
|
# split str into list of single words
|
||||||
|
@ -138,8 +145,18 @@ class BagOfWords:
|
||||||
# transform set to list
|
# transform set to list
|
||||||
return list(vocab)
|
return list(vocab)
|
||||||
|
|
||||||
|
def load_company_names():
|
||||||
|
# load pickle object of organizations
|
||||||
|
with open('../obj/dict_organizations.pkl', 'rb') as input:
|
||||||
|
dict = pickle.load(input)
|
||||||
|
list = []
|
||||||
|
for key in dict.keys():
|
||||||
|
list.append(key)
|
||||||
|
return list
|
||||||
|
|
||||||
def set_stop_words(stemming=True):
|
def set_stop_words(stemming=True):
|
||||||
'''creates list of all words that will be ignored
|
'''creates list of all words that will be ignored:
|
||||||
|
stopwords, company names and other disturbing terms
|
||||||
'''
|
'''
|
||||||
# stopwords
|
# stopwords
|
||||||
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
|
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
|
||||||
|
@ -232,7 +249,7 @@ class BagOfWords:
|
||||||
n_dict[next_highest[0]] = next_highest[1]
|
n_dict[next_highest[0]] = next_highest[1]
|
||||||
|
|
||||||
# save n_dict object
|
# save n_dict object
|
||||||
with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
|
with open('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
|
||||||
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
return n_dict
|
return n_dict
|
||||||
|
@ -254,7 +271,7 @@ class BagOfWords:
|
||||||
return sum
|
return sum
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
file = 'data\\cleaned_data_set_without_header.csv'
|
file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||||
df_dataset = pd.read_csv(file,
|
df_dataset = pd.read_csv(file,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
|
@ -62,7 +62,7 @@ class CosineSimilarity:
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# read data set
|
# read data set
|
||||||
file = 'data\\cleaned_data_set_without_header.csv'
|
file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||||
df = pd.read_csv(file,
|
df = pd.read_csv(file,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
|
@ -131,7 +131,7 @@ class DecisionTree:
|
||||||
print('# starting decision tree')
|
print('# starting decision tree')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
file = 'data\\classification_labelled_corrected.csv'
|
file = '..\\data\\classification_labelled_corrected.csv'
|
||||||
|
|
||||||
# read csv file
|
# read csv file
|
||||||
print('# reading dataset')
|
print('# reading dataset')
|
|
@ -40,7 +40,7 @@ class FileHandler:
|
||||||
|
|
||||||
def create_labeling_dataset():
|
def create_labeling_dataset():
|
||||||
# output file
|
# output file
|
||||||
o_file = 'data\\cleaned_data_set_without_header.csv'
|
o_file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||||
# create file and write header
|
# create file and write header
|
||||||
with open(o_file, 'w', newline='') as csvfile:
|
with open(o_file, 'w', newline='') as csvfile:
|
||||||
writer = csv.writer(csvfile,
|
writer = csv.writer(csvfile,
|
||||||
|
@ -57,7 +57,7 @@ class FileHandler:
|
||||||
# number of articles to select from each month (10000/12=833,33)
|
# number of articles to select from each month (10000/12=833,33)
|
||||||
n_select = 833
|
n_select = 833
|
||||||
for m in FileHandler.months:
|
for m in FileHandler.months:
|
||||||
df = pd.read_csv('data\\articles\\all_{}.csv'.format(m),
|
df = pd.read_csv('..\\data\\articles\\all_{}.csv'.format(m),
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=0,
|
header=0,
|
||||||
index_col=None,
|
index_col=None,
|
||||||
|
@ -82,7 +82,7 @@ class FileHandler:
|
||||||
'''clean articles in data set: filter out all non-printable characters
|
'''clean articles in data set: filter out all non-printable characters
|
||||||
'''
|
'''
|
||||||
# read data set
|
# read data set
|
||||||
file = 'data\\cleaned_data_set_without_header.csv'
|
file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||||
df = pd.read_csv(file,
|
df = pd.read_csv(file,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
|
@ -114,7 +114,7 @@ class FileHandler:
|
||||||
'''remove articles with exactly same headline
|
'''remove articles with exactly same headline
|
||||||
'''
|
'''
|
||||||
# read data set
|
# read data set
|
||||||
file = 'data\\cleaned_data_set_without_header.csv'
|
file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||||
df = pd.read_csv(file,
|
df = pd.read_csv(file,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
|
@ -137,7 +137,7 @@ class FileHandler:
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
# save cleaned dataframe
|
# save cleaned dataframe
|
||||||
df.to_csv('data\\cleaned_data_set_without_header.csv',
|
df.to_csv('..\\data\\cleaned_data_set_without_header.csv',
|
||||||
header=False,
|
header=False,
|
||||||
index=False,
|
index=False,
|
||||||
sep='|',
|
sep='|',
|
||||||
|
@ -152,14 +152,14 @@ class FileHandler:
|
||||||
# reliable sources (site_sections)
|
# reliable sources (site_sections)
|
||||||
site_sections = []
|
site_sections = []
|
||||||
# read list from 'sections.txt' file
|
# read list from 'sections.txt' file
|
||||||
with open('data\\sections.txt', 'r') as s_list:
|
with open('..\\data\\sections.txt', 'r') as s_list:
|
||||||
site_sections = s_list.read().split('\n')
|
site_sections = s_list.read().split('\n')
|
||||||
|
|
||||||
# article counter
|
# article counter
|
||||||
a = 0
|
a = 0
|
||||||
for m in FileHandler.months:
|
for m in FileHandler.months:
|
||||||
# 1 output file per month
|
# 1 output file per month
|
||||||
output_file = 'data\\articles\\all_{}.csv'.format(m)
|
output_file = '..\\data\\articles\\all_{}.csv'.format(m)
|
||||||
# path of input JSON files per month
|
# path of input JSON files per month
|
||||||
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
|
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
|
||||||
'\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
|
'\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
|
|
@ -40,10 +40,10 @@ class NER:
|
||||||
|
|
||||||
def tag_words(text):
|
def tag_words(text):
|
||||||
# path to Stanford NER
|
# path to Stanford NER
|
||||||
stanford_classifier = 'stanford-ner-2018-02-27'\
|
stanford_classifier = '..\stanford-ner-2018-02-27'\
|
||||||
'\\classifiers'\
|
'\\classifiers'\
|
||||||
'\\english.all.3class.distsim.crf.ser.gz'
|
'\\english.all.3class.distsim.crf.ser.gz'
|
||||||
stanford_ner_path = 'stanford-ner-2018-02-27'\
|
stanford_ner_path = '..\stanford-ner-2018-02-27'\
|
||||||
'\\stanford-ner.jar'
|
'\\stanford-ner.jar'
|
||||||
# create tagger object
|
# create tagger object
|
||||||
st = StanfordNERTagger(stanford_classifier, stanford_ner_path,
|
st = StanfordNERTagger(stanford_classifier, stanford_ner_path,
|
||||||
|
@ -100,36 +100,55 @@ class NER:
|
||||||
'''
|
'''
|
||||||
print('# counting company names...')
|
print('# counting company names...')
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# dictionary of companies with their count
|
# dictionary of companies with their count
|
||||||
dict_com = {}
|
dict_com = {}
|
||||||
|
|
||||||
# list of company lists (one per article)
|
# list of company lists (one per article)
|
||||||
coms_list = []
|
coms_list = []
|
||||||
|
|
||||||
|
# dict of articles per company name
|
||||||
|
# (company name => list of indices of articles mentioning the company)
|
||||||
|
dict_mentions = {}
|
||||||
|
|
||||||
for i, text in enumerate(texts):
|
for i, text in enumerate(texts):
|
||||||
# list of found companies in article
|
# list of found companies in article
|
||||||
print('# article no. {}:'.format(i))
|
print('# article no. {}:'.format(i))
|
||||||
coms = NER.find_companies(text)
|
coms = NER.find_companies(text)
|
||||||
coms_list.append(coms)
|
coms_list.append(coms)
|
||||||
|
|
||||||
|
# annotate article number in dict
|
||||||
|
for com in coms:
|
||||||
|
if com in dict_mentions.keys():
|
||||||
|
dict_mentions[com].append(i)
|
||||||
|
else:
|
||||||
|
dict_mentions[com] = [i]
|
||||||
|
|
||||||
for com in coms:
|
for com in coms:
|
||||||
if com in dict_com.keys():
|
if com in dict_com.keys():
|
||||||
dict_com[com] += 1
|
dict_com[com] += 1
|
||||||
else:
|
else:
|
||||||
dict_com[com] = 1
|
dict_com[com] = 1
|
||||||
# print(coms_list)
|
|
||||||
# print()
|
|
||||||
# calculate number of company mentions per article
|
# calculate number of company mentions per article
|
||||||
num_companies = []
|
num_companies = []
|
||||||
for l in coms_list:
|
for l in coms_list:
|
||||||
num_companies.append(len(l))
|
num_companies.append(len(l))
|
||||||
|
|
||||||
# print(num_companies)
|
# print(num_companies)
|
||||||
print('# average number of different companies mentioned per article:')
|
print('# average number of different companies mentioned per article:')
|
||||||
print(sum(num_companies)/len(num_companies))
|
print(sum(num_companies)/len(num_companies))
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
# save dict_mentions of article indices per company name
|
||||||
|
with open('../obj/'+ 'article_indices_mentions_companies' + '.pkl', 'wb') as f:
|
||||||
|
pickle.dump(dict_mentions, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
# save num_companies object in file (for plotting)
|
# save num_companies object in file (for plotting)
|
||||||
with open('obj/'+ 'num_mentions_companies' + '.pkl', 'wb') as f:
|
with open('../obj/'+ 'num_mentions_companies' + '.pkl', 'wb') as f:
|
||||||
pickle.dump(num_companies, f, pickle.HIGHEST_PROTOCOL)
|
pickle.dump(num_companies, f, pickle.HIGHEST_PROTOCOL)
|
||||||
# save dict_com object in file (for plotting)
|
# save dict_com object in file (for plotting)
|
||||||
with open('obj/'+ 'dict_organizations' + '.pkl', 'wb') as f:
|
with open('../obj/'+ 'dict_organizations' + '.pkl', 'wb') as f:
|
||||||
pickle.dump(dict_com, f, pickle.HIGHEST_PROTOCOL)
|
pickle.dump(dict_com, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
#print(dict_com)
|
#print(dict_com)
|
||||||
|
@ -139,7 +158,7 @@ class NER:
|
||||||
|
|
||||||
def show_most_common_companies(n_commons=50):
|
def show_most_common_companies(n_commons=50):
|
||||||
# load pickle object
|
# load pickle object
|
||||||
with open('obj/dict_organizations.pkl', 'rb') as input:
|
with open('../obj/dict_organizations.pkl', 'rb') as input:
|
||||||
dict = pickle.load(input)
|
dict = pickle.load(input)
|
||||||
# sort dict by value
|
# sort dict by value
|
||||||
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
||||||
|
@ -154,21 +173,22 @@ class NER:
|
||||||
print(n_dict)
|
print(n_dict)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# print('# starting NER...')
|
|
||||||
# print()
|
print('# starting NER...')
|
||||||
# # read data set
|
print()
|
||||||
# file = 'data\\cleaned_data_set_without_header.csv'
|
# read data set
|
||||||
# df = pd.read_csv(file,
|
file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||||
# delimiter='|',
|
df = pd.read_csv(file,
|
||||||
# header=None,
|
delimiter='|',
|
||||||
# index_col=None,
|
header=None,
|
||||||
# engine='python',
|
index_col=None,
|
||||||
# # usecols=[1,2],
|
engine='python',
|
||||||
# # nrows=100,
|
# usecols=[1,2],
|
||||||
# quoting=csv.QUOTE_NONNUMERIC,
|
# nrows=100,
|
||||||
# quotechar='\'')
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
# #print(df)
|
quotechar='\'')
|
||||||
# texts = df[1] + '. ' + df[2]
|
#print(df)
|
||||||
# NER.count_companies(texts)
|
texts = df[1] + '. ' + df[2]
|
||||||
# # NER.show_most_common_companies()
|
NER.count_companies(texts)
|
||||||
#print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
|
# NER.show_most_common_companies()
|
||||||
|
# print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
|
|
@ -187,7 +187,7 @@ class NaiveBayes:
|
||||||
print('# starting naive bayes')
|
print('# starting naive bayes')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
file = 'data\\classification_labelled_corrected.csv'
|
file = '..\\data\\classification_labelled_corrected.csv'
|
||||||
|
|
||||||
# read csv file
|
# read csv file
|
||||||
print('# reading dataset')
|
print('# reading dataset')
|
|
@ -5,6 +5,7 @@ Naive Bayes Classifier
|
||||||
basic implementation of naive bayes.
|
basic implementation of naive bayes.
|
||||||
prints out probabilities for classes needed for interactive labeling.
|
prints out probabilities for classes needed for interactive labeling.
|
||||||
'''
|
'''
|
||||||
|
from BagOfWords import BagOfWords
|
||||||
|
|
||||||
import csv
|
import csv
|
||||||
|
|
||||||
|
@ -15,9 +16,9 @@ from sklearn.metrics import recall_score, precision_score
|
||||||
from sklearn.model_selection import StratifiedKFold
|
from sklearn.model_selection import StratifiedKFold
|
||||||
from sklearn.naive_bayes import GaussianNB
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
|
||||||
class NaiveBayes_Interactive:
|
class NaiveBayesInteractive:
|
||||||
|
|
||||||
def make_naive_bayes(dataset, sklearn_cv=True, percentile=100):
|
def make_naive_bayes(dataset, sklearn_cv=False, percentile=100):
|
||||||
'''fits naive bayes model
|
'''fits naive bayes model
|
||||||
'''
|
'''
|
||||||
print('# fitting model')
|
print('# fitting model')
|
||||||
|
@ -178,7 +179,7 @@ class NaiveBayes_Interactive:
|
||||||
print('# starting naive bayes')
|
print('# starting naive bayes')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
file = 'data\\classification_labelled_corrected.csv'
|
file = '..\data\\classification_labelled_corrected.csv'
|
||||||
|
|
||||||
# read csv file
|
# read csv file
|
||||||
print('# reading dataset')
|
print('# reading dataset')
|
||||||
|
@ -191,8 +192,10 @@ class NaiveBayes_Interactive:
|
||||||
quotechar='\'',
|
quotechar='\'',
|
||||||
quoting=csv.QUOTE_NONE)
|
quoting=csv.QUOTE_NONE)
|
||||||
|
|
||||||
use_count_vectorizer = True
|
# training options
|
||||||
|
use_count_vectorizer = False
|
||||||
select_percentile = 100
|
select_percentile = 100
|
||||||
|
|
||||||
make_naive_bayes(data, use_count_vectorizer, select_percentile)
|
make_naive_bayes(data, use_count_vectorizer, select_percentile)
|
||||||
|
|
||||||
print('#')
|
print('#')
|
|
@ -93,7 +93,7 @@ class SVM:
|
||||||
print('# starting svm')
|
print('# starting svm')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
file = 'data\\classification_labelled_corrected.csv'
|
file = '..\\data\\classification_labelled_corrected.csv'
|
||||||
|
|
||||||
# read csv file
|
# read csv file
|
||||||
print('# reading dataset')
|
print('# reading dataset')
|
|
@ -30,7 +30,7 @@ class VisualizerNews:
|
||||||
print('# preparing word cloud of 200 most common words...')
|
print('# preparing word cloud of 200 most common words...')
|
||||||
print()
|
print()
|
||||||
# load new data set
|
# load new data set
|
||||||
file = 'data\\cleaned_data_set_without_header.csv'
|
file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||||
df_dataset = pd.read_csv(file,
|
df_dataset = pd.read_csv(file,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
|
@ -53,7 +53,7 @@ class VisualizerNews:
|
||||||
dict = BagOfWords.make_dict_common_words(matrix, 200,
|
dict = BagOfWords.make_dict_common_words(matrix, 200,
|
||||||
rel_freq, stemming)
|
rel_freq, stemming)
|
||||||
# save dict object
|
# save dict object
|
||||||
with open('obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f:
|
with open('../obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f:
|
||||||
pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)
|
pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
wordcloud = WordCloud(background_color='white',
|
wordcloud = WordCloud(background_color='white',
|
||||||
|
@ -82,7 +82,7 @@ class VisualizerNews:
|
||||||
print('# preparing histogram of company mentions...')
|
print('# preparing histogram of company mentions...')
|
||||||
print()
|
print()
|
||||||
# # read data set
|
# # read data set
|
||||||
# file = 'data\\cleaned_data_set_without_header.csv'
|
# file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||||
# df = pd.read_csv(file,
|
# df = pd.read_csv(file,
|
||||||
# delimiter='|',
|
# delimiter='|',
|
||||||
# header=None,
|
# header=None,
|
||||||
|
@ -107,7 +107,7 @@ class VisualizerNews:
|
||||||
# names = np.asarray(count_names)
|
# names = np.asarray(count_names)
|
||||||
|
|
||||||
# load pickle object
|
# load pickle object
|
||||||
with open('obj/dict_organizations.pkl', 'rb') as input:
|
with open('../obj/dict_organizations.pkl', 'rb') as input:
|
||||||
dict = pickle.load(input)
|
dict = pickle.load(input)
|
||||||
# make list of dict's values
|
# make list of dict's values
|
||||||
count_companies = list(dict.values())
|
count_companies = list(dict.values())
|
||||||
|
@ -129,9 +129,9 @@ class VisualizerNews:
|
||||||
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
||||||
|
|
||||||
# save to file
|
# save to file
|
||||||
plt.savefig('visualization\\NER_{}.eps'
|
plt.savefig('..\\visualization\\NER_{}.eps'
|
||||||
.format(VisualizerNews.datestring))
|
.format(VisualizerNews.datestring))
|
||||||
plt.savefig('visualization\\NER_{}.png'
|
plt.savefig('..\\visualization\\NER_{}.png'
|
||||||
.format(VisualizerNews.datestring))
|
.format(VisualizerNews.datestring))
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
@ -143,7 +143,7 @@ class VisualizerNews:
|
||||||
print('# preparing histogram of text lengths...')
|
print('# preparing histogram of text lengths...')
|
||||||
print()
|
print()
|
||||||
# read data set
|
# read data set
|
||||||
filepath = 'data\\cleaned_data_set_without_header.csv'
|
filepath = '..\\data\\cleaned_data_set_without_header.csv'
|
||||||
df_dataset = pd.read_csv(filepath,
|
df_dataset = pd.read_csv(filepath,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
|
@ -183,9 +183,9 @@ class VisualizerNews:
|
||||||
plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
|
plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
|
||||||
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
||||||
# save plot
|
# save plot
|
||||||
plt.savefig('visualization\\TextLength_{}.eps'\
|
plt.savefig('..\\visualization\\TextLength_{}.eps'\
|
||||||
.format(VisualizerNews.datestring))
|
.format(VisualizerNews.datestring))
|
||||||
plt.savefig('visualization\\TextLength_{}.png'\
|
plt.savefig('..\\visualization\\TextLength_{}.png'\
|
||||||
.format(VisualizerNews.datestring))
|
.format(VisualizerNews.datestring))
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
@ -195,7 +195,7 @@ class VisualizerNews:
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# load data set
|
# load data set
|
||||||
filepath = 'data\\cleaned_data_set_without_header.csv'
|
filepath = '..\\data\\cleaned_data_set_without_header.csv'
|
||||||
df_dataset = pd.read_csv(filepath,
|
df_dataset = pd.read_csv(filepath,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
|
@ -229,14 +229,14 @@ class VisualizerNews:
|
||||||
|
|
||||||
plt.setp(autotexts, size=8, weight="bold")
|
plt.setp(autotexts, size=8, weight="bold")
|
||||||
plt.show()
|
plt.show()
|
||||||
plt.savefig('Sites_{}.pdf'.format(VisualizerNews.datestring))
|
plt.savefig('..\\visualization\\Sites_{}.pdf'.format(VisualizerNews.datestring))
|
||||||
plt.savefig('Sites_{}.pgf'.format(VisualizerNews.datestring))
|
plt.savefig('..\\visualization\\Sites_{}.pgf'.format(VisualizerNews.datestring))
|
||||||
|
|
||||||
def plot_hist_most_common_words(n_commons = 10):
|
def plot_hist_most_common_words(n_commons = 10):
|
||||||
print('# preparing histogram of most common words...')
|
print('# preparing histogram of most common words...')
|
||||||
print()
|
print()
|
||||||
# # load data set
|
# # load data set
|
||||||
# filepath = 'data\\cleaned_data_set_without_header.csv'
|
# filepath = '..\\data\\cleaned_data_set_without_header.csv'
|
||||||
# df_dataset = pd.read_csv(filepath,
|
# df_dataset = pd.read_csv(filepath,
|
||||||
# delimiter='|',
|
# delimiter='|',
|
||||||
# header=None,
|
# header=None,
|
||||||
|
@ -264,7 +264,7 @@ class VisualizerNews:
|
||||||
# pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
# pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
# load pickle object
|
# load pickle object
|
||||||
with open ('obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i:
|
with open ('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i:
|
||||||
dict = pickle.load(i)
|
dict = pickle.load(i)
|
||||||
# sort dict by value
|
# sort dict by value
|
||||||
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
||||||
|
@ -287,9 +287,9 @@ class VisualizerNews:
|
||||||
height=numbers,
|
height=numbers,
|
||||||
tick_label=labels,
|
tick_label=labels,
|
||||||
facecolor='royalblue')
|
facecolor='royalblue')
|
||||||
plt.savefig('visualization\\10_most_common_words_{}.eps'
|
plt.savefig('..\\visualization\\10_most_common_words_{}.eps'
|
||||||
.format(VisualizerNews.datestring))
|
.format(VisualizerNews.datestring))
|
||||||
plt.savefig('visualization\\10_most_common_words_{}.png'
|
plt.savefig('..\\visualization\\10_most_common_words_{}.png'
|
||||||
.format(VisualizerNews.datestring))
|
.format(VisualizerNews.datestring))
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
@ -299,7 +299,7 @@ class VisualizerNews:
|
||||||
'''
|
'''
|
||||||
# list of number of different companies per article (int)
|
# list of number of different companies per article (int)
|
||||||
list = []
|
list = []
|
||||||
with open('obj/num_mentions_companies.pkl', 'rb') as input:
|
with open('../obj/num_mentions_companies.pkl', 'rb') as input:
|
||||||
list = pickle.load(input)
|
list = pickle.load(input)
|
||||||
|
|
||||||
# sort list in descending order
|
# sort list in descending order
|
||||||
|
@ -320,9 +320,9 @@ class VisualizerNews:
|
||||||
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
||||||
|
|
||||||
# save to file
|
# save to file
|
||||||
plt.savefig('visualization\\NER_2_{}.eps'
|
plt.savefig('..\\visualization\\NER_2_{}.eps'
|
||||||
.format(VisualizerNews.datestring))
|
.format(VisualizerNews.datestring))
|
||||||
plt.savefig('visualization\\NER_2_{}.png'
|
plt.savefig('..\\visualization\\NER_2_{}.png'
|
||||||
.format(VisualizerNews.datestring))
|
.format(VisualizerNews.datestring))
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
Loading…
Reference in New Issue