new folder
This commit is contained in:
parent
29dabecb9e
commit
417a26d114
10000
data/cleaned_data_set_without_duplicates.csv
Normal file
10000
data/cleaned_data_set_without_duplicates.csv
Normal file
File diff suppressed because one or more lines are too long
BIN
obj/article_indices_mentions_companies.pkl
Normal file
BIN
obj/article_indices_mentions_companies.pkl
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
717
src/2018-12-01-al-interactive-labeling.ipynb
Normal file
717
src/2018-12-01-al-interactive-labeling.ipynb
Normal file
@ -0,0 +1,717 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Interactive Labeling using Naive Bayes Classifier\n",
|
||||
"\n",
|
||||
"This Jupyter Notebook combines a manual and automated labeling technique.\n",
|
||||
"It includes a basic implementation of Naive Bayes Classifier.\n",
|
||||
"By calculating class probabilities, we decide wheather a news article has to be labeled manually or automatically.\n",
|
||||
"\n",
|
||||
"For the multi-class classification we use the following 6 classes:\n",
|
||||
"\n",
|
||||
"* 1: merger of company A and B\n",
|
||||
"* 2: merger is pending\n",
|
||||
"* 3: merger is aborted\n",
|
||||
"* 4: sale of shares\n",
|
||||
"* 5: merger as incidental remark, not main topic\n",
|
||||
"* 6: other / irrelevant news"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from collections import OrderedDict\n",
|
||||
"import csv\n",
|
||||
"import pickle\n",
|
||||
"import random\n",
|
||||
"\n",
|
||||
"from ipywidgets import interact, interactive, fixed, interact_manual\n",
|
||||
"import ipywidgets as widgets\n",
|
||||
"from IPython.display import display\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||||
"from sklearn.feature_selection import SelectPercentile\n",
|
||||
"from sklearn.metrics import recall_score, precision_score\n",
|
||||
"from sklearn.model_selection import StratifiedKFold\n",
|
||||
"from sklearn.naive_bayes import GaussianNB\n",
|
||||
"\n",
|
||||
"from FileHandler import FileHandler\n",
|
||||
"from NaiveBayesInteractive import NaiveBayesInteractive"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"First, we import our data set of 10 000 business news articles from a csv file.\n",
|
||||
"It contains 833/834 articles of each month of the year 2017.\n",
|
||||
"For detailed information regarding the data set, please read the full documentation."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style>\n",
|
||||
" .dataframe thead tr:only-child th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: left;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Timestamp</th>\n",
|
||||
" <th>Title</th>\n",
|
||||
" <th>Text</th>\n",
|
||||
" <th>Index</th>\n",
|
||||
" <th>Label</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>'7f565b5746ba33be1784c7d3d6a7ede0c9cd81a2'</td>\n",
|
||||
" <td>'Toshiba to sell less than 20 pct of chip unit...</td>\n",
|
||||
" <td>'Industrials 25am EST Toshiba to sell less th...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>-1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>'64e474522a8fbcdbb86a829a9c5708d3dd76e04b'</td>\n",
|
||||
" <td>'Alaska Air to record $82 million as merger-re...</td>\n",
|
||||
" <td>'Alaska Air Group Inc ( ALK.N ) said on Wednes...</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>-1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>'244f708215c689f2fb7fa502434743a5410a254b'</td>\n",
|
||||
" <td>'Delta Air Lines forecasts smaller drop in key...</td>\n",
|
||||
" <td>' 20am EST Delta Air Lines forecasts smaller d...</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>-1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>'4a55c5a8cbbf3ff0b62d19127b664cb5ce483bca'</td>\n",
|
||||
" <td>'Water utility Severn Trent sees FY rewards be...</td>\n",
|
||||
" <td>'Business News - Tue Jan 31, 2017 - 8:26am GMT...</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>-1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>'4f21e2d67d3b1dce026c874c2ae69f6792eb30ae'</td>\n",
|
||||
" <td>'German industry orders fall more than expecte...</td>\n",
|
||||
" <td>'Business News - Fri Jan 6, 2017 - 2:09am EST ...</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" <td>-1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Timestamp \\\n",
|
||||
"0 '7f565b5746ba33be1784c7d3d6a7ede0c9cd81a2' \n",
|
||||
"1 '64e474522a8fbcdbb86a829a9c5708d3dd76e04b' \n",
|
||||
"2 '244f708215c689f2fb7fa502434743a5410a254b' \n",
|
||||
"3 '4a55c5a8cbbf3ff0b62d19127b664cb5ce483bca' \n",
|
||||
"4 '4f21e2d67d3b1dce026c874c2ae69f6792eb30ae' \n",
|
||||
"\n",
|
||||
" Title \\\n",
|
||||
"0 'Toshiba to sell less than 20 pct of chip unit... \n",
|
||||
"1 'Alaska Air to record $82 million as merger-re... \n",
|
||||
"2 'Delta Air Lines forecasts smaller drop in key... \n",
|
||||
"3 'Water utility Severn Trent sees FY rewards be... \n",
|
||||
"4 'German industry orders fall more than expecte... \n",
|
||||
"\n",
|
||||
" Text Index Label \n",
|
||||
"0 'Industrials 25am EST Toshiba to sell less th... 0 -1 \n",
|
||||
"1 'Alaska Air Group Inc ( ALK.N ) said on Wednes... 1 -1 \n",
|
||||
"2 ' 20am EST Delta Air Lines forecasts smaller d... 2 -1 \n",
|
||||
"3 'Business News - Tue Jan 31, 2017 - 8:26am GMT... 3 -1 \n",
|
||||
"4 'Business News - Fri Jan 6, 2017 - 2:09am EST ... 4 -1 "
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"filepath = '../data/cleaned_data_set_without_header.csv'\n",
|
||||
"\n",
|
||||
"df = pd.read_csv(filepath,\n",
|
||||
" header=None,\n",
|
||||
" sep='|',\n",
|
||||
" engine='python',\n",
|
||||
" usecols=[0,1,2],\n",
|
||||
" names = [\"Timestamp\", \"Title\", \"Text\"],\n",
|
||||
" decimal='.',\n",
|
||||
" quotechar='\\'',\n",
|
||||
" quoting=csv.QUOTE_NONE)\n",
|
||||
"\n",
|
||||
"n = len(df)\n",
|
||||
"\n",
|
||||
"# create new column with indices\n",
|
||||
"df['Index'] = df.index.values\n",
|
||||
"\n",
|
||||
"# create new column and initialize with -1 for unlabeled samples\n",
|
||||
"df['Label'] = np.full((n), -1)\n",
|
||||
"\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Now we load the previously created dictionary 'article_indices_mentions_companies.pkl'. It is a dictionary of all different organizations in the data set (keys) with the list of article indices where a organization was mentioned."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# load pickle object of dict (company => [article numbers])\n",
|
||||
"with open('../obj/article_indices_mentions_companies.pkl', 'rb') as input:\n",
|
||||
" dict = pickle.load(input)\n",
|
||||
"\n",
|
||||
"# list of companies in insertion order\n",
|
||||
"comp_list = list(dict)\n",
|
||||
"\n",
|
||||
"# number of companies\n",
|
||||
"len_dict = len(dict)\n",
|
||||
"\n",
|
||||
"# list of indices of next articles\n",
|
||||
"labeled = []\n",
|
||||
"\n",
|
||||
"# indices of articles that mention the already picked companies\n",
|
||||
"black_list = []\n",
|
||||
"\n",
|
||||
"def pick_random_articles(n):\n",
|
||||
" ''' returns list of n indices of the articles we can label next\n",
|
||||
" '''\n",
|
||||
" # pick n random articles about n different companies\n",
|
||||
" i = 0\n",
|
||||
" # list of chosen articles' indices\n",
|
||||
" list_arts = []\n",
|
||||
" while i < n:\n",
|
||||
" # random company\n",
|
||||
" rand_c = random.randint(0, len_dict)\n",
|
||||
" # random article\n",
|
||||
" rand_i = random.choice(dict[comp_list[rand_c]])\n",
|
||||
" if rand_i not in (black_list or list_arts):\n",
|
||||
" list_arts.append(rand_i)\n",
|
||||
" black_list.extend(dict[comp_list[rand_c]])\n",
|
||||
" i += 1\n",
|
||||
" return list_arts\n",
|
||||
"\n",
|
||||
"def f(x):\n",
|
||||
" # store user input\n",
|
||||
" current_label = x\n",
|
||||
"\n",
|
||||
"# first round\n",
|
||||
"label_next = pick_random_articles(10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## - Here starts user computer interaction: -\n",
|
||||
"### *** Please enter correct label manually: ***\n",
|
||||
"- 1: merger of companies A and B\n",
|
||||
"- 2: merger is pending\n",
|
||||
"- 3: merger is aborted\n",
|
||||
"- 4: sale of shares\n",
|
||||
"- 5: merger as incidental remark, not main topic\n",
|
||||
"- 6: other/irrelevant news"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"616 'Close Brothers sees strong first half, report...\n",
|
||||
"Name: Title, dtype: object\n",
|
||||
"\n",
|
||||
"616 'Business News - Fri Jan 20, 2017 - 7:57am GMT...\n",
|
||||
"Name: Text, dtype: object\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "586ec1797e9b4111a441c64e16c8326f",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/html": [
|
||||
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||
" Widgets Documentation</a> for setup instructions.\n",
|
||||
"</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in another frontend (for example, a static\n",
|
||||
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||
"</p>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"8227 'Britain''s financial watchdog fines Merrill L...\n",
|
||||
"Name: Title, dtype: object\n",
|
||||
"\n",
|
||||
"8227 ' 30 AM / in 14 minutes Britain''s financial w...\n",
|
||||
"Name: Text, dtype: object\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\anne.lorenz\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:7: SettingWithCopyWarning: \n",
|
||||
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
||||
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
||||
"\n",
|
||||
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
|
||||
" import sys\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "219feab0bae845a5993b1194f6f2107c",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/html": [
|
||||
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||
" Widgets Documentation</a> for setup instructions.\n",
|
||||
"</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in another frontend (for example, a static\n",
|
||||
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||
"</p>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"4495 'Takata decides to file for bankruptcy - Japan...\n",
|
||||
"Name: Title, dtype: object\n",
|
||||
"\n",
|
||||
"4495 'Bonds News - Sun Jun 25, 2017 - 6:56pm EDT Ta...\n",
|
||||
"Name: Text, dtype: object\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "fdd5e247d2764f9d99966567f959cbf2",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/html": [
|
||||
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||
" Widgets Documentation</a> for setup instructions.\n",
|
||||
"</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in another frontend (for example, a static\n",
|
||||
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||
"</p>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"7665 'Bombardier eyes Asian markets amid U.S. trade...\n",
|
||||
"Name: Title, dtype: object\n",
|
||||
"\n",
|
||||
"7665 'October 5, 2017 / 1:22 PM / Updated 2 hours a...\n",
|
||||
"Name: Text, dtype: object\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "d9f08a572e83449db5bfbb63981dc810",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/html": [
|
||||
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||
" Widgets Documentation</a> for setup instructions.\n",
|
||||
"</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in another frontend (for example, a static\n",
|
||||
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||
"</p>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"9076 'Canadian regulator denies request to suspend ...\n",
|
||||
"Name: Title, dtype: object\n",
|
||||
"\n",
|
||||
"9076 'TORONTO, Nov 23 (Reuters) - Canadas biggest s...\n",
|
||||
"Name: Text, dtype: object\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "eac7e0d9e6264a3f99011742e7d50023",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/html": [
|
||||
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||
" Widgets Documentation</a> for setup instructions.\n",
|
||||
"</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in another frontend (for example, a static\n",
|
||||
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||
"</p>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"8955 '.'\n",
|
||||
"Name: Title, dtype: object\n",
|
||||
"\n",
|
||||
"8955 '(Corrects to make clear the comparison of dai...\n",
|
||||
"Name: Text, dtype: object\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "46e776d0ae38403da38b471f8da28179",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/html": [
|
||||
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||
" Widgets Documentation</a> for setup instructions.\n",
|
||||
"</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in another frontend (for example, a static\n",
|
||||
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||
"</p>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"5345 'European banks struggle to solve toxic shippi...\n",
|
||||
"Name: Title, dtype: object\n",
|
||||
"\n",
|
||||
"5345 'July 24, 2017 / 6:07 AM / 34 minutes ago Euro...\n",
|
||||
"Name: Text, dtype: object\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "3f1e6b3bf95549f9b26fae754e7cc4e6",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/html": [
|
||||
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||
" Widgets Documentation</a> for setup instructions.\n",
|
||||
"</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in another frontend (for example, a static\n",
|
||||
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||
"</p>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"603 'PRESS DIGEST- New York Times business news - ...\n",
|
||||
"Name: Title, dtype: object\n",
|
||||
"\n",
|
||||
"603 ' 17am EST PRESS DIGEST- New York Times busine...\n",
|
||||
"Name: Text, dtype: object\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "6fdd9d92a04044c2b757c74c7205775f",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/html": [
|
||||
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||
" Widgets Documentation</a> for setup instructions.\n",
|
||||
"</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in another frontend (for example, a static\n",
|
||||
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||
"</p>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"536 'UPDATE 2-Viacom names global entertainment gr...\n",
|
||||
"Name: Title, dtype: object\n",
|
||||
"\n",
|
||||
"536 '(Adds detail from internal memo, changes sour...\n",
|
||||
"Name: Text, dtype: object\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "02a38605cfd743428b34424428ef3b0e",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/html": [
|
||||
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||
" Widgets Documentation</a> for setup instructions.\n",
|
||||
"</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in another frontend (for example, a static\n",
|
||||
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||
"</p>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"9396 'Fiat Chrysler in talks over potential diesel ...\n",
|
||||
"Name: Title, dtype: object\n",
|
||||
"\n",
|
||||
"9396 'December 19, 2017 / 10:31 PM / Updated 20 min...\n",
|
||||
"Name: Text, dtype: object\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "551f6fa037f44acc967165c56b03ae93",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/html": [
|
||||
"<p>Failed to display Jupyter Widget of type <code>interactive</code>.</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n",
|
||||
" that the widgets JavaScript is still loading. If this message persists, it\n",
|
||||
" likely means that the widgets JavaScript library is either not installed or\n",
|
||||
" not enabled. See the <a href=\"https://ipywidgets.readthedocs.io/en/stable/user_install.html\">Jupyter\n",
|
||||
" Widgets Documentation</a> for setup instructions.\n",
|
||||
"</p>\n",
|
||||
"<p>\n",
|
||||
" If you're reading this message in another frontend (for example, a static\n",
|
||||
" rendering on GitHub or <a href=\"https://nbviewer.jupyter.org/\">NBViewer</a>),\n",
|
||||
" it may mean that your frontend doesn't currently support widgets.\n",
|
||||
"</p>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"interactive(children=(IntSlider(value=6, description='x', max=18, min=-6), Output()), _dom_classes=('widget-interact',))"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for index in label_next:\n",
|
||||
" print(df.loc[df['Index'] == index]['Title'])\n",
|
||||
" print()\n",
|
||||
" print(df.loc[df['Index'] == index]['Text'])\n",
|
||||
" # create widget\n",
|
||||
" current_label = interact(f, x=6)\n",
|
||||
" df.loc[df['Index'] == index]['Label'] = current_label\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -15,6 +15,7 @@ from collections import OrderedDict
|
||||
import csv
|
||||
import pickle
|
||||
import re
|
||||
import string
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
@ -37,6 +38,12 @@ class BagOfWords:
|
||||
'''
|
||||
stemmer = PorterStemmer()
|
||||
stop_words = BagOfWords.set_stop_words(stemming)
|
||||
|
||||
# ignore company names
|
||||
company_names_list = BagOfWords.load_company_names()
|
||||
for company in company_names_list:
|
||||
text = text.replace(company, '')
|
||||
|
||||
# replace punctuation marks with spaces
|
||||
words = re.sub(r'\W', ' ', text)
|
||||
# split str into list of single words
|
||||
@ -138,8 +145,18 @@ class BagOfWords:
|
||||
# transform set to list
|
||||
return list(vocab)
|
||||
|
||||
def load_company_names():
|
||||
# load pickle object of organizations
|
||||
with open('../obj/dict_organizations.pkl', 'rb') as input:
|
||||
dict = pickle.load(input)
|
||||
list = []
|
||||
for key in dict.keys():
|
||||
list.append(key)
|
||||
return list
|
||||
|
||||
def set_stop_words(stemming=True):
|
||||
'''creates list of all words that will be ignored
|
||||
'''creates list of all words that will be ignored:
|
||||
stopwords, company names and other disturbing terms
|
||||
'''
|
||||
# stopwords
|
||||
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
|
||||
@ -232,7 +249,7 @@ class BagOfWords:
|
||||
n_dict[next_highest[0]] = next_highest[1]
|
||||
|
||||
# save n_dict object
|
||||
with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
|
||||
with open('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
|
||||
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
return n_dict
|
||||
@ -254,7 +271,7 @@ class BagOfWords:
|
||||
return sum
|
||||
|
||||
def test():
|
||||
file = 'data\\cleaned_data_set_without_header.csv'
|
||||
file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||
df_dataset = pd.read_csv(file,
|
||||
delimiter='|',
|
||||
header=None,
|
@ -62,7 +62,7 @@ class CosineSimilarity:
|
||||
|
||||
if __name__ == '__main__':
|
||||
# read data set
|
||||
file = 'data\\cleaned_data_set_without_header.csv'
|
||||
file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||
df = pd.read_csv(file,
|
||||
delimiter='|',
|
||||
header=None,
|
@ -131,7 +131,7 @@ class DecisionTree:
|
||||
print('# starting decision tree')
|
||||
print('# ...')
|
||||
|
||||
file = 'data\\classification_labelled_corrected.csv'
|
||||
file = '..\\data\\classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
@ -40,7 +40,7 @@ class FileHandler:
|
||||
|
||||
def create_labeling_dataset():
|
||||
# output file
|
||||
o_file = 'data\\cleaned_data_set_without_header.csv'
|
||||
o_file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||
# create file and write header
|
||||
with open(o_file, 'w', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile,
|
||||
@ -57,7 +57,7 @@ class FileHandler:
|
||||
# number of articles to select from each month (10000/12=833,33)
|
||||
n_select = 833
|
||||
for m in FileHandler.months:
|
||||
df = pd.read_csv('data\\articles\\all_{}.csv'.format(m),
|
||||
df = pd.read_csv('..\\data\\articles\\all_{}.csv'.format(m),
|
||||
delimiter='|',
|
||||
header=0,
|
||||
index_col=None,
|
||||
@ -82,7 +82,7 @@ class FileHandler:
|
||||
'''clean articles in data set: filter out all non-printable characters
|
||||
'''
|
||||
# read data set
|
||||
file = 'data\\cleaned_data_set_without_header.csv'
|
||||
file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||
df = pd.read_csv(file,
|
||||
delimiter='|',
|
||||
header=None,
|
||||
@ -114,7 +114,7 @@ class FileHandler:
|
||||
'''remove articles with exactly same headline
|
||||
'''
|
||||
# read data set
|
||||
file = 'data\\cleaned_data_set_without_header.csv'
|
||||
file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||
df = pd.read_csv(file,
|
||||
delimiter='|',
|
||||
header=None,
|
||||
@ -137,7 +137,7 @@ class FileHandler:
|
||||
i += 1
|
||||
|
||||
# save cleaned dataframe
|
||||
df.to_csv('data\\cleaned_data_set_without_header.csv',
|
||||
df.to_csv('..\\data\\cleaned_data_set_without_header.csv',
|
||||
header=False,
|
||||
index=False,
|
||||
sep='|',
|
||||
@ -152,14 +152,14 @@ class FileHandler:
|
||||
# reliable sources (site_sections)
|
||||
site_sections = []
|
||||
# read list from 'sections.txt' file
|
||||
with open('data\\sections.txt', 'r') as s_list:
|
||||
with open('..\\data\\sections.txt', 'r') as s_list:
|
||||
site_sections = s_list.read().split('\n')
|
||||
|
||||
# article counter
|
||||
a = 0
|
||||
for m in FileHandler.months:
|
||||
# 1 output file per month
|
||||
output_file = 'data\\articles\\all_{}.csv'.format(m)
|
||||
output_file = '..\\data\\articles\\all_{}.csv'.format(m)
|
||||
# path of input JSON files per month
|
||||
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
|
||||
'\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
|
@ -40,10 +40,10 @@ class NER:
|
||||
|
||||
def tag_words(text):
|
||||
# path to Stanford NER
|
||||
stanford_classifier = 'stanford-ner-2018-02-27'\
|
||||
stanford_classifier = '..\stanford-ner-2018-02-27'\
|
||||
'\\classifiers'\
|
||||
'\\english.all.3class.distsim.crf.ser.gz'
|
||||
stanford_ner_path = 'stanford-ner-2018-02-27'\
|
||||
stanford_ner_path = '..\stanford-ner-2018-02-27'\
|
||||
'\\stanford-ner.jar'
|
||||
# create tagger object
|
||||
st = StanfordNERTagger(stanford_classifier, stanford_ner_path,
|
||||
@ -100,36 +100,55 @@ class NER:
|
||||
'''
|
||||
print('# counting company names...')
|
||||
print()
|
||||
|
||||
# dictionary of companies with their count
|
||||
dict_com = {}
|
||||
|
||||
# list of company lists (one per article)
|
||||
coms_list = []
|
||||
|
||||
# dict of articles per company name
|
||||
# (company name => list of indices of articles mentioning the company)
|
||||
dict_mentions = {}
|
||||
|
||||
for i, text in enumerate(texts):
|
||||
# list of found companies in article
|
||||
print('# article no. {}:'.format(i))
|
||||
coms = NER.find_companies(text)
|
||||
coms_list.append(coms)
|
||||
|
||||
# annotate article number in dict
|
||||
for com in coms:
|
||||
if com in dict_mentions.keys():
|
||||
dict_mentions[com].append(i)
|
||||
else:
|
||||
dict_mentions[com] = [i]
|
||||
|
||||
for com in coms:
|
||||
if com in dict_com.keys():
|
||||
dict_com[com] += 1
|
||||
else:
|
||||
dict_com[com] = 1
|
||||
# print(coms_list)
|
||||
# print()
|
||||
|
||||
# calculate number of company mentions per article
|
||||
num_companies = []
|
||||
for l in coms_list:
|
||||
num_companies.append(len(l))
|
||||
|
||||
# print(num_companies)
|
||||
print('# average number of different companies mentioned per article:')
|
||||
print(sum(num_companies)/len(num_companies))
|
||||
print()
|
||||
|
||||
# save dict_mentions of article indices per company name
|
||||
with open('../obj/'+ 'article_indices_mentions_companies' + '.pkl', 'wb') as f:
|
||||
pickle.dump(dict_mentions, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
# save num_companies object in file (for plotting)
|
||||
with open('obj/'+ 'num_mentions_companies' + '.pkl', 'wb') as f:
|
||||
with open('../obj/'+ 'num_mentions_companies' + '.pkl', 'wb') as f:
|
||||
pickle.dump(num_companies, f, pickle.HIGHEST_PROTOCOL)
|
||||
# save dict_com object in file (for plotting)
|
||||
with open('obj/'+ 'dict_organizations' + '.pkl', 'wb') as f:
|
||||
with open('../obj/'+ 'dict_organizations' + '.pkl', 'wb') as f:
|
||||
pickle.dump(dict_com, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
#print(dict_com)
|
||||
@ -139,7 +158,7 @@ class NER:
|
||||
|
||||
def show_most_common_companies(n_commons=50):
|
||||
# load pickle object
|
||||
with open('obj/dict_organizations.pkl', 'rb') as input:
|
||||
with open('../obj/dict_organizations.pkl', 'rb') as input:
|
||||
dict = pickle.load(input)
|
||||
# sort dict by value
|
||||
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
||||
@ -154,21 +173,22 @@ class NER:
|
||||
print(n_dict)
|
||||
|
||||
if __name__ == '__main__':
|
||||
# print('# starting NER...')
|
||||
# print()
|
||||
# # read data set
|
||||
# file = 'data\\cleaned_data_set_without_header.csv'
|
||||
# df = pd.read_csv(file,
|
||||
# delimiter='|',
|
||||
# header=None,
|
||||
# index_col=None,
|
||||
# engine='python',
|
||||
# # usecols=[1,2],
|
||||
# # nrows=100,
|
||||
# quoting=csv.QUOTE_NONNUMERIC,
|
||||
# quotechar='\'')
|
||||
# #print(df)
|
||||
# texts = df[1] + '. ' + df[2]
|
||||
# NER.count_companies(texts)
|
||||
# # NER.show_most_common_companies()
|
||||
#print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
|
||||
|
||||
print('# starting NER...')
|
||||
print()
|
||||
# read data set
|
||||
file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||
df = pd.read_csv(file,
|
||||
delimiter='|',
|
||||
header=None,
|
||||
index_col=None,
|
||||
engine='python',
|
||||
# usecols=[1,2],
|
||||
# nrows=100,
|
||||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
quotechar='\'')
|
||||
#print(df)
|
||||
texts = df[1] + '. ' + df[2]
|
||||
NER.count_companies(texts)
|
||||
# NER.show_most_common_companies()
|
||||
# print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
|
@ -187,7 +187,7 @@ class NaiveBayes:
|
||||
print('# starting naive bayes')
|
||||
print('# ...')
|
||||
|
||||
file = 'data\\classification_labelled_corrected.csv'
|
||||
file = '..\\data\\classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
@ -5,6 +5,7 @@ Naive Bayes Classifier
|
||||
basic implementation of naive bayes.
|
||||
prints out probabilities for classes needed for interactive labeling.
|
||||
'''
|
||||
from BagOfWords import BagOfWords
|
||||
|
||||
import csv
|
||||
|
||||
@ -15,9 +16,9 @@ from sklearn.metrics import recall_score, precision_score
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
|
||||
class NaiveBayes_Interactive:
|
||||
class NaiveBayesInteractive:
|
||||
|
||||
def make_naive_bayes(dataset, sklearn_cv=True, percentile=100):
|
||||
def make_naive_bayes(dataset, sklearn_cv=False, percentile=100):
|
||||
'''fits naive bayes model
|
||||
'''
|
||||
print('# fitting model')
|
||||
@ -178,7 +179,7 @@ class NaiveBayes_Interactive:
|
||||
print('# starting naive bayes')
|
||||
print('# ...')
|
||||
|
||||
file = 'data\\classification_labelled_corrected.csv'
|
||||
file = '..\data\\classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
@ -191,8 +192,10 @@ class NaiveBayes_Interactive:
|
||||
quotechar='\'',
|
||||
quoting=csv.QUOTE_NONE)
|
||||
|
||||
use_count_vectorizer = True
|
||||
# training options
|
||||
use_count_vectorizer = False
|
||||
select_percentile = 100
|
||||
|
||||
make_naive_bayes(data, use_count_vectorizer, select_percentile)
|
||||
|
||||
print('#')
|
@ -93,7 +93,7 @@ class SVM:
|
||||
print('# starting svm')
|
||||
print('# ...')
|
||||
|
||||
file = 'data\\classification_labelled_corrected.csv'
|
||||
file = '..\\data\\classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
@ -30,7 +30,7 @@ class VisualizerNews:
|
||||
print('# preparing word cloud of 200 most common words...')
|
||||
print()
|
||||
# load new data set
|
||||
file = 'data\\cleaned_data_set_without_header.csv'
|
||||
file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||
df_dataset = pd.read_csv(file,
|
||||
delimiter='|',
|
||||
header=None,
|
||||
@ -53,7 +53,7 @@ class VisualizerNews:
|
||||
dict = BagOfWords.make_dict_common_words(matrix, 200,
|
||||
rel_freq, stemming)
|
||||
# save dict object
|
||||
with open('obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f:
|
||||
with open('../obj/'+ 'dict_200_most_common_words_stemmed' + '.pkl', 'wb') as f:
|
||||
pickle.dump(dict, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
wordcloud = WordCloud(background_color='white',
|
||||
@ -82,7 +82,7 @@ class VisualizerNews:
|
||||
print('# preparing histogram of company mentions...')
|
||||
print()
|
||||
# # read data set
|
||||
# file = 'data\\cleaned_data_set_without_header.csv'
|
||||
# file = '..\\data\\cleaned_data_set_without_header.csv'
|
||||
# df = pd.read_csv(file,
|
||||
# delimiter='|',
|
||||
# header=None,
|
||||
@ -107,7 +107,7 @@ class VisualizerNews:
|
||||
# names = np.asarray(count_names)
|
||||
|
||||
# load pickle object
|
||||
with open('obj/dict_organizations.pkl', 'rb') as input:
|
||||
with open('../obj/dict_organizations.pkl', 'rb') as input:
|
||||
dict = pickle.load(input)
|
||||
# make list of dict's values
|
||||
count_companies = list(dict.values())
|
||||
@ -129,9 +129,9 @@ class VisualizerNews:
|
||||
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
||||
|
||||
# save to file
|
||||
plt.savefig('visualization\\NER_{}.eps'
|
||||
plt.savefig('..\\visualization\\NER_{}.eps'
|
||||
.format(VisualizerNews.datestring))
|
||||
plt.savefig('visualization\\NER_{}.png'
|
||||
plt.savefig('..\\visualization\\NER_{}.png'
|
||||
.format(VisualizerNews.datestring))
|
||||
plt.show()
|
||||
|
||||
@ -143,7 +143,7 @@ class VisualizerNews:
|
||||
print('# preparing histogram of text lengths...')
|
||||
print()
|
||||
# read data set
|
||||
filepath = 'data\\cleaned_data_set_without_header.csv'
|
||||
filepath = '..\\data\\cleaned_data_set_without_header.csv'
|
||||
df_dataset = pd.read_csv(filepath,
|
||||
delimiter='|',
|
||||
header=None,
|
||||
@ -183,9 +183,9 @@ class VisualizerNews:
|
||||
plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
|
||||
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
||||
# save plot
|
||||
plt.savefig('visualization\\TextLength_{}.eps'\
|
||||
plt.savefig('..\\visualization\\TextLength_{}.eps'\
|
||||
.format(VisualizerNews.datestring))
|
||||
plt.savefig('visualization\\TextLength_{}.png'\
|
||||
plt.savefig('..\\visualization\\TextLength_{}.png'\
|
||||
.format(VisualizerNews.datestring))
|
||||
plt.show()
|
||||
|
||||
@ -195,7 +195,7 @@ class VisualizerNews:
|
||||
print()
|
||||
|
||||
# load data set
|
||||
filepath = 'data\\cleaned_data_set_without_header.csv'
|
||||
filepath = '..\\data\\cleaned_data_set_without_header.csv'
|
||||
df_dataset = pd.read_csv(filepath,
|
||||
delimiter='|',
|
||||
header=None,
|
||||
@ -229,14 +229,14 @@ class VisualizerNews:
|
||||
|
||||
plt.setp(autotexts, size=8, weight="bold")
|
||||
plt.show()
|
||||
plt.savefig('Sites_{}.pdf'.format(VisualizerNews.datestring))
|
||||
plt.savefig('Sites_{}.pgf'.format(VisualizerNews.datestring))
|
||||
plt.savefig('..\\visualization\\Sites_{}.pdf'.format(VisualizerNews.datestring))
|
||||
plt.savefig('..\\visualization\\Sites_{}.pgf'.format(VisualizerNews.datestring))
|
||||
|
||||
def plot_hist_most_common_words(n_commons = 10):
|
||||
print('# preparing histogram of most common words...')
|
||||
print()
|
||||
# # load data set
|
||||
# filepath = 'data\\cleaned_data_set_without_header.csv'
|
||||
# filepath = '..\\data\\cleaned_data_set_without_header.csv'
|
||||
# df_dataset = pd.read_csv(filepath,
|
||||
# delimiter='|',
|
||||
# header=None,
|
||||
@ -264,7 +264,7 @@ class VisualizerNews:
|
||||
# pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
# load pickle object
|
||||
with open ('obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i:
|
||||
with open ('../obj/'+ 'dict_200_most_common_words' + '.pkl', 'rb') as i:
|
||||
dict = pickle.load(i)
|
||||
# sort dict by value
|
||||
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
||||
@ -287,9 +287,9 @@ class VisualizerNews:
|
||||
height=numbers,
|
||||
tick_label=labels,
|
||||
facecolor='royalblue')
|
||||
plt.savefig('visualization\\10_most_common_words_{}.eps'
|
||||
plt.savefig('..\\visualization\\10_most_common_words_{}.eps'
|
||||
.format(VisualizerNews.datestring))
|
||||
plt.savefig('visualization\\10_most_common_words_{}.png'
|
||||
plt.savefig('..\\visualization\\10_most_common_words_{}.png'
|
||||
.format(VisualizerNews.datestring))
|
||||
plt.show()
|
||||
|
||||
@ -299,7 +299,7 @@ class VisualizerNews:
|
||||
'''
|
||||
# list of number of different companies per article (int)
|
||||
list = []
|
||||
with open('obj/num_mentions_companies.pkl', 'rb') as input:
|
||||
with open('../obj/num_mentions_companies.pkl', 'rb') as input:
|
||||
list = pickle.load(input)
|
||||
|
||||
# sort list in descending order
|
||||
@ -320,9 +320,9 @@ class VisualizerNews:
|
||||
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
||||
|
||||
# save to file
|
||||
plt.savefig('visualization\\NER_2_{}.eps'
|
||||
plt.savefig('..\\visualization\\NER_2_{}.eps'
|
||||
.format(VisualizerNews.datestring))
|
||||
plt.savefig('visualization\\NER_2_{}.png'
|
||||
plt.savefig('..\\visualization\\NER_2_{}.png'
|
||||
.format(VisualizerNews.datestring))
|
||||
plt.show()
|
||||
|
Loading…
x
Reference in New Issue
Block a user