saving objects as pickles
This commit is contained in:
parent
7e037a1621
commit
b7d1f546e4
@ -18,6 +18,7 @@ import re
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from nltk.stem.porter import PorterStemmer
|
from nltk.stem.porter import PorterStemmer
|
||||||
|
import pickle
|
||||||
|
|
||||||
class BagOfWords:
|
class BagOfWords:
|
||||||
|
|
||||||
@ -114,6 +115,9 @@ class BagOfWords:
|
|||||||
else:
|
else:
|
||||||
# absolute word frequency
|
# absolute word frequency
|
||||||
df_matrix.loc[i][v] += 1
|
df_matrix.loc[i][v] += 1
|
||||||
|
# save df_matrix object
|
||||||
|
with open('obj/'+ 'document_term_matrix' + '.pkl', 'wb') as f:
|
||||||
|
pickle.dump(df_matrix, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
return df_matrix
|
return df_matrix
|
||||||
|
|
||||||
@ -170,7 +174,7 @@ class BagOfWords:
|
|||||||
|
|
||||||
#add unwanted terms
|
#add unwanted terms
|
||||||
stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
|
stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
|
||||||
'file', 'photo', 'min', 'read', 'staff', 'left',
|
'file', 'photo', 'min', 'read', 'staff', 'left', 'â',
|
||||||
'right', 'updated', 'minutes', 'brief', 'editing',
|
'right', 'updated', 'minutes', 'brief', 'editing',
|
||||||
'reporting', 'ago', 'also', 'would', 'could',
|
'reporting', 'ago', 'also', 'would', 'could',
|
||||||
'bit', 'ly', 'fy', 'economist', 'u', 'guardian'])
|
'bit', 'ly', 'fy', 'economist', 'u', 'guardian'])
|
||||||
@ -202,9 +206,9 @@ class BagOfWords:
|
|||||||
|
|
||||||
# words under that rel_freq limit are not included
|
# words under that rel_freq limit are not included
|
||||||
# set limit
|
# set limit
|
||||||
limit = 0.001
|
limit = 0.0001
|
||||||
if not rel_freq:
|
if not rel_freq:
|
||||||
limit = len(df_matrix) * 0.001
|
limit = len(df_matrix) * 0.0001
|
||||||
|
|
||||||
# word => count
|
# word => count
|
||||||
dict = {}
|
dict = {}
|
||||||
@ -214,7 +218,8 @@ class BagOfWords:
|
|||||||
# count word mentions in total
|
# count word mentions in total
|
||||||
if (df_matrix[column].sum() > limit):
|
if (df_matrix[column].sum() > limit):
|
||||||
dict[column] = df_matrix[column].sum()
|
dict[column] = df_matrix[column].sum()
|
||||||
# sort dict by value and
|
|
||||||
|
# sort dict by value
|
||||||
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
||||||
reverse=True))
|
reverse=True))
|
||||||
print(o_dict)
|
print(o_dict)
|
||||||
@ -226,6 +231,10 @@ class BagOfWords:
|
|||||||
next_highest = o_dict.popitem(last=False)
|
next_highest = o_dict.popitem(last=False)
|
||||||
n_dict[next_highest[0]] = next_highest[1]
|
n_dict[next_highest[0]] = next_highest[1]
|
||||||
|
|
||||||
|
# save n_dict object
|
||||||
|
with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
|
||||||
|
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
return n_dict
|
return n_dict
|
||||||
|
|
||||||
def count_features(texts, stemming=True):
|
def count_features(texts, stemming=True):
|
||||||
@ -245,36 +254,38 @@ class BagOfWords:
|
|||||||
return sum
|
return sum
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
file = 'data\\cleaned_data_set_without_header.csv'
|
||||||
df_dataset = pd.read_csv(file,
|
df_dataset = pd.read_csv(file,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
usecols=[1,2],
|
usecols=[1,2],
|
||||||
nrows=100,
|
#nrows=100,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
|
|
||||||
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
||||||
stemming = True
|
stemming = True
|
||||||
rel_freq = True
|
rel_freq = True
|
||||||
|
#print(BagOfWords.count_features(corpus))
|
||||||
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||||
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||||
#print(vocab)
|
print(len(vocab))
|
||||||
for text in corpus:
|
|
||||||
print(text)
|
# for text in corpus:
|
||||||
print()
|
# print(text)
|
||||||
print()
|
# print()
|
||||||
# ab hier ValueError bei nrows=10000...
|
# print()
|
||||||
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
|
# # ab hier ValueError bei nrows=10000...
|
||||||
dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming)
|
# matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
|
||||||
print(dict)
|
# dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming)
|
||||||
|
# print(dict)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
for word in sorted(BagOfWords.set_stop_words(False)):
|
# for word in sorted(BagOfWords.set_stop_words(False)):
|
||||||
print(word)
|
# print(word)
|
||||||
print()
|
# print()
|
||||||
print(PorterStemmer().stem(word))
|
# print(PorterStemmer().stem(word))
|
||||||
print()
|
# print()
|
||||||
# BagOfWords.test()
|
BagOfWords.test()
|
@ -62,7 +62,7 @@ class CosineSimilarity:
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# read data set
|
# read data set
|
||||||
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
file = 'data\\cleaned_data_set_without_header.csv'
|
||||||
df = pd.read_csv(file,
|
df = pd.read_csv(file,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
|
@ -12,6 +12,7 @@ writes it to a csv file.
|
|||||||
import csv
|
import csv
|
||||||
import glob
|
import glob
|
||||||
import json
|
import json
|
||||||
|
import string
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -39,7 +40,7 @@ class FileHandler:
|
|||||||
|
|
||||||
def create_labeling_dataset():
|
def create_labeling_dataset():
|
||||||
# output file
|
# output file
|
||||||
o_file = 'data\\interactive_labeling_dataset.csv'
|
o_file = 'data\\cleaned_data_set_without_header.csv'
|
||||||
# create file and write header
|
# create file and write header
|
||||||
with open(o_file, 'w', newline='') as csvfile:
|
with open(o_file, 'w', newline='') as csvfile:
|
||||||
writer = csv.writer(csvfile,
|
writer = csv.writer(csvfile,
|
||||||
@ -77,6 +78,38 @@ class FileHandler:
|
|||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
|
|
||||||
|
def clean_articles():
|
||||||
|
'''clean articles in data set: filter out all non-printable characters
|
||||||
|
'''
|
||||||
|
# read data set
|
||||||
|
file = 'data\\cleaned_data_set_without_header.csv'
|
||||||
|
df = pd.read_csv(file,
|
||||||
|
delimiter='|',
|
||||||
|
header=None,
|
||||||
|
index_col=None,
|
||||||
|
engine='python',
|
||||||
|
#usecols=[1,2],
|
||||||
|
#nrows=100,
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
|
quotechar='\'')
|
||||||
|
|
||||||
|
# for each article(row)
|
||||||
|
for i in range (len(df)):
|
||||||
|
# filter headline
|
||||||
|
df.iloc[i][1] = ''.join(x for x in df.iloc[i][1] if x in string.printable)
|
||||||
|
# filter text
|
||||||
|
df.iloc[i][2] = ''.join(x for x in df.iloc[i][2] if x in string.printable)
|
||||||
|
print(df)
|
||||||
|
# save cleaned dataframe
|
||||||
|
df.to_csv('data\\cleaned_data_set_without_header.csv',
|
||||||
|
header=False,
|
||||||
|
index=False,
|
||||||
|
sep='|',
|
||||||
|
mode='a',
|
||||||
|
encoding='utf-8',
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
|
quotechar='\'')
|
||||||
|
|
||||||
def write_articles_to_csv_files():
|
def write_articles_to_csv_files():
|
||||||
'''read JSON files, select articles and write them to csv.
|
'''read JSON files, select articles and write them to csv.
|
||||||
'''
|
'''
|
||||||
@ -160,8 +193,8 @@ class FileHandler:
|
|||||||
print('#')
|
print('#')
|
||||||
print('# saved {} articles in total'.format(a))
|
print('# saved {} articles in total'.format(a))
|
||||||
print('#')
|
print('#')
|
||||||
def join_all_csv_files():
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# FileHandler.write_articles_to_csv_files()
|
# FileHandler.write_articles_to_csv_files()
|
||||||
# FileHandler.create_labeling_dataset()
|
# FileHandler.create_labeling_dataset()
|
||||||
|
FileHandler.clean_articles()
|
99
NER.py
99
NER.py
@ -16,17 +16,31 @@ import numpy as np
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from nltk.tag import StanfordNERTagger
|
from nltk.tag import StanfordNERTagger
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
|
import pickle
|
||||||
|
import re
|
||||||
|
|
||||||
class NER:
|
class NER:
|
||||||
|
|
||||||
company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP', 'Plc', 'LLC',
|
company_abbrevs = ['Inc.', 'Inc', 'Corp', '& Co', 'Co', 'Ltd.', 'Ltd',
|
||||||
'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB']
|
'AG', 'LP', 'Limited', 'Tbk', 'Group', 'U.S.', 'BRIEF-',
|
||||||
|
'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB', 'Plc',
|
||||||
|
's.r.l.', 'Holding', 'Holdings']
|
||||||
|
|
||||||
# some entities and misc that are not companies
|
# some entities and misc that are not companies
|
||||||
misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn',
|
misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn',
|
||||||
'European Commission', 'EU', 'Staff', 'Min', 'Read',
|
'EU', 'Staff', 'Min', 'Read', 'SRF', 'New York Stock Exchange',
|
||||||
'Thomson Reuters Trust Principles', 'New York Stock Exchange',
|
'NYSE', 'DAX' 'ECB', 'Federal Reserve', 'Muslim', 'JPMorgan',
|
||||||
'NYSE']
|
'Standard & Poor', 'International Monetary Fund', 'Morgan Stanley',
|
||||||
|
'Hongkong', 'Whitehall Street', 'Fitch Australia Pty', 'AFS',
|
||||||
|
'FT House & Home', 'Fitch Rates Autonomous Community of Asturias',
|
||||||
|
'Autonomous Community of Asturias', 'Fitch Ratings Espana',
|
||||||
|
'Barcelona', 'Fitch Ratings ', 'Congress', 'Fed', 'OPEC', 'U.N.',
|
||||||
|
'National Federation of Independent Business', 'Barclays',
|
||||||
|
'McKinsey', 'Moody', 'Fitch Ratings Ltd.']
|
||||||
|
|
||||||
|
regex = r'European.*|.*Reuters.*|.*(B|b)ank.*|.*Ministry.*|.*Trump.*|.*Banca.*|\
|
||||||
|
.*Department.*|.*House.*|Wall (Street|Str).*|.*Congress.*|\
|
||||||
|
.*Republican.*|Goldman( Sachs)?|.*Chamber.*|.*Department.*'
|
||||||
|
|
||||||
def tag_words(text):
|
def tag_words(text):
|
||||||
# path to Stanford NER
|
# path to Stanford NER
|
||||||
@ -61,6 +75,10 @@ class NER:
|
|||||||
'''param: article text where organizations must be indentified
|
'''param: article text where organizations must be indentified
|
||||||
returns: list of identified organisations as strings
|
returns: list of identified organisations as strings
|
||||||
'''
|
'''
|
||||||
|
# print(text)
|
||||||
|
# print()
|
||||||
|
# print('# examining article...')
|
||||||
|
# print()
|
||||||
# set paths
|
# set paths
|
||||||
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
|
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
|
||||||
os.environ['JAVAHOME'] = java_path
|
os.environ['JAVAHOME'] = java_path
|
||||||
@ -75,9 +93,15 @@ class NER:
|
|||||||
#print(nes_coherent)
|
#print(nes_coherent)
|
||||||
for tuple in nes_coherent:
|
for tuple in nes_coherent:
|
||||||
# check if company and not already in list
|
# check if company and not already in list
|
||||||
if (tuple[0] not in NER.misc) and (tuple[0] not in seen):
|
if (tuple[0] not in NER.misc) and (tuple[0] not in seen)\
|
||||||
|
and (not re.search(NER.regex, tuple[0])):
|
||||||
organizations.append(tuple[0])
|
organizations.append(tuple[0])
|
||||||
seen.add(tuple[0])
|
seen.add(tuple[0])
|
||||||
|
print('# recognized the following organizations:')
|
||||||
|
print()
|
||||||
|
print(organizations)
|
||||||
|
print()
|
||||||
|
print()
|
||||||
return organizations
|
return organizations
|
||||||
|
|
||||||
def count_companies(texts):
|
def count_companies(texts):
|
||||||
@ -88,14 +112,37 @@ class NER:
|
|||||||
print()
|
print()
|
||||||
# dictionary of companies with their count
|
# dictionary of companies with their count
|
||||||
dict_com = {}
|
dict_com = {}
|
||||||
for text in texts:
|
# list of company lists (one per article)
|
||||||
|
coms_list = []
|
||||||
|
for i, text in enumerate(texts):
|
||||||
# list of found companies in article
|
# list of found companies in article
|
||||||
|
print('# article no. {}:'.format(i))
|
||||||
coms = NER.find_companies(text)
|
coms = NER.find_companies(text)
|
||||||
|
coms_list.append(coms)
|
||||||
|
|
||||||
for com in coms:
|
for com in coms:
|
||||||
if com in dict_com.keys():
|
if com in dict_com.keys():
|
||||||
dict_com[com] += 1
|
dict_com[com] += 1
|
||||||
else:
|
else:
|
||||||
dict_com[com] = 1
|
dict_com[com] = 1
|
||||||
|
# print(coms_list)
|
||||||
|
# print()
|
||||||
|
# calculate number of company mentions per article
|
||||||
|
num_companies = []
|
||||||
|
for l in coms_list:
|
||||||
|
num_companies.append(len(l))
|
||||||
|
# print(num_companies)
|
||||||
|
print('# average number of different companies mentioned per article:')
|
||||||
|
print(sum(num_companies)/len(num_companies))
|
||||||
|
print()
|
||||||
|
# save num_companies object in file (for plotting)
|
||||||
|
with open('obj/'+ 'num_mentions_companies' + '.pkl', 'wb') as f:
|
||||||
|
pickle.dump(num_companies, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
# save dict_com object in file (for plotting)
|
||||||
|
with open('obj/'+ 'dict_organizations' + '.pkl', 'wb') as f:
|
||||||
|
pickle.dump(dict_com, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
#print(dict_com)
|
||||||
# # print outlier
|
# # print outlier
|
||||||
# print(max(dict_com, key=dict_com.get))
|
# print(max(dict_com, key=dict_com.get))
|
||||||
return list(dict_com.values())
|
return list(dict_com.values())
|
||||||
@ -103,27 +150,17 @@ class NER:
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print('# starting NER...')
|
print('# starting NER...')
|
||||||
print()
|
print()
|
||||||
test_article = '''Exclusive: Microsoft's $7.5 billion GitHub deal set for
|
# read data set
|
||||||
EU approval - sources. BRUSSELS (Reuters) - U.S. software
|
file = 'data\\cleaned_data_set_without_header.csv'
|
||||||
giant Microsoft (MSFT.O) is set to win unconditional EU
|
df = pd.read_csv(file,
|
||||||
antitrust approval for its $7.5 billion purchase of
|
delimiter='|',
|
||||||
privately held coding website GitHub, two people familiar
|
header=None,
|
||||||
with the matter said on Monday. Microsoft announced the
|
index_col=None,
|
||||||
deal in June, its largest acquisition since it bought
|
engine='python',
|
||||||
LinkedIn for $26 billion in 2016. The GitHub deal is
|
#usecols=[1,2],
|
||||||
expected to boost the U.S. software giant’s cloud
|
nrows=100,
|
||||||
computing business and challenge market leader Amazon
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
(AMZN.O). GitHub, the world’s largest code host, has
|
quotechar='\'')
|
||||||
more than 28 million developers using its platform. It
|
#print(df)
|
||||||
will become a part of Microsoft’s Intelligent Cloud unit
|
texts = df[1] + '. ' + df[2]
|
||||||
once the acquisition is completed. Microsoft Chief
|
NER.count_companies(texts)
|
||||||
Executive Satya Nadella has tried to assuage users’
|
|
||||||
worries that GitHub might favor Microsoft products
|
|
||||||
over competitors after the deal, saying GitHub would
|
|
||||||
continue to be an open platform that works with all
|
|
||||||
public clouds. The European Commission, which is set to
|
|
||||||
decide on the deal by Oct. 19, did not respond to a
|
|
||||||
request for immediate comment. Microsoft declined to
|
|
||||||
comment. Reporting by Foo Yun Chee; editing by Jason
|
|
||||||
Neely'''
|
|
||||||
print(NER.find_companies(test_article))
|
|
@ -8,7 +8,9 @@ from BagOfWords import BagOfWords
|
|||||||
from NER import NER
|
from NER import NER
|
||||||
|
|
||||||
import csv
|
import csv
|
||||||
|
from datetime import datetime
|
||||||
from os import path
|
from os import path
|
||||||
|
import pickle
|
||||||
|
|
||||||
import matplotlib
|
import matplotlib
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
@ -19,42 +21,55 @@ from wordcloud import WordCloud
|
|||||||
|
|
||||||
class VisualizerNews:
|
class VisualizerNews:
|
||||||
|
|
||||||
|
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
|
||||||
|
|
||||||
def plot_wordcloud_dataset():
|
def plot_wordcloud_dataset():
|
||||||
'''plots word cloud image of most common words in dataset.
|
'''plots word cloud image of most common words in dataset.
|
||||||
'''
|
'''
|
||||||
print('# preparing word cloud of 200 most common words...')
|
print('# preparing word cloud of 200 most common words...')
|
||||||
print()
|
print()
|
||||||
# load new data set
|
# load new data set
|
||||||
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
file = 'data\\cleaned_data_set_without_header.csv'
|
||||||
df_dataset = pd.read_csv(file,
|
df_dataset = pd.read_csv(file,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
usecols=[1,2],
|
usecols=[1,2],
|
||||||
|
#nrows=100,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
|
|
||||||
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
||||||
stemming = False
|
stemming = False
|
||||||
rel_freq = False
|
rel_freq = True
|
||||||
|
|
||||||
# find most common words in dataset
|
# find most common words in dataset
|
||||||
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||||
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||||
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
|
matrix = BagOfWords.make_matrix(extracted_words, vocab,
|
||||||
dict = BagOfWords.make_dict_common_words(matrix, 200, rel_freq, stemming)
|
rel_freq, stemming)
|
||||||
|
dict = BagOfWords.make_dict_common_words(matrix, 200,
|
||||||
|
rel_freq, stemming)
|
||||||
|
# save dict object
|
||||||
|
with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
|
||||||
|
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
wordcloud = WordCloud(background_color='white',
|
wordcloud = WordCloud(background_color='white',
|
||||||
width=2400,
|
width=2400,
|
||||||
height=1200,
|
height=1200,
|
||||||
scale=2,
|
scale=2,
|
||||||
# true if bigram:
|
# true if bigram:
|
||||||
collocations=False).generate_from_frequencies(dict)
|
collocations=False)\
|
||||||
|
.generate_from_frequencies(dict)
|
||||||
|
|
||||||
# display generated image
|
# display generated image
|
||||||
plt.imshow(wordcloud, interpolation='bilinear')
|
plt.imshow(wordcloud, interpolation='bilinear')
|
||||||
plt.axis("off")
|
plt.axis("off")
|
||||||
|
plt.savefig('visualization\\WordCloud_{}.eps'
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
|
plt.savefig('visualization\\WordCloud_{}.png'
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def plot_histogram_companies():
|
def plot_histogram_companies():
|
||||||
@ -66,13 +81,14 @@ class VisualizerNews:
|
|||||||
print('# preparing histogram of company mentions...')
|
print('# preparing histogram of company mentions...')
|
||||||
print()
|
print()
|
||||||
# read data set
|
# read data set
|
||||||
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
file = 'data\\cleaned_data_set_without_header.csv'
|
||||||
df = pd.read_csv(file,
|
df = pd.read_csv(file,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
usecols=[1,2],
|
usecols=[1,2],
|
||||||
|
#nrows=10,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
|
|
||||||
@ -93,8 +109,15 @@ class VisualizerNews:
|
|||||||
# Number of companies with this number of mentions
|
# Number of companies with this number of mentions
|
||||||
plt.ylabel('Number of companies with this number of articles')
|
plt.ylabel('Number of companies with this number of articles')
|
||||||
num_bins = 50
|
num_bins = 50
|
||||||
n, bins, patches = plt.hist(names, num_bins, facecolor='darkred', alpha=0.5)
|
n, bins, patches = plt.hist(names, num_bins,
|
||||||
|
facecolor='darkred', alpha=0.5)
|
||||||
plt.axis([0, 50, 0, 1000])
|
plt.axis([0, 50, 0, 1000])
|
||||||
|
|
||||||
|
# save to file
|
||||||
|
plt.savefig('visualization\\NER_{}.eps'
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
|
plt.savefig('visualization\\NER_{}.png'
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def plot_histogram_text_lengths():
|
def plot_histogram_text_lengths():
|
||||||
@ -105,10 +128,10 @@ class VisualizerNews:
|
|||||||
print('# preparing histogram of text lengths...')
|
print('# preparing histogram of text lengths...')
|
||||||
print()
|
print()
|
||||||
# read data set
|
# read data set
|
||||||
filepath = 'data\\interactive_labeling_dataset.csv'
|
filepath = 'data\\cleaned_data_set_without_header.csv'
|
||||||
df_dataset = pd.read_csv(filepath,
|
df_dataset = pd.read_csv(filepath,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=0,
|
header=None,
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
usecols=[2],
|
usecols=[2],
|
||||||
@ -126,23 +149,30 @@ class VisualizerNews:
|
|||||||
count_chars.append(len(text))
|
count_chars.append(len(text))
|
||||||
# average of number of characters
|
# average of number of characters
|
||||||
av = int(sum(count_chars) / len(count_chars))
|
av = int(sum(count_chars) / len(count_chars))
|
||||||
print('# average length of news articles is: {} characters'.format(av))
|
print('# average length of news articles is {} characters'.format(av))
|
||||||
print()
|
print()
|
||||||
# sort list in descending order
|
# sort list in descending order
|
||||||
count_chars.sort(reverse=True)
|
count_chars.sort(reverse=True)
|
||||||
# convert list to array
|
# convert list to array
|
||||||
names = np.asarray(count_chars)
|
names = np.asarray(count_chars)
|
||||||
# plt.title('Length of News Articles')
|
# plt.title('Length of News Articles')
|
||||||
plt.xlabel('Number of characters in an article')
|
plt.xlabel('Number of characters in article')
|
||||||
plt.ylabel('Frequency')
|
plt.ylabel('Frequency')
|
||||||
# number of vertical bins
|
# number of vertical bins
|
||||||
num_bins = 200
|
num_bins = 200
|
||||||
n, bins, patches = plt.hist(names, num_bins, facecolor='darkslategrey', alpha=0.5)
|
n, bins, patches = plt.hist(names, num_bins,
|
||||||
|
facecolor='darkslategrey', alpha=0.5)
|
||||||
# [xmin, xmax, ymin, ymax] of axis
|
# [xmin, xmax, ymin, ymax] of axis
|
||||||
#plt.axis([format(300, ','),format(10000, ','), 0, 500])
|
#plt.axis([format(300, ','),format(10000, ','), 0, 500])
|
||||||
plt.axis([300,10000,0,500])
|
plt.axis([300,10000,0,500])
|
||||||
# format axis labels for thousends (e.g. '10,000')
|
# format axis labels for thousends (e.g. '10,000')
|
||||||
plt.gca().xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
|
plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
|
||||||
|
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
||||||
|
# save plot
|
||||||
|
plt.savefig('visualization\\TextLength_{}.eps'\
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
|
plt.savefig('visualization\\TextLength_{}.png'\
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def plot_pie_chart_of_sites():
|
def plot_pie_chart_of_sites():
|
||||||
@ -151,24 +181,24 @@ class VisualizerNews:
|
|||||||
print()
|
print()
|
||||||
|
|
||||||
# load data set
|
# load data set
|
||||||
filepath = 'data\\interactive_labeling_dataset.csv'
|
filepath = 'data\\cleaned_data_set_without_header.csv'
|
||||||
df_dataset = pd.read_csv(filepath,
|
df_dataset = pd.read_csv(filepath,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=0,
|
header=None,
|
||||||
#usecols=[3], #column 'Site'
|
#usecols=[3], #column 'Site'
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
#nrows=100,
|
nrows=10,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
# find all different sites
|
# find all different sites, group by 'Site'
|
||||||
df_counts = df_dataset.groupby('Site').count()
|
df_counts = df_dataset.groupby(3).count()
|
||||||
# count occurences of each site
|
# count occurences of each site, count different 'Url's
|
||||||
df_counts = df_counts.sort_values(['Url'], ascending=False)
|
df_counts = df_counts.sort_values([5], ascending=False)
|
||||||
|
|
||||||
fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
|
fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
|
||||||
|
|
||||||
data = list(df_counts['Url'])
|
data = list(df_counts[5])
|
||||||
# legend labels
|
# legend labels
|
||||||
labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)',
|
labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)',
|
||||||
'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']
|
'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']
|
||||||
@ -185,12 +215,14 @@ class VisualizerNews:
|
|||||||
|
|
||||||
plt.setp(autotexts, size=8, weight="bold")
|
plt.setp(autotexts, size=8, weight="bold")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
plt.savefig('Sites_{}.pdf'.format(VisualizerNews.datestring))
|
||||||
|
plt.savefig('Sites_{}.pgf'.format(VisualizerNews.datestring))
|
||||||
|
|
||||||
def plot_hist_most_common_words(n_commons = 10):
|
def plot_hist_most_common_words(n_commons = 10):
|
||||||
print('# preparing histogram of most common words...')
|
print('# preparing histogram of most common words...')
|
||||||
print()
|
print()
|
||||||
# load data set
|
# load data set
|
||||||
filepath = 'data\\interactive_labeling_dataset_without_header.csv'
|
filepath = 'data\\cleaned_data_set_without_header.csv'
|
||||||
df_dataset = pd.read_csv(filepath,
|
df_dataset = pd.read_csv(filepath,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
@ -209,8 +241,13 @@ class VisualizerNews:
|
|||||||
# find most common words in dataset
|
# find most common words in dataset
|
||||||
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||||
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||||
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
|
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
|
||||||
dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq, stemming)
|
stemming)
|
||||||
|
dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
|
||||||
|
stemming)
|
||||||
|
# save dict object
|
||||||
|
with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f:
|
||||||
|
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
plt.xlabel('Most common words in textual corpus')
|
plt.xlabel('Most common words in textual corpus')
|
||||||
plt.ylabel('Relative frequency')
|
plt.ylabel('Relative frequency')
|
||||||
@ -222,11 +259,15 @@ class VisualizerNews:
|
|||||||
height=numbers,
|
height=numbers,
|
||||||
tick_label=labels,
|
tick_label=labels,
|
||||||
facecolor='darkorange')
|
facecolor='darkorange')
|
||||||
|
plt.savefig('visualization\\10_most_common_words_{}.eps'
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
|
plt.savefig('visualization\\10_most_common_words_{}.png'
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
VisualizerNews.plot_wordcloud_dataset()
|
||||||
# VisualizerNews.plot_histogram_companies()
|
# VisualizerNews.plot_histogram_companies()
|
||||||
# VisualizerNews.plot_wordcloud_dataset()
|
|
||||||
# VisualizerNews.plot_histogram_text_lengths()
|
# VisualizerNews.plot_histogram_text_lengths()
|
||||||
# VisualizerNews.plot_pie_chart_of_sites()
|
# VisualizerNews.plot_pie_chart_of_sites()
|
||||||
VisualizerNews.plot_hist_most_common_words()
|
VisualizerNews.plot_hist_most_common_words()
|
10000
data/cleaned_data_set_without_header.csv
Normal file
10000
data/cleaned_data_set_without_header.csv
Normal file
File diff suppressed because one or more lines are too long
BIN
obj/dict_organizations.pkl
Normal file
BIN
obj/dict_organizations.pkl
Normal file
Binary file not shown.
0
obj/list_organizations.pkl
Normal file
0
obj/list_organizations.pkl
Normal file
BIN
obj/num_mentions_companies.pkl
Normal file
BIN
obj/num_mentions_companies.pkl
Normal file
Binary file not shown.
3082
visualization/TextLength_2018-11-05.eps
Normal file
3082
visualization/TextLength_2018-11-05.eps
Normal file
File diff suppressed because it is too large
Load Diff
BIN
visualization/TextLength_2018-11-05.pdf
Normal file
BIN
visualization/TextLength_2018-11-05.pdf
Normal file
Binary file not shown.
4354
visualization/TextLength_2018-11-05.pgf
Normal file
4354
visualization/TextLength_2018-11-05.pgf
Normal file
File diff suppressed because it is too large
Load Diff
BIN
visualization/TextLength_2018-11-05.png
Normal file
BIN
visualization/TextLength_2018-11-05.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 14 KiB |
Loading…
x
Reference in New Issue
Block a user