saving objects as pickles
This commit is contained in:
parent
7e037a1621
commit
b7d1f546e4
|
@ -18,6 +18,7 @@ import re
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from nltk.stem.porter import PorterStemmer
|
from nltk.stem.porter import PorterStemmer
|
||||||
|
import pickle
|
||||||
|
|
||||||
class BagOfWords:
|
class BagOfWords:
|
||||||
|
|
||||||
|
@ -114,6 +115,9 @@ class BagOfWords:
|
||||||
else:
|
else:
|
||||||
# absolute word frequency
|
# absolute word frequency
|
||||||
df_matrix.loc[i][v] += 1
|
df_matrix.loc[i][v] += 1
|
||||||
|
# save df_matrix object
|
||||||
|
with open('obj/'+ 'document_term_matrix' + '.pkl', 'wb') as f:
|
||||||
|
pickle.dump(df_matrix, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
return df_matrix
|
return df_matrix
|
||||||
|
|
||||||
|
@ -170,7 +174,7 @@ class BagOfWords:
|
||||||
|
|
||||||
#add unwanted terms
|
#add unwanted terms
|
||||||
stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
|
stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
|
||||||
'file', 'photo', 'min', 'read', 'staff', 'left',
|
'file', 'photo', 'min', 'read', 'staff', 'left', 'â',
|
||||||
'right', 'updated', 'minutes', 'brief', 'editing',
|
'right', 'updated', 'minutes', 'brief', 'editing',
|
||||||
'reporting', 'ago', 'also', 'would', 'could',
|
'reporting', 'ago', 'also', 'would', 'could',
|
||||||
'bit', 'ly', 'fy', 'economist', 'u', 'guardian'])
|
'bit', 'ly', 'fy', 'economist', 'u', 'guardian'])
|
||||||
|
@ -202,9 +206,9 @@ class BagOfWords:
|
||||||
|
|
||||||
# words under that rel_freq limit are not included
|
# words under that rel_freq limit are not included
|
||||||
# set limit
|
# set limit
|
||||||
limit = 0.001
|
limit = 0.0001
|
||||||
if not rel_freq:
|
if not rel_freq:
|
||||||
limit = len(df_matrix) * 0.001
|
limit = len(df_matrix) * 0.0001
|
||||||
|
|
||||||
# word => count
|
# word => count
|
||||||
dict = {}
|
dict = {}
|
||||||
|
@ -214,7 +218,8 @@ class BagOfWords:
|
||||||
# count word mentions in total
|
# count word mentions in total
|
||||||
if (df_matrix[column].sum() > limit):
|
if (df_matrix[column].sum() > limit):
|
||||||
dict[column] = df_matrix[column].sum()
|
dict[column] = df_matrix[column].sum()
|
||||||
# sort dict by value and
|
|
||||||
|
# sort dict by value
|
||||||
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
||||||
reverse=True))
|
reverse=True))
|
||||||
print(o_dict)
|
print(o_dict)
|
||||||
|
@ -226,6 +231,10 @@ class BagOfWords:
|
||||||
next_highest = o_dict.popitem(last=False)
|
next_highest = o_dict.popitem(last=False)
|
||||||
n_dict[next_highest[0]] = next_highest[1]
|
n_dict[next_highest[0]] = next_highest[1]
|
||||||
|
|
||||||
|
# save n_dict object
|
||||||
|
with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
|
||||||
|
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
return n_dict
|
return n_dict
|
||||||
|
|
||||||
def count_features(texts, stemming=True):
|
def count_features(texts, stemming=True):
|
||||||
|
@ -245,36 +254,38 @@ class BagOfWords:
|
||||||
return sum
|
return sum
|
||||||
|
|
||||||
def test():
|
def test():
|
||||||
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
file = 'data\\cleaned_data_set_without_header.csv'
|
||||||
df_dataset = pd.read_csv(file,
|
df_dataset = pd.read_csv(file,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
usecols=[1,2],
|
usecols=[1,2],
|
||||||
nrows=100,
|
#nrows=100,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
|
|
||||||
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
||||||
stemming = True
|
stemming = True
|
||||||
rel_freq = True
|
rel_freq = True
|
||||||
|
#print(BagOfWords.count_features(corpus))
|
||||||
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||||
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||||
#print(vocab)
|
print(len(vocab))
|
||||||
for text in corpus:
|
|
||||||
print(text)
|
# for text in corpus:
|
||||||
print()
|
# print(text)
|
||||||
print()
|
# print()
|
||||||
# ab hier ValueError bei nrows=10000...
|
# print()
|
||||||
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
|
# # ab hier ValueError bei nrows=10000...
|
||||||
dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming)
|
# matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
|
||||||
print(dict)
|
# dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming)
|
||||||
|
# print(dict)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
for word in sorted(BagOfWords.set_stop_words(False)):
|
# for word in sorted(BagOfWords.set_stop_words(False)):
|
||||||
print(word)
|
# print(word)
|
||||||
print()
|
# print()
|
||||||
print(PorterStemmer().stem(word))
|
# print(PorterStemmer().stem(word))
|
||||||
print()
|
# print()
|
||||||
# BagOfWords.test()
|
BagOfWords.test()
|
|
@ -62,7 +62,7 @@ class CosineSimilarity:
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# read data set
|
# read data set
|
||||||
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
file = 'data\\cleaned_data_set_without_header.csv'
|
||||||
df = pd.read_csv(file,
|
df = pd.read_csv(file,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
|
|
|
@ -12,6 +12,7 @@ writes it to a csv file.
|
||||||
import csv
|
import csv
|
||||||
import glob
|
import glob
|
||||||
import json
|
import json
|
||||||
|
import string
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
@ -39,7 +40,7 @@ class FileHandler:
|
||||||
|
|
||||||
def create_labeling_dataset():
|
def create_labeling_dataset():
|
||||||
# output file
|
# output file
|
||||||
o_file = 'data\\interactive_labeling_dataset.csv'
|
o_file = 'data\\cleaned_data_set_without_header.csv'
|
||||||
# create file and write header
|
# create file and write header
|
||||||
with open(o_file, 'w', newline='') as csvfile:
|
with open(o_file, 'w', newline='') as csvfile:
|
||||||
writer = csv.writer(csvfile,
|
writer = csv.writer(csvfile,
|
||||||
|
@ -77,6 +78,38 @@ class FileHandler:
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
|
|
||||||
|
def clean_articles():
|
||||||
|
'''clean articles in data set: filter out all non-printable characters
|
||||||
|
'''
|
||||||
|
# read data set
|
||||||
|
file = 'data\\cleaned_data_set_without_header.csv'
|
||||||
|
df = pd.read_csv(file,
|
||||||
|
delimiter='|',
|
||||||
|
header=None,
|
||||||
|
index_col=None,
|
||||||
|
engine='python',
|
||||||
|
#usecols=[1,2],
|
||||||
|
#nrows=100,
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
|
quotechar='\'')
|
||||||
|
|
||||||
|
# for each article(row)
|
||||||
|
for i in range (len(df)):
|
||||||
|
# filter headline
|
||||||
|
df.iloc[i][1] = ''.join(x for x in df.iloc[i][1] if x in string.printable)
|
||||||
|
# filter text
|
||||||
|
df.iloc[i][2] = ''.join(x for x in df.iloc[i][2] if x in string.printable)
|
||||||
|
print(df)
|
||||||
|
# save cleaned dataframe
|
||||||
|
df.to_csv('data\\cleaned_data_set_without_header.csv',
|
||||||
|
header=False,
|
||||||
|
index=False,
|
||||||
|
sep='|',
|
||||||
|
mode='a',
|
||||||
|
encoding='utf-8',
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
|
quotechar='\'')
|
||||||
|
|
||||||
def write_articles_to_csv_files():
|
def write_articles_to_csv_files():
|
||||||
'''read JSON files, select articles and write them to csv.
|
'''read JSON files, select articles and write them to csv.
|
||||||
'''
|
'''
|
||||||
|
@ -160,8 +193,8 @@ class FileHandler:
|
||||||
print('#')
|
print('#')
|
||||||
print('# saved {} articles in total'.format(a))
|
print('# saved {} articles in total'.format(a))
|
||||||
print('#')
|
print('#')
|
||||||
def join_all_csv_files():
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# FileHandler.write_articles_to_csv_files()
|
# FileHandler.write_articles_to_csv_files()
|
||||||
# FileHandler.create_labeling_dataset()
|
# FileHandler.create_labeling_dataset()
|
||||||
|
FileHandler.clean_articles()
|
99
NER.py
99
NER.py
|
@ -16,17 +16,31 @@ import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from nltk.tag import StanfordNERTagger
|
from nltk.tag import StanfordNERTagger
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
|
import pickle
|
||||||
|
import re
|
||||||
|
|
||||||
class NER:
|
class NER:
|
||||||
|
|
||||||
company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP', 'Plc', 'LLC',
|
company_abbrevs = ['Inc.', 'Inc', 'Corp', '& Co', 'Co', 'Ltd.', 'Ltd',
|
||||||
'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB']
|
'AG', 'LP', 'Limited', 'Tbk', 'Group', 'U.S.', 'BRIEF-',
|
||||||
|
'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB', 'Plc',
|
||||||
|
's.r.l.', 'Holding', 'Holdings']
|
||||||
|
|
||||||
# some entities and misc that are not companies
|
# some entities and misc that are not companies
|
||||||
misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn',
|
misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn',
|
||||||
'European Commission', 'EU', 'Staff', 'Min', 'Read',
|
'EU', 'Staff', 'Min', 'Read', 'SRF', 'New York Stock Exchange',
|
||||||
'Thomson Reuters Trust Principles', 'New York Stock Exchange',
|
'NYSE', 'DAX' 'ECB', 'Federal Reserve', 'Muslim', 'JPMorgan',
|
||||||
'NYSE']
|
'Standard & Poor', 'International Monetary Fund', 'Morgan Stanley',
|
||||||
|
'Hongkong', 'Whitehall Street', 'Fitch Australia Pty', 'AFS',
|
||||||
|
'FT House & Home', 'Fitch Rates Autonomous Community of Asturias',
|
||||||
|
'Autonomous Community of Asturias', 'Fitch Ratings Espana',
|
||||||
|
'Barcelona', 'Fitch Ratings ', 'Congress', 'Fed', 'OPEC', 'U.N.',
|
||||||
|
'National Federation of Independent Business', 'Barclays',
|
||||||
|
'McKinsey', 'Moody', 'Fitch Ratings Ltd.']
|
||||||
|
|
||||||
|
regex = r'European.*|.*Reuters.*|.*(B|b)ank.*|.*Ministry.*|.*Trump.*|.*Banca.*|\
|
||||||
|
.*Department.*|.*House.*|Wall (Street|Str).*|.*Congress.*|\
|
||||||
|
.*Republican.*|Goldman( Sachs)?|.*Chamber.*|.*Department.*'
|
||||||
|
|
||||||
def tag_words(text):
|
def tag_words(text):
|
||||||
# path to Stanford NER
|
# path to Stanford NER
|
||||||
|
@ -61,6 +75,10 @@ class NER:
|
||||||
'''param: article text where organizations must be indentified
|
'''param: article text where organizations must be indentified
|
||||||
returns: list of identified organisations as strings
|
returns: list of identified organisations as strings
|
||||||
'''
|
'''
|
||||||
|
# print(text)
|
||||||
|
# print()
|
||||||
|
# print('# examining article...')
|
||||||
|
# print()
|
||||||
# set paths
|
# set paths
|
||||||
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
|
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
|
||||||
os.environ['JAVAHOME'] = java_path
|
os.environ['JAVAHOME'] = java_path
|
||||||
|
@ -75,9 +93,15 @@ class NER:
|
||||||
#print(nes_coherent)
|
#print(nes_coherent)
|
||||||
for tuple in nes_coherent:
|
for tuple in nes_coherent:
|
||||||
# check if company and not already in list
|
# check if company and not already in list
|
||||||
if (tuple[0] not in NER.misc) and (tuple[0] not in seen):
|
if (tuple[0] not in NER.misc) and (tuple[0] not in seen)\
|
||||||
|
and (not re.search(NER.regex, tuple[0])):
|
||||||
organizations.append(tuple[0])
|
organizations.append(tuple[0])
|
||||||
seen.add(tuple[0])
|
seen.add(tuple[0])
|
||||||
|
print('# recognized the following organizations:')
|
||||||
|
print()
|
||||||
|
print(organizations)
|
||||||
|
print()
|
||||||
|
print()
|
||||||
return organizations
|
return organizations
|
||||||
|
|
||||||
def count_companies(texts):
|
def count_companies(texts):
|
||||||
|
@ -88,14 +112,37 @@ class NER:
|
||||||
print()
|
print()
|
||||||
# dictionary of companies with their count
|
# dictionary of companies with their count
|
||||||
dict_com = {}
|
dict_com = {}
|
||||||
for text in texts:
|
# list of company lists (one per article)
|
||||||
|
coms_list = []
|
||||||
|
for i, text in enumerate(texts):
|
||||||
# list of found companies in article
|
# list of found companies in article
|
||||||
|
print('# article no. {}:'.format(i))
|
||||||
coms = NER.find_companies(text)
|
coms = NER.find_companies(text)
|
||||||
|
coms_list.append(coms)
|
||||||
|
|
||||||
for com in coms:
|
for com in coms:
|
||||||
if com in dict_com.keys():
|
if com in dict_com.keys():
|
||||||
dict_com[com] += 1
|
dict_com[com] += 1
|
||||||
else:
|
else:
|
||||||
dict_com[com] = 1
|
dict_com[com] = 1
|
||||||
|
# print(coms_list)
|
||||||
|
# print()
|
||||||
|
# calculate number of company mentions per article
|
||||||
|
num_companies = []
|
||||||
|
for l in coms_list:
|
||||||
|
num_companies.append(len(l))
|
||||||
|
# print(num_companies)
|
||||||
|
print('# average number of different companies mentioned per article:')
|
||||||
|
print(sum(num_companies)/len(num_companies))
|
||||||
|
print()
|
||||||
|
# save num_companies object in file (for plotting)
|
||||||
|
with open('obj/'+ 'num_mentions_companies' + '.pkl', 'wb') as f:
|
||||||
|
pickle.dump(num_companies, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
# save dict_com object in file (for plotting)
|
||||||
|
with open('obj/'+ 'dict_organizations' + '.pkl', 'wb') as f:
|
||||||
|
pickle.dump(dict_com, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
#print(dict_com)
|
||||||
# # print outlier
|
# # print outlier
|
||||||
# print(max(dict_com, key=dict_com.get))
|
# print(max(dict_com, key=dict_com.get))
|
||||||
return list(dict_com.values())
|
return list(dict_com.values())
|
||||||
|
@ -103,27 +150,17 @@ class NER:
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
print('# starting NER...')
|
print('# starting NER...')
|
||||||
print()
|
print()
|
||||||
test_article = '''Exclusive: Microsoft's $7.5 billion GitHub deal set for
|
# read data set
|
||||||
EU approval - sources. BRUSSELS (Reuters) - U.S. software
|
file = 'data\\cleaned_data_set_without_header.csv'
|
||||||
giant Microsoft (MSFT.O) is set to win unconditional EU
|
df = pd.read_csv(file,
|
||||||
antitrust approval for its $7.5 billion purchase of
|
delimiter='|',
|
||||||
privately held coding website GitHub, two people familiar
|
header=None,
|
||||||
with the matter said on Monday. Microsoft announced the
|
index_col=None,
|
||||||
deal in June, its largest acquisition since it bought
|
engine='python',
|
||||||
LinkedIn for $26 billion in 2016. The GitHub deal is
|
#usecols=[1,2],
|
||||||
expected to boost the U.S. software giant’s cloud
|
nrows=100,
|
||||||
computing business and challenge market leader Amazon
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
(AMZN.O). GitHub, the world’s largest code host, has
|
quotechar='\'')
|
||||||
more than 28 million developers using its platform. It
|
#print(df)
|
||||||
will become a part of Microsoft’s Intelligent Cloud unit
|
texts = df[1] + '. ' + df[2]
|
||||||
once the acquisition is completed. Microsoft Chief
|
NER.count_companies(texts)
|
||||||
Executive Satya Nadella has tried to assuage users’
|
|
||||||
worries that GitHub might favor Microsoft products
|
|
||||||
over competitors after the deal, saying GitHub would
|
|
||||||
continue to be an open platform that works with all
|
|
||||||
public clouds. The European Commission, which is set to
|
|
||||||
decide on the deal by Oct. 19, did not respond to a
|
|
||||||
request for immediate comment. Microsoft declined to
|
|
||||||
comment. Reporting by Foo Yun Chee; editing by Jason
|
|
||||||
Neely'''
|
|
||||||
print(NER.find_companies(test_article))
|
|
|
@ -8,7 +8,9 @@ from BagOfWords import BagOfWords
|
||||||
from NER import NER
|
from NER import NER
|
||||||
|
|
||||||
import csv
|
import csv
|
||||||
|
from datetime import datetime
|
||||||
from os import path
|
from os import path
|
||||||
|
import pickle
|
||||||
|
|
||||||
import matplotlib
|
import matplotlib
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
@ -19,42 +21,55 @@ from wordcloud import WordCloud
|
||||||
|
|
||||||
class VisualizerNews:
|
class VisualizerNews:
|
||||||
|
|
||||||
|
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
|
||||||
|
|
||||||
def plot_wordcloud_dataset():
|
def plot_wordcloud_dataset():
|
||||||
'''plots word cloud image of most common words in dataset.
|
'''plots word cloud image of most common words in dataset.
|
||||||
'''
|
'''
|
||||||
print('# preparing word cloud of 200 most common words...')
|
print('# preparing word cloud of 200 most common words...')
|
||||||
print()
|
print()
|
||||||
# load new data set
|
# load new data set
|
||||||
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
file = 'data\\cleaned_data_set_without_header.csv'
|
||||||
df_dataset = pd.read_csv(file,
|
df_dataset = pd.read_csv(file,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
usecols=[1,2],
|
usecols=[1,2],
|
||||||
|
#nrows=100,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
|
|
||||||
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
||||||
stemming = False
|
stemming = False
|
||||||
rel_freq = False
|
rel_freq = True
|
||||||
|
|
||||||
# find most common words in dataset
|
# find most common words in dataset
|
||||||
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||||
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||||
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
|
matrix = BagOfWords.make_matrix(extracted_words, vocab,
|
||||||
dict = BagOfWords.make_dict_common_words(matrix, 200, rel_freq, stemming)
|
rel_freq, stemming)
|
||||||
|
dict = BagOfWords.make_dict_common_words(matrix, 200,
|
||||||
|
rel_freq, stemming)
|
||||||
|
# save dict object
|
||||||
|
with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
|
||||||
|
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
wordcloud = WordCloud(background_color='white',
|
wordcloud = WordCloud(background_color='white',
|
||||||
width=2400,
|
width=2400,
|
||||||
height=1200,
|
height=1200,
|
||||||
scale=2,
|
scale=2,
|
||||||
# true if bigram:
|
# true if bigram:
|
||||||
collocations=False).generate_from_frequencies(dict)
|
collocations=False)\
|
||||||
|
.generate_from_frequencies(dict)
|
||||||
|
|
||||||
# display generated image
|
# display generated image
|
||||||
plt.imshow(wordcloud, interpolation='bilinear')
|
plt.imshow(wordcloud, interpolation='bilinear')
|
||||||
plt.axis("off")
|
plt.axis("off")
|
||||||
|
plt.savefig('visualization\\WordCloud_{}.eps'
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
|
plt.savefig('visualization\\WordCloud_{}.png'
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def plot_histogram_companies():
|
def plot_histogram_companies():
|
||||||
|
@ -66,13 +81,14 @@ class VisualizerNews:
|
||||||
print('# preparing histogram of company mentions...')
|
print('# preparing histogram of company mentions...')
|
||||||
print()
|
print()
|
||||||
# read data set
|
# read data set
|
||||||
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
file = 'data\\cleaned_data_set_without_header.csv'
|
||||||
df = pd.read_csv(file,
|
df = pd.read_csv(file,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
usecols=[1,2],
|
usecols=[1,2],
|
||||||
|
#nrows=10,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
|
|
||||||
|
@ -93,8 +109,15 @@ class VisualizerNews:
|
||||||
# Number of companies with this number of mentions
|
# Number of companies with this number of mentions
|
||||||
plt.ylabel('Number of companies with this number of articles')
|
plt.ylabel('Number of companies with this number of articles')
|
||||||
num_bins = 50
|
num_bins = 50
|
||||||
n, bins, patches = plt.hist(names, num_bins, facecolor='darkred', alpha=0.5)
|
n, bins, patches = plt.hist(names, num_bins,
|
||||||
|
facecolor='darkred', alpha=0.5)
|
||||||
plt.axis([0, 50, 0, 1000])
|
plt.axis([0, 50, 0, 1000])
|
||||||
|
|
||||||
|
# save to file
|
||||||
|
plt.savefig('visualization\\NER_{}.eps'
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
|
plt.savefig('visualization\\NER_{}.png'
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def plot_histogram_text_lengths():
|
def plot_histogram_text_lengths():
|
||||||
|
@ -105,10 +128,10 @@ class VisualizerNews:
|
||||||
print('# preparing histogram of text lengths...')
|
print('# preparing histogram of text lengths...')
|
||||||
print()
|
print()
|
||||||
# read data set
|
# read data set
|
||||||
filepath = 'data\\interactive_labeling_dataset.csv'
|
filepath = 'data\\cleaned_data_set_without_header.csv'
|
||||||
df_dataset = pd.read_csv(filepath,
|
df_dataset = pd.read_csv(filepath,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=0,
|
header=None,
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
usecols=[2],
|
usecols=[2],
|
||||||
|
@ -126,23 +149,30 @@ class VisualizerNews:
|
||||||
count_chars.append(len(text))
|
count_chars.append(len(text))
|
||||||
# average of number of characters
|
# average of number of characters
|
||||||
av = int(sum(count_chars) / len(count_chars))
|
av = int(sum(count_chars) / len(count_chars))
|
||||||
print('# average length of news articles is: {} characters'.format(av))
|
print('# average length of news articles is {} characters'.format(av))
|
||||||
print()
|
print()
|
||||||
# sort list in descending order
|
# sort list in descending order
|
||||||
count_chars.sort(reverse=True)
|
count_chars.sort(reverse=True)
|
||||||
# convert list to array
|
# convert list to array
|
||||||
names = np.asarray(count_chars)
|
names = np.asarray(count_chars)
|
||||||
# plt.title('Length of News Articles')
|
# plt.title('Length of News Articles')
|
||||||
plt.xlabel('Number of characters in an article')
|
plt.xlabel('Number of characters in article')
|
||||||
plt.ylabel('Frequency')
|
plt.ylabel('Frequency')
|
||||||
# number of vertical bins
|
# number of vertical bins
|
||||||
num_bins = 200
|
num_bins = 200
|
||||||
n, bins, patches = plt.hist(names, num_bins, facecolor='darkslategrey', alpha=0.5)
|
n, bins, patches = plt.hist(names, num_bins,
|
||||||
|
facecolor='darkslategrey', alpha=0.5)
|
||||||
# [xmin, xmax, ymin, ymax] of axis
|
# [xmin, xmax, ymin, ymax] of axis
|
||||||
#plt.axis([format(300, ','),format(10000, ','), 0, 500])
|
#plt.axis([format(300, ','),format(10000, ','), 0, 500])
|
||||||
plt.axis([300,10000,0,500])
|
plt.axis([300,10000,0,500])
|
||||||
# format axis labels for thousends (e.g. '10,000')
|
# format axis labels for thousends (e.g. '10,000')
|
||||||
plt.gca().xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
|
plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
|
||||||
|
.FuncFormatter(lambda x, p: format(int(x), ',')))
|
||||||
|
# save plot
|
||||||
|
plt.savefig('visualization\\TextLength_{}.eps'\
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
|
plt.savefig('visualization\\TextLength_{}.png'\
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def plot_pie_chart_of_sites():
|
def plot_pie_chart_of_sites():
|
||||||
|
@ -151,24 +181,24 @@ class VisualizerNews:
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# load data set
|
# load data set
|
||||||
filepath = 'data\\interactive_labeling_dataset.csv'
|
filepath = 'data\\cleaned_data_set_without_header.csv'
|
||||||
df_dataset = pd.read_csv(filepath,
|
df_dataset = pd.read_csv(filepath,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=0,
|
header=None,
|
||||||
#usecols=[3], #column 'Site'
|
#usecols=[3], #column 'Site'
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
#nrows=100,
|
nrows=10,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
# find all different sites
|
# find all different sites, group by 'Site'
|
||||||
df_counts = df_dataset.groupby('Site').count()
|
df_counts = df_dataset.groupby(3).count()
|
||||||
# count occurences of each site
|
# count occurences of each site, count different 'Url's
|
||||||
df_counts = df_counts.sort_values(['Url'], ascending=False)
|
df_counts = df_counts.sort_values([5], ascending=False)
|
||||||
|
|
||||||
fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
|
fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
|
||||||
|
|
||||||
data = list(df_counts['Url'])
|
data = list(df_counts[5])
|
||||||
# legend labels
|
# legend labels
|
||||||
labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)',
|
labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)',
|
||||||
'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']
|
'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']
|
||||||
|
@ -185,12 +215,14 @@ class VisualizerNews:
|
||||||
|
|
||||||
plt.setp(autotexts, size=8, weight="bold")
|
plt.setp(autotexts, size=8, weight="bold")
|
||||||
plt.show()
|
plt.show()
|
||||||
|
plt.savefig('Sites_{}.pdf'.format(VisualizerNews.datestring))
|
||||||
|
plt.savefig('Sites_{}.pgf'.format(VisualizerNews.datestring))
|
||||||
|
|
||||||
def plot_hist_most_common_words(n_commons = 10):
|
def plot_hist_most_common_words(n_commons = 10):
|
||||||
print('# preparing histogram of most common words...')
|
print('# preparing histogram of most common words...')
|
||||||
print()
|
print()
|
||||||
# load data set
|
# load data set
|
||||||
filepath = 'data\\interactive_labeling_dataset_without_header.csv'
|
filepath = 'data\\cleaned_data_set_without_header.csv'
|
||||||
df_dataset = pd.read_csv(filepath,
|
df_dataset = pd.read_csv(filepath,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
|
@ -209,8 +241,13 @@ class VisualizerNews:
|
||||||
# find most common words in dataset
|
# find most common words in dataset
|
||||||
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||||
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||||
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
|
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
|
||||||
dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq, stemming)
|
stemming)
|
||||||
|
dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
|
||||||
|
stemming)
|
||||||
|
# save dict object
|
||||||
|
with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f:
|
||||||
|
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
plt.xlabel('Most common words in textual corpus')
|
plt.xlabel('Most common words in textual corpus')
|
||||||
plt.ylabel('Relative frequency')
|
plt.ylabel('Relative frequency')
|
||||||
|
@ -222,11 +259,15 @@ class VisualizerNews:
|
||||||
height=numbers,
|
height=numbers,
|
||||||
tick_label=labels,
|
tick_label=labels,
|
||||||
facecolor='darkorange')
|
facecolor='darkorange')
|
||||||
|
plt.savefig('visualization\\10_most_common_words_{}.eps'
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
|
plt.savefig('visualization\\10_most_common_words_{}.png'
|
||||||
|
.format(VisualizerNews.datestring))
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
VisualizerNews.plot_wordcloud_dataset()
|
||||||
# VisualizerNews.plot_histogram_companies()
|
# VisualizerNews.plot_histogram_companies()
|
||||||
# VisualizerNews.plot_wordcloud_dataset()
|
|
||||||
# VisualizerNews.plot_histogram_text_lengths()
|
# VisualizerNews.plot_histogram_text_lengths()
|
||||||
# VisualizerNews.plot_pie_chart_of_sites()
|
# VisualizerNews.plot_pie_chart_of_sites()
|
||||||
VisualizerNews.plot_hist_most_common_words()
|
VisualizerNews.plot_hist_most_common_words()
|
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
After Width: | Height: | Size: 14 KiB |
Loading…
Reference in New Issue