saving objects as pickles

This commit is contained in:
Anne Lorenz 2018-11-05 13:18:03 +01:00
parent 7e037a1621
commit b7d1f546e4
13 changed files with 17639 additions and 81 deletions

View File

@ -18,6 +18,7 @@ import re
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from nltk.stem.porter import PorterStemmer from nltk.stem.porter import PorterStemmer
import pickle
class BagOfWords: class BagOfWords:
@ -114,6 +115,9 @@ class BagOfWords:
else: else:
# absolute word frequency # absolute word frequency
df_matrix.loc[i][v] += 1 df_matrix.loc[i][v] += 1
# save df_matrix object
with open('obj/'+ 'document_term_matrix' + '.pkl', 'wb') as f:
pickle.dump(df_matrix, f, pickle.HIGHEST_PROTOCOL)
return df_matrix return df_matrix
@ -170,7 +174,7 @@ class BagOfWords:
#add unwanted terms #add unwanted terms
stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l', stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
'file', 'photo', 'min', 'read', 'staff', 'left', 'file', 'photo', 'min', 'read', 'staff', 'left', 'â',
'right', 'updated', 'minutes', 'brief', 'editing', 'right', 'updated', 'minutes', 'brief', 'editing',
'reporting', 'ago', 'also', 'would', 'could', 'reporting', 'ago', 'also', 'would', 'could',
'bit', 'ly', 'fy', 'economist', 'u', 'guardian']) 'bit', 'ly', 'fy', 'economist', 'u', 'guardian'])
@ -202,9 +206,9 @@ class BagOfWords:
# words under that rel_freq limit are not included # words under that rel_freq limit are not included
# set limit # set limit
limit = 0.001 limit = 0.0001
if not rel_freq: if not rel_freq:
limit = len(df_matrix) * 0.001 limit = len(df_matrix) * 0.0001
# word => count # word => count
dict = {} dict = {}
@ -214,7 +218,8 @@ class BagOfWords:
# count word mentions in total # count word mentions in total
if (df_matrix[column].sum() > limit): if (df_matrix[column].sum() > limit):
dict[column] = df_matrix[column].sum() dict[column] = df_matrix[column].sum()
# sort dict by value and
# sort dict by value
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\ o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
reverse=True)) reverse=True))
print(o_dict) print(o_dict)
@ -226,6 +231,10 @@ class BagOfWords:
next_highest = o_dict.popitem(last=False) next_highest = o_dict.popitem(last=False)
n_dict[next_highest[0]] = next_highest[1] n_dict[next_highest[0]] = next_highest[1]
# save n_dict object
with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
return n_dict return n_dict
def count_features(texts, stemming=True): def count_features(texts, stemming=True):
@ -245,36 +254,38 @@ class BagOfWords:
return sum return sum
def test(): def test():
file = 'data\\interactive_labeling_dataset_without_header.csv' file = 'data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(file, df_dataset = pd.read_csv(file,
delimiter='|', delimiter='|',
header=None, header=None,
index_col=None, index_col=None,
engine='python', engine='python',
usecols=[1,2], usecols=[1,2],
nrows=100, #nrows=100,
quoting=csv.QUOTE_NONNUMERIC, quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') quotechar='\'')
corpus = df_dataset[1] + '. ' + df_dataset[2] corpus = df_dataset[1] + '. ' + df_dataset[2]
stemming = True stemming = True
rel_freq = True rel_freq = True
#print(BagOfWords.count_features(corpus))
extracted_words = BagOfWords.extract_all_words(corpus, stemming) extracted_words = BagOfWords.extract_all_words(corpus, stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming) vocab = BagOfWords.make_vocab(extracted_words, stemming)
#print(vocab) print(len(vocab))
for text in corpus:
print(text) # for text in corpus:
print() # print(text)
print() # print()
# ab hier ValueError bei nrows=10000... # print()
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming) # # ab hier ValueError bei nrows=10000...
dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming) # matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
print(dict) # dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming)
# print(dict)
if __name__ == '__main__': if __name__ == '__main__':
for word in sorted(BagOfWords.set_stop_words(False)): # for word in sorted(BagOfWords.set_stop_words(False)):
print(word) # print(word)
print() # print()
print(PorterStemmer().stem(word)) # print(PorterStemmer().stem(word))
print() # print()
# BagOfWords.test() BagOfWords.test()

View File

@ -62,7 +62,7 @@ class CosineSimilarity:
if __name__ == '__main__': if __name__ == '__main__':
# read data set # read data set
file = 'data\\interactive_labeling_dataset_without_header.csv' file = 'data\\cleaned_data_set_without_header.csv'
df = pd.read_csv(file, df = pd.read_csv(file,
delimiter='|', delimiter='|',
header=None, header=None,

View File

@ -12,6 +12,7 @@ writes it to a csv file.
import csv import csv
import glob import glob
import json import json
import string
import numpy as np import numpy as np
import pandas as pd import pandas as pd
@ -39,7 +40,7 @@ class FileHandler:
def create_labeling_dataset(): def create_labeling_dataset():
# output file # output file
o_file = 'data\\interactive_labeling_dataset.csv' o_file = 'data\\cleaned_data_set_without_header.csv'
# create file and write header # create file and write header
with open(o_file, 'w', newline='') as csvfile: with open(o_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile, writer = csv.writer(csvfile,
@ -77,6 +78,38 @@ class FileHandler:
quoting=csv.QUOTE_NONNUMERIC, quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') quotechar='\'')
def clean_articles():
'''clean articles in data set: filter out all non-printable characters
'''
# read data set
file = 'data\\cleaned_data_set_without_header.csv'
df = pd.read_csv(file,
delimiter='|',
header=None,
index_col=None,
engine='python',
#usecols=[1,2],
#nrows=100,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
# for each article(row)
for i in range (len(df)):
# filter headline
df.iloc[i][1] = ''.join(x for x in df.iloc[i][1] if x in string.printable)
# filter text
df.iloc[i][2] = ''.join(x for x in df.iloc[i][2] if x in string.printable)
print(df)
# save cleaned dataframe
df.to_csv('data\\cleaned_data_set_without_header.csv',
header=False,
index=False,
sep='|',
mode='a',
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
def write_articles_to_csv_files(): def write_articles_to_csv_files():
'''read JSON files, select articles and write them to csv. '''read JSON files, select articles and write them to csv.
''' '''
@ -160,8 +193,8 @@ class FileHandler:
print('#') print('#')
print('# saved {} articles in total'.format(a)) print('# saved {} articles in total'.format(a))
print('#') print('#')
def join_all_csv_files():
if __name__ == '__main__': if __name__ == '__main__':
# FileHandler.write_articles_to_csv_files() # FileHandler.write_articles_to_csv_files()
# FileHandler.create_labeling_dataset() # FileHandler.create_labeling_dataset()
FileHandler.clean_articles()

99
NER.py
View File

@ -16,17 +16,31 @@ import numpy as np
import pandas as pd import pandas as pd
from nltk.tag import StanfordNERTagger from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
import pickle
import re
class NER: class NER:
company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP', 'Plc', 'LLC', company_abbrevs = ['Inc.', 'Inc', 'Corp', '& Co', 'Co', 'Ltd.', 'Ltd',
'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB'] 'AG', 'LP', 'Limited', 'Tbk', 'Group', 'U.S.', 'BRIEF-',
'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB', 'Plc',
's.r.l.', 'Holding', 'Holdings']
# some entities and misc that are not companies # some entities and misc that are not companies
misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn', misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn',
'European Commission', 'EU', 'Staff', 'Min', 'Read', 'EU', 'Staff', 'Min', 'Read', 'SRF', 'New York Stock Exchange',
'Thomson Reuters Trust Principles', 'New York Stock Exchange', 'NYSE', 'DAX' 'ECB', 'Federal Reserve', 'Muslim', 'JPMorgan',
'NYSE'] 'Standard & Poor', 'International Monetary Fund', 'Morgan Stanley',
'Hongkong', 'Whitehall Street', 'Fitch Australia Pty', 'AFS',
'FT House & Home', 'Fitch Rates Autonomous Community of Asturias',
'Autonomous Community of Asturias', 'Fitch Ratings Espana',
'Barcelona', 'Fitch Ratings ', 'Congress', 'Fed', 'OPEC', 'U.N.',
'National Federation of Independent Business', 'Barclays',
'McKinsey', 'Moody', 'Fitch Ratings Ltd.']
regex = r'European.*|.*Reuters.*|.*(B|b)ank.*|.*Ministry.*|.*Trump.*|.*Banca.*|\
.*Department.*|.*House.*|Wall (Street|Str).*|.*Congress.*|\
.*Republican.*|Goldman( Sachs)?|.*Chamber.*|.*Department.*'
def tag_words(text): def tag_words(text):
# path to Stanford NER # path to Stanford NER
@ -61,6 +75,10 @@ class NER:
'''param: article text where organizations must be indentified '''param: article text where organizations must be indentified
returns: list of identified organisations as strings returns: list of identified organisations as strings
''' '''
# print(text)
# print()
# print('# examining article...')
# print()
# set paths # set paths
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181" java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
os.environ['JAVAHOME'] = java_path os.environ['JAVAHOME'] = java_path
@ -75,9 +93,15 @@ class NER:
#print(nes_coherent) #print(nes_coherent)
for tuple in nes_coherent: for tuple in nes_coherent:
# check if company and not already in list # check if company and not already in list
if (tuple[0] not in NER.misc) and (tuple[0] not in seen): if (tuple[0] not in NER.misc) and (tuple[0] not in seen)\
and (not re.search(NER.regex, tuple[0])):
organizations.append(tuple[0]) organizations.append(tuple[0])
seen.add(tuple[0]) seen.add(tuple[0])
print('# recognized the following organizations:')
print()
print(organizations)
print()
print()
return organizations return organizations
def count_companies(texts): def count_companies(texts):
@ -88,14 +112,37 @@ class NER:
print() print()
# dictionary of companies with their count # dictionary of companies with their count
dict_com = {} dict_com = {}
for text in texts: # list of company lists (one per article)
coms_list = []
for i, text in enumerate(texts):
# list of found companies in article # list of found companies in article
print('# article no. {}:'.format(i))
coms = NER.find_companies(text) coms = NER.find_companies(text)
coms_list.append(coms)
for com in coms: for com in coms:
if com in dict_com.keys(): if com in dict_com.keys():
dict_com[com] += 1 dict_com[com] += 1
else: else:
dict_com[com] = 1 dict_com[com] = 1
# print(coms_list)
# print()
# calculate number of company mentions per article
num_companies = []
for l in coms_list:
num_companies.append(len(l))
# print(num_companies)
print('# average number of different companies mentioned per article:')
print(sum(num_companies)/len(num_companies))
print()
# save num_companies object in file (for plotting)
with open('obj/'+ 'num_mentions_companies' + '.pkl', 'wb') as f:
pickle.dump(num_companies, f, pickle.HIGHEST_PROTOCOL)
# save dict_com object in file (for plotting)
with open('obj/'+ 'dict_organizations' + '.pkl', 'wb') as f:
pickle.dump(dict_com, f, pickle.HIGHEST_PROTOCOL)
#print(dict_com)
# # print outlier # # print outlier
# print(max(dict_com, key=dict_com.get)) # print(max(dict_com, key=dict_com.get))
return list(dict_com.values()) return list(dict_com.values())
@ -103,27 +150,17 @@ class NER:
if __name__ == '__main__': if __name__ == '__main__':
print('# starting NER...') print('# starting NER...')
print() print()
test_article = '''Exclusive: Microsoft's $7.5 billion GitHub deal set for # read data set
EU approval - sources. BRUSSELS (Reuters) - U.S. software file = 'data\\cleaned_data_set_without_header.csv'
giant Microsoft (MSFT.O) is set to win unconditional EU df = pd.read_csv(file,
antitrust approval for its $7.5 billion purchase of delimiter='|',
privately held coding website GitHub, two people familiar header=None,
with the matter said on Monday. Microsoft announced the index_col=None,
deal in June, its largest acquisition since it bought engine='python',
LinkedIn for $26 billion in 2016. The GitHub deal is #usecols=[1,2],
expected to boost the U.S. software giants cloud nrows=100,
computing business and challenge market leader Amazon quoting=csv.QUOTE_NONNUMERIC,
(AMZN.O). GitHub, the worlds largest code host, has quotechar='\'')
more than 28 million developers using its platform. It #print(df)
will become a part of Microsofts Intelligent Cloud unit texts = df[1] + '. ' + df[2]
once the acquisition is completed. Microsoft Chief NER.count_companies(texts)
Executive Satya Nadella has tried to assuage users
worries that GitHub might favor Microsoft products
over competitors after the deal, saying GitHub would
continue to be an open platform that works with all
public clouds. The European Commission, which is set to
decide on the deal by Oct. 19, did not respond to a
request for immediate comment. Microsoft declined to
comment. Reporting by Foo Yun Chee; editing by Jason
Neely'''
print(NER.find_companies(test_article))

View File

@ -8,7 +8,9 @@ from BagOfWords import BagOfWords
from NER import NER from NER import NER
import csv import csv
from datetime import datetime
from os import path from os import path
import pickle
import matplotlib import matplotlib
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
@ -19,42 +21,55 @@ from wordcloud import WordCloud
class VisualizerNews: class VisualizerNews:
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
def plot_wordcloud_dataset(): def plot_wordcloud_dataset():
'''plots word cloud image of most common words in dataset. '''plots word cloud image of most common words in dataset.
''' '''
print('# preparing word cloud of 200 most common words...') print('# preparing word cloud of 200 most common words...')
print() print()
# load new data set # load new data set
file = 'data\\interactive_labeling_dataset_without_header.csv' file = 'data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(file, df_dataset = pd.read_csv(file,
delimiter='|', delimiter='|',
header=None, header=None,
index_col=None, index_col=None,
engine='python', engine='python',
usecols=[1,2], usecols=[1,2],
#nrows=100,
quoting=csv.QUOTE_NONNUMERIC, quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') quotechar='\'')
corpus = df_dataset[1] + '. ' + df_dataset[2] corpus = df_dataset[1] + '. ' + df_dataset[2]
stemming = False stemming = False
rel_freq = False rel_freq = True
# find most common words in dataset # find most common words in dataset
extracted_words = BagOfWords.extract_all_words(corpus, stemming) extracted_words = BagOfWords.extract_all_words(corpus, stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming) vocab = BagOfWords.make_vocab(extracted_words, stemming)
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming) matrix = BagOfWords.make_matrix(extracted_words, vocab,
dict = BagOfWords.make_dict_common_words(matrix, 200, rel_freq, stemming) rel_freq, stemming)
dict = BagOfWords.make_dict_common_words(matrix, 200,
rel_freq, stemming)
# save dict object
with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
wordcloud = WordCloud(background_color='white', wordcloud = WordCloud(background_color='white',
width=2400, width=2400,
height=1200, height=1200,
scale=2, scale=2,
# true if bigram: # true if bigram:
collocations=False).generate_from_frequencies(dict) collocations=False)\
.generate_from_frequencies(dict)
# display generated image # display generated image
plt.imshow(wordcloud, interpolation='bilinear') plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off") plt.axis("off")
plt.savefig('visualization\\WordCloud_{}.eps'
.format(VisualizerNews.datestring))
plt.savefig('visualization\\WordCloud_{}.png'
.format(VisualizerNews.datestring))
plt.show() plt.show()
def plot_histogram_companies(): def plot_histogram_companies():
@ -66,13 +81,14 @@ class VisualizerNews:
print('# preparing histogram of company mentions...') print('# preparing histogram of company mentions...')
print() print()
# read data set # read data set
file = 'data\\interactive_labeling_dataset_without_header.csv' file = 'data\\cleaned_data_set_without_header.csv'
df = pd.read_csv(file, df = pd.read_csv(file,
delimiter='|', delimiter='|',
header=None, header=None,
index_col=None, index_col=None,
engine='python', engine='python',
usecols=[1,2], usecols=[1,2],
#nrows=10,
quoting=csv.QUOTE_NONNUMERIC, quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') quotechar='\'')
@ -93,8 +109,15 @@ class VisualizerNews:
# Number of companies with this number of mentions # Number of companies with this number of mentions
plt.ylabel('Number of companies with this number of articles') plt.ylabel('Number of companies with this number of articles')
num_bins = 50 num_bins = 50
n, bins, patches = plt.hist(names, num_bins, facecolor='darkred', alpha=0.5) n, bins, patches = plt.hist(names, num_bins,
facecolor='darkred', alpha=0.5)
plt.axis([0, 50, 0, 1000]) plt.axis([0, 50, 0, 1000])
# save to file
plt.savefig('visualization\\NER_{}.eps'
.format(VisualizerNews.datestring))
plt.savefig('visualization\\NER_{}.png'
.format(VisualizerNews.datestring))
plt.show() plt.show()
def plot_histogram_text_lengths(): def plot_histogram_text_lengths():
@ -105,10 +128,10 @@ class VisualizerNews:
print('# preparing histogram of text lengths...') print('# preparing histogram of text lengths...')
print() print()
# read data set # read data set
filepath = 'data\\interactive_labeling_dataset.csv' filepath = 'data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(filepath, df_dataset = pd.read_csv(filepath,
delimiter='|', delimiter='|',
header=0, header=None,
index_col=None, index_col=None,
engine='python', engine='python',
usecols=[2], usecols=[2],
@ -126,23 +149,30 @@ class VisualizerNews:
count_chars.append(len(text)) count_chars.append(len(text))
# average of number of characters # average of number of characters
av = int(sum(count_chars) / len(count_chars)) av = int(sum(count_chars) / len(count_chars))
print('# average length of news articles is: {} characters'.format(av)) print('# average length of news articles is {} characters'.format(av))
print() print()
# sort list in descending order # sort list in descending order
count_chars.sort(reverse=True) count_chars.sort(reverse=True)
# convert list to array # convert list to array
names = np.asarray(count_chars) names = np.asarray(count_chars)
# plt.title('Length of News Articles') # plt.title('Length of News Articles')
plt.xlabel('Number of characters in an article') plt.xlabel('Number of characters in article')
plt.ylabel('Frequency') plt.ylabel('Frequency')
# number of vertical bins # number of vertical bins
num_bins = 200 num_bins = 200
n, bins, patches = plt.hist(names, num_bins, facecolor='darkslategrey', alpha=0.5) n, bins, patches = plt.hist(names, num_bins,
facecolor='darkslategrey', alpha=0.5)
# [xmin, xmax, ymin, ymax] of axis # [xmin, xmax, ymin, ymax] of axis
#plt.axis([format(300, ','),format(10000, ','), 0, 500]) #plt.axis([format(300, ','),format(10000, ','), 0, 500])
plt.axis([300,10000,0,500]) plt.axis([300,10000,0,500])
# format axis labels for thousends (e.g. '10,000') # format axis labels for thousends (e.g. '10,000')
plt.gca().xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ','))) plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
.FuncFormatter(lambda x, p: format(int(x), ',')))
# save plot
plt.savefig('visualization\\TextLength_{}.eps'\
.format(VisualizerNews.datestring))
plt.savefig('visualization\\TextLength_{}.png'\
.format(VisualizerNews.datestring))
plt.show() plt.show()
def plot_pie_chart_of_sites(): def plot_pie_chart_of_sites():
@ -151,24 +181,24 @@ class VisualizerNews:
print() print()
# load data set # load data set
filepath = 'data\\interactive_labeling_dataset.csv' filepath = 'data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(filepath, df_dataset = pd.read_csv(filepath,
delimiter='|', delimiter='|',
header=0, header=None,
#usecols=[3], #column 'Site' #usecols=[3], #column 'Site'
index_col=None, index_col=None,
engine='python', engine='python',
#nrows=100, nrows=10,
quoting=csv.QUOTE_NONNUMERIC, quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') quotechar='\'')
# find all different sites # find all different sites, group by 'Site'
df_counts = df_dataset.groupby('Site').count() df_counts = df_dataset.groupby(3).count()
# count occurences of each site # count occurences of each site, count different 'Url's
df_counts = df_counts.sort_values(['Url'], ascending=False) df_counts = df_counts.sort_values([5], ascending=False)
fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal")) fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
data = list(df_counts['Url']) data = list(df_counts[5])
# legend labels # legend labels
labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)', labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)',
'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)'] 'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']
@ -185,12 +215,14 @@ class VisualizerNews:
plt.setp(autotexts, size=8, weight="bold") plt.setp(autotexts, size=8, weight="bold")
plt.show() plt.show()
plt.savefig('Sites_{}.pdf'.format(VisualizerNews.datestring))
plt.savefig('Sites_{}.pgf'.format(VisualizerNews.datestring))
def plot_hist_most_common_words(n_commons = 10): def plot_hist_most_common_words(n_commons = 10):
print('# preparing histogram of most common words...') print('# preparing histogram of most common words...')
print() print()
# load data set # load data set
filepath = 'data\\interactive_labeling_dataset_without_header.csv' filepath = 'data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(filepath, df_dataset = pd.read_csv(filepath,
delimiter='|', delimiter='|',
header=None, header=None,
@ -209,8 +241,13 @@ class VisualizerNews:
# find most common words in dataset # find most common words in dataset
extracted_words = BagOfWords.extract_all_words(corpus, stemming) extracted_words = BagOfWords.extract_all_words(corpus, stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming) vocab = BagOfWords.make_vocab(extracted_words, stemming)
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming) matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq, stemming) stemming)
dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
stemming)
# save dict object
with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f:
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
plt.xlabel('Most common words in textual corpus') plt.xlabel('Most common words in textual corpus')
plt.ylabel('Relative frequency') plt.ylabel('Relative frequency')
@ -222,11 +259,15 @@ class VisualizerNews:
height=numbers, height=numbers,
tick_label=labels, tick_label=labels,
facecolor='darkorange') facecolor='darkorange')
plt.savefig('visualization\\10_most_common_words_{}.eps'
.format(VisualizerNews.datestring))
plt.savefig('visualization\\10_most_common_words_{}.png'
.format(VisualizerNews.datestring))
plt.show() plt.show()
if __name__ == '__main__': if __name__ == '__main__':
VisualizerNews.plot_wordcloud_dataset()
# VisualizerNews.plot_histogram_companies() # VisualizerNews.plot_histogram_companies()
# VisualizerNews.plot_wordcloud_dataset()
# VisualizerNews.plot_histogram_text_lengths() # VisualizerNews.plot_histogram_text_lengths()
# VisualizerNews.plot_pie_chart_of_sites() # VisualizerNews.plot_pie_chart_of_sites()
VisualizerNews.plot_hist_most_common_words() VisualizerNews.plot_hist_most_common_words()

File diff suppressed because one or more lines are too long

BIN
obj/dict_organizations.pkl Normal file

Binary file not shown.

View File

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB