saving objects as pickles

master
Anne Lorenz 2018-11-05 13:18:03 +01:00
parent 7e037a1621
commit b7d1f546e4
13 changed files with 17639 additions and 81 deletions

View File

@ -18,6 +18,7 @@ import re
import numpy as np
import pandas as pd
from nltk.stem.porter import PorterStemmer
import pickle
class BagOfWords:
@ -114,6 +115,9 @@ class BagOfWords:
else:
# absolute word frequency
df_matrix.loc[i][v] += 1
# save df_matrix object
with open('obj/'+ 'document_term_matrix' + '.pkl', 'wb') as f:
pickle.dump(df_matrix, f, pickle.HIGHEST_PROTOCOL)
return df_matrix
@ -170,7 +174,7 @@ class BagOfWords:
#add unwanted terms
stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
'file', 'photo', 'min', 'read', 'staff', 'left',
'file', 'photo', 'min', 'read', 'staff', 'left', 'â',
'right', 'updated', 'minutes', 'brief', 'editing',
'reporting', 'ago', 'also', 'would', 'could',
'bit', 'ly', 'fy', 'economist', 'u', 'guardian'])
@ -202,9 +206,9 @@ class BagOfWords:
# words under that rel_freq limit are not included
# set limit
limit = 0.001
limit = 0.0001
if not rel_freq:
limit = len(df_matrix) * 0.001
limit = len(df_matrix) * 0.0001
# word => count
dict = {}
@ -214,7 +218,8 @@ class BagOfWords:
# count word mentions in total
if (df_matrix[column].sum() > limit):
dict[column] = df_matrix[column].sum()
# sort dict by value and
# sort dict by value
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
reverse=True))
print(o_dict)
@ -226,6 +231,10 @@ class BagOfWords:
next_highest = o_dict.popitem(last=False)
n_dict[next_highest[0]] = next_highest[1]
# save n_dict object
with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
return n_dict
def count_features(texts, stemming=True):
@ -245,36 +254,38 @@ class BagOfWords:
return sum
def test():
file = 'data\\interactive_labeling_dataset_without_header.csv'
file = 'data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(file,
delimiter='|',
header=None,
index_col=None,
engine='python',
usecols=[1,2],
nrows=100,
#nrows=100,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
corpus = df_dataset[1] + '. ' + df_dataset[2]
stemming = True
rel_freq = True
#print(BagOfWords.count_features(corpus))
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming)
#print(vocab)
for text in corpus:
print(text)
print()
print()
# ab hier ValueError bei nrows=10000...
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming)
print(dict)
print(len(vocab))
# for text in corpus:
# print(text)
# print()
# print()
# # ab hier ValueError bei nrows=10000...
# matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
# dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming)
# print(dict)
if __name__ == '__main__':
for word in sorted(BagOfWords.set_stop_words(False)):
print(word)
print()
print(PorterStemmer().stem(word))
print()
# BagOfWords.test()
# for word in sorted(BagOfWords.set_stop_words(False)):
# print(word)
# print()
# print(PorterStemmer().stem(word))
# print()
BagOfWords.test()

View File

@ -62,7 +62,7 @@ class CosineSimilarity:
if __name__ == '__main__':
# read data set
file = 'data\\interactive_labeling_dataset_without_header.csv'
file = 'data\\cleaned_data_set_without_header.csv'
df = pd.read_csv(file,
delimiter='|',
header=None,

View File

@ -12,6 +12,7 @@ writes it to a csv file.
import csv
import glob
import json
import string
import numpy as np
import pandas as pd
@ -39,7 +40,7 @@ class FileHandler:
def create_labeling_dataset():
# output file
o_file = 'data\\interactive_labeling_dataset.csv'
o_file = 'data\\cleaned_data_set_without_header.csv'
# create file and write header
with open(o_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile,
@ -77,6 +78,38 @@ class FileHandler:
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
def clean_articles():
'''clean articles in data set: filter out all non-printable characters
'''
# read data set
file = 'data\\cleaned_data_set_without_header.csv'
df = pd.read_csv(file,
delimiter='|',
header=None,
index_col=None,
engine='python',
#usecols=[1,2],
#nrows=100,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
# for each article(row)
for i in range (len(df)):
# filter headline
df.iloc[i][1] = ''.join(x for x in df.iloc[i][1] if x in string.printable)
# filter text
df.iloc[i][2] = ''.join(x for x in df.iloc[i][2] if x in string.printable)
print(df)
# save cleaned dataframe
df.to_csv('data\\cleaned_data_set_without_header.csv',
header=False,
index=False,
sep='|',
mode='a',
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
def write_articles_to_csv_files():
'''read JSON files, select articles and write them to csv.
'''
@ -160,8 +193,8 @@ class FileHandler:
print('#')
print('# saved {} articles in total'.format(a))
print('#')
def join_all_csv_files():
if __name__ == '__main__':
# FileHandler.write_articles_to_csv_files()
# FileHandler.create_labeling_dataset()
# FileHandler.create_labeling_dataset()
FileHandler.clean_articles()

99
NER.py
View File

@ -16,17 +16,31 @@ import numpy as np
import pandas as pd
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
import pickle
import re
class NER:
company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP', 'Plc', 'LLC',
'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB']
company_abbrevs = ['Inc.', 'Inc', 'Corp', '& Co', 'Co', 'Ltd.', 'Ltd',
'AG', 'LP', 'Limited', 'Tbk', 'Group', 'U.S.', 'BRIEF-',
'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB', 'Plc',
's.r.l.', 'Holding', 'Holdings']
# some entities and misc that are not companies
misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn',
'European Commission', 'EU', 'Staff', 'Min', 'Read',
'Thomson Reuters Trust Principles', 'New York Stock Exchange',
'NYSE']
'EU', 'Staff', 'Min', 'Read', 'SRF', 'New York Stock Exchange',
'NYSE', 'DAX' 'ECB', 'Federal Reserve', 'Muslim', 'JPMorgan',
'Standard & Poor', 'International Monetary Fund', 'Morgan Stanley',
'Hongkong', 'Whitehall Street', 'Fitch Australia Pty', 'AFS',
'FT House & Home', 'Fitch Rates Autonomous Community of Asturias',
'Autonomous Community of Asturias', 'Fitch Ratings Espana',
'Barcelona', 'Fitch Ratings ', 'Congress', 'Fed', 'OPEC', 'U.N.',
'National Federation of Independent Business', 'Barclays',
'McKinsey', 'Moody', 'Fitch Ratings Ltd.']
regex = r'European.*|.*Reuters.*|.*(B|b)ank.*|.*Ministry.*|.*Trump.*|.*Banca.*|\
.*Department.*|.*House.*|Wall (Street|Str).*|.*Congress.*|\
.*Republican.*|Goldman( Sachs)?|.*Chamber.*|.*Department.*'
def tag_words(text):
# path to Stanford NER
@ -61,6 +75,10 @@ class NER:
'''param: article text where organizations must be indentified
returns: list of identified organisations as strings
'''
# print(text)
# print()
# print('# examining article...')
# print()
# set paths
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
os.environ['JAVAHOME'] = java_path
@ -75,9 +93,15 @@ class NER:
#print(nes_coherent)
for tuple in nes_coherent:
# check if company and not already in list
if (tuple[0] not in NER.misc) and (tuple[0] not in seen):
if (tuple[0] not in NER.misc) and (tuple[0] not in seen)\
and (not re.search(NER.regex, tuple[0])):
organizations.append(tuple[0])
seen.add(tuple[0])
print('# recognized the following organizations:')
print()
print(organizations)
print()
print()
return organizations
def count_companies(texts):
@ -88,14 +112,37 @@ class NER:
print()
# dictionary of companies with their count
dict_com = {}
for text in texts:
# list of company lists (one per article)
coms_list = []
for i, text in enumerate(texts):
# list of found companies in article
print('# article no. {}:'.format(i))
coms = NER.find_companies(text)
coms_list.append(coms)
for com in coms:
if com in dict_com.keys():
dict_com[com] += 1
else:
dict_com[com] = 1
# print(coms_list)
# print()
# calculate number of company mentions per article
num_companies = []
for l in coms_list:
num_companies.append(len(l))
# print(num_companies)
print('# average number of different companies mentioned per article:')
print(sum(num_companies)/len(num_companies))
print()
# save num_companies object in file (for plotting)
with open('obj/'+ 'num_mentions_companies' + '.pkl', 'wb') as f:
pickle.dump(num_companies, f, pickle.HIGHEST_PROTOCOL)
# save dict_com object in file (for plotting)
with open('obj/'+ 'dict_organizations' + '.pkl', 'wb') as f:
pickle.dump(dict_com, f, pickle.HIGHEST_PROTOCOL)
#print(dict_com)
# # print outlier
# print(max(dict_com, key=dict_com.get))
return list(dict_com.values())
@ -103,27 +150,17 @@ class NER:
if __name__ == '__main__':
print('# starting NER...')
print()
test_article = '''Exclusive: Microsoft's $7.5 billion GitHub deal set for
EU approval - sources. BRUSSELS (Reuters) - U.S. software
giant Microsoft (MSFT.O) is set to win unconditional EU
antitrust approval for its $7.5 billion purchase of
privately held coding website GitHub, two people familiar
with the matter said on Monday. Microsoft announced the
deal in June, its largest acquisition since it bought
LinkedIn for $26 billion in 2016. The GitHub deal is
expected to boost the U.S. software giants cloud
computing business and challenge market leader Amazon
(AMZN.O). GitHub, the worlds largest code host, has
more than 28 million developers using its platform. It
will become a part of Microsofts Intelligent Cloud unit
once the acquisition is completed. Microsoft Chief
Executive Satya Nadella has tried to assuage users
worries that GitHub might favor Microsoft products
over competitors after the deal, saying GitHub would
continue to be an open platform that works with all
public clouds. The European Commission, which is set to
decide on the deal by Oct. 19, did not respond to a
request for immediate comment. Microsoft declined to
comment. Reporting by Foo Yun Chee; editing by Jason
Neely'''
print(NER.find_companies(test_article))
# read data set
file = 'data\\cleaned_data_set_without_header.csv'
df = pd.read_csv(file,
delimiter='|',
header=None,
index_col=None,
engine='python',
#usecols=[1,2],
nrows=100,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
#print(df)
texts = df[1] + '. ' + df[2]
NER.count_companies(texts)

View File

@ -8,7 +8,9 @@ from BagOfWords import BagOfWords
from NER import NER
import csv
from datetime import datetime
from os import path
import pickle
import matplotlib
import matplotlib.pyplot as plt
@ -19,42 +21,55 @@ from wordcloud import WordCloud
class VisualizerNews:
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
def plot_wordcloud_dataset():
'''plots word cloud image of most common words in dataset.
'''
print('# preparing word cloud of 200 most common words...')
print()
# load new data set
file = 'data\\interactive_labeling_dataset_without_header.csv'
file = 'data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(file,
delimiter='|',
header=None,
index_col=None,
engine='python',
usecols=[1,2],
#nrows=100,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
corpus = df_dataset[1] + '. ' + df_dataset[2]
stemming = False
rel_freq = False
rel_freq = True
# find most common words in dataset
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming)
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
dict = BagOfWords.make_dict_common_words(matrix, 200, rel_freq, stemming)
matrix = BagOfWords.make_matrix(extracted_words, vocab,
rel_freq, stemming)
dict = BagOfWords.make_dict_common_words(matrix, 200,
rel_freq, stemming)
# save dict object
with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
wordcloud = WordCloud(background_color='white',
width=2400,
height=1200,
scale=2,
# true if bigram:
collocations=False).generate_from_frequencies(dict)
collocations=False)\
.generate_from_frequencies(dict)
# display generated image
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('visualization\\WordCloud_{}.eps'
.format(VisualizerNews.datestring))
plt.savefig('visualization\\WordCloud_{}.png'
.format(VisualizerNews.datestring))
plt.show()
def plot_histogram_companies():
@ -66,13 +81,14 @@ class VisualizerNews:
print('# preparing histogram of company mentions...')
print()
# read data set
file = 'data\\interactive_labeling_dataset_without_header.csv'
file = 'data\\cleaned_data_set_without_header.csv'
df = pd.read_csv(file,
delimiter='|',
header=None,
index_col=None,
engine='python',
usecols=[1,2],
#nrows=10,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
@ -93,8 +109,15 @@ class VisualizerNews:
# Number of companies with this number of mentions
plt.ylabel('Number of companies with this number of articles')
num_bins = 50
n, bins, patches = plt.hist(names, num_bins, facecolor='darkred', alpha=0.5)
n, bins, patches = plt.hist(names, num_bins,
facecolor='darkred', alpha=0.5)
plt.axis([0, 50, 0, 1000])
# save to file
plt.savefig('visualization\\NER_{}.eps'
.format(VisualizerNews.datestring))
plt.savefig('visualization\\NER_{}.png'
.format(VisualizerNews.datestring))
plt.show()
def plot_histogram_text_lengths():
@ -105,10 +128,10 @@ class VisualizerNews:
print('# preparing histogram of text lengths...')
print()
# read data set
filepath = 'data\\interactive_labeling_dataset.csv'
filepath = 'data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(filepath,
delimiter='|',
header=0,
header=None,
index_col=None,
engine='python',
usecols=[2],
@ -126,23 +149,30 @@ class VisualizerNews:
count_chars.append(len(text))
# average of number of characters
av = int(sum(count_chars) / len(count_chars))
print('# average length of news articles is: {} characters'.format(av))
print('# average length of news articles is {} characters'.format(av))
print()
# sort list in descending order
count_chars.sort(reverse=True)
# convert list to array
names = np.asarray(count_chars)
# plt.title('Length of News Articles')
plt.xlabel('Number of characters in an article')
plt.xlabel('Number of characters in article')
plt.ylabel('Frequency')
# number of vertical bins
num_bins = 200
n, bins, patches = plt.hist(names, num_bins, facecolor='darkslategrey', alpha=0.5)
n, bins, patches = plt.hist(names, num_bins,
facecolor='darkslategrey', alpha=0.5)
# [xmin, xmax, ymin, ymax] of axis
#plt.axis([format(300, ','),format(10000, ','), 0, 500])
plt.axis([300,10000,0,500])
# format axis labels for thousends (e.g. '10,000')
plt.gca().xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
.FuncFormatter(lambda x, p: format(int(x), ',')))
# save plot
plt.savefig('visualization\\TextLength_{}.eps'\
.format(VisualizerNews.datestring))
plt.savefig('visualization\\TextLength_{}.png'\
.format(VisualizerNews.datestring))
plt.show()
def plot_pie_chart_of_sites():
@ -151,24 +181,24 @@ class VisualizerNews:
print()
# load data set
filepath = 'data\\interactive_labeling_dataset.csv'
filepath = 'data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(filepath,
delimiter='|',
header=0,
header=None,
#usecols=[3], #column 'Site'
index_col=None,
engine='python',
#nrows=100,
nrows=10,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
# find all different sites
df_counts = df_dataset.groupby('Site').count()
# count occurences of each site
df_counts = df_counts.sort_values(['Url'], ascending=False)
# find all different sites, group by 'Site'
df_counts = df_dataset.groupby(3).count()
# count occurences of each site, count different 'Url's
df_counts = df_counts.sort_values([5], ascending=False)
fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
data = list(df_counts['Url'])
data = list(df_counts[5])
# legend labels
labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)',
'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']
@ -185,12 +215,14 @@ class VisualizerNews:
plt.setp(autotexts, size=8, weight="bold")
plt.show()
plt.savefig('Sites_{}.pdf'.format(VisualizerNews.datestring))
plt.savefig('Sites_{}.pgf'.format(VisualizerNews.datestring))
def plot_hist_most_common_words(n_commons = 10):
print('# preparing histogram of most common words...')
print()
# load data set
filepath = 'data\\interactive_labeling_dataset_without_header.csv'
filepath = 'data\\cleaned_data_set_without_header.csv'
df_dataset = pd.read_csv(filepath,
delimiter='|',
header=None,
@ -209,8 +241,13 @@ class VisualizerNews:
# find most common words in dataset
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming)
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq, stemming)
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
stemming)
dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
stemming)
# save dict object
with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f:
pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
plt.xlabel('Most common words in textual corpus')
plt.ylabel('Relative frequency')
@ -222,11 +259,15 @@ class VisualizerNews:
height=numbers,
tick_label=labels,
facecolor='darkorange')
plt.savefig('visualization\\10_most_common_words_{}.eps'
.format(VisualizerNews.datestring))
plt.savefig('visualization\\10_most_common_words_{}.png'
.format(VisualizerNews.datestring))
plt.show()
if __name__ == '__main__':
VisualizerNews.plot_wordcloud_dataset()
# VisualizerNews.plot_histogram_companies()
# VisualizerNews.plot_wordcloud_dataset()
# VisualizerNews.plot_histogram_text_lengths()
# VisualizerNews.plot_pie_chart_of_sites()
VisualizerNews.plot_hist_most_common_words()

File diff suppressed because one or more lines are too long

Binary file not shown.

View File

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB