saving objects as pickles

2018-11-05 13:18:03 +01:00 · 2018-11-05 13:18:03 +01:00 · b7d1f546e4
commit b7d1f546e4
parent 7e037a1621
13 changed files with 17639 additions and 81 deletions
--- a/BagOfWords.py
+++ b/BagOfWords.py
@ -18,6 +18,7 @@ import re
 import numpy as np
 import pandas as pd
 from nltk.stem.porter import PorterStemmer
+import pickle

 class BagOfWords:

@ -114,6 +115,9 @@ class BagOfWords:
                        else:
                            # absolute word frequency
                            df_matrix.loc[i][v] += 1
+        # save df_matrix object
+        with open('obj/'+ 'document_term_matrix' + '.pkl', 'wb') as f:
+            pickle.dump(df_matrix, f, pickle.HIGHEST_PROTOCOL)

        return df_matrix

@ -170,7 +174,7 @@ class BagOfWords:

        #add unwanted terms
        stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
-                           'file', 'photo', 'min', 'read', 'staff', 'left',
+                           'file', 'photo', 'min', 'read', 'staff', 'left', 'â',
                           'right', 'updated', 'minutes', 'brief', 'editing',
                           'reporting', 'ago', 'also', 'would', 'could',
                           'bit', 'ly', 'fy', 'economist', 'u', 'guardian'])
@ -202,9 +206,9 @@ class BagOfWords:

        # words under that rel_freq limit are not included
        # set limit
-        limit = 0.001
+        limit = 0.0001
        if not rel_freq:
-            limit = len(df_matrix) * 0.001
+            limit = len(df_matrix) * 0.0001

        # word => count
        dict = {}
@ -214,7 +218,8 @@ class BagOfWords:
            # count word mentions in total
            if (df_matrix[column].sum() > limit):
                dict[column] = df_matrix[column].sum()
-        # sort dict by value and 
+
+        # sort dict by value
        o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
                             reverse=True))
        print(o_dict)
@ -226,6 +231,10 @@ class BagOfWords:
            next_highest = o_dict.popitem(last=False)
            n_dict[next_highest[0]] = next_highest[1]

+        # save n_dict object
+        with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
+            pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
+
        return n_dict

    def count_features(texts, stemming=True):
@ -245,36 +254,38 @@ class BagOfWords:
        return sum

    def test():
-        file = 'data\\interactive_labeling_dataset_without_header.csv'
+        file = 'data\\cleaned_data_set_without_header.csv'
        df_dataset = pd.read_csv(file,
                                 delimiter='|',
                                 header=None,
                                 index_col=None,
                                 engine='python',
                                 usecols=[1,2],
-                                 nrows=100,
+                                 #nrows=100,
                                 quoting=csv.QUOTE_NONNUMERIC,
                                 quotechar='\'')

        corpus = df_dataset[1] + '. ' + df_dataset[2]
        stemming = True
        rel_freq = True
+        #print(BagOfWords.count_features(corpus))
        extracted_words = BagOfWords.extract_all_words(corpus, stemming)
        vocab = BagOfWords.make_vocab(extracted_words, stemming)
-        #print(vocab)
-        for text in corpus:
-            print(text)
-            print()
-            print()
-        # ab hier ValueError bei nrows=10000...
-        matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
-        dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming)
-        print(dict)
+        print(len(vocab))
+
+        # for text in corpus:
+            # print(text)
+            # print()
+            # print()
+        # # ab hier ValueError bei nrows=10000...
+        # matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
+        # dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming)
+        # print(dict)

 if __name__ == '__main__':
-    for word in sorted(BagOfWords.set_stop_words(False)):
-        print(word)
-        print()
-        print(PorterStemmer().stem(word))
-        print()
-    # BagOfWords.test()
+    # for word in sorted(BagOfWords.set_stop_words(False)):
+        # print(word)
+        # print()
+        # print(PorterStemmer().stem(word))
+        # print()
+    BagOfWords.test()
--- a/CosineSimilarity.py
+++ b/CosineSimilarity.py
@ -62,7 +62,7 @@ class CosineSimilarity:

 if __name__ == '__main__':
        # read data set
-        file = 'data\\interactive_labeling_dataset_without_header.csv'
+        file = 'data\\cleaned_data_set_without_header.csv'
        df = pd.read_csv(file,
                         delimiter='|',
                         header=None,
--- a/FileHandler.py
+++ b/FileHandler.py
@ -12,6 +12,7 @@ writes it to a csv file.
 import csv
 import glob
 import json
+import string

 import numpy as np
 import pandas as pd
@ -39,7 +40,7 @@ class FileHandler:

    def create_labeling_dataset():
        # output file
-        o_file = 'data\\interactive_labeling_dataset.csv'
+        o_file = 'data\\cleaned_data_set_without_header.csv'
        # create file and write header
        with open(o_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, 
@ -77,6 +78,38 @@ class FileHandler:
                                   quoting=csv.QUOTE_NONNUMERIC,
                                   quotechar='\'')

+    def clean_articles():
+        '''clean articles in data set: filter out all non-printable characters
+        '''
+        # read data set
+        file = 'data\\cleaned_data_set_without_header.csv'
+        df = pd.read_csv(file,
+                         delimiter='|',
+                         header=None,
+                         index_col=None,
+                         engine='python',
+                         #usecols=[1,2],
+                         #nrows=100,
+                         quoting=csv.QUOTE_NONNUMERIC,
+                         quotechar='\'')
+
+        # for each article(row)
+        for i in range (len(df)):
+            # filter headline
+            df.iloc[i][1] = ''.join(x for x in df.iloc[i][1] if x in string.printable)
+            # filter text
+            df.iloc[i][2] = ''.join(x for x in df.iloc[i][2] if x in string.printable)
+        print(df)
+        # save cleaned dataframe
+        df.to_csv('data\\cleaned_data_set_without_header.csv',
+                   header=False,
+                   index=False,
+                   sep='|',
+                   mode='a',
+                   encoding='utf-8',
+                   quoting=csv.QUOTE_NONNUMERIC,
+                   quotechar='\'')
+
    def write_articles_to_csv_files():
        '''read JSON files, select articles and write them to csv.
        '''
@ -160,8 +193,8 @@ class FileHandler:
            print('#')
        print('# saved {} articles in total'.format(a))
        print('#')
-    def join_all_csv_files():

 if __name__ == '__main__':
    # FileHandler.write_articles_to_csv_files()
-    # FileHandler.create_labeling_dataset()
+    # FileHandler.create_labeling_dataset()
+    FileHandler.clean_articles()
--- a/NER.py
+++ b/NER.py
@ -16,17 +16,31 @@ import numpy as np
 import pandas as pd
 from nltk.tag import StanfordNERTagger
 from nltk.tokenize import word_tokenize
+import pickle
+import re

 class NER:

-    company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP', 'Plc', 'LLC',
-                       'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB']
+    company_abbrevs = ['Inc.', 'Inc', 'Corp', '& Co', 'Co', 'Ltd.', 'Ltd',
+                       'AG', 'LP', 'Limited', 'Tbk', 'Group', 'U.S.', 'BRIEF-',
+                       'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB', 'Plc',
+                       's.r.l.', 'Holding', 'Holdings']

    # some entities and misc that are not companies
    misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn',
-            'European Commission', 'EU', 'Staff', 'Min', 'Read', 
-            'Thomson Reuters Trust Principles', 'New York Stock Exchange',
-            'NYSE']
+            'EU', 'Staff', 'Min', 'Read', 'SRF', 'New York Stock Exchange',
+            'NYSE', 'DAX' 'ECB', 'Federal Reserve', 'Muslim', 'JPMorgan',
+            'Standard & Poor', 'International Monetary Fund', 'Morgan Stanley',
+            'Hongkong', 'Whitehall Street', 'Fitch Australia Pty', 'AFS',
+            'FT House & Home', 'Fitch Rates Autonomous Community of Asturias',
+            'Autonomous Community of Asturias', 'Fitch Ratings Espana',
+            'Barcelona', 'Fitch Ratings ', 'Congress', 'Fed', 'OPEC', 'U.N.',
+            'National Federation of Independent Business', 'Barclays',
+            'McKinsey', 'Moody', 'Fitch Ratings Ltd.']
+
+    regex = r'European.*|.*Reuters.*|.*(B|b)ank.*|.*Ministry.*|.*Trump.*|.*Banca.*|\
+            .*Department.*|.*House.*|Wall (Street|Str).*|.*Congress.*|\
+            .*Republican.*|Goldman( Sachs)?|.*Chamber.*|.*Department.*'

    def tag_words(text):
        # path to Stanford NER
@ -61,6 +75,10 @@ class NER:
        '''param: article text where organizations must be indentified
        returns: list of identified organisations as strings
        '''
+        # print(text)
+        # print()
+        # print('# examining article...')
+        # print()
        # set paths
        java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
        os.environ['JAVAHOME'] = java_path
@ -75,9 +93,15 @@ class NER:
        #print(nes_coherent)
        for tuple in nes_coherent:
            # check if company and not already in list
-            if (tuple[0] not in NER.misc) and (tuple[0] not in seen):
+            if (tuple[0] not in NER.misc) and (tuple[0] not in seen)\
+                and (not re.search(NER.regex, tuple[0])):
                organizations.append(tuple[0])
                seen.add(tuple[0])
+        print('# recognized the following organizations:')
+        print()
+        print(organizations)
+        print()
+        print()
        return organizations

    def count_companies(texts):
@ -88,14 +112,37 @@ class NER:
        print()
        # dictionary of companies with their count
        dict_com = {}
-        for text in texts:
+        # list of company lists (one per article)
+        coms_list = []
+        for i, text in enumerate(texts):
            # list of found companies in article
+            print('# article no. {}:'.format(i))
            coms = NER.find_companies(text)
+            coms_list.append(coms)
+
            for com in coms:
                if com in dict_com.keys():
                    dict_com[com] += 1
                else:
                    dict_com[com] = 1
+        # print(coms_list)
+        # print()
+        # calculate number of company mentions per article
+        num_companies = []
+        for l in coms_list:
+            num_companies.append(len(l))
+        # print(num_companies)
+        print('# average number of different companies mentioned per article:')
+        print(sum(num_companies)/len(num_companies))
+        print()
+        # save num_companies object in file (for plotting)
+        with open('obj/'+ 'num_mentions_companies' + '.pkl', 'wb') as f:
+            pickle.dump(num_companies, f, pickle.HIGHEST_PROTOCOL)
+        # save dict_com object in file (for plotting)
+        with open('obj/'+ 'dict_organizations' + '.pkl', 'wb') as f:
+            pickle.dump(dict_com, f, pickle.HIGHEST_PROTOCOL)
+
+        #print(dict_com)
        # # print outlier
        # print(max(dict_com, key=dict_com.get))
        return list(dict_com.values())
@ -103,27 +150,17 @@ class NER:
 if __name__ == '__main__':
    print('# starting NER...')
    print()
-    test_article = '''Exclusive: Microsoft's $7.5 billion GitHub deal set for
-                    EU approval - sources. BRUSSELS (Reuters) - U.S. software
-                    giant Microsoft (MSFT.O) is set to win unconditional EU
-                    antitrust approval for its $7.5 billion purchase of
-                    privately held coding website GitHub, two people familiar
-                    with the matter said on Monday. Microsoft announced the
-                    deal in June, its largest acquisition since it bought
-                    LinkedIn for $26 billion in 2016. The GitHub deal is
-                    expected to boost the U.S. software giant’s cloud
-                    computing business and challenge market leader Amazon
-                    (AMZN.O). GitHub, the world’s largest code host, has
-                    more than 28 million developers using its platform. It
-                    will become a part of Microsoft’s Intelligent Cloud unit
-                    once the acquisition is completed. Microsoft Chief
-                    Executive Satya Nadella has tried to assuage users’
-                    worries that GitHub might favor Microsoft products
-                    over competitors after the deal, saying GitHub would
-                    continue to be an open platform that works with all
-                    public clouds. The European Commission, which is set to
-                    decide on the deal by Oct. 19, did not respond to a
-                    request for immediate comment. Microsoft declined to
-                    comment. Reporting by Foo Yun Chee; editing by Jason
-                    Neely'''
-    print(NER.find_companies(test_article))
+    # read data set
+    file = 'data\\cleaned_data_set_without_header.csv'
+    df = pd.read_csv(file,
+                     delimiter='|',
+                     header=None,
+                     index_col=None,
+                     engine='python',
+                     #usecols=[1,2],
+                     nrows=100,
+                     quoting=csv.QUOTE_NONNUMERIC,
+                     quotechar='\'')
+    #print(df)
+    texts = df[1] + '. ' + df[2]
+    NER.count_companies(texts)
--- a/VisualizerNews.py
+++ b/VisualizerNews.py
@ -8,7 +8,9 @@ from BagOfWords import BagOfWords
 from NER import NER

 import csv
+from datetime import datetime
 from os import path
+import pickle

 import matplotlib
 import matplotlib.pyplot as plt
@ -19,42 +21,55 @@ from wordcloud import WordCloud

 class VisualizerNews:

+    datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
+
    def plot_wordcloud_dataset():
        '''plots word cloud image of most common words in dataset.
        '''
        print('# preparing word cloud of 200 most common words...')
        print()
        # load new data set
-        file = 'data\\interactive_labeling_dataset_without_header.csv'
+        file = 'data\\cleaned_data_set_without_header.csv'
        df_dataset = pd.read_csv(file,
                                 delimiter='|',
                                 header=None,
                                 index_col=None,
                                 engine='python',
                                 usecols=[1,2],
+                                 #nrows=100,
                                 quoting=csv.QUOTE_NONNUMERIC,
                                 quotechar='\'')

        corpus = df_dataset[1] + '. ' + df_dataset[2]
        stemming = False
-        rel_freq = False
+        rel_freq = True

        # find most common words in dataset
        extracted_words = BagOfWords.extract_all_words(corpus, stemming)
        vocab = BagOfWords.make_vocab(extracted_words, stemming)
-        matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
-        dict = BagOfWords.make_dict_common_words(matrix, 200, rel_freq, stemming)
+        matrix = BagOfWords.make_matrix(extracted_words, vocab,
+                                        rel_freq, stemming)
+        dict = BagOfWords.make_dict_common_words(matrix, 200,
+                                                 rel_freq, stemming)
+        # save dict object
+        with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
+            pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)

        wordcloud = WordCloud(background_color='white',
                              width=2400, 
                              height=1200, 
                              scale=2,
                              # true if bigram:
-                              collocations=False).generate_from_frequencies(dict)
+                              collocations=False)\
+                              .generate_from_frequencies(dict)

        # display generated image
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
+        plt.savefig('visualization\\WordCloud_{}.eps'
+                    .format(VisualizerNews.datestring))
+        plt.savefig('visualization\\WordCloud_{}.png'
+                    .format(VisualizerNews.datestring))
        plt.show()

    def plot_histogram_companies():
@ -66,13 +81,14 @@ class VisualizerNews:
        print('# preparing histogram of company mentions...')
        print()
        # read data set
-        file = 'data\\interactive_labeling_dataset_without_header.csv'
+        file = 'data\\cleaned_data_set_without_header.csv'
        df = pd.read_csv(file,
                         delimiter='|',
                         header=None,
                         index_col=None,
                         engine='python',
                         usecols=[1,2],
+                         #nrows=10,
                         quoting=csv.QUOTE_NONNUMERIC,
                         quotechar='\'')

@ -93,8 +109,15 @@ class VisualizerNews:
        # Number of companies with this number of mentions
        plt.ylabel('Number of companies with this number of articles')
        num_bins = 50
-        n, bins, patches = plt.hist(names, num_bins, facecolor='darkred', alpha=0.5)
+        n, bins, patches = plt.hist(names, num_bins,
+                                    facecolor='darkred', alpha=0.5)
        plt.axis([0, 50, 0, 1000])
+
+        # save to file
+        plt.savefig('visualization\\NER_{}.eps'
+                    .format(VisualizerNews.datestring))
+        plt.savefig('visualization\\NER_{}.png'
+                    .format(VisualizerNews.datestring))
        plt.show()

    def plot_histogram_text_lengths():
@ -105,10 +128,10 @@ class VisualizerNews:
        print('# preparing histogram of text lengths...')
        print()
        # read data set
-        filepath = 'data\\interactive_labeling_dataset.csv'
+        filepath = 'data\\cleaned_data_set_without_header.csv'
        df_dataset = pd.read_csv(filepath,
                                 delimiter='|',
-                                 header=0,
+                                 header=None,
                                 index_col=None,
                                 engine='python',
                                 usecols=[2],
@ -126,23 +149,30 @@ class VisualizerNews:
            count_chars.append(len(text))
        # average of number of characters
        av = int(sum(count_chars) / len(count_chars))
-        print('# average length of news articles is: {} characters'.format(av))
+        print('# average length of news articles is {} characters'.format(av))
        print()
        # sort list in descending order
        count_chars.sort(reverse=True)
        # convert list to array
        names = np.asarray(count_chars)
        # plt.title('Length of News Articles')
-        plt.xlabel('Number of characters in an article')
+        plt.xlabel('Number of characters in article')
        plt.ylabel('Frequency')
        # number of vertical bins
        num_bins = 200
-        n, bins, patches = plt.hist(names, num_bins, facecolor='darkslategrey', alpha=0.5)
+        n, bins, patches = plt.hist(names, num_bins,
+                                    facecolor='darkslategrey', alpha=0.5)
        # [xmin, xmax, ymin, ymax] of axis
        #plt.axis([format(300, ','),format(10000, ','), 0, 500])
        plt.axis([300,10000,0,500])
        # format axis labels for thousends (e.g. '10,000')
-        plt.gca().xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
+        plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
+            .FuncFormatter(lambda x, p: format(int(x), ',')))
+        # save plot
+        plt.savefig('visualization\\TextLength_{}.eps'\
+                    .format(VisualizerNews.datestring))
+        plt.savefig('visualization\\TextLength_{}.png'\
+                    .format(VisualizerNews.datestring))
        plt.show()

    def plot_pie_chart_of_sites():
@ -151,24 +181,24 @@ class VisualizerNews:
        print()

        # load data set
-        filepath = 'data\\interactive_labeling_dataset.csv'
+        filepath = 'data\\cleaned_data_set_without_header.csv'
        df_dataset = pd.read_csv(filepath,
                                 delimiter='|',
-                                 header=0,
+                                 header=None,
                                 #usecols=[3], #column 'Site'
                                 index_col=None,
                                 engine='python',
-                                 #nrows=100,
+                                 nrows=10,
                                 quoting=csv.QUOTE_NONNUMERIC,
                                 quotechar='\'')
-        # find all different sites
-        df_counts = df_dataset.groupby('Site').count()
-        # count occurences of each site
-        df_counts = df_counts.sort_values(['Url'], ascending=False)
+        # find all different sites, group by 'Site'
+        df_counts = df_dataset.groupby(3).count()
+        # count occurences of each site, count different 'Url's
+        df_counts = df_counts.sort_values([5], ascending=False)

        fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))

-        data = list(df_counts['Url'])
+        data = list(df_counts[5])
        # legend labels
        labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)', 
                  'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']
@ -185,12 +215,14 @@ class VisualizerNews:

        plt.setp(autotexts, size=8, weight="bold")
        plt.show()
+        plt.savefig('Sites_{}.pdf'.format(VisualizerNews.datestring))
+        plt.savefig('Sites_{}.pgf'.format(VisualizerNews.datestring))

    def plot_hist_most_common_words(n_commons = 10):
        print('# preparing histogram of most common words...')
        print()
        # load data set
-        filepath = 'data\\interactive_labeling_dataset_without_header.csv'
+        filepath = 'data\\cleaned_data_set_without_header.csv'
        df_dataset = pd.read_csv(filepath,
                                 delimiter='|',
                                 header=None,
@ -209,8 +241,13 @@ class VisualizerNews:
        # find most common words in dataset
        extracted_words = BagOfWords.extract_all_words(corpus, stemming)
        vocab = BagOfWords.make_vocab(extracted_words, stemming)
-        matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
-        dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq, stemming)
+        matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
+                                        stemming)
+        dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
+                                                 stemming)
+        # save dict object
+        with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f:
+            pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)

        plt.xlabel('Most common words in textual corpus')
        plt.ylabel('Relative frequency')
@ -222,11 +259,15 @@ class VisualizerNews:
                height=numbers, 
                tick_label=labels, 
                facecolor='darkorange')
+        plt.savefig('visualization\\10_most_common_words_{}.eps'
+                    .format(VisualizerNews.datestring))
+        plt.savefig('visualization\\10_most_common_words_{}.png'
+                    .format(VisualizerNews.datestring))
        plt.show()

 if __name__ == '__main__':
+    VisualizerNews.plot_wordcloud_dataset()
    # VisualizerNews.plot_histogram_companies()
-    # VisualizerNews.plot_wordcloud_dataset()
    # VisualizerNews.plot_histogram_text_lengths()
    # VisualizerNews.plot_pie_chart_of_sites()
    VisualizerNews.plot_hist_most_common_words()
--- a/data/cleaned_data_set_without_header.csv
+++ b/data/cleaned_data_set_without_header.csv
--- a/obj/dict_organizations.pkl
+++ b/obj/dict_organizations.pkl
--- a/obj/list_organizations.pkl
+++ b/obj/list_organizations.pkl
--- a/obj/num_mentions_companies.pkl
+++ b/obj/num_mentions_companies.pkl
--- a/visualization/TextLength_2018-11-05.eps
+++ b/visualization/TextLength_2018-11-05.eps
--- a/visualization/TextLength_2018-11-05.pdf
+++ b/visualization/TextLength_2018-11-05.pdf
--- a/visualization/TextLength_2018-11-05.pgf
+++ b/visualization/TextLength_2018-11-05.pgf
--- a/visualization/TextLength_2018-11-05.png
+++ b/visualization/TextLength_2018-11-05.png