saving objects as pickles

2018-11-05 13:18:03 +01:00 · 2018-11-05 13:18:03 +01:00 · b7d1f546e4
commit b7d1f546e4
parent 7e037a1621
13 changed files with 17639 additions and 81 deletions
--- a/BagOfWords.py
+++ b/BagOfWords.py
@ -18,6 +18,7 @@ import re
 import numpy as np
 import pandas as pd
 from nltk.stem.porter import PorterStemmer
 import pickle
 class BagOfWords:
@ -114,6 +115,9 @@ class BagOfWords:
                        else:
                            # absolute word frequency
                            df_matrix.loc[i][v] += 1
        # save df_matrix object
        with open('obj/'+ 'document_term_matrix' + '.pkl', 'wb') as f:
            pickle.dump(df_matrix, f, pickle.HIGHEST_PROTOCOL)
        return df_matrix
@ -170,7 +174,7 @@ class BagOfWords:
        #add unwanted terms
        stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
-                           'file', 'photo', 'min', 'read', 'staff', 'left',
+                           'file', 'photo', 'min', 'read', 'staff', 'left', 'â',
                           'right', 'updated', 'minutes', 'brief', 'editing',
                           'reporting', 'ago', 'also', 'would', 'could',
                           'bit', 'ly', 'fy', 'economist', 'u', 'guardian'])
@ -202,9 +206,9 @@ class BagOfWords:
        # words under that rel_freq limit are not included
        # set limit
-        limit = 0.001
+        limit = 0.0001
        if not rel_freq:
-            limit = len(df_matrix) * 0.001
+            limit = len(df_matrix) * 0.0001
        # word => count
        dict = {}
@ -214,7 +218,8 @@ class BagOfWords:
            # count word mentions in total
            if (df_matrix[column].sum() > limit):
                dict[column] = df_matrix[column].sum()
-        # sort dict by value and 
+
        # sort dict by value
        o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
                             reverse=True))
        print(o_dict)
@ -226,6 +231,10 @@ class BagOfWords:
            next_highest = o_dict.popitem(last=False)
            n_dict[next_highest[0]] = next_highest[1]
        # save n_dict object
        with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
            pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
        return n_dict
    def count_features(texts, stemming=True):
@ -245,36 +254,38 @@ class BagOfWords:
        return sum
    def test():
-        file = 'data\\interactive_labeling_dataset_without_header.csv'
+        file = 'data\\cleaned_data_set_without_header.csv'
        df_dataset = pd.read_csv(file,
                                 delimiter='|',
                                 header=None,
                                 index_col=None,
                                 engine='python',
                                 usecols=[1,2],
-                                 nrows=100,
+                                 #nrows=100,
                                 quoting=csv.QUOTE_NONNUMERIC,
                                 quotechar='\'')
        corpus = df_dataset[1] + '. ' + df_dataset[2]
        stemming = True
        rel_freq = True
        #print(BagOfWords.count_features(corpus))
        extracted_words = BagOfWords.extract_all_words(corpus, stemming)
        vocab = BagOfWords.make_vocab(extracted_words, stemming)
-        #print(vocab)
+        print(len(vocab))
-        for text in corpus:
+
-            print(text)
+        # for text in corpus:
-            print()
+            # print(text)
-            print()
+            # print()
-        # ab hier ValueError bei nrows=10000...
+            # print()
-        matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
+        # # ab hier ValueError bei nrows=10000...
-        dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming)
+        # matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
-        print(dict)
+        # dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming)
        # print(dict)
 if __name__ == '__main__':
-    for word in sorted(BagOfWords.set_stop_words(False)):
+    # for word in sorted(BagOfWords.set_stop_words(False)):
-        print(word)
+        # print(word)
-        print()
+        # print()
-        print(PorterStemmer().stem(word))
+        # print(PorterStemmer().stem(word))
-        print()
+        # print()
-    # BagOfWords.test()
+    BagOfWords.test()
--- a/CosineSimilarity.py
+++ b/CosineSimilarity.py
@ -62,7 +62,7 @@ class CosineSimilarity:
 if __name__ == '__main__':
        # read data set
-        file = 'data\\interactive_labeling_dataset_without_header.csv'
+        file = 'data\\cleaned_data_set_without_header.csv'
        df = pd.read_csv(file,
                         delimiter='|',
                         header=None,
--- a/FileHandler.py
+++ b/FileHandler.py
@ -12,6 +12,7 @@ writes it to a csv file.
 import csv
 import glob
 import json
 import string
 import numpy as np
 import pandas as pd
@ -39,7 +40,7 @@ class FileHandler:
    def create_labeling_dataset():
        # output file
-        o_file = 'data\\interactive_labeling_dataset.csv'
+        o_file = 'data\\cleaned_data_set_without_header.csv'
        # create file and write header
        with open(o_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, 
@ -77,6 +78,38 @@ class FileHandler:
                                   quoting=csv.QUOTE_NONNUMERIC,
                                   quotechar='\'')
    def clean_articles():
        '''clean articles in data set: filter out all non-printable characters
        '''
        # read data set
        file = 'data\\cleaned_data_set_without_header.csv'
        df = pd.read_csv(file,
                         delimiter='|',
                         header=None,
                         index_col=None,
                         engine='python',
                         #usecols=[1,2],
                         #nrows=100,
                         quoting=csv.QUOTE_NONNUMERIC,
                         quotechar='\'')
        # for each article(row)
        for i in range (len(df)):
            # filter headline
            df.iloc[i][1] = ''.join(x for x in df.iloc[i][1] if x in string.printable)
            # filter text
            df.iloc[i][2] = ''.join(x for x in df.iloc[i][2] if x in string.printable)
        print(df)
        # save cleaned dataframe
        df.to_csv('data\\cleaned_data_set_without_header.csv',
                   header=False,
                   index=False,
                   sep='|',
                   mode='a',
                   encoding='utf-8',
                   quoting=csv.QUOTE_NONNUMERIC,
                   quotechar='\'')
    def write_articles_to_csv_files():
        '''read JSON files, select articles and write them to csv.
        '''
@ -160,8 +193,8 @@ class FileHandler:
            print('#')
        print('# saved {} articles in total'.format(a))
        print('#')
    def join_all_csv_files():
 if __name__ == '__main__':
    # FileHandler.write_articles_to_csv_files()
-    # FileHandler.create_labeling_dataset()
+    # FileHandler.create_labeling_dataset()
    FileHandler.clean_articles()
--- a/NER.py
+++ b/NER.py
@ -16,17 +16,31 @@ import numpy as np
 import pandas as pd
 from nltk.tag import StanfordNERTagger
 from nltk.tokenize import word_tokenize
 import pickle
 import re
 class NER:
-    company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP', 'Plc', 'LLC',
+    company_abbrevs = ['Inc.', 'Inc', 'Corp', '& Co', 'Co', 'Ltd.', 'Ltd',
-                       'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB']
+                       'AG', 'LP', 'Limited', 'Tbk', 'Group', 'U.S.', 'BRIEF-',
                       'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB', 'Plc',
                       's.r.l.', 'Holding', 'Holdings']
    # some entities and misc that are not companies
    misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn',
-            'European Commission', 'EU', 'Staff', 'Min', 'Read', 
+            'EU', 'Staff', 'Min', 'Read', 'SRF', 'New York Stock Exchange',
-            'Thomson Reuters Trust Principles', 'New York Stock Exchange',
+            'NYSE', 'DAX' 'ECB', 'Federal Reserve', 'Muslim', 'JPMorgan',
-            'NYSE']
+            'Standard & Poor', 'International Monetary Fund', 'Morgan Stanley',
            'Hongkong', 'Whitehall Street', 'Fitch Australia Pty', 'AFS',
            'FT House & Home', 'Fitch Rates Autonomous Community of Asturias',
            'Autonomous Community of Asturias', 'Fitch Ratings Espana',
            'Barcelona', 'Fitch Ratings ', 'Congress', 'Fed', 'OPEC', 'U.N.',
            'National Federation of Independent Business', 'Barclays',
            'McKinsey', 'Moody', 'Fitch Ratings Ltd.']
    regex = r'European.*|.*Reuters.*|.*(B|b)ank.*|.*Ministry.*|.*Trump.*|.*Banca.*|\
            .*Department.*|.*House.*|Wall (Street|Str).*|.*Congress.*|\
            .*Republican.*|Goldman( Sachs)?|.*Chamber.*|.*Department.*'
    def tag_words(text):
        # path to Stanford NER
@ -61,6 +75,10 @@ class NER:
        '''param: article text where organizations must be indentified
        returns: list of identified organisations as strings
        '''
        # print(text)
        # print()
        # print('# examining article...')
        # print()
        # set paths
        java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
        os.environ['JAVAHOME'] = java_path
@ -75,9 +93,15 @@ class NER:
        #print(nes_coherent)
        for tuple in nes_coherent:
            # check if company and not already in list
-            if (tuple[0] not in NER.misc) and (tuple[0] not in seen):
+            if (tuple[0] not in NER.misc) and (tuple[0] not in seen)\
                and (not re.search(NER.regex, tuple[0])):
                organizations.append(tuple[0])
                seen.add(tuple[0])
        print('# recognized the following organizations:')
        print()
        print(organizations)
        print()
        print()
        return organizations
    def count_companies(texts):
@ -88,14 +112,37 @@ class NER:
        print()
        # dictionary of companies with their count
        dict_com = {}
-        for text in texts:
+        # list of company lists (one per article)
        coms_list = []
        for i, text in enumerate(texts):
            # list of found companies in article
            print('# article no. {}:'.format(i))
            coms = NER.find_companies(text)
            coms_list.append(coms)
            for com in coms:
                if com in dict_com.keys():
                    dict_com[com] += 1
                else:
                    dict_com[com] = 1
        # print(coms_list)
        # print()
        # calculate number of company mentions per article
        num_companies = []
        for l in coms_list:
            num_companies.append(len(l))
        # print(num_companies)
        print('# average number of different companies mentioned per article:')
        print(sum(num_companies)/len(num_companies))
        print()
        # save num_companies object in file (for plotting)
        with open('obj/'+ 'num_mentions_companies' + '.pkl', 'wb') as f:
            pickle.dump(num_companies, f, pickle.HIGHEST_PROTOCOL)
        # save dict_com object in file (for plotting)
        with open('obj/'+ 'dict_organizations' + '.pkl', 'wb') as f:
            pickle.dump(dict_com, f, pickle.HIGHEST_PROTOCOL)
        #print(dict_com)
        # # print outlier
        # print(max(dict_com, key=dict_com.get))
        return list(dict_com.values())
@ -103,27 +150,17 @@ class NER:
 if __name__ == '__main__':
    print('# starting NER...')
    print()
-    test_article = '''Exclusive: Microsoft's $7.5 billion GitHub deal set for
+    # read data set
-                    EU approval - sources. BRUSSELS (Reuters) - U.S. software
+    file = 'data\\cleaned_data_set_without_header.csv'
-                    giant Microsoft (MSFT.O) is set to win unconditional EU
+    df = pd.read_csv(file,
-                    antitrust approval for its $7.5 billion purchase of
+                     delimiter='|',
-                    privately held coding website GitHub, two people familiar
+                     header=None,
-                    with the matter said on Monday. Microsoft announced the
+                     index_col=None,
-                    deal in June, its largest acquisition since it bought
+                     engine='python',
-                    LinkedIn for $26 billion in 2016. The GitHub deal is
+                     #usecols=[1,2],
-                    expected to boost the U.S. software giant’s cloud
+                     nrows=100,
-                    computing business and challenge market leader Amazon
+                     quoting=csv.QUOTE_NONNUMERIC,
-                    (AMZN.O). GitHub, the world’s largest code host, has
+                     quotechar='\'')
-                    more than 28 million developers using its platform. It
+    #print(df)
-                    will become a part of Microsoft’s Intelligent Cloud unit
+    texts = df[1] + '. ' + df[2]
-                    once the acquisition is completed. Microsoft Chief
+    NER.count_companies(texts)
                    Executive Satya Nadella has tried to assuage users’
                    worries that GitHub might favor Microsoft products
                    over competitors after the deal, saying GitHub would
                    continue to be an open platform that works with all
                    public clouds. The European Commission, which is set to
                    decide on the deal by Oct. 19, did not respond to a
                    request for immediate comment. Microsoft declined to
                    comment. Reporting by Foo Yun Chee; editing by Jason
                    Neely'''
    print(NER.find_companies(test_article))
--- a/VisualizerNews.py
+++ b/VisualizerNews.py
@ -8,7 +8,9 @@ from BagOfWords import BagOfWords
 from NER import NER
 import csv
 from datetime import datetime
 from os import path
 import pickle
 import matplotlib
 import matplotlib.pyplot as plt
@ -19,42 +21,55 @@ from wordcloud import WordCloud
 class VisualizerNews:
    datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
    def plot_wordcloud_dataset():
        '''plots word cloud image of most common words in dataset.
        '''
        print('# preparing word cloud of 200 most common words...')
        print()
        # load new data set
-        file = 'data\\interactive_labeling_dataset_without_header.csv'
+        file = 'data\\cleaned_data_set_without_header.csv'
        df_dataset = pd.read_csv(file,
                                 delimiter='|',
                                 header=None,
                                 index_col=None,
                                 engine='python',
                                 usecols=[1,2],
                                 #nrows=100,
                                 quoting=csv.QUOTE_NONNUMERIC,
                                 quotechar='\'')
        corpus = df_dataset[1] + '. ' + df_dataset[2]
        stemming = False
-        rel_freq = False
+        rel_freq = True
        # find most common words in dataset
        extracted_words = BagOfWords.extract_all_words(corpus, stemming)
        vocab = BagOfWords.make_vocab(extracted_words, stemming)
-        matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
+        matrix = BagOfWords.make_matrix(extracted_words, vocab,
-        dict = BagOfWords.make_dict_common_words(matrix, 200, rel_freq, stemming)
+                                        rel_freq, stemming)
        dict = BagOfWords.make_dict_common_words(matrix, 200,
                                                 rel_freq, stemming)
        # save dict object
        with open('obj/'+ 'dict_200_most_common_words' + '.pkl', 'wb') as f:
            pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
        wordcloud = WordCloud(background_color='white',
                              width=2400, 
                              height=1200, 
                              scale=2,
                              # true if bigram:
-                              collocations=False).generate_from_frequencies(dict)
+                              collocations=False)\
                              .generate_from_frequencies(dict)
        # display generated image
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.savefig('visualization\\WordCloud_{}.eps'
                    .format(VisualizerNews.datestring))
        plt.savefig('visualization\\WordCloud_{}.png'
                    .format(VisualizerNews.datestring))
        plt.show()
    def plot_histogram_companies():
@ -66,13 +81,14 @@ class VisualizerNews:
        print('# preparing histogram of company mentions...')
        print()
        # read data set
-        file = 'data\\interactive_labeling_dataset_without_header.csv'
+        file = 'data\\cleaned_data_set_without_header.csv'
        df = pd.read_csv(file,
                         delimiter='|',
                         header=None,
                         index_col=None,
                         engine='python',
                         usecols=[1,2],
                         #nrows=10,
                         quoting=csv.QUOTE_NONNUMERIC,
                         quotechar='\'')
@ -93,8 +109,15 @@ class VisualizerNews:
        # Number of companies with this number of mentions
        plt.ylabel('Number of companies with this number of articles')
        num_bins = 50
-        n, bins, patches = plt.hist(names, num_bins, facecolor='darkred', alpha=0.5)
+        n, bins, patches = plt.hist(names, num_bins,
                                    facecolor='darkred', alpha=0.5)
        plt.axis([0, 50, 0, 1000])
        # save to file
        plt.savefig('visualization\\NER_{}.eps'
                    .format(VisualizerNews.datestring))
        plt.savefig('visualization\\NER_{}.png'
                    .format(VisualizerNews.datestring))
        plt.show()
    def plot_histogram_text_lengths():
@ -105,10 +128,10 @@ class VisualizerNews:
        print('# preparing histogram of text lengths...')
        print()
        # read data set
-        filepath = 'data\\interactive_labeling_dataset.csv'
+        filepath = 'data\\cleaned_data_set_without_header.csv'
        df_dataset = pd.read_csv(filepath,
                                 delimiter='|',
-                                 header=0,
+                                 header=None,
                                 index_col=None,
                                 engine='python',
                                 usecols=[2],
@ -126,23 +149,30 @@ class VisualizerNews:
            count_chars.append(len(text))
        # average of number of characters
        av = int(sum(count_chars) / len(count_chars))
-        print('# average length of news articles is: {} characters'.format(av))
+        print('# average length of news articles is {} characters'.format(av))
        print()
        # sort list in descending order
        count_chars.sort(reverse=True)
        # convert list to array
        names = np.asarray(count_chars)
        # plt.title('Length of News Articles')
-        plt.xlabel('Number of characters in an article')
+        plt.xlabel('Number of characters in article')
        plt.ylabel('Frequency')
        # number of vertical bins
        num_bins = 200
-        n, bins, patches = plt.hist(names, num_bins, facecolor='darkslategrey', alpha=0.5)
+        n, bins, patches = plt.hist(names, num_bins,
                                    facecolor='darkslategrey', alpha=0.5)
        # [xmin, xmax, ymin, ymax] of axis
        #plt.axis([format(300, ','),format(10000, ','), 0, 500])
        plt.axis([300,10000,0,500])
        # format axis labels for thousends (e.g. '10,000')
-        plt.gca().xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
+        plt.gca().xaxis.set_major_formatter(matplotlib.ticker\
            .FuncFormatter(lambda x, p: format(int(x), ',')))
        # save plot
        plt.savefig('visualization\\TextLength_{}.eps'\
                    .format(VisualizerNews.datestring))
        plt.savefig('visualization\\TextLength_{}.png'\
                    .format(VisualizerNews.datestring))
        plt.show()
    def plot_pie_chart_of_sites():
@ -151,24 +181,24 @@ class VisualizerNews:
        print()
        # load data set
-        filepath = 'data\\interactive_labeling_dataset.csv'
+        filepath = 'data\\cleaned_data_set_without_header.csv'
        df_dataset = pd.read_csv(filepath,
                                 delimiter='|',
-                                 header=0,
+                                 header=None,
                                 #usecols=[3], #column 'Site'
                                 index_col=None,
                                 engine='python',
-                                 #nrows=100,
+                                 nrows=10,
                                 quoting=csv.QUOTE_NONNUMERIC,
                                 quotechar='\'')
-        # find all different sites
+        # find all different sites, group by 'Site'
-        df_counts = df_dataset.groupby('Site').count()
+        df_counts = df_dataset.groupby(3).count()
-        # count occurences of each site
+        # count occurences of each site, count different 'Url's
-        df_counts = df_counts.sort_values(['Url'], ascending=False)
+        df_counts = df_counts.sort_values([5], ascending=False)
        fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
-        data = list(df_counts['Url'])
+        data = list(df_counts[5])
        # legend labels
        labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)', 
                  'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']
@ -185,12 +215,14 @@ class VisualizerNews:
        plt.setp(autotexts, size=8, weight="bold")
        plt.show()
        plt.savefig('Sites_{}.pdf'.format(VisualizerNews.datestring))
        plt.savefig('Sites_{}.pgf'.format(VisualizerNews.datestring))
    def plot_hist_most_common_words(n_commons = 10):
        print('# preparing histogram of most common words...')
        print()
        # load data set
-        filepath = 'data\\interactive_labeling_dataset_without_header.csv'
+        filepath = 'data\\cleaned_data_set_without_header.csv'
        df_dataset = pd.read_csv(filepath,
                                 delimiter='|',
                                 header=None,
@ -209,8 +241,13 @@ class VisualizerNews:
        # find most common words in dataset
        extracted_words = BagOfWords.extract_all_words(corpus, stemming)
        vocab = BagOfWords.make_vocab(extracted_words, stemming)
-        matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
+        matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
-        dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq, stemming)
+                                        stemming)
        dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq,
                                                 stemming)
        # save dict object
        with open('obj/'+ 'dict_10_most_common_words' + '.pkl', 'wb') as f:
            pickle.dump(n_dict, f, pickle.HIGHEST_PROTOCOL)
        plt.xlabel('Most common words in textual corpus')
        plt.ylabel('Relative frequency')
@ -222,11 +259,15 @@ class VisualizerNews:
                height=numbers, 
                tick_label=labels, 
                facecolor='darkorange')
        plt.savefig('visualization\\10_most_common_words_{}.eps'
                    .format(VisualizerNews.datestring))
        plt.savefig('visualization\\10_most_common_words_{}.png'
                    .format(VisualizerNews.datestring))
        plt.show()
 if __name__ == '__main__':
    VisualizerNews.plot_wordcloud_dataset()
    # VisualizerNews.plot_histogram_companies()
    # VisualizerNews.plot_wordcloud_dataset()
    # VisualizerNews.plot_histogram_text_lengths()
    # VisualizerNews.plot_pie_chart_of_sites()
    VisualizerNews.plot_hist_most_common_words()
--- a/data/cleaned_data_set_without_header.csv
+++ b/data/cleaned_data_set_without_header.csv
--- a/obj/dict_organizations.pkl
+++ b/obj/dict_organizations.pkl
--- a/obj/list_organizations.pkl
+++ b/obj/list_organizations.pkl
--- a/obj/num_mentions_companies.pkl
+++ b/obj/num_mentions_companies.pkl
--- a/visualization/TextLength_2018-11-05.eps
+++ b/visualization/TextLength_2018-11-05.eps
--- a/visualization/TextLength_2018-11-05.pdf
+++ b/visualization/TextLength_2018-11-05.pdf
--- a/visualization/TextLength_2018-11-05.pgf
+++ b/visualization/TextLength_2018-11-05.pgf
--- a/visualization/TextLength_2018-11-05.png
+++ b/visualization/TextLength_2018-11-05.png