changes document term matrix

2018-10-29 13:47:39 +01:00 · 2018-10-29 13:47:39 +01:00 · 2d5368e283
parent 6d15207da9
commit 2d5368e283
2 changed files with 106 additions and 37 deletions
--- a/BagOfWords.py
+++ b/BagOfWords.py
@ -15,6 +15,7 @@ from collections import OrderedDict
 import csv
 import re

+import numpy as np
 import pandas as pd
 from nltk.stem.porter import PorterStemmer

@ -48,6 +49,48 @@ class BagOfWords:
                words_cleaned.append(word)
        return words_cleaned

+    # def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True):
+        # '''calculates word stem frequencies in input articles. returns
+        # document term matrix(DataFrame) with relative word frequencies
+        # (0 <= values < 1) if relative_word_frequencies=True or absolute
+        # word frequencies (int) if relative_word_frequencies=False.
+        # (rows: different articles, colums: different words in vocab)
+        # returns matrix as DataFrame
+        # '''
+        # print('# BOW: calculating matrix...')
+        # print()
+        # # create list of tuples
+        # vectors = []
+        # # for every text in series
+        # for i in range(len(series)):
+            # # extract text of single article
+            # text = series.iloc[i]
+            # # extract its words
+            # words = BagOfWords.extract_words(text, stemming)
+            # # count words in single article
+            # word_count = len(words)
+            # vector = []
+            # for i, v in enumerate(vocab):
+                # vector.append(0)
+                # for w in words:
+                    # if w == v:
+                        # if relative_word_frequencies:
+                            # # relative word frequency
+                            # vector[i] += 1/word_count
+                        # else:
+                            # # absolute word frequency
+                            # vector[i] += 1
+
+            # # !!! hier passiert immer der MemoryError: !!!
+
+            # # add single vector as tuple
+            # vectors.append(tuple(vector))
+        # df_vectors = pd.DataFrame.from_records(vectors,
+                                               # index=None,
+                                               # #header=vocab,
+                                               # columns=vocab)
+        # return df_vectors
+
    def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True):
        '''calculates word stem frequencies in input articles. returns
        document term matrix(DataFrame) with relative word frequencies
@ -58,34 +101,35 @@ class BagOfWords:
        '''
        print('# BOW: calculating matrix...')
        print()
-        # create list of tuples
-        vectors = []
+        # create zero-filled dataframe
+        array = np.zeros(shape=(len(series),len(vocab)))
+        df_matrix = pd.DataFrame(array, columns=vocab)
+
        # for every text in series
        for i in range(len(series)):
+
            # extract text of single article
            text = series.iloc[i]
+
            # extract its words
            words = BagOfWords.extract_words(text, stemming)
-            # count words in single article
+            # count words in article
            word_count = len(words)
-            vector = []
-            for i, v in enumerate(vocab):
-                vector.append(0)
+
+            # for every word in global vocab
+            for v in vocab:
+                # for every word in article
                for w in words:
+                    # find right position
                    if w == v:
                        if relative_word_frequencies:
                            # relative word frequency
-                            vector[i] += 1/word_count
+                            df_matrix.loc[i][v] += 1/word_count
                        else:
                            # absolute word frequency
-                            vector[i] += 1
-            # add single vector as tuple
-            vectors.append(tuple(vector))
-        df_vectors = pd.DataFrame.from_records(vectors,
-                                               index=None,
-                                               #header=vocab,
-                                               columns=vocab)
-        return df_vectors
+                            df_matrix.loc[i][v] += 1
+
+        return df_matrix

    def make_vocab(series, stemming=True):
        '''adds words of input articles to a global vocabulary.
@ -158,10 +202,14 @@ class BagOfWords:
            # transform list to set to eliminate duplicates
        return set(stop_words)

-    def make_dict_common_words(texts, rel_freq=False, stemming=True, n=200):
+    def make_dict_common_words(texts, rel_freq=True, stemming=True, n=200):
        '''texts: df of article texts of complete data set as series,
        return dict of words with their count.
        '''
+        # words under that rel_freq limit are not included
+        limit = 0.0005
+        if not rel_freq:
+            limit = 25
        # word => count
        dict = {}
        vocab = BagOfWords.make_vocab(texts, stemming)
@ -171,6 +219,7 @@ class BagOfWords:
        # iterate over words
        for column in df_matrix:
            # count word mentions in total
+            if (df_matrix[column].sum() > limit):
                dict[column] = df_matrix[column].sum()
        # sort dict by value and 
        o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
@ -182,9 +231,19 @@ class BagOfWords:
        return n_dict

    def count_features(texts, stemming=True):
+        print('# counting all features in corpus...')
+        print()
        vocab = BagOfWords.make_vocab(texts, True)
        return len(vocab)

+    def count_all_words(texts):
+        print('# counting all words in corpus...')
+        print()
+        sum = 0
+        for text in texts:
+            sum += len(text.split())
+        return sum
+
 if __name__ == '__main__':

    # load new data set
@ -195,16 +254,16 @@ if __name__ == '__main__':
                             index_col=None,
                             engine='python',
                             usecols=[1,2],
-                             #nrows=10,
+                             nrows=3000,
                             quoting=csv.QUOTE_NONNUMERIC,
                             quotechar='\'')

    # find most common words in dataset
    corpus = df_dataset[1] + '. ' + df_dataset[2]
-    # stemming = False
-    # vocab = BagOfWords.make_vocab(corpus, stemming)
-    # print(vocab)
-    # print()
+    stemming = False
+    rel_freq = False
+    vocab = BagOfWords.make_vocab(corpus, stemming)
+
    # print(BagOfWords.make_matrix(corpus, vocab, False, stemming))
-    # print(BagOfWords.make_dict_common_words(corpus, False, stemming, 200))
-    print(BagOfWords.count_features(corpus))
+    print(BagOfWords.make_dict_common_words(corpus, rel_freq, stemming, 200))
+    # print(BagOfWords.count_features(corpus))
--- a/VisualizerNews.py
+++ b/VisualizerNews.py
@ -10,6 +10,7 @@ from NER import NER
 import csv
 from os import path

+import matplotlib
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
@ -43,7 +44,10 @@ class VisualizerNews:
                                                 stemming=False,
                                                 n=200)

-        wordcloud = WordCloud(width=2400, height=1200, scale=2,
+        wordcloud = WordCloud(background_color='white',
+                              width=2400, 
+                              height=1200, 
+                              scale=2,
                              # true if bigram:
                              collocations=False).generate_from_frequencies(dict)

@ -72,7 +76,7 @@ class VisualizerNews:
        # only articles with label==1
        df_hits = df[df['Label'] == 1]

-        texts = df_hits['Title'] + ' ' + df_hits['Text']
+        texts = df_hits['Title'] + '. ' + df_hits['Text']

        # # zum prüfen lesen
        # for text in texts[10:20]:
@ -93,7 +97,7 @@ class VisualizerNews:
        # Number of companies with this number of mentions
        plt.ylabel('Number of companies with this number of articles')
        num_bins = 50
-        n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5)
+        n, bins, patches = plt.hist(names, num_bins, facecolor='darkred', alpha=0.5)
        # plt.grid(True)
        plt.show()

@ -132,13 +136,16 @@ class VisualizerNews:
        # convert list to array
        names = np.asarray(count_chars)
        # plt.title('Length of News Articles')
-        plt.xlabel('Number of Characters in an Article')
+        plt.xlabel('Number of characters in an article')
        plt.ylabel('Frequency')
        # number of vertical bins
        num_bins = 200
-        n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5)
+        n, bins, patches = plt.hist(names, num_bins, facecolor='darkslategrey', alpha=0.5)
        # [xmin, xmax, ymin, ymax] of axis
+        #plt.axis([format(300, ','),format(10000, ','), 0, 500])
        plt.axis([300,10000,0,500])
+        # format axis labels for thousends (e.g. '10,000')
+        plt.gca().xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
        plt.show()

    def plot_pie_chart_of_sites():
@ -191,7 +198,7 @@ class VisualizerNews:
                                 #usecols=[1,2],
                                 index_col=None,
                                 engine='python',
-                                 #nrows=100,
+                                 #nrows=1000,
                                 quoting=csv.QUOTE_NONNUMERIC,
                                 quotechar='\'')

@ -199,22 +206,25 @@ class VisualizerNews:

        # find most common words in dataset
        dict = BagOfWords.make_dict_common_words(corpus,
-                                                 rel_freq=False,
+                                                 rel_freq=True,
                                                 stemming=False,
                                                 n=n_commons)

-        plt.xlabel('Most Common Words in News Articles')
-        plt.ylabel('Frequency')
+        plt.xlabel('Most common words in textual corpus')
+        plt.ylabel('Relative frequency')

        labels = list(dict.keys())
        numbers = list(dict.values())
        nbars = n_commons
-        plt.bar(np.arange(nbars), height=numbers, tick_label=labels)
+        plt.bar(np.arange(nbars), 
+                height=numbers, 
+                tick_label=labels, 
+                facecolor='darkorange')
        plt.show()

 if __name__ == '__main__':
    # VisualizerNews.plot_histogram_companies()
    # VisualizerNews.plot_wordcloud_dataset()
    # VisualizerNews.plot_histogram_text_lengths()
-    VisualizerNews.plot_pie_chart_of_sites()
-    # VisualizerNews.plot_hist_most_common_words()
+    # VisualizerNews.plot_pie_chart_of_sites()
+    VisualizerNews.plot_hist_most_common_words()