diff --git a/BagOfWords.py b/BagOfWords.py index d5b5720..b98bc6f 100644 --- a/BagOfWords.py +++ b/BagOfWords.py @@ -15,6 +15,7 @@ from collections import OrderedDict import csv import re +import numpy as np import pandas as pd from nltk.stem.porter import PorterStemmer @@ -48,6 +49,48 @@ class BagOfWords: words_cleaned.append(word) return words_cleaned + # def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True): + # '''calculates word stem frequencies in input articles. returns + # document term matrix(DataFrame) with relative word frequencies + # (0 <= values < 1) if relative_word_frequencies=True or absolute + # word frequencies (int) if relative_word_frequencies=False. + # (rows: different articles, colums: different words in vocab) + # returns matrix as DataFrame + # ''' + # print('# BOW: calculating matrix...') + # print() + # # create list of tuples + # vectors = [] + # # for every text in series + # for i in range(len(series)): + # # extract text of single article + # text = series.iloc[i] + # # extract its words + # words = BagOfWords.extract_words(text, stemming) + # # count words in single article + # word_count = len(words) + # vector = [] + # for i, v in enumerate(vocab): + # vector.append(0) + # for w in words: + # if w == v: + # if relative_word_frequencies: + # # relative word frequency + # vector[i] += 1/word_count + # else: + # # absolute word frequency + # vector[i] += 1 + + # # !!! hier passiert immer der MemoryError: !!! + + # # add single vector as tuple + # vectors.append(tuple(vector)) + # df_vectors = pd.DataFrame.from_records(vectors, + # index=None, + # #header=vocab, + # columns=vocab) + # return df_vectors + def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True): '''calculates word stem frequencies in input articles. returns document term matrix(DataFrame) with relative word frequencies @@ -58,34 +101,35 @@ class BagOfWords: ''' print('# BOW: calculating matrix...') print() - # create list of tuples - vectors = [] + # create zero-filled dataframe + array = np.zeros(shape=(len(series),len(vocab))) + df_matrix = pd.DataFrame(array, columns=vocab) + # for every text in series for i in range(len(series)): + # extract text of single article text = series.iloc[i] + # extract its words words = BagOfWords.extract_words(text, stemming) - # count words in single article + # count words in article word_count = len(words) - vector = [] - for i, v in enumerate(vocab): - vector.append(0) + + # for every word in global vocab + for v in vocab: + # for every word in article for w in words: + # find right position if w == v: if relative_word_frequencies: # relative word frequency - vector[i] += 1/word_count + df_matrix.loc[i][v] += 1/word_count else: # absolute word frequency - vector[i] += 1 - # add single vector as tuple - vectors.append(tuple(vector)) - df_vectors = pd.DataFrame.from_records(vectors, - index=None, - #header=vocab, - columns=vocab) - return df_vectors + df_matrix.loc[i][v] += 1 + + return df_matrix def make_vocab(series, stemming=True): '''adds words of input articles to a global vocabulary. @@ -158,10 +202,14 @@ class BagOfWords: # transform list to set to eliminate duplicates return set(stop_words) - def make_dict_common_words(texts, rel_freq=False, stemming=True, n=200): + def make_dict_common_words(texts, rel_freq=True, stemming=True, n=200): '''texts: df of article texts of complete data set as series, return dict of words with their count. ''' + # words under that rel_freq limit are not included + limit = 0.0005 + if not rel_freq: + limit = 25 # word => count dict = {} vocab = BagOfWords.make_vocab(texts, stemming) @@ -171,7 +219,8 @@ class BagOfWords: # iterate over words for column in df_matrix: # count word mentions in total - dict[column] = df_matrix[column].sum() + if (df_matrix[column].sum() > limit): + dict[column] = df_matrix[column].sum() # sort dict by value and o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\ reverse=True)) @@ -182,9 +231,19 @@ class BagOfWords: return n_dict def count_features(texts, stemming=True): + print('# counting all features in corpus...') + print() vocab = BagOfWords.make_vocab(texts, True) return len(vocab) + def count_all_words(texts): + print('# counting all words in corpus...') + print() + sum = 0 + for text in texts: + sum += len(text.split()) + return sum + if __name__ == '__main__': # load new data set @@ -195,16 +254,16 @@ if __name__ == '__main__': index_col=None, engine='python', usecols=[1,2], - #nrows=10, + nrows=3000, quoting=csv.QUOTE_NONNUMERIC, quotechar='\'') # find most common words in dataset corpus = df_dataset[1] + '. ' + df_dataset[2] - # stemming = False - # vocab = BagOfWords.make_vocab(corpus, stemming) - # print(vocab) - # print() + stemming = False + rel_freq = False + vocab = BagOfWords.make_vocab(corpus, stemming) + # print(BagOfWords.make_matrix(corpus, vocab, False, stemming)) - # print(BagOfWords.make_dict_common_words(corpus, False, stemming, 200)) - print(BagOfWords.count_features(corpus)) \ No newline at end of file + print(BagOfWords.make_dict_common_words(corpus, rel_freq, stemming, 200)) + # print(BagOfWords.count_features(corpus)) \ No newline at end of file diff --git a/VisualizerNews.py b/VisualizerNews.py index 97fe238..31724ab 100644 --- a/VisualizerNews.py +++ b/VisualizerNews.py @@ -10,6 +10,7 @@ from NER import NER import csv from os import path +import matplotlib import matplotlib.pyplot as plt import numpy as np import pandas as pd @@ -43,7 +44,10 @@ class VisualizerNews: stemming=False, n=200) - wordcloud = WordCloud(width=2400, height=1200, scale=2, + wordcloud = WordCloud(background_color='white', + width=2400, + height=1200, + scale=2, # true if bigram: collocations=False).generate_from_frequencies(dict) @@ -72,7 +76,7 @@ class VisualizerNews: # only articles with label==1 df_hits = df[df['Label'] == 1] - texts = df_hits['Title'] + ' ' + df_hits['Text'] + texts = df_hits['Title'] + '. ' + df_hits['Text'] # # zum prüfen lesen # for text in texts[10:20]: @@ -93,7 +97,7 @@ class VisualizerNews: # Number of companies with this number of mentions plt.ylabel('Number of companies with this number of articles') num_bins = 50 - n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5) + n, bins, patches = plt.hist(names, num_bins, facecolor='darkred', alpha=0.5) # plt.grid(True) plt.show() @@ -132,13 +136,16 @@ class VisualizerNews: # convert list to array names = np.asarray(count_chars) # plt.title('Length of News Articles') - plt.xlabel('Number of Characters in an Article') + plt.xlabel('Number of characters in an article') plt.ylabel('Frequency') # number of vertical bins num_bins = 200 - n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5) + n, bins, patches = plt.hist(names, num_bins, facecolor='darkslategrey', alpha=0.5) # [xmin, xmax, ymin, ymax] of axis - plt.axis([300, 10000, 0, 500]) + #plt.axis([format(300, ','),format(10000, ','), 0, 500]) + plt.axis([300,10000,0,500]) + # format axis labels for thousends (e.g. '10,000') + plt.gca().xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ','))) plt.show() def plot_pie_chart_of_sites(): @@ -191,7 +198,7 @@ class VisualizerNews: #usecols=[1,2], index_col=None, engine='python', - #nrows=100, + #nrows=1000, quoting=csv.QUOTE_NONNUMERIC, quotechar='\'') @@ -199,22 +206,25 @@ class VisualizerNews: # find most common words in dataset dict = BagOfWords.make_dict_common_words(corpus, - rel_freq=False, + rel_freq=True, stemming=False, n=n_commons) - plt.xlabel('Most Common Words in News Articles') - plt.ylabel('Frequency') + plt.xlabel('Most common words in textual corpus') + plt.ylabel('Relative frequency') labels = list(dict.keys()) numbers = list(dict.values()) nbars = n_commons - plt.bar(np.arange(nbars), height=numbers, tick_label=labels) + plt.bar(np.arange(nbars), + height=numbers, + tick_label=labels, + facecolor='darkorange') plt.show() if __name__ == '__main__': # VisualizerNews.plot_histogram_companies() # VisualizerNews.plot_wordcloud_dataset() # VisualizerNews.plot_histogram_text_lengths() - VisualizerNews.plot_pie_chart_of_sites() - # VisualizerNews.plot_hist_most_common_words() \ No newline at end of file + # VisualizerNews.plot_pie_chart_of_sites() + VisualizerNews.plot_hist_most_common_words() \ No newline at end of file