changes document term matrix

This commit is contained in:
Anne Lorenz 2018-10-29 13:47:39 +01:00
parent 6d15207da9
commit 2d5368e283
2 changed files with 106 additions and 37 deletions

View File

@ -15,6 +15,7 @@ from collections import OrderedDict
import csv import csv
import re import re
import numpy as np
import pandas as pd import pandas as pd
from nltk.stem.porter import PorterStemmer from nltk.stem.porter import PorterStemmer
@ -48,6 +49,48 @@ class BagOfWords:
words_cleaned.append(word) words_cleaned.append(word)
return words_cleaned return words_cleaned
# def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True):
# '''calculates word stem frequencies in input articles. returns
# document term matrix(DataFrame) with relative word frequencies
# (0 <= values < 1) if relative_word_frequencies=True or absolute
# word frequencies (int) if relative_word_frequencies=False.
# (rows: different articles, colums: different words in vocab)
# returns matrix as DataFrame
# '''
# print('# BOW: calculating matrix...')
# print()
# # create list of tuples
# vectors = []
# # for every text in series
# for i in range(len(series)):
# # extract text of single article
# text = series.iloc[i]
# # extract its words
# words = BagOfWords.extract_words(text, stemming)
# # count words in single article
# word_count = len(words)
# vector = []
# for i, v in enumerate(vocab):
# vector.append(0)
# for w in words:
# if w == v:
# if relative_word_frequencies:
# # relative word frequency
# vector[i] += 1/word_count
# else:
# # absolute word frequency
# vector[i] += 1
# # !!! hier passiert immer der MemoryError: !!!
# # add single vector as tuple
# vectors.append(tuple(vector))
# df_vectors = pd.DataFrame.from_records(vectors,
# index=None,
# #header=vocab,
# columns=vocab)
# return df_vectors
def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True): def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True):
'''calculates word stem frequencies in input articles. returns '''calculates word stem frequencies in input articles. returns
document term matrix(DataFrame) with relative word frequencies document term matrix(DataFrame) with relative word frequencies
@ -58,34 +101,35 @@ class BagOfWords:
''' '''
print('# BOW: calculating matrix...') print('# BOW: calculating matrix...')
print() print()
# create list of tuples # create zero-filled dataframe
vectors = [] array = np.zeros(shape=(len(series),len(vocab)))
df_matrix = pd.DataFrame(array, columns=vocab)
# for every text in series # for every text in series
for i in range(len(series)): for i in range(len(series)):
# extract text of single article # extract text of single article
text = series.iloc[i] text = series.iloc[i]
# extract its words # extract its words
words = BagOfWords.extract_words(text, stemming) words = BagOfWords.extract_words(text, stemming)
# count words in single article # count words in article
word_count = len(words) word_count = len(words)
vector = []
for i, v in enumerate(vocab): # for every word in global vocab
vector.append(0) for v in vocab:
# for every word in article
for w in words: for w in words:
# find right position
if w == v: if w == v:
if relative_word_frequencies: if relative_word_frequencies:
# relative word frequency # relative word frequency
vector[i] += 1/word_count df_matrix.loc[i][v] += 1/word_count
else: else:
# absolute word frequency # absolute word frequency
vector[i] += 1 df_matrix.loc[i][v] += 1
# add single vector as tuple
vectors.append(tuple(vector)) return df_matrix
df_vectors = pd.DataFrame.from_records(vectors,
index=None,
#header=vocab,
columns=vocab)
return df_vectors
def make_vocab(series, stemming=True): def make_vocab(series, stemming=True):
'''adds words of input articles to a global vocabulary. '''adds words of input articles to a global vocabulary.
@ -158,10 +202,14 @@ class BagOfWords:
# transform list to set to eliminate duplicates # transform list to set to eliminate duplicates
return set(stop_words) return set(stop_words)
def make_dict_common_words(texts, rel_freq=False, stemming=True, n=200): def make_dict_common_words(texts, rel_freq=True, stemming=True, n=200):
'''texts: df of article texts of complete data set as series, '''texts: df of article texts of complete data set as series,
return dict of words with their count. return dict of words with their count.
''' '''
# words under that rel_freq limit are not included
limit = 0.0005
if not rel_freq:
limit = 25
# word => count # word => count
dict = {} dict = {}
vocab = BagOfWords.make_vocab(texts, stemming) vocab = BagOfWords.make_vocab(texts, stemming)
@ -171,7 +219,8 @@ class BagOfWords:
# iterate over words # iterate over words
for column in df_matrix: for column in df_matrix:
# count word mentions in total # count word mentions in total
dict[column] = df_matrix[column].sum() if (df_matrix[column].sum() > limit):
dict[column] = df_matrix[column].sum()
# sort dict by value and # sort dict by value and
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\ o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
reverse=True)) reverse=True))
@ -182,9 +231,19 @@ class BagOfWords:
return n_dict return n_dict
def count_features(texts, stemming=True): def count_features(texts, stemming=True):
print('# counting all features in corpus...')
print()
vocab = BagOfWords.make_vocab(texts, True) vocab = BagOfWords.make_vocab(texts, True)
return len(vocab) return len(vocab)
def count_all_words(texts):
print('# counting all words in corpus...')
print()
sum = 0
for text in texts:
sum += len(text.split())
return sum
if __name__ == '__main__': if __name__ == '__main__':
# load new data set # load new data set
@ -195,16 +254,16 @@ if __name__ == '__main__':
index_col=None, index_col=None,
engine='python', engine='python',
usecols=[1,2], usecols=[1,2],
#nrows=10, nrows=3000,
quoting=csv.QUOTE_NONNUMERIC, quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') quotechar='\'')
# find most common words in dataset # find most common words in dataset
corpus = df_dataset[1] + '. ' + df_dataset[2] corpus = df_dataset[1] + '. ' + df_dataset[2]
# stemming = False stemming = False
# vocab = BagOfWords.make_vocab(corpus, stemming) rel_freq = False
# print(vocab) vocab = BagOfWords.make_vocab(corpus, stemming)
# print()
# print(BagOfWords.make_matrix(corpus, vocab, False, stemming)) # print(BagOfWords.make_matrix(corpus, vocab, False, stemming))
# print(BagOfWords.make_dict_common_words(corpus, False, stemming, 200)) print(BagOfWords.make_dict_common_words(corpus, rel_freq, stemming, 200))
print(BagOfWords.count_features(corpus)) # print(BagOfWords.count_features(corpus))

View File

@ -10,6 +10,7 @@ from NER import NER
import csv import csv
from os import path from os import path
import matplotlib
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
import pandas as pd import pandas as pd
@ -43,7 +44,10 @@ class VisualizerNews:
stemming=False, stemming=False,
n=200) n=200)
wordcloud = WordCloud(width=2400, height=1200, scale=2, wordcloud = WordCloud(background_color='white',
width=2400,
height=1200,
scale=2,
# true if bigram: # true if bigram:
collocations=False).generate_from_frequencies(dict) collocations=False).generate_from_frequencies(dict)
@ -72,7 +76,7 @@ class VisualizerNews:
# only articles with label==1 # only articles with label==1
df_hits = df[df['Label'] == 1] df_hits = df[df['Label'] == 1]
texts = df_hits['Title'] + ' ' + df_hits['Text'] texts = df_hits['Title'] + '. ' + df_hits['Text']
# # zum prüfen lesen # # zum prüfen lesen
# for text in texts[10:20]: # for text in texts[10:20]:
@ -93,7 +97,7 @@ class VisualizerNews:
# Number of companies with this number of mentions # Number of companies with this number of mentions
plt.ylabel('Number of companies with this number of articles') plt.ylabel('Number of companies with this number of articles')
num_bins = 50 num_bins = 50
n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5) n, bins, patches = plt.hist(names, num_bins, facecolor='darkred', alpha=0.5)
# plt.grid(True) # plt.grid(True)
plt.show() plt.show()
@ -132,13 +136,16 @@ class VisualizerNews:
# convert list to array # convert list to array
names = np.asarray(count_chars) names = np.asarray(count_chars)
# plt.title('Length of News Articles') # plt.title('Length of News Articles')
plt.xlabel('Number of Characters in an Article') plt.xlabel('Number of characters in an article')
plt.ylabel('Frequency') plt.ylabel('Frequency')
# number of vertical bins # number of vertical bins
num_bins = 200 num_bins = 200
n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5) n, bins, patches = plt.hist(names, num_bins, facecolor='darkslategrey', alpha=0.5)
# [xmin, xmax, ymin, ymax] of axis # [xmin, xmax, ymin, ymax] of axis
plt.axis([300, 10000, 0, 500]) #plt.axis([format(300, ','),format(10000, ','), 0, 500])
plt.axis([300,10000,0,500])
# format axis labels for thousends (e.g. '10,000')
plt.gca().xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.show() plt.show()
def plot_pie_chart_of_sites(): def plot_pie_chart_of_sites():
@ -191,7 +198,7 @@ class VisualizerNews:
#usecols=[1,2], #usecols=[1,2],
index_col=None, index_col=None,
engine='python', engine='python',
#nrows=100, #nrows=1000,
quoting=csv.QUOTE_NONNUMERIC, quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') quotechar='\'')
@ -199,22 +206,25 @@ class VisualizerNews:
# find most common words in dataset # find most common words in dataset
dict = BagOfWords.make_dict_common_words(corpus, dict = BagOfWords.make_dict_common_words(corpus,
rel_freq=False, rel_freq=True,
stemming=False, stemming=False,
n=n_commons) n=n_commons)
plt.xlabel('Most Common Words in News Articles') plt.xlabel('Most common words in textual corpus')
plt.ylabel('Frequency') plt.ylabel('Relative frequency')
labels = list(dict.keys()) labels = list(dict.keys())
numbers = list(dict.values()) numbers = list(dict.values())
nbars = n_commons nbars = n_commons
plt.bar(np.arange(nbars), height=numbers, tick_label=labels) plt.bar(np.arange(nbars),
height=numbers,
tick_label=labels,
facecolor='darkorange')
plt.show() plt.show()
if __name__ == '__main__': if __name__ == '__main__':
# VisualizerNews.plot_histogram_companies() # VisualizerNews.plot_histogram_companies()
# VisualizerNews.plot_wordcloud_dataset() # VisualizerNews.plot_wordcloud_dataset()
# VisualizerNews.plot_histogram_text_lengths() # VisualizerNews.plot_histogram_text_lengths()
VisualizerNews.plot_pie_chart_of_sites() # VisualizerNews.plot_pie_chart_of_sites()
# VisualizerNews.plot_hist_most_common_words() VisualizerNews.plot_hist_most_common_words()