changes document term matrix

This commit is contained in:
Anne Lorenz 2018-10-29 13:47:39 +01:00
parent 6d15207da9
commit 2d5368e283
2 changed files with 106 additions and 37 deletions

View File

@ -15,6 +15,7 @@ from collections import OrderedDict
import csv
import re
import numpy as np
import pandas as pd
from nltk.stem.porter import PorterStemmer
@ -48,6 +49,48 @@ class BagOfWords:
words_cleaned.append(word)
return words_cleaned
# def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True):
# '''calculates word stem frequencies in input articles. returns
# document term matrix(DataFrame) with relative word frequencies
# (0 <= values < 1) if relative_word_frequencies=True or absolute
# word frequencies (int) if relative_word_frequencies=False.
# (rows: different articles, colums: different words in vocab)
# returns matrix as DataFrame
# '''
# print('# BOW: calculating matrix...')
# print()
# # create list of tuples
# vectors = []
# # for every text in series
# for i in range(len(series)):
# # extract text of single article
# text = series.iloc[i]
# # extract its words
# words = BagOfWords.extract_words(text, stemming)
# # count words in single article
# word_count = len(words)
# vector = []
# for i, v in enumerate(vocab):
# vector.append(0)
# for w in words:
# if w == v:
# if relative_word_frequencies:
# # relative word frequency
# vector[i] += 1/word_count
# else:
# # absolute word frequency
# vector[i] += 1
# # !!! hier passiert immer der MemoryError: !!!
# # add single vector as tuple
# vectors.append(tuple(vector))
# df_vectors = pd.DataFrame.from_records(vectors,
# index=None,
# #header=vocab,
# columns=vocab)
# return df_vectors
def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True):
'''calculates word stem frequencies in input articles. returns
document term matrix(DataFrame) with relative word frequencies
@ -58,34 +101,35 @@ class BagOfWords:
'''
print('# BOW: calculating matrix...')
print()
# create list of tuples
vectors = []
# create zero-filled dataframe
array = np.zeros(shape=(len(series),len(vocab)))
df_matrix = pd.DataFrame(array, columns=vocab)
# for every text in series
for i in range(len(series)):
# extract text of single article
text = series.iloc[i]
# extract its words
words = BagOfWords.extract_words(text, stemming)
# count words in single article
# count words in article
word_count = len(words)
vector = []
for i, v in enumerate(vocab):
vector.append(0)
# for every word in global vocab
for v in vocab:
# for every word in article
for w in words:
# find right position
if w == v:
if relative_word_frequencies:
# relative word frequency
vector[i] += 1/word_count
df_matrix.loc[i][v] += 1/word_count
else:
# absolute word frequency
vector[i] += 1
# add single vector as tuple
vectors.append(tuple(vector))
df_vectors = pd.DataFrame.from_records(vectors,
index=None,
#header=vocab,
columns=vocab)
return df_vectors
df_matrix.loc[i][v] += 1
return df_matrix
def make_vocab(series, stemming=True):
'''adds words of input articles to a global vocabulary.
@ -158,10 +202,14 @@ class BagOfWords:
# transform list to set to eliminate duplicates
return set(stop_words)
def make_dict_common_words(texts, rel_freq=False, stemming=True, n=200):
def make_dict_common_words(texts, rel_freq=True, stemming=True, n=200):
'''texts: df of article texts of complete data set as series,
return dict of words with their count.
'''
# words under that rel_freq limit are not included
limit = 0.0005
if not rel_freq:
limit = 25
# word => count
dict = {}
vocab = BagOfWords.make_vocab(texts, stemming)
@ -171,6 +219,7 @@ class BagOfWords:
# iterate over words
for column in df_matrix:
# count word mentions in total
if (df_matrix[column].sum() > limit):
dict[column] = df_matrix[column].sum()
# sort dict by value and
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
@ -182,9 +231,19 @@ class BagOfWords:
return n_dict
def count_features(texts, stemming=True):
print('# counting all features in corpus...')
print()
vocab = BagOfWords.make_vocab(texts, True)
return len(vocab)
def count_all_words(texts):
print('# counting all words in corpus...')
print()
sum = 0
for text in texts:
sum += len(text.split())
return sum
if __name__ == '__main__':
# load new data set
@ -195,16 +254,16 @@ if __name__ == '__main__':
index_col=None,
engine='python',
usecols=[1,2],
#nrows=10,
nrows=3000,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
# find most common words in dataset
corpus = df_dataset[1] + '. ' + df_dataset[2]
# stemming = False
# vocab = BagOfWords.make_vocab(corpus, stemming)
# print(vocab)
# print()
stemming = False
rel_freq = False
vocab = BagOfWords.make_vocab(corpus, stemming)
# print(BagOfWords.make_matrix(corpus, vocab, False, stemming))
# print(BagOfWords.make_dict_common_words(corpus, False, stemming, 200))
print(BagOfWords.count_features(corpus))
print(BagOfWords.make_dict_common_words(corpus, rel_freq, stemming, 200))
# print(BagOfWords.count_features(corpus))

View File

@ -10,6 +10,7 @@ from NER import NER
import csv
from os import path
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
@ -43,7 +44,10 @@ class VisualizerNews:
stemming=False,
n=200)
wordcloud = WordCloud(width=2400, height=1200, scale=2,
wordcloud = WordCloud(background_color='white',
width=2400,
height=1200,
scale=2,
# true if bigram:
collocations=False).generate_from_frequencies(dict)
@ -72,7 +76,7 @@ class VisualizerNews:
# only articles with label==1
df_hits = df[df['Label'] == 1]
texts = df_hits['Title'] + ' ' + df_hits['Text']
texts = df_hits['Title'] + '. ' + df_hits['Text']
# # zum prüfen lesen
# for text in texts[10:20]:
@ -93,7 +97,7 @@ class VisualizerNews:
# Number of companies with this number of mentions
plt.ylabel('Number of companies with this number of articles')
num_bins = 50
n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5)
n, bins, patches = plt.hist(names, num_bins, facecolor='darkred', alpha=0.5)
# plt.grid(True)
plt.show()
@ -132,13 +136,16 @@ class VisualizerNews:
# convert list to array
names = np.asarray(count_chars)
# plt.title('Length of News Articles')
plt.xlabel('Number of Characters in an Article')
plt.xlabel('Number of characters in an article')
plt.ylabel('Frequency')
# number of vertical bins
num_bins = 200
n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5)
n, bins, patches = plt.hist(names, num_bins, facecolor='darkslategrey', alpha=0.5)
# [xmin, xmax, ymin, ymax] of axis
#plt.axis([format(300, ','),format(10000, ','), 0, 500])
plt.axis([300,10000,0,500])
# format axis labels for thousends (e.g. '10,000')
plt.gca().xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
plt.show()
def plot_pie_chart_of_sites():
@ -191,7 +198,7 @@ class VisualizerNews:
#usecols=[1,2],
index_col=None,
engine='python',
#nrows=100,
#nrows=1000,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
@ -199,22 +206,25 @@ class VisualizerNews:
# find most common words in dataset
dict = BagOfWords.make_dict_common_words(corpus,
rel_freq=False,
rel_freq=True,
stemming=False,
n=n_commons)
plt.xlabel('Most Common Words in News Articles')
plt.ylabel('Frequency')
plt.xlabel('Most common words in textual corpus')
plt.ylabel('Relative frequency')
labels = list(dict.keys())
numbers = list(dict.values())
nbars = n_commons
plt.bar(np.arange(nbars), height=numbers, tick_label=labels)
plt.bar(np.arange(nbars),
height=numbers,
tick_label=labels,
facecolor='darkorange')
plt.show()
if __name__ == '__main__':
# VisualizerNews.plot_histogram_companies()
# VisualizerNews.plot_wordcloud_dataset()
# VisualizerNews.plot_histogram_text_lengths()
VisualizerNews.plot_pie_chart_of_sites()
# VisualizerNews.plot_hist_most_common_words()
# VisualizerNews.plot_pie_chart_of_sites()
VisualizerNews.plot_hist_most_common_words()