changes document term matrix
This commit is contained in:
parent
6d15207da9
commit
2d5368e283
107
BagOfWords.py
107
BagOfWords.py
|
@ -15,6 +15,7 @@ from collections import OrderedDict
|
||||||
import csv
|
import csv
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from nltk.stem.porter import PorterStemmer
|
from nltk.stem.porter import PorterStemmer
|
||||||
|
|
||||||
|
@ -48,6 +49,48 @@ class BagOfWords:
|
||||||
words_cleaned.append(word)
|
words_cleaned.append(word)
|
||||||
return words_cleaned
|
return words_cleaned
|
||||||
|
|
||||||
|
# def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True):
|
||||||
|
# '''calculates word stem frequencies in input articles. returns
|
||||||
|
# document term matrix(DataFrame) with relative word frequencies
|
||||||
|
# (0 <= values < 1) if relative_word_frequencies=True or absolute
|
||||||
|
# word frequencies (int) if relative_word_frequencies=False.
|
||||||
|
# (rows: different articles, colums: different words in vocab)
|
||||||
|
# returns matrix as DataFrame
|
||||||
|
# '''
|
||||||
|
# print('# BOW: calculating matrix...')
|
||||||
|
# print()
|
||||||
|
# # create list of tuples
|
||||||
|
# vectors = []
|
||||||
|
# # for every text in series
|
||||||
|
# for i in range(len(series)):
|
||||||
|
# # extract text of single article
|
||||||
|
# text = series.iloc[i]
|
||||||
|
# # extract its words
|
||||||
|
# words = BagOfWords.extract_words(text, stemming)
|
||||||
|
# # count words in single article
|
||||||
|
# word_count = len(words)
|
||||||
|
# vector = []
|
||||||
|
# for i, v in enumerate(vocab):
|
||||||
|
# vector.append(0)
|
||||||
|
# for w in words:
|
||||||
|
# if w == v:
|
||||||
|
# if relative_word_frequencies:
|
||||||
|
# # relative word frequency
|
||||||
|
# vector[i] += 1/word_count
|
||||||
|
# else:
|
||||||
|
# # absolute word frequency
|
||||||
|
# vector[i] += 1
|
||||||
|
|
||||||
|
# # !!! hier passiert immer der MemoryError: !!!
|
||||||
|
|
||||||
|
# # add single vector as tuple
|
||||||
|
# vectors.append(tuple(vector))
|
||||||
|
# df_vectors = pd.DataFrame.from_records(vectors,
|
||||||
|
# index=None,
|
||||||
|
# #header=vocab,
|
||||||
|
# columns=vocab)
|
||||||
|
# return df_vectors
|
||||||
|
|
||||||
def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True):
|
def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True):
|
||||||
'''calculates word stem frequencies in input articles. returns
|
'''calculates word stem frequencies in input articles. returns
|
||||||
document term matrix(DataFrame) with relative word frequencies
|
document term matrix(DataFrame) with relative word frequencies
|
||||||
|
@ -58,34 +101,35 @@ class BagOfWords:
|
||||||
'''
|
'''
|
||||||
print('# BOW: calculating matrix...')
|
print('# BOW: calculating matrix...')
|
||||||
print()
|
print()
|
||||||
# create list of tuples
|
# create zero-filled dataframe
|
||||||
vectors = []
|
array = np.zeros(shape=(len(series),len(vocab)))
|
||||||
|
df_matrix = pd.DataFrame(array, columns=vocab)
|
||||||
|
|
||||||
# for every text in series
|
# for every text in series
|
||||||
for i in range(len(series)):
|
for i in range(len(series)):
|
||||||
|
|
||||||
# extract text of single article
|
# extract text of single article
|
||||||
text = series.iloc[i]
|
text = series.iloc[i]
|
||||||
|
|
||||||
# extract its words
|
# extract its words
|
||||||
words = BagOfWords.extract_words(text, stemming)
|
words = BagOfWords.extract_words(text, stemming)
|
||||||
# count words in single article
|
# count words in article
|
||||||
word_count = len(words)
|
word_count = len(words)
|
||||||
vector = []
|
|
||||||
for i, v in enumerate(vocab):
|
# for every word in global vocab
|
||||||
vector.append(0)
|
for v in vocab:
|
||||||
|
# for every word in article
|
||||||
for w in words:
|
for w in words:
|
||||||
|
# find right position
|
||||||
if w == v:
|
if w == v:
|
||||||
if relative_word_frequencies:
|
if relative_word_frequencies:
|
||||||
# relative word frequency
|
# relative word frequency
|
||||||
vector[i] += 1/word_count
|
df_matrix.loc[i][v] += 1/word_count
|
||||||
else:
|
else:
|
||||||
# absolute word frequency
|
# absolute word frequency
|
||||||
vector[i] += 1
|
df_matrix.loc[i][v] += 1
|
||||||
# add single vector as tuple
|
|
||||||
vectors.append(tuple(vector))
|
return df_matrix
|
||||||
df_vectors = pd.DataFrame.from_records(vectors,
|
|
||||||
index=None,
|
|
||||||
#header=vocab,
|
|
||||||
columns=vocab)
|
|
||||||
return df_vectors
|
|
||||||
|
|
||||||
def make_vocab(series, stemming=True):
|
def make_vocab(series, stemming=True):
|
||||||
'''adds words of input articles to a global vocabulary.
|
'''adds words of input articles to a global vocabulary.
|
||||||
|
@ -158,10 +202,14 @@ class BagOfWords:
|
||||||
# transform list to set to eliminate duplicates
|
# transform list to set to eliminate duplicates
|
||||||
return set(stop_words)
|
return set(stop_words)
|
||||||
|
|
||||||
def make_dict_common_words(texts, rel_freq=False, stemming=True, n=200):
|
def make_dict_common_words(texts, rel_freq=True, stemming=True, n=200):
|
||||||
'''texts: df of article texts of complete data set as series,
|
'''texts: df of article texts of complete data set as series,
|
||||||
return dict of words with their count.
|
return dict of words with their count.
|
||||||
'''
|
'''
|
||||||
|
# words under that rel_freq limit are not included
|
||||||
|
limit = 0.0005
|
||||||
|
if not rel_freq:
|
||||||
|
limit = 25
|
||||||
# word => count
|
# word => count
|
||||||
dict = {}
|
dict = {}
|
||||||
vocab = BagOfWords.make_vocab(texts, stemming)
|
vocab = BagOfWords.make_vocab(texts, stemming)
|
||||||
|
@ -171,7 +219,8 @@ class BagOfWords:
|
||||||
# iterate over words
|
# iterate over words
|
||||||
for column in df_matrix:
|
for column in df_matrix:
|
||||||
# count word mentions in total
|
# count word mentions in total
|
||||||
dict[column] = df_matrix[column].sum()
|
if (df_matrix[column].sum() > limit):
|
||||||
|
dict[column] = df_matrix[column].sum()
|
||||||
# sort dict by value and
|
# sort dict by value and
|
||||||
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
||||||
reverse=True))
|
reverse=True))
|
||||||
|
@ -182,9 +231,19 @@ class BagOfWords:
|
||||||
return n_dict
|
return n_dict
|
||||||
|
|
||||||
def count_features(texts, stemming=True):
|
def count_features(texts, stemming=True):
|
||||||
|
print('# counting all features in corpus...')
|
||||||
|
print()
|
||||||
vocab = BagOfWords.make_vocab(texts, True)
|
vocab = BagOfWords.make_vocab(texts, True)
|
||||||
return len(vocab)
|
return len(vocab)
|
||||||
|
|
||||||
|
def count_all_words(texts):
|
||||||
|
print('# counting all words in corpus...')
|
||||||
|
print()
|
||||||
|
sum = 0
|
||||||
|
for text in texts:
|
||||||
|
sum += len(text.split())
|
||||||
|
return sum
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
# load new data set
|
# load new data set
|
||||||
|
@ -195,16 +254,16 @@ if __name__ == '__main__':
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
usecols=[1,2],
|
usecols=[1,2],
|
||||||
#nrows=10,
|
nrows=3000,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
|
|
||||||
# find most common words in dataset
|
# find most common words in dataset
|
||||||
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
||||||
# stemming = False
|
stemming = False
|
||||||
# vocab = BagOfWords.make_vocab(corpus, stemming)
|
rel_freq = False
|
||||||
# print(vocab)
|
vocab = BagOfWords.make_vocab(corpus, stemming)
|
||||||
# print()
|
|
||||||
# print(BagOfWords.make_matrix(corpus, vocab, False, stemming))
|
# print(BagOfWords.make_matrix(corpus, vocab, False, stemming))
|
||||||
# print(BagOfWords.make_dict_common_words(corpus, False, stemming, 200))
|
print(BagOfWords.make_dict_common_words(corpus, rel_freq, stemming, 200))
|
||||||
print(BagOfWords.count_features(corpus))
|
# print(BagOfWords.count_features(corpus))
|
|
@ -10,6 +10,7 @@ from NER import NER
|
||||||
import csv
|
import csv
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
|
import matplotlib
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
@ -43,7 +44,10 @@ class VisualizerNews:
|
||||||
stemming=False,
|
stemming=False,
|
||||||
n=200)
|
n=200)
|
||||||
|
|
||||||
wordcloud = WordCloud(width=2400, height=1200, scale=2,
|
wordcloud = WordCloud(background_color='white',
|
||||||
|
width=2400,
|
||||||
|
height=1200,
|
||||||
|
scale=2,
|
||||||
# true if bigram:
|
# true if bigram:
|
||||||
collocations=False).generate_from_frequencies(dict)
|
collocations=False).generate_from_frequencies(dict)
|
||||||
|
|
||||||
|
@ -72,7 +76,7 @@ class VisualizerNews:
|
||||||
# only articles with label==1
|
# only articles with label==1
|
||||||
df_hits = df[df['Label'] == 1]
|
df_hits = df[df['Label'] == 1]
|
||||||
|
|
||||||
texts = df_hits['Title'] + ' ' + df_hits['Text']
|
texts = df_hits['Title'] + '. ' + df_hits['Text']
|
||||||
|
|
||||||
# # zum prüfen lesen
|
# # zum prüfen lesen
|
||||||
# for text in texts[10:20]:
|
# for text in texts[10:20]:
|
||||||
|
@ -93,7 +97,7 @@ class VisualizerNews:
|
||||||
# Number of companies with this number of mentions
|
# Number of companies with this number of mentions
|
||||||
plt.ylabel('Number of companies with this number of articles')
|
plt.ylabel('Number of companies with this number of articles')
|
||||||
num_bins = 50
|
num_bins = 50
|
||||||
n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5)
|
n, bins, patches = plt.hist(names, num_bins, facecolor='darkred', alpha=0.5)
|
||||||
# plt.grid(True)
|
# plt.grid(True)
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
@ -132,13 +136,16 @@ class VisualizerNews:
|
||||||
# convert list to array
|
# convert list to array
|
||||||
names = np.asarray(count_chars)
|
names = np.asarray(count_chars)
|
||||||
# plt.title('Length of News Articles')
|
# plt.title('Length of News Articles')
|
||||||
plt.xlabel('Number of Characters in an Article')
|
plt.xlabel('Number of characters in an article')
|
||||||
plt.ylabel('Frequency')
|
plt.ylabel('Frequency')
|
||||||
# number of vertical bins
|
# number of vertical bins
|
||||||
num_bins = 200
|
num_bins = 200
|
||||||
n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5)
|
n, bins, patches = plt.hist(names, num_bins, facecolor='darkslategrey', alpha=0.5)
|
||||||
# [xmin, xmax, ymin, ymax] of axis
|
# [xmin, xmax, ymin, ymax] of axis
|
||||||
plt.axis([300, 10000, 0, 500])
|
#plt.axis([format(300, ','),format(10000, ','), 0, 500])
|
||||||
|
plt.axis([300,10000,0,500])
|
||||||
|
# format axis labels for thousends (e.g. '10,000')
|
||||||
|
plt.gca().xaxis.set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def plot_pie_chart_of_sites():
|
def plot_pie_chart_of_sites():
|
||||||
|
@ -191,7 +198,7 @@ class VisualizerNews:
|
||||||
#usecols=[1,2],
|
#usecols=[1,2],
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
#nrows=100,
|
#nrows=1000,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
|
|
||||||
|
@ -199,22 +206,25 @@ class VisualizerNews:
|
||||||
|
|
||||||
# find most common words in dataset
|
# find most common words in dataset
|
||||||
dict = BagOfWords.make_dict_common_words(corpus,
|
dict = BagOfWords.make_dict_common_words(corpus,
|
||||||
rel_freq=False,
|
rel_freq=True,
|
||||||
stemming=False,
|
stemming=False,
|
||||||
n=n_commons)
|
n=n_commons)
|
||||||
|
|
||||||
plt.xlabel('Most Common Words in News Articles')
|
plt.xlabel('Most common words in textual corpus')
|
||||||
plt.ylabel('Frequency')
|
plt.ylabel('Relative frequency')
|
||||||
|
|
||||||
labels = list(dict.keys())
|
labels = list(dict.keys())
|
||||||
numbers = list(dict.values())
|
numbers = list(dict.values())
|
||||||
nbars = n_commons
|
nbars = n_commons
|
||||||
plt.bar(np.arange(nbars), height=numbers, tick_label=labels)
|
plt.bar(np.arange(nbars),
|
||||||
|
height=numbers,
|
||||||
|
tick_label=labels,
|
||||||
|
facecolor='darkorange')
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# VisualizerNews.plot_histogram_companies()
|
# VisualizerNews.plot_histogram_companies()
|
||||||
# VisualizerNews.plot_wordcloud_dataset()
|
# VisualizerNews.plot_wordcloud_dataset()
|
||||||
# VisualizerNews.plot_histogram_text_lengths()
|
# VisualizerNews.plot_histogram_text_lengths()
|
||||||
VisualizerNews.plot_pie_chart_of_sites()
|
# VisualizerNews.plot_pie_chart_of_sites()
|
||||||
# VisualizerNews.plot_hist_most_common_words()
|
VisualizerNews.plot_hist_most_common_words()
|
Loading…
Reference in New Issue