changed CountVectorizer optional and other things
This commit is contained in:
parent
2d5368e283
commit
7e037a1621
205
BagOfWords.py
205
BagOfWords.py
|
@ -9,7 +9,7 @@ vocabulary. As the multinomial Naive Bayes classifier is suitable for
|
|||
classification with discrete features (e.g., word counts for text
|
||||
classification). The multinomial distribution normally requires integer
|
||||
feature counts. However, in practice, fractional counts such as tf-idf may
|
||||
also work. => considered by 'relative_word_frequencies' as parameter.
|
||||
also work => considered by 'rel_freq'(relative word frequencies) as parameter.
|
||||
'''
|
||||
from collections import OrderedDict
|
||||
import csv
|
||||
|
@ -21,11 +21,14 @@ from nltk.stem.porter import PorterStemmer
|
|||
|
||||
class BagOfWords:
|
||||
|
||||
def fit_transform(X, relative_word_frequencies=True):
|
||||
def fit_transform(corpus, rel_freq=True, stemming=True):
|
||||
''' similar to CountVectorizer's fit_transform method
|
||||
'''
|
||||
vocab = BagOfWords.make_vocab(X)
|
||||
return BagOfWords.make_matrix(X, vocab, relative_word_frequencies)
|
||||
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
|
||||
stemming)
|
||||
return matrix
|
||||
|
||||
def extract_words(text, stemming=True):
|
||||
'''takes article as argument, removes numbers,
|
||||
|
@ -46,52 +49,25 @@ class BagOfWords:
|
|||
if stemming:
|
||||
# reduce word to its stem
|
||||
word = stemmer.stem(word)
|
||||
# filter out spam chars
|
||||
word = word.replace('â', '').replace('œ', '')\
|
||||
.replace('ã', '')
|
||||
words_cleaned.append(word)
|
||||
return words_cleaned
|
||||
|
||||
# def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True):
|
||||
# '''calculates word stem frequencies in input articles. returns
|
||||
# document term matrix(DataFrame) with relative word frequencies
|
||||
# (0 <= values < 1) if relative_word_frequencies=True or absolute
|
||||
# word frequencies (int) if relative_word_frequencies=False.
|
||||
# (rows: different articles, colums: different words in vocab)
|
||||
# returns matrix as DataFrame
|
||||
# '''
|
||||
# print('# BOW: calculating matrix...')
|
||||
# print()
|
||||
# # create list of tuples
|
||||
# vectors = []
|
||||
# # for every text in series
|
||||
# for i in range(len(series)):
|
||||
# # extract text of single article
|
||||
# text = series.iloc[i]
|
||||
# # extract its words
|
||||
# words = BagOfWords.extract_words(text, stemming)
|
||||
# # count words in single article
|
||||
# word_count = len(words)
|
||||
# vector = []
|
||||
# for i, v in enumerate(vocab):
|
||||
# vector.append(0)
|
||||
# for w in words:
|
||||
# if w == v:
|
||||
# if relative_word_frequencies:
|
||||
# # relative word frequency
|
||||
# vector[i] += 1/word_count
|
||||
# else:
|
||||
# # absolute word frequency
|
||||
# vector[i] += 1
|
||||
def extract_all_words(corpus, stemming=True):
|
||||
'''param: all articles of corpus
|
||||
returns list of lists of all extracted words, one row per article
|
||||
'''
|
||||
extracted_words = []
|
||||
print('# extracting all words from articles...')
|
||||
print()
|
||||
for text in corpus:
|
||||
extracted_words.append(BagOfWords.extract_words(text, stemming))
|
||||
|
||||
# # !!! hier passiert immer der MemoryError: !!!
|
||||
return extracted_words
|
||||
|
||||
# # add single vector as tuple
|
||||
# vectors.append(tuple(vector))
|
||||
# df_vectors = pd.DataFrame.from_records(vectors,
|
||||
# index=None,
|
||||
# #header=vocab,
|
||||
# columns=vocab)
|
||||
# return df_vectors
|
||||
|
||||
def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True):
|
||||
def make_matrix(extracted_words, vocab, rel_freq=True, stemming=True):
|
||||
'''calculates word stem frequencies in input articles. returns
|
||||
document term matrix(DataFrame) with relative word frequencies
|
||||
(0 <= values < 1) if relative_word_frequencies=True or absolute
|
||||
|
@ -101,28 +77,38 @@ class BagOfWords:
|
|||
'''
|
||||
print('# BOW: calculating matrix...')
|
||||
print()
|
||||
|
||||
# total number of words in bag of words
|
||||
word_count = 0
|
||||
print('# counting number of features in corpus...')
|
||||
print()
|
||||
for list in extracted_words:
|
||||
word_count += len(list)
|
||||
|
||||
# number of articles
|
||||
n_articles = len(extracted_words)
|
||||
# number of words in vocab
|
||||
l_vocab = len(vocab)
|
||||
|
||||
# create zero-filled dataframe
|
||||
array = np.zeros(shape=(len(series),len(vocab)))
|
||||
array = np.zeros(shape=(n_articles, l_vocab))
|
||||
df_matrix = pd.DataFrame(array, columns=vocab)
|
||||
|
||||
print('# calculating frequencies...')
|
||||
print()
|
||||
|
||||
# for every text in series
|
||||
for i in range(len(series)):
|
||||
for i in range(len(extracted_words)):
|
||||
|
||||
# extract text of single article
|
||||
text = series.iloc[i]
|
||||
# extract words of single article
|
||||
words = extracted_words[i]
|
||||
|
||||
# extract its words
|
||||
words = BagOfWords.extract_words(text, stemming)
|
||||
# count words in article
|
||||
word_count = len(words)
|
||||
|
||||
# for every word in global vocab
|
||||
for v in vocab:
|
||||
# for every word in article
|
||||
for w in words:
|
||||
# find right position
|
||||
if w == v:
|
||||
if relative_word_frequencies:
|
||||
if rel_freq:
|
||||
# relative word frequency
|
||||
df_matrix.loc[i][v] += 1/word_count
|
||||
else:
|
||||
|
@ -131,18 +117,22 @@ class BagOfWords:
|
|||
|
||||
return df_matrix
|
||||
|
||||
def make_vocab(series, stemming=True):
|
||||
'''adds words of input articles to a global vocabulary.
|
||||
input: dataframe of all articles, return value: list of words
|
||||
def make_vocab(extracted_words, stemming=True):
|
||||
'''adds all words to a global vocabulary.
|
||||
input: list of lists of all extracted words, returns: list of words
|
||||
'''
|
||||
print('# BOW: making vocabulary of data set...')
|
||||
print()
|
||||
vocab = set()
|
||||
# for every article's text
|
||||
for text in series:
|
||||
# add single article's text to total vocabulary
|
||||
vocab |= set(BagOfWords.extract_words(text, stemming))
|
||||
return vocab
|
||||
for e_list in extracted_words:
|
||||
for word in e_list:
|
||||
# add every single word to vocabulary
|
||||
vocab.add(word)
|
||||
print('# vocabulary consists of {} features.'.format(len(vocab)))
|
||||
print()
|
||||
# transform set to list
|
||||
return list(vocab)
|
||||
|
||||
def set_stop_words(stemming=True):
|
||||
'''creates list of all words that will be ignored
|
||||
|
@ -179,7 +169,7 @@ class BagOfWords:
|
|||
'yourselves']
|
||||
|
||||
#add unwanted terms
|
||||
stop_words.extend(['reuters', 'bloomberg', 'cnn', 'n', 'l', 'â',
|
||||
stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
|
||||
'file', 'photo', 'min', 'read', 'staff', 'left',
|
||||
'right', 'updated', 'minutes', 'brief', 'editing',
|
||||
'reporting', 'ago', 'also', 'would', 'could',
|
||||
|
@ -202,20 +192,23 @@ class BagOfWords:
|
|||
# transform list to set to eliminate duplicates
|
||||
return set(stop_words)
|
||||
|
||||
def make_dict_common_words(texts, rel_freq=True, stemming=True, n=200):
|
||||
'''texts: df of article texts of complete data set as series,
|
||||
return dict of words with their count.
|
||||
def make_dict_common_words(df_matrix, n=200, rel_freq=True, stemming=True):
|
||||
'''params: DataFrame document term matrix of complete data set,
|
||||
number of n most common words.
|
||||
returns: dict of words with their count.
|
||||
'''
|
||||
print('# making dictionary of most common words...')
|
||||
print()
|
||||
|
||||
# words under that rel_freq limit are not included
|
||||
limit = 0.0005
|
||||
# set limit
|
||||
limit = 0.001
|
||||
if not rel_freq:
|
||||
limit = 25
|
||||
limit = len(df_matrix) * 0.001
|
||||
|
||||
# word => count
|
||||
dict = {}
|
||||
vocab = BagOfWords.make_vocab(texts, stemming)
|
||||
# calculate document term matrix
|
||||
df_matrix = BagOfWords.make_matrix(texts, vocab, rel_freq, stemming)
|
||||
print(df_matrix.shape)
|
||||
|
||||
# iterate over words
|
||||
for column in df_matrix:
|
||||
# count word mentions in total
|
||||
|
@ -224,16 +217,23 @@ class BagOfWords:
|
|||
# sort dict by value and
|
||||
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
||||
reverse=True))
|
||||
print(o_dict)
|
||||
# return n higest values as dict (word => count)
|
||||
n_dict = {}
|
||||
|
||||
for i in range(n):
|
||||
n_dict[o_dict.popitem(last=False)[0]] = o_dict.popitem(last=False)[1]
|
||||
# next highest score
|
||||
next_highest = o_dict.popitem(last=False)
|
||||
n_dict[next_highest[0]] = next_highest[1]
|
||||
|
||||
return n_dict
|
||||
|
||||
def count_features(texts, stemming=True):
|
||||
''' count total number of features in textual corpus
|
||||
'''
|
||||
print('# counting all features in corpus...')
|
||||
print()
|
||||
vocab = BagOfWords.make_vocab(texts, True)
|
||||
vocab = BagOfWords.make_vocab(texts, stemming)
|
||||
return len(vocab)
|
||||
|
||||
def count_all_words(texts):
|
||||
|
@ -244,26 +244,37 @@ class BagOfWords:
|
|||
sum += len(text.split())
|
||||
return sum
|
||||
|
||||
def test():
|
||||
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
||||
df_dataset = pd.read_csv(file,
|
||||
delimiter='|',
|
||||
header=None,
|
||||
index_col=None,
|
||||
engine='python',
|
||||
usecols=[1,2],
|
||||
nrows=100,
|
||||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
quotechar='\'')
|
||||
|
||||
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
||||
stemming = True
|
||||
rel_freq = True
|
||||
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||
#print(vocab)
|
||||
for text in corpus:
|
||||
print(text)
|
||||
print()
|
||||
print()
|
||||
# ab hier ValueError bei nrows=10000...
|
||||
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
|
||||
dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming)
|
||||
print(dict)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
# load new data set
|
||||
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
||||
df_dataset = pd.read_csv(file,
|
||||
delimiter='|',
|
||||
header=None,
|
||||
index_col=None,
|
||||
engine='python',
|
||||
usecols=[1,2],
|
||||
nrows=3000,
|
||||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
quotechar='\'')
|
||||
|
||||
# find most common words in dataset
|
||||
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
||||
stemming = False
|
||||
rel_freq = False
|
||||
vocab = BagOfWords.make_vocab(corpus, stemming)
|
||||
|
||||
# print(BagOfWords.make_matrix(corpus, vocab, False, stemming))
|
||||
print(BagOfWords.make_dict_common_words(corpus, rel_freq, stemming, 200))
|
||||
# print(BagOfWords.count_features(corpus))
|
||||
for word in sorted(BagOfWords.set_stop_words(False)):
|
||||
print(word)
|
||||
print()
|
||||
print(PorterStemmer().stem(word))
|
||||
print()
|
||||
# BagOfWords.test()
|
|
@ -4,74 +4,77 @@ Cosine Similarity
|
|||
|
||||
CosineSimilarity measures the similarity between to articles.
|
||||
It calculates c: the cosine of the angle between the articles
|
||||
vectors dict_1 and dict_2.
|
||||
c = (dict_1 * dict_2) / (|dict_1| * |dict_2|).
|
||||
vectors text_1 and text_2.
|
||||
c = (text_1 * text_2) / (|text_1| * |text_2|).
|
||||
c = 1, if articles are equal => identicalness is 100%
|
||||
0 > c > 1, else => identicalness is (c*100)%
|
||||
(The greater c, the more similar two articles are.)
|
||||
'''
|
||||
from BagOfWords import BagOfWords
|
||||
|
||||
#TODO:uses dictionaries of each article
|
||||
#=>ToDo:has to be changed as we are now using vectors
|
||||
|
||||
import csv
|
||||
import math
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
import pandas as pd
|
||||
|
||||
class CosineSimilarity:
|
||||
|
||||
def cos_sim(dict_1, dict_2):
|
||||
def calc_similarity(text_1, text_2, rel_freq=True, stemming=True):
|
||||
''' calculates cosine similarity of two input articles
|
||||
'''
|
||||
print('# calculating cosine similarity...')
|
||||
print()
|
||||
|
||||
# list of all different words
|
||||
vocab = []
|
||||
# extract words from articles
|
||||
extracted_words_1 = BagOfWords.extract_words(text_1, stemming)
|
||||
extracted_words_2 = BagOfWords.extract_words(text_2, stemming)
|
||||
print(extracted_words_1)
|
||||
print(extracted_words_2)
|
||||
|
||||
# insert words of 1st article into vocab
|
||||
for key in dict_1.keys():
|
||||
if key not in vocab:
|
||||
vocab.append(key)
|
||||
|
||||
# insert words of 2nd article into vocab
|
||||
for key in dict_2.keys():
|
||||
if key not in vocab:
|
||||
vocab.append(key)
|
||||
|
||||
# delete first entry ('sum_words')
|
||||
vocab.pop(0)
|
||||
# insert words into vocab
|
||||
both_extracted = []
|
||||
both_extracted.append(extracted_words_1)
|
||||
both_extracted.append(extracted_words_2)
|
||||
vocab = BagOfWords.make_vocab(both_extracted, stemming)
|
||||
|
||||
# create vectors
|
||||
vector_1 = CosineSimilarity.create_vector(dict_1, vocab)
|
||||
vector_2 = CosineSimilarity.create_vector(dict_2, vocab)
|
||||
matrix = BagOfWords.make_matrix(both_extracted, vocab,\
|
||||
rel_freq, stemming)
|
||||
|
||||
# start calculation
|
||||
# calculate numerator of formula
|
||||
sum_1 = 0
|
||||
|
||||
for i in range (0,len(vector_1)):
|
||||
sum_1 += vector_1[i] * vector_2[i]
|
||||
for i in range (0,len(matrix.iloc[0])):
|
||||
sum_1 += matrix.iloc[0][i] * matrix.iloc[1][i]
|
||||
|
||||
# calculate denominator of formula
|
||||
sum_2 = 0
|
||||
|
||||
for entry in vector_1:
|
||||
for entry in matrix.iloc[0]:
|
||||
sum_2 += entry ** 2
|
||||
|
||||
sum_3 = 0
|
||||
for entry in vector_2:
|
||||
for entry in matrix.iloc[1]:
|
||||
sum_3 += entry ** 2
|
||||
|
||||
return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3))
|
||||
|
||||
def create_vector(dict, vocab):
|
||||
# word frequency vector
|
||||
vector = []
|
||||
for word in vocab:
|
||||
# check if word occurs in article
|
||||
if word in dict:
|
||||
# insert word count
|
||||
vector.append(dict[word])
|
||||
else:
|
||||
# insert zero
|
||||
vector.append(0)
|
||||
# delete first entry ('sum_words')
|
||||
vector.pop(0)
|
||||
return vector
|
||||
if __name__ == '__main__':
|
||||
# read data set
|
||||
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
||||
df = pd.read_csv(file,
|
||||
delimiter='|',
|
||||
header=None,
|
||||
index_col=None,
|
||||
engine='python',
|
||||
usecols=[1,2],
|
||||
nrows=100,
|
||||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
quotechar='\'')
|
||||
|
||||
texts = df[1] + '. ' + df[2]
|
||||
|
||||
# compare first and second article in data set
|
||||
print(CosineSimilarity.calc_similarity(texts.iloc[0], texts.iloc[1],\
|
||||
rel_freq=True, stemming=True))
|
|
@ -7,6 +7,9 @@ array X of size [n_samples, n_features], holding the training samples,
|
|||
and array y of integer values, size [n_samples],
|
||||
holding the class labels for the training samples.
|
||||
'''
|
||||
# toDo: replace old dataset!!!
|
||||
# CountVectorizer funktioniert noch nicht
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
|
||||
import csv
|
||||
|
@ -16,21 +19,22 @@ import graphviz
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn import tree
|
||||
#from sklearn.feature_extraction.text import CountVectorizer
|
||||
# from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.metrics import f1_score
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
|
||||
class DecisionTree:
|
||||
|
||||
def make_tree(dataset):
|
||||
def make_tree(dataset, sklearn_cv=False, stemming=False, percentile=100):
|
||||
print('# fitting model')
|
||||
print('# ...')
|
||||
|
||||
X = dataset['Title'] + ' ' + dataset['Text']
|
||||
y = dataset['Label']
|
||||
|
||||
#count_vector = CountVectorizer()
|
||||
if sklearn_cv:
|
||||
cv = CountVectorizer()
|
||||
|
||||
# use stratified k-fold cross-validation as split method
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
|
@ -45,33 +49,48 @@ class DecisionTree:
|
|||
important_words = {}
|
||||
|
||||
# for each fold
|
||||
n = 0
|
||||
for train, test in skf.split(X,y):
|
||||
|
||||
# BOW
|
||||
vocab = BagOfWords.make_vocab(X[train])
|
||||
# fit the training data and then return the matrix
|
||||
training_data = BagOfWords.make_matrix(X[train], vocab)
|
||||
# transform testing data and return the matrix
|
||||
testing_data = BagOfWords.make_matrix(X[test], vocab)
|
||||
n += 1
|
||||
vocab = []
|
||||
print('# split no. ' + str(n))
|
||||
|
||||
# #fit the training data and then return the matrix
|
||||
# training_data = count_vector.fit_transform(X[train], y[train]).toarray()
|
||||
# #transform testing data and return the matrix
|
||||
# testing_data = count_vector.transform(X[test]).toarray()
|
||||
if sklearn_cv:
|
||||
# use sklearn CountVectorizer
|
||||
# fit the training data and then return the matrix
|
||||
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||
# transform testing data and return the matrix
|
||||
testing_data = cv.transform(X[test]).toarray()
|
||||
else:
|
||||
# use my own BagOfWords python implementation
|
||||
rel_freq = True
|
||||
extracted_words = BagOfWords.extract_all_words(X[train], stemming)
|
||||
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||
print(vocab)
|
||||
|
||||
# # apply select percentile
|
||||
# selector = SelectPercentile(percentile=25)
|
||||
# selector.fit(training_data, y[train])
|
||||
# fit the training data and then return the matrix
|
||||
training_data = BagOfWords.make_matrix(extracted_words,
|
||||
vocab, rel_freq, stemming)
|
||||
# transform testing data and return the matrix
|
||||
extracted_words = BagOfWords.extract_all_words(X[test], stemming)
|
||||
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||
vocab, rel_freq, stemming)
|
||||
|
||||
# training_data_r = selector.transform(training_data)
|
||||
# testing_data_r = selector.transform(testing_data)
|
||||
# apply select percentile
|
||||
selector = SelectPercentile(percentile=percentile)
|
||||
selector.fit(training_data, y[train])
|
||||
|
||||
# new reduced data sets
|
||||
training_data_r = selector.transform(training_data)
|
||||
testing_data_r = selector.transform(testing_data)
|
||||
|
||||
# fit classifier
|
||||
classifier.fit(training_data, y[train])
|
||||
classifier.fit(training_data_r, y[train])
|
||||
|
||||
#predict class
|
||||
predictions_train = classifier.predict(training_data)
|
||||
predictions_test = classifier.predict(testing_data)
|
||||
predictions_train = classifier.predict(training_data_r)
|
||||
predictions_test = classifier.predict(testing_data_r)
|
||||
|
||||
#store metrics predicted on test/train set
|
||||
f1_scores.append(f1_score(y[test], predictions_test))
|
||||
|
@ -80,6 +99,7 @@ class DecisionTree:
|
|||
# search for important features
|
||||
feature_importances = np.array(classifier.feature_importances_)
|
||||
important_indices = feature_importances.argsort()[-50:][::-1]
|
||||
print(important_indices)
|
||||
|
||||
for i in important_indices:
|
||||
if vocab[i] in important_words:
|
||||
|
|
|
@ -6,6 +6,8 @@ FilterKeywords searches for merger specific keywords
|
|||
in an article and counts them.
|
||||
'''
|
||||
|
||||
# toDo: replace dict by vector/matrix
|
||||
|
||||
from collections import defaultdict
|
||||
import re
|
||||
|
||||
|
@ -18,14 +20,6 @@ class FilterKeywords:
|
|||
output are the contained keywords and their count.
|
||||
'''
|
||||
|
||||
# # list of regular expressions that match merger specific keywords
|
||||
# regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?',
|
||||
# r'business combinations?', r'combined compan(y|ies)',
|
||||
# r'(joint venture|JV)s?', r'take[ -]?overs?', r'tie-up',
|
||||
# r'deals?', r'transactions?', r'approv(e|ing|al|ed)s?',
|
||||
# r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?',
|
||||
# r'purchase', r'(sell(s|ers?|ing)?|sold)']
|
||||
|
||||
keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers',
|
||||
'acquisition', 'acquire', 'acquisitions', 'acquires',
|
||||
'combine', 'combines', 'combination', 'combined',
|
||||
|
@ -44,22 +38,22 @@ class FilterKeywords:
|
|||
# remove duplicates
|
||||
keywords = set(keyword_list)
|
||||
|
||||
# counts keywords in article (default value: 0)
|
||||
dict_keywords = defaultdict(int)
|
||||
# # counts keywords in article (default value: 0)
|
||||
# dict_keywords = defaultdict(int)
|
||||
|
||||
# search for matchings in dictionary of input article
|
||||
for key in dict_input.keys():
|
||||
# iterate over all regular expressions
|
||||
for kword in keywords:
|
||||
if re.match(kword, key):
|
||||
# if match, increase value of matching key
|
||||
if str(kword) in dict_keywords:
|
||||
dict_keywords[str(kword)] += dict_input[key]
|
||||
else:
|
||||
dict_keywords[str(kword)] = dict_input[key]
|
||||
# # search for matchings in dictionary of input article
|
||||
# for key in dict_input.keys():
|
||||
# # iterate over all regular expressions
|
||||
# for kword in keywords:
|
||||
# if re.match(kword, key):
|
||||
# # if match, increase value of matching key
|
||||
# if str(kword) in dict_keywords:
|
||||
# dict_keywords[str(kword)] += dict_input[key]
|
||||
# else:
|
||||
# dict_keywords[str(kword)] = dict_input[key]
|
||||
|
||||
return dict_keywords
|
||||
# return dict_keywords
|
||||
|
||||
if __name__ == '__main__':
|
||||
dict_test={'example':2, 'combined':5, 'sells':3}
|
||||
print(FilterKeywords.search_keywords(dict_test))
|
||||
# dict_test={'example':2, 'combined':5, 'sells':3}
|
||||
# print(FilterKeywords.search_keywords(dict_test))
|
|
@ -25,7 +25,7 @@ from sklearn.naive_bayes import GaussianNB
|
|||
|
||||
class NaiveBayes:
|
||||
|
||||
def make_naive_bayes(dataset):
|
||||
def make_naive_bayes(dataset, sklearn_cv=True, percentile=100):
|
||||
'''fits naive bayes model with StratifiedKFold,
|
||||
uses my BOW
|
||||
'''
|
||||
|
@ -34,10 +34,11 @@ class NaiveBayes:
|
|||
|
||||
# split data into text and label set
|
||||
# join title and text
|
||||
X = dataset['Title'] + ' ' + dataset['Text']
|
||||
X = dataset['Title'] + '. ' + dataset['Text']
|
||||
y = dataset['Label']
|
||||
|
||||
cv = CountVectorizer()
|
||||
if sklearn_cv:
|
||||
cv = CountVectorizer()
|
||||
|
||||
# use stratified k-fold cross-validation as split method
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
|
||||
|
@ -61,23 +62,32 @@ class NaiveBayes:
|
|||
n += 1
|
||||
print('# split no. ' + str(n))
|
||||
|
||||
# # eigenes BOW
|
||||
# vocab = BagOfWords.make_vocab(X[train])
|
||||
# # fit the training data and then return the matrix
|
||||
# training_data = BagOfWords.make_matrix(X[train], vocab)
|
||||
# # transform testing data and return the matrix
|
||||
# testing_data = BagOfWords.make_matrix(X[test], vocab)
|
||||
if sklearn_cv:
|
||||
# use sklearn CountVectorizer
|
||||
# fit the training data and then return the matrix
|
||||
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||
# transform testing data and return the matrix
|
||||
testing_data = cv.transform(X[test]).toarray()
|
||||
else:
|
||||
# use my own BagOfWords python implementation
|
||||
stemming = True
|
||||
rel_freq = True
|
||||
extracted_words = BagOfWords.extract_all_words(X[train])
|
||||
vocab = BagOfWords.make_vocab(extracted_words)
|
||||
|
||||
# using CountVectorizer:
|
||||
# fit the training data and then return the matrix
|
||||
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||
# transform testing data and return the matrix
|
||||
testing_data = cv.transform(X[test]).toarray()
|
||||
# fit the training data and then return the matrix
|
||||
training_data = BagOfWords.make_matrix(extracted_words,
|
||||
vocab, rel_freq, stemming)
|
||||
# transform testing data and return the matrix
|
||||
extracted_words = BagOfWords.extract_all_words(X[test])
|
||||
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||
vocab, rel_freq, stemming)
|
||||
|
||||
# apply select percentile
|
||||
selector = SelectPercentile(percentile=100)
|
||||
selector = SelectPercentile(percentile=percentile)
|
||||
selector.fit(training_data, y[train])
|
||||
|
||||
# new reduced data sets
|
||||
training_data_r = selector.transform(training_data)
|
||||
testing_data_r = selector.transform(testing_data)
|
||||
|
||||
|
|
|
@ -10,13 +10,14 @@ import csv
|
|||
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.metrics import recall_score, precision_score
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
|
||||
class NaiveBayes_Interactive:
|
||||
|
||||
def make_naive_bayes(dataset):
|
||||
def make_naive_bayes(dataset, sklearn_cv=True, percentile=100):
|
||||
'''fits naive bayes model
|
||||
'''
|
||||
print('# fitting model')
|
||||
|
@ -24,10 +25,11 @@ class NaiveBayes_Interactive:
|
|||
|
||||
# split data into text and label set
|
||||
# join title and text
|
||||
X = dataset['Title'] + ' ' + dataset['Text']
|
||||
X = dataset['Title'] + '. ' + dataset['Text']
|
||||
y = dataset['Label']
|
||||
|
||||
cv = CountVectorizer()
|
||||
if sklearn_cv:
|
||||
cv = CountVectorizer()
|
||||
|
||||
# stratified k-fold cross-validation as split method
|
||||
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=5)
|
||||
|
@ -51,17 +53,40 @@ class NaiveBayes_Interactive:
|
|||
n += 1
|
||||
print('# split no. ' + str(n))
|
||||
|
||||
# using CountVectorizer:
|
||||
# fit the training data and then return the matrix
|
||||
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||
# transform testing data and return the matrix
|
||||
testing_data = cv.transform(X[test]).toarray()
|
||||
if sklearn_cv:
|
||||
# use sklearn CountVectorizer
|
||||
# fit the training data and then return the matrix
|
||||
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||
# transform testing data and return the matrix
|
||||
testing_data = cv.transform(X[test]).toarray()
|
||||
else:
|
||||
# use my own BagOfWords python implementation
|
||||
stemming = True
|
||||
rel_freq = True
|
||||
extracted_words = BagOfWords.extract_all_words(X[train])
|
||||
vocab = BagOfWords.make_vocab(extracted_words)
|
||||
|
||||
# fit the training data and then return the matrix
|
||||
training_data = BagOfWords.make_matrix(extracted_words,
|
||||
vocab, rel_freq, stemming)
|
||||
# transform testing data and return the matrix
|
||||
extracted_words = BagOfWords.extract_all_words(X[test])
|
||||
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||
vocab, rel_freq, stemming)
|
||||
|
||||
# apply select percentile
|
||||
selector = SelectPercentile(percentile=percentile)
|
||||
selector.fit(training_data, y[train])
|
||||
|
||||
# new reduced data sets
|
||||
training_data_r = selector.transform(training_data)
|
||||
testing_data_r = selector.transform(testing_data)
|
||||
|
||||
#fit classifier
|
||||
classifier.fit(training_data, y[train])
|
||||
classifier.fit(training_data_r, y[train])
|
||||
#predict class
|
||||
predictions_train = classifier.predict(training_data)
|
||||
predictions_test = classifier.predict(testing_data)
|
||||
predictions_train = classifier.predict(training_data_r)
|
||||
predictions_test = classifier.predict(testing_data_r)
|
||||
|
||||
#print and store metrics
|
||||
rec = recall_score(y[test], predictions_test)
|
||||
|
@ -166,7 +191,9 @@ class NaiveBayes_Interactive:
|
|||
quotechar='\'',
|
||||
quoting=csv.QUOTE_NONE)
|
||||
|
||||
make_naive_bayes(data)
|
||||
use_count_vectorizer = True
|
||||
select_percentile = 100
|
||||
make_naive_bayes(data, use_count_vectorizer, select_percentile)
|
||||
|
||||
print('#')
|
||||
print('# ending naive bayes')
|
21
SVM.py
21
SVM.py
|
@ -27,7 +27,7 @@ from sklearn.svm import SVC
|
|||
|
||||
class SVM:
|
||||
|
||||
def make_svm(dataset):
|
||||
def make_svm(dataset, sklearn_cv=True):
|
||||
|
||||
print('# fitting model')
|
||||
print('# ...')
|
||||
|
@ -35,16 +35,18 @@ class SVM:
|
|||
# split data into text and label set
|
||||
|
||||
# articles' text (title + text)
|
||||
X = dataset['Title'] + ' ' + dataset['Text']
|
||||
X = dataset['Title'] + '. ' + dataset['Text']
|
||||
# articles' labels
|
||||
y = dataset['Label']
|
||||
matrix = pd.DataFrame()
|
||||
|
||||
# Bag of Words
|
||||
print('# calculating bag of words')
|
||||
print('# ...')
|
||||
# fit the training data and then return the matrix
|
||||
#X = BagOfWords.fit_transform(X)
|
||||
X = CountVectorizer().fit_transform(X).toarray()
|
||||
if sklearn_cv:
|
||||
# use sklearn CountVectorizer
|
||||
matrix = CountVectorizer().fit_transform(X).toarray()
|
||||
else:
|
||||
# use own BOW implementation
|
||||
matrix = BagOfWords.fit_transform(X)
|
||||
|
||||
# use stratified k-fold cross-validation as split method
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
|
@ -64,7 +66,7 @@ class SVM:
|
|||
print('# fit classifier')
|
||||
print('# ...')
|
||||
|
||||
grid.fit(X,y)
|
||||
grid.fit(matrix,y)
|
||||
|
||||
# DataFrame of results
|
||||
df_results = grid.cv_results_
|
||||
|
@ -104,6 +106,7 @@ class SVM:
|
|||
quotechar='\'',
|
||||
quoting=csv.QUOTE_NONE)
|
||||
|
||||
make_svm(data)
|
||||
use_count_vectorizer = True
|
||||
make_svm(data, use_count_vectorizer)
|
||||
|
||||
print('# ending svm')
|
|
@ -22,7 +22,7 @@ class VisualizerNews:
|
|||
def plot_wordcloud_dataset():
|
||||
'''plots word cloud image of most common words in dataset.
|
||||
'''
|
||||
print('# preparing word cloud...')
|
||||
print('# preparing word cloud of 200 most common words...')
|
||||
print()
|
||||
# load new data set
|
||||
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
||||
|
@ -32,17 +32,18 @@ class VisualizerNews:
|
|||
index_col=None,
|
||||
engine='python',
|
||||
usecols=[1,2],
|
||||
#nrows=100,
|
||||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
quotechar='\'')
|
||||
|
||||
corpus = df_dataset[1] + ' ' + df_dataset[2]
|
||||
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
||||
stemming = False
|
||||
rel_freq = False
|
||||
|
||||
# find most common words in dataset
|
||||
dict = BagOfWords.make_dict_common_words(corpus,
|
||||
rel_freq=True,
|
||||
stemming=False,
|
||||
n=200)
|
||||
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
|
||||
dict = BagOfWords.make_dict_common_words(matrix, 200, rel_freq, stemming)
|
||||
|
||||
wordcloud = WordCloud(background_color='white',
|
||||
width=2400,
|
||||
|
@ -62,30 +63,25 @@ class VisualizerNews:
|
|||
x-axis: number of mentions of the company
|
||||
y-axis: frequency
|
||||
'''
|
||||
print('# preparing histogram...')
|
||||
print('# preparing histogram of company mentions...')
|
||||
print()
|
||||
# old data set
|
||||
filepath = 'data\\classification_labelled_corrected.csv'
|
||||
df = pd.read_csv(filepath,
|
||||
sep='|',
|
||||
# read data set
|
||||
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
||||
df = pd.read_csv(file,
|
||||
delimiter='|',
|
||||
header=None,
|
||||
index_col=None,
|
||||
engine='python',
|
||||
decimal='.',
|
||||
quotechar='\'',
|
||||
quoting=csv.QUOTE_NONE)
|
||||
usecols=[1,2],
|
||||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
quotechar='\'')
|
||||
|
||||
# only articles with label==1
|
||||
df_hits = df[df['Label'] == 1]
|
||||
# # only articles with label==1
|
||||
# df_hits = df[df['Label'] == 1]
|
||||
# texts = df_hits['Title'] + '. ' + df_hits['Text']
|
||||
texts = df[1] + '. ' + df[2]
|
||||
|
||||
texts = df_hits['Title'] + '. ' + df_hits['Text']
|
||||
|
||||
# # zum prüfen lesen
|
||||
# for text in texts[10:20]:
|
||||
# print(text)
|
||||
# print()
|
||||
# print(NER.find_companies(text))
|
||||
# print()
|
||||
|
||||
# count names in hit articles
|
||||
# dict: count articles with company names
|
||||
count_names = NER.count_companies(texts)
|
||||
|
||||
# sort list in descending order
|
||||
|
@ -98,7 +94,7 @@ class VisualizerNews:
|
|||
plt.ylabel('Number of companies with this number of articles')
|
||||
num_bins = 50
|
||||
n, bins, patches = plt.hist(names, num_bins, facecolor='darkred', alpha=0.5)
|
||||
# plt.grid(True)
|
||||
plt.axis([0, 50, 0, 1000])
|
||||
plt.show()
|
||||
|
||||
def plot_histogram_text_lengths():
|
||||
|
@ -106,20 +102,21 @@ class VisualizerNews:
|
|||
x-axis: number of characters in article (without headline)
|
||||
y-axis: frequency
|
||||
'''
|
||||
print('# preparing histogram...')
|
||||
print('# preparing histogram of text lengths...')
|
||||
print()
|
||||
# new data set
|
||||
# read data set
|
||||
filepath = 'data\\interactive_labeling_dataset.csv'
|
||||
df_dataset = pd.read_csv(filepath,
|
||||
delimiter='|',
|
||||
header=0,
|
||||
index_col=None,
|
||||
engine='python',
|
||||
usecols=[2],
|
||||
#nrows=100,
|
||||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
quotechar='\'')
|
||||
# consider only Text, not Headline
|
||||
texts = df_dataset['Text']
|
||||
texts = df_dataset[2]
|
||||
|
||||
# count characters in articles
|
||||
print('# counting characters in articles...')
|
||||
|
@ -150,7 +147,7 @@ class VisualizerNews:
|
|||
|
||||
def plot_pie_chart_of_sites():
|
||||
|
||||
print('# preparing pie chart...')
|
||||
print('# preparing pie chart of news article sites...')
|
||||
print()
|
||||
|
||||
# load data set
|
||||
|
@ -164,13 +161,15 @@ class VisualizerNews:
|
|||
#nrows=100,
|
||||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
quotechar='\'')
|
||||
|
||||
# find all different sites
|
||||
df_counts = df_dataset.groupby('Site').count()
|
||||
# count occurences of each site
|
||||
df_counts = df_counts.sort_values(['Url'], ascending=False)
|
||||
|
||||
fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
|
||||
|
||||
data = list(df_counts['Url'])
|
||||
# legend labels
|
||||
labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)',
|
||||
'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']
|
||||
|
||||
|
@ -188,14 +187,14 @@ class VisualizerNews:
|
|||
plt.show()
|
||||
|
||||
def plot_hist_most_common_words(n_commons = 10):
|
||||
print('# preparing histogram...')
|
||||
print('# preparing histogram of most common words...')
|
||||
print()
|
||||
# load data set
|
||||
filepath = 'data\\interactive_labeling_dataset_without_header.csv'
|
||||
df_dataset = pd.read_csv(filepath,
|
||||
delimiter='|',
|
||||
header=None,
|
||||
#usecols=[1,2],
|
||||
usecols=[1,2],
|
||||
index_col=None,
|
||||
engine='python',
|
||||
#nrows=1000,
|
||||
|
@ -204,11 +203,14 @@ class VisualizerNews:
|
|||
|
||||
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
||||
|
||||
stemming = False
|
||||
rel_freq = True
|
||||
|
||||
# find most common words in dataset
|
||||
dict = BagOfWords.make_dict_common_words(corpus,
|
||||
rel_freq=True,
|
||||
stemming=False,
|
||||
n=n_commons)
|
||||
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
|
||||
dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq, stemming)
|
||||
|
||||
plt.xlabel('Most common words in textual corpus')
|
||||
plt.ylabel('Relative frequency')
|
||||
|
|
Loading…
Reference in New Issue