changed CountVectorizer optional and other things

This commit is contained in:
Anne Lorenz 2018-11-01 14:03:17 +01:00
parent 2d5368e283
commit 7e037a1621
8 changed files with 327 additions and 257 deletions

View File

@ -9,7 +9,7 @@ vocabulary. As the multinomial Naive Bayes classifier is suitable for
classification with discrete features (e.g., word counts for text classification with discrete features (e.g., word counts for text
classification). The multinomial distribution normally requires integer classification). The multinomial distribution normally requires integer
feature counts. However, in practice, fractional counts such as tf-idf may feature counts. However, in practice, fractional counts such as tf-idf may
also work. => considered by 'relative_word_frequencies' as parameter. also work => considered by 'rel_freq'(relative word frequencies) as parameter.
''' '''
from collections import OrderedDict from collections import OrderedDict
import csv import csv
@ -21,11 +21,14 @@ from nltk.stem.porter import PorterStemmer
class BagOfWords: class BagOfWords:
def fit_transform(X, relative_word_frequencies=True): def fit_transform(corpus, rel_freq=True, stemming=True):
''' similar to CountVectorizer's fit_transform method ''' similar to CountVectorizer's fit_transform method
''' '''
vocab = BagOfWords.make_vocab(X) extracted_words = BagOfWords.extract_all_words(corpus, stemming)
return BagOfWords.make_matrix(X, vocab, relative_word_frequencies) vocab = BagOfWords.make_vocab(extracted_words, stemming)
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
stemming)
return matrix
def extract_words(text, stemming=True): def extract_words(text, stemming=True):
'''takes article as argument, removes numbers, '''takes article as argument, removes numbers,
@ -46,52 +49,25 @@ class BagOfWords:
if stemming: if stemming:
# reduce word to its stem # reduce word to its stem
word = stemmer.stem(word) word = stemmer.stem(word)
# filter out spam chars
word = word.replace('â', '').replace('œ', '')\
.replace('ã', '')
words_cleaned.append(word) words_cleaned.append(word)
return words_cleaned return words_cleaned
# def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True): def extract_all_words(corpus, stemming=True):
# '''calculates word stem frequencies in input articles. returns '''param: all articles of corpus
# document term matrix(DataFrame) with relative word frequencies returns list of lists of all extracted words, one row per article
# (0 <= values < 1) if relative_word_frequencies=True or absolute '''
# word frequencies (int) if relative_word_frequencies=False. extracted_words = []
# (rows: different articles, colums: different words in vocab) print('# extracting all words from articles...')
# returns matrix as DataFrame print()
# ''' for text in corpus:
# print('# BOW: calculating matrix...') extracted_words.append(BagOfWords.extract_words(text, stemming))
# print()
# # create list of tuples
# vectors = []
# # for every text in series
# for i in range(len(series)):
# # extract text of single article
# text = series.iloc[i]
# # extract its words
# words = BagOfWords.extract_words(text, stemming)
# # count words in single article
# word_count = len(words)
# vector = []
# for i, v in enumerate(vocab):
# vector.append(0)
# for w in words:
# if w == v:
# if relative_word_frequencies:
# # relative word frequency
# vector[i] += 1/word_count
# else:
# # absolute word frequency
# vector[i] += 1
# # !!! hier passiert immer der MemoryError: !!! return extracted_words
# # add single vector as tuple def make_matrix(extracted_words, vocab, rel_freq=True, stemming=True):
# vectors.append(tuple(vector))
# df_vectors = pd.DataFrame.from_records(vectors,
# index=None,
# #header=vocab,
# columns=vocab)
# return df_vectors
def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True):
'''calculates word stem frequencies in input articles. returns '''calculates word stem frequencies in input articles. returns
document term matrix(DataFrame) with relative word frequencies document term matrix(DataFrame) with relative word frequencies
(0 <= values < 1) if relative_word_frequencies=True or absolute (0 <= values < 1) if relative_word_frequencies=True or absolute
@ -101,28 +77,38 @@ class BagOfWords:
''' '''
print('# BOW: calculating matrix...') print('# BOW: calculating matrix...')
print() print()
# total number of words in bag of words
word_count = 0
print('# counting number of features in corpus...')
print()
for list in extracted_words:
word_count += len(list)
# number of articles
n_articles = len(extracted_words)
# number of words in vocab
l_vocab = len(vocab)
# create zero-filled dataframe # create zero-filled dataframe
array = np.zeros(shape=(len(series),len(vocab))) array = np.zeros(shape=(n_articles, l_vocab))
df_matrix = pd.DataFrame(array, columns=vocab) df_matrix = pd.DataFrame(array, columns=vocab)
print('# calculating frequencies...')
print()
# for every text in series # for every text in series
for i in range(len(series)): for i in range(len(extracted_words)):
# extract text of single article # extract words of single article
text = series.iloc[i] words = extracted_words[i]
# extract its words
words = BagOfWords.extract_words(text, stemming)
# count words in article
word_count = len(words)
# for every word in global vocab
for v in vocab: for v in vocab:
# for every word in article # for every word in article
for w in words: for w in words:
# find right position # find right position
if w == v: if w == v:
if relative_word_frequencies: if rel_freq:
# relative word frequency # relative word frequency
df_matrix.loc[i][v] += 1/word_count df_matrix.loc[i][v] += 1/word_count
else: else:
@ -131,18 +117,22 @@ class BagOfWords:
return df_matrix return df_matrix
def make_vocab(series, stemming=True): def make_vocab(extracted_words, stemming=True):
'''adds words of input articles to a global vocabulary. '''adds all words to a global vocabulary.
input: dataframe of all articles, return value: list of words input: list of lists of all extracted words, returns: list of words
''' '''
print('# BOW: making vocabulary of data set...') print('# BOW: making vocabulary of data set...')
print() print()
vocab = set() vocab = set()
# for every article's text # for every article's text
for text in series: for e_list in extracted_words:
# add single article's text to total vocabulary for word in e_list:
vocab |= set(BagOfWords.extract_words(text, stemming)) # add every single word to vocabulary
return vocab vocab.add(word)
print('# vocabulary consists of {} features.'.format(len(vocab)))
print()
# transform set to list
return list(vocab)
def set_stop_words(stemming=True): def set_stop_words(stemming=True):
'''creates list of all words that will be ignored '''creates list of all words that will be ignored
@ -179,7 +169,7 @@ class BagOfWords:
'yourselves'] 'yourselves']
#add unwanted terms #add unwanted terms
stop_words.extend(['reuters', 'bloomberg', 'cnn', 'n', 'l', 'â', stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
'file', 'photo', 'min', 'read', 'staff', 'left', 'file', 'photo', 'min', 'read', 'staff', 'left',
'right', 'updated', 'minutes', 'brief', 'editing', 'right', 'updated', 'minutes', 'brief', 'editing',
'reporting', 'ago', 'also', 'would', 'could', 'reporting', 'ago', 'also', 'would', 'could',
@ -202,20 +192,23 @@ class BagOfWords:
# transform list to set to eliminate duplicates # transform list to set to eliminate duplicates
return set(stop_words) return set(stop_words)
def make_dict_common_words(texts, rel_freq=True, stemming=True, n=200): def make_dict_common_words(df_matrix, n=200, rel_freq=True, stemming=True):
'''texts: df of article texts of complete data set as series, '''params: DataFrame document term matrix of complete data set,
return dict of words with their count. number of n most common words.
returns: dict of words with their count.
''' '''
print('# making dictionary of most common words...')
print()
# words under that rel_freq limit are not included # words under that rel_freq limit are not included
limit = 0.0005 # set limit
limit = 0.001
if not rel_freq: if not rel_freq:
limit = 25 limit = len(df_matrix) * 0.001
# word => count # word => count
dict = {} dict = {}
vocab = BagOfWords.make_vocab(texts, stemming)
# calculate document term matrix
df_matrix = BagOfWords.make_matrix(texts, vocab, rel_freq, stemming)
print(df_matrix.shape)
# iterate over words # iterate over words
for column in df_matrix: for column in df_matrix:
# count word mentions in total # count word mentions in total
@ -224,16 +217,23 @@ class BagOfWords:
# sort dict by value and # sort dict by value and
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\ o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
reverse=True)) reverse=True))
print(o_dict)
# return n higest values as dict (word => count) # return n higest values as dict (word => count)
n_dict = {} n_dict = {}
for i in range(n): for i in range(n):
n_dict[o_dict.popitem(last=False)[0]] = o_dict.popitem(last=False)[1] # next highest score
next_highest = o_dict.popitem(last=False)
n_dict[next_highest[0]] = next_highest[1]
return n_dict return n_dict
def count_features(texts, stemming=True): def count_features(texts, stemming=True):
''' count total number of features in textual corpus
'''
print('# counting all features in corpus...') print('# counting all features in corpus...')
print() print()
vocab = BagOfWords.make_vocab(texts, True) vocab = BagOfWords.make_vocab(texts, stemming)
return len(vocab) return len(vocab)
def count_all_words(texts): def count_all_words(texts):
@ -244,26 +244,37 @@ class BagOfWords:
sum += len(text.split()) sum += len(text.split())
return sum return sum
def test():
file = 'data\\interactive_labeling_dataset_without_header.csv'
df_dataset = pd.read_csv(file,
delimiter='|',
header=None,
index_col=None,
engine='python',
usecols=[1,2],
nrows=100,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
corpus = df_dataset[1] + '. ' + df_dataset[2]
stemming = True
rel_freq = True
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming)
#print(vocab)
for text in corpus:
print(text)
print()
print()
# ab hier ValueError bei nrows=10000...
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming)
print(dict)
if __name__ == '__main__': if __name__ == '__main__':
for word in sorted(BagOfWords.set_stop_words(False)):
# load new data set print(word)
file = 'data\\interactive_labeling_dataset_without_header.csv' print()
df_dataset = pd.read_csv(file, print(PorterStemmer().stem(word))
delimiter='|', print()
header=None, # BagOfWords.test()
index_col=None,
engine='python',
usecols=[1,2],
nrows=3000,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
# find most common words in dataset
corpus = df_dataset[1] + '. ' + df_dataset[2]
stemming = False
rel_freq = False
vocab = BagOfWords.make_vocab(corpus, stemming)
# print(BagOfWords.make_matrix(corpus, vocab, False, stemming))
print(BagOfWords.make_dict_common_words(corpus, rel_freq, stemming, 200))
# print(BagOfWords.count_features(corpus))

View File

@ -4,74 +4,77 @@ Cosine Similarity
CosineSimilarity measures the similarity between to articles. CosineSimilarity measures the similarity between to articles.
It calculates c: the cosine of the angle between the articles It calculates c: the cosine of the angle between the articles
vectors dict_1 and dict_2. vectors text_1 and text_2.
c = (dict_1 * dict_2) / (|dict_1| * |dict_2|). c = (text_1 * text_2) / (|text_1| * |text_2|).
c = 1, if articles are equal => identicalness is 100% c = 1, if articles are equal => identicalness is 100%
0 > c > 1, else => identicalness is (c*100)% 0 > c > 1, else => identicalness is (c*100)%
(The greater c, the more similar two articles are.) (The greater c, the more similar two articles are.)
''' '''
from BagOfWords import BagOfWords
#TODO:uses dictionaries of each article import csv
#=>ToDo:has to be changed as we are now using vectors
import math import math
from BagOfWords import BagOfWords import pandas as pd
class CosineSimilarity: class CosineSimilarity:
def cos_sim(dict_1, dict_2): def calc_similarity(text_1, text_2, rel_freq=True, stemming=True):
''' calculates cosine similarity of two input articles
'''
print('# calculating cosine similarity...')
print()
# list of all different words # extract words from articles
vocab = [] extracted_words_1 = BagOfWords.extract_words(text_1, stemming)
extracted_words_2 = BagOfWords.extract_words(text_2, stemming)
print(extracted_words_1)
print(extracted_words_2)
# insert words of 1st article into vocab # insert words into vocab
for key in dict_1.keys(): both_extracted = []
if key not in vocab: both_extracted.append(extracted_words_1)
vocab.append(key) both_extracted.append(extracted_words_2)
vocab = BagOfWords.make_vocab(both_extracted, stemming)
# insert words of 2nd article into vocab
for key in dict_2.keys():
if key not in vocab:
vocab.append(key)
# delete first entry ('sum_words')
vocab.pop(0)
# create vectors # create vectors
vector_1 = CosineSimilarity.create_vector(dict_1, vocab) matrix = BagOfWords.make_matrix(both_extracted, vocab,\
vector_2 = CosineSimilarity.create_vector(dict_2, vocab) rel_freq, stemming)
# start calculation # start calculation
# calculate numerator of formula # calculate numerator of formula
sum_1 = 0 sum_1 = 0
for i in range (0,len(vector_1)): for i in range (0,len(matrix.iloc[0])):
sum_1 += vector_1[i] * vector_2[i] sum_1 += matrix.iloc[0][i] * matrix.iloc[1][i]
# calculate denominator of formula # calculate denominator of formula
sum_2 = 0 sum_2 = 0
for entry in vector_1: for entry in matrix.iloc[0]:
sum_2 += entry ** 2 sum_2 += entry ** 2
sum_3 = 0 sum_3 = 0
for entry in vector_2: for entry in matrix.iloc[1]:
sum_3 += entry ** 2 sum_3 += entry ** 2
return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3)) return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3))
def create_vector(dict, vocab): if __name__ == '__main__':
# word frequency vector # read data set
vector = [] file = 'data\\interactive_labeling_dataset_without_header.csv'
for word in vocab: df = pd.read_csv(file,
# check if word occurs in article delimiter='|',
if word in dict: header=None,
# insert word count index_col=None,
vector.append(dict[word]) engine='python',
else: usecols=[1,2],
# insert zero nrows=100,
vector.append(0) quoting=csv.QUOTE_NONNUMERIC,
# delete first entry ('sum_words') quotechar='\'')
vector.pop(0)
return vector texts = df[1] + '. ' + df[2]
# compare first and second article in data set
print(CosineSimilarity.calc_similarity(texts.iloc[0], texts.iloc[1],\
rel_freq=True, stemming=True))

View File

@ -7,6 +7,9 @@ array X of size [n_samples, n_features], holding the training samples,
and array y of integer values, size [n_samples], and array y of integer values, size [n_samples],
holding the class labels for the training samples. holding the class labels for the training samples.
''' '''
# toDo: replace old dataset!!!
# CountVectorizer funktioniert noch nicht
from BagOfWords import BagOfWords from BagOfWords import BagOfWords
import csv import csv
@ -16,21 +19,22 @@ import graphviz
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from sklearn import tree from sklearn import tree
#from sklearn.feature_extraction.text import CountVectorizer # from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedKFold
class DecisionTree: class DecisionTree:
def make_tree(dataset): def make_tree(dataset, sklearn_cv=False, stemming=False, percentile=100):
print('# fitting model') print('# fitting model')
print('# ...') print('# ...')
X = dataset['Title'] + ' ' + dataset['Text'] X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label'] y = dataset['Label']
#count_vector = CountVectorizer() if sklearn_cv:
cv = CountVectorizer()
# use stratified k-fold cross-validation as split method # use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True) skf = StratifiedKFold(n_splits = 10, shuffle=True)
@ -45,33 +49,48 @@ class DecisionTree:
important_words = {} important_words = {}
# for each fold # for each fold
n = 0
for train, test in skf.split(X,y): for train, test in skf.split(X,y):
# BOW n += 1
vocab = BagOfWords.make_vocab(X[train]) vocab = []
# fit the training data and then return the matrix print('# split no. ' + str(n))
training_data = BagOfWords.make_matrix(X[train], vocab)
# transform testing data and return the matrix
testing_data = BagOfWords.make_matrix(X[test], vocab)
# #fit the training data and then return the matrix if sklearn_cv:
# training_data = count_vector.fit_transform(X[train], y[train]).toarray() # use sklearn CountVectorizer
# #transform testing data and return the matrix # fit the training data and then return the matrix
# testing_data = count_vector.transform(X[test]).toarray() training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray()
else:
# use my own BagOfWords python implementation
rel_freq = True
extracted_words = BagOfWords.extract_all_words(X[train], stemming)
vocab = BagOfWords.make_vocab(extracted_words, stemming)
print(vocab)
# # apply select percentile # fit the training data and then return the matrix
# selector = SelectPercentile(percentile=25) training_data = BagOfWords.make_matrix(extracted_words,
# selector.fit(training_data, y[train]) vocab, rel_freq, stemming)
# transform testing data and return the matrix
extracted_words = BagOfWords.extract_all_words(X[test], stemming)
testing_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# training_data_r = selector.transform(training_data) # apply select percentile
# testing_data_r = selector.transform(testing_data) selector = SelectPercentile(percentile=percentile)
selector.fit(training_data, y[train])
# new reduced data sets
training_data_r = selector.transform(training_data)
testing_data_r = selector.transform(testing_data)
# fit classifier # fit classifier
classifier.fit(training_data, y[train]) classifier.fit(training_data_r, y[train])
#predict class #predict class
predictions_train = classifier.predict(training_data) predictions_train = classifier.predict(training_data_r)
predictions_test = classifier.predict(testing_data) predictions_test = classifier.predict(testing_data_r)
#store metrics predicted on test/train set #store metrics predicted on test/train set
f1_scores.append(f1_score(y[test], predictions_test)) f1_scores.append(f1_score(y[test], predictions_test))
@ -80,6 +99,7 @@ class DecisionTree:
# search for important features # search for important features
feature_importances = np.array(classifier.feature_importances_) feature_importances = np.array(classifier.feature_importances_)
important_indices = feature_importances.argsort()[-50:][::-1] important_indices = feature_importances.argsort()[-50:][::-1]
print(important_indices)
for i in important_indices: for i in important_indices:
if vocab[i] in important_words: if vocab[i] in important_words:

View File

@ -6,6 +6,8 @@ FilterKeywords searches for merger specific keywords
in an article and counts them. in an article and counts them.
''' '''
# toDo: replace dict by vector/matrix
from collections import defaultdict from collections import defaultdict
import re import re
@ -18,14 +20,6 @@ class FilterKeywords:
output are the contained keywords and their count. output are the contained keywords and their count.
''' '''
# # list of regular expressions that match merger specific keywords
# regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?',
# r'business combinations?', r'combined compan(y|ies)',
# r'(joint venture|JV)s?', r'take[ -]?overs?', r'tie-up',
# r'deals?', r'transactions?', r'approv(e|ing|al|ed)s?',
# r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?',
# r'purchase', r'(sell(s|ers?|ing)?|sold)']
keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers', keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers',
'acquisition', 'acquire', 'acquisitions', 'acquires', 'acquisition', 'acquire', 'acquisitions', 'acquires',
'combine', 'combines', 'combination', 'combined', 'combine', 'combines', 'combination', 'combined',
@ -44,22 +38,22 @@ class FilterKeywords:
# remove duplicates # remove duplicates
keywords = set(keyword_list) keywords = set(keyword_list)
# counts keywords in article (default value: 0) # # counts keywords in article (default value: 0)
dict_keywords = defaultdict(int) # dict_keywords = defaultdict(int)
# search for matchings in dictionary of input article # # search for matchings in dictionary of input article
for key in dict_input.keys(): # for key in dict_input.keys():
# iterate over all regular expressions # # iterate over all regular expressions
for kword in keywords: # for kword in keywords:
if re.match(kword, key): # if re.match(kword, key):
# if match, increase value of matching key # # if match, increase value of matching key
if str(kword) in dict_keywords: # if str(kword) in dict_keywords:
dict_keywords[str(kword)] += dict_input[key] # dict_keywords[str(kword)] += dict_input[key]
else: # else:
dict_keywords[str(kword)] = dict_input[key] # dict_keywords[str(kword)] = dict_input[key]
return dict_keywords # return dict_keywords
if __name__ == '__main__': if __name__ == '__main__':
dict_test={'example':2, 'combined':5, 'sells':3} # dict_test={'example':2, 'combined':5, 'sells':3}
print(FilterKeywords.search_keywords(dict_test)) # print(FilterKeywords.search_keywords(dict_test))

View File

@ -25,7 +25,7 @@ from sklearn.naive_bayes import GaussianNB
class NaiveBayes: class NaiveBayes:
def make_naive_bayes(dataset): def make_naive_bayes(dataset, sklearn_cv=True, percentile=100):
'''fits naive bayes model with StratifiedKFold, '''fits naive bayes model with StratifiedKFold,
uses my BOW uses my BOW
''' '''
@ -34,10 +34,11 @@ class NaiveBayes:
# split data into text and label set # split data into text and label set
# join title and text # join title and text
X = dataset['Title'] + ' ' + dataset['Text'] X = dataset['Title'] + '. ' + dataset['Text']
y = dataset['Label'] y = dataset['Label']
cv = CountVectorizer() if sklearn_cv:
cv = CountVectorizer()
# use stratified k-fold cross-validation as split method # use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5) skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
@ -61,23 +62,32 @@ class NaiveBayes:
n += 1 n += 1
print('# split no. ' + str(n)) print('# split no. ' + str(n))
# # eigenes BOW if sklearn_cv:
# vocab = BagOfWords.make_vocab(X[train]) # use sklearn CountVectorizer
# # fit the training data and then return the matrix # fit the training data and then return the matrix
# training_data = BagOfWords.make_matrix(X[train], vocab) training_data = cv.fit_transform(X[train], y[train]).toarray()
# # transform testing data and return the matrix # transform testing data and return the matrix
# testing_data = BagOfWords.make_matrix(X[test], vocab) testing_data = cv.transform(X[test]).toarray()
else:
# use my own BagOfWords python implementation
stemming = True
rel_freq = True
extracted_words = BagOfWords.extract_all_words(X[train])
vocab = BagOfWords.make_vocab(extracted_words)
# using CountVectorizer: # fit the training data and then return the matrix
# fit the training data and then return the matrix training_data = BagOfWords.make_matrix(extracted_words,
training_data = cv.fit_transform(X[train], y[train]).toarray() vocab, rel_freq, stemming)
# transform testing data and return the matrix # transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray() extracted_words = BagOfWords.extract_all_words(X[test])
testing_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# apply select percentile # apply select percentile
selector = SelectPercentile(percentile=100) selector = SelectPercentile(percentile=percentile)
selector.fit(training_data, y[train]) selector.fit(training_data, y[train])
# new reduced data sets
training_data_r = selector.transform(training_data) training_data_r = selector.transform(training_data)
testing_data_r = selector.transform(testing_data) testing_data_r = selector.transform(testing_data)

View File

@ -10,13 +10,14 @@ import csv
import pandas as pd import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import GaussianNB
class NaiveBayes_Interactive: class NaiveBayes_Interactive:
def make_naive_bayes(dataset): def make_naive_bayes(dataset, sklearn_cv=True, percentile=100):
'''fits naive bayes model '''fits naive bayes model
''' '''
print('# fitting model') print('# fitting model')
@ -24,10 +25,11 @@ class NaiveBayes_Interactive:
# split data into text and label set # split data into text and label set
# join title and text # join title and text
X = dataset['Title'] + ' ' + dataset['Text'] X = dataset['Title'] + '. ' + dataset['Text']
y = dataset['Label'] y = dataset['Label']
cv = CountVectorizer() if sklearn_cv:
cv = CountVectorizer()
# stratified k-fold cross-validation as split method # stratified k-fold cross-validation as split method
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=5) kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=5)
@ -51,17 +53,40 @@ class NaiveBayes_Interactive:
n += 1 n += 1
print('# split no. ' + str(n)) print('# split no. ' + str(n))
# using CountVectorizer: if sklearn_cv:
# fit the training data and then return the matrix # use sklearn CountVectorizer
training_data = cv.fit_transform(X[train], y[train]).toarray() # fit the training data and then return the matrix
# transform testing data and return the matrix training_data = cv.fit_transform(X[train], y[train]).toarray()
testing_data = cv.transform(X[test]).toarray() # transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray()
else:
# use my own BagOfWords python implementation
stemming = True
rel_freq = True
extracted_words = BagOfWords.extract_all_words(X[train])
vocab = BagOfWords.make_vocab(extracted_words)
# fit the training data and then return the matrix
training_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# transform testing data and return the matrix
extracted_words = BagOfWords.extract_all_words(X[test])
testing_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# apply select percentile
selector = SelectPercentile(percentile=percentile)
selector.fit(training_data, y[train])
# new reduced data sets
training_data_r = selector.transform(training_data)
testing_data_r = selector.transform(testing_data)
#fit classifier #fit classifier
classifier.fit(training_data, y[train]) classifier.fit(training_data_r, y[train])
#predict class #predict class
predictions_train = classifier.predict(training_data) predictions_train = classifier.predict(training_data_r)
predictions_test = classifier.predict(testing_data) predictions_test = classifier.predict(testing_data_r)
#print and store metrics #print and store metrics
rec = recall_score(y[test], predictions_test) rec = recall_score(y[test], predictions_test)
@ -166,7 +191,9 @@ class NaiveBayes_Interactive:
quotechar='\'', quotechar='\'',
quoting=csv.QUOTE_NONE) quoting=csv.QUOTE_NONE)
make_naive_bayes(data) use_count_vectorizer = True
select_percentile = 100
make_naive_bayes(data, use_count_vectorizer, select_percentile)
print('#') print('#')
print('# ending naive bayes') print('# ending naive bayes')

21
SVM.py
View File

@ -27,7 +27,7 @@ from sklearn.svm import SVC
class SVM: class SVM:
def make_svm(dataset): def make_svm(dataset, sklearn_cv=True):
print('# fitting model') print('# fitting model')
print('# ...') print('# ...')
@ -35,16 +35,18 @@ class SVM:
# split data into text and label set # split data into text and label set
# articles' text (title + text) # articles' text (title + text)
X = dataset['Title'] + ' ' + dataset['Text'] X = dataset['Title'] + '. ' + dataset['Text']
# articles' labels # articles' labels
y = dataset['Label'] y = dataset['Label']
matrix = pd.DataFrame()
# Bag of Words
print('# calculating bag of words')
print('# ...')
# fit the training data and then return the matrix # fit the training data and then return the matrix
#X = BagOfWords.fit_transform(X) if sklearn_cv:
X = CountVectorizer().fit_transform(X).toarray() # use sklearn CountVectorizer
matrix = CountVectorizer().fit_transform(X).toarray()
else:
# use own BOW implementation
matrix = BagOfWords.fit_transform(X)
# use stratified k-fold cross-validation as split method # use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True) skf = StratifiedKFold(n_splits = 10, shuffle=True)
@ -64,7 +66,7 @@ class SVM:
print('# fit classifier') print('# fit classifier')
print('# ...') print('# ...')
grid.fit(X,y) grid.fit(matrix,y)
# DataFrame of results # DataFrame of results
df_results = grid.cv_results_ df_results = grid.cv_results_
@ -104,6 +106,7 @@ class SVM:
quotechar='\'', quotechar='\'',
quoting=csv.QUOTE_NONE) quoting=csv.QUOTE_NONE)
make_svm(data) use_count_vectorizer = True
make_svm(data, use_count_vectorizer)
print('# ending svm') print('# ending svm')

View File

@ -22,7 +22,7 @@ class VisualizerNews:
def plot_wordcloud_dataset(): def plot_wordcloud_dataset():
'''plots word cloud image of most common words in dataset. '''plots word cloud image of most common words in dataset.
''' '''
print('# preparing word cloud...') print('# preparing word cloud of 200 most common words...')
print() print()
# load new data set # load new data set
file = 'data\\interactive_labeling_dataset_without_header.csv' file = 'data\\interactive_labeling_dataset_without_header.csv'
@ -32,17 +32,18 @@ class VisualizerNews:
index_col=None, index_col=None,
engine='python', engine='python',
usecols=[1,2], usecols=[1,2],
#nrows=100,
quoting=csv.QUOTE_NONNUMERIC, quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') quotechar='\'')
corpus = df_dataset[1] + ' ' + df_dataset[2] corpus = df_dataset[1] + '. ' + df_dataset[2]
stemming = False
rel_freq = False
# find most common words in dataset # find most common words in dataset
dict = BagOfWords.make_dict_common_words(corpus, extracted_words = BagOfWords.extract_all_words(corpus, stemming)
rel_freq=True, vocab = BagOfWords.make_vocab(extracted_words, stemming)
stemming=False, matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
n=200) dict = BagOfWords.make_dict_common_words(matrix, 200, rel_freq, stemming)
wordcloud = WordCloud(background_color='white', wordcloud = WordCloud(background_color='white',
width=2400, width=2400,
@ -62,30 +63,25 @@ class VisualizerNews:
x-axis: number of mentions of the company x-axis: number of mentions of the company
y-axis: frequency y-axis: frequency
''' '''
print('# preparing histogram...') print('# preparing histogram of company mentions...')
print() print()
# old data set # read data set
filepath = 'data\\classification_labelled_corrected.csv' file = 'data\\interactive_labeling_dataset_without_header.csv'
df = pd.read_csv(filepath, df = pd.read_csv(file,
sep='|', delimiter='|',
header=None,
index_col=None,
engine='python', engine='python',
decimal='.', usecols=[1,2],
quotechar='\'', quoting=csv.QUOTE_NONNUMERIC,
quoting=csv.QUOTE_NONE) quotechar='\'')
# only articles with label==1 # # only articles with label==1
df_hits = df[df['Label'] == 1] # df_hits = df[df['Label'] == 1]
# texts = df_hits['Title'] + '. ' + df_hits['Text']
texts = df[1] + '. ' + df[2]
texts = df_hits['Title'] + '. ' + df_hits['Text'] # dict: count articles with company names
# # zum prüfen lesen
# for text in texts[10:20]:
# print(text)
# print()
# print(NER.find_companies(text))
# print()
# count names in hit articles
count_names = NER.count_companies(texts) count_names = NER.count_companies(texts)
# sort list in descending order # sort list in descending order
@ -98,7 +94,7 @@ class VisualizerNews:
plt.ylabel('Number of companies with this number of articles') plt.ylabel('Number of companies with this number of articles')
num_bins = 50 num_bins = 50
n, bins, patches = plt.hist(names, num_bins, facecolor='darkred', alpha=0.5) n, bins, patches = plt.hist(names, num_bins, facecolor='darkred', alpha=0.5)
# plt.grid(True) plt.axis([0, 50, 0, 1000])
plt.show() plt.show()
def plot_histogram_text_lengths(): def plot_histogram_text_lengths():
@ -106,20 +102,21 @@ class VisualizerNews:
x-axis: number of characters in article (without headline) x-axis: number of characters in article (without headline)
y-axis: frequency y-axis: frequency
''' '''
print('# preparing histogram...') print('# preparing histogram of text lengths...')
print() print()
# new data set # read data set
filepath = 'data\\interactive_labeling_dataset.csv' filepath = 'data\\interactive_labeling_dataset.csv'
df_dataset = pd.read_csv(filepath, df_dataset = pd.read_csv(filepath,
delimiter='|', delimiter='|',
header=0, header=0,
index_col=None, index_col=None,
engine='python', engine='python',
usecols=[2],
#nrows=100, #nrows=100,
quoting=csv.QUOTE_NONNUMERIC, quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') quotechar='\'')
# consider only Text, not Headline # consider only Text, not Headline
texts = df_dataset['Text'] texts = df_dataset[2]
# count characters in articles # count characters in articles
print('# counting characters in articles...') print('# counting characters in articles...')
@ -150,7 +147,7 @@ class VisualizerNews:
def plot_pie_chart_of_sites(): def plot_pie_chart_of_sites():
print('# preparing pie chart...') print('# preparing pie chart of news article sites...')
print() print()
# load data set # load data set
@ -164,13 +161,15 @@ class VisualizerNews:
#nrows=100, #nrows=100,
quoting=csv.QUOTE_NONNUMERIC, quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') quotechar='\'')
# find all different sites
df_counts = df_dataset.groupby('Site').count() df_counts = df_dataset.groupby('Site').count()
# count occurences of each site
df_counts = df_counts.sort_values(['Url'], ascending=False) df_counts = df_counts.sort_values(['Url'], ascending=False)
fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal")) fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
data = list(df_counts['Url']) data = list(df_counts['Url'])
# legend labels
labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)', labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)',
'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)'] 'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']
@ -188,14 +187,14 @@ class VisualizerNews:
plt.show() plt.show()
def plot_hist_most_common_words(n_commons = 10): def plot_hist_most_common_words(n_commons = 10):
print('# preparing histogram...') print('# preparing histogram of most common words...')
print() print()
# load data set # load data set
filepath = 'data\\interactive_labeling_dataset_without_header.csv' filepath = 'data\\interactive_labeling_dataset_without_header.csv'
df_dataset = pd.read_csv(filepath, df_dataset = pd.read_csv(filepath,
delimiter='|', delimiter='|',
header=None, header=None,
#usecols=[1,2], usecols=[1,2],
index_col=None, index_col=None,
engine='python', engine='python',
#nrows=1000, #nrows=1000,
@ -204,11 +203,14 @@ class VisualizerNews:
corpus = df_dataset[1] + '. ' + df_dataset[2] corpus = df_dataset[1] + '. ' + df_dataset[2]
stemming = False
rel_freq = True
# find most common words in dataset # find most common words in dataset
dict = BagOfWords.make_dict_common_words(corpus, extracted_words = BagOfWords.extract_all_words(corpus, stemming)
rel_freq=True, vocab = BagOfWords.make_vocab(extracted_words, stemming)
stemming=False, matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
n=n_commons) dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq, stemming)
plt.xlabel('Most common words in textual corpus') plt.xlabel('Most common words in textual corpus')
plt.ylabel('Relative frequency') plt.ylabel('Relative frequency')