initial project version
This commit is contained in:
commit
ecb629e16c
|
@ -0,0 +1,130 @@
|
|||
'''
|
||||
Bag Of Words
|
||||
============
|
||||
|
||||
BagOfWords counts word stems in an article
|
||||
and adds new words to the global vocabulary.
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from nltk.stem.porter import PorterStemmer
|
||||
|
||||
class BagOfWords():
|
||||
|
||||
def extract_words(text):
|
||||
'''takes article as argument, removes numbers,
|
||||
returns list of single words, recurrences included.
|
||||
'''
|
||||
stop_words = BagOfWords.set_stop_words()
|
||||
# replace punctuation marks with spaces
|
||||
words = re.sub(r'\W', ' ', text)
|
||||
# split str into list of single words
|
||||
words = words.split()
|
||||
# list of all words to return
|
||||
words_cleaned = []
|
||||
for word in words:
|
||||
# remove numbers
|
||||
if word.isalpha():
|
||||
# reduce word to stem
|
||||
word = BagOfWords.reduce_word_to_stem(word)
|
||||
# check if not stop word
|
||||
if word.lower() not in stop_words:
|
||||
# add every word in lowercase
|
||||
words_cleaned.append(word.lower())
|
||||
return words_cleaned
|
||||
|
||||
def reduce_word_to_stem(word):
|
||||
'''takes normal word as input, returns the word's word stem
|
||||
'''
|
||||
stemmer = PorterStemmer()
|
||||
# replace word by its stem
|
||||
word = stemmer.stem(word)
|
||||
return word
|
||||
|
||||
def make_matrix(series, vocab):
|
||||
'''calculates word stem frequencies in input articles.
|
||||
returns matrix (DataFrame) with relative word frequencies (0 <= values < 1)
|
||||
(rows: different articles, colums: different words in vocab)
|
||||
'''
|
||||
# create list of tuples
|
||||
vectors = []
|
||||
for i in range(len(series)):
|
||||
# extract text of single article
|
||||
text = series.iloc[i]
|
||||
# extract its words
|
||||
words = BagOfWords.extract_words(text)
|
||||
# count words in single article
|
||||
word_count = len(words)
|
||||
vector = []
|
||||
for i, v in enumerate(vocab):
|
||||
vector.append(0)
|
||||
for w in words:
|
||||
if w == v:
|
||||
# add relative word frequency
|
||||
vector[i] += 1/word_count
|
||||
# add single vector as tuple
|
||||
vectors.append(tuple(vector))
|
||||
df_vectors = pd.DataFrame.from_records(vectors, index=None, columns=vocab)
|
||||
return df_vectors
|
||||
|
||||
def make_vocab(series):
|
||||
'''adds words of input articles to a global vocabulary.
|
||||
input: dataframe of all articles, return value: list of words
|
||||
'''
|
||||
vocab = set()
|
||||
for text in series:
|
||||
vocab |= set(BagOfWords.extract_words(text))
|
||||
# transform to list
|
||||
vocab = list(vocab)
|
||||
# sort list
|
||||
vocab.sort()
|
||||
return vocab
|
||||
|
||||
def set_stop_words():
|
||||
'''creates list of all words that will be ignored
|
||||
'''
|
||||
# standard stopwords from nltk.corpus stopwords('english')
|
||||
stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'ain',
|
||||
'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'aren\'t',
|
||||
'as', 'at', 'be', 'because', 'been', 'before', 'being',
|
||||
'below', 'between', 'both', 'but', 'by', 'can', 'couldn',
|
||||
'couldn\'t', 'd', 'did', 'didn', 'didn\'t', 'do', 'does',
|
||||
'doesn', 'doesn\'t', 'doing', 'don', 'don\'t', 'down',
|
||||
'during', 'each', 'few', 'for', 'from', 'further', 'had',
|
||||
'hadn', 'hadn\'t', 'has', 'hasn', 'hasn\'t', 'have', 'haven',
|
||||
'haven\'t', 'having', 'he', 'her', 'here', 'hers', 'herself',
|
||||
'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
|
||||
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just', 'll',
|
||||
'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more', 'most',
|
||||
'mustn', 'mustn\'t', 'my', 'myself', 'needn', 'needn\'t',
|
||||
'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once',
|
||||
'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out',
|
||||
'over', 'own', 're', 's', 'same', 'shan', 'shan\'t', 'she',
|
||||
'she\'s', 'should', 'should\'ve', 'shouldn', 'shouldn\'t',
|
||||
'so', 'some', 'such', 't', 'than', 'that', 'that\'ll', 'the',
|
||||
'their', 'theirs', 'them', 'themselves', 'then', 'there',
|
||||
'these', 'they', 'this', 'those', 'through', 'to', 'too',
|
||||
'under', 'until', 'up', 've', 'very', 'was', 'wasn', 'wasn\'t',
|
||||
'we', 'were', 'weren', 'weren\'t', 'what', 'when', 'where',
|
||||
'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won',
|
||||
'won\'t', 'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
|
||||
'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves']
|
||||
|
||||
# add specific words
|
||||
stop_words.extend(['reuters', 'also', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday'])
|
||||
|
||||
# remove the word 'not' from stop words
|
||||
stop_words.remove('not')
|
||||
|
||||
for i in range(len(stop_words)):
|
||||
# remove punctuation marks and strip endings from abbreviations
|
||||
#stop_words[i] = re.split(r'\W', stop_words[i])[0]
|
||||
# reduce word to stem
|
||||
stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
|
||||
# transform list to set to eliminate duplicates
|
||||
stop_words = set(stop_words)
|
||||
|
||||
return stop_words
|
|
@ -0,0 +1,28 @@
|
|||
'''
|
||||
Csv Handler
|
||||
===========
|
||||
|
||||
CsvHandler writes articles' information to csv file and reads it.
|
||||
'''
|
||||
|
||||
import csv
|
||||
|
||||
import pandas as pd
|
||||
|
||||
class CsvHandler():
|
||||
|
||||
def read_csv(csv_file):
|
||||
df = pd.read_csv(csv_file,
|
||||
sep='|',
|
||||
header=0,
|
||||
engine='python',
|
||||
usecols=[1,2,4], #use only 'Title', 'Text' and 'Label'
|
||||
decimal='.',
|
||||
quotechar='\'',
|
||||
#nrows = 200,
|
||||
quoting=csv.QUOTE_NONE)
|
||||
return df
|
||||
|
||||
def write_csv(df, file_name):
|
||||
df.to_csv(file_name, sep='|')
|
||||
print('### saved {} articles in {}'.format(len(df), file_name))
|
|
@ -0,0 +1,112 @@
|
|||
'''
|
||||
Decision Tree Classifier
|
||||
========================
|
||||
|
||||
Decision Tree Classifier takes as input two arrays:
|
||||
array X of size [n_samples, n_features], holding the training samples,
|
||||
and array y of integer values, size [n_samples],
|
||||
holding the class labels for the training samples.
|
||||
'''
|
||||
import operator
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
from CsvHandler import CsvHandler
|
||||
|
||||
import graphviz
|
||||
import numpy as np
|
||||
from sklearn import tree
|
||||
#from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.metrics import f1_score
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
|
||||
class DecisionTree():
|
||||
|
||||
def make_tree(dataset):
|
||||
|
||||
print('# starting decision tree')
|
||||
print()
|
||||
|
||||
# note: better results with only title, but other important words
|
||||
X = dataset['Title'] + ' ' + dataset['Text']
|
||||
y = dataset['Label']
|
||||
|
||||
#count_vector = CountVectorizer()
|
||||
|
||||
# use stratified k-fold cross-validation as split method
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
|
||||
# lists for metrics predicted on test/train set
|
||||
f1_scores = []
|
||||
f1_scores_train = []
|
||||
|
||||
classifier = tree.DecisionTreeClassifier()
|
||||
|
||||
# dict of most important words of each fold
|
||||
important_words = {}
|
||||
|
||||
# for each fold
|
||||
for train, test in skf.split(X,y):
|
||||
|
||||
# BOW
|
||||
vocab = BagOfWords.make_vocab(X[train])
|
||||
# fit the training data and then return the matrix
|
||||
training_data = BagOfWords.make_matrix(X[train], vocab)
|
||||
# transform testing data and return the matrix
|
||||
testing_data = BagOfWords.make_matrix(X[test], vocab)
|
||||
|
||||
# #fit the training data and then return the matrix
|
||||
# training_data = count_vector.fit_transform(X[train], y[train]).toarray()
|
||||
# #transform testing data and return the matrix
|
||||
# testing_data = count_vector.transform(X[test]).toarray()
|
||||
|
||||
# # apply select percentile
|
||||
# selector = SelectPercentile(percentile=25)
|
||||
# selector.fit(training_data, y[train])
|
||||
|
||||
# training_data_r = selector.transform(training_data)
|
||||
# testing_data_r = selector.transform(testing_data)
|
||||
|
||||
# fit classifier
|
||||
classifier.fit(training_data, y[train])
|
||||
|
||||
#predict class
|
||||
predictions_train = classifier.predict(training_data)
|
||||
predictions_test = classifier.predict(testing_data)
|
||||
|
||||
#store metrics predicted on test/train set
|
||||
f1_scores.append(f1_score(y[test], predictions_test))
|
||||
f1_scores_train.append(f1_score(y[train], predictions_train))
|
||||
|
||||
# search for important features
|
||||
feature_importances = np.array(classifier.feature_importances_)
|
||||
important_indices = feature_importances.argsort()[-50:][::-1]
|
||||
|
||||
for i in important_indices:
|
||||
if vocab[i] in important_words:
|
||||
important_words[vocab[i]] += feature_importances[i]
|
||||
else:
|
||||
important_words[vocab[i]] = feature_importances[i]
|
||||
|
||||
print('20 most important words in training set:')
|
||||
print()
|
||||
sorted_i_w = sorted(important_words.items(), key=operator.itemgetter(1))
|
||||
#print(sorted_i_w)[:20]
|
||||
i_w = [x[0] for x in sorted_i_w]
|
||||
print(i_w[:20])
|
||||
|
||||
print()
|
||||
|
||||
#print metrics of test set
|
||||
print('prediction of testing set:')
|
||||
print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
||||
format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores))))
|
||||
print()
|
||||
# print('overfit testing: prediction of training set')
|
||||
# print('F1 score: min = {}, max = {}, average = {}'.
|
||||
# format(min(f1_scores_train), max(f1_scores_train),
|
||||
# sum(f1_scores_train)/float(len(f1_scores_train))))
|
||||
# print()
|
||||
|
||||
print('# ending decision tree')
|
||||
print()
|
|
@ -0,0 +1,59 @@
|
|||
'''
|
||||
Filter Keywords
|
||||
===============
|
||||
|
||||
FilterKeywords searches for merger specific keywords
|
||||
in an article and counts them.
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from nltk.stem.porter import PorterStemmer
|
||||
|
||||
class FilterKeywords():
|
||||
|
||||
def search_keywords(dict_input):
|
||||
'''extracts relevant key-value pairs of in article's input dictionary.
|
||||
output are the contained keywords and their count.
|
||||
'''
|
||||
|
||||
keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers', 'acquisition',
|
||||
'acquire', 'acquisitions', 'acquires', 'combine', 'combines',
|
||||
'combination', 'combined', 'joint', 'venture', 'JV', 'takeover',
|
||||
'take-over', 'tie-up', 'deal', 'deals', 'transaction', 'transactions',
|
||||
'approve', 'approves', 'approved', 'approving', 'approval',
|
||||
'approvals', 'buy', 'buys', 'buying', 'bought', 'buyout', 'buy-out',
|
||||
'purchase', 'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']
|
||||
|
||||
# reduce words to stem
|
||||
stemmer = PorterStemmer()
|
||||
for i in range(len(keyword_list)):
|
||||
keyword_list[i] = stemmer.stem(keyword_list[i])
|
||||
|
||||
# remove duplicates
|
||||
keywords = set(keyword_list)
|
||||
|
||||
# counts keywords in article
|
||||
dict_keywords = {}
|
||||
|
||||
# search for matchings in dictionary of input article
|
||||
for key in dict_input.keys():
|
||||
# iterate over all regular expressions
|
||||
for kword in keywords:
|
||||
if re.match(kword, key):
|
||||
# if match, increase value of matching key
|
||||
if str(kword) in dict_keywords:
|
||||
dict_keywords[str(kword)] += dict_input[key]
|
||||
else:
|
||||
dict_keywords[str(kword)] = dict_input[key]
|
||||
|
||||
return dict_keywords
|
||||
|
||||
def count_keywords(dict_keywords):
|
||||
'''input: dict with article's keywords (key) and their count (value).
|
||||
returns number of keywords that are found.
|
||||
'''
|
||||
return sum(dict_keywords.values())
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,191 @@
|
|||
'''
|
||||
Naive Bayes Classifier
|
||||
======================
|
||||
|
||||
Naive Bayes is a probabilistic classifier that is able to predict,
|
||||
given an observation of an input, a probability distribution over a set of classes,
|
||||
rather than only outputting the most likely class that the observation should belong to.
|
||||
'Naive' means, that it assumes that the value of a particular feature
|
||||
(word in an article) is independent of the value of any other feature,
|
||||
given the class variable (label). It considers each of these features
|
||||
to contribute independently to the probability that it belongs to its category,
|
||||
regardless of any possible correlations between these features.
|
||||
'''
|
||||
from BagOfWords import BagOfWords
|
||||
from CsvHandler import CsvHandler
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.metrics import recall_score, precision_score
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
|
||||
# toDo: für Julian erst mal ohne SelectPercentile machen
|
||||
|
||||
class NaiveBayes():
|
||||
|
||||
def make_naive_bayes(dataset):
|
||||
'''fits naive bayes model with StratifiedKFold, uses my BOW
|
||||
'''
|
||||
|
||||
print('# starting naive bayes')
|
||||
print()
|
||||
|
||||
# alternative: use only articles' header => may give better results
|
||||
X = dataset['Title'] + ' ' + dataset['Text']
|
||||
y = dataset['Label']
|
||||
|
||||
# use stratified k-fold cross-validation as split method
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
|
||||
classifier = GaussianNB()
|
||||
|
||||
# lists for metrics
|
||||
recall_scores = []
|
||||
precision_scores = []
|
||||
f1_scores = []
|
||||
|
||||
# for each fold
|
||||
n = 0
|
||||
for train, test in skf.split(X,y):
|
||||
# BOW
|
||||
vocab = BagOfWords.make_vocab(X[train])
|
||||
# fit the training data and then return the matrix
|
||||
training_data = BagOfWords.make_matrix(X[train], vocab)
|
||||
# transform testing data and return the matrix
|
||||
testing_data = BagOfWords.make_matrix(X[test], vocab)
|
||||
|
||||
# apply select percentile
|
||||
selector = SelectPercentile(percentile=25)
|
||||
selector.fit(training_data, y[train])
|
||||
|
||||
training_data_r = selector.transform(training_data)
|
||||
testing_data_r = selector.transform(testing_data)
|
||||
|
||||
#fit classifier
|
||||
classifier.fit(training_data_r, y[train])
|
||||
#predict class
|
||||
predictions_train = classifier.predict(training_data_r)
|
||||
predictions_test = classifier.predict(testing_data_r)
|
||||
|
||||
#store metrics
|
||||
rec = recall_score(y[test], predictions_test)
|
||||
recall_scores.append(rec)
|
||||
prec = precision_score(y[train], predictions_train)
|
||||
precision_scores.append(prec)
|
||||
# equation for f1 score
|
||||
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||
|
||||
#print metrics of test set
|
||||
print('prediction of testing set:')
|
||||
print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
|
||||
.format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores))))
|
||||
print()
|
||||
#print('overfit testing: prediction of training set')
|
||||
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
||||
#format(min(f1_scores_train), max(f1_scores_train),sum(f1_scores_train)/float(len(f1_scores_train))))
|
||||
#print()
|
||||
|
||||
print('# ending naive bayes')
|
||||
print()
|
||||
|
||||
|
||||
def make_naive_bayes_CV(dataset):
|
||||
'''alternative: uses CountVectorizer (faster)
|
||||
'''
|
||||
# alternative: use only articles' header => may give better results
|
||||
X = dataset['Title'] + '.' + dataset['Text'] + '.'
|
||||
y = dataset['Label']
|
||||
|
||||
# use stratified k-fold cross-validation as split method
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
|
||||
count_vector = CountVectorizer()
|
||||
|
||||
classifier = GaussianNB()
|
||||
|
||||
# lists for metrics predicted on test/train set
|
||||
f1_scores, f1_scores_train = []
|
||||
|
||||
# for each fold (10 times)
|
||||
# fold number
|
||||
n = 0
|
||||
for train, test in skf.split(X,y):
|
||||
|
||||
# fit the training data and then return the matrix
|
||||
training_data = count_vector.fit_transform(X[train], y[train]).toarray()
|
||||
# transform testing data and return the matrix
|
||||
testing_data = count_vector.transform(X[test]).toarray()
|
||||
|
||||
# apply select percentile
|
||||
selector = SelectPercentile(percentile=25)
|
||||
selector.fit(training_data, y[train])
|
||||
|
||||
training_data_r = selector.transform(training_data)
|
||||
testing_data_r = selector.transform(testing_data)
|
||||
|
||||
#fit classifier
|
||||
classifier.fit(training_data_r, y[train])
|
||||
|
||||
#predict class
|
||||
predictions_train = classifier.predict(training_data_r)
|
||||
predictions_test = classifier.predict(testing_data_r)
|
||||
|
||||
#store metrics predicted on test set
|
||||
f1_scores.append(f1_score(y[test], predictions_test))
|
||||
|
||||
#store metrics predicted on train set
|
||||
f1_scores_train.append(f1_score(y[train], predictions_train))
|
||||
|
||||
#print metrics of test set
|
||||
print('--------------------')
|
||||
print('prediction of testing set:')
|
||||
print('F1 score: min = {}, max = {}, average = {}'.format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores))))
|
||||
|
||||
print()
|
||||
print('prediction of training set:')
|
||||
print('F1 score: min = {}, max = {}, average = {}'.format(min(f1_scores_train), max(f1_scores_train),sum(f1_scores_train)/float(len(f1_scores_train))))
|
||||
print()
|
||||
|
||||
# def analyze_errors_cv(dataset):
|
||||
# '''calculates resubstitution error
|
||||
# shows indices of false classified articles
|
||||
# uses Gaussian Bayes with train test split
|
||||
# '''
|
||||
|
||||
# X_train_test = dataset['Text']
|
||||
# y_train_test = dataset['Label']
|
||||
|
||||
# count_vector = CountVectorizer()
|
||||
|
||||
# # fit the training data and then return the matrix
|
||||
# training_data = count_vector.fit_transform(X_train_test).toarray()
|
||||
|
||||
# # transform testing data and return the matrix
|
||||
# testing_data = count_vector.transform(X_train_test).toarray()
|
||||
|
||||
# # Naive Bayes
|
||||
# classifier = GaussianNB()
|
||||
|
||||
# # fit classifier
|
||||
# classifier.fit(training_data, y_train_test)
|
||||
|
||||
# # Predict class
|
||||
# predictions = classifier.predict(testing_data)
|
||||
|
||||
# print()
|
||||
# print('errors at index:')
|
||||
# n = 0
|
||||
# for i in range(len(y_train_test)):
|
||||
# if y_train_test[i] != predictions[i]:
|
||||
# n += 1
|
||||
# print('error no.{}'.format(n))
|
||||
# print('prediction at index {} is: {}, but actual is: {}'.format(i, predictions[i], y_train_test[i]))
|
||||
# print(X_train_test[i])
|
||||
# print(y_train_test[i])
|
||||
# print()
|
||||
|
||||
# print()
|
||||
# #print metrics
|
||||
# print('F1 score: ', format(f1_score(y_train_test, predictions)))
|
Loading…
Reference in New Issue