added .gitignore file

This commit is contained in:
Anne Lorenz 2018-09-10 10:38:24 +02:00
parent 3f98aff635
commit 1195a161d6
6 changed files with 419 additions and 112 deletions

221
.gitignore vendored Normal file
View File

@ -0,0 +1,221 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json

View File

@ -12,7 +12,7 @@ import pandas as pd
from nltk.stem.porter import PorterStemmer from nltk.stem.porter import PorterStemmer
class BagOfWords(): class BagOfWords:
def extract_words(text): def extract_words(text):
'''takes article as argument, removes numbers, '''takes article as argument, removes numbers,
@ -46,7 +46,8 @@ class BagOfWords():
def make_matrix(series, vocab): def make_matrix(series, vocab):
'''calculates word stem frequencies in input articles. '''calculates word stem frequencies in input articles.
returns matrix (DataFrame) with relative word frequencies (0 <= values < 1) returns matrix (DataFrame) with relative word frequencies
(0 <= values < 1)
(rows: different articles, colums: different words in vocab) (rows: different articles, colums: different words in vocab)
''' '''
# create list of tuples # create list of tuples
@ -67,7 +68,9 @@ class BagOfWords():
vector[i] += 1/word_count vector[i] += 1/word_count
# add single vector as tuple # add single vector as tuple
vectors.append(tuple(vector)) vectors.append(tuple(vector))
df_vectors = pd.DataFrame.from_records(vectors, index=None, columns=vocab) df_vectors = pd.DataFrame.from_records(vectors,
index=None,
columns=vocab)
return df_vectors return df_vectors
def make_vocab(series): def make_vocab(series):
@ -87,41 +90,49 @@ class BagOfWords():
'''creates list of all words that will be ignored '''creates list of all words that will be ignored
''' '''
# standard stopwords from nltk.corpus stopwords('english') # standard stopwords from nltk.corpus stopwords('english')
stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'ain', stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'aren\'t', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
'as', 'at', 'be', 'because', 'been', 'before', 'being', 'aren\'t', 'as', 'at', 'be', 'because', 'been',
'below', 'between', 'both', 'but', 'by', 'can', 'couldn', 'before', 'being', 'below', 'between', 'both', 'but',
'couldn\'t', 'd', 'did', 'didn', 'didn\'t', 'do', 'does', 'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
'doesn', 'doesn\'t', 'doing', 'don', 'don\'t', 'down', 'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
'during', 'each', 'few', 'for', 'from', 'further', 'had', 'don', 'don\'t', 'down', 'during', 'each', 'few',
'hadn', 'hadn\'t', 'has', 'hasn', 'hasn\'t', 'have', 'haven', 'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
'haven\'t', 'having', 'he', 'her', 'here', 'hers', 'herself', 'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just', 'll', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more', 'most', 'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
'mustn', 'mustn\'t', 'my', 'myself', 'needn', 'needn\'t', 'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
'over', 'own', 're', 's', 'same', 'shan', 'shan\'t', 'she', 'on', 'once', 'only', 'or', 'other', 'our', 'ours',
'she\'s', 'should', 'should\'ve', 'shouldn', 'shouldn\'t', 'ourselves', 'out', 'over', 'own', 're', 's', 'same',
'so', 'some', 'such', 't', 'than', 'that', 'that\'ll', 'the', 'shan', 'shan\'t', 'she', 'she\'s', 'should',
'their', 'theirs', 'them', 'themselves', 'then', 'there', 'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
'these', 'they', 'this', 'those', 'through', 'to', 'too', 'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
'under', 'until', 'up', 've', 'very', 'was', 'wasn', 'wasn\'t', 'theirs', 'them', 'themselves', 'then', 'there',
'we', 'were', 'weren', 'weren\'t', 'what', 'when', 'where', 'these', 'they', 'this', 'those', 'through', 'to',
'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', 'too', 'under', 'until', 'up', 've', 'very', 'was',
'won\'t', 'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll', 'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves'] 'what', 'when', 'where', 'which', 'while', 'who',
'whom', 'why', 'will', 'with', 'won', 'won\'t',
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
'yourselves']
# add specific words # add specific words
stop_words.extend(['reuters', 'also', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday']) stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
'wednesday', 'thursday', 'friday'])
# => does this make sense?:
# remove the word 'not' from stop words # remove the word 'not' from stop words
stop_words.remove('not') #stop_words.remove('not')
for i in range(len(stop_words)): for i in range(len(stop_words)):
# remove punctuation marks and strip endings from abbreviations # remove punctuation marks and strip endings from abbreviations
#stop_words[i] = re.split(r'\W', stop_words[i])[0] #stop_words[i] = re.split(r'\W', stop_words[i])[0]
# reduce word to stem # reduce word to stem
stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i]) stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
# transform list to set to eliminate duplicates # transform list to set to eliminate duplicates

View File

@ -9,7 +9,7 @@ import csv
import pandas as pd import pandas as pd
class CsvHandler(): class CsvHandler:
def read_csv(csv_file): def read_csv(csv_file):
df = pd.read_csv(csv_file, df = pd.read_csv(csv_file,

View File

@ -20,7 +20,7 @@ from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedKFold
class DecisionTree(): class DecisionTree:
def make_tree(dataset): def make_tree(dataset):

View File

@ -10,20 +10,30 @@ import re
from nltk.stem.porter import PorterStemmer from nltk.stem.porter import PorterStemmer
class FilterKeywords(): class FilterKeywords:
def search_keywords(dict_input): def search_keywords(dict_input):
'''extracts relevant key-value pairs of in article's input dictionary. '''extracts relevant key-value pairs of in article's input dictionary,
output are the contained keywords and their count. output are the contained keywords and their count.
''' '''
# # list of regular expressions that match merger specific keywords
# regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?',
# r'business combinations?', r'combined compan(y|ies)',
# r'(joint venture|JV)s?', r'take[ -]?overs?', r'tie-up',
# r'deals?', r'transactions?', r'approv(e|ing|al|ed)s?',
# r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?',
# r'purchase', r'(sell(s|ers?|ing)?|sold)']
keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers', 'acquisition', keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers',
'acquire', 'acquisitions', 'acquires', 'combine', 'combines', 'acquisition', 'acquire', 'acquisitions', 'acquires',
'combination', 'combined', 'joint', 'venture', 'JV', 'takeover', 'combine', 'combines', 'combination', 'combined',
'take-over', 'tie-up', 'deal', 'deals', 'transaction', 'transactions', 'joint', 'venture', 'JV', 'takeover', 'take-over',
'approve', 'approves', 'approved', 'approving', 'approval', 'tie-up', 'deal', 'deals', 'transaction',
'approvals', 'buy', 'buys', 'buying', 'bought', 'buyout', 'buy-out', 'transactions', 'approve', 'approves', 'approved',
'purchase', 'sell', 'sells', 'selling', 'sold', 'seller', 'buyer'] 'approving', 'approval', 'approvals', 'buy', 'buys',
'buying', 'bought', 'buyout', 'buy-out', 'purchase',
'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']
# reduce words to stem # reduce words to stem
stemmer = PorterStemmer() stemmer = PorterStemmer()
@ -50,7 +60,7 @@ class FilterKeywords():
return dict_keywords return dict_keywords
def count_keywords(dict_keywords): def count_keywords(dict_keywords):
'''input: dict with article's keywords (key) and their count (value). '''input: dict with article's keywords (key) and their count (value),
returns number of keywords that are found. returns number of keywords that are found.
''' '''
return sum(dict_keywords.values()) return sum(dict_keywords.values())

View File

@ -2,37 +2,35 @@
Naive Bayes Classifier Naive Bayes Classifier
====================== ======================
Naive Bayes is a probabilistic classifier that is able to predict, Naive Bayes is a probabilistic classifier that is able to predict a
given an observation of an input, a probability distribution over a set of classes, probability distribution over a set of classes, rather than only
rather than only outputting the most likely class that the observation should belong to. outputting the most likely class that the observation should belong to.
'Naive' means, that it assumes that the value of a particular feature 'Naive' means, that it assumes that the value of a particular feature
(word in an article) is independent of the value of any other feature, (word in an article) is independent of the value of any other feature,
given the class variable (label). It considers each of these features given the label. It considers each of these features to contribute
to contribute independently to the probability that it belongs to its category, independently to the probability that it belongs to its category,
regardless of any possible correlations between these features. regardless of any possible correlations between these features.
''' '''
from BagOfWords import BagOfWords from BagOfWords import BagOfWords
from CsvHandler import CsvHandler from CsvHandler import CsvHandler
from sklearn.feature_extraction.text import CountVectorizer #from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile #from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split #from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB from sklearn.naive_bayes import GaussianNB
# toDo: für Julian erst mal ohne SelectPercentile machen class NaiveBayes:
class NaiveBayes():
def make_naive_bayes(dataset): def make_naive_bayes(dataset):
'''fits naive bayes model with StratifiedKFold, uses my BOW '''fits naive bayes model with StratifiedKFold,
''' uses my BOW
'''
print('# starting naive bayes') print('# starting naive bayes')
print() print()
# alternative: use only articles' header => may give better results # join title and text
X = dataset['Title'] + ' ' + dataset['Text'] X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label'] y = dataset['Label']
@ -56,18 +54,11 @@ class NaiveBayes():
# transform testing data and return the matrix # transform testing data and return the matrix
testing_data = BagOfWords.make_matrix(X[test], vocab) testing_data = BagOfWords.make_matrix(X[test], vocab)
# apply select percentile
selector = SelectPercentile(percentile=25)
selector.fit(training_data, y[train])
training_data_r = selector.transform(training_data)
testing_data_r = selector.transform(testing_data)
#fit classifier #fit classifier
classifier.fit(training_data_r, y[train]) classifier.fit(training_data, y[train])
#predict class #predict class
predictions_train = classifier.predict(training_data_r) predictions_train = classifier.predict(training_data)
predictions_test = classifier.predict(testing_data_r) predictions_test = classifier.predict(testing_data)
#store metrics #store metrics
rec = recall_score(y[test], predictions_test) rec = recall_score(y[test], predictions_test)
@ -80,73 +71,146 @@ class NaiveBayes():
#print metrics of test set #print metrics of test set
print('prediction of testing set:') print('prediction of testing set:')
print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}' print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
.format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores)))) .format(min(f1_scores), max(f1_scores),
sum(f1_scores)/float(len(f1_scores))))
print() print()
#print('overfit testing: prediction of training set') #print('overfit testing: prediction of training set')
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'. #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
#format(min(f1_scores_train), max(f1_scores_train),sum(f1_scores_train)/float(len(f1_scores_train)))) #format(min(f1_scores_train), max(f1_scores_train),
#sum(f1_scores_train)/float(len(f1_scores_train))))
#print() #print()
print('# ending naive bayes') print('# ending naive bayes')
print() print()
# def make_naive_bayes_selectpercentile(dataset):
# '''fits naive bayes model with StratifiedKFold, uses my BOW
# feature selection: select 0.25-percentile
# '''
# print('# starting naive bayes')
# print()
# # alternative: use only articles' header => may give better results
# X = dataset['Title'] + ' ' + dataset['Text']
# y = dataset['Label']
# # use stratified k-fold cross-validation as split method
# skf = StratifiedKFold(n_splits = 10, shuffle=True)
# classifier = GaussianNB()
# # lists for metrics
# recall_scores = []
# precision_scores = []
# f1_scores = []
# # for each fold
# n = 0
# for train, test in skf.split(X,y):
# # BOW
# vocab = BagOfWords.make_vocab(X[train])
# # fit the training data and then return the matrix
# training_data = BagOfWords.make_matrix(X[train], vocab)
# # transform testing data and return the matrix
# testing_data = BagOfWords.make_matrix(X[test], vocab)
# # apply select percentile
# selector = SelectPercentile(percentile=25)
# selector.fit(training_data, y[train])
# training_data_r = selector.transform(training_data)
# testing_data_r = selector.transform(testing_data)
# #fit classifier
# classifier.fit(training_data_r, y[train])
# #predict class
# predictions_train = classifier.predict(training_data_r)
# predictions_test = classifier.predict(testing_data_r)
# #store metrics
# rec = recall_score(y[test], predictions_test)
# recall_scores.append(rec)
# prec = precision_score(y[train], predictions_train)
# precision_scores.append(prec)
# # equation for f1 score
# f1_scores.append(2 * (prec * rec)/(prec + rec))
# #print metrics of test set
# print('prediction of testing set:')
# print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
# .format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores))))
# print()
# #print('overfit testing: prediction of training set')
# #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
# #format(min(f1_scores_train), max(f1_scores_train),
# sum(f1_scores_train)/float(len(f1_scores_train))))
# #print()
def make_naive_bayes_CV(dataset): # print('# ending naive bayes')
'''alternative: uses CountVectorizer (faster) # print()
'''
# alternative: use only articles' header => may give better results
X = dataset['Title'] + '.' + dataset['Text'] + '.'
y = dataset['Label']
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True) # def make_naive_bayes_CV(dataset):
# '''alternative: uses CountVectorizer (faster)
# '''
# # alternative: use only articles' header => may give better results
# X = dataset['Title'] + '.' + dataset['Text'] + '.'
# y = dataset['Label']
count_vector = CountVectorizer() # # use stratified k-fold cross-validation as split method
# skf = StratifiedKFold(n_splits = 10, shuffle=True)
# count_vector = CountVectorizer()
classifier = GaussianNB() # classifier = GaussianNB()
# lists for metrics predicted on test/train set # # lists for metrics predicted on test/train set
f1_scores, f1_scores_train = [] # f1_scores, f1_scores_train = []
# for each fold (10 times) # # for each fold (10 times)
# fold number # # fold number
n = 0 # n = 0
for train, test in skf.split(X,y): # for train, test in skf.split(X,y):
# fit the training data and then return the matrix # # fit the training data and then return the matrix
training_data = count_vector.fit_transform(X[train], y[train]).toarray() # training_data = count_vector.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix # # transform testing data and return the matrix
testing_data = count_vector.transform(X[test]).toarray() # testing_data = count_vector.transform(X[test]).toarray()
# apply select percentile # # apply select percentile
selector = SelectPercentile(percentile=25) # selector = SelectPercentile(percentile=25)
selector.fit(training_data, y[train]) # selector.fit(training_data, y[train])
training_data_r = selector.transform(training_data) # training_data_r = selector.transform(training_data)
testing_data_r = selector.transform(testing_data) # testing_data_r = selector.transform(testing_data)
#fit classifier # #fit classifier
classifier.fit(training_data_r, y[train]) # classifier.fit(training_data_r, y[train])
#predict class # #predict class
predictions_train = classifier.predict(training_data_r) # predictions_train = classifier.predict(training_data_r)
predictions_test = classifier.predict(testing_data_r) # predictions_test = classifier.predict(testing_data_r)
#store metrics predicted on test set # #store metrics predicted on test set
f1_scores.append(f1_score(y[test], predictions_test)) # f1_scores.append(f1_score(y[test], predictions_test))
#store metrics predicted on train set # #store metrics predicted on train set
f1_scores_train.append(f1_score(y[train], predictions_train)) # f1_scores_train.append(f1_score(y[train], predictions_train))
#print metrics of test set # #print metrics of test set
print('--------------------') # print('--------------------')
print('prediction of testing set:') # print('prediction of testing set:')
print('F1 score: min = {}, max = {}, average = {}'.format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores)))) # print('F1 score: min = {}, max = {}, average = {}'
# .format(min(f1_scores), max(f1_scores),
# sum(f1_scores)/float(len(f1_scores))))
print() # print()
print('prediction of training set:') # print('prediction of training set:')
print('F1 score: min = {}, max = {}, average = {}'.format(min(f1_scores_train), max(f1_scores_train),sum(f1_scores_train)/float(len(f1_scores_train)))) # print('F1 score: min = {}, max = {}, average = {}'
print() # .format(min(f1_scores_train), max(f1_scores_train),
# sum(f1_scores_train)/float(len(f1_scores_train))))
# print()
# def analyze_errors_cv(dataset): # def analyze_errors_cv(dataset):
# '''calculates resubstitution error # '''calculates resubstitution error
@ -181,7 +245,8 @@ class NaiveBayes():
# if y_train_test[i] != predictions[i]: # if y_train_test[i] != predictions[i]:
# n += 1 # n += 1
# print('error no.{}'.format(n)) # print('error no.{}'.format(n))
# print('prediction at index {} is: {}, but actual is: {}'.format(i, predictions[i], y_train_test[i])) # print('prediction at index {} is: {}, but actual is: {}'
# .format(i, predictions[i], y_train_test[i]))
# print(X_train_test[i]) # print(X_train_test[i])
# print(y_train_test[i]) # print(y_train_test[i])
# print() # print()