added .gitignore file

This commit is contained in:
Anne Lorenz 2018-09-10 10:38:24 +02:00
parent 3f98aff635
commit 1195a161d6
6 changed files with 419 additions and 112 deletions

221
.gitignore vendored Normal file
View File

@ -0,0 +1,221 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json

View File

@ -12,7 +12,7 @@ import pandas as pd
from nltk.stem.porter import PorterStemmer
class BagOfWords():
class BagOfWords:
def extract_words(text):
'''takes article as argument, removes numbers,
@ -46,7 +46,8 @@ class BagOfWords():
def make_matrix(series, vocab):
'''calculates word stem frequencies in input articles.
returns matrix (DataFrame) with relative word frequencies (0 <= values < 1)
returns matrix (DataFrame) with relative word frequencies
(0 <= values < 1)
(rows: different articles, colums: different words in vocab)
'''
# create list of tuples
@ -67,7 +68,9 @@ class BagOfWords():
vector[i] += 1/word_count
# add single vector as tuple
vectors.append(tuple(vector))
df_vectors = pd.DataFrame.from_records(vectors, index=None, columns=vocab)
df_vectors = pd.DataFrame.from_records(vectors,
index=None,
columns=vocab)
return df_vectors
def make_vocab(series):
@ -87,41 +90,49 @@ class BagOfWords():
'''creates list of all words that will be ignored
'''
# standard stopwords from nltk.corpus stopwords('english')
stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'ain',
'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'aren\'t',
'as', 'at', 'be', 'because', 'been', 'before', 'being',
'below', 'between', 'both', 'but', 'by', 'can', 'couldn',
'couldn\'t', 'd', 'did', 'didn', 'didn\'t', 'do', 'does',
'doesn', 'doesn\'t', 'doing', 'don', 'don\'t', 'down',
'during', 'each', 'few', 'for', 'from', 'further', 'had',
'hadn', 'hadn\'t', 'has', 'hasn', 'hasn\'t', 'have', 'haven',
'haven\'t', 'having', 'he', 'her', 'here', 'hers', 'herself',
'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just', 'll',
'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more', 'most',
'mustn', 'mustn\'t', 'my', 'myself', 'needn', 'needn\'t',
'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once',
'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out',
'over', 'own', 're', 's', 'same', 'shan', 'shan\'t', 'she',
'she\'s', 'should', 'should\'ve', 'shouldn', 'shouldn\'t',
'so', 'some', 'such', 't', 'than', 'that', 'that\'ll', 'the',
'their', 'theirs', 'them', 'themselves', 'then', 'there',
'these', 'they', 'this', 'those', 'through', 'to', 'too',
'under', 'until', 'up', 've', 'very', 'was', 'wasn', 'wasn\'t',
'we', 'were', 'weren', 'weren\'t', 'what', 'when', 'where',
'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won',
'won\'t', 'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves']
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
'aren\'t', 'as', 'at', 'be', 'because', 'been',
'before', 'being', 'below', 'between', 'both', 'but',
'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
'don', 'don\'t', 'down', 'during', 'each', 'few',
'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
'on', 'once', 'only', 'or', 'other', 'our', 'ours',
'ourselves', 'out', 'over', 'own', 're', 's', 'same',
'shan', 'shan\'t', 'she', 'she\'s', 'should',
'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
'theirs', 'them', 'themselves', 'then', 'there',
'these', 'they', 'this', 'those', 'through', 'to',
'too', 'under', 'until', 'up', 've', 'very', 'was',
'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
'what', 'when', 'where', 'which', 'while', 'who',
'whom', 'why', 'will', 'with', 'won', 'won\'t',
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
'yourselves']
# add specific words
stop_words.extend(['reuters', 'also', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday'])
stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
'wednesday', 'thursday', 'friday'])
# => does this make sense?:
# remove the word 'not' from stop words
stop_words.remove('not')
#stop_words.remove('not')
for i in range(len(stop_words)):
# remove punctuation marks and strip endings from abbreviations
#stop_words[i] = re.split(r'\W', stop_words[i])[0]
# reduce word to stem
stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
# transform list to set to eliminate duplicates

View File

@ -9,7 +9,7 @@ import csv
import pandas as pd
class CsvHandler():
class CsvHandler:
def read_csv(csv_file):
df = pd.read_csv(csv_file,

View File

@ -20,7 +20,7 @@ from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
class DecisionTree():
class DecisionTree:
def make_tree(dataset):

View File

@ -10,20 +10,30 @@ import re
from nltk.stem.porter import PorterStemmer
class FilterKeywords():
class FilterKeywords:
def search_keywords(dict_input):
'''extracts relevant key-value pairs of in article's input dictionary.
'''extracts relevant key-value pairs of in article's input dictionary,
output are the contained keywords and their count.
'''
# # list of regular expressions that match merger specific keywords
# regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?',
# r'business combinations?', r'combined compan(y|ies)',
# r'(joint venture|JV)s?', r'take[ -]?overs?', r'tie-up',
# r'deals?', r'transactions?', r'approv(e|ing|al|ed)s?',
# r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?',
# r'purchase', r'(sell(s|ers?|ing)?|sold)']
keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers', 'acquisition',
'acquire', 'acquisitions', 'acquires', 'combine', 'combines',
'combination', 'combined', 'joint', 'venture', 'JV', 'takeover',
'take-over', 'tie-up', 'deal', 'deals', 'transaction', 'transactions',
'approve', 'approves', 'approved', 'approving', 'approval',
'approvals', 'buy', 'buys', 'buying', 'bought', 'buyout', 'buy-out',
'purchase', 'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']
keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers',
'acquisition', 'acquire', 'acquisitions', 'acquires',
'combine', 'combines', 'combination', 'combined',
'joint', 'venture', 'JV', 'takeover', 'take-over',
'tie-up', 'deal', 'deals', 'transaction',
'transactions', 'approve', 'approves', 'approved',
'approving', 'approval', 'approvals', 'buy', 'buys',
'buying', 'bought', 'buyout', 'buy-out', 'purchase',
'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']
# reduce words to stem
stemmer = PorterStemmer()
@ -50,7 +60,7 @@ class FilterKeywords():
return dict_keywords
def count_keywords(dict_keywords):
'''input: dict with article's keywords (key) and their count (value).
'''input: dict with article's keywords (key) and their count (value),
returns number of keywords that are found.
'''
return sum(dict_keywords.values())

View File

@ -2,37 +2,35 @@
Naive Bayes Classifier
======================
Naive Bayes is a probabilistic classifier that is able to predict,
given an observation of an input, a probability distribution over a set of classes,
rather than only outputting the most likely class that the observation should belong to.
Naive Bayes is a probabilistic classifier that is able to predict a
probability distribution over a set of classes, rather than only
outputting the most likely class that the observation should belong to.
'Naive' means, that it assumes that the value of a particular feature
(word in an article) is independent of the value of any other feature,
given the class variable (label). It considers each of these features
to contribute independently to the probability that it belongs to its category,
given the label. It considers each of these features to contribute
independently to the probability that it belongs to its category,
regardless of any possible correlations between these features.
'''
from BagOfWords import BagOfWords
from CsvHandler import CsvHandler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
#from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
# toDo: für Julian erst mal ohne SelectPercentile machen
class NaiveBayes():
class NaiveBayes:
def make_naive_bayes(dataset):
'''fits naive bayes model with StratifiedKFold, uses my BOW
'''
'''fits naive bayes model with StratifiedKFold,
uses my BOW
'''
print('# starting naive bayes')
print()
# alternative: use only articles' header => may give better results
# join title and text
X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label']
@ -56,18 +54,11 @@ class NaiveBayes():
# transform testing data and return the matrix
testing_data = BagOfWords.make_matrix(X[test], vocab)
# apply select percentile
selector = SelectPercentile(percentile=25)
selector.fit(training_data, y[train])
training_data_r = selector.transform(training_data)
testing_data_r = selector.transform(testing_data)
#fit classifier
classifier.fit(training_data_r, y[train])
classifier.fit(training_data, y[train])
#predict class
predictions_train = classifier.predict(training_data_r)
predictions_test = classifier.predict(testing_data_r)
predictions_train = classifier.predict(training_data)
predictions_test = classifier.predict(testing_data)
#store metrics
rec = recall_score(y[test], predictions_test)
@ -80,73 +71,146 @@ class NaiveBayes():
#print metrics of test set
print('prediction of testing set:')
print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
.format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores))))
.format(min(f1_scores), max(f1_scores),
sum(f1_scores)/float(len(f1_scores))))
print()
#print('overfit testing: prediction of training set')
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
#format(min(f1_scores_train), max(f1_scores_train),sum(f1_scores_train)/float(len(f1_scores_train))))
#format(min(f1_scores_train), max(f1_scores_train),
#sum(f1_scores_train)/float(len(f1_scores_train))))
#print()
print('# ending naive bayes')
print()
# def make_naive_bayes_selectpercentile(dataset):
# '''fits naive bayes model with StratifiedKFold, uses my BOW
# feature selection: select 0.25-percentile
# '''
# print('# starting naive bayes')
# print()
# # alternative: use only articles' header => may give better results
# X = dataset['Title'] + ' ' + dataset['Text']
# y = dataset['Label']
# # use stratified k-fold cross-validation as split method
# skf = StratifiedKFold(n_splits = 10, shuffle=True)
# classifier = GaussianNB()
# # lists for metrics
# recall_scores = []
# precision_scores = []
# f1_scores = []
# # for each fold
# n = 0
# for train, test in skf.split(X,y):
# # BOW
# vocab = BagOfWords.make_vocab(X[train])
# # fit the training data and then return the matrix
# training_data = BagOfWords.make_matrix(X[train], vocab)
# # transform testing data and return the matrix
# testing_data = BagOfWords.make_matrix(X[test], vocab)
# # apply select percentile
# selector = SelectPercentile(percentile=25)
# selector.fit(training_data, y[train])
# training_data_r = selector.transform(training_data)
# testing_data_r = selector.transform(testing_data)
# #fit classifier
# classifier.fit(training_data_r, y[train])
# #predict class
# predictions_train = classifier.predict(training_data_r)
# predictions_test = classifier.predict(testing_data_r)
# #store metrics
# rec = recall_score(y[test], predictions_test)
# recall_scores.append(rec)
# prec = precision_score(y[train], predictions_train)
# precision_scores.append(prec)
# # equation for f1 score
# f1_scores.append(2 * (prec * rec)/(prec + rec))
# #print metrics of test set
# print('prediction of testing set:')
# print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
# .format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores))))
# print()
# #print('overfit testing: prediction of training set')
# #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
# #format(min(f1_scores_train), max(f1_scores_train),
# sum(f1_scores_train)/float(len(f1_scores_train))))
# #print()
def make_naive_bayes_CV(dataset):
'''alternative: uses CountVectorizer (faster)
'''
# alternative: use only articles' header => may give better results
X = dataset['Title'] + '.' + dataset['Text'] + '.'
y = dataset['Label']
# print('# ending naive bayes')
# print()
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
# def make_naive_bayes_CV(dataset):
# '''alternative: uses CountVectorizer (faster)
# '''
# # alternative: use only articles' header => may give better results
# X = dataset['Title'] + '.' + dataset['Text'] + '.'
# y = dataset['Label']
count_vector = CountVectorizer()
# # use stratified k-fold cross-validation as split method
# skf = StratifiedKFold(n_splits = 10, shuffle=True)
# count_vector = CountVectorizer()
classifier = GaussianNB()
# classifier = GaussianNB()
# lists for metrics predicted on test/train set
f1_scores, f1_scores_train = []
# # lists for metrics predicted on test/train set
# f1_scores, f1_scores_train = []
# for each fold (10 times)
# fold number
n = 0
for train, test in skf.split(X,y):
# # for each fold (10 times)
# # fold number
# n = 0
# for train, test in skf.split(X,y):
# fit the training data and then return the matrix
training_data = count_vector.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix
testing_data = count_vector.transform(X[test]).toarray()
# # fit the training data and then return the matrix
# training_data = count_vector.fit_transform(X[train], y[train]).toarray()
# # transform testing data and return the matrix
# testing_data = count_vector.transform(X[test]).toarray()
# apply select percentile
selector = SelectPercentile(percentile=25)
selector.fit(training_data, y[train])
# # apply select percentile
# selector = SelectPercentile(percentile=25)
# selector.fit(training_data, y[train])
training_data_r = selector.transform(training_data)
testing_data_r = selector.transform(testing_data)
# training_data_r = selector.transform(training_data)
# testing_data_r = selector.transform(testing_data)
#fit classifier
classifier.fit(training_data_r, y[train])
# #fit classifier
# classifier.fit(training_data_r, y[train])
#predict class
predictions_train = classifier.predict(training_data_r)
predictions_test = classifier.predict(testing_data_r)
# #predict class
# predictions_train = classifier.predict(training_data_r)
# predictions_test = classifier.predict(testing_data_r)
#store metrics predicted on test set
f1_scores.append(f1_score(y[test], predictions_test))
# #store metrics predicted on test set
# f1_scores.append(f1_score(y[test], predictions_test))
#store metrics predicted on train set
f1_scores_train.append(f1_score(y[train], predictions_train))
# #store metrics predicted on train set
# f1_scores_train.append(f1_score(y[train], predictions_train))
#print metrics of test set
print('--------------------')
print('prediction of testing set:')
print('F1 score: min = {}, max = {}, average = {}'.format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores))))
# #print metrics of test set
# print('--------------------')
# print('prediction of testing set:')
# print('F1 score: min = {}, max = {}, average = {}'
# .format(min(f1_scores), max(f1_scores),
# sum(f1_scores)/float(len(f1_scores))))
print()
print('prediction of training set:')
print('F1 score: min = {}, max = {}, average = {}'.format(min(f1_scores_train), max(f1_scores_train),sum(f1_scores_train)/float(len(f1_scores_train))))
print()
# print()
# print('prediction of training set:')
# print('F1 score: min = {}, max = {}, average = {}'
# .format(min(f1_scores_train), max(f1_scores_train),
# sum(f1_scores_train)/float(len(f1_scores_train))))
# print()
# def analyze_errors_cv(dataset):
# '''calculates resubstitution error
@ -181,7 +245,8 @@ class NaiveBayes():
# if y_train_test[i] != predictions[i]:
# n += 1
# print('error no.{}'.format(n))
# print('prediction at index {} is: {}, but actual is: {}'.format(i, predictions[i], y_train_test[i]))
# print('prediction at index {} is: {}, but actual is: {}'
# .format(i, predictions[i], y_train_test[i]))
# print(X_train_test[i])
# print(y_train_test[i])
# print()