deleted .gitignore

This commit is contained in:
Anne Lorenz 2018-09-14 09:19:12 +02:00
parent 52146158e2
commit 0b424835d8
5 changed files with 108 additions and 292 deletions

221
.gitignore vendored
View File

@ -1,221 +0,0 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json

View File

@ -4,6 +4,15 @@ Bag Of Words
BagOfWords counts word stems in an article
and adds new words to the global vocabulary.
Anm.:
The multinomial Naive Bayes classifier is suitable
for classification with discrete features (e.g.,
word counts for text classification).
The multinomial distribution normally requires
integer feature counts. However, in practice,
fractional counts such as tf-idf may also work.
=> durch 'relative_word_frequencies' als Paramter berücksichtigt
'''
import re
@ -50,12 +59,15 @@ class BagOfWords:
word = stemmer.stem(word)
return word
def make_matrix(series, vocab, relative_word_frequencies):
def make_matrix(series, vocab, relative_word_frequencies=True):
'''calculates word stem frequencies in input articles.
returns matrix (DataFrame) with relative word frequencies
(0 <= values < 1) or absolute word frequencies (int).
(0 <= values < 1) if relative_word_frequencies=True or absolute
word frequencies (int) if relative_word_frequencies=False.
(rows: different articles, colums: different words in vocab)
'''
print('# BOW: calculating matrix')
print('#')
# create list of tuples
vectors = []
for i in range(len(series)):
@ -88,6 +100,8 @@ class BagOfWords:
'''adds words of input articles to a global vocabulary.
input: dataframe of all articles, return value: list of words
'''
print('# BOW: making vocabulary of data set')
print('#')
vocab = set()
for text in series:
vocab |= set(BagOfWords.extract_words(text))

View File

@ -12,96 +12,119 @@ independently to the probability that it belongs to its category,
regardless of any possible correlations between these features.
'''
#!!
# The multinomial Naive Bayes classifier is suitable
#for classification with discrete features (e.g.,
#word counts for text classification).
#The multinomial distribution normally requires
#integer feature counts. However, in practice,
#fractional counts such as tf-idf may also work.
# => nur bei eigenem BOW berücksichtigt
from BagOfWords import BagOfWords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
# MultinomialNB statt GaussianNB benutzt => OK?
#from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import GaussianNB
class NaiveBayes:
def make_naive_bayes(dataset):
'''fits naive bayes model
'''fits naive bayes model with StratifiedKFold,
uses my BOW
'''
print('# starting naive bayes')
print('#')
# split data into text and label set
# join title and text
X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label']
# Bag of Words
print('# calculating bag of words')
print('#')
# fit the training data and then return the matrix
# toDO: warum so andere (schlechte) werte mit meinem BOW?
#X = BagOfWords.fit_transform(X, False)
X = CountVectorizer().fit_transform(X).toarray()
cv = CountVectorizer()
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
# use only most important features
selector = SelectPercentile()
skf = StratifiedKFold(n_splits = 10, shuffle=True)
pipeline = Pipeline([('perc', selector), ('NB', MultinomialNB())])
classifier = GaussianNB()
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
'NB__alpha': [0.00000001, 0.0000001,
0.000001, 0.00001,
0.0001, 0.001, 0.01,
0.1]},
cv=skf,
scoring=make_scorer(f1_score))
# lists for metrics
recall_scores = []
precision_scores = []
f1_scores = []
# for each fold
n = 0
for train, test in skf.split(X,y):
n += 1
print('# split no. ' + str(n))
print('# fit classifier')
print('#')
grid.fit(X,y)
# DataFrame of results
df_results = grid.cv_results_
# print results
######################
print('RESULTS:')
print('#')
print('mean_test_score:')
print(df_results['mean_test_score'])
print('#')
print('mean of means:')
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
print('#')
print('best score:')
print(grid.best_score_)
print('#')
print('best parameters set found on development set:')
print(grid.best_params_)
print('#')
# # eigenes BOW => schlechtere ergebnisse
# vocab = BagOfWords.make_vocab(X[train])
# # fit the training data and then return the matrix
# training_data = BagOfWords.make_matrix(X[train], vocab)
# # transform testing data and return the matrix
# testing_data = BagOfWords.make_matrix(X[test], vocab)
# # using CountVectorizer:
# fit the training data and then return the matrix
training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray()
# # apply select percentile
# selector = SelectPercentile(percentile=25)
# selector.fit(training_data, y[train])
# training_data_r = selector.transform(training_data)
# testing_data_r = selector.transform(testing_data)
# #fit classifier
# classifier.fit(training_data_r, y[train])
# #predict class
# predictions_train = classifier.predict(training_data_r)
# predictions_test = classifier.predict(testing_data_r)
#fit classifier
classifier.fit(training_data, y[train])
#predict class
predictions_train = classifier.predict(training_data)
predictions_test = classifier.predict(testing_data)
#print and store metrics
rec = recall_score(y[test], predictions_test)
print('rec: ' + str(rec))
recall_scores.append(rec)
prec = precision_score(y[train], predictions_train)
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
##########################
#print metrics of test set
print('-------------------------')
print('prediction of testing set:')
print('Precision score: min = {}, max = {}, average = {}'
.format(min(precision_scores),
max(precision_scores),
sum(precision_scores)/float(len(precision_scores))))
print('Recall score: min = {}, max = {}, average = {}'
.format(min(recall_scores),
max(recall_scores),
sum(recall_scores)/float(len(recall_scores))))
print('F1 score: min = {}, max = {}, average = {}'
.format(min(f1_scores),
max(f1_scores),
sum(f1_scores)/float(len(f1_scores))))
print()
##### nur für overfit testing ###########
#print('overfit testing: prediction of training set')
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
#format(min(f1_scores_train), max(f1_scores_train),
#sum(f1_scores_train)/float(len(f1_scores_train))))
#print()
print('# ending naive bayes')
print('#')
print('#')
######## nur für resubstitutionsfehler benötigt ########
def analyze_errors(dataset):
'''calculates resubstitution error
shows indices of false classified articles

6
SVM.py
View File

@ -51,10 +51,10 @@ class SVM:
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
grid = GridSearchCV(pipeline, {'perc__percentile': [50, 100],
'SVC__kernel': ['linear','poly','rbf','sigmoid'],
'SVC__gamma': [0.0001, 0.001, 0.01, 0.1, 1],
'SVC__C': [0.0001, 0.001, 0.01, 0.1, 1]},
'SVC__gamma': [0.01, 0.1],
'SVC__C': [0.01, 0.1]},
cv=skf,
scoring=make_scorer(f1_score))

View File

@ -27,6 +27,6 @@ dataset = CsvHandler.read_csv(file)
# DecisionTree.make_tree(dataset)
NaiveBayes.make_naive_bayes(dataset)
SVM.make_svm(dataset)
# SVM.make_svm(dataset)
print('# ending program')