deleted .gitignore

This commit is contained in:
Anne Lorenz 2018-09-14 09:19:12 +02:00
parent 52146158e2
commit 0b424835d8
5 changed files with 108 additions and 292 deletions

221
.gitignore vendored
View File

@ -1,221 +0,0 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json

View File

@ -4,6 +4,15 @@ Bag Of Words
BagOfWords counts word stems in an article BagOfWords counts word stems in an article
and adds new words to the global vocabulary. and adds new words to the global vocabulary.
Anm.:
The multinomial Naive Bayes classifier is suitable
for classification with discrete features (e.g.,
word counts for text classification).
The multinomial distribution normally requires
integer feature counts. However, in practice,
fractional counts such as tf-idf may also work.
=> durch 'relative_word_frequencies' als Paramter berücksichtigt
''' '''
import re import re
@ -50,12 +59,15 @@ class BagOfWords:
word = stemmer.stem(word) word = stemmer.stem(word)
return word return word
def make_matrix(series, vocab, relative_word_frequencies): def make_matrix(series, vocab, relative_word_frequencies=True):
'''calculates word stem frequencies in input articles. '''calculates word stem frequencies in input articles.
returns matrix (DataFrame) with relative word frequencies returns matrix (DataFrame) with relative word frequencies
(0 <= values < 1) or absolute word frequencies (int). (0 <= values < 1) if relative_word_frequencies=True or absolute
word frequencies (int) if relative_word_frequencies=False.
(rows: different articles, colums: different words in vocab) (rows: different articles, colums: different words in vocab)
''' '''
print('# BOW: calculating matrix')
print('#')
# create list of tuples # create list of tuples
vectors = [] vectors = []
for i in range(len(series)): for i in range(len(series)):
@ -88,6 +100,8 @@ class BagOfWords:
'''adds words of input articles to a global vocabulary. '''adds words of input articles to a global vocabulary.
input: dataframe of all articles, return value: list of words input: dataframe of all articles, return value: list of words
''' '''
print('# BOW: making vocabulary of data set')
print('#')
vocab = set() vocab = set()
for text in series: for text in series:
vocab |= set(BagOfWords.extract_words(text)) vocab |= set(BagOfWords.extract_words(text))

View File

@ -12,96 +12,119 @@ independently to the probability that it belongs to its category,
regardless of any possible correlations between these features. regardless of any possible correlations between these features.
''' '''
#!!
# The multinomial Naive Bayes classifier is suitable
#for classification with discrete features (e.g.,
#word counts for text classification).
#The multinomial distribution normally requires
#integer feature counts. However, in practice,
#fractional counts such as tf-idf may also work.
# => nur bei eigenem BOW berücksichtigt
from BagOfWords import BagOfWords from BagOfWords import BagOfWords
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score, make_scorer from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
# MultinomialNB statt GaussianNB benutzt => OK?
#from sklearn.naive_bayes import GaussianNB
class NaiveBayes: class NaiveBayes:
def make_naive_bayes(dataset): def make_naive_bayes(dataset):
'''fits naive bayes model '''fits naive bayes model with StratifiedKFold,
uses my BOW
''' '''
print('# starting naive bayes') print('# starting naive bayes')
print('#') print('#')
# split data into text and label set # split data into text and label set
# join title and text
X = dataset['Title'] + ' ' + dataset['Text'] X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label'] y = dataset['Label']
# Bag of Words cv = CountVectorizer()
print('# calculating bag of words')
print('#')
# fit the training data and then return the matrix
# toDO: warum so andere (schlechte) werte mit meinem BOW?
#X = BagOfWords.fit_transform(X, False)
X = CountVectorizer().fit_transform(X).toarray()
# use stratified k-fold cross-validation as split method # use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True) skf = StratifiedKFold(n_splits = 10, shuffle=True)
# use only most important features
selector = SelectPercentile()
pipeline = Pipeline([('perc', selector), ('NB', MultinomialNB())]) classifier = GaussianNB()
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100], # lists for metrics
'NB__alpha': [0.00000001, 0.0000001, recall_scores = []
0.000001, 0.00001, precision_scores = []
0.0001, 0.001, 0.01, f1_scores = []
0.1]},
cv=skf, # for each fold
scoring=make_scorer(f1_score)) n = 0
for train, test in skf.split(X,y):
n += 1
print('# split no. ' + str(n))
print('# fit classifier') # # eigenes BOW => schlechtere ergebnisse
print('#') # vocab = BagOfWords.make_vocab(X[train])
# # fit the training data and then return the matrix
grid.fit(X,y) # training_data = BagOfWords.make_matrix(X[train], vocab)
# # transform testing data and return the matrix
# DataFrame of results # testing_data = BagOfWords.make_matrix(X[test], vocab)
df_results = grid.cv_results_
# # using CountVectorizer:
# print results # fit the training data and then return the matrix
###################### training_data = cv.fit_transform(X[train], y[train]).toarray()
print('RESULTS:') # transform testing data and return the matrix
print('#') testing_data = cv.transform(X[test]).toarray()
print('mean_test_score:')
print(df_results['mean_test_score']) # # apply select percentile
print('#') # selector = SelectPercentile(percentile=25)
print('mean of means:') # selector.fit(training_data, y[train])
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
print('#') # training_data_r = selector.transform(training_data)
print('best score:') # testing_data_r = selector.transform(testing_data)
print(grid.best_score_)
print('#') # #fit classifier
print('best parameters set found on development set:') # classifier.fit(training_data_r, y[train])
print(grid.best_params_) # #predict class
print('#') # predictions_train = classifier.predict(training_data_r)
# predictions_test = classifier.predict(testing_data_r)
#fit classifier
classifier.fit(training_data, y[train])
#predict class
predictions_train = classifier.predict(training_data)
predictions_test = classifier.predict(testing_data)
#print and store metrics
rec = recall_score(y[test], predictions_test)
print('rec: ' + str(rec))
recall_scores.append(rec)
prec = precision_score(y[train], predictions_train)
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
##########################
#print metrics of test set
print('-------------------------')
print('prediction of testing set:')
print('Precision score: min = {}, max = {}, average = {}'
.format(min(precision_scores),
max(precision_scores),
sum(precision_scores)/float(len(precision_scores))))
print('Recall score: min = {}, max = {}, average = {}'
.format(min(recall_scores),
max(recall_scores),
sum(recall_scores)/float(len(recall_scores))))
print('F1 score: min = {}, max = {}, average = {}'
.format(min(f1_scores),
max(f1_scores),
sum(f1_scores)/float(len(f1_scores))))
print()
##### nur für overfit testing ###########
#print('overfit testing: prediction of training set')
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
#format(min(f1_scores_train), max(f1_scores_train),
#sum(f1_scores_train)/float(len(f1_scores_train))))
#print()
print('# ending naive bayes') print('# ending naive bayes')
print('#') print('#')
######## nur für resubstitutionsfehler benötigt ########
def analyze_errors(dataset): def analyze_errors(dataset):
'''calculates resubstitution error '''calculates resubstitution error
shows indices of false classified articles shows indices of false classified articles

6
SVM.py
View File

@ -51,10 +51,10 @@ class SVM:
pipeline = Pipeline([('perc', selector), ('SVC', SVC())]) pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100], grid = GridSearchCV(pipeline, {'perc__percentile': [50, 100],
'SVC__kernel': ['linear','poly','rbf','sigmoid'], 'SVC__kernel': ['linear','poly','rbf','sigmoid'],
'SVC__gamma': [0.0001, 0.001, 0.01, 0.1, 1], 'SVC__gamma': [0.01, 0.1],
'SVC__C': [0.0001, 0.001, 0.01, 0.1, 1]}, 'SVC__C': [0.01, 0.1]},
cv=skf, cv=skf,
scoring=make_scorer(f1_score)) scoring=make_scorer(f1_score))

View File

@ -27,6 +27,6 @@ dataset = CsvHandler.read_csv(file)
# DecisionTree.make_tree(dataset) # DecisionTree.make_tree(dataset)
NaiveBayes.make_naive_bayes(dataset) NaiveBayes.make_naive_bayes(dataset)
SVM.make_svm(dataset) # SVM.make_svm(dataset)
print('# ending program') print('# ending program')