deleted .gitignore
This commit is contained in:
parent
52146158e2
commit
0b424835d8
|
@ -1,221 +0,0 @@
|
|||
# Byte-compiled / optimized / DLL files
|
||||
|
||||
__pycache__/
|
||||
|
||||
*.py[cod]
|
||||
|
||||
*$py.class
|
||||
|
||||
|
||||
|
||||
# C extensions
|
||||
|
||||
*.so
|
||||
|
||||
|
||||
|
||||
# Distribution / packaging
|
||||
|
||||
.Python
|
||||
|
||||
build/
|
||||
|
||||
develop-eggs/
|
||||
|
||||
dist/
|
||||
|
||||
downloads/
|
||||
|
||||
eggs/
|
||||
|
||||
.eggs/
|
||||
|
||||
lib/
|
||||
|
||||
lib64/
|
||||
|
||||
parts/
|
||||
|
||||
sdist/
|
||||
|
||||
var/
|
||||
|
||||
wheels/
|
||||
|
||||
*.egg-info/
|
||||
|
||||
.installed.cfg
|
||||
|
||||
*.egg
|
||||
|
||||
MANIFEST
|
||||
|
||||
|
||||
|
||||
# PyInstaller
|
||||
|
||||
# Usually these files are written by a python script from a template
|
||||
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
|
||||
*.manifest
|
||||
|
||||
*.spec
|
||||
|
||||
|
||||
|
||||
# Installer logs
|
||||
|
||||
pip-log.txt
|
||||
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
|
||||
|
||||
# Unit test / coverage reports
|
||||
|
||||
htmlcov/
|
||||
|
||||
.tox/
|
||||
|
||||
.nox/
|
||||
|
||||
.coverage
|
||||
|
||||
.coverage.*
|
||||
|
||||
.cache
|
||||
|
||||
nosetests.xml
|
||||
|
||||
coverage.xml
|
||||
|
||||
*.cover
|
||||
|
||||
.hypothesis/
|
||||
|
||||
.pytest_cache/
|
||||
|
||||
|
||||
|
||||
# Translations
|
||||
|
||||
*.mo
|
||||
|
||||
*.pot
|
||||
|
||||
|
||||
|
||||
# Django stuff:
|
||||
|
||||
*.log
|
||||
|
||||
local_settings.py
|
||||
|
||||
db.sqlite3
|
||||
|
||||
|
||||
|
||||
# Flask stuff:
|
||||
|
||||
instance/
|
||||
|
||||
.webassets-cache
|
||||
|
||||
|
||||
|
||||
# Scrapy stuff:
|
||||
|
||||
.scrapy
|
||||
|
||||
|
||||
|
||||
# Sphinx documentation
|
||||
|
||||
docs/_build/
|
||||
|
||||
|
||||
|
||||
# PyBuilder
|
||||
|
||||
target/
|
||||
|
||||
|
||||
|
||||
# Jupyter Notebook
|
||||
|
||||
.ipynb_checkpoints
|
||||
|
||||
|
||||
|
||||
# IPython
|
||||
|
||||
profile_default/
|
||||
|
||||
ipython_config.py
|
||||
|
||||
|
||||
|
||||
# pyenv
|
||||
|
||||
.python-version
|
||||
|
||||
|
||||
|
||||
# celery beat schedule file
|
||||
|
||||
celerybeat-schedule
|
||||
|
||||
|
||||
|
||||
# SageMath parsed files
|
||||
|
||||
*.sage.py
|
||||
|
||||
|
||||
|
||||
# Environments
|
||||
|
||||
.env
|
||||
|
||||
.venv
|
||||
|
||||
env/
|
||||
|
||||
venv/
|
||||
|
||||
ENV/
|
||||
|
||||
env.bak/
|
||||
|
||||
venv.bak/
|
||||
|
||||
|
||||
|
||||
# Spyder project settings
|
||||
|
||||
.spyderproject
|
||||
|
||||
.spyproject
|
||||
|
||||
|
||||
|
||||
# Rope project settings
|
||||
|
||||
.ropeproject
|
||||
|
||||
|
||||
|
||||
# mkdocs documentation
|
||||
|
||||
/site
|
||||
|
||||
|
||||
|
||||
# mypy
|
||||
|
||||
.mypy_cache/
|
||||
|
||||
.dmypy.json
|
||||
|
||||
dmypy.json
|
|
@ -4,6 +4,15 @@ Bag Of Words
|
|||
|
||||
BagOfWords counts word stems in an article
|
||||
and adds new words to the global vocabulary.
|
||||
|
||||
Anm.:
|
||||
The multinomial Naive Bayes classifier is suitable
|
||||
for classification with discrete features (e.g.,
|
||||
word counts for text classification).
|
||||
The multinomial distribution normally requires
|
||||
integer feature counts. However, in practice,
|
||||
fractional counts such as tf-idf may also work.
|
||||
=> durch 'relative_word_frequencies' als Paramter berücksichtigt
|
||||
'''
|
||||
|
||||
import re
|
||||
|
@ -50,12 +59,15 @@ class BagOfWords:
|
|||
word = stemmer.stem(word)
|
||||
return word
|
||||
|
||||
def make_matrix(series, vocab, relative_word_frequencies):
|
||||
def make_matrix(series, vocab, relative_word_frequencies=True):
|
||||
'''calculates word stem frequencies in input articles.
|
||||
returns matrix (DataFrame) with relative word frequencies
|
||||
(0 <= values < 1) or absolute word frequencies (int).
|
||||
(0 <= values < 1) if relative_word_frequencies=True or absolute
|
||||
word frequencies (int) if relative_word_frequencies=False.
|
||||
(rows: different articles, colums: different words in vocab)
|
||||
'''
|
||||
print('# BOW: calculating matrix')
|
||||
print('#')
|
||||
# create list of tuples
|
||||
vectors = []
|
||||
for i in range(len(series)):
|
||||
|
@ -88,6 +100,8 @@ class BagOfWords:
|
|||
'''adds words of input articles to a global vocabulary.
|
||||
input: dataframe of all articles, return value: list of words
|
||||
'''
|
||||
print('# BOW: making vocabulary of data set')
|
||||
print('#')
|
||||
vocab = set()
|
||||
for text in series:
|
||||
vocab |= set(BagOfWords.extract_words(text))
|
||||
|
|
153
NaiveBayes.py
153
NaiveBayes.py
|
@ -12,96 +12,119 @@ independently to the probability that it belongs to its category,
|
|||
regardless of any possible correlations between these features.
|
||||
'''
|
||||
|
||||
#!!
|
||||
# The multinomial Naive Bayes classifier is suitable
|
||||
#for classification with discrete features (e.g.,
|
||||
#word counts for text classification).
|
||||
#The multinomial distribution normally requires
|
||||
#integer feature counts. However, in practice,
|
||||
#fractional counts such as tf-idf may also work.
|
||||
|
||||
# => nur bei eigenem BOW berücksichtigt
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.metrics import f1_score, make_scorer
|
||||
from sklearn.metrics import recall_score, precision_score
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
|
||||
# MultinomialNB statt GaussianNB benutzt => OK?
|
||||
#from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
|
||||
class NaiveBayes:
|
||||
|
||||
def make_naive_bayes(dataset):
|
||||
'''fits naive bayes model
|
||||
'''fits naive bayes model with StratifiedKFold,
|
||||
uses my BOW
|
||||
'''
|
||||
print('# starting naive bayes')
|
||||
print('#')
|
||||
|
||||
# split data into text and label set
|
||||
# join title and text
|
||||
X = dataset['Title'] + ' ' + dataset['Text']
|
||||
y = dataset['Label']
|
||||
|
||||
# Bag of Words
|
||||
print('# calculating bag of words')
|
||||
print('#')
|
||||
|
||||
# fit the training data and then return the matrix
|
||||
|
||||
# toDO: warum so andere (schlechte) werte mit meinem BOW?
|
||||
#X = BagOfWords.fit_transform(X, False)
|
||||
|
||||
X = CountVectorizer().fit_transform(X).toarray()
|
||||
cv = CountVectorizer()
|
||||
|
||||
# use stratified k-fold cross-validation as split method
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
|
||||
# use only most important features
|
||||
selector = SelectPercentile()
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
|
||||
pipeline = Pipeline([('perc', selector), ('NB', MultinomialNB())])
|
||||
classifier = GaussianNB()
|
||||
|
||||
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
|
||||
'NB__alpha': [0.00000001, 0.0000001,
|
||||
0.000001, 0.00001,
|
||||
0.0001, 0.001, 0.01,
|
||||
0.1]},
|
||||
cv=skf,
|
||||
scoring=make_scorer(f1_score))
|
||||
# lists for metrics
|
||||
recall_scores = []
|
||||
precision_scores = []
|
||||
f1_scores = []
|
||||
|
||||
# for each fold
|
||||
n = 0
|
||||
for train, test in skf.split(X,y):
|
||||
|
||||
n += 1
|
||||
print('# split no. ' + str(n))
|
||||
|
||||
print('# fit classifier')
|
||||
print('#')
|
||||
|
||||
grid.fit(X,y)
|
||||
|
||||
# DataFrame of results
|
||||
df_results = grid.cv_results_
|
||||
|
||||
# print results
|
||||
######################
|
||||
print('RESULTS:')
|
||||
print('#')
|
||||
print('mean_test_score:')
|
||||
print(df_results['mean_test_score'])
|
||||
print('#')
|
||||
print('mean of means:')
|
||||
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
|
||||
print('#')
|
||||
print('best score:')
|
||||
print(grid.best_score_)
|
||||
print('#')
|
||||
print('best parameters set found on development set:')
|
||||
print(grid.best_params_)
|
||||
print('#')
|
||||
# # eigenes BOW => schlechtere ergebnisse
|
||||
# vocab = BagOfWords.make_vocab(X[train])
|
||||
# # fit the training data and then return the matrix
|
||||
# training_data = BagOfWords.make_matrix(X[train], vocab)
|
||||
# # transform testing data and return the matrix
|
||||
# testing_data = BagOfWords.make_matrix(X[test], vocab)
|
||||
|
||||
# # using CountVectorizer:
|
||||
# fit the training data and then return the matrix
|
||||
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||
# transform testing data and return the matrix
|
||||
testing_data = cv.transform(X[test]).toarray()
|
||||
|
||||
# # apply select percentile
|
||||
# selector = SelectPercentile(percentile=25)
|
||||
# selector.fit(training_data, y[train])
|
||||
|
||||
# training_data_r = selector.transform(training_data)
|
||||
# testing_data_r = selector.transform(testing_data)
|
||||
|
||||
# #fit classifier
|
||||
# classifier.fit(training_data_r, y[train])
|
||||
# #predict class
|
||||
# predictions_train = classifier.predict(training_data_r)
|
||||
# predictions_test = classifier.predict(testing_data_r)
|
||||
|
||||
#fit classifier
|
||||
classifier.fit(training_data, y[train])
|
||||
#predict class
|
||||
predictions_train = classifier.predict(training_data)
|
||||
predictions_test = classifier.predict(testing_data)
|
||||
|
||||
#print and store metrics
|
||||
rec = recall_score(y[test], predictions_test)
|
||||
print('rec: ' + str(rec))
|
||||
recall_scores.append(rec)
|
||||
prec = precision_score(y[train], predictions_train)
|
||||
print('prec: ' + str(prec))
|
||||
print('#')
|
||||
precision_scores.append(prec)
|
||||
# equation for f1 score
|
||||
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||
|
||||
##########################
|
||||
#print metrics of test set
|
||||
print('-------------------------')
|
||||
print('prediction of testing set:')
|
||||
print('Precision score: min = {}, max = {}, average = {}'
|
||||
.format(min(precision_scores),
|
||||
max(precision_scores),
|
||||
sum(precision_scores)/float(len(precision_scores))))
|
||||
print('Recall score: min = {}, max = {}, average = {}'
|
||||
.format(min(recall_scores),
|
||||
max(recall_scores),
|
||||
sum(recall_scores)/float(len(recall_scores))))
|
||||
print('F1 score: min = {}, max = {}, average = {}'
|
||||
.format(min(f1_scores),
|
||||
max(f1_scores),
|
||||
sum(f1_scores)/float(len(f1_scores))))
|
||||
print()
|
||||
|
||||
##### nur für overfit testing ###########
|
||||
#print('overfit testing: prediction of training set')
|
||||
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
||||
#format(min(f1_scores_train), max(f1_scores_train),
|
||||
#sum(f1_scores_train)/float(len(f1_scores_train))))
|
||||
#print()
|
||||
|
||||
print('# ending naive bayes')
|
||||
print('#')
|
||||
print('#')
|
||||
|
||||
######## nur für resubstitutionsfehler benötigt ########
|
||||
def analyze_errors(dataset):
|
||||
'''calculates resubstitution error
|
||||
shows indices of false classified articles
|
||||
|
|
6
SVM.py
6
SVM.py
|
@ -51,10 +51,10 @@ class SVM:
|
|||
|
||||
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
|
||||
|
||||
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
|
||||
grid = GridSearchCV(pipeline, {'perc__percentile': [50, 100],
|
||||
'SVC__kernel': ['linear','poly','rbf','sigmoid'],
|
||||
'SVC__gamma': [0.0001, 0.001, 0.01, 0.1, 1],
|
||||
'SVC__C': [0.0001, 0.001, 0.01, 0.1, 1]},
|
||||
'SVC__gamma': [0.01, 0.1],
|
||||
'SVC__C': [0.01, 0.1]},
|
||||
cv=skf,
|
||||
scoring=make_scorer(f1_score))
|
||||
|
||||
|
|
|
@ -27,6 +27,6 @@ dataset = CsvHandler.read_csv(file)
|
|||
|
||||
# DecisionTree.make_tree(dataset)
|
||||
NaiveBayes.make_naive_bayes(dataset)
|
||||
SVM.make_svm(dataset)
|
||||
# SVM.make_svm(dataset)
|
||||
|
||||
print('# ending program')
|
Loading…
Reference in New Issue