deleted .gitignore
This commit is contained in:
parent
52146158e2
commit
0b424835d8
|
@ -1,221 +0,0 @@
|
||||||
# Byte-compiled / optimized / DLL files
|
|
||||||
|
|
||||||
__pycache__/
|
|
||||||
|
|
||||||
*.py[cod]
|
|
||||||
|
|
||||||
*$py.class
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# C extensions
|
|
||||||
|
|
||||||
*.so
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Distribution / packaging
|
|
||||||
|
|
||||||
.Python
|
|
||||||
|
|
||||||
build/
|
|
||||||
|
|
||||||
develop-eggs/
|
|
||||||
|
|
||||||
dist/
|
|
||||||
|
|
||||||
downloads/
|
|
||||||
|
|
||||||
eggs/
|
|
||||||
|
|
||||||
.eggs/
|
|
||||||
|
|
||||||
lib/
|
|
||||||
|
|
||||||
lib64/
|
|
||||||
|
|
||||||
parts/
|
|
||||||
|
|
||||||
sdist/
|
|
||||||
|
|
||||||
var/
|
|
||||||
|
|
||||||
wheels/
|
|
||||||
|
|
||||||
*.egg-info/
|
|
||||||
|
|
||||||
.installed.cfg
|
|
||||||
|
|
||||||
*.egg
|
|
||||||
|
|
||||||
MANIFEST
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# PyInstaller
|
|
||||||
|
|
||||||
# Usually these files are written by a python script from a template
|
|
||||||
|
|
||||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
||||||
|
|
||||||
*.manifest
|
|
||||||
|
|
||||||
*.spec
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Installer logs
|
|
||||||
|
|
||||||
pip-log.txt
|
|
||||||
|
|
||||||
pip-delete-this-directory.txt
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Unit test / coverage reports
|
|
||||||
|
|
||||||
htmlcov/
|
|
||||||
|
|
||||||
.tox/
|
|
||||||
|
|
||||||
.nox/
|
|
||||||
|
|
||||||
.coverage
|
|
||||||
|
|
||||||
.coverage.*
|
|
||||||
|
|
||||||
.cache
|
|
||||||
|
|
||||||
nosetests.xml
|
|
||||||
|
|
||||||
coverage.xml
|
|
||||||
|
|
||||||
*.cover
|
|
||||||
|
|
||||||
.hypothesis/
|
|
||||||
|
|
||||||
.pytest_cache/
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Translations
|
|
||||||
|
|
||||||
*.mo
|
|
||||||
|
|
||||||
*.pot
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Django stuff:
|
|
||||||
|
|
||||||
*.log
|
|
||||||
|
|
||||||
local_settings.py
|
|
||||||
|
|
||||||
db.sqlite3
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Flask stuff:
|
|
||||||
|
|
||||||
instance/
|
|
||||||
|
|
||||||
.webassets-cache
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Scrapy stuff:
|
|
||||||
|
|
||||||
.scrapy
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Sphinx documentation
|
|
||||||
|
|
||||||
docs/_build/
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# PyBuilder
|
|
||||||
|
|
||||||
target/
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Jupyter Notebook
|
|
||||||
|
|
||||||
.ipynb_checkpoints
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# IPython
|
|
||||||
|
|
||||||
profile_default/
|
|
||||||
|
|
||||||
ipython_config.py
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# pyenv
|
|
||||||
|
|
||||||
.python-version
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# celery beat schedule file
|
|
||||||
|
|
||||||
celerybeat-schedule
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# SageMath parsed files
|
|
||||||
|
|
||||||
*.sage.py
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Environments
|
|
||||||
|
|
||||||
.env
|
|
||||||
|
|
||||||
.venv
|
|
||||||
|
|
||||||
env/
|
|
||||||
|
|
||||||
venv/
|
|
||||||
|
|
||||||
ENV/
|
|
||||||
|
|
||||||
env.bak/
|
|
||||||
|
|
||||||
venv.bak/
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Spyder project settings
|
|
||||||
|
|
||||||
.spyderproject
|
|
||||||
|
|
||||||
.spyproject
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Rope project settings
|
|
||||||
|
|
||||||
.ropeproject
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# mkdocs documentation
|
|
||||||
|
|
||||||
/site
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# mypy
|
|
||||||
|
|
||||||
.mypy_cache/
|
|
||||||
|
|
||||||
.dmypy.json
|
|
||||||
|
|
||||||
dmypy.json
|
|
|
@ -4,6 +4,15 @@ Bag Of Words
|
||||||
|
|
||||||
BagOfWords counts word stems in an article
|
BagOfWords counts word stems in an article
|
||||||
and adds new words to the global vocabulary.
|
and adds new words to the global vocabulary.
|
||||||
|
|
||||||
|
Anm.:
|
||||||
|
The multinomial Naive Bayes classifier is suitable
|
||||||
|
for classification with discrete features (e.g.,
|
||||||
|
word counts for text classification).
|
||||||
|
The multinomial distribution normally requires
|
||||||
|
integer feature counts. However, in practice,
|
||||||
|
fractional counts such as tf-idf may also work.
|
||||||
|
=> durch 'relative_word_frequencies' als Paramter berücksichtigt
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import re
|
import re
|
||||||
|
@ -50,12 +59,15 @@ class BagOfWords:
|
||||||
word = stemmer.stem(word)
|
word = stemmer.stem(word)
|
||||||
return word
|
return word
|
||||||
|
|
||||||
def make_matrix(series, vocab, relative_word_frequencies):
|
def make_matrix(series, vocab, relative_word_frequencies=True):
|
||||||
'''calculates word stem frequencies in input articles.
|
'''calculates word stem frequencies in input articles.
|
||||||
returns matrix (DataFrame) with relative word frequencies
|
returns matrix (DataFrame) with relative word frequencies
|
||||||
(0 <= values < 1) or absolute word frequencies (int).
|
(0 <= values < 1) if relative_word_frequencies=True or absolute
|
||||||
|
word frequencies (int) if relative_word_frequencies=False.
|
||||||
(rows: different articles, colums: different words in vocab)
|
(rows: different articles, colums: different words in vocab)
|
||||||
'''
|
'''
|
||||||
|
print('# BOW: calculating matrix')
|
||||||
|
print('#')
|
||||||
# create list of tuples
|
# create list of tuples
|
||||||
vectors = []
|
vectors = []
|
||||||
for i in range(len(series)):
|
for i in range(len(series)):
|
||||||
|
@ -88,6 +100,8 @@ class BagOfWords:
|
||||||
'''adds words of input articles to a global vocabulary.
|
'''adds words of input articles to a global vocabulary.
|
||||||
input: dataframe of all articles, return value: list of words
|
input: dataframe of all articles, return value: list of words
|
||||||
'''
|
'''
|
||||||
|
print('# BOW: making vocabulary of data set')
|
||||||
|
print('#')
|
||||||
vocab = set()
|
vocab = set()
|
||||||
for text in series:
|
for text in series:
|
||||||
vocab |= set(BagOfWords.extract_words(text))
|
vocab |= set(BagOfWords.extract_words(text))
|
||||||
|
|
153
NaiveBayes.py
153
NaiveBayes.py
|
@ -12,96 +12,119 @@ independently to the probability that it belongs to its category,
|
||||||
regardless of any possible correlations between these features.
|
regardless of any possible correlations between these features.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
#!!
|
|
||||||
# The multinomial Naive Bayes classifier is suitable
|
|
||||||
#for classification with discrete features (e.g.,
|
|
||||||
#word counts for text classification).
|
|
||||||
#The multinomial distribution normally requires
|
|
||||||
#integer feature counts. However, in practice,
|
|
||||||
#fractional counts such as tf-idf may also work.
|
|
||||||
|
|
||||||
# => nur bei eigenem BOW berücksichtigt
|
|
||||||
|
|
||||||
from BagOfWords import BagOfWords
|
from BagOfWords import BagOfWords
|
||||||
|
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
from sklearn.feature_selection import SelectPercentile
|
from sklearn.feature_selection import SelectPercentile
|
||||||
from sklearn.metrics import f1_score, make_scorer
|
from sklearn.metrics import recall_score, precision_score
|
||||||
from sklearn.model_selection import StratifiedKFold
|
from sklearn.model_selection import StratifiedKFold
|
||||||
from sklearn.model_selection import GridSearchCV
|
from sklearn.naive_bayes import GaussianNB
|
||||||
from sklearn.pipeline import Pipeline
|
|
||||||
from sklearn.naive_bayes import MultinomialNB
|
|
||||||
|
|
||||||
# MultinomialNB statt GaussianNB benutzt => OK?
|
|
||||||
#from sklearn.naive_bayes import GaussianNB
|
|
||||||
|
|
||||||
class NaiveBayes:
|
class NaiveBayes:
|
||||||
|
|
||||||
def make_naive_bayes(dataset):
|
def make_naive_bayes(dataset):
|
||||||
'''fits naive bayes model
|
'''fits naive bayes model with StratifiedKFold,
|
||||||
|
uses my BOW
|
||||||
'''
|
'''
|
||||||
print('# starting naive bayes')
|
print('# starting naive bayes')
|
||||||
print('#')
|
print('#')
|
||||||
|
|
||||||
# split data into text and label set
|
# split data into text and label set
|
||||||
|
# join title and text
|
||||||
X = dataset['Title'] + ' ' + dataset['Text']
|
X = dataset['Title'] + ' ' + dataset['Text']
|
||||||
y = dataset['Label']
|
y = dataset['Label']
|
||||||
|
|
||||||
# Bag of Words
|
cv = CountVectorizer()
|
||||||
print('# calculating bag of words')
|
|
||||||
print('#')
|
|
||||||
|
|
||||||
# fit the training data and then return the matrix
|
|
||||||
|
|
||||||
# toDO: warum so andere (schlechte) werte mit meinem BOW?
|
|
||||||
#X = BagOfWords.fit_transform(X, False)
|
|
||||||
|
|
||||||
X = CountVectorizer().fit_transform(X).toarray()
|
|
||||||
|
|
||||||
# use stratified k-fold cross-validation as split method
|
# use stratified k-fold cross-validation as split method
|
||||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||||
|
|
||||||
# use only most important features
|
|
||||||
selector = SelectPercentile()
|
|
||||||
|
|
||||||
pipeline = Pipeline([('perc', selector), ('NB', MultinomialNB())])
|
classifier = GaussianNB()
|
||||||
|
|
||||||
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
|
# lists for metrics
|
||||||
'NB__alpha': [0.00000001, 0.0000001,
|
recall_scores = []
|
||||||
0.000001, 0.00001,
|
precision_scores = []
|
||||||
0.0001, 0.001, 0.01,
|
f1_scores = []
|
||||||
0.1]},
|
|
||||||
cv=skf,
|
# for each fold
|
||||||
scoring=make_scorer(f1_score))
|
n = 0
|
||||||
|
for train, test in skf.split(X,y):
|
||||||
|
|
||||||
|
n += 1
|
||||||
|
print('# split no. ' + str(n))
|
||||||
|
|
||||||
print('# fit classifier')
|
# # eigenes BOW => schlechtere ergebnisse
|
||||||
print('#')
|
# vocab = BagOfWords.make_vocab(X[train])
|
||||||
|
# # fit the training data and then return the matrix
|
||||||
grid.fit(X,y)
|
# training_data = BagOfWords.make_matrix(X[train], vocab)
|
||||||
|
# # transform testing data and return the matrix
|
||||||
# DataFrame of results
|
# testing_data = BagOfWords.make_matrix(X[test], vocab)
|
||||||
df_results = grid.cv_results_
|
|
||||||
|
# # using CountVectorizer:
|
||||||
# print results
|
# fit the training data and then return the matrix
|
||||||
######################
|
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||||
print('RESULTS:')
|
# transform testing data and return the matrix
|
||||||
print('#')
|
testing_data = cv.transform(X[test]).toarray()
|
||||||
print('mean_test_score:')
|
|
||||||
print(df_results['mean_test_score'])
|
# # apply select percentile
|
||||||
print('#')
|
# selector = SelectPercentile(percentile=25)
|
||||||
print('mean of means:')
|
# selector.fit(training_data, y[train])
|
||||||
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
|
|
||||||
print('#')
|
# training_data_r = selector.transform(training_data)
|
||||||
print('best score:')
|
# testing_data_r = selector.transform(testing_data)
|
||||||
print(grid.best_score_)
|
|
||||||
print('#')
|
# #fit classifier
|
||||||
print('best parameters set found on development set:')
|
# classifier.fit(training_data_r, y[train])
|
||||||
print(grid.best_params_)
|
# #predict class
|
||||||
print('#')
|
# predictions_train = classifier.predict(training_data_r)
|
||||||
|
# predictions_test = classifier.predict(testing_data_r)
|
||||||
|
|
||||||
|
#fit classifier
|
||||||
|
classifier.fit(training_data, y[train])
|
||||||
|
#predict class
|
||||||
|
predictions_train = classifier.predict(training_data)
|
||||||
|
predictions_test = classifier.predict(testing_data)
|
||||||
|
|
||||||
|
#print and store metrics
|
||||||
|
rec = recall_score(y[test], predictions_test)
|
||||||
|
print('rec: ' + str(rec))
|
||||||
|
recall_scores.append(rec)
|
||||||
|
prec = precision_score(y[train], predictions_train)
|
||||||
|
print('prec: ' + str(prec))
|
||||||
|
print('#')
|
||||||
|
precision_scores.append(prec)
|
||||||
|
# equation for f1 score
|
||||||
|
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||||
|
|
||||||
|
##########################
|
||||||
|
#print metrics of test set
|
||||||
|
print('-------------------------')
|
||||||
|
print('prediction of testing set:')
|
||||||
|
print('Precision score: min = {}, max = {}, average = {}'
|
||||||
|
.format(min(precision_scores),
|
||||||
|
max(precision_scores),
|
||||||
|
sum(precision_scores)/float(len(precision_scores))))
|
||||||
|
print('Recall score: min = {}, max = {}, average = {}'
|
||||||
|
.format(min(recall_scores),
|
||||||
|
max(recall_scores),
|
||||||
|
sum(recall_scores)/float(len(recall_scores))))
|
||||||
|
print('F1 score: min = {}, max = {}, average = {}'
|
||||||
|
.format(min(f1_scores),
|
||||||
|
max(f1_scores),
|
||||||
|
sum(f1_scores)/float(len(f1_scores))))
|
||||||
|
print()
|
||||||
|
|
||||||
|
##### nur für overfit testing ###########
|
||||||
|
#print('overfit testing: prediction of training set')
|
||||||
|
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
||||||
|
#format(min(f1_scores_train), max(f1_scores_train),
|
||||||
|
#sum(f1_scores_train)/float(len(f1_scores_train))))
|
||||||
|
#print()
|
||||||
|
|
||||||
print('# ending naive bayes')
|
print('# ending naive bayes')
|
||||||
print('#')
|
print('#')
|
||||||
|
|
||||||
|
######## nur für resubstitutionsfehler benötigt ########
|
||||||
def analyze_errors(dataset):
|
def analyze_errors(dataset):
|
||||||
'''calculates resubstitution error
|
'''calculates resubstitution error
|
||||||
shows indices of false classified articles
|
shows indices of false classified articles
|
||||||
|
|
6
SVM.py
6
SVM.py
|
@ -51,10 +51,10 @@ class SVM:
|
||||||
|
|
||||||
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
|
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
|
||||||
|
|
||||||
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
|
grid = GridSearchCV(pipeline, {'perc__percentile': [50, 100],
|
||||||
'SVC__kernel': ['linear','poly','rbf','sigmoid'],
|
'SVC__kernel': ['linear','poly','rbf','sigmoid'],
|
||||||
'SVC__gamma': [0.0001, 0.001, 0.01, 0.1, 1],
|
'SVC__gamma': [0.01, 0.1],
|
||||||
'SVC__C': [0.0001, 0.001, 0.01, 0.1, 1]},
|
'SVC__C': [0.01, 0.1]},
|
||||||
cv=skf,
|
cv=skf,
|
||||||
scoring=make_scorer(f1_score))
|
scoring=make_scorer(f1_score))
|
||||||
|
|
||||||
|
|
|
@ -27,6 +27,6 @@ dataset = CsvHandler.read_csv(file)
|
||||||
|
|
||||||
# DecisionTree.make_tree(dataset)
|
# DecisionTree.make_tree(dataset)
|
||||||
NaiveBayes.make_naive_bayes(dataset)
|
NaiveBayes.make_naive_bayes(dataset)
|
||||||
SVM.make_svm(dataset)
|
# SVM.make_svm(dataset)
|
||||||
|
|
||||||
print('# ending program')
|
print('# ending program')
|
Loading…
Reference in New Issue