added .gitignore file
This commit is contained in:
parent
3f98aff635
commit
1195a161d6
|
@ -0,0 +1,221 @@
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
|
||||||
|
__pycache__/
|
||||||
|
|
||||||
|
*.py[cod]
|
||||||
|
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
|
||||||
|
*.so
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
|
||||||
|
.Python
|
||||||
|
|
||||||
|
build/
|
||||||
|
|
||||||
|
develop-eggs/
|
||||||
|
|
||||||
|
dist/
|
||||||
|
|
||||||
|
downloads/
|
||||||
|
|
||||||
|
eggs/
|
||||||
|
|
||||||
|
.eggs/
|
||||||
|
|
||||||
|
lib/
|
||||||
|
|
||||||
|
lib64/
|
||||||
|
|
||||||
|
parts/
|
||||||
|
|
||||||
|
sdist/
|
||||||
|
|
||||||
|
var/
|
||||||
|
|
||||||
|
wheels/
|
||||||
|
|
||||||
|
*.egg-info/
|
||||||
|
|
||||||
|
.installed.cfg
|
||||||
|
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
|
||||||
|
*.manifest
|
||||||
|
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
|
||||||
|
pip-log.txt
|
||||||
|
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
|
||||||
|
htmlcov/
|
||||||
|
|
||||||
|
.tox/
|
||||||
|
|
||||||
|
.nox/
|
||||||
|
|
||||||
|
.coverage
|
||||||
|
|
||||||
|
.coverage.*
|
||||||
|
|
||||||
|
.cache
|
||||||
|
|
||||||
|
nosetests.xml
|
||||||
|
|
||||||
|
coverage.xml
|
||||||
|
|
||||||
|
*.cover
|
||||||
|
|
||||||
|
.hypothesis/
|
||||||
|
|
||||||
|
.pytest_cache/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
|
||||||
|
*.mo
|
||||||
|
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
|
||||||
|
*.log
|
||||||
|
|
||||||
|
local_settings.py
|
||||||
|
|
||||||
|
db.sqlite3
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
|
||||||
|
instance/
|
||||||
|
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
|
||||||
|
target/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
|
||||||
|
profile_default/
|
||||||
|
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
|
||||||
|
.python-version
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# celery beat schedule file
|
||||||
|
|
||||||
|
celerybeat-schedule
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
|
||||||
|
.env
|
||||||
|
|
||||||
|
.venv
|
||||||
|
|
||||||
|
env/
|
||||||
|
|
||||||
|
venv/
|
||||||
|
|
||||||
|
ENV/
|
||||||
|
|
||||||
|
env.bak/
|
||||||
|
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
|
||||||
|
.spyderproject
|
||||||
|
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
|
||||||
|
/site
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
|
||||||
|
.mypy_cache/
|
||||||
|
|
||||||
|
.dmypy.json
|
||||||
|
|
||||||
|
dmypy.json
|
|
@ -12,7 +12,7 @@ import pandas as pd
|
||||||
|
|
||||||
from nltk.stem.porter import PorterStemmer
|
from nltk.stem.porter import PorterStemmer
|
||||||
|
|
||||||
class BagOfWords():
|
class BagOfWords:
|
||||||
|
|
||||||
def extract_words(text):
|
def extract_words(text):
|
||||||
'''takes article as argument, removes numbers,
|
'''takes article as argument, removes numbers,
|
||||||
|
@ -46,7 +46,8 @@ class BagOfWords():
|
||||||
|
|
||||||
def make_matrix(series, vocab):
|
def make_matrix(series, vocab):
|
||||||
'''calculates word stem frequencies in input articles.
|
'''calculates word stem frequencies in input articles.
|
||||||
returns matrix (DataFrame) with relative word frequencies (0 <= values < 1)
|
returns matrix (DataFrame) with relative word frequencies
|
||||||
|
(0 <= values < 1)
|
||||||
(rows: different articles, colums: different words in vocab)
|
(rows: different articles, colums: different words in vocab)
|
||||||
'''
|
'''
|
||||||
# create list of tuples
|
# create list of tuples
|
||||||
|
@ -67,7 +68,9 @@ class BagOfWords():
|
||||||
vector[i] += 1/word_count
|
vector[i] += 1/word_count
|
||||||
# add single vector as tuple
|
# add single vector as tuple
|
||||||
vectors.append(tuple(vector))
|
vectors.append(tuple(vector))
|
||||||
df_vectors = pd.DataFrame.from_records(vectors, index=None, columns=vocab)
|
df_vectors = pd.DataFrame.from_records(vectors,
|
||||||
|
index=None,
|
||||||
|
columns=vocab)
|
||||||
return df_vectors
|
return df_vectors
|
||||||
|
|
||||||
def make_vocab(series):
|
def make_vocab(series):
|
||||||
|
@ -87,41 +90,49 @@ class BagOfWords():
|
||||||
'''creates list of all words that will be ignored
|
'''creates list of all words that will be ignored
|
||||||
'''
|
'''
|
||||||
# standard stopwords from nltk.corpus stopwords('english')
|
# standard stopwords from nltk.corpus stopwords('english')
|
||||||
stop_words = ['a', 'about', 'above', 'after', 'again', 'against', 'ain',
|
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
|
||||||
'all', 'am', 'an', 'and', 'any', 'are', 'aren', 'aren\'t',
|
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
|
||||||
'as', 'at', 'be', 'because', 'been', 'before', 'being',
|
'aren\'t', 'as', 'at', 'be', 'because', 'been',
|
||||||
'below', 'between', 'both', 'but', 'by', 'can', 'couldn',
|
'before', 'being', 'below', 'between', 'both', 'but',
|
||||||
'couldn\'t', 'd', 'did', 'didn', 'didn\'t', 'do', 'does',
|
'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
|
||||||
'doesn', 'doesn\'t', 'doing', 'don', 'don\'t', 'down',
|
'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
|
||||||
'during', 'each', 'few', 'for', 'from', 'further', 'had',
|
'don', 'don\'t', 'down', 'during', 'each', 'few',
|
||||||
'hadn', 'hadn\'t', 'has', 'hasn', 'hasn\'t', 'have', 'haven',
|
'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
|
||||||
'haven\'t', 'having', 'he', 'her', 'here', 'hers', 'herself',
|
'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
|
||||||
'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
|
'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
|
||||||
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just', 'll',
|
'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
|
||||||
'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more', 'most',
|
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
|
||||||
'mustn', 'mustn\'t', 'my', 'myself', 'needn', 'needn\'t',
|
'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
|
||||||
'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once',
|
'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
|
||||||
'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out',
|
'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
|
||||||
'over', 'own', 're', 's', 'same', 'shan', 'shan\'t', 'she',
|
'on', 'once', 'only', 'or', 'other', 'our', 'ours',
|
||||||
'she\'s', 'should', 'should\'ve', 'shouldn', 'shouldn\'t',
|
'ourselves', 'out', 'over', 'own', 're', 's', 'same',
|
||||||
'so', 'some', 'such', 't', 'than', 'that', 'that\'ll', 'the',
|
'shan', 'shan\'t', 'she', 'she\'s', 'should',
|
||||||
'their', 'theirs', 'them', 'themselves', 'then', 'there',
|
'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
|
||||||
'these', 'they', 'this', 'those', 'through', 'to', 'too',
|
'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
|
||||||
'under', 'until', 'up', 've', 'very', 'was', 'wasn', 'wasn\'t',
|
'theirs', 'them', 'themselves', 'then', 'there',
|
||||||
'we', 'were', 'weren', 'weren\'t', 'what', 'when', 'where',
|
'these', 'they', 'this', 'those', 'through', 'to',
|
||||||
'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won',
|
'too', 'under', 'until', 'up', 've', 'very', 'was',
|
||||||
'won\'t', 'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
|
'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
|
||||||
'you\'re', 'you\'ve', 'your', 'yours', 'yourself', 'yourselves']
|
'what', 'when', 'where', 'which', 'while', 'who',
|
||||||
|
'whom', 'why', 'will', 'with', 'won', 'won\'t',
|
||||||
|
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
|
||||||
|
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
|
||||||
|
'yourselves']
|
||||||
|
|
||||||
# add specific words
|
# add specific words
|
||||||
stop_words.extend(['reuters', 'also', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday'])
|
stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
|
||||||
|
'wednesday', 'thursday', 'friday'])
|
||||||
|
|
||||||
|
# => does this make sense?:
|
||||||
# remove the word 'not' from stop words
|
# remove the word 'not' from stop words
|
||||||
stop_words.remove('not')
|
#stop_words.remove('not')
|
||||||
|
|
||||||
for i in range(len(stop_words)):
|
for i in range(len(stop_words)):
|
||||||
|
|
||||||
# remove punctuation marks and strip endings from abbreviations
|
# remove punctuation marks and strip endings from abbreviations
|
||||||
#stop_words[i] = re.split(r'\W', stop_words[i])[0]
|
#stop_words[i] = re.split(r'\W', stop_words[i])[0]
|
||||||
|
|
||||||
# reduce word to stem
|
# reduce word to stem
|
||||||
stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
|
stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
|
||||||
# transform list to set to eliminate duplicates
|
# transform list to set to eliminate duplicates
|
||||||
|
|
|
@ -9,7 +9,7 @@ import csv
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
class CsvHandler():
|
class CsvHandler:
|
||||||
|
|
||||||
def read_csv(csv_file):
|
def read_csv(csv_file):
|
||||||
df = pd.read_csv(csv_file,
|
df = pd.read_csv(csv_file,
|
||||||
|
|
|
@ -20,7 +20,7 @@ from sklearn.feature_selection import SelectPercentile
|
||||||
from sklearn.metrics import f1_score
|
from sklearn.metrics import f1_score
|
||||||
from sklearn.model_selection import StratifiedKFold
|
from sklearn.model_selection import StratifiedKFold
|
||||||
|
|
||||||
class DecisionTree():
|
class DecisionTree:
|
||||||
|
|
||||||
def make_tree(dataset):
|
def make_tree(dataset):
|
||||||
|
|
||||||
|
|
|
@ -10,20 +10,30 @@ import re
|
||||||
|
|
||||||
from nltk.stem.porter import PorterStemmer
|
from nltk.stem.porter import PorterStemmer
|
||||||
|
|
||||||
class FilterKeywords():
|
class FilterKeywords:
|
||||||
|
|
||||||
def search_keywords(dict_input):
|
def search_keywords(dict_input):
|
||||||
'''extracts relevant key-value pairs of in article's input dictionary.
|
'''extracts relevant key-value pairs of in article's input dictionary,
|
||||||
output are the contained keywords and their count.
|
output are the contained keywords and their count.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
# # list of regular expressions that match merger specific keywords
|
||||||
|
# regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?',
|
||||||
|
# r'business combinations?', r'combined compan(y|ies)',
|
||||||
|
# r'(joint venture|JV)s?', r'take[ -]?overs?', r'tie-up',
|
||||||
|
# r'deals?', r'transactions?', r'approv(e|ing|al|ed)s?',
|
||||||
|
# r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?',
|
||||||
|
# r'purchase', r'(sell(s|ers?|ing)?|sold)']
|
||||||
|
|
||||||
keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers', 'acquisition',
|
keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers',
|
||||||
'acquire', 'acquisitions', 'acquires', 'combine', 'combines',
|
'acquisition', 'acquire', 'acquisitions', 'acquires',
|
||||||
'combination', 'combined', 'joint', 'venture', 'JV', 'takeover',
|
'combine', 'combines', 'combination', 'combined',
|
||||||
'take-over', 'tie-up', 'deal', 'deals', 'transaction', 'transactions',
|
'joint', 'venture', 'JV', 'takeover', 'take-over',
|
||||||
'approve', 'approves', 'approved', 'approving', 'approval',
|
'tie-up', 'deal', 'deals', 'transaction',
|
||||||
'approvals', 'buy', 'buys', 'buying', 'bought', 'buyout', 'buy-out',
|
'transactions', 'approve', 'approves', 'approved',
|
||||||
'purchase', 'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']
|
'approving', 'approval', 'approvals', 'buy', 'buys',
|
||||||
|
'buying', 'bought', 'buyout', 'buy-out', 'purchase',
|
||||||
|
'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']
|
||||||
|
|
||||||
# reduce words to stem
|
# reduce words to stem
|
||||||
stemmer = PorterStemmer()
|
stemmer = PorterStemmer()
|
||||||
|
@ -50,7 +60,7 @@ class FilterKeywords():
|
||||||
return dict_keywords
|
return dict_keywords
|
||||||
|
|
||||||
def count_keywords(dict_keywords):
|
def count_keywords(dict_keywords):
|
||||||
'''input: dict with article's keywords (key) and their count (value).
|
'''input: dict with article's keywords (key) and their count (value),
|
||||||
returns number of keywords that are found.
|
returns number of keywords that are found.
|
||||||
'''
|
'''
|
||||||
return sum(dict_keywords.values())
|
return sum(dict_keywords.values())
|
||||||
|
|
205
NaiveBayes.py
205
NaiveBayes.py
|
@ -2,37 +2,35 @@
|
||||||
Naive Bayes Classifier
|
Naive Bayes Classifier
|
||||||
======================
|
======================
|
||||||
|
|
||||||
Naive Bayes is a probabilistic classifier that is able to predict,
|
Naive Bayes is a probabilistic classifier that is able to predict a
|
||||||
given an observation of an input, a probability distribution over a set of classes,
|
probability distribution over a set of classes, rather than only
|
||||||
rather than only outputting the most likely class that the observation should belong to.
|
outputting the most likely class that the observation should belong to.
|
||||||
'Naive' means, that it assumes that the value of a particular feature
|
'Naive' means, that it assumes that the value of a particular feature
|
||||||
(word in an article) is independent of the value of any other feature,
|
(word in an article) is independent of the value of any other feature,
|
||||||
given the class variable (label). It considers each of these features
|
given the label. It considers each of these features to contribute
|
||||||
to contribute independently to the probability that it belongs to its category,
|
independently to the probability that it belongs to its category,
|
||||||
regardless of any possible correlations between these features.
|
regardless of any possible correlations between these features.
|
||||||
'''
|
'''
|
||||||
from BagOfWords import BagOfWords
|
from BagOfWords import BagOfWords
|
||||||
from CsvHandler import CsvHandler
|
from CsvHandler import CsvHandler
|
||||||
|
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
#from sklearn.feature_extraction.text import CountVectorizer
|
||||||
from sklearn.feature_selection import SelectPercentile
|
#from sklearn.feature_selection import SelectPercentile
|
||||||
from sklearn.metrics import recall_score, precision_score
|
from sklearn.metrics import recall_score, precision_score
|
||||||
from sklearn.model_selection import StratifiedKFold
|
from sklearn.model_selection import StratifiedKFold
|
||||||
from sklearn.model_selection import train_test_split
|
#from sklearn.model_selection import train_test_split
|
||||||
from sklearn.naive_bayes import GaussianNB
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
|
||||||
# toDo: für Julian erst mal ohne SelectPercentile machen
|
class NaiveBayes:
|
||||||
|
|
||||||
class NaiveBayes():
|
|
||||||
|
|
||||||
def make_naive_bayes(dataset):
|
def make_naive_bayes(dataset):
|
||||||
'''fits naive bayes model with StratifiedKFold, uses my BOW
|
'''fits naive bayes model with StratifiedKFold,
|
||||||
'''
|
uses my BOW
|
||||||
|
'''
|
||||||
print('# starting naive bayes')
|
print('# starting naive bayes')
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# alternative: use only articles' header => may give better results
|
# join title and text
|
||||||
X = dataset['Title'] + ' ' + dataset['Text']
|
X = dataset['Title'] + ' ' + dataset['Text']
|
||||||
y = dataset['Label']
|
y = dataset['Label']
|
||||||
|
|
||||||
|
@ -56,18 +54,11 @@ class NaiveBayes():
|
||||||
# transform testing data and return the matrix
|
# transform testing data and return the matrix
|
||||||
testing_data = BagOfWords.make_matrix(X[test], vocab)
|
testing_data = BagOfWords.make_matrix(X[test], vocab)
|
||||||
|
|
||||||
# apply select percentile
|
|
||||||
selector = SelectPercentile(percentile=25)
|
|
||||||
selector.fit(training_data, y[train])
|
|
||||||
|
|
||||||
training_data_r = selector.transform(training_data)
|
|
||||||
testing_data_r = selector.transform(testing_data)
|
|
||||||
|
|
||||||
#fit classifier
|
#fit classifier
|
||||||
classifier.fit(training_data_r, y[train])
|
classifier.fit(training_data, y[train])
|
||||||
#predict class
|
#predict class
|
||||||
predictions_train = classifier.predict(training_data_r)
|
predictions_train = classifier.predict(training_data)
|
||||||
predictions_test = classifier.predict(testing_data_r)
|
predictions_test = classifier.predict(testing_data)
|
||||||
|
|
||||||
#store metrics
|
#store metrics
|
||||||
rec = recall_score(y[test], predictions_test)
|
rec = recall_score(y[test], predictions_test)
|
||||||
|
@ -80,73 +71,146 @@ class NaiveBayes():
|
||||||
#print metrics of test set
|
#print metrics of test set
|
||||||
print('prediction of testing set:')
|
print('prediction of testing set:')
|
||||||
print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
|
print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
|
||||||
.format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores))))
|
.format(min(f1_scores), max(f1_scores),
|
||||||
|
sum(f1_scores)/float(len(f1_scores))))
|
||||||
print()
|
print()
|
||||||
#print('overfit testing: prediction of training set')
|
#print('overfit testing: prediction of training set')
|
||||||
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
||||||
#format(min(f1_scores_train), max(f1_scores_train),sum(f1_scores_train)/float(len(f1_scores_train))))
|
#format(min(f1_scores_train), max(f1_scores_train),
|
||||||
|
#sum(f1_scores_train)/float(len(f1_scores_train))))
|
||||||
#print()
|
#print()
|
||||||
|
|
||||||
print('# ending naive bayes')
|
print('# ending naive bayes')
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
# def make_naive_bayes_selectpercentile(dataset):
|
||||||
|
# '''fits naive bayes model with StratifiedKFold, uses my BOW
|
||||||
|
# feature selection: select 0.25-percentile
|
||||||
|
# '''
|
||||||
|
|
||||||
|
# print('# starting naive bayes')
|
||||||
|
# print()
|
||||||
|
|
||||||
|
# # alternative: use only articles' header => may give better results
|
||||||
|
# X = dataset['Title'] + ' ' + dataset['Text']
|
||||||
|
# y = dataset['Label']
|
||||||
|
|
||||||
|
# # use stratified k-fold cross-validation as split method
|
||||||
|
# skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||||
|
|
||||||
|
# classifier = GaussianNB()
|
||||||
|
|
||||||
|
# # lists for metrics
|
||||||
|
# recall_scores = []
|
||||||
|
# precision_scores = []
|
||||||
|
# f1_scores = []
|
||||||
|
|
||||||
|
# # for each fold
|
||||||
|
# n = 0
|
||||||
|
# for train, test in skf.split(X,y):
|
||||||
|
# # BOW
|
||||||
|
# vocab = BagOfWords.make_vocab(X[train])
|
||||||
|
# # fit the training data and then return the matrix
|
||||||
|
# training_data = BagOfWords.make_matrix(X[train], vocab)
|
||||||
|
# # transform testing data and return the matrix
|
||||||
|
# testing_data = BagOfWords.make_matrix(X[test], vocab)
|
||||||
|
|
||||||
|
# # apply select percentile
|
||||||
|
# selector = SelectPercentile(percentile=25)
|
||||||
|
# selector.fit(training_data, y[train])
|
||||||
|
|
||||||
|
# training_data_r = selector.transform(training_data)
|
||||||
|
# testing_data_r = selector.transform(testing_data)
|
||||||
|
|
||||||
|
# #fit classifier
|
||||||
|
# classifier.fit(training_data_r, y[train])
|
||||||
|
# #predict class
|
||||||
|
# predictions_train = classifier.predict(training_data_r)
|
||||||
|
# predictions_test = classifier.predict(testing_data_r)
|
||||||
|
|
||||||
|
# #store metrics
|
||||||
|
# rec = recall_score(y[test], predictions_test)
|
||||||
|
# recall_scores.append(rec)
|
||||||
|
# prec = precision_score(y[train], predictions_train)
|
||||||
|
# precision_scores.append(prec)
|
||||||
|
# # equation for f1 score
|
||||||
|
# f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||||
|
|
||||||
|
# #print metrics of test set
|
||||||
|
# print('prediction of testing set:')
|
||||||
|
# print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'
|
||||||
|
# .format(min(f1_scores), max(f1_scores), sum(f1_scores)/float(len(f1_scores))))
|
||||||
|
# print()
|
||||||
|
# #print('overfit testing: prediction of training set')
|
||||||
|
# #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
||||||
|
# #format(min(f1_scores_train), max(f1_scores_train),
|
||||||
|
# sum(f1_scores_train)/float(len(f1_scores_train))))
|
||||||
|
# #print()
|
||||||
|
|
||||||
def make_naive_bayes_CV(dataset):
|
# print('# ending naive bayes')
|
||||||
'''alternative: uses CountVectorizer (faster)
|
# print()
|
||||||
'''
|
|
||||||
# alternative: use only articles' header => may give better results
|
|
||||||
X = dataset['Title'] + '.' + dataset['Text'] + '.'
|
|
||||||
y = dataset['Label']
|
|
||||||
|
|
||||||
# use stratified k-fold cross-validation as split method
|
|
||||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
# def make_naive_bayes_CV(dataset):
|
||||||
|
# '''alternative: uses CountVectorizer (faster)
|
||||||
|
# '''
|
||||||
|
# # alternative: use only articles' header => may give better results
|
||||||
|
# X = dataset['Title'] + '.' + dataset['Text'] + '.'
|
||||||
|
# y = dataset['Label']
|
||||||
|
|
||||||
count_vector = CountVectorizer()
|
# # use stratified k-fold cross-validation as split method
|
||||||
|
# skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||||
|
|
||||||
|
# count_vector = CountVectorizer()
|
||||||
|
|
||||||
classifier = GaussianNB()
|
# classifier = GaussianNB()
|
||||||
|
|
||||||
# lists for metrics predicted on test/train set
|
# # lists for metrics predicted on test/train set
|
||||||
f1_scores, f1_scores_train = []
|
# f1_scores, f1_scores_train = []
|
||||||
|
|
||||||
# for each fold (10 times)
|
# # for each fold (10 times)
|
||||||
# fold number
|
# # fold number
|
||||||
n = 0
|
# n = 0
|
||||||
for train, test in skf.split(X,y):
|
# for train, test in skf.split(X,y):
|
||||||
|
|
||||||
# fit the training data and then return the matrix
|
# # fit the training data and then return the matrix
|
||||||
training_data = count_vector.fit_transform(X[train], y[train]).toarray()
|
# training_data = count_vector.fit_transform(X[train], y[train]).toarray()
|
||||||
# transform testing data and return the matrix
|
# # transform testing data and return the matrix
|
||||||
testing_data = count_vector.transform(X[test]).toarray()
|
# testing_data = count_vector.transform(X[test]).toarray()
|
||||||
|
|
||||||
# apply select percentile
|
# # apply select percentile
|
||||||
selector = SelectPercentile(percentile=25)
|
# selector = SelectPercentile(percentile=25)
|
||||||
selector.fit(training_data, y[train])
|
# selector.fit(training_data, y[train])
|
||||||
|
|
||||||
training_data_r = selector.transform(training_data)
|
# training_data_r = selector.transform(training_data)
|
||||||
testing_data_r = selector.transform(testing_data)
|
# testing_data_r = selector.transform(testing_data)
|
||||||
|
|
||||||
#fit classifier
|
# #fit classifier
|
||||||
classifier.fit(training_data_r, y[train])
|
# classifier.fit(training_data_r, y[train])
|
||||||
|
|
||||||
#predict class
|
# #predict class
|
||||||
predictions_train = classifier.predict(training_data_r)
|
# predictions_train = classifier.predict(training_data_r)
|
||||||
predictions_test = classifier.predict(testing_data_r)
|
# predictions_test = classifier.predict(testing_data_r)
|
||||||
|
|
||||||
#store metrics predicted on test set
|
# #store metrics predicted on test set
|
||||||
f1_scores.append(f1_score(y[test], predictions_test))
|
# f1_scores.append(f1_score(y[test], predictions_test))
|
||||||
|
|
||||||
#store metrics predicted on train set
|
# #store metrics predicted on train set
|
||||||
f1_scores_train.append(f1_score(y[train], predictions_train))
|
# f1_scores_train.append(f1_score(y[train], predictions_train))
|
||||||
|
|
||||||
#print metrics of test set
|
# #print metrics of test set
|
||||||
print('--------------------')
|
# print('--------------------')
|
||||||
print('prediction of testing set:')
|
# print('prediction of testing set:')
|
||||||
print('F1 score: min = {}, max = {}, average = {}'.format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores))))
|
# print('F1 score: min = {}, max = {}, average = {}'
|
||||||
|
# .format(min(f1_scores), max(f1_scores),
|
||||||
|
# sum(f1_scores)/float(len(f1_scores))))
|
||||||
|
|
||||||
print()
|
# print()
|
||||||
print('prediction of training set:')
|
# print('prediction of training set:')
|
||||||
print('F1 score: min = {}, max = {}, average = {}'.format(min(f1_scores_train), max(f1_scores_train),sum(f1_scores_train)/float(len(f1_scores_train))))
|
# print('F1 score: min = {}, max = {}, average = {}'
|
||||||
print()
|
# .format(min(f1_scores_train), max(f1_scores_train),
|
||||||
|
# sum(f1_scores_train)/float(len(f1_scores_train))))
|
||||||
|
# print()
|
||||||
|
|
||||||
# def analyze_errors_cv(dataset):
|
# def analyze_errors_cv(dataset):
|
||||||
# '''calculates resubstitution error
|
# '''calculates resubstitution error
|
||||||
|
@ -181,7 +245,8 @@ class NaiveBayes():
|
||||||
# if y_train_test[i] != predictions[i]:
|
# if y_train_test[i] != predictions[i]:
|
||||||
# n += 1
|
# n += 1
|
||||||
# print('error no.{}'.format(n))
|
# print('error no.{}'.format(n))
|
||||||
# print('prediction at index {} is: {}, but actual is: {}'.format(i, predictions[i], y_train_test[i]))
|
# print('prediction at index {} is: {}, but actual is: {}'
|
||||||
|
# .format(i, predictions[i], y_train_test[i]))
|
||||||
# print(X_train_test[i])
|
# print(X_train_test[i])
|
||||||
# print(y_train_test[i])
|
# print(y_train_test[i])
|
||||||
# print()
|
# print()
|
||||||
|
|
Loading…
Reference in New Issue