callable scripts

This commit is contained in:
Anne Lorenz 2018-09-17 21:16:19 +02:00
parent ab578ae0c6
commit f934b5a1a0
8 changed files with 131 additions and 105 deletions

View File

@ -67,7 +67,7 @@ class BagOfWords:
(rows: different articles, colums: different words in vocab)
'''
print('# BOW: calculating matrix')
print('#')
print('# ...')
# create list of tuples
vectors = []
for i in range(len(series)):
@ -101,7 +101,7 @@ class BagOfWords:
input: dataframe of all articles, return value: list of words
'''
print('# BOW: making vocabulary of data set')
print('#')
print('# ...')
vocab = set()
for text in series:
vocab |= set(BagOfWords.extract_words(text))

View File

@ -22,19 +22,9 @@ from sklearn.model_selection import StratifiedKFold
class DecisionTree:
print('# starting program')
print('#')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('#')
dataset = CsvHandler.read_csv(file)
def make_tree(dataset):
print('# starting decision tree')
print('#')
print('# fitting model')
print('# ...')
X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label']
@ -114,8 +104,19 @@ class DecisionTree:
# format(min(f1_scores_train), max(f1_scores_train),
# sum(f1_scores_train)/float(len(f1_scores_train))))
# print()
print('# ending decision tree')
print('#')
DecisionTree.make_tree(dataset)
print('# ending program')
#################################
print('# starting decision tree')
print('# ...')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('# ...')
dataset = CsvHandler.read_csv(file)
make_tree(dataset)
print('# ending decision tree')

View File

@ -6,6 +6,8 @@ FilterKeywords searches for merger specific keywords
in an article and counts them.
'''
# toDo: dict ändern!
import re
from nltk.stem.porter import PorterStemmer
@ -64,6 +66,3 @@ class FilterKeywords:
returns number of keywords that are found.
'''
return sum(dict_keywords.values())

View File

@ -13,7 +13,7 @@ regardless of any possible correlations between these features.
'''
from BagOfWords import BagOfWords
from CsvReader import CsvReader
from CsvHandler import CsvHandler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
@ -23,22 +23,12 @@ from sklearn.naive_bayes import GaussianNB
class NaiveBayes:
print('# starting program')
print('#')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('#')
dataset = CsvHandler.read_csv(file)
def make_naive_bayes(dataset):
'''fits naive bayes model with StratifiedKFold,
uses my BOW
'''
print('# starting naive bayes')
print('#')
print('# fitting model')
print('# ...')
# split data into text and label set
# join title and text
@ -132,9 +122,6 @@ class NaiveBayes:
#sum(f1_scores_train)/float(len(f1_scores_train))))
#print()
print('# ending naive bayes')
print('#')
######## nur für resubstitutionsfehler benötigt ########
def analyze_errors(dataset):
'''calculates resubstitution error
@ -172,5 +159,19 @@ class NaiveBayes:
#print metrics
print('F1 score: ', format(f1_score(y_train_test, predictions)))
#################################
print('# starting naive bayes')
print('# ...')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('# ...')
dataset = CsvHandler.read_csv(file)
make_naive_bayes(dataset)
print('#')
print('# ending program')
print('# ending naive bayes')

View File

@ -28,6 +28,7 @@ class Requester:
# print message
print('# retrieving articles from webhose.io')
print('# ...')
# personal API key
webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
@ -57,6 +58,7 @@ class Requester:
num_downloads = int(sum_posts / 100)
print('# collecting first {} articles'.format(num_downloads * 100))
print('# sorting out other sources than reuters')
print('# ...')
# twodimensional list of all articles
list_articles = []
@ -91,3 +93,8 @@ class Requester:
columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
# save csv
CsvHandler.write_csv(df, filestring)
print('# starting requester')
print('# ...')
save_articles_from_webhoseio()
print('# ending requester')

26
SVM.py
View File

@ -13,6 +13,7 @@ to belong to a category based on which side of the gap they fall.
'''
from BagOfWords import BagOfWords
from CsvHandler import CsvHandler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
@ -26,8 +27,8 @@ class SVM:
def make_svm(dataset):
print('# starting SVM')
print('#')
print('# fitting model')
print('# ...')
# split data into text and label set
@ -38,7 +39,7 @@ class SVM:
# Bag of Words
print('# calculating bag of words')
print('#')
print('# ...')
# fit the training data and then return the matrix
#X = BagOfWords.fit_transform(X)
X = CountVectorizer().fit_transform(X).toarray()
@ -59,7 +60,7 @@ class SVM:
scoring=make_scorer(f1_score))
print('# fit classifier')
print('#')
print('# ...')
grid.fit(X,y)
@ -83,5 +84,18 @@ class SVM:
print(grid.best_params_)
print()
print('# ending SVM')
print('#')
########################
print('# starting svm')
print('# ...')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('# ...')
dataset = CsvHandler.read_csv(file)
make_svm(dataset)
print('# ending svm')

View File

@ -13,15 +13,19 @@ from NaiveBayes import NaiveBayes
from SVM import SVM
print('# starting program')
print('#')
print('# ...')
# only if new unlabeled(!) data set is required:
# Requester.save_articles_from_webhoseio()
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('#')
print('# ...')
dataset = CsvHandler.read_csv(file)
# DecisionTree.make_tree(dataset)
NaiveBayes.make_naive_bayes(dataset)
# SVM.make_svm(dataset)