callable scripts
This commit is contained in:
parent
ab578ae0c6
commit
f934b5a1a0
|
@ -67,7 +67,7 @@ class BagOfWords:
|
|||
(rows: different articles, colums: different words in vocab)
|
||||
'''
|
||||
print('# BOW: calculating matrix')
|
||||
print('#')
|
||||
print('# ...')
|
||||
# create list of tuples
|
||||
vectors = []
|
||||
for i in range(len(series)):
|
||||
|
@ -101,7 +101,7 @@ class BagOfWords:
|
|||
input: dataframe of all articles, return value: list of words
|
||||
'''
|
||||
print('# BOW: making vocabulary of data set')
|
||||
print('#')
|
||||
print('# ...')
|
||||
vocab = set()
|
||||
for text in series:
|
||||
vocab |= set(BagOfWords.extract_words(text))
|
||||
|
|
|
@ -22,19 +22,9 @@ from sklearn.model_selection import StratifiedKFold
|
|||
|
||||
class DecisionTree:
|
||||
|
||||
print('# starting program')
|
||||
print('#')
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('#')
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
def make_tree(dataset):
|
||||
print('# starting decision tree')
|
||||
print('#')
|
||||
print('# fitting model')
|
||||
print('# ...')
|
||||
|
||||
X = dataset['Title'] + ' ' + dataset['Text']
|
||||
y = dataset['Label']
|
||||
|
@ -114,8 +104,19 @@ class DecisionTree:
|
|||
# format(min(f1_scores_train), max(f1_scores_train),
|
||||
# sum(f1_scores_train)/float(len(f1_scores_train))))
|
||||
# print()
|
||||
print('# ending decision tree')
|
||||
print('#')
|
||||
|
||||
DecisionTree.make_tree(dataset)
|
||||
print('# ending program')
|
||||
#################################
|
||||
print('# starting decision tree')
|
||||
print('# ...')
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('# ...')
|
||||
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
make_tree(dataset)
|
||||
|
||||
print('# ending decision tree')
|
|
@ -6,6 +6,8 @@ FilterKeywords searches for merger specific keywords
|
|||
in an article and counts them.
|
||||
'''
|
||||
|
||||
# toDo: dict ändern!
|
||||
|
||||
import re
|
||||
|
||||
from nltk.stem.porter import PorterStemmer
|
||||
|
@ -64,6 +66,3 @@ class FilterKeywords:
|
|||
returns number of keywords that are found.
|
||||
'''
|
||||
return sum(dict_keywords.values())
|
||||
|
||||
|
||||
|
|
@ -13,7 +13,7 @@ regardless of any possible correlations between these features.
|
|||
'''
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
from CsvReader import CsvReader
|
||||
from CsvHandler import CsvHandler
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
|
@ -23,22 +23,12 @@ from sklearn.naive_bayes import GaussianNB
|
|||
|
||||
class NaiveBayes:
|
||||
|
||||
print('# starting program')
|
||||
print('#')
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('#')
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
def make_naive_bayes(dataset):
|
||||
'''fits naive bayes model with StratifiedKFold,
|
||||
uses my BOW
|
||||
'''
|
||||
print('# starting naive bayes')
|
||||
print('#')
|
||||
print('# fitting model')
|
||||
print('# ...')
|
||||
|
||||
# split data into text and label set
|
||||
# join title and text
|
||||
|
@ -132,9 +122,6 @@ class NaiveBayes:
|
|||
#sum(f1_scores_train)/float(len(f1_scores_train))))
|
||||
#print()
|
||||
|
||||
print('# ending naive bayes')
|
||||
print('#')
|
||||
|
||||
######## nur für resubstitutionsfehler benötigt ########
|
||||
def analyze_errors(dataset):
|
||||
'''calculates resubstitution error
|
||||
|
@ -172,5 +159,19 @@ class NaiveBayes:
|
|||
#print metrics
|
||||
print('F1 score: ', format(f1_score(y_train_test, predictions)))
|
||||
|
||||
#################################
|
||||
print('# starting naive bayes')
|
||||
print('# ...')
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('# ...')
|
||||
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
make_naive_bayes(dataset)
|
||||
|
||||
print('#')
|
||||
print('# ending program')
|
||||
print('# ending naive bayes')
|
|
@ -28,6 +28,7 @@ class Requester:
|
|||
|
||||
# print message
|
||||
print('# retrieving articles from webhose.io')
|
||||
print('# ...')
|
||||
|
||||
# personal API key
|
||||
webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
|
||||
|
@ -57,6 +58,7 @@ class Requester:
|
|||
num_downloads = int(sum_posts / 100)
|
||||
print('# collecting first {} articles'.format(num_downloads * 100))
|
||||
print('# sorting out other sources than reuters')
|
||||
print('# ...')
|
||||
|
||||
# twodimensional list of all articles
|
||||
list_articles = []
|
||||
|
@ -91,3 +93,8 @@ class Requester:
|
|||
columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
|
||||
# save csv
|
||||
CsvHandler.write_csv(df, filestring)
|
||||
|
||||
print('# starting requester')
|
||||
print('# ...')
|
||||
save_articles_from_webhoseio()
|
||||
print('# ending requester')
|
26
SVM.py
26
SVM.py
|
@ -13,6 +13,7 @@ to belong to a category based on which side of the gap they fall.
|
|||
'''
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
from CsvHandler import CsvHandler
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
|
@ -26,8 +27,8 @@ class SVM:
|
|||
|
||||
def make_svm(dataset):
|
||||
|
||||
print('# starting SVM')
|
||||
print('#')
|
||||
print('# fitting model')
|
||||
print('# ...')
|
||||
|
||||
# split data into text and label set
|
||||
|
||||
|
@ -38,7 +39,7 @@ class SVM:
|
|||
|
||||
# Bag of Words
|
||||
print('# calculating bag of words')
|
||||
print('#')
|
||||
print('# ...')
|
||||
# fit the training data and then return the matrix
|
||||
#X = BagOfWords.fit_transform(X)
|
||||
X = CountVectorizer().fit_transform(X).toarray()
|
||||
|
@ -59,7 +60,7 @@ class SVM:
|
|||
scoring=make_scorer(f1_score))
|
||||
|
||||
print('# fit classifier')
|
||||
print('#')
|
||||
print('# ...')
|
||||
|
||||
grid.fit(X,y)
|
||||
|
||||
|
@ -83,5 +84,18 @@ class SVM:
|
|||
print(grid.best_params_)
|
||||
print()
|
||||
|
||||
print('# ending SVM')
|
||||
print('#')
|
||||
########################
|
||||
print('# starting svm')
|
||||
print('# ...')
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('# ...')
|
||||
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
make_svm(dataset)
|
||||
|
||||
print('# ending svm')
|
|
@ -13,15 +13,19 @@ from NaiveBayes import NaiveBayes
|
|||
from SVM import SVM
|
||||
|
||||
print('# starting program')
|
||||
print('#')
|
||||
print('# ...')
|
||||
|
||||
# only if new unlabeled(!) data set is required:
|
||||
# Requester.save_articles_from_webhoseio()
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('#')
|
||||
print('# ...')
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
# DecisionTree.make_tree(dataset)
|
||||
NaiveBayes.make_naive_bayes(dataset)
|
||||
# SVM.make_svm(dataset)
|
||||
|
||||
|
|
Loading…
Reference in New Issue