callable scripts

This commit is contained in:
Anne Lorenz 2018-09-17 21:16:19 +02:00
parent ab578ae0c6
commit f934b5a1a0
8 changed files with 131 additions and 105 deletions

View File

@ -3,7 +3,7 @@ Bag Of Words
============ ============
BagOfWords counts word stems in an article BagOfWords counts word stems in an article
and adds new words to the global vocabulary. and adds new words to the global vocabulary.
Anm.: Anm.:
The multinomial Naive Bayes classifier is suitable The multinomial Naive Bayes classifier is suitable
@ -67,7 +67,7 @@ class BagOfWords:
(rows: different articles, colums: different words in vocab) (rows: different articles, colums: different words in vocab)
''' '''
print('# BOW: calculating matrix') print('# BOW: calculating matrix')
print('#') print('# ...')
# create list of tuples # create list of tuples
vectors = [] vectors = []
for i in range(len(series)): for i in range(len(series)):
@ -101,7 +101,7 @@ class BagOfWords:
input: dataframe of all articles, return value: list of words input: dataframe of all articles, return value: list of words
''' '''
print('# BOW: making vocabulary of data set') print('# BOW: making vocabulary of data set')
print('#') print('# ...')
vocab = set() vocab = set()
for text in series: for text in series:
vocab |= set(BagOfWords.extract_words(text)) vocab |= set(BagOfWords.extract_words(text))

View File

@ -22,19 +22,9 @@ from sklearn.model_selection import StratifiedKFold
class DecisionTree: class DecisionTree:
print('# starting program')
print('#')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('#')
dataset = CsvHandler.read_csv(file)
def make_tree(dataset): def make_tree(dataset):
print('# starting decision tree') print('# fitting model')
print('#') print('# ...')
X = dataset['Title'] + ' ' + dataset['Text'] X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label'] y = dataset['Label']
@ -42,9 +32,9 @@ class DecisionTree:
#count_vector = CountVectorizer() #count_vector = CountVectorizer()
# use stratified k-fold cross-validation as split method # use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True) skf = StratifiedKFold(n_splits = 10, shuffle=True)
# lists for metrics predicted on test/train set # lists for metrics predicted on test/train set
f1_scores = [] f1_scores = []
f1_scores_train = [] f1_scores_train = []
@ -114,8 +104,19 @@ class DecisionTree:
# format(min(f1_scores_train), max(f1_scores_train), # format(min(f1_scores_train), max(f1_scores_train),
# sum(f1_scores_train)/float(len(f1_scores_train)))) # sum(f1_scores_train)/float(len(f1_scores_train))))
# print() # print()
print('# ending decision tree')
print('#')
DecisionTree.make_tree(dataset) #################################
print('# ending program') print('# starting decision tree')
print('# ...')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('# ...')
dataset = CsvHandler.read_csv(file)
make_tree(dataset)
print('# ending decision tree')

View File

@ -2,68 +2,67 @@
Filter Keywords Filter Keywords
=============== ===============
FilterKeywords searches for merger specific keywords FilterKeywords searches for merger specific keywords
in an article and counts them. in an article and counts them.
''' '''
# toDo: dict ändern!
import re import re
from nltk.stem.porter import PorterStemmer from nltk.stem.porter import PorterStemmer
class FilterKeywords: class FilterKeywords:
def search_keywords(dict_input): def search_keywords(dict_input):
'''extracts relevant key-value pairs of in article's input dictionary, '''extracts relevant key-value pairs of in article's input dictionary,
output are the contained keywords and their count. output are the contained keywords and their count.
''' '''
# # list of regular expressions that match merger specific keywords # # list of regular expressions that match merger specific keywords
# regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?', # regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?',
# r'business combinations?', r'combined compan(y|ies)', # r'business combinations?', r'combined compan(y|ies)',
# r'(joint venture|JV)s?', r'take[ -]?overs?', r'tie-up', # r'(joint venture|JV)s?', r'take[ -]?overs?', r'tie-up',
# r'deals?', r'transactions?', r'approv(e|ing|al|ed)s?', # r'deals?', r'transactions?', r'approv(e|ing|al|ed)s?',
# r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?', # r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?',
# r'purchase', r'(sell(s|ers?|ing)?|sold)'] # r'purchase', r'(sell(s|ers?|ing)?|sold)']
keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers', keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers',
'acquisition', 'acquire', 'acquisitions', 'acquires', 'acquisition', 'acquire', 'acquisitions', 'acquires',
'combine', 'combines', 'combination', 'combined', 'combine', 'combines', 'combination', 'combined',
'joint', 'venture', 'JV', 'takeover', 'take-over', 'joint', 'venture', 'JV', 'takeover', 'take-over',
'tie-up', 'deal', 'deals', 'transaction', 'tie-up', 'deal', 'deals', 'transaction',
'transactions', 'approve', 'approves', 'approved', 'transactions', 'approve', 'approves', 'approved',
'approving', 'approval', 'approvals', 'buy', 'buys', 'approving', 'approval', 'approvals', 'buy', 'buys',
'buying', 'bought', 'buyout', 'buy-out', 'purchase', 'buying', 'bought', 'buyout', 'buy-out', 'purchase',
'sell', 'sells', 'selling', 'sold', 'seller', 'buyer'] 'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']
# reduce words to stem # reduce words to stem
stemmer = PorterStemmer() stemmer = PorterStemmer()
for i in range(len(keyword_list)): for i in range(len(keyword_list)):
keyword_list[i] = stemmer.stem(keyword_list[i]) keyword_list[i] = stemmer.stem(keyword_list[i])
# remove duplicates # remove duplicates
keywords = set(keyword_list) keywords = set(keyword_list)
# counts keywords in article # counts keywords in article
dict_keywords = {} dict_keywords = {}
# search for matchings in dictionary of input article # search for matchings in dictionary of input article
for key in dict_input.keys(): for key in dict_input.keys():
# iterate over all regular expressions # iterate over all regular expressions
for kword in keywords: for kword in keywords:
if re.match(kword, key): if re.match(kword, key):
# if match, increase value of matching key # if match, increase value of matching key
if str(kword) in dict_keywords: if str(kword) in dict_keywords:
dict_keywords[str(kword)] += dict_input[key] dict_keywords[str(kword)] += dict_input[key]
else: else:
dict_keywords[str(kword)] = dict_input[key] dict_keywords[str(kword)] = dict_input[key]
return dict_keywords return dict_keywords
def count_keywords(dict_keywords): def count_keywords(dict_keywords):
'''input: dict with article's keywords (key) and their count (value), '''input: dict with article's keywords (key) and their count (value),
returns number of keywords that are found. returns number of keywords that are found.
''' '''
return sum(dict_keywords.values()) return sum(dict_keywords.values())

48
NER.py
View File

@ -3,10 +3,10 @@ Named Entity Recognition (NER)
============================== ==============================
NER takes a text as input and searches for names of persons, companies NER takes a text as input and searches for names of persons, companies
and countries. and countries.
''' '''
from nltk import ne_chunk, pos_tag, sent_tokenize, word_tokenize from nltk import ne_chunk, pos_tag, sent_tokenize, word_tokenize
from nltk.tree import Tree from nltk.tree import Tree
''' TODO: falsch klassifiert: ''' TODO: falsch klassifiert:
[('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '), [('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '),
@ -16,7 +16,7 @@ from nltk.tree import Tree
''' '''
class NER: class NER:
def get_ne_with_label(text): def get_ne_with_label(text):
labels = [] labels = []
names = [] names = []
@ -32,29 +32,29 @@ class NER:
#print(chunk.label(), ' '.join(c[0] for c in chunk)) #print(chunk.label(), ' '.join(c[0] for c in chunk))
return list(zip(labels, names)) return list(zip(labels, names))
test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
\nmostly fell in light volumes on Tuesday as energy shares \nmostly fell in light volumes on Tuesday as energy shares
tracked \nfalls in global oil prices, while weaknesses in banking shares tracked \nfalls in global oil prices, while weaknesses in banking shares
\namid concerns about loans to an ailing steel firm sent the Thai \namid concerns about loans to an ailing steel firm sent the Thai
\nindex to a one-week closing low. \nBangkok's SET index shed nearly \nindex to a one-week closing low. \nBangkok's SET index shed nearly
1 percent after four \nsessions of gains. The index closed at 1,379.32, 1 percent after four \nsessions of gains. The index closed at 1,379.32,
its lowest \nclosing since Sept. 15. \nShares of Krung Thai Bank Pcl, its lowest \nclosing since Sept. 15. \nShares of Krung Thai Bank Pcl,
the most actively \ntraded by turnover, dropped 2.8 percent to a near the most actively \ntraded by turnover, dropped 2.8 percent to a near
one-month low, \nreflecting potential impact of loans to Sahaviriya Steel one-month low, \nreflecting potential impact of loans to Sahaviriya Steel
\nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities \nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities
downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure
to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be
lower than 130 percent, the \ndesired level we think and hence the need for lower than 130 percent, the \ndesired level we think and hence the need for
more provisioning \nin the following quarters,\" the broker said in a report. more provisioning \nin the following quarters,\" the broker said in a report.
\nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its \nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its
creditors, dropped 1 percent. The steel firm \nand its three creditors creditors, dropped 1 percent. The steel firm \nand its three creditors
agreed on Monday to consider options to \nrestructure debt worth over agreed on Monday to consider options to \nrestructure debt worth over
50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their 50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their
slides for a third \nsession, Singapore gave up early gains and Indonesia slides for a third \nsession, Singapore gave up early gains and Indonesia
\nhit a near one-week low, all with trading volumes below \nthe 30-day \nhit a near one-week low, all with trading volumes below \nthe 30-day
average ahead of a public holiday on Thursday. \nAmong top losers in the average ahead of a public holiday on Thursday. \nAmong top losers in the
region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
\namid uncertainty over global demand. \nFor Asian Companies click.''' \namid uncertainty over global demand. \nFor Asian Companies click.'''
print(NER.get_ne_with_label(test_article)) print(NER.get_ne_with_label(test_article))

View File

@ -1,6 +1,6 @@
''' '''
Naive Bayes Classifier Naive Bayes Classifier
====================== ======================
Naive Bayes is a probabilistic classifier that is able to predict a Naive Bayes is a probabilistic classifier that is able to predict a
probability distribution over a set of classes, rather than only probability distribution over a set of classes, rather than only
@ -13,7 +13,7 @@ regardless of any possible correlations between these features.
''' '''
from BagOfWords import BagOfWords from BagOfWords import BagOfWords
from CsvReader import CsvReader from CsvHandler import CsvHandler
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import SelectPercentile
@ -23,22 +23,12 @@ from sklearn.naive_bayes import GaussianNB
class NaiveBayes: class NaiveBayes:
print('# starting program')
print('#')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('#')
dataset = CsvHandler.read_csv(file)
def make_naive_bayes(dataset): def make_naive_bayes(dataset):
'''fits naive bayes model with StratifiedKFold, '''fits naive bayes model with StratifiedKFold,
uses my BOW uses my BOW
''' '''
print('# starting naive bayes') print('# fitting model')
print('#') print('# ...')
# split data into text and label set # split data into text and label set
# join title and text # join title and text
@ -120,7 +110,7 @@ class NaiveBayes:
max(recall_scores), max(recall_scores),
sum(recall_scores)/float(len(recall_scores)))) sum(recall_scores)/float(len(recall_scores))))
print('F1 score: min = {}, max = {}, average = {}' print('F1 score: min = {}, max = {}, average = {}'
.format(min(f1_scores), .format(min(f1_scores),
max(f1_scores), max(f1_scores),
sum(f1_scores)/float(len(f1_scores)))) sum(f1_scores)/float(len(f1_scores))))
print() print()
@ -130,11 +120,8 @@ class NaiveBayes:
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'. #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
#format(min(f1_scores_train), max(f1_scores_train), #format(min(f1_scores_train), max(f1_scores_train),
#sum(f1_scores_train)/float(len(f1_scores_train)))) #sum(f1_scores_train)/float(len(f1_scores_train))))
#print() #print()
print('# ending naive bayes')
print('#')
######## nur für resubstitutionsfehler benötigt ######## ######## nur für resubstitutionsfehler benötigt ########
def analyze_errors(dataset): def analyze_errors(dataset):
'''calculates resubstitution error '''calculates resubstitution error
@ -143,7 +130,7 @@ class NaiveBayes:
''' '''
X_train_test = dataset['Title'] + ' ' + dataset['Text'] X_train_test = dataset['Title'] + ' ' + dataset['Text']
y_train_test = dataset['Label'] y_train_test = dataset['Label']
count_vector = CountVectorizer() count_vector = CountVectorizer()
# fit the training data and then return the matrix # fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train_test).toarray() training_data = count_vector.fit_transform(X_train_test).toarray()
@ -172,5 +159,19 @@ class NaiveBayes:
#print metrics #print metrics
print('F1 score: ', format(f1_score(y_train_test, predictions))) print('F1 score: ', format(f1_score(y_train_test, predictions)))
#################################
print('# starting naive bayes')
print('# ...')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('# ...')
dataset = CsvHandler.read_csv(file)
make_naive_bayes(dataset)
print('#') print('#')
print('# ending program') print('# ending naive bayes')

View File

@ -28,7 +28,8 @@ class Requester:
# print message # print message
print('# retrieving articles from webhose.io') print('# retrieving articles from webhose.io')
print('# ...')
# personal API key # personal API key
webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX") webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
@ -57,6 +58,7 @@ class Requester:
num_downloads = int(sum_posts / 100) num_downloads = int(sum_posts / 100)
print('# collecting first {} articles'.format(num_downloads * 100)) print('# collecting first {} articles'.format(num_downloads * 100))
print('# sorting out other sources than reuters') print('# sorting out other sources than reuters')
print('# ...')
# twodimensional list of all articles # twodimensional list of all articles
list_articles = [] list_articles = []
@ -90,4 +92,9 @@ class Requester:
df = pd.DataFrame(data=list_articles, df = pd.DataFrame(data=list_articles,
columns=['Timestamp', 'Title', 'Text', 'SiteSection']) columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
# save csv # save csv
CsvHandler.write_csv(df, filestring) CsvHandler.write_csv(df, filestring)
print('# starting requester')
print('# ...')
save_articles_from_webhoseio()
print('# ending requester')

26
SVM.py
View File

@ -13,6 +13,7 @@ to belong to a category based on which side of the gap they fall.
''' '''
from BagOfWords import BagOfWords from BagOfWords import BagOfWords
from CsvHandler import CsvHandler
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import SelectPercentile
@ -26,8 +27,8 @@ class SVM:
def make_svm(dataset): def make_svm(dataset):
print('# starting SVM') print('# fitting model')
print('#') print('# ...')
# split data into text and label set # split data into text and label set
@ -38,7 +39,7 @@ class SVM:
# Bag of Words # Bag of Words
print('# calculating bag of words') print('# calculating bag of words')
print('#') print('# ...')
# fit the training data and then return the matrix # fit the training data and then return the matrix
#X = BagOfWords.fit_transform(X) #X = BagOfWords.fit_transform(X)
X = CountVectorizer().fit_transform(X).toarray() X = CountVectorizer().fit_transform(X).toarray()
@ -59,7 +60,7 @@ class SVM:
scoring=make_scorer(f1_score)) scoring=make_scorer(f1_score))
print('# fit classifier') print('# fit classifier')
print('#') print('# ...')
grid.fit(X,y) grid.fit(X,y)
@ -83,5 +84,18 @@ class SVM:
print(grid.best_params_) print(grid.best_params_)
print() print()
print('# ending SVM') ########################
print('#') print('# starting svm')
print('# ...')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('# ...')
dataset = CsvHandler.read_csv(file)
make_svm(dataset)
print('# ending svm')

View File

@ -13,15 +13,19 @@ from NaiveBayes import NaiveBayes
from SVM import SVM from SVM import SVM
print('# starting program') print('# starting program')
print('#') print('# ...')
# only if new unlabeled(!) data set is required:
# Requester.save_articles_from_webhoseio()
file = 'classification_labelled_corrected.csv' file = 'classification_labelled_corrected.csv'
# read csv file # read csv file
print('# reading dataset') print('# reading dataset')
print('#') print('# ...')
dataset = CsvHandler.read_csv(file) dataset = CsvHandler.read_csv(file)
# DecisionTree.make_tree(dataset)
NaiveBayes.make_naive_bayes(dataset) NaiveBayes.make_naive_bayes(dataset)
# SVM.make_svm(dataset) # SVM.make_svm(dataset)