callable scripts

This commit is contained in:
Anne Lorenz 2018-09-17 21:16:19 +02:00
parent ab578ae0c6
commit f934b5a1a0
8 changed files with 131 additions and 105 deletions

View File

@ -3,7 +3,7 @@ Bag Of Words
============
BagOfWords counts word stems in an article
and adds new words to the global vocabulary.
and adds new words to the global vocabulary.
Anm.:
The multinomial Naive Bayes classifier is suitable
@ -67,7 +67,7 @@ class BagOfWords:
(rows: different articles, colums: different words in vocab)
'''
print('# BOW: calculating matrix')
print('#')
print('# ...')
# create list of tuples
vectors = []
for i in range(len(series)):
@ -101,7 +101,7 @@ class BagOfWords:
input: dataframe of all articles, return value: list of words
'''
print('# BOW: making vocabulary of data set')
print('#')
print('# ...')
vocab = set()
for text in series:
vocab |= set(BagOfWords.extract_words(text))

View File

@ -22,19 +22,9 @@ from sklearn.model_selection import StratifiedKFold
class DecisionTree:
print('# starting program')
print('#')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('#')
dataset = CsvHandler.read_csv(file)
def make_tree(dataset):
print('# starting decision tree')
print('#')
print('# fitting model')
print('# ...')
X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label']
@ -42,9 +32,9 @@ class DecisionTree:
#count_vector = CountVectorizer()
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
skf = StratifiedKFold(n_splits = 10, shuffle=True)
# lists for metrics predicted on test/train set
# lists for metrics predicted on test/train set
f1_scores = []
f1_scores_train = []
@ -114,8 +104,19 @@ class DecisionTree:
# format(min(f1_scores_train), max(f1_scores_train),
# sum(f1_scores_train)/float(len(f1_scores_train))))
# print()
print('# ending decision tree')
print('#')
DecisionTree.make_tree(dataset)
print('# ending program')
#################################
print('# starting decision tree')
print('# ...')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('# ...')
dataset = CsvHandler.read_csv(file)
make_tree(dataset)
print('# ending decision tree')

View File

@ -2,68 +2,67 @@
Filter Keywords
===============
FilterKeywords searches for merger specific keywords
FilterKeywords searches for merger specific keywords
in an article and counts them.
'''
# toDo: dict ändern!
import re
from nltk.stem.porter import PorterStemmer
class FilterKeywords:
def search_keywords(dict_input):
'''extracts relevant key-value pairs of in article's input dictionary,
output are the contained keywords and their count.
'''
'''
# # list of regular expressions that match merger specific keywords
# regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?',
# r'business combinations?', r'combined compan(y|ies)',
# regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?',
# r'business combinations?', r'combined compan(y|ies)',
# r'(joint venture|JV)s?', r'take[ -]?overs?', r'tie-up',
# r'deals?', r'transactions?', r'approv(e|ing|al|ed)s?',
# r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?',
# r'purchase', r'(sell(s|ers?|ing)?|sold)']
keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers',
'acquisition', 'acquire', 'acquisitions', 'acquires',
'combine', 'combines', 'combination', 'combined',
'joint', 'venture', 'JV', 'takeover', 'take-over',
'tie-up', 'deal', 'deals', 'transaction',
'transactions', 'approve', 'approves', 'approved',
'approving', 'approval', 'approvals', 'buy', 'buys',
'buying', 'bought', 'buyout', 'buy-out', 'purchase',
# r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?',
# r'purchase', r'(sell(s|ers?|ing)?|sold)']
keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers',
'acquisition', 'acquire', 'acquisitions', 'acquires',
'combine', 'combines', 'combination', 'combined',
'joint', 'venture', 'JV', 'takeover', 'take-over',
'tie-up', 'deal', 'deals', 'transaction',
'transactions', 'approve', 'approves', 'approved',
'approving', 'approval', 'approvals', 'buy', 'buys',
'buying', 'bought', 'buyout', 'buy-out', 'purchase',
'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']
# reduce words to stem
stemmer = PorterStemmer()
for i in range(len(keyword_list)):
keyword_list[i] = stemmer.stem(keyword_list[i])
keyword_list[i] = stemmer.stem(keyword_list[i])
# remove duplicates
keywords = set(keyword_list)
# counts keywords in article
dict_keywords = {}
# search for matchings in dictionary of input article
for key in dict_input.keys():
# iterate over all regular expressions
for kword in keywords:
if re.match(kword, key):
if re.match(kword, key):
# if match, increase value of matching key
if str(kword) in dict_keywords:
dict_keywords[str(kword)] += dict_input[key]
else:
dict_keywords[str(kword)] = dict_input[key]
return dict_keywords
def count_keywords(dict_keywords):
'''input: dict with article's keywords (key) and their count (value),
returns number of keywords that are found.
'''
return sum(dict_keywords.values())
return sum(dict_keywords.values())

48
NER.py
View File

@ -3,10 +3,10 @@ Named Entity Recognition (NER)
==============================
NER takes a text as input and searches for names of persons, companies
and countries.
and countries.
'''
from nltk import ne_chunk, pos_tag, sent_tokenize, word_tokenize
from nltk.tree import Tree
from nltk.tree import Tree
''' TODO: falsch klassifiert:
[('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '),
@ -16,7 +16,7 @@ from nltk.tree import Tree
'''
class NER:
def get_ne_with_label(text):
labels = []
names = []
@ -32,29 +32,29 @@ class NER:
#print(chunk.label(), ' '.join(c[0] for c in chunk))
return list(zip(labels, names))
test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
\nmostly fell in light volumes on Tuesday as energy shares
test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
\nmostly fell in light volumes on Tuesday as energy shares
tracked \nfalls in global oil prices, while weaknesses in banking shares
\namid concerns about loans to an ailing steel firm sent the Thai
\nindex to a one-week closing low. \nBangkok's SET index shed nearly
1 percent after four \nsessions of gains. The index closed at 1,379.32,
\namid concerns about loans to an ailing steel firm sent the Thai
\nindex to a one-week closing low. \nBangkok's SET index shed nearly
1 percent after four \nsessions of gains. The index closed at 1,379.32,
its lowest \nclosing since Sept. 15. \nShares of Krung Thai Bank Pcl,
the most actively \ntraded by turnover, dropped 2.8 percent to a near
one-month low, \nreflecting potential impact of loans to Sahaviriya Steel
\nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities
downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure
to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be
lower than 130 percent, the \ndesired level we think and hence the need for
more provisioning \nin the following quarters,\" the broker said in a report.
\nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its
creditors, dropped 1 percent. The steel firm \nand its three creditors
agreed on Monday to consider options to \nrestructure debt worth over
50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their
slides for a third \nsession, Singapore gave up early gains and Indonesia
\nhit a near one-week low, all with trading volumes below \nthe 30-day
average ahead of a public holiday on Thursday. \nAmong top losers in the
region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
the most actively \ntraded by turnover, dropped 2.8 percent to a near
one-month low, \nreflecting potential impact of loans to Sahaviriya Steel
\nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities
downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure
to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be
lower than 130 percent, the \ndesired level we think and hence the need for
more provisioning \nin the following quarters,\" the broker said in a report.
\nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its
creditors, dropped 1 percent. The steel firm \nand its three creditors
agreed on Monday to consider options to \nrestructure debt worth over
50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their
slides for a third \nsession, Singapore gave up early gains and Indonesia
\nhit a near one-week low, all with trading volumes below \nthe 30-day
average ahead of a public holiday on Thursday. \nAmong top losers in the
region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
\namid uncertainty over global demand. \nFor Asian Companies click.'''
print(NER.get_ne_with_label(test_article))

View File

@ -1,6 +1,6 @@
'''
Naive Bayes Classifier
======================
======================
Naive Bayes is a probabilistic classifier that is able to predict a
probability distribution over a set of classes, rather than only
@ -13,7 +13,7 @@ regardless of any possible correlations between these features.
'''
from BagOfWords import BagOfWords
from CsvReader import CsvReader
from CsvHandler import CsvHandler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
@ -23,22 +23,12 @@ from sklearn.naive_bayes import GaussianNB
class NaiveBayes:
print('# starting program')
print('#')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('#')
dataset = CsvHandler.read_csv(file)
def make_naive_bayes(dataset):
'''fits naive bayes model with StratifiedKFold,
uses my BOW
'''
print('# starting naive bayes')
print('#')
print('# fitting model')
print('# ...')
# split data into text and label set
# join title and text
@ -120,7 +110,7 @@ class NaiveBayes:
max(recall_scores),
sum(recall_scores)/float(len(recall_scores))))
print('F1 score: min = {}, max = {}, average = {}'
.format(min(f1_scores),
.format(min(f1_scores),
max(f1_scores),
sum(f1_scores)/float(len(f1_scores))))
print()
@ -130,11 +120,8 @@ class NaiveBayes:
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
#format(min(f1_scores_train), max(f1_scores_train),
#sum(f1_scores_train)/float(len(f1_scores_train))))
#print()
#print()
print('# ending naive bayes')
print('#')
######## nur für resubstitutionsfehler benötigt ########
def analyze_errors(dataset):
'''calculates resubstitution error
@ -143,7 +130,7 @@ class NaiveBayes:
'''
X_train_test = dataset['Title'] + ' ' + dataset['Text']
y_train_test = dataset['Label']
count_vector = CountVectorizer()
# fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train_test).toarray()
@ -172,5 +159,19 @@ class NaiveBayes:
#print metrics
print('F1 score: ', format(f1_score(y_train_test, predictions)))
#################################
print('# starting naive bayes')
print('# ...')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('# ...')
dataset = CsvHandler.read_csv(file)
make_naive_bayes(dataset)
print('#')
print('# ending program')
print('# ending naive bayes')

View File

@ -28,7 +28,8 @@ class Requester:
# print message
print('# retrieving articles from webhose.io')
print('# ...')
# personal API key
webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
@ -57,6 +58,7 @@ class Requester:
num_downloads = int(sum_posts / 100)
print('# collecting first {} articles'.format(num_downloads * 100))
print('# sorting out other sources than reuters')
print('# ...')
# twodimensional list of all articles
list_articles = []
@ -90,4 +92,9 @@ class Requester:
df = pd.DataFrame(data=list_articles,
columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
# save csv
CsvHandler.write_csv(df, filestring)
CsvHandler.write_csv(df, filestring)
print('# starting requester')
print('# ...')
save_articles_from_webhoseio()
print('# ending requester')

26
SVM.py
View File

@ -13,6 +13,7 @@ to belong to a category based on which side of the gap they fall.
'''
from BagOfWords import BagOfWords
from CsvHandler import CsvHandler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
@ -26,8 +27,8 @@ class SVM:
def make_svm(dataset):
print('# starting SVM')
print('#')
print('# fitting model')
print('# ...')
# split data into text and label set
@ -38,7 +39,7 @@ class SVM:
# Bag of Words
print('# calculating bag of words')
print('#')
print('# ...')
# fit the training data and then return the matrix
#X = BagOfWords.fit_transform(X)
X = CountVectorizer().fit_transform(X).toarray()
@ -59,7 +60,7 @@ class SVM:
scoring=make_scorer(f1_score))
print('# fit classifier')
print('#')
print('# ...')
grid.fit(X,y)
@ -83,5 +84,18 @@ class SVM:
print(grid.best_params_)
print()
print('# ending SVM')
print('#')
########################
print('# starting svm')
print('# ...')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('# ...')
dataset = CsvHandler.read_csv(file)
make_svm(dataset)
print('# ending svm')

View File

@ -13,15 +13,19 @@ from NaiveBayes import NaiveBayes
from SVM import SVM
print('# starting program')
print('#')
print('# ...')
# only if new unlabeled(!) data set is required:
# Requester.save_articles_from_webhoseio()
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('#')
print('# ...')
dataset = CsvHandler.read_csv(file)
# DecisionTree.make_tree(dataset)
NaiveBayes.make_naive_bayes(dataset)
# SVM.make_svm(dataset)