callable scripts
This commit is contained in:
parent
ab578ae0c6
commit
f934b5a1a0
|
@ -3,7 +3,7 @@ Bag Of Words
|
|||
============
|
||||
|
||||
BagOfWords counts word stems in an article
|
||||
and adds new words to the global vocabulary.
|
||||
and adds new words to the global vocabulary.
|
||||
|
||||
Anm.:
|
||||
The multinomial Naive Bayes classifier is suitable
|
||||
|
@ -67,7 +67,7 @@ class BagOfWords:
|
|||
(rows: different articles, colums: different words in vocab)
|
||||
'''
|
||||
print('# BOW: calculating matrix')
|
||||
print('#')
|
||||
print('# ...')
|
||||
# create list of tuples
|
||||
vectors = []
|
||||
for i in range(len(series)):
|
||||
|
@ -101,7 +101,7 @@ class BagOfWords:
|
|||
input: dataframe of all articles, return value: list of words
|
||||
'''
|
||||
print('# BOW: making vocabulary of data set')
|
||||
print('#')
|
||||
print('# ...')
|
||||
vocab = set()
|
||||
for text in series:
|
||||
vocab |= set(BagOfWords.extract_words(text))
|
||||
|
|
|
@ -22,19 +22,9 @@ from sklearn.model_selection import StratifiedKFold
|
|||
|
||||
class DecisionTree:
|
||||
|
||||
print('# starting program')
|
||||
print('#')
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('#')
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
def make_tree(dataset):
|
||||
print('# starting decision tree')
|
||||
print('#')
|
||||
print('# fitting model')
|
||||
print('# ...')
|
||||
|
||||
X = dataset['Title'] + ' ' + dataset['Text']
|
||||
y = dataset['Label']
|
||||
|
@ -42,9 +32,9 @@ class DecisionTree:
|
|||
#count_vector = CountVectorizer()
|
||||
|
||||
# use stratified k-fold cross-validation as split method
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
|
||||
# lists for metrics predicted on test/train set
|
||||
# lists for metrics predicted on test/train set
|
||||
f1_scores = []
|
||||
f1_scores_train = []
|
||||
|
||||
|
@ -114,8 +104,19 @@ class DecisionTree:
|
|||
# format(min(f1_scores_train), max(f1_scores_train),
|
||||
# sum(f1_scores_train)/float(len(f1_scores_train))))
|
||||
# print()
|
||||
print('# ending decision tree')
|
||||
print('#')
|
||||
|
||||
DecisionTree.make_tree(dataset)
|
||||
print('# ending program')
|
||||
#################################
|
||||
print('# starting decision tree')
|
||||
print('# ...')
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('# ...')
|
||||
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
make_tree(dataset)
|
||||
|
||||
print('# ending decision tree')
|
|
@ -2,68 +2,67 @@
|
|||
Filter Keywords
|
||||
===============
|
||||
|
||||
FilterKeywords searches for merger specific keywords
|
||||
FilterKeywords searches for merger specific keywords
|
||||
in an article and counts them.
|
||||
'''
|
||||
|
||||
# toDo: dict ändern!
|
||||
|
||||
import re
|
||||
|
||||
from nltk.stem.porter import PorterStemmer
|
||||
|
||||
class FilterKeywords:
|
||||
|
||||
|
||||
def search_keywords(dict_input):
|
||||
'''extracts relevant key-value pairs of in article's input dictionary,
|
||||
output are the contained keywords and their count.
|
||||
'''
|
||||
|
||||
'''
|
||||
|
||||
# # list of regular expressions that match merger specific keywords
|
||||
# regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?',
|
||||
# r'business combinations?', r'combined compan(y|ies)',
|
||||
# regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?',
|
||||
# r'business combinations?', r'combined compan(y|ies)',
|
||||
# r'(joint venture|JV)s?', r'take[ -]?overs?', r'tie-up',
|
||||
# r'deals?', r'transactions?', r'approv(e|ing|al|ed)s?',
|
||||
# r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?',
|
||||
# r'purchase', r'(sell(s|ers?|ing)?|sold)']
|
||||
|
||||
keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers',
|
||||
'acquisition', 'acquire', 'acquisitions', 'acquires',
|
||||
'combine', 'combines', 'combination', 'combined',
|
||||
'joint', 'venture', 'JV', 'takeover', 'take-over',
|
||||
'tie-up', 'deal', 'deals', 'transaction',
|
||||
'transactions', 'approve', 'approves', 'approved',
|
||||
'approving', 'approval', 'approvals', 'buy', 'buys',
|
||||
'buying', 'bought', 'buyout', 'buy-out', 'purchase',
|
||||
# r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?',
|
||||
# r'purchase', r'(sell(s|ers?|ing)?|sold)']
|
||||
|
||||
keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers',
|
||||
'acquisition', 'acquire', 'acquisitions', 'acquires',
|
||||
'combine', 'combines', 'combination', 'combined',
|
||||
'joint', 'venture', 'JV', 'takeover', 'take-over',
|
||||
'tie-up', 'deal', 'deals', 'transaction',
|
||||
'transactions', 'approve', 'approves', 'approved',
|
||||
'approving', 'approval', 'approvals', 'buy', 'buys',
|
||||
'buying', 'bought', 'buyout', 'buy-out', 'purchase',
|
||||
'sell', 'sells', 'selling', 'sold', 'seller', 'buyer']
|
||||
|
||||
|
||||
# reduce words to stem
|
||||
stemmer = PorterStemmer()
|
||||
for i in range(len(keyword_list)):
|
||||
keyword_list[i] = stemmer.stem(keyword_list[i])
|
||||
|
||||
keyword_list[i] = stemmer.stem(keyword_list[i])
|
||||
|
||||
# remove duplicates
|
||||
keywords = set(keyword_list)
|
||||
|
||||
|
||||
# counts keywords in article
|
||||
dict_keywords = {}
|
||||
|
||||
|
||||
# search for matchings in dictionary of input article
|
||||
for key in dict_input.keys():
|
||||
# iterate over all regular expressions
|
||||
for kword in keywords:
|
||||
if re.match(kword, key):
|
||||
if re.match(kword, key):
|
||||
# if match, increase value of matching key
|
||||
if str(kword) in dict_keywords:
|
||||
dict_keywords[str(kword)] += dict_input[key]
|
||||
else:
|
||||
dict_keywords[str(kword)] = dict_input[key]
|
||||
|
||||
|
||||
return dict_keywords
|
||||
|
||||
|
||||
def count_keywords(dict_keywords):
|
||||
'''input: dict with article's keywords (key) and their count (value),
|
||||
returns number of keywords that are found.
|
||||
'''
|
||||
return sum(dict_keywords.values())
|
||||
|
||||
|
||||
|
||||
return sum(dict_keywords.values())
|
48
NER.py
48
NER.py
|
@ -3,10 +3,10 @@ Named Entity Recognition (NER)
|
|||
==============================
|
||||
|
||||
NER takes a text as input and searches for names of persons, companies
|
||||
and countries.
|
||||
and countries.
|
||||
'''
|
||||
from nltk import ne_chunk, pos_tag, sent_tokenize, word_tokenize
|
||||
from nltk.tree import Tree
|
||||
from nltk.tree import Tree
|
||||
|
||||
''' TODO: falsch klassifiert:
|
||||
[('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '),
|
||||
|
@ -16,7 +16,7 @@ from nltk.tree import Tree
|
|||
'''
|
||||
|
||||
class NER:
|
||||
|
||||
|
||||
def get_ne_with_label(text):
|
||||
labels = []
|
||||
names = []
|
||||
|
@ -32,29 +32,29 @@ class NER:
|
|||
#print(chunk.label(), ' '.join(c[0] for c in chunk))
|
||||
return list(zip(labels, names))
|
||||
|
||||
test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
|
||||
\nmostly fell in light volumes on Tuesday as energy shares
|
||||
test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
|
||||
\nmostly fell in light volumes on Tuesday as energy shares
|
||||
tracked \nfalls in global oil prices, while weaknesses in banking shares
|
||||
\namid concerns about loans to an ailing steel firm sent the Thai
|
||||
\nindex to a one-week closing low. \nBangkok's SET index shed nearly
|
||||
1 percent after four \nsessions of gains. The index closed at 1,379.32,
|
||||
\namid concerns about loans to an ailing steel firm sent the Thai
|
||||
\nindex to a one-week closing low. \nBangkok's SET index shed nearly
|
||||
1 percent after four \nsessions of gains. The index closed at 1,379.32,
|
||||
its lowest \nclosing since Sept. 15. \nShares of Krung Thai Bank Pcl,
|
||||
the most actively \ntraded by turnover, dropped 2.8 percent to a near
|
||||
one-month low, \nreflecting potential impact of loans to Sahaviriya Steel
|
||||
\nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities
|
||||
downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure
|
||||
to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be
|
||||
lower than 130 percent, the \ndesired level we think and hence the need for
|
||||
more provisioning \nin the following quarters,\" the broker said in a report.
|
||||
\nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its
|
||||
creditors, dropped 1 percent. The steel firm \nand its three creditors
|
||||
agreed on Monday to consider options to \nrestructure debt worth over
|
||||
50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their
|
||||
slides for a third \nsession, Singapore gave up early gains and Indonesia
|
||||
\nhit a near one-week low, all with trading volumes below \nthe 30-day
|
||||
average ahead of a public holiday on Thursday. \nAmong top losers in the
|
||||
region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
|
||||
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
|
||||
the most actively \ntraded by turnover, dropped 2.8 percent to a near
|
||||
one-month low, \nreflecting potential impact of loans to Sahaviriya Steel
|
||||
\nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities
|
||||
downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure
|
||||
to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be
|
||||
lower than 130 percent, the \ndesired level we think and hence the need for
|
||||
more provisioning \nin the following quarters,\" the broker said in a report.
|
||||
\nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its
|
||||
creditors, dropped 1 percent. The steel firm \nand its three creditors
|
||||
agreed on Monday to consider options to \nrestructure debt worth over
|
||||
50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their
|
||||
slides for a third \nsession, Singapore gave up early gains and Indonesia
|
||||
\nhit a near one-week low, all with trading volumes below \nthe 30-day
|
||||
average ahead of a public holiday on Thursday. \nAmong top losers in the
|
||||
region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
|
||||
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
|
||||
\namid uncertainty over global demand. \nFor Asian Companies click.'''
|
||||
|
||||
print(NER.get_ne_with_label(test_article))
|
|
@ -1,6 +1,6 @@
|
|||
'''
|
||||
Naive Bayes Classifier
|
||||
======================
|
||||
======================
|
||||
|
||||
Naive Bayes is a probabilistic classifier that is able to predict a
|
||||
probability distribution over a set of classes, rather than only
|
||||
|
@ -13,7 +13,7 @@ regardless of any possible correlations between these features.
|
|||
'''
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
from CsvReader import CsvReader
|
||||
from CsvHandler import CsvHandler
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
|
@ -23,22 +23,12 @@ from sklearn.naive_bayes import GaussianNB
|
|||
|
||||
class NaiveBayes:
|
||||
|
||||
print('# starting program')
|
||||
print('#')
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('#')
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
def make_naive_bayes(dataset):
|
||||
'''fits naive bayes model with StratifiedKFold,
|
||||
uses my BOW
|
||||
'''
|
||||
print('# starting naive bayes')
|
||||
print('#')
|
||||
print('# fitting model')
|
||||
print('# ...')
|
||||
|
||||
# split data into text and label set
|
||||
# join title and text
|
||||
|
@ -120,7 +110,7 @@ class NaiveBayes:
|
|||
max(recall_scores),
|
||||
sum(recall_scores)/float(len(recall_scores))))
|
||||
print('F1 score: min = {}, max = {}, average = {}'
|
||||
.format(min(f1_scores),
|
||||
.format(min(f1_scores),
|
||||
max(f1_scores),
|
||||
sum(f1_scores)/float(len(f1_scores))))
|
||||
print()
|
||||
|
@ -130,11 +120,8 @@ class NaiveBayes:
|
|||
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
||||
#format(min(f1_scores_train), max(f1_scores_train),
|
||||
#sum(f1_scores_train)/float(len(f1_scores_train))))
|
||||
#print()
|
||||
#print()
|
||||
|
||||
print('# ending naive bayes')
|
||||
print('#')
|
||||
|
||||
######## nur für resubstitutionsfehler benötigt ########
|
||||
def analyze_errors(dataset):
|
||||
'''calculates resubstitution error
|
||||
|
@ -143,7 +130,7 @@ class NaiveBayes:
|
|||
'''
|
||||
X_train_test = dataset['Title'] + ' ' + dataset['Text']
|
||||
y_train_test = dataset['Label']
|
||||
|
||||
|
||||
count_vector = CountVectorizer()
|
||||
# fit the training data and then return the matrix
|
||||
training_data = count_vector.fit_transform(X_train_test).toarray()
|
||||
|
@ -172,5 +159,19 @@ class NaiveBayes:
|
|||
#print metrics
|
||||
print('F1 score: ', format(f1_score(y_train_test, predictions)))
|
||||
|
||||
#################################
|
||||
print('# starting naive bayes')
|
||||
print('# ...')
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('# ...')
|
||||
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
make_naive_bayes(dataset)
|
||||
|
||||
print('#')
|
||||
print('# ending program')
|
||||
print('# ending naive bayes')
|
11
Requester.py
11
Requester.py
|
@ -28,7 +28,8 @@ class Requester:
|
|||
|
||||
# print message
|
||||
print('# retrieving articles from webhose.io')
|
||||
|
||||
print('# ...')
|
||||
|
||||
# personal API key
|
||||
webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
|
||||
|
||||
|
@ -57,6 +58,7 @@ class Requester:
|
|||
num_downloads = int(sum_posts / 100)
|
||||
print('# collecting first {} articles'.format(num_downloads * 100))
|
||||
print('# sorting out other sources than reuters')
|
||||
print('# ...')
|
||||
|
||||
# twodimensional list of all articles
|
||||
list_articles = []
|
||||
|
@ -90,4 +92,9 @@ class Requester:
|
|||
df = pd.DataFrame(data=list_articles,
|
||||
columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
|
||||
# save csv
|
||||
CsvHandler.write_csv(df, filestring)
|
||||
CsvHandler.write_csv(df, filestring)
|
||||
|
||||
print('# starting requester')
|
||||
print('# ...')
|
||||
save_articles_from_webhoseio()
|
||||
print('# ending requester')
|
26
SVM.py
26
SVM.py
|
@ -13,6 +13,7 @@ to belong to a category based on which side of the gap they fall.
|
|||
'''
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
from CsvHandler import CsvHandler
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
|
@ -26,8 +27,8 @@ class SVM:
|
|||
|
||||
def make_svm(dataset):
|
||||
|
||||
print('# starting SVM')
|
||||
print('#')
|
||||
print('# fitting model')
|
||||
print('# ...')
|
||||
|
||||
# split data into text and label set
|
||||
|
||||
|
@ -38,7 +39,7 @@ class SVM:
|
|||
|
||||
# Bag of Words
|
||||
print('# calculating bag of words')
|
||||
print('#')
|
||||
print('# ...')
|
||||
# fit the training data and then return the matrix
|
||||
#X = BagOfWords.fit_transform(X)
|
||||
X = CountVectorizer().fit_transform(X).toarray()
|
||||
|
@ -59,7 +60,7 @@ class SVM:
|
|||
scoring=make_scorer(f1_score))
|
||||
|
||||
print('# fit classifier')
|
||||
print('#')
|
||||
print('# ...')
|
||||
|
||||
grid.fit(X,y)
|
||||
|
||||
|
@ -83,5 +84,18 @@ class SVM:
|
|||
print(grid.best_params_)
|
||||
print()
|
||||
|
||||
print('# ending SVM')
|
||||
print('#')
|
||||
########################
|
||||
print('# starting svm')
|
||||
print('# ...')
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('# ...')
|
||||
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
make_svm(dataset)
|
||||
|
||||
print('# ending svm')
|
|
@ -13,15 +13,19 @@ from NaiveBayes import NaiveBayes
|
|||
from SVM import SVM
|
||||
|
||||
print('# starting program')
|
||||
print('#')
|
||||
print('# ...')
|
||||
|
||||
# only if new unlabeled(!) data set is required:
|
||||
# Requester.save_articles_from_webhoseio()
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('#')
|
||||
print('# ...')
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
# DecisionTree.make_tree(dataset)
|
||||
NaiveBayes.make_naive_bayes(dataset)
|
||||
# SVM.make_svm(dataset)
|
||||
|
||||
|
|
Loading…
Reference in New Issue