added requirements and some things

This commit is contained in:
Anne Lorenz 2018-09-17 14:47:50 +02:00
parent c2066d6adb
commit ab578ae0c6
13 changed files with 726 additions and 237 deletions

View File

@ -6,11 +6,11 @@ BagOfWords counts word stems in an article
and adds new words to the global vocabulary.
Anm.:
The multinomial Naive Bayes classifier is suitable
for classification with discrete features (e.g.,
word counts for text classification).
The multinomial distribution normally requires
integer feature counts. However, in practice,
The multinomial Naive Bayes classifier is suitable
for classification with discrete features (e.g.,
word counts for text classification).
The multinomial distribution normally requires
integer feature counts. However, in practice,
fractional counts such as tf-idf may also work.
=> durch 'relative_word_frequencies' als Paramter berücksichtigt
'''
@ -32,14 +32,14 @@ class BagOfWords:
def extract_words(text):
'''takes article as argument, removes numbers,
returns list of single words, recurrences included.
'''
'''
stop_words = BagOfWords.set_stop_words()
# replace punctuation marks with spaces
words = re.sub(r'\W', ' ', text)
words = re.sub(r'\W', ' ', text)
# split str into list of single words
words = words.split()
words = words.split()
# list of all words to return
words_cleaned = []
words_cleaned = []
for word in words:
# remove numbers
if word.isalpha():
@ -50,18 +50,18 @@ class BagOfWords:
# add every word in lowercase
words_cleaned.append(word.lower())
return words_cleaned
def reduce_word_to_stem(word):
'''takes normal word as input, returns the word's stem
'''
stemmer = PorterStemmer()
# replace word by its stem
word = stemmer.stem(word)
word = stemmer.stem(word)
return word
def make_matrix(series, vocab, relative_word_frequencies=True):
'''calculates word stem frequencies in input articles.
returns matrix (DataFrame) with relative word frequencies
returns matrix (DataFrame) with relative word frequencies
(0 <= values < 1) if relative_word_frequencies=True or absolute
word frequencies (int) if relative_word_frequencies=False.
(rows: different articles, colums: different words in vocab)
@ -69,14 +69,14 @@ class BagOfWords:
print('# BOW: calculating matrix')
print('#')
# create list of tuples
vectors = []
vectors = []
for i in range(len(series)):
# extract text of single article
text = series.iloc[i]
# extract its words
words = BagOfWords.extract_words(text)
# count words in single article
word_count = len(words)
# count words in single article
word_count = len(words)
vector = []
for i, v in enumerate(vocab):
vector.append(0)
@ -88,14 +88,14 @@ class BagOfWords:
else:
# absolute word frequency
vector[i] += 1
# add single vector as tuple
vectors.append(tuple(vector))
df_vectors = pd.DataFrame.from_records(vectors,
index=None,
columns=vocab)
vectors.append(tuple(vector))
df_vectors = pd.DataFrame.from_records(vectors,
index=None,
columns=vocab)
return df_vectors
def make_vocab(series):
'''adds words of input articles to a global vocabulary.
input: dataframe of all articles, return value: list of words
@ -110,56 +110,56 @@ class BagOfWords:
# sort list
vocab.sort()
return vocab
def set_stop_words():
'''creates list of all words that will be ignored
'''
'''
# stopwords
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
'aren\'t', 'as', 'at', 'be', 'because', 'been',
'before', 'being', 'below', 'between', 'both', 'but',
'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
'don', 'don\'t', 'down', 'during', 'each', 'few',
'aren\'t', 'as', 'at', 'be', 'because', 'been',
'before', 'being', 'below', 'between', 'both', 'but',
'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
'don', 'don\'t', 'down', 'during', 'each', 'few',
'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
'on', 'once', 'only', 'or', 'other', 'our', 'ours',
'ourselves', 'out', 'over', 'own', 're', 's', 'same',
'shan', 'shan\'t', 'she', 'she\'s', 'should',
'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
'on', 'once', 'only', 'or', 'other', 'our', 'ours',
'ourselves', 'out', 'over', 'own', 're', 's', 'same',
'shan', 'shan\'t', 'she', 'she\'s', 'should',
'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
'theirs', 'them', 'themselves', 'then', 'there',
'these', 'they', 'this', 'those', 'through', 'to',
'too', 'under', 'until', 'up', 've', 'very', 'was',
'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
'what', 'when', 'where', 'which', 'while', 'who',
'whom', 'why', 'will', 'with', 'won', 'won\'t',
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
'yourselves']
##=> ist das sinnvoll?:
'theirs', 'them', 'themselves', 'then', 'there',
'these', 'they', 'this', 'those', 'through', 'to',
'too', 'under', 'until', 'up', 've', 'very', 'was',
'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
'what', 'when', 'where', 'which', 'while', 'who',
'whom', 'why', 'will', 'with', 'won', 'won\'t',
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
'yourselves']
##=> ist das sinnvoll?:
#add specific words
#stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
# 'wednesday', 'thursday', 'friday'])
#stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
# 'wednesday', 'thursday', 'friday'])
#remove the word 'not' from stop words
#stop_words.remove('not')
#stop_words.remove('not')
for i in range(len(stop_words)):
# remove punctuation marks and strip endings from abbreviations
#stop_words[i] = re.split(r'\W', stop_words[i])[0]
# reduce word to stem
stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
# transform list to set to eliminate duplicates
stop_words = set(stop_words)
stop_words = set(stop_words)
return stop_words

View File

@ -3,16 +3,16 @@ Cosine Similarity
=================
CosineSimilarity measures the similarity between to articles.
It calculates c: the cosine of the angle between the articles
It calculates c: the cosine of the angle between the articles
vectors dict_1 and dict_2.
c = (dict_1 * dict_2) / (|dict_1| * |dict_2|).
c = 1, if articles are equal => identicalness is 100%
c = (dict_1 * dict_2) / (|dict_1| * |dict_2|).
c = 1, if articles are equal => identicalness is 100%
0 > c > 1, else => identicalness is (c*100)%
(The greater c, the more similar two articles are.)
'''
#TODO: uses dictionaries of each article
# => ToDo: has to be changed as we are now using vectors
#TODO:uses dictionaries of each article
#=>ToDo:has to be changed as we are now using vectors
import math
@ -23,47 +23,47 @@ class CosineSimilarity:
def cos_sim(dict_1, dict_2):
# list of all different words
vocab = []
vocab = []
# insert words of 1st article into vocab
for key in dict_1.keys():
if key not in vocab:
vocab.append(key)
# insert words of 2nd article into vocab
for key in dict_2.keys():
if key not in vocab:
vocab.append(key)
# delete first entry ('sum_words')
# delete first entry ('sum_words')
vocab.pop(0)
# create vectors
vector_1 = CosineSimilarity.create_vector(dict_1, vocab)
vector_2 = CosineSimilarity.create_vector(dict_2, vocab)
# start calculation
# start calculation
# calculate numerator of formula
sum_1 = 0
sum_1 = 0
for i in range (0,len(vector_1)):
sum_1 += vector_1[i] * vector_2[i]
# calculate denominator of formula
sum_1 += vector_1[i] * vector_2[i]
# calculate denominator of formula
sum_2 = 0
for entry in vector_1:
sum_2 += entry ** 2
sum_3 = 0
sum_3 = 0
for entry in vector_2:
sum_3 += entry ** 2
return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3))
def create_vector(dict, vocab):
# word frequency vector
vector = []
# word frequency vector
vector = []
for word in vocab:
# check if word occurs in article
if word in dict:

View File

@ -12,17 +12,17 @@ import pandas as pd
class CsvHandler:
def read_csv(csv_file):
df = pd.read_csv(csv_file,
sep='|',
header=0,
engine='python',
df = pd.read_csv(csv_file,
sep='|',
header=0,
engine='python',
usecols=[1,2,4], #use only 'Title', 'Text' and 'Label'
decimal='.',
decimal='.',
quotechar='\'',
#nrows = 200,
quoting=csv.QUOTE_NONE)
return df
def write_csv(df, file_name):
df.to_csv(file_name, sep='|')
print('### saved {} articles in {}'.format(len(df), file_name))

View File

@ -2,14 +2,15 @@
Decision Tree Classifier
========================
Decision Tree Classifier takes as input two arrays:
Decision Tree Classifier takes as input two arrays:
array X of size [n_samples, n_features], holding the training samples,
and array y of integer values, size [n_samples],
and array y of integer values, size [n_samples],
holding the class labels for the training samples.
'''
import operator
from BagOfWords import BagOfWords
from CsvHandler import CsvHandler
import graphviz
import numpy as np
@ -21,71 +22,80 @@ from sklearn.model_selection import StratifiedKFold
class DecisionTree:
print('# starting program')
print('#')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('#')
dataset = CsvHandler.read_csv(file)
def make_tree(dataset):
print('# starting decision tree')
print('#')
X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label']
#count_vector = CountVectorizer()
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
# lists for metrics predicted on test/train set
f1_scores = []
f1_scores_train = []
f1_scores_train = []
classifier = tree.DecisionTreeClassifier()
# dict of most important words of each fold
important_words = {}
# for each fold
for train, test in skf.split(X,y):
for train, test in skf.split(X,y):
# BOW
vocab = BagOfWords.make_vocab(X[train])
vocab = BagOfWords.make_vocab(X[train])
# fit the training data and then return the matrix
training_data = BagOfWords.make_matrix(X[train], vocab)
training_data = BagOfWords.make_matrix(X[train], vocab)
# transform testing data and return the matrix
testing_data = BagOfWords.make_matrix(X[test], vocab)
testing_data = BagOfWords.make_matrix(X[test], vocab)
# #fit the training data and then return the matrix
# training_data = count_vector.fit_transform(X[train], y[train]).toarray()
# training_data = count_vector.fit_transform(X[train], y[train]).toarray()
# #transform testing data and return the matrix
# testing_data = count_vector.transform(X[test]).toarray()
# testing_data = count_vector.transform(X[test]).toarray()
# # apply select percentile
# selector = SelectPercentile(percentile=25)
# selector = SelectPercentile(percentile=25)
# selector.fit(training_data, y[train])
# training_data_r = selector.transform(training_data)
# training_data_r = selector.transform(training_data)
# testing_data_r = selector.transform(testing_data)
# fit classifier
classifier.fit(training_data, y[train])
#predict class
#predict class
predictions_train = classifier.predict(training_data)
predictions_test = classifier.predict(testing_data)
#store metrics predicted on test/train set
f1_scores.append(f1_score(y[test], predictions_test))
#store metrics predicted on test/train set
f1_scores.append(f1_score(y[test], predictions_test))
f1_scores_train.append(f1_score(y[train], predictions_train))
# search for important features
feature_importances = np.array(classifier.feature_importances_)
important_indices = feature_importances.argsort()[-50:][::-1]
for i in important_indices:
if vocab[i] in important_words:
important_words[vocab[i]] += feature_importances[i]
else:
important_words[vocab[i]] = feature_importances[i]
print('20 most important words in training set:')
print()
sorted_i_w = sorted(important_words.items(), key=operator.itemgetter(1))
@ -93,17 +103,19 @@ class DecisionTree:
i_w = [x[0] for x in sorted_i_w]
print(i_w[:20])
print()
#print metrics of test set
#print metrics of test set
print('prediction of testing set:')
print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores))))
format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores))))
print()
# print('overfit testing: prediction of training set')
# print('F1 score: min = {}, max = {}, average = {}'.
# format(min(f1_scores_train), max(f1_scores_train),
# sum(f1_scores_train)/float(len(f1_scores_train))))
# print()
print('# ending decision tree')
print('#')
print('#')
DecisionTree.make_tree(dataset)
print('# ending program')

20
NER.py
View File

@ -9,32 +9,32 @@ from nltk import ne_chunk, pos_tag, sent_tokenize, word_tokenize
from nltk.tree import Tree
''' TODO: falsch klassifiert:
[('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '),
[('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '),
('PERSON', 'Maybank Kim Eng Securities '), ('PERSON', 'Krung Thai Bank '),
('PERSON', 'Siam Commercial Bank '), ('PERSON', 'Singapore '),
('PERSON', 'Siam Commercial Bank '), ('PERSON', 'Singapore '),
('PERSON', 'Keppel Corp '), ('ORGANIZATION', 'Companies ')]
'''
class NER:
def get_ne_with_label(text):
def get_ne_with_label(text):
labels = []
names = []
# TODO: letztes Wort wird nicht erkannt
for chunk in ne_chunk(pos_tag(word_tokenize(text + 'lastword.'))):
if hasattr(chunk, 'label'):
name = ''
if hasattr(chunk, 'label'):
name = ''
for c in chunk:
name += c[0] + ' '
if name not in names:
if name not in names:
names.append(name.strip())
labels.append(chunk.label())
#print(chunk.label(), ' '.join(c[0] for c in chunk))
return list(zip(labels, names))
return list(zip(labels, names))
test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
\nmostly fell in light volumes on Tuesday as energy shares
tracked \nfalls in global oil prices, while weaknesses in banking shares
tracked \nfalls in global oil prices, while weaknesses in banking shares
\namid concerns about loans to an ailing steel firm sent the Thai
\nindex to a one-week closing low. \nBangkok's SET index shed nearly
1 percent after four \nsessions of gains. The index closed at 1,379.32,
@ -56,5 +56,5 @@ test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
\namid uncertainty over global demand. \nFor Asian Companies click.'''
print(NER.get_ne_with_label(test_article))

View File

@ -2,17 +2,18 @@
Naive Bayes Classifier
======================
Naive Bayes is a probabilistic classifier that is able to predict a
probability distribution over a set of classes, rather than only
outputting the most likely class that the observation should belong to.
'Naive' means, that it assumes that the value of a particular feature
(word in an article) is independent of the value of any other feature,
given the label. It considers each of these features to contribute
Naive Bayes is a probabilistic classifier that is able to predict a
probability distribution over a set of classes, rather than only
outputting the most likely class that the observation should belong to
'Naive' means, that it assumes that the value of a particular feature
(word in an article) is independent of the value of any other feature,
given the label. It considers each of these features to contribute
independently to the probability that it belongs to its category,
regardless of any possible correlations between these features.
regardless of any possible correlations between these features.
'''
from BagOfWords import BagOfWords
from CsvReader import CsvReader
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
@ -22,98 +23,108 @@ from sklearn.naive_bayes import GaussianNB
class NaiveBayes:
print('# starting program')
print('#')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('#')
dataset = CsvHandler.read_csv(file)
def make_naive_bayes(dataset):
'''fits naive bayes model with StratifiedKFold,
'''fits naive bayes model with StratifiedKFold,
uses my BOW
'''
'''
print('# starting naive bayes')
print('#')
# split data into text and label set
# join title and text
X = dataset['Title'] + ' ' + dataset['Text']
X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label']
cv = CountVectorizer()
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
classifier = GaussianNB()
skf = StratifiedKFold(n_splits = 10, shuffle=True)
classifier = GaussianNB()
# lists for metrics
recall_scores = []
precision_scores = []
f1_scores = []
# for each fold
n = 0
for train, test in skf.split(X,y):
for train, test in skf.split(X,y):
n += 1
print('# split no. ' + str(n))
# eigenes BOW => schlechtere ergebnisse
vocab = BagOfWords.make_vocab(X[train])
vocab = BagOfWords.make_vocab(X[train])
# fit the training data and then return the matrix
training_data = BagOfWords.make_matrix(X[train], vocab)
training_data = BagOfWords.make_matrix(X[train], vocab)
# transform testing data and return the matrix
testing_data = BagOfWords.make_matrix(X[test], vocab)
# # # using CountVectorizer:
# # fit the training data and then return the matrix
# training_data = cv.fit_transform(X[train], y[train]).toarray()
# # transform testing data and return the matrix
# testing_data = cv.transform(X[test]).toarray()
# testing_data = cv.transform(X[test]).toarray()
# # apply select percentile
# selector = SelectPercentile(percentile=25)
# selector = SelectPercentile(percentile=25)
# selector.fit(training_data, y[train])
# training_data_r = selector.transform(training_data)
# training_data_r = selector.transform(training_data)
# testing_data_r = selector.transform(testing_data)
# #fit classifier
# classifier.fit(training_data_r, y[train])
# #predict class
# classifier.fit(training_data_r, y[train])
# #predict class
# predictions_train = classifier.predict(training_data_r)
# predictions_test = classifier.predict(testing_data_r)
#fit classifier
classifier.fit(training_data, y[train])
#predict class
classifier.fit(training_data, y[train])
#predict class
predictions_train = classifier.predict(training_data)
predictions_test = classifier.predict(testing_data)
#print and store metrics
rec = recall_score(y[test], predictions_test)
print('rec: ' + str(rec))
recall_scores.append(rec)
recall_scores.append(rec)
prec = precision_score(y[train], predictions_train)
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
##########################
#print metrics of test set
#print metrics of test set
print('-------------------------')
print('prediction of testing set:')
print('Precision score: min = {}, max = {}, average = {}'
.format(min(precision_scores),
max(precision_scores),
sum(precision_scores)/float(len(precision_scores))))
sum(precision_scores)/float(len(precision_scores))))
print('Recall score: min = {}, max = {}, average = {}'
.format(min(recall_scores),
max(recall_scores),
sum(recall_scores)/float(len(recall_scores))))
sum(recall_scores)/float(len(recall_scores))))
print('F1 score: min = {}, max = {}, average = {}'
.format(min(f1_scores),
max(f1_scores),
sum(f1_scores)/float(len(f1_scores))))
sum(f1_scores)/float(len(f1_scores))))
print()
##### nur für overfit testing ###########
#print('overfit testing: prediction of training set')
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
@ -124,28 +135,28 @@ class NaiveBayes:
print('# ending naive bayes')
print('#')
######## nur für resubstitutionsfehler benötigt ########
######## nur für resubstitutionsfehler benötigt ########
def analyze_errors(dataset):
'''calculates resubstitution error
shows indices of false classified articles
uses Gaussian Bayes with train test split
'''
X_train_test = dataset['Title'] + ' ' + dataset['Text']
'''
X_train_test = dataset['Title'] + ' ' + dataset['Text']
y_train_test = dataset['Label']
count_vector = CountVectorizer()
count_vector = CountVectorizer()
# fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train_test).toarray()
# transform testing data and return the matrix
testing_data = count_vector.transform(X_train_test).toarray()
# Naive Bayes
classifier = GaussianNB()
classifier = GaussianNB()
# fit classifier
classifier.fit(training_data, y_train_test)
# Predict class
predictions = classifier.predict(testing_data)
predictions = classifier.predict(testing_data)
print('Errors at index:')
print()
n = 0
@ -157,6 +168,9 @@ class NaiveBayes:
.format(i, predictions[i], y_train_test[i]))
print(X_train_test[i])
print(y_train_test[i])
print()
#print metrics
print('F1 score: ', format(f1_score(y_train_test, predictions)))
print()
#print metrics
print('F1 score: ', format(f1_score(y_train_test, predictions)))
print('#')
print('# ending program')

View File

@ -1,3 +1,13 @@
# thesis-anne
my python classes for text mining, machine learning models, …
my python classes for text mining, machine learning models, …
# Requirements
pandas==0.20.1
nltk==3.2.5
webhoseio==0.5
numpy==1.14.0
graphviz==0.9
scikit_learn==0.19.2
# Installation under (UBUNTU?)
apt-get install XX

View File

@ -12,12 +12,12 @@ import re
from datetime import datetime
import pandas as pd
import webhoseio
import webhoseio
from CsvHandler import CsvHandler
from CsvHandler import CsvHandler
class Requester:
def save_articles_from_webhoseio():
''' create DataFrame of articles with
Timestamp, Title, Text, SiteSection
@ -25,14 +25,14 @@ class Requester:
'''
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
filestring = 'download_articles_{}.csv'.format(datestring)
# print message
print('# retrieving articles from webhose.io')
# personal API key
webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
# webhose.io query
# webhose.io query
# suboptimal: usage of search terms :-(
query_params = {
"q": "thread.title:(merger OR merges OR merge OR merged OR "
@ -47,25 +47,25 @@ class Requester:
"has_video:false",
"ts": "1527411742661",
"sort": "crawled"}
output = webhoseio.query("filterWebContent", query_params)
sum_posts = output['totalResults']
print('# total sum of posts: ' + str(sum_posts))
# 100 articles per batch (download)
num_downloads = int(sum_posts / 100)
num_downloads = int(sum_posts / 100)
print('# collecting first {} articles'.format(num_downloads * 100))
print('# sorting out other sources than reuters')
# twodimensional list of all articles
list_articles = []
for n in range(num_downloads):
# save next 100 articles
for i in range(100):
for i in range(100):
# check if correct source 'reuters'
if not re.search(r'reuters',
if not re.search(r'reuters',
output['posts'][i]['thread']['site_section']):
continue
else:
@ -73,21 +73,21 @@ class Requester:
article.append(output['posts'][i]['published'])
article.append(output['posts'][i]['title'].replace('|', ' '))
# remove white spaces and separators
text = output['posts'][i]['text'].replace('\n', ' ')
.replace('\r', ' ').replace('|', ' ')
text = output['posts'][i]['text'].replace('\n', ' ')\
.replace('\r', ' ').replace('|', ' ')
section = output['posts'][i]['thread']['site_section']
article.append(text)
# remove '\r' at end of some urls
section = section.replace('\r', '')
section = section.replace('\r', '')
article.append(section)
# add article to list
list_articles.append(article)
# Get the next batch of 100 posts
output = webhoseio.get_next()
# create DataFrame
df = pd.DataFrame(data=list_articles,
df = pd.DataFrame(data=list_articles,
columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
# save csv
CsvHandler.write_csv(df, filestring)

50
SVM.py
View File

@ -2,14 +2,14 @@
Support Vector Machines (SVM) Classifier
========================================
The SVM training algorithm builds a model from the training data that assigns
the test samples to one category ('merger' or 'not merger'),
making it a non-probabilistic binary linear classifier.
An SVM model is a representation of the samples as points in space,
mapped so that the examples of the separate categories are divided
by a clear gap that is as wide as possible.
New samples are then mapped into that same space and predicted
to belong to a category based on which side of the gap they fall.
The SVM training algorithm builds a model from the training data that assigns
the test samples to one category ('merger' or 'not merger'),
making it a non-probabilistic binary linear classifier.
An SVM model is a representation of the samples as points in space,
mapped so that the examples of the separate categories are divided
by a clear gap that is as wide as possible.
New samples are then mapped into that same space and predicted
to belong to a category based on which side of the gap they fall.
'''
from BagOfWords import BagOfWords
@ -25,12 +25,12 @@ from sklearn.svm import SVC
class SVM:
def make_svm(dataset):
print('# starting SVM')
print('#')
# split data into text and label set
# articles' text (title + text)
X = dataset['Title'] + ' ' + dataset['Text']
# articles' labels
@ -44,28 +44,28 @@ class SVM:
X = CountVectorizer().fit_transform(X).toarray()
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
skf = StratifiedKFold(n_splits = 10, shuffle=True)
# use only most important features
selector = SelectPercentile()
selector = SelectPercentile()
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
'SVC__kernel': ['linear','poly'],
'SVC__gamma': [0.001, 0.01],
'SVC__C': [0.1, 1]},
cv=skf,
'SVC__gamma': [0.001, 0.01],
'SVC__C': [0.1, 1]},
cv=skf,
scoring=make_scorer(f1_score))
print('# fit classifier')
print('#')
print('#')
grid.fit(X,y)
# DataFrame of results
df_results = grid.cv_results_
# print results
######################
print('RESULTS:')
@ -76,12 +76,12 @@ class SVM:
print('mean of means:')
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
print('')
print('best score:')
print('best score:')
print(grid.best_score_)
print()
print('best parameters set found on development set:')
print(grid.best_params_)
print()
print('# ending SVM')
print('#')

View File

@ -15,9 +15,6 @@ from SVM import SVM
print('# starting program')
print('#')
# only if new unlabeled(!) data set is required:
# Requester.save_articles_from_webhoseio()
file = 'classification_labelled_corrected.csv'
# read csv file
@ -25,7 +22,6 @@ print('# reading dataset')
print('#')
dataset = CsvHandler.read_csv(file)
# DecisionTree.make_tree(dataset)
NaiveBayes.make_naive_bayes(dataset)
# SVM.make_svm(dataset)

7
thesis/LV.bib Normal file
View File

@ -0,0 +1,7 @@
@BOOK{pierson2016,
AUTHOR="Lillian Pierson",
TITLE="Data Science für Dummies",
PUBLISHER="WILEY-VCH Verlag GmbH \& Co. KGaA",
YEAR=2016,
ADDRESS="Weinheim"
}

Binary file not shown.

450
thesis/thesis.tex Normal file
View File

@ -0,0 +1,450 @@
\documentclass[11pt,a4paper]{scrbook}
\usepackage{geometry}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage[pdftex]{graphicx}
%\usepackage[ngerman]{babel}
\usepackage{colortbl}
\usepackage{xcolor}
\usepackage{soul}
\usepackage{cleveref}
\usepackage{todonotes}
\AtBeginDocument{\renewcommand{\chaptername}{}}
% Kommentare Julian
\newcommand{\jk}[1]{\todo[inline]{JK: #1}}
\renewcommand{\familydefault}{\sfdefault}
% Kommentare Anne
\definecolor{comments}{cmyk}{1,0,1,0}
\newcommand{\al}[1]{\todo[inline]{\color{comments}{AL: #1}}}
\definecolor{uhhred}{cmyk}{0,100,100,0}
\begin{document}
\frontmatter
\newgeometry{centering,left=2cm,right=2cm,top=2cm,bottom=2cm}
\begin{titlepage}
\includegraphics[scale=0.3]{UHH-Logo_2010_Farbe_CMYK.pdf}
\vspace*{2cm}
\Large
\begin{center}
{\color{uhhred}\textbf{\so{BACHELORTHESIS}}}
\vspace*{2.0cm}\\
{\LARGE \textbf{Interactive Labeling of Unclassified Data\\Using the Example of Recognition of Company Mergers}}
%or: Incremental labeling of an unknown data set using the example of classification of news articles
\vspace*{2.0cm}\\
vorgelegt von
\vspace*{0.4cm}\\
Anne Lorenz
\end{center}
\vspace*{3.5cm}
\noindent
MIN-Fakultät \vspace*{0.4cm} \\
Fachbereich Informatik \vspace*{0.4cm} \\
%Ggf. Professur/Institut \vspace*{0.4cm} \\
Studiengang: Software-System-Entwicklung \vspace*{0.4cm} \\
Matrikelnummer: 6434073 \vspace*{0.8cm} \\
Erstgutachter: Dr. Julian Kunkel \vspace*{0.4cm} \\
Zweitgutachter: Eugen Betke
\vspace*{0.8cm} \\
Betreuer: Dr. Julian Kunkel, Doris Birkefeld
\end{titlepage}
\restoregeometry
\chapter*{Abstract}
BLABLA ABSTRACT
%So objektiv, kurz, verständlich, vollständig und genau wie möglich :-)
\tableofcontents
\mainmatter
%Kapitel Einleitung
%####################
\chapter{Introduction}
\label{chap:introduction}
\textit{
In this chapter...In \cref{sec:motivation} the motivation, then in \cref{sec:goals} the goals, blablabla...
}
\section{Motivation}
\label{sec:motivation}
Given a classification problem, there is always a labeled data set needed first to apply a machine learning model and make predictions possible. The larger the labeled data set is, the better are generally the predictions. However, to get there, each single data element must first be classified manually. Depending on the type of data, this procedure can be very time-consuming, for example if longer texts have to be read.
In this thesis we want to present an alternative data labeling method that allows to label a larger amount of data in a shorter time.
\section{Goals}
\label{sec:goals}
\jk{Ein Satz welcher das Problem beschreibt, dannach dann runtergebrochen in Teilaufgaben}
We want to compare a conventional method of data labeling with an alternative, incremental method using the following example: The aim is to investigate news articles about recent mergers ('mergers and acquisitions') and to classify them accordingly. With the help of the labeled data set, different classification models will be applied and optimized so that a prediction about future news articles will be possible.
\section{Outline}
über die gliederung...
\bigskip
\paragraph{Summary:}
\textit{\newline In this chapter we discussed ... The following chapter deals with blabla.}
%Kapitel Stand der Technik
%##########################
\chapter{State of the Art}
\label{state_of_the_art}
\textit{In this chapter the current state of research in the field of... will be presented.
}
\section{State of Research}
\al{Was soll hier rein?}
\bigskip
\paragraph{Summary:}
\textit{\newline In this chapter we have described ... are described in the next chapter. In the next chapter we describe...
}
%Kapitel Grundlagen
%####################
\chapter{Background and Related Work}
\label{chap:background}
\textit{
In this chapter...In \cref{sec:news} news sources are introduced, then blablabla...
}
\section{Business News about Mergers}
\label{sec:news}
\subsection{Company Mergers}
When two companies merge, ... When shares of a company are sold, ... Blabla...
\subsection{Webhose.io as Source for News Articles}
As a source for our initial data set, RSS feeds from established business news agencies such as Reuters or Bloomberg come into consideration. However, when crawling RSS feeds, it is not possible to retrieve news from a longer period in the past. Since we want to analyze news of the last 12 months, we obtain the data set from the provider webhose.io. It offers access to English news articles from the sections 'Financial News', 'Finance' and 'Business', among others. As we are only interested in reliable sources, we limit our request to the websites of Reuters, Bloomberg, Financial Times, The Economist and ...
\section{Supervised Machine Learning Problems}
\subsubsection{Structured / Unstructured Data}
\subsection{Classification Problems}
\subsubsection{Binary Classification}
Vergleichbar mit Spamfilterung...
\subsubsection{Multiple Classification}
\subsection{Balanced / Unbalanced Data Set}
\section{Text Analysis}
\subsection{Natural Language Processing (NLP)}
\subsection{Tokenization}
\subsection{Unigram, Bigram}
\subsection{Stemming}
\subsection{Feature Vectors}
\subsubsection{Word Frequencies}
\subsection{Bag of Words (BOW)}
\subsection{Stop Words}
\subsection{Named Entity Recognition (NER)}
\section{Machine Learning Models}
\subsection{Naive Bayes Classifier}
\subsection{Support Vector Machines (SVM)}
\subsection{Decision Trees}
\subsection{Hyperparameters}
\subsection{Feature Selection}
\section{Split Methods}
\subsection{Test-Train-Split}
\subsection{Shuffle Split}
\subsection{(K-fold) Cross-Validation}
\section{Metrics}
\subsection{Accuracy, Error Rate, Sensitivity, Specifity}
Sensitivity(=true positive rate) and Specificity(=true negative rate)
\subsection{Recall, Precision, F1-score}
\subsection{Robustness}
\subsection{Overfit, Underfit}
\subsection{Bias, Variance}
\subsection{Resubstitution Error}
\bigskip
\paragraph{Summary:}
\textit{\newline
In this chapter we ... blabla are described in section bla.
In the next chapter we describe...
}
%Kapitel Design
%###########################
\chapter{Design}
\label{chap:design}
\textit{
In this chapter... In \cref{sec:overview} we give an overview of all, then in \cref{sec:pipeline} the data processing pipeline, blablabla...
}
\section{Overview}
\label{sec:overview}
\jk{Was muss insgesamt gemacht werden, welche Teilprobleme müssen addressiert werden}
\jk{Alternativen besprechen, Entscheidungen fällen basierend auf Kriterien}
\jk{Hier ist evtl. noch einiges drin was in Kapitel 'Grundlagen' verschoben wird. Hier kommt Deine Arbeit hin, kein Related work oder Methoden die es schon gibt. Nur falls man es Vergleicht, dann relevant.}
\section{Data Processing Pipeline}
\label{sec:pipeline}
\section{Preprocessing}
Tokenization, Stemming, Stop Words, Leaving Out Numbers
\section{Data Labeling}
\subsection{Conventional Method}
\subsubsection{Top-Down / Waterfall}
1) Data Labeling \\
2) Data Cleaning\\
3) Model Building\\
4) Analysis of wrong predicted instances
=> evtl. neu labeln, wird meistens nicht gemacht\\
5) Neue Hypothesen => 3); evl. zu 2)\\
\subsection{Incremental Method}
\subsubsection{Visual Analyticts, Agile Model Development}
\subsubsection{Unbalanced Data Set}
\section{Model Selection}
\subsection{Naive Bayes}
GaussianNB vs MultinomialNB
\subsection{SVM}
\subsection{Decision Tree}
\section{Recognition of merger partners}
\subsubsection{Named Entity Recognition (NER)}
\bigskip
\paragraph{Summary:}
\textit{\newline
In this chapter we... In the next chapter...
}
% Kapitel Labeling
%###########################
\chapter{Data Labeling}
\label{chap:labeling}
\textit{
This chapter describes the procedure for labeling. blabla
}
\section{Conventional Method}
\subsection{Data Set}
1497 Artikel\\
Zeitraum: 1 Monat\\
Quelle: Reuters.com\\
\subsection{Classification}
Daten binär klassifiziert, Zeitaufwand ca. 30 Stunden
\subsection{Difficulties}
Hier ein paar Textbeispiele, die schwierig einzuordnen waren:\\
- wie soll mit Anteilsverkäufen > 50 \% umgegangen werden? => bedeutet eigentlich Eigentümerwechsel\\
- "X will buy Y", "X wants to buy Y" => findet es definitiv statt? => ganzer Artikel muss gelesen werden\\
- Fusion nur als Randbemerkung, ("letztes Jahr haben X und Y fusioniert..., jetzt entstehen neue geschäftsbereiche blabla") ansonsten aber irrelevanter Artikel
\\
=> aus diesen problemen heraus entstand die idee, verschiedene klassen zu verwenden
\section{Incremental Method}
\subsection{Data Set}
10.000 Artikel aus 130.000\\
Zeitraum: 12 Monate\\
Quellen: Reuters.com, Bloomberg.com, ...\\
\subsection{Classification}
Daten mehrfach klassifiert mit 6 Klassen:\\
\\
1: Merger \\
2: Merger Pending\\
3: Merger Aborted\\
4: Sale of Shares\\
5: Incidental \\
6: Irrelevant \\
\subsection{Selection of Articles}
\subsection{Procedure}
Wähle von jedem Monat 10 Artikel zufällig aus.
Es ist wahrscheinlich dann man nur Merger mit vielen Artikeln hat
=> Das könnte man minimieren indem man “stratified” sampling macht
=> Zuerst NER machen, danach fair über Klassen randomisieren
=> wähle 10 Artikel von 100 Kategorien aus => 10 Kategorien auswählen => darunter zufällig ein Artikel
Labeln von 1\% aller Artikel
1) Erste Modelle bauen z.b. Bayes
Auf alle Artikel anwenden => Wahrscheinlichkeit pro Klasse Vektor: (K1, K2, … , K6)
Klare Fälle: Kx > 80\% und alle anderen Ky < 10\% (mit x in {1-6}, y != x)
=> Label übernehmen => wie viele Fälle sind eindeutig?
Behauptung: 10\% aller Artikel sind eindeutig
Stichprobenartig überprüfen => 10 Artikel random auswählen von jeder Klasse
Identifikation von äußert unklaren Fällen
Mehr als eine Klasse hat ähnliche Wahrscheinlichkeit
(5\%, 5\%, 5\%, …) => (80\%, 80\%, 0\%, 0\%, …)
z.b. 100 Artikel angucken und manuell label
=> Wiederhole ich 3-4 mal gehe zu Schritt 1) (Modell bauen)
=> 95\% aller Fälle sind jetzt klar.
=> warum gehen die 5\% nicht? Stichprobenartig Artikel anschauen
Falls das nicht klappt, Modelle oder Preprozessing (z.b. NER) verbessern
\subsection{Tagging of Named Entities}
Histogram: X: Autoren/Personen, Unternehmen, Y: Anzahl der Nennungen
\bigskip
\paragraph{Summary:}
\textit{\newline
In this chapter...in the next chapter...
}
% Kapitel Implementierung
%##########################
\chapter{Implementation}
\label{chap:implementation}
\textit{
This chapter deals with the most relevant parts of the implementation.
}
\section{Data Download}
Query webhose.io:\\
% austauschen!
query\_params = \{'q':'site:(reuters.com OR ft.com OR cnn.com OR economist.com OR bloomberg.com OR theguardian.com) site\_category:(financial\_news OR finance OR business)',
'ts': '1533634070282',
'sort': 'crawled'\}
\section{Python Modules}
\subsection{nltk}
\subsection{pandas}
\subsection{sklearn}
\subsection{webhoseio}
\section{Own Implementation}
\subsection{Examples}
\bigskip
\paragraph{Summary:}
\textit{\newline
In this chapter, we...In the next chapter...
}
% Kapitel Evaluation
%##########################
\chapter{Evaluation}
\label{chap:evaluation}
\textit{
In this chapter we want to evaluate the different methods. blabla.
}
\section{News Articles Exploration}
\subsection{Length of Articles}
Oder was sonst noch interessant ist.
\subsection{Most Common Words}
Im Bezug auf die Artikel über Fusion.
\subsubsection{Word Cloud}
z.B. Word Cloud mit Microsoft-Github-Fusion-Artikel.
\section{Model Fitting}
dran denken: Hyperparameter SEPARAT variieren
\subsection{Naive Bayes Model}
Grid-Search
\subsection{SVM}
\subsection{Decision Tree}
\section{Performance}
\bigskip
\paragraph{Summary:}
\textit{\newline
In this chapter we have described ... In the last chapter we describe...
}
\chapter{Discussion (?)}
\al{Braucht man das? Arbeit soll kritisch hinterfragt werden, z.B. 'war der datensatz gut gewählt?' etc.}
% Kapitel ZUsammenfassung
%#############################
\chapter{Summary}
\label{chap:summary}
\section{Comparison of Labeling Methods}
\section{Quality of Predictions}
\section{Conclusions}
\section{Future Work}
Neuronales Netz
\bigskip
\paragraph{Summary:}
\textit{\newline
In the last chapter we have described ....
}
% Literaturliste soll im Inhaltsverzeichnis auftauchen
\nocite{*}
\addcontentsline{toc}{chapter}{Bibliography}
% Literaturliste anzeigen
\bibliography{LV}
\backmatter
\thispagestyle{empty}
\vspace*{\fill}
\pagestyle{empty}
{\normalsize
\begin{center}\textbf{Eidesstattliche Erklärung}\end{center}
Hiermit versichere ich an Eides statt, dass ich die vorliegende Arbeit im Bachelorstudiengang Wirtschaftsinformatik selbstständig verfasst und keine anderen als die angegebenen Hilfsmittel insbesondere keine im Quellenverzeichnis nicht benannten Internet-Quellen benutzt habe. Alle Stellen, die wörtlich oder sinngemäß aus Veröffentlichungen entnommen wurden, sind als solche kenntlich gemacht. Ich versichere weiterhin, dass ich die Arbeit vorher nicht in einem anderen Prüfungsverfahren eingereicht habe und die eingereichte schriftliche Fassung der auf dem elektronischen Speichermedium entspricht.
\vspace*{1cm}\\
Hamburg, den 01.02.2019
\hspace*{\fill}\begin{tabular}{@{}l@{}}\hline
\makebox[5cm]{Anne Lorenz}
\end{tabular}
\vspace*{3cm}
%Dies ist optional, ggf. löschen!
\begin{center}\textbf{Veröffentlichung}\end{center}
Ich stimme der Einstellung der Arbeit in die Bibliothek des Fachbereichs Informatik zu.
\vspace*{1cm}\\
Hamburg, den 01.02.2019
\hspace*{\fill}\begin{tabular}{@{}l@{}}\hline
\makebox[5cm]{Anne Lorenz}
\end{tabular}
}
\vspace*{\fill}
\end{document}