added requirements and some things
This commit is contained in:
parent
c2066d6adb
commit
ab578ae0c6
114
BagOfWords.py
114
BagOfWords.py
|
@ -6,11 +6,11 @@ BagOfWords counts word stems in an article
|
|||
and adds new words to the global vocabulary.
|
||||
|
||||
Anm.:
|
||||
The multinomial Naive Bayes classifier is suitable
|
||||
for classification with discrete features (e.g.,
|
||||
word counts for text classification).
|
||||
The multinomial distribution normally requires
|
||||
integer feature counts. However, in practice,
|
||||
The multinomial Naive Bayes classifier is suitable
|
||||
for classification with discrete features (e.g.,
|
||||
word counts for text classification).
|
||||
The multinomial distribution normally requires
|
||||
integer feature counts. However, in practice,
|
||||
fractional counts such as tf-idf may also work.
|
||||
=> durch 'relative_word_frequencies' als Paramter berücksichtigt
|
||||
'''
|
||||
|
@ -32,14 +32,14 @@ class BagOfWords:
|
|||
def extract_words(text):
|
||||
'''takes article as argument, removes numbers,
|
||||
returns list of single words, recurrences included.
|
||||
'''
|
||||
'''
|
||||
stop_words = BagOfWords.set_stop_words()
|
||||
# replace punctuation marks with spaces
|
||||
words = re.sub(r'\W', ' ', text)
|
||||
words = re.sub(r'\W', ' ', text)
|
||||
# split str into list of single words
|
||||
words = words.split()
|
||||
words = words.split()
|
||||
# list of all words to return
|
||||
words_cleaned = []
|
||||
words_cleaned = []
|
||||
for word in words:
|
||||
# remove numbers
|
||||
if word.isalpha():
|
||||
|
@ -50,18 +50,18 @@ class BagOfWords:
|
|||
# add every word in lowercase
|
||||
words_cleaned.append(word.lower())
|
||||
return words_cleaned
|
||||
|
||||
|
||||
def reduce_word_to_stem(word):
|
||||
'''takes normal word as input, returns the word's stem
|
||||
'''
|
||||
stemmer = PorterStemmer()
|
||||
# replace word by its stem
|
||||
word = stemmer.stem(word)
|
||||
word = stemmer.stem(word)
|
||||
return word
|
||||
|
||||
|
||||
def make_matrix(series, vocab, relative_word_frequencies=True):
|
||||
'''calculates word stem frequencies in input articles.
|
||||
returns matrix (DataFrame) with relative word frequencies
|
||||
returns matrix (DataFrame) with relative word frequencies
|
||||
(0 <= values < 1) if relative_word_frequencies=True or absolute
|
||||
word frequencies (int) if relative_word_frequencies=False.
|
||||
(rows: different articles, colums: different words in vocab)
|
||||
|
@ -69,14 +69,14 @@ class BagOfWords:
|
|||
print('# BOW: calculating matrix')
|
||||
print('#')
|
||||
# create list of tuples
|
||||
vectors = []
|
||||
vectors = []
|
||||
for i in range(len(series)):
|
||||
# extract text of single article
|
||||
text = series.iloc[i]
|
||||
# extract its words
|
||||
words = BagOfWords.extract_words(text)
|
||||
# count words in single article
|
||||
word_count = len(words)
|
||||
# count words in single article
|
||||
word_count = len(words)
|
||||
vector = []
|
||||
for i, v in enumerate(vocab):
|
||||
vector.append(0)
|
||||
|
@ -88,14 +88,14 @@ class BagOfWords:
|
|||
else:
|
||||
# absolute word frequency
|
||||
vector[i] += 1
|
||||
|
||||
|
||||
# add single vector as tuple
|
||||
vectors.append(tuple(vector))
|
||||
df_vectors = pd.DataFrame.from_records(vectors,
|
||||
index=None,
|
||||
columns=vocab)
|
||||
vectors.append(tuple(vector))
|
||||
df_vectors = pd.DataFrame.from_records(vectors,
|
||||
index=None,
|
||||
columns=vocab)
|
||||
return df_vectors
|
||||
|
||||
|
||||
def make_vocab(series):
|
||||
'''adds words of input articles to a global vocabulary.
|
||||
input: dataframe of all articles, return value: list of words
|
||||
|
@ -110,56 +110,56 @@ class BagOfWords:
|
|||
# sort list
|
||||
vocab.sort()
|
||||
return vocab
|
||||
|
||||
|
||||
def set_stop_words():
|
||||
'''creates list of all words that will be ignored
|
||||
'''
|
||||
'''
|
||||
# stopwords
|
||||
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
|
||||
stop_words = ['a', 'about', 'above', 'after', 'again', 'against',
|
||||
'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren',
|
||||
'aren\'t', 'as', 'at', 'be', 'because', 'been',
|
||||
'before', 'being', 'below', 'between', 'both', 'but',
|
||||
'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
|
||||
'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
|
||||
'don', 'don\'t', 'down', 'during', 'each', 'few',
|
||||
'aren\'t', 'as', 'at', 'be', 'because', 'been',
|
||||
'before', 'being', 'below', 'between', 'both', 'but',
|
||||
'by', 'can', 'couldn', 'couldn\'t', 'd', 'did', 'didn',
|
||||
'didn\'t', 'do', 'does', 'doesn', 'doesn\'t', 'doing',
|
||||
'don', 'don\'t', 'down', 'during', 'each', 'few',
|
||||
'for', 'from', 'further', 'had', 'hadn', 'hadn\'t',
|
||||
'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
|
||||
'has', 'hasn', 'hasn\'t', 'have', 'haven', 'haven\'t',
|
||||
'having', 'he', 'her', 'here', 'hers', 'herself', 'him',
|
||||
'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
|
||||
'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is',
|
||||
'isn', 'isn\'t', 'it', 'it\'s', 'its', 'itself', 'just',
|
||||
'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
|
||||
'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
|
||||
'll', 'm', 'ma', 'me', 'mightn', 'mightn\'t', 'more',
|
||||
'most', 'mustn', 'mustn\'t', 'my', 'myself', 'needn',
|
||||
'needn\'t', 'no', 'nor', 'not', 'now', 'o', 'of', 'off',
|
||||
'on', 'once', 'only', 'or', 'other', 'our', 'ours',
|
||||
'ourselves', 'out', 'over', 'own', 're', 's', 'same',
|
||||
'shan', 'shan\'t', 'she', 'she\'s', 'should',
|
||||
'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
|
||||
'on', 'once', 'only', 'or', 'other', 'our', 'ours',
|
||||
'ourselves', 'out', 'over', 'own', 're', 's', 'same',
|
||||
'shan', 'shan\'t', 'she', 'she\'s', 'should',
|
||||
'should\'ve', 'shouldn', 'shouldn\'t', 'so', 'some',
|
||||
'such', 't', 'than', 'that', 'that\'ll', 'the', 'their',
|
||||
'theirs', 'them', 'themselves', 'then', 'there',
|
||||
'these', 'they', 'this', 'those', 'through', 'to',
|
||||
'too', 'under', 'until', 'up', 've', 'very', 'was',
|
||||
'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
|
||||
'what', 'when', 'where', 'which', 'while', 'who',
|
||||
'whom', 'why', 'will', 'with', 'won', 'won\'t',
|
||||
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
|
||||
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
|
||||
'yourselves']
|
||||
|
||||
##=> ist das sinnvoll?:
|
||||
'theirs', 'them', 'themselves', 'then', 'there',
|
||||
'these', 'they', 'this', 'those', 'through', 'to',
|
||||
'too', 'under', 'until', 'up', 've', 'very', 'was',
|
||||
'wasn', 'wasn\'t', 'we', 'were', 'weren', 'weren\'t',
|
||||
'what', 'when', 'where', 'which', 'while', 'who',
|
||||
'whom', 'why', 'will', 'with', 'won', 'won\'t',
|
||||
'wouldn', 'wouldn\'t', 'y', 'you', 'you\'d', 'you\'ll',
|
||||
'you\'re', 'you\'ve', 'your', 'yours', 'yourself',
|
||||
'yourselves']
|
||||
|
||||
##=> ist das sinnvoll?:
|
||||
#add specific words
|
||||
#stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
|
||||
# 'wednesday', 'thursday', 'friday'])
|
||||
#stop_words.extend(['reuters', 'also', 'monday', 'tuesday',
|
||||
# 'wednesday', 'thursday', 'friday'])
|
||||
#remove the word 'not' from stop words
|
||||
#stop_words.remove('not')
|
||||
|
||||
#stop_words.remove('not')
|
||||
|
||||
for i in range(len(stop_words)):
|
||||
|
||||
|
||||
# remove punctuation marks and strip endings from abbreviations
|
||||
#stop_words[i] = re.split(r'\W', stop_words[i])[0]
|
||||
|
||||
|
||||
# reduce word to stem
|
||||
stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
|
||||
# transform list to set to eliminate duplicates
|
||||
stop_words = set(stop_words)
|
||||
|
||||
stop_words = set(stop_words)
|
||||
|
||||
return stop_words
|
|
@ -3,16 +3,16 @@ Cosine Similarity
|
|||
=================
|
||||
|
||||
CosineSimilarity measures the similarity between to articles.
|
||||
It calculates c: the cosine of the angle between the articles
|
||||
It calculates c: the cosine of the angle between the articles
|
||||
vectors dict_1 and dict_2.
|
||||
c = (dict_1 * dict_2) / (|dict_1| * |dict_2|).
|
||||
c = 1, if articles are equal => identicalness is 100%
|
||||
c = (dict_1 * dict_2) / (|dict_1| * |dict_2|).
|
||||
c = 1, if articles are equal => identicalness is 100%
|
||||
0 > c > 1, else => identicalness is (c*100)%
|
||||
(The greater c, the more similar two articles are.)
|
||||
'''
|
||||
|
||||
#TODO: uses dictionaries of each article
|
||||
# => ToDo: has to be changed as we are now using vectors
|
||||
#TODO:uses dictionaries of each article
|
||||
#=>ToDo:has to be changed as we are now using vectors
|
||||
|
||||
import math
|
||||
|
||||
|
@ -23,47 +23,47 @@ class CosineSimilarity:
|
|||
def cos_sim(dict_1, dict_2):
|
||||
|
||||
# list of all different words
|
||||
vocab = []
|
||||
|
||||
vocab = []
|
||||
|
||||
# insert words of 1st article into vocab
|
||||
for key in dict_1.keys():
|
||||
if key not in vocab:
|
||||
vocab.append(key)
|
||||
|
||||
|
||||
# insert words of 2nd article into vocab
|
||||
for key in dict_2.keys():
|
||||
if key not in vocab:
|
||||
vocab.append(key)
|
||||
|
||||
# delete first entry ('sum_words')
|
||||
|
||||
# delete first entry ('sum_words')
|
||||
vocab.pop(0)
|
||||
|
||||
|
||||
# create vectors
|
||||
vector_1 = CosineSimilarity.create_vector(dict_1, vocab)
|
||||
vector_2 = CosineSimilarity.create_vector(dict_2, vocab)
|
||||
|
||||
# start calculation
|
||||
|
||||
# start calculation
|
||||
# calculate numerator of formula
|
||||
sum_1 = 0
|
||||
|
||||
sum_1 = 0
|
||||
|
||||
for i in range (0,len(vector_1)):
|
||||
sum_1 += vector_1[i] * vector_2[i]
|
||||
|
||||
# calculate denominator of formula
|
||||
sum_1 += vector_1[i] * vector_2[i]
|
||||
|
||||
# calculate denominator of formula
|
||||
sum_2 = 0
|
||||
|
||||
|
||||
for entry in vector_1:
|
||||
sum_2 += entry ** 2
|
||||
|
||||
sum_3 = 0
|
||||
|
||||
sum_3 = 0
|
||||
for entry in vector_2:
|
||||
sum_3 += entry ** 2
|
||||
|
||||
|
||||
return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3))
|
||||
|
||||
|
||||
def create_vector(dict, vocab):
|
||||
# word frequency vector
|
||||
vector = []
|
||||
# word frequency vector
|
||||
vector = []
|
||||
for word in vocab:
|
||||
# check if word occurs in article
|
||||
if word in dict:
|
||||
|
|
|
@ -12,17 +12,17 @@ import pandas as pd
|
|||
class CsvHandler:
|
||||
|
||||
def read_csv(csv_file):
|
||||
df = pd.read_csv(csv_file,
|
||||
sep='|',
|
||||
header=0,
|
||||
engine='python',
|
||||
df = pd.read_csv(csv_file,
|
||||
sep='|',
|
||||
header=0,
|
||||
engine='python',
|
||||
usecols=[1,2,4], #use only 'Title', 'Text' and 'Label'
|
||||
decimal='.',
|
||||
decimal='.',
|
||||
quotechar='\'',
|
||||
#nrows = 200,
|
||||
quoting=csv.QUOTE_NONE)
|
||||
return df
|
||||
|
||||
|
||||
def write_csv(df, file_name):
|
||||
df.to_csv(file_name, sep='|')
|
||||
print('### saved {} articles in {}'.format(len(df), file_name))
|
|
@ -2,14 +2,15 @@
|
|||
Decision Tree Classifier
|
||||
========================
|
||||
|
||||
Decision Tree Classifier takes as input two arrays:
|
||||
Decision Tree Classifier takes as input two arrays:
|
||||
array X of size [n_samples, n_features], holding the training samples,
|
||||
and array y of integer values, size [n_samples],
|
||||
and array y of integer values, size [n_samples],
|
||||
holding the class labels for the training samples.
|
||||
'''
|
||||
import operator
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
from CsvHandler import CsvHandler
|
||||
|
||||
import graphviz
|
||||
import numpy as np
|
||||
|
@ -21,71 +22,80 @@ from sklearn.model_selection import StratifiedKFold
|
|||
|
||||
class DecisionTree:
|
||||
|
||||
print('# starting program')
|
||||
print('#')
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('#')
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
def make_tree(dataset):
|
||||
|
||||
print('# starting decision tree')
|
||||
print('#')
|
||||
|
||||
|
||||
X = dataset['Title'] + ' ' + dataset['Text']
|
||||
y = dataset['Label']
|
||||
|
||||
|
||||
#count_vector = CountVectorizer()
|
||||
|
||||
|
||||
# use stratified k-fold cross-validation as split method
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
|
||||
|
||||
# lists for metrics predicted on test/train set
|
||||
f1_scores = []
|
||||
f1_scores_train = []
|
||||
|
||||
f1_scores_train = []
|
||||
|
||||
classifier = tree.DecisionTreeClassifier()
|
||||
|
||||
|
||||
# dict of most important words of each fold
|
||||
important_words = {}
|
||||
|
||||
|
||||
# for each fold
|
||||
for train, test in skf.split(X,y):
|
||||
|
||||
for train, test in skf.split(X,y):
|
||||
|
||||
# BOW
|
||||
vocab = BagOfWords.make_vocab(X[train])
|
||||
vocab = BagOfWords.make_vocab(X[train])
|
||||
# fit the training data and then return the matrix
|
||||
training_data = BagOfWords.make_matrix(X[train], vocab)
|
||||
training_data = BagOfWords.make_matrix(X[train], vocab)
|
||||
# transform testing data and return the matrix
|
||||
testing_data = BagOfWords.make_matrix(X[test], vocab)
|
||||
|
||||
testing_data = BagOfWords.make_matrix(X[test], vocab)
|
||||
|
||||
# #fit the training data and then return the matrix
|
||||
# training_data = count_vector.fit_transform(X[train], y[train]).toarray()
|
||||
# training_data = count_vector.fit_transform(X[train], y[train]).toarray()
|
||||
# #transform testing data and return the matrix
|
||||
# testing_data = count_vector.transform(X[test]).toarray()
|
||||
|
||||
# testing_data = count_vector.transform(X[test]).toarray()
|
||||
|
||||
# # apply select percentile
|
||||
# selector = SelectPercentile(percentile=25)
|
||||
# selector = SelectPercentile(percentile=25)
|
||||
# selector.fit(training_data, y[train])
|
||||
|
||||
# training_data_r = selector.transform(training_data)
|
||||
|
||||
# training_data_r = selector.transform(training_data)
|
||||
# testing_data_r = selector.transform(testing_data)
|
||||
|
||||
|
||||
# fit classifier
|
||||
classifier.fit(training_data, y[train])
|
||||
|
||||
#predict class
|
||||
|
||||
#predict class
|
||||
predictions_train = classifier.predict(training_data)
|
||||
predictions_test = classifier.predict(testing_data)
|
||||
|
||||
#store metrics predicted on test/train set
|
||||
f1_scores.append(f1_score(y[test], predictions_test))
|
||||
|
||||
#store metrics predicted on test/train set
|
||||
f1_scores.append(f1_score(y[test], predictions_test))
|
||||
f1_scores_train.append(f1_score(y[train], predictions_train))
|
||||
|
||||
|
||||
# search for important features
|
||||
feature_importances = np.array(classifier.feature_importances_)
|
||||
important_indices = feature_importances.argsort()[-50:][::-1]
|
||||
|
||||
|
||||
for i in important_indices:
|
||||
if vocab[i] in important_words:
|
||||
important_words[vocab[i]] += feature_importances[i]
|
||||
else:
|
||||
important_words[vocab[i]] = feature_importances[i]
|
||||
|
||||
|
||||
print('20 most important words in training set:')
|
||||
print()
|
||||
sorted_i_w = sorted(important_words.items(), key=operator.itemgetter(1))
|
||||
|
@ -93,17 +103,19 @@ class DecisionTree:
|
|||
i_w = [x[0] for x in sorted_i_w]
|
||||
print(i_w[:20])
|
||||
print()
|
||||
|
||||
#print metrics of test set
|
||||
|
||||
#print metrics of test set
|
||||
print('prediction of testing set:')
|
||||
print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
||||
format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores))))
|
||||
format(min(f1_scores), max(f1_scores),sum(f1_scores)/float(len(f1_scores))))
|
||||
print()
|
||||
# print('overfit testing: prediction of training set')
|
||||
# print('F1 score: min = {}, max = {}, average = {}'.
|
||||
# format(min(f1_scores_train), max(f1_scores_train),
|
||||
# sum(f1_scores_train)/float(len(f1_scores_train))))
|
||||
# print()
|
||||
|
||||
print('# ending decision tree')
|
||||
print('#')
|
||||
print('#')
|
||||
|
||||
DecisionTree.make_tree(dataset)
|
||||
print('# ending program')
|
20
NER.py
20
NER.py
|
@ -9,32 +9,32 @@ from nltk import ne_chunk, pos_tag, sent_tokenize, word_tokenize
|
|||
from nltk.tree import Tree
|
||||
|
||||
''' TODO: falsch klassifiert:
|
||||
[('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '),
|
||||
[('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '),
|
||||
('PERSON', 'Maybank Kim Eng Securities '), ('PERSON', 'Krung Thai Bank '),
|
||||
('PERSON', 'Siam Commercial Bank '), ('PERSON', 'Singapore '),
|
||||
('PERSON', 'Siam Commercial Bank '), ('PERSON', 'Singapore '),
|
||||
('PERSON', 'Keppel Corp '), ('ORGANIZATION', 'Companies ')]
|
||||
'''
|
||||
|
||||
class NER:
|
||||
|
||||
def get_ne_with_label(text):
|
||||
def get_ne_with_label(text):
|
||||
labels = []
|
||||
names = []
|
||||
# TODO: letztes Wort wird nicht erkannt
|
||||
for chunk in ne_chunk(pos_tag(word_tokenize(text + 'lastword.'))):
|
||||
if hasattr(chunk, 'label'):
|
||||
name = ''
|
||||
if hasattr(chunk, 'label'):
|
||||
name = ''
|
||||
for c in chunk:
|
||||
name += c[0] + ' '
|
||||
if name not in names:
|
||||
if name not in names:
|
||||
names.append(name.strip())
|
||||
labels.append(chunk.label())
|
||||
#print(chunk.label(), ' '.join(c[0] for c in chunk))
|
||||
return list(zip(labels, names))
|
||||
|
||||
return list(zip(labels, names))
|
||||
|
||||
test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
|
||||
\nmostly fell in light volumes on Tuesday as energy shares
|
||||
tracked \nfalls in global oil prices, while weaknesses in banking shares
|
||||
tracked \nfalls in global oil prices, while weaknesses in banking shares
|
||||
\namid concerns about loans to an ailing steel firm sent the Thai
|
||||
\nindex to a one-week closing low. \nBangkok's SET index shed nearly
|
||||
1 percent after four \nsessions of gains. The index closed at 1,379.32,
|
||||
|
@ -56,5 +56,5 @@ test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
|
|||
region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
|
||||
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
|
||||
\namid uncertainty over global demand. \nFor Asian Companies click.'''
|
||||
|
||||
|
||||
print(NER.get_ne_with_label(test_article))
|
120
NaiveBayes.py
120
NaiveBayes.py
|
@ -2,17 +2,18 @@
|
|||
Naive Bayes Classifier
|
||||
======================
|
||||
|
||||
Naive Bayes is a probabilistic classifier that is able to predict a
|
||||
probability distribution over a set of classes, rather than only
|
||||
outputting the most likely class that the observation should belong to.
|
||||
'Naive' means, that it assumes that the value of a particular feature
|
||||
(word in an article) is independent of the value of any other feature,
|
||||
given the label. It considers each of these features to contribute
|
||||
Naive Bayes is a probabilistic classifier that is able to predict a
|
||||
probability distribution over a set of classes, rather than only
|
||||
outputting the most likely class that the observation should belong to
|
||||
'Naive' means, that it assumes that the value of a particular feature
|
||||
(word in an article) is independent of the value of any other feature,
|
||||
given the label. It considers each of these features to contribute
|
||||
independently to the probability that it belongs to its category,
|
||||
regardless of any possible correlations between these features.
|
||||
regardless of any possible correlations between these features.
|
||||
'''
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
from CsvReader import CsvReader
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
|
@ -22,98 +23,108 @@ from sklearn.naive_bayes import GaussianNB
|
|||
|
||||
class NaiveBayes:
|
||||
|
||||
print('# starting program')
|
||||
print('#')
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('#')
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
def make_naive_bayes(dataset):
|
||||
'''fits naive bayes model with StratifiedKFold,
|
||||
'''fits naive bayes model with StratifiedKFold,
|
||||
uses my BOW
|
||||
'''
|
||||
'''
|
||||
print('# starting naive bayes')
|
||||
print('#')
|
||||
|
||||
|
||||
# split data into text and label set
|
||||
# join title and text
|
||||
X = dataset['Title'] + ' ' + dataset['Text']
|
||||
X = dataset['Title'] + ' ' + dataset['Text']
|
||||
y = dataset['Label']
|
||||
|
||||
|
||||
cv = CountVectorizer()
|
||||
|
||||
|
||||
# use stratified k-fold cross-validation as split method
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
|
||||
classifier = GaussianNB()
|
||||
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
|
||||
classifier = GaussianNB()
|
||||
|
||||
# lists for metrics
|
||||
recall_scores = []
|
||||
precision_scores = []
|
||||
f1_scores = []
|
||||
|
||||
|
||||
# for each fold
|
||||
n = 0
|
||||
for train, test in skf.split(X,y):
|
||||
|
||||
for train, test in skf.split(X,y):
|
||||
|
||||
n += 1
|
||||
print('# split no. ' + str(n))
|
||||
|
||||
|
||||
# eigenes BOW => schlechtere ergebnisse
|
||||
vocab = BagOfWords.make_vocab(X[train])
|
||||
vocab = BagOfWords.make_vocab(X[train])
|
||||
# fit the training data and then return the matrix
|
||||
training_data = BagOfWords.make_matrix(X[train], vocab)
|
||||
training_data = BagOfWords.make_matrix(X[train], vocab)
|
||||
# transform testing data and return the matrix
|
||||
testing_data = BagOfWords.make_matrix(X[test], vocab)
|
||||
|
||||
|
||||
# # # using CountVectorizer:
|
||||
# # fit the training data and then return the matrix
|
||||
# training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||
# # transform testing data and return the matrix
|
||||
# testing_data = cv.transform(X[test]).toarray()
|
||||
|
||||
# testing_data = cv.transform(X[test]).toarray()
|
||||
|
||||
# # apply select percentile
|
||||
# selector = SelectPercentile(percentile=25)
|
||||
# selector = SelectPercentile(percentile=25)
|
||||
# selector.fit(training_data, y[train])
|
||||
|
||||
# training_data_r = selector.transform(training_data)
|
||||
|
||||
# training_data_r = selector.transform(training_data)
|
||||
# testing_data_r = selector.transform(testing_data)
|
||||
|
||||
|
||||
# #fit classifier
|
||||
# classifier.fit(training_data_r, y[train])
|
||||
# #predict class
|
||||
# classifier.fit(training_data_r, y[train])
|
||||
# #predict class
|
||||
# predictions_train = classifier.predict(training_data_r)
|
||||
# predictions_test = classifier.predict(testing_data_r)
|
||||
|
||||
|
||||
#fit classifier
|
||||
classifier.fit(training_data, y[train])
|
||||
#predict class
|
||||
classifier.fit(training_data, y[train])
|
||||
#predict class
|
||||
predictions_train = classifier.predict(training_data)
|
||||
predictions_test = classifier.predict(testing_data)
|
||||
|
||||
|
||||
#print and store metrics
|
||||
rec = recall_score(y[test], predictions_test)
|
||||
print('rec: ' + str(rec))
|
||||
recall_scores.append(rec)
|
||||
recall_scores.append(rec)
|
||||
prec = precision_score(y[train], predictions_train)
|
||||
print('prec: ' + str(prec))
|
||||
print('#')
|
||||
precision_scores.append(prec)
|
||||
# equation for f1 score
|
||||
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||
|
||||
|
||||
##########################
|
||||
#print metrics of test set
|
||||
#print metrics of test set
|
||||
print('-------------------------')
|
||||
print('prediction of testing set:')
|
||||
print('Precision score: min = {}, max = {}, average = {}'
|
||||
.format(min(precision_scores),
|
||||
max(precision_scores),
|
||||
sum(precision_scores)/float(len(precision_scores))))
|
||||
sum(precision_scores)/float(len(precision_scores))))
|
||||
print('Recall score: min = {}, max = {}, average = {}'
|
||||
.format(min(recall_scores),
|
||||
max(recall_scores),
|
||||
sum(recall_scores)/float(len(recall_scores))))
|
||||
sum(recall_scores)/float(len(recall_scores))))
|
||||
print('F1 score: min = {}, max = {}, average = {}'
|
||||
.format(min(f1_scores),
|
||||
max(f1_scores),
|
||||
sum(f1_scores)/float(len(f1_scores))))
|
||||
sum(f1_scores)/float(len(f1_scores))))
|
||||
print()
|
||||
|
||||
|
||||
##### nur für overfit testing ###########
|
||||
#print('overfit testing: prediction of training set')
|
||||
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
||||
|
@ -124,28 +135,28 @@ class NaiveBayes:
|
|||
print('# ending naive bayes')
|
||||
print('#')
|
||||
|
||||
######## nur für resubstitutionsfehler benötigt ########
|
||||
######## nur für resubstitutionsfehler benötigt ########
|
||||
def analyze_errors(dataset):
|
||||
'''calculates resubstitution error
|
||||
shows indices of false classified articles
|
||||
uses Gaussian Bayes with train test split
|
||||
'''
|
||||
X_train_test = dataset['Title'] + ' ' + dataset['Text']
|
||||
'''
|
||||
X_train_test = dataset['Title'] + ' ' + dataset['Text']
|
||||
y_train_test = dataset['Label']
|
||||
|
||||
count_vector = CountVectorizer()
|
||||
count_vector = CountVectorizer()
|
||||
# fit the training data and then return the matrix
|
||||
training_data = count_vector.fit_transform(X_train_test).toarray()
|
||||
# transform testing data and return the matrix
|
||||
testing_data = count_vector.transform(X_train_test).toarray()
|
||||
|
||||
# Naive Bayes
|
||||
classifier = GaussianNB()
|
||||
classifier = GaussianNB()
|
||||
# fit classifier
|
||||
classifier.fit(training_data, y_train_test)
|
||||
|
||||
|
||||
# Predict class
|
||||
predictions = classifier.predict(testing_data)
|
||||
predictions = classifier.predict(testing_data)
|
||||
print('Errors at index:')
|
||||
print()
|
||||
n = 0
|
||||
|
@ -157,6 +168,9 @@ class NaiveBayes:
|
|||
.format(i, predictions[i], y_train_test[i]))
|
||||
print(X_train_test[i])
|
||||
print(y_train_test[i])
|
||||
print()
|
||||
#print metrics
|
||||
print('F1 score: ', format(f1_score(y_train_test, predictions)))
|
||||
print()
|
||||
#print metrics
|
||||
print('F1 score: ', format(f1_score(y_train_test, predictions)))
|
||||
|
||||
print('#')
|
||||
print('# ending program')
|
12
README.md
12
README.md
|
@ -1,3 +1,13 @@
|
|||
# thesis-anne
|
||||
my python classes for text mining, machine learning models, …
|
||||
|
||||
my python classes for text mining, machine learning models, …
|
||||
# Requirements
|
||||
pandas==0.20.1
|
||||
nltk==3.2.5
|
||||
webhoseio==0.5
|
||||
numpy==1.14.0
|
||||
graphviz==0.9
|
||||
scikit_learn==0.19.2
|
||||
|
||||
# Installation under (UBUNTU?)
|
||||
apt-get install XX
|
38
Requester.py
38
Requester.py
|
@ -12,12 +12,12 @@ import re
|
|||
from datetime import datetime
|
||||
|
||||
import pandas as pd
|
||||
import webhoseio
|
||||
import webhoseio
|
||||
|
||||
from CsvHandler import CsvHandler
|
||||
from CsvHandler import CsvHandler
|
||||
|
||||
class Requester:
|
||||
|
||||
|
||||
def save_articles_from_webhoseio():
|
||||
''' create DataFrame of articles with
|
||||
Timestamp, Title, Text, SiteSection
|
||||
|
@ -25,14 +25,14 @@ class Requester:
|
|||
'''
|
||||
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
|
||||
filestring = 'download_articles_{}.csv'.format(datestring)
|
||||
|
||||
|
||||
# print message
|
||||
print('# retrieving articles from webhose.io')
|
||||
|
||||
# personal API key
|
||||
webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
|
||||
webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
|
||||
|
||||
# webhose.io query
|
||||
# webhose.io query
|
||||
# suboptimal: usage of search terms :-(
|
||||
query_params = {
|
||||
"q": "thread.title:(merger OR merges OR merge OR merged OR "
|
||||
|
@ -47,25 +47,25 @@ class Requester:
|
|||
"has_video:false",
|
||||
"ts": "1527411742661",
|
||||
"sort": "crawled"}
|
||||
|
||||
|
||||
output = webhoseio.query("filterWebContent", query_params)
|
||||
|
||||
|
||||
sum_posts = output['totalResults']
|
||||
print('# total sum of posts: ' + str(sum_posts))
|
||||
|
||||
|
||||
# 100 articles per batch (download)
|
||||
num_downloads = int(sum_posts / 100)
|
||||
num_downloads = int(sum_posts / 100)
|
||||
print('# collecting first {} articles'.format(num_downloads * 100))
|
||||
print('# sorting out other sources than reuters')
|
||||
|
||||
|
||||
# twodimensional list of all articles
|
||||
list_articles = []
|
||||
|
||||
for n in range(num_downloads):
|
||||
# save next 100 articles
|
||||
for i in range(100):
|
||||
for i in range(100):
|
||||
# check if correct source 'reuters'
|
||||
if not re.search(r'reuters',
|
||||
if not re.search(r'reuters',
|
||||
output['posts'][i]['thread']['site_section']):
|
||||
continue
|
||||
else:
|
||||
|
@ -73,21 +73,21 @@ class Requester:
|
|||
article.append(output['posts'][i]['published'])
|
||||
article.append(output['posts'][i]['title'].replace('|', ' '))
|
||||
# remove white spaces and separators
|
||||
text = output['posts'][i]['text'].replace('\n', ' ')
|
||||
.replace('\r', ' ').replace('|', ' ')
|
||||
text = output['posts'][i]['text'].replace('\n', ' ')\
|
||||
.replace('\r', ' ').replace('|', ' ')
|
||||
section = output['posts'][i]['thread']['site_section']
|
||||
article.append(text)
|
||||
# remove '\r' at end of some urls
|
||||
section = section.replace('\r', '')
|
||||
section = section.replace('\r', '')
|
||||
article.append(section)
|
||||
# add article to list
|
||||
list_articles.append(article)
|
||||
|
||||
|
||||
# Get the next batch of 100 posts
|
||||
output = webhoseio.get_next()
|
||||
|
||||
|
||||
# create DataFrame
|
||||
df = pd.DataFrame(data=list_articles,
|
||||
df = pd.DataFrame(data=list_articles,
|
||||
columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
|
||||
# save csv
|
||||
CsvHandler.write_csv(df, filestring)
|
50
SVM.py
50
SVM.py
|
@ -2,14 +2,14 @@
|
|||
Support Vector Machines (SVM) Classifier
|
||||
========================================
|
||||
|
||||
The SVM training algorithm builds a model from the training data that assigns
|
||||
the test samples to one category ('merger' or 'not merger'),
|
||||
making it a non-probabilistic binary linear classifier.
|
||||
An SVM model is a representation of the samples as points in space,
|
||||
mapped so that the examples of the separate categories are divided
|
||||
by a clear gap that is as wide as possible.
|
||||
New samples are then mapped into that same space and predicted
|
||||
to belong to a category based on which side of the gap they fall.
|
||||
The SVM training algorithm builds a model from the training data that assigns
|
||||
the test samples to one category ('merger' or 'not merger'),
|
||||
making it a non-probabilistic binary linear classifier.
|
||||
An SVM model is a representation of the samples as points in space,
|
||||
mapped so that the examples of the separate categories are divided
|
||||
by a clear gap that is as wide as possible.
|
||||
New samples are then mapped into that same space and predicted
|
||||
to belong to a category based on which side of the gap they fall.
|
||||
'''
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
|
@ -25,12 +25,12 @@ from sklearn.svm import SVC
|
|||
class SVM:
|
||||
|
||||
def make_svm(dataset):
|
||||
|
||||
|
||||
print('# starting SVM')
|
||||
print('#')
|
||||
|
||||
# split data into text and label set
|
||||
|
||||
|
||||
# articles' text (title + text)
|
||||
X = dataset['Title'] + ' ' + dataset['Text']
|
||||
# articles' labels
|
||||
|
@ -44,28 +44,28 @@ class SVM:
|
|||
X = CountVectorizer().fit_transform(X).toarray()
|
||||
|
||||
# use stratified k-fold cross-validation as split method
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
|
||||
# use only most important features
|
||||
selector = SelectPercentile()
|
||||
|
||||
selector = SelectPercentile()
|
||||
|
||||
pipeline = Pipeline([('perc', selector), ('SVC', SVC())])
|
||||
|
||||
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
|
||||
|
||||
grid = GridSearchCV(pipeline, {'perc__percentile': [25, 50, 75, 100],
|
||||
'SVC__kernel': ['linear','poly'],
|
||||
'SVC__gamma': [0.001, 0.01],
|
||||
'SVC__C': [0.1, 1]},
|
||||
cv=skf,
|
||||
'SVC__gamma': [0.001, 0.01],
|
||||
'SVC__C': [0.1, 1]},
|
||||
cv=skf,
|
||||
scoring=make_scorer(f1_score))
|
||||
|
||||
|
||||
print('# fit classifier')
|
||||
print('#')
|
||||
|
||||
print('#')
|
||||
|
||||
grid.fit(X,y)
|
||||
|
||||
|
||||
# DataFrame of results
|
||||
df_results = grid.cv_results_
|
||||
|
||||
|
||||
# print results
|
||||
######################
|
||||
print('RESULTS:')
|
||||
|
@ -76,12 +76,12 @@ class SVM:
|
|||
print('mean of means:')
|
||||
print(sum(df_results['mean_test_score'])/len(df_results['mean_test_score']))
|
||||
print('')
|
||||
print('best score:')
|
||||
print('best score:')
|
||||
print(grid.best_score_)
|
||||
print()
|
||||
print('best parameters set found on development set:')
|
||||
print(grid.best_params_)
|
||||
print()
|
||||
|
||||
|
||||
print('# ending SVM')
|
||||
print('#')
|
|
@ -15,9 +15,6 @@ from SVM import SVM
|
|||
print('# starting program')
|
||||
print('#')
|
||||
|
||||
# only if new unlabeled(!) data set is required:
|
||||
# Requester.save_articles_from_webhoseio()
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
|
@ -25,7 +22,6 @@ print('# reading dataset')
|
|||
print('#')
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
# DecisionTree.make_tree(dataset)
|
||||
NaiveBayes.make_naive_bayes(dataset)
|
||||
# SVM.make_svm(dataset)
|
||||
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
@BOOK{pierson2016,
|
||||
AUTHOR="Lillian Pierson",
|
||||
TITLE="Data Science für Dummies",
|
||||
PUBLISHER="WILEY-VCH Verlag GmbH \& Co. KGaA",
|
||||
YEAR=2016,
|
||||
ADDRESS="Weinheim"
|
||||
}
|
Binary file not shown.
|
@ -0,0 +1,450 @@
|
|||
\documentclass[11pt,a4paper]{scrbook}
|
||||
\usepackage{geometry}
|
||||
\usepackage[utf8]{inputenc}
|
||||
\usepackage[T1]{fontenc}
|
||||
\usepackage[pdftex]{graphicx}
|
||||
%\usepackage[ngerman]{babel}
|
||||
\usepackage{colortbl}
|
||||
\usepackage{xcolor}
|
||||
\usepackage{soul}
|
||||
\usepackage{cleveref}
|
||||
\usepackage{todonotes}
|
||||
|
||||
\AtBeginDocument{\renewcommand{\chaptername}{}}
|
||||
|
||||
% Kommentare Julian
|
||||
\newcommand{\jk}[1]{\todo[inline]{JK: #1}}
|
||||
\renewcommand{\familydefault}{\sfdefault}
|
||||
|
||||
% Kommentare Anne
|
||||
\definecolor{comments}{cmyk}{1,0,1,0}
|
||||
\newcommand{\al}[1]{\todo[inline]{\color{comments}{AL: #1}}}
|
||||
|
||||
|
||||
|
||||
\definecolor{uhhred}{cmyk}{0,100,100,0}
|
||||
|
||||
\begin{document}
|
||||
|
||||
\frontmatter
|
||||
\newgeometry{centering,left=2cm,right=2cm,top=2cm,bottom=2cm}
|
||||
\begin{titlepage}
|
||||
\includegraphics[scale=0.3]{UHH-Logo_2010_Farbe_CMYK.pdf}
|
||||
\vspace*{2cm}
|
||||
\Large
|
||||
\begin{center}
|
||||
{\color{uhhred}\textbf{\so{BACHELORTHESIS}}}
|
||||
\vspace*{2.0cm}\\
|
||||
{\LARGE \textbf{Interactive Labeling of Unclassified Data\\Using the Example of Recognition of Company Mergers}}
|
||||
%or: Incremental labeling of an unknown data set using the example of classification of news articles
|
||||
\vspace*{2.0cm}\\
|
||||
vorgelegt von
|
||||
\vspace*{0.4cm}\\
|
||||
Anne Lorenz
|
||||
\end{center}
|
||||
\vspace*{3.5cm}
|
||||
|
||||
\noindent
|
||||
MIN-Fakultät \vspace*{0.4cm} \\
|
||||
Fachbereich Informatik \vspace*{0.4cm} \\
|
||||
%Ggf. Professur/Institut \vspace*{0.4cm} \\
|
||||
Studiengang: Software-System-Entwicklung \vspace*{0.4cm} \\
|
||||
Matrikelnummer: 6434073 \vspace*{0.8cm} \\
|
||||
Erstgutachter: Dr. Julian Kunkel \vspace*{0.4cm} \\
|
||||
Zweitgutachter: Eugen Betke
|
||||
\vspace*{0.8cm} \\
|
||||
Betreuer: Dr. Julian Kunkel, Doris Birkefeld
|
||||
\end{titlepage}
|
||||
|
||||
\restoregeometry
|
||||
|
||||
\chapter*{Abstract}
|
||||
BLABLA ABSTRACT
|
||||
%So objektiv, kurz, verständlich, vollständig und genau wie möglich :-)
|
||||
|
||||
\tableofcontents
|
||||
|
||||
\mainmatter
|
||||
|
||||
%Kapitel Einleitung
|
||||
%####################
|
||||
\chapter{Introduction}
|
||||
\label{chap:introduction}
|
||||
|
||||
\textit{
|
||||
In this chapter...In \cref{sec:motivation} the motivation, then in \cref{sec:goals} the goals, blablabla...
|
||||
}
|
||||
|
||||
\section{Motivation}
|
||||
\label{sec:motivation}
|
||||
Given a classification problem, there is always a labeled data set needed first to apply a machine learning model and make predictions possible. The larger the labeled data set is, the better are generally the predictions. However, to get there, each single data element must first be classified manually. Depending on the type of data, this procedure can be very time-consuming, for example if longer texts have to be read.
|
||||
|
||||
In this thesis we want to present an alternative data labeling method that allows to label a larger amount of data in a shorter time.
|
||||
|
||||
\section{Goals}
|
||||
\label{sec:goals}
|
||||
|
||||
\jk{Ein Satz welcher das Problem beschreibt, dannach dann runtergebrochen in Teilaufgaben}
|
||||
|
||||
We want to compare a conventional method of data labeling with an alternative, incremental method using the following example: The aim is to investigate news articles about recent mergers ('mergers and acquisitions') and to classify them accordingly. With the help of the labeled data set, different classification models will be applied and optimized so that a prediction about future news articles will be possible.
|
||||
|
||||
\section{Outline}
|
||||
über die gliederung...
|
||||
|
||||
\bigskip
|
||||
\paragraph{Summary:}
|
||||
|
||||
\textit{\newline In this chapter we discussed ... The following chapter deals with blabla.}
|
||||
|
||||
%Kapitel Stand der Technik
|
||||
%##########################
|
||||
\chapter{State of the Art}
|
||||
\label{state_of_the_art}
|
||||
|
||||
\textit{In this chapter the current state of research in the field of... will be presented.
|
||||
}
|
||||
|
||||
\section{State of Research}
|
||||
\al{Was soll hier rein?}
|
||||
|
||||
\bigskip
|
||||
\paragraph{Summary:}
|
||||
|
||||
\textit{\newline In this chapter we have described ... are described in the next chapter. In the next chapter we describe...
|
||||
}
|
||||
|
||||
%Kapitel Grundlagen
|
||||
%####################
|
||||
\chapter{Background and Related Work}
|
||||
\label{chap:background}
|
||||
|
||||
\textit{
|
||||
In this chapter...In \cref{sec:news} news sources are introduced, then blablabla...
|
||||
}
|
||||
|
||||
\section{Business News about Mergers}
|
||||
\label{sec:news}
|
||||
|
||||
\subsection{Company Mergers}
|
||||
When two companies merge, ... When shares of a company are sold, ... Blabla...
|
||||
|
||||
\subsection{Webhose.io as Source for News Articles}
|
||||
As a source for our initial data set, RSS feeds from established business news agencies such as Reuters or Bloomberg come into consideration. However, when crawling RSS feeds, it is not possible to retrieve news from a longer period in the past. Since we want to analyze news of the last 12 months, we obtain the data set from the provider webhose.io. It offers access to English news articles from the sections 'Financial News', 'Finance' and 'Business', among others. As we are only interested in reliable sources, we limit our request to the websites of Reuters, Bloomberg, Financial Times, The Economist and ...
|
||||
|
||||
|
||||
|
||||
\section{Supervised Machine Learning Problems}
|
||||
|
||||
\subsubsection{Structured / Unstructured Data}
|
||||
|
||||
\subsection{Classification Problems}
|
||||
\subsubsection{Binary Classification}
|
||||
Vergleichbar mit Spamfilterung...
|
||||
\subsubsection{Multiple Classification}
|
||||
|
||||
\subsection{Balanced / Unbalanced Data Set}
|
||||
|
||||
|
||||
\section{Text Analysis}
|
||||
\subsection{Natural Language Processing (NLP)}
|
||||
\subsection{Tokenization}
|
||||
\subsection{Unigram, Bigram}
|
||||
\subsection{Stemming}
|
||||
\subsection{Feature Vectors}
|
||||
\subsubsection{Word Frequencies}
|
||||
\subsection{Bag of Words (BOW)}
|
||||
\subsection{Stop Words}
|
||||
\subsection{Named Entity Recognition (NER)}
|
||||
|
||||
\section{Machine Learning Models}
|
||||
\subsection{Naive Bayes Classifier}
|
||||
\subsection{Support Vector Machines (SVM)}
|
||||
\subsection{Decision Trees}
|
||||
\subsection{Hyperparameters}
|
||||
\subsection{Feature Selection}
|
||||
|
||||
\section{Split Methods}
|
||||
\subsection{Test-Train-Split}
|
||||
\subsection{Shuffle Split}
|
||||
\subsection{(K-fold) Cross-Validation}
|
||||
|
||||
\section{Metrics}
|
||||
\subsection{Accuracy, Error Rate, Sensitivity, Specifity}
|
||||
Sensitivity(=true positive rate) and Specificity(=true negative rate)
|
||||
\subsection{Recall, Precision, F1-score}
|
||||
\subsection{Robustness}
|
||||
\subsection{Overfit, Underfit}
|
||||
\subsection{Bias, Variance}
|
||||
\subsection{Resubstitution Error}
|
||||
|
||||
\bigskip
|
||||
\paragraph{Summary:}
|
||||
|
||||
\textit{\newline
|
||||
In this chapter we ... blabla are described in section bla.
|
||||
In the next chapter we describe...
|
||||
}
|
||||
|
||||
%Kapitel Design
|
||||
%###########################
|
||||
\chapter{Design}
|
||||
\label{chap:design}
|
||||
|
||||
\textit{
|
||||
In this chapter... In \cref{sec:overview} we give an overview of all, then in \cref{sec:pipeline} the data processing pipeline, blablabla...
|
||||
}
|
||||
|
||||
\section{Overview}
|
||||
\label{sec:overview}
|
||||
|
||||
\jk{Was muss insgesamt gemacht werden, welche Teilprobleme müssen addressiert werden}
|
||||
|
||||
\jk{Alternativen besprechen, Entscheidungen fällen basierend auf Kriterien}
|
||||
|
||||
\jk{Hier ist evtl. noch einiges drin was in Kapitel 'Grundlagen' verschoben wird. Hier kommt Deine Arbeit hin, kein Related work oder Methoden die es schon gibt. Nur falls man es Vergleicht, dann relevant.}
|
||||
|
||||
\section{Data Processing Pipeline}
|
||||
\label{sec:pipeline}
|
||||
|
||||
\section{Preprocessing}
|
||||
Tokenization, Stemming, Stop Words, Leaving Out Numbers
|
||||
|
||||
\section{Data Labeling}
|
||||
|
||||
\subsection{Conventional Method}
|
||||
|
||||
\subsubsection{Top-Down / Waterfall}
|
||||
1) Data Labeling \\
|
||||
2) Data Cleaning\\
|
||||
3) Model Building\\
|
||||
4) Analysis of wrong predicted instances
|
||||
=> evtl. neu labeln, wird meistens nicht gemacht\\
|
||||
5) Neue Hypothesen => 3); evl. zu 2)\\
|
||||
|
||||
\subsection{Incremental Method}
|
||||
|
||||
\subsubsection{Visual Analyticts, Agile Model Development}
|
||||
|
||||
\subsubsection{Unbalanced Data Set}
|
||||
|
||||
\section{Model Selection}
|
||||
\subsection{Naive Bayes}
|
||||
GaussianNB vs MultinomialNB
|
||||
\subsection{SVM}
|
||||
\subsection{Decision Tree}
|
||||
|
||||
|
||||
\section{Recognition of merger partners}
|
||||
\subsubsection{Named Entity Recognition (NER)}
|
||||
|
||||
\bigskip
|
||||
\paragraph{Summary:}
|
||||
|
||||
\textit{\newline
|
||||
In this chapter we... In the next chapter...
|
||||
}
|
||||
|
||||
% Kapitel Labeling
|
||||
%###########################
|
||||
\chapter{Data Labeling}
|
||||
\label{chap:labeling}
|
||||
|
||||
\textit{
|
||||
This chapter describes the procedure for labeling. blabla
|
||||
}
|
||||
|
||||
\section{Conventional Method}
|
||||
|
||||
\subsection{Data Set}
|
||||
1497 Artikel\\
|
||||
Zeitraum: 1 Monat\\
|
||||
Quelle: Reuters.com\\
|
||||
|
||||
\subsection{Classification}
|
||||
Daten binär klassifiziert, Zeitaufwand ca. 30 Stunden
|
||||
|
||||
\subsection{Difficulties}
|
||||
Hier ein paar Textbeispiele, die schwierig einzuordnen waren:\\
|
||||
- wie soll mit Anteilsverkäufen > 50 \% umgegangen werden? => bedeutet eigentlich Eigentümerwechsel\\
|
||||
- "X will buy Y", "X wants to buy Y" => findet es definitiv statt? => ganzer Artikel muss gelesen werden\\
|
||||
- Fusion nur als Randbemerkung, ("letztes Jahr haben X und Y fusioniert..., jetzt entstehen neue geschäftsbereiche blabla") ansonsten aber irrelevanter Artikel
|
||||
\\
|
||||
|
||||
=> aus diesen problemen heraus entstand die idee, verschiedene klassen zu verwenden
|
||||
|
||||
\section{Incremental Method}
|
||||
\subsection{Data Set}
|
||||
10.000 Artikel aus 130.000\\
|
||||
Zeitraum: 12 Monate\\
|
||||
Quellen: Reuters.com, Bloomberg.com, ...\\
|
||||
\subsection{Classification}
|
||||
Daten mehrfach klassifiert mit 6 Klassen:\\
|
||||
\\
|
||||
1: Merger \\
|
||||
2: Merger Pending\\
|
||||
3: Merger Aborted\\
|
||||
4: Sale of Shares\\
|
||||
5: Incidental \\
|
||||
6: Irrelevant \\
|
||||
\subsection{Selection of Articles}
|
||||
\subsection{Procedure}
|
||||
Wähle von jedem Monat 10 Artikel zufällig aus.
|
||||
Es ist wahrscheinlich dann man nur Merger mit vielen Artikeln hat
|
||||
=> Das könnte man minimieren indem man “stratified” sampling macht
|
||||
=> Zuerst NER machen, danach fair über Klassen randomisieren
|
||||
=> wähle 10 Artikel von 100 Kategorien aus => 10 Kategorien auswählen => darunter zufällig ein Artikel
|
||||
Labeln von 1\% aller Artikel
|
||||
1) Erste Modelle bauen z.b. Bayes
|
||||
Auf alle Artikel anwenden => Wahrscheinlichkeit pro Klasse Vektor: (K1, K2, … , K6)
|
||||
Klare Fälle: Kx > 80\% und alle anderen Ky < 10\% (mit x in {1-6}, y != x)
|
||||
=> Label übernehmen => wie viele Fälle sind eindeutig?
|
||||
Behauptung: 10\% aller Artikel sind eindeutig
|
||||
Stichprobenartig überprüfen => 10 Artikel random auswählen von jeder Klasse
|
||||
Identifikation von äußert unklaren Fällen
|
||||
Mehr als eine Klasse hat ähnliche Wahrscheinlichkeit
|
||||
(5\%, 5\%, 5\%, …) => (80\%, 80\%, 0\%, 0\%, …)
|
||||
z.b. 100 Artikel angucken und manuell label
|
||||
=> Wiederhole ich 3-4 mal gehe zu Schritt 1) (Modell bauen)
|
||||
=> 95\% aller Fälle sind jetzt klar.
|
||||
=> warum gehen die 5\% nicht? Stichprobenartig Artikel anschauen
|
||||
Falls das nicht klappt, Modelle oder Preprozessing (z.b. NER) verbessern
|
||||
|
||||
|
||||
\subsection{Tagging of Named Entities}
|
||||
Histogram: X: Autoren/Personen, Unternehmen, Y: Anzahl der Nennungen
|
||||
|
||||
\bigskip
|
||||
\paragraph{Summary:}
|
||||
|
||||
\textit{\newline
|
||||
In this chapter...in the next chapter...
|
||||
}
|
||||
|
||||
% Kapitel Implementierung
|
||||
%##########################
|
||||
\chapter{Implementation}
|
||||
\label{chap:implementation}
|
||||
|
||||
\textit{
|
||||
This chapter deals with the most relevant parts of the implementation.
|
||||
}
|
||||
|
||||
\section{Data Download}
|
||||
Query webhose.io:\\
|
||||
% austauschen!
|
||||
query\_params = \{'q':'site:(reuters.com OR ft.com OR cnn.com OR economist.com OR bloomberg.com OR theguardian.com) site\_category:(financial\_news OR finance OR business)',
|
||||
'ts': '1533634070282',
|
||||
'sort': 'crawled'\}
|
||||
|
||||
\section{Python Modules}
|
||||
\subsection{nltk}
|
||||
\subsection{pandas}
|
||||
\subsection{sklearn}
|
||||
\subsection{webhoseio}
|
||||
\section{Own Implementation}
|
||||
\subsection{Examples}
|
||||
|
||||
\bigskip
|
||||
\paragraph{Summary:}
|
||||
|
||||
\textit{\newline
|
||||
In this chapter, we...In the next chapter...
|
||||
}
|
||||
|
||||
% Kapitel Evaluation
|
||||
%##########################
|
||||
\chapter{Evaluation}
|
||||
\label{chap:evaluation}
|
||||
|
||||
\textit{
|
||||
In this chapter we want to evaluate the different methods. blabla.
|
||||
}
|
||||
|
||||
\section{News Articles Exploration}
|
||||
|
||||
\subsection{Length of Articles}
|
||||
Oder was sonst noch interessant ist.
|
||||
|
||||
\subsection{Most Common Words}
|
||||
|
||||
Im Bezug auf die Artikel über Fusion.
|
||||
\subsubsection{Word Cloud}
|
||||
z.B. Word Cloud mit Microsoft-Github-Fusion-Artikel.
|
||||
\section{Model Fitting}
|
||||
dran denken: Hyperparameter SEPARAT variieren
|
||||
|
||||
|
||||
\subsection{Naive Bayes Model}
|
||||
Grid-Search
|
||||
|
||||
\subsection{SVM}
|
||||
\subsection{Decision Tree}
|
||||
|
||||
\section{Performance}
|
||||
|
||||
\bigskip
|
||||
\paragraph{Summary:}
|
||||
|
||||
\textit{\newline
|
||||
In this chapter we have described ... In the last chapter we describe...
|
||||
}
|
||||
|
||||
\chapter{Discussion (?)}
|
||||
\al{Braucht man das? Arbeit soll kritisch hinterfragt werden, z.B. 'war der datensatz gut gewählt?' etc.}
|
||||
|
||||
% Kapitel ZUsammenfassung
|
||||
%#############################
|
||||
\chapter{Summary}
|
||||
\label{chap:summary}
|
||||
|
||||
\section{Comparison of Labeling Methods}
|
||||
|
||||
\section{Quality of Predictions}
|
||||
|
||||
\section{Conclusions}
|
||||
|
||||
\section{Future Work}
|
||||
Neuronales Netz
|
||||
|
||||
\bigskip
|
||||
\paragraph{Summary:}
|
||||
|
||||
\textit{\newline
|
||||
In the last chapter we have described ....
|
||||
}
|
||||
|
||||
% Literaturliste soll im Inhaltsverzeichnis auftauchen
|
||||
\nocite{*}
|
||||
\addcontentsline{toc}{chapter}{Bibliography}
|
||||
|
||||
% Literaturliste anzeigen
|
||||
\bibliography{LV}
|
||||
|
||||
\backmatter
|
||||
|
||||
\thispagestyle{empty}
|
||||
|
||||
\vspace*{\fill}
|
||||
\pagestyle{empty}
|
||||
|
||||
{\normalsize
|
||||
\begin{center}\textbf{Eidesstattliche Erklärung}\end{center}
|
||||
Hiermit versichere ich an Eides statt, dass ich die vorliegende Arbeit im Bachelorstudiengang Wirtschaftsinformatik selbstständig verfasst und keine anderen als die angegebenen Hilfsmittel – insbesondere keine im Quellenverzeichnis nicht benannten Internet-Quellen – benutzt habe. Alle Stellen, die wörtlich oder sinngemäß aus Veröffentlichungen entnommen wurden, sind als solche kenntlich gemacht. Ich versichere weiterhin, dass ich die Arbeit vorher nicht in einem anderen Prüfungsverfahren eingereicht habe und die eingereichte schriftliche Fassung der auf dem elektronischen Speichermedium entspricht.
|
||||
\vspace*{1cm}\\
|
||||
Hamburg, den 01.02.2019
|
||||
\hspace*{\fill}\begin{tabular}{@{}l@{}}\hline
|
||||
\makebox[5cm]{Anne Lorenz}
|
||||
\end{tabular}
|
||||
\vspace*{3cm}
|
||||
%Dies ist optional, ggf. löschen!
|
||||
\begin{center}\textbf{Veröffentlichung}\end{center}
|
||||
Ich stimme der Einstellung der Arbeit in die Bibliothek des Fachbereichs Informatik zu.
|
||||
\vspace*{1cm}\\
|
||||
Hamburg, den 01.02.2019
|
||||
\hspace*{\fill}\begin{tabular}{@{}l@{}}\hline
|
||||
\makebox[5cm]{Anne Lorenz}
|
||||
\end{tabular}
|
||||
}
|
||||
\vspace*{\fill}
|
||||
|
||||
\end{document}
|
Loading…
Reference in New Issue