added requirements and some things
This commit is contained in:
		
							parent
							
								
									c2066d6adb
								
							
						
					
					
						commit
						ab578ae0c6
					
				| @ -11,8 +11,8 @@ c = 1, if articles are equal => identicalness is 100% | |||||||
| (The greater c, the more similar two articles are.) | (The greater c, the more similar two articles are.) | ||||||
| ''' | ''' | ||||||
| 
 | 
 | ||||||
| #TODO: uses dictionaries of each article  | #TODO:uses dictionaries of each article | ||||||
| # => ToDo: has to be changed as we are now using vectors | #=>ToDo:has to be changed as we are now using vectors | ||||||
| 
 | 
 | ||||||
| import math | import math | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -10,6 +10,7 @@ holding the class labels for the training samples. | |||||||
| import operator | import operator | ||||||
| 
 | 
 | ||||||
| from BagOfWords import BagOfWords | from BagOfWords import BagOfWords | ||||||
|  | from CsvHandler import CsvHandler | ||||||
| 
 | 
 | ||||||
| import graphviz | import graphviz | ||||||
| import numpy as np | import numpy as np | ||||||
| @ -21,8 +22,17 @@ from sklearn.model_selection import StratifiedKFold | |||||||
| 
 | 
 | ||||||
| class DecisionTree: | class DecisionTree: | ||||||
| 
 | 
 | ||||||
|     def make_tree(dataset): |     print('# starting program') | ||||||
|  |     print('#') | ||||||
| 
 | 
 | ||||||
|  |     file = 'classification_labelled_corrected.csv' | ||||||
|  | 
 | ||||||
|  |     # read csv file | ||||||
|  |     print('# reading dataset') | ||||||
|  |     print('#') | ||||||
|  |     dataset = CsvHandler.read_csv(file) | ||||||
|  | 
 | ||||||
|  |     def make_tree(dataset): | ||||||
|         print('# starting decision tree') |         print('# starting decision tree') | ||||||
|         print('#') |         print('#') | ||||||
| 
 | 
 | ||||||
| @ -104,6 +114,8 @@ class DecisionTree: | |||||||
|                 # format(min(f1_scores_train), max(f1_scores_train), |                 # format(min(f1_scores_train), max(f1_scores_train), | ||||||
|                 # sum(f1_scores_train)/float(len(f1_scores_train)))) |                 # sum(f1_scores_train)/float(len(f1_scores_train)))) | ||||||
|         # print() |         # print() | ||||||
|          |  | ||||||
|         print('# ending decision tree') |         print('# ending decision tree') | ||||||
|         print('#') |         print('#') | ||||||
|  | 
 | ||||||
|  |     DecisionTree.make_tree(dataset) | ||||||
|  |     print('# ending program') | ||||||
| @ -4,7 +4,7 @@ Naive Bayes Classifier | |||||||
| 
 | 
 | ||||||
| Naive Bayes is a probabilistic classifier that is able to predict a | Naive Bayes is a probabilistic classifier that is able to predict a | ||||||
| probability distribution over a set of classes, rather than only | probability distribution over a set of classes, rather than only | ||||||
| outputting the most likely class that the observation should belong to. | outputting the most likely class that the observation should belong to | ||||||
| 'Naive' means, that it assumes that the value of a particular feature | 'Naive' means, that it assumes that the value of a particular feature | ||||||
| (word in an article) is independent of the value of any other feature, | (word in an article) is independent of the value of any other feature, | ||||||
| given the label. It considers each of these features to contribute | given the label. It considers each of these features to contribute | ||||||
| @ -13,6 +13,7 @@ regardless of any possible correlations between these features. | |||||||
| ''' | ''' | ||||||
| 
 | 
 | ||||||
| from BagOfWords import BagOfWords | from BagOfWords import BagOfWords | ||||||
|  | from CsvReader import CsvReader | ||||||
| 
 | 
 | ||||||
| from sklearn.feature_extraction.text import CountVectorizer | from sklearn.feature_extraction.text import CountVectorizer | ||||||
| from sklearn.feature_selection import SelectPercentile | from sklearn.feature_selection import SelectPercentile | ||||||
| @ -22,6 +23,16 @@ from sklearn.naive_bayes import GaussianNB | |||||||
| 
 | 
 | ||||||
| class NaiveBayes: | class NaiveBayes: | ||||||
| 
 | 
 | ||||||
|  |     print('# starting program') | ||||||
|  |     print('#') | ||||||
|  | 
 | ||||||
|  |     file = 'classification_labelled_corrected.csv' | ||||||
|  | 
 | ||||||
|  |     # read csv file | ||||||
|  |     print('# reading dataset') | ||||||
|  |     print('#') | ||||||
|  |     dataset = CsvHandler.read_csv(file) | ||||||
|  | 
 | ||||||
|     def make_naive_bayes(dataset): |     def make_naive_bayes(dataset): | ||||||
|         '''fits naive bayes model with StratifiedKFold, |         '''fits naive bayes model with StratifiedKFold, | ||||||
|         uses my BOW |         uses my BOW | ||||||
| @ -160,3 +171,6 @@ class NaiveBayes: | |||||||
|                 print() |                 print() | ||||||
|         #print metrics |         #print metrics | ||||||
|         print('F1 score: ', format(f1_score(y_train_test, predictions))) |         print('F1 score: ', format(f1_score(y_train_test, predictions))) | ||||||
|  | 
 | ||||||
|  |     print('#') | ||||||
|  |     print('# ending program') | ||||||
							
								
								
									
										12
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								README.md
									
									
									
									
									
								
							| @ -1,3 +1,13 @@ | |||||||
| # thesis-anne | # thesis-anne | ||||||
| 
 |  | ||||||
| my python classes for text mining, machine learning models, … | my python classes for text mining, machine learning models, … | ||||||
|  | 
 | ||||||
|  | # Requirements | ||||||
|  | pandas==0.20.1 | ||||||
|  | nltk==3.2.5 | ||||||
|  | webhoseio==0.5 | ||||||
|  | numpy==1.14.0 | ||||||
|  | graphviz==0.9 | ||||||
|  | scikit_learn==0.19.2 | ||||||
|  | 
 | ||||||
|  | # Installation under (UBUNTU?) | ||||||
|  | apt-get install XX | ||||||
| @ -30,7 +30,7 @@ class Requester: | |||||||
|         print('# retrieving articles from webhose.io') |         print('# retrieving articles from webhose.io') | ||||||
|      |      | ||||||
|         # personal API key |         # personal API key | ||||||
|          webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX") |         webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX") | ||||||
| 
 | 
 | ||||||
|         # webhose.io query |         # webhose.io query | ||||||
|         # suboptimal: usage of search terms :-( |         # suboptimal: usage of search terms :-( | ||||||
| @ -73,7 +73,7 @@ class Requester: | |||||||
|                     article.append(output['posts'][i]['published']) |                     article.append(output['posts'][i]['published']) | ||||||
|                     article.append(output['posts'][i]['title'].replace('|', ' ')) |                     article.append(output['posts'][i]['title'].replace('|', ' ')) | ||||||
|                     # remove white spaces and separators |                     # remove white spaces and separators | ||||||
|                     text = output['posts'][i]['text'].replace('\n', ' ') |                     text = output['posts'][i]['text'].replace('\n', ' ')\ | ||||||
|                            .replace('\r', ' ').replace('|', ' ') |                            .replace('\r', ' ').replace('|', ' ') | ||||||
|                     section = output['posts'][i]['thread']['site_section'] |                     section = output['posts'][i]['thread']['site_section'] | ||||||
|                     article.append(text) |                     article.append(text) | ||||||
|  | |||||||
| @ -15,9 +15,6 @@ from SVM import SVM | |||||||
| print('# starting program') | print('# starting program') | ||||||
| print('#') | print('#') | ||||||
| 
 | 
 | ||||||
| # only if new unlabeled(!) data set is required: |  | ||||||
| # Requester.save_articles_from_webhoseio() |  | ||||||
| 
 |  | ||||||
| file = 'classification_labelled_corrected.csv' | file = 'classification_labelled_corrected.csv' | ||||||
| 
 | 
 | ||||||
| # read csv file | # read csv file | ||||||
| @ -25,7 +22,6 @@ print('# reading dataset') | |||||||
| print('#') | print('#') | ||||||
| dataset = CsvHandler.read_csv(file) | dataset = CsvHandler.read_csv(file) | ||||||
| 
 | 
 | ||||||
| # DecisionTree.make_tree(dataset) |  | ||||||
| NaiveBayes.make_naive_bayes(dataset) | NaiveBayes.make_naive_bayes(dataset) | ||||||
| # SVM.make_svm(dataset) | # SVM.make_svm(dataset) | ||||||
| 
 | 
 | ||||||
|  | |||||||
							
								
								
									
										7
									
								
								thesis/LV.bib
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										7
									
								
								thesis/LV.bib
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,7 @@ | |||||||
|  | @BOOK{pierson2016, | ||||||
|  | 	AUTHOR="Lillian Pierson", | ||||||
|  | 	TITLE="Data Science für Dummies", | ||||||
|  | 	PUBLISHER="WILEY-VCH Verlag GmbH \& Co. KGaA", | ||||||
|  | 	YEAR=2016, | ||||||
|  | 	ADDRESS="Weinheim" | ||||||
|  | } | ||||||
							
								
								
									
										
											BIN
										
									
								
								thesis/UHH-Logo_2010_Farbe_CMYK.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								thesis/UHH-Logo_2010_Farbe_CMYK.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										450
									
								
								thesis/thesis.tex
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										450
									
								
								thesis/thesis.tex
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,450 @@ | |||||||
|  | \documentclass[11pt,a4paper]{scrbook} | ||||||
|  | \usepackage{geometry}	 | ||||||
|  | \usepackage[utf8]{inputenc} | ||||||
|  | \usepackage[T1]{fontenc} | ||||||
|  | \usepackage[pdftex]{graphicx} | ||||||
|  | %\usepackage[ngerman]{babel} | ||||||
|  | \usepackage{colortbl}	 | ||||||
|  | \usepackage{xcolor} | ||||||
|  | \usepackage{soul} | ||||||
|  | \usepackage{cleveref} | ||||||
|  | \usepackage{todonotes} | ||||||
|  | 
 | ||||||
|  | \AtBeginDocument{\renewcommand{\chaptername}{}} | ||||||
|  | 
 | ||||||
|  | % Kommentare Julian | ||||||
|  | \newcommand{\jk}[1]{\todo[inline]{JK: #1}} | ||||||
|  | \renewcommand{\familydefault}{\sfdefault} | ||||||
|  | 
 | ||||||
|  | % Kommentare Anne | ||||||
|  | \definecolor{comments}{cmyk}{1,0,1,0} | ||||||
|  | \newcommand{\al}[1]{\todo[inline]{\color{comments}{AL: #1}}} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | \definecolor{uhhred}{cmyk}{0,100,100,0} | ||||||
|  | 
 | ||||||
|  | \begin{document} | ||||||
|  | 
 | ||||||
|  | \frontmatter | ||||||
|  | \newgeometry{centering,left=2cm,right=2cm,top=2cm,bottom=2cm} | ||||||
|  | \begin{titlepage} | ||||||
|  | \includegraphics[scale=0.3]{UHH-Logo_2010_Farbe_CMYK.pdf} | ||||||
|  | \vspace*{2cm} | ||||||
|  | \Large | ||||||
|  | \begin{center}  | ||||||
|  |       {\color{uhhred}\textbf{\so{BACHELORTHESIS}}} | ||||||
|  | \vspace*{2.0cm}\\ | ||||||
|  | {\LARGE \textbf{Interactive Labeling of Unclassified Data\\Using the Example of  Recognition of Company Mergers}} | ||||||
|  | %or: Incremental labeling of an unknown data set using the example of classification of news articles | ||||||
|  | \vspace*{2.0cm}\\ | ||||||
|  | vorgelegt von | ||||||
|  | \vspace*{0.4cm}\\ | ||||||
|  | Anne Lorenz | ||||||
|  | \end{center} | ||||||
|  | \vspace*{3.5cm} | ||||||
|  | 
 | ||||||
|  | \noindent  | ||||||
|  | MIN-Fakultät \vspace*{0.4cm} \\  | ||||||
|  | Fachbereich Informatik \vspace*{0.4cm} \\  | ||||||
|  | %Ggf. Professur/Institut \vspace*{0.4cm} \\ | ||||||
|  | Studiengang: Software-System-Entwicklung \vspace*{0.4cm} \\  | ||||||
|  | Matrikelnummer: 6434073 \vspace*{0.8cm} \\  | ||||||
|  | Erstgutachter: Dr. Julian Kunkel \vspace*{0.4cm} \\  | ||||||
|  | Zweitgutachter: Eugen Betke | ||||||
|  |  \vspace*{0.8cm} \\ | ||||||
|  | Betreuer: Dr. Julian Kunkel, Doris Birkefeld | ||||||
|  | \end{titlepage} | ||||||
|  | 
 | ||||||
|  | \restoregeometry | ||||||
|  | 
 | ||||||
|  | \chapter*{Abstract} | ||||||
|  | BLABLA ABSTRACT | ||||||
|  | %So objektiv, kurz, verständlich, vollständig und genau wie möglich :-) | ||||||
|  | 
 | ||||||
|  | \tableofcontents | ||||||
|  | 
 | ||||||
|  | \mainmatter  | ||||||
|  | 
 | ||||||
|  | %Kapitel Einleitung | ||||||
|  | %#################### | ||||||
|  | \chapter{Introduction}  | ||||||
|  | \label{chap:introduction} | ||||||
|  | 
 | ||||||
|  | \textit{ | ||||||
|  | In this chapter...In \cref{sec:motivation} the motivation, then in \cref{sec:goals} the goals, blablabla... | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | \section{Motivation}  | ||||||
|  | \label{sec:motivation} | ||||||
|  |  Given a classification problem, there is always a labeled data set needed first to apply a machine learning model and make predictions possible. The larger the labeled data set is, the better are generally the predictions. However, to get there, each single data element must first be classified manually. Depending on the type of data, this procedure can be very time-consuming, for example if longer texts have to be read. | ||||||
|  | 
 | ||||||
|  | In this thesis we want to present an alternative data labeling method that allows to label a larger amount of data in a shorter time.  | ||||||
|  | 
 | ||||||
|  | \section{Goals}  | ||||||
|  | \label{sec:goals} | ||||||
|  | 
 | ||||||
|  | \jk{Ein Satz welcher das Problem beschreibt, dannach dann runtergebrochen in Teilaufgaben} | ||||||
|  | 
 | ||||||
|  |  We want to compare a conventional method of data labeling with an alternative, incremental method using the following example: The aim is to investigate news articles about recent mergers ('mergers and acquisitions') and to classify them accordingly. With the help of the labeled data set, different classification models will be applied and optimized so that a prediction about future news articles will be possible.  | ||||||
|  | 
 | ||||||
|  | \section{Outline} | ||||||
|  | über die gliederung... | ||||||
|  | 
 | ||||||
|  | \bigskip | ||||||
|  | \paragraph{Summary:}  | ||||||
|  | 
 | ||||||
|  | \textit{\newline In this chapter we discussed ... The following chapter deals with blabla.} | ||||||
|  |   | ||||||
|  | %Kapitel Stand der Technik  | ||||||
|  | %########################## | ||||||
|  | \chapter{State of the Art}  | ||||||
|  | \label{state_of_the_art} | ||||||
|  | 
 | ||||||
|  | \textit{In this chapter the current state of research in the field of... will be presented. | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | \section{State of Research} | ||||||
|  | \al{Was soll hier rein?} | ||||||
|  | 
 | ||||||
|  | \bigskip | ||||||
|  | \paragraph{Summary:}  | ||||||
|  | 
 | ||||||
|  | \textit{\newline In this chapter we have described ... are described in the next chapter. In the next chapter we describe... | ||||||
|  | } | ||||||
|  |   | ||||||
|  | %Kapitel Grundlagen  | ||||||
|  | %####################  | ||||||
|  | \chapter{Background and Related Work}  | ||||||
|  | \label{chap:background} | ||||||
|  | 
 | ||||||
|  | \textit{ | ||||||
|  | In this chapter...In \cref{sec:news} news sources are introduced, then blablabla... | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | \section{Business News about Mergers}  | ||||||
|  | \label{sec:news} | ||||||
|  | 
 | ||||||
|  | \subsection{Company Mergers}  | ||||||
|  | When two companies merge, ... When shares of a company are sold, ... Blabla... | ||||||
|  | 
 | ||||||
|  | \subsection{Webhose.io as Source for News Articles}  | ||||||
|  | As a source for our initial data set, RSS feeds from established business news agencies such as Reuters or Bloomberg come into consideration. However, when crawling RSS feeds, it is not possible to retrieve news from a longer period in the past. Since we want to analyze news of the last 12 months, we obtain the data set from the provider webhose.io. It offers access to English news articles from the sections 'Financial News', 'Finance' and 'Business', among others. As we are only interested in reliable sources, we limit our request to the websites of Reuters, Bloomberg, Financial Times, The Economist and ...  | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | \section{Supervised Machine Learning Problems}  | ||||||
|  | 
 | ||||||
|  | \subsubsection{Structured / Unstructured Data}  | ||||||
|  | 
 | ||||||
|  | \subsection{Classification Problems} | ||||||
|  | \subsubsection{Binary Classification} | ||||||
|  | Vergleichbar mit Spamfilterung... | ||||||
|  | \subsubsection{Multiple Classification} | ||||||
|  | 
 | ||||||
|  | \subsection{Balanced / Unbalanced Data Set} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | \section{Text Analysis}  | ||||||
|  | \subsection{Natural Language Processing (NLP)}  | ||||||
|  | \subsection{Tokenization} | ||||||
|  | \subsection{Unigram, Bigram}  | ||||||
|  | \subsection{Stemming}  | ||||||
|  | \subsection{Feature Vectors} | ||||||
|  | \subsubsection{Word Frequencies}  | ||||||
|  | \subsection{Bag of Words (BOW)}  | ||||||
|  | \subsection{Stop Words}  | ||||||
|  | \subsection{Named Entity Recognition (NER)}  | ||||||
|  | 
 | ||||||
|  | \section{Machine Learning Models}  | ||||||
|  | \subsection{Naive Bayes Classifier}  | ||||||
|  | \subsection{Support Vector Machines (SVM)}  | ||||||
|  | \subsection{Decision Trees}  | ||||||
|  | \subsection{Hyperparameters}  | ||||||
|  | \subsection{Feature Selection} | ||||||
|  | 
 | ||||||
|  | \section{Split Methods}  | ||||||
|  | \subsection{Test-Train-Split} | ||||||
|  | \subsection{Shuffle Split}  | ||||||
|  | \subsection{(K-fold) Cross-Validation} | ||||||
|  | 
 | ||||||
|  | \section{Metrics} | ||||||
|  | \subsection{Accuracy, Error Rate, Sensitivity, Specifity} | ||||||
|  | Sensitivity(=true positive rate) and Specificity(=true negative rate) | ||||||
|  | \subsection{Recall, Precision, F1-score}  | ||||||
|  | \subsection{Robustness} | ||||||
|  | \subsection{Overfit, Underfit} | ||||||
|  | \subsection{Bias, Variance} | ||||||
|  | \subsection{Resubstitution Error} | ||||||
|  |    | ||||||
|  | \bigskip | ||||||
|  | \paragraph{Summary:}  | ||||||
|  | 
 | ||||||
|  | \textit{\newline | ||||||
|  | In this chapter we ... blabla are described in section bla. | ||||||
|  | In the next chapter we describe... | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | %Kapitel Design | ||||||
|  | %########################### | ||||||
|  | \chapter{Design}  | ||||||
|  | \label{chap:design} | ||||||
|  | 
 | ||||||
|  | \textit{ | ||||||
|  | In this chapter... In \cref{sec:overview} we give an overview of all, then in \cref{sec:pipeline} the data processing pipeline, blablabla... | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | \section{Overview} | ||||||
|  | \label{sec:overview} | ||||||
|  | 
 | ||||||
|  | \jk{Was muss insgesamt gemacht werden, welche Teilprobleme müssen addressiert werden} | ||||||
|  | 
 | ||||||
|  | \jk{Alternativen besprechen, Entscheidungen fällen basierend auf Kriterien} | ||||||
|  | 
 | ||||||
|  | \jk{Hier ist evtl. noch einiges drin was in Kapitel 'Grundlagen' verschoben wird. Hier kommt Deine Arbeit hin, kein Related work oder Methoden die es schon gibt. Nur falls man es Vergleicht, dann relevant.} | ||||||
|  | 
 | ||||||
|  | \section{Data Processing Pipeline}  | ||||||
|  | \label{sec:pipeline} | ||||||
|  | 
 | ||||||
|  | \section{Preprocessing}  | ||||||
|  | Tokenization, Stemming, Stop Words, Leaving Out Numbers | ||||||
|  | 
 | ||||||
|  | \section{Data Labeling}  | ||||||
|  | 
 | ||||||
|  | \subsection{Conventional Method}  | ||||||
|  | 
 | ||||||
|  | \subsubsection{Top-Down / Waterfall} | ||||||
|  | 1) Data Labeling \\ | ||||||
|  | 2) Data Cleaning\\ | ||||||
|  | 3) Model Building\\ | ||||||
|  | 4) Analysis of wrong predicted instances  | ||||||
|  | => evtl. neu labeln, wird meistens nicht gemacht\\ | ||||||
|  | 5) Neue Hypothesen => 3); evl. zu 2)\\ | ||||||
|  | 
 | ||||||
|  | \subsection{Incremental Method}  | ||||||
|  | 
 | ||||||
|  | \subsubsection{Visual Analyticts, Agile Model Development} | ||||||
|  | 
 | ||||||
|  | \subsubsection{Unbalanced Data Set}  | ||||||
|  | 
 | ||||||
|  | \section{Model Selection} | ||||||
|  | \subsection{Naive Bayes}  | ||||||
|  | GaussianNB vs MultinomialNB | ||||||
|  | \subsection{SVM}  | ||||||
|  | \subsection{Decision Tree}  | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | \section{Recognition of merger partners}  | ||||||
|  | \subsubsection{Named Entity Recognition (NER)}  | ||||||
|  | 
 | ||||||
|  | \bigskip | ||||||
|  | \paragraph{Summary:}  | ||||||
|  | 
 | ||||||
|  | \textit{\newline | ||||||
|  | In this chapter we... In the next chapter... | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | % Kapitel Labeling | ||||||
|  | %########################### | ||||||
|  | \chapter{Data Labeling} | ||||||
|  | \label{chap:labeling} | ||||||
|  | 
 | ||||||
|  | \textit{ | ||||||
|  | This chapter describes the procedure for labeling. blabla | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | \section{Conventional Method} | ||||||
|  | 
 | ||||||
|  | \subsection{Data Set} | ||||||
|  | 1497 Artikel\\ | ||||||
|  | Zeitraum: 1 Monat\\ | ||||||
|  | Quelle: Reuters.com\\ | ||||||
|  | 
 | ||||||
|  | \subsection{Classification} | ||||||
|  | Daten binär klassifiziert, Zeitaufwand ca. 30 Stunden | ||||||
|  | 
 | ||||||
|  | \subsection{Difficulties} | ||||||
|  | Hier ein paar Textbeispiele, die schwierig einzuordnen waren:\\ | ||||||
|  | - wie soll mit Anteilsverkäufen > 50 \% umgegangen werden? => bedeutet eigentlich Eigentümerwechsel\\ | ||||||
|  | - "X will buy Y", "X wants to buy Y" => findet es definitiv statt? => ganzer Artikel muss gelesen werden\\ | ||||||
|  | - Fusion nur als Randbemerkung, ("letztes Jahr haben X und Y fusioniert..., jetzt entstehen neue geschäftsbereiche blabla") ansonsten aber irrelevanter Artikel | ||||||
|  | \\ | ||||||
|  | 
 | ||||||
|  | => aus diesen problemen heraus entstand die idee, verschiedene klassen zu verwenden | ||||||
|  | 
 | ||||||
|  | \section{Incremental Method} | ||||||
|  | \subsection{Data Set} | ||||||
|  | 10.000 Artikel aus 130.000\\ | ||||||
|  | Zeitraum: 12 Monate\\ | ||||||
|  | Quellen: Reuters.com, Bloomberg.com, ...\\ | ||||||
|  | \subsection{Classification} | ||||||
|  | Daten mehrfach klassifiert mit 6 Klassen:\\ | ||||||
|  | \\ | ||||||
|  | 1: Merger \\ | ||||||
|  | 2: Merger Pending\\ | ||||||
|  | 3: Merger Aborted\\ | ||||||
|  | 4: Sale of Shares\\ | ||||||
|  | 5: Incidental \\ | ||||||
|  | 6: Irrelevant \\ | ||||||
|  | \subsection{Selection of Articles}  | ||||||
|  | \subsection{Procedure} | ||||||
|  | Wähle von jedem Monat 10 Artikel zufällig aus. | ||||||
|  | Es ist wahrscheinlich dann man nur Merger mit vielen Artikeln hat | ||||||
|  | => Das könnte man minimieren indem man “stratified” sampling macht | ||||||
|  | => Zuerst NER machen, danach fair über Klassen randomisieren | ||||||
|  | => wähle 10 Artikel von 100 Kategorien aus => 10 Kategorien auswählen => darunter zufällig ein Artikel | ||||||
|  | Labeln von 1\% aller Artikel | ||||||
|  | 1) Erste Modelle bauen z.b. Bayes | ||||||
|  | Auf alle Artikel anwenden => Wahrscheinlichkeit pro Klasse Vektor: (K1, K2, … , K6) | ||||||
|  | Klare Fälle: Kx > 80\% und alle anderen Ky < 10\% (mit x in {1-6}, y != x) | ||||||
|  | => Label übernehmen => wie viele Fälle sind eindeutig? | ||||||
|  | Behauptung: 10\% aller Artikel sind eindeutig | ||||||
|  | Stichprobenartig überprüfen => 10 Artikel random auswählen von jeder Klasse | ||||||
|  | Identifikation von äußert unklaren Fällen | ||||||
|  | Mehr als eine Klasse hat ähnliche Wahrscheinlichkeit | ||||||
|  | (5\%, 5\%, 5\%, …) => (80\%, 80\%, 0\%, 0\%, …) | ||||||
|  | z.b. 100 Artikel angucken und manuell label | ||||||
|  | => Wiederhole ich 3-4 mal gehe zu Schritt 1) (Modell bauen) | ||||||
|  | => 95\% aller Fälle sind jetzt klar. | ||||||
|  | => warum gehen die 5\% nicht? Stichprobenartig Artikel anschauen | ||||||
|  | Falls das nicht klappt, Modelle oder Preprozessing (z.b. NER) verbessern | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | \subsection{Tagging of Named Entities}  | ||||||
|  | Histogram: X: Autoren/Personen, Unternehmen, Y: Anzahl der Nennungen | ||||||
|  | 
 | ||||||
|  | \bigskip | ||||||
|  | \paragraph{Summary:}  | ||||||
|  | 
 | ||||||
|  | \textit{\newline | ||||||
|  | In this chapter...in the next chapter... | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | % Kapitel Implementierung | ||||||
|  | %########################## | ||||||
|  | \chapter{Implementation}  | ||||||
|  | \label{chap:implementation} | ||||||
|  | 
 | ||||||
|  | \textit{ | ||||||
|  | This chapter deals with the most relevant parts of the implementation. | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | \section{Data Download}  | ||||||
|  | Query webhose.io:\\ | ||||||
|  | % austauschen! | ||||||
|  | query\_params = \{'q':'site:(reuters.com OR ft.com OR cnn.com OR economist.com OR bloomberg.com OR theguardian.com) site\_category:(financial\_news OR finance OR business)', | ||||||
|  | 	'ts': '1533634070282', | ||||||
|  | 	'sort': 'crawled'\} | ||||||
|  | 
 | ||||||
|  | \section{Python Modules}  | ||||||
|  | \subsection{nltk}  | ||||||
|  | \subsection{pandas}  | ||||||
|  | \subsection{sklearn} | ||||||
|  | \subsection{webhoseio} | ||||||
|  | \section{Own Implementation} | ||||||
|  | \subsection{Examples}  | ||||||
|  | 
 | ||||||
|  | \bigskip | ||||||
|  | \paragraph{Summary:}  | ||||||
|  | 
 | ||||||
|  | \textit{\newline | ||||||
|  | In this chapter, we...In the next chapter... | ||||||
|  | } | ||||||
|  |    | ||||||
|  | % Kapitel Evaluation   | ||||||
|  | %########################## | ||||||
|  | \chapter{Evaluation}  | ||||||
|  | \label{chap:evaluation} | ||||||
|  | 
 | ||||||
|  | \textit{ | ||||||
|  | In this chapter we want to evaluate the different methods. blabla. | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | \section{News Articles Exploration}  | ||||||
|  | 
 | ||||||
|  | \subsection{Length of Articles} | ||||||
|  | Oder was sonst noch interessant ist. | ||||||
|  | 
 | ||||||
|  | \subsection{Most Common Words}  | ||||||
|  | 
 | ||||||
|  | Im Bezug auf die Artikel über Fusion. | ||||||
|  | \subsubsection{Word Cloud}  | ||||||
|  | z.B. Word Cloud mit Microsoft-Github-Fusion-Artikel. | ||||||
|  | \section{Model Fitting} | ||||||
|  | dran denken: Hyperparameter SEPARAT variieren | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | \subsection{Naive Bayes Model} | ||||||
|  | Grid-Search | ||||||
|  | 
 | ||||||
|  | \subsection{SVM} | ||||||
|  | \subsection{Decision Tree} | ||||||
|  | 
 | ||||||
|  | \section{Performance} | ||||||
|  | 
 | ||||||
|  | \bigskip | ||||||
|  | \paragraph{Summary:}  | ||||||
|  | 
 | ||||||
|  | \textit{\newline | ||||||
|  | In this chapter we have described ... In the last chapter we describe... | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | \chapter{Discussion (?)} | ||||||
|  | \al{Braucht man das? Arbeit soll kritisch hinterfragt werden, z.B. 'war der datensatz gut gewählt?' etc.} | ||||||
|  |    | ||||||
|  | % Kapitel ZUsammenfassung   | ||||||
|  | %############################# | ||||||
|  | \chapter{Summary}  | ||||||
|  | \label{chap:summary} | ||||||
|  | 
 | ||||||
|  | \section{Comparison of Labeling Methods} | ||||||
|  | 
 | ||||||
|  | \section{Quality of Predictions}  | ||||||
|  | 
 | ||||||
|  | \section{Conclusions}  | ||||||
|  | 
 | ||||||
|  | \section{Future Work}  | ||||||
|  | Neuronales Netz | ||||||
|  | 
 | ||||||
|  | \bigskip | ||||||
|  | \paragraph{Summary:}  | ||||||
|  | 
 | ||||||
|  | \textit{\newline | ||||||
|  | In the last chapter we have described .... | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | % Literaturliste soll im Inhaltsverzeichnis auftauchen | ||||||
|  | \nocite{*} | ||||||
|  | \addcontentsline{toc}{chapter}{Bibliography} | ||||||
|  | 
 | ||||||
|  | % Literaturliste anzeigen | ||||||
|  | \bibliography{LV} | ||||||
|  | 
 | ||||||
|  | \backmatter  | ||||||
|  | 
 | ||||||
|  | \thispagestyle{empty} | ||||||
|  | 
 | ||||||
|  | \vspace*{\fill} | ||||||
|  | \pagestyle{empty} | ||||||
|  | 
 | ||||||
|  | {\normalsize | ||||||
|  | \begin{center}\textbf{Eidesstattliche Erklärung}\end{center} | ||||||
|  | Hiermit versichere ich an Eides statt, dass ich die vorliegende Arbeit im Bachelorstudiengang Wirtschaftsinformatik selbstständig verfasst und keine anderen als die angegebenen Hilfsmittel – insbesondere keine im Quellenverzeichnis nicht benannten Internet-Quellen – benutzt habe. Alle Stellen, die wörtlich oder sinngemäß aus Veröffentlichungen entnommen wurden, sind als solche kenntlich gemacht. Ich versichere weiterhin, dass ich die Arbeit vorher nicht in einem anderen Prüfungsverfahren eingereicht habe und die eingereichte schriftliche Fassung der auf dem elektronischen Speichermedium entspricht. | ||||||
|  | \vspace*{1cm}\\ | ||||||
|  | Hamburg, den 01.02.2019 | ||||||
|  | \hspace*{\fill}\begin{tabular}{@{}l@{}}\hline | ||||||
|  | \makebox[5cm]{Anne Lorenz} | ||||||
|  | \end{tabular} | ||||||
|  | \vspace*{3cm} | ||||||
|  | %Dies ist optional, ggf. löschen! | ||||||
|  | \begin{center}\textbf{Veröffentlichung}\end{center} | ||||||
|  | Ich stimme der Einstellung der Arbeit in die Bibliothek des Fachbereichs Informatik zu. | ||||||
|  | \vspace*{1cm}\\ | ||||||
|  | Hamburg, den 01.02.2019 | ||||||
|  | \hspace*{\fill}\begin{tabular}{@{}l@{}}\hline | ||||||
|  | \makebox[5cm]{Anne Lorenz} | ||||||
|  | \end{tabular} | ||||||
|  | } | ||||||
|  | \vspace*{\fill}  | ||||||
|  | 
 | ||||||
|  | \end{document} | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user