thesis-anne/thesis/thesis.tex

\documentclass[11pt,a4paper]{scrbook}
\usepackage{geometry}	
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage[pdftex]{graphicx}
%for lists
\usepackage{listings}
\usepackage{enumitem}
\usepackage{colortbl}	
\usepackage{xcolor}
\usepackage{soul}
\usepackage{cleveref}
\usepackage{todonotes}
%for hyperlinks
\usepackage{hyperref}

\AtBeginDocument{\renewcommand{\chaptername}{}}

% Kommentare Julian
\newcommand{\jk}[1]{\todo[inline]{JK: #1}}
\renewcommand{\familydefault}{\sfdefault}

% Kommentare Anne
\definecolor{comments}{cmyk}{1,0,1,0}
\newcommand{\al}[1]{\todo[inline]{\color{comments}{AL: #1}}}

\definecolor{uhhred}{cmyk}{0,100,100,0}

\begin{document}

\frontmatter
\newgeometry{centering,left=2cm,right=2cm,top=2cm,bottom=2cm}
\begin{titlepage}
\includegraphics[scale=0.3]{UHH-Logo_2010_Farbe_CMYK.pdf}
\vspace*{2cm}
\Large
\begin{center} 
      {\color{uhhred}\textbf{\so{BACHELORTHESIS}}}
\vspace*{2.0cm}\\
{\LARGE \textbf{Prediction of Company Mergers\\Using Interactive Labeling\\and Machine Learning Methods}}
%or: Incremental labeling of an unknown data set using the example of classification of news articles OR
%Recognizing M\&As in News Articles\\Using Interactive Labeling\\and Machine Learning Methods
%Interactive Labeling of Unclassified Data\\Using the Example of Recognition of Company Mergers
\vspace*{2.0cm}\\
vorgelegt von
\vspace*{0.4cm}\\
Anne Lorenz
\end{center}
\vspace*{3.5cm}

\noindent 
MIN-Fakultät \vspace*{0.4cm} \\ 
Fachbereich Informatik \vspace*{0.4cm} \\ 
%Ggf. Professur/Institut \vspace*{0.4cm} \\
Studiengang: Software-System-Entwicklung \vspace*{0.4cm} \\ 
Matrikelnummer: 6434073 \vspace*{0.8cm} \\ 
Erstgutachter: Dr. Julian Kunkel \vspace*{0.4cm} \\ 
Zweitgutachter: Eugen Betke
 \vspace*{0.8cm} \\
Betreuer: Dr. Julian Kunkel, Doris Birkefeld
\end{titlepage}

\restoregeometry

\chapter*{Abstract}
BLABLA ABSTRACT
%So objektiv, kurz, verständlich, vollständig und genau wie möglich :-)

\tableofcontents

\mainmatter 

%Kapitel 1 Einleitung
%####################
\chapter{Introduction} 
\label{chap:introduction}

\textit{
In this chapter...In Section \ref{sec:motivation} the motivation, then in Section \ref{sec:goals} the goals...
}

\section{Motivation} 
\label{sec:motivation}
 Given a classification problem, there is always a labeled data set needed first to apply a machine learning model and make predictions possible. The larger the labeled data set is, the better are generally the predictions. However, to get there, each single data element must first be classified manually. Depending on the type of data, this procedure can be very time-consuming, for example if longer texts have to be read.

In this thesis we want to present an alternative data labeling method that allows to label a larger amount of data in a shorter time. 

\section{Goals} 
\label{sec:goals}

\jk{Ein Satz welcher das Problem beschreibt, dannach dann runtergebrochen in Teilaufgaben}

 We want to compare a conventional method of data labeling with an alternative, incremental method using the following example: The aim is to investigate news articles about recent mergers ('mergers and acquisitions') and to classify them accordingly. With the help of the labeled data set, different classification models will be applied and optimized so that a prediction about future news articles will be possible. 

\section{Outline}
% hier steht was über die Gliederung...

\bigskip
\paragraph{Summary:} 

\textit{\newline In this chapter we discussed ... The following chapter deals with blabla.}
 
%Kapitel 2 Stand der Technik 
%##########################
\chapter{State of the Art} 
\label{state_of_the_art}

\textit{In this chapter the current state of research in the field of... will be presented.
}

\section{State of Research}
\al{Was soll hier rein?}

\bigskip
\paragraph{Summary:} 

\textit{\newline In this chapter we have described ... are described in the next chapter. In the next chapter we describe...
}
 
%Kapitel 3 Grundlagen 
%#################### 
\chapter{Background and Related Work} 
\label{chap:background}

\textit{
In this chapter...In Section \ref{sec:news} news sources are introduced...
}

\section{Business News about Mergers} 
\label{sec:news}

\subsection{Company Mergers} 
When two companies merge, ... When shares of a company are sold,...

\subsection{Webhose.io as Source for News Articles} 
As a source for our initial data set, RSS feeds from established business news agencies such as \textit{Reuters} or \textit{Bloomberg} come into consideration. However, when crawling RSS feeds, it is not possible to retrieve news from a longer period in the past. Since we want to analyze news of the period of 12 months, we obtain the data set from the provider \textit{webhose.io}\footnote{\url{<https://webhose.io/>}}. It offers access to English news articles from sections like \textit{Financial News}, \textit{Finance} and \textit{Business} at affordable fees compared to the news agencies' offers. As we are only interested in reliable sources, we limit our request to the websites of the news agengies \textit{Reuters, Bloomberg, Financial Times, CNN, The Economist} and \textit{The Guardian}. 

\section{Supervised Machine Learning Problems} 

\subsubsection{Structured and Unstructured Data} 

\subsection{Classification Problems}
\subsubsection{Binary Classification}
Vergleichbar mit Spamfilterung...
\subsubsection{Multiple Classification}

\subsection{Balanced / Unbalanced Data Set}


\section{Text Analysis} 
\subsection{Natural Language Processing (NLP)} 
\subsection{Tokenization}
\subsection{Unigram, Bigram} 
\subsection{Stemming} 
\subsection{Feature Vectors}
\subsubsection{Word Frequencies} 
\subsection{Bag of Words (BOW)} 
\subsection{Stop Words} 
\subsection{Named Entity Recognition (NER)} 

\section{Machine Learning Models} 
\subsection{Naive Bayes Classifier} 
\subsection{Support Vector Machines (SVM)} 
\subsection{Decision Trees} 
\subsection{Hyperparameters} 
\subsection{Feature Selection}

\section{Split Methods} 
\subsection{Test-Train-Split}
\subsection{Shuffle Split} 
\subsection{(K-fold) Cross-Validation}

\section{Metrics}
\subsection{Accuracy, Error Rate, Sensitivity, Specifity}
Sensitivity(=true positive rate) and Specificity(=true negative rate)
\subsection{Recall, Precision, F1-score} 
\subsection{Robustness}
\subsection{Overfit, Underfit}
\subsection{Bias, Variance}
\subsection{Resubstitution Error}
  
\bigskip
\paragraph{Summary:} 

\textit{\newline
In this chapter we ... blabla are described in section bla.
In the next chapter we describe...
}

%Kapitel 4 Design
%###########################
\chapter{Design} 
\label{chap:design}

\textit{
In this chapter... In Section \ref{sec:overview} we give an overview of all, then in Section the data processing pipeline, blablabla...
}

\section{Overview}
\label{sec:overview}

\jk{Was muss insgesamt gemacht werden, welche Teilprobleme müssen addressiert werden. Alternativen besprechen, Entscheidungen fällen basierend auf Kriterien. Hier kommt Deine Arbeit hin, kein Related work oder Methoden die es schon gibt. Nur falls man es Vergleicht, dann relevant.}

First, we need to collect appropriate data, then label a data set manually, then, ....\\
\\
% Data Processing Pipeline als Schaubild einfügen:
Data Selection > Labeling > Preprocessing > Model Selection > Recognition of Merger Partners

\section{Data Selection}
\label{sec:data_selection}

Before we can start with the data processing, we need to identify and select appropriate data. We downloaded news articles of 12 months (year 2017) from the website \url{<webhose.io>} as described in Chapter \ref{chap:implementation}, Section \ref{sec:data_download}.
As webhose.io is a secondary source and only crawls the news feeds itself, it may occur that some RSS feeds are not parsed correctly or a article is tagged with a wrong topic as \textit{site categories}. The downloaded files also contain blog entries, user comments, videos or graphical content and other spam which we have to filter out. We also do not need pages quoting Reuters etc.. Besides this, we are only interested in English news articles. \\
After we have filtered out all the irrelevant data, we receive a data set of XX.XXX news articles that we store in a csv file.

The csv file contains the following 9 columns:
\begin{center}
\begin{tabular}{|c|c|c|c|c|c|c|c|c|}
\hline
SectionTitle & Title & SiteSection & Text & Uuid & Timestamp & Site & SiteFull & Url \\
\hline
 \end{tabular}
 \end{center}
The individual columns contain:
\begin{itemize}

\item \textbf{SectionTitle:} The name of the news feed section, e.g. \textit{'Reuters | Financial News'}.

\item \textbf{Title:} The news article's headline, e.g. \textit{'EU antitrust ruling on Microsoft buy of GitHub due by October 19'}

\item \textbf{SiteSection:} The link to the section of the site where the thread was created, e.g. \textit{'http://feeds.reuters.com/reuters/financialsNews'}

\item \textbf{Text:} The article's plain text.

\item \textbf{Uuid:} Universally unique identifier, representing the article's thread.

\item \textbf{Timestamp:} The thread's publishing date/time in the format YYYY-MM-DDThh:mmGMT+3. E.g. \textit{2018-09-17T20:00:00.000+03:00'}

\item \textbf{Site:} The top level domain of the article's site, e.g. \textit{'reuters.com'}

\item \textbf{SiteFull:} The complete domain of the article's site, e.g. \textit{'reuters.com'}

\item \textbf{Url:} The link to the top of the article's thread, e.g. \textit{'https://www.reuters.com/article/us-github-m-a-microsoft-eu/eu-antitrust-ruling-on-microsoft-buy-of-github-due-by-october-19-idUSKCN1LX114'}

\end{itemize}
The columns \textbf{Title} and \textbf{Text} contain our main data, whereas the rest of the attributes is the meta data.

\section{Labeling}

From our dataset of XX.XXX news articles, we select 10.000 articles \footnote{833/844 articles of each month} to proceed with the labeling process.

\subsection{Conventional Method} 

\subsubsection{Top-Down / Waterfall}
\begin{enumerate}[label=(\alph*)]

\item \textbf{Data Labeling}
\item \textbf{Data Cleaning}
\item \textbf{Model Building}
\item \textbf{Analysis of wrong predicted instances}\\
=> optionally back to step (a) \footnote{In practice, this step is rarely done.}
\item \textbf{New Hypotheses}\\
=> back to (c); optionally back to step (b)
\end{enumerate}

\subsection{Interactive Method}

\subsubsection{Visual Analyticts} 

\subsubsection{Agile Model Development}

\subsubsection{Unbalanced Data Set} 

\section{Preprocessing}
In order to use the news articles for machine learning algorithms, we must first prepare and filter the texts appropriately:

\begin{description}
\item \textbf{Removing punctuation marks}\\
We replace all punctuation marks with white spaces.

\item \textbf{Tokenization}\\
Every news article is split into a list of single words.

\item \textbf{Leaving out numbers}\\
We ignore all numbers in the news article.

\item \textbf{Transforming words to lower case}\\
Every word is transformed to lower case.

\item \textbf{Word stemming}\\
We reduce every word to its word stem (i.e. 'approves' to 'approv').

\item \textbf{Ignoring stop words}\\
We filter out extremely common words ('a', 'about', 'above', 'after', 'again', etc.) and other unwanted terms ('reuters', 'bloomberg', etc.).
\end{description}

\section{Model Selection}
\subsection{Naive Bayes} 
GaussianNB vs MultinomialNB
\subsection{SVM} 
\subsection{Decision Tree} 

\section{Recognition of merger partners} 
\subsubsection{Named Entity Recognition (NER)} 

\bigskip
\paragraph{Summary:} 

\textit{\newline
In this chapter we... In the next chapter...
}

% Kapitel 5 Data Exploration
%###########################
\chapter{Data Exploration}
\label{chap:exploration}

\textit{
In this chapter we explore our textual corpus, which contains of the news articles headline and plain text.
}

\section{Text Corpus Exploration} 
% Hier kommen Visualisierungen mit pyplot/seaborn rein.

\subsection{Number of Features}
% Wichtigste Features?

\subsection{Length of Articles}
The average length of the news articles examined is [X] words.

\subsection{Most Common Words} 
% NACH data preprocessing! (eigenes BOW benutzen)
% Erst Schaubild/WordCloud von ganzem Korpus,
% dann nur die Artikel über Fusion.

\subsubsection{Word Cloud} 
%z.B. Word Cloud mit Microsoft-Github-Fusion-Artikel.

\subsection{Distribution of Company Names}
'XY' is the most frequently used company name in the old dataset.

\bigskip
\paragraph{Summary:}

\textit{\newline
In this chapter we... In the next chapter...
}

% Kapitel 6 Labeling
%###########################
\chapter{Data Labeling}
\label{chap:labeling}

\textit{
This chapter describes and compares two different data labeling processes; a conventional labeling method and an interactive method.
}

\section{Conventional Method}

\subsection{Data Set}
First, we label a slightly smaller data set in a conventional way. The dataset consists of 1497 news articles, which were downloaded via \textit{webhose.io}. The dataset contains news articles from different Reuters' RSS feeds dating from the period of one month \footnote{The timeframe is May 25 - June 25 2018, retrieved on June 25 2018.}. Here, we only filter out articles that contain at least one of the keywords \textit{'merger', 'acquisition', 'take over', 'deal', 'transaction'} or \textit{'buy'} in the heading.
With the following query we download the desired data from \textit{webhose.io}:\\\\
\texttt{
thread.title:(merger OR merges OR merge OR merged 
    OR acquisition 
    OR "take over"   
    \noindent\hspace*{42mm}%
   OR "take-over" OR takeover 
    OR deal OR transaction OR buy) \\
is\_first:true \\
site\_type:news \\
site:reuters.com \\
language:english}
\subsection{Classification}
The articles were classified binary with the two labels:
\begin{description}
\item[0:]{company A and B merge}
\item[1:]{other}
\end{description}
The process of reading and label the 1497 news articles took about 30 hours in total.

\subsection{Difficulties}
Some article texts were difficult to classify even when read carefully.
Here are a few examples of the difficulties that showed up:
\begin{itemize}

\item \textit{'Company A acquires more than 50\% of the shares of company B.'}\\ => How should share sales be handled? Actually, this means a change of ownership, even if it is not a real merger.

\item \textit{'Company X will buy/wants to buy company Y.'} \\=> Will the merger definitely take place? On what circumstances does it depend?

\item \textit{'Last year company X and company Y merged. Now company A wants to invest more in renewable energies.'}\\ => Only an incidental remark deals with a merger that is not taking place right now. The main topic of the article is about something completely different.

\end{itemize}
These difficulties led to the idea of using different labeling classes, which we finally implemented in the interactive labeling method.

\section{Incremental Method}
%Vorteil: könnte bessere Ergebnisse bringen, da allgemeiner / größere Menge
\subsection{Data Set}
For the interactive labeling method, we use the data set of 10.000 articles from a whole year described in Chapter \ref{chap:design}, Section \ref{sec:data_selection}.

\subsection{Classification}
For the multiple classification we use the following 6 classes:
\begin{description}
\item[1:]{Company A and B merge}
\item[2:]{Merger is pending}
\item[3:]{Merger is aborted}
\item[4:]{Share sale}
\item[5:]{Merger as incidental remark}
\item[6:]{Irrelevant news}
\end{description}

\subsection{Selection of Articles} 
\subsection{Procedure}
%Wähle von jedem Monat 10 Artikel zufällig aus.
%Es ist wahrscheinlich dann man nur Merger mit vielen Artikeln hat => Das könnte man minimieren indem man “stratified” sampling macht => Zuerst NER machen, danach fair über Klassen randomisieren => wähle 10 Artikel von 100 Kategorien aus => 10 Kategorien auswählen => darunter zufällig ein Artikel . Labeln von 1\% aller Artikel
%1) Erste Modelle bauen z.b. Bayes . Auf alle Artikel anwenden => Wahrscheinlichkeit pro Klasse Vektor: (K1, K2, … , K6)
%Klare Fälle: Kx > 80\% und alle anderen Ky < 10\% (mit x in {1-6}, y != x)
%=> Label übernehmen => wie viele Fälle sind eindeutig?
%Behauptung: 10\% aller Artikel sind eindeutig
%Stichprobenartig überprüfen => 10 Artikel random auswählen von jeder Klasse
%Identifikation von äußert unklaren Fällen
%Mehr als eine Klasse hat ähnliche Wahrscheinlichkeit
%(5\%, 5\%, 5\%, …) => (80\%, 80\%, 0\%, 0\%, …)
%z.b. 100 Artikel angucken und manuell label
%=> Wiederhole ich 3-4 mal gehe zu Schritt 1) (Modell bauen)
%=> 95\% aller Fälle sind jetzt klar.
%=> warum gehen die 5\% nicht? Stichprobenartig Artikel anschauen
%Falls das nicht klappt, Modelle oder Preprozessing (z.b. NER) verbessern

\subsection{Tagging of Named Entities} 
Histogram: X: Autoren/Personen, Unternehmen, Y: Anzahl der Nennungen

\bigskip
\paragraph{Summary:} 

\textit{\newline
In this chapter...in the next chapter...
}

% Kapitel 7 Implementierung
%##########################
\chapter{Implementation} 
\label{chap:implementation}

\textit{
This chapter deals with the most relevant parts of the implementation.
}

\section{Data Download}
\label{sec:data_download}

To retrieve our data, we make the following request on the website
\url{<https://webhose.io>}:\\\\
\texttt{
site:(reuters.com OR ft.com OR cnn.com OR economist.com\\
\noindent\hspace*{12mm}%
OR bloomberg.com OR theguardian.com)\\
site\_category:(financial\_news OR finance OR business)\\
\\
timeframe: january 2017 - december 2017} \\
\\
The requested data was downloaded in September 2018 with JSON as file format. Every news article is saved in a single file, in total 1.478.508 files were downloaded (4,69 GiB).
Among others, one JSON file contains the information shown in the following example :\\

\begin{lstlisting}[breaklines=true]
{
      "thread": {
        "uuid": "a931e8221a6a55fac4badd5c6992d0a525ca3e83",
        "url": "https://www.reuters.com/article/us-github-m-a-microsoft-eu/eu-antitrust-ruling-on-microsoft-buy-of-github-due-by-october-19-idUSKCN1LX114",
        "site": "reuters.com",
        "site_section": "http://feeds.reuters.com/reuters/financialsNews",
        "section_title": "Reuters | Financial News"
        "published": "2018-09-17T20:00:00.000+03:00"
        "site_type": "news",
        "spam_score": 0.0,
      },
      "title": "EU antitrust ruling on Microsoft buy of GitHub due by October 19",
      "text": "BRUSSELS (Reuters)-EU antitrust regulators will decide by Oct. 19 whether to clear U.S. software giant Microsoft's $7.5 billion dollar acquisition of privately held coding website GitHub. Microsoft, which wants to acquire the firm to reinforce its cloud computing business against rival Amazon, requested European Union approval for the deal last Friday, a filing on the European Commission website showed on Monday. The EU competition enforcer can either give the green light with or without demanding concessions, or it can open a full-scale investigation if it has serious concerns. GitHub, the world's largest code host with more than 28 million developers using its platform, is Microsoft's largest takeover since the company bought LinkedIn for $26 billion in 2016. Microsoft Chief Executive Satya Nadella has tried to assuage users' worries that GitHub might favor Microsoft products over competitors after the deal, saying GitHub would continue to be an open platform that works with all the public clouds. Reporting by Foo Yun Chee; Editing by Edmund Blair",
      "language": "english",
      "crawled": "2018-09-18T01:52:42.035+03:00"
}
\end{lstlisting}

\section{Python Modules} 
\subsection{nltk} 
\subsection{pandas} 
\subsection{sklearn}
\subsection{webhoseio}

\section{Jupyter Notebook}
For interactive coding, labeling, visualization and documentation.

\section{Own Implementation}
\subsection{Examples} 

\bigskip
\paragraph{Summary:} 

\textit{\newline
In this chapter, we...In the next chapter...
}
  
% Kapitel 8 Evaluation  
%##########################
\chapter{Evaluation} 
\label{chap:evaluation}

\textit{
In this chapter we evaluate the different machine learning methods.
}

\section{Model Fitting}

% Algos auf Recall statt F1 optimieren bzw. beides ausgeben lassen

%dran denken: einzelne Hyperparameter SEPARAT variieren

% Variante: wenn ich modell nur auf 'Title' anwende, sogar noch besser!

% Alle Metriken, Robustheit, Over-/Underfit etc. in Tabelle zur Übersicht!!

% variieren: SelectPercentile, BOW/CountVectorizer, Preprocessing(stopwords, stemming,...) verändern, SelectPercentile (1,5,25,75,100), Hyperparameter(alpha, gamma=0.0001.,, C, ...) mit/ohne Text) => alles 	dokumentieren

\subsection{Naive Bayes Model}
Multinomial Naive Bayes
Grid-Search

\subsection{SVM}
% 5-fold-cross ausprobieren
% SVM bestes Ergebnis mit ALTEM Datensatz:
% best score: 0.876
% best parameters set found on development set:
% C: 0.1, gamma: 0.0001, kernel: linear, percentile: 50

\subsection{Decision Tree}
% wichtigste 20 features ausgeben lassen!
% einfaches test_train_split (0.25) nur auf Title in altem Dataset benutzt:
20 most important words in testing set:
['merger', 'buy', 'monsanto', 'warner', 'win', 'walmart', '2', 'billion', 'kkr', 'rival', 'uk', 'watch', 'jv', 'merg', 'get', 'non', 'anz', 'xerox', 'clear', 'deal']

\section{Recognition of Merger Partners}
% Stanford-Variante erzielt ganz gute Ergebnisse.

\section{Performance}

\bigskip
\paragraph{Summary:} 

\textit{\newline
In this chapter we have described ... In the last chapter we describe...
}
  
% Kapitel Zusammenfassung  
%#############################
\chapter{Summary} 
\label{chap:summary}

\section{Comparison of Labeling Methods}

\section{Quality of Predictions} 

\section{Conclusions} 

\section{Future Work} 
\subsubsection{}
The task of this work could also be solved by using an artificial neural network (ANN). %Genauere Erklärung fehlt noch.
This may lead to even better results.
\bigskip
\paragraph{Summary:} 

\textit{\newline
In the last chapter we have described ....
}

% Literaturliste soll im Inhaltsverzeichnis auftauchen
\nocite{*}
\addcontentsline{toc}{chapter}{Bibliography}

% Literaturliste anzeigen
\bibliography{LV}

\backmatter 

\thispagestyle{empty}

\vspace*{\fill}
\pagestyle{empty}

{\normalsize
\begin{center}\textbf{Eidesstattliche Erklärung}\end{center}
Hiermit versichere ich an Eides statt, dass ich die vorliegende Arbeit im Bachelorstudiengang Wirtschaftsinformatik selbstständig verfasst und keine anderen als die angegebenen Hilfsmittel – insbesondere keine im Quellenverzeichnis nicht benannten Internet-Quellen – benutzt habe. Alle Stellen, die wörtlich oder sinngemäß aus Veröffentlichungen entnommen wurden, sind als solche kenntlich gemacht. Ich versichere weiterhin, dass ich die Arbeit vorher nicht in einem anderen Prüfungsverfahren eingereicht habe und die eingereichte schriftliche Fassung der auf dem elektronischen Speichermedium entspricht.
\vspace*{1cm}\\
Hamburg, den 01.03.2019
\hspace*{\fill}\begin{tabular}{@{}l@{}}\hline
\makebox[5cm]{Anne Lorenz}
\end{tabular}
\vspace*{3cm}
%Dies ist optional, ggf. löschen!
\begin{center}\textbf{Veröffentlichung}\end{center}
Ich stimme der Einstellung der Arbeit in die Bibliothek des Fachbereichs Informatik zu.
\vspace*{1cm}\\
Hamburg, den 01.03.2019
\hspace*{\fill}\begin{tabular}{@{}l@{}}\hline
\makebox[5cm]{Anne Lorenz}
\end{tabular}
}
\vspace*{\fill} 

\end{document}