changes for data exploration

2018-10-26 11:30:19 +02:00 · 2018-10-26 11:30:19 +02:00 · 6d15207da9
parent b6e48feb16
commit 6d15207da9
13 changed files with 10553 additions and 217 deletions
--- a/BagOfWords.py
+++ b/BagOfWords.py
@ -11,7 +11,8 @@ classification). The multinomial distribution normally requires integer
 feature counts. However, in practice, fractional counts such as tf-idf may
 also work. => considered by 'relative_word_frequencies' as parameter.
 '''
-
+from collections import OrderedDict
+import csv
 import re

 import pandas as pd
@ -25,12 +26,12 @@ class BagOfWords:
        vocab = BagOfWords.make_vocab(X)
        return BagOfWords.make_matrix(X, vocab, relative_word_frequencies)

-    def extract_words(text):
+    def extract_words(text, stemming=True):
        '''takes article as argument, removes numbers,
        returns list of single words, recurrences included.
        '''
        stemmer = PorterStemmer()
-        stop_words = BagOfWords.set_stop_words()
+        stop_words = BagOfWords.set_stop_words(stemming)
        # replace punctuation marks with spaces
        words = re.sub(r'\W', ' ', text)
        # split str into list of single words
@ -38,31 +39,33 @@ class BagOfWords:
        # list of all words to return
        words_cleaned = []
        for word in words:
-            # check if alphabetic char
-            if word.isalpha():
-                # reduce word in lower case to stem
-                word = stemmer.stem(word.lower())
-                # check if not stop word
-                if word not in stop_words:
-                    words_cleaned.append(word)
+            word = word.lower()
+            # check if alphabetic and not stop word
+            if (word.isalpha() and word not in stop_words):
+                if stemming:
+                    # reduce word to its stem
+                    word = stemmer.stem(word)
+                words_cleaned.append(word)
        return words_cleaned

-    def make_matrix(series, vocab, relative_word_frequencies=True):
-        '''calculates word stem frequencies in input articles. returns matrix
-        (DataFrame) with relative word frequencies (0 <= values < 1) if
-        relative_word_frequencies=True or absolute word frequencies (int) if
-        relative_word_frequencies=False.(rows: different articles, colums: 
-        different words in vocab)
+    def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True):
+        '''calculates word stem frequencies in input articles. returns
+        document term matrix(DataFrame) with relative word frequencies
+        (0 <= values < 1) if relative_word_frequencies=True or absolute
+        word frequencies (int) if relative_word_frequencies=False.
+        (rows: different articles, colums: different words in vocab)
+        returns matrix as DataFrame
        '''
-        print('# BOW: calculating matrix')
-        print('# ...')
+        print('# BOW: calculating matrix...')
+        print()
        # create list of tuples
        vectors = []
+        # for every text in series
        for i in range(len(series)):
            # extract text of single article
            text = series.iloc[i]
            # extract its words
-            words = BagOfWords.extract_words(text)
+            words = BagOfWords.extract_words(text, stemming)
            # count words in single article
            word_count = len(words)
            vector = []
@ -80,27 +83,24 @@ class BagOfWords:
            vectors.append(tuple(vector))
        df_vectors = pd.DataFrame.from_records(vectors,
                                               index=None,
+                                               #header=vocab,
                                               columns=vocab)
        return df_vectors

-    def make_vocab(series):
+    def make_vocab(series, stemming=True):
        '''adds words of input articles to a global vocabulary.
        input: dataframe of all articles, return value: list of words
        '''
-        print('# BOW: making vocabulary of data set')
-        print('# ...')
+        print('# BOW: making vocabulary of data set...')
+        print()
        vocab = set()
        # for every article's text
        for text in series:
            # add single article's text to total vocabulary
-            vocab |= set(BagOfWords.extract_words(text))
-        # # transform to list
-        # vocab = list(vocab)
-        # # sort list
-        # vocab.sort()
+            vocab |= set(BagOfWords.extract_words(text, stemming))
        return vocab

-    def set_stop_words():
+    def set_stop_words(stemming=True):
        '''creates list of all words that will be ignored
        '''
        # stopwords
@ -135,41 +135,76 @@ class BagOfWords:
                      'yourselves']

        #add unwanted terms
-        stop_words.extend(['reuters', 'bloomberg', 'cnn', 'economist'])
+        stop_words.extend(['reuters', 'bloomberg', 'cnn', 'n', 'l', 'â',
+                           'file', 'photo', 'min', 'read', 'staff', 'left',
+                           'right', 'updated', 'minutes', 'brief', 'editing',
+                           'reporting', 'ago', 'also', 'would', 'could',
+                           'bit', 'ly', 'fy', 'economist', 'u', 'guardian'])

-        # #remove the word 'not' from stop words?
-        # stop_words.remove('not')
+        stop_words.extend(['monday', 'tuesday', 'wednesday', 'thursday', 'friday',
+                           'saturday', 'sunday'])

-        stemmer = PorterStemmer()
-        for i in range(len(stop_words)):
-            # reduce stop words to stem
-            stop_words[i] = stemmer.stem(stop_words[i])
-        # transform list to set to eliminate duplicates
+        stop_words.extend(['january', 'february', 'march', 'april', 'may',
+                           'june', 'july', 'august', 'september', 'october',
+                           'november', 'december', 'jan', 'feb', 'mar', 'apr',
+                           'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov',
+                           'dec'])
+
+        if stemming:
+            stemmer = PorterStemmer()
+            for i in range(len(stop_words)):
+                # reduce stop words to stem
+                stop_words[i] = stemmer.stem(stop_words[i])
+            # transform list to set to eliminate duplicates
        return set(stop_words)

-if __name__ == '__main__':
-    test_article = '''Exclusive: Microsoft's $7.5 billion GitHub deal set for
-                    EU approval - sources. BRUSSELS (Reuters) - U.S. software
-                    giant Microsoft (MSFT.O) is set to win unconditional EU
-                    antitrust approval for its $7.5 billion purchase of
-                    privately held coding website GitHub, two people familiar
-                    with the matter said on Monday. Microsoft announced the
-                    deal in June, its largest acquisition since it bought
-                    LinkedIn for $26 billion in 2016. The GitHub deal is
-                    expected to boost the U.S. software giant’s cloud
-                    computing business and challenge market leader Amazon
-                    (AMZN.O). GitHub, the world’s largest code host, has
-                    more than 28 million developers using its platform. It
-                    will become a part of Microsoft’s Intelligent Cloud unit
-                    once the acquisition is completed. Microsoft Chief
-                    Executive Satya Nadella has tried to assuage users’
-                    worries that GitHub might favor Microsoft products
-                    over competitors after the deal, saying GitHub would
-                    continue to be an open platform that works with all
-                    public clouds. The European Commission, which is set to
-                    decide on the deal by Oct. 19, did not respond to a
-                    request for immediate comment. Microsoft declined to
-                    comment. Reporting by Foo Yun Chee; editing by Jason
-                    Neely'''
+    def make_dict_common_words(texts, rel_freq=False, stemming=True, n=200):
+        '''texts: df of article texts of complete data set as series,
+        return dict of words with their count.
+        '''
+        # word => count
+        dict = {}
+        vocab = BagOfWords.make_vocab(texts, stemming)
+        # calculate document term matrix
+        df_matrix = BagOfWords.make_matrix(texts, vocab, rel_freq, stemming)
+        print(df_matrix.shape)
+        # iterate over words
+        for column in df_matrix:
+            # count word mentions in total
+            dict[column] = df_matrix[column].sum()
+        # sort dict by value and 
+        o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
+                             reverse=True))
+        # return n higest values as dict (word => count)
+        n_dict = {}
+        for i in range(n):
+            n_dict[o_dict.popitem(last=False)[0]] = o_dict.popitem(last=False)[1]
+        return n_dict

-    print(BagOfWords.extract_words(test_article))
+    def count_features(texts, stemming=True):
+        vocab = BagOfWords.make_vocab(texts, True)
+        return len(vocab)
+
+if __name__ == '__main__':
+
+    # load new data set
+    file = 'data\\interactive_labeling_dataset_without_header.csv'
+    df_dataset = pd.read_csv(file,
+                             delimiter='|',
+                             header=None,
+                             index_col=None,
+                             engine='python',
+                             usecols=[1,2],
+                             #nrows=10,
+                             quoting=csv.QUOTE_NONNUMERIC,
+                             quotechar='\'')
+
+    # find most common words in dataset
+    corpus = df_dataset[1] + '. ' + df_dataset[2]
+    # stemming = False
+    # vocab = BagOfWords.make_vocab(corpus, stemming)
+    # print(vocab)
+    # print()
+    # print(BagOfWords.make_matrix(corpus, vocab, False, stemming))
+    # print(BagOfWords.make_dict_common_words(corpus, False, stemming, 200))
+    print(BagOfWords.count_features(corpus))
--- a/FileHandler.py
+++ b/FileHandler.py
@ -132,7 +132,7 @@ class FileHandler:
                            # check if text parsed correctly
                            ('Further company coverage:' in dict['text']) or
                            (('subscription' or 'subscribe') in dict['text']) or
-                            (len(dict['text']) < 300)):
+                            (len(dict['text']) < 200)):
                            continue
                        else:
                            try:
--- a/NER.py
+++ b/NER.py
@ -5,10 +5,13 @@ Named Entity Recognition (NER)
 Stanford NER takes a text as input and returns a list of entities
 like persons, organizations and countries, e.g.
 '''
+
+# toDo: complete list legal entity types
+# 'Amazon' not recognized as organization
+
 import csv
 import os

-import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from nltk.tag import StanfordNERTagger
@ -16,12 +19,10 @@ from nltk.tokenize import word_tokenize

 class NER:

-    # toDo: complete lists:
-    # some legal entity types
    company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP', 'Plc', 'LLC',
                       'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB']

-    # some entities that are not companies
+    # some entities and misc that are not companies
    misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 'Cnn',
            'European Commission', 'EU', 'Staff', 'Min', 'Read', 
            'Thomson Reuters Trust Principles', 'New York Stock Exchange',
@ -56,25 +57,6 @@ class NER:
                current_chunks = []
        return continuous_chunks

-    def plot_histogram(count_names):
-        '''pyplot diagram of company names distribution
-        (probability density function)
-        x-axis: number of mentions of the company
-        y-axis: frequency
-        '''
-        # sort list in descending order
-        count_names.sort(reverse=True)
-        # convert list to array
-        names = np.asarray(count_names)
-        plt.title('Company mentions in News Articles')
-        plt.xlabel('Count of articles that mention a company')
-        # Number of companies with this number of mentions
-        plt.ylabel('Number of companies with this number of articles')
-        num_bins = 50
-        n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5)
-        # plt.grid(True)
-        plt.show()
-
    def find_companies(text):
        '''param: article text where organizations must be indentified
        returns: list of identified organisations as strings
@ -102,6 +84,8 @@ class NER:
        '''param: list of all article texts
        returns: list of company counts as ints
        '''
+        print('# counting company names...')
+        print()
        # dictionary of companies with their count
        dict_com = {}
        for text in texts:
@ -117,29 +101,29 @@ class NER:
        return list(dict_com.values())

 if __name__ == '__main__':
-
-    filepath = 'data\\classification_labelled_corrected.csv'
-    df = pd.read_csv(filepath,
-                     sep='|',
-                     engine='python',
-                     decimal='.',
-                     quotechar='\'',
-                     quoting=csv.QUOTE_NONE)
-
-    # only articles with label==1
-    df_hits = df[df['Label'] == 1]
-
-    texts = df_hits['Title'] + ' ' + df_hits['Text']
-
-    # # zum prüfen lesen
-    # for text in texts[10:20]:
-        # print(text)
-        # print()
-        # print(NER.find_companies(text))
-        # print()
-
-    # count names in hit articles
-    count_names = NER.count_companies(texts)
-
-    # plot diagram
-    NER.plot_histogram(count_names)
+    print('# starting NER...')
+    print()
+    test_article = '''Exclusive: Microsoft's $7.5 billion GitHub deal set for
+                    EU approval - sources. BRUSSELS (Reuters) - U.S. software
+                    giant Microsoft (MSFT.O) is set to win unconditional EU
+                    antitrust approval for its $7.5 billion purchase of
+                    privately held coding website GitHub, two people familiar
+                    with the matter said on Monday. Microsoft announced the
+                    deal in June, its largest acquisition since it bought
+                    LinkedIn for $26 billion in 2016. The GitHub deal is
+                    expected to boost the U.S. software giant’s cloud
+                    computing business and challenge market leader Amazon
+                    (AMZN.O). GitHub, the world’s largest code host, has
+                    more than 28 million developers using its platform. It
+                    will become a part of Microsoft’s Intelligent Cloud unit
+                    once the acquisition is completed. Microsoft Chief
+                    Executive Satya Nadella has tried to assuage users’
+                    worries that GitHub might favor Microsoft products
+                    over competitors after the deal, saying GitHub would
+                    continue to be an open platform that works with all
+                    public clouds. The European Commission, which is set to
+                    decide on the deal by Oct. 19, did not respond to a
+                    request for immediate comment. Microsoft declined to
+                    comment. Reporting by Foo Yun Chee; editing by Jason
+                    Neely'''
+    print(NER.find_companies(test_article))
--- a/VisualizerNews.py
+++ b/VisualizerNews.py
@ -0,0 +1,220 @@
+"""
+VisualizeNews
+=============
+
+Generating a square wordcloud with most common words of input data set.
+"""
+from BagOfWords import BagOfWords
+from NER import NER
+
+import csv
+from os import path
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import CountVectorizer
+from wordcloud import WordCloud
+
+class VisualizerNews:
+
+    def plot_wordcloud_dataset():
+        '''plots word cloud image of most common words in dataset.
+        '''
+        print('# preparing word cloud...')
+        print()
+        # load new data set
+        file = 'data\\interactive_labeling_dataset_without_header.csv'
+        df_dataset = pd.read_csv(file,
+                                 delimiter='|',
+                                 header=None,
+                                 index_col=None,
+                                 engine='python',
+                                 usecols=[1,2],
+                                 #nrows=100,
+                                 quoting=csv.QUOTE_NONNUMERIC,
+                                 quotechar='\'')
+
+        corpus = df_dataset[1] + ' ' + df_dataset[2]
+
+        # find most common words in dataset
+        dict = BagOfWords.make_dict_common_words(corpus,
+                                                 rel_freq=True,
+                                                 stemming=False,
+                                                 n=200)
+
+        wordcloud = WordCloud(width=2400, height=1200, scale=2,
+                              # true if bigram:
+                              collocations=False).generate_from_frequencies(dict)
+
+        # display generated image
+        plt.imshow(wordcloud, interpolation='bilinear')
+        plt.axis("off")
+        plt.show()
+
+    def plot_histogram_companies():
+        '''plots diagram of company names distribution
+        count_names: list of company counts(int)
+        x-axis: number of mentions of the company
+        y-axis: frequency
+        '''
+        print('# preparing histogram...')
+        print()
+        # old data set
+        filepath = 'data\\classification_labelled_corrected.csv'
+        df = pd.read_csv(filepath,
+                         sep='|',
+                         engine='python',
+                         decimal='.',
+                         quotechar='\'',
+                         quoting=csv.QUOTE_NONE)
+
+        # only articles with label==1
+        df_hits = df[df['Label'] == 1]
+
+        texts = df_hits['Title'] + ' ' + df_hits['Text']
+
+        # # zum prüfen lesen
+        # for text in texts[10:20]:
+            # print(text)
+            # print()
+            # print(NER.find_companies(text))
+            # print()
+
+        # count names in hit articles
+        count_names = NER.count_companies(texts)
+        
+        # sort list in descending order
+        count_names.sort(reverse=True)
+        # convert list to array
+        names = np.asarray(count_names)
+        #plt.title('Company mentions in News Articles')
+        plt.xlabel('Count of articles that mention a company')
+        # Number of companies with this number of mentions
+        plt.ylabel('Number of companies with this number of articles')
+        num_bins = 50
+        n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5)
+        # plt.grid(True)
+        plt.show()
+
+    def plot_histogram_text_lengths():
+        '''plot histogram of article length
+        x-axis: number of characters in article (without headline)
+        y-axis: frequency
+        '''
+        print('# preparing histogram...')
+        print()
+        # new data set
+        filepath = 'data\\interactive_labeling_dataset.csv'
+        df_dataset = pd.read_csv(filepath,
+                                 delimiter='|',
+                                 header=0,
+                                 index_col=None,
+                                 engine='python',
+                                 #nrows=100,
+                                 quoting=csv.QUOTE_NONNUMERIC,
+                                 quotechar='\'')
+        # consider only Text, not Headline
+        texts = df_dataset['Text']
+
+        # count characters in articles
+        print('# counting characters in articles...')
+        print()
+        count_chars = []
+        for text in texts:
+            count_chars.append(len(text))
+        # average of number of characters
+        av = int(sum(count_chars) / len(count_chars))
+        print('# average length of news articles is: {} characters'.format(av))
+        print()
+        # sort list in descending order
+        count_chars.sort(reverse=True)
+        # convert list to array
+        names = np.asarray(count_chars)
+        # plt.title('Length of News Articles')
+        plt.xlabel('Number of Characters in an Article')
+        plt.ylabel('Frequency')
+        # number of vertical bins
+        num_bins = 200
+        n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5)
+        # [xmin, xmax, ymin, ymax] of axis
+        plt.axis([300, 10000, 0, 500])
+        plt.show()
+
+    def plot_pie_chart_of_sites():
+
+        print('# preparing pie chart...')
+        print()
+
+        # load data set
+        filepath = 'data\\interactive_labeling_dataset.csv'
+        df_dataset = pd.read_csv(filepath,
+                                 delimiter='|',
+                                 header=0,
+                                 #usecols=[3], #column 'Site'
+                                 index_col=None,
+                                 engine='python',
+                                 #nrows=100,
+                                 quoting=csv.QUOTE_NONNUMERIC,
+                                 quotechar='\'')
+
+        df_counts = df_dataset.groupby('Site').count()
+        df_counts = df_counts.sort_values(['Url'], ascending=False)
+
+        fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
+
+        data = list(df_counts['Url'])
+        labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)', 
+                  'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']
+
+        wedges, texts, autotexts = ax.pie(data, autopct='%1.0f%%', pctdistance=2.0,
+                                          startangle=90, textprops=dict(color="w"))
+
+        ax.legend(wedges, labels,
+                  #title="News Article Sources",
+                  loc="center left",
+                  bbox_to_anchor=(1, 0, 0.5, 1),
+                  prop={'size': 10},
+                  fontsize=10)
+
+        plt.setp(autotexts, size=8, weight="bold")
+        plt.show()
+
+    def plot_hist_most_common_words(n_commons = 10):
+        print('# preparing histogram...')
+        print()
+        # load data set
+        filepath = 'data\\interactive_labeling_dataset_without_header.csv'
+        df_dataset = pd.read_csv(filepath,
+                                 delimiter='|',
+                                 header=None,
+                                 #usecols=[1,2],
+                                 index_col=None,
+                                 engine='python',
+                                 #nrows=100,
+                                 quoting=csv.QUOTE_NONNUMERIC,
+                                 quotechar='\'')
+
+        corpus = df_dataset[1] + '. ' + df_dataset[2]
+
+        # find most common words in dataset
+        dict = BagOfWords.make_dict_common_words(corpus,
+                                                 rel_freq=False,
+                                                 stemming=False,
+                                                 n=n_commons)
+
+        plt.xlabel('Most Common Words in News Articles')
+        plt.ylabel('Frequency')
+
+        labels = list(dict.keys())
+        numbers = list(dict.values())
+        nbars = n_commons
+        plt.bar(np.arange(nbars), height=numbers, tick_label=labels)
+        plt.show()
+
+if __name__ == '__main__':
+    # VisualizerNews.plot_histogram_companies()
+    # VisualizerNews.plot_wordcloud_dataset()
+    # VisualizerNews.plot_histogram_text_lengths()
+    VisualizerNews.plot_pie_chart_of_sites()
+    # VisualizerNews.plot_hist_most_common_words()
--- a/data/interactive_labeling_dataset.csv
+++ b/data/interactive_labeling_dataset.csv
@ -10000,8 +10000,3 @@
 '8de6b38540b5c03c427f1eb51edd138d6375ebf2'|'Iraq has not reached agreement with Exxon on southern oilfields -oil minister'|'December 25, 2017 / 11:13 AM / in 3 hours Iraq has not reached agreement with Exxon on southern oilfields -oil minister Ahmed Rasheed 1 Min Read BAGHDAD, Dec 25 (Reuters) - Iraq has not yet reached an agreement with Exxon Mobil on a multibillion-dollar project to boost output from several southern oilfields, Oil Minister Jabar al-Luaibi said on Monday. If no agreement is reached by February, Luaibi told journalists at a signing ceremony for a separate deal, the project would be offered to other companies. Luaibi had said in October that Iraq was in final talks with Exxon Mobil on developing the project, which consists of building oil pipelines, storage facilities and a seawater supply project to inject water from the Gulf into reservoirs to improve production. (Reporting by Ahmed Rasheed; Writing by Ahmed Aboulenein; Editing by Hugh Lawson)'|'reuters.com'|'http://feeds.reuters.com/reuters/companyNews'|'https://www.reuters.com/article/iraq-oil-exxon-mobil/iraq-has-not-reached-agreement-with-exxon-on-southern-oilfields-oil-minister-idUSL8N1OP0DK'|'2017-12-25T13:11:00.000+02:00'
 'f38935c3d50c936a0ff012d16087d46e24bba1b8'|'Activist investor Primestone urges Tennant to merge with Nilfisk'|'December 13, 2017 / 4:40 PM / Updated 11 minutes ago Activist investor Primestone urges Tennant to merge with Nilfisk Reuters Staff 2 Min Read COPENHAGEN/LONDON (Reuters) - Activist hedge fund Primestone Capital on Wednesday urged U.S. cleaning equipment company Tennant and Danish peer Nilfisk to consider merging after building up minority stakes in both firms. “Primestone Capital owns more than 5 percent of both Tennant and Nilfisk and believes a combination of the two will generate extraordinary returns for shareholders,” the London-based activist firm said in a filing with the U.S. Securities and Exchange Commission. It disclosed it had taken a 5.2 percent stake in Tenant. Nilfisk, which was spun off from Danish cable maker NKT in October, and Tennant were not immediately available to comment. Shares in Nilfisk rose 4.9 percent in Copenhagen after Primestone’s statement, while Tennant traded up 0.8 percent in New York at 1532 GMT. Primestone said it believed a merger would create earnings per share (EPS) accretion in excess of 85 percent for both companies. It is following a trend, as 91 European companies have been subject to activist campaigns this year as of the end of November, according to industry tracker Activist Insight. Primestone Capital, which does not disclose its assets under management, was set up in December 2014 by former Carlyle Group senior partners Franck Falézan, Benoît Colas and Jean-Pierre Millet. Primestone owns a 5.6 percent stake in Nilfisk. Reporting by Stine Jacobsen in Copenhagen and Maiya Keidan in London, editing by Louise Heavens and Susan Fenton'|'reuters.com'|'http://feeds.reuters.com/reuters/UKBusinessNews/'|'https://uk.reuters.com/article/uk-tennant-nilfisk-holding-hedgefunds/activist-investor-primestone-urges-tennant-to-merge-with-nilfisk-idUKKBN1E72B0'|'2017-12-13T18:39:00.000+02:00'
 '1ca7e1a94514268012775d32c5306adf4aab4b73'|'Australia''s Crown Resorts faces class action'|'SYDNEY (Reuters) - Australian casino operator Crown Resorts Ltd ( CWN.AX ) was slapped with a class action lawsuit on Monday for allegedly failing to inform shareholders of a marketing campaign in China that resulted in staff arrests and a slump in the share price.FILE PHOTO: The logo of Australian casino giant Crown Resorts Ltd adorns the hotel and casino complex in Melbourne, Australia, June 13, 2017. REUTERS/Jason Reed/File Photo The company part-owned by billionaire businessman James Packer said it would defend itself against the allegations contained in the suit, filed on behalf of shareholders who invested between Feb. 6, 2015 and Oct. 16, 2016.On Oct. 17, 2016, Crown shares fell almost 14 percent on the news that almost 20 employees had been arrested in China for marketing gambling trips to Macau. While gambling is legal in the southern Chinese territory, it is illegal on the mainland.“Shareholders should have been apprised of the risks that Crown was taking in China and the threat they posed to the company’s revenue streams,” Andrew Watson, national head of class actions at Maurice Blackburn, said in a statement.The arrests triggered an abrupt reversal in Crown’s offshore ambitions and an admission from Packer that his “global strategy” had failed. Australia’s biggest casino company quit Macau and Las Vegas to focus on Australia.Trading volumes at Crown resorts have slumped in the fall-out of the arrests, as profits generated from high rollers plummeted.All of the Crown employees who were detained in China have now been released.Crown said in a statement it would “vigorously defend” itself against the shareholders’ allegations.Crown shares fell as much as 1.2 percent in midday trading on Monday while the benchmark index edged higher.Reporting by Paulina Duran in Sydney. Additional reporting by Susan Mathew in Bengaluru; Editing by Stephen Coates '|'reuters.com'|'http://feeds.reuters.com/reuters/businessNews'|'https://www.reuters.com/article/us-crown-resorts-lawsuit/australias-crown-resorts-faces-class-action-idUSKBN1DY028'|'2017-12-04T02:46:00.000+02:00'
-'ce77dda10f2ac692067ef3b157eddbd762fe2893'|'Irish strike threat recedes after Ryanair puts union pledge in writing - union'|'December 21, 2017 / 1:00 PM / Updated an hour ago Irish strike threat recedes after Ryanair puts union pledge in writing - union Reuters Staff 1 Min Read DUBLIN (Reuters) - The Irish union representing Ryanair pilots on Thursday said the threat of industrial action had receded after the company confirmed in writing that it would recognise the union for collective bargaining purposes. FILE PHOTO: A pilot disembarks a Ryanair flight at Stansted airport in London, Britain September 27, 2017. REUTERS/Clodagh Kilcoyne - RC13831BF000/File Photo The IMPACT union called off planned strike action after Ryanair last week announced that it would recognise trade unions for the first time in its 32 year history, but it said it would reverse the decision unless written confirmation was supplied. “IMPACT said the company’s confirmation that it recognised the union... and would conclude a comprehensive agreement, meant the danger of industrial action had receded for the present,” the union said in a statement. Reporting by Conor Humphries; editing by Jason Neely'|'reuters.com'|'http://feeds.reuters.com/Reuters/UKBusinessNews?format=xml'|'https://uk.reuters.com/article/uk-ryanair-pilots-ireland/irish-strike-threat-recedes-after-ryanair-puts-union-pledge-in-writing-union-idUKKBN1EF1MP'|'2017-12-21T14:59:00.000+02:00'
-'fa656e260c133d46a74fefe91c13b2d318052a18'|'Lufthansa scraps deal for Air Berlin''s Niki'|'December 13, 2017 / 4:13 PM / a few seconds ago Lufthansa scraps deal for Air Berlin''s Niki Victoria Bryan 4 Min Read BERLIN (Reuters) - Lufthansa ( LHAG.DE ) has abandoned plans to buy Air Berlin ( AB1.DE ) subsidiary Niki after being told by the European Commission that it would not allow the deal, meaning Niki could join the list of Europe’s collapsed airlines this year. A placard is seen in the cockpit of the plane of the AB6210, the last flight, operated by insolvent carrier Air Berlin before departing Munich''s international airport, southern Germany, October 27, 2017. Sign reads "Air Berlin says goodbye". REUTERS/Michael Dalder Tough competition and falling ticket prices have led to the demise of Monarch and Air Berlin while Alitalia has filed for insolvency protection. Lufthansa’s planned takeover of Air Berlin businesses Niki and LGW had raised concerns among rivals that Lufthansa would become too dominant in Germany, with Ryanair boss Michael O‘Leary describing the deal as a “stitch-up”. Lufthansa said on Wednesday it had offered to give up take-off and landing slots in order to get the deal approved, but that the European Commission considered that to be insufficient. The withdrawal leaves Air Berlin’s administrators scrambling to find a new buyer for Austria-based Niki, which had not filed for insolvency itself and was still flying with the help of funding from Lufthansa. The administrators said on Tuesday British Airways’ parent IAG ( ICAG.L ) was no longer interested and talks with Thomas Cook ( TCG.L ) had not yet resulted in a viable deal. Thomas Cook declined to comment on Wednesday. The German government, which stands to lose out on a loan given to Air Berlin, said it expected Niki, founded by former Formula 1 driver Niki Lauda, to file for insolvency protection and be grounded. Lufthansa, and Air Berlin’s administrators had also previously warned that would happen if the deal was not approved. GROWTH Lufthansa will likely still be able to expand its market position even without the Niki deal. It has previously said it planned to grow the Eurowings’ fleet to about 210 aircraft from 160 as a result of the Air Berlin insolvency. Lufthansa said on Wednesday it still intended to pursue growth plans for its Eurowings budget subsidiary and would apply for any Niki slots that become free in the event of an insolvency. It had been providing funding to keep Niki in the air until the deal was approved and said that money would now be used to grow on its own in Niki’s markets. “(From a financial point of view), this leads to a comparable result for the Lufthansa Group,” it said. Lufthansa said it still planned to buy Air Berlin subsidiary LGW and would submit a revised proposal, including foregoing slots, to the Commission on Wednesday. The purchase price for LGW on its own is around 18 million euros ($22 million), Air Berlin said, compared to a previous price of 210 million euros for the deal including Niki. The German government had been expecting to use the sale proceeds as repayment for a 150 million euros bridge loan it awarded to Air Berlin to keep it flying after it filed for insolvency protection. “We deeply regret the Commission decision,” it said in a statement. It added that it expected only part of the loan could now be repaid and it would take steps to minimize losses for taxpayers. easyJet ( EZJ.L ) is paying 40 million euros for Air Berlin’s operations at Berlin Tegel, a deal which has received Commission approval. That would leave the German government nearly 100 million euros short. ($1 = 0.8499 euros) Reporting by Victoria Bryan, Klaus Lauer; Additional reporting by Alistair Smout; Editing by Tom Sims and Elaine Hardcastle'|'reuters.com'|'http://feeds.reuters.com/Reuters/UKBusinessNews?format=xml'|'https://uk.reuters.com/article/us-air-berlin-m-a-lufthansa/lufthansa-scraps-deal-for-air-berlins-niki-idUKKBN1E7285'|'2017-12-13T18:03:00.000+02:00'
-'bbc8ffc5e25727be8f283da0448f55c7b7e2410b'|'Northrop gets second request from U.S. antitrust officials on Orbital deal'|'(Reuters) - U.S. defense contractor Northrop Grumman Corp ( NOC.N ) said on Wednesday it received a second request from U.S. antitrust regulators for more information about its deal to buy Orbital ATK Inc ( OA.N ).FILE PHOTO - A UAV helicopter build by Northrop Gruman is on deck aboard the soon to be commissioned littoral combat ship USS Coronado during a media tour in Coronado, California April 3, 2014. REUTERS/Mike Blake/File Photo A second request from the Federal Trade Commission is often a burden for companies that have to provide extensive information that can drain time and resources to collect. Such requests have in the past led to concessions so a merger gains government approval.The $7.8 billion merger with Orbital, a missile and rocket maker, is still expected to close in the first half of 2018, Northrop said in a statement.Northrop’s deal to buy Orbital will give it greater access to lucrative government contracts and expand its arsenal of missile defense systems and space rockets.The all-cash transaction is the biggest in the defense sector in two years and comes as North Korea’s missile and nuclear weapons threats grow, heightening tensions with the United States and its allies.Reporting by Chris SandersEditing by Sandra Maler '|'reuters.com'|'https://in.reuters.com/finance/deals'|'https://in.reuters.com/article/us-orbital-atk-m-a-northrop-grumman/northrop-gets-second-request-from-u-s-antitrust-officials-on-orbital-deal-idINKBN1E03BL'|'2017-12-06T20:51:00.000+02:00'
-'936cb8b49322119f1bdd80fdbae3f22ef6f3380a'|'Delivery Hero sees proceeds of $429 million from capital increase'|'December 6, 2017 / 7:54 AM / Updated 24 minutes ago Delivery Hero sees proceeds of $429 million from capital increase Reuters Staff 1 Min Read FRANKFURT (Reuters) - Delivery Hero, the world’s largest online takeaway food delivery group, said it expected gross proceeds of 362.25 million euros ($428.6 million) from the placement of new shares with investors, as it seeks to raise funds to expand through acquisitions. FILE PHOTO: The Delivery Hero headquarters is pictured in Berlin, Germany, June 2, 2017. REUTERS/Fabrizio Bensch/File Photo The group said in a statement on Wednesday it would issue 10.5 million new shares, to be placed at 34.50 euros apiece, an 8 percent discount to Tuesday’s closing price. In addition, Delivery Hero shareholders have placed 2 million existing shares with institutional investors in an accelerated bookbuilding, bringing the total placement to 12.5 million shares. The group had said on Tuesday its shareholders planned to place up to 7.8 million existing shares, or around 4.5 percent of the share capital. Reporting by Maria Sheahan; Editing by Ludwig Burger'|'reuters.com'|'http://feeds.reuters.com/Reuters/UKBusinessNews?format=xml'|'https://uk.reuters.com/article/uk-delivery-hero-shareissue/delivery-hero-sees-proceeds-of-429-million-from-capital-increase-idUKKBN1E00R4'|'2017-12-06T09:54:00.000+02:00'
-'b95be642e6b27ff3e8011c39a1c3e25803991e8e'|'Exclusive: Ford set for China tie-up with Alibaba to test online, direct auto sales: source'|'BEIJING (Reuters) - Ford Motor Co. is expected to sign as early as Thursday a deal with Alibaba Group Holding Ltd which may allow the U.S. automaker to test selling cars to consumers in China through Alibaba’s online retail arm Tmall, as well as via a new “auto vending machine” store concept, according to a Ford source familiar with the matter.FILE PHOTO: Visitors look at Ford models at Auto Guangzhou in Guangzhou, China November 17, 2017. REUTERS/Bobby Yip/File Photo Representatives of Ford and Alibaba, including Ford Executive Chairman Bill Ford Jr. and Ford CEO Jim Hackett, are expected to be in Hangzhou on Thursday to sign a letter of intent that outlines the scope of the new partnership.According to the source, who did not want to be named because he is not authorized to speak with reporters, the deal is intended to position the Dearborn, Michigan, automaker for an emerging Chinese marketplace where more cars could be sold online.The partnership would be part of Ford’s effort to overhaul its China strategy to revive the growth momentum it has lost in recent months.Ford’s global chief spokesman Mark Truby said the company is expected to make an announcement on Thursday in Hangzhou, where Alibaba is based, but declined to comment in advance.Alibaba spokeswoman Crystal Liu declined to comment.The source said the proposal could mean that cars purchased online are delivered to buyers by franchised Ford retail stores and would be maintained and repaired by them.But Ford could also use Tmall’s new retail concept called the “Automotive Vending Machine” — a multi-storey parking garage that partly resembles a giant vending machine — to sell directly to consumers, the source said. Those cars could come directly from Ford or from its dealers but the details are still to be worked out, the source added.According to Alibaba, consumers can use their phones to browse through the cars garaged in the store and choose to either immediately buy one or test drive it. The vehicle would be delivered to them on the ground floor.CONCERN FOR DEALERS The model allows shoppers with good credit to purchase their new ride with a 10 percent down payment and then make monthly payments for the car purchase through Alibaba’s affiliate Alipay, according to Alibaba.Ford believes dealers would likely agree to this direct retailing model because they still get to service cars sold through Tmall, the Ford source said.The move, though, could be potentially problematic for dealers, some industry experts said.“When online sales and direct sales volume was small that’s one thing. But if this format gained steam, it would definitely impact dealers,” according to Yale Zhang, head of Shanghai-based consultancy Automotive Foresight. “Retail innovation is great, but it is by its nature disruptive and can’t keep everybody happy.”The danger is that the dealers lose out not only on a lot of car sales but also the potentially lucrative auto financing aspect of their traditional business.Direct selling by auto brands is not always possible in many markets around the world. In the United States, for example, because of franchise auto dealer operators’ political clout, except for a small number of states, direct selling is largely not possible.The source said Ford is “behind in using big data” to monitor sales trends and effectively market its cars and the move to online sales as well as the access to Tmall’s massive database of information on consumers would help it to catch up.Online auto sales volumes are currently limited in China because car buyers want to be able to see, touch and drive cars before buying them, said Zhang. The ability to test drive a car ordered online could change that.Ford’s China sales have been sluggish in recent months in part because it has failed to catch on to rapidly changing trends in the marketplace, including the rise of entry-level cars popular in smaller and less-well-known cities, where demand is booming.Ford’s sales in the first 10 months of this year were 938,570, a decline of 5 percent from the same period in 2016, against a 2.2 percent gains to 3.13 million for hometown rival General Motors.Reporting By Norihiko Shirouzu; Editing by Martin Howell '|'reuters.com'|'http://feeds.reuters.com/reuters/INbusinessNews'|'https://in.reuters.com/article/china-autos-ford-alibaba/exclusive-ford-set-for-china-tie-up-with-alibaba-to-test-online-direct-auto-sales-source-idINKBN1E016Q'|'2017-12-06T12:26:00.000+02:00'
--- a/data/interactive_labeling_dataset_without_header.csv
+++ b/data/interactive_labeling_dataset_without_header.csv
--- a/thesis/images/Data_Processing_Pipeline_251018.png
+++ b/thesis/images/Data_Processing_Pipeline_251018.png
--- a/thesis/images/Hist_10CommonWords_100rows_2.png
+++ b/thesis/images/Hist_10CommonWords_100rows_2.png
--- a/thesis/images/NER_old_50bins.png
+++ b/thesis/images/NER_old_50bins.png
--- a/thesis/images/WordCloud_allRows_best.png
+++ b/thesis/images/WordCloud_allRows_best.png
--- a/thesis/images/art_length_200bins_best.png
+++ b/thesis/images/art_length_200bins_best.png
--- a/thesis/refs.bib
+++ b/thesis/refs.bib
@ -0,0 +1,23 @@
+@BOOK{BOOK:1,
+	  AUTHOR="Lillian Pierson",
+	  TITLE="Data Science für Dummies",
+	  PUBLISHER="Wiley-VCH Verlag GmbH \& Co. KGaA",
+	  YEAR=2016
+}
+#stanford NER:
+@ARTICLE{ARTICLE:1,
+         AUTHOR="Jenny Rose Finkel, Trond Grenager, Christopher Manning",
+         TITLE="Incorporating Non-local Information into Information Extraction Systems by Gibbs Sampling. Proceedings of the 43nd Annual Meeting of the Association for Computational Linguistics",
+         JOURNAL="ACL",
+         PUBLISHER="ACL",
+         YEAR=2005}
+# pp. 363-370. #http://nlp.stanford.edu/~manning/papers/gibbscrf3.pdf
+
+@MISC{WEBSITE:1,
+  	  HOWPUBLISHED="\url{https://docs.webhose.io/docs/output-reference}",
+	  AUTHOR = "Intel",
+	  TITLE = "Webhose.io",
+	  MONTH = "Oct",
+	  YEAR = "1999",
+	  NOTE = "Accessed on 2018-10-19"
+}
--- a/thesis/thesis.tex
+++ b/thesis/thesis.tex
@ -3,6 +3,11 @@
 \usepackage[utf8]{inputenc}
 \usepackage[T1]{fontenc}
 \usepackage[pdftex]{graphicx}
+%package to manage images
+\usepackage{graphicx} 
+\graphicspath{ {./images/} }
+\usepackage[rightcaption]{sidecap}
+\usepackage{wrapfig}
 %for lists
 \usepackage{listings}
 \usepackage{enumitem}
@ -20,9 +25,9 @@
 \newcommand{\jk}[1]{\todo[inline]{JK: #1}}
 \renewcommand{\familydefault}{\sfdefault}

-% Kommentare Anne
+% Fragen Anne
 \definecolor{comments}{cmyk}{1,0,1,0}
-\newcommand{\al}[1]{\todo[inline]{\color{comments}{AL: #1}}}
+\newcommand{\al}[1]{\todo[inline]{\color{comments}{Anne: #1}}}

 \definecolor{uhhred}{cmyk}{0,100,100,0}

@ -38,9 +43,9 @@
      {\color{uhhred}\textbf{\so{BACHELORTHESIS}}}
 \vspace*{2.0cm}\\
 {\LARGE \textbf{Prediction of Company Mergers\\Using Interactive Labeling\\and Machine Learning Methods}}
-%or: Incremental labeling of an unknown data set using the example of classification of news articles OR
-%Recognizing M\&As in News Articles\\Using Interactive Labeling\\and Machine Learning Methods
-%Interactive Labeling of Unclassified Data\\Using the Example of Recognition of Company Mergers
+% OR: Incremental labeling of an unknown data set using the example of classification of news articles OR
+% OR: Recognizing M\&As in News Articles\\Using Interactive Labeling\\and Machine Learning Methods
+% OR: Interactive Labeling of Unclassified Data\\Using the Example of Recognition of Company Mergers
 \vspace*{2.0cm}\\
 vorgelegt von
 \vspace*{0.4cm}\\
@ -51,7 +56,6 @@ Anne Lorenz
 \noindent 
 MIN-Fakultät \vspace*{0.4cm} \\ 
 Fachbereich Informatik \vspace*{0.4cm} \\ 
-%Ggf. Professur/Institut \vspace*{0.4cm} \\
 Studiengang: Software-System-Entwicklung \vspace*{0.4cm} \\ 
 Matrikelnummer: 6434073 \vspace*{0.8cm} \\ 
 Erstgutachter: Dr. Julian Kunkel \vspace*{0.4cm} \\ 
@ -109,7 +113,7 @@ In this thesis we want to present an alternative data labeling method that allow
 }

 \section{State of Research}
-\al{Was soll hier rein?}
+\al{Was soll hier rein? Kann mir darunter leider nichts vorstellen.}

 \bigskip
 \paragraph{Summary:} 
@ -146,13 +150,12 @@ Vergleichbar mit Spamfilterung...

 \subsection{Balanced / Unbalanced Data Set}

-
 \section{Text Analysis} 
 \subsection{Natural Language Processing (NLP)} 
 \subsection{Tokenization}
 \subsection{Unigram, Bigram} 
 \subsection{Stemming} 
-\subsection{Feature Vectors}
+\subsection{Feature Vectors, Document Term Matrix}
 \subsubsection{Word Frequencies} 
 \subsection{Bag of Words (BOW)} 
 \subsection{Stop Words} 
@ -160,16 +163,17 @@ Vergleichbar mit Spamfilterung...

 \section{Machine Learning Models} 
 \subsection{Naive Bayes Classifier} 
-\subsection{Support Vector Machines (SVM)} 
-\subsection{Decision Trees} 
+\subsection{Support Vector Machines Classifier (SVM)} 
+\subsection{Decision Trees Classifier} 
+\section{Tuning Options} 
+\subsection{Split Methods} 
+\subsubsection{Test-Train-Split}
+\subsubsection{Shuffle Split}
+\subsubsection{Stratified Split}
+\subsubsection{(K-fold) Cross-Validation}
 \subsection{Hyperparameters} 
 \subsection{Feature Selection}

-\section{Split Methods} 
-\subsection{Test-Train-Split}
-\subsection{Shuffle Split} 
-\subsection{(K-fold) Cross-Validation}
-
 \section{Metrics}
 \subsection{Accuracy, Error Rate, Sensitivity, Specifity}
 Sensitivity(=true positive rate) and Specificity(=true negative rate)
@ -195,59 +199,105 @@ In the next chapter we describe...
 \textit{
 In this chapter... In Section \ref{sec:overview} we give an overview of all, then in Section the data processing pipeline, blablabla...
 }
+\jk{Was muss insgesamt gemacht werden, welche Teilprobleme müssen addressiert werden. Alternativen besprechen, Entscheidungen fällen basierend auf Kriterien. Hier kommt Deine Arbeit hin, kein Related work oder Methoden die es schon gibt. Nur falls man es Vergleicht, dann relevant.}

 \section{Overview}
 \label{sec:overview}

-\jk{Was muss insgesamt gemacht werden, welche Teilprobleme müssen addressiert werden. Alternativen besprechen, Entscheidungen fällen basierend auf Kriterien. Hier kommt Deine Arbeit hin, kein Related work oder Methoden die es schon gibt. Nur falls man es Vergleicht, dann relevant.}
+% Data Selection > Data Labeling > Data Preprocessing > Model Selection > Recognition of Merger Partners

-First, we need to collect appropriate data, then label a data set manually, then, ....\\
+\vspace{1.0cm}
+\begin{figure}[h]
+\centering
+\includegraphics[width=\textwidth]{images/Data_Processing_Pipeline_251018}
+\caption{Data Processing Pipeline}
+\label{fig:pipeline}
+\end{figure}
+\vspace{1.0cm}
+
+As shown in Figure \ref{fig:pipeline}, we first need to select appropriate data, then label a data set manually, then, ...\\
 \\
-% Data Processing Pipeline als Schaubild einfügen:
-Data Selection > Labeling > Preprocessing > Model Selection > Recognition of Merger Partners

 \section{Data Selection}
 \label{sec:data_selection}

-Before we can start with the data processing, we need to identify and select appropriate data. We downloaded news articles of 12 months (year 2017) from the website \url{<webhose.io>} as described in Chapter \ref{chap:implementation}, Section \ref{sec:data_download}.
-As webhose.io is a secondary source and only crawls the news feeds itself, it may occur that some RSS feeds are not parsed correctly or a article is tagged with a wrong topic as \textit{site categories}. The downloaded files also contain blog entries, user comments, videos or graphical content and other spam which we have to filter out. We also do not need pages quoting Reuters etc.. Besides this, we are only interested in English news articles. \\
-After we have filtered out all the irrelevant data, we receive a data set of XX.XXX news articles that we store in a csv file.
+\subsection{Downloading the Data}

-The csv file contains the following 9 columns:
+Before we can start with the data processing, we have to identify and select appropriate data. We downloaded news articles of 12 months (year 2017) from the website \textit{webhose.io}.
+
+To retrieve our data, we make the following request\footnote{On \url{https://docs.webhose.io/docs/filters-reference} you can learn more about the possible filter settings of \textit{webhose.io.}}:\\\\
+\texttt{
+site:(reuters.com OR ft.com OR cnn.com OR economist.com\\
+\noindent\hspace*{12mm}%
+OR bloomberg.com OR theguardian.com)\\
+site\_category:(financial\_news OR finance OR business)\\
+\\
+timeframe:january2017-december2017} \\
+\\
+The requested data was downloaded in September 2018 with JSON as file format. Every news article is saved in a single file, in total 1.478.508 files were downloaded (4,69 GiB).
+Among others, one JSON file contains the information shown in the following example :\\
+
+\begin{lstlisting}[breaklines=true]
+{
+      "thread": {
+        "uuid": "a931e8221a6a55fac4badd5c6992d0a525ca3e83",
+        "url": "https://www.reuters.com/article/us-github-m-a-microsoft-eu/eu-antitrust-ruling-on-microsoft-buy-of-github-due-by-october-19-idUSKCN1LX114",
+        "site": "reuters.com",
+        "site_section": "http://feeds.reuters.com/reuters/financialsNews",
+        "section_title": "Reuters | Financial News"
+        "published": "2018-09-17T20:00:00.000+03:00"
+        "site_type": "news",
+        "spam_score": 0.0,
+      },
+      "title": "EU antitrust ruling on Microsoft buy of GitHub due by October 19",
+      "text": "BRUSSELS (Reuters)-EU antitrust regulators will decide by Oct. 19 whether to clear U.S. software giant Microsoft's $7.5 billion dollar acquisition of privately held coding website GitHub. Microsoft, which wants to acquire the firm to reinforce its cloud computing business against rival Amazon, requested European Union approval for the deal last Friday, a filing on the European Commission website showed on Monday. The EU competition enforcer can either give the green light with or without demanding concessions, or it can open a full-scale investigation if it has serious concerns. GitHub, the world's largest code host with more than 28 million developers using its platform, is Microsoft's largest takeover since the company bought LinkedIn for $26 billion in 2016. Microsoft Chief Executive Satya Nadella has tried to assuage users' worries that GitHub might favor Microsoft products over competitors after the deal, saying GitHub would continue to be an open platform that works with all the public clouds. Reporting by Foo Yun Chee; Editing by Edmund Blair",
+      "language": "english",
+      "crawled": "2018-09-18T01:52:42.035+03:00"
+}
+\end{lstlisting}
+
+As \textit{webhose.io} is a secondary source for news articles and only crawls the news feeds itself, it may occur that some RSS feeds are not parsed correctly or a article is tagged with a wrong topic as \textit{site categories}. The downloaded files also contain blog entries, user comments, videos or graphical content and other spam which we have to filter out. We also do not need pages quoting Reuters etc.. Besides this, we are only interested in English news articles.\\
+
+
+After we have filtered out all the irrelevant data, we receive a data set of \textbf{41.790} news articles that we store in multiple csv files\footnote{All csv files have a total size of 109 MB.}, one for each month.
+
+\subsection{Selecting the Working Data Set}
+\label{subsec:data_selection}
+We have received a different number of articles from each month. Because we want the items for our initial working data set to be fairly distributed throughout the year, we select 833 articles from each month\footnote{We select 834 from every third month: (8 * 833) + (4 * 834) = 10.000.} to create a csv file containing \textbf{10.000} articles with a total size of 27 MB. 
+
+The csv file has the following 7 columns:
 \begin{center}
-\begin{tabular}{|c|c|c|c|c|c|c|c|c|}
+\begin{tabular}{|c|c|c|c|c|c|c|}
 \hline
-SectionTitle & Title & SiteSection & Text & Uuid & Timestamp & Site & SiteFull & Url \\
+Uuid & Title & Text & Site & SiteSection & Url & Timestamp\\
 \hline
 \end{tabular}
 \end{center}
-The individual columns contain:
+
 \begin{itemize}

-\item \textbf{SectionTitle:} The name of the news feed section, e.g. \textit{'Reuters | Financial News'}.
-
-\item \textbf{Title:} The news article's headline, e.g. \textit{'EU antitrust ruling on Microsoft buy of GitHub due by October 19'}
-
-\item \textbf{SiteSection:} The link to the section of the site where the thread was created, e.g. \textit{'http://feeds.reuters.com/reuters/financialsNews'}
-
-\item \textbf{Text:} The article's plain text.
-
 \item \textbf{Uuid:} Universally unique identifier, representing the article's thread.

-\item \textbf{Timestamp:} The thread's publishing date/time in the format YYYY-MM-DDThh:mmGMT+3. E.g. \textit{2018-09-17T20:00:00.000+03:00'}
+\item \textbf{Title:} The news article's headline.

-\item \textbf{Site:} The top level domain of the article's site, e.g. \textit{'reuters.com'}
+\item \textbf{Text:} The article's plain text.

-\item \textbf{SiteFull:} The complete domain of the article's site, e.g. \textit{'reuters.com'}
+\item \textbf{Site:} The top level domain of the article's site.

-\item \textbf{Url:} The link to the top of the article's thread, e.g. \textit{'https://www.reuters.com/article/us-github-m-a-microsoft-eu/eu-antitrust-ruling-on-microsoft-buy-of-github-due-by-october-19-idUSKCN1LX114'}
+\item \textbf{SiteSection:} The link to the section of the site where the thread was created.
+
+\item \textbf{Url:} The link to the top of the article's thread.
+
+\item \textbf{Timestamp:} The thread's publishing date and time in the format YYYY-MM-DDThh:mm (GMT+3).

 \end{itemize}
 The columns \textbf{Title} and \textbf{Text} contain our main data, whereas the rest of the attributes is the meta data.

-\section{Labeling}
+We explore this data set in more datail in Chapter \ref{chap:exploration}.

-From our dataset of XX.XXX news articles, we select 10.000 articles \footnote{833/844 articles of each month} to proceed with the labeling process.
+\section{Data Labeling}
+
+Here we explain our two different approaches of labeling data sets.

 \subsection{Conventional Method} 

@ -271,7 +321,7 @@ From our dataset of XX.XXX news articles, we select 10.000 articles \footnote{83

 \subsubsection{Unbalanced Data Set} 

-\section{Preprocessing}
+\section{Data Preprocessing}
 In order to use the news articles for machine learning algorithms, we must first prepare and filter the texts appropriately:

 \begin{description}
@ -313,31 +363,89 @@ In this chapter we... In the next chapter...
 % Kapitel 5 Data Exploration
 %###########################
 \chapter{Data Exploration}
+
 \label{chap:exploration}

 \textit{
-In this chapter we explore our textual corpus, which contains of the news articles headline and plain text.
+In this chapter we explore our textual corpus of news articles.
 }

 \section{Text Corpus Exploration} 
-% Hier kommen Visualisierungen mit pyplot/seaborn rein.
+
+The textual corpus\footnote{We describe the initial data set in detail in Chapter \ref{chap:design}, Section \ref{subsec:data_selection}.} contains of the news articles' headlines and plain texts, if not specified otherwise. For the sake of simplicity we use the unigram model for our analysis.
+
+\subsection{Sources for News Articles}
+
+As illustrated in Table \ref{table:sources}, the main source for news articles in our data set is \textit{Reuters.com}. This is due to the fact that Webhose.io does not have equal access to the desired sources and, above all, the news from Reuters.com has been parsed with the required quality.
+
+
+\begin{center}
+\begin{table}[h]
+\centering
+\begin{tabular}{|l|r|}
+\hline
+reuters.com & 94\% \\ 
+theguardian.com & 3\% \\ 
+economist.com & 2\% \\ 
+bloomberg.com & < 1\% \\ 
+cnn.com & < 1\% \\ 
+ft.com & < 1\% \\
+\hline
+\end{tabular}
+\caption{Article sources in the data set}
+\label{table:sources}
+\end{table}
+\end{center}
+
+\al{Ist es ein Problem, dass Reuters die Hauptquelle ist?}

 \subsection{Number of Features}
-% Wichtigste Features?
+The document term matrix of the entire data set has 47.545 features.

 \subsection{Length of Articles}
-The average length of the news articles examined is [X] words.
+
+The average length of the news articles examined is 2476 characters\footnote{headlines excluded}. The distribution of the article length in the dataset is shown in Figure \ref{fig:article_length}.
+
+\begin{figure}[h]
+\centering
+\includegraphics[width=\textwidth]{images/art_length_200bins_best.png}
+\caption{Histogram of article lengths}
+\label{fig:article_length}
+\end{figure}

 \subsection{Most Common Words}
-% NACH data preprocessing! (eigenes BOW benutzen)
+
+The 10 most common words in the data set are: \textit{'percent', 'fitch', 'billion', 'new', 'business', 'market', 'next', 'million', 'ratings', 'investors'}.
+% toDo
+\al{Ist nur von den 100 ersten Artikeln im Datensatz, muss noch ausgetauscht werden.}
+\begin{figure}[h]
+\centering
+\includegraphics[width=\textwidth]{images/Hist_10CommonWords_100rows_2.png}
+\caption{Bar chart of 10 most frequent words in data set}
+\label{fig:10_most_common}
+\end{figure}
+
 % Erst Schaubild/WordCloud von ganzem Korpus,
 % dann nur die Artikel über Fusion.

-\subsubsection{Word Cloud} 
-%z.B. Word Cloud mit Microsoft-Github-Fusion-Artikel.
+\begin{figure}[h]
+\centering
+\includegraphics[width=\textwidth]{images/WordCloud_allRows_best.png}
+\caption{WordCloud of most frequent words in data set}
+\label{fig:wordcloud}
+\end{figure}

 \subsection{Distribution of Company Names}
-'XY' is the most frequently used company name in the old dataset.
+%=> toDo!!!
+\al{Hier geht es noch um den alten Datensatz, muss noch ausgetauscht werden.}
+'Comcast' is the most frequently used company name in the data set. Figure \ref{fig:company_names} shows that big companies dominate the reporting about mergers. In order to use a fairly distributed data set for model selection, we limit the number of articles used to 3 per company name.
+
+\begin{figure}[h]
+\centering
+\includegraphics[width=\textwidth]{images/NER_old_50bins.png}
+\caption{Histogram of Company Names Distribution}
+\label{fig:company_names}
+\end{figure}

 \bigskip
 \paragraph{Summary:}
@ -358,8 +466,8 @@ This chapter describes and compares two different data labeling processes; a con
 \section{Conventional Method}

 \subsection{Data Set}
-First, we label a slightly smaller data set in a conventional way. The dataset consists of 1497 news articles, which were downloaded via \textit{webhose.io}. The dataset contains news articles from different Reuters' RSS feeds dating from the period of one month \footnote{The timeframe is May 25 - June 25 2018, retrieved on June 25 2018.}. Here, we only filter out articles that contain at least one of the keywords \textit{'merger', 'acquisition', 'take over', 'deal', 'transaction'} or \textit{'buy'} in the heading.
-With the following query we download the desired data from \textit{webhose.io}:\\\\
+First, we label a slightly smaller data set in a conventional way. The dataset consists of 1497 news articles, which were downloaded via \textit{webhose.io}. The dataset contains news articles from different Reuters' RSS feeds dating from the period of one month \footnote{The timeframe was May 25 - June 25 2018, retrieved on June 25 2018.}. Here, we only filter out articles that contain at least one of the keywords \textit{'merger', 'acquisition', 'take over', 'deal', 'transaction'} or \textit{'buy'} in the heading.
+With the following query\footnote{Please read more about the possible filter settings on the website \url{https://docs.webhose.io/docs/filters-reference}} we download the desired data from \textit{webhose.io}:\\\\
 \texttt{
 thread.title:(merger OR merges OR merge OR merged 
    OR acquisition 
@ -372,12 +480,13 @@ site\_type:news \\
 site:reuters.com \\
 language:english}
 \subsection{Classification}
-The articles were classified binary with the two labels:
+The articles are classified binary with the labels:
 \begin{description}
-\item[0:]{company A and B merge}
+\item[0:]{merger of company A and B}
 \item[1:]{other}
 \end{description}
-The process of reading and label the 1497 news articles took about 30 hours in total.
+
+The process of reading and labeling the 1497 news articles takes about 30 hours in total.

 \subsection{Difficulties}
 Some article texts were difficult to classify even when read carefully.
@ -401,12 +510,12 @@ For the interactive labeling method, we use the data set of 10.000 articles from
 \subsection{Classification}
 For the multiple classification we use the following 6 classes:
 \begin{description}
-\item[1:]{Company A and B merge}
-\item[2:]{Merger is pending}
-\item[3:]{Merger is aborted}
-\item[4:]{Share sale}
-\item[5:]{Merger as incidental remark}
-\item[6:]{Irrelevant news}
+\item[1:]{merger of company A and B}
+\item[2:]{merger is pending}
+\item[3:]{merger is aborted}
+\item[4:]{sale of shares}
+\item[5:]{merger as incidental remark, not main topic}
+\item[6:]{other / irrelevant news}
 \end{description}

 \subsection{Selection of Articles} 
@ -449,43 +558,11 @@ This chapter deals with the most relevant parts of the implementation.
 \section{Data Download}
 \label{sec:data_download}

-To retrieve our data, we make the following request on the website
-\url{<https://webhose.io>}:\\\\
-\texttt{
-site:(reuters.com OR ft.com OR cnn.com OR economist.com\\
-\noindent\hspace*{12mm}%
-OR bloomberg.com OR theguardian.com)\\
-site\_category:(financial\_news OR finance OR business)\\
-\\
-timeframe: january 2017 - december 2017} \\
-\\
-The requested data was downloaded in September 2018 with JSON as file format. Every news article is saved in a single file, in total 1.478.508 files were downloaded (4,69 GiB).
-Among others, one JSON file contains the information shown in the following example :\\
-
-\begin{lstlisting}[breaklines=true]
-{
-      "thread": {
-        "uuid": "a931e8221a6a55fac4badd5c6992d0a525ca3e83",
-        "url": "https://www.reuters.com/article/us-github-m-a-microsoft-eu/eu-antitrust-ruling-on-microsoft-buy-of-github-due-by-october-19-idUSKCN1LX114",
-        "site": "reuters.com",
-        "site_section": "http://feeds.reuters.com/reuters/financialsNews",
-        "section_title": "Reuters | Financial News"
-        "published": "2018-09-17T20:00:00.000+03:00"
-        "site_type": "news",
-        "spam_score": 0.0,
-      },
-      "title": "EU antitrust ruling on Microsoft buy of GitHub due by October 19",
-      "text": "BRUSSELS (Reuters)-EU antitrust regulators will decide by Oct. 19 whether to clear U.S. software giant Microsoft's $7.5 billion dollar acquisition of privately held coding website GitHub. Microsoft, which wants to acquire the firm to reinforce its cloud computing business against rival Amazon, requested European Union approval for the deal last Friday, a filing on the European Commission website showed on Monday. The EU competition enforcer can either give the green light with or without demanding concessions, or it can open a full-scale investigation if it has serious concerns. GitHub, the world's largest code host with more than 28 million developers using its platform, is Microsoft's largest takeover since the company bought LinkedIn for $26 billion in 2016. Microsoft Chief Executive Satya Nadella has tried to assuage users' worries that GitHub might favor Microsoft products over competitors after the deal, saying GitHub would continue to be an open platform that works with all the public clouds. Reporting by Foo Yun Chee; Editing by Edmund Blair",
-      "language": "english",
-      "crawled": "2018-09-18T01:52:42.035+03:00"
-}
-\end{lstlisting}
-
 \section{Python Modules} 
 \subsection{nltk} 
 \subsection{pandas} 
 \subsection{sklearn}
-\subsection{webhoseio}
+\subsection{webhose.io}

 \section{Jupyter Notebook}
 For interactive coding, labeling, visualization and documentation.
@ -535,6 +612,7 @@ Grid-Search
 \subsection{Decision Tree}
 % wichtigste 20 features ausgeben lassen!
 % einfaches test_train_split (0.25) nur auf Title in altem Dataset benutzt:
+\al{Das ist noch von altem Datensatz, muss noch ausgetauscht werden.}
 20 most important words in testing set:
 ['merger', 'buy', 'monsanto', 'warner', 'win', 'walmart', '2', 'billion', 'kkr', 'rival', 'uk', 'watch', 'jv', 'merg', 'get', 'non', 'anz', 'xerox', 'clear', 'deal']

@ -571,13 +649,14 @@ This may lead to even better results.
 \textit{\newline
 In the last chapter we have described ....
 }
-
-% Literaturliste soll im Inhaltsverzeichnis auftauchen
+% nicht als Kapitel:
 \nocite{*}
+% List of figures
+\addcontentsline{toc}{chapter}{List of Figures}
+\listoffigures
+% Literaturliste
 \addcontentsline{toc}{chapter}{Bibliography}
-
-% Literaturliste anzeigen
-\bibliography{LV}
+\bibliographystyle{ieeetr}  \bibliography{refs}

 \backmatter