added new files

2018-09-07 14:16:47 +02:00 · 2018-09-07 14:16:47 +02:00 · 3f98aff635
parent ecb629e16c
commit 3f98aff635
5 changed files with 1756 additions and 0 deletions
--- a/CosineSimilarity.py
+++ b/CosineSimilarity.py
@ -0,0 +1,77 @@
+'''
+Cosine Similarity
+=================
+
+CosineSimilarity measures the similarity between to articles.
+It calculates c: the cosine of the angle between the articles 
+vectors dict_1 and dict_2.
+c = (dict_1 * dict_2) / (|dict_1| * |dict_2|). 
+c = 1, if articles are equal => identicalness is 100% 
+0 > c > 1, else => identicalness is (c*100)%
+(The greater c, the more similar two articles are.)
+'''
+
+#TODO: uses dictionaries of each article 
+# => ToDo: has to be changed as we are now using vectors
+
+import math
+
+from BagOfWords import BagOfWords
+
+class CosineSimilarity:
+
+    def cos_sim(dict_1, dict_2):
+
+        # list of all different words
+        vocab = []        
+        
+        # insert words of 1st article into vocab
+        for key in dict_1.keys():
+            if key not in vocab:
+                vocab.append(key)
+                
+        # insert words of 2nd article into vocab
+        for key in dict_2.keys():
+            if key not in vocab:
+                vocab.append(key)
+                
+        # delete first entry ('sum_words')       
+        vocab.pop(0)
+        
+        # create vectors
+        vector_1 = CosineSimilarity.create_vector(dict_1, vocab)
+        vector_2 = CosineSimilarity.create_vector(dict_2, vocab)
+        
+        # start calculation       
+        # calculate numerator of formula
+        sum_1 = 0    
+        
+        for i in range (0,len(vector_1)):
+            sum_1 += vector_1[i] * vector_2[i]  
+            
+        # calculate denominator of formula 
+        sum_2 = 0
+        
+        for entry in vector_1:
+            sum_2 += entry ** 2
+        
+        sum_3 = 0    
+        for entry in vector_2:
+            sum_3 += entry ** 2
+     
+        return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3))
+        
+    def create_vector(dict, vocab):
+        # word frequency vector 
+        vector = []     
+        for word in vocab:
+            # check if word occurs in article
+            if word in dict:
+                # insert word count
+                vector.append(dict[word])
+            else:
+                # insert zero
+                vector.append(0)
+        # delete first entry ('sum_words')
+        vector.pop(0)
+        return vector
--- a/NER.py
+++ b/NER.py
@ -0,0 +1,60 @@
+'''
+Named Entity Recognition (NER)
+==============================
+
+NER takes a text as input and searches for names of persons, companies
+and countries. 
+'''
+from nltk import ne_chunk, pos_tag, sent_tokenize, word_tokenize
+from nltk.tree import Tree 
+
+''' TODO: falsch klassifiert:
+[('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '), 
+('PERSON', 'Maybank Kim Eng Securities '), ('PERSON', 'Krung Thai Bank '),
+ ('PERSON', 'Siam Commercial Bank '), ('PERSON', 'Singapore '),  
+ ('PERSON', 'Keppel Corp '), ('ORGANIZATION', 'Companies ')]
+'''
+
+class NER:
+      
+    def get_ne_with_label(text):   
+        labels = []
+        names = []
+        # TODO: letztes Wort wird nicht erkannt
+        for chunk in ne_chunk(pos_tag(word_tokenize(text + 'lastword.'))):
+            if hasattr(chunk, 'label'):               
+                name = '' 
+                for c in chunk:
+                    name += c[0] + ' '
+                if name not in names:    
+                    names.append(name.strip())
+                    labels.append(chunk.label())
+                    #print(chunk.label(), ' '.join(c[0] for c in chunk))
+        return list(zip(labels, names))               
+        
+test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets 
+                \nmostly fell in light volumes on Tuesday as energy shares 
+                tracked \nfalls in global oil prices, while weaknesses in banking shares 
+                \namid concerns about loans to an ailing steel firm sent the Thai 
+                \nindex to a one-week closing low. \nBangkok's SET index shed nearly 
+                1 percent after four \nsessions of gains. The index closed at 1,379.32, 
+                its lowest \nclosing since Sept. 15. \nShares of Krung Thai Bank Pcl,
+                the most actively \ntraded by turnover, dropped 2.8 percent to a near 
+                one-month low, \nreflecting potential impact of loans to Sahaviriya Steel 
+                \nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities 
+                downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure 
+                to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be 
+                lower than 130 percent, the \ndesired level we think and hence the need for 
+                more provisioning \nin the following quarters,\" the broker said in a report. 
+                \nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its 
+                creditors, dropped 1 percent. The steel firm \nand its three creditors 
+                agreed on Monday to consider options to \nrestructure debt worth over 
+                50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their 
+                slides for a third \nsession, Singapore gave up early gains and Indonesia 
+                \nhit a near one-week low, all with trading volumes below \nthe 30-day 
+                average ahead of a public holiday on Thursday. \nAmong top losers in the 
+                region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and 
+                Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell 
+                \namid uncertainty over global demand. \nFor Asian Companies click.'''
+                
+print(NER.get_ne_with_label(test_article))
--- a/Requester.py
+++ b/Requester.py
@ -0,0 +1,93 @@
+'''
+Requester
+=========
+
+retrieves JSON files from webhose.io
+saves articles' relevant information in csv file
+'''
+
+#toDo: insert personal webhose key
+
+import re
+from datetime import datetime
+
+import pandas as pd
+import webhoseio   
+
+from CsvHandler import CsvHandler    
+
+class Requester:
+                
+    def save_articles_from_webhoseio():
+        ''' create DataFrame of articles with
+        Timestamp, Title, Text, SiteSection
+        and then save it in csv target file
+        '''
+        datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
+        filestring = 'download_articles_{}.csv'.format(datestring)
+    
+        # print message
+        print('# retrieving articles from webhose.io')
+    
+        # personal API key
+         webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
+
+        # webhose.io query 
+        # suboptimal: usage of search terms :-(
+        query_params = {
+            "q": "thread.title:(merger OR merges OR merge OR merged OR "
+                 "acquisition OR \"take over\" OR \"take-over\" OR "
+                 "\"takeover\" OR deal OR transaction OR buy OR sell OR "
+                 "approval OR approve OR \"business combination\" OR "
+                 "\"combined company\") "
+                 "is_first:true "
+                 "site_type:news "
+                 "site:reuters.com "
+                 "language:english "
+                 "has_video:false",
+            "ts": "1527411742661",
+            "sort": "crawled"}
+    
+        output = webhoseio.query("filterWebContent", query_params)
+        
+        sum_posts = output['totalResults']
+        print('# total sum of posts: ' + str(sum_posts))
+        
+        # 100 articles per batch (download)
+        num_downloads = int(sum_posts / 100)       
+        print('# collecting first {} articles'.format(num_downloads * 100))
+        print('# sorting out other sources than reuters')
+        
+        # twodimensional list of all articles
+        list_articles = []
+
+        for n in range(num_downloads):
+            # save next 100 articles
+            for i in range(100):  
+                # check if correct source 'reuters'
+                if not re.search(r'reuters', 
+                                 output['posts'][i]['thread']['site_section']):
+                    continue
+                else:
+                    article = []
+                    article.append(output['posts'][i]['published'])
+                    article.append(output['posts'][i]['title'].replace('|', ' '))
+                    # remove white spaces and separators
+                    text = output['posts'][i]['text'].replace('\n', ' ')
+                           .replace('\r', ' ').replace('|', ' ')                
+                    section = output['posts'][i]['thread']['site_section']
+                    article.append(text)
+                    # remove '\r' at end of some urls
+                    section = section.replace('\r', '') 
+                    article.append(section)
+                    # add article to list
+                    list_articles.append(article)
+                
+            # Get the next batch of 100 posts
+            output = webhoseio.get_next()
+        
+        # create DataFrame
+        df = pd.DataFrame(data=list_articles, 
+                          columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
+        # save csv
+        CsvHandler.write_csv(df, filestring)
--- a/Starter.py
+++ b/Starter.py
@ -0,0 +1,28 @@
+'''
+Starter
+=============
+
+starter program
+'''
+
+from BagOfWords import BagOfWords
+from CsvHandler import CsvHandler
+from DecisionTree import DecisionTree
+from NaiveBayes import NaiveBayes
+#from Requester import Requester
+#from SVM import SVM
+
+print('# starting program')
+print()
+
+# Requester.save_articles_from_webhoseio()
+file = 'classification_labelled_corrected.csv'
+
+# read csv file
+dataset = CsvHandler.read_csv(file)
+
+# DecisionTree.make_tree(dataset)
+NaiveBayes.make_naive_bayes(dataset)
+# SVM.make_svm(dataset)
+
+print('# ending program')
--- a/classification_labelled_corrected.csv
+++ b/classification_labelled_corrected.csv