added new files

This commit is contained in:
Anne Lorenz 2018-09-07 14:16:47 +02:00
parent ecb629e16c
commit 3f98aff635
5 changed files with 1756 additions and 0 deletions

77
CosineSimilarity.py Normal file
View File

@ -0,0 +1,77 @@
'''
Cosine Similarity
=================
CosineSimilarity measures the similarity between to articles.
It calculates c: the cosine of the angle between the articles
vectors dict_1 and dict_2.
c = (dict_1 * dict_2) / (|dict_1| * |dict_2|).
c = 1, if articles are equal => identicalness is 100%
0 > c > 1, else => identicalness is (c*100)%
(The greater c, the more similar two articles are.)
'''
#TODO: uses dictionaries of each article
# => ToDo: has to be changed as we are now using vectors
import math
from BagOfWords import BagOfWords
class CosineSimilarity:
def cos_sim(dict_1, dict_2):
# list of all different words
vocab = []
# insert words of 1st article into vocab
for key in dict_1.keys():
if key not in vocab:
vocab.append(key)
# insert words of 2nd article into vocab
for key in dict_2.keys():
if key not in vocab:
vocab.append(key)
# delete first entry ('sum_words')
vocab.pop(0)
# create vectors
vector_1 = CosineSimilarity.create_vector(dict_1, vocab)
vector_2 = CosineSimilarity.create_vector(dict_2, vocab)
# start calculation
# calculate numerator of formula
sum_1 = 0
for i in range (0,len(vector_1)):
sum_1 += vector_1[i] * vector_2[i]
# calculate denominator of formula
sum_2 = 0
for entry in vector_1:
sum_2 += entry ** 2
sum_3 = 0
for entry in vector_2:
sum_3 += entry ** 2
return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3))
def create_vector(dict, vocab):
# word frequency vector
vector = []
for word in vocab:
# check if word occurs in article
if word in dict:
# insert word count
vector.append(dict[word])
else:
# insert zero
vector.append(0)
# delete first entry ('sum_words')
vector.pop(0)
return vector

60
NER.py Normal file
View File

@ -0,0 +1,60 @@
'''
Named Entity Recognition (NER)
==============================
NER takes a text as input and searches for names of persons, companies
and countries.
'''
from nltk import ne_chunk, pos_tag, sent_tokenize, word_tokenize
from nltk.tree import Tree
''' TODO: falsch klassifiert:
[('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '),
('PERSON', 'Maybank Kim Eng Securities '), ('PERSON', 'Krung Thai Bank '),
('PERSON', 'Siam Commercial Bank '), ('PERSON', 'Singapore '),
('PERSON', 'Keppel Corp '), ('ORGANIZATION', 'Companies ')]
'''
class NER:
def get_ne_with_label(text):
labels = []
names = []
# TODO: letztes Wort wird nicht erkannt
for chunk in ne_chunk(pos_tag(word_tokenize(text + 'lastword.'))):
if hasattr(chunk, 'label'):
name = ''
for c in chunk:
name += c[0] + ' '
if name not in names:
names.append(name.strip())
labels.append(chunk.label())
#print(chunk.label(), ' '.join(c[0] for c in chunk))
return list(zip(labels, names))
test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
\nmostly fell in light volumes on Tuesday as energy shares
tracked \nfalls in global oil prices, while weaknesses in banking shares
\namid concerns about loans to an ailing steel firm sent the Thai
\nindex to a one-week closing low. \nBangkok's SET index shed nearly
1 percent after four \nsessions of gains. The index closed at 1,379.32,
its lowest \nclosing since Sept. 15. \nShares of Krung Thai Bank Pcl,
the most actively \ntraded by turnover, dropped 2.8 percent to a near
one-month low, \nreflecting potential impact of loans to Sahaviriya Steel
\nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities
downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure
to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be
lower than 130 percent, the \ndesired level we think and hence the need for
more provisioning \nin the following quarters,\" the broker said in a report.
\nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its
creditors, dropped 1 percent. The steel firm \nand its three creditors
agreed on Monday to consider options to \nrestructure debt worth over
50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their
slides for a third \nsession, Singapore gave up early gains and Indonesia
\nhit a near one-week low, all with trading volumes below \nthe 30-day
average ahead of a public holiday on Thursday. \nAmong top losers in the
region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
\namid uncertainty over global demand. \nFor Asian Companies click.'''
print(NER.get_ne_with_label(test_article))

93
Requester.py Normal file
View File

@ -0,0 +1,93 @@
'''
Requester
=========
retrieves JSON files from webhose.io
saves articles' relevant information in csv file
'''
#toDo: insert personal webhose key
import re
from datetime import datetime
import pandas as pd
import webhoseio
from CsvHandler import CsvHandler
class Requester:
def save_articles_from_webhoseio():
''' create DataFrame of articles with
Timestamp, Title, Text, SiteSection
and then save it in csv target file
'''
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
filestring = 'download_articles_{}.csv'.format(datestring)
# print message
print('# retrieving articles from webhose.io')
# personal API key
webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
# webhose.io query
# suboptimal: usage of search terms :-(
query_params = {
"q": "thread.title:(merger OR merges OR merge OR merged OR "
"acquisition OR \"take over\" OR \"take-over\" OR "
"\"takeover\" OR deal OR transaction OR buy OR sell OR "
"approval OR approve OR \"business combination\" OR "
"\"combined company\") "
"is_first:true "
"site_type:news "
"site:reuters.com "
"language:english "
"has_video:false",
"ts": "1527411742661",
"sort": "crawled"}
output = webhoseio.query("filterWebContent", query_params)
sum_posts = output['totalResults']
print('# total sum of posts: ' + str(sum_posts))
# 100 articles per batch (download)
num_downloads = int(sum_posts / 100)
print('# collecting first {} articles'.format(num_downloads * 100))
print('# sorting out other sources than reuters')
# twodimensional list of all articles
list_articles = []
for n in range(num_downloads):
# save next 100 articles
for i in range(100):
# check if correct source 'reuters'
if not re.search(r'reuters',
output['posts'][i]['thread']['site_section']):
continue
else:
article = []
article.append(output['posts'][i]['published'])
article.append(output['posts'][i]['title'].replace('|', ' '))
# remove white spaces and separators
text = output['posts'][i]['text'].replace('\n', ' ')
.replace('\r', ' ').replace('|', ' ')
section = output['posts'][i]['thread']['site_section']
article.append(text)
# remove '\r' at end of some urls
section = section.replace('\r', '')
article.append(section)
# add article to list
list_articles.append(article)
# Get the next batch of 100 posts
output = webhoseio.get_next()
# create DataFrame
df = pd.DataFrame(data=list_articles,
columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
# save csv
CsvHandler.write_csv(df, filestring)

28
Starter.py Normal file
View File

@ -0,0 +1,28 @@
'''
Starter
=============
starter program
'''
from BagOfWords import BagOfWords
from CsvHandler import CsvHandler
from DecisionTree import DecisionTree
from NaiveBayes import NaiveBayes
#from Requester import Requester
#from SVM import SVM
print('# starting program')
print()
# Requester.save_articles_from_webhoseio()
file = 'classification_labelled_corrected.csv'
# read csv file
dataset = CsvHandler.read_csv(file)
# DecisionTree.make_tree(dataset)
NaiveBayes.make_naive_bayes(dataset)
# SVM.make_svm(dataset)
print('# ending program')

File diff suppressed because one or more lines are too long