added new files
This commit is contained in:
parent
ecb629e16c
commit
3f98aff635
|
@ -0,0 +1,77 @@
|
|||
'''
|
||||
Cosine Similarity
|
||||
=================
|
||||
|
||||
CosineSimilarity measures the similarity between to articles.
|
||||
It calculates c: the cosine of the angle between the articles
|
||||
vectors dict_1 and dict_2.
|
||||
c = (dict_1 * dict_2) / (|dict_1| * |dict_2|).
|
||||
c = 1, if articles are equal => identicalness is 100%
|
||||
0 > c > 1, else => identicalness is (c*100)%
|
||||
(The greater c, the more similar two articles are.)
|
||||
'''
|
||||
|
||||
#TODO: uses dictionaries of each article
|
||||
# => ToDo: has to be changed as we are now using vectors
|
||||
|
||||
import math
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
|
||||
class CosineSimilarity:
|
||||
|
||||
def cos_sim(dict_1, dict_2):
|
||||
|
||||
# list of all different words
|
||||
vocab = []
|
||||
|
||||
# insert words of 1st article into vocab
|
||||
for key in dict_1.keys():
|
||||
if key not in vocab:
|
||||
vocab.append(key)
|
||||
|
||||
# insert words of 2nd article into vocab
|
||||
for key in dict_2.keys():
|
||||
if key not in vocab:
|
||||
vocab.append(key)
|
||||
|
||||
# delete first entry ('sum_words')
|
||||
vocab.pop(0)
|
||||
|
||||
# create vectors
|
||||
vector_1 = CosineSimilarity.create_vector(dict_1, vocab)
|
||||
vector_2 = CosineSimilarity.create_vector(dict_2, vocab)
|
||||
|
||||
# start calculation
|
||||
# calculate numerator of formula
|
||||
sum_1 = 0
|
||||
|
||||
for i in range (0,len(vector_1)):
|
||||
sum_1 += vector_1[i] * vector_2[i]
|
||||
|
||||
# calculate denominator of formula
|
||||
sum_2 = 0
|
||||
|
||||
for entry in vector_1:
|
||||
sum_2 += entry ** 2
|
||||
|
||||
sum_3 = 0
|
||||
for entry in vector_2:
|
||||
sum_3 += entry ** 2
|
||||
|
||||
return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3))
|
||||
|
||||
def create_vector(dict, vocab):
|
||||
# word frequency vector
|
||||
vector = []
|
||||
for word in vocab:
|
||||
# check if word occurs in article
|
||||
if word in dict:
|
||||
# insert word count
|
||||
vector.append(dict[word])
|
||||
else:
|
||||
# insert zero
|
||||
vector.append(0)
|
||||
# delete first entry ('sum_words')
|
||||
vector.pop(0)
|
||||
return vector
|
|
@ -0,0 +1,60 @@
|
|||
'''
|
||||
Named Entity Recognition (NER)
|
||||
==============================
|
||||
|
||||
NER takes a text as input and searches for names of persons, companies
|
||||
and countries.
|
||||
'''
|
||||
from nltk import ne_chunk, pos_tag, sent_tokenize, word_tokenize
|
||||
from nltk.tree import Tree
|
||||
|
||||
''' TODO: falsch klassifiert:
|
||||
[('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '),
|
||||
('PERSON', 'Maybank Kim Eng Securities '), ('PERSON', 'Krung Thai Bank '),
|
||||
('PERSON', 'Siam Commercial Bank '), ('PERSON', 'Singapore '),
|
||||
('PERSON', 'Keppel Corp '), ('ORGANIZATION', 'Companies ')]
|
||||
'''
|
||||
|
||||
class NER:
|
||||
|
||||
def get_ne_with_label(text):
|
||||
labels = []
|
||||
names = []
|
||||
# TODO: letztes Wort wird nicht erkannt
|
||||
for chunk in ne_chunk(pos_tag(word_tokenize(text + 'lastword.'))):
|
||||
if hasattr(chunk, 'label'):
|
||||
name = ''
|
||||
for c in chunk:
|
||||
name += c[0] + ' '
|
||||
if name not in names:
|
||||
names.append(name.strip())
|
||||
labels.append(chunk.label())
|
||||
#print(chunk.label(), ' '.join(c[0] for c in chunk))
|
||||
return list(zip(labels, names))
|
||||
|
||||
test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
|
||||
\nmostly fell in light volumes on Tuesday as energy shares
|
||||
tracked \nfalls in global oil prices, while weaknesses in banking shares
|
||||
\namid concerns about loans to an ailing steel firm sent the Thai
|
||||
\nindex to a one-week closing low. \nBangkok's SET index shed nearly
|
||||
1 percent after four \nsessions of gains. The index closed at 1,379.32,
|
||||
its lowest \nclosing since Sept. 15. \nShares of Krung Thai Bank Pcl,
|
||||
the most actively \ntraded by turnover, dropped 2.8 percent to a near
|
||||
one-month low, \nreflecting potential impact of loans to Sahaviriya Steel
|
||||
\nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities
|
||||
downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure
|
||||
to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be
|
||||
lower than 130 percent, the \ndesired level we think and hence the need for
|
||||
more provisioning \nin the following quarters,\" the broker said in a report.
|
||||
\nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its
|
||||
creditors, dropped 1 percent. The steel firm \nand its three creditors
|
||||
agreed on Monday to consider options to \nrestructure debt worth over
|
||||
50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their
|
||||
slides for a third \nsession, Singapore gave up early gains and Indonesia
|
||||
\nhit a near one-week low, all with trading volumes below \nthe 30-day
|
||||
average ahead of a public holiday on Thursday. \nAmong top losers in the
|
||||
region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
|
||||
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
|
||||
\namid uncertainty over global demand. \nFor Asian Companies click.'''
|
||||
|
||||
print(NER.get_ne_with_label(test_article))
|
|
@ -0,0 +1,93 @@
|
|||
'''
|
||||
Requester
|
||||
=========
|
||||
|
||||
retrieves JSON files from webhose.io
|
||||
saves articles' relevant information in csv file
|
||||
'''
|
||||
|
||||
#toDo: insert personal webhose key
|
||||
|
||||
import re
|
||||
from datetime import datetime
|
||||
|
||||
import pandas as pd
|
||||
import webhoseio
|
||||
|
||||
from CsvHandler import CsvHandler
|
||||
|
||||
class Requester:
|
||||
|
||||
def save_articles_from_webhoseio():
|
||||
''' create DataFrame of articles with
|
||||
Timestamp, Title, Text, SiteSection
|
||||
and then save it in csv target file
|
||||
'''
|
||||
datestring = datetime.strftime(datetime.now(), '%Y-%m-%d')
|
||||
filestring = 'download_articles_{}.csv'.format(datestring)
|
||||
|
||||
# print message
|
||||
print('# retrieving articles from webhose.io')
|
||||
|
||||
# personal API key
|
||||
webhoseio.config(token="XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX")
|
||||
|
||||
# webhose.io query
|
||||
# suboptimal: usage of search terms :-(
|
||||
query_params = {
|
||||
"q": "thread.title:(merger OR merges OR merge OR merged OR "
|
||||
"acquisition OR \"take over\" OR \"take-over\" OR "
|
||||
"\"takeover\" OR deal OR transaction OR buy OR sell OR "
|
||||
"approval OR approve OR \"business combination\" OR "
|
||||
"\"combined company\") "
|
||||
"is_first:true "
|
||||
"site_type:news "
|
||||
"site:reuters.com "
|
||||
"language:english "
|
||||
"has_video:false",
|
||||
"ts": "1527411742661",
|
||||
"sort": "crawled"}
|
||||
|
||||
output = webhoseio.query("filterWebContent", query_params)
|
||||
|
||||
sum_posts = output['totalResults']
|
||||
print('# total sum of posts: ' + str(sum_posts))
|
||||
|
||||
# 100 articles per batch (download)
|
||||
num_downloads = int(sum_posts / 100)
|
||||
print('# collecting first {} articles'.format(num_downloads * 100))
|
||||
print('# sorting out other sources than reuters')
|
||||
|
||||
# twodimensional list of all articles
|
||||
list_articles = []
|
||||
|
||||
for n in range(num_downloads):
|
||||
# save next 100 articles
|
||||
for i in range(100):
|
||||
# check if correct source 'reuters'
|
||||
if not re.search(r'reuters',
|
||||
output['posts'][i]['thread']['site_section']):
|
||||
continue
|
||||
else:
|
||||
article = []
|
||||
article.append(output['posts'][i]['published'])
|
||||
article.append(output['posts'][i]['title'].replace('|', ' '))
|
||||
# remove white spaces and separators
|
||||
text = output['posts'][i]['text'].replace('\n', ' ')
|
||||
.replace('\r', ' ').replace('|', ' ')
|
||||
section = output['posts'][i]['thread']['site_section']
|
||||
article.append(text)
|
||||
# remove '\r' at end of some urls
|
||||
section = section.replace('\r', '')
|
||||
article.append(section)
|
||||
# add article to list
|
||||
list_articles.append(article)
|
||||
|
||||
# Get the next batch of 100 posts
|
||||
output = webhoseio.get_next()
|
||||
|
||||
# create DataFrame
|
||||
df = pd.DataFrame(data=list_articles,
|
||||
columns=['Timestamp', 'Title', 'Text', 'SiteSection'])
|
||||
# save csv
|
||||
CsvHandler.write_csv(df, filestring)
|
|
@ -0,0 +1,28 @@
|
|||
'''
|
||||
Starter
|
||||
=============
|
||||
|
||||
starter program
|
||||
'''
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
from CsvHandler import CsvHandler
|
||||
from DecisionTree import DecisionTree
|
||||
from NaiveBayes import NaiveBayes
|
||||
#from Requester import Requester
|
||||
#from SVM import SVM
|
||||
|
||||
print('# starting program')
|
||||
print()
|
||||
|
||||
# Requester.save_articles_from_webhoseio()
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
# DecisionTree.make_tree(dataset)
|
||||
NaiveBayes.make_naive_bayes(dataset)
|
||||
# SVM.make_svm(dataset)
|
||||
|
||||
print('# ending program')
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue