refactoring

This commit is contained in:
Anne Lorenz 2018-10-22 10:17:52 +02:00
parent cbfbdffdb7
commit 0c15d49d7e
4 changed files with 269 additions and 179 deletions

View File

@ -4,23 +4,17 @@
Bag Of Words Bag Of Words
============ ============
BagOfWords counts word stems in an article BagOfWords counts word stems in an article and adds new words to the global
and adds new words to the global vocabulary. vocabulary. As the multinomial Naive Bayes classifier is suitable for
classification with discrete features (e.g., word counts for text
note: classification). The multinomial distribution normally requires integer
The multinomial Naive Bayes classifier is suitable feature counts. However, in practice, fractional counts such as tf-idf may
for classification with discrete features (e.g., also work. => considered by 'relative_word_frequencies' as parameter.
word counts for text classification).
The multinomial distribution normally requires
integer feature counts. However, in practice,
fractional counts such as tf-idf may also work.
=> considered by 'relative_word_frequencies' as parameter
''' '''
import re import re
import pandas as pd import pandas as pd
from nltk.stem.porter import PorterStemmer from nltk.stem.porter import PorterStemmer
class BagOfWords: class BagOfWords:
@ -35,6 +29,7 @@ class BagOfWords:
'''takes article as argument, removes numbers, '''takes article as argument, removes numbers,
returns list of single words, recurrences included. returns list of single words, recurrences included.
''' '''
stemmer = PorterStemmer()
stop_words = BagOfWords.set_stop_words() stop_words = BagOfWords.set_stop_words()
# replace punctuation marks with spaces # replace punctuation marks with spaces
words = re.sub(r'\W', ' ', text) words = re.sub(r'\W', ' ', text)
@ -43,30 +38,21 @@ class BagOfWords:
# list of all words to return # list of all words to return
words_cleaned = [] words_cleaned = []
for word in words: for word in words:
# leave out numbers # check if alphabetic char
if word.isalpha(): if word.isalpha():
# reduce word to stem # reduce word in lower case to stem
word = BagOfWords.reduce_word_to_stem(word) word = stemmer.stem(word.lower())
# check if not stop word # check if not stop word
if word.lower() not in stop_words: if word not in stop_words:
# add every word in lowercase words_cleaned.append(word)
words_cleaned.append(word.lower())
return words_cleaned return words_cleaned
def reduce_word_to_stem(word):
'''takes normal word as input, returns the word's stem
'''
stemmer = PorterStemmer()
# replace word by its stem
word = stemmer.stem(word)
return word
def make_matrix(series, vocab, relative_word_frequencies=True): def make_matrix(series, vocab, relative_word_frequencies=True):
'''calculates word stem frequencies in input articles. '''calculates word stem frequencies in input articles. returns matrix
returns matrix (DataFrame) with relative word frequencies (DataFrame) with relative word frequencies (0 <= values < 1) if
(0 <= values < 1) if relative_word_frequencies=True or absolute relative_word_frequencies=True or absolute word frequencies (int) if
word frequencies (int) if relative_word_frequencies=False. relative_word_frequencies=False.(rows: different articles, colums:
(rows: different articles, colums: different words in vocab) different words in vocab)
''' '''
print('# BOW: calculating matrix') print('# BOW: calculating matrix')
print('# ...') print('# ...')
@ -90,7 +76,6 @@ class BagOfWords:
else: else:
# absolute word frequency # absolute word frequency
vector[i] += 1 vector[i] += 1
# add single vector as tuple # add single vector as tuple
vectors.append(tuple(vector)) vectors.append(tuple(vector))
df_vectors = pd.DataFrame.from_records(vectors, df_vectors = pd.DataFrame.from_records(vectors,
@ -109,10 +94,10 @@ class BagOfWords:
for text in series: for text in series:
# add single article's text to total vocabulary # add single article's text to total vocabulary
vocab |= set(BagOfWords.extract_words(text)) vocab |= set(BagOfWords.extract_words(text))
# transform to list # # transform to list
vocab = list(vocab) # vocab = list(vocab)
# sort list # # sort list
vocab.sort() # vocab.sort()
return vocab return vocab
def set_stop_words(): def set_stop_words():
@ -151,20 +136,17 @@ class BagOfWords:
#add unwanted terms #add unwanted terms
stop_words.extend(['reuters', 'bloomberg', 'cnn', 'economist']) stop_words.extend(['reuters', 'bloomberg', 'cnn', 'economist'])
#remove the word 'not' from stop words
#stop_words.remove('not')
# #remove the word 'not' from stop words?
# stop_words.remove('not')
stemmer = PorterStemmer()
for i in range(len(stop_words)): for i in range(len(stop_words)):
# reduce stop words to stem
# remove punctuation marks and strip endings from abbreviations stop_words[i] = stemmer.stem(stop_words[i])
#stop_words[i] = re.split(r'\W', stop_words[i])[0]
# reduce word to stem
stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
# transform list to set to eliminate duplicates # transform list to set to eliminate duplicates
stop_words = set(stop_words) return set(stop_words)
return stop_words
if __name__ == '__main__': if __name__ == '__main__':
test_article = '''Exclusive: Microsoft's $7.5 billion GitHub deal set for test_article = '''Exclusive: Microsoft's $7.5 billion GitHub deal set for
EU approval - sources. BRUSSELS (Reuters) - U.S. software EU approval - sources. BRUSSELS (Reuters) - U.S. software
@ -189,4 +171,5 @@ if __name__ == '__main__':
request for immediate comment. Microsoft declined to request for immediate comment. Microsoft declined to
comment. Reporting by Foo Yun Chee; editing by Jason comment. Reporting by Foo Yun Chee; editing by Jason
Neely''' Neely'''
print(BagOfWords.extract_words(test_article)) print(BagOfWords.extract_words(test_article))

View File

@ -1,133 +0,0 @@
'''
JSON Handler
============
JSON Handler reads articles from JSON files,
extracts relevant information and
writes it to a csv file.
'''
# -*- coding: utf-8 -*-
import csv
import glob
import json
import numpy as np
import pandas as pd
class JsonHandler:
def select_randoms(df, n):
'''selects n random samples from dataset.
params: df DataFrame to select items from,
n number of items to select randomly,
returns new DataFrame with only selected items
'''
# initialize random => reproducible sequence
np.random.seed(5)
# add new column 'Random'
df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
# sort DataFrame by random numbers
df = df.sort_values('Random')
# return first n elements of randomly sorted dataset
return df.iloc[0:n]
def create_csv(file_name):
# create new csv file for each month.
# each row contains an news article.
with open(file_name, 'w', newline='') as csvfile:
writer = csv.writer(csvfile,
delimiter='|',
quotechar='\'',
quoting=csv.QUOTE_NONNUMERIC)
# write header / column names
writer.writerow(['Uuid', #0
'Title', #1
'Text', #2
'Site', #3
'SiteSection', #4
'Url', #5
'Timestamp']) #6
def write_articles_to_csv(file_name):
# path of JSON files
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
'\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4'\
'\\news_[0-9]*.json'
files = glob.glob(path)
# reliable sources (site_sections)
site_sections = ['http://feeds.reuters.com/reuters/financialsNews',
'http://feeds.reuters.com/reuters/INbusinessNews',
'http://feeds.reuters.com/reuters/businessNews',
'http://feeds.reuters.com/reuters/companyNews',
'http://www.reuters.com/finance/deals',
'http://feeds.reuters.com/reuters/mergersNews',
'http://rss.cnn.com/rss/money_topstories.rss',
'http://rss.cnn.com/rss/money_latest.rss',
'http://www.economist.com/sections/business-finance/rss.xml',
'http://rss.cnn.com/rss/edition_business.rss',
'http://in.reuters.com/finance/deals',
'http://feeds.reuters.com/reuters/technologyNews',
'http://feeds.reuters.com/reuters/technologysectorNews',
'https://www.ft.com/companies/us',
'http://feeds.reuters.com/reuters/UKScienceNews',
'http://in.reuters.com/news/technology',
'http://in.reuters.com/finance/economy',
'https://www.bloomberg.com/middleeast',
'http://in.reuters.com/news/top-news']
# file counter
n = 0
# article counter
a = 0
# read every JSON file in current folder
with open(file_name, 'a', newline='') as csvfile:
writer = csv.writer(csvfile,
delimiter='|',
quotechar='\'',
quoting=csv.QUOTE_NONNUMERIC)
for file in files:
n += 1
with open(file, encoding='utf-8') as f:
# Json is converted to dict
dict = json.load(f)
#print(n)
# leave out comments or posts, take only reuters as source
if ((dict['ord_in_thread'] != 0) or
(dict['language'] != 'english') or
(dict['thread']['spam_score'] > 0.3) or
(dict['thread']['site_section'] not in site_sections)):
continue
# pick only relevant information of article
# and put in in list
article = [dict['thread']['uuid'], # 0:'Uuid'
dict['thread']['title'], # 1:'Title'
dict['text'], # 2:'Text'
dict['thread']['site'], # 3:'Site'
dict['thread']['site_section'],# 4:'SiteSection'
dict['url'], # 5:'Url'
dict['published']] # 6:'Timestamp'
# remove newlines and delimiter chars
article[1] = article[1].replace('|', '-')
article[2] = article[2].replace('\n', ' ')\
.replace('\r', ' ').replace('|', '-')
try:
writer.writerow(article)
a += 1
# handle undefined characters (videos and other spam)
except UnicodeEncodeError:
print('# filtered out: {} (UnicodeEncodeError)'
.format(dict['thread']['site_section']))
print()
print('# saved {} articles in file {}'.format(a, file_name))
if __name__ == '__main__':
file_name = 'test.csv'
JsonHandler.create_csv(file_name)
JsonHandler.write_articles_to_csv(file_name)

152
JsonHandler.py Normal file
View File

@ -0,0 +1,152 @@
'''
Json Handler
============
JsonHandler reads articles from JSON files,
extracts relevant information and
writes it to a csv file.
'''
# -*- coding: utf-8 -*-
import csv
import glob
import json
import numpy as np
import pandas as pd
class JsonHandler:
# string for every month of the year
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
'11', '12']
def select_randoms(df, n):
'''select n random samples from dataset.
params: df DataFrame to select items from,
n number of items to select randomly,
return new DataFrame with only selected items.
'''
# initialize random => reproducible sequence
np.random.seed(5)
# add new column 'Random'
df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
# sort DataFrame by random numbers
df = df.sort_values('Random')
# return first n elements of randomly sorted dataset
return df.iloc[0:n]
def create_labeling_dataset():
# number of articles to select from each month:
# 10.000 / 12 = 833,33
n_select = 833
# except every third month:
every_third_month = ['03', '06', '09', '12']
for m in JsonHandler.month:
df = pandas.read_csv('all_{}.csv'.format(m),
delimiter='|',
header=0,
index_col=None,
engine='python',
quotechar='\'',
quoting=0,
encoding='utf-8')
# pick one more from every third article
if m in every_third_month:
n_select = 834
JsonHandler.select_randoms(df, n_select).to_csv('labeling_dataset.csv',
header=True,
mode='a',
encoding='python',
quoting=QUOTE_MINIMAL,
quotechar='\'')
def write_articles_to_csv_files():
'''read JSON files, select articles and write them to csv.
'''
# reliable sources (site_sections)
site_sections = []
# read list from 'sections.txt' file
with open('sections.txt', 'r') as s_list:
site_sections = s_list.read().split('\n')
# article counter
a = 0
for m in JsonHandler.months:
# 1 output file per month
output_file = 'all_{}.csv'.format(m)
# path of input JSON files per month
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
'\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
'\\news_[0-9]*.json'.format(m)
files = glob.glob(path)
# file counter
n = 0
# write separate csv file for every month
with open(output_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile,
delimiter='|',
quotechar='\'',
quoting=csv.QUOTE_NONNUMERIC)
# write header / column names
writer.writerow(['Uuid', #0
'Title', #1
'Text', #2
'Site', #3
'SiteSection', #4
'Url', #5
'Timestamp']) #6
# write articles
for file in files:
n += 1
# read every JSON file
with open(file, encoding='utf-8') as f:
# Json is converted to dict
dict = json.load(f)
# check if comment or post
if ((dict['ord_in_thread'] != 0) or
# check if not english
(dict['language'] != 'english') or
# check if spam
(dict['thread']['spam_score'] > 0.3) or
# check if reliable source
(dict['thread']['site_section'] not in site_sections) or
# check if text parsed correctly
('Further company coverage:' in dict['text']) or
(('subscription' or 'subscribe') in dict['text']) or
(len(dict['text']) < 300)):
continue
else:
try:
# replace whitespaces and delimiter chars
# and write to csv
writer.writerow([dict['thread']['uuid'],
dict['thread']['title']\
.replace('|', '-'),
dict['text']\
.replace('\n', '')\
.replace('\r', '')\
.replace('|', '-'),
dict['thread']['site'],
dict['thread']['site_section']\
.replace('\n', '')\
.replace('\r', ''),
dict['url'],
dict['published']])
a += 1
# handle undefined characters (videos and other spam)
except UnicodeEncodeError:
print('# filtered out: {} (UnicodeEncodeError)'
.format(dict['thread']['site_section']))
print('# saved articles in file {}, now {} in total'.format(output_file, a))
print('#')
print('# saved {} articles in total'.format(a))
print('#')
if __name__ == '__main__':
JsonHandler.write_articles_to_csv_files()
#JsonHandler.create_labeling_dataset()

88
sections.txt Normal file
View File

@ -0,0 +1,88 @@
http://feeds.reuters.com/Reuters/UKBusinessNews?format=xml
http://in.reuters.com/finance/economy
http://feeds.reuters.com/reuters/financialsNews
http://in.reuters.com/finance/deals
http://feeds.reuters.com/reuters/INbusinessNews
http://www.theguardian.com/business/rss
http://feeds.reuters.com/reuters/businessNews
http://feeds.reuters.com/reuters/mergersNews
http://feeds.reuters.com/reuters/industrialsNews
http://feeds.reuters.com/reuters/UKBusinessNews/
http://www.ft.com/rss/indepth/investmentbanking/deal
http://feeds.guardian.co.uk/theguardian/business/uk-edition/rss
http://feeds.reuters.com/reuters/companyNews
http://www.ft.com/rss/companies/us
http://rss.cnn.com/rss/edition_business.rss
http://www.ft.com/rss/lex
http://feeds.reuters.com/reuters/businessNews?format=xml
http://www.reuters.com/finance/deals
http://www.ft.com/rss/companies/chemicals
https://www.theguardian.com/uk/business
http://www.ft.com/rss/companies/asia-pacific
http://in.reuters.com/finance/markets/companyOutlooksNews
http://www.ft.com/rss/companies/financials
http://www.ft.com/rss/companies/industrials
http://www.ft.com/rss/companies/uk
http://www.ft.com/rss/companies/rail
https://www.theguardian.com/business/all
http://www.ft.com/rss/companies
http://www.ft.com/rss/companies/banks
http://feeds.reuters.com/news/deals
http://in.reuters.com/finance
http://www.ft.com/rss/companies/airlines
http://www.ft.com/rss/companies/asiapacific
http://www.ft.com/rss/companies/financial-services
http://www.ft.com/rss/companies/retail
http://www.ft.com/rss/companies/europe
http://www.ft.com/rss/companies/property
http://www.ft.com/rss/companies/utilities
http://rss.cnn.com/rss/money_news_companies.rss
http://www.ft.com/rss/world/uk/business
http://www.ft.com/rss/companies/transport
http://www.ft.com/rss/companies/retail-consumer
http://www.ft.com/rss/companies/energy
http://www.ft.com/rss/companies/mining
http://www.reuters.com/finance
http://www.ft.com/rss/companies/automobiles
http://www.ft.com/rss/companies/basic-resources
http://www.ft.com/rss/companies/technology
http://www.ft.com/rss/companies/construction
http://www.ft.com/rss/companies/health
https://www.theguardian.com/media/mediabusiness
http://www.theguardian.com/business/tesco/rss
http://www.theguardian.com/business/oil/rss
http://www.ft.com/rss/companies/aerospace-defence
http://www.ft.com/rss/companies/travel-leisure
http://www.ft.com/rss/companies/oil-gas
http://www.theguardian.com/business/morrisons/rss
http://www.ft.com/rss/companies/telecoms
http://www.ft.com/rss/companies/personal-goods
http://www.ft.com/rss/companies/pharmaceuticals
http://www.ft.com/rss/in-depth/initial-public-offering
http://rss.cnn.com/rss/money_news_economy.rss
http://www.ft.com/rss/companies/insurance
http://www.ft.com/rss/companies/support-services
http://www.guardian.co.uk/business/economics/rss
http://www.economist.com/sections/business-finance/rss.xml
http://www.guardian.co.uk/theobserver/news/business/rss
http://www.ft.com/rss/companies/healthcare
https://www.bloomberg.com/businessweek
http://www.theguardian.com/business/retail/rss
http://rss.cnn.com/rss/money_technology.rss
http://www.economist.com/rss/business_rss.xml
http://www.theguardian.com/business/unilever/rss
https://www.theguardian.com/business/eurozone
https://www.theguardian.com/business/economics
http://www.economist.com/rss/briefings_rss.xml
http://www.theguardian.com/business/euro/rss
http://www.reuters.com/finance/summits
http://rss.ft.com/rss/companies/banks
http://in.reuters.com/finance/summits
http://www.theguardian.com/business/ryanair/rss
http://www.theguardian.com/business/deloitte/rss
https://in.reuters.com/finance/deals
https://in.reuters.com/finance
https://www.reuters.com/finance/deals
https://www.reuters.com/finance
https://in.reuters.com/finance/economy
https://in.reuters.com/finance/markets/companyOutlooksNews