refactoring
This commit is contained in:
parent
cbfbdffdb7
commit
0c15d49d7e
|
@ -4,23 +4,17 @@
|
|||
Bag Of Words
|
||||
============
|
||||
|
||||
BagOfWords counts word stems in an article
|
||||
and adds new words to the global vocabulary.
|
||||
|
||||
note:
|
||||
The multinomial Naive Bayes classifier is suitable
|
||||
for classification with discrete features (e.g.,
|
||||
word counts for text classification).
|
||||
The multinomial distribution normally requires
|
||||
integer feature counts. However, in practice,
|
||||
fractional counts such as tf-idf may also work.
|
||||
=> considered by 'relative_word_frequencies' as parameter
|
||||
BagOfWords counts word stems in an article and adds new words to the global
|
||||
vocabulary. As the multinomial Naive Bayes classifier is suitable for
|
||||
classification with discrete features (e.g., word counts for text
|
||||
classification). The multinomial distribution normally requires integer
|
||||
feature counts. However, in practice, fractional counts such as tf-idf may
|
||||
also work. => considered by 'relative_word_frequencies' as parameter.
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from nltk.stem.porter import PorterStemmer
|
||||
|
||||
class BagOfWords:
|
||||
|
@ -35,6 +29,7 @@ class BagOfWords:
|
|||
'''takes article as argument, removes numbers,
|
||||
returns list of single words, recurrences included.
|
||||
'''
|
||||
stemmer = PorterStemmer()
|
||||
stop_words = BagOfWords.set_stop_words()
|
||||
# replace punctuation marks with spaces
|
||||
words = re.sub(r'\W', ' ', text)
|
||||
|
@ -43,30 +38,21 @@ class BagOfWords:
|
|||
# list of all words to return
|
||||
words_cleaned = []
|
||||
for word in words:
|
||||
# leave out numbers
|
||||
# check if alphabetic char
|
||||
if word.isalpha():
|
||||
# reduce word to stem
|
||||
word = BagOfWords.reduce_word_to_stem(word)
|
||||
# reduce word in lower case to stem
|
||||
word = stemmer.stem(word.lower())
|
||||
# check if not stop word
|
||||
if word.lower() not in stop_words:
|
||||
# add every word in lowercase
|
||||
words_cleaned.append(word.lower())
|
||||
if word not in stop_words:
|
||||
words_cleaned.append(word)
|
||||
return words_cleaned
|
||||
|
||||
def reduce_word_to_stem(word):
|
||||
'''takes normal word as input, returns the word's stem
|
||||
'''
|
||||
stemmer = PorterStemmer()
|
||||
# replace word by its stem
|
||||
word = stemmer.stem(word)
|
||||
return word
|
||||
|
||||
def make_matrix(series, vocab, relative_word_frequencies=True):
|
||||
'''calculates word stem frequencies in input articles.
|
||||
returns matrix (DataFrame) with relative word frequencies
|
||||
(0 <= values < 1) if relative_word_frequencies=True or absolute
|
||||
word frequencies (int) if relative_word_frequencies=False.
|
||||
(rows: different articles, colums: different words in vocab)
|
||||
'''calculates word stem frequencies in input articles. returns matrix
|
||||
(DataFrame) with relative word frequencies (0 <= values < 1) if
|
||||
relative_word_frequencies=True or absolute word frequencies (int) if
|
||||
relative_word_frequencies=False.(rows: different articles, colums:
|
||||
different words in vocab)
|
||||
'''
|
||||
print('# BOW: calculating matrix')
|
||||
print('# ...')
|
||||
|
@ -90,7 +76,6 @@ class BagOfWords:
|
|||
else:
|
||||
# absolute word frequency
|
||||
vector[i] += 1
|
||||
|
||||
# add single vector as tuple
|
||||
vectors.append(tuple(vector))
|
||||
df_vectors = pd.DataFrame.from_records(vectors,
|
||||
|
@ -109,10 +94,10 @@ class BagOfWords:
|
|||
for text in series:
|
||||
# add single article's text to total vocabulary
|
||||
vocab |= set(BagOfWords.extract_words(text))
|
||||
# transform to list
|
||||
vocab = list(vocab)
|
||||
# sort list
|
||||
vocab.sort()
|
||||
# # transform to list
|
||||
# vocab = list(vocab)
|
||||
# # sort list
|
||||
# vocab.sort()
|
||||
return vocab
|
||||
|
||||
def set_stop_words():
|
||||
|
@ -151,20 +136,17 @@ class BagOfWords:
|
|||
|
||||
#add unwanted terms
|
||||
stop_words.extend(['reuters', 'bloomberg', 'cnn', 'economist'])
|
||||
#remove the word 'not' from stop words
|
||||
|
||||
# #remove the word 'not' from stop words?
|
||||
# stop_words.remove('not')
|
||||
|
||||
stemmer = PorterStemmer()
|
||||
for i in range(len(stop_words)):
|
||||
|
||||
# remove punctuation marks and strip endings from abbreviations
|
||||
#stop_words[i] = re.split(r'\W', stop_words[i])[0]
|
||||
|
||||
# reduce word to stem
|
||||
stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
|
||||
# reduce stop words to stem
|
||||
stop_words[i] = stemmer.stem(stop_words[i])
|
||||
# transform list to set to eliminate duplicates
|
||||
stop_words = set(stop_words)
|
||||
return set(stop_words)
|
||||
|
||||
return stop_words
|
||||
if __name__ == '__main__':
|
||||
test_article = '''Exclusive: Microsoft's $7.5 billion GitHub deal set for
|
||||
EU approval - sources. BRUSSELS (Reuters) - U.S. software
|
||||
|
@ -189,4 +171,5 @@ if __name__ == '__main__':
|
|||
request for immediate comment. Microsoft declined to
|
||||
comment. Reporting by Foo Yun Chee; editing by Jason
|
||||
Neely'''
|
||||
|
||||
print(BagOfWords.extract_words(test_article))
|
133
JSONHandler.py
133
JSONHandler.py
|
@ -1,133 +0,0 @@
|
|||
'''
|
||||
JSON Handler
|
||||
============
|
||||
|
||||
JSON Handler reads articles from JSON files,
|
||||
extracts relevant information and
|
||||
writes it to a csv file.
|
||||
'''
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import csv
|
||||
import glob
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
class JsonHandler:
|
||||
|
||||
def select_randoms(df, n):
|
||||
'''selects n random samples from dataset.
|
||||
params: df DataFrame to select items from,
|
||||
n number of items to select randomly,
|
||||
returns new DataFrame with only selected items
|
||||
'''
|
||||
|
||||
# initialize random => reproducible sequence
|
||||
np.random.seed(5)
|
||||
# add new column 'Random'
|
||||
df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
|
||||
# sort DataFrame by random numbers
|
||||
df = df.sort_values('Random')
|
||||
# return first n elements of randomly sorted dataset
|
||||
return df.iloc[0:n]
|
||||
|
||||
def create_csv(file_name):
|
||||
# create new csv file for each month.
|
||||
# each row contains an news article.
|
||||
|
||||
with open(file_name, 'w', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile,
|
||||
delimiter='|',
|
||||
quotechar='\'',
|
||||
quoting=csv.QUOTE_NONNUMERIC)
|
||||
# write header / column names
|
||||
writer.writerow(['Uuid', #0
|
||||
'Title', #1
|
||||
'Text', #2
|
||||
'Site', #3
|
||||
'SiteSection', #4
|
||||
'Url', #5
|
||||
'Timestamp']) #6
|
||||
|
||||
def write_articles_to_csv(file_name):
|
||||
# path of JSON files
|
||||
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
|
||||
'\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4'\
|
||||
'\\news_[0-9]*.json'
|
||||
files = glob.glob(path)
|
||||
|
||||
# reliable sources (site_sections)
|
||||
site_sections = ['http://feeds.reuters.com/reuters/financialsNews',
|
||||
'http://feeds.reuters.com/reuters/INbusinessNews',
|
||||
'http://feeds.reuters.com/reuters/businessNews',
|
||||
'http://feeds.reuters.com/reuters/companyNews',
|
||||
'http://www.reuters.com/finance/deals',
|
||||
'http://feeds.reuters.com/reuters/mergersNews',
|
||||
'http://rss.cnn.com/rss/money_topstories.rss',
|
||||
'http://rss.cnn.com/rss/money_latest.rss',
|
||||
'http://www.economist.com/sections/business-finance/rss.xml',
|
||||
'http://rss.cnn.com/rss/edition_business.rss',
|
||||
'http://in.reuters.com/finance/deals',
|
||||
'http://feeds.reuters.com/reuters/technologyNews',
|
||||
'http://feeds.reuters.com/reuters/technologysectorNews',
|
||||
'https://www.ft.com/companies/us',
|
||||
'http://feeds.reuters.com/reuters/UKScienceNews',
|
||||
'http://in.reuters.com/news/technology',
|
||||
'http://in.reuters.com/finance/economy',
|
||||
'https://www.bloomberg.com/middleeast',
|
||||
'http://in.reuters.com/news/top-news']
|
||||
|
||||
# file counter
|
||||
n = 0
|
||||
# article counter
|
||||
a = 0
|
||||
# read every JSON file in current folder
|
||||
with open(file_name, 'a', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile,
|
||||
delimiter='|',
|
||||
quotechar='\'',
|
||||
quoting=csv.QUOTE_NONNUMERIC)
|
||||
for file in files:
|
||||
n += 1
|
||||
with open(file, encoding='utf-8') as f:
|
||||
# Json is converted to dict
|
||||
dict = json.load(f)
|
||||
#print(n)
|
||||
# leave out comments or posts, take only reuters as source
|
||||
if ((dict['ord_in_thread'] != 0) or
|
||||
(dict['language'] != 'english') or
|
||||
(dict['thread']['spam_score'] > 0.3) or
|
||||
(dict['thread']['site_section'] not in site_sections)):
|
||||
continue
|
||||
# pick only relevant information of article
|
||||
# and put in in list
|
||||
article = [dict['thread']['uuid'], # 0:'Uuid'
|
||||
dict['thread']['title'], # 1:'Title'
|
||||
dict['text'], # 2:'Text'
|
||||
dict['thread']['site'], # 3:'Site'
|
||||
dict['thread']['site_section'],# 4:'SiteSection'
|
||||
dict['url'], # 5:'Url'
|
||||
dict['published']] # 6:'Timestamp'
|
||||
|
||||
# remove newlines and delimiter chars
|
||||
article[1] = article[1].replace('|', '-')
|
||||
article[2] = article[2].replace('\n', ' ')\
|
||||
.replace('\r', ' ').replace('|', '-')
|
||||
|
||||
try:
|
||||
writer.writerow(article)
|
||||
a += 1
|
||||
# handle undefined characters (videos and other spam)
|
||||
except UnicodeEncodeError:
|
||||
print('# filtered out: {} (UnicodeEncodeError)'
|
||||
.format(dict['thread']['site_section']))
|
||||
print()
|
||||
print('# saved {} articles in file {}'.format(a, file_name))
|
||||
|
||||
if __name__ == '__main__':
|
||||
file_name = 'test.csv'
|
||||
JsonHandler.create_csv(file_name)
|
||||
JsonHandler.write_articles_to_csv(file_name)
|
|
@ -0,0 +1,152 @@
|
|||
'''
|
||||
Json Handler
|
||||
============
|
||||
|
||||
JsonHandler reads articles from JSON files,
|
||||
extracts relevant information and
|
||||
writes it to a csv file.
|
||||
'''
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import csv
|
||||
import glob
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
class JsonHandler:
|
||||
|
||||
# string for every month of the year
|
||||
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
|
||||
'11', '12']
|
||||
|
||||
def select_randoms(df, n):
|
||||
'''select n random samples from dataset.
|
||||
params: df DataFrame to select items from,
|
||||
n number of items to select randomly,
|
||||
return new DataFrame with only selected items.
|
||||
'''
|
||||
|
||||
# initialize random => reproducible sequence
|
||||
np.random.seed(5)
|
||||
# add new column 'Random'
|
||||
df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
|
||||
# sort DataFrame by random numbers
|
||||
df = df.sort_values('Random')
|
||||
# return first n elements of randomly sorted dataset
|
||||
return df.iloc[0:n]
|
||||
|
||||
def create_labeling_dataset():
|
||||
# number of articles to select from each month:
|
||||
# 10.000 / 12 = 833,33
|
||||
n_select = 833
|
||||
# except every third month:
|
||||
every_third_month = ['03', '06', '09', '12']
|
||||
for m in JsonHandler.month:
|
||||
df = pandas.read_csv('all_{}.csv'.format(m),
|
||||
delimiter='|',
|
||||
header=0,
|
||||
index_col=None,
|
||||
engine='python',
|
||||
quotechar='\'',
|
||||
quoting=0,
|
||||
encoding='utf-8')
|
||||
# pick one more from every third article
|
||||
if m in every_third_month:
|
||||
n_select = 834
|
||||
JsonHandler.select_randoms(df, n_select).to_csv('labeling_dataset.csv',
|
||||
header=True,
|
||||
mode='a',
|
||||
encoding='python',
|
||||
quoting=QUOTE_MINIMAL,
|
||||
quotechar='\'')
|
||||
|
||||
def write_articles_to_csv_files():
|
||||
'''read JSON files, select articles and write them to csv.
|
||||
'''
|
||||
# reliable sources (site_sections)
|
||||
site_sections = []
|
||||
# read list from 'sections.txt' file
|
||||
with open('sections.txt', 'r') as s_list:
|
||||
site_sections = s_list.read().split('\n')
|
||||
|
||||
# article counter
|
||||
a = 0
|
||||
for m in JsonHandler.months:
|
||||
# 1 output file per month
|
||||
output_file = 'all_{}.csv'.format(m)
|
||||
# path of input JSON files per month
|
||||
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
|
||||
'\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
|
||||
'\\news_[0-9]*.json'.format(m)
|
||||
files = glob.glob(path)
|
||||
|
||||
# file counter
|
||||
n = 0
|
||||
# write separate csv file for every month
|
||||
with open(output_file, 'w', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile,
|
||||
delimiter='|',
|
||||
quotechar='\'',
|
||||
quoting=csv.QUOTE_NONNUMERIC)
|
||||
|
||||
# write header / column names
|
||||
writer.writerow(['Uuid', #0
|
||||
'Title', #1
|
||||
'Text', #2
|
||||
'Site', #3
|
||||
'SiteSection', #4
|
||||
'Url', #5
|
||||
'Timestamp']) #6
|
||||
# write articles
|
||||
for file in files:
|
||||
n += 1
|
||||
# read every JSON file
|
||||
with open(file, encoding='utf-8') as f:
|
||||
# Json is converted to dict
|
||||
dict = json.load(f)
|
||||
# check if comment or post
|
||||
if ((dict['ord_in_thread'] != 0) or
|
||||
# check if not english
|
||||
(dict['language'] != 'english') or
|
||||
# check if spam
|
||||
(dict['thread']['spam_score'] > 0.3) or
|
||||
# check if reliable source
|
||||
(dict['thread']['site_section'] not in site_sections) or
|
||||
# check if text parsed correctly
|
||||
('Further company coverage:' in dict['text']) or
|
||||
(('subscription' or 'subscribe') in dict['text']) or
|
||||
(len(dict['text']) < 300)):
|
||||
continue
|
||||
else:
|
||||
try:
|
||||
# replace whitespaces and delimiter chars
|
||||
# and write to csv
|
||||
writer.writerow([dict['thread']['uuid'],
|
||||
dict['thread']['title']\
|
||||
.replace('|', '-'),
|
||||
dict['text']\
|
||||
.replace('\n', '')\
|
||||
.replace('\r', '')\
|
||||
.replace('|', '-'),
|
||||
dict['thread']['site'],
|
||||
dict['thread']['site_section']\
|
||||
.replace('\n', '')\
|
||||
.replace('\r', ''),
|
||||
dict['url'],
|
||||
dict['published']])
|
||||
a += 1
|
||||
# handle undefined characters (videos and other spam)
|
||||
except UnicodeEncodeError:
|
||||
print('# filtered out: {} (UnicodeEncodeError)'
|
||||
.format(dict['thread']['site_section']))
|
||||
print('# saved articles in file {}, now {} in total'.format(output_file, a))
|
||||
print('#')
|
||||
print('# saved {} articles in total'.format(a))
|
||||
print('#')
|
||||
|
||||
if __name__ == '__main__':
|
||||
JsonHandler.write_articles_to_csv_files()
|
||||
#JsonHandler.create_labeling_dataset()
|
|
@ -0,0 +1,88 @@
|
|||
http://feeds.reuters.com/Reuters/UKBusinessNews?format=xml
|
||||
http://in.reuters.com/finance/economy
|
||||
http://feeds.reuters.com/reuters/financialsNews
|
||||
http://in.reuters.com/finance/deals
|
||||
http://feeds.reuters.com/reuters/INbusinessNews
|
||||
http://www.theguardian.com/business/rss
|
||||
http://feeds.reuters.com/reuters/businessNews
|
||||
http://feeds.reuters.com/reuters/mergersNews
|
||||
http://feeds.reuters.com/reuters/industrialsNews
|
||||
http://feeds.reuters.com/reuters/UKBusinessNews/
|
||||
http://www.ft.com/rss/indepth/investmentbanking/deal
|
||||
http://feeds.guardian.co.uk/theguardian/business/uk-edition/rss
|
||||
http://feeds.reuters.com/reuters/companyNews
|
||||
http://www.ft.com/rss/companies/us
|
||||
http://rss.cnn.com/rss/edition_business.rss
|
||||
http://www.ft.com/rss/lex
|
||||
http://feeds.reuters.com/reuters/businessNews?format=xml
|
||||
http://www.reuters.com/finance/deals
|
||||
http://www.ft.com/rss/companies/chemicals
|
||||
https://www.theguardian.com/uk/business
|
||||
http://www.ft.com/rss/companies/asia-pacific
|
||||
http://in.reuters.com/finance/markets/companyOutlooksNews
|
||||
http://www.ft.com/rss/companies/financials
|
||||
http://www.ft.com/rss/companies/industrials
|
||||
http://www.ft.com/rss/companies/uk
|
||||
http://www.ft.com/rss/companies/rail
|
||||
https://www.theguardian.com/business/all
|
||||
http://www.ft.com/rss/companies
|
||||
http://www.ft.com/rss/companies/banks
|
||||
http://feeds.reuters.com/news/deals
|
||||
http://in.reuters.com/finance
|
||||
http://www.ft.com/rss/companies/airlines
|
||||
http://www.ft.com/rss/companies/asiapacific
|
||||
http://www.ft.com/rss/companies/financial-services
|
||||
http://www.ft.com/rss/companies/retail
|
||||
http://www.ft.com/rss/companies/europe
|
||||
http://www.ft.com/rss/companies/property
|
||||
http://www.ft.com/rss/companies/utilities
|
||||
http://rss.cnn.com/rss/money_news_companies.rss
|
||||
http://www.ft.com/rss/world/uk/business
|
||||
http://www.ft.com/rss/companies/transport
|
||||
http://www.ft.com/rss/companies/retail-consumer
|
||||
http://www.ft.com/rss/companies/energy
|
||||
http://www.ft.com/rss/companies/mining
|
||||
http://www.reuters.com/finance
|
||||
http://www.ft.com/rss/companies/automobiles
|
||||
http://www.ft.com/rss/companies/basic-resources
|
||||
http://www.ft.com/rss/companies/technology
|
||||
http://www.ft.com/rss/companies/construction
|
||||
http://www.ft.com/rss/companies/health
|
||||
https://www.theguardian.com/media/mediabusiness
|
||||
http://www.theguardian.com/business/tesco/rss
|
||||
http://www.theguardian.com/business/oil/rss
|
||||
http://www.ft.com/rss/companies/aerospace-defence
|
||||
http://www.ft.com/rss/companies/travel-leisure
|
||||
http://www.ft.com/rss/companies/oil-gas
|
||||
http://www.theguardian.com/business/morrisons/rss
|
||||
http://www.ft.com/rss/companies/telecoms
|
||||
http://www.ft.com/rss/companies/personal-goods
|
||||
http://www.ft.com/rss/companies/pharmaceuticals
|
||||
http://www.ft.com/rss/in-depth/initial-public-offering
|
||||
http://rss.cnn.com/rss/money_news_economy.rss
|
||||
http://www.ft.com/rss/companies/insurance
|
||||
http://www.ft.com/rss/companies/support-services
|
||||
http://www.guardian.co.uk/business/economics/rss
|
||||
http://www.economist.com/sections/business-finance/rss.xml
|
||||
http://www.guardian.co.uk/theobserver/news/business/rss
|
||||
http://www.ft.com/rss/companies/healthcare
|
||||
https://www.bloomberg.com/businessweek
|
||||
http://www.theguardian.com/business/retail/rss
|
||||
http://rss.cnn.com/rss/money_technology.rss
|
||||
http://www.economist.com/rss/business_rss.xml
|
||||
http://www.theguardian.com/business/unilever/rss
|
||||
https://www.theguardian.com/business/eurozone
|
||||
https://www.theguardian.com/business/economics
|
||||
http://www.economist.com/rss/briefings_rss.xml
|
||||
http://www.theguardian.com/business/euro/rss
|
||||
http://www.reuters.com/finance/summits
|
||||
http://rss.ft.com/rss/companies/banks
|
||||
http://in.reuters.com/finance/summits
|
||||
http://www.theguardian.com/business/ryanair/rss
|
||||
http://www.theguardian.com/business/deloitte/rss
|
||||
https://in.reuters.com/finance/deals
|
||||
https://in.reuters.com/finance
|
||||
https://www.reuters.com/finance/deals
|
||||
https://www.reuters.com/finance
|
||||
https://in.reuters.com/finance/economy
|
||||
https://in.reuters.com/finance/markets/companyOutlooksNews
|
Loading…
Reference in New Issue