From 0c15d49d7ecf6d85ad03eaed655439d809d0e396 Mon Sep 17 00:00:00 2001 From: Anne Lorenz Date: Mon, 22 Oct 2018 10:17:52 +0200 Subject: [PATCH] refactoring --- BagOfWords.py | 75 ++++++++++-------------- JSONHandler.py | 133 ------------------------------------------- JsonHandler.py | 152 +++++++++++++++++++++++++++++++++++++++++++++++++ sections.txt | 88 ++++++++++++++++++++++++++++ 4 files changed, 269 insertions(+), 179 deletions(-) delete mode 100644 JSONHandler.py create mode 100644 JsonHandler.py create mode 100644 sections.txt diff --git a/BagOfWords.py b/BagOfWords.py index a31822b..8a1fd2a 100644 --- a/BagOfWords.py +++ b/BagOfWords.py @@ -4,23 +4,17 @@ Bag Of Words ============ -BagOfWords counts word stems in an article -and adds new words to the global vocabulary. - -note: -The multinomial Naive Bayes classifier is suitable -for classification with discrete features (e.g., -word counts for text classification). -The multinomial distribution normally requires -integer feature counts. However, in practice, -fractional counts such as tf-idf may also work. -=> considered by 'relative_word_frequencies' as parameter +BagOfWords counts word stems in an article and adds new words to the global +vocabulary. As the multinomial Naive Bayes classifier is suitable for +classification with discrete features (e.g., word counts for text +classification). The multinomial distribution normally requires integer +feature counts. However, in practice, fractional counts such as tf-idf may +also work. => considered by 'relative_word_frequencies' as parameter. ''' import re import pandas as pd - from nltk.stem.porter import PorterStemmer class BagOfWords: @@ -35,6 +29,7 @@ class BagOfWords: '''takes article as argument, removes numbers, returns list of single words, recurrences included. ''' + stemmer = PorterStemmer() stop_words = BagOfWords.set_stop_words() # replace punctuation marks with spaces words = re.sub(r'\W', ' ', text) @@ -43,30 +38,21 @@ class BagOfWords: # list of all words to return words_cleaned = [] for word in words: - # leave out numbers + # check if alphabetic char if word.isalpha(): - # reduce word to stem - word = BagOfWords.reduce_word_to_stem(word) + # reduce word in lower case to stem + word = stemmer.stem(word.lower()) # check if not stop word - if word.lower() not in stop_words: - # add every word in lowercase - words_cleaned.append(word.lower()) + if word not in stop_words: + words_cleaned.append(word) return words_cleaned - def reduce_word_to_stem(word): - '''takes normal word as input, returns the word's stem - ''' - stemmer = PorterStemmer() - # replace word by its stem - word = stemmer.stem(word) - return word - def make_matrix(series, vocab, relative_word_frequencies=True): - '''calculates word stem frequencies in input articles. - returns matrix (DataFrame) with relative word frequencies - (0 <= values < 1) if relative_word_frequencies=True or absolute - word frequencies (int) if relative_word_frequencies=False. - (rows: different articles, colums: different words in vocab) + '''calculates word stem frequencies in input articles. returns matrix + (DataFrame) with relative word frequencies (0 <= values < 1) if + relative_word_frequencies=True or absolute word frequencies (int) if + relative_word_frequencies=False.(rows: different articles, colums: + different words in vocab) ''' print('# BOW: calculating matrix') print('# ...') @@ -90,7 +76,6 @@ class BagOfWords: else: # absolute word frequency vector[i] += 1 - # add single vector as tuple vectors.append(tuple(vector)) df_vectors = pd.DataFrame.from_records(vectors, @@ -109,10 +94,10 @@ class BagOfWords: for text in series: # add single article's text to total vocabulary vocab |= set(BagOfWords.extract_words(text)) - # transform to list - vocab = list(vocab) - # sort list - vocab.sort() + # # transform to list + # vocab = list(vocab) + # # sort list + # vocab.sort() return vocab def set_stop_words(): @@ -151,20 +136,17 @@ class BagOfWords: #add unwanted terms stop_words.extend(['reuters', 'bloomberg', 'cnn', 'economist']) - #remove the word 'not' from stop words - #stop_words.remove('not') + # #remove the word 'not' from stop words? + # stop_words.remove('not') + + stemmer = PorterStemmer() for i in range(len(stop_words)): - - # remove punctuation marks and strip endings from abbreviations - #stop_words[i] = re.split(r'\W', stop_words[i])[0] - - # reduce word to stem - stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i]) + # reduce stop words to stem + stop_words[i] = stemmer.stem(stop_words[i]) # transform list to set to eliminate duplicates - stop_words = set(stop_words) + return set(stop_words) - return stop_words if __name__ == '__main__': test_article = '''Exclusive: Microsoft's $7.5 billion GitHub deal set for EU approval - sources. BRUSSELS (Reuters) - U.S. software @@ -189,4 +171,5 @@ if __name__ == '__main__': request for immediate comment. Microsoft declined to comment. Reporting by Foo Yun Chee; editing by Jason Neely''' + print(BagOfWords.extract_words(test_article)) \ No newline at end of file diff --git a/JSONHandler.py b/JSONHandler.py deleted file mode 100644 index dc253a4..0000000 --- a/JSONHandler.py +++ /dev/null @@ -1,133 +0,0 @@ -''' -JSON Handler -============ - -JSON Handler reads articles from JSON files, -extracts relevant information and -writes it to a csv file. -''' - -# -*- coding: utf-8 -*- - -import csv -import glob -import json - -import numpy as np -import pandas as pd - -class JsonHandler: - - def select_randoms(df, n): - '''selects n random samples from dataset. - params: df DataFrame to select items from, - n number of items to select randomly, - returns new DataFrame with only selected items - ''' - - # initialize random => reproducible sequence - np.random.seed(5) - # add new column 'Random' - df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index) - # sort DataFrame by random numbers - df = df.sort_values('Random') - # return first n elements of randomly sorted dataset - return df.iloc[0:n] - - def create_csv(file_name): - # create new csv file for each month. - # each row contains an news article. - - with open(file_name, 'w', newline='') as csvfile: - writer = csv.writer(csvfile, - delimiter='|', - quotechar='\'', - quoting=csv.QUOTE_NONNUMERIC) - # write header / column names - writer.writerow(['Uuid', #0 - 'Title', #1 - 'Text', #2 - 'Site', #3 - 'SiteSection', #4 - 'Url', #5 - 'Timestamp']) #6 - - def write_articles_to_csv(file_name): - # path of JSON files - path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\ - '\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4'\ - '\\news_[0-9]*.json' - files = glob.glob(path) - - # reliable sources (site_sections) - site_sections = ['http://feeds.reuters.com/reuters/financialsNews', - 'http://feeds.reuters.com/reuters/INbusinessNews', - 'http://feeds.reuters.com/reuters/businessNews', - 'http://feeds.reuters.com/reuters/companyNews', - 'http://www.reuters.com/finance/deals', - 'http://feeds.reuters.com/reuters/mergersNews', - 'http://rss.cnn.com/rss/money_topstories.rss', - 'http://rss.cnn.com/rss/money_latest.rss', - 'http://www.economist.com/sections/business-finance/rss.xml', - 'http://rss.cnn.com/rss/edition_business.rss', - 'http://in.reuters.com/finance/deals', - 'http://feeds.reuters.com/reuters/technologyNews', - 'http://feeds.reuters.com/reuters/technologysectorNews', - 'https://www.ft.com/companies/us', - 'http://feeds.reuters.com/reuters/UKScienceNews', - 'http://in.reuters.com/news/technology', - 'http://in.reuters.com/finance/economy', - 'https://www.bloomberg.com/middleeast', - 'http://in.reuters.com/news/top-news'] - - # file counter - n = 0 - # article counter - a = 0 - # read every JSON file in current folder - with open(file_name, 'a', newline='') as csvfile: - writer = csv.writer(csvfile, - delimiter='|', - quotechar='\'', - quoting=csv.QUOTE_NONNUMERIC) - for file in files: - n += 1 - with open(file, encoding='utf-8') as f: - # Json is converted to dict - dict = json.load(f) - #print(n) - # leave out comments or posts, take only reuters as source - if ((dict['ord_in_thread'] != 0) or - (dict['language'] != 'english') or - (dict['thread']['spam_score'] > 0.3) or - (dict['thread']['site_section'] not in site_sections)): - continue - # pick only relevant information of article - # and put in in list - article = [dict['thread']['uuid'], # 0:'Uuid' - dict['thread']['title'], # 1:'Title' - dict['text'], # 2:'Text' - dict['thread']['site'], # 3:'Site' - dict['thread']['site_section'],# 4:'SiteSection' - dict['url'], # 5:'Url' - dict['published']] # 6:'Timestamp' - - # remove newlines and delimiter chars - article[1] = article[1].replace('|', '-') - article[2] = article[2].replace('\n', ' ')\ - .replace('\r', ' ').replace('|', '-') - - try: - writer.writerow(article) - a += 1 - # handle undefined characters (videos and other spam) - except UnicodeEncodeError: - print('# filtered out: {} (UnicodeEncodeError)' - .format(dict['thread']['site_section'])) - print() - print('# saved {} articles in file {}'.format(a, file_name)) - -if __name__ == '__main__': - file_name = 'test.csv' - JsonHandler.create_csv(file_name) - JsonHandler.write_articles_to_csv(file_name) \ No newline at end of file diff --git a/JsonHandler.py b/JsonHandler.py new file mode 100644 index 0000000..ede0752 --- /dev/null +++ b/JsonHandler.py @@ -0,0 +1,152 @@ +''' +Json Handler +============ + +JsonHandler reads articles from JSON files, +extracts relevant information and +writes it to a csv file. +''' + +# -*- coding: utf-8 -*- + +import csv +import glob +import json + +import numpy as np +import pandas as pd + +class JsonHandler: + + # string for every month of the year + months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', + '11', '12'] + + def select_randoms(df, n): + '''select n random samples from dataset. + params: df DataFrame to select items from, + n number of items to select randomly, + return new DataFrame with only selected items. + ''' + + # initialize random => reproducible sequence + np.random.seed(5) + # add new column 'Random' + df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index) + # sort DataFrame by random numbers + df = df.sort_values('Random') + # return first n elements of randomly sorted dataset + return df.iloc[0:n] + + def create_labeling_dataset(): + # number of articles to select from each month: + # 10.000 / 12 = 833,33 + n_select = 833 + # except every third month: + every_third_month = ['03', '06', '09', '12'] + for m in JsonHandler.month: + df = pandas.read_csv('all_{}.csv'.format(m), + delimiter='|', + header=0, + index_col=None, + engine='python', + quotechar='\'', + quoting=0, + encoding='utf-8') + # pick one more from every third article + if m in every_third_month: + n_select = 834 + JsonHandler.select_randoms(df, n_select).to_csv('labeling_dataset.csv', + header=True, + mode='a', + encoding='python', + quoting=QUOTE_MINIMAL, + quotechar='\'') + + def write_articles_to_csv_files(): + '''read JSON files, select articles and write them to csv. + ''' + # reliable sources (site_sections) + site_sections = [] + # read list from 'sections.txt' file + with open('sections.txt', 'r') as s_list: + site_sections = s_list.read().split('\n') + + # article counter + a = 0 + for m in JsonHandler.months: + # 1 output file per month + output_file = 'all_{}.csv'.format(m) + # path of input JSON files per month + path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\ + '\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\ + '\\news_[0-9]*.json'.format(m) + files = glob.glob(path) + + # file counter + n = 0 + # write separate csv file for every month + with open(output_file, 'w', newline='') as csvfile: + writer = csv.writer(csvfile, + delimiter='|', + quotechar='\'', + quoting=csv.QUOTE_NONNUMERIC) + + # write header / column names + writer.writerow(['Uuid', #0 + 'Title', #1 + 'Text', #2 + 'Site', #3 + 'SiteSection', #4 + 'Url', #5 + 'Timestamp']) #6 + # write articles + for file in files: + n += 1 + # read every JSON file + with open(file, encoding='utf-8') as f: + # Json is converted to dict + dict = json.load(f) + # check if comment or post + if ((dict['ord_in_thread'] != 0) or + # check if not english + (dict['language'] != 'english') or + # check if spam + (dict['thread']['spam_score'] > 0.3) or + # check if reliable source + (dict['thread']['site_section'] not in site_sections) or + # check if text parsed correctly + ('Further company coverage:' in dict['text']) or + (('subscription' or 'subscribe') in dict['text']) or + (len(dict['text']) < 300)): + continue + else: + try: + # replace whitespaces and delimiter chars + # and write to csv + writer.writerow([dict['thread']['uuid'], + dict['thread']['title']\ + .replace('|', '-'), + dict['text']\ + .replace('\n', '')\ + .replace('\r', '')\ + .replace('|', '-'), + dict['thread']['site'], + dict['thread']['site_section']\ + .replace('\n', '')\ + .replace('\r', ''), + dict['url'], + dict['published']]) + a += 1 + # handle undefined characters (videos and other spam) + except UnicodeEncodeError: + print('# filtered out: {} (UnicodeEncodeError)' + .format(dict['thread']['site_section'])) + print('# saved articles in file {}, now {} in total'.format(output_file, a)) + print('#') + print('# saved {} articles in total'.format(a)) + print('#') + +if __name__ == '__main__': + JsonHandler.write_articles_to_csv_files() + #JsonHandler.create_labeling_dataset() \ No newline at end of file diff --git a/sections.txt b/sections.txt new file mode 100644 index 0000000..8629eeb --- /dev/null +++ b/sections.txt @@ -0,0 +1,88 @@ +http://feeds.reuters.com/Reuters/UKBusinessNews?format=xml +http://in.reuters.com/finance/economy +http://feeds.reuters.com/reuters/financialsNews +http://in.reuters.com/finance/deals +http://feeds.reuters.com/reuters/INbusinessNews +http://www.theguardian.com/business/rss +http://feeds.reuters.com/reuters/businessNews +http://feeds.reuters.com/reuters/mergersNews +http://feeds.reuters.com/reuters/industrialsNews +http://feeds.reuters.com/reuters/UKBusinessNews/ +http://www.ft.com/rss/indepth/investmentbanking/deal +http://feeds.guardian.co.uk/theguardian/business/uk-edition/rss +http://feeds.reuters.com/reuters/companyNews +http://www.ft.com/rss/companies/us +http://rss.cnn.com/rss/edition_business.rss +http://www.ft.com/rss/lex +http://feeds.reuters.com/reuters/businessNews?format=xml +http://www.reuters.com/finance/deals +http://www.ft.com/rss/companies/chemicals +https://www.theguardian.com/uk/business +http://www.ft.com/rss/companies/asia-pacific +http://in.reuters.com/finance/markets/companyOutlooksNews +http://www.ft.com/rss/companies/financials +http://www.ft.com/rss/companies/industrials +http://www.ft.com/rss/companies/uk +http://www.ft.com/rss/companies/rail +https://www.theguardian.com/business/all +http://www.ft.com/rss/companies +http://www.ft.com/rss/companies/banks +http://feeds.reuters.com/news/deals +http://in.reuters.com/finance +http://www.ft.com/rss/companies/airlines +http://www.ft.com/rss/companies/asiapacific +http://www.ft.com/rss/companies/financial-services +http://www.ft.com/rss/companies/retail +http://www.ft.com/rss/companies/europe +http://www.ft.com/rss/companies/property +http://www.ft.com/rss/companies/utilities +http://rss.cnn.com/rss/money_news_companies.rss +http://www.ft.com/rss/world/uk/business +http://www.ft.com/rss/companies/transport +http://www.ft.com/rss/companies/retail-consumer +http://www.ft.com/rss/companies/energy +http://www.ft.com/rss/companies/mining +http://www.reuters.com/finance +http://www.ft.com/rss/companies/automobiles +http://www.ft.com/rss/companies/basic-resources +http://www.ft.com/rss/companies/technology +http://www.ft.com/rss/companies/construction +http://www.ft.com/rss/companies/health +https://www.theguardian.com/media/mediabusiness +http://www.theguardian.com/business/tesco/rss +http://www.theguardian.com/business/oil/rss +http://www.ft.com/rss/companies/aerospace-defence +http://www.ft.com/rss/companies/travel-leisure +http://www.ft.com/rss/companies/oil-gas +http://www.theguardian.com/business/morrisons/rss +http://www.ft.com/rss/companies/telecoms +http://www.ft.com/rss/companies/personal-goods +http://www.ft.com/rss/companies/pharmaceuticals +http://www.ft.com/rss/in-depth/initial-public-offering +http://rss.cnn.com/rss/money_news_economy.rss +http://www.ft.com/rss/companies/insurance +http://www.ft.com/rss/companies/support-services +http://www.guardian.co.uk/business/economics/rss +http://www.economist.com/sections/business-finance/rss.xml +http://www.guardian.co.uk/theobserver/news/business/rss +http://www.ft.com/rss/companies/healthcare +https://www.bloomberg.com/businessweek +http://www.theguardian.com/business/retail/rss +http://rss.cnn.com/rss/money_technology.rss +http://www.economist.com/rss/business_rss.xml +http://www.theguardian.com/business/unilever/rss +https://www.theguardian.com/business/eurozone +https://www.theguardian.com/business/economics +http://www.economist.com/rss/briefings_rss.xml +http://www.theguardian.com/business/euro/rss +http://www.reuters.com/finance/summits +http://rss.ft.com/rss/companies/banks +http://in.reuters.com/finance/summits +http://www.theguardian.com/business/ryanair/rss +http://www.theguardian.com/business/deloitte/rss +https://in.reuters.com/finance/deals +https://in.reuters.com/finance +https://www.reuters.com/finance/deals +https://www.reuters.com/finance +https://in.reuters.com/finance/economy +https://in.reuters.com/finance/markets/companyOutlooksNews \ No newline at end of file