refactoring

2018-10-22 10:17:52 +02:00 · 2018-10-22 10:17:52 +02:00 · 0c15d49d7e
commit 0c15d49d7e
parent cbfbdffdb7
4 changed files with 269 additions and 179 deletions
--- a/BagOfWords.py
+++ b/BagOfWords.py
@ -4,23 +4,17 @@
 Bag Of Words
 ============

-BagOfWords counts word stems in an article
-and adds new words to the global vocabulary.
-
-note:
-The multinomial Naive Bayes classifier is suitable
-for classification with discrete features (e.g.,
-word counts for text classification).
-The multinomial distribution normally requires
-integer feature counts. However, in practice,
-fractional counts such as tf-idf may also work.
-=> considered by 'relative_word_frequencies' as parameter
+BagOfWords counts word stems in an article and adds new words to the global
+vocabulary. As the multinomial Naive Bayes classifier is suitable for
+classification with discrete features (e.g., word counts for text
+classification). The multinomial distribution normally requires integer
+feature counts. However, in practice, fractional counts such as tf-idf may
+also work. => considered by 'relative_word_frequencies' as parameter.
 '''

 import re

 import pandas as pd
-
 from nltk.stem.porter import PorterStemmer

 class BagOfWords:
@ -35,6 +29,7 @@ class BagOfWords:
        '''takes article as argument, removes numbers,
        returns list of single words, recurrences included.
        '''
+        stemmer = PorterStemmer()
        stop_words = BagOfWords.set_stop_words()
        # replace punctuation marks with spaces
        words = re.sub(r'\W', ' ', text)
@ -43,30 +38,21 @@ class BagOfWords:
        # list of all words to return
        words_cleaned = []
        for word in words:
-            # leave out numbers
+            # check if alphabetic char
            if word.isalpha():
-                # reduce word to stem
-                word = BagOfWords.reduce_word_to_stem(word)
+                # reduce word in lower case to stem
+                word = stemmer.stem(word.lower())
                # check if not stop word
-                if word.lower() not in stop_words:
-                    # add every word in lowercase
-                    words_cleaned.append(word.lower())
+                if word not in stop_words:
+                    words_cleaned.append(word)
        return words_cleaned

-    def reduce_word_to_stem(word):
-        '''takes normal word as input, returns the word's stem
-        '''
-        stemmer = PorterStemmer()
-        # replace word by its stem
-        word = stemmer.stem(word)
-        return word
-
    def make_matrix(series, vocab, relative_word_frequencies=True):
-        '''calculates word stem frequencies in input articles.
-        returns matrix (DataFrame) with relative word frequencies
-        (0 <= values < 1) if relative_word_frequencies=True or absolute
-        word frequencies (int) if relative_word_frequencies=False.
-        (rows: different articles, colums: different words in vocab)
+        '''calculates word stem frequencies in input articles. returns matrix
+        (DataFrame) with relative word frequencies (0 <= values < 1) if
+        relative_word_frequencies=True or absolute word frequencies (int) if
+        relative_word_frequencies=False.(rows: different articles, colums: 
+        different words in vocab)
        '''
        print('# BOW: calculating matrix')
        print('# ...')
@ -90,7 +76,6 @@ class BagOfWords:
                        else:
                            # absolute word frequency
                            vector[i] += 1
-
            # add single vector as tuple
            vectors.append(tuple(vector))
        df_vectors = pd.DataFrame.from_records(vectors,
@ -109,10 +94,10 @@ class BagOfWords:
        for text in series:
            # add single article's text to total vocabulary
            vocab |= set(BagOfWords.extract_words(text))
-        # transform to list
-        vocab = list(vocab)
-        # sort list
-        vocab.sort()
+        # # transform to list
+        # vocab = list(vocab)
+        # # sort list
+        # vocab.sort()
        return vocab

    def set_stop_words():
@ -151,20 +136,17 @@ class BagOfWords:

        #add unwanted terms
        stop_words.extend(['reuters', 'bloomberg', 'cnn', 'economist'])
-        #remove the word 'not' from stop words
-        #stop_words.remove('not')

+        # #remove the word 'not' from stop words?
+        # stop_words.remove('not')
+
+        stemmer = PorterStemmer()
        for i in range(len(stop_words)):
-
-            # remove punctuation marks and strip endings from abbreviations
-            #stop_words[i] = re.split(r'\W', stop_words[i])[0]
-
-            # reduce word to stem
-            stop_words[i] = BagOfWords.reduce_word_to_stem(stop_words[i])
+            # reduce stop words to stem
+            stop_words[i] = stemmer.stem(stop_words[i])
        # transform list to set to eliminate duplicates
-        stop_words = set(stop_words)
+        return set(stop_words)

-        return stop_words
 if __name__ == '__main__':
    test_article = '''Exclusive: Microsoft's $7.5 billion GitHub deal set for
                    EU approval - sources. BRUSSELS (Reuters) - U.S. software
@ -189,4 +171,5 @@ if __name__ == '__main__':
                    request for immediate comment. Microsoft declined to
                    comment. Reporting by Foo Yun Chee; editing by Jason
                    Neely'''
+
    print(BagOfWords.extract_words(test_article))
--- a/JSONHandler.py
+++ b/JSONHandler.py
@ -1,133 +0,0 @@
-'''
-JSON Handler
-============
-
-JSON Handler reads articles from JSON files,
-extracts relevant information and
-writes it to a csv file.
-'''
-
-# -*- coding: utf-8 -*-
-
-import csv
-import glob
-import json
-
-import numpy as np
-import pandas as pd
-
-class JsonHandler:
-
-    def select_randoms(df, n):
-        '''selects n random samples from dataset.
-        params: df DataFrame to select items from,
-        n number of items to select randomly,
-        returns new DataFrame with only selected items
-        '''
-
-        # initialize random => reproducible sequence
-        np.random.seed(5)
-        # add new column 'Random'
-        df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
-        # sort DataFrame by random numbers
-        df = df.sort_values('Random')
-        # return first n elements of randomly sorted dataset
-        return df.iloc[0:n]
-
-    def create_csv(file_name):
-        # create new csv file for each month.
-        # each row contains an news article.
-
-        with open(file_name, 'w', newline='') as csvfile:
-            writer = csv.writer(csvfile, 
-                                delimiter='|',
-                                quotechar='\'',
-                                quoting=csv.QUOTE_NONNUMERIC)
-            # write header / column names
-            writer.writerow(['Uuid',        #0
-                             'Title',       #1
-                             'Text',        #2
-                             'Site',        #3
-                             'SiteSection', #4
-                             'Url',         #5
-                             'Timestamp'])  #6
-
-    def write_articles_to_csv(file_name):
-        # path of JSON files
-        path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
-               '\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4'\
-               '\\news_[0-9]*.json'
-        files = glob.glob(path)
-
-        # reliable sources (site_sections)
-        site_sections = ['http://feeds.reuters.com/reuters/financialsNews',
-                         'http://feeds.reuters.com/reuters/INbusinessNews',
-                         'http://feeds.reuters.com/reuters/businessNews',
-                         'http://feeds.reuters.com/reuters/companyNews',
-                         'http://www.reuters.com/finance/deals',
-                         'http://feeds.reuters.com/reuters/mergersNews',
-                         'http://rss.cnn.com/rss/money_topstories.rss',
-                         'http://rss.cnn.com/rss/money_latest.rss',
-                         'http://www.economist.com/sections/business-finance/rss.xml',
-                         'http://rss.cnn.com/rss/edition_business.rss',
-                         'http://in.reuters.com/finance/deals',
-                         'http://feeds.reuters.com/reuters/technologyNews',
-                         'http://feeds.reuters.com/reuters/technologysectorNews',
-                         'https://www.ft.com/companies/us',
-                         'http://feeds.reuters.com/reuters/UKScienceNews',
-                         'http://in.reuters.com/news/technology',
-                         'http://in.reuters.com/finance/economy',
-                         'https://www.bloomberg.com/middleeast',
-                         'http://in.reuters.com/news/top-news']
-
-        # file counter
-        n = 0
-        # article counter
-        a = 0
-        # read every JSON file in current folder
-        with open(file_name, 'a', newline='') as csvfile:
-            writer = csv.writer(csvfile, 
-                                delimiter='|',
-                                quotechar='\'', 
-                                quoting=csv.QUOTE_NONNUMERIC)
-            for file in files:
-                n += 1
-                with open(file, encoding='utf-8') as f:
-                    # Json is converted to dict
-                    dict = json.load(f)
-                    #print(n)
-                    # leave out comments or posts, take only reuters as source
-                    if ((dict['ord_in_thread'] != 0) or 
-                        (dict['language'] != 'english') or 
-                        (dict['thread']['spam_score'] > 0.3) or
-                        (dict['thread']['site_section'] not in site_sections)):
-                        continue
-                    # pick only relevant information of article
-                    # and put in in list
-                    article = [dict['thread']['uuid'],        # 0:'Uuid'
-                               dict['thread']['title'],       # 1:'Title'
-                               dict['text'],                  # 2:'Text'
-                               dict['thread']['site'],        # 3:'Site'
-                               dict['thread']['site_section'],# 4:'SiteSection'
-                               dict['url'],                   # 5:'Url'
-                               dict['published']]             # 6:'Timestamp'
-
-                    # remove newlines and delimiter chars
-                    article[1] = article[1].replace('|', '-')
-                    article[2] = article[2].replace('\n', ' ')\
-                                 .replace('\r', ' ').replace('|', '-')
-
-                    try:
-                        writer.writerow(article)
-                        a += 1
-                    # handle undefined characters (videos and other spam)
-                    except UnicodeEncodeError:
-                        print('# filtered out: {} (UnicodeEncodeError)'
-                                    .format(dict['thread']['site_section']))
-        print()
-        print('# saved {} articles in file {}'.format(a, file_name))
-
-if __name__ == '__main__':
-    file_name = 'test.csv'
-    JsonHandler.create_csv(file_name)
-    JsonHandler.write_articles_to_csv(file_name)
--- a/JsonHandler.py
+++ b/JsonHandler.py
@ -0,0 +1,152 @@
+'''
+Json Handler
+============
+
+JsonHandler reads articles from JSON files,
+extracts relevant information and
+writes it to a csv file.
+'''
+
+# -*- coding: utf-8 -*-
+
+import csv
+import glob
+import json
+
+import numpy as np
+import pandas as pd
+
+class JsonHandler:
+
+    # string for every month of the year
+    months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
+              '11', '12']
+
+    def select_randoms(df, n):
+        '''select n random samples from dataset.
+        params: df DataFrame to select items from,
+        n number of items to select randomly,
+        return new DataFrame with only selected items.
+        '''
+
+        # initialize random => reproducible sequence
+        np.random.seed(5)
+        # add new column 'Random'
+        df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
+        # sort DataFrame by random numbers
+        df = df.sort_values('Random')
+        # return first n elements of randomly sorted dataset
+        return df.iloc[0:n]
+
+    def create_labeling_dataset():
+        # number of articles to select from each month:
+        # 10.000 / 12 = 833,33
+        n_select = 833
+        # except every third month:
+        every_third_month = ['03', '06', '09', '12']
+        for m in JsonHandler.month:
+            df = pandas.read_csv('all_{}.csv'.format(m),
+                                  delimiter='|',
+                                  header=0,
+                                  index_col=None,
+                                  engine='python',
+                                  quotechar='\'',
+                                  quoting=0,
+                                  encoding='utf-8')
+            # pick one more from every third article
+            if m in every_third_month:
+                n_select = 834
+            JsonHandler.select_randoms(df, n_select).to_csv('labeling_dataset.csv', 
+                                                            header=True, 
+                                                            mode='a', 
+                                                            encoding='python', 
+                                                            quoting=QUOTE_MINIMAL, 
+                                                            quotechar='\'')
+
+    def write_articles_to_csv_files():
+        '''read JSON files, select articles and write them to csv.
+        '''
+        # reliable sources (site_sections)
+        site_sections = []
+        # read list from 'sections.txt' file
+        with open('sections.txt', 'r') as s_list:
+            site_sections = s_list.read().split('\n')
+
+        # article counter
+        a = 0
+        for m in JsonHandler.months:
+            # 1 output file per month
+            output_file = 'all_{}.csv'.format(m)
+            # path of input JSON files per month
+            path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
+                   '\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
+                   '\\news_[0-9]*.json'.format(m)
+            files = glob.glob(path)
+
+            # file counter
+            n = 0
+            # write separate csv file for every month
+            with open(output_file, 'w', newline='') as csvfile:
+                writer = csv.writer(csvfile, 
+                                    delimiter='|',
+                                    quotechar='\'', 
+                                    quoting=csv.QUOTE_NONNUMERIC)
+
+                # write header / column names
+                writer.writerow(['Uuid',        #0
+                                 'Title',       #1
+                                 'Text',        #2
+                                 'Site',        #3
+                                 'SiteSection', #4
+                                 'Url',         #5
+                                 'Timestamp'])  #6
+                # write articles
+                for file in files:
+                    n += 1
+                    # read every JSON file
+                    with open(file, encoding='utf-8') as f:
+                        # Json is converted to dict
+                        dict = json.load(f)
+                        # check if comment or post
+                        if ((dict['ord_in_thread'] != 0) or
+                            # check if not english
+                            (dict['language'] != 'english') or
+                            # check if spam
+                            (dict['thread']['spam_score'] > 0.3) or
+                            # check if reliable source
+                            (dict['thread']['site_section'] not in site_sections) or
+                            # check if text parsed correctly
+                            ('Further company coverage:' in dict['text']) or
+                            (('subscription' or 'subscribe') in dict['text']) or
+                            (len(dict['text']) < 300)):
+                            continue
+                        else:
+                            try:
+                                # replace whitespaces and delimiter chars
+                                # and write to csv
+                                writer.writerow([dict['thread']['uuid'],
+                                                 dict['thread']['title']\
+                                                 .replace('|', '-'),
+                                                 dict['text']\
+                                                 .replace('\n', '')\
+                                                 .replace('\r', '')\
+                                                 .replace('|', '-'),
+                                                 dict['thread']['site'],
+                                                 dict['thread']['site_section']\
+                                                 .replace('\n', '')\
+                                                 .replace('\r', ''),
+                                                 dict['url'],
+                                                 dict['published']])
+                                a += 1
+                            # handle undefined characters (videos and other spam)
+                            except UnicodeEncodeError:
+                                print('# filtered out: {} (UnicodeEncodeError)'
+                                            .format(dict['thread']['site_section']))
+            print('# saved articles in file {}, now {} in total'.format(output_file, a))
+            print('#')
+        print('# saved {} articles in total'.format(a))
+        print('#')
+
+if __name__ == '__main__':
+    JsonHandler.write_articles_to_csv_files()
+    #JsonHandler.create_labeling_dataset()
--- a/sections.txt
+++ b/sections.txt
@ -0,0 +1,88 @@
+http://feeds.reuters.com/Reuters/UKBusinessNews?format=xml
+http://in.reuters.com/finance/economy
+http://feeds.reuters.com/reuters/financialsNews
+http://in.reuters.com/finance/deals
+http://feeds.reuters.com/reuters/INbusinessNews
+http://www.theguardian.com/business/rss
+http://feeds.reuters.com/reuters/businessNews
+http://feeds.reuters.com/reuters/mergersNews
+http://feeds.reuters.com/reuters/industrialsNews
+http://feeds.reuters.com/reuters/UKBusinessNews/
+http://www.ft.com/rss/indepth/investmentbanking/deal
+http://feeds.guardian.co.uk/theguardian/business/uk-edition/rss
+http://feeds.reuters.com/reuters/companyNews
+http://www.ft.com/rss/companies/us
+http://rss.cnn.com/rss/edition_business.rss
+http://www.ft.com/rss/lex
+http://feeds.reuters.com/reuters/businessNews?format=xml
+http://www.reuters.com/finance/deals
+http://www.ft.com/rss/companies/chemicals
+https://www.theguardian.com/uk/business
+http://www.ft.com/rss/companies/asia-pacific
+http://in.reuters.com/finance/markets/companyOutlooksNews
+http://www.ft.com/rss/companies/financials
+http://www.ft.com/rss/companies/industrials
+http://www.ft.com/rss/companies/uk
+http://www.ft.com/rss/companies/rail
+https://www.theguardian.com/business/all
+http://www.ft.com/rss/companies
+http://www.ft.com/rss/companies/banks
+http://feeds.reuters.com/news/deals
+http://in.reuters.com/finance
+http://www.ft.com/rss/companies/airlines
+http://www.ft.com/rss/companies/asiapacific
+http://www.ft.com/rss/companies/financial-services
+http://www.ft.com/rss/companies/retail
+http://www.ft.com/rss/companies/europe
+http://www.ft.com/rss/companies/property
+http://www.ft.com/rss/companies/utilities
+http://rss.cnn.com/rss/money_news_companies.rss
+http://www.ft.com/rss/world/uk/business
+http://www.ft.com/rss/companies/transport
+http://www.ft.com/rss/companies/retail-consumer
+http://www.ft.com/rss/companies/energy
+http://www.ft.com/rss/companies/mining
+http://www.reuters.com/finance
+http://www.ft.com/rss/companies/automobiles
+http://www.ft.com/rss/companies/basic-resources
+http://www.ft.com/rss/companies/technology
+http://www.ft.com/rss/companies/construction
+http://www.ft.com/rss/companies/health
+https://www.theguardian.com/media/mediabusiness
+http://www.theguardian.com/business/tesco/rss
+http://www.theguardian.com/business/oil/rss
+http://www.ft.com/rss/companies/aerospace-defence
+http://www.ft.com/rss/companies/travel-leisure
+http://www.ft.com/rss/companies/oil-gas
+http://www.theguardian.com/business/morrisons/rss
+http://www.ft.com/rss/companies/telecoms
+http://www.ft.com/rss/companies/personal-goods
+http://www.ft.com/rss/companies/pharmaceuticals
+http://www.ft.com/rss/in-depth/initial-public-offering
+http://rss.cnn.com/rss/money_news_economy.rss
+http://www.ft.com/rss/companies/insurance
+http://www.ft.com/rss/companies/support-services
+http://www.guardian.co.uk/business/economics/rss
+http://www.economist.com/sections/business-finance/rss.xml
+http://www.guardian.co.uk/theobserver/news/business/rss
+http://www.ft.com/rss/companies/healthcare
+https://www.bloomberg.com/businessweek
+http://www.theguardian.com/business/retail/rss
+http://rss.cnn.com/rss/money_technology.rss
+http://www.economist.com/rss/business_rss.xml
+http://www.theguardian.com/business/unilever/rss
+https://www.theguardian.com/business/eurozone
+https://www.theguardian.com/business/economics
+http://www.economist.com/rss/briefings_rss.xml
+http://www.theguardian.com/business/euro/rss
+http://www.reuters.com/finance/summits
+http://rss.ft.com/rss/companies/banks
+http://in.reuters.com/finance/summits
+http://www.theguardian.com/business/ryanair/rss
+http://www.theguardian.com/business/deloitte/rss
+https://in.reuters.com/finance/deals
+https://in.reuters.com/finance
+https://www.reuters.com/finance/deals
+https://www.reuters.com/finance
+https://in.reuters.com/finance/economy
+https://in.reuters.com/finance/markets/companyOutlooksNews