refactoring
This commit is contained in:
		
							parent
							
								
									0c15d49d7e
								
							
						
					
					
						commit
						b6e48feb16
					
				@ -111,7 +111,7 @@ class DecisionTree:
 | 
				
			|||||||
        print('# starting decision tree')
 | 
					        print('# starting decision tree')
 | 
				
			||||||
        print('# ...')
 | 
					        print('# ...')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        file = 'classification_labelled_corrected.csv'
 | 
					        file = 'data\\classification_labelled_corrected.csv'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # read csv file
 | 
					        # read csv file
 | 
				
			||||||
        print('# reading dataset')
 | 
					        print('# reading dataset')
 | 
				
			||||||
 | 
				
			|||||||
@ -1,8 +1,8 @@
 | 
				
			|||||||
'''
 | 
					'''
 | 
				
			||||||
Json Handler
 | 
					File Handler
 | 
				
			||||||
============
 | 
					============
 | 
				
			||||||
 | 
					
 | 
				
			||||||
JsonHandler reads articles from JSON files,
 | 
					FileHandler reads articles from JSON files,
 | 
				
			||||||
extracts relevant information and
 | 
					extracts relevant information and
 | 
				
			||||||
writes it to a csv file.
 | 
					writes it to a csv file.
 | 
				
			||||||
'''
 | 
					'''
 | 
				
			||||||
@ -16,9 +16,9 @@ import json
 | 
				
			|||||||
import numpy as np
 | 
					import numpy as np
 | 
				
			||||||
import pandas as pd
 | 
					import pandas as pd
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class JsonHandler:
 | 
					class FileHandler:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # string for every month of the year
 | 
					    # strings for every month of the year
 | 
				
			||||||
    months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
 | 
					    months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10',
 | 
				
			||||||
              '11', '12']
 | 
					              '11', '12']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -28,7 +28,6 @@ class JsonHandler:
 | 
				
			|||||||
        n number of items to select randomly,
 | 
					        n number of items to select randomly,
 | 
				
			||||||
        return new DataFrame with only selected items.
 | 
					        return new DataFrame with only selected items.
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
 | 
					 | 
				
			||||||
        # initialize random => reproducible sequence
 | 
					        # initialize random => reproducible sequence
 | 
				
			||||||
        np.random.seed(5)
 | 
					        np.random.seed(5)
 | 
				
			||||||
        # add new column 'Random'
 | 
					        # add new column 'Random'
 | 
				
			||||||
@ -39,29 +38,44 @@ class JsonHandler:
 | 
				
			|||||||
        return df.iloc[0:n]
 | 
					        return df.iloc[0:n]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def create_labeling_dataset():
 | 
					    def create_labeling_dataset():
 | 
				
			||||||
        # number of articles to select from each month:
 | 
					        # output file
 | 
				
			||||||
        # 10.000 / 12 = 833,33
 | 
					        o_file = 'data\\interactive_labeling_dataset.csv'
 | 
				
			||||||
 | 
					        # create file and write header
 | 
				
			||||||
 | 
					        with open(o_file, 'w', newline='') as csvfile:
 | 
				
			||||||
 | 
					            writer = csv.writer(csvfile, 
 | 
				
			||||||
 | 
					                                delimiter='|',
 | 
				
			||||||
 | 
					                                quotechar='\'', 
 | 
				
			||||||
 | 
					                                quoting=csv.QUOTE_NONNUMERIC)
 | 
				
			||||||
 | 
					            writer.writerow(['Uuid',        #0
 | 
				
			||||||
 | 
					                             'Title',       #1
 | 
				
			||||||
 | 
					                             'Text',        #2
 | 
				
			||||||
 | 
					                             'Site',        #3
 | 
				
			||||||
 | 
					                             'SiteSection', #4
 | 
				
			||||||
 | 
					                             'Url',         #5
 | 
				
			||||||
 | 
					                             'Timestamp'])  #6
 | 
				
			||||||
 | 
					        # number of articles to select from each month (10000/12=833,33)
 | 
				
			||||||
        n_select = 833
 | 
					        n_select = 833
 | 
				
			||||||
        # except every third month:
 | 
					        for m in FileHandler.months:
 | 
				
			||||||
        every_third_month = ['03', '06', '09', '12']
 | 
					            df = pd.read_csv('data\\articles\\all_{}.csv'.format(m),
 | 
				
			||||||
        for m in JsonHandler.month:
 | 
					                             delimiter='|',
 | 
				
			||||||
            df = pandas.read_csv('all_{}.csv'.format(m),
 | 
					                             header=0,
 | 
				
			||||||
                                  delimiter='|',
 | 
					                             index_col=None,
 | 
				
			||||||
                                  header=0,
 | 
					                             engine='python',
 | 
				
			||||||
                                  index_col=None,
 | 
					                             quoting=csv.QUOTE_NONNUMERIC,
 | 
				
			||||||
                                  engine='python',
 | 
					                             quotechar='\'')
 | 
				
			||||||
                                  quotechar='\'',
 | 
					 | 
				
			||||||
                                  quoting=0,
 | 
					 | 
				
			||||||
                                  encoding='utf-8')
 | 
					 | 
				
			||||||
            # pick one more from every third article
 | 
					            # pick one more from every third article
 | 
				
			||||||
            if m in every_third_month:
 | 
					            if m in ['03', '06', '09', '12']:
 | 
				
			||||||
                n_select = 834
 | 
					                n_select = 834
 | 
				
			||||||
            JsonHandler.select_randoms(df, n_select).to_csv('labeling_dataset.csv', 
 | 
					            random_articles = FileHandler.select_randoms(df, n_select)
 | 
				
			||||||
                                                            header=True, 
 | 
					            del random_articles['Random']
 | 
				
			||||||
                                                            mode='a', 
 | 
					            random_articles.to_csv(o_file,
 | 
				
			||||||
                                                            encoding='python', 
 | 
					                                   header=False,
 | 
				
			||||||
                                                            quoting=QUOTE_MINIMAL, 
 | 
					                                   index=False,
 | 
				
			||||||
                                                            quotechar='\'')
 | 
					                                   sep='|',
 | 
				
			||||||
 | 
					                                   mode='a',
 | 
				
			||||||
 | 
					                                   encoding='utf-8',
 | 
				
			||||||
 | 
					                                   quoting=csv.QUOTE_NONNUMERIC,
 | 
				
			||||||
 | 
					                                   quotechar='\'')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def write_articles_to_csv_files():
 | 
					    def write_articles_to_csv_files():
 | 
				
			||||||
        '''read JSON files, select articles and write them to csv.
 | 
					        '''read JSON files, select articles and write them to csv.
 | 
				
			||||||
@ -69,14 +83,14 @@ class JsonHandler:
 | 
				
			|||||||
        # reliable sources (site_sections)
 | 
					        # reliable sources (site_sections)
 | 
				
			||||||
        site_sections = []
 | 
					        site_sections = []
 | 
				
			||||||
        # read list from 'sections.txt' file
 | 
					        # read list from 'sections.txt' file
 | 
				
			||||||
        with open('sections.txt', 'r') as s_list:
 | 
					        with open('data\\sections.txt', 'r') as s_list:
 | 
				
			||||||
            site_sections = s_list.read().split('\n')
 | 
					            site_sections = s_list.read().split('\n')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # article counter
 | 
					        # article counter
 | 
				
			||||||
        a = 0
 | 
					        a = 0
 | 
				
			||||||
        for m in JsonHandler.months:
 | 
					        for m in FileHandler.months:
 | 
				
			||||||
            # 1 output file per month
 | 
					            # 1 output file per month
 | 
				
			||||||
            output_file = 'all_{}.csv'.format(m)
 | 
					            output_file = 'data\\articles\\all_{}.csv'.format(m)
 | 
				
			||||||
            # path of input JSON files per month
 | 
					            # path of input JSON files per month
 | 
				
			||||||
            path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
 | 
					            path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz'\
 | 
				
			||||||
                   '\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
 | 
					                   '\\new_dataset\\2017_{}_ccc517fd45024a87c12318299efc50a4'\
 | 
				
			||||||
@ -146,7 +160,8 @@ class JsonHandler:
 | 
				
			|||||||
            print('#')
 | 
					            print('#')
 | 
				
			||||||
        print('# saved {} articles in total'.format(a))
 | 
					        print('# saved {} articles in total'.format(a))
 | 
				
			||||||
        print('#')
 | 
					        print('#')
 | 
				
			||||||
 | 
					    def join_all_csv_files():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == '__main__':
 | 
					if __name__ == '__main__':
 | 
				
			||||||
    JsonHandler.write_articles_to_csv_files()
 | 
					    # FileHandler.write_articles_to_csv_files()
 | 
				
			||||||
    #JsonHandler.create_labeling_dataset()
 | 
					    # FileHandler.create_labeling_dataset()
 | 
				
			||||||
							
								
								
									
										13
									
								
								NER.py
									
									
									
									
									
								
							
							
						
						
									
										13
									
								
								NER.py
									
									
									
									
									
								
							@ -5,15 +5,15 @@ Named Entity Recognition (NER)
 | 
				
			|||||||
Stanford NER takes a text as input and returns a list of entities
 | 
					Stanford NER takes a text as input and returns a list of entities
 | 
				
			||||||
like persons, organizations and countries, e.g.
 | 
					like persons, organizations and countries, e.g.
 | 
				
			||||||
'''
 | 
					'''
 | 
				
			||||||
 | 
					import csv
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import matplotlib.pyplot as plt
 | 
					import matplotlib.pyplot as plt
 | 
				
			||||||
import numpy as np
 | 
					import numpy as np
 | 
				
			||||||
 | 
					import pandas as pd
 | 
				
			||||||
from nltk.tag import StanfordNERTagger
 | 
					from nltk.tag import StanfordNERTagger
 | 
				
			||||||
from nltk.tokenize import word_tokenize
 | 
					from nltk.tokenize import word_tokenize
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from CsvHandler import CsvHandler
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
class NER:
 | 
					class NER:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # toDo: complete lists:
 | 
					    # toDo: complete lists:
 | 
				
			||||||
@ -118,8 +118,13 @@ class NER:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
if __name__ == '__main__':
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    filepath = 'classification_labelled_corrected.csv'
 | 
					    filepath = 'data\\classification_labelled_corrected.csv'
 | 
				
			||||||
    df = CsvHandler.read_csv(filepath)
 | 
					    df = pd.read_csv(filepath,
 | 
				
			||||||
 | 
					                     sep='|',
 | 
				
			||||||
 | 
					                     engine='python',
 | 
				
			||||||
 | 
					                     decimal='.',
 | 
				
			||||||
 | 
					                     quotechar='\'',
 | 
				
			||||||
 | 
					                     quoting=csv.QUOTE_NONE)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # only articles with label==1
 | 
					    # only articles with label==1
 | 
				
			||||||
    df_hits = df[df['Label'] == 1]
 | 
					    df_hits = df[df['Label'] == 1]
 | 
				
			||||||
 | 
				
			|||||||
@ -177,7 +177,7 @@ class NaiveBayes:
 | 
				
			|||||||
        print('# starting naive bayes')
 | 
					        print('# starting naive bayes')
 | 
				
			||||||
        print('# ...')
 | 
					        print('# ...')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        file = 'classification_labelled_corrected.csv'
 | 
					        file = 'data\\classification_labelled_corrected.csv'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # read csv file
 | 
					        # read csv file
 | 
				
			||||||
        print('# reading dataset')
 | 
					        print('# reading dataset')
 | 
				
			||||||
 | 
				
			|||||||
@ -153,7 +153,7 @@ class NaiveBayes_Interactive:
 | 
				
			|||||||
        print('# starting naive bayes')
 | 
					        print('# starting naive bayes')
 | 
				
			||||||
        print('# ...')
 | 
					        print('# ...')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        file = 'classification_labelled_corrected.csv'
 | 
					        file = 'data\\classification_labelled_corrected.csv'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # read csv file
 | 
					        # read csv file
 | 
				
			||||||
        print('# reading dataset')
 | 
					        print('# reading dataset')
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										2
									
								
								SVM.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								SVM.py
									
									
									
									
									
								
							@ -91,7 +91,7 @@ class SVM:
 | 
				
			|||||||
        print('# starting svm')
 | 
					        print('# starting svm')
 | 
				
			||||||
        print('# ...')
 | 
					        print('# ...')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        file = 'classification_labelled_corrected.csv'
 | 
					        file = 'data\\classification_labelled_corrected.csv'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # read csv file
 | 
					        # read csv file
 | 
				
			||||||
        print('# reading dataset')
 | 
					        print('# reading dataset')
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										4514
									
								
								data/articles/all_01.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4514
									
								
								data/articles/all_01.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										3853
									
								
								data/articles/all_02.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3853
									
								
								data/articles/all_02.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										4299
									
								
								data/articles/all_03.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4299
									
								
								data/articles/all_03.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										3312
									
								
								data/articles/all_04.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3312
									
								
								data/articles/all_04.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										4127
									
								
								data/articles/all_05.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4127
									
								
								data/articles/all_05.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										3388
									
								
								data/articles/all_06.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3388
									
								
								data/articles/all_06.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										2372
									
								
								data/articles/all_07.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2372
									
								
								data/articles/all_07.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										2981
									
								
								data/articles/all_08.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2981
									
								
								data/articles/all_08.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										3296
									
								
								data/articles/all_09.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3296
									
								
								data/articles/all_09.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										3491
									
								
								data/articles/all_10.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3491
									
								
								data/articles/all_10.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										3391
									
								
								data/articles/all_11.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3391
									
								
								data/articles/all_11.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										2777
									
								
								data/articles/all_12.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2777
									
								
								data/articles/all_12.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							| 
		
		
			 Can't render this file because it is too large. 
		
	 | 
							
								
								
									
										10007
									
								
								data/interactive_labeling_dataset.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										10007
									
								
								data/interactive_labeling_dataset.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user