From c85ce71e24ca62fc697955bd253aa64b5081f450 Mon Sep 17 00:00:00 2001 From: Anne Lorenz Date: Thu, 18 Oct 2018 13:57:46 +0200 Subject: [PATCH] removed csvHandler.py --- CsvHandler.py | 54 -------- DecisionTree.py | 16 ++- JSONHandler.py | 129 ++++++++++++++++++ NaiveBayes.py | 43 +++--- ...yes_simple.py => NaiveBayes_Interactive.py | 28 ++-- README.md | 34 ++++- SVM.py | 13 +- 7 files changed, 214 insertions(+), 103 deletions(-) delete mode 100644 CsvHandler.py create mode 100644 JSONHandler.py rename NaiveBayes_simple.py => NaiveBayes_Interactive.py (88%) diff --git a/CsvHandler.py b/CsvHandler.py deleted file mode 100644 index ca52616..0000000 --- a/CsvHandler.py +++ /dev/null @@ -1,54 +0,0 @@ -''' -Csv Handler -=========== - -CsvHandler writes articles' information to csv file and reads it. -''' - -import csv - -import numpy as np -import pandas as pd - -class CsvHandler: - - def read_csv(csv_file, usecols=None): - df = pd.read_csv(csv_file, - sep='|', - header=0, - engine='python', - usecols=usecols, - decimal='.', - quotechar='\'', - #nrows = 200, - quoting=csv.QUOTE_NONE) - return df - - def write_csv(df, file_name): - df.to_csv(file_name, - sep='|') - print('# saved {} article(s) in {}'.format(len(df), file_name)) - - def select_randoms(df, n): - '''selects n random samples from dataset. - params: df DataFrame to select items from, - n number of items to select randomly, - returns new DataFrame with only selected items - ''' - # new empty DataFrame - # df_samples = pd.DataFrame(columns=['rands','title','text','label']) - # initialize random => reproducible sequence - np.random.seed(5) - # pseudorandom float -1.0 <= x <= 1.0 for every sample - # pd.Series() - # add new column 'Random' - df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index) - # sort DataFrame by random numbers - df = df.sort_values('Random') - # return first n elements of randomly sorted dataset - return df.iloc[0:n] - -if __name__ == '__main__': - df = CsvHandler.read_csv('classification_labelled_corrected.csv') - df_new = CsvHandler.select_randoms(df, 10) - CsvHandler.write_csv(df_new, 'samples_10.csv') \ No newline at end of file diff --git a/DecisionTree.py b/DecisionTree.py index b69efaa..309aa53 100644 --- a/DecisionTree.py +++ b/DecisionTree.py @@ -7,13 +7,14 @@ array X of size [n_samples, n_features], holding the training samples, and array y of integer values, size [n_samples], holding the class labels for the training samples. ''' -import operator - from BagOfWords import BagOfWords -from CsvHandler import CsvHandler + +import csv +import operator import graphviz import numpy as np +import pandas as pd from sklearn import tree #from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_selection import SelectPercentile @@ -116,8 +117,13 @@ class DecisionTree: print('# reading dataset') print('# ...') - dataset = CsvHandler.read_csv(file) + data = pd.read_csv(file, + sep='|', + engine='python', + decimal='.', + quotechar='\'', + quoting=csv.QUOTE_NONE) - make_tree(dataset) + make_tree(data) print('# ending decision tree') \ No newline at end of file diff --git a/JSONHandler.py b/JSONHandler.py new file mode 100644 index 0000000..e1c8404 --- /dev/null +++ b/JSONHandler.py @@ -0,0 +1,129 @@ +''' +JSON Handler +============ + +JSON Handler reads articles from JSON files, +extracts relevant information and +writes it to a csv file. +''' + +# -*- coding: utf-8 -*- + +import csv +import glob +import json + +import numpy as np +import pandas as pd + +class JsonHandler: + + def select_randoms(df, n): + '''selects n random samples from dataset. + params: df DataFrame to select items from, + n number of items to select randomly, + returns new DataFrame with only selected items + ''' + # initialize random => reproducible sequence + np.random.seed(5) + # add new column 'Random' + df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index) + # sort DataFrame by random numbers + df = df.sort_values('Random') + # return first n elements of randomly sorted dataset + return df.iloc[0:n] + + def create_csv(file_name): + # create new csv file for each month. + # each row contains an news article. + + with open(file_name, 'w', newline='') as csvfile: + writer = csv.writer(csvfile, + delimiter='|', + quotechar='\'', + quoting=csv.QUOTE_NONNUMERIC) + # write header / column names + writer.writerow(['Uuid', #0 + 'Title', #1 + 'Text', #2 + 'Site', #3 + 'SiteSection', #4 + 'Url', #5 + 'Timestamp']) #6 + + def write_articles_to_csv(file_name): + # path of JSON files + path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4\\news_[0-9]*.json' + files = glob.glob(path) + + # reliable sources (site_sections) + site_sections = ['http://feeds.reuters.com/reuters/financialsNews', + 'http://feeds.reuters.com/reuters/INbusinessNews', + 'http://feeds.reuters.com/reuters/businessNews', + 'http://feeds.reuters.com/reuters/companyNews', + 'http://www.reuters.com/finance/deals', + 'http://feeds.reuters.com/reuters/mergersNews', + 'http://rss.cnn.com/rss/money_topstories.rss', + 'http://rss.cnn.com/rss/money_latest.rss', + 'http://www.economist.com/sections/business-finance/rss.xml', + 'http://rss.cnn.com/rss/edition_business.rss', + 'http://in.reuters.com/finance/deals', + 'http://feeds.reuters.com/reuters/technologyNews', + 'http://feeds.reuters.com/reuters/technologysectorNews', + 'https://www.ft.com/companies/us', + 'http://feeds.reuters.com/reuters/UKScienceNews', + 'http://in.reuters.com/news/technology', + 'http://in.reuters.com/finance/economy', + 'https://www.bloomberg.com/middleeast', + 'http://in.reuters.com/news/top-news'] + + # file counter + n = 0 + # article counter + a = 0 + # read every JSON file in current folder + with open(file_name, 'a', newline='') as csvfile: + writer = csv.writer(csvfile, + delimiter='|', + quotechar='\'', + quoting=csv.QUOTE_NONNUMERIC) + for file in files: + n += 1 + with open(file, encoding='utf-8') as f: + # Json is converted to dict + dict = json.load(f) + #print(n) + # leave out comments or posts, take only reuters as source + if ((dict['ord_in_thread'] != 0) or + (dict['language'] != 'english') or + (dict['thread']['spam_score'] > 0.3) or + (dict['thread']['site_section'] not in site_sections)): + continue + # pick only relevant information of article + # and put in in list + article = [dict['thread']['uuid'], # 0:'Uuid' + dict['thread']['title'], # 1:'Title' + dict['text'], # 2:'Text' + dict['thread']['site'], # 3:'Site' + dict['thread']['site_section'], # 4:'SiteSection' + dict['url'], # 5:'Url' + dict['published']] # 6:'Timestamp' + + # remove newlines and delimiter char + article[1] = article[1].replace('|', '-') # in 'Title' + article[2] = article[2].replace('\n', ' ').replace('\r', ' ').replace('|', '-') # in 'Text' + + try: + writer.writerow(article) + a += 1 + # handle undefined characters (videos and other spam) + except UnicodeEncodeError: + print('# filtered out site_section: {} (UnicodeEncodeError)' + .format(dict['thread']['site_section'])) + print() + print('# saved {} articles in file {}'.format(a, file_name)) + +if __name__ == '__main__': + file_name = 'test.csv' + JsonHandler.create_csv(file_name) + JsonHandler.write_articles_to_csv(file_name) \ No newline at end of file diff --git a/NaiveBayes.py b/NaiveBayes.py index f4834c6..eaf293b 100644 --- a/NaiveBayes.py +++ b/NaiveBayes.py @@ -13,8 +13,10 @@ regardless of any possible correlations between these features. ''' from BagOfWords import BagOfWords -from CsvHandler import CsvHandler +import csv + +import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_selection import SelectPercentile from sklearn.metrics import recall_score, precision_score @@ -59,7 +61,7 @@ class NaiveBayes: n += 1 print('# split no. ' + str(n)) - # # eigenes BOW => schlechtere ergebnisse + # # eigenes BOW # vocab = BagOfWords.make_vocab(X[train]) # # fit the training data and then return the matrix # training_data = BagOfWords.make_matrix(X[train], vocab) @@ -72,26 +74,18 @@ class NaiveBayes: # transform testing data and return the matrix testing_data = cv.transform(X[test]).toarray() - # # apply select percentile - # selector = SelectPercentile(percentile=25) - # selector.fit(training_data, y[train]) - - ##DORIS: WIRD SELECT PERCENTILE IN DEINE ARBEIT MIT NB EINBEZOGEN? + # apply select percentile + selector = SelectPercentile(percentile=100) + selector.fit(training_data, y[train]) - # training_data_r = selector.transform(training_data) - # testing_data_r = selector.transform(testing_data) - - # #fit classifier - # classifier.fit(training_data_r, y[train]) - # #predict class - # predictions_train = classifier.predict(training_data_r) - # predictions_test = classifier.predict(testing_data_r) + training_data_r = selector.transform(training_data) + testing_data_r = selector.transform(testing_data) #fit classifier - classifier.fit(training_data, y[train]) + classifier.fit(training_data_r, y[train]) #predict class - predictions_train = classifier.predict(training_data) - predictions_test = classifier.predict(testing_data) + predictions_train = classifier.predict(training_data_r) + predictions_test = classifier.predict(testing_data_r) #print and store metrics rec = recall_score(y[test], predictions_test) @@ -189,12 +183,15 @@ class NaiveBayes: # read csv file print('# reading dataset') print('# ...') - - ## DORIS: ICH VERSTEHE NICHT, WARUM DU HIER EINE EXTRA FUNKTION SCHREIBST, PD.READ_CSV MÜSSTE DOCH AUCH SO GEHEN? - ## KOMMT VIELLEICHT NOCH, VIELLEICHT BIN ICH ZU VORSCHNELL - dataset = CsvHandler.read_csv(file) - make_naive_bayes(dataset) + data = pd.read_csv(file, + sep='|', + engine='python', + decimal='.', + quotechar='\'', + quoting=csv.QUOTE_NONE) + + make_naive_bayes(data) print('#') print('# ending naive bayes') \ No newline at end of file diff --git a/NaiveBayes_simple.py b/NaiveBayes_Interactive.py similarity index 88% rename from NaiveBayes_simple.py rename to NaiveBayes_Interactive.py index 50473fd..faeb981 100644 --- a/NaiveBayes_simple.py +++ b/NaiveBayes_Interactive.py @@ -3,23 +3,21 @@ Naive Bayes Classifier ====================== basic implementation of naive bayes. -prints out probabilities for classes. -needed for interactive labeling. +prints out probabilities for classes needed for interactive labeling. ''' -from CsvHandler import CsvHandler +import csv +import pandas as pd from sklearn.feature_extraction.text import CountVectorizer - from sklearn.metrics import recall_score, precision_score -from sklearn.model_selection import KFold +from sklearn.model_selection import StratifiedKFold from sklearn.naive_bayes import GaussianNB -class NaiveBayes_simple: +class NaiveBayes_Interactive: def make_naive_bayes(dataset): - '''fits naive bayes model with StratifiedKFold, - uses my BOW + '''fits naive bayes model ''' print('# fitting model') print('# ...') @@ -31,9 +29,8 @@ class NaiveBayes_simple: cv = CountVectorizer() - ##DORIS: DU BRAUCHST IMMER EINEN STRATIFIED SPLIT, WEIL DIEN DATASET UNBALANCED IST - # k-fold cross-validation as split method - kf = KFold(n_splits=10, shuffle=True, random_state=5) + # stratified k-fold cross-validation as split method + kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=5) classifier = GaussianNB() @@ -163,9 +160,14 @@ class NaiveBayes_simple: print('# reading dataset') print('# ...') - dataset = CsvHandler.read_csv(file) + data = pd.read_csv(file, + sep='|', + engine='python', + decimal='.', + quotechar='\'', + quoting=csv.QUOTE_NONE) - make_naive_bayes(dataset) + make_naive_bayes(data) print('#') print('# ending naive bayes') \ No newline at end of file diff --git a/README.md b/README.md index d21944b..733816d 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,43 @@ # Anne's Bachelor Thesis +State: October 2018 (in progress) My python classes for text mining, machine learning models, … +The scripts can be called separately. + +Best F1 score results were: + +SVM +--- +F1 score: 0.8944166649330559 +best parameters set found on development set: +{'SVC__C': 0.1, 'SVC__gamma': 0.01, 'SVC__kernel': 'linear', 'perc__percentile': 50} + +Naive Bayes +----------- +parameters: SelectPercentile(25), own BOW implementation, 10-fold cross validation +F1 score: min = 0.7586206896551724, max = 0.8846153846153846, average = 0.8324014738144634 + +The complete documentation can be found in the latex document in the thesis folder. + +The csv file 'classification_labelled_corrected.csv' contains 1497 labeled news articles from Reuters.com and is used for the machine learning models. + +Note: +Please enter a valid webhose personal key before you call 'Requester.py'. +Also, please change the path to your JAVAHOME environment variable in 'NER.find_companies' method. + +example: +# set paths +java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181" +os.environ['JAVAHOME'] = java_path + ## Requirements pandas==0.20.1 - nltk==3.2.5 - webhoseio==0.5 - numpy==1.14.0 - graphviz==0.9 - scikit_learn==0.19.2 ## Installation under Windows diff --git a/SVM.py b/SVM.py index ff62be5..c196d03 100644 --- a/SVM.py +++ b/SVM.py @@ -13,8 +13,10 @@ to belong to a category based on which side of the gap they fall. ''' from BagOfWords import BagOfWords -from CsvHandler import CsvHandler +import csv + +import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_selection import SelectPercentile from sklearn.metrics import f1_score, make_scorer @@ -95,8 +97,13 @@ class SVM: print('# reading dataset') print('# ...') - dataset = CsvHandler.read_csv(file) + data = pd.read_csv(file, + sep='|', + engine='python', + decimal='.', + quotechar='\'', + quoting=csv.QUOTE_NONE) - make_svm(dataset) + make_svm(data) print('# ending svm') \ No newline at end of file