removed csvHandler.py

This commit is contained in:
Anne Lorenz 2018-10-18 13:57:46 +02:00
parent b14798242f
commit c85ce71e24
7 changed files with 214 additions and 103 deletions

View File

@ -1,54 +0,0 @@
'''
Csv Handler
===========
CsvHandler writes articles' information to csv file and reads it.
'''
import csv
import numpy as np
import pandas as pd
class CsvHandler:
def read_csv(csv_file, usecols=None):
df = pd.read_csv(csv_file,
sep='|',
header=0,
engine='python',
usecols=usecols,
decimal='.',
quotechar='\'',
#nrows = 200,
quoting=csv.QUOTE_NONE)
return df
def write_csv(df, file_name):
df.to_csv(file_name,
sep='|')
print('# saved {} article(s) in {}'.format(len(df), file_name))
def select_randoms(df, n):
'''selects n random samples from dataset.
params: df DataFrame to select items from,
n number of items to select randomly,
returns new DataFrame with only selected items
'''
# new empty DataFrame
# df_samples = pd.DataFrame(columns=['rands','title','text','label'])
# initialize random => reproducible sequence
np.random.seed(5)
# pseudorandom float -1.0 <= x <= 1.0 for every sample
# pd.Series()
# add new column 'Random'
df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
# sort DataFrame by random numbers
df = df.sort_values('Random')
# return first n elements of randomly sorted dataset
return df.iloc[0:n]
if __name__ == '__main__':
df = CsvHandler.read_csv('classification_labelled_corrected.csv')
df_new = CsvHandler.select_randoms(df, 10)
CsvHandler.write_csv(df_new, 'samples_10.csv')

View File

@ -7,13 +7,14 @@ array X of size [n_samples, n_features], holding the training samples,
and array y of integer values, size [n_samples],
holding the class labels for the training samples.
'''
import operator
from BagOfWords import BagOfWords
from CsvHandler import CsvHandler
import csv
import operator
import graphviz
import numpy as np
import pandas as pd
from sklearn import tree
#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
@ -116,8 +117,13 @@ class DecisionTree:
print('# reading dataset')
print('# ...')
dataset = CsvHandler.read_csv(file)
data = pd.read_csv(file,
sep='|',
engine='python',
decimal='.',
quotechar='\'',
quoting=csv.QUOTE_NONE)
make_tree(dataset)
make_tree(data)
print('# ending decision tree')

129
JSONHandler.py Normal file
View File

@ -0,0 +1,129 @@
'''
JSON Handler
============
JSON Handler reads articles from JSON files,
extracts relevant information and
writes it to a csv file.
'''
# -*- coding: utf-8 -*-
import csv
import glob
import json
import numpy as np
import pandas as pd
class JsonHandler:
def select_randoms(df, n):
'''selects n random samples from dataset.
params: df DataFrame to select items from,
n number of items to select randomly,
returns new DataFrame with only selected items
'''
# initialize random => reproducible sequence
np.random.seed(5)
# add new column 'Random'
df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
# sort DataFrame by random numbers
df = df.sort_values('Random')
# return first n elements of randomly sorted dataset
return df.iloc[0:n]
def create_csv(file_name):
# create new csv file for each month.
# each row contains an news article.
with open(file_name, 'w', newline='') as csvfile:
writer = csv.writer(csvfile,
delimiter='|',
quotechar='\'',
quoting=csv.QUOTE_NONNUMERIC)
# write header / column names
writer.writerow(['Uuid', #0
'Title', #1
'Text', #2
'Site', #3
'SiteSection', #4
'Url', #5
'Timestamp']) #6
def write_articles_to_csv(file_name):
# path of JSON files
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4\\news_[0-9]*.json'
files = glob.glob(path)
# reliable sources (site_sections)
site_sections = ['http://feeds.reuters.com/reuters/financialsNews',
'http://feeds.reuters.com/reuters/INbusinessNews',
'http://feeds.reuters.com/reuters/businessNews',
'http://feeds.reuters.com/reuters/companyNews',
'http://www.reuters.com/finance/deals',
'http://feeds.reuters.com/reuters/mergersNews',
'http://rss.cnn.com/rss/money_topstories.rss',
'http://rss.cnn.com/rss/money_latest.rss',
'http://www.economist.com/sections/business-finance/rss.xml',
'http://rss.cnn.com/rss/edition_business.rss',
'http://in.reuters.com/finance/deals',
'http://feeds.reuters.com/reuters/technologyNews',
'http://feeds.reuters.com/reuters/technologysectorNews',
'https://www.ft.com/companies/us',
'http://feeds.reuters.com/reuters/UKScienceNews',
'http://in.reuters.com/news/technology',
'http://in.reuters.com/finance/economy',
'https://www.bloomberg.com/middleeast',
'http://in.reuters.com/news/top-news']
# file counter
n = 0
# article counter
a = 0
# read every JSON file in current folder
with open(file_name, 'a', newline='') as csvfile:
writer = csv.writer(csvfile,
delimiter='|',
quotechar='\'',
quoting=csv.QUOTE_NONNUMERIC)
for file in files:
n += 1
with open(file, encoding='utf-8') as f:
# Json is converted to dict
dict = json.load(f)
#print(n)
# leave out comments or posts, take only reuters as source
if ((dict['ord_in_thread'] != 0) or
(dict['language'] != 'english') or
(dict['thread']['spam_score'] > 0.3) or
(dict['thread']['site_section'] not in site_sections)):
continue
# pick only relevant information of article
# and put in in list
article = [dict['thread']['uuid'], # 0:'Uuid'
dict['thread']['title'], # 1:'Title'
dict['text'], # 2:'Text'
dict['thread']['site'], # 3:'Site'
dict['thread']['site_section'], # 4:'SiteSection'
dict['url'], # 5:'Url'
dict['published']] # 6:'Timestamp'
# remove newlines and delimiter char
article[1] = article[1].replace('|', '-') # in 'Title'
article[2] = article[2].replace('\n', ' ').replace('\r', ' ').replace('|', '-') # in 'Text'
try:
writer.writerow(article)
a += 1
# handle undefined characters (videos and other spam)
except UnicodeEncodeError:
print('# filtered out site_section: {} (UnicodeEncodeError)'
.format(dict['thread']['site_section']))
print()
print('# saved {} articles in file {}'.format(a, file_name))
if __name__ == '__main__':
file_name = 'test.csv'
JsonHandler.create_csv(file_name)
JsonHandler.write_articles_to_csv(file_name)

View File

@ -13,8 +13,10 @@ regardless of any possible correlations between these features.
'''
from BagOfWords import BagOfWords
from CsvHandler import CsvHandler
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import recall_score, precision_score
@ -59,7 +61,7 @@ class NaiveBayes:
n += 1
print('# split no. ' + str(n))
# # eigenes BOW => schlechtere ergebnisse
# # eigenes BOW
# vocab = BagOfWords.make_vocab(X[train])
# # fit the training data and then return the matrix
# training_data = BagOfWords.make_matrix(X[train], vocab)
@ -72,26 +74,18 @@ class NaiveBayes:
# transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray()
# # apply select percentile
# selector = SelectPercentile(percentile=25)
# selector.fit(training_data, y[train])
##DORIS: WIRD SELECT PERCENTILE IN DEINE ARBEIT MIT NB EINBEZOGEN?
# apply select percentile
selector = SelectPercentile(percentile=100)
selector.fit(training_data, y[train])
# training_data_r = selector.transform(training_data)
# testing_data_r = selector.transform(testing_data)
# #fit classifier
# classifier.fit(training_data_r, y[train])
# #predict class
# predictions_train = classifier.predict(training_data_r)
# predictions_test = classifier.predict(testing_data_r)
training_data_r = selector.transform(training_data)
testing_data_r = selector.transform(testing_data)
#fit classifier
classifier.fit(training_data, y[train])
classifier.fit(training_data_r, y[train])
#predict class
predictions_train = classifier.predict(training_data)
predictions_test = classifier.predict(testing_data)
predictions_train = classifier.predict(training_data_r)
predictions_test = classifier.predict(testing_data_r)
#print and store metrics
rec = recall_score(y[test], predictions_test)
@ -189,12 +183,15 @@ class NaiveBayes:
# read csv file
print('# reading dataset')
print('# ...')
## DORIS: ICH VERSTEHE NICHT, WARUM DU HIER EINE EXTRA FUNKTION SCHREIBST, PD.READ_CSV MÜSSTE DOCH AUCH SO GEHEN?
## KOMMT VIELLEICHT NOCH, VIELLEICHT BIN ICH ZU VORSCHNELL
dataset = CsvHandler.read_csv(file)
make_naive_bayes(dataset)
data = pd.read_csv(file,
sep='|',
engine='python',
decimal='.',
quotechar='\'',
quoting=csv.QUOTE_NONE)
make_naive_bayes(data)
print('#')
print('# ending naive bayes')

View File

@ -3,23 +3,21 @@ Naive Bayes Classifier
======================
basic implementation of naive bayes.
prints out probabilities for classes.
needed for interactive labeling.
prints out probabilities for classes needed for interactive labeling.
'''
from CsvHandler import CsvHandler
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
class NaiveBayes_simple:
class NaiveBayes_Interactive:
def make_naive_bayes(dataset):
'''fits naive bayes model with StratifiedKFold,
uses my BOW
'''fits naive bayes model
'''
print('# fitting model')
print('# ...')
@ -31,9 +29,8 @@ class NaiveBayes_simple:
cv = CountVectorizer()
##DORIS: DU BRAUCHST IMMER EINEN STRATIFIED SPLIT, WEIL DIEN DATASET UNBALANCED IST
# k-fold cross-validation as split method
kf = KFold(n_splits=10, shuffle=True, random_state=5)
# stratified k-fold cross-validation as split method
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=5)
classifier = GaussianNB()
@ -163,9 +160,14 @@ class NaiveBayes_simple:
print('# reading dataset')
print('# ...')
dataset = CsvHandler.read_csv(file)
data = pd.read_csv(file,
sep='|',
engine='python',
decimal='.',
quotechar='\'',
quoting=csv.QUOTE_NONE)
make_naive_bayes(dataset)
make_naive_bayes(data)
print('#')
print('# ending naive bayes')

View File

@ -1,19 +1,43 @@
# Anne's Bachelor Thesis
State: October 2018 (in progress)
My python classes for text mining, machine learning models, …
The scripts can be called separately.
Best F1 score results were:
SVM
---
F1 score: 0.8944166649330559
best parameters set found on development set:
{'SVC__C': 0.1, 'SVC__gamma': 0.01, 'SVC__kernel': 'linear', 'perc__percentile': 50}
Naive Bayes
-----------
parameters: SelectPercentile(25), own BOW implementation, 10-fold cross validation
F1 score: min = 0.7586206896551724, max = 0.8846153846153846, average = 0.8324014738144634
The complete documentation can be found in the latex document in the thesis folder.
The csv file 'classification_labelled_corrected.csv' contains 1497 labeled news articles from Reuters.com and is used for the machine learning models.
Note:
Please enter a valid webhose personal key before you call 'Requester.py'.
Also, please change the path to your JAVAHOME environment variable in 'NER.find_companies' method.
example:
# set paths
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
os.environ['JAVAHOME'] = java_path
## Requirements
pandas==0.20.1
nltk==3.2.5
webhoseio==0.5
numpy==1.14.0
graphviz==0.9
scikit_learn==0.19.2
## Installation under Windows

13
SVM.py
View File

@ -13,8 +13,10 @@ to belong to a category based on which side of the gap they fall.
'''
from BagOfWords import BagOfWords
from CsvHandler import CsvHandler
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score, make_scorer
@ -95,8 +97,13 @@ class SVM:
print('# reading dataset')
print('# ...')
dataset = CsvHandler.read_csv(file)
data = pd.read_csv(file,
sep='|',
engine='python',
decimal='.',
quotechar='\'',
quoting=csv.QUOTE_NONE)
make_svm(dataset)
make_svm(data)
print('# ending svm')