removed csvHandler.py
This commit is contained in:
parent
b14798242f
commit
c85ce71e24
|
@ -1,54 +0,0 @@
|
|||
'''
|
||||
Csv Handler
|
||||
===========
|
||||
|
||||
CsvHandler writes articles' information to csv file and reads it.
|
||||
'''
|
||||
|
||||
import csv
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
class CsvHandler:
|
||||
|
||||
def read_csv(csv_file, usecols=None):
|
||||
df = pd.read_csv(csv_file,
|
||||
sep='|',
|
||||
header=0,
|
||||
engine='python',
|
||||
usecols=usecols,
|
||||
decimal='.',
|
||||
quotechar='\'',
|
||||
#nrows = 200,
|
||||
quoting=csv.QUOTE_NONE)
|
||||
return df
|
||||
|
||||
def write_csv(df, file_name):
|
||||
df.to_csv(file_name,
|
||||
sep='|')
|
||||
print('# saved {} article(s) in {}'.format(len(df), file_name))
|
||||
|
||||
def select_randoms(df, n):
|
||||
'''selects n random samples from dataset.
|
||||
params: df DataFrame to select items from,
|
||||
n number of items to select randomly,
|
||||
returns new DataFrame with only selected items
|
||||
'''
|
||||
# new empty DataFrame
|
||||
# df_samples = pd.DataFrame(columns=['rands','title','text','label'])
|
||||
# initialize random => reproducible sequence
|
||||
np.random.seed(5)
|
||||
# pseudorandom float -1.0 <= x <= 1.0 for every sample
|
||||
# pd.Series()
|
||||
# add new column 'Random'
|
||||
df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
|
||||
# sort DataFrame by random numbers
|
||||
df = df.sort_values('Random')
|
||||
# return first n elements of randomly sorted dataset
|
||||
return df.iloc[0:n]
|
||||
|
||||
if __name__ == '__main__':
|
||||
df = CsvHandler.read_csv('classification_labelled_corrected.csv')
|
||||
df_new = CsvHandler.select_randoms(df, 10)
|
||||
CsvHandler.write_csv(df_new, 'samples_10.csv')
|
|
@ -7,13 +7,14 @@ array X of size [n_samples, n_features], holding the training samples,
|
|||
and array y of integer values, size [n_samples],
|
||||
holding the class labels for the training samples.
|
||||
'''
|
||||
import operator
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
from CsvHandler import CsvHandler
|
||||
|
||||
import csv
|
||||
import operator
|
||||
|
||||
import graphviz
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn import tree
|
||||
#from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
|
@ -116,8 +117,13 @@ class DecisionTree:
|
|||
print('# reading dataset')
|
||||
print('# ...')
|
||||
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
data = pd.read_csv(file,
|
||||
sep='|',
|
||||
engine='python',
|
||||
decimal='.',
|
||||
quotechar='\'',
|
||||
quoting=csv.QUOTE_NONE)
|
||||
|
||||
make_tree(dataset)
|
||||
make_tree(data)
|
||||
|
||||
print('# ending decision tree')
|
|
@ -0,0 +1,129 @@
|
|||
'''
|
||||
JSON Handler
|
||||
============
|
||||
|
||||
JSON Handler reads articles from JSON files,
|
||||
extracts relevant information and
|
||||
writes it to a csv file.
|
||||
'''
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import csv
|
||||
import glob
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
class JsonHandler:
|
||||
|
||||
def select_randoms(df, n):
|
||||
'''selects n random samples from dataset.
|
||||
params: df DataFrame to select items from,
|
||||
n number of items to select randomly,
|
||||
returns new DataFrame with only selected items
|
||||
'''
|
||||
# initialize random => reproducible sequence
|
||||
np.random.seed(5)
|
||||
# add new column 'Random'
|
||||
df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
|
||||
# sort DataFrame by random numbers
|
||||
df = df.sort_values('Random')
|
||||
# return first n elements of randomly sorted dataset
|
||||
return df.iloc[0:n]
|
||||
|
||||
def create_csv(file_name):
|
||||
# create new csv file for each month.
|
||||
# each row contains an news article.
|
||||
|
||||
with open(file_name, 'w', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile,
|
||||
delimiter='|',
|
||||
quotechar='\'',
|
||||
quoting=csv.QUOTE_NONNUMERIC)
|
||||
# write header / column names
|
||||
writer.writerow(['Uuid', #0
|
||||
'Title', #1
|
||||
'Text', #2
|
||||
'Site', #3
|
||||
'SiteSection', #4
|
||||
'Url', #5
|
||||
'Timestamp']) #6
|
||||
|
||||
def write_articles_to_csv(file_name):
|
||||
# path of JSON files
|
||||
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4\\news_[0-9]*.json'
|
||||
files = glob.glob(path)
|
||||
|
||||
# reliable sources (site_sections)
|
||||
site_sections = ['http://feeds.reuters.com/reuters/financialsNews',
|
||||
'http://feeds.reuters.com/reuters/INbusinessNews',
|
||||
'http://feeds.reuters.com/reuters/businessNews',
|
||||
'http://feeds.reuters.com/reuters/companyNews',
|
||||
'http://www.reuters.com/finance/deals',
|
||||
'http://feeds.reuters.com/reuters/mergersNews',
|
||||
'http://rss.cnn.com/rss/money_topstories.rss',
|
||||
'http://rss.cnn.com/rss/money_latest.rss',
|
||||
'http://www.economist.com/sections/business-finance/rss.xml',
|
||||
'http://rss.cnn.com/rss/edition_business.rss',
|
||||
'http://in.reuters.com/finance/deals',
|
||||
'http://feeds.reuters.com/reuters/technologyNews',
|
||||
'http://feeds.reuters.com/reuters/technologysectorNews',
|
||||
'https://www.ft.com/companies/us',
|
||||
'http://feeds.reuters.com/reuters/UKScienceNews',
|
||||
'http://in.reuters.com/news/technology',
|
||||
'http://in.reuters.com/finance/economy',
|
||||
'https://www.bloomberg.com/middleeast',
|
||||
'http://in.reuters.com/news/top-news']
|
||||
|
||||
# file counter
|
||||
n = 0
|
||||
# article counter
|
||||
a = 0
|
||||
# read every JSON file in current folder
|
||||
with open(file_name, 'a', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile,
|
||||
delimiter='|',
|
||||
quotechar='\'',
|
||||
quoting=csv.QUOTE_NONNUMERIC)
|
||||
for file in files:
|
||||
n += 1
|
||||
with open(file, encoding='utf-8') as f:
|
||||
# Json is converted to dict
|
||||
dict = json.load(f)
|
||||
#print(n)
|
||||
# leave out comments or posts, take only reuters as source
|
||||
if ((dict['ord_in_thread'] != 0) or
|
||||
(dict['language'] != 'english') or
|
||||
(dict['thread']['spam_score'] > 0.3) or
|
||||
(dict['thread']['site_section'] not in site_sections)):
|
||||
continue
|
||||
# pick only relevant information of article
|
||||
# and put in in list
|
||||
article = [dict['thread']['uuid'], # 0:'Uuid'
|
||||
dict['thread']['title'], # 1:'Title'
|
||||
dict['text'], # 2:'Text'
|
||||
dict['thread']['site'], # 3:'Site'
|
||||
dict['thread']['site_section'], # 4:'SiteSection'
|
||||
dict['url'], # 5:'Url'
|
||||
dict['published']] # 6:'Timestamp'
|
||||
|
||||
# remove newlines and delimiter char
|
||||
article[1] = article[1].replace('|', '-') # in 'Title'
|
||||
article[2] = article[2].replace('\n', ' ').replace('\r', ' ').replace('|', '-') # in 'Text'
|
||||
|
||||
try:
|
||||
writer.writerow(article)
|
||||
a += 1
|
||||
# handle undefined characters (videos and other spam)
|
||||
except UnicodeEncodeError:
|
||||
print('# filtered out site_section: {} (UnicodeEncodeError)'
|
||||
.format(dict['thread']['site_section']))
|
||||
print()
|
||||
print('# saved {} articles in file {}'.format(a, file_name))
|
||||
|
||||
if __name__ == '__main__':
|
||||
file_name = 'test.csv'
|
||||
JsonHandler.create_csv(file_name)
|
||||
JsonHandler.write_articles_to_csv(file_name)
|
|
@ -13,8 +13,10 @@ regardless of any possible correlations between these features.
|
|||
'''
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
from CsvHandler import CsvHandler
|
||||
|
||||
import csv
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.metrics import recall_score, precision_score
|
||||
|
@ -59,7 +61,7 @@ class NaiveBayes:
|
|||
n += 1
|
||||
print('# split no. ' + str(n))
|
||||
|
||||
# # eigenes BOW => schlechtere ergebnisse
|
||||
# # eigenes BOW
|
||||
# vocab = BagOfWords.make_vocab(X[train])
|
||||
# # fit the training data and then return the matrix
|
||||
# training_data = BagOfWords.make_matrix(X[train], vocab)
|
||||
|
@ -72,26 +74,18 @@ class NaiveBayes:
|
|||
# transform testing data and return the matrix
|
||||
testing_data = cv.transform(X[test]).toarray()
|
||||
|
||||
# # apply select percentile
|
||||
# selector = SelectPercentile(percentile=25)
|
||||
# selector.fit(training_data, y[train])
|
||||
|
||||
##DORIS: WIRD SELECT PERCENTILE IN DEINE ARBEIT MIT NB EINBEZOGEN?
|
||||
# apply select percentile
|
||||
selector = SelectPercentile(percentile=100)
|
||||
selector.fit(training_data, y[train])
|
||||
|
||||
# training_data_r = selector.transform(training_data)
|
||||
# testing_data_r = selector.transform(testing_data)
|
||||
|
||||
# #fit classifier
|
||||
# classifier.fit(training_data_r, y[train])
|
||||
# #predict class
|
||||
# predictions_train = classifier.predict(training_data_r)
|
||||
# predictions_test = classifier.predict(testing_data_r)
|
||||
training_data_r = selector.transform(training_data)
|
||||
testing_data_r = selector.transform(testing_data)
|
||||
|
||||
#fit classifier
|
||||
classifier.fit(training_data, y[train])
|
||||
classifier.fit(training_data_r, y[train])
|
||||
#predict class
|
||||
predictions_train = classifier.predict(training_data)
|
||||
predictions_test = classifier.predict(testing_data)
|
||||
predictions_train = classifier.predict(training_data_r)
|
||||
predictions_test = classifier.predict(testing_data_r)
|
||||
|
||||
#print and store metrics
|
||||
rec = recall_score(y[test], predictions_test)
|
||||
|
@ -189,12 +183,15 @@ class NaiveBayes:
|
|||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('# ...')
|
||||
|
||||
## DORIS: ICH VERSTEHE NICHT, WARUM DU HIER EINE EXTRA FUNKTION SCHREIBST, PD.READ_CSV MÜSSTE DOCH AUCH SO GEHEN?
|
||||
## KOMMT VIELLEICHT NOCH, VIELLEICHT BIN ICH ZU VORSCHNELL
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
make_naive_bayes(dataset)
|
||||
data = pd.read_csv(file,
|
||||
sep='|',
|
||||
engine='python',
|
||||
decimal='.',
|
||||
quotechar='\'',
|
||||
quoting=csv.QUOTE_NONE)
|
||||
|
||||
make_naive_bayes(data)
|
||||
|
||||
print('#')
|
||||
print('# ending naive bayes')
|
|
@ -3,23 +3,21 @@ Naive Bayes Classifier
|
|||
======================
|
||||
|
||||
basic implementation of naive bayes.
|
||||
prints out probabilities for classes.
|
||||
needed for interactive labeling.
|
||||
prints out probabilities for classes needed for interactive labeling.
|
||||
'''
|
||||
|
||||
from CsvHandler import CsvHandler
|
||||
import csv
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
|
||||
from sklearn.metrics import recall_score, precision_score
|
||||
from sklearn.model_selection import KFold
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
|
||||
class NaiveBayes_simple:
|
||||
class NaiveBayes_Interactive:
|
||||
|
||||
def make_naive_bayes(dataset):
|
||||
'''fits naive bayes model with StratifiedKFold,
|
||||
uses my BOW
|
||||
'''fits naive bayes model
|
||||
'''
|
||||
print('# fitting model')
|
||||
print('# ...')
|
||||
|
@ -31,9 +29,8 @@ class NaiveBayes_simple:
|
|||
|
||||
cv = CountVectorizer()
|
||||
|
||||
##DORIS: DU BRAUCHST IMMER EINEN STRATIFIED SPLIT, WEIL DIEN DATASET UNBALANCED IST
|
||||
# k-fold cross-validation as split method
|
||||
kf = KFold(n_splits=10, shuffle=True, random_state=5)
|
||||
# stratified k-fold cross-validation as split method
|
||||
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=5)
|
||||
|
||||
classifier = GaussianNB()
|
||||
|
||||
|
@ -163,9 +160,14 @@ class NaiveBayes_simple:
|
|||
print('# reading dataset')
|
||||
print('# ...')
|
||||
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
data = pd.read_csv(file,
|
||||
sep='|',
|
||||
engine='python',
|
||||
decimal='.',
|
||||
quotechar='\'',
|
||||
quoting=csv.QUOTE_NONE)
|
||||
|
||||
make_naive_bayes(dataset)
|
||||
make_naive_bayes(data)
|
||||
|
||||
print('#')
|
||||
print('# ending naive bayes')
|
34
README.md
34
README.md
|
@ -1,19 +1,43 @@
|
|||
# Anne's Bachelor Thesis
|
||||
State: October 2018 (in progress)
|
||||
|
||||
My python classes for text mining, machine learning models, …
|
||||
The scripts can be called separately.
|
||||
|
||||
Best F1 score results were:
|
||||
|
||||
SVM
|
||||
---
|
||||
F1 score: 0.8944166649330559
|
||||
best parameters set found on development set:
|
||||
{'SVC__C': 0.1, 'SVC__gamma': 0.01, 'SVC__kernel': 'linear', 'perc__percentile': 50}
|
||||
|
||||
Naive Bayes
|
||||
-----------
|
||||
parameters: SelectPercentile(25), own BOW implementation, 10-fold cross validation
|
||||
F1 score: min = 0.7586206896551724, max = 0.8846153846153846, average = 0.8324014738144634
|
||||
|
||||
The complete documentation can be found in the latex document in the thesis folder.
|
||||
|
||||
The csv file 'classification_labelled_corrected.csv' contains 1497 labeled news articles from Reuters.com and is used for the machine learning models.
|
||||
|
||||
Note:
|
||||
Please enter a valid webhose personal key before you call 'Requester.py'.
|
||||
Also, please change the path to your JAVAHOME environment variable in 'NER.find_companies' method.
|
||||
|
||||
example:
|
||||
# set paths
|
||||
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
|
||||
os.environ['JAVAHOME'] = java_path
|
||||
|
||||
|
||||
## Requirements
|
||||
|
||||
pandas==0.20.1
|
||||
|
||||
nltk==3.2.5
|
||||
|
||||
webhoseio==0.5
|
||||
|
||||
numpy==1.14.0
|
||||
|
||||
graphviz==0.9
|
||||
|
||||
scikit_learn==0.19.2
|
||||
|
||||
## Installation under Windows
|
||||
|
|
13
SVM.py
13
SVM.py
|
@ -13,8 +13,10 @@ to belong to a category based on which side of the gap they fall.
|
|||
'''
|
||||
|
||||
from BagOfWords import BagOfWords
|
||||
from CsvHandler import CsvHandler
|
||||
|
||||
import csv
|
||||
|
||||
import pandas as pd
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.metrics import f1_score, make_scorer
|
||||
|
@ -95,8 +97,13 @@ class SVM:
|
|||
print('# reading dataset')
|
||||
print('# ...')
|
||||
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
data = pd.read_csv(file,
|
||||
sep='|',
|
||||
engine='python',
|
||||
decimal='.',
|
||||
quotechar='\'',
|
||||
quoting=csv.QUOTE_NONE)
|
||||
|
||||
make_svm(dataset)
|
||||
make_svm(data)
|
||||
|
||||
print('# ending svm')
|
Loading…
Reference in New Issue