removed csvHandler.py
This commit is contained in:
parent
b14798242f
commit
c85ce71e24
|
@ -1,54 +0,0 @@
|
||||||
'''
|
|
||||||
Csv Handler
|
|
||||||
===========
|
|
||||||
|
|
||||||
CsvHandler writes articles' information to csv file and reads it.
|
|
||||||
'''
|
|
||||||
|
|
||||||
import csv
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
class CsvHandler:
|
|
||||||
|
|
||||||
def read_csv(csv_file, usecols=None):
|
|
||||||
df = pd.read_csv(csv_file,
|
|
||||||
sep='|',
|
|
||||||
header=0,
|
|
||||||
engine='python',
|
|
||||||
usecols=usecols,
|
|
||||||
decimal='.',
|
|
||||||
quotechar='\'',
|
|
||||||
#nrows = 200,
|
|
||||||
quoting=csv.QUOTE_NONE)
|
|
||||||
return df
|
|
||||||
|
|
||||||
def write_csv(df, file_name):
|
|
||||||
df.to_csv(file_name,
|
|
||||||
sep='|')
|
|
||||||
print('# saved {} article(s) in {}'.format(len(df), file_name))
|
|
||||||
|
|
||||||
def select_randoms(df, n):
|
|
||||||
'''selects n random samples from dataset.
|
|
||||||
params: df DataFrame to select items from,
|
|
||||||
n number of items to select randomly,
|
|
||||||
returns new DataFrame with only selected items
|
|
||||||
'''
|
|
||||||
# new empty DataFrame
|
|
||||||
# df_samples = pd.DataFrame(columns=['rands','title','text','label'])
|
|
||||||
# initialize random => reproducible sequence
|
|
||||||
np.random.seed(5)
|
|
||||||
# pseudorandom float -1.0 <= x <= 1.0 for every sample
|
|
||||||
# pd.Series()
|
|
||||||
# add new column 'Random'
|
|
||||||
df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
|
|
||||||
# sort DataFrame by random numbers
|
|
||||||
df = df.sort_values('Random')
|
|
||||||
# return first n elements of randomly sorted dataset
|
|
||||||
return df.iloc[0:n]
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
df = CsvHandler.read_csv('classification_labelled_corrected.csv')
|
|
||||||
df_new = CsvHandler.select_randoms(df, 10)
|
|
||||||
CsvHandler.write_csv(df_new, 'samples_10.csv')
|
|
|
@ -7,13 +7,14 @@ array X of size [n_samples, n_features], holding the training samples,
|
||||||
and array y of integer values, size [n_samples],
|
and array y of integer values, size [n_samples],
|
||||||
holding the class labels for the training samples.
|
holding the class labels for the training samples.
|
||||||
'''
|
'''
|
||||||
import operator
|
|
||||||
|
|
||||||
from BagOfWords import BagOfWords
|
from BagOfWords import BagOfWords
|
||||||
from CsvHandler import CsvHandler
|
|
||||||
|
import csv
|
||||||
|
import operator
|
||||||
|
|
||||||
import graphviz
|
import graphviz
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
from sklearn import tree
|
from sklearn import tree
|
||||||
#from sklearn.feature_extraction.text import CountVectorizer
|
#from sklearn.feature_extraction.text import CountVectorizer
|
||||||
from sklearn.feature_selection import SelectPercentile
|
from sklearn.feature_selection import SelectPercentile
|
||||||
|
@ -116,8 +117,13 @@ class DecisionTree:
|
||||||
print('# reading dataset')
|
print('# reading dataset')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
dataset = CsvHandler.read_csv(file)
|
data = pd.read_csv(file,
|
||||||
|
sep='|',
|
||||||
|
engine='python',
|
||||||
|
decimal='.',
|
||||||
|
quotechar='\'',
|
||||||
|
quoting=csv.QUOTE_NONE)
|
||||||
|
|
||||||
make_tree(dataset)
|
make_tree(data)
|
||||||
|
|
||||||
print('# ending decision tree')
|
print('# ending decision tree')
|
|
@ -0,0 +1,129 @@
|
||||||
|
'''
|
||||||
|
JSON Handler
|
||||||
|
============
|
||||||
|
|
||||||
|
JSON Handler reads articles from JSON files,
|
||||||
|
extracts relevant information and
|
||||||
|
writes it to a csv file.
|
||||||
|
'''
|
||||||
|
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import glob
|
||||||
|
import json
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
class JsonHandler:
|
||||||
|
|
||||||
|
def select_randoms(df, n):
|
||||||
|
'''selects n random samples from dataset.
|
||||||
|
params: df DataFrame to select items from,
|
||||||
|
n number of items to select randomly,
|
||||||
|
returns new DataFrame with only selected items
|
||||||
|
'''
|
||||||
|
# initialize random => reproducible sequence
|
||||||
|
np.random.seed(5)
|
||||||
|
# add new column 'Random'
|
||||||
|
df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
|
||||||
|
# sort DataFrame by random numbers
|
||||||
|
df = df.sort_values('Random')
|
||||||
|
# return first n elements of randomly sorted dataset
|
||||||
|
return df.iloc[0:n]
|
||||||
|
|
||||||
|
def create_csv(file_name):
|
||||||
|
# create new csv file for each month.
|
||||||
|
# each row contains an news article.
|
||||||
|
|
||||||
|
with open(file_name, 'w', newline='') as csvfile:
|
||||||
|
writer = csv.writer(csvfile,
|
||||||
|
delimiter='|',
|
||||||
|
quotechar='\'',
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC)
|
||||||
|
# write header / column names
|
||||||
|
writer.writerow(['Uuid', #0
|
||||||
|
'Title', #1
|
||||||
|
'Text', #2
|
||||||
|
'Site', #3
|
||||||
|
'SiteSection', #4
|
||||||
|
'Url', #5
|
||||||
|
'Timestamp']) #6
|
||||||
|
|
||||||
|
def write_articles_to_csv(file_name):
|
||||||
|
# path of JSON files
|
||||||
|
path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\01_Datensatz\\new_dataset\\2017_12_ccc517fd45024a87c12318299efc50a4\\news_[0-9]*.json'
|
||||||
|
files = glob.glob(path)
|
||||||
|
|
||||||
|
# reliable sources (site_sections)
|
||||||
|
site_sections = ['http://feeds.reuters.com/reuters/financialsNews',
|
||||||
|
'http://feeds.reuters.com/reuters/INbusinessNews',
|
||||||
|
'http://feeds.reuters.com/reuters/businessNews',
|
||||||
|
'http://feeds.reuters.com/reuters/companyNews',
|
||||||
|
'http://www.reuters.com/finance/deals',
|
||||||
|
'http://feeds.reuters.com/reuters/mergersNews',
|
||||||
|
'http://rss.cnn.com/rss/money_topstories.rss',
|
||||||
|
'http://rss.cnn.com/rss/money_latest.rss',
|
||||||
|
'http://www.economist.com/sections/business-finance/rss.xml',
|
||||||
|
'http://rss.cnn.com/rss/edition_business.rss',
|
||||||
|
'http://in.reuters.com/finance/deals',
|
||||||
|
'http://feeds.reuters.com/reuters/technologyNews',
|
||||||
|
'http://feeds.reuters.com/reuters/technologysectorNews',
|
||||||
|
'https://www.ft.com/companies/us',
|
||||||
|
'http://feeds.reuters.com/reuters/UKScienceNews',
|
||||||
|
'http://in.reuters.com/news/technology',
|
||||||
|
'http://in.reuters.com/finance/economy',
|
||||||
|
'https://www.bloomberg.com/middleeast',
|
||||||
|
'http://in.reuters.com/news/top-news']
|
||||||
|
|
||||||
|
# file counter
|
||||||
|
n = 0
|
||||||
|
# article counter
|
||||||
|
a = 0
|
||||||
|
# read every JSON file in current folder
|
||||||
|
with open(file_name, 'a', newline='') as csvfile:
|
||||||
|
writer = csv.writer(csvfile,
|
||||||
|
delimiter='|',
|
||||||
|
quotechar='\'',
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC)
|
||||||
|
for file in files:
|
||||||
|
n += 1
|
||||||
|
with open(file, encoding='utf-8') as f:
|
||||||
|
# Json is converted to dict
|
||||||
|
dict = json.load(f)
|
||||||
|
#print(n)
|
||||||
|
# leave out comments or posts, take only reuters as source
|
||||||
|
if ((dict['ord_in_thread'] != 0) or
|
||||||
|
(dict['language'] != 'english') or
|
||||||
|
(dict['thread']['spam_score'] > 0.3) or
|
||||||
|
(dict['thread']['site_section'] not in site_sections)):
|
||||||
|
continue
|
||||||
|
# pick only relevant information of article
|
||||||
|
# and put in in list
|
||||||
|
article = [dict['thread']['uuid'], # 0:'Uuid'
|
||||||
|
dict['thread']['title'], # 1:'Title'
|
||||||
|
dict['text'], # 2:'Text'
|
||||||
|
dict['thread']['site'], # 3:'Site'
|
||||||
|
dict['thread']['site_section'], # 4:'SiteSection'
|
||||||
|
dict['url'], # 5:'Url'
|
||||||
|
dict['published']] # 6:'Timestamp'
|
||||||
|
|
||||||
|
# remove newlines and delimiter char
|
||||||
|
article[1] = article[1].replace('|', '-') # in 'Title'
|
||||||
|
article[2] = article[2].replace('\n', ' ').replace('\r', ' ').replace('|', '-') # in 'Text'
|
||||||
|
|
||||||
|
try:
|
||||||
|
writer.writerow(article)
|
||||||
|
a += 1
|
||||||
|
# handle undefined characters (videos and other spam)
|
||||||
|
except UnicodeEncodeError:
|
||||||
|
print('# filtered out site_section: {} (UnicodeEncodeError)'
|
||||||
|
.format(dict['thread']['site_section']))
|
||||||
|
print()
|
||||||
|
print('# saved {} articles in file {}'.format(a, file_name))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
file_name = 'test.csv'
|
||||||
|
JsonHandler.create_csv(file_name)
|
||||||
|
JsonHandler.write_articles_to_csv(file_name)
|
|
@ -13,8 +13,10 @@ regardless of any possible correlations between these features.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from BagOfWords import BagOfWords
|
from BagOfWords import BagOfWords
|
||||||
from CsvHandler import CsvHandler
|
|
||||||
|
|
||||||
|
import csv
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
from sklearn.feature_selection import SelectPercentile
|
from sklearn.feature_selection import SelectPercentile
|
||||||
from sklearn.metrics import recall_score, precision_score
|
from sklearn.metrics import recall_score, precision_score
|
||||||
|
@ -59,7 +61,7 @@ class NaiveBayes:
|
||||||
n += 1
|
n += 1
|
||||||
print('# split no. ' + str(n))
|
print('# split no. ' + str(n))
|
||||||
|
|
||||||
# # eigenes BOW => schlechtere ergebnisse
|
# # eigenes BOW
|
||||||
# vocab = BagOfWords.make_vocab(X[train])
|
# vocab = BagOfWords.make_vocab(X[train])
|
||||||
# # fit the training data and then return the matrix
|
# # fit the training data and then return the matrix
|
||||||
# training_data = BagOfWords.make_matrix(X[train], vocab)
|
# training_data = BagOfWords.make_matrix(X[train], vocab)
|
||||||
|
@ -72,26 +74,18 @@ class NaiveBayes:
|
||||||
# transform testing data and return the matrix
|
# transform testing data and return the matrix
|
||||||
testing_data = cv.transform(X[test]).toarray()
|
testing_data = cv.transform(X[test]).toarray()
|
||||||
|
|
||||||
# # apply select percentile
|
# apply select percentile
|
||||||
# selector = SelectPercentile(percentile=25)
|
selector = SelectPercentile(percentile=100)
|
||||||
# selector.fit(training_data, y[train])
|
selector.fit(training_data, y[train])
|
||||||
|
|
||||||
##DORIS: WIRD SELECT PERCENTILE IN DEINE ARBEIT MIT NB EINBEZOGEN?
|
|
||||||
|
|
||||||
# training_data_r = selector.transform(training_data)
|
training_data_r = selector.transform(training_data)
|
||||||
# testing_data_r = selector.transform(testing_data)
|
testing_data_r = selector.transform(testing_data)
|
||||||
|
|
||||||
# #fit classifier
|
|
||||||
# classifier.fit(training_data_r, y[train])
|
|
||||||
# #predict class
|
|
||||||
# predictions_train = classifier.predict(training_data_r)
|
|
||||||
# predictions_test = classifier.predict(testing_data_r)
|
|
||||||
|
|
||||||
#fit classifier
|
#fit classifier
|
||||||
classifier.fit(training_data, y[train])
|
classifier.fit(training_data_r, y[train])
|
||||||
#predict class
|
#predict class
|
||||||
predictions_train = classifier.predict(training_data)
|
predictions_train = classifier.predict(training_data_r)
|
||||||
predictions_test = classifier.predict(testing_data)
|
predictions_test = classifier.predict(testing_data_r)
|
||||||
|
|
||||||
#print and store metrics
|
#print and store metrics
|
||||||
rec = recall_score(y[test], predictions_test)
|
rec = recall_score(y[test], predictions_test)
|
||||||
|
@ -189,12 +183,15 @@ class NaiveBayes:
|
||||||
# read csv file
|
# read csv file
|
||||||
print('# reading dataset')
|
print('# reading dataset')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
## DORIS: ICH VERSTEHE NICHT, WARUM DU HIER EINE EXTRA FUNKTION SCHREIBST, PD.READ_CSV MÜSSTE DOCH AUCH SO GEHEN?
|
|
||||||
## KOMMT VIELLEICHT NOCH, VIELLEICHT BIN ICH ZU VORSCHNELL
|
|
||||||
dataset = CsvHandler.read_csv(file)
|
|
||||||
|
|
||||||
make_naive_bayes(dataset)
|
data = pd.read_csv(file,
|
||||||
|
sep='|',
|
||||||
|
engine='python',
|
||||||
|
decimal='.',
|
||||||
|
quotechar='\'',
|
||||||
|
quoting=csv.QUOTE_NONE)
|
||||||
|
|
||||||
|
make_naive_bayes(data)
|
||||||
|
|
||||||
print('#')
|
print('#')
|
||||||
print('# ending naive bayes')
|
print('# ending naive bayes')
|
|
@ -3,23 +3,21 @@ Naive Bayes Classifier
|
||||||
======================
|
======================
|
||||||
|
|
||||||
basic implementation of naive bayes.
|
basic implementation of naive bayes.
|
||||||
prints out probabilities for classes.
|
prints out probabilities for classes needed for interactive labeling.
|
||||||
needed for interactive labeling.
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from CsvHandler import CsvHandler
|
import csv
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
|
||||||
from sklearn.metrics import recall_score, precision_score
|
from sklearn.metrics import recall_score, precision_score
|
||||||
from sklearn.model_selection import KFold
|
from sklearn.model_selection import StratifiedKFold
|
||||||
from sklearn.naive_bayes import GaussianNB
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
|
||||||
class NaiveBayes_simple:
|
class NaiveBayes_Interactive:
|
||||||
|
|
||||||
def make_naive_bayes(dataset):
|
def make_naive_bayes(dataset):
|
||||||
'''fits naive bayes model with StratifiedKFold,
|
'''fits naive bayes model
|
||||||
uses my BOW
|
|
||||||
'''
|
'''
|
||||||
print('# fitting model')
|
print('# fitting model')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
@ -31,9 +29,8 @@ class NaiveBayes_simple:
|
||||||
|
|
||||||
cv = CountVectorizer()
|
cv = CountVectorizer()
|
||||||
|
|
||||||
##DORIS: DU BRAUCHST IMMER EINEN STRATIFIED SPLIT, WEIL DIEN DATASET UNBALANCED IST
|
# stratified k-fold cross-validation as split method
|
||||||
# k-fold cross-validation as split method
|
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=5)
|
||||||
kf = KFold(n_splits=10, shuffle=True, random_state=5)
|
|
||||||
|
|
||||||
classifier = GaussianNB()
|
classifier = GaussianNB()
|
||||||
|
|
||||||
|
@ -163,9 +160,14 @@ class NaiveBayes_simple:
|
||||||
print('# reading dataset')
|
print('# reading dataset')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
dataset = CsvHandler.read_csv(file)
|
data = pd.read_csv(file,
|
||||||
|
sep='|',
|
||||||
|
engine='python',
|
||||||
|
decimal='.',
|
||||||
|
quotechar='\'',
|
||||||
|
quoting=csv.QUOTE_NONE)
|
||||||
|
|
||||||
make_naive_bayes(dataset)
|
make_naive_bayes(data)
|
||||||
|
|
||||||
print('#')
|
print('#')
|
||||||
print('# ending naive bayes')
|
print('# ending naive bayes')
|
34
README.md
34
README.md
|
@ -1,19 +1,43 @@
|
||||||
# Anne's Bachelor Thesis
|
# Anne's Bachelor Thesis
|
||||||
|
State: October 2018 (in progress)
|
||||||
|
|
||||||
My python classes for text mining, machine learning models, …
|
My python classes for text mining, machine learning models, …
|
||||||
|
The scripts can be called separately.
|
||||||
|
|
||||||
|
Best F1 score results were:
|
||||||
|
|
||||||
|
SVM
|
||||||
|
---
|
||||||
|
F1 score: 0.8944166649330559
|
||||||
|
best parameters set found on development set:
|
||||||
|
{'SVC__C': 0.1, 'SVC__gamma': 0.01, 'SVC__kernel': 'linear', 'perc__percentile': 50}
|
||||||
|
|
||||||
|
Naive Bayes
|
||||||
|
-----------
|
||||||
|
parameters: SelectPercentile(25), own BOW implementation, 10-fold cross validation
|
||||||
|
F1 score: min = 0.7586206896551724, max = 0.8846153846153846, average = 0.8324014738144634
|
||||||
|
|
||||||
|
The complete documentation can be found in the latex document in the thesis folder.
|
||||||
|
|
||||||
|
The csv file 'classification_labelled_corrected.csv' contains 1497 labeled news articles from Reuters.com and is used for the machine learning models.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
Please enter a valid webhose personal key before you call 'Requester.py'.
|
||||||
|
Also, please change the path to your JAVAHOME environment variable in 'NER.find_companies' method.
|
||||||
|
|
||||||
|
example:
|
||||||
|
# set paths
|
||||||
|
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
|
||||||
|
os.environ['JAVAHOME'] = java_path
|
||||||
|
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
pandas==0.20.1
|
pandas==0.20.1
|
||||||
|
|
||||||
nltk==3.2.5
|
nltk==3.2.5
|
||||||
|
|
||||||
webhoseio==0.5
|
webhoseio==0.5
|
||||||
|
|
||||||
numpy==1.14.0
|
numpy==1.14.0
|
||||||
|
|
||||||
graphviz==0.9
|
graphviz==0.9
|
||||||
|
|
||||||
scikit_learn==0.19.2
|
scikit_learn==0.19.2
|
||||||
|
|
||||||
## Installation under Windows
|
## Installation under Windows
|
||||||
|
|
13
SVM.py
13
SVM.py
|
@ -13,8 +13,10 @@ to belong to a category based on which side of the gap they fall.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
from BagOfWords import BagOfWords
|
from BagOfWords import BagOfWords
|
||||||
from CsvHandler import CsvHandler
|
|
||||||
|
|
||||||
|
import csv
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
from sklearn.feature_selection import SelectPercentile
|
from sklearn.feature_selection import SelectPercentile
|
||||||
from sklearn.metrics import f1_score, make_scorer
|
from sklearn.metrics import f1_score, make_scorer
|
||||||
|
@ -95,8 +97,13 @@ class SVM:
|
||||||
print('# reading dataset')
|
print('# reading dataset')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
dataset = CsvHandler.read_csv(file)
|
data = pd.read_csv(file,
|
||||||
|
sep='|',
|
||||||
|
engine='python',
|
||||||
|
decimal='.',
|
||||||
|
quotechar='\'',
|
||||||
|
quoting=csv.QUOTE_NONE)
|
||||||
|
|
||||||
make_svm(dataset)
|
make_svm(data)
|
||||||
|
|
||||||
print('# ending svm')
|
print('# ending svm')
|
Loading…
Reference in New Issue