changes due to NER

This commit is contained in:
Anne Lorenz 2018-09-24 13:50:11 +02:00
parent 14e5af9d7d
commit 188a2d582c
3 changed files with 225 additions and 26 deletions

41
NER.py
View File

@ -8,6 +8,7 @@ like persons, organizations and countries, e.g.
import os
import matplotlib.pyplot as plt
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
@ -41,12 +42,37 @@ class NER:
continuous_chunk.append(current_chunk)
return continuous_chunk
if __name__ == '__main__':
def plot_barchart():
organizations = ['org1', 'org2', 'org3', 'org4', 'org5', 'org6']
num_mentions = [5, 2, 33, 12, 6, 10]
#n, bins, patches = plt.hist(num_mentions, 6, normed=1, facecolor='green')
plt.plot(organizations, num_mentions, 'ro', ms = 10)
plt.xlabel('companies')
plt.ylabel('count')
plt.title('Company mentions in articles')
plt.grid(True)
plt.show()
def find_companies(text):
#set paths
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
os.environ['JAVAHOME'] = java_path
organizations = []
# create list of (word, tag) tuples
tagged_words = NER.tag_words(text)
# put coherent names together
nes = NER.get_coherent_names(tagged_words)
nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
#print(nes_coherent)
for tuple in nes_coherent:
if tuple[1] == 'ORGANIZATION':
organizations.append(tuple[0])
return organizations
if __name__ == '__main__':
#plot_barchart()
text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
\nmostly fell in light volumes on Tuesday as energy shares
tracked \nfalls in global oil prices, while weaknesses in banking shares
@ -71,15 +97,4 @@ class NER:
region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
\namid uncertainty over global demand. \nFor Asian Companies click.'''
organizations = []
# create list of (word, tag) tuples
tagged_words = tag_words(text)
# put coherent names together
nes = get_coherent_names(tagged_words)
nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
#print(nes_coherent)
for tuple in nes_coherent:
if tuple[1] == 'ORGANIZATION':
organizations.append(tuple[0])
print(organizations)
print(NER.find_companies(text))

View File

@ -38,15 +38,20 @@ class NaiveBayes:
cv = CountVectorizer()
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
classifier = GaussianNB()
# lists for metrics
# metrics
recall_scores = []
precision_scores = []
f1_scores = []
# probabilities of each class (of each fold)
class_prob = []
# counts number of training samples observed in each class
class_counts = []
# for each fold
n = 0
for train, test in skf.split(X,y):
@ -54,18 +59,18 @@ class NaiveBayes:
n += 1
print('# split no. ' + str(n))
# eigenes BOW => schlechtere ergebnisse
vocab = BagOfWords.make_vocab(X[train])
# fit the training data and then return the matrix
training_data = BagOfWords.make_matrix(X[train], vocab)
# transform testing data and return the matrix
testing_data = BagOfWords.make_matrix(X[test], vocab)
# # # using CountVectorizer:
# # eigenes BOW => schlechtere ergebnisse
# vocab = BagOfWords.make_vocab(X[train])
# # fit the training data and then return the matrix
# training_data = cv.fit_transform(X[train], y[train]).toarray()
# training_data = BagOfWords.make_matrix(X[train], vocab)
# # transform testing data and return the matrix
# testing_data = cv.transform(X[test]).toarray()
# testing_data = BagOfWords.make_matrix(X[test], vocab)
# using CountVectorizer:
# fit the training data and then return the matrix
training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray()
# # apply select percentile
# selector = SelectPercentile(percentile=25)
@ -97,6 +102,9 @@ class NaiveBayes:
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
class_prob.append(classifier.class_prior_)
class_counts.append(classifier.class_count_)
##########################
#print metrics of test set
print('-------------------------')
@ -114,6 +122,15 @@ class NaiveBayes:
max(f1_scores),
sum(f1_scores)/float(len(f1_scores))))
print()
# print probability of each class
print('probability of each class:')
print()
print(class_prob)
print()
print('number of samples of each class:')
print()
print(class_counts)
print()
##### nur für overfit testing ###########
#print('overfit testing: prediction of training set')

167
NaiveBayes_simple.py Normal file
View File

@ -0,0 +1,167 @@
'''
Naive Bayes Classifier
======================
basic implementation of naive bayes.
'''
from CsvHandler import CsvHandler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
class NaiveBayes_simple:
def make_naive_bayes(dataset):
'''fits naive bayes model with StratifiedKFold,
uses my BOW
'''
print('# fitting model')
print('# ...')
# split data into text and label set
# join title and text
X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label']
cv = CountVectorizer()
# k-fold cross-validation as split method
kf = KFold(n_splits=10, shuffle=True, random_state=5)
classifier = GaussianNB()
# metrics
recall_scores = []
precision_scores = []
f1_scores = []
# probabilities of each class (of each fold)
class_prob = []
# counts number of training samples observed in each class
class_counts = []
# for each fold
n = 0
for train, test in kf.split(X,y):
n += 1
print('# split no. ' + str(n))
# using CountVectorizer:
# fit the training data and then return the matrix
training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray()
#fit classifier
classifier.fit(training_data, y[train])
#predict class
predictions_train = classifier.predict(training_data)
predictions_test = classifier.predict(testing_data)
#print and store metrics
rec = recall_score(y[test], predictions_test)
print('rec: ' + str(rec))
recall_scores.append(rec)
prec = precision_score(y[train], predictions_train)
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
class_prob.append(classifier.class_prior_)
class_counts.append(classifier.class_count_)
##########################
#print metrics of test set
print('-------------------------')
print('prediction of testing set:')
print('Precision score: min = {}, max = {}, average = {}'
.format(min(precision_scores),
max(precision_scores),
sum(precision_scores)/float(len(precision_scores))))
print('Recall score: min = {}, max = {}, average = {}'
.format(min(recall_scores),
max(recall_scores),
sum(recall_scores)/float(len(recall_scores))))
print('F1 score: min = {}, max = {}, average = {}'
.format(min(f1_scores),
max(f1_scores),
sum(f1_scores)/float(len(f1_scores))))
print()
# print probability of each class
print('probability of each class:')
print()
print(class_prob)
print()
print('number of samples of each class:')
print()
print(class_counts)
print()
##### nur für overfit testing ###########
#print('overfit testing: prediction of training set')
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
#format(min(f1_scores_train), max(f1_scores_train),
#sum(f1_scores_train)/float(len(f1_scores_train))))
#print()
######## nur für resubstitutionsfehler benötigt ########
def analyze_errors(dataset):
'''calculates resubstitution error
shows indices of false classified articles
uses Gaussian Bayes with train test split
'''
X_train_test = dataset['Title'] + ' ' + dataset['Text']
y_train_test = dataset['Label']
count_vector = CountVectorizer()
# fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train_test).toarray()
# transform testing data and return the matrix
testing_data = count_vector.transform(X_train_test).toarray()
# Naive Bayes
classifier = GaussianNB()
# fit classifier
classifier.fit(training_data, y_train_test)
# Predict class
predictions = classifier.predict(testing_data)
print('Errors at index:')
print()
n = 0
for i in range(len(y_train_test)):
if y_train_test[i] != predictions[i]:
n += 1
print('error no.{}'.format(n))
print('prediction at index {} is: {}, but actual is: {}'
.format(i, predictions[i], y_train_test[i]))
print(X_train_test[i])
print(y_train_test[i])
print()
#print metrics
print('F1 score: ', format(f1_score(y_train_test, predictions)))
if __name__ == '__main__':
print('# starting naive bayes')
print('# ...')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('# ...')
dataset = CsvHandler.read_csv(file)
make_naive_bayes(dataset)
print('#')
print('# ending naive bayes')