changes due to NER

This commit is contained in:
Anne Lorenz 2018-09-24 13:50:11 +02:00
parent 14e5af9d7d
commit 188a2d582c
3 changed files with 225 additions and 26 deletions

43
NER.py
View File

@ -8,6 +8,7 @@ like persons, organizations and countries, e.g.
import os
import matplotlib.pyplot as plt
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
@ -41,13 +42,38 @@ class NER:
continuous_chunk.append(current_chunk)
return continuous_chunk
if __name__ == '__main__':
def plot_barchart():
organizations = ['org1', 'org2', 'org3', 'org4', 'org5', 'org6']
num_mentions = [5, 2, 33, 12, 6, 10]
#n, bins, patches = plt.hist(num_mentions, 6, normed=1, facecolor='green')
plt.plot(organizations, num_mentions, 'ro', ms = 10)
plt.xlabel('companies')
plt.ylabel('count')
plt.title('Company mentions in articles')
plt.grid(True)
plt.show()
def find_companies(text):
#set paths
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
os.environ['JAVAHOME'] = java_path
text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
organizations = []
# create list of (word, tag) tuples
tagged_words = NER.tag_words(text)
# put coherent names together
nes = NER.get_coherent_names(tagged_words)
nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
#print(nes_coherent)
for tuple in nes_coherent:
if tuple[1] == 'ORGANIZATION':
organizations.append(tuple[0])
return organizations
if __name__ == '__main__':
#plot_barchart()
text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
\nmostly fell in light volumes on Tuesday as energy shares
tracked \nfalls in global oil prices, while weaknesses in banking shares
\namid concerns about loans to an ailing steel firm sent the Thai
@ -71,15 +97,4 @@ class NER:
region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
\namid uncertainty over global demand. \nFor Asian Companies click.'''
organizations = []
# create list of (word, tag) tuples
tagged_words = tag_words(text)
# put coherent names together
nes = get_coherent_names(tagged_words)
nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
#print(nes_coherent)
for tuple in nes_coherent:
if tuple[1] == 'ORGANIZATION':
organizations.append(tuple[0])
print(organizations)
print(NER.find_companies(text))

View File

@ -38,15 +38,20 @@ class NaiveBayes:
cv = CountVectorizer()
# use stratified k-fold cross-validation as split method
skf = StratifiedKFold(n_splits = 10, shuffle=True)
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
classifier = GaussianNB()
# lists for metrics
# metrics
recall_scores = []
precision_scores = []
f1_scores = []
# probabilities of each class (of each fold)
class_prob = []
# counts number of training samples observed in each class
class_counts = []
# for each fold
n = 0
for train, test in skf.split(X,y):
@ -54,18 +59,18 @@ class NaiveBayes:
n += 1
print('# split no. ' + str(n))
# eigenes BOW => schlechtere ergebnisse
vocab = BagOfWords.make_vocab(X[train])
# fit the training data and then return the matrix
training_data = BagOfWords.make_matrix(X[train], vocab)
# transform testing data and return the matrix
testing_data = BagOfWords.make_matrix(X[test], vocab)
# # # using CountVectorizer:
# # eigenes BOW => schlechtere ergebnisse
# vocab = BagOfWords.make_vocab(X[train])
# # fit the training data and then return the matrix
# training_data = cv.fit_transform(X[train], y[train]).toarray()
# training_data = BagOfWords.make_matrix(X[train], vocab)
# # transform testing data and return the matrix
# testing_data = cv.transform(X[test]).toarray()
# testing_data = BagOfWords.make_matrix(X[test], vocab)
# using CountVectorizer:
# fit the training data and then return the matrix
training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray()
# # apply select percentile
# selector = SelectPercentile(percentile=25)
@ -97,6 +102,9 @@ class NaiveBayes:
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
class_prob.append(classifier.class_prior_)
class_counts.append(classifier.class_count_)
##########################
#print metrics of test set
print('-------------------------')
@ -114,6 +122,15 @@ class NaiveBayes:
max(f1_scores),
sum(f1_scores)/float(len(f1_scores))))
print()
# print probability of each class
print('probability of each class:')
print()
print(class_prob)
print()
print('number of samples of each class:')
print()
print(class_counts)
print()
##### nur für overfit testing ###########
#print('overfit testing: prediction of training set')

167
NaiveBayes_simple.py Normal file
View File

@ -0,0 +1,167 @@
'''
Naive Bayes Classifier
======================
basic implementation of naive bayes.
'''
from CsvHandler import CsvHandler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
class NaiveBayes_simple:
def make_naive_bayes(dataset):
'''fits naive bayes model with StratifiedKFold,
uses my BOW
'''
print('# fitting model')
print('# ...')
# split data into text and label set
# join title and text
X = dataset['Title'] + ' ' + dataset['Text']
y = dataset['Label']
cv = CountVectorizer()
# k-fold cross-validation as split method
kf = KFold(n_splits=10, shuffle=True, random_state=5)
classifier = GaussianNB()
# metrics
recall_scores = []
precision_scores = []
f1_scores = []
# probabilities of each class (of each fold)
class_prob = []
# counts number of training samples observed in each class
class_counts = []
# for each fold
n = 0
for train, test in kf.split(X,y):
n += 1
print('# split no. ' + str(n))
# using CountVectorizer:
# fit the training data and then return the matrix
training_data = cv.fit_transform(X[train], y[train]).toarray()
# transform testing data and return the matrix
testing_data = cv.transform(X[test]).toarray()
#fit classifier
classifier.fit(training_data, y[train])
#predict class
predictions_train = classifier.predict(training_data)
predictions_test = classifier.predict(testing_data)
#print and store metrics
rec = recall_score(y[test], predictions_test)
print('rec: ' + str(rec))
recall_scores.append(rec)
prec = precision_score(y[train], predictions_train)
print('prec: ' + str(prec))
print('#')
precision_scores.append(prec)
# equation for f1 score
f1_scores.append(2 * (prec * rec)/(prec + rec))
class_prob.append(classifier.class_prior_)
class_counts.append(classifier.class_count_)
##########################
#print metrics of test set
print('-------------------------')
print('prediction of testing set:')
print('Precision score: min = {}, max = {}, average = {}'
.format(min(precision_scores),
max(precision_scores),
sum(precision_scores)/float(len(precision_scores))))
print('Recall score: min = {}, max = {}, average = {}'
.format(min(recall_scores),
max(recall_scores),
sum(recall_scores)/float(len(recall_scores))))
print('F1 score: min = {}, max = {}, average = {}'
.format(min(f1_scores),
max(f1_scores),
sum(f1_scores)/float(len(f1_scores))))
print()
# print probability of each class
print('probability of each class:')
print()
print(class_prob)
print()
print('number of samples of each class:')
print()
print(class_counts)
print()
##### nur für overfit testing ###########
#print('overfit testing: prediction of training set')
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
#format(min(f1_scores_train), max(f1_scores_train),
#sum(f1_scores_train)/float(len(f1_scores_train))))
#print()
######## nur für resubstitutionsfehler benötigt ########
def analyze_errors(dataset):
'''calculates resubstitution error
shows indices of false classified articles
uses Gaussian Bayes with train test split
'''
X_train_test = dataset['Title'] + ' ' + dataset['Text']
y_train_test = dataset['Label']
count_vector = CountVectorizer()
# fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train_test).toarray()
# transform testing data and return the matrix
testing_data = count_vector.transform(X_train_test).toarray()
# Naive Bayes
classifier = GaussianNB()
# fit classifier
classifier.fit(training_data, y_train_test)
# Predict class
predictions = classifier.predict(testing_data)
print('Errors at index:')
print()
n = 0
for i in range(len(y_train_test)):
if y_train_test[i] != predictions[i]:
n += 1
print('error no.{}'.format(n))
print('prediction at index {} is: {}, but actual is: {}'
.format(i, predictions[i], y_train_test[i]))
print(X_train_test[i])
print(y_train_test[i])
print()
#print metrics
print('F1 score: ', format(f1_score(y_train_test, predictions)))
if __name__ == '__main__':
print('# starting naive bayes')
print('# ...')
file = 'classification_labelled_corrected.csv'
# read csv file
print('# reading dataset')
print('# ...')
dataset = CsvHandler.read_csv(file)
make_naive_bayes(dataset)
print('#')
print('# ending naive bayes')