changes due to NER
This commit is contained in:
parent
14e5af9d7d
commit
188a2d582c
43
NER.py
43
NER.py
|
@ -8,6 +8,7 @@ like persons, organizations and countries, e.g.
|
|||
|
||||
import os
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
from nltk.tag import StanfordNERTagger
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
|
@ -41,13 +42,38 @@ class NER:
|
|||
continuous_chunk.append(current_chunk)
|
||||
return continuous_chunk
|
||||
|
||||
if __name__ == '__main__':
|
||||
def plot_barchart():
|
||||
organizations = ['org1', 'org2', 'org3', 'org4', 'org5', 'org6']
|
||||
num_mentions = [5, 2, 33, 12, 6, 10]
|
||||
#n, bins, patches = plt.hist(num_mentions, 6, normed=1, facecolor='green')
|
||||
plt.plot(organizations, num_mentions, 'ro', ms = 10)
|
||||
plt.xlabel('companies')
|
||||
plt.ylabel('count')
|
||||
plt.title('Company mentions in articles')
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
def find_companies(text):
|
||||
#set paths
|
||||
java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
|
||||
os.environ['JAVAHOME'] = java_path
|
||||
|
||||
text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
|
||||
organizations = []
|
||||
# create list of (word, tag) tuples
|
||||
tagged_words = NER.tag_words(text)
|
||||
# put coherent names together
|
||||
nes = NER.get_coherent_names(tagged_words)
|
||||
nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
|
||||
#print(nes_coherent)
|
||||
for tuple in nes_coherent:
|
||||
if tuple[1] == 'ORGANIZATION':
|
||||
organizations.append(tuple[0])
|
||||
return organizations
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
#plot_barchart()
|
||||
text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
|
||||
\nmostly fell in light volumes on Tuesday as energy shares
|
||||
tracked \nfalls in global oil prices, while weaknesses in banking shares
|
||||
\namid concerns about loans to an ailing steel firm sent the Thai
|
||||
|
@ -71,15 +97,4 @@ class NER:
|
|||
region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
|
||||
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
|
||||
\namid uncertainty over global demand. \nFor Asian Companies click.'''
|
||||
|
||||
organizations = []
|
||||
# create list of (word, tag) tuples
|
||||
tagged_words = tag_words(text)
|
||||
# put coherent names together
|
||||
nes = get_coherent_names(tagged_words)
|
||||
nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
|
||||
#print(nes_coherent)
|
||||
for tuple in nes_coherent:
|
||||
if tuple[1] == 'ORGANIZATION':
|
||||
organizations.append(tuple[0])
|
||||
print(organizations)
|
||||
print(NER.find_companies(text))
|
|
@ -38,15 +38,20 @@ class NaiveBayes:
|
|||
cv = CountVectorizer()
|
||||
|
||||
# use stratified k-fold cross-validation as split method
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
|
||||
|
||||
classifier = GaussianNB()
|
||||
|
||||
# lists for metrics
|
||||
# metrics
|
||||
recall_scores = []
|
||||
precision_scores = []
|
||||
f1_scores = []
|
||||
|
||||
# probabilities of each class (of each fold)
|
||||
class_prob = []
|
||||
# counts number of training samples observed in each class
|
||||
class_counts = []
|
||||
|
||||
# for each fold
|
||||
n = 0
|
||||
for train, test in skf.split(X,y):
|
||||
|
@ -54,18 +59,18 @@ class NaiveBayes:
|
|||
n += 1
|
||||
print('# split no. ' + str(n))
|
||||
|
||||
# eigenes BOW => schlechtere ergebnisse
|
||||
vocab = BagOfWords.make_vocab(X[train])
|
||||
# fit the training data and then return the matrix
|
||||
training_data = BagOfWords.make_matrix(X[train], vocab)
|
||||
# transform testing data and return the matrix
|
||||
testing_data = BagOfWords.make_matrix(X[test], vocab)
|
||||
|
||||
# # # using CountVectorizer:
|
||||
# # eigenes BOW => schlechtere ergebnisse
|
||||
# vocab = BagOfWords.make_vocab(X[train])
|
||||
# # fit the training data and then return the matrix
|
||||
# training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||
# training_data = BagOfWords.make_matrix(X[train], vocab)
|
||||
# # transform testing data and return the matrix
|
||||
# testing_data = cv.transform(X[test]).toarray()
|
||||
# testing_data = BagOfWords.make_matrix(X[test], vocab)
|
||||
|
||||
# using CountVectorizer:
|
||||
# fit the training data and then return the matrix
|
||||
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||
# transform testing data and return the matrix
|
||||
testing_data = cv.transform(X[test]).toarray()
|
||||
|
||||
# # apply select percentile
|
||||
# selector = SelectPercentile(percentile=25)
|
||||
|
@ -97,6 +102,9 @@ class NaiveBayes:
|
|||
# equation for f1 score
|
||||
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||
|
||||
class_prob.append(classifier.class_prior_)
|
||||
class_counts.append(classifier.class_count_)
|
||||
|
||||
##########################
|
||||
#print metrics of test set
|
||||
print('-------------------------')
|
||||
|
@ -114,6 +122,15 @@ class NaiveBayes:
|
|||
max(f1_scores),
|
||||
sum(f1_scores)/float(len(f1_scores))))
|
||||
print()
|
||||
# print probability of each class
|
||||
print('probability of each class:')
|
||||
print()
|
||||
print(class_prob)
|
||||
print()
|
||||
print('number of samples of each class:')
|
||||
print()
|
||||
print(class_counts)
|
||||
print()
|
||||
|
||||
##### nur für overfit testing ###########
|
||||
#print('overfit testing: prediction of training set')
|
||||
|
|
|
@ -0,0 +1,167 @@
|
|||
'''
|
||||
Naive Bayes Classifier
|
||||
======================
|
||||
|
||||
basic implementation of naive bayes.
|
||||
'''
|
||||
|
||||
from CsvHandler import CsvHandler
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
|
||||
from sklearn.metrics import recall_score, precision_score
|
||||
from sklearn.model_selection import KFold
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
|
||||
class NaiveBayes_simple:
|
||||
|
||||
def make_naive_bayes(dataset):
|
||||
'''fits naive bayes model with StratifiedKFold,
|
||||
uses my BOW
|
||||
'''
|
||||
print('# fitting model')
|
||||
print('# ...')
|
||||
|
||||
# split data into text and label set
|
||||
# join title and text
|
||||
X = dataset['Title'] + ' ' + dataset['Text']
|
||||
y = dataset['Label']
|
||||
|
||||
cv = CountVectorizer()
|
||||
|
||||
# k-fold cross-validation as split method
|
||||
kf = KFold(n_splits=10, shuffle=True, random_state=5)
|
||||
|
||||
classifier = GaussianNB()
|
||||
|
||||
# metrics
|
||||
recall_scores = []
|
||||
precision_scores = []
|
||||
f1_scores = []
|
||||
|
||||
# probabilities of each class (of each fold)
|
||||
class_prob = []
|
||||
# counts number of training samples observed in each class
|
||||
class_counts = []
|
||||
|
||||
# for each fold
|
||||
n = 0
|
||||
for train, test in kf.split(X,y):
|
||||
|
||||
n += 1
|
||||
print('# split no. ' + str(n))
|
||||
|
||||
# using CountVectorizer:
|
||||
# fit the training data and then return the matrix
|
||||
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||
# transform testing data and return the matrix
|
||||
testing_data = cv.transform(X[test]).toarray()
|
||||
|
||||
#fit classifier
|
||||
classifier.fit(training_data, y[train])
|
||||
#predict class
|
||||
predictions_train = classifier.predict(training_data)
|
||||
predictions_test = classifier.predict(testing_data)
|
||||
|
||||
#print and store metrics
|
||||
rec = recall_score(y[test], predictions_test)
|
||||
print('rec: ' + str(rec))
|
||||
recall_scores.append(rec)
|
||||
prec = precision_score(y[train], predictions_train)
|
||||
print('prec: ' + str(prec))
|
||||
print('#')
|
||||
precision_scores.append(prec)
|
||||
# equation for f1 score
|
||||
f1_scores.append(2 * (prec * rec)/(prec + rec))
|
||||
|
||||
class_prob.append(classifier.class_prior_)
|
||||
class_counts.append(classifier.class_count_)
|
||||
|
||||
##########################
|
||||
#print metrics of test set
|
||||
print('-------------------------')
|
||||
print('prediction of testing set:')
|
||||
print('Precision score: min = {}, max = {}, average = {}'
|
||||
.format(min(precision_scores),
|
||||
max(precision_scores),
|
||||
sum(precision_scores)/float(len(precision_scores))))
|
||||
print('Recall score: min = {}, max = {}, average = {}'
|
||||
.format(min(recall_scores),
|
||||
max(recall_scores),
|
||||
sum(recall_scores)/float(len(recall_scores))))
|
||||
print('F1 score: min = {}, max = {}, average = {}'
|
||||
.format(min(f1_scores),
|
||||
max(f1_scores),
|
||||
sum(f1_scores)/float(len(f1_scores))))
|
||||
print()
|
||||
# print probability of each class
|
||||
print('probability of each class:')
|
||||
print()
|
||||
print(class_prob)
|
||||
print()
|
||||
print('number of samples of each class:')
|
||||
print()
|
||||
print(class_counts)
|
||||
print()
|
||||
|
||||
##### nur für overfit testing ###########
|
||||
#print('overfit testing: prediction of training set')
|
||||
#print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
|
||||
#format(min(f1_scores_train), max(f1_scores_train),
|
||||
#sum(f1_scores_train)/float(len(f1_scores_train))))
|
||||
#print()
|
||||
|
||||
######## nur für resubstitutionsfehler benötigt ########
|
||||
def analyze_errors(dataset):
|
||||
'''calculates resubstitution error
|
||||
shows indices of false classified articles
|
||||
uses Gaussian Bayes with train test split
|
||||
'''
|
||||
X_train_test = dataset['Title'] + ' ' + dataset['Text']
|
||||
y_train_test = dataset['Label']
|
||||
|
||||
count_vector = CountVectorizer()
|
||||
# fit the training data and then return the matrix
|
||||
training_data = count_vector.fit_transform(X_train_test).toarray()
|
||||
# transform testing data and return the matrix
|
||||
testing_data = count_vector.transform(X_train_test).toarray()
|
||||
|
||||
# Naive Bayes
|
||||
classifier = GaussianNB()
|
||||
# fit classifier
|
||||
classifier.fit(training_data, y_train_test)
|
||||
|
||||
# Predict class
|
||||
predictions = classifier.predict(testing_data)
|
||||
print('Errors at index:')
|
||||
print()
|
||||
n = 0
|
||||
for i in range(len(y_train_test)):
|
||||
if y_train_test[i] != predictions[i]:
|
||||
n += 1
|
||||
print('error no.{}'.format(n))
|
||||
print('prediction at index {} is: {}, but actual is: {}'
|
||||
.format(i, predictions[i], y_train_test[i]))
|
||||
print(X_train_test[i])
|
||||
print(y_train_test[i])
|
||||
print()
|
||||
#print metrics
|
||||
print('F1 score: ', format(f1_score(y_train_test, predictions)))
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
print('# starting naive bayes')
|
||||
print('# ...')
|
||||
|
||||
file = 'classification_labelled_corrected.csv'
|
||||
|
||||
# read csv file
|
||||
print('# reading dataset')
|
||||
print('# ...')
|
||||
|
||||
dataset = CsvHandler.read_csv(file)
|
||||
|
||||
make_naive_bayes(dataset)
|
||||
|
||||
print('#')
|
||||
print('# ending naive bayes')
|
Loading…
Reference in New Issue