changes due to NER
This commit is contained in:
		
							parent
							
								
									14e5af9d7d
								
							
						
					
					
						commit
						188a2d582c
					
				
							
								
								
									
										41
									
								
								NER.py
									
									
									
									
									
								
							
							
						
						
									
										41
									
								
								NER.py
									
									
									
									
									
								
							@ -8,6 +8,7 @@ like persons, organizations and countries, e.g.
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
import matplotlib.pyplot as plt
 | 
			
		||||
from nltk.tag import StanfordNERTagger
 | 
			
		||||
from nltk.tokenize import word_tokenize
 | 
			
		||||
 | 
			
		||||
@ -41,12 +42,37 @@ class NER:
 | 
			
		||||
            continuous_chunk.append(current_chunk)
 | 
			
		||||
        return continuous_chunk
 | 
			
		||||
 | 
			
		||||
    if __name__ == '__main__':
 | 
			
		||||
    def plot_barchart():
 | 
			
		||||
        organizations = ['org1', 'org2', 'org3', 'org4', 'org5', 'org6']
 | 
			
		||||
        num_mentions = [5, 2, 33, 12, 6, 10]
 | 
			
		||||
        #n, bins, patches = plt.hist(num_mentions, 6, normed=1, facecolor='green')
 | 
			
		||||
        plt.plot(organizations, num_mentions, 'ro', ms = 10)
 | 
			
		||||
        plt.xlabel('companies')
 | 
			
		||||
        plt.ylabel('count')
 | 
			
		||||
        plt.title('Company mentions in articles')
 | 
			
		||||
        plt.grid(True)
 | 
			
		||||
        plt.show()
 | 
			
		||||
 | 
			
		||||
    def find_companies(text):
 | 
			
		||||
        #set paths
 | 
			
		||||
        java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
 | 
			
		||||
        os.environ['JAVAHOME'] = java_path
 | 
			
		||||
 | 
			
		||||
        organizations = []
 | 
			
		||||
        # create list of (word, tag) tuples
 | 
			
		||||
        tagged_words = NER.tag_words(text)
 | 
			
		||||
        # put coherent names together
 | 
			
		||||
        nes = NER.get_coherent_names(tagged_words)
 | 
			
		||||
        nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
 | 
			
		||||
        #print(nes_coherent)
 | 
			
		||||
        for tuple in nes_coherent:
 | 
			
		||||
            if tuple[1] == 'ORGANIZATION':
 | 
			
		||||
                organizations.append(tuple[0])
 | 
			
		||||
        return organizations
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
 | 
			
		||||
    #plot_barchart()
 | 
			
		||||
    text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
 | 
			
		||||
                    \nmostly fell in light volumes on Tuesday as energy shares
 | 
			
		||||
                    tracked \nfalls in global oil prices, while weaknesses in banking shares
 | 
			
		||||
@ -71,15 +97,4 @@ class NER:
 | 
			
		||||
                    region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
 | 
			
		||||
                    Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
 | 
			
		||||
                    \namid uncertainty over global demand. \nFor Asian Companies click.'''
 | 
			
		||||
 | 
			
		||||
        organizations = []
 | 
			
		||||
        # create list of (word, tag) tuples
 | 
			
		||||
        tagged_words = tag_words(text)
 | 
			
		||||
        # put coherent names together
 | 
			
		||||
        nes = get_coherent_names(tagged_words)
 | 
			
		||||
        nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
 | 
			
		||||
        #print(nes_coherent)
 | 
			
		||||
        for tuple in nes_coherent:
 | 
			
		||||
            if tuple[1] == 'ORGANIZATION':
 | 
			
		||||
                organizations.append(tuple[0])
 | 
			
		||||
        print(organizations)
 | 
			
		||||
    print(NER.find_companies(text))
 | 
			
		||||
@ -38,15 +38,20 @@ class NaiveBayes:
 | 
			
		||||
        cv = CountVectorizer()
 | 
			
		||||
 | 
			
		||||
        # use stratified k-fold cross-validation as split method
 | 
			
		||||
        skf = StratifiedKFold(n_splits = 10, shuffle=True)
 | 
			
		||||
        skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
 | 
			
		||||
 | 
			
		||||
        classifier = GaussianNB()
 | 
			
		||||
 | 
			
		||||
        # lists for metrics
 | 
			
		||||
        # metrics
 | 
			
		||||
        recall_scores = []
 | 
			
		||||
        precision_scores = []
 | 
			
		||||
        f1_scores = []
 | 
			
		||||
 | 
			
		||||
        # probabilities of each class (of each fold)
 | 
			
		||||
        class_prob = []
 | 
			
		||||
        # counts number of training samples observed in each class 
 | 
			
		||||
        class_counts = []
 | 
			
		||||
 | 
			
		||||
        # for each fold
 | 
			
		||||
        n = 0
 | 
			
		||||
        for train, test in skf.split(X,y):
 | 
			
		||||
@ -54,18 +59,18 @@ class NaiveBayes:
 | 
			
		||||
            n += 1
 | 
			
		||||
            print('# split no. ' + str(n))
 | 
			
		||||
 | 
			
		||||
            # eigenes BOW => schlechtere ergebnisse
 | 
			
		||||
            vocab = BagOfWords.make_vocab(X[train])
 | 
			
		||||
            # fit the training data and then return the matrix
 | 
			
		||||
            training_data = BagOfWords.make_matrix(X[train], vocab)
 | 
			
		||||
            # transform testing data and return the matrix
 | 
			
		||||
            testing_data = BagOfWords.make_matrix(X[test], vocab)
 | 
			
		||||
 | 
			
		||||
            # # # using CountVectorizer:
 | 
			
		||||
            # # eigenes BOW => schlechtere ergebnisse
 | 
			
		||||
            # vocab = BagOfWords.make_vocab(X[train])
 | 
			
		||||
            # # fit the training data and then return the matrix
 | 
			
		||||
            # training_data = cv.fit_transform(X[train], y[train]).toarray()
 | 
			
		||||
            # training_data = BagOfWords.make_matrix(X[train], vocab)
 | 
			
		||||
            # # transform testing data and return the matrix
 | 
			
		||||
            # testing_data = cv.transform(X[test]).toarray()
 | 
			
		||||
            # testing_data = BagOfWords.make_matrix(X[test], vocab)
 | 
			
		||||
 | 
			
		||||
            # using CountVectorizer:
 | 
			
		||||
            # fit the training data and then return the matrix
 | 
			
		||||
            training_data = cv.fit_transform(X[train], y[train]).toarray()
 | 
			
		||||
            # transform testing data and return the matrix
 | 
			
		||||
            testing_data = cv.transform(X[test]).toarray()
 | 
			
		||||
 | 
			
		||||
            # # apply select percentile
 | 
			
		||||
            # selector = SelectPercentile(percentile=25)
 | 
			
		||||
@ -97,6 +102,9 @@ class NaiveBayes:
 | 
			
		||||
            # equation for f1 score
 | 
			
		||||
            f1_scores.append(2 * (prec * rec)/(prec + rec))
 | 
			
		||||
 | 
			
		||||
            class_prob.append(classifier.class_prior_)
 | 
			
		||||
            class_counts.append(classifier.class_count_)
 | 
			
		||||
 | 
			
		||||
        ##########################
 | 
			
		||||
        #print metrics of test set
 | 
			
		||||
        print('-------------------------')
 | 
			
		||||
@ -114,6 +122,15 @@ class NaiveBayes:
 | 
			
		||||
                        max(f1_scores),
 | 
			
		||||
                        sum(f1_scores)/float(len(f1_scores))))
 | 
			
		||||
        print()
 | 
			
		||||
        # print probability of each class
 | 
			
		||||
        print('probability of each class:')
 | 
			
		||||
        print()
 | 
			
		||||
        print(class_prob)
 | 
			
		||||
        print()
 | 
			
		||||
        print('number of samples of each class:')
 | 
			
		||||
        print()
 | 
			
		||||
        print(class_counts)
 | 
			
		||||
        print()
 | 
			
		||||
 | 
			
		||||
        ##### nur für overfit testing ###########
 | 
			
		||||
        #print('overfit testing: prediction of training set')
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										167
									
								
								NaiveBayes_simple.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										167
									
								
								NaiveBayes_simple.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,167 @@
 | 
			
		||||
'''
 | 
			
		||||
Naive Bayes Classifier
 | 
			
		||||
======================
 | 
			
		||||
 | 
			
		||||
basic implementation of naive bayes.
 | 
			
		||||
'''
 | 
			
		||||
 | 
			
		||||
from CsvHandler import CsvHandler
 | 
			
		||||
 | 
			
		||||
from sklearn.feature_extraction.text import CountVectorizer
 | 
			
		||||
 | 
			
		||||
from sklearn.metrics import recall_score, precision_score
 | 
			
		||||
from sklearn.model_selection import KFold
 | 
			
		||||
from sklearn.naive_bayes import GaussianNB
 | 
			
		||||
 | 
			
		||||
class NaiveBayes_simple:
 | 
			
		||||
 | 
			
		||||
    def make_naive_bayes(dataset):
 | 
			
		||||
        '''fits naive bayes model with StratifiedKFold,
 | 
			
		||||
        uses my BOW
 | 
			
		||||
        '''
 | 
			
		||||
        print('# fitting model')
 | 
			
		||||
        print('# ...')
 | 
			
		||||
 | 
			
		||||
        # split data into text and label set
 | 
			
		||||
        # join title and text
 | 
			
		||||
        X = dataset['Title'] + ' ' + dataset['Text']
 | 
			
		||||
        y = dataset['Label']
 | 
			
		||||
 | 
			
		||||
        cv = CountVectorizer()
 | 
			
		||||
 | 
			
		||||
        # k-fold cross-validation as split method
 | 
			
		||||
        kf = KFold(n_splits=10, shuffle=True, random_state=5)
 | 
			
		||||
 | 
			
		||||
        classifier = GaussianNB()
 | 
			
		||||
 | 
			
		||||
        # metrics
 | 
			
		||||
        recall_scores = []
 | 
			
		||||
        precision_scores = []
 | 
			
		||||
        f1_scores = []
 | 
			
		||||
 | 
			
		||||
        # probabilities of each class (of each fold)
 | 
			
		||||
        class_prob = []
 | 
			
		||||
        # counts number of training samples observed in each class 
 | 
			
		||||
        class_counts = []
 | 
			
		||||
 | 
			
		||||
        # for each fold
 | 
			
		||||
        n = 0
 | 
			
		||||
        for train, test in kf.split(X,y):
 | 
			
		||||
 | 
			
		||||
            n += 1
 | 
			
		||||
            print('# split no. ' + str(n))
 | 
			
		||||
 | 
			
		||||
            # using CountVectorizer:
 | 
			
		||||
            # fit the training data and then return the matrix
 | 
			
		||||
            training_data = cv.fit_transform(X[train], y[train]).toarray()
 | 
			
		||||
            # transform testing data and return the matrix
 | 
			
		||||
            testing_data = cv.transform(X[test]).toarray()
 | 
			
		||||
 | 
			
		||||
            #fit classifier
 | 
			
		||||
            classifier.fit(training_data, y[train])
 | 
			
		||||
            #predict class
 | 
			
		||||
            predictions_train = classifier.predict(training_data)
 | 
			
		||||
            predictions_test = classifier.predict(testing_data)
 | 
			
		||||
 | 
			
		||||
            #print and store metrics
 | 
			
		||||
            rec = recall_score(y[test], predictions_test)
 | 
			
		||||
            print('rec: ' + str(rec))
 | 
			
		||||
            recall_scores.append(rec)
 | 
			
		||||
            prec = precision_score(y[train], predictions_train)
 | 
			
		||||
            print('prec: ' + str(prec))
 | 
			
		||||
            print('#')
 | 
			
		||||
            precision_scores.append(prec)
 | 
			
		||||
            # equation for f1 score
 | 
			
		||||
            f1_scores.append(2 * (prec * rec)/(prec + rec))
 | 
			
		||||
 | 
			
		||||
            class_prob.append(classifier.class_prior_)
 | 
			
		||||
            class_counts.append(classifier.class_count_)
 | 
			
		||||
 | 
			
		||||
        ##########################
 | 
			
		||||
        #print metrics of test set
 | 
			
		||||
        print('-------------------------')
 | 
			
		||||
        print('prediction of testing set:')
 | 
			
		||||
        print('Precision score: min = {}, max = {}, average = {}'
 | 
			
		||||
                .format(min(precision_scores),
 | 
			
		||||
                        max(precision_scores),
 | 
			
		||||
                        sum(precision_scores)/float(len(precision_scores))))
 | 
			
		||||
        print('Recall score: min = {}, max = {}, average = {}'
 | 
			
		||||
                .format(min(recall_scores),
 | 
			
		||||
                        max(recall_scores),
 | 
			
		||||
                        sum(recall_scores)/float(len(recall_scores))))
 | 
			
		||||
        print('F1 score: min = {}, max = {}, average = {}'
 | 
			
		||||
                .format(min(f1_scores),
 | 
			
		||||
                        max(f1_scores),
 | 
			
		||||
                        sum(f1_scores)/float(len(f1_scores))))
 | 
			
		||||
        print()
 | 
			
		||||
        # print probability of each class
 | 
			
		||||
        print('probability of each class:')
 | 
			
		||||
        print()
 | 
			
		||||
        print(class_prob)
 | 
			
		||||
        print()
 | 
			
		||||
        print('number of samples of each class:')
 | 
			
		||||
        print()
 | 
			
		||||
        print(class_counts)
 | 
			
		||||
        print()
 | 
			
		||||
 | 
			
		||||
        ##### nur für overfit testing ###########
 | 
			
		||||
        #print('overfit testing: prediction of training set')
 | 
			
		||||
        #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
 | 
			
		||||
        #format(min(f1_scores_train), max(f1_scores_train),
 | 
			
		||||
        #sum(f1_scores_train)/float(len(f1_scores_train))))
 | 
			
		||||
        #print()
 | 
			
		||||
 | 
			
		||||
    ######## nur für resubstitutionsfehler benötigt ########
 | 
			
		||||
    def analyze_errors(dataset):
 | 
			
		||||
        '''calculates resubstitution error
 | 
			
		||||
        shows indices of false classified articles
 | 
			
		||||
        uses Gaussian Bayes with train test split
 | 
			
		||||
        '''
 | 
			
		||||
        X_train_test = dataset['Title'] + ' ' + dataset['Text']
 | 
			
		||||
        y_train_test = dataset['Label']
 | 
			
		||||
 | 
			
		||||
        count_vector = CountVectorizer()
 | 
			
		||||
        # fit the training data and then return the matrix
 | 
			
		||||
        training_data = count_vector.fit_transform(X_train_test).toarray()
 | 
			
		||||
        # transform testing data and return the matrix
 | 
			
		||||
        testing_data = count_vector.transform(X_train_test).toarray()
 | 
			
		||||
 | 
			
		||||
        # Naive Bayes
 | 
			
		||||
        classifier = GaussianNB()
 | 
			
		||||
        # fit classifier
 | 
			
		||||
        classifier.fit(training_data, y_train_test)
 | 
			
		||||
 | 
			
		||||
        # Predict class
 | 
			
		||||
        predictions = classifier.predict(testing_data)
 | 
			
		||||
        print('Errors at index:')
 | 
			
		||||
        print()
 | 
			
		||||
        n = 0
 | 
			
		||||
        for i in range(len(y_train_test)):
 | 
			
		||||
            if y_train_test[i] != predictions[i]:
 | 
			
		||||
                n += 1
 | 
			
		||||
                print('error no.{}'.format(n))
 | 
			
		||||
                print('prediction at index {} is: {}, but actual is: {}'
 | 
			
		||||
                .format(i, predictions[i], y_train_test[i]))
 | 
			
		||||
                print(X_train_test[i])
 | 
			
		||||
                print(y_train_test[i])
 | 
			
		||||
                print()
 | 
			
		||||
        #print metrics
 | 
			
		||||
        print('F1 score: ', format(f1_score(y_train_test, predictions)))
 | 
			
		||||
 | 
			
		||||
    if __name__ == '__main__':
 | 
			
		||||
 | 
			
		||||
        print('# starting naive bayes')
 | 
			
		||||
        print('# ...')
 | 
			
		||||
 | 
			
		||||
        file = 'classification_labelled_corrected.csv'
 | 
			
		||||
 | 
			
		||||
        # read csv file
 | 
			
		||||
        print('# reading dataset')
 | 
			
		||||
        print('# ...')
 | 
			
		||||
 | 
			
		||||
        dataset = CsvHandler.read_csv(file)
 | 
			
		||||
 | 
			
		||||
        make_naive_bayes(dataset)
 | 
			
		||||
 | 
			
		||||
        print('#')
 | 
			
		||||
        print('# ending naive bayes')
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user