changes due to NER
This commit is contained in:
		
							parent
							
								
									14e5af9d7d
								
							
						
					
					
						commit
						188a2d582c
					
				
							
								
								
									
										43
									
								
								NER.py
									
									
									
									
									
								
							
							
						
						
									
										43
									
								
								NER.py
									
									
									
									
									
								
							@ -8,6 +8,7 @@ like persons, organizations and countries, e.g.
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import matplotlib.pyplot as plt
 | 
				
			||||||
from nltk.tag import StanfordNERTagger
 | 
					from nltk.tag import StanfordNERTagger
 | 
				
			||||||
from nltk.tokenize import word_tokenize
 | 
					from nltk.tokenize import word_tokenize
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -41,13 +42,38 @@ class NER:
 | 
				
			|||||||
            continuous_chunk.append(current_chunk)
 | 
					            continuous_chunk.append(current_chunk)
 | 
				
			||||||
        return continuous_chunk
 | 
					        return continuous_chunk
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if __name__ == '__main__':
 | 
					    def plot_barchart():
 | 
				
			||||||
 | 
					        organizations = ['org1', 'org2', 'org3', 'org4', 'org5', 'org6']
 | 
				
			||||||
 | 
					        num_mentions = [5, 2, 33, 12, 6, 10]
 | 
				
			||||||
 | 
					        #n, bins, patches = plt.hist(num_mentions, 6, normed=1, facecolor='green')
 | 
				
			||||||
 | 
					        plt.plot(organizations, num_mentions, 'ro', ms = 10)
 | 
				
			||||||
 | 
					        plt.xlabel('companies')
 | 
				
			||||||
 | 
					        plt.ylabel('count')
 | 
				
			||||||
 | 
					        plt.title('Company mentions in articles')
 | 
				
			||||||
 | 
					        plt.grid(True)
 | 
				
			||||||
 | 
					        plt.show()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def find_companies(text):
 | 
				
			||||||
        #set paths
 | 
					        #set paths
 | 
				
			||||||
        java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
 | 
					        java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
 | 
				
			||||||
        os.environ['JAVAHOME'] = java_path
 | 
					        os.environ['JAVAHOME'] = java_path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
 | 
					        organizations = []
 | 
				
			||||||
 | 
					        # create list of (word, tag) tuples
 | 
				
			||||||
 | 
					        tagged_words = NER.tag_words(text)
 | 
				
			||||||
 | 
					        # put coherent names together
 | 
				
			||||||
 | 
					        nes = NER.get_coherent_names(tagged_words)
 | 
				
			||||||
 | 
					        nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
 | 
				
			||||||
 | 
					        #print(nes_coherent)
 | 
				
			||||||
 | 
					        for tuple in nes_coherent:
 | 
				
			||||||
 | 
					            if tuple[1] == 'ORGANIZATION':
 | 
				
			||||||
 | 
					                organizations.append(tuple[0])
 | 
				
			||||||
 | 
					        return organizations
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == '__main__':
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    #plot_barchart()
 | 
				
			||||||
 | 
					    text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
 | 
				
			||||||
                    \nmostly fell in light volumes on Tuesday as energy shares
 | 
					                    \nmostly fell in light volumes on Tuesday as energy shares
 | 
				
			||||||
                    tracked \nfalls in global oil prices, while weaknesses in banking shares
 | 
					                    tracked \nfalls in global oil prices, while weaknesses in banking shares
 | 
				
			||||||
                    \namid concerns about loans to an ailing steel firm sent the Thai
 | 
					                    \namid concerns about loans to an ailing steel firm sent the Thai
 | 
				
			||||||
@ -71,15 +97,4 @@ class NER:
 | 
				
			|||||||
                    region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
 | 
					                    region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
 | 
				
			||||||
                    Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
 | 
					                    Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
 | 
				
			||||||
                    \namid uncertainty over global demand. \nFor Asian Companies click.'''
 | 
					                    \namid uncertainty over global demand. \nFor Asian Companies click.'''
 | 
				
			||||||
 | 
					    print(NER.find_companies(text))
 | 
				
			||||||
        organizations = []
 | 
					 | 
				
			||||||
        # create list of (word, tag) tuples
 | 
					 | 
				
			||||||
        tagged_words = tag_words(text)
 | 
					 | 
				
			||||||
        # put coherent names together
 | 
					 | 
				
			||||||
        nes = get_coherent_names(tagged_words)
 | 
					 | 
				
			||||||
        nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
 | 
					 | 
				
			||||||
        #print(nes_coherent)
 | 
					 | 
				
			||||||
        for tuple in nes_coherent:
 | 
					 | 
				
			||||||
            if tuple[1] == 'ORGANIZATION':
 | 
					 | 
				
			||||||
                organizations.append(tuple[0])
 | 
					 | 
				
			||||||
        print(organizations)
 | 
					 | 
				
			||||||
@ -38,15 +38,20 @@ class NaiveBayes:
 | 
				
			|||||||
        cv = CountVectorizer()
 | 
					        cv = CountVectorizer()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # use stratified k-fold cross-validation as split method
 | 
					        # use stratified k-fold cross-validation as split method
 | 
				
			||||||
        skf = StratifiedKFold(n_splits = 10, shuffle=True)
 | 
					        skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        classifier = GaussianNB()
 | 
					        classifier = GaussianNB()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # lists for metrics
 | 
					        # metrics
 | 
				
			||||||
        recall_scores = []
 | 
					        recall_scores = []
 | 
				
			||||||
        precision_scores = []
 | 
					        precision_scores = []
 | 
				
			||||||
        f1_scores = []
 | 
					        f1_scores = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # probabilities of each class (of each fold)
 | 
				
			||||||
 | 
					        class_prob = []
 | 
				
			||||||
 | 
					        # counts number of training samples observed in each class 
 | 
				
			||||||
 | 
					        class_counts = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # for each fold
 | 
					        # for each fold
 | 
				
			||||||
        n = 0
 | 
					        n = 0
 | 
				
			||||||
        for train, test in skf.split(X,y):
 | 
					        for train, test in skf.split(X,y):
 | 
				
			||||||
@ -54,18 +59,18 @@ class NaiveBayes:
 | 
				
			|||||||
            n += 1
 | 
					            n += 1
 | 
				
			||||||
            print('# split no. ' + str(n))
 | 
					            print('# split no. ' + str(n))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # eigenes BOW => schlechtere ergebnisse
 | 
					            # # eigenes BOW => schlechtere ergebnisse
 | 
				
			||||||
            vocab = BagOfWords.make_vocab(X[train])
 | 
					            # vocab = BagOfWords.make_vocab(X[train])
 | 
				
			||||||
            # fit the training data and then return the matrix
 | 
					 | 
				
			||||||
            training_data = BagOfWords.make_matrix(X[train], vocab)
 | 
					 | 
				
			||||||
            # transform testing data and return the matrix
 | 
					 | 
				
			||||||
            testing_data = BagOfWords.make_matrix(X[test], vocab)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            # # # using CountVectorizer:
 | 
					 | 
				
			||||||
            # # fit the training data and then return the matrix
 | 
					            # # fit the training data and then return the matrix
 | 
				
			||||||
            # training_data = cv.fit_transform(X[train], y[train]).toarray()
 | 
					            # training_data = BagOfWords.make_matrix(X[train], vocab)
 | 
				
			||||||
            # # transform testing data and return the matrix
 | 
					            # # transform testing data and return the matrix
 | 
				
			||||||
            # testing_data = cv.transform(X[test]).toarray()
 | 
					            # testing_data = BagOfWords.make_matrix(X[test], vocab)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # using CountVectorizer:
 | 
				
			||||||
 | 
					            # fit the training data and then return the matrix
 | 
				
			||||||
 | 
					            training_data = cv.fit_transform(X[train], y[train]).toarray()
 | 
				
			||||||
 | 
					            # transform testing data and return the matrix
 | 
				
			||||||
 | 
					            testing_data = cv.transform(X[test]).toarray()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # # apply select percentile
 | 
					            # # apply select percentile
 | 
				
			||||||
            # selector = SelectPercentile(percentile=25)
 | 
					            # selector = SelectPercentile(percentile=25)
 | 
				
			||||||
@ -97,6 +102,9 @@ class NaiveBayes:
 | 
				
			|||||||
            # equation for f1 score
 | 
					            # equation for f1 score
 | 
				
			||||||
            f1_scores.append(2 * (prec * rec)/(prec + rec))
 | 
					            f1_scores.append(2 * (prec * rec)/(prec + rec))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            class_prob.append(classifier.class_prior_)
 | 
				
			||||||
 | 
					            class_counts.append(classifier.class_count_)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        ##########################
 | 
					        ##########################
 | 
				
			||||||
        #print metrics of test set
 | 
					        #print metrics of test set
 | 
				
			||||||
        print('-------------------------')
 | 
					        print('-------------------------')
 | 
				
			||||||
@ -114,6 +122,15 @@ class NaiveBayes:
 | 
				
			|||||||
                        max(f1_scores),
 | 
					                        max(f1_scores),
 | 
				
			||||||
                        sum(f1_scores)/float(len(f1_scores))))
 | 
					                        sum(f1_scores)/float(len(f1_scores))))
 | 
				
			||||||
        print()
 | 
					        print()
 | 
				
			||||||
 | 
					        # print probability of each class
 | 
				
			||||||
 | 
					        print('probability of each class:')
 | 
				
			||||||
 | 
					        print()
 | 
				
			||||||
 | 
					        print(class_prob)
 | 
				
			||||||
 | 
					        print()
 | 
				
			||||||
 | 
					        print('number of samples of each class:')
 | 
				
			||||||
 | 
					        print()
 | 
				
			||||||
 | 
					        print(class_counts)
 | 
				
			||||||
 | 
					        print()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        ##### nur für overfit testing ###########
 | 
					        ##### nur für overfit testing ###########
 | 
				
			||||||
        #print('overfit testing: prediction of training set')
 | 
					        #print('overfit testing: prediction of training set')
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										167
									
								
								NaiveBayes_simple.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										167
									
								
								NaiveBayes_simple.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,167 @@
 | 
				
			|||||||
 | 
					'''
 | 
				
			||||||
 | 
					Naive Bayes Classifier
 | 
				
			||||||
 | 
					======================
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					basic implementation of naive bayes.
 | 
				
			||||||
 | 
					'''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from CsvHandler import CsvHandler
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from sklearn.feature_extraction.text import CountVectorizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from sklearn.metrics import recall_score, precision_score
 | 
				
			||||||
 | 
					from sklearn.model_selection import KFold
 | 
				
			||||||
 | 
					from sklearn.naive_bayes import GaussianNB
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class NaiveBayes_simple:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def make_naive_bayes(dataset):
 | 
				
			||||||
 | 
					        '''fits naive bayes model with StratifiedKFold,
 | 
				
			||||||
 | 
					        uses my BOW
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					        print('# fitting model')
 | 
				
			||||||
 | 
					        print('# ...')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # split data into text and label set
 | 
				
			||||||
 | 
					        # join title and text
 | 
				
			||||||
 | 
					        X = dataset['Title'] + ' ' + dataset['Text']
 | 
				
			||||||
 | 
					        y = dataset['Label']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        cv = CountVectorizer()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # k-fold cross-validation as split method
 | 
				
			||||||
 | 
					        kf = KFold(n_splits=10, shuffle=True, random_state=5)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        classifier = GaussianNB()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # metrics
 | 
				
			||||||
 | 
					        recall_scores = []
 | 
				
			||||||
 | 
					        precision_scores = []
 | 
				
			||||||
 | 
					        f1_scores = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # probabilities of each class (of each fold)
 | 
				
			||||||
 | 
					        class_prob = []
 | 
				
			||||||
 | 
					        # counts number of training samples observed in each class 
 | 
				
			||||||
 | 
					        class_counts = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # for each fold
 | 
				
			||||||
 | 
					        n = 0
 | 
				
			||||||
 | 
					        for train, test in kf.split(X,y):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            n += 1
 | 
				
			||||||
 | 
					            print('# split no. ' + str(n))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # using CountVectorizer:
 | 
				
			||||||
 | 
					            # fit the training data and then return the matrix
 | 
				
			||||||
 | 
					            training_data = cv.fit_transform(X[train], y[train]).toarray()
 | 
				
			||||||
 | 
					            # transform testing data and return the matrix
 | 
				
			||||||
 | 
					            testing_data = cv.transform(X[test]).toarray()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            #fit classifier
 | 
				
			||||||
 | 
					            classifier.fit(training_data, y[train])
 | 
				
			||||||
 | 
					            #predict class
 | 
				
			||||||
 | 
					            predictions_train = classifier.predict(training_data)
 | 
				
			||||||
 | 
					            predictions_test = classifier.predict(testing_data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            #print and store metrics
 | 
				
			||||||
 | 
					            rec = recall_score(y[test], predictions_test)
 | 
				
			||||||
 | 
					            print('rec: ' + str(rec))
 | 
				
			||||||
 | 
					            recall_scores.append(rec)
 | 
				
			||||||
 | 
					            prec = precision_score(y[train], predictions_train)
 | 
				
			||||||
 | 
					            print('prec: ' + str(prec))
 | 
				
			||||||
 | 
					            print('#')
 | 
				
			||||||
 | 
					            precision_scores.append(prec)
 | 
				
			||||||
 | 
					            # equation for f1 score
 | 
				
			||||||
 | 
					            f1_scores.append(2 * (prec * rec)/(prec + rec))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            class_prob.append(classifier.class_prior_)
 | 
				
			||||||
 | 
					            class_counts.append(classifier.class_count_)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        ##########################
 | 
				
			||||||
 | 
					        #print metrics of test set
 | 
				
			||||||
 | 
					        print('-------------------------')
 | 
				
			||||||
 | 
					        print('prediction of testing set:')
 | 
				
			||||||
 | 
					        print('Precision score: min = {}, max = {}, average = {}'
 | 
				
			||||||
 | 
					                .format(min(precision_scores),
 | 
				
			||||||
 | 
					                        max(precision_scores),
 | 
				
			||||||
 | 
					                        sum(precision_scores)/float(len(precision_scores))))
 | 
				
			||||||
 | 
					        print('Recall score: min = {}, max = {}, average = {}'
 | 
				
			||||||
 | 
					                .format(min(recall_scores),
 | 
				
			||||||
 | 
					                        max(recall_scores),
 | 
				
			||||||
 | 
					                        sum(recall_scores)/float(len(recall_scores))))
 | 
				
			||||||
 | 
					        print('F1 score: min = {}, max = {}, average = {}'
 | 
				
			||||||
 | 
					                .format(min(f1_scores),
 | 
				
			||||||
 | 
					                        max(f1_scores),
 | 
				
			||||||
 | 
					                        sum(f1_scores)/float(len(f1_scores))))
 | 
				
			||||||
 | 
					        print()
 | 
				
			||||||
 | 
					        # print probability of each class
 | 
				
			||||||
 | 
					        print('probability of each class:')
 | 
				
			||||||
 | 
					        print()
 | 
				
			||||||
 | 
					        print(class_prob)
 | 
				
			||||||
 | 
					        print()
 | 
				
			||||||
 | 
					        print('number of samples of each class:')
 | 
				
			||||||
 | 
					        print()
 | 
				
			||||||
 | 
					        print(class_counts)
 | 
				
			||||||
 | 
					        print()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        ##### nur für overfit testing ###########
 | 
				
			||||||
 | 
					        #print('overfit testing: prediction of training set')
 | 
				
			||||||
 | 
					        #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
 | 
				
			||||||
 | 
					        #format(min(f1_scores_train), max(f1_scores_train),
 | 
				
			||||||
 | 
					        #sum(f1_scores_train)/float(len(f1_scores_train))))
 | 
				
			||||||
 | 
					        #print()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ######## nur für resubstitutionsfehler benötigt ########
 | 
				
			||||||
 | 
					    def analyze_errors(dataset):
 | 
				
			||||||
 | 
					        '''calculates resubstitution error
 | 
				
			||||||
 | 
					        shows indices of false classified articles
 | 
				
			||||||
 | 
					        uses Gaussian Bayes with train test split
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					        X_train_test = dataset['Title'] + ' ' + dataset['Text']
 | 
				
			||||||
 | 
					        y_train_test = dataset['Label']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        count_vector = CountVectorizer()
 | 
				
			||||||
 | 
					        # fit the training data and then return the matrix
 | 
				
			||||||
 | 
					        training_data = count_vector.fit_transform(X_train_test).toarray()
 | 
				
			||||||
 | 
					        # transform testing data and return the matrix
 | 
				
			||||||
 | 
					        testing_data = count_vector.transform(X_train_test).toarray()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Naive Bayes
 | 
				
			||||||
 | 
					        classifier = GaussianNB()
 | 
				
			||||||
 | 
					        # fit classifier
 | 
				
			||||||
 | 
					        classifier.fit(training_data, y_train_test)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Predict class
 | 
				
			||||||
 | 
					        predictions = classifier.predict(testing_data)
 | 
				
			||||||
 | 
					        print('Errors at index:')
 | 
				
			||||||
 | 
					        print()
 | 
				
			||||||
 | 
					        n = 0
 | 
				
			||||||
 | 
					        for i in range(len(y_train_test)):
 | 
				
			||||||
 | 
					            if y_train_test[i] != predictions[i]:
 | 
				
			||||||
 | 
					                n += 1
 | 
				
			||||||
 | 
					                print('error no.{}'.format(n))
 | 
				
			||||||
 | 
					                print('prediction at index {} is: {}, but actual is: {}'
 | 
				
			||||||
 | 
					                .format(i, predictions[i], y_train_test[i]))
 | 
				
			||||||
 | 
					                print(X_train_test[i])
 | 
				
			||||||
 | 
					                print(y_train_test[i])
 | 
				
			||||||
 | 
					                print()
 | 
				
			||||||
 | 
					        #print metrics
 | 
				
			||||||
 | 
					        print('F1 score: ', format(f1_score(y_train_test, predictions)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if __name__ == '__main__':
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        print('# starting naive bayes')
 | 
				
			||||||
 | 
					        print('# ...')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        file = 'classification_labelled_corrected.csv'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # read csv file
 | 
				
			||||||
 | 
					        print('# reading dataset')
 | 
				
			||||||
 | 
					        print('# ...')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        dataset = CsvHandler.read_csv(file)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        make_naive_bayes(dataset)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        print('#')
 | 
				
			||||||
 | 
					        print('# ending naive bayes')
 | 
				
			||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user