changes due to NER

2018-09-24 13:50:11 +02:00 · 2018-09-24 13:50:11 +02:00 · 188a2d582c
commit 188a2d582c
parent 14e5af9d7d
3 changed files with 225 additions and 26 deletions
--- a/NER.py
+++ b/NER.py
@ -8,6 +8,7 @@ like persons, organizations and countries, e.g.
 import os
 import matplotlib.pyplot as plt
 from nltk.tag import StanfordNERTagger
 from nltk.tokenize import word_tokenize
@ -41,13 +42,38 @@ class NER:
            continuous_chunk.append(current_chunk)
        return continuous_chunk
-    if __name__ == '__main__':
+    def plot_barchart():
        organizations = ['org1', 'org2', 'org3', 'org4', 'org5', 'org6']
        num_mentions = [5, 2, 33, 12, 6, 10]
        #n, bins, patches = plt.hist(num_mentions, 6, normed=1, facecolor='green')
        plt.plot(organizations, num_mentions, 'ro', ms = 10)
        plt.xlabel('companies')
        plt.ylabel('count')
        plt.title('Company mentions in articles')
        plt.grid(True)
        plt.show()
    def find_companies(text):
        #set paths
        java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
        os.environ['JAVAHOME'] = java_path
-        text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
+        organizations = []
        # create list of (word, tag) tuples
        tagged_words = NER.tag_words(text)
        # put coherent names together
        nes = NER.get_coherent_names(tagged_words)
        nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
        #print(nes_coherent)
        for tuple in nes_coherent:
            if tuple[1] == 'ORGANIZATION':
                organizations.append(tuple[0])
        return organizations
 if __name__ == '__main__':
    #plot_barchart()
    text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
                    \nmostly fell in light volumes on Tuesday as energy shares
                    tracked \nfalls in global oil prices, while weaknesses in banking shares
                    \namid concerns about loans to an ailing steel firm sent the Thai
@ -71,15 +97,4 @@ class NER:
                    region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
                    Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
                    \namid uncertainty over global demand. \nFor Asian Companies click.'''
-
+    print(NER.find_companies(text))
        organizations = []
        # create list of (word, tag) tuples
        tagged_words = tag_words(text)
        # put coherent names together
        nes = get_coherent_names(tagged_words)
        nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
        #print(nes_coherent)
        for tuple in nes_coherent:
            if tuple[1] == 'ORGANIZATION':
                organizations.append(tuple[0])
        print(organizations)
--- a/NaiveBayes.py
+++ b/NaiveBayes.py
@ -38,15 +38,20 @@ class NaiveBayes:
        cv = CountVectorizer()
        # use stratified k-fold cross-validation as split method
-        skf = StratifiedKFold(n_splits = 10, shuffle=True)
+        skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
        classifier = GaussianNB()
-        # lists for metrics
+        # metrics
        recall_scores = []
        precision_scores = []
        f1_scores = []
        # probabilities of each class (of each fold)
        class_prob = []
        # counts number of training samples observed in each class 
        class_counts = []
        # for each fold
        n = 0
        for train, test in skf.split(X,y):
@ -54,18 +59,18 @@ class NaiveBayes:
            n += 1
            print('# split no. ' + str(n))
-            # eigenes BOW => schlechtere ergebnisse
+            # # eigenes BOW => schlechtere ergebnisse
-            vocab = BagOfWords.make_vocab(X[train])
+            # vocab = BagOfWords.make_vocab(X[train])
            # fit the training data and then return the matrix
            training_data = BagOfWords.make_matrix(X[train], vocab)
            # transform testing data and return the matrix
            testing_data = BagOfWords.make_matrix(X[test], vocab)
            # # # using CountVectorizer:
            # # fit the training data and then return the matrix
-            # training_data = cv.fit_transform(X[train], y[train]).toarray()
+            # training_data = BagOfWords.make_matrix(X[train], vocab)
            # # transform testing data and return the matrix
-            # testing_data = cv.transform(X[test]).toarray()
+            # testing_data = BagOfWords.make_matrix(X[test], vocab)
            # using CountVectorizer:
            # fit the training data and then return the matrix
            training_data = cv.fit_transform(X[train], y[train]).toarray()
            # transform testing data and return the matrix
            testing_data = cv.transform(X[test]).toarray()
            # # apply select percentile
            # selector = SelectPercentile(percentile=25)
@ -97,6 +102,9 @@ class NaiveBayes:
            # equation for f1 score
            f1_scores.append(2 * (prec * rec)/(prec + rec))
            class_prob.append(classifier.class_prior_)
            class_counts.append(classifier.class_count_)
        ##########################
        #print metrics of test set
        print('-------------------------')
@ -114,6 +122,15 @@ class NaiveBayes:
                        max(f1_scores),
                        sum(f1_scores)/float(len(f1_scores))))
        print()
        # print probability of each class
        print('probability of each class:')
        print()
        print(class_prob)
        print()
        print('number of samples of each class:')
        print()
        print(class_counts)
        print()
        ##### nur für overfit testing ###########
        #print('overfit testing: prediction of training set')
--- a/NaiveBayes_simple.py
+++ b/NaiveBayes_simple.py
@ -0,0 +1,167 @@
 '''
 Naive Bayes Classifier
 ======================
 basic implementation of naive bayes.
 '''
 from CsvHandler import CsvHandler
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.metrics import recall_score, precision_score
 from sklearn.model_selection import KFold
 from sklearn.naive_bayes import GaussianNB
 class NaiveBayes_simple:
    def make_naive_bayes(dataset):
        '''fits naive bayes model with StratifiedKFold,
        uses my BOW
        '''
        print('# fitting model')
        print('# ...')
        # split data into text and label set
        # join title and text
        X = dataset['Title'] + ' ' + dataset['Text']
        y = dataset['Label']
        cv = CountVectorizer()
        # k-fold cross-validation as split method
        kf = KFold(n_splits=10, shuffle=True, random_state=5)
        classifier = GaussianNB()
        # metrics
        recall_scores = []
        precision_scores = []
        f1_scores = []
        # probabilities of each class (of each fold)
        class_prob = []
        # counts number of training samples observed in each class 
        class_counts = []
        # for each fold
        n = 0
        for train, test in kf.split(X,y):
            n += 1
            print('# split no. ' + str(n))
            # using CountVectorizer:
            # fit the training data and then return the matrix
            training_data = cv.fit_transform(X[train], y[train]).toarray()
            # transform testing data and return the matrix
            testing_data = cv.transform(X[test]).toarray()
            #fit classifier
            classifier.fit(training_data, y[train])
            #predict class
            predictions_train = classifier.predict(training_data)
            predictions_test = classifier.predict(testing_data)
            #print and store metrics
            rec = recall_score(y[test], predictions_test)
            print('rec: ' + str(rec))
            recall_scores.append(rec)
            prec = precision_score(y[train], predictions_train)
            print('prec: ' + str(prec))
            print('#')
            precision_scores.append(prec)
            # equation for f1 score
            f1_scores.append(2 * (prec * rec)/(prec + rec))
            class_prob.append(classifier.class_prior_)
            class_counts.append(classifier.class_count_)
        ##########################
        #print metrics of test set
        print('-------------------------')
        print('prediction of testing set:')
        print('Precision score: min = {}, max = {}, average = {}'
                .format(min(precision_scores),
                        max(precision_scores),
                        sum(precision_scores)/float(len(precision_scores))))
        print('Recall score: min = {}, max = {}, average = {}'
                .format(min(recall_scores),
                        max(recall_scores),
                        sum(recall_scores)/float(len(recall_scores))))
        print('F1 score: min = {}, max = {}, average = {}'
                .format(min(f1_scores),
                        max(f1_scores),
                        sum(f1_scores)/float(len(f1_scores))))
        print()
        # print probability of each class
        print('probability of each class:')
        print()
        print(class_prob)
        print()
        print('number of samples of each class:')
        print()
        print(class_counts)
        print()
        ##### nur für overfit testing ###########
        #print('overfit testing: prediction of training set')
        #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
        #format(min(f1_scores_train), max(f1_scores_train),
        #sum(f1_scores_train)/float(len(f1_scores_train))))
        #print()
    ######## nur für resubstitutionsfehler benötigt ########
    def analyze_errors(dataset):
        '''calculates resubstitution error
        shows indices of false classified articles
        uses Gaussian Bayes with train test split
        '''
        X_train_test = dataset['Title'] + ' ' + dataset['Text']
        y_train_test = dataset['Label']
        count_vector = CountVectorizer()
        # fit the training data and then return the matrix
        training_data = count_vector.fit_transform(X_train_test).toarray()
        # transform testing data and return the matrix
        testing_data = count_vector.transform(X_train_test).toarray()
        # Naive Bayes
        classifier = GaussianNB()
        # fit classifier
        classifier.fit(training_data, y_train_test)
        # Predict class
        predictions = classifier.predict(testing_data)
        print('Errors at index:')
        print()
        n = 0
        for i in range(len(y_train_test)):
            if y_train_test[i] != predictions[i]:
                n += 1
                print('error no.{}'.format(n))
                print('prediction at index {} is: {}, but actual is: {}'
                .format(i, predictions[i], y_train_test[i]))
                print(X_train_test[i])
                print(y_train_test[i])
                print()
        #print metrics
        print('F1 score: ', format(f1_score(y_train_test, predictions)))
    if __name__ == '__main__':
        print('# starting naive bayes')
        print('# ...')
        file = 'classification_labelled_corrected.csv'
        # read csv file
        print('# reading dataset')
        print('# ...')
        dataset = CsvHandler.read_csv(file)
        make_naive_bayes(dataset)
        print('#')
        print('# ending naive bayes')