changes due to NER

2018-09-24 13:50:11 +02:00 · 2018-09-24 13:50:11 +02:00 · 188a2d582c
commit 188a2d582c
parent 14e5af9d7d
3 changed files with 225 additions and 26 deletions
--- a/NER.py
+++ b/NER.py
@ -8,6 +8,7 @@ like persons, organizations and countries, e.g.

 import os

+import matplotlib.pyplot as plt
 from nltk.tag import StanfordNERTagger
 from nltk.tokenize import word_tokenize

@ -41,12 +42,37 @@ class NER:
            continuous_chunk.append(current_chunk)
        return continuous_chunk

-    if __name__ == '__main__':
+    def plot_barchart():
+        organizations = ['org1', 'org2', 'org3', 'org4', 'org5', 'org6']
+        num_mentions = [5, 2, 33, 12, 6, 10]
+        #n, bins, patches = plt.hist(num_mentions, 6, normed=1, facecolor='green')
+        plt.plot(organizations, num_mentions, 'ro', ms = 10)
+        plt.xlabel('companies')
+        plt.ylabel('count')
+        plt.title('Company mentions in articles')
+        plt.grid(True)
+        plt.show()

+    def find_companies(text):
        #set paths
        java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
        os.environ['JAVAHOME'] = java_path

+        organizations = []
+        # create list of (word, tag) tuples
+        tagged_words = NER.tag_words(text)
+        # put coherent names together
+        nes = NER.get_coherent_names(tagged_words)
+        nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
+        #print(nes_coherent)
+        for tuple in nes_coherent:
+            if tuple[1] == 'ORGANIZATION':
+                organizations.append(tuple[0])
+        return organizations
+
+if __name__ == '__main__':
+
+    #plot_barchart()
    text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
                    \nmostly fell in light volumes on Tuesday as energy shares
                    tracked \nfalls in global oil prices, while weaknesses in banking shares
@ -71,15 +97,4 @@ class NER:
                    region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
                    Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
                    \namid uncertainty over global demand. \nFor Asian Companies click.'''
-
-        organizations = []
-        # create list of (word, tag) tuples
-        tagged_words = tag_words(text)
-        # put coherent names together
-        nes = get_coherent_names(tagged_words)
-        nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
-        #print(nes_coherent)
-        for tuple in nes_coherent:
-            if tuple[1] == 'ORGANIZATION':
-                organizations.append(tuple[0])
-        print(organizations)
+    print(NER.find_companies(text))
--- a/NaiveBayes.py
+++ b/NaiveBayes.py
@ -38,15 +38,20 @@ class NaiveBayes:
        cv = CountVectorizer()

        # use stratified k-fold cross-validation as split method
-        skf = StratifiedKFold(n_splits = 10, shuffle=True)
+        skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)

        classifier = GaussianNB()

-        # lists for metrics
+        # metrics
        recall_scores = []
        precision_scores = []
        f1_scores = []

+        # probabilities of each class (of each fold)
+        class_prob = []
+        # counts number of training samples observed in each class 
+        class_counts = []
+
        # for each fold
        n = 0
        for train, test in skf.split(X,y):
@ -54,18 +59,18 @@ class NaiveBayes:
            n += 1
            print('# split no. ' + str(n))

-            # eigenes BOW => schlechtere ergebnisse
-            vocab = BagOfWords.make_vocab(X[train])
-            # fit the training data and then return the matrix
-            training_data = BagOfWords.make_matrix(X[train], vocab)
-            # transform testing data and return the matrix
-            testing_data = BagOfWords.make_matrix(X[test], vocab)
-
-            # # # using CountVectorizer:
+            # # eigenes BOW => schlechtere ergebnisse
+            # vocab = BagOfWords.make_vocab(X[train])
            # # fit the training data and then return the matrix
-            # training_data = cv.fit_transform(X[train], y[train]).toarray()
+            # training_data = BagOfWords.make_matrix(X[train], vocab)
            # # transform testing data and return the matrix
-            # testing_data = cv.transform(X[test]).toarray()
+            # testing_data = BagOfWords.make_matrix(X[test], vocab)
+
+            # using CountVectorizer:
+            # fit the training data and then return the matrix
+            training_data = cv.fit_transform(X[train], y[train]).toarray()
+            # transform testing data and return the matrix
+            testing_data = cv.transform(X[test]).toarray()

            # # apply select percentile
            # selector = SelectPercentile(percentile=25)
@ -97,6 +102,9 @@ class NaiveBayes:
            # equation for f1 score
            f1_scores.append(2 * (prec * rec)/(prec + rec))

+            class_prob.append(classifier.class_prior_)
+            class_counts.append(classifier.class_count_)
+
        ##########################
        #print metrics of test set
        print('-------------------------')
@ -114,6 +122,15 @@ class NaiveBayes:
                        max(f1_scores),
                        sum(f1_scores)/float(len(f1_scores))))
        print()
+        # print probability of each class
+        print('probability of each class:')
+        print()
+        print(class_prob)
+        print()
+        print('number of samples of each class:')
+        print()
+        print(class_counts)
+        print()

        ##### nur für overfit testing ###########
        #print('overfit testing: prediction of training set')
--- a/NaiveBayes_simple.py
+++ b/NaiveBayes_simple.py
@ -0,0 +1,167 @@
+'''
+Naive Bayes Classifier
+======================
+
+basic implementation of naive bayes.
+'''
+
+from CsvHandler import CsvHandler
+
+from sklearn.feature_extraction.text import CountVectorizer
+
+from sklearn.metrics import recall_score, precision_score
+from sklearn.model_selection import KFold
+from sklearn.naive_bayes import GaussianNB
+
+class NaiveBayes_simple:
+
+    def make_naive_bayes(dataset):
+        '''fits naive bayes model with StratifiedKFold,
+        uses my BOW
+        '''
+        print('# fitting model')
+        print('# ...')
+
+        # split data into text and label set
+        # join title and text
+        X = dataset['Title'] + ' ' + dataset['Text']
+        y = dataset['Label']
+
+        cv = CountVectorizer()
+
+        # k-fold cross-validation as split method
+        kf = KFold(n_splits=10, shuffle=True, random_state=5)
+
+        classifier = GaussianNB()
+
+        # metrics
+        recall_scores = []
+        precision_scores = []
+        f1_scores = []
+
+        # probabilities of each class (of each fold)
+        class_prob = []
+        # counts number of training samples observed in each class 
+        class_counts = []
+
+        # for each fold
+        n = 0
+        for train, test in kf.split(X,y):
+
+            n += 1
+            print('# split no. ' + str(n))
+
+            # using CountVectorizer:
+            # fit the training data and then return the matrix
+            training_data = cv.fit_transform(X[train], y[train]).toarray()
+            # transform testing data and return the matrix
+            testing_data = cv.transform(X[test]).toarray()
+
+            #fit classifier
+            classifier.fit(training_data, y[train])
+            #predict class
+            predictions_train = classifier.predict(training_data)
+            predictions_test = classifier.predict(testing_data)
+
+            #print and store metrics
+            rec = recall_score(y[test], predictions_test)
+            print('rec: ' + str(rec))
+            recall_scores.append(rec)
+            prec = precision_score(y[train], predictions_train)
+            print('prec: ' + str(prec))
+            print('#')
+            precision_scores.append(prec)
+            # equation for f1 score
+            f1_scores.append(2 * (prec * rec)/(prec + rec))
+
+            class_prob.append(classifier.class_prior_)
+            class_counts.append(classifier.class_count_)
+
+        ##########################
+        #print metrics of test set
+        print('-------------------------')
+        print('prediction of testing set:')
+        print('Precision score: min = {}, max = {}, average = {}'
+                .format(min(precision_scores),
+                        max(precision_scores),
+                        sum(precision_scores)/float(len(precision_scores))))
+        print('Recall score: min = {}, max = {}, average = {}'
+                .format(min(recall_scores),
+                        max(recall_scores),
+                        sum(recall_scores)/float(len(recall_scores))))
+        print('F1 score: min = {}, max = {}, average = {}'
+                .format(min(f1_scores),
+                        max(f1_scores),
+                        sum(f1_scores)/float(len(f1_scores))))
+        print()
+        # print probability of each class
+        print('probability of each class:')
+        print()
+        print(class_prob)
+        print()
+        print('number of samples of each class:')
+        print()
+        print(class_counts)
+        print()
+
+        ##### nur für overfit testing ###########
+        #print('overfit testing: prediction of training set')
+        #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
+        #format(min(f1_scores_train), max(f1_scores_train),
+        #sum(f1_scores_train)/float(len(f1_scores_train))))
+        #print()
+
+    ######## nur für resubstitutionsfehler benötigt ########
+    def analyze_errors(dataset):
+        '''calculates resubstitution error
+        shows indices of false classified articles
+        uses Gaussian Bayes with train test split
+        '''
+        X_train_test = dataset['Title'] + ' ' + dataset['Text']
+        y_train_test = dataset['Label']
+
+        count_vector = CountVectorizer()
+        # fit the training data and then return the matrix
+        training_data = count_vector.fit_transform(X_train_test).toarray()
+        # transform testing data and return the matrix
+        testing_data = count_vector.transform(X_train_test).toarray()
+
+        # Naive Bayes
+        classifier = GaussianNB()
+        # fit classifier
+        classifier.fit(training_data, y_train_test)
+
+        # Predict class
+        predictions = classifier.predict(testing_data)
+        print('Errors at index:')
+        print()
+        n = 0
+        for i in range(len(y_train_test)):
+            if y_train_test[i] != predictions[i]:
+                n += 1
+                print('error no.{}'.format(n))
+                print('prediction at index {} is: {}, but actual is: {}'
+                .format(i, predictions[i], y_train_test[i]))
+                print(X_train_test[i])
+                print(y_train_test[i])
+                print()
+        #print metrics
+        print('F1 score: ', format(f1_score(y_train_test, predictions)))
+
+    if __name__ == '__main__':
+
+        print('# starting naive bayes')
+        print('# ...')
+
+        file = 'classification_labelled_corrected.csv'
+
+        # read csv file
+        print('# reading dataset')
+        print('# ...')
+
+        dataset = CsvHandler.read_csv(file)
+
+        make_naive_bayes(dataset)
+
+        print('#')
+        print('# ending naive bayes')