changes due to NER

2018-09-24 13:50:11 +02:00 · 2018-09-24 13:50:11 +02:00 · 188a2d582c
commit 188a2d582c
parent 14e5af9d7d
3 changed files with 225 additions and 26 deletions
--- a/NER.py
+++ b/NER.py
@ -8,6 +8,7 @@ like persons, organizations and countries, e.g.

 import os

+import matplotlib.pyplot as plt
 from nltk.tag import StanfordNERTagger
 from nltk.tokenize import word_tokenize

@ -41,13 +42,38 @@ class NER:
            continuous_chunk.append(current_chunk)
        return continuous_chunk

-    if __name__ == '__main__':
+    def plot_barchart():
+        organizations = ['org1', 'org2', 'org3', 'org4', 'org5', 'org6']
+        num_mentions = [5, 2, 33, 12, 6, 10]
+        #n, bins, patches = plt.hist(num_mentions, 6, normed=1, facecolor='green')
+        plt.plot(organizations, num_mentions, 'ro', ms = 10)
+        plt.xlabel('companies')
+        plt.ylabel('count')
+        plt.title('Company mentions in articles')
+        plt.grid(True)
+        plt.show()

+    def find_companies(text):
        #set paths
        java_path = "C:\\Program Files (x86)\\Java\\jre1.8.0_181"
        os.environ['JAVAHOME'] = java_path

-        text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
+        organizations = []
+        # create list of (word, tag) tuples
+        tagged_words = NER.tag_words(text)
+        # put coherent names together
+        nes = NER.get_coherent_names(tagged_words)
+        nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
+        #print(nes_coherent)
+        for tuple in nes_coherent:
+            if tuple[1] == 'ORGANIZATION':
+                organizations.append(tuple[0])
+        return organizations
+
+if __name__ == '__main__':
+
+    #plot_barchart()
+    text = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
                    \nmostly fell in light volumes on Tuesday as energy shares
                    tracked \nfalls in global oil prices, while weaknesses in banking shares
                    \namid concerns about loans to an ailing steel firm sent the Thai
@ -71,15 +97,4 @@ class NER:
                    region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
                    Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
                    \namid uncertainty over global demand. \nFor Asian Companies click.'''
-
-        organizations = []
-        # create list of (word, tag) tuples
-        tagged_words = tag_words(text)
-        # put coherent names together
-        nes = get_coherent_names(tagged_words)
-        nes_coherent = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in nes]
-        #print(nes_coherent)
-        for tuple in nes_coherent:
-            if tuple[1] == 'ORGANIZATION':
-                organizations.append(tuple[0])
-        print(organizations)
+    print(NER.find_companies(text))
--- a/NaiveBayes.py
+++ b/NaiveBayes.py
@ -38,15 +38,20 @@ class NaiveBayes:
        cv = CountVectorizer()

        # use stratified k-fold cross-validation as split method
-        skf = StratifiedKFold(n_splits = 10, shuffle=True)
+        skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)

        classifier = GaussianNB()

-        # lists for metrics
+        # metrics
        recall_scores = []
        precision_scores = []
        f1_scores = []

+        # probabilities of each class (of each fold)
+        class_prob = []
+        # counts number of training samples observed in each class 
+        class_counts = []
+
        # for each fold
        n = 0
        for train, test in skf.split(X,y):
@ -54,18 +59,18 @@ class NaiveBayes:
            n += 1
            print('# split no. ' + str(n))

-            # eigenes BOW => schlechtere ergebnisse
-            vocab = BagOfWords.make_vocab(X[train])
-            # fit the training data and then return the matrix
-            training_data = BagOfWords.make_matrix(X[train], vocab)
-            # transform testing data and return the matrix
-            testing_data = BagOfWords.make_matrix(X[test], vocab)
-
-            # # # using CountVectorizer:
+            # # eigenes BOW => schlechtere ergebnisse
+            # vocab = BagOfWords.make_vocab(X[train])
            # # fit the training data and then return the matrix
-            # training_data = cv.fit_transform(X[train], y[train]).toarray()
+            # training_data = BagOfWords.make_matrix(X[train], vocab)
            # # transform testing data and return the matrix
-            # testing_data = cv.transform(X[test]).toarray()
+            # testing_data = BagOfWords.make_matrix(X[test], vocab)
+
+            # using CountVectorizer:
+            # fit the training data and then return the matrix
+            training_data = cv.fit_transform(X[train], y[train]).toarray()
+            # transform testing data and return the matrix
+            testing_data = cv.transform(X[test]).toarray()

            # # apply select percentile
            # selector = SelectPercentile(percentile=25)
@ -97,6 +102,9 @@ class NaiveBayes:
            # equation for f1 score
            f1_scores.append(2 * (prec * rec)/(prec + rec))

+            class_prob.append(classifier.class_prior_)
+            class_counts.append(classifier.class_count_)
+
        ##########################
        #print metrics of test set
        print('-------------------------')
@ -114,6 +122,15 @@ class NaiveBayes:
                        max(f1_scores),
                        sum(f1_scores)/float(len(f1_scores))))
        print()
+        # print probability of each class
+        print('probability of each class:')
+        print()
+        print(class_prob)
+        print()
+        print('number of samples of each class:')
+        print()
+        print(class_counts)
+        print()

        ##### nur für overfit testing ###########
        #print('overfit testing: prediction of training set')
--- a/NaiveBayes_simple.py
+++ b/NaiveBayes_simple.py
@ -0,0 +1,167 @@
+'''
+Naive Bayes Classifier
+======================
+
+basic implementation of naive bayes.
+'''
+
+from CsvHandler import CsvHandler
+
+from sklearn.feature_extraction.text import CountVectorizer
+
+from sklearn.metrics import recall_score, precision_score
+from sklearn.model_selection import KFold
+from sklearn.naive_bayes import GaussianNB
+
+class NaiveBayes_simple:
+
+    def make_naive_bayes(dataset):
+        '''fits naive bayes model with StratifiedKFold,
+        uses my BOW
+        '''
+        print('# fitting model')
+        print('# ...')
+
+        # split data into text and label set
+        # join title and text
+        X = dataset['Title'] + ' ' + dataset['Text']
+        y = dataset['Label']
+
+        cv = CountVectorizer()
+
+        # k-fold cross-validation as split method
+        kf = KFold(n_splits=10, shuffle=True, random_state=5)
+
+        classifier = GaussianNB()
+
+        # metrics
+        recall_scores = []
+        precision_scores = []
+        f1_scores = []
+
+        # probabilities of each class (of each fold)
+        class_prob = []
+        # counts number of training samples observed in each class 
+        class_counts = []
+
+        # for each fold
+        n = 0
+        for train, test in kf.split(X,y):
+
+            n += 1
+            print('# split no. ' + str(n))
+
+            # using CountVectorizer:
+            # fit the training data and then return the matrix
+            training_data = cv.fit_transform(X[train], y[train]).toarray()
+            # transform testing data and return the matrix
+            testing_data = cv.transform(X[test]).toarray()
+
+            #fit classifier
+            classifier.fit(training_data, y[train])
+            #predict class
+            predictions_train = classifier.predict(training_data)
+            predictions_test = classifier.predict(testing_data)
+
+            #print and store metrics
+            rec = recall_score(y[test], predictions_test)
+            print('rec: ' + str(rec))
+            recall_scores.append(rec)
+            prec = precision_score(y[train], predictions_train)
+            print('prec: ' + str(prec))
+            print('#')
+            precision_scores.append(prec)
+            # equation for f1 score
+            f1_scores.append(2 * (prec * rec)/(prec + rec))
+
+            class_prob.append(classifier.class_prior_)
+            class_counts.append(classifier.class_count_)
+
+        ##########################
+        #print metrics of test set
+        print('-------------------------')
+        print('prediction of testing set:')
+        print('Precision score: min = {}, max = {}, average = {}'
+                .format(min(precision_scores),
+                        max(precision_scores),
+                        sum(precision_scores)/float(len(precision_scores))))
+        print('Recall score: min = {}, max = {}, average = {}'
+                .format(min(recall_scores),
+                        max(recall_scores),
+                        sum(recall_scores)/float(len(recall_scores))))
+        print('F1 score: min = {}, max = {}, average = {}'
+                .format(min(f1_scores),
+                        max(f1_scores),
+                        sum(f1_scores)/float(len(f1_scores))))
+        print()
+        # print probability of each class
+        print('probability of each class:')
+        print()
+        print(class_prob)
+        print()
+        print('number of samples of each class:')
+        print()
+        print(class_counts)
+        print()
+
+        ##### nur für overfit testing ###########
+        #print('overfit testing: prediction of training set')
+        #print('F1 score: min = {0:.2f}, max = {0:.2f}, average = {0:.2f}'.
+        #format(min(f1_scores_train), max(f1_scores_train),
+        #sum(f1_scores_train)/float(len(f1_scores_train))))
+        #print()
+
+    ######## nur für resubstitutionsfehler benötigt ########
+    def analyze_errors(dataset):
+        '''calculates resubstitution error
+        shows indices of false classified articles
+        uses Gaussian Bayes with train test split
+        '''
+        X_train_test = dataset['Title'] + ' ' + dataset['Text']
+        y_train_test = dataset['Label']
+
+        count_vector = CountVectorizer()
+        # fit the training data and then return the matrix
+        training_data = count_vector.fit_transform(X_train_test).toarray()
+        # transform testing data and return the matrix
+        testing_data = count_vector.transform(X_train_test).toarray()
+
+        # Naive Bayes
+        classifier = GaussianNB()
+        # fit classifier
+        classifier.fit(training_data, y_train_test)
+
+        # Predict class
+        predictions = classifier.predict(testing_data)
+        print('Errors at index:')
+        print()
+        n = 0
+        for i in range(len(y_train_test)):
+            if y_train_test[i] != predictions[i]:
+                n += 1
+                print('error no.{}'.format(n))
+                print('prediction at index {} is: {}, but actual is: {}'
+                .format(i, predictions[i], y_train_test[i]))
+                print(X_train_test[i])
+                print(y_train_test[i])
+                print()
+        #print metrics
+        print('F1 score: ', format(f1_score(y_train_test, predictions)))
+
+    if __name__ == '__main__':
+
+        print('# starting naive bayes')
+        print('# ...')
+
+        file = 'classification_labelled_corrected.csv'
+
+        # read csv file
+        print('# reading dataset')
+        print('# ...')
+
+        dataset = CsvHandler.read_csv(file)
+
+        make_naive_bayes(dataset)
+
+        print('#')
+        print('# ending naive bayes')