changed plot to histogram

2018-09-27 09:20:07 +02:00 · 2018-09-27 09:20:07 +02:00 · 549f21657c
commit 549f21657c
parent 8d6af51409
1 changed files with 20 additions and 13 deletions
--- a/NER.py
+++ b/NER.py
@ -15,15 +15,17 @@ from CsvHandler import CsvHandler
 class NER:
-    # legal entity types
+    # toDo: complete lists:
    # some legal entity types
    company_abbrevs = ['Inc', 'Corp', 'Co', 'Ltd', 'AG', 'LP',
                      'Plc', 'LLC', 'LBO', 'IPO', 'HQ',
                      'CIO', 'NGO', 'AB']
-    # entities that are not companies
+    # some entities that are not companies
    misc = ['Reuters', 'Financial Times', 'Bloomberg', 'The Economist', 
            'Cnn', 'European Commission', 'EU', 'Staff', 'Min', 'Read',
-            'Thomson Reuters Trust Principles']
+            'Thomson Reuters Trust Principles', 'New York Stock Exchange',
            'NYSE']
    def tag_words(text):
        stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
@ -49,17 +51,22 @@ class NER:
                current_chunks = []
        return continuous_chunks
-    def plot_barchart(count_names):
+    def plot_histogram(count_names):
        '''pyplot diagram of company names distribution
        in input news articles
        x-axis:different company names (numbered consecutively)
        y-axis:counts of company name
        '''
-        plt.plot(range(len(count_names)), count_names, 'ro', ms = 5)
+        # sort list in descending order
-        plt.xlabel('Company Names')
+        sorted = count_names.sort(reverse=True)
-        plt.ylabel('Article Count')
+        # plt.plot(range(len(count_names)), count_names, 'ro', ms = 5)
-        plt.title('Counts of News Articles with Company Name')
+        plt.title('Company mentions in News Articles')
-        plt.grid(True)
+        plt.xlabel('Number of mentions of the company')
        # Number of companies with this number of mentions
        plt.ylabel('Frequency')
        num_bins = 50
        n, bins, patches = plt.hist(sorted, num_bins, facecolor='blue', alpha=0.5)
        # plt.grid(True)
        plt.show()
    def find_companies(text):
@ -99,8 +106,8 @@ class NER:
                    dict_com[com] += 1
                else:
                    dict_com[com] = 1
-        # print outlier (value 38)
+        # # print outlier
-        print(max(dict_com, key=dict_com.get))
+        # print(max(dict_com, key=dict_com.get))
        return dict_com.values()
 if __name__ == '__main__':
@ -114,7 +121,7 @@ if __name__ == '__main__':
    texts = df_hits['Title'] + ' ' + df_hits['Text']
    # # zum prüfen lesen
-    # for text in texts[5:10]:
+    # for text in texts[10:20]:
        # print(text)
        # print()
        # print(NER.find_companies(text))
@ -124,4 +131,4 @@ if __name__ == '__main__':
    count_names = NER.count_companies(texts)
    # plot diagram
-    NER.plot_barchart(count_names)
+    NER.plot_histogram(count_names)