some updates

2018-10-02 13:56:06 +02:00 · 2018-10-02 13:56:06 +02:00 · 446df63d84
commit 446df63d84
parent 03d96987b8
2 changed files with 19 additions and 10 deletions
--- a/CsvHandler.py
+++ b/CsvHandler.py
@ -10,6 +10,12 @@ import csv
 import numpy as np
 import pandas as pd

+# todo: checken, wie groß ("Dataframe maximum size")
+# import sys
+# print(sys.getsizeof(OBEJCT_NAME_HERE))
+
+# beim selektieren (833 pro Monat) auf Duplikate in Überschrift prüfen!!!
+
 class CsvHandler:

    def read_csv(csv_file):
@ -25,7 +31,8 @@ class CsvHandler:
        return df

    def write_csv(df, file_name):
-        df.to_csv(file_name, sep='|')
+        df.to_csv(file_name, 
+                  sep='|')
        print('# saved {} article(s) in {}'.format(len(df), file_name))

    def select_randoms(df, n):
--- a/NER.py
+++ b/NER.py
@ -8,6 +8,7 @@ like persons, organizations and countries, e.g.
 import os

 import matplotlib.pyplot as plt
+import numpy as np
 from nltk.tag import StanfordNERTagger
 from nltk.tokenize import word_tokenize

@ -53,19 +54,20 @@ class NER:

    def plot_histogram(count_names):
        '''pyplot diagram of company names distribution
-        in input news articles
-        x-axis:different company names (numbered consecutively)
-        y-axis:counts of company name
+        (probability density function)
+        x-axis: number of mentions of the company
+        y-axis: frequency
        '''
        # sort list in descending order
-        sorted = count_names.sort(reverse=True)
-        # plt.plot(range(len(count_names)), count_names, 'ro', ms = 5)
+        count_names.sort(reverse=True)
+        # convert list to array
+        names = np.asarray(count_names)
        plt.title('Company mentions in News Articles')
-        plt.xlabel('Number of mentions of the company')
+        plt.xlabel('Count of articles that mention a company')
        # Number of companies with this number of mentions
-        plt.ylabel('Frequency')
+        plt.ylabel('Number of companies with this number of articles')
        num_bins = 50
-        n, bins, patches = plt.hist(sorted, num_bins, facecolor='blue', alpha=0.5)
+        n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5)
        # plt.grid(True)
        plt.show()

@ -108,7 +110,7 @@ class NER:
                    dict_com[com] = 1
        # # print outlier
        # print(max(dict_com, key=dict_com.get))
-        return dict_com.values()
+        return list(dict_com.values())

 if __name__ == '__main__':