cleaned dataset

2018-11-05 13:27:52 +01:00 · 2018-11-05 13:27:52 +01:00 · 2243a50ed0
commit 2243a50ed0
parent b7d1f546e4
4 changed files with 8 additions and 20005 deletions
--- a/NER.py
+++ b/NER.py
@ -157,8 +157,8 @@ if __name__ == '__main__':
                     header=None,
                     index_col=None,
                     engine='python',
-                     #usecols=[1,2],
-                     nrows=100,
+                     # usecols=[1,2],
+                     # nrows=100,
                     quoting=csv.QUOTE_NONNUMERIC,
                     quotechar='\'')
    #print(df)
--- a/VisualizerNews.py
+++ b/VisualizerNews.py
@ -97,7 +97,7 @@ class VisualizerNews:
        # texts = df_hits['Title'] + '. ' + df_hits['Text']
        texts = df[1] + '. ' + df[2]

-        # dict: count articles with company names
+        # list: count articles with company names
        count_names = NER.count_companies(texts)
        
        # sort list in descending order
@ -265,6 +265,11 @@ class VisualizerNews:
                    .format(VisualizerNews.datestring))
        plt.show()

+    def plot_hist_num_comp_per_art():
+        ''' open pkl file of dict, plot histogram of number of different
+        company names per article.
+        '''
+        
 if __name__ == '__main__':
    VisualizerNews.plot_wordcloud_dataset()
    # VisualizerNews.plot_histogram_companies()
--- a/data/interactive_labeling_dataset.csv
+++ b/data/interactive_labeling_dataset.csv
--- a/data/interactive_labeling_dataset_without_header.csv
+++ b/data/interactive_labeling_dataset_without_header.csv