cleaned dataset

This commit is contained in:
Anne Lorenz 2018-11-05 13:27:52 +01:00
parent b7d1f546e4
commit 2243a50ed0
4 changed files with 8 additions and 20005 deletions

4
NER.py
View File

@ -157,8 +157,8 @@ if __name__ == '__main__':
header=None, header=None,
index_col=None, index_col=None,
engine='python', engine='python',
#usecols=[1,2], # usecols=[1,2],
nrows=100, # nrows=100,
quoting=csv.QUOTE_NONNUMERIC, quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') quotechar='\'')
#print(df) #print(df)

View File

@ -97,7 +97,7 @@ class VisualizerNews:
# texts = df_hits['Title'] + '. ' + df_hits['Text'] # texts = df_hits['Title'] + '. ' + df_hits['Text']
texts = df[1] + '. ' + df[2] texts = df[1] + '. ' + df[2]
# dict: count articles with company names # list: count articles with company names
count_names = NER.count_companies(texts) count_names = NER.count_companies(texts)
# sort list in descending order # sort list in descending order
@ -265,6 +265,11 @@ class VisualizerNews:
.format(VisualizerNews.datestring)) .format(VisualizerNews.datestring))
plt.show() plt.show()
def plot_hist_num_comp_per_art():
''' open pkl file of dict, plot histogram of number of different
company names per article.
'''
if __name__ == '__main__': if __name__ == '__main__':
VisualizerNews.plot_wordcloud_dataset() VisualizerNews.plot_wordcloud_dataset()
# VisualizerNews.plot_histogram_companies() # VisualizerNews.plot_histogram_companies()

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long