some updates

This commit is contained in:
Anne Lorenz 2018-10-02 13:56:06 +02:00
parent 03d96987b8
commit 446df63d84
2 changed files with 19 additions and 10 deletions

View File

@ -10,6 +10,12 @@ import csv
import numpy as np import numpy as np
import pandas as pd import pandas as pd
# todo: checken, wie groß ("Dataframe maximum size")
# import sys
# print(sys.getsizeof(OBEJCT_NAME_HERE))
# beim selektieren (833 pro Monat) auf Duplikate in Überschrift prüfen!!!
class CsvHandler: class CsvHandler:
def read_csv(csv_file): def read_csv(csv_file):
@ -25,7 +31,8 @@ class CsvHandler:
return df return df
def write_csv(df, file_name): def write_csv(df, file_name):
df.to_csv(file_name, sep='|') df.to_csv(file_name,
sep='|')
print('# saved {} article(s) in {}'.format(len(df), file_name)) print('# saved {} article(s) in {}'.format(len(df), file_name))
def select_randoms(df, n): def select_randoms(df, n):

20
NER.py
View File

@ -8,6 +8,7 @@ like persons, organizations and countries, e.g.
import os import os
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np
from nltk.tag import StanfordNERTagger from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
@ -53,19 +54,20 @@ class NER:
def plot_histogram(count_names): def plot_histogram(count_names):
'''pyplot diagram of company names distribution '''pyplot diagram of company names distribution
in input news articles (probability density function)
x-axis:different company names (numbered consecutively) x-axis: number of mentions of the company
y-axis:counts of company name y-axis: frequency
''' '''
# sort list in descending order # sort list in descending order
sorted = count_names.sort(reverse=True) count_names.sort(reverse=True)
# plt.plot(range(len(count_names)), count_names, 'ro', ms = 5) # convert list to array
names = np.asarray(count_names)
plt.title('Company mentions in News Articles') plt.title('Company mentions in News Articles')
plt.xlabel('Number of mentions of the company') plt.xlabel('Count of articles that mention a company')
# Number of companies with this number of mentions # Number of companies with this number of mentions
plt.ylabel('Frequency') plt.ylabel('Number of companies with this number of articles')
num_bins = 50 num_bins = 50
n, bins, patches = plt.hist(sorted, num_bins, facecolor='blue', alpha=0.5) n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5)
# plt.grid(True) # plt.grid(True)
plt.show() plt.show()
@ -108,7 +110,7 @@ class NER:
dict_com[com] = 1 dict_com[com] = 1
# # print outlier # # print outlier
# print(max(dict_com, key=dict_com.get)) # print(max(dict_com, key=dict_com.get))
return dict_com.values() return list(dict_com.values())
if __name__ == '__main__': if __name__ == '__main__':