some updates

This commit is contained in:
Anne Lorenz 2018-10-02 13:56:06 +02:00
parent 03d96987b8
commit 446df63d84
2 changed files with 19 additions and 10 deletions

View File

@ -10,6 +10,12 @@ import csv
import numpy as np
import pandas as pd
# todo: checken, wie groß ("Dataframe maximum size")
# import sys
# print(sys.getsizeof(OBEJCT_NAME_HERE))
# beim selektieren (833 pro Monat) auf Duplikate in Überschrift prüfen!!!
class CsvHandler:
def read_csv(csv_file):
@ -25,7 +31,8 @@ class CsvHandler:
return df
def write_csv(df, file_name):
df.to_csv(file_name, sep='|')
df.to_csv(file_name,
sep='|')
print('# saved {} article(s) in {}'.format(len(df), file_name))
def select_randoms(df, n):

20
NER.py
View File

@ -8,6 +8,7 @@ like persons, organizations and countries, e.g.
import os
import matplotlib.pyplot as plt
import numpy as np
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
@ -53,19 +54,20 @@ class NER:
def plot_histogram(count_names):
'''pyplot diagram of company names distribution
in input news articles
x-axis:different company names (numbered consecutively)
y-axis:counts of company name
(probability density function)
x-axis: number of mentions of the company
y-axis: frequency
'''
# sort list in descending order
sorted = count_names.sort(reverse=True)
# plt.plot(range(len(count_names)), count_names, 'ro', ms = 5)
count_names.sort(reverse=True)
# convert list to array
names = np.asarray(count_names)
plt.title('Company mentions in News Articles')
plt.xlabel('Number of mentions of the company')
plt.xlabel('Count of articles that mention a company')
# Number of companies with this number of mentions
plt.ylabel('Frequency')
plt.ylabel('Number of companies with this number of articles')
num_bins = 50
n, bins, patches = plt.hist(sorted, num_bins, facecolor='blue', alpha=0.5)
n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5)
# plt.grid(True)
plt.show()
@ -108,7 +110,7 @@ class NER:
dict_com[com] = 1
# # print outlier
# print(max(dict_com, key=dict_com.get))
return dict_com.values()
return list(dict_com.values())
if __name__ == '__main__':