some updates
This commit is contained in:
parent
03d96987b8
commit
446df63d84
|
@ -10,6 +10,12 @@ import csv
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
# todo: checken, wie groß ("Dataframe maximum size")
|
||||
# import sys
|
||||
# print(sys.getsizeof(OBEJCT_NAME_HERE))
|
||||
|
||||
# beim selektieren (833 pro Monat) auf Duplikate in Überschrift prüfen!!!
|
||||
|
||||
class CsvHandler:
|
||||
|
||||
def read_csv(csv_file):
|
||||
|
@ -25,7 +31,8 @@ class CsvHandler:
|
|||
return df
|
||||
|
||||
def write_csv(df, file_name):
|
||||
df.to_csv(file_name, sep='|')
|
||||
df.to_csv(file_name,
|
||||
sep='|')
|
||||
print('# saved {} article(s) in {}'.format(len(df), file_name))
|
||||
|
||||
def select_randoms(df, n):
|
||||
|
|
20
NER.py
20
NER.py
|
@ -8,6 +8,7 @@ like persons, organizations and countries, e.g.
|
|||
import os
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from nltk.tag import StanfordNERTagger
|
||||
from nltk.tokenize import word_tokenize
|
||||
|
||||
|
@ -53,19 +54,20 @@ class NER:
|
|||
|
||||
def plot_histogram(count_names):
|
||||
'''pyplot diagram of company names distribution
|
||||
in input news articles
|
||||
x-axis:different company names (numbered consecutively)
|
||||
y-axis:counts of company name
|
||||
(probability density function)
|
||||
x-axis: number of mentions of the company
|
||||
y-axis: frequency
|
||||
'''
|
||||
# sort list in descending order
|
||||
sorted = count_names.sort(reverse=True)
|
||||
# plt.plot(range(len(count_names)), count_names, 'ro', ms = 5)
|
||||
count_names.sort(reverse=True)
|
||||
# convert list to array
|
||||
names = np.asarray(count_names)
|
||||
plt.title('Company mentions in News Articles')
|
||||
plt.xlabel('Number of mentions of the company')
|
||||
plt.xlabel('Count of articles that mention a company')
|
||||
# Number of companies with this number of mentions
|
||||
plt.ylabel('Frequency')
|
||||
plt.ylabel('Number of companies with this number of articles')
|
||||
num_bins = 50
|
||||
n, bins, patches = plt.hist(sorted, num_bins, facecolor='blue', alpha=0.5)
|
||||
n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5)
|
||||
# plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
|
@ -108,7 +110,7 @@ class NER:
|
|||
dict_com[com] = 1
|
||||
# # print outlier
|
||||
# print(max(dict_com, key=dict_com.get))
|
||||
return dict_com.values()
|
||||
return list(dict_com.values())
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
|
|
Loading…
Reference in New Issue