some updates
This commit is contained in:
parent
03d96987b8
commit
446df63d84
|
@ -10,6 +10,12 @@ import csv
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
# todo: checken, wie groß ("Dataframe maximum size")
|
||||||
|
# import sys
|
||||||
|
# print(sys.getsizeof(OBEJCT_NAME_HERE))
|
||||||
|
|
||||||
|
# beim selektieren (833 pro Monat) auf Duplikate in Überschrift prüfen!!!
|
||||||
|
|
||||||
class CsvHandler:
|
class CsvHandler:
|
||||||
|
|
||||||
def read_csv(csv_file):
|
def read_csv(csv_file):
|
||||||
|
@ -25,7 +31,8 @@ class CsvHandler:
|
||||||
return df
|
return df
|
||||||
|
|
||||||
def write_csv(df, file_name):
|
def write_csv(df, file_name):
|
||||||
df.to_csv(file_name, sep='|')
|
df.to_csv(file_name,
|
||||||
|
sep='|')
|
||||||
print('# saved {} article(s) in {}'.format(len(df), file_name))
|
print('# saved {} article(s) in {}'.format(len(df), file_name))
|
||||||
|
|
||||||
def select_randoms(df, n):
|
def select_randoms(df, n):
|
||||||
|
|
20
NER.py
20
NER.py
|
@ -8,6 +8,7 @@ like persons, organizations and countries, e.g.
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
from nltk.tag import StanfordNERTagger
|
from nltk.tag import StanfordNERTagger
|
||||||
from nltk.tokenize import word_tokenize
|
from nltk.tokenize import word_tokenize
|
||||||
|
|
||||||
|
@ -53,19 +54,20 @@ class NER:
|
||||||
|
|
||||||
def plot_histogram(count_names):
|
def plot_histogram(count_names):
|
||||||
'''pyplot diagram of company names distribution
|
'''pyplot diagram of company names distribution
|
||||||
in input news articles
|
(probability density function)
|
||||||
x-axis:different company names (numbered consecutively)
|
x-axis: number of mentions of the company
|
||||||
y-axis:counts of company name
|
y-axis: frequency
|
||||||
'''
|
'''
|
||||||
# sort list in descending order
|
# sort list in descending order
|
||||||
sorted = count_names.sort(reverse=True)
|
count_names.sort(reverse=True)
|
||||||
# plt.plot(range(len(count_names)), count_names, 'ro', ms = 5)
|
# convert list to array
|
||||||
|
names = np.asarray(count_names)
|
||||||
plt.title('Company mentions in News Articles')
|
plt.title('Company mentions in News Articles')
|
||||||
plt.xlabel('Number of mentions of the company')
|
plt.xlabel('Count of articles that mention a company')
|
||||||
# Number of companies with this number of mentions
|
# Number of companies with this number of mentions
|
||||||
plt.ylabel('Frequency')
|
plt.ylabel('Number of companies with this number of articles')
|
||||||
num_bins = 50
|
num_bins = 50
|
||||||
n, bins, patches = plt.hist(sorted, num_bins, facecolor='blue', alpha=0.5)
|
n, bins, patches = plt.hist(names, num_bins, facecolor='blue', alpha=0.5)
|
||||||
# plt.grid(True)
|
# plt.grid(True)
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
@ -108,7 +110,7 @@ class NER:
|
||||||
dict_com[com] = 1
|
dict_com[com] = 1
|
||||||
# # print outlier
|
# # print outlier
|
||||||
# print(max(dict_com, key=dict_com.get))
|
# print(max(dict_com, key=dict_com.get))
|
||||||
return dict_com.values()
|
return list(dict_com.values())
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue