some updates

This commit is contained in:
Anne Lorenz 2018-11-23 12:09:48 +01:00
parent 5e3ff00b36
commit 29dabecb9e
22 changed files with 22277 additions and 41 deletions

View File

@ -13,12 +13,12 @@ also work => considered by 'rel_freq'(relative word frequencies) as parameter.
'''
from collections import OrderedDict
import csv
import pickle
import re
import numpy as np
import pandas as pd
from nltk.stem.porter import PorterStemmer
import pickle
class BagOfWords:

View File

@ -19,7 +19,7 @@ import graphviz
import numpy as np
import pandas as pd
from sklearn import tree
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

View File

@ -110,6 +110,42 @@ class FileHandler:
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
def remove_duplicates():
'''remove articles with exactly same headline
'''
# read data set
file = 'data\\cleaned_data_set_without_header.csv'
df = pd.read_csv(file,
delimiter='|',
header=None,
index_col=None,
engine='python',
#usecols=[1,2],
#nrows=100,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
print(type(df))
# for each article(row)
i = 1
while i < len(df):
# check if already in column
if df.iloc[i][1] in df.iloc[0:(i-1)][1]:
# remove duplicate
del df.iloc[i]
else:
i += 1
# save cleaned dataframe
df.to_csv('data\\cleaned_data_set_without_header.csv',
header=False,
index=False,
sep='|',
mode='a',
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
def write_articles_to_csv_files():
'''read JSON files, select articles and write them to csv.
'''
@ -197,4 +233,4 @@ class FileHandler:
if __name__ == '__main__':
# FileHandler.write_articles_to_csv_files()
# FileHandler.create_labeling_dataset()
FileHandler.clean_articles()
FileHandler.remove_duplicates()

46
NER.py
View File

@ -8,23 +8,24 @@ like persons, organizations and countries, e.g.
from collections import OrderedDict
import csv
import os
import pickle
import re
import numpy as np
import pandas as pd
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
import pickle
import re
class NER:
# common company abbreviations to be stripped
company_abbrevs = ['Inc.', 'Inc', 'Corp', '& Co', 'Co', 'Ltd.', 'Ltd',
'AG', 'LP', 'Limited', 'Tbk', 'Group', 'Co.', 'Groups'
'AG', 'LP', 'Limited', 'Tbk', 'Group', 'Co.', 'Groups',
'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB', 'Plc',
's.r.l.', 'Holding', 'Holdings', 'GmbH', 'plc', 'Incs',
'Plcs', 'PLC', 'Ltds', 'SA', 'Incs', 'S.A.R.L', 'LLC'
'Company', '& Co.', 'Corporation', 'Pte', 'Pty', 'LLP']
'Plcs', 'PLC', 'Ltds', 'SA', 'Incs', 'S.A.R.L', 'LLC',
'Company', '& Co.', 'Corporation', 'Pte', 'Pty', 'LLP',
'Corp.']
# organizations that are no companies
regex = r'.*Reuters.*|.*Ministry.*|.*Trump.*|.*Commission.*|.*BRIEF.*|\
@ -153,20 +154,21 @@ class NER:
print(n_dict)
if __name__ == '__main__':
print('# starting NER...')
print()
# read data set
file = 'data\\cleaned_data_set_without_header.csv'
df = pd.read_csv(file,
delimiter='|',
header=None,
index_col=None,
engine='python',
# usecols=[1,2],
# nrows=100,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
#print(df)
texts = df[1] + '. ' + df[2]
NER.count_companies(texts)
# NER.show_most_common_companies()
# print('# starting NER...')
# print()
# # read data set
# file = 'data\\cleaned_data_set_without_header.csv'
# df = pd.read_csv(file,
# delimiter='|',
# header=None,
# index_col=None,
# engine='python',
# # usecols=[1,2],
# # nrows=100,
# quoting=csv.QUOTE_NONNUMERIC,
# quotechar='\'')
# #print(df)
# texts = df[1] + '. ' + df[2]
# NER.count_companies(texts)
# # NER.show_most_common_companies()
#print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,16 +0,0 @@
import schedule
import time
def job():
print("Starting whole process")
#collecting the data (requester.py) -> csv file with data
#model.py -> to be created, parameters of the pickled model should go here, to use for predictions (SVM)
#pushed through NER, output as pdf
#sent via email to telesales
schedule.every().day.at("10:30").do(job)
while 1:
schedule.run_pending()
time.sleep(1)

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 180 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 173 KiB