some updates

This commit is contained in:
Anne Lorenz 2018-11-23 12:09:48 +01:00
parent 5e3ff00b36
commit 29dabecb9e
22 changed files with 22277 additions and 41 deletions

View File

@ -13,12 +13,12 @@ also work => considered by 'rel_freq'(relative word frequencies) as parameter.
''' '''
from collections import OrderedDict from collections import OrderedDict
import csv import csv
import pickle
import re import re
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from nltk.stem.porter import PorterStemmer from nltk.stem.porter import PorterStemmer
import pickle
class BagOfWords: class BagOfWords:

View File

@ -19,7 +19,7 @@ import graphviz
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from sklearn import tree from sklearn import tree
# from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import f1_score from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import StratifiedKFold

View File

@ -110,6 +110,42 @@ class FileHandler:
quoting=csv.QUOTE_NONNUMERIC, quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') quotechar='\'')
def remove_duplicates():
'''remove articles with exactly same headline
'''
# read data set
file = 'data\\cleaned_data_set_without_header.csv'
df = pd.read_csv(file,
delimiter='|',
header=None,
index_col=None,
engine='python',
#usecols=[1,2],
#nrows=100,
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
print(type(df))
# for each article(row)
i = 1
while i < len(df):
# check if already in column
if df.iloc[i][1] in df.iloc[0:(i-1)][1]:
# remove duplicate
del df.iloc[i]
else:
i += 1
# save cleaned dataframe
df.to_csv('data\\cleaned_data_set_without_header.csv',
header=False,
index=False,
sep='|',
mode='a',
encoding='utf-8',
quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'')
def write_articles_to_csv_files(): def write_articles_to_csv_files():
'''read JSON files, select articles and write them to csv. '''read JSON files, select articles and write them to csv.
''' '''
@ -197,4 +233,4 @@ class FileHandler:
if __name__ == '__main__': if __name__ == '__main__':
# FileHandler.write_articles_to_csv_files() # FileHandler.write_articles_to_csv_files()
# FileHandler.create_labeling_dataset() # FileHandler.create_labeling_dataset()
FileHandler.clean_articles() FileHandler.remove_duplicates()

46
NER.py
View File

@ -8,23 +8,24 @@ like persons, organizations and countries, e.g.
from collections import OrderedDict from collections import OrderedDict
import csv import csv
import os import os
import pickle
import re
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from nltk.tag import StanfordNERTagger from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
import pickle
import re
class NER: class NER:
# common company abbreviations to be stripped # common company abbreviations to be stripped
company_abbrevs = ['Inc.', 'Inc', 'Corp', '& Co', 'Co', 'Ltd.', 'Ltd', company_abbrevs = ['Inc.', 'Inc', 'Corp', '& Co', 'Co', 'Ltd.', 'Ltd',
'AG', 'LP', 'Limited', 'Tbk', 'Group', 'Co.', 'Groups' 'AG', 'LP', 'Limited', 'Tbk', 'Group', 'Co.', 'Groups',
'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB', 'Plc', 'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB', 'Plc',
's.r.l.', 'Holding', 'Holdings', 'GmbH', 'plc', 'Incs', 's.r.l.', 'Holding', 'Holdings', 'GmbH', 'plc', 'Incs',
'Plcs', 'PLC', 'Ltds', 'SA', 'Incs', 'S.A.R.L', 'LLC' 'Plcs', 'PLC', 'Ltds', 'SA', 'Incs', 'S.A.R.L', 'LLC',
'Company', '& Co.', 'Corporation', 'Pte', 'Pty', 'LLP'] 'Company', '& Co.', 'Corporation', 'Pte', 'Pty', 'LLP',
'Corp.']
# organizations that are no companies # organizations that are no companies
regex = r'.*Reuters.*|.*Ministry.*|.*Trump.*|.*Commission.*|.*BRIEF.*|\ regex = r'.*Reuters.*|.*Ministry.*|.*Trump.*|.*Commission.*|.*BRIEF.*|\
@ -153,20 +154,21 @@ class NER:
print(n_dict) print(n_dict)
if __name__ == '__main__': if __name__ == '__main__':
print('# starting NER...') # print('# starting NER...')
print() # print()
# read data set # # read data set
file = 'data\\cleaned_data_set_without_header.csv' # file = 'data\\cleaned_data_set_without_header.csv'
df = pd.read_csv(file, # df = pd.read_csv(file,
delimiter='|', # delimiter='|',
header=None, # header=None,
index_col=None, # index_col=None,
engine='python', # engine='python',
# usecols=[1,2], # # usecols=[1,2],
# nrows=100, # # nrows=100,
quoting=csv.QUOTE_NONNUMERIC, # quoting=csv.QUOTE_NONNUMERIC,
quotechar='\'') # quotechar='\'')
#print(df) # #print(df)
texts = df[1] + '. ' + df[2] # texts = df[1] + '. ' + df[2]
NER.count_companies(texts) # NER.count_companies(texts)
# NER.show_most_common_companies() # # NER.show_most_common_companies()
#print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,16 +0,0 @@
import schedule
import time
def job():
print("Starting whole process")
#collecting the data (requester.py) -> csv file with data
#model.py -> to be created, parameters of the pickled model should go here, to use for predictions (SVM)
#pushed through NER, output as pdf
#sent via email to telesales
schedule.every().day.at("10:30").do(job)
while 1:
schedule.run_pending()
time.sleep(1)

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 180 KiB

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 173 KiB