some updates
|
@ -13,12 +13,12 @@ also work => considered by 'rel_freq'(relative word frequencies) as parameter.
|
|||
'''
|
||||
from collections import OrderedDict
|
||||
import csv
|
||||
import pickle
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from nltk.stem.porter import PorterStemmer
|
||||
import pickle
|
||||
|
||||
class BagOfWords:
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@ import graphviz
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn import tree
|
||||
# from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.feature_selection import SelectPercentile
|
||||
from sklearn.metrics import f1_score
|
||||
from sklearn.model_selection import StratifiedKFold
|
||||
|
|
|
@ -110,6 +110,42 @@ class FileHandler:
|
|||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
quotechar='\'')
|
||||
|
||||
def remove_duplicates():
|
||||
'''remove articles with exactly same headline
|
||||
'''
|
||||
# read data set
|
||||
file = 'data\\cleaned_data_set_without_header.csv'
|
||||
df = pd.read_csv(file,
|
||||
delimiter='|',
|
||||
header=None,
|
||||
index_col=None,
|
||||
engine='python',
|
||||
#usecols=[1,2],
|
||||
#nrows=100,
|
||||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
quotechar='\'')
|
||||
print(type(df))
|
||||
|
||||
# for each article(row)
|
||||
i = 1
|
||||
while i < len(df):
|
||||
# check if already in column
|
||||
if df.iloc[i][1] in df.iloc[0:(i-1)][1]:
|
||||
# remove duplicate
|
||||
del df.iloc[i]
|
||||
else:
|
||||
i += 1
|
||||
|
||||
# save cleaned dataframe
|
||||
df.to_csv('data\\cleaned_data_set_without_header.csv',
|
||||
header=False,
|
||||
index=False,
|
||||
sep='|',
|
||||
mode='a',
|
||||
encoding='utf-8',
|
||||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
quotechar='\'')
|
||||
|
||||
def write_articles_to_csv_files():
|
||||
'''read JSON files, select articles and write them to csv.
|
||||
'''
|
||||
|
@ -197,4 +233,4 @@ class FileHandler:
|
|||
if __name__ == '__main__':
|
||||
# FileHandler.write_articles_to_csv_files()
|
||||
# FileHandler.create_labeling_dataset()
|
||||
FileHandler.clean_articles()
|
||||
FileHandler.remove_duplicates()
|
46
NER.py
|
@ -8,23 +8,24 @@ like persons, organizations and countries, e.g.
|
|||
from collections import OrderedDict
|
||||
import csv
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from nltk.tag import StanfordNERTagger
|
||||
from nltk.tokenize import word_tokenize
|
||||
import pickle
|
||||
import re
|
||||
|
||||
class NER:
|
||||
|
||||
# common company abbreviations to be stripped
|
||||
company_abbrevs = ['Inc.', 'Inc', 'Corp', '& Co', 'Co', 'Ltd.', 'Ltd',
|
||||
'AG', 'LP', 'Limited', 'Tbk', 'Group', 'Co.', 'Groups'
|
||||
'AG', 'LP', 'Limited', 'Tbk', 'Group', 'Co.', 'Groups',
|
||||
'LLC', 'LBO', 'IPO', 'HQ', 'CIO', 'NGO', 'AB', 'Plc',
|
||||
's.r.l.', 'Holding', 'Holdings', 'GmbH', 'plc', 'Incs',
|
||||
'Plcs', 'PLC', 'Ltds', 'SA', 'Incs', 'S.A.R.L', 'LLC'
|
||||
'Company', '& Co.', 'Corporation', 'Pte', 'Pty', 'LLP']
|
||||
'Plcs', 'PLC', 'Ltds', 'SA', 'Incs', 'S.A.R.L', 'LLC',
|
||||
'Company', '& Co.', 'Corporation', 'Pte', 'Pty', 'LLP',
|
||||
'Corp.']
|
||||
|
||||
# organizations that are no companies
|
||||
regex = r'.*Reuters.*|.*Ministry.*|.*Trump.*|.*Commission.*|.*BRIEF.*|\
|
||||
|
@ -153,20 +154,21 @@ class NER:
|
|||
print(n_dict)
|
||||
|
||||
if __name__ == '__main__':
|
||||
print('# starting NER...')
|
||||
print()
|
||||
# read data set
|
||||
file = 'data\\cleaned_data_set_without_header.csv'
|
||||
df = pd.read_csv(file,
|
||||
delimiter='|',
|
||||
header=None,
|
||||
index_col=None,
|
||||
engine='python',
|
||||
# usecols=[1,2],
|
||||
# nrows=100,
|
||||
quoting=csv.QUOTE_NONNUMERIC,
|
||||
quotechar='\'')
|
||||
#print(df)
|
||||
texts = df[1] + '. ' + df[2]
|
||||
NER.count_companies(texts)
|
||||
# NER.show_most_common_companies()
|
||||
# print('# starting NER...')
|
||||
# print()
|
||||
# # read data set
|
||||
# file = 'data\\cleaned_data_set_without_header.csv'
|
||||
# df = pd.read_csv(file,
|
||||
# delimiter='|',
|
||||
# header=None,
|
||||
# index_col=None,
|
||||
# engine='python',
|
||||
# # usecols=[1,2],
|
||||
# # nrows=100,
|
||||
# quoting=csv.QUOTE_NONNUMERIC,
|
||||
# quotechar='\'')
|
||||
# #print(df)
|
||||
# texts = df[1] + '. ' + df[2]
|
||||
# NER.count_companies(texts)
|
||||
# # NER.show_most_common_companies()
|
||||
#print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
|
16
scheduler.py
|
@ -1,16 +0,0 @@
|
|||
import schedule
|
||||
import time
|
||||
|
||||
def job():
|
||||
print("Starting whole process")
|
||||
#collecting the data (requester.py) -> csv file with data
|
||||
#model.py -> to be created, parameters of the pickled model should go here, to use for predictions (SVM)
|
||||
#pushed through NER, output as pdf
|
||||
#sent via email to telesales
|
||||
|
||||
|
||||
schedule.every().day.at("10:30").do(job)
|
||||
|
||||
while 1:
|
||||
schedule.run_pending()
|
||||
time.sleep(1)
|
After Width: | Height: | Size: 19 KiB |
After Width: | Height: | Size: 20 KiB |
After Width: | Height: | Size: 23 KiB |
After Width: | Height: | Size: 22 KiB |
After Width: | Height: | Size: 24 KiB |
After Width: | Height: | Size: 180 KiB |
After Width: | Height: | Size: 173 KiB |