updated jupyter notebook
This commit is contained in:
parent
afe0e96efd
commit
59c664fbb0
File diff suppressed because one or more lines are too long
10001
data/interactive_labeling_round_1.csv
Normal file
10001
data/interactive_labeling_round_1.csv
Normal file
File diff suppressed because one or more lines are too long
BIN
obj/dict_articles_organizations.pkl
Normal file
BIN
obj/dict_articles_organizations.pkl
Normal file
Binary file not shown.
File diff suppressed because one or more lines are too long
@ -68,7 +68,7 @@ class BagOfWords:
|
|||||||
returns list of lists of all extracted words, one row per article
|
returns list of lists of all extracted words, one row per article
|
||||||
'''
|
'''
|
||||||
extracted_words = []
|
extracted_words = []
|
||||||
print('# extracting all words from articles...')
|
print('# BOW: extracting all words from articles...')
|
||||||
print()
|
print()
|
||||||
for text in corpus:
|
for text in corpus:
|
||||||
extracted_words.append(BagOfWords.extract_words(text, stemming))
|
extracted_words.append(BagOfWords.extract_words(text, stemming))
|
||||||
@ -88,8 +88,7 @@ class BagOfWords:
|
|||||||
|
|
||||||
# total number of words in bag of words
|
# total number of words in bag of words
|
||||||
word_count = 0
|
word_count = 0
|
||||||
print('# counting number of features in corpus...')
|
|
||||||
print()
|
|
||||||
for list in extracted_words:
|
for list in extracted_words:
|
||||||
word_count += len(list)
|
word_count += len(list)
|
||||||
|
|
||||||
@ -102,7 +101,7 @@ class BagOfWords:
|
|||||||
array = np.zeros(shape=(n_articles, l_vocab))
|
array = np.zeros(shape=(n_articles, l_vocab))
|
||||||
df_matrix = pd.DataFrame(array, columns=vocab)
|
df_matrix = pd.DataFrame(array, columns=vocab)
|
||||||
|
|
||||||
print('# calculating frequencies...')
|
print('# BOW: calculating frequencies...')
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# for every text in series
|
# for every text in series
|
||||||
@ -142,7 +141,7 @@ class BagOfWords:
|
|||||||
for word in e_list:
|
for word in e_list:
|
||||||
# add every single word to vocabulary
|
# add every single word to vocabulary
|
||||||
vocab.add(word)
|
vocab.add(word)
|
||||||
print('# vocabulary consists of {} features.'.format(len(vocab)))
|
print('# BOW: vocabulary consists of {} features.'.format(len(vocab)))
|
||||||
print()
|
print()
|
||||||
# transform set to list
|
# transform set to list
|
||||||
return list(vocab)
|
return list(vocab)
|
||||||
@ -259,7 +258,7 @@ class BagOfWords:
|
|||||||
def count_features(texts, stemming=True):
|
def count_features(texts, stemming=True):
|
||||||
''' count total number of features in textual corpus
|
''' count total number of features in textual corpus
|
||||||
'''
|
'''
|
||||||
print('# counting all features in corpus...')
|
print('# BOW: counting all features in corpus...')
|
||||||
print()
|
print()
|
||||||
vocab = BagOfWords.make_vocab(texts, stemming)
|
vocab = BagOfWords.make_vocab(texts, stemming)
|
||||||
return len(vocab)
|
return len(vocab)
|
||||||
|
@ -23,7 +23,7 @@ class MNBInteractive:
|
|||||||
# chose BagOfWords implementation (own if false)
|
# chose BagOfWords implementation (own if false)
|
||||||
sklearn_cv = False
|
sklearn_cv = False
|
||||||
|
|
||||||
print('# starting multinomial naives bayes...')
|
print('# MNB: starting multinomial naives bayes...')
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# split labeled data into text and label set
|
# split labeled data into text and label set
|
||||||
@ -40,7 +40,7 @@ class MNBInteractive:
|
|||||||
|
|
||||||
# fit_prior=False: a uniform prior will be used instead
|
# fit_prior=False: a uniform prior will be used instead
|
||||||
# of learning class prior probabilities
|
# of learning class prior probabilities
|
||||||
classifier = MultinomialNB(alpha=0.5,
|
classifier = MultinomialNB(alpha=1.0e-10,
|
||||||
fit_prior=False,
|
fit_prior=False,
|
||||||
class_prior=None)
|
class_prior=None)
|
||||||
|
|
||||||
@ -69,10 +69,14 @@ class MNBInteractive:
|
|||||||
vocab = BagOfWords.make_vocab(extracted_words)
|
vocab = BagOfWords.make_vocab(extracted_words)
|
||||||
|
|
||||||
# fit the training data and then return the matrix
|
# fit the training data and then return the matrix
|
||||||
|
print('# MNB: fit training data and calculate matrix...')
|
||||||
|
print()
|
||||||
training_data = BagOfWords.make_matrix(extracted_words,
|
training_data = BagOfWords.make_matrix(extracted_words,
|
||||||
vocab, rel_freq, stemming)
|
vocab, rel_freq, stemming)
|
||||||
|
|
||||||
# transform testing data and return the matrix
|
# transform testing data and return the matrix
|
||||||
|
print('# MNB: transform testing data to matrix...')
|
||||||
|
print()
|
||||||
extracted_words = BagOfWords.extract_all_words(U)
|
extracted_words = BagOfWords.extract_all_words(U)
|
||||||
testing_data = BagOfWords.make_matrix(extracted_words,
|
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||||
vocab, rel_freq, stemming)
|
vocab, rel_freq, stemming)
|
||||||
@ -85,7 +89,12 @@ class MNBInteractive:
|
|||||||
|
|
||||||
# number of samples encountered for each class during fitting
|
# number of samples encountered for each class during fitting
|
||||||
# this value is weighted by the sample weight when provided
|
# this value is weighted by the sample weight when provided
|
||||||
class_count = classifier.class_count_
|
# class_count = classifier.class_count_
|
||||||
|
|
||||||
|
# classes in order used
|
||||||
|
classes = classifier.classes_
|
||||||
|
|
||||||
|
print('# MNB: ending multinomial naive bayes')
|
||||||
|
|
||||||
# return classes and vector of class estimates
|
# return classes and vector of class estimates
|
||||||
return class_count, class_probs
|
return classes, class_probs
|
23
src/NER.py
23
src/NER.py
@ -156,6 +156,27 @@ class NER:
|
|||||||
# print(max(dict_com, key=dict_com.get))
|
# print(max(dict_com, key=dict_com.get))
|
||||||
return list(dict_com.values())
|
return list(dict_com.values())
|
||||||
|
|
||||||
|
def make_article_orgs_dict(texts):
|
||||||
|
'''param: list of all article texts
|
||||||
|
returns: dict of article indices with company names in it
|
||||||
|
'''
|
||||||
|
print('# searching company names...')
|
||||||
|
print()
|
||||||
|
|
||||||
|
# dict of article indices with company names in it
|
||||||
|
dict_art_orgs = {}
|
||||||
|
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
# list of found companies in article
|
||||||
|
print('# article no. {}:'.format(i))
|
||||||
|
dict_art_orgs[i] = NER.find_companies(text)
|
||||||
|
|
||||||
|
# save coms_list
|
||||||
|
with open('../obj/'+ 'dict_articles_organizations' + '.pkl', 'wb') as f:
|
||||||
|
pickle.dump(dict_art_orgs, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
print(dict_art_orgs)
|
||||||
|
|
||||||
def show_most_common_companies(n_commons=50):
|
def show_most_common_companies(n_commons=50):
|
||||||
# load pickle object
|
# load pickle object
|
||||||
with open('../obj/dict_organizations.pkl', 'rb') as input:
|
with open('../obj/dict_organizations.pkl', 'rb') as input:
|
||||||
@ -189,6 +210,6 @@ if __name__ == '__main__':
|
|||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
#print(df)
|
#print(df)
|
||||||
texts = df[1] + '. ' + df[2]
|
texts = df[1] + '. ' + df[2]
|
||||||
NER.count_companies(texts)
|
NER.make_article_orgs_dict(texts)
|
||||||
# NER.show_most_common_companies()
|
# NER.show_most_common_companies()
|
||||||
# print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
|
# print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
|
Loading…
x
Reference in New Issue
Block a user