updated jupyter notebook
This commit is contained in:
parent
afe0e96efd
commit
59c664fbb0
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
File diff suppressed because one or more lines are too long
|
@ -68,7 +68,7 @@ class BagOfWords:
|
||||||
returns list of lists of all extracted words, one row per article
|
returns list of lists of all extracted words, one row per article
|
||||||
'''
|
'''
|
||||||
extracted_words = []
|
extracted_words = []
|
||||||
print('# extracting all words from articles...')
|
print('# BOW: extracting all words from articles...')
|
||||||
print()
|
print()
|
||||||
for text in corpus:
|
for text in corpus:
|
||||||
extracted_words.append(BagOfWords.extract_words(text, stemming))
|
extracted_words.append(BagOfWords.extract_words(text, stemming))
|
||||||
|
@ -88,8 +88,7 @@ class BagOfWords:
|
||||||
|
|
||||||
# total number of words in bag of words
|
# total number of words in bag of words
|
||||||
word_count = 0
|
word_count = 0
|
||||||
print('# counting number of features in corpus...')
|
|
||||||
print()
|
|
||||||
for list in extracted_words:
|
for list in extracted_words:
|
||||||
word_count += len(list)
|
word_count += len(list)
|
||||||
|
|
||||||
|
@ -102,7 +101,7 @@ class BagOfWords:
|
||||||
array = np.zeros(shape=(n_articles, l_vocab))
|
array = np.zeros(shape=(n_articles, l_vocab))
|
||||||
df_matrix = pd.DataFrame(array, columns=vocab)
|
df_matrix = pd.DataFrame(array, columns=vocab)
|
||||||
|
|
||||||
print('# calculating frequencies...')
|
print('# BOW: calculating frequencies...')
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# for every text in series
|
# for every text in series
|
||||||
|
@ -142,7 +141,7 @@ class BagOfWords:
|
||||||
for word in e_list:
|
for word in e_list:
|
||||||
# add every single word to vocabulary
|
# add every single word to vocabulary
|
||||||
vocab.add(word)
|
vocab.add(word)
|
||||||
print('# vocabulary consists of {} features.'.format(len(vocab)))
|
print('# BOW: vocabulary consists of {} features.'.format(len(vocab)))
|
||||||
print()
|
print()
|
||||||
# transform set to list
|
# transform set to list
|
||||||
return list(vocab)
|
return list(vocab)
|
||||||
|
@ -259,7 +258,7 @@ class BagOfWords:
|
||||||
def count_features(texts, stemming=True):
|
def count_features(texts, stemming=True):
|
||||||
''' count total number of features in textual corpus
|
''' count total number of features in textual corpus
|
||||||
'''
|
'''
|
||||||
print('# counting all features in corpus...')
|
print('# BOW: counting all features in corpus...')
|
||||||
print()
|
print()
|
||||||
vocab = BagOfWords.make_vocab(texts, stemming)
|
vocab = BagOfWords.make_vocab(texts, stemming)
|
||||||
return len(vocab)
|
return len(vocab)
|
||||||
|
|
|
@ -23,7 +23,7 @@ class MNBInteractive:
|
||||||
# chose BagOfWords implementation (own if false)
|
# chose BagOfWords implementation (own if false)
|
||||||
sklearn_cv = False
|
sklearn_cv = False
|
||||||
|
|
||||||
print('# starting multinomial naives bayes...')
|
print('# MNB: starting multinomial naives bayes...')
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# split labeled data into text and label set
|
# split labeled data into text and label set
|
||||||
|
@ -40,7 +40,7 @@ class MNBInteractive:
|
||||||
|
|
||||||
# fit_prior=False: a uniform prior will be used instead
|
# fit_prior=False: a uniform prior will be used instead
|
||||||
# of learning class prior probabilities
|
# of learning class prior probabilities
|
||||||
classifier = MultinomialNB(alpha=0.5,
|
classifier = MultinomialNB(alpha=1.0e-10,
|
||||||
fit_prior=False,
|
fit_prior=False,
|
||||||
class_prior=None)
|
class_prior=None)
|
||||||
|
|
||||||
|
@ -69,10 +69,14 @@ class MNBInteractive:
|
||||||
vocab = BagOfWords.make_vocab(extracted_words)
|
vocab = BagOfWords.make_vocab(extracted_words)
|
||||||
|
|
||||||
# fit the training data and then return the matrix
|
# fit the training data and then return the matrix
|
||||||
|
print('# MNB: fit training data and calculate matrix...')
|
||||||
|
print()
|
||||||
training_data = BagOfWords.make_matrix(extracted_words,
|
training_data = BagOfWords.make_matrix(extracted_words,
|
||||||
vocab, rel_freq, stemming)
|
vocab, rel_freq, stemming)
|
||||||
|
|
||||||
# transform testing data and return the matrix
|
# transform testing data and return the matrix
|
||||||
|
print('# MNB: transform testing data to matrix...')
|
||||||
|
print()
|
||||||
extracted_words = BagOfWords.extract_all_words(U)
|
extracted_words = BagOfWords.extract_all_words(U)
|
||||||
testing_data = BagOfWords.make_matrix(extracted_words,
|
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||||
vocab, rel_freq, stemming)
|
vocab, rel_freq, stemming)
|
||||||
|
@ -85,7 +89,12 @@ class MNBInteractive:
|
||||||
|
|
||||||
# number of samples encountered for each class during fitting
|
# number of samples encountered for each class during fitting
|
||||||
# this value is weighted by the sample weight when provided
|
# this value is weighted by the sample weight when provided
|
||||||
class_count = classifier.class_count_
|
# class_count = classifier.class_count_
|
||||||
|
|
||||||
|
# classes in order used
|
||||||
|
classes = classifier.classes_
|
||||||
|
|
||||||
|
print('# MNB: ending multinomial naive bayes')
|
||||||
|
|
||||||
# return classes and vector of class estimates
|
# return classes and vector of class estimates
|
||||||
return class_count, class_probs
|
return classes, class_probs
|
23
src/NER.py
23
src/NER.py
|
@ -156,6 +156,27 @@ class NER:
|
||||||
# print(max(dict_com, key=dict_com.get))
|
# print(max(dict_com, key=dict_com.get))
|
||||||
return list(dict_com.values())
|
return list(dict_com.values())
|
||||||
|
|
||||||
|
def make_article_orgs_dict(texts):
|
||||||
|
'''param: list of all article texts
|
||||||
|
returns: dict of article indices with company names in it
|
||||||
|
'''
|
||||||
|
print('# searching company names...')
|
||||||
|
print()
|
||||||
|
|
||||||
|
# dict of article indices with company names in it
|
||||||
|
dict_art_orgs = {}
|
||||||
|
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
# list of found companies in article
|
||||||
|
print('# article no. {}:'.format(i))
|
||||||
|
dict_art_orgs[i] = NER.find_companies(text)
|
||||||
|
|
||||||
|
# save coms_list
|
||||||
|
with open('../obj/'+ 'dict_articles_organizations' + '.pkl', 'wb') as f:
|
||||||
|
pickle.dump(dict_art_orgs, f, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
print(dict_art_orgs)
|
||||||
|
|
||||||
def show_most_common_companies(n_commons=50):
|
def show_most_common_companies(n_commons=50):
|
||||||
# load pickle object
|
# load pickle object
|
||||||
with open('../obj/dict_organizations.pkl', 'rb') as input:
|
with open('../obj/dict_organizations.pkl', 'rb') as input:
|
||||||
|
@ -189,6 +210,6 @@ if __name__ == '__main__':
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
#print(df)
|
#print(df)
|
||||||
texts = df[1] + '. ' + df[2]
|
texts = df[1] + '. ' + df[2]
|
||||||
NER.count_companies(texts)
|
NER.make_article_orgs_dict(texts)
|
||||||
# NER.show_most_common_companies()
|
# NER.show_most_common_companies()
|
||||||
# print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
|
# print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
|
Loading…
Reference in New Issue