updated jupyter notebook
This commit is contained in:
parent
afe0e96efd
commit
59c664fbb0
File diff suppressed because one or more lines are too long
10001
data/interactive_labeling_round_1.csv
Normal file
10001
data/interactive_labeling_round_1.csv
Normal file
File diff suppressed because one or more lines are too long
BIN
obj/dict_articles_organizations.pkl
Normal file
BIN
obj/dict_articles_organizations.pkl
Normal file
Binary file not shown.
File diff suppressed because one or more lines are too long
@ -68,7 +68,7 @@ class BagOfWords:
|
||||
returns list of lists of all extracted words, one row per article
|
||||
'''
|
||||
extracted_words = []
|
||||
print('# extracting all words from articles...')
|
||||
print('# BOW: extracting all words from articles...')
|
||||
print()
|
||||
for text in corpus:
|
||||
extracted_words.append(BagOfWords.extract_words(text, stemming))
|
||||
@ -88,8 +88,7 @@ class BagOfWords:
|
||||
|
||||
# total number of words in bag of words
|
||||
word_count = 0
|
||||
print('# counting number of features in corpus...')
|
||||
print()
|
||||
|
||||
for list in extracted_words:
|
||||
word_count += len(list)
|
||||
|
||||
@ -102,7 +101,7 @@ class BagOfWords:
|
||||
array = np.zeros(shape=(n_articles, l_vocab))
|
||||
df_matrix = pd.DataFrame(array, columns=vocab)
|
||||
|
||||
print('# calculating frequencies...')
|
||||
print('# BOW: calculating frequencies...')
|
||||
print()
|
||||
|
||||
# for every text in series
|
||||
@ -142,7 +141,7 @@ class BagOfWords:
|
||||
for word in e_list:
|
||||
# add every single word to vocabulary
|
||||
vocab.add(word)
|
||||
print('# vocabulary consists of {} features.'.format(len(vocab)))
|
||||
print('# BOW: vocabulary consists of {} features.'.format(len(vocab)))
|
||||
print()
|
||||
# transform set to list
|
||||
return list(vocab)
|
||||
@ -259,7 +258,7 @@ class BagOfWords:
|
||||
def count_features(texts, stemming=True):
|
||||
''' count total number of features in textual corpus
|
||||
'''
|
||||
print('# counting all features in corpus...')
|
||||
print('# BOW: counting all features in corpus...')
|
||||
print()
|
||||
vocab = BagOfWords.make_vocab(texts, stemming)
|
||||
return len(vocab)
|
||||
|
@ -23,7 +23,7 @@ class MNBInteractive:
|
||||
# chose BagOfWords implementation (own if false)
|
||||
sklearn_cv = False
|
||||
|
||||
print('# starting multinomial naives bayes...')
|
||||
print('# MNB: starting multinomial naives bayes...')
|
||||
print()
|
||||
|
||||
# split labeled data into text and label set
|
||||
@ -40,7 +40,7 @@ class MNBInteractive:
|
||||
|
||||
# fit_prior=False: a uniform prior will be used instead
|
||||
# of learning class prior probabilities
|
||||
classifier = MultinomialNB(alpha=0.5,
|
||||
classifier = MultinomialNB(alpha=1.0e-10,
|
||||
fit_prior=False,
|
||||
class_prior=None)
|
||||
|
||||
@ -69,10 +69,14 @@ class MNBInteractive:
|
||||
vocab = BagOfWords.make_vocab(extracted_words)
|
||||
|
||||
# fit the training data and then return the matrix
|
||||
print('# MNB: fit training data and calculate matrix...')
|
||||
print()
|
||||
training_data = BagOfWords.make_matrix(extracted_words,
|
||||
vocab, rel_freq, stemming)
|
||||
|
||||
# transform testing data and return the matrix
|
||||
print('# MNB: transform testing data to matrix...')
|
||||
print()
|
||||
extracted_words = BagOfWords.extract_all_words(U)
|
||||
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||
vocab, rel_freq, stemming)
|
||||
@ -85,7 +89,12 @@ class MNBInteractive:
|
||||
|
||||
# number of samples encountered for each class during fitting
|
||||
# this value is weighted by the sample weight when provided
|
||||
class_count = classifier.class_count_
|
||||
# class_count = classifier.class_count_
|
||||
|
||||
# classes in order used
|
||||
classes = classifier.classes_
|
||||
|
||||
print('# MNB: ending multinomial naive bayes')
|
||||
|
||||
# return classes and vector of class estimates
|
||||
return class_count, class_probs
|
||||
return classes, class_probs
|
23
src/NER.py
23
src/NER.py
@ -156,6 +156,27 @@ class NER:
|
||||
# print(max(dict_com, key=dict_com.get))
|
||||
return list(dict_com.values())
|
||||
|
||||
def make_article_orgs_dict(texts):
|
||||
'''param: list of all article texts
|
||||
returns: dict of article indices with company names in it
|
||||
'''
|
||||
print('# searching company names...')
|
||||
print()
|
||||
|
||||
# dict of article indices with company names in it
|
||||
dict_art_orgs = {}
|
||||
|
||||
for i, text in enumerate(texts):
|
||||
# list of found companies in article
|
||||
print('# article no. {}:'.format(i))
|
||||
dict_art_orgs[i] = NER.find_companies(text)
|
||||
|
||||
# save coms_list
|
||||
with open('../obj/'+ 'dict_articles_organizations' + '.pkl', 'wb') as f:
|
||||
pickle.dump(dict_art_orgs, f, pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
print(dict_art_orgs)
|
||||
|
||||
def show_most_common_companies(n_commons=50):
|
||||
# load pickle object
|
||||
with open('../obj/dict_organizations.pkl', 'rb') as input:
|
||||
@ -189,6 +210,6 @@ if __name__ == '__main__':
|
||||
quotechar='\'')
|
||||
#print(df)
|
||||
texts = df[1] + '. ' + df[2]
|
||||
NER.count_companies(texts)
|
||||
NER.make_article_orgs_dict(texts)
|
||||
# NER.show_most_common_companies()
|
||||
# print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))
|
Loading…
x
Reference in New Issue
Block a user