updated jupyter notebook

This commit is contained in:
Anne Lorenz 2018-12-19 10:12:38 +01:00
parent afe0e96efd
commit 59c664fbb0
7 changed files with 20670 additions and 10677 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -68,7 +68,7 @@ class BagOfWords:
returns list of lists of all extracted words, one row per article
'''
extracted_words = []
print('# extracting all words from articles...')
print('# BOW: extracting all words from articles...')
print()
for text in corpus:
extracted_words.append(BagOfWords.extract_words(text, stemming))
@ -88,8 +88,7 @@ class BagOfWords:
# total number of words in bag of words
word_count = 0
print('# counting number of features in corpus...')
print()
for list in extracted_words:
word_count += len(list)
@ -102,7 +101,7 @@ class BagOfWords:
array = np.zeros(shape=(n_articles, l_vocab))
df_matrix = pd.DataFrame(array, columns=vocab)
print('# calculating frequencies...')
print('# BOW: calculating frequencies...')
print()
# for every text in series
@ -142,7 +141,7 @@ class BagOfWords:
for word in e_list:
# add every single word to vocabulary
vocab.add(word)
print('# vocabulary consists of {} features.'.format(len(vocab)))
print('# BOW: vocabulary consists of {} features.'.format(len(vocab)))
print()
# transform set to list
return list(vocab)
@ -259,7 +258,7 @@ class BagOfWords:
def count_features(texts, stemming=True):
''' count total number of features in textual corpus
'''
print('# counting all features in corpus...')
print('# BOW: counting all features in corpus...')
print()
vocab = BagOfWords.make_vocab(texts, stemming)
return len(vocab)

View File

@ -23,7 +23,7 @@ class MNBInteractive:
# chose BagOfWords implementation (own if false)
sklearn_cv = False
print('# starting multinomial naives bayes...')
print('# MNB: starting multinomial naives bayes...')
print()
# split labeled data into text and label set
@ -40,7 +40,7 @@ class MNBInteractive:
# fit_prior=False: a uniform prior will be used instead
# of learning class prior probabilities
classifier = MultinomialNB(alpha=0.5,
classifier = MultinomialNB(alpha=1.0e-10,
fit_prior=False,
class_prior=None)
@ -69,10 +69,14 @@ class MNBInteractive:
vocab = BagOfWords.make_vocab(extracted_words)
# fit the training data and then return the matrix
print('# MNB: fit training data and calculate matrix...')
print()
training_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
# transform testing data and return the matrix
print('# MNB: transform testing data to matrix...')
print()
extracted_words = BagOfWords.extract_all_words(U)
testing_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming)
@ -85,7 +89,12 @@ class MNBInteractive:
# number of samples encountered for each class during fitting
# this value is weighted by the sample weight when provided
class_count = classifier.class_count_
# class_count = classifier.class_count_
# classes in order used
classes = classifier.classes_
print('# MNB: ending multinomial naive bayes')
# return classes and vector of class estimates
return class_count, class_probs
return classes, class_probs

View File

@ -156,6 +156,27 @@ class NER:
# print(max(dict_com, key=dict_com.get))
return list(dict_com.values())
def make_article_orgs_dict(texts):
'''param: list of all article texts
returns: dict of article indices with company names in it
'''
print('# searching company names...')
print()
# dict of article indices with company names in it
dict_art_orgs = {}
for i, text in enumerate(texts):
# list of found companies in article
print('# article no. {}:'.format(i))
dict_art_orgs[i] = NER.find_companies(text)
# save coms_list
with open('../obj/'+ 'dict_articles_organizations' + '.pkl', 'wb') as f:
pickle.dump(dict_art_orgs, f, pickle.HIGHEST_PROTOCOL)
print(dict_art_orgs)
def show_most_common_companies(n_commons=50):
# load pickle object
with open('../obj/dict_organizations.pkl', 'rb') as input:
@ -189,6 +210,6 @@ if __name__ == '__main__':
quotechar='\'')
#print(df)
texts = df[1] + '. ' + df[2]
NER.count_companies(texts)
NER.make_article_orgs_dict(texts)
# NER.show_most_common_companies()
# print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))