updated jupyter notebook

This commit is contained in:
Anne Lorenz 2018-12-19 10:12:38 +01:00
parent afe0e96efd
commit 59c664fbb0
7 changed files with 20670 additions and 10677 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

File diff suppressed because one or more lines are too long

View File

@ -68,7 +68,7 @@ class BagOfWords:
returns list of lists of all extracted words, one row per article returns list of lists of all extracted words, one row per article
''' '''
extracted_words = [] extracted_words = []
print('# extracting all words from articles...') print('# BOW: extracting all words from articles...')
print() print()
for text in corpus: for text in corpus:
extracted_words.append(BagOfWords.extract_words(text, stemming)) extracted_words.append(BagOfWords.extract_words(text, stemming))
@ -88,8 +88,7 @@ class BagOfWords:
# total number of words in bag of words # total number of words in bag of words
word_count = 0 word_count = 0
print('# counting number of features in corpus...')
print()
for list in extracted_words: for list in extracted_words:
word_count += len(list) word_count += len(list)
@ -102,7 +101,7 @@ class BagOfWords:
array = np.zeros(shape=(n_articles, l_vocab)) array = np.zeros(shape=(n_articles, l_vocab))
df_matrix = pd.DataFrame(array, columns=vocab) df_matrix = pd.DataFrame(array, columns=vocab)
print('# calculating frequencies...') print('# BOW: calculating frequencies...')
print() print()
# for every text in series # for every text in series
@ -142,7 +141,7 @@ class BagOfWords:
for word in e_list: for word in e_list:
# add every single word to vocabulary # add every single word to vocabulary
vocab.add(word) vocab.add(word)
print('# vocabulary consists of {} features.'.format(len(vocab))) print('# BOW: vocabulary consists of {} features.'.format(len(vocab)))
print() print()
# transform set to list # transform set to list
return list(vocab) return list(vocab)
@ -259,7 +258,7 @@ class BagOfWords:
def count_features(texts, stemming=True): def count_features(texts, stemming=True):
''' count total number of features in textual corpus ''' count total number of features in textual corpus
''' '''
print('# counting all features in corpus...') print('# BOW: counting all features in corpus...')
print() print()
vocab = BagOfWords.make_vocab(texts, stemming) vocab = BagOfWords.make_vocab(texts, stemming)
return len(vocab) return len(vocab)

View File

@ -23,7 +23,7 @@ class MNBInteractive:
# chose BagOfWords implementation (own if false) # chose BagOfWords implementation (own if false)
sklearn_cv = False sklearn_cv = False
print('# starting multinomial naives bayes...') print('# MNB: starting multinomial naives bayes...')
print() print()
# split labeled data into text and label set # split labeled data into text and label set
@ -40,7 +40,7 @@ class MNBInteractive:
# fit_prior=False: a uniform prior will be used instead # fit_prior=False: a uniform prior will be used instead
# of learning class prior probabilities # of learning class prior probabilities
classifier = MultinomialNB(alpha=0.5, classifier = MultinomialNB(alpha=1.0e-10,
fit_prior=False, fit_prior=False,
class_prior=None) class_prior=None)
@ -69,10 +69,14 @@ class MNBInteractive:
vocab = BagOfWords.make_vocab(extracted_words) vocab = BagOfWords.make_vocab(extracted_words)
# fit the training data and then return the matrix # fit the training data and then return the matrix
print('# MNB: fit training data and calculate matrix...')
print()
training_data = BagOfWords.make_matrix(extracted_words, training_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming) vocab, rel_freq, stemming)
# transform testing data and return the matrix # transform testing data and return the matrix
print('# MNB: transform testing data to matrix...')
print()
extracted_words = BagOfWords.extract_all_words(U) extracted_words = BagOfWords.extract_all_words(U)
testing_data = BagOfWords.make_matrix(extracted_words, testing_data = BagOfWords.make_matrix(extracted_words,
vocab, rel_freq, stemming) vocab, rel_freq, stemming)
@ -85,7 +89,12 @@ class MNBInteractive:
# number of samples encountered for each class during fitting # number of samples encountered for each class during fitting
# this value is weighted by the sample weight when provided # this value is weighted by the sample weight when provided
class_count = classifier.class_count_ # class_count = classifier.class_count_
# classes in order used
classes = classifier.classes_
print('# MNB: ending multinomial naive bayes')
# return classes and vector of class estimates # return classes and vector of class estimates
return class_count, class_probs return classes, class_probs

View File

@ -156,6 +156,27 @@ class NER:
# print(max(dict_com, key=dict_com.get)) # print(max(dict_com, key=dict_com.get))
return list(dict_com.values()) return list(dict_com.values())
def make_article_orgs_dict(texts):
'''param: list of all article texts
returns: dict of article indices with company names in it
'''
print('# searching company names...')
print()
# dict of article indices with company names in it
dict_art_orgs = {}
for i, text in enumerate(texts):
# list of found companies in article
print('# article no. {}:'.format(i))
dict_art_orgs[i] = NER.find_companies(text)
# save coms_list
with open('../obj/'+ 'dict_articles_organizations' + '.pkl', 'wb') as f:
pickle.dump(dict_art_orgs, f, pickle.HIGHEST_PROTOCOL)
print(dict_art_orgs)
def show_most_common_companies(n_commons=50): def show_most_common_companies(n_commons=50):
# load pickle object # load pickle object
with open('../obj/dict_organizations.pkl', 'rb') as input: with open('../obj/dict_organizations.pkl', 'rb') as input:
@ -189,6 +210,6 @@ if __name__ == '__main__':
quotechar='\'') quotechar='\'')
#print(df) #print(df)
texts = df[1] + '. ' + df[2] texts = df[1] + '. ' + df[2]
NER.count_companies(texts) NER.make_article_orgs_dict(texts)
# NER.show_most_common_companies() # NER.show_most_common_companies()
# print(NER.tag_words('On Monday, Github and Microsoft announced their merger.')) # print(NER.tag_words('On Monday, Github and Microsoft announced their merger.'))