Code Kommentare von Doris

This commit is contained in:
Doris Birkefeld 2018-10-18 09:22:51 +02:00
parent 446df63d84
commit b648ab70d2
6 changed files with 14 additions and 2 deletions

View File

@ -106,6 +106,7 @@ class BagOfWords:
print('# ...') print('# ...')
vocab = set() vocab = set()
for text in series: for text in series:
## DORIS: DIE FOLGENDE ZEILE VERSTEH ICH NICHT
vocab |= set(BagOfWords.extract_words(text)) vocab |= set(BagOfWords.extract_words(text))
# transform to list # transform to list
vocab = list(vocab) vocab = list(vocab)

View File

@ -54,6 +54,7 @@ class FilterKeywords:
for kword in keywords: for kword in keywords:
if re.match(kword, key): if re.match(kword, key):
# if match, increase value of matching key # if match, increase value of matching key
## DORIS: Hier könntest du ein defaultdict verwenden, https://www.accelebrate.com/blog/using-defaultdict-python/
if str(kword) in dict_keywords: if str(kword) in dict_keywords:
dict_keywords[str(kword)] += dict_input[key] dict_keywords[str(kword)] += dict_input[key]
else: else:

1
NER.py
View File

@ -29,6 +29,7 @@ class NER:
'NYSE'] 'NYSE']
def tag_words(text): def tag_words(text):
## DORIS: Kannst du die auch ins repository und den ordner schieben, dass ich das laufen lassen kann?
stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz' stanford_classifier = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\classifiers\\english.all.3class.distsim.crf.ser.gz'
stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar' stanford_ner_path = 'C:\\Users\\anne.lorenz\\Bachelorarbeit\\StanfordNER\\stanford-ner-2018-02-27\\stanford-ner.jar'
# create tagger object # create tagger object

View File

@ -75,6 +75,8 @@ class NaiveBayes:
# # apply select percentile # # apply select percentile
# selector = SelectPercentile(percentile=25) # selector = SelectPercentile(percentile=25)
# selector.fit(training_data, y[train]) # selector.fit(training_data, y[train])
##DORIS: WIRD SELECT PERCENTILE IN DEINE ARBEIT MIT NB EINBEZOGEN?
# training_data_r = selector.transform(training_data) # training_data_r = selector.transform(training_data)
# testing_data_r = selector.transform(testing_data) # testing_data_r = selector.transform(testing_data)
@ -95,6 +97,7 @@ class NaiveBayes:
rec = recall_score(y[test], predictions_test) rec = recall_score(y[test], predictions_test)
print('rec: ' + str(rec)) print('rec: ' + str(rec))
recall_scores.append(rec) recall_scores.append(rec)
##DORIS: PRECISION MISST DU AUCH MIT DEN TEST SCORES!!!
prec = precision_score(y[train], predictions_train) prec = precision_score(y[train], predictions_train)
print('prec: ' + str(prec)) print('prec: ' + str(prec))
print('#') print('#')
@ -186,7 +189,9 @@ class NaiveBayes:
# read csv file # read csv file
print('# reading dataset') print('# reading dataset')
print('# ...') print('# ...')
## DORIS: ICH VERSTEHE NICHT, WARUM DU HIER EINE EXTRA FUNKTION SCHREIBST, PD.READ_CSV MÜSSTE DOCH AUCH SO GEHEN?
## KOMMT VIELLEICHT NOCH, VIELLEICHT BIN ICH ZU VORSCHNELL
dataset = CsvHandler.read_csv(file) dataset = CsvHandler.read_csv(file)
make_naive_bayes(dataset) make_naive_bayes(dataset)

View File

@ -31,6 +31,7 @@ class NaiveBayes_simple:
cv = CountVectorizer() cv = CountVectorizer()
##DORIS: DU BRAUCHST IMMER EINEN STRATIFIED SPLIT, WEIL DIEN DATASET UNBALANCED IST
# k-fold cross-validation as split method # k-fold cross-validation as split method
kf = KFold(n_splits=10, shuffle=True, random_state=5) kf = KFold(n_splits=10, shuffle=True, random_state=5)
@ -69,6 +70,7 @@ class NaiveBayes_simple:
rec = recall_score(y[test], predictions_test) rec = recall_score(y[test], predictions_test)
print('rec: ' + str(rec)) print('rec: ' + str(rec))
recall_scores.append(rec) recall_scores.append(rec)
##DORIS: PRECISION MISST DU AUCH MIT DEN TEST SCORES!!!
prec = precision_score(y[train], predictions_train) prec = precision_score(y[train], predictions_train)
print('prec: ' + str(prec)) print('prec: ' + str(prec))
print('#') print('#')

View File

@ -84,10 +84,12 @@ class Requester:
article.append(section) article.append(section)
# add article to list # add article to list
list_articles.append(article) list_articles.append(article)
## DORIS: WARUM SCHREIBST DU ES NICHT DIREKT IN EINE CSV, SONDERN KONVERTIERST NOCHMAL?
# Get the next batch of 100 posts # Get the next batch of 100 posts
output = webhoseio.get_next() output = webhoseio.get_next()
# create DataFrame # create DataFrame
df = pd.DataFrame(data=list_articles, df = pd.DataFrame(data=list_articles,
columns=['Timestamp', 'Title', 'Text', 'SiteSection']) columns=['Timestamp', 'Title', 'Text', 'SiteSection'])