changed CountVectorizer optional and other things
This commit is contained in:
parent
2d5368e283
commit
7e037a1621
205
BagOfWords.py
205
BagOfWords.py
|
@ -9,7 +9,7 @@ vocabulary. As the multinomial Naive Bayes classifier is suitable for
|
||||||
classification with discrete features (e.g., word counts for text
|
classification with discrete features (e.g., word counts for text
|
||||||
classification). The multinomial distribution normally requires integer
|
classification). The multinomial distribution normally requires integer
|
||||||
feature counts. However, in practice, fractional counts such as tf-idf may
|
feature counts. However, in practice, fractional counts such as tf-idf may
|
||||||
also work. => considered by 'relative_word_frequencies' as parameter.
|
also work => considered by 'rel_freq'(relative word frequencies) as parameter.
|
||||||
'''
|
'''
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
import csv
|
import csv
|
||||||
|
@ -21,11 +21,14 @@ from nltk.stem.porter import PorterStemmer
|
||||||
|
|
||||||
class BagOfWords:
|
class BagOfWords:
|
||||||
|
|
||||||
def fit_transform(X, relative_word_frequencies=True):
|
def fit_transform(corpus, rel_freq=True, stemming=True):
|
||||||
''' similar to CountVectorizer's fit_transform method
|
''' similar to CountVectorizer's fit_transform method
|
||||||
'''
|
'''
|
||||||
vocab = BagOfWords.make_vocab(X)
|
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||||
return BagOfWords.make_matrix(X, vocab, relative_word_frequencies)
|
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||||
|
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq,
|
||||||
|
stemming)
|
||||||
|
return matrix
|
||||||
|
|
||||||
def extract_words(text, stemming=True):
|
def extract_words(text, stemming=True):
|
||||||
'''takes article as argument, removes numbers,
|
'''takes article as argument, removes numbers,
|
||||||
|
@ -46,52 +49,25 @@ class BagOfWords:
|
||||||
if stemming:
|
if stemming:
|
||||||
# reduce word to its stem
|
# reduce word to its stem
|
||||||
word = stemmer.stem(word)
|
word = stemmer.stem(word)
|
||||||
|
# filter out spam chars
|
||||||
|
word = word.replace('â', '').replace('œ', '')\
|
||||||
|
.replace('ã', '')
|
||||||
words_cleaned.append(word)
|
words_cleaned.append(word)
|
||||||
return words_cleaned
|
return words_cleaned
|
||||||
|
|
||||||
# def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True):
|
def extract_all_words(corpus, stemming=True):
|
||||||
# '''calculates word stem frequencies in input articles. returns
|
'''param: all articles of corpus
|
||||||
# document term matrix(DataFrame) with relative word frequencies
|
returns list of lists of all extracted words, one row per article
|
||||||
# (0 <= values < 1) if relative_word_frequencies=True or absolute
|
'''
|
||||||
# word frequencies (int) if relative_word_frequencies=False.
|
extracted_words = []
|
||||||
# (rows: different articles, colums: different words in vocab)
|
print('# extracting all words from articles...')
|
||||||
# returns matrix as DataFrame
|
print()
|
||||||
# '''
|
for text in corpus:
|
||||||
# print('# BOW: calculating matrix...')
|
extracted_words.append(BagOfWords.extract_words(text, stemming))
|
||||||
# print()
|
|
||||||
# # create list of tuples
|
|
||||||
# vectors = []
|
|
||||||
# # for every text in series
|
|
||||||
# for i in range(len(series)):
|
|
||||||
# # extract text of single article
|
|
||||||
# text = series.iloc[i]
|
|
||||||
# # extract its words
|
|
||||||
# words = BagOfWords.extract_words(text, stemming)
|
|
||||||
# # count words in single article
|
|
||||||
# word_count = len(words)
|
|
||||||
# vector = []
|
|
||||||
# for i, v in enumerate(vocab):
|
|
||||||
# vector.append(0)
|
|
||||||
# for w in words:
|
|
||||||
# if w == v:
|
|
||||||
# if relative_word_frequencies:
|
|
||||||
# # relative word frequency
|
|
||||||
# vector[i] += 1/word_count
|
|
||||||
# else:
|
|
||||||
# # absolute word frequency
|
|
||||||
# vector[i] += 1
|
|
||||||
|
|
||||||
# # !!! hier passiert immer der MemoryError: !!!
|
return extracted_words
|
||||||
|
|
||||||
# # add single vector as tuple
|
def make_matrix(extracted_words, vocab, rel_freq=True, stemming=True):
|
||||||
# vectors.append(tuple(vector))
|
|
||||||
# df_vectors = pd.DataFrame.from_records(vectors,
|
|
||||||
# index=None,
|
|
||||||
# #header=vocab,
|
|
||||||
# columns=vocab)
|
|
||||||
# return df_vectors
|
|
||||||
|
|
||||||
def make_matrix(series, vocab, relative_word_frequencies=True, stemming=True):
|
|
||||||
'''calculates word stem frequencies in input articles. returns
|
'''calculates word stem frequencies in input articles. returns
|
||||||
document term matrix(DataFrame) with relative word frequencies
|
document term matrix(DataFrame) with relative word frequencies
|
||||||
(0 <= values < 1) if relative_word_frequencies=True or absolute
|
(0 <= values < 1) if relative_word_frequencies=True or absolute
|
||||||
|
@ -101,28 +77,38 @@ class BagOfWords:
|
||||||
'''
|
'''
|
||||||
print('# BOW: calculating matrix...')
|
print('# BOW: calculating matrix...')
|
||||||
print()
|
print()
|
||||||
|
|
||||||
|
# total number of words in bag of words
|
||||||
|
word_count = 0
|
||||||
|
print('# counting number of features in corpus...')
|
||||||
|
print()
|
||||||
|
for list in extracted_words:
|
||||||
|
word_count += len(list)
|
||||||
|
|
||||||
|
# number of articles
|
||||||
|
n_articles = len(extracted_words)
|
||||||
|
# number of words in vocab
|
||||||
|
l_vocab = len(vocab)
|
||||||
|
|
||||||
# create zero-filled dataframe
|
# create zero-filled dataframe
|
||||||
array = np.zeros(shape=(len(series),len(vocab)))
|
array = np.zeros(shape=(n_articles, l_vocab))
|
||||||
df_matrix = pd.DataFrame(array, columns=vocab)
|
df_matrix = pd.DataFrame(array, columns=vocab)
|
||||||
|
|
||||||
|
print('# calculating frequencies...')
|
||||||
|
print()
|
||||||
|
|
||||||
# for every text in series
|
# for every text in series
|
||||||
for i in range(len(series)):
|
for i in range(len(extracted_words)):
|
||||||
|
|
||||||
# extract text of single article
|
# extract words of single article
|
||||||
text = series.iloc[i]
|
words = extracted_words[i]
|
||||||
|
|
||||||
# extract its words
|
|
||||||
words = BagOfWords.extract_words(text, stemming)
|
|
||||||
# count words in article
|
|
||||||
word_count = len(words)
|
|
||||||
|
|
||||||
# for every word in global vocab
|
|
||||||
for v in vocab:
|
for v in vocab:
|
||||||
# for every word in article
|
# for every word in article
|
||||||
for w in words:
|
for w in words:
|
||||||
# find right position
|
# find right position
|
||||||
if w == v:
|
if w == v:
|
||||||
if relative_word_frequencies:
|
if rel_freq:
|
||||||
# relative word frequency
|
# relative word frequency
|
||||||
df_matrix.loc[i][v] += 1/word_count
|
df_matrix.loc[i][v] += 1/word_count
|
||||||
else:
|
else:
|
||||||
|
@ -131,18 +117,22 @@ class BagOfWords:
|
||||||
|
|
||||||
return df_matrix
|
return df_matrix
|
||||||
|
|
||||||
def make_vocab(series, stemming=True):
|
def make_vocab(extracted_words, stemming=True):
|
||||||
'''adds words of input articles to a global vocabulary.
|
'''adds all words to a global vocabulary.
|
||||||
input: dataframe of all articles, return value: list of words
|
input: list of lists of all extracted words, returns: list of words
|
||||||
'''
|
'''
|
||||||
print('# BOW: making vocabulary of data set...')
|
print('# BOW: making vocabulary of data set...')
|
||||||
print()
|
print()
|
||||||
vocab = set()
|
vocab = set()
|
||||||
# for every article's text
|
# for every article's text
|
||||||
for text in series:
|
for e_list in extracted_words:
|
||||||
# add single article's text to total vocabulary
|
for word in e_list:
|
||||||
vocab |= set(BagOfWords.extract_words(text, stemming))
|
# add every single word to vocabulary
|
||||||
return vocab
|
vocab.add(word)
|
||||||
|
print('# vocabulary consists of {} features.'.format(len(vocab)))
|
||||||
|
print()
|
||||||
|
# transform set to list
|
||||||
|
return list(vocab)
|
||||||
|
|
||||||
def set_stop_words(stemming=True):
|
def set_stop_words(stemming=True):
|
||||||
'''creates list of all words that will be ignored
|
'''creates list of all words that will be ignored
|
||||||
|
@ -179,7 +169,7 @@ class BagOfWords:
|
||||||
'yourselves']
|
'yourselves']
|
||||||
|
|
||||||
#add unwanted terms
|
#add unwanted terms
|
||||||
stop_words.extend(['reuters', 'bloomberg', 'cnn', 'n', 'l', 'â',
|
stop_words.extend(['reuters', 'reuter', 'bloomberg', 'cnn', 'n', 'l',
|
||||||
'file', 'photo', 'min', 'read', 'staff', 'left',
|
'file', 'photo', 'min', 'read', 'staff', 'left',
|
||||||
'right', 'updated', 'minutes', 'brief', 'editing',
|
'right', 'updated', 'minutes', 'brief', 'editing',
|
||||||
'reporting', 'ago', 'also', 'would', 'could',
|
'reporting', 'ago', 'also', 'would', 'could',
|
||||||
|
@ -202,20 +192,23 @@ class BagOfWords:
|
||||||
# transform list to set to eliminate duplicates
|
# transform list to set to eliminate duplicates
|
||||||
return set(stop_words)
|
return set(stop_words)
|
||||||
|
|
||||||
def make_dict_common_words(texts, rel_freq=True, stemming=True, n=200):
|
def make_dict_common_words(df_matrix, n=200, rel_freq=True, stemming=True):
|
||||||
'''texts: df of article texts of complete data set as series,
|
'''params: DataFrame document term matrix of complete data set,
|
||||||
return dict of words with their count.
|
number of n most common words.
|
||||||
|
returns: dict of words with their count.
|
||||||
'''
|
'''
|
||||||
|
print('# making dictionary of most common words...')
|
||||||
|
print()
|
||||||
|
|
||||||
# words under that rel_freq limit are not included
|
# words under that rel_freq limit are not included
|
||||||
limit = 0.0005
|
# set limit
|
||||||
|
limit = 0.001
|
||||||
if not rel_freq:
|
if not rel_freq:
|
||||||
limit = 25
|
limit = len(df_matrix) * 0.001
|
||||||
|
|
||||||
# word => count
|
# word => count
|
||||||
dict = {}
|
dict = {}
|
||||||
vocab = BagOfWords.make_vocab(texts, stemming)
|
|
||||||
# calculate document term matrix
|
|
||||||
df_matrix = BagOfWords.make_matrix(texts, vocab, rel_freq, stemming)
|
|
||||||
print(df_matrix.shape)
|
|
||||||
# iterate over words
|
# iterate over words
|
||||||
for column in df_matrix:
|
for column in df_matrix:
|
||||||
# count word mentions in total
|
# count word mentions in total
|
||||||
|
@ -224,16 +217,23 @@ class BagOfWords:
|
||||||
# sort dict by value and
|
# sort dict by value and
|
||||||
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
o_dict = OrderedDict(sorted(dict.items(), key=lambda t: t[1],\
|
||||||
reverse=True))
|
reverse=True))
|
||||||
|
print(o_dict)
|
||||||
# return n higest values as dict (word => count)
|
# return n higest values as dict (word => count)
|
||||||
n_dict = {}
|
n_dict = {}
|
||||||
|
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
n_dict[o_dict.popitem(last=False)[0]] = o_dict.popitem(last=False)[1]
|
# next highest score
|
||||||
|
next_highest = o_dict.popitem(last=False)
|
||||||
|
n_dict[next_highest[0]] = next_highest[1]
|
||||||
|
|
||||||
return n_dict
|
return n_dict
|
||||||
|
|
||||||
def count_features(texts, stemming=True):
|
def count_features(texts, stemming=True):
|
||||||
|
''' count total number of features in textual corpus
|
||||||
|
'''
|
||||||
print('# counting all features in corpus...')
|
print('# counting all features in corpus...')
|
||||||
print()
|
print()
|
||||||
vocab = BagOfWords.make_vocab(texts, True)
|
vocab = BagOfWords.make_vocab(texts, stemming)
|
||||||
return len(vocab)
|
return len(vocab)
|
||||||
|
|
||||||
def count_all_words(texts):
|
def count_all_words(texts):
|
||||||
|
@ -244,26 +244,37 @@ class BagOfWords:
|
||||||
sum += len(text.split())
|
sum += len(text.split())
|
||||||
return sum
|
return sum
|
||||||
|
|
||||||
|
def test():
|
||||||
|
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
||||||
|
df_dataset = pd.read_csv(file,
|
||||||
|
delimiter='|',
|
||||||
|
header=None,
|
||||||
|
index_col=None,
|
||||||
|
engine='python',
|
||||||
|
usecols=[1,2],
|
||||||
|
nrows=100,
|
||||||
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
|
quotechar='\'')
|
||||||
|
|
||||||
|
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
||||||
|
stemming = True
|
||||||
|
rel_freq = True
|
||||||
|
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||||
|
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||||
|
#print(vocab)
|
||||||
|
for text in corpus:
|
||||||
|
print(text)
|
||||||
|
print()
|
||||||
|
print()
|
||||||
|
# ab hier ValueError bei nrows=10000...
|
||||||
|
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
|
||||||
|
dict = BagOfWords.make_dict_common_words(matrix, 20, rel_freq, stemming)
|
||||||
|
print(dict)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
for word in sorted(BagOfWords.set_stop_words(False)):
|
||||||
# load new data set
|
print(word)
|
||||||
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
print()
|
||||||
df_dataset = pd.read_csv(file,
|
print(PorterStemmer().stem(word))
|
||||||
delimiter='|',
|
print()
|
||||||
header=None,
|
# BagOfWords.test()
|
||||||
index_col=None,
|
|
||||||
engine='python',
|
|
||||||
usecols=[1,2],
|
|
||||||
nrows=3000,
|
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
|
||||||
quotechar='\'')
|
|
||||||
|
|
||||||
# find most common words in dataset
|
|
||||||
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
|
||||||
stemming = False
|
|
||||||
rel_freq = False
|
|
||||||
vocab = BagOfWords.make_vocab(corpus, stemming)
|
|
||||||
|
|
||||||
# print(BagOfWords.make_matrix(corpus, vocab, False, stemming))
|
|
||||||
print(BagOfWords.make_dict_common_words(corpus, rel_freq, stemming, 200))
|
|
||||||
# print(BagOfWords.count_features(corpus))
|
|
|
@ -4,74 +4,77 @@ Cosine Similarity
|
||||||
|
|
||||||
CosineSimilarity measures the similarity between to articles.
|
CosineSimilarity measures the similarity between to articles.
|
||||||
It calculates c: the cosine of the angle between the articles
|
It calculates c: the cosine of the angle between the articles
|
||||||
vectors dict_1 and dict_2.
|
vectors text_1 and text_2.
|
||||||
c = (dict_1 * dict_2) / (|dict_1| * |dict_2|).
|
c = (text_1 * text_2) / (|text_1| * |text_2|).
|
||||||
c = 1, if articles are equal => identicalness is 100%
|
c = 1, if articles are equal => identicalness is 100%
|
||||||
0 > c > 1, else => identicalness is (c*100)%
|
0 > c > 1, else => identicalness is (c*100)%
|
||||||
(The greater c, the more similar two articles are.)
|
(The greater c, the more similar two articles are.)
|
||||||
'''
|
'''
|
||||||
|
from BagOfWords import BagOfWords
|
||||||
|
|
||||||
#TODO:uses dictionaries of each article
|
import csv
|
||||||
#=>ToDo:has to be changed as we are now using vectors
|
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
|
||||||
from BagOfWords import BagOfWords
|
import pandas as pd
|
||||||
|
|
||||||
class CosineSimilarity:
|
class CosineSimilarity:
|
||||||
|
|
||||||
def cos_sim(dict_1, dict_2):
|
def calc_similarity(text_1, text_2, rel_freq=True, stemming=True):
|
||||||
|
''' calculates cosine similarity of two input articles
|
||||||
|
'''
|
||||||
|
print('# calculating cosine similarity...')
|
||||||
|
print()
|
||||||
|
|
||||||
# list of all different words
|
# extract words from articles
|
||||||
vocab = []
|
extracted_words_1 = BagOfWords.extract_words(text_1, stemming)
|
||||||
|
extracted_words_2 = BagOfWords.extract_words(text_2, stemming)
|
||||||
|
print(extracted_words_1)
|
||||||
|
print(extracted_words_2)
|
||||||
|
|
||||||
# insert words of 1st article into vocab
|
# insert words into vocab
|
||||||
for key in dict_1.keys():
|
both_extracted = []
|
||||||
if key not in vocab:
|
both_extracted.append(extracted_words_1)
|
||||||
vocab.append(key)
|
both_extracted.append(extracted_words_2)
|
||||||
|
vocab = BagOfWords.make_vocab(both_extracted, stemming)
|
||||||
# insert words of 2nd article into vocab
|
|
||||||
for key in dict_2.keys():
|
|
||||||
if key not in vocab:
|
|
||||||
vocab.append(key)
|
|
||||||
|
|
||||||
# delete first entry ('sum_words')
|
|
||||||
vocab.pop(0)
|
|
||||||
|
|
||||||
# create vectors
|
# create vectors
|
||||||
vector_1 = CosineSimilarity.create_vector(dict_1, vocab)
|
matrix = BagOfWords.make_matrix(both_extracted, vocab,\
|
||||||
vector_2 = CosineSimilarity.create_vector(dict_2, vocab)
|
rel_freq, stemming)
|
||||||
|
|
||||||
# start calculation
|
# start calculation
|
||||||
# calculate numerator of formula
|
# calculate numerator of formula
|
||||||
sum_1 = 0
|
sum_1 = 0
|
||||||
|
|
||||||
for i in range (0,len(vector_1)):
|
for i in range (0,len(matrix.iloc[0])):
|
||||||
sum_1 += vector_1[i] * vector_2[i]
|
sum_1 += matrix.iloc[0][i] * matrix.iloc[1][i]
|
||||||
|
|
||||||
# calculate denominator of formula
|
# calculate denominator of formula
|
||||||
sum_2 = 0
|
sum_2 = 0
|
||||||
|
|
||||||
for entry in vector_1:
|
for entry in matrix.iloc[0]:
|
||||||
sum_2 += entry ** 2
|
sum_2 += entry ** 2
|
||||||
|
|
||||||
sum_3 = 0
|
sum_3 = 0
|
||||||
for entry in vector_2:
|
for entry in matrix.iloc[1]:
|
||||||
sum_3 += entry ** 2
|
sum_3 += entry ** 2
|
||||||
|
|
||||||
return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3))
|
return sum_1 / (math.sqrt(sum_2) * math.sqrt(sum_3))
|
||||||
|
|
||||||
def create_vector(dict, vocab):
|
if __name__ == '__main__':
|
||||||
# word frequency vector
|
# read data set
|
||||||
vector = []
|
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
||||||
for word in vocab:
|
df = pd.read_csv(file,
|
||||||
# check if word occurs in article
|
delimiter='|',
|
||||||
if word in dict:
|
header=None,
|
||||||
# insert word count
|
index_col=None,
|
||||||
vector.append(dict[word])
|
engine='python',
|
||||||
else:
|
usecols=[1,2],
|
||||||
# insert zero
|
nrows=100,
|
||||||
vector.append(0)
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
# delete first entry ('sum_words')
|
quotechar='\'')
|
||||||
vector.pop(0)
|
|
||||||
return vector
|
texts = df[1] + '. ' + df[2]
|
||||||
|
|
||||||
|
# compare first and second article in data set
|
||||||
|
print(CosineSimilarity.calc_similarity(texts.iloc[0], texts.iloc[1],\
|
||||||
|
rel_freq=True, stemming=True))
|
|
@ -7,6 +7,9 @@ array X of size [n_samples, n_features], holding the training samples,
|
||||||
and array y of integer values, size [n_samples],
|
and array y of integer values, size [n_samples],
|
||||||
holding the class labels for the training samples.
|
holding the class labels for the training samples.
|
||||||
'''
|
'''
|
||||||
|
# toDo: replace old dataset!!!
|
||||||
|
# CountVectorizer funktioniert noch nicht
|
||||||
|
|
||||||
from BagOfWords import BagOfWords
|
from BagOfWords import BagOfWords
|
||||||
|
|
||||||
import csv
|
import csv
|
||||||
|
@ -16,21 +19,22 @@ import graphviz
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn import tree
|
from sklearn import tree
|
||||||
#from sklearn.feature_extraction.text import CountVectorizer
|
# from sklearn.feature_extraction.text import CountVectorizer
|
||||||
from sklearn.feature_selection import SelectPercentile
|
from sklearn.feature_selection import SelectPercentile
|
||||||
from sklearn.metrics import f1_score
|
from sklearn.metrics import f1_score
|
||||||
from sklearn.model_selection import StratifiedKFold
|
from sklearn.model_selection import StratifiedKFold
|
||||||
|
|
||||||
class DecisionTree:
|
class DecisionTree:
|
||||||
|
|
||||||
def make_tree(dataset):
|
def make_tree(dataset, sklearn_cv=False, stemming=False, percentile=100):
|
||||||
print('# fitting model')
|
print('# fitting model')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
X = dataset['Title'] + ' ' + dataset['Text']
|
X = dataset['Title'] + ' ' + dataset['Text']
|
||||||
y = dataset['Label']
|
y = dataset['Label']
|
||||||
|
|
||||||
#count_vector = CountVectorizer()
|
if sklearn_cv:
|
||||||
|
cv = CountVectorizer()
|
||||||
|
|
||||||
# use stratified k-fold cross-validation as split method
|
# use stratified k-fold cross-validation as split method
|
||||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||||
|
@ -45,33 +49,48 @@ class DecisionTree:
|
||||||
important_words = {}
|
important_words = {}
|
||||||
|
|
||||||
# for each fold
|
# for each fold
|
||||||
|
n = 0
|
||||||
for train, test in skf.split(X,y):
|
for train, test in skf.split(X,y):
|
||||||
|
|
||||||
# BOW
|
n += 1
|
||||||
vocab = BagOfWords.make_vocab(X[train])
|
vocab = []
|
||||||
# fit the training data and then return the matrix
|
print('# split no. ' + str(n))
|
||||||
training_data = BagOfWords.make_matrix(X[train], vocab)
|
|
||||||
# transform testing data and return the matrix
|
|
||||||
testing_data = BagOfWords.make_matrix(X[test], vocab)
|
|
||||||
|
|
||||||
# #fit the training data and then return the matrix
|
if sklearn_cv:
|
||||||
# training_data = count_vector.fit_transform(X[train], y[train]).toarray()
|
# use sklearn CountVectorizer
|
||||||
# #transform testing data and return the matrix
|
# fit the training data and then return the matrix
|
||||||
# testing_data = count_vector.transform(X[test]).toarray()
|
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
testing_data = cv.transform(X[test]).toarray()
|
||||||
|
else:
|
||||||
|
# use my own BagOfWords python implementation
|
||||||
|
rel_freq = True
|
||||||
|
extracted_words = BagOfWords.extract_all_words(X[train], stemming)
|
||||||
|
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||||
|
print(vocab)
|
||||||
|
|
||||||
# # apply select percentile
|
# fit the training data and then return the matrix
|
||||||
# selector = SelectPercentile(percentile=25)
|
training_data = BagOfWords.make_matrix(extracted_words,
|
||||||
# selector.fit(training_data, y[train])
|
vocab, rel_freq, stemming)
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
extracted_words = BagOfWords.extract_all_words(X[test], stemming)
|
||||||
|
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||||
|
vocab, rel_freq, stemming)
|
||||||
|
|
||||||
# training_data_r = selector.transform(training_data)
|
# apply select percentile
|
||||||
# testing_data_r = selector.transform(testing_data)
|
selector = SelectPercentile(percentile=percentile)
|
||||||
|
selector.fit(training_data, y[train])
|
||||||
|
|
||||||
|
# new reduced data sets
|
||||||
|
training_data_r = selector.transform(training_data)
|
||||||
|
testing_data_r = selector.transform(testing_data)
|
||||||
|
|
||||||
# fit classifier
|
# fit classifier
|
||||||
classifier.fit(training_data, y[train])
|
classifier.fit(training_data_r, y[train])
|
||||||
|
|
||||||
#predict class
|
#predict class
|
||||||
predictions_train = classifier.predict(training_data)
|
predictions_train = classifier.predict(training_data_r)
|
||||||
predictions_test = classifier.predict(testing_data)
|
predictions_test = classifier.predict(testing_data_r)
|
||||||
|
|
||||||
#store metrics predicted on test/train set
|
#store metrics predicted on test/train set
|
||||||
f1_scores.append(f1_score(y[test], predictions_test))
|
f1_scores.append(f1_score(y[test], predictions_test))
|
||||||
|
@ -80,6 +99,7 @@ class DecisionTree:
|
||||||
# search for important features
|
# search for important features
|
||||||
feature_importances = np.array(classifier.feature_importances_)
|
feature_importances = np.array(classifier.feature_importances_)
|
||||||
important_indices = feature_importances.argsort()[-50:][::-1]
|
important_indices = feature_importances.argsort()[-50:][::-1]
|
||||||
|
print(important_indices)
|
||||||
|
|
||||||
for i in important_indices:
|
for i in important_indices:
|
||||||
if vocab[i] in important_words:
|
if vocab[i] in important_words:
|
||||||
|
|
|
@ -6,6 +6,8 @@ FilterKeywords searches for merger specific keywords
|
||||||
in an article and counts them.
|
in an article and counts them.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
# toDo: replace dict by vector/matrix
|
||||||
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
@ -18,14 +20,6 @@ class FilterKeywords:
|
||||||
output are the contained keywords and their count.
|
output are the contained keywords and their count.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
# # list of regular expressions that match merger specific keywords
|
|
||||||
# regex_list = [r'merge[rs]*d?', r'acquisitions?', r'acquires?',
|
|
||||||
# r'business combinations?', r'combined compan(y|ies)',
|
|
||||||
# r'(joint venture|JV)s?', r'take[ -]?overs?', r'tie-up',
|
|
||||||
# r'deals?', r'transactions?', r'approv(e|ing|al|ed)s?',
|
|
||||||
# r'(buy(s|ers?|ing)?|bought)', r'buy[ -]?outs?',
|
|
||||||
# r'purchase', r'(sell(s|ers?|ing)?|sold)']
|
|
||||||
|
|
||||||
keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers',
|
keyword_list = ['merge', 'merges', 'merged', 'merger', 'mergers',
|
||||||
'acquisition', 'acquire', 'acquisitions', 'acquires',
|
'acquisition', 'acquire', 'acquisitions', 'acquires',
|
||||||
'combine', 'combines', 'combination', 'combined',
|
'combine', 'combines', 'combination', 'combined',
|
||||||
|
@ -44,22 +38,22 @@ class FilterKeywords:
|
||||||
# remove duplicates
|
# remove duplicates
|
||||||
keywords = set(keyword_list)
|
keywords = set(keyword_list)
|
||||||
|
|
||||||
# counts keywords in article (default value: 0)
|
# # counts keywords in article (default value: 0)
|
||||||
dict_keywords = defaultdict(int)
|
# dict_keywords = defaultdict(int)
|
||||||
|
|
||||||
# search for matchings in dictionary of input article
|
# # search for matchings in dictionary of input article
|
||||||
for key in dict_input.keys():
|
# for key in dict_input.keys():
|
||||||
# iterate over all regular expressions
|
# # iterate over all regular expressions
|
||||||
for kword in keywords:
|
# for kword in keywords:
|
||||||
if re.match(kword, key):
|
# if re.match(kword, key):
|
||||||
# if match, increase value of matching key
|
# # if match, increase value of matching key
|
||||||
if str(kword) in dict_keywords:
|
# if str(kword) in dict_keywords:
|
||||||
dict_keywords[str(kword)] += dict_input[key]
|
# dict_keywords[str(kword)] += dict_input[key]
|
||||||
else:
|
# else:
|
||||||
dict_keywords[str(kword)] = dict_input[key]
|
# dict_keywords[str(kword)] = dict_input[key]
|
||||||
|
|
||||||
return dict_keywords
|
# return dict_keywords
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
dict_test={'example':2, 'combined':5, 'sells':3}
|
# dict_test={'example':2, 'combined':5, 'sells':3}
|
||||||
print(FilterKeywords.search_keywords(dict_test))
|
# print(FilterKeywords.search_keywords(dict_test))
|
|
@ -25,7 +25,7 @@ from sklearn.naive_bayes import GaussianNB
|
||||||
|
|
||||||
class NaiveBayes:
|
class NaiveBayes:
|
||||||
|
|
||||||
def make_naive_bayes(dataset):
|
def make_naive_bayes(dataset, sklearn_cv=True, percentile=100):
|
||||||
'''fits naive bayes model with StratifiedKFold,
|
'''fits naive bayes model with StratifiedKFold,
|
||||||
uses my BOW
|
uses my BOW
|
||||||
'''
|
'''
|
||||||
|
@ -34,10 +34,11 @@ class NaiveBayes:
|
||||||
|
|
||||||
# split data into text and label set
|
# split data into text and label set
|
||||||
# join title and text
|
# join title and text
|
||||||
X = dataset['Title'] + ' ' + dataset['Text']
|
X = dataset['Title'] + '. ' + dataset['Text']
|
||||||
y = dataset['Label']
|
y = dataset['Label']
|
||||||
|
|
||||||
cv = CountVectorizer()
|
if sklearn_cv:
|
||||||
|
cv = CountVectorizer()
|
||||||
|
|
||||||
# use stratified k-fold cross-validation as split method
|
# use stratified k-fold cross-validation as split method
|
||||||
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
|
skf = StratifiedKFold(n_splits = 10, shuffle=True, random_state=5)
|
||||||
|
@ -61,23 +62,32 @@ class NaiveBayes:
|
||||||
n += 1
|
n += 1
|
||||||
print('# split no. ' + str(n))
|
print('# split no. ' + str(n))
|
||||||
|
|
||||||
# # eigenes BOW
|
if sklearn_cv:
|
||||||
# vocab = BagOfWords.make_vocab(X[train])
|
# use sklearn CountVectorizer
|
||||||
# # fit the training data and then return the matrix
|
# fit the training data and then return the matrix
|
||||||
# training_data = BagOfWords.make_matrix(X[train], vocab)
|
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||||
# # transform testing data and return the matrix
|
# transform testing data and return the matrix
|
||||||
# testing_data = BagOfWords.make_matrix(X[test], vocab)
|
testing_data = cv.transform(X[test]).toarray()
|
||||||
|
else:
|
||||||
|
# use my own BagOfWords python implementation
|
||||||
|
stemming = True
|
||||||
|
rel_freq = True
|
||||||
|
extracted_words = BagOfWords.extract_all_words(X[train])
|
||||||
|
vocab = BagOfWords.make_vocab(extracted_words)
|
||||||
|
|
||||||
# using CountVectorizer:
|
# fit the training data and then return the matrix
|
||||||
# fit the training data and then return the matrix
|
training_data = BagOfWords.make_matrix(extracted_words,
|
||||||
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
vocab, rel_freq, stemming)
|
||||||
# transform testing data and return the matrix
|
# transform testing data and return the matrix
|
||||||
testing_data = cv.transform(X[test]).toarray()
|
extracted_words = BagOfWords.extract_all_words(X[test])
|
||||||
|
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||||
|
vocab, rel_freq, stemming)
|
||||||
|
|
||||||
# apply select percentile
|
# apply select percentile
|
||||||
selector = SelectPercentile(percentile=100)
|
selector = SelectPercentile(percentile=percentile)
|
||||||
selector.fit(training_data, y[train])
|
selector.fit(training_data, y[train])
|
||||||
|
|
||||||
|
# new reduced data sets
|
||||||
training_data_r = selector.transform(training_data)
|
training_data_r = selector.transform(training_data)
|
||||||
testing_data_r = selector.transform(testing_data)
|
testing_data_r = selector.transform(testing_data)
|
||||||
|
|
||||||
|
|
|
@ -10,13 +10,14 @@ import csv
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from sklearn.feature_extraction.text import CountVectorizer
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
from sklearn.feature_selection import SelectPercentile
|
||||||
from sklearn.metrics import recall_score, precision_score
|
from sklearn.metrics import recall_score, precision_score
|
||||||
from sklearn.model_selection import StratifiedKFold
|
from sklearn.model_selection import StratifiedKFold
|
||||||
from sklearn.naive_bayes import GaussianNB
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
|
||||||
class NaiveBayes_Interactive:
|
class NaiveBayes_Interactive:
|
||||||
|
|
||||||
def make_naive_bayes(dataset):
|
def make_naive_bayes(dataset, sklearn_cv=True, percentile=100):
|
||||||
'''fits naive bayes model
|
'''fits naive bayes model
|
||||||
'''
|
'''
|
||||||
print('# fitting model')
|
print('# fitting model')
|
||||||
|
@ -24,10 +25,11 @@ class NaiveBayes_Interactive:
|
||||||
|
|
||||||
# split data into text and label set
|
# split data into text and label set
|
||||||
# join title and text
|
# join title and text
|
||||||
X = dataset['Title'] + ' ' + dataset['Text']
|
X = dataset['Title'] + '. ' + dataset['Text']
|
||||||
y = dataset['Label']
|
y = dataset['Label']
|
||||||
|
|
||||||
cv = CountVectorizer()
|
if sklearn_cv:
|
||||||
|
cv = CountVectorizer()
|
||||||
|
|
||||||
# stratified k-fold cross-validation as split method
|
# stratified k-fold cross-validation as split method
|
||||||
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=5)
|
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=5)
|
||||||
|
@ -51,17 +53,40 @@ class NaiveBayes_Interactive:
|
||||||
n += 1
|
n += 1
|
||||||
print('# split no. ' + str(n))
|
print('# split no. ' + str(n))
|
||||||
|
|
||||||
# using CountVectorizer:
|
if sklearn_cv:
|
||||||
# fit the training data and then return the matrix
|
# use sklearn CountVectorizer
|
||||||
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
# fit the training data and then return the matrix
|
||||||
# transform testing data and return the matrix
|
training_data = cv.fit_transform(X[train], y[train]).toarray()
|
||||||
testing_data = cv.transform(X[test]).toarray()
|
# transform testing data and return the matrix
|
||||||
|
testing_data = cv.transform(X[test]).toarray()
|
||||||
|
else:
|
||||||
|
# use my own BagOfWords python implementation
|
||||||
|
stemming = True
|
||||||
|
rel_freq = True
|
||||||
|
extracted_words = BagOfWords.extract_all_words(X[train])
|
||||||
|
vocab = BagOfWords.make_vocab(extracted_words)
|
||||||
|
|
||||||
|
# fit the training data and then return the matrix
|
||||||
|
training_data = BagOfWords.make_matrix(extracted_words,
|
||||||
|
vocab, rel_freq, stemming)
|
||||||
|
# transform testing data and return the matrix
|
||||||
|
extracted_words = BagOfWords.extract_all_words(X[test])
|
||||||
|
testing_data = BagOfWords.make_matrix(extracted_words,
|
||||||
|
vocab, rel_freq, stemming)
|
||||||
|
|
||||||
|
# apply select percentile
|
||||||
|
selector = SelectPercentile(percentile=percentile)
|
||||||
|
selector.fit(training_data, y[train])
|
||||||
|
|
||||||
|
# new reduced data sets
|
||||||
|
training_data_r = selector.transform(training_data)
|
||||||
|
testing_data_r = selector.transform(testing_data)
|
||||||
|
|
||||||
#fit classifier
|
#fit classifier
|
||||||
classifier.fit(training_data, y[train])
|
classifier.fit(training_data_r, y[train])
|
||||||
#predict class
|
#predict class
|
||||||
predictions_train = classifier.predict(training_data)
|
predictions_train = classifier.predict(training_data_r)
|
||||||
predictions_test = classifier.predict(testing_data)
|
predictions_test = classifier.predict(testing_data_r)
|
||||||
|
|
||||||
#print and store metrics
|
#print and store metrics
|
||||||
rec = recall_score(y[test], predictions_test)
|
rec = recall_score(y[test], predictions_test)
|
||||||
|
@ -166,7 +191,9 @@ class NaiveBayes_Interactive:
|
||||||
quotechar='\'',
|
quotechar='\'',
|
||||||
quoting=csv.QUOTE_NONE)
|
quoting=csv.QUOTE_NONE)
|
||||||
|
|
||||||
make_naive_bayes(data)
|
use_count_vectorizer = True
|
||||||
|
select_percentile = 100
|
||||||
|
make_naive_bayes(data, use_count_vectorizer, select_percentile)
|
||||||
|
|
||||||
print('#')
|
print('#')
|
||||||
print('# ending naive bayes')
|
print('# ending naive bayes')
|
21
SVM.py
21
SVM.py
|
@ -27,7 +27,7 @@ from sklearn.svm import SVC
|
||||||
|
|
||||||
class SVM:
|
class SVM:
|
||||||
|
|
||||||
def make_svm(dataset):
|
def make_svm(dataset, sklearn_cv=True):
|
||||||
|
|
||||||
print('# fitting model')
|
print('# fitting model')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
@ -35,16 +35,18 @@ class SVM:
|
||||||
# split data into text and label set
|
# split data into text and label set
|
||||||
|
|
||||||
# articles' text (title + text)
|
# articles' text (title + text)
|
||||||
X = dataset['Title'] + ' ' + dataset['Text']
|
X = dataset['Title'] + '. ' + dataset['Text']
|
||||||
# articles' labels
|
# articles' labels
|
||||||
y = dataset['Label']
|
y = dataset['Label']
|
||||||
|
matrix = pd.DataFrame()
|
||||||
|
|
||||||
# Bag of Words
|
|
||||||
print('# calculating bag of words')
|
|
||||||
print('# ...')
|
|
||||||
# fit the training data and then return the matrix
|
# fit the training data and then return the matrix
|
||||||
#X = BagOfWords.fit_transform(X)
|
if sklearn_cv:
|
||||||
X = CountVectorizer().fit_transform(X).toarray()
|
# use sklearn CountVectorizer
|
||||||
|
matrix = CountVectorizer().fit_transform(X).toarray()
|
||||||
|
else:
|
||||||
|
# use own BOW implementation
|
||||||
|
matrix = BagOfWords.fit_transform(X)
|
||||||
|
|
||||||
# use stratified k-fold cross-validation as split method
|
# use stratified k-fold cross-validation as split method
|
||||||
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
skf = StratifiedKFold(n_splits = 10, shuffle=True)
|
||||||
|
@ -64,7 +66,7 @@ class SVM:
|
||||||
print('# fit classifier')
|
print('# fit classifier')
|
||||||
print('# ...')
|
print('# ...')
|
||||||
|
|
||||||
grid.fit(X,y)
|
grid.fit(matrix,y)
|
||||||
|
|
||||||
# DataFrame of results
|
# DataFrame of results
|
||||||
df_results = grid.cv_results_
|
df_results = grid.cv_results_
|
||||||
|
@ -104,6 +106,7 @@ class SVM:
|
||||||
quotechar='\'',
|
quotechar='\'',
|
||||||
quoting=csv.QUOTE_NONE)
|
quoting=csv.QUOTE_NONE)
|
||||||
|
|
||||||
make_svm(data)
|
use_count_vectorizer = True
|
||||||
|
make_svm(data, use_count_vectorizer)
|
||||||
|
|
||||||
print('# ending svm')
|
print('# ending svm')
|
|
@ -22,7 +22,7 @@ class VisualizerNews:
|
||||||
def plot_wordcloud_dataset():
|
def plot_wordcloud_dataset():
|
||||||
'''plots word cloud image of most common words in dataset.
|
'''plots word cloud image of most common words in dataset.
|
||||||
'''
|
'''
|
||||||
print('# preparing word cloud...')
|
print('# preparing word cloud of 200 most common words...')
|
||||||
print()
|
print()
|
||||||
# load new data set
|
# load new data set
|
||||||
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
||||||
|
@ -32,17 +32,18 @@ class VisualizerNews:
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
usecols=[1,2],
|
usecols=[1,2],
|
||||||
#nrows=100,
|
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
|
|
||||||
corpus = df_dataset[1] + ' ' + df_dataset[2]
|
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
||||||
|
stemming = False
|
||||||
|
rel_freq = False
|
||||||
|
|
||||||
# find most common words in dataset
|
# find most common words in dataset
|
||||||
dict = BagOfWords.make_dict_common_words(corpus,
|
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||||
rel_freq=True,
|
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||||
stemming=False,
|
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
|
||||||
n=200)
|
dict = BagOfWords.make_dict_common_words(matrix, 200, rel_freq, stemming)
|
||||||
|
|
||||||
wordcloud = WordCloud(background_color='white',
|
wordcloud = WordCloud(background_color='white',
|
||||||
width=2400,
|
width=2400,
|
||||||
|
@ -62,30 +63,25 @@ class VisualizerNews:
|
||||||
x-axis: number of mentions of the company
|
x-axis: number of mentions of the company
|
||||||
y-axis: frequency
|
y-axis: frequency
|
||||||
'''
|
'''
|
||||||
print('# preparing histogram...')
|
print('# preparing histogram of company mentions...')
|
||||||
print()
|
print()
|
||||||
# old data set
|
# read data set
|
||||||
filepath = 'data\\classification_labelled_corrected.csv'
|
file = 'data\\interactive_labeling_dataset_without_header.csv'
|
||||||
df = pd.read_csv(filepath,
|
df = pd.read_csv(file,
|
||||||
sep='|',
|
delimiter='|',
|
||||||
|
header=None,
|
||||||
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
decimal='.',
|
usecols=[1,2],
|
||||||
quotechar='\'',
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quoting=csv.QUOTE_NONE)
|
quotechar='\'')
|
||||||
|
|
||||||
# only articles with label==1
|
# # only articles with label==1
|
||||||
df_hits = df[df['Label'] == 1]
|
# df_hits = df[df['Label'] == 1]
|
||||||
|
# texts = df_hits['Title'] + '. ' + df_hits['Text']
|
||||||
|
texts = df[1] + '. ' + df[2]
|
||||||
|
|
||||||
texts = df_hits['Title'] + '. ' + df_hits['Text']
|
# dict: count articles with company names
|
||||||
|
|
||||||
# # zum prüfen lesen
|
|
||||||
# for text in texts[10:20]:
|
|
||||||
# print(text)
|
|
||||||
# print()
|
|
||||||
# print(NER.find_companies(text))
|
|
||||||
# print()
|
|
||||||
|
|
||||||
# count names in hit articles
|
|
||||||
count_names = NER.count_companies(texts)
|
count_names = NER.count_companies(texts)
|
||||||
|
|
||||||
# sort list in descending order
|
# sort list in descending order
|
||||||
|
@ -98,7 +94,7 @@ class VisualizerNews:
|
||||||
plt.ylabel('Number of companies with this number of articles')
|
plt.ylabel('Number of companies with this number of articles')
|
||||||
num_bins = 50
|
num_bins = 50
|
||||||
n, bins, patches = plt.hist(names, num_bins, facecolor='darkred', alpha=0.5)
|
n, bins, patches = plt.hist(names, num_bins, facecolor='darkred', alpha=0.5)
|
||||||
# plt.grid(True)
|
plt.axis([0, 50, 0, 1000])
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def plot_histogram_text_lengths():
|
def plot_histogram_text_lengths():
|
||||||
|
@ -106,20 +102,21 @@ class VisualizerNews:
|
||||||
x-axis: number of characters in article (without headline)
|
x-axis: number of characters in article (without headline)
|
||||||
y-axis: frequency
|
y-axis: frequency
|
||||||
'''
|
'''
|
||||||
print('# preparing histogram...')
|
print('# preparing histogram of text lengths...')
|
||||||
print()
|
print()
|
||||||
# new data set
|
# read data set
|
||||||
filepath = 'data\\interactive_labeling_dataset.csv'
|
filepath = 'data\\interactive_labeling_dataset.csv'
|
||||||
df_dataset = pd.read_csv(filepath,
|
df_dataset = pd.read_csv(filepath,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=0,
|
header=0,
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
|
usecols=[2],
|
||||||
#nrows=100,
|
#nrows=100,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
# consider only Text, not Headline
|
# consider only Text, not Headline
|
||||||
texts = df_dataset['Text']
|
texts = df_dataset[2]
|
||||||
|
|
||||||
# count characters in articles
|
# count characters in articles
|
||||||
print('# counting characters in articles...')
|
print('# counting characters in articles...')
|
||||||
|
@ -150,7 +147,7 @@ class VisualizerNews:
|
||||||
|
|
||||||
def plot_pie_chart_of_sites():
|
def plot_pie_chart_of_sites():
|
||||||
|
|
||||||
print('# preparing pie chart...')
|
print('# preparing pie chart of news article sites...')
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# load data set
|
# load data set
|
||||||
|
@ -164,13 +161,15 @@ class VisualizerNews:
|
||||||
#nrows=100,
|
#nrows=100,
|
||||||
quoting=csv.QUOTE_NONNUMERIC,
|
quoting=csv.QUOTE_NONNUMERIC,
|
||||||
quotechar='\'')
|
quotechar='\'')
|
||||||
|
# find all different sites
|
||||||
df_counts = df_dataset.groupby('Site').count()
|
df_counts = df_dataset.groupby('Site').count()
|
||||||
|
# count occurences of each site
|
||||||
df_counts = df_counts.sort_values(['Url'], ascending=False)
|
df_counts = df_counts.sort_values(['Url'], ascending=False)
|
||||||
|
|
||||||
fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
|
fig, ax = plt.subplots(figsize=(6, 3), subplot_kw=dict(aspect="equal"))
|
||||||
|
|
||||||
data = list(df_counts['Url'])
|
data = list(df_counts['Url'])
|
||||||
|
# legend labels
|
||||||
labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)',
|
labels = ['Reuters (94%)', 'The Guardian (3%)', 'The Economist (2%)',
|
||||||
'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']
|
'Bloomberg (<1%)', 'CNN (<1%)', 'Financial Times (<1%)']
|
||||||
|
|
||||||
|
@ -188,14 +187,14 @@ class VisualizerNews:
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def plot_hist_most_common_words(n_commons = 10):
|
def plot_hist_most_common_words(n_commons = 10):
|
||||||
print('# preparing histogram...')
|
print('# preparing histogram of most common words...')
|
||||||
print()
|
print()
|
||||||
# load data set
|
# load data set
|
||||||
filepath = 'data\\interactive_labeling_dataset_without_header.csv'
|
filepath = 'data\\interactive_labeling_dataset_without_header.csv'
|
||||||
df_dataset = pd.read_csv(filepath,
|
df_dataset = pd.read_csv(filepath,
|
||||||
delimiter='|',
|
delimiter='|',
|
||||||
header=None,
|
header=None,
|
||||||
#usecols=[1,2],
|
usecols=[1,2],
|
||||||
index_col=None,
|
index_col=None,
|
||||||
engine='python',
|
engine='python',
|
||||||
#nrows=1000,
|
#nrows=1000,
|
||||||
|
@ -204,11 +203,14 @@ class VisualizerNews:
|
||||||
|
|
||||||
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
corpus = df_dataset[1] + '. ' + df_dataset[2]
|
||||||
|
|
||||||
|
stemming = False
|
||||||
|
rel_freq = True
|
||||||
|
|
||||||
# find most common words in dataset
|
# find most common words in dataset
|
||||||
dict = BagOfWords.make_dict_common_words(corpus,
|
extracted_words = BagOfWords.extract_all_words(corpus, stemming)
|
||||||
rel_freq=True,
|
vocab = BagOfWords.make_vocab(extracted_words, stemming)
|
||||||
stemming=False,
|
matrix = BagOfWords.make_matrix(extracted_words, vocab, rel_freq, stemming)
|
||||||
n=n_commons)
|
dict = BagOfWords.make_dict_common_words(matrix, n_commons, rel_freq, stemming)
|
||||||
|
|
||||||
plt.xlabel('Most common words in textual corpus')
|
plt.xlabel('Most common words in textual corpus')
|
||||||
plt.ylabel('Relative frequency')
|
plt.ylabel('Relative frequency')
|
||||||
|
|
Loading…
Reference in New Issue