added random selection

This commit is contained in:
Anne Lorenz 2018-09-26 10:30:17 +02:00
parent 759db3c0cf
commit 8d6af51409
1 changed files with 26 additions and 1 deletions

View File

@ -7,6 +7,7 @@ CsvHandler writes articles' information to csv file and reads it.
import csv
import numpy as np
import pandas as pd
class CsvHandler:
@ -25,4 +26,28 @@ class CsvHandler:
def write_csv(df, file_name):
df.to_csv(file_name, sep='|')
print('# saved {} article(s) in {}'.format(len(df), file_name))
print('# saved {} article(s) in {}'.format(len(df), file_name))
def select_randoms(df, n):
'''selects n random samples from dataset.
params: df DataFrame to select items from,
n number of items to select randomly,
returns new DataFrame with only selected items
'''
# new empty DataFrame
df_samples = pd.DataFrame(columns=['rands','title','text','label'])
# initialize random => reproducible sequence
np.random.seed(5)
# pseudorandom float -1.0 <= x <= 1.0 for every sample
pd.Series()
# add new column 'Random'
df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
# sort DataFrame by random numbers
df = df.sort_values('Random')
# return first n elements of randomly sorted dataset
return df.iloc[0:n]
if __name__ == '__main__':
df = CsvHandler.read_csv('classification_labelled_corrected.csv')
df_new = CsvHandler.select_randoms(df, 10)
CsvHandler.write_csv(df_new, 'samples_10.csv')