added random selection
This commit is contained in:
parent
759db3c0cf
commit
8d6af51409
|
@ -7,6 +7,7 @@ CsvHandler writes articles' information to csv file and reads it.
|
||||||
|
|
||||||
import csv
|
import csv
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
class CsvHandler:
|
class CsvHandler:
|
||||||
|
@ -26,3 +27,27 @@ class CsvHandler:
|
||||||
def write_csv(df, file_name):
|
def write_csv(df, file_name):
|
||||||
df.to_csv(file_name, sep='|')
|
df.to_csv(file_name, sep='|')
|
||||||
print('# saved {} article(s) in {}'.format(len(df), file_name))
|
print('# saved {} article(s) in {}'.format(len(df), file_name))
|
||||||
|
|
||||||
|
def select_randoms(df, n):
|
||||||
|
'''selects n random samples from dataset.
|
||||||
|
params: df DataFrame to select items from,
|
||||||
|
n number of items to select randomly,
|
||||||
|
returns new DataFrame with only selected items
|
||||||
|
'''
|
||||||
|
# new empty DataFrame
|
||||||
|
df_samples = pd.DataFrame(columns=['rands','title','text','label'])
|
||||||
|
# initialize random => reproducible sequence
|
||||||
|
np.random.seed(5)
|
||||||
|
# pseudorandom float -1.0 <= x <= 1.0 for every sample
|
||||||
|
pd.Series()
|
||||||
|
# add new column 'Random'
|
||||||
|
df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index)
|
||||||
|
# sort DataFrame by random numbers
|
||||||
|
df = df.sort_values('Random')
|
||||||
|
# return first n elements of randomly sorted dataset
|
||||||
|
return df.iloc[0:n]
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
df = CsvHandler.read_csv('classification_labelled_corrected.csv')
|
||||||
|
df_new = CsvHandler.select_randoms(df, 10)
|
||||||
|
CsvHandler.write_csv(df_new, 'samples_10.csv')
|
Loading…
Reference in New Issue