''' Csv Handler =========== CsvHandler writes articles' information to csv file and reads it. ''' import csv import numpy as np import pandas as pd class CsvHandler: def read_csv(csv_file): df = pd.read_csv(csv_file, sep='|', header=0, engine='python', usecols=[1,2,4], #use only 'Title', 'Text' and 'Label' decimal='.', quotechar='\'', #nrows = 200, quoting=csv.QUOTE_NONE) return df def write_csv(df, file_name): df.to_csv(file_name, sep='|') print('# saved {} article(s) in {}'.format(len(df), file_name)) def select_randoms(df, n): '''selects n random samples from dataset. params: df DataFrame to select items from, n number of items to select randomly, returns new DataFrame with only selected items ''' # new empty DataFrame df_samples = pd.DataFrame(columns=['rands','title','text','label']) # initialize random => reproducible sequence np.random.seed(5) # pseudorandom float -1.0 <= x <= 1.0 for every sample pd.Series() # add new column 'Random' df['Random'] = pd.Series(np.random.randn(len(df)), index=df.index) # sort DataFrame by random numbers df = df.sort_values('Random') # return first n elements of randomly sorted dataset return df.iloc[0:n] if __name__ == '__main__': df = CsvHandler.read_csv('classification_labelled_corrected.csv') df_new = CsvHandler.select_randoms(df, 10) CsvHandler.write_csv(df_new, 'samples_10.csv')