diff --git a/datasets/decompress.sh b/datasets/decompress.sh new file mode 100755 index 0000000..691df92 --- /dev/null +++ b/datasets/decompress.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +filenames=$( ls *.tar.xz ) + +for filename in ${filenames[@]}; do + echo "Decompressing ${filename}" + tar -xJf "${filename}" +done diff --git a/job_assessment.csv.tar.xz b/datasets/job_assessment.csv.tar.xz similarity index 100% rename from job_assessment.csv.tar.xz rename to datasets/job_assessment.csv.tar.xz diff --git a/job_similarities_5024292.csv.tar.xz b/datasets/job_similarities_5024292.csv.tar.xz similarity index 100% rename from job_similarities_5024292.csv.tar.xz rename to datasets/job_similarities_5024292.csv.tar.xz diff --git a/job_similarities_7488914.csv.tar.xz b/datasets/job_similarities_7488914.csv.tar.xz similarity index 100% rename from job_similarities_7488914.csv.tar.xz rename to datasets/job_similarities_7488914.csv.tar.xz diff --git a/scripts/r_visual_jobs#pickle_reader.py b/scripts/r_visual_jobs#pickle_reader.py new file mode 100755 index 0000000..5f64fb7 --- /dev/null +++ b/scripts/r_visual_jobs#pickle_reader.py @@ -0,0 +1,11 @@ +# Required for job visualization +# job_visualization_r.ipynb + +import pandas as pd + +def read_pickle_file(file): + pickle_data = pd.read_pickle(file) + start, stop, data, metadata = pickle_data + return data.reset_index() + + diff --git a/scripts/visualize.R b/scripts/visualize.R new file mode 100755 index 0000000..05dd4aa --- /dev/null +++ b/scripts/visualize.R @@ -0,0 +1,208 @@ +#!/usr/bin/env Rscript + +library('ggplot2') +library('ggthemes') +library('tidyverse') +library('repr') +library('jcolors') +library("reticulate") + +#setwd(source_dir) +use_python("/mnt/lustre01/work/ku0598/k202107/software/install/python/3.8.0/bin/python3", required=T) +source_python("/work/ku0598/k202107/git/mistral-job-evaluation/scripts/jupyter/r_visual_jobs#pickle_reader.py") + +global = list() +global[['source_dir']] = '/work/ku0598/k202107/git/mistral-job-evaluation/data/eval_20200117' +global[['eval_dir']] = '../evaluation' +global[['fig_dir']] = sprintf('%s/pictures/jobs', global[['eval_dir']]) +global[['key']] = 22897682 + +config = list() +config[['crypted_jobid']] = 4296426 # has 16 levels +config[['jobid']] = bitwXor(config[['crypted_jobid']], global[['key']]) +config[['cat_fn']] = sprintf("%s/600/cats/%s.json", global[['source_dir']], config[['jobid']]) +config[['raw_fn']] = sprintf('%s/600/jobdata/%s.pkl', global[['source_dir']], config[['jobid']]) + + +graph_config = list() +# Colorized entities +# "name" : file systems +# "host" : compute nodes +# "metric" : I/O metrics +graph_config[['cols']] = c('metric', 'host', 'name') +#graph_config[['cols']] = c('host', 'name') + +# Enable views +#'default', 'jscore', 'nscore', 'mscore' +graph_config[['views']] = c('jscore', 'default', 'nscore', 'mscore') +#graph_config[['views']] = c('default') + +# Set at nth position a label +graph_config[['x_breakpoint_interval']] = 5 + +# Segments size in minutes +graph_config[['seg_size']] = 10 + + +rename_metrics <- function(data) { + data['metric'] <- lapply(data['metric'], gsub, pattern = "host.lustre.", replacement = "", fixed = TRUE) + data['metric'] <- lapply(data['metric'], gsub, pattern = "stats.", replacement = "", fixed = TRUE) + data['metric'] <- lapply(data['metric'], gsub, pattern = ".bytes", replacement = "_bytes", fixed = TRUE) + data['metric'] <- lapply(data['metric'], gsub, pattern = ".calls", replacement = "_calls", fixed = TRUE) + return(data) +} + + +visualize_categories <- function(gconf, cconf, data, view, col, x_breakpoints) { + out_dir = sprintf('%s/%d', gconf[['fig_dir']], cconf[['jobid']]) + dir.create(out_dir, recursive=TRUE) + + # Set legend title + if (col == 'host') { + gtitle = 'Node' + } + else if (col == 'metric') { + gtitle = 'Metric' + } + else if (col == 'name') { + gtitle = 'File system' + } + + + # The palette with black: + #cbp2 = c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#999999") + # General plot + p <- ( + ggplot(data, aes_string(x='bin', y='score', fill=col)) + #+ geom_bar(stat='summary', fun.y = "mean") + + geom_bar(stat='identity') + + scale_x_discrete(breaks=x_breakpoints) + #+ scale_fill_manual(values= cbp2) + #+ geom_line(data=dat,aes(x='rmin', y='value', color="Second line")) + + guides( + fill = guide_legend(title=gtitle, nrow=15) + ) + + ylab('JScore') + + xlab('Runtime in minutes') + + theme_linedraw() + #+ theme_classic() + + theme( + #guide_legend.title = element_text('File system'), #element_blank(), + #legend.text=element_text(size=6), + legend.spacing.y = unit(0, 'cm'), + #legend.spacing.x = unit(0, 'cm'), + legend.text = element_text(size=8, margin = margin(t = 1)), + strip.text.x = element_text(size = 8, color = "black"), + strip.text.y = element_text(size = 8, color = "black"), + legend.key = element_rect(size = 1), + legend.key.size = unit(0.5, 'lines'), + strip.background = element_rect(color="black", fill="#FFFFFF", linetype="solid") +# panel.grid.major=element_line(size=0.25, color=alpha('black', 0.25)), +# panel.grid.minor=element_line(size=0.25, color=alpha('black', 0.25)) + ) + ) + if (col == 'host') { + if (nrow(unique(data['host'])) > 13) { + p <- (p + + theme( + #legend.position='none' + ) + ) + } + } + else if (col == 'metric') { + p <- (p + + scale_fill_jcolors("pal12") + ) + } + else if (col == 'name') { + } + + if (view == 'jscore') { + fn = sprintf('%s/jscore_%s.png', out_dir, col) + ggsave(fn, width=10, height=2.5) + } + if (view == 'default') { + p <- ( + p + + facet_grid(metric ~ .) + + ylab('') + + ylab('Score') + + theme( + legend.position='none', + #strip.text.x = element_text(angle=0), + strip.text.y = element_text(angle=0), + ) + ) + fn = sprintf('%s/default_%s.png', out_dir, col) + ggsave(fn, width=7, height=7) + } + else if (view == 'nscore') { + p <- ( + p + + facet_grid(host ~ .) + + ylab('NScore') + + theme( + #legend.position='none' + ) + ) + if (col == 'name') { + p <- p + theme(legend.position = 'bottom') + } + fn = sprintf('%s/nscore_%s.png', out_dir, col) + ggsave(fn, width=4, height=4) + } + else if (view == 'mscore') { + p <- ( + p + + facet_grid(host ~ metric) + + ylab('MScore') + + theme( + legend.position='none', + axis.text.x = element_text(angle=90, hjust=1) + ) + ) + fn = sprintf('%s/mscore_%s.png', out_dir, col) + ggsave(fn, width=8, height=4) + } +} + + + + +# Create 10 minutes segments +cat_data <- rename_metrics(read.csv(config[['cat_fn']])) # categorized data +cat_data['rmin'] = cat_data['runtime'] / 60 # runtime in minutes +duration = max(ceiling(cat_data['rmin'])) +bins = seq(0, duration, graph_config[['seg_size']] ) + + +d2 <- cat_data %>% + group_by(cat) %>% + mutate(bin = cut(rmin, breaks = bins, labels = bins[-1])) +d3 <- d2 %>% + group_by(name, metric, host, bin) %>% + summarise(score = sum(cat)) + +x_breakpoints <- bins[seq(1, length(bins), graph_config[['x_breakpoint_interval']])] + +for (col in graph_config[['cols']]) { + for (view in graph_config[['views']]) { + visualize_categories (global, config, d3, view, col, x_breakpoints) + } +} + + +## TODO +#visualize_rawdata <- function(data) { +#} + +#pickle_data <- rename_metrics(read_pickle_file(config[['raw_fn']])) # raw data +#print(head(pickle_data)) +#offset = min(pickle_data$timestamp) +#dat = pickle_data[complete.cases(pickle_data),] +#dat$runtime = dat$timestamp - offset +#dat['rmin'] = dat['runtime'] / 60 # runtime in minutes + +#visualize_rawdata(dat) +