mistral-io-datasets/scripts/visualize.R

#!/usr/bin/env Rscript

library('ggplot2')
library('ggthemes')
library('tidyverse')
library('repr')
library('jcolors')
library("reticulate")

args <- commandArgs(trailingOnly = TRUE)

#setwd(source_dir)
use_python("/mnt/lustre01/work/ku0598/k202107/software/install/python/3.8.0/bin/python3", required=T)
source_python("/work/ku0598/k202107/git/mistral-job-evaluation/scripts/jupyter/r_visual_jobs#pickle_reader.py")

global = list()
global[['source_dir']] = '/work/ku0598/k202107/git/mistral-job-evaluation/data/eval_20200117'
global[['eval_dir']] = '../evaluation'
global[['fig_dir']] = sprintf('%s/figures/job_visualization', global[['eval_dir']])
global[['key']] = 22897682

config = list()
config[['crypted_jobid']] = strtoi(args[1])
config[['jobid']] = bitwXor(config[['crypted_jobid']], global[['key']])
config[['cat_fn']] =  sprintf("%s/600/cats/%s.json", global[['source_dir']], config[['jobid']])
config[['raw_fn']] = sprintf('%s/600/jobdata/%s.pkl', global[['source_dir']], config[['jobid']])


graph_config = list()
# View
graph_config[['cols']] = c('metric', 'host', 'name') # Colorized entities: "name" : file systems; "host" : compute nodes, "metric" : I/O metrics
graph_config[['views']] = c('jscore', 'default', 'nscore', 'mscore') # Enable views: 'default', 'jscore', 'nscore', 'mscore'
#graph_config[['views']] = c('nscore')
graph_config[['n_x_breakpoints']] = 5 # Number of breakpoints on x-axis
graph_config[['seg_size']] = 10 # Segments size in minutes

# Size
graph_config[['plot_size']] = list(
    'default' = list('height'=1, 'width'=10),
    'jscore' = list('height'=3, 'width'=10),
    'nscore' = list('height'=1, 'width'=14),
    'mscore' = list('height'=1, 'width'=1))

# Dimensions Limits
graph_config[['max_dimensions']] = list(
    'default' = list('seg'=1000, 'host'=13, 'name'=2, 'metric'=9),
    'jscore' = list('seg'=1000, 'host'=13, 'name'=2, 'metric'=9),
    'nscore' = list('seg'=1000, 'host'=50, 'name'=2, 'metric'=9),
    'mscore' = list('seg'=1000, 'host'=50, 'name'=2, 'metric'=9))

# Legend Limits
graph_config[['max_legend_size']] = list(
    'default' = list('seg'=1000, 'host'=15, 'name'=2, 'metric'=9),
    'jscore' = list('seg'=1000, 'host'=15, 'name'=2, 'metric'=9),
    'nscore' = list('seg'=1000, 'host'=15, 'name'=2, 'metric'=9),
    'mscore' = list('seg'=1000, 'host'=15, 'name'=2, 'metric'=9))

rename_metrics <- function(data) {
    data['metric'] <- lapply(data['metric'], gsub, pattern = "host.lustre.", replacement = "", fixed = TRUE)
    data['metric'] <- lapply(data['metric'], gsub, pattern = "stats.", replacement = "", fixed = TRUE)
    data['metric'] <- lapply(data['metric'], gsub, pattern = ".bytes", replacement = "_bytes", fixed = TRUE)
    data['metric'] <- lapply(data['metric'], gsub, pattern = ".calls", replacement = "_calls", fixed = TRUE)
    return(data)
}


visualize_categories <- function(fn, gconf, cconf, vconf, data, view, col, x_breakpoints, dims) {
    # Set legend title
    if (col == 'host') {
        gtitle = 'Node'
    }
    else if (col == 'metric') {
        gtitle = 'Metric'
    }
    else if (col == 'name') {
        gtitle = 'File system'
    }

    title = sprintf('JOBID: %d / %d (M:H:F:S)=(%d:%d:%d:%d)', cconf$jobid, cconf$crypted_jobid, dims$metric, dims$host, dims$name, dims$seg)

    # The palette with black:
    #cbp2 = c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#999999")
    # General plot
    p <- (
        ggplot(data, aes_string(x='seg', y='score', fill=col))
        #+ geom_bar(stat='summary', fun.y = "mean")
        + ggtitle(title)
        + geom_bar(stat='identity')
        + scale_x_discrete(breaks=x_breakpoints)
        #+ scale_fill_manual(values= cbp2)
        #+ geom_line(data=dat,aes(x='rmin', y='value', color="Second line"))
        + guides(
            fill = guide_legend(title=gtitle, nrow=15)
        )
        #+ theme(aspect.ratio = 1)
        + xlab('Runtime in minutes')
        + theme_linedraw()
        #+ theme_classic()
        + theme(
            #guide_legend.title = element_text('File system'), #element_blank(),
            #legend.text=element_text(size=6),
            legend.spacing.y = unit(0, 'cm'),
            #legend.spacing.x = unit(0, 'cm'),
            legend.text = element_text(size = 8, margin = margin(t = 1)),
            strip.text.x = element_text(size = 8, color = "black"),
            strip.text.y = element_text(size = 8, color = "black"),
            legend.key = element_rect(size = 1),
            legend.key.size = unit(0.5, 'lines'),
            strip.background = element_rect(color="black", fill="#FFFFFF", linetype="solid")
#             panel.grid.major=element_line(size=0.25, color=alpha('black', 0.25)),
#             panel.grid.minor=element_line(size=0.25, color=alpha('black', 0.25))
            )
    )

    # Dimensions modifier
    if (col == 'host') {
        # do nothing
    }
    else if (col == 'metric') {
        p <- (p
            + scale_fill_jcolors("pal12")
        )
    }
    else if (col == 'name') {
        # do nothing
    }
    else if (col == 'seg') {
        # do nothing
    }


    # View modifiers
    if (view == 'default') {
        p <- (p
              + facet_grid(metric ~ .)
              + ylab('Score')
              + theme(
                      strip.text.y = element_text(angle=0)
              )
        )
        # Disable legend if dimensions are too large
        if (dims[[col]] > vconf$max_legend_size[[view]][[col]]) {
            p <- p + theme (legend.position='none')
        }
        ggsave(fn, width=vconf$plot_size[[view]][['width']], height=vconf$plot_size[[view]][['height']] * dims[['metric']])
    }
    else if (view == 'jscore') {
        p <- (p
              + ylab('JScore')
              + theme (
                       strip.text.y = element_text(angle=0),
              )
        )
        # Disable legend if dimensions are too large
        if (dims[[col]] > vconf$max_legend_size[[view]][[col]]) {
            p <- p + theme (legend.position='none')
        }
        ggsave(fn, width=vconf$plot_size[[view]][['width']], height=vconf$plot_size[[view]][['height']])
    }
    else if (view == 'nscore') {
        p <- (
            p
            + facet_grid(host ~ .)
            + ylab('NScore')
            + theme(
                    strip.text.y = element_text(angle=0),
                    aspect.ratio = vconf$plot_size$nscore$height / vconf$plot_size$nscore$width,
                    #legend.position='bottom'
            )
        )
        # Disable legend if dimensions are too large
        if (dims[[col]] > vconf$max_legend_size[[view]][[col]]) {
            p <- p + theme (legend.position='none')
        }
        extra_space = 2
        ggsave(fn, width=vconf$plot_size[[view]][['width']], height=vconf$plot_size[[view]][['height']] * (dims[['host']] + extra_space))
    }
    else if (view == 'mscore') {
        p <- (
            p
            + facet_grid(host ~ metric)
            #+ coord_fixed(ratio=dims[['host']]/dims[['metric']])
            #+ coord_fixed(ratio=dims[['metric']]/dims[['host']])
            #+ coord_fixed(ratio=1)
            + ylab('MScore')
            + theme(
                axis.text.x = element_text(angle=90, hjust=1),
                aspect.ratio = 1,
            )
        )
        # Disable legend if dimensions are too large
        if (dims[[col]] > vconf$max_legend_size[[view]][[col]]) {
            p <- p + theme (legend.position='none')
        }
        extra_space = 2
        ggsave(fn, width=vconf$plot_size[[view]][['width']] * dims[['metric']], height=vconf$plot_size[[view]][['height']] * (dims[['host']] + extra_space))
    }
}


# Check if dimensions exceed limits
exceeds_limits <- function(view, dims, graph_config) {
    max_dims <- graph_config$max_dimensions[[view]]
    if ((dims[['seg']] > max_dims[['seg']])) {
        return(T)
    }
    if (view == 'default') {
        if ((dims[['metric']] > max_dims[['metric']])) {
            return(T)
        }
    }
    else if (view == 'jscore') {
    }
    else if (view == 'nscore') {
        if ((dims[['host']] > max_dims[['host']])) {
            return(T)
        }
    }
    else if (view == 'mscore') {
        if ((dims[['host']] > max_dims[['host']]) || dims[['metric']] > max_dims[['metric']]) {
            return(T)
        }
    }
    else {
        print("Unknown view")
        exit(1)
    }
    return(F)
}


# Create 10 minutes segments
cat_data <- rename_metrics(read.csv(config[['cat_fn']])) # categorized data
cat_data['rmin'] = cat_data['runtime'] / 60 # runtime in minutes
duration = max(ceiling(cat_data['rmin']))
bins = seq(0, duration, graph_config[['seg_size']] )

d2 <- cat_data %>%
    group_by(cat) %>%
    mutate(seg = cut(rmin, breaks = bins, labels = bins[-1]))
d3 <- d2 %>%
    group_by(name, metric, host, seg) %>%
    summarise(score = sum(cat))

dimensions = list()
dimensions[['metric']] <- length(unique(d3$metric))
dimensions[['name']] <- length(unique(d3$name))
dimensions[['host']] <- length(unique(d3$host))
dimensions[['seg']] <- length(unique(d3$seg))

x_breakpoints <- bins[seq(1, length(bins), dimensions[['seg']]/graph_config[['n_x_breakpoints']]+1)]
#x_breakpoints[length(x_breakpoints)+1] <- (dimensions[['seg']]-0)*10

out_dir = sprintf('%s/%d_%d', global[['fig_dir']], config[['jobid']], config[['crypted_jobid']])
dir.create(out_dir, recursive=TRUE)


for (col in graph_config[['cols']]) {
    for (view in graph_config[['views']]) {
        fn = sprintf('%s/%s_%s.png', out_dir, view, col)
        fn_skip = sprintf("%s.skip", fn)

        if (exceeds_limits(view, dimensions, graph_config)) {
            if (file.exists(fn)) {
                file.remove(fn)
            }
            f_skip<-file(fn_skip)
            writeLines(c("dimensions too large"), f_skip)
            close(f_skip)
           print(sprintf('Skipping %s', fn))
        }
        else {
            if (file.exists(fn_skip)) {
                file.remove(fn_skip)
            }
            print(sprintf('Processing %s', fn))
            visualize_categories(fn, global, config, graph_config, d3, view, col, x_breakpoints, dimensions)
        }
    }
}


## TODO
#visualize_rawdata <- function(data) {
#}

#pickle_data <- rename_metrics(read_pickle_file(config[['raw_fn']])) # raw data
#print(head(pickle_data))
#offset = min(pickle_data$timestamp)
#dat = pickle_data[complete.cases(pickle_data),]
#dat$runtime = dat$timestamp - offset
#dat['rmin'] = dat['runtime'] / 60 # runtime in minutes

#visualize_rawdata(dat)