mistral-io-datasets/scripts/plot.R

#!/usr/bin/env Rscript

library(ggplot2)
library(dplyr)
require(scales)
#library(hrbrthemes)

file = "job_similarities_5024292.csv"
file = "job_similarities_7488914.csv"

# Color scheme
plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000066")

# Parse job from command line
args = commandArgs(trailingOnly = TRUE)
file = args[1]

data = read.csv(file)
# Columns are: jobid alg_id alg_name similarity

data$alg_id = as.factor(data$alg_id)
print(nrow(data))

# FILTER, TODO
data = data %>% filter(similarity <= 1.0)

# empirical cummulative density function (ECDF)
ggplot(data, aes(similarity, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("SIM") + ylab("Fraction of jobs") + theme(legend.position="bottom") + scale_color_brewer(palette = "Set2")
ggsave("ecdf.png")

e = data %>% filter(similarity >= 0.5)
ggplot(e, aes(similarity, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("SIM") + ylab("Fraction of jobs") + theme(legend.position="bottom")  + scale_color_brewer(palette = "Set2")
print(summary(e))
ggsave("ecdf-0.5.png")

# histogram for the jobs
ggplot(data, aes(similarity), group=alg_name) + geom_histogram(color="black", binwidth=0.025) + aes(fill = alg_name) + facet_grid(alg_name ~ ., switch = 'y') + scale_y_continuous(limits=c(0, 100), oob=squish)  +   scale_color_brewer(palette = "Set2") + ylab("Count (cropped at 100)")
ggsave("hist-sim.png")

# load job information, i.e., the time series per job
jobData = read.csv("job-io-datasets/datasets/job_codings.csv")
metadata = read.csv("job-io-datasets/datasets/job_metadata.csv")
metadata$user_id = as.factor(metadata$user_id)
metadata$group_id = as.factor(metadata$group_id)

plotJobs = function(jobs){
    # plot details about the jobs of a given algorithm
    tbl = jobData[jobData$jobid %in% jobs,]
    print(summary(tbl))
    #print(tbl)
    md = metadata[metadata$jobid %in% jobs,]
    print(summary(md))

    # print the job timeline
    r = e[ordered, ]
    for (row in 1:length(jobs)) {
      prefix = sprintf("%s-%f-%.0f-", level, r[row, "similarity"], row)
      job = r[row, "jobid"]
      system(sprintf("scripts/plot-single-job.py %s %s", job, prefix))
    }
  }

# Store the job ids in a table, each column is one algorithm
dim = length(levels(data$alg_name))
count = 100
result = matrix(1:(dim*count), nrow=count, ncol=dim)
colnames(result) = levels(data$alg_name)

# Extract the 100 most similar jobs into the table
for (level in levels(data$alg_name)){
    e = data %>% filter(alg_name == level)
    print(level)
    print(summary(e))
    ordered = order(e$similarity, decreasing=TRUE)[1:count]
    print(e[ordered,])
    # Extract the data for the jobs
    jobs = e[ordered,"jobid"]
    result[, level] = jobs
    plotJobs(jobs)
}

# Compute intersection in a new table
res.intersect = matrix(1:(dim*dim), nrow=dim, ncol=dim)
colnames(res.intersect) = levels(data$alg_name)
rownames(res.intersect) = levels(data$alg_name)

tbl.intersect = expand.grid(first=levels(data$alg_name), second=levels(data$alg_name))
tbl.intersect$intersect = 0

for (l1 in levels(data$alg_name)){
  for (l2 in levels(data$alg_name)){
    res = length(intersect(result[,l1], result[,l2]))
    res.intersect[l1,l2] = res
    tbl.intersect[tbl.intersect$first == l1 & tbl.intersect$second == l2, ]$intersect = res
  }
}

print(res.intersect)

# Plot heatmap about intersection
ggplot(tbl.intersect, aes(first, second, fill=intersect)) + geom_tile() + geom_text(aes(label = round(intersect, 1))) + scale_fill_gradientn(colours = rev(plotcolors))
ggsave("intersection-heatmap.png")

# Collect the metadata of all jobs in a new table
res.jobs = tibble()
for (alg_name in levels(data$alg_name)){
  res.jobs = rbind(res.jobs, cbind(alg_name, metadata[metadata$jobid %in% result[, alg_name],]))
}

ggplot(res.jobs, aes(alg_name, total_nodes, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))
ggsave("jobs-nodes.png")

ggplot(res.jobs, aes(alg_name, elapsed, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) + ylab("Runtime in s") + xlab("Algorithm")
ggsave("jobs-elapsed.png")


# scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))

# stat_summary(aes(linetype = alg_id), fun.y=mean, geom="line")
Overview plot 2020-08-17 17:14:58 +00:00			`#!/usr/bin/env Rscript`

			`library(ggplot2)`
			`library(dplyr)`
			`require(scales)`
New diagrams. 2020-08-18 10:54:57 +00:00			`#library(hrbrthemes)`
Overview plot 2020-08-17 17:14:58 +00:00
			`file = "job_similarities_5024292.csv"`
			`file = "job_similarities_7488914.csv"`

New diagrams. 2020-08-18 10:54:57 +00:00			`# Color scheme`
			`plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000066")`

			`# Parse job from command line`
			`args = commandArgs(trailingOnly = TRUE)`
Overview plot 2020-08-17 17:14:58 +00:00			`file = args[1]`

			`data = read.csv(file)`
			`# Columns are: jobid alg_id alg_name similarity`

			`data$alg_id = as.factor(data$alg_id)`
			`print(nrow(data))`

			`# FILTER, TODO`
			`data = data %>% filter(similarity <= 1.0)`

			`# empirical cummulative density function (ECDF)`
			`ggplot(data, aes(similarity, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("SIM") + ylab("Fraction of jobs") + theme(legend.position="bottom") + scale_color_brewer(palette = "Set2")`
			`ggsave("ecdf.png")`

			`e = data %>% filter(similarity >= 0.5)`
			`ggplot(e, aes(similarity, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("SIM") + ylab("Fraction of jobs") + theme(legend.position="bottom") + scale_color_brewer(palette = "Set2")`
			`print(summary(e))`
			`ggsave("ecdf-0.5.png")`

			`# histogram for the jobs`
			`ggplot(data, aes(similarity), group=alg_name) + geom_histogram(color="black", binwidth=0.025) + aes(fill = alg_name) + facet_grid(alg_name ~ ., switch = 'y') + scale_y_continuous(limits=c(0, 100), oob=squish) + scale_color_brewer(palette = "Set2") + ylab("Count (cropped at 100)")`
			`ggsave("hist-sim.png")`

New diagrams. 2020-08-18 10:54:57 +00:00			`# load job information, i.e., the time series per job`
			`jobData = read.csv("job-io-datasets/datasets/job_codings.csv")`
			`metadata = read.csv("job-io-datasets/datasets/job_metadata.csv")`
			`metadata$user_id = as.factor(metadata$user_id)`
			`metadata$group_id = as.factor(metadata$group_id)`

			`plotJobs = function(jobs){`
			`# plot details about the jobs of a given algorithm`
			`tbl = jobData[jobData$jobid %in% jobs,]`
			`print(summary(tbl))`
			`#print(tbl)`
			`md = metadata[metadata$jobid %in% jobs,]`
			`print(summary(md))`
Plot the 100 jobs. 2020-08-18 14:26:29 +00:00
			`# print the job timeline`
			`r = e[ordered, ]`
			`for (row in 1:length(jobs)) {`
Tune size + output 2020-08-18 14:46:05 +00:00			`prefix = sprintf("%s-%f-%.0f-", level, r[row, "similarity"], row)`
Plot the 100 jobs. 2020-08-18 14:26:29 +00:00			`job = r[row, "jobid"]`
			`system(sprintf("scripts/plot-single-job.py %s %s", job, prefix))`
			`}`
New diagrams. 2020-08-18 10:54:57 +00:00			`}`

			`# Store the job ids in a table, each column is one algorithm`
			`dim = length(levels(data$alg_name))`
			`count = 100`
			`result = matrix(1:(dim*count), nrow=count, ncol=dim)`
			`colnames(result) = levels(data$alg_name)`

			`# Extract the 100 most similar jobs into the table`
			`for (level in levels(data$alg_name)){`
			`e = data %>% filter(alg_name == level)`
			`print(level)`
			`print(summary(e))`
			`ordered = order(e$similarity, decreasing=TRUE)[1:count]`
			`print(e[ordered,])`
			`# Extract the data for the jobs`
			`jobs = e[ordered,"jobid"]`
			`result[, level] = jobs`
			`plotJobs(jobs)`
			`}`

			`# Compute intersection in a new table`
			`res.intersect = matrix(1:(dim*dim), nrow=dim, ncol=dim)`
			`colnames(res.intersect) = levels(data$alg_name)`
			`rownames(res.intersect) = levels(data$alg_name)`

			`tbl.intersect = expand.grid(first=levels(data$alg_name), second=levels(data$alg_name))`
			`tbl.intersect$intersect = 0`
Overview plot 2020-08-17 17:14:58 +00:00
New diagrams. 2020-08-18 10:54:57 +00:00			`for (l1 in levels(data$alg_name)){`
			`for (l2 in levels(data$alg_name)){`
			`res = length(intersect(result[,l1], result[,l2]))`
			`res.intersect[l1,l2] = res`
			`tbl.intersect[tbl.intersect$first == l1 & tbl.intersect$second == l2, ]$intersect = res`
			`}`
			`}`

			`print(res.intersect)`

			`# Plot heatmap about intersection`
			`ggplot(tbl.intersect, aes(first, second, fill=intersect)) + geom_tile() + geom_text(aes(label = round(intersect, 1))) + scale_fill_gradientn(colours = rev(plotcolors))`
			`ggsave("intersection-heatmap.png")`

			`# Collect the metadata of all jobs in a new table`
			`res.jobs = tibble()`
			`for (alg_name in levels(data$alg_name)){`
			`res.jobs = rbind(res.jobs, cbind(alg_name, metadata[metadata$jobid %in% result[, alg_name],]))`
			`}`

			`ggplot(res.jobs, aes(alg_name, total_nodes, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))`
			`ggsave("jobs-nodes.png")`

			`ggplot(res.jobs, aes(alg_name, elapsed, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) + ylab("Runtime in s") + xlab("Algorithm")`
			`ggsave("jobs-elapsed.png")`




			`# scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))`
Overview plot 2020-08-17 17:14:58 +00:00
New diagrams. 2020-08-18 10:54:57 +00:00			`# stat_summary(aes(linetype = alg_id), fun.y=mean, geom="line")`