Merge branch 'master' of http://git.hps.vi4io.org/eugen.betke/mistral-io-datasets

2020-08-18 12:57:20 +02:00 · 2020-08-18 12:57:20 +02:00 · 18a084f025
commit 18a084f025
parent ac9a8bff36 305441ebe1
2 changed files with 80 additions and 7 deletions
--- a/scripts/analyse-all.sh
+++ b/scripts/analyse-all.sh
@ -1,7 +1,7 @@
 #!/bin/bash
 for I in job_similarities_*.csv ; do
-  ./plot.R $I
+  ./plot.R $I > description.txt
  mkdir $I.out
  rm $I.out/*
-  mv *.png *.pdf $I.out
+  mv *.png *.pdf description.txt $I.out
 done
--- a/scripts/plot.R
+++ b/scripts/plot.R
@ -3,12 +3,16 @@
 library(ggplot2)
 library(dplyr)
 require(scales)
-
-args = commandArgs(trailingOnly = TRUE)
+#library(hrbrthemes)

 file = "job_similarities_5024292.csv"
 file = "job_similarities_7488914.csv"

+# Color scheme
+plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000066")
+
+# Parse job from command line
+args = commandArgs(trailingOnly = TRUE)
 file = args[1]

 data = read.csv(file)
@ -33,8 +37,77 @@ ggsave("ecdf-0.5.png")
 ggplot(data, aes(similarity), group=alg_name) + geom_histogram(color="black", binwidth=0.025) + aes(fill = alg_name) + facet_grid(alg_name ~ ., switch = 'y') + scale_y_continuous(limits=c(0, 100), oob=squish)  +   scale_color_brewer(palette = "Set2") + ylab("Count (cropped at 100)")
 ggsave("hist-sim.png")

+# load job information, i.e., the time series per job
+jobData = read.csv("job-io-datasets/datasets/job_codings.csv")
+metadata = read.csv("job-io-datasets/datasets/job_metadata.csv")
+metadata$user_id = as.factor(metadata$user_id)
+metadata$group_id = as.factor(metadata$group_id)
+
+plotJobs = function(jobs){
+    # plot details about the jobs of a given algorithm
+    tbl = jobData[jobData$jobid %in% jobs,]
+    print(summary(tbl))
+    #print(tbl)
+    md = metadata[metadata$jobid %in% jobs,]
+    print(summary(md))
+  }
+
+# Store the job ids in a table, each column is one algorithm
+dim = length(levels(data$alg_name))
+count = 100
+result = matrix(1:(dim*count), nrow=count, ncol=dim)
+colnames(result) = levels(data$alg_name)
+
+# Extract the 100 most similar jobs into the table
+for (level in levels(data$alg_name)){
+    e = data %>% filter(alg_name == level)
+    print(level)
+    print(summary(e))
+    ordered = order(e$similarity, decreasing=TRUE)[1:count]
+    print(e[ordered,])
+    # Extract the data for the jobs
+    jobs = e[ordered,"jobid"]
+    result[, level] = jobs
+    plotJobs(jobs)
+}
+
+# Compute intersection in a new table
+res.intersect = matrix(1:(dim*dim), nrow=dim, ncol=dim)
+colnames(res.intersect) = levels(data$alg_name)
+rownames(res.intersect) = levels(data$alg_name)
+
+tbl.intersect = expand.grid(first=levels(data$alg_name), second=levels(data$alg_name))
+tbl.intersect$intersect = 0
+
+for (l1 in levels(data$alg_name)){
+  for (l2 in levels(data$alg_name)){
+    res = length(intersect(result[,l1], result[,l2]))
+    res.intersect[l1,l2] = res
+    tbl.intersect[tbl.intersect$first == l1 & tbl.intersect$second == l2, ]$intersect = res
+  }
+}
+
+print(res.intersect)
+
+# Plot heatmap about intersection
+ggplot(tbl.intersect, aes(first, second, fill=intersect)) + geom_tile() + geom_text(aes(label = round(intersect, 1))) + scale_fill_gradientn(colours = rev(plotcolors))
+ggsave("intersection-heatmap.png")
+
+# Collect the metadata of all jobs in a new table
+res.jobs = tibble()
+for (alg_name in levels(data$alg_name)){
+  res.jobs = rbind(res.jobs, cbind(alg_name, metadata[metadata$jobid %in% result[, alg_name],]))
+}
+
+ggplot(res.jobs, aes(alg_name, total_nodes, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))
+ggsave("jobs-nodes.png")
+
+ggplot(res.jobs, aes(alg_name, elapsed, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) + ylab("Runtime in s") + xlab("Algorithm")
+ggsave("jobs-elapsed.png")
+
+
+
+
 # scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))

 # stat_summary(aes(linetype = alg_id), fun.y=mean, geom="line")
-
-exit(0)