mistral-io-datasets/scripts/plot.R

#!/usr/bin/env Rscript

# Parse job from command line
args = commandArgs(trailingOnly = TRUE)
filename = args[1]

library(ggplot2)
library(dplyr)
require(scales)
library(stringi)
library(stringr)

# Turn to TRUE to print indivdiual job images
plotjobs = FALSE

# Color scheme
plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000099")

if (! exists("filename")){
  filename = "./datasets/job_similarities_5024292.csv"
  filename = "./datasets/job_similarities_7488914.csv" # for manual execution
}
print(filename)
jobID = str_extract(filename, regex("[0-9]+"))

data = read.csv(filename)
# Columns are: jobid alg_id alg_name similarity

#data$alg_id = as.factor(data$alg_id) # EB: falsche Spalte?
data$alg_name = as.factor(data$alg_name) # EB: im Script wird diese Spalte benutzt
cat("Job count:")
cat(nrow(data))

# empirical cumulative density function (ECDF)
data$sim = data$similarity*100
ggplot(data, aes(sim, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("Similarity in %") + ylab("Fraction of jobs") + theme(legend.position=c(0.9, 0.5),  legend.title = element_blank()) + scale_color_brewer(palette = "Set2") # + scale_x_log10() +
ggsave("ecdf.png", width=8, height=2.5)

# histogram for the jobs
ggplot(data, aes(sim), group=alg_name) + geom_histogram(color="black", binwidth=2.5) + aes(fill = alg_name) + facet_grid(alg_name ~ ., switch = 'y') + xlab("Similarity in %") + scale_y_continuous(limits=c(0, 100), oob=squish)  +   scale_color_brewer(palette = "Set2") + ylab("Count (cropped at 100)") + theme(legend.position = "none") + stat_bin(binwidth=2.5, geom="text", adj=1.0, angle = 90, colour="black", size=3, aes(label=..count.., y=0*(..count..)+95))
ggsave("hist-sim.png", width=6, height=5)

#ggplot(data, aes(similarity, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("SIM") + ylab("Fraction of jobs") + theme(legend.position=c(0.9, 0.4))  + scale_color_brewer(palette = "Set2") + xlim(0.5, 1.0)
#ggsave("ecdf-0.5.png", width=8, height=3)

print("Similarity > 0.5")
e = data %>% filter(similarity >= 0.5)
print(summary(e))

# load job information, i.e., the time series per job
jobData = read.csv("./datasets/job_codings_v3.csv") # EB: liegt jetzt Repo. v3 hat die korrekten hexadezimalen Codings
metadata = read.csv("./datasets/job_metadata.csv") # EB: is ebenfalls im Repo
metadata$user_id = as.factor(metadata$user_id)
metadata$group_id = as.factor(metadata$group_id)

plotJobs = function(algorithm, jobs){
    # print the job timelines
    r = e[ordered, ]

    if (plotjobs) {
      if(algorithm == "ks"){
        script = "./scripts/plot-job-timelines-ks.py"
      }else{
        script = "./scripts/plot-job-timelines.py"
      }
      prefix = do.call("sprintf", list("%s-%.4f-", level, r$similarity))
      call = sprintf("%s %s %s", script, paste(r$jobid, collapse=","), paste(prefix, collapse=","))
      print(call)
      system(call)
    }

    system(sprintf("./scripts/extract-conf-data.sh %s > jobs-%s.txt", paste(r$jobid, collapse=" "), level))
  }

# Store the job ids in a table, each column is one algorithm
dim = length(levels(data$alg_name))
count = 100
result = matrix(1:(dim*count), nrow=count, ncol=dim) # will contain the job ids for the count best jobs
colnames(result) = levels(data$alg_name)
result.userid = tibble() # will contain the userid for the count best jobs

# Extract the 100 most similar jobs into the table
for (level in levels(data$alg_name)){
    e = data %>% filter(alg_name == level)
    print(level)
    print(summary(e))
    ordered = order(e$similarity, decreasing=TRUE)[1:count]
    print(e[ordered,])
    # Extract the data for the jobs
    jobs = e[ordered,"jobid"]
    result[, level] = jobs

    # extract details about the jobs of a given algorithm
    tbl = jobData[jobData$jobid %in% jobs,]
    print(summary(tbl))
    md = metadata[metadata$jobid %in% jobs,]
    print(summary(md))
    md$value = 1
    userprofile = md %>% group_by(user_id) %>% summarise(count = sum(value))
    userprofile = userprofile[order(userprofile$count, decreasing=TRUE),]
    userprofile$userrank = 1:nrow(userprofile)
    result.userid = rbind(result.userid, cbind(level, userprofile))

    plotJobs(level, jobs)
}

colnames(result.userid) = c("alg_name", "user_id", "count", "userrank")

print(result.userid)

# Create stacked user table
ggplot(result.userid, aes(fill=userrank, y=count, x=alg_name)) + geom_bar(position="stack", stat="identity") + theme(legend.position = "none") + scale_fill_gradientn(colours=rainbow(5)) + ylab("Stacked user count") + xlab("Algorithm") # + scale_fill_gradient(low="blue", high="red", space ="Lab" ) + scale_fill_continuous(type = "viridis")

ggsave("user-ids.png", width=6, height=4)


# Compute intersection in a new table
res.intersect = matrix(1:(dim*dim), nrow=dim, ncol=dim)
colnames(res.intersect) = levels(data$alg_name)
rownames(res.intersect) = levels(data$alg_name)

tbl.intersect = expand.grid(first=levels(data$alg_name), second=levels(data$alg_name))
tbl.intersect$intersect = 0

for (l1 in levels(data$alg_name)){
  for (l2 in levels(data$alg_name)){
    res = length(intersect(result[,l1], result[,l2]))
    res.intersect[l1,l2] = res
    tbl.intersect[tbl.intersect$first == l1 & tbl.intersect$second == l2, ]$intersect = res
  }
}

print(res.intersect)

# Plot heatmap about intersection
ggplot(tbl.intersect, aes(first, second, fill=intersect)) + geom_tile() + geom_text(aes(label = round(intersect, 1))) + scale_fill_gradientn(colours = rev(plotcolors)) + xlab("") + ylab("")  + theme(legend.position = "bottom", legend.title = element_blank())
ggsave("intersection-heatmap.png", width=5, height=5)

# Collect the metadata of all jobs in a new table
res.jobs = tibble()
for (alg_name in levels(data$alg_name)){
  res.jobs = rbind(res.jobs, cbind(alg_name, metadata[metadata$jobid %in% result[, alg_name],]))
}

# Plot histogram of nodes per algorithm
jobRef = metadata[metadata$jobid == jobID,]$total_nodes
ggplot(res.jobs, aes(alg_name, total_nodes, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + theme(legend.position = "none") + xlab("Algorithm") + ylab("Job node count")  + geom_hline(yintercept= jobRef, linetype="dashed", color = "red", size=0.5)
ggsave("jobs-nodes.png", width=6, height=4)

# Plot histogram of elapsed time per algorithm
jobRef = metadata[metadata$jobid == jobID,]$elapsed
ggplot(res.jobs, aes(alg_name, elapsed, fill=alg_name)) + geom_boxplot() + ylab("Job runtime in s") + xlab("Algorithm")  + theme(legend.position = "none") + ylim(0, max(res.jobs$elapsed)) + geom_hline(yintercept= jobRef, linetype="dashed", color = "red", size=0.5)
#  scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))
ggsave("jobs-elapsed.png", width=6, height=4)


# scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))

# stat_summary(aes(linetype = alg_id), fun.y=mean, geom="line")
Overview plot 2020-08-17 17:14:58 +00:00			`#!/usr/bin/env Rscript`

Plot impr 2020-09-03 18:55:54 +00:00			`# Parse job from command line`
			`args = commandArgs(trailingOnly = TRUE)`
Update 2020-10-01 16:10:27 +00:00			`filename = args[1]`
Plot impr 2020-09-03 18:55:54 +00:00
Overview plot 2020-08-17 17:14:58 +00:00			`library(ggplot2)`
			`library(dplyr)`
			`require(scales)`
Eugen Kommentare. 2020-08-27 09:16:22 +00:00			`library(stringi)`
			`library(stringr)`
Overview plot 2020-08-17 17:14:58 +00:00
Description 2020-08-26 14:09:14 +00:00			`# Turn to TRUE to print indivdiual job images`
Nai 2020-11-24 11:51:22 +00:00			`plotjobs = FALSE`
Overview plot 2020-08-17 17:14:58 +00:00
New diagrams. 2020-08-18 10:54:57 +00:00			`# Color scheme`
Eugen Kommentare. 2020-08-27 09:16:22 +00:00			`plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000099")`
New diagrams. 2020-08-18 10:54:57 +00:00
Update 2020-10-01 16:10:27 +00:00			`if (! exists("filename")){`
			`filename = "./datasets/job_similarities_5024292.csv"`
			`filename = "./datasets/job_similarities_7488914.csv" # for manual execution`
Plot impr 2020-09-03 18:55:54 +00:00			`}`
Update 2020-10-01 16:10:27 +00:00			`print(filename)`
			`jobID = str_extract(filename, regex("[0-9]+"))`
Overview plot 2020-08-17 17:14:58 +00:00
Update 2020-10-01 16:10:27 +00:00			`data = read.csv(filename)`
Overview plot 2020-08-17 17:14:58 +00:00			`# Columns are: jobid alg_id alg_name similarity`

Fixes im Script 2020-08-26 09:56:41 +00:00			`#data$alg_id = as.factor(data$alg_id) # EB: falsche Spalte?`
			`data$alg_name = as.factor(data$alg_name) # EB: im Script wird diese Spalte benutzt`
Optimization 2020-08-20 10:48:27 +00:00			`cat("Job count:")`
			`cat(nrow(data))`
Overview plot 2020-08-17 17:14:58 +00:00
Nai 2020-08-20 11:23:32 +00:00			`# empirical cumulative density function (ECDF)`
Nai 2020-08-21 18:12:33 +00:00			`data$sim = data$similarity*100`
Plot impr 2020-09-03 18:55:54 +00:00			`ggplot(data, aes(sim, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("Similarity in %") + ylab("Fraction of jobs") + theme(legend.position=c(0.9, 0.5), legend.title = element_blank()) + scale_color_brewer(palette = "Set2") # + scale_x_log10() +`
Nai 2020-08-21 18:12:33 +00:00			`ggsave("ecdf.png", width=8, height=2.5)`
Better plotting. 2020-08-20 11:11:35 +00:00
Nai 2020-08-21 18:12:33 +00:00			`# histogram for the jobs`
			`ggplot(data, aes(sim), group=alg_name) + geom_histogram(color="black", binwidth=2.5) + aes(fill = alg_name) + facet_grid(alg_name ~ ., switch = 'y') + xlab("Similarity in %") + scale_y_continuous(limits=c(0, 100), oob=squish) + scale_color_brewer(palette = "Set2") + ylab("Count (cropped at 100)") + theme(legend.position = "none") + stat_bin(binwidth=2.5, geom="text", adj=1.0, angle = 90, colour="black", size=3, aes(label=..count.., y=0*(..count..)+95))`
Plot impr 2020-09-03 18:55:54 +00:00			`ggsave("hist-sim.png", width=6, height=5)`
Nai 2020-08-21 18:12:33 +00:00
			`#ggplot(data, aes(similarity, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("SIM") + ylab("Fraction of jobs") + theme(legend.position=c(0.9, 0.4)) + scale_color_brewer(palette = "Set2") + xlim(0.5, 1.0)`
			`#ggsave("ecdf-0.5.png", width=8, height=3)`
Overview plot 2020-08-17 17:14:58 +00:00
Update 2020-10-01 16:10:27 +00:00			`print("Similarity > 0.5")`
Overview plot 2020-08-17 17:14:58 +00:00			`e = data %>% filter(similarity >= 0.5)`
			`print(summary(e))`

New diagrams. 2020-08-18 10:54:57 +00:00			`# load job information, i.e., the time series per job`
Fixes im Script 2020-08-26 09:56:41 +00:00			`jobData = read.csv("./datasets/job_codings_v3.csv") # EB: liegt jetzt Repo. v3 hat die korrekten hexadezimalen Codings`
			`metadata = read.csv("./datasets/job_metadata.csv") # EB: is ebenfalls im Repo`
New diagrams. 2020-08-18 10:54:57 +00:00			`metadata$user_id = as.factor(metadata$user_id)`
			`metadata$group_id = as.factor(metadata$group_id)`

Renamed 2020-09-03 12:59:20 +00:00			`plotJobs = function(algorithm, jobs){`
Optimization 2020-08-20 10:48:27 +00:00			`# print the job timelines`
Plot the 100 jobs. 2020-08-18 14:26:29 +00:00			`r = e[ordered, ]`
Nai 2020-08-20 11:23:32 +00:00
			`if (plotjobs) {`
Renamed 2020-09-03 12:59:20 +00:00			`if(algorithm == "ks"){`
			`script = "./scripts/plot-job-timelines-ks.py"`
			`}else{`
			`script = "./scripts/plot-job-timelines.py"`
			`}`
Bugfix plotting of jobs. More details in paper. 2020-08-20 15:16:46 +00:00			`prefix = do.call("sprintf", list("%s-%.4f-", level, r$similarity))`
Renamed 2020-09-03 12:59:20 +00:00			`call = sprintf("%s %s %s", script, paste(r$jobid, collapse=","), paste(prefix, collapse=","))`
			`print(call)`
			`system(call)`
Nai 2020-08-20 11:23:32 +00:00			`}`
Weiter 2020-08-27 13:18:23 +00:00
			`system(sprintf("./scripts/extract-conf-data.sh %s > jobs-%s.txt", paste(r$jobid, collapse=" "), level))`
New diagrams. 2020-08-18 10:54:57 +00:00			`}`

			`# Store the job ids in a table, each column is one algorithm`
			`dim = length(levels(data$alg_name))`
			`count = 100`
User info added. 2020-08-20 19:39:42 +00:00			`result = matrix(1:(dim*count), nrow=count, ncol=dim) # will contain the job ids for the count best jobs`
New diagrams. 2020-08-18 10:54:57 +00:00			`colnames(result) = levels(data$alg_name)`
User info added. 2020-08-20 19:39:42 +00:00			`result.userid = tibble() # will contain the userid for the count best jobs`
New diagrams. 2020-08-18 10:54:57 +00:00
			`# Extract the 100 most similar jobs into the table`
			`for (level in levels(data$alg_name)){`
			`e = data %>% filter(alg_name == level)`
			`print(level)`
			`print(summary(e))`
			`ordered = order(e$similarity, decreasing=TRUE)[1:count]`
			`print(e[ordered,])`
			`# Extract the data for the jobs`
			`jobs = e[ordered,"jobid"]`
			`result[, level] = jobs`
User info added. 2020-08-20 19:39:42 +00:00
			`# extract details about the jobs of a given algorithm`
			`tbl = jobData[jobData$jobid %in% jobs,]`
			`print(summary(tbl))`
			`md = metadata[metadata$jobid %in% jobs,]`
			`print(summary(md))`
			`md$value = 1`
			`userprofile = md %>% group_by(user_id) %>% summarise(count = sum(value))`
			`userprofile = userprofile[order(userprofile$count, decreasing=TRUE),]`
			`userprofile$userrank = 1:nrow(userprofile)`
			`result.userid = rbind(result.userid, cbind(level, userprofile))`

Renamed 2020-09-03 12:59:20 +00:00			`plotJobs(level, jobs)`
New diagrams. 2020-08-18 10:54:57 +00:00			`}`

User info added. 2020-08-20 19:39:42 +00:00			`colnames(result.userid) = c("alg_name", "user_id", "count", "userrank")`

			`print(result.userid)`

			`# Create stacked user table`
			`ggplot(result.userid, aes(fill=userrank, y=count, x=alg_name)) + geom_bar(position="stack", stat="identity") + theme(legend.position = "none") + scale_fill_gradientn(colours=rainbow(5)) + ylab("Stacked user count") + xlab("Algorithm") # + scale_fill_gradient(low="blue", high="red", space ="Lab" ) + scale_fill_continuous(type = "viridis")`

			`ggsave("user-ids.png", width=6, height=4)`


New diagrams. 2020-08-18 10:54:57 +00:00			`# Compute intersection in a new table`
			`res.intersect = matrix(1:(dim*dim), nrow=dim, ncol=dim)`
			`colnames(res.intersect) = levels(data$alg_name)`
			`rownames(res.intersect) = levels(data$alg_name)`

			`tbl.intersect = expand.grid(first=levels(data$alg_name), second=levels(data$alg_name))`
			`tbl.intersect$intersect = 0`
Overview plot 2020-08-17 17:14:58 +00:00
New diagrams. 2020-08-18 10:54:57 +00:00			`for (l1 in levels(data$alg_name)){`
			`for (l2 in levels(data$alg_name)){`
			`res = length(intersect(result[,l1], result[,l2]))`
			`res.intersect[l1,l2] = res`
			`tbl.intersect[tbl.intersect$first == l1 & tbl.intersect$second == l2, ]$intersect = res`
			`}`
			`}`

			`print(res.intersect)`

			`# Plot heatmap about intersection`
Added figures. 2020-08-25 17:00:28 +00:00			`ggplot(tbl.intersect, aes(first, second, fill=intersect)) + geom_tile() + geom_text(aes(label = round(intersect, 1))) + scale_fill_gradientn(colours = rev(plotcolors)) + xlab("") + ylab("") + theme(legend.position = "bottom", legend.title = element_blank())`
Abstract 2020-10-04 14:11:46 +00:00			`ggsave("intersection-heatmap.png", width=5, height=5)`
New diagrams. 2020-08-18 10:54:57 +00:00
			`# Collect the metadata of all jobs in a new table`
			`res.jobs = tibble()`
			`for (alg_name in levels(data$alg_name)){`
			`res.jobs = rbind(res.jobs, cbind(alg_name, metadata[metadata$jobid %in% result[, alg_name],]))`
			`}`

Eugen Kommentare. 2020-08-27 09:16:22 +00:00			`# Plot histogram of nodes per algorithm`
			`jobRef = metadata[metadata$jobid == jobID,]$total_nodes`
			`ggplot(res.jobs, aes(alg_name, total_nodes, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + theme(legend.position = "none") + xlab("Algorithm") + ylab("Job node count") + geom_hline(yintercept= jobRef, linetype="dashed", color = "red", size=0.5)`
Bugfix plotting of jobs. More details in paper. 2020-08-20 15:16:46 +00:00			`ggsave("jobs-nodes.png", width=6, height=4)`
New diagrams. 2020-08-18 10:54:57 +00:00
Eugen Kommentare. 2020-08-27 09:16:22 +00:00			`# Plot histogram of elapsed time per algorithm`
			`jobRef = metadata[metadata$jobid == jobID,]$elapsed`
			`ggplot(res.jobs, aes(alg_name, elapsed, fill=alg_name)) + geom_boxplot() + ylab("Job runtime in s") + xlab("Algorithm") + theme(legend.position = "none") + ylim(0, max(res.jobs$elapsed)) + geom_hline(yintercept= jobRef, linetype="dashed", color = "red", size=0.5)`
			`# scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))`
Bugfix plotting of jobs. More details in paper. 2020-08-20 15:16:46 +00:00			`ggsave("jobs-elapsed.png", width=6, height=4)`
New diagrams. 2020-08-18 10:54:57 +00:00



			`# scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))`
Overview plot 2020-08-17 17:14:58 +00:00
New diagrams. 2020-08-18 10:54:57 +00:00			`# stat_summary(aes(linetype = alg_id), fun.y=mean, geom="line")`