diff --git a/scripts/analyse-all.sh b/scripts/analyse-all.sh index d3aeea9..67030b3 100755 --- a/scripts/analyse-all.sh +++ b/scripts/analyse-all.sh @@ -1,7 +1,7 @@ #!/bin/bash for I in job_similarities_*.csv ; do - ./plot.R $I + ./plot.R $I > description.txt mkdir $I.out rm $I.out/* - mv *.png *.pdf $I.out + mv *.png *.pdf description.txt $I.out done diff --git a/scripts/plot.R b/scripts/plot.R index 4854ef6..6e264fa 100755 --- a/scripts/plot.R +++ b/scripts/plot.R @@ -3,12 +3,16 @@ library(ggplot2) library(dplyr) require(scales) - -args = commandArgs(trailingOnly = TRUE) +#library(hrbrthemes) file = "job_similarities_5024292.csv" file = "job_similarities_7488914.csv" +# Color scheme +plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000066") + +# Parse job from command line +args = commandArgs(trailingOnly = TRUE) file = args[1] data = read.csv(file) @@ -33,8 +37,77 @@ ggsave("ecdf-0.5.png") ggplot(data, aes(similarity), group=alg_name) + geom_histogram(color="black", binwidth=0.025) + aes(fill = alg_name) + facet_grid(alg_name ~ ., switch = 'y') + scale_y_continuous(limits=c(0, 100), oob=squish) + scale_color_brewer(palette = "Set2") + ylab("Count (cropped at 100)") ggsave("hist-sim.png") +# load job information, i.e., the time series per job +jobData = read.csv("job-io-datasets/datasets/job_codings.csv") +metadata = read.csv("job-io-datasets/datasets/job_metadata.csv") +metadata$user_id = as.factor(metadata$user_id) +metadata$group_id = as.factor(metadata$group_id) + +plotJobs = function(jobs){ + # plot details about the jobs of a given algorithm + tbl = jobData[jobData$jobid %in% jobs,] + print(summary(tbl)) + #print(tbl) + md = metadata[metadata$jobid %in% jobs,] + print(summary(md)) + } + +# Store the job ids in a table, each column is one algorithm +dim = length(levels(data$alg_name)) +count = 100 +result = matrix(1:(dim*count), nrow=count, ncol=dim) +colnames(result) = levels(data$alg_name) + +# Extract the 100 most similar jobs into the table +for (level in levels(data$alg_name)){ + e = data %>% filter(alg_name == level) + print(level) + print(summary(e)) + ordered = order(e$similarity, decreasing=TRUE)[1:count] + print(e[ordered,]) + # Extract the data for the jobs + jobs = e[ordered,"jobid"] + result[, level] = jobs + plotJobs(jobs) +} + +# Compute intersection in a new table +res.intersect = matrix(1:(dim*dim), nrow=dim, ncol=dim) +colnames(res.intersect) = levels(data$alg_name) +rownames(res.intersect) = levels(data$alg_name) + +tbl.intersect = expand.grid(first=levels(data$alg_name), second=levels(data$alg_name)) +tbl.intersect$intersect = 0 + +for (l1 in levels(data$alg_name)){ + for (l2 in levels(data$alg_name)){ + res = length(intersect(result[,l1], result[,l2])) + res.intersect[l1,l2] = res + tbl.intersect[tbl.intersect$first == l1 & tbl.intersect$second == l2, ]$intersect = res + } +} + +print(res.intersect) + +# Plot heatmap about intersection +ggplot(tbl.intersect, aes(first, second, fill=intersect)) + geom_tile() + geom_text(aes(label = round(intersect, 1))) + scale_fill_gradientn(colours = rev(plotcolors)) +ggsave("intersection-heatmap.png") + +# Collect the metadata of all jobs in a new table +res.jobs = tibble() +for (alg_name in levels(data$alg_name)){ + res.jobs = rbind(res.jobs, cbind(alg_name, metadata[metadata$jobid %in% result[, alg_name],])) +} + +ggplot(res.jobs, aes(alg_name, total_nodes, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) +ggsave("jobs-nodes.png") + +ggplot(res.jobs, aes(alg_name, elapsed, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) + ylab("Runtime in s") + xlab("Algorithm") +ggsave("jobs-elapsed.png") + + + + # scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) -# stat_summary(aes(linetype = alg_id), fun.y=mean, geom="line") - -exit(0) +# stat_summary(aes(linetype = alg_id), fun.y=mean, geom="line")