118 lines
		
	
	
		
			4.6 KiB
		
	
	
	
		
			R
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			118 lines
		
	
	
		
			4.6 KiB
		
	
	
	
		
			R
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/env Rscript
 | |
| 
 | |
| library(ggplot2)
 | |
| library(dplyr)
 | |
| require(scales)
 | |
| #library(hrbrthemes)
 | |
| 
 | |
| file = "job_similarities_5024292.csv"
 | |
| file = "job_similarities_7488914.csv"
 | |
| 
 | |
| # Color scheme
 | |
| plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000066")
 | |
| 
 | |
| # Parse job from command line
 | |
| args = commandArgs(trailingOnly = TRUE)
 | |
| file = args[1]
 | |
| 
 | |
| data = read.csv(file)
 | |
| # Columns are: jobid alg_id alg_name similarity
 | |
| 
 | |
| data$alg_id = as.factor(data$alg_id)
 | |
| cat("Job count:")
 | |
| cat(nrow(data))
 | |
| 
 | |
| # empirical cummulative density function (ECDF)
 | |
| ggplot(data, aes(similarity, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("SIM") + ylab("Fraction of jobs") + theme(legend.position=c(0.9, 0.4)) + scale_color_brewer(palette = "Set2")
 | |
| ggsave("ecdf.png", width=8, height=3)
 | |
| 
 | |
| ggplot(data, aes(similarity, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("SIM") + ylab("Fraction of jobs") + theme(legend.position=c(0.9, 0.4))  + scale_color_brewer(palette = "Set2") + xlim(0.5, 1.0)
 | |
| ggsave("ecdf-0.5.png", width=8, height=3)
 | |
| 
 | |
| e = data %>% filter(similarity >= 0.5)
 | |
| print(summary(e))
 | |
| 
 | |
| # histogram for the jobs
 | |
| ggplot(data, aes(similarity), group=alg_name) + geom_histogram(color="black", binwidth=0.025) + aes(fill = alg_name) + facet_grid(alg_name ~ ., switch = 'y') + scale_y_continuous(limits=c(0, 100), oob=squish)  +   scale_color_brewer(palette = "Set2") + ylab("Count (cropped at 100)") + theme(legend.position = "none") + stat_bin(binwidth=0.025, geom="text", angle = 90, colour="black", size=3, aes(label=..count.., y=0*(..count..)+20))
 | |
| ggsave("hist-sim.png")
 | |
| 
 | |
| # load job information, i.e., the time series per job
 | |
| jobData = read.csv("job-io-datasets/datasets/job_codings.csv")
 | |
| metadata = read.csv("job-io-datasets/datasets/job_metadata.csv")
 | |
| metadata$user_id = as.factor(metadata$user_id)
 | |
| metadata$group_id = as.factor(metadata$group_id)
 | |
| 
 | |
| plotJobs = function(jobs){
 | |
|     # plot details about the jobs of a given algorithm
 | |
|     tbl = jobData[jobData$jobid %in% jobs,]
 | |
|     print(summary(tbl))
 | |
|     #print(tbl)
 | |
|     md = metadata[metadata$jobid %in% jobs,]
 | |
|     print(summary(md))
 | |
| 
 | |
|     # print the job timelines
 | |
|     r = e[ordered, ]
 | |
|     #prefix = do.call("sprintf", list("%s-%.0f-", level, r$similarity))
 | |
|     #system(sprintf("scripts/plot-single-job.py %s %s", paste(r$jobid, collapse=","), paste(prefix, collapse=",")))
 | |
|   }
 | |
| 
 | |
| # Store the job ids in a table, each column is one algorithm
 | |
| dim = length(levels(data$alg_name))
 | |
| count = 100
 | |
| result = matrix(1:(dim*count), nrow=count, ncol=dim)
 | |
| colnames(result) = levels(data$alg_name)
 | |
| 
 | |
| # Extract the 100 most similar jobs into the table
 | |
| for (level in levels(data$alg_name)){
 | |
|     e = data %>% filter(alg_name == level)
 | |
|     print(level)
 | |
|     print(summary(e))
 | |
|     ordered = order(e$similarity, decreasing=TRUE)[1:count]
 | |
|     print(e[ordered,])
 | |
|     # Extract the data for the jobs
 | |
|     jobs = e[ordered,"jobid"]
 | |
|     result[, level] = jobs
 | |
|     plotJobs(jobs)
 | |
| }
 | |
| 
 | |
| # Compute intersection in a new table
 | |
| res.intersect = matrix(1:(dim*dim), nrow=dim, ncol=dim)
 | |
| colnames(res.intersect) = levels(data$alg_name)
 | |
| rownames(res.intersect) = levels(data$alg_name)
 | |
| 
 | |
| tbl.intersect = expand.grid(first=levels(data$alg_name), second=levels(data$alg_name))
 | |
| tbl.intersect$intersect = 0
 | |
| 
 | |
| for (l1 in levels(data$alg_name)){
 | |
|   for (l2 in levels(data$alg_name)){
 | |
|     res = length(intersect(result[,l1], result[,l2]))
 | |
|     res.intersect[l1,l2] = res
 | |
|     tbl.intersect[tbl.intersect$first == l1 & tbl.intersect$second == l2, ]$intersect = res
 | |
|   }
 | |
| }
 | |
| 
 | |
| print(res.intersect)
 | |
| 
 | |
| # Plot heatmap about intersection
 | |
| ggplot(tbl.intersect, aes(first, second, fill=intersect)) + geom_tile() + geom_text(aes(label = round(intersect, 1))) + scale_fill_gradientn(colours = rev(plotcolors))
 | |
| ggsave("intersection-heatmap.png", width=6, height=5)
 | |
| 
 | |
| # Collect the metadata of all jobs in a new table
 | |
| res.jobs = tibble()
 | |
| for (alg_name in levels(data$alg_name)){
 | |
|   res.jobs = rbind(res.jobs, cbind(alg_name, metadata[metadata$jobid %in% result[, alg_name],]))
 | |
| }
 | |
| 
 | |
| ggplot(res.jobs, aes(alg_name, total_nodes, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))
 | |
| ggsave("jobs-nodes.png")
 | |
| 
 | |
| ggplot(res.jobs, aes(alg_name, elapsed, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) + ylab("Runtime in s") + xlab("Algorithm")
 | |
| ggsave("jobs-elapsed.png")
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| # scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))
 | |
| 
 | |
| # stat_summary(aes(linetype = alg_id), fun.y=mean, geom="line")
 |