New diagrams.
This commit is contained in:
		
							parent
							
								
									a7fab7d233
								
							
						
					
					
						commit
						2c6d542a79
					
				| @ -1,7 +1,7 @@ | |||||||
| #!/bin/bash | #!/bin/bash | ||||||
| for I in job_similarities_*.csv ; do | for I in job_similarities_*.csv ; do | ||||||
|   ./plot.R $I |   ./plot.R $I > description.txt | ||||||
|   mkdir $I.out |   mkdir $I.out | ||||||
|   rm $I.out/* |   rm $I.out/* | ||||||
|   mv *.png *.pdf $I.out |   mv *.png *.pdf description.txt $I.out | ||||||
| done | done | ||||||
|  | |||||||
							
								
								
									
										81
									
								
								plot.R
									
									
									
									
									
								
							
							
						
						
									
										81
									
								
								plot.R
									
									
									
									
									
								
							| @ -3,12 +3,16 @@ | |||||||
| library(ggplot2) | library(ggplot2) | ||||||
| library(dplyr) | library(dplyr) | ||||||
| require(scales) | require(scales) | ||||||
| 
 | #library(hrbrthemes) | ||||||
| args = commandArgs(trailingOnly = TRUE) |  | ||||||
| 
 | 
 | ||||||
| file = "job_similarities_5024292.csv" | file = "job_similarities_5024292.csv" | ||||||
| file = "job_similarities_7488914.csv" | file = "job_similarities_7488914.csv" | ||||||
| 
 | 
 | ||||||
|  | # Color scheme | ||||||
|  | plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000066") | ||||||
|  | 
 | ||||||
|  | # Parse job from command line | ||||||
|  | args = commandArgs(trailingOnly = TRUE) | ||||||
| file = args[1] | file = args[1] | ||||||
| 
 | 
 | ||||||
| data = read.csv(file) | data = read.csv(file) | ||||||
| @ -33,8 +37,77 @@ ggsave("ecdf-0.5.png") | |||||||
| ggplot(data, aes(similarity), group=alg_name) + geom_histogram(color="black", binwidth=0.025) + aes(fill = alg_name) + facet_grid(alg_name ~ ., switch = 'y') + scale_y_continuous(limits=c(0, 100), oob=squish)  +   scale_color_brewer(palette = "Set2") + ylab("Count (cropped at 100)") | ggplot(data, aes(similarity), group=alg_name) + geom_histogram(color="black", binwidth=0.025) + aes(fill = alg_name) + facet_grid(alg_name ~ ., switch = 'y') + scale_y_continuous(limits=c(0, 100), oob=squish)  +   scale_color_brewer(palette = "Set2") + ylab("Count (cropped at 100)") | ||||||
| ggsave("hist-sim.png") | ggsave("hist-sim.png") | ||||||
| 
 | 
 | ||||||
|  | # load job information, i.e., the time series per job | ||||||
|  | jobData = read.csv("job-io-datasets/datasets/job_codings.csv") | ||||||
|  | metadata = read.csv("job-io-datasets/datasets/job_metadata.csv") | ||||||
|  | metadata$user_id = as.factor(metadata$user_id) | ||||||
|  | metadata$group_id = as.factor(metadata$group_id) | ||||||
|  | 
 | ||||||
|  | plotJobs = function(jobs){ | ||||||
|  |     # plot details about the jobs of a given algorithm | ||||||
|  |     tbl = jobData[jobData$jobid %in% jobs,] | ||||||
|  |     print(summary(tbl)) | ||||||
|  |     #print(tbl) | ||||||
|  |     md = metadata[metadata$jobid %in% jobs,] | ||||||
|  |     print(summary(md)) | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  | # Store the job ids in a table, each column is one algorithm | ||||||
|  | dim = length(levels(data$alg_name)) | ||||||
|  | count = 100 | ||||||
|  | result = matrix(1:(dim*count), nrow=count, ncol=dim) | ||||||
|  | colnames(result) = levels(data$alg_name) | ||||||
|  | 
 | ||||||
|  | # Extract the 100 most similar jobs into the table | ||||||
|  | for (level in levels(data$alg_name)){ | ||||||
|  |     e = data %>% filter(alg_name == level) | ||||||
|  |     print(level) | ||||||
|  |     print(summary(e)) | ||||||
|  |     ordered = order(e$similarity, decreasing=TRUE)[1:count] | ||||||
|  |     print(e[ordered,]) | ||||||
|  |     # Extract the data for the jobs | ||||||
|  |     jobs = e[ordered,"jobid"] | ||||||
|  |     result[, level] = jobs | ||||||
|  |     plotJobs(jobs) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | # Compute intersection in a new table | ||||||
|  | res.intersect = matrix(1:(dim*dim), nrow=dim, ncol=dim) | ||||||
|  | colnames(res.intersect) = levels(data$alg_name) | ||||||
|  | rownames(res.intersect) = levels(data$alg_name) | ||||||
|  | 
 | ||||||
|  | tbl.intersect = expand.grid(first=levels(data$alg_name), second=levels(data$alg_name)) | ||||||
|  | tbl.intersect$intersect = 0 | ||||||
|  | 
 | ||||||
|  | for (l1 in levels(data$alg_name)){ | ||||||
|  |   for (l2 in levels(data$alg_name)){ | ||||||
|  |     res = length(intersect(result[,l1], result[,l2])) | ||||||
|  |     res.intersect[l1,l2] = res | ||||||
|  |     tbl.intersect[tbl.intersect$first == l1 & tbl.intersect$second == l2, ]$intersect = res | ||||||
|  |   } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | print(res.intersect) | ||||||
|  | 
 | ||||||
|  | # Plot heatmap about intersection | ||||||
|  | ggplot(tbl.intersect, aes(first, second, fill=intersect)) + geom_tile() + geom_text(aes(label = round(intersect, 1))) + scale_fill_gradientn(colours = rev(plotcolors)) | ||||||
|  | ggsave("intersection-heatmap.png") | ||||||
|  | 
 | ||||||
|  | # Collect the metadata of all jobs in a new table | ||||||
|  | res.jobs = tibble() | ||||||
|  | for (alg_name in levels(data$alg_name)){ | ||||||
|  |   res.jobs = rbind(res.jobs, cbind(alg_name, metadata[metadata$jobid %in% result[, alg_name],])) | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | ggplot(res.jobs, aes(alg_name, total_nodes, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) | ||||||
|  | ggsave("jobs-nodes.png") | ||||||
|  | 
 | ||||||
|  | ggplot(res.jobs, aes(alg_name, elapsed, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) + ylab("Runtime in s") + xlab("Algorithm") | ||||||
|  | ggsave("jobs-elapsed.png") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| # scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) | # scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) | ||||||
| 
 | 
 | ||||||
| # stat_summary(aes(linetype = alg_id), fun.y=mean, geom="line") | # stat_summary(aes(linetype = alg_id), fun.y=mean, geom="line") | ||||||
| 
 |  | ||||||
| exit(0) |  | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user