#!/usr/bin/env Rscript library(ggplot2) library(dplyr) require(scales) plotjobs = FALSE # Color scheme plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000066") # Parse job from command line args = commandArgs(trailingOnly = TRUE) file = "job_similarities_5024292.csv" # for manual execution file = args[1] data = read.csv(file) # Columns are: jobid alg_id alg_name similarity data$alg_id = as.factor(data$alg_id) cat("Job count:") cat(nrow(data)) # empirical cumulative density function (ECDF) data$sim = data$similarity*100 ggplot(data, aes(sim, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("Similarity in %") + ylab("Fraction of jobs") + theme(legend.position=c(0.9, 0.4)) + scale_color_brewer(palette = "Set2") + scale_x_log10() ggsave("ecdf.png", width=8, height=2.5) # histogram for the jobs ggplot(data, aes(sim), group=alg_name) + geom_histogram(color="black", binwidth=2.5) + aes(fill = alg_name) + facet_grid(alg_name ~ ., switch = 'y') + xlab("Similarity in %") + scale_y_continuous(limits=c(0, 100), oob=squish) + scale_color_brewer(palette = "Set2") + ylab("Count (cropped at 100)") + theme(legend.position = "none") + stat_bin(binwidth=2.5, geom="text", adj=1.0, angle = 90, colour="black", size=3, aes(label=..count.., y=0*(..count..)+95)) ggsave("hist-sim.png", width=6, height=4.5) #ggplot(data, aes(similarity, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("SIM") + ylab("Fraction of jobs") + theme(legend.position=c(0.9, 0.4)) + scale_color_brewer(palette = "Set2") + xlim(0.5, 1.0) #ggsave("ecdf-0.5.png", width=8, height=3) e = data %>% filter(similarity >= 0.5) print(summary(e)) # load job information, i.e., the time series per job jobData = read.csv("job-io-datasets/datasets/job_codings.csv") metadata = read.csv("job-io-datasets/datasets/job_metadata.csv") metadata$user_id = as.factor(metadata$user_id) metadata$group_id = as.factor(metadata$group_id) plotJobs = function(jobs){ # print the job timelines r = e[ordered, ] if (plotjobs) { prefix = do.call("sprintf", list("%s-%.4f-", level, r$similarity)) system(sprintf("scripts/plot-single-job.py %s %s", paste(r$jobid, collapse=","), paste(prefix, collapse=","))) } } # Store the job ids in a table, each column is one algorithm dim = length(levels(data$alg_name)) count = 100 result = matrix(1:(dim*count), nrow=count, ncol=dim) # will contain the job ids for the count best jobs colnames(result) = levels(data$alg_name) result.userid = tibble() # will contain the userid for the count best jobs # Extract the 100 most similar jobs into the table for (level in levels(data$alg_name)){ e = data %>% filter(alg_name == level) print(level) print(summary(e)) ordered = order(e$similarity, decreasing=TRUE)[1:count] print(e[ordered,]) # Extract the data for the jobs jobs = e[ordered,"jobid"] result[, level] = jobs # extract details about the jobs of a given algorithm tbl = jobData[jobData$jobid %in% jobs,] print(summary(tbl)) md = metadata[metadata$jobid %in% jobs,] print(summary(md)) md$value = 1 userprofile = md %>% group_by(user_id) %>% summarise(count = sum(value)) userprofile = userprofile[order(userprofile$count, decreasing=TRUE),] userprofile$userrank = 1:nrow(userprofile) result.userid = rbind(result.userid, cbind(level, userprofile)) plotJobs(jobs) } colnames(result.userid) = c("alg_name", "user_id", "count", "userrank") print(result.userid) # Create stacked user table ggplot(result.userid, aes(fill=userrank, y=count, x=alg_name)) + geom_bar(position="stack", stat="identity") + theme(legend.position = "none") + scale_fill_gradientn(colours=rainbow(5)) + ylab("Stacked user count") + xlab("Algorithm") # + scale_fill_gradient(low="blue", high="red", space ="Lab" ) + scale_fill_continuous(type = "viridis") ggsave("user-ids.png", width=6, height=4) # Compute intersection in a new table res.intersect = matrix(1:(dim*dim), nrow=dim, ncol=dim) colnames(res.intersect) = levels(data$alg_name) rownames(res.intersect) = levels(data$alg_name) tbl.intersect = expand.grid(first=levels(data$alg_name), second=levels(data$alg_name)) tbl.intersect$intersect = 0 for (l1 in levels(data$alg_name)){ for (l2 in levels(data$alg_name)){ res = length(intersect(result[,l1], result[,l2])) res.intersect[l1,l2] = res tbl.intersect[tbl.intersect$first == l1 & tbl.intersect$second == l2, ]$intersect = res } } print(res.intersect) # Plot heatmap about intersection ggplot(tbl.intersect, aes(first, second, fill=intersect)) + geom_tile() + geom_text(aes(label = round(intersect, 1))) + scale_fill_gradientn(colours = rev(plotcolors)) + xlab("") + ylab("") ggsave("intersection-heatmap.png", width=6, height=5) # Collect the metadata of all jobs in a new table res.jobs = tibble() for (alg_name in levels(data$alg_name)){ res.jobs = rbind(res.jobs, cbind(alg_name, metadata[metadata$jobid %in% result[, alg_name],])) } ggplot(res.jobs, aes(alg_name, total_nodes, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + theme(legend.position = "none") ggsave("jobs-nodes.png", width=6, height=4) ggplot(res.jobs, aes(alg_name, elapsed, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) + ylab("Runtime in s") + xlab("Algorithm") + theme(legend.position = "none") ggsave("jobs-elapsed.png", width=6, height=4) # scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) # stat_summary(aes(linetype = alg_id), fun.y=mean, geom="line")