2020-08-17 17:14:58 +00:00
#!/usr/bin/env Rscript
library ( ggplot2 )
library ( dplyr )
require ( scales )
2020-08-18 10:54:57 +00:00
#library(hrbrthemes)
2020-08-17 17:14:58 +00:00
file = " job_similarities_5024292.csv"
file = " job_similarities_7488914.csv"
2020-08-18 10:54:57 +00:00
# Color scheme
plotcolors <- c ( " #CC0000" , " #FFA500" , " #FFFF00" , " #008000" , " #9999ff" , " #000066" )
# Parse job from command line
args = commandArgs ( trailingOnly = TRUE )
2020-08-17 17:14:58 +00:00
file = args [1 ]
data = read.csv ( file )
# Columns are: jobid alg_id alg_name similarity
data $ alg_id = as.factor ( data $ alg_id )
print ( nrow ( data ) )
# FILTER, TODO
data = data %>% filter ( similarity <= 1.0 )
# empirical cummulative density function (ECDF)
ggplot ( data , aes ( similarity , color = alg_name , group = alg_name ) ) + stat_ecdf ( geom = " step" ) + xlab ( " SIM" ) + ylab ( " Fraction of jobs" ) + theme ( legend.position = " bottom" ) + scale_color_brewer ( palette = " Set2" )
ggsave ( " ecdf.png" )
e = data %>% filter ( similarity >= 0.5 )
ggplot ( e , aes ( similarity , color = alg_name , group = alg_name ) ) + stat_ecdf ( geom = " step" ) + xlab ( " SIM" ) + ylab ( " Fraction of jobs" ) + theme ( legend.position = " bottom" ) + scale_color_brewer ( palette = " Set2" )
print ( summary ( e ) )
ggsave ( " ecdf-0.5.png" )
# histogram for the jobs
ggplot ( data , aes ( similarity ) , group = alg_name ) + geom_histogram ( color = " black" , binwidth = 0.025 ) + aes ( fill = alg_name ) + facet_grid ( alg_name ~ ., switch = ' y' ) + scale_y_continuous ( limits = c ( 0 , 100 ) , oob = squish ) + scale_color_brewer ( palette = " Set2" ) + ylab ( " Count (cropped at 100)" )
ggsave ( " hist-sim.png" )
2020-08-18 10:54:57 +00:00
# load job information, i.e., the time series per job
jobData = read.csv ( " job-io-datasets/datasets/job_codings.csv" )
metadata = read.csv ( " job-io-datasets/datasets/job_metadata.csv" )
metadata $ user_id = as.factor ( metadata $ user_id )
metadata $ group_id = as.factor ( metadata $ group_id )
plotJobs = function ( jobs ) {
# plot details about the jobs of a given algorithm
tbl = jobData [jobData $ jobid %in% jobs , ]
print ( summary ( tbl ) )
#print(tbl)
md = metadata [metadata $ jobid %in% jobs , ]
print ( summary ( md ) )
2020-08-18 14:26:29 +00:00
# print the job timeline
r = e [ordered , ]
for ( row in 1 : length ( jobs ) ) {
2020-08-18 14:46:05 +00:00
prefix = sprintf ( " %s-%f-%.0f-" , level , r [row , " similarity" ] , row )
2020-08-18 14:26:29 +00:00
job = r [row , " jobid" ]
system ( sprintf ( " scripts/plot-single-job.py %s %s" , job , prefix ) )
}
2020-08-18 10:54:57 +00:00
}
# Store the job ids in a table, each column is one algorithm
dim = length ( levels ( data $ alg_name ) )
count = 100
result = matrix ( 1 : ( dim * count ) , nrow = count , ncol = dim )
colnames ( result ) = levels ( data $ alg_name )
# Extract the 100 most similar jobs into the table
for ( level in levels ( data $ alg_name ) ) {
e = data %>% filter ( alg_name == level )
print ( level )
print ( summary ( e ) )
ordered = order ( e $ similarity , decreasing = TRUE ) [1 : count ]
print ( e [ordered , ] )
# Extract the data for the jobs
jobs = e [ordered , " jobid" ]
result [ , level ] = jobs
plotJobs ( jobs )
}
# Compute intersection in a new table
res.intersect = matrix ( 1 : ( dim * dim ) , nrow = dim , ncol = dim )
colnames ( res.intersect ) = levels ( data $ alg_name )
rownames ( res.intersect ) = levels ( data $ alg_name )
tbl.intersect = expand.grid ( first = levels ( data $ alg_name ) , second = levels ( data $ alg_name ) )
tbl.intersect $ intersect = 0
2020-08-17 17:14:58 +00:00
2020-08-18 10:54:57 +00:00
for ( l1 in levels ( data $ alg_name ) ) {
for ( l2 in levels ( data $ alg_name ) ) {
res = length ( intersect ( result [ , l1 ] , result [ , l2 ] ) )
res.intersect [l1 , l2 ] = res
tbl.intersect [tbl.intersect $ first == l1 & tbl.intersect $ second == l2 , ] $ intersect = res
}
}
print ( res.intersect )
# Plot heatmap about intersection
ggplot ( tbl.intersect , aes ( first , second , fill = intersect ) ) + geom_tile ( ) + geom_text ( aes ( label = round ( intersect , 1 ) ) ) + scale_fill_gradientn ( colours = rev ( plotcolors ) )
ggsave ( " intersection-heatmap.png" )
# Collect the metadata of all jobs in a new table
res.jobs = tibble ( )
for ( alg_name in levels ( data $ alg_name ) ) {
res.jobs = rbind ( res.jobs , cbind ( alg_name , metadata [metadata $ jobid %in% result [ , alg_name ] , ] ) )
}
ggplot ( res.jobs , aes ( alg_name , total_nodes , fill = alg_name ) ) + geom_boxplot ( ) + scale_y_continuous ( trans = log2_trans ( ) , breaks = trans_breaks ( " log2" , function ( x ) 2 ^x ) , labels = trans_format ( " log2" , math_format ( 2 ^.x ) ) )
ggsave ( " jobs-nodes.png" )
ggplot ( res.jobs , aes ( alg_name , elapsed , fill = alg_name ) ) + geom_boxplot ( ) + scale_y_continuous ( trans = log2_trans ( ) , breaks = trans_breaks ( " log10" , function ( x ) 10 ^x ) , labels = trans_format ( " log10" , math_format ( 10 ^.x ) ) ) + ylab ( " Runtime in s" ) + xlab ( " Algorithm" )
ggsave ( " jobs-elapsed.png" )
# scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))
2020-08-17 17:14:58 +00:00
2020-08-18 10:54:57 +00:00
# stat_summary(aes(linetype = alg_id), fun.y=mean, geom="line")