2020-08-17 17:14:58 +00:00
#!/usr/bin/env Rscript
library ( ggplot2 )
library ( dplyr )
require ( scales )
2020-08-21 18:12:33 +00:00
plotjobs = FALSE
2020-08-17 17:14:58 +00:00
2020-08-18 10:54:57 +00:00
# Color scheme
plotcolors <- c ( " #CC0000" , " #FFA500" , " #FFFF00" , " #008000" , " #9999ff" , " #000066" )
# Parse job from command line
args = commandArgs ( trailingOnly = TRUE )
2020-08-20 11:23:32 +00:00
file = " job_similarities_5024292.csv" # for manual execution
2020-08-17 17:14:58 +00:00
file = args [1 ]
data = read.csv ( file )
# Columns are: jobid alg_id alg_name similarity
data $ alg_id = as.factor ( data $ alg_id )
2020-08-20 10:48:27 +00:00
cat ( " Job count:" )
cat ( nrow ( data ) )
2020-08-17 17:14:58 +00:00
2020-08-20 11:23:32 +00:00
# empirical cumulative density function (ECDF)
2020-08-21 18:12:33 +00:00
data $ sim = data $ similarity * 100
ggplot ( data , aes ( sim , color = alg_name , group = alg_name ) ) + stat_ecdf ( geom = " step" ) + xlab ( " Similarity in %" ) + ylab ( " Fraction of jobs" ) + theme ( legend.position = c ( 0.9 , 0.4 ) ) + scale_color_brewer ( palette = " Set2" ) + scale_x_log10 ( )
ggsave ( " ecdf.png" , width = 8 , height = 2.5 )
2020-08-20 11:11:35 +00:00
2020-08-21 18:12:33 +00:00
# histogram for the jobs
ggplot ( data , aes ( sim ) , group = alg_name ) + geom_histogram ( color = " black" , binwidth = 2.5 ) + aes ( fill = alg_name ) + facet_grid ( alg_name ~ ., switch = ' y' ) + xlab ( " Similarity in %" ) + scale_y_continuous ( limits = c ( 0 , 100 ) , oob = squish ) + scale_color_brewer ( palette = " Set2" ) + ylab ( " Count (cropped at 100)" ) + theme ( legend.position = " none" ) + stat_bin ( binwidth = 2.5 , geom = " text" , adj = 1.0 , angle = 90 , colour = " black" , size = 3 , aes ( label = ..count.. , y = 0 * ( ..count.. ) +95 ) )
ggsave ( " hist-sim.png" , width = 6 , height = 4.5 )
#ggplot(data, aes(similarity, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("SIM") + ylab("Fraction of jobs") + theme(legend.position=c(0.9, 0.4)) + scale_color_brewer(palette = "Set2") + xlim(0.5, 1.0)
#ggsave("ecdf-0.5.png", width=8, height=3)
2020-08-17 17:14:58 +00:00
e = data %>% filter ( similarity >= 0.5 )
print ( summary ( e ) )
2020-08-18 10:54:57 +00:00
# load job information, i.e., the time series per job
jobData = read.csv ( " job-io-datasets/datasets/job_codings.csv" )
metadata = read.csv ( " job-io-datasets/datasets/job_metadata.csv" )
metadata $ user_id = as.factor ( metadata $ user_id )
metadata $ group_id = as.factor ( metadata $ group_id )
plotJobs = function ( jobs ) {
2020-08-20 10:48:27 +00:00
# print the job timelines
2020-08-18 14:26:29 +00:00
r = e [ordered , ]
2020-08-20 11:23:32 +00:00
if ( plotjobs ) {
2020-08-20 15:16:46 +00:00
prefix = do.call ( " sprintf" , list ( " %s-%.4f-" , level , r $ similarity ) )
2020-08-20 11:23:32 +00:00
system ( sprintf ( " scripts/plot-single-job.py %s %s" , paste ( r $ jobid , collapse = " ," ) , paste ( prefix , collapse = " ," ) ) )
}
2020-08-18 10:54:57 +00:00
}
# Store the job ids in a table, each column is one algorithm
dim = length ( levels ( data $ alg_name ) )
count = 100
2020-08-20 19:39:42 +00:00
result = matrix ( 1 : ( dim * count ) , nrow = count , ncol = dim ) # will contain the job ids for the count best jobs
2020-08-18 10:54:57 +00:00
colnames ( result ) = levels ( data $ alg_name )
2020-08-20 19:39:42 +00:00
result.userid = tibble ( ) # will contain the userid for the count best jobs
2020-08-18 10:54:57 +00:00
# Extract the 100 most similar jobs into the table
for ( level in levels ( data $ alg_name ) ) {
e = data %>% filter ( alg_name == level )
print ( level )
print ( summary ( e ) )
ordered = order ( e $ similarity , decreasing = TRUE ) [1 : count ]
print ( e [ordered , ] )
# Extract the data for the jobs
jobs = e [ordered , " jobid" ]
result [ , level ] = jobs
2020-08-20 19:39:42 +00:00
# extract details about the jobs of a given algorithm
tbl = jobData [jobData $ jobid %in% jobs , ]
print ( summary ( tbl ) )
md = metadata [metadata $ jobid %in% jobs , ]
print ( summary ( md ) )
md $ value = 1
userprofile = md %>% group_by ( user_id ) %>% summarise ( count = sum ( value ) )
userprofile = userprofile [order ( userprofile $ count , decreasing = TRUE ) , ]
userprofile $ userrank = 1 : nrow ( userprofile )
result.userid = rbind ( result.userid , cbind ( level , userprofile ) )
2020-08-18 10:54:57 +00:00
plotJobs ( jobs )
}
2020-08-20 19:39:42 +00:00
colnames ( result.userid ) = c ( " alg_name" , " user_id" , " count" , " userrank" )
print ( result.userid )
# Create stacked user table
ggplot ( result.userid , aes ( fill = userrank , y = count , x = alg_name ) ) + geom_bar ( position = " stack" , stat = " identity" ) + theme ( legend.position = " none" ) + scale_fill_gradientn ( colours = rainbow ( 5 ) ) + ylab ( " Stacked user count" ) + xlab ( " Algorithm" ) # + scale_fill_gradient(low="blue", high="red", space ="Lab" ) + scale_fill_continuous(type = "viridis")
ggsave ( " user-ids.png" , width = 6 , height = 4 )
2020-08-18 10:54:57 +00:00
# Compute intersection in a new table
res.intersect = matrix ( 1 : ( dim * dim ) , nrow = dim , ncol = dim )
colnames ( res.intersect ) = levels ( data $ alg_name )
rownames ( res.intersect ) = levels ( data $ alg_name )
tbl.intersect = expand.grid ( first = levels ( data $ alg_name ) , second = levels ( data $ alg_name ) )
tbl.intersect $ intersect = 0
2020-08-17 17:14:58 +00:00
2020-08-18 10:54:57 +00:00
for ( l1 in levels ( data $ alg_name ) ) {
for ( l2 in levels ( data $ alg_name ) ) {
res = length ( intersect ( result [ , l1 ] , result [ , l2 ] ) )
res.intersect [l1 , l2 ] = res
tbl.intersect [tbl.intersect $ first == l1 & tbl.intersect $ second == l2 , ] $ intersect = res
}
}
print ( res.intersect )
# Plot heatmap about intersection
2020-08-20 15:16:46 +00:00
ggplot ( tbl.intersect , aes ( first , second , fill = intersect ) ) + geom_tile ( ) + geom_text ( aes ( label = round ( intersect , 1 ) ) ) + scale_fill_gradientn ( colours = rev ( plotcolors ) ) + xlab ( " " ) + ylab ( " " )
2020-08-20 11:11:35 +00:00
ggsave ( " intersection-heatmap.png" , width = 6 , height = 5 )
2020-08-18 10:54:57 +00:00
# Collect the metadata of all jobs in a new table
res.jobs = tibble ( )
for ( alg_name in levels ( data $ alg_name ) ) {
res.jobs = rbind ( res.jobs , cbind ( alg_name , metadata [metadata $ jobid %in% result [ , alg_name ] , ] ) )
}
2020-08-20 15:16:46 +00:00
ggplot ( res.jobs , aes ( alg_name , total_nodes , fill = alg_name ) ) + geom_boxplot ( ) + scale_y_continuous ( trans = log2_trans ( ) , breaks = trans_breaks ( " log2" , function ( x ) 2 ^x ) , labels = trans_format ( " log2" , math_format ( 2 ^.x ) ) ) + theme ( legend.position = " none" )
ggsave ( " jobs-nodes.png" , width = 6 , height = 4 )
2020-08-18 10:54:57 +00:00
2020-08-20 15:16:46 +00:00
ggplot ( res.jobs , aes ( alg_name , elapsed , fill = alg_name ) ) + geom_boxplot ( ) + scale_y_continuous ( trans = log2_trans ( ) , breaks = trans_breaks ( " log10" , function ( x ) 10 ^x ) , labels = trans_format ( " log10" , math_format ( 10 ^.x ) ) ) + ylab ( " Runtime in s" ) + xlab ( " Algorithm" ) + theme ( legend.position = " none" )
ggsave ( " jobs-elapsed.png" , width = 6 , height = 4 )
2020-08-18 10:54:57 +00:00
# scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))
2020-08-17 17:14:58 +00:00
2020-08-18 10:54:57 +00:00
# stat_summary(aes(linetype = alg_id), fun.y=mean, geom="line")