2020-08-17 17:14:58 +00:00
#!/usr/bin/env Rscript
2020-09-03 18:55:54 +00:00
# Parse job from command line
args = commandArgs ( trailingOnly = TRUE )
2020-10-01 16:10:27 +00:00
filename = args [1 ]
2020-09-03 18:55:54 +00:00
2020-08-17 17:14:58 +00:00
library ( ggplot2 )
library ( dplyr )
require ( scales )
2020-08-27 09:16:22 +00:00
library ( stringi )
library ( stringr )
2020-08-17 17:14:58 +00:00
2020-08-26 14:09:14 +00:00
# Turn to TRUE to print indivdiual job images
2020-11-24 11:51:22 +00:00
plotjobs = FALSE
2020-08-17 17:14:58 +00:00
2020-08-18 10:54:57 +00:00
# Color scheme
2020-08-27 09:16:22 +00:00
plotcolors <- c ( " #CC0000" , " #FFA500" , " #FFFF00" , " #008000" , " #9999ff" , " #000099" )
2020-08-18 10:54:57 +00:00
2020-10-01 16:10:27 +00:00
if ( ! exists ( " filename" ) ) {
filename = " ./datasets/job_similarities_5024292.csv"
filename = " ./datasets/job_similarities_7488914.csv" # for manual execution
2020-09-03 18:55:54 +00:00
}
2020-10-01 16:10:27 +00:00
print ( filename )
jobID = str_extract ( filename , regex ( " [0-9]+" ) )
2020-08-17 17:14:58 +00:00
2020-10-01 16:10:27 +00:00
data = read.csv ( filename )
2020-08-17 17:14:58 +00:00
# Columns are: jobid alg_id alg_name similarity
2020-08-26 09:56:41 +00:00
#data$alg_id = as.factor(data$alg_id) # EB: falsche Spalte?
data $ alg_name = as.factor ( data $ alg_name ) # EB: im Script wird diese Spalte benutzt
2020-08-20 10:48:27 +00:00
cat ( " Job count:" )
cat ( nrow ( data ) )
2020-08-17 17:14:58 +00:00
2020-08-20 11:23:32 +00:00
# empirical cumulative density function (ECDF)
2020-08-21 18:12:33 +00:00
data $ sim = data $ similarity * 100
2020-09-03 18:55:54 +00:00
ggplot ( data , aes ( sim , color = alg_name , group = alg_name ) ) + stat_ecdf ( geom = " step" ) + xlab ( " Similarity in %" ) + ylab ( " Fraction of jobs" ) + theme ( legend.position = c ( 0.9 , 0.5 ) , legend.title = element_blank ( ) ) + scale_color_brewer ( palette = " Set2" ) # + scale_x_log10() +
2020-08-21 18:12:33 +00:00
ggsave ( " ecdf.png" , width = 8 , height = 2.5 )
2020-08-20 11:11:35 +00:00
2020-08-21 18:12:33 +00:00
# histogram for the jobs
ggplot ( data , aes ( sim ) , group = alg_name ) + geom_histogram ( color = " black" , binwidth = 2.5 ) + aes ( fill = alg_name ) + facet_grid ( alg_name ~ ., switch = ' y' ) + xlab ( " Similarity in %" ) + scale_y_continuous ( limits = c ( 0 , 100 ) , oob = squish ) + scale_color_brewer ( palette = " Set2" ) + ylab ( " Count (cropped at 100)" ) + theme ( legend.position = " none" ) + stat_bin ( binwidth = 2.5 , geom = " text" , adj = 1.0 , angle = 90 , colour = " black" , size = 3 , aes ( label = ..count.. , y = 0 * ( ..count.. ) +95 ) )
2020-09-03 18:55:54 +00:00
ggsave ( " hist-sim.png" , width = 6 , height = 5 )
2020-08-21 18:12:33 +00:00
#ggplot(data, aes(similarity, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("SIM") + ylab("Fraction of jobs") + theme(legend.position=c(0.9, 0.4)) + scale_color_brewer(palette = "Set2") + xlim(0.5, 1.0)
#ggsave("ecdf-0.5.png", width=8, height=3)
2020-08-17 17:14:58 +00:00
2020-10-01 16:10:27 +00:00
print ( " Similarity > 0.5" )
2020-08-17 17:14:58 +00:00
e = data %>% filter ( similarity >= 0.5 )
print ( summary ( e ) )
2020-08-18 10:54:57 +00:00
# load job information, i.e., the time series per job
2020-08-26 09:56:41 +00:00
jobData = read.csv ( " ./datasets/job_codings_v3.csv" ) # EB: liegt jetzt Repo. v3 hat die korrekten hexadezimalen Codings
metadata = read.csv ( " ./datasets/job_metadata.csv" ) # EB: is ebenfalls im Repo
2020-08-18 10:54:57 +00:00
metadata $ user_id = as.factor ( metadata $ user_id )
metadata $ group_id = as.factor ( metadata $ group_id )
2020-09-03 12:59:20 +00:00
plotJobs = function ( algorithm , jobs ) {
2020-08-20 10:48:27 +00:00
# print the job timelines
2020-08-18 14:26:29 +00:00
r = e [ordered , ]
2020-08-20 11:23:32 +00:00
if ( plotjobs ) {
2020-09-03 12:59:20 +00:00
if ( algorithm == " ks" ) {
script = " ./scripts/plot-job-timelines-ks.py"
} else {
script = " ./scripts/plot-job-timelines.py"
}
2020-08-20 15:16:46 +00:00
prefix = do.call ( " sprintf" , list ( " %s-%.4f-" , level , r $ similarity ) )
2020-09-03 12:59:20 +00:00
call = sprintf ( " %s %s %s" , script , paste ( r $ jobid , collapse = " ," ) , paste ( prefix , collapse = " ," ) )
print ( call )
system ( call )
2020-08-20 11:23:32 +00:00
}
2020-08-27 13:18:23 +00:00
system ( sprintf ( " ./scripts/extract-conf-data.sh %s > jobs-%s.txt" , paste ( r $ jobid , collapse = " " ) , level ) )
2020-08-18 10:54:57 +00:00
}
# Store the job ids in a table, each column is one algorithm
dim = length ( levels ( data $ alg_name ) )
count = 100
2020-08-20 19:39:42 +00:00
result = matrix ( 1 : ( dim * count ) , nrow = count , ncol = dim ) # will contain the job ids for the count best jobs
2020-08-18 10:54:57 +00:00
colnames ( result ) = levels ( data $ alg_name )
2020-08-20 19:39:42 +00:00
result.userid = tibble ( ) # will contain the userid for the count best jobs
2020-08-18 10:54:57 +00:00
# Extract the 100 most similar jobs into the table
for ( level in levels ( data $ alg_name ) ) {
e = data %>% filter ( alg_name == level )
print ( level )
print ( summary ( e ) )
ordered = order ( e $ similarity , decreasing = TRUE ) [1 : count ]
print ( e [ordered , ] )
# Extract the data for the jobs
jobs = e [ordered , " jobid" ]
result [ , level ] = jobs
2020-08-20 19:39:42 +00:00
# extract details about the jobs of a given algorithm
tbl = jobData [jobData $ jobid %in% jobs , ]
print ( summary ( tbl ) )
md = metadata [metadata $ jobid %in% jobs , ]
print ( summary ( md ) )
md $ value = 1
userprofile = md %>% group_by ( user_id ) %>% summarise ( count = sum ( value ) )
userprofile = userprofile [order ( userprofile $ count , decreasing = TRUE ) , ]
userprofile $ userrank = 1 : nrow ( userprofile )
result.userid = rbind ( result.userid , cbind ( level , userprofile ) )
2020-09-03 12:59:20 +00:00
plotJobs ( level , jobs )
2020-08-18 10:54:57 +00:00
}
2020-08-20 19:39:42 +00:00
colnames ( result.userid ) = c ( " alg_name" , " user_id" , " count" , " userrank" )
print ( result.userid )
# Create stacked user table
ggplot ( result.userid , aes ( fill = userrank , y = count , x = alg_name ) ) + geom_bar ( position = " stack" , stat = " identity" ) + theme ( legend.position = " none" ) + scale_fill_gradientn ( colours = rainbow ( 5 ) ) + ylab ( " Stacked user count" ) + xlab ( " Algorithm" ) # + scale_fill_gradient(low="blue", high="red", space ="Lab" ) + scale_fill_continuous(type = "viridis")
ggsave ( " user-ids.png" , width = 6 , height = 4 )
2020-08-18 10:54:57 +00:00
# Compute intersection in a new table
res.intersect = matrix ( 1 : ( dim * dim ) , nrow = dim , ncol = dim )
colnames ( res.intersect ) = levels ( data $ alg_name )
rownames ( res.intersect ) = levels ( data $ alg_name )
tbl.intersect = expand.grid ( first = levels ( data $ alg_name ) , second = levels ( data $ alg_name ) )
tbl.intersect $ intersect = 0
2020-08-17 17:14:58 +00:00
2020-08-18 10:54:57 +00:00
for ( l1 in levels ( data $ alg_name ) ) {
for ( l2 in levels ( data $ alg_name ) ) {
res = length ( intersect ( result [ , l1 ] , result [ , l2 ] ) )
res.intersect [l1 , l2 ] = res
tbl.intersect [tbl.intersect $ first == l1 & tbl.intersect $ second == l2 , ] $ intersect = res
}
}
print ( res.intersect )
# Plot heatmap about intersection
2020-08-25 17:00:28 +00:00
ggplot ( tbl.intersect , aes ( first , second , fill = intersect ) ) + geom_tile ( ) + geom_text ( aes ( label = round ( intersect , 1 ) ) ) + scale_fill_gradientn ( colours = rev ( plotcolors ) ) + xlab ( " " ) + ylab ( " " ) + theme ( legend.position = " bottom" , legend.title = element_blank ( ) )
2020-10-04 14:11:46 +00:00
ggsave ( " intersection-heatmap.png" , width = 5 , height = 5 )
2020-08-18 10:54:57 +00:00
# Collect the metadata of all jobs in a new table
res.jobs = tibble ( )
for ( alg_name in levels ( data $ alg_name ) ) {
res.jobs = rbind ( res.jobs , cbind ( alg_name , metadata [metadata $ jobid %in% result [ , alg_name ] , ] ) )
}
2020-08-27 09:16:22 +00:00
# Plot histogram of nodes per algorithm
jobRef = metadata [metadata $ jobid == jobID , ] $ total_nodes
ggplot ( res.jobs , aes ( alg_name , total_nodes , fill = alg_name ) ) + geom_boxplot ( ) + scale_y_continuous ( trans = log2_trans ( ) , breaks = trans_breaks ( " log2" , function ( x ) 2 ^x ) , labels = trans_format ( " log2" , math_format ( 2 ^.x ) ) ) + theme ( legend.position = " none" ) + xlab ( " Algorithm" ) + ylab ( " Job node count" ) + geom_hline ( yintercept = jobRef , linetype = " dashed" , color = " red" , size = 0.5 )
2020-08-20 15:16:46 +00:00
ggsave ( " jobs-nodes.png" , width = 6 , height = 4 )
2020-08-18 10:54:57 +00:00
2020-08-27 09:16:22 +00:00
# Plot histogram of elapsed time per algorithm
jobRef = metadata [metadata $ jobid == jobID , ] $ elapsed
ggplot ( res.jobs , aes ( alg_name , elapsed , fill = alg_name ) ) + geom_boxplot ( ) + ylab ( " Job runtime in s" ) + xlab ( " Algorithm" ) + theme ( legend.position = " none" ) + ylim ( 0 , max ( res.jobs $ elapsed ) ) + geom_hline ( yintercept = jobRef , linetype = " dashed" , color = " red" , size = 0.5 )
# scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))
2020-08-20 15:16:46 +00:00
ggsave ( " jobs-elapsed.png" , width = 6 , height = 4 )
2020-08-18 10:54:57 +00:00
# scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))
2020-08-17 17:14:58 +00:00
2020-08-18 10:54:57 +00:00
# stat_summary(aes(linetype = alg_id), fun.y=mean, geom="line")