41 lines
1.5 KiB
R
Executable File
41 lines
1.5 KiB
R
Executable File
#!/usr/bin/env Rscript
|
|
|
|
library(ggplot2)
|
|
library(dplyr)
|
|
require(scales)
|
|
|
|
args = commandArgs(trailingOnly = TRUE)
|
|
|
|
file = "job_similarities_5024292.csv"
|
|
file = "job_similarities_7488914.csv"
|
|
|
|
file = args[1]
|
|
|
|
data = read.csv(file)
|
|
# Columns are: jobid alg_id alg_name similarity
|
|
|
|
data$alg_id = as.factor(data$alg_id)
|
|
print(nrow(data))
|
|
|
|
# FILTER, TODO
|
|
data = data %>% filter(similarity <= 1.0)
|
|
|
|
# empirical cummulative density function (ECDF)
|
|
ggplot(data, aes(similarity, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("SIM") + ylab("Fraction of jobs") + theme(legend.position="bottom") + scale_color_brewer(palette = "Set2")
|
|
ggsave("ecdf.png")
|
|
|
|
e = data %>% filter(similarity >= 0.5)
|
|
ggplot(e, aes(similarity, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("SIM") + ylab("Fraction of jobs") + theme(legend.position="bottom") + scale_color_brewer(palette = "Set2")
|
|
print(summary(e))
|
|
ggsave("ecdf-0.5.png")
|
|
|
|
# histogram for the jobs
|
|
ggplot(data, aes(similarity), group=alg_name) + geom_histogram(color="black", binwidth=0.025) + aes(fill = alg_name) + facet_grid(alg_name ~ ., switch = 'y') + scale_y_continuous(limits=c(0, 100), oob=squish) + scale_color_brewer(palette = "Set2") + ylab("Count (cropped at 100)")
|
|
ggsave("hist-sim.png")
|
|
|
|
# scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))
|
|
|
|
# stat_summary(aes(linetype = alg_id), fun.y=mean, geom="line")
|
|
|
|
exit(0)
|