Merge branch 'master' of http://git.hps.vi4io.org/eugen.betke/mistral-io-datasets
This commit is contained in:
commit
98186bc0d8
|
@ -244,9 +244,37 @@ Potentially, analyze how the rankings of different similarities look like.
|
|||
|
||||
\subsection{Quantitative Analysis of Selected Jobs}
|
||||
|
||||
\begin{table}
|
||||
\caption{User and Group Information}
|
||||
\end{table}
|
||||
|
||||
User count and group id is the same, meaning that a user is likely from the same group and the number of groups is identical to the number of users (unique), for Job-L user id and group count differ a bit, for Job-M a bit more.
|
||||
Up to about 2x users than groups.
|
||||
|
||||
To understand how the Top\,100 jobs are distributed across users, the data is grouped by userid and counted.
|
||||
\Cref{fig:userids} shows the stacked user information, where the lowest stack is the user with the most jobs and the top most user in the stack has the smallest number of jobs.
|
||||
For Job-S, we can see that about 70-80\% of jobs stem from one user, for the hex\_lev and hex\_native algorithms, the other jobs stem from a second user while bin includes jobs from additional users (5 in total).
|
||||
For Job-M, jobs from more users are included (13); about 25\% of jobs stem from the same user, here, hex\_lev and hex\_native is including more users (30 and 33, respectively) than the other three algorithms.
|
||||
For Job-L, the two hex algorithms include with (12 and 13) a bit more diverse user community than the bin algorithms (9) but hex\_phases covers 35 users.
|
||||
|
||||
\begin{figure}
|
||||
\begin{subfigure}{0.31\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{job_similarities_4296426-out/user-ids}
|
||||
\caption{Job-S} \label{fig:users-job-S}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.31\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{job_similarities_5024292-out/user-ids}
|
||||
\caption{Job-M} \label{fig:users-job-M}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.31\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{job_similarities_7488914-out/user-ids}
|
||||
\caption{Job-L} \label{fig:users-job-L}
|
||||
\end{subfigure}
|
||||
|
||||
|
||||
\caption{User information for each jobs}
|
||||
\label{fig:userids}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}
|
||||
\begin{subfigure}{0.31\textwidth}
|
||||
|
|
|
@ -4,7 +4,7 @@ library(ggplot2)
|
|||
library(dplyr)
|
||||
require(scales)
|
||||
|
||||
plotjobs = TRUE
|
||||
plotjobs = FALSE
|
||||
|
||||
# Color scheme
|
||||
plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000066")
|
||||
|
@ -42,13 +42,6 @@ metadata$user_id = as.factor(metadata$user_id)
|
|||
metadata$group_id = as.factor(metadata$group_id)
|
||||
|
||||
plotJobs = function(jobs){
|
||||
# plot details about the jobs of a given algorithm
|
||||
tbl = jobData[jobData$jobid %in% jobs,]
|
||||
print(summary(tbl))
|
||||
#print(tbl)
|
||||
md = metadata[metadata$jobid %in% jobs,]
|
||||
print(summary(md))
|
||||
|
||||
# print the job timelines
|
||||
r = e[ordered, ]
|
||||
|
||||
|
@ -61,8 +54,9 @@ plotJobs = function(jobs){
|
|||
# Store the job ids in a table, each column is one algorithm
|
||||
dim = length(levels(data$alg_name))
|
||||
count = 100
|
||||
result = matrix(1:(dim*count), nrow=count, ncol=dim)
|
||||
result = matrix(1:(dim*count), nrow=count, ncol=dim) # will contain the job ids for the count best jobs
|
||||
colnames(result) = levels(data$alg_name)
|
||||
result.userid = tibble() # will contain the userid for the count best jobs
|
||||
|
||||
# Extract the 100 most similar jobs into the table
|
||||
for (level in levels(data$alg_name)){
|
||||
|
@ -74,9 +68,31 @@ for (level in levels(data$alg_name)){
|
|||
# Extract the data for the jobs
|
||||
jobs = e[ordered,"jobid"]
|
||||
result[, level] = jobs
|
||||
|
||||
# extract details about the jobs of a given algorithm
|
||||
tbl = jobData[jobData$jobid %in% jobs,]
|
||||
print(summary(tbl))
|
||||
md = metadata[metadata$jobid %in% jobs,]
|
||||
print(summary(md))
|
||||
md$value = 1
|
||||
userprofile = md %>% group_by(user_id) %>% summarise(count = sum(value))
|
||||
userprofile = userprofile[order(userprofile$count, decreasing=TRUE),]
|
||||
userprofile$userrank = 1:nrow(userprofile)
|
||||
result.userid = rbind(result.userid, cbind(level, userprofile))
|
||||
|
||||
plotJobs(jobs)
|
||||
}
|
||||
|
||||
colnames(result.userid) = c("alg_name", "user_id", "count", "userrank")
|
||||
|
||||
print(result.userid)
|
||||
|
||||
# Create stacked user table
|
||||
ggplot(result.userid, aes(fill=userrank, y=count, x=alg_name)) + geom_bar(position="stack", stat="identity") + theme(legend.position = "none") + scale_fill_gradientn(colours=rainbow(5)) + ylab("Stacked user count") + xlab("Algorithm") # + scale_fill_gradient(low="blue", high="red", space ="Lab" ) + scale_fill_continuous(type = "viridis")
|
||||
|
||||
ggsave("user-ids.png", width=6, height=4)
|
||||
|
||||
|
||||
# Compute intersection in a new table
|
||||
res.intersect = matrix(1:(dim*dim), nrow=dim, ncol=dim)
|
||||
colnames(res.intersect) = levels(data$alg_name)
|
||||
|
|
Loading…
Reference in New Issue