Merge branch 'master' of http://git.hps.vi4io.org/eugen.betke/mistral-io-datasets

2020-08-21 15:22:18 +02:00 · 2020-08-21 15:22:18 +02:00 · 98186bc0d8
commit 98186bc0d8
parent 5b0331ec32 aa86690f43
2 changed files with 56 additions and 12 deletions
--- a/paper/main.tex
+++ b/paper/main.tex
@ -244,9 +244,37 @@ Potentially, analyze how the rankings of different similarities look like.

 \subsection{Quantitative Analysis of Selected Jobs}

-\begin{table}
-\caption{User and Group Information}
-\end{table}
+
+User count and group id is the same, meaning that a user is likely from the same group and the number of groups is identical to the number of users (unique), for Job-L user id and group count differ a bit, for Job-M a bit more.
+Up to about 2x users than groups.
+
+To understand how the Top\,100 jobs are distributed across users, the data is grouped by userid and counted.
+\Cref{fig:userids} shows the stacked user information, where the lowest stack is the user with the most jobs and the top most user in the stack has the smallest number of jobs.
+For Job-S, we can see that about 70-80\% of jobs stem from one user, for the hex\_lev and hex\_native algorithms, the other jobs stem from a second user while bin includes jobs from additional users (5 in total).
+For Job-M, jobs from more users are included (13); about 25\% of jobs stem from the same user, here, hex\_lev and hex\_native is including more users (30 and 33, respectively) than the other three algorithms.
+For Job-L, the two hex algorithms include with (12 and 13) a bit more diverse user community than the bin algorithms (9) but hex\_phases covers 35 users.
+
+\begin{figure}
+\begin{subfigure}{0.31\textwidth}
+\centering
+\includegraphics[width=\textwidth]{job_similarities_4296426-out/user-ids}
+\caption{Job-S} \label{fig:users-job-S}
+\end{subfigure}
+\begin{subfigure}{0.31\textwidth}
+\centering
+\includegraphics[width=\textwidth]{job_similarities_5024292-out/user-ids}
+\caption{Job-M} \label{fig:users-job-M}
+\end{subfigure}
+\begin{subfigure}{0.31\textwidth}
+\centering
+\includegraphics[width=\textwidth]{job_similarities_7488914-out/user-ids}
+\caption{Job-L} \label{fig:users-job-L}
+\end{subfigure}
+
+
+\caption{User information for each jobs}
+\label{fig:userids}
+\end{figure}

 \begin{figure}
 \begin{subfigure}{0.31\textwidth}
--- a/scripts/plot.R
+++ b/scripts/plot.R
@ -4,7 +4,7 @@ library(ggplot2)
 library(dplyr)
 require(scales)

-plotjobs = TRUE
+plotjobs = FALSE

 # Color scheme
 plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000066")
@ -42,13 +42,6 @@ metadata$user_id = as.factor(metadata$user_id)
 metadata$group_id = as.factor(metadata$group_id)

 plotJobs = function(jobs){
-    # plot details about the jobs of a given algorithm
-    tbl = jobData[jobData$jobid %in% jobs,]
-    print(summary(tbl))
-    #print(tbl)
-    md = metadata[metadata$jobid %in% jobs,]
-    print(summary(md))
-
    # print the job timelines
    r = e[ordered, ]

@ -61,8 +54,9 @@ plotJobs = function(jobs){
 # Store the job ids in a table, each column is one algorithm
 dim = length(levels(data$alg_name))
 count = 100
-result = matrix(1:(dim*count), nrow=count, ncol=dim)
+result = matrix(1:(dim*count), nrow=count, ncol=dim) # will contain the job ids for the count best jobs
 colnames(result) = levels(data$alg_name)
+result.userid = tibble() # will contain the userid for the count best jobs

 # Extract the 100 most similar jobs into the table
 for (level in levels(data$alg_name)){
@ -74,9 +68,31 @@ for (level in levels(data$alg_name)){
    # Extract the data for the jobs
    jobs = e[ordered,"jobid"]
    result[, level] = jobs
+
+    # extract details about the jobs of a given algorithm
+    tbl = jobData[jobData$jobid %in% jobs,]
+    print(summary(tbl))
+    md = metadata[metadata$jobid %in% jobs,]
+    print(summary(md))
+    md$value = 1
+    userprofile = md %>% group_by(user_id) %>% summarise(count = sum(value))
+    userprofile = userprofile[order(userprofile$count, decreasing=TRUE),]
+    userprofile$userrank = 1:nrow(userprofile)
+    result.userid = rbind(result.userid, cbind(level, userprofile))
+
    plotJobs(jobs)
 }

+colnames(result.userid) = c("alg_name", "user_id", "count", "userrank")
+
+print(result.userid)
+
+# Create stacked user table
+ggplot(result.userid, aes(fill=userrank, y=count, x=alg_name)) + geom_bar(position="stack", stat="identity") + theme(legend.position = "none") + scale_fill_gradientn(colours=rainbow(5)) + ylab("Stacked user count") + xlab("Algorithm") # + scale_fill_gradient(low="blue", high="red", space ="Lab" ) + scale_fill_continuous(type = "viridis")
+
+ggsave("user-ids.png", width=6, height=4)
+
+
 # Compute intersection in a new table
 res.intersect = matrix(1:(dim*dim), nrow=dim, ncol=dim)
 colnames(res.intersect) = levels(data$alg_name)