diff --git a/paper/main.tex b/paper/main.tex index 2a4a7f6..104f36d 100644 --- a/paper/main.tex +++ b/paper/main.tex @@ -244,9 +244,37 @@ Potentially, analyze how the rankings of different similarities look like. \subsection{Quantitative Analysis of Selected Jobs} -\begin{table} -\caption{User and Group Information} -\end{table} + +User count and group id is the same, meaning that a user is likely from the same group and the number of groups is identical to the number of users (unique), for Job-L user id and group count differ a bit, for Job-M a bit more. +Up to about 2x users than groups. + +To understand how the Top\,100 jobs are distributed across users, the data is grouped by userid and counted. +\Cref{fig:userids} shows the stacked user information, where the lowest stack is the user with the most jobs and the top most user in the stack has the smallest number of jobs. +For Job-S, we can see that about 70-80\% of jobs stem from one user, for the hex\_lev and hex\_native algorithms, the other jobs stem from a second user while bin includes jobs from additional users (5 in total). +For Job-M, jobs from more users are included (13); about 25\% of jobs stem from the same user, here, hex\_lev and hex\_native is including more users (30 and 33, respectively) than the other three algorithms. +For Job-L, the two hex algorithms include with (12 and 13) a bit more diverse user community than the bin algorithms (9) but hex\_phases covers 35 users. + +\begin{figure} +\begin{subfigure}{0.31\textwidth} +\centering +\includegraphics[width=\textwidth]{job_similarities_4296426-out/user-ids} +\caption{Job-S} \label{fig:users-job-S} +\end{subfigure} +\begin{subfigure}{0.31\textwidth} +\centering +\includegraphics[width=\textwidth]{job_similarities_5024292-out/user-ids} +\caption{Job-M} \label{fig:users-job-M} +\end{subfigure} +\begin{subfigure}{0.31\textwidth} +\centering +\includegraphics[width=\textwidth]{job_similarities_7488914-out/user-ids} +\caption{Job-L} \label{fig:users-job-L} +\end{subfigure} + + +\caption{User information for each jobs} +\label{fig:userids} +\end{figure} \begin{figure} \begin{subfigure}{0.31\textwidth} diff --git a/scripts/plot.R b/scripts/plot.R index 97451c2..923ae2a 100755 --- a/scripts/plot.R +++ b/scripts/plot.R @@ -4,7 +4,7 @@ library(ggplot2) library(dplyr) require(scales) -plotjobs = TRUE +plotjobs = FALSE # Color scheme plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000066") @@ -42,13 +42,6 @@ metadata$user_id = as.factor(metadata$user_id) metadata$group_id = as.factor(metadata$group_id) plotJobs = function(jobs){ - # plot details about the jobs of a given algorithm - tbl = jobData[jobData$jobid %in% jobs,] - print(summary(tbl)) - #print(tbl) - md = metadata[metadata$jobid %in% jobs,] - print(summary(md)) - # print the job timelines r = e[ordered, ] @@ -61,8 +54,9 @@ plotJobs = function(jobs){ # Store the job ids in a table, each column is one algorithm dim = length(levels(data$alg_name)) count = 100 -result = matrix(1:(dim*count), nrow=count, ncol=dim) +result = matrix(1:(dim*count), nrow=count, ncol=dim) # will contain the job ids for the count best jobs colnames(result) = levels(data$alg_name) +result.userid = tibble() # will contain the userid for the count best jobs # Extract the 100 most similar jobs into the table for (level in levels(data$alg_name)){ @@ -74,9 +68,31 @@ for (level in levels(data$alg_name)){ # Extract the data for the jobs jobs = e[ordered,"jobid"] result[, level] = jobs + + # extract details about the jobs of a given algorithm + tbl = jobData[jobData$jobid %in% jobs,] + print(summary(tbl)) + md = metadata[metadata$jobid %in% jobs,] + print(summary(md)) + md$value = 1 + userprofile = md %>% group_by(user_id) %>% summarise(count = sum(value)) + userprofile = userprofile[order(userprofile$count, decreasing=TRUE),] + userprofile$userrank = 1:nrow(userprofile) + result.userid = rbind(result.userid, cbind(level, userprofile)) + plotJobs(jobs) } +colnames(result.userid) = c("alg_name", "user_id", "count", "userrank") + +print(result.userid) + +# Create stacked user table +ggplot(result.userid, aes(fill=userrank, y=count, x=alg_name)) + geom_bar(position="stack", stat="identity") + theme(legend.position = "none") + scale_fill_gradientn(colours=rainbow(5)) + ylab("Stacked user count") + xlab("Algorithm") # + scale_fill_gradient(low="blue", high="red", space ="Lab" ) + scale_fill_continuous(type = "viridis") + +ggsave("user-ids.png", width=6, height=4) + + # Compute intersection in a new table res.intersect = matrix(1:(dim*dim), nrow=dim, ncol=dim) colnames(res.intersect) = levels(data$alg_name)