User info added.

2020-08-20 20:39:42 +01:00 · 2020-08-20 20:39:42 +01:00 · aa86690f43
commit aa86690f43
parent a0e9ed9fb1
2 changed files with 56 additions and 12 deletions
--- a/paper/main.tex
+++ b/paper/main.tex
@ -244,9 +244,37 @@ Potentially, analyze how the rankings of different similarities look like.
 \subsection{Quantitative Analysis of Selected Jobs}
-\begin{table}
+
-\caption{User and Group Information}
+User count and group id is the same, meaning that a user is likely from the same group and the number of groups is identical to the number of users (unique), for Job-L user id and group count differ a bit, for Job-M a bit more.
-\end{table}
+Up to about 2x users than groups.
 To understand how the Top\,100 jobs are distributed across users, the data is grouped by userid and counted.
 \Cref{fig:userids} shows the stacked user information, where the lowest stack is the user with the most jobs and the top most user in the stack has the smallest number of jobs.
 For Job-S, we can see that about 70-80\% of jobs stem from one user, for the hex\_lev and hex\_native algorithms, the other jobs stem from a second user while bin includes jobs from additional users (5 in total).
 For Job-M, jobs from more users are included (13); about 25\% of jobs stem from the same user, here, hex\_lev and hex\_native is including more users (30 and 33, respectively) than the other three algorithms.
 For Job-L, the two hex algorithms include with (12 and 13) a bit more diverse user community than the bin algorithms (9) but hex\_phases covers 35 users.
 \begin{figure}
 \begin{subfigure}{0.31\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_4296426-out/user-ids}
 \caption{Job-S} \label{fig:users-job-S}
 \end{subfigure}
 \begin{subfigure}{0.31\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/user-ids}
 \caption{Job-M} \label{fig:users-job-M}
 \end{subfigure}
 \begin{subfigure}{0.31\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/user-ids}
 \caption{Job-L} \label{fig:users-job-L}
 \end{subfigure}
 \caption{User information for each jobs}
 \label{fig:userids}
 \end{figure}
 \begin{figure}
 \begin{subfigure}{0.31\textwidth}
--- a/scripts/plot.R
+++ b/scripts/plot.R
@ -4,7 +4,7 @@ library(ggplot2)
 library(dplyr)
 require(scales)
-plotjobs = TRUE
+plotjobs = FALSE
 # Color scheme
 plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000066")
@ -42,13 +42,6 @@ metadata$user_id = as.factor(metadata$user_id)
 metadata$group_id = as.factor(metadata$group_id)
 plotJobs = function(jobs){
    # plot details about the jobs of a given algorithm
    tbl = jobData[jobData$jobid %in% jobs,]
    print(summary(tbl))
    #print(tbl)
    md = metadata[metadata$jobid %in% jobs,]
    print(summary(md))
    # print the job timelines
    r = e[ordered, ]
@ -61,8 +54,9 @@ plotJobs = function(jobs){
 # Store the job ids in a table, each column is one algorithm
 dim = length(levels(data$alg_name))
 count = 100
-result = matrix(1:(dim*count), nrow=count, ncol=dim)
+result = matrix(1:(dim*count), nrow=count, ncol=dim) # will contain the job ids for the count best jobs
 colnames(result) = levels(data$alg_name)
 result.userid = tibble() # will contain the userid for the count best jobs
 # Extract the 100 most similar jobs into the table
 for (level in levels(data$alg_name)){
@ -74,9 +68,31 @@ for (level in levels(data$alg_name)){
    # Extract the data for the jobs
    jobs = e[ordered,"jobid"]
    result[, level] = jobs
    # extract details about the jobs of a given algorithm
    tbl = jobData[jobData$jobid %in% jobs,]
    print(summary(tbl))
    md = metadata[metadata$jobid %in% jobs,]
    print(summary(md))
    md$value = 1
    userprofile = md %>% group_by(user_id) %>% summarise(count = sum(value))
    userprofile = userprofile[order(userprofile$count, decreasing=TRUE),]
    userprofile$userrank = 1:nrow(userprofile)
    result.userid = rbind(result.userid, cbind(level, userprofile))
    plotJobs(jobs)
 }
 colnames(result.userid) = c("alg_name", "user_id", "count", "userrank")
 print(result.userid)
 # Create stacked user table
 ggplot(result.userid, aes(fill=userrank, y=count, x=alg_name)) + geom_bar(position="stack", stat="identity") + theme(legend.position = "none") + scale_fill_gradientn(colours=rainbow(5)) + ylab("Stacked user count") + xlab("Algorithm") # + scale_fill_gradient(low="blue", high="red", space ="Lab" ) + scale_fill_continuous(type = "viridis")
 ggsave("user-ids.png", width=6, height=4)
 # Compute intersection in a new table
 res.intersect = matrix(1:(dim*dim), nrow=dim, ncol=dim)
 colnames(res.intersect) = levels(data$alg_name)