From 16750beb39f8aacf34ad58dd831960b65aba3e12 Mon Sep 17 00:00:00 2001
From: "Julian M. Kunkel" <juliankunkel@googlemail.com>
Date: Thu, 27 Aug 2020 16:11:08 +0100
Subject: [PATCH] Nai

---
 paper/main.tex               | 86 ++++++++++++++++++++----------------
 scripts/extract-conf-data.sh |  4 ++
 2 files changed, 52 insertions(+), 38 deletions(-)

diff --git a/paper/main.tex b/paper/main.tex
index f11ff93..9edd2a0 100644
--- a/paper/main.tex
+++ b/paper/main.tex
@@ -454,8 +454,8 @@ From this analysis, we conclude that one representative from binary quantization
 
 To verify the suitability of the similarity metrics, for each algorithm, we investigated the timelines of all Top\,100 jobs.
 We subjectively found that the approach works very well and identifies suitable similar jobs.
-To demonstrate this, we include a selection of job timelines -- typically Rank\,2, Rank\,15, and Rank\,100 --  and selected interesting job profiles.
-
+To demonstrate this, we include a selection of job timelines -- typically Rank\,2, Rank\,15, and Rank\,100, and selected interesting job profiles.
+These can be visually and subjectively compared to our reference jobs shown in \Cref{fig:refJobs}.
 
 \subsection{Job-S}
 
@@ -491,12 +491,12 @@ For Job-S, we found that all algorithms work similarly well and, therefore, omit
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_4296426-out/bin_aggzeros-0.6923--76timeseries4235560}
-\caption{Non-cmor job: Rank\,76, SIM=0.69}
+\caption{Non-cmor job: Rank\,76, SIM=69\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_4296426-out/bin_aggzeros-0.8077--4timeseries4483904}
-\caption{Non-control job: Rank\,4, SIM=0.81}
+\caption{Non-control job: Rank\,4, SIM=81\%}
 \end{subfigure}
 
 \caption{Job-S: jobs with different job names when using bin\_aggzeros}
@@ -508,17 +508,17 @@ For Job-S, we found that all algorithms work similarly well and, therefore, omit
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_4296426-out/hex_lev-0.9615--1timeseries4296288}
-\caption{Rank 2, SIM=0.9615}
+\caption{Rank 2, SIM=96\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_4296426-out/hex_lev-0.9012--15timeseries4296277}
-\caption{Rank 15, SIM=0.9017}
+\caption{Rank 15, SIM=90\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_4296426-out/hex_lev-0.7901--99timeseries4297842}
-\caption{Rank\,100, SIM=0.790}
+\caption{Rank\,100, SIM=79\%}
 \end{subfigure}
 
 \caption{Job-S with Hex-Lev, selection of similar jobs}
@@ -571,6 +571,14 @@ For Job-S, we found that all algorithms work similarly well and, therefore, omit
 
 \subsection{Job-M}
 
+Inspecting the Top\,100 for this reference jobs is highlighting the differences between the algorithms.
+
+All algorithms identify a diverse range of job names for this reference job in the Top\,100.
+Firstly, the name of the reference job appears 30 times in the whole dataset so this kind job type isn't necessarily executed frequently and, therefore, our Top\,100 is expected to contain other names.
+Some applications are more prominent in these sets, e.g., for bin\_aggzero, 32\,jobs contain WRF (a model) in the name.
+The number of unique names is 19, 38, 49 to 51 for bin\_aggzero, hex\_phases, hex\_native and hex\_lev, respectively.
+
+The jobs that are similar according to the bin algorithms differ from our expectation.
 
 
 
@@ -578,17 +586,17 @@ For Job-S, we found that all algorithms work similarly well and, therefore, omit
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/bin_aggzeros-0.7755--1timeseries8010306}
-\caption{Rank 2, $SIM=$}
+\caption{Rank\,2, SIM=78\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/bin_aggzeros-0.7347--14timeseries4498983}
-\caption{$SIM=$}
+\caption{Rank\,15, SIM=73\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/bin_aggzeros-0.5102--99timeseries5120077}
-\caption{$SIM=$ }
+\caption{Rank\,100, SIM=51\% }
 \end{subfigure}
 
 \caption{Job-M with Bin-Aggzero, selection of similar jobs}
@@ -601,21 +609,21 @@ For Job-S, we found that all algorithms work similarly well and, therefore, omit
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_lev-0.9546--1timeseries7826634}
-\caption{Rank 2, $SIM=$}
+\caption{Rank\,2, SIM=95\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_lev-0.9365--2timeseries5240733}
-\caption{Rank 3, $SIM=$}
+\caption{Rank 3, SIM=94\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_lev-0.7392--15timeseries7651420}
-\caption{$SIM=$}
+\caption{Rank\,15, SIM=74\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_lev-0.7007--99timeseries8201967}
-\caption{$SIM=$ }
+\caption{Rank\,100, SIM=70\%}
 \end{subfigure}
 
 \caption{Job-M with hex\_lev, selection of similar jobs}
@@ -628,21 +636,21 @@ For Job-S, we found that all algorithms work similarly well and, therefore, omit
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_native-0.9878--1timeseries5240733}
-\caption{Rank 2, $SIM=$}
+\caption{Rank 2, SIM=99\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_native-0.9651--2timeseries7826634}
-\caption{Rank 3, $SIM=$}
+\caption{Rank 3, SIM=97\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_native-0.9084--14timeseries8037817}
-\caption{$SIM=$}
+\caption{Rank 15, SIM=91\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_native-0.8838--99timeseries7571967}
-\caption{$SIM=$ }
+\caption{Rank 100, SIM=88\%}
 \end{subfigure}
 
 \caption{Job-M with hex\_native, selection of similar jobs}
@@ -654,21 +662,21 @@ For Job-S, we found that all algorithms work similarly well and, therefore, omit
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_phases-0.8831--1timeseries7826634}
-\caption{Rank 2, $SIM=$}
+\caption{Rank 2, SIM=88\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_phases-0.7963--2timeseries5240733}
-\caption{Rank 3, $SIM=$}
+\caption{Rank 3, SIM=80\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_phases-0.4583--14timeseries4244400}
-\caption{$SIM=$}
+\caption{Rank 15, SIM=46\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_phases-0.2397--99timeseries7644009}
-\caption{$SIM=$ }
+\caption{Rank 100, SIM=24\%}
 \end{subfigure}
 
 \caption{Job-M with hex\_phases, selection of similar jobs}
@@ -677,26 +685,28 @@ For Job-S, we found that all algorithms work similarly well and, therefore, omit
 
 \subsection{Job-L}
 
+For the bin algorithms, the inspection of job names (14 unique names) leads to two prominent applications: bash and xmessy with 45 and 48 instances, respectively.
+The hex algorithms identify a more diverse set of applications (18 unique names), with no xmessy job, and the hex\_phases algorithm has 85 unique names.
 
 \begin{figure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/bin_aggzeros-0.1671--1timeseries7869050}
-\caption{Rank 2, $SIM=$}
+\caption{Rank 2, SIM=17\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/bin_aggzeros-0.1671--2timeseries7990497}
-\caption{Rank 3, $SIM=$}
+\caption{Rank 3, SIM=17\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/bin_aggzeros-0.1521--14timeseries8363584}
-\caption{$SIM=$}
+\caption{Rank 15, SIM=15\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/bin_aggzeros-0.1097--97timeseries4262983}
-\caption{$SIM=$ }
+\caption{Rank 100, SIM=11\%}
 \end{subfigure}
 
 \caption{Job-L with bin\_aggzero, selection of similar jobs}
@@ -708,21 +718,21 @@ For Job-S, we found that all algorithms work similarly well and, therefore, omit
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_lev-0.9386--1timeseries7266845}
-\caption{Rank 2, $SIM=$}
+\caption{Rank 2, SIM=94\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_lev-0.9375--2timeseries7214657}
-\caption{Rank 3, $SIM=$}
+\caption{Rank 3, SIM=94\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_lev-0.7251--14timeseries4341304}
-\caption{$SIM=$}
+\caption{Rank 15, SIM=73\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_lev-0.1657--99timeseries8036223}
-\caption{$SIM=$ (30s)}
+\caption{Rank 100, SIM=17\%}
 \end{subfigure}
 
 \caption{Job-L with hex\_lev, selection of similar jobs}
@@ -734,21 +744,21 @@ For Job-S, we found that all algorithms work similarly well and, therefore, omit
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_native-0.9390--1timeseries7266845}
-\caption{Rank 2, $SIM=$}
+\caption{Rank 2, SIM=94\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_native-0.9333--2timeseries7214657}
-\caption{Rank 3, $SIM=$}
+\caption{Rank 3, SIM=93\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_native-0.8708--14timeseries4936553}
-\caption{$SIM=$}
+\caption{Rank 15, SIM=87\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_native-0.1695--99timeseries7942052}
-\caption{$SIM=$ }
+\caption{Rank 100, SIM=17\%}
 \end{subfigure}
 
 \caption{Job-L with hex\_native, selection of similar jobs}
@@ -759,21 +769,21 @@ For Job-S, we found that all algorithms work similarly well and, therefore, omit
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_phases-1.0000--14timeseries4577917}
-\caption{Rank 2, $SIM=$}
+\caption{Rank 2, SIM=100\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_phases-1.0000--1timeseries4405671}
-\caption{Rank 3, $SIM=$}
+\caption{Rank 3, SIM=100\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_phases-1.0000--2timeseries4621422}
-\caption{$SIM=$}
+\caption{Rank 15, SIM=100\%}
 \end{subfigure}
 \begin{subfigure}{0.3\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_phases-1.0000--99timeseries4232293}
-\caption{$SIM=$ }
+\caption{Rank 100, SIM=100\%}
 \end{subfigure}
 
 \caption{Job-L with hex\_phases, selection of similar jobs}
diff --git a/scripts/extract-conf-data.sh b/scripts/extract-conf-data.sh
index f21846d..298fb6f 100755
--- a/scripts/extract-conf-data.sh
+++ b/scripts/extract-conf-data.sh
@@ -10,5 +10,9 @@ fi
 for I in $@ ; do
   DATA=$(grep $I datasets/job_metadata.csv | cut -d "," -f 7-)
   echo -n $I,
+  if [[ "$DATA" == "" ]] ; then
+    echo "No data found"
+    continue
+  fi
   grep $DATA datasets/job_metadata_confidential.csv | cut -d "," -f 1-5
 done