eugen.betke 2020-09-02 13:46:52 +02:00
commit 6621d2e692
13 changed files with 294 additions and 132 deletions

View File

@ -56,8 +56,9 @@ import pandas as pd
if __name__ == '__main__':
FNS = [
'job_codings_v3_confidential.csv',
'job_metadata_confidential.csv',
'job_codings_v4_confidential.csv',
#'job_codings_v3_confidential.csv',
#'job_metadata_confidential.csv',
]
for in_fn in FNS:

View File

@ -4,7 +4,7 @@ filenames=( $(ls job_codings_v*.csv) )
filenames=( ${filenames[@]} "clustering_progress.csv" )
filenames=( ${filenames[@]} $(ls job_metadata*.csv) )
filenames=( ${filenames[@]} $( ls job_similarities_*.csv ) )
filenames=( ${filenames[@]} $( ls progress_individual_*.csv ) )
filenames=( ${filenames[@]} $( ls sim_computation_times_*.csv ) )
echo "${filenames[*]}"

View File

@ -1,8 +1,24 @@
#!/bin/bash
filenames=$( ls *.tar.xz )
comm="new" # decompress only new files
for filename in ${filenames[@]}; do
echo "Decompressing ${filename}"
tar -xJf "${filename}"
if [[ $# -eq 1 ]]; then
comm="$1"
fi
echo $comm
source_filenames=$( ls *.tar.xz )
for source_filename in ${source_filenames[@]}; do
target_filename=$(basename $source_filename)
target_filename="${target_filename%.*}" # remove .xz extension
target_filename=$(basename $target_filename)
target_filename="${target_filename%.*}" # remove .tar extension
if [ "$comm" == "all" ] || [ ! -f $target_filename ]; then
echo "Decompressing ${source_filename}"
tar -xJf "${source_filename}"
else
echo "Skipping decompression of ${source_filename}"
fi
done

View File

@ -2,7 +2,7 @@
\documentclass[]{llncs}
\usepackage{todonotes}
\newcommand{\eb}[1]{\todo[inline]{(EB): #1}}
\newcommand{\eb}[1]{\todo[inline, color=green]{EB: #1}}
\newcommand{\jk}[1]{\todo[inline]{JK: #1}}
\usepackage{silence}
@ -11,6 +11,13 @@
\WarningFilter{caption}{Unsupported}
\WarningFilter{caption}{Unknown document}
\usepackage{changes}
\definechangesauthor[name=Betke, color=blue]{eb}
\newcommand{\ebrep}[2]{\replaced[id=eb]{#1}{#2}}
\newcommand{\ebadd}[1]{\added[id=eb]{#1}}
\newcommand{\ebdel}[1]{\deleted[id=eb]{#1}}
\newcommand{\ebcom}[1]{\comment[id=eb]{#1}}
\let\spvec\vec
\let\vec\accentvec
\usepackage{amsmath}
@ -27,7 +34,6 @@
\usepackage[listings,skins,breakable,raster,most]{tcolorbox}
\usepackage{caption}
\lstset{
numberbychapter=false,
belowskip=-10pt,
@ -63,9 +69,10 @@
\usepackage{cleveref}
\crefname{codecount}{Code}{Codes}
\title{Using Machine Learning to Identify Similar Jobs Based on their IO Behavior}
\title{A Workflow for Identifying Jobs with Similar I/O Behavior by Analyzing the Timeseries}
\author{Julian Kunkel\inst{2} \and Eugen Betke\inst{1}}
\institute{
University of Reading--%
\email{j.m.kunkel@reading.ac.uk}%
@ -162,7 +169,7 @@ For example, we can see in \Cref{fig:job-S}, that several metrics increase in Se
\begin{subfigure}{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{job-timeseries4296426}
\caption{Job-S} \label{fig:job-S}
\caption{Job-S (runtime=15,551\,s, segments=25)} \label{fig:job-S}
\end{subfigure}
\centering
@ -170,7 +177,7 @@ For example, we can see in \Cref{fig:job-S}, that several metrics increase in Se
\begin{subfigure}{0.8\textwidth}
\centering
\includegraphics[width=\textwidth]{job-timeseries5024292}
\caption{Job-M} \label{fig:job-M}
\caption{Job-M (runtime=28,828\,s, segments=48)} \label{fig:job-M}
\end{subfigure}
\centering
@ -200,7 +207,7 @@ For example, we can see in \Cref{fig:job-S}, that several metrics increase in Se
To measure the performance for computing the similarity to the reference jobs, the algorithms are executed 10 times on a compute node at DKRZ.
A boxplot for the runtimes is shown in \Cref{fig:performance}.
The runtime is normalized for 100k seconds, i.e., for bin\_all it takes about 41\,s to process 100k jobs out of the 500k total jobs that this algorithm will process.
The runtime is normalized for 100k jobs, i.e., for bin\_all it takes about 41\,s to process 100k jobs out of the 500k total jobs that this algorithm will process.
Generally, the bin algorithms are fastest, while the hex algorithms take often 4-5x as long.
Hex\_phases is slow for Job-S and Job-M while it is fast for Job-L, the reason is that just one phase is extracted for Job-L.
The Levensthein based algorithms take longer for longer jobs -- proportional to the job length as it applies a sliding window.
@ -213,17 +220,17 @@ We believe this will then allow a near-online analysis of a job.
\begin{subfigure}{0.31\textwidth}
\centering
\includegraphics[width=\textwidth]{progress_4296426-out-boxplot}
\caption{Job-S (runtime=15,551\,s, segments=25)} \label{fig:perf-job-S}
\caption{Job-S (segments=25)} \label{fig:perf-job-S}
\end{subfigure}
\begin{subfigure}{0.31\textwidth}
\centering
\includegraphics[width=\textwidth]{progress_5024292-out-boxplot}
\caption{Job-M (runtime=28,828\,s, segments=48)} \label{fig:perf-job-M}
\caption{Job-M (segments=48)} \label{fig:perf-job-M}
\end{subfigure}
\begin{subfigure}{0.31\textwidth}
\centering
\includegraphics[width=\textwidth]{progress_7488914-out-boxplot}
\caption{Job-L} \label{fig:perf-job-L}
\caption{Job-L (segments=400)} \label{fig:perf-job-L}
\end{subfigure}
\caption{Runtime of the algorithms to compute the similarity to reference jobs}
@ -241,13 +248,14 @@ The different algorithms lead to different curves for our reference jobs, e.g.,
% This indicates that the algorithms
The support team in a data center may have time to investigate the most similar jobs.
Time for the analysis is typically bound, for instance, the team may analyze the 100 most similar ranked jobs (the Top\,100).
Time for the analysis is typically bound, for instance, the team may analyze the 100 most similar ranked jobs; we refer to them as the Top\,100 jobs, and Rank\,i refers to the job that has the i-th highest similarity to the reference job -- sometimes these values can be rather close together as we see in the following histogram.
In \Cref{fig:hist}, the histograms with the actual number of jobs for a given similarity are shown.
As we focus on a feasible number of jobs, the diagram should be read from right (100\% similarity) to left; and for a bin we show at most 100 jobs (total number is still given).
It turns out that both BIN algorithms produce nearly identical histograms and we omit one of them.
In the figures, we can see again a different behavior of the algorithms depending on the reference job.
Especially for Job-S, we can see clusters with jobs of higher similarity (e.g., at hex\_lev at SIM=75\%) while for Job-M, the growth in the relevant section is more steady.
For Job-L, we find barely similar jobs, except when using the HEX\_phases algorithm.
This algorithm finds 393 jobs that have a similarity of 100\%, thus they are indistinguishable to the algorithm.
Practically, the support team would start with Rank\,1 (most similar job, presumably, the reference job itself) and walk down until the jobs look different, or until a cluster is analyzed.
@ -305,16 +313,43 @@ Practically, the support team would start with Rank\,1 (most similar job, presum
\subsubsection{Inclusivity and Specificity}
When analyzing the overall population of jobs executed on a system, we expect that some workloads are executed several times (with different inputs but with the same configuration) or are executed with slightly different configurations (e.g., node counts, timesteps).
Thus, potentially our similarity analysis of the job population may just identify the re-execution of the same workload.
Typically, the support staff would identify the re-execution of jobs by inspecting job names which are user-defined generic strings\footnote{%
As they can contain confidential data, it is difficult to anonymize them without perturbing the meaning.
Therefore, they are not published in our data repository.
}
User count and group id is the same, meaning that a user is likely from the same group and the number of groups is identical to the number of users (unique), for Job-L user id and group count differ a bit, for Job-M a bit more.
Up to about 2x users than groups.
To understand if the analysis is inclusive and identifies different applications, we use two approaches with our Top\,100 jobs:
We explore the distribution of users (and groups), runtime, and node count across jobs.
The algorithms should include different users, node counts, and across runtime.
To confirm hypotheses presented, we analyzed the job metadata comparing job names which validates our quantitative results discussed in the following.
\paragraph{User distribution.}
To understand how the Top\,100 are distributed across users, the data is grouped by userid and counted.
\Cref{fig:userids} shows the stacked user information, where the lowest stack is the user with the most jobs and the top most user in the stack has the smallest number of jobs.
For Job-S, we can see that about 70-80\% of jobs stem from one user, for the hex\_lev and hex\_native algorithms, the other jobs stem from a second user while bin includes jobs from additional users (5 in total).
For Job-M, jobs from more users are included (13); about 25\% of jobs stem from the same user, here, hex\_lev and hex\_native is including more users (30 and 33, respectively) than the other three algorithms.
For Job-L, the two hex algorithms include with (12 and 13) a bit more diverse user community than the bin algorithms (9) but hex\_phases covers 35 users.
We didn't include the group analysis in the figure as user count and group id is proportional, at most the number of users is 2x the number of groups.
Thus, a user is likely from the same group and the number of groups is similar to the number of unique users.
\paragraph{Node distribution.}
All algorithms reduce over the node dimensions, therefore, we naturally expect a big inclusion across node range -- as long as the average I/O behavior of the jobs are similar.
\Cref{fig:nodes-job} shows a boxplot for the node counts in the Top\,100 -- the red line marks the reference job.
For Job-M and Job-L, we can observe that indeed the range of similar nodes is between 1 and 128.
For Job-S, all 100 top-ranked jobs use one node.
As post-processing jobs use typically one node and the number of postprocessing jobs is a high proportion, it appears natural that all Top\,100 are from this class of jobs which is confirmed by investigating the job metadata.
The boxplots have different shapes which is an indication, that the different algorithms identify a different set of jobs -- we will analyze this later further.
\paragraph{Runtime distribution.}
The \added{job} runtime of the Top\,100 jobs is shown using boxplots in \Cref{fig:runtime-job}.
While all algorithms can compute the similarity between jobs of different length, the bin algorithms and hex\_native penalize jobs of different length preferring jobs of very similar length.
For Job-M and Job-L, hex\_phases is able to identify much shorter or longer jobs.
For Job-L, the job itself isn't included in the chosen Top\,100 (see \Cref{fig:hist-job-L}, 393 jobs have a similarity of 100\%) which is the reason why the job runtime isn't shown in the figure itself.
\begin{figure}
\begin{subfigure}{0.31\textwidth}
\centering
@ -333,7 +368,7 @@ For Job-L, the two hex algorithms include with (12 and 13) a bit more diverse us
\end{subfigure}
\caption{User information for all 100 top ranked jobs}
\caption{User information for all 100 top-ranked jobs}
\label{fig:userids}
\end{figure}
@ -354,7 +389,7 @@ For Job-L, the two hex algorithms include with (12 and 13) a bit more diverse us
\caption{Job-L (reference job runs on 20 nodes)} \label{fig:nodes-job-L}
\end{subfigure}
\centering
\caption{Distribution of node counts (for Job-S nodes=1 in all cases)}
\caption{Distribution of node counts (for Job-S nodes=1 in all cases))}
\label{fig:nodes-job}
\end{figure}
@ -362,32 +397,34 @@ For Job-L, the two hex algorithms include with (12 and 13) a bit more diverse us
\begin{subfigure}{0.31\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_4296426-out/jobs-elapsed}
\caption{Job-S ($job=10^{4.19}$)} \label{fig:runtime-job-S}
\caption{Job-S ($job=15,551s$)} \label{fig:runtime-job-S}
\end{subfigure}
\begin{subfigure}{0.31\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_5024292-out/jobs-elapsed}
\caption{Job-M ($job=10^{4.46}$)} \label{fig:runtime-job-M}
\caption{Job-M ($job=28,828s$)} \label{fig:runtime-job-M}
\end{subfigure}
\begin{subfigure}{0.31\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_7488914-out/jobs-elapsed}
\caption{Job-L ($job=10^{5.3}$)} \label{fig:runtime-job-L}
\caption{Job-L ($job=240ks$)} \label{fig:runtime-job-L}
\end{subfigure}
\centering
\caption{Distribution of runtime for all 100 top ranked jobs}
\caption{Distribution of runtime for all 100 top-ranked jobs}
\label{fig:runtime-job}
\end{figure}
To see how different the algorithms behave, the intersection of two algorithms is computed for the 100 jobs with the highest similarity and visualized in \Cref{fig:heatmap-job}.
As expected, we can observe that bin\_all and bin\_aggzeros is very similar for all three jobs.
\subsubsection{Algorithmic differences}
To verify that the different algorithms behave differently, the intersection for the Top\,100 is computed for all combination of algorithms and visualized in \Cref{fig:heatmap-job}.
As expected we can observe that bin\_all and bin\_aggzeros is very similar for all three jobs.
While there is some reordering, both algorithms lead to a comparable order.
The hex\_lev and hex\_native algorithms are also exhibiting some overlap particularly for Job-S and Job-L.
For Job\-M, however, they lead to a different ranking and Top\,100.
From the analysis, we conclude that one representative from binary quantization is sufficient while the other algorithms identify mostly disjoint behavioral aspects and, therefore, should be considered together.
From this analysis, we conclude that one representative from binary quantization is sufficient as it generates very similar results while the other algorithms identify mostly disjoint behavioral aspects and, therefore, should be analyzed individually.
\eb{Ist das eine generelle Aussage: ``one representative from binary quantization is sufficient``? Wenn ja, dann ist sie sehr wage. Koennte Zufall sein.}
\jk{Habe das bissl umgeschrieben. Sicher ja. Ist halt sehr ähnlich.}
One consideration is to identify jobs that meet a rank threshold for all different algorithms.
\jk{TODO}
\begin{figure}
\begin{subfigure}{0.31\textwidth}
@ -407,91 +444,141 @@ One consideration is to identify jobs that meet a rank threshold for all differe
\end{subfigure}
\centering
\caption{Intersection of the 100 top ranked jobs for different algorithms}
\caption{Intersection of the 100 top-ranked jobs for different algorithms}
\label{fig:heatmap-job}
\end{figure}
%%%%%%%%%%% %%%%%%%%%%% %%%%%%%%%%% %%%%%%%%%%% %%%%%%%%%%% %%%%%%%%%%% %%%%%%%%%%% %%%%%%%%%%%
\section{Assessing Timelines for Similar Jobs}
To verify the suitability of the similarity metrics, for each algorithm, we investigated the timelines of all Top\,100 jobs.
We subjectively found that the approach works very well and identifies suitable similar jobs.
To demonstrate this, we include a selection of job timelines -- typically Rank\,2, Rank\,15, and Rank\,100, and selected interesting job profiles.
These can be visually and subjectively compared to our reference jobs shown in \Cref{fig:refJobs}.
\subsection{Job-S}
This job represents post-processing (CMORization) which is a typical step.
It is executed for different simulations and variables across timesteps.
The job name of Job-S suggests that is applied to the control variable.
In the metadata, we found 22,580 jobs with “cmor” in the name of which 367 jobs mention “control”.
The bin algorithms identify one job which name doesn't include “cmor”,
All other algorithm identify only “cmor” jobs and 26-38 of these jobs are applied to “control” (see \Cref{tbl:control-jobs}).
A selection of job timelines is given in \Cref{fig:job-S-hex-lev}; all of these jobs are jobs on control variables.
The single non-cmor job and a high-ranked non-control cmor job is shown in \Cref{fig:job-S-bin-agg}.
While we cannot visually see much differences between these two jobs compared to the cmor job processing the control variables, the algorithms indicate that jobs processing the control variables must be more similar as they appear much more frequently in the Top\,100 jobs than in all jobs labeled with “cmor”.
For Job-S, we found that all algorithms work similarly well and, therefore, omit further timelines.
\begin{table}
\centering
\begin{tabular}{r|r}
Algorithm & Jobs \\ \hline
bin\_aggzeros & 38 \\
bin\_all & 38 \\
hex\_lev & 33 \\
hex\_native & 26 \\
hex\_phases & 33
\end{tabular}
\caption{Job-S: number of jobs with “control” in their name in the Top-100}
\label{tbl:control-jobs}
\end{table}
\begin{figure}
\centering
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_4296426-out/bin_aggzeros-0.6923--76timeseries4235560}
\caption{Non-cmor job: Rank\,76, SIM=69\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_4296426-out/bin_aggzeros-0.8077--4timeseries4483904}
\caption{Non-control job: Rank\,4, SIM=81\%}
\end{subfigure}
\caption{Job-S: jobs with different job names when using bin\_aggzeros}
\label{fig:job-S-bin-agg}
\end{figure}
\begin{figure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_4296426-out/hex_lev-0.9615--1timeseries4296288}
\caption{Rank 2, SIM=0.9615}
\caption{Rank 2, SIM=96\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_4296426-out/hex_lev-0.9012--15timeseries4296277}
\caption{Rank 15, SIM=0.9017}
\caption{Rank 15, SIM=90\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_4296426-out/hex_lev-0.7901--99timeseries4297842}
\caption{Rank\,100, SIM=0.790}
\caption{Rank\,100, SIM=79\%}
\end{subfigure}
\caption{Job-S with Hex-Lev, selection of similar jobs}
\label{fig:job-S-hex-lev}
\end{figure}
\begin{figure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_4296426-out/hex_native-0.9808--1timeseries4296288}
\caption{Rank 2, SIM=}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_4296426-out/hex_native-0.9375--15timeseries4564296}
\caption{Rank 15, SIM=}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_4296426-out/hex_native-0.8915--99timeseries4296785}
\caption{Rank\,100, SIM=}
\end{subfigure}
\caption{Job-S with Hex-Native, selection of similar jobs}
\label{fig:job-S-hex-native}
\end{figure}
% \begin{figure}
% \begin{subfigure}{0.3\textwidth}
% \centering
% \includegraphics[width=\textwidth]{job_similarities_4296426-out/hex_native-0.9808--1timeseries4296288}
% \caption{Rank 2, SIM=}
% \end{subfigure}
% \begin{subfigure}{0.3\textwidth}
% \centering
% \includegraphics[width=\textwidth]{job_similarities_4296426-out/hex_native-0.9375--15timeseries4564296}
% \caption{Rank 15, SIM=}
% \end{subfigure}
% \begin{subfigure}{0.3\textwidth}
% \centering
% \includegraphics[width=\textwidth]{job_similarities_4296426-out/hex_native-0.8915--99timeseries4296785}
% \caption{Rank\,100, SIM=}
% \end{subfigure}
% \caption{Job-S with Hex-Native, selection of similar jobs}
% \label{fig:job-S-hex-native}
% \end{figure}
%
% \ContinuedFloat
Hex phases very similar to hex native.
Komischer JOB zu inspizieren: \verb|job_similarities_4296426-out/hex_phases-0.7429--93timeseries4237860|
Bin aggzeros works quite well here too. The jobs are a bit more diverse.
\begin{figure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_4296426-out/bin_aggzeros-0.8462--1timeseries4296280}
\caption{Rank 2, SIM=}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_4296426-out/bin_aggzeros-0.7778--14timeseries4555405}
\caption{Rank 15, SIM=}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_4296426-out/bin_aggzeros-0.6923--99timeseries4687419}
\caption{Rank\,100, SIM=}
\end{subfigure}
\caption{Job-S with bin\_aggzero, selection of similar jobs}
\label{fig:job-S-bin-aggzeros}
\end{figure}
%
% \begin{figure}
% \begin{subfigure}{0.3\textwidth}
% \centering
% \includegraphics[width=\textwidth]{job_similarities_4296426-out/bin_aggzeros-0.8462--1timeseries4296280}
% \caption{Rank 2, SIM=}
% \end{subfigure}
% \begin{subfigure}{0.3\textwidth}
% \centering
% \includegraphics[width=\textwidth]{job_similarities_4296426-out/bin_aggzeros-0.7778--14timeseries4555405}
% \caption{Rank 15, SIM=}
% \end{subfigure}
% \begin{subfigure}{0.3\textwidth}
% \centering
% \includegraphics[width=\textwidth]{job_similarities_4296426-out/bin_aggzeros-0.6923--99timeseries4687419}
% \caption{Rank\,100, SIM=}
% \end{subfigure}
% \caption{Job-S with bin\_aggzero, selection of similar jobs}
% \label{fig:job-S-bin-aggzeros}
% \end{figure}
\subsection{Job-M}
Bin aggzero liefert Mist zurück.
Inspecting the Top\,100 for this reference jobs is highlighting the differences between the algorithms.
All algorithms identify a diverse range of job names for this reference job in the Top\,100.
Firstly, the name of the reference job appears 30 times in the whole dataset so this kind job type isn't necessarily executed frequently and, therefore, our Top\,100 is expected to contain other names.
Some applications are more prominent in these sets, e.g., for bin\_aggzero, 32\,jobs contain WRF (a model) in the name.
The number of unique names is 19, 38, 49 to 51 for bin\_aggzero, hex\_phases, hex\_native and hex\_lev, respectively.
The jobs that are similar according to the bin algorithms differ from our expectation.
@ -499,17 +586,17 @@ Bin aggzero liefert Mist zurück.
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_5024292-out/bin_aggzeros-0.7755--1timeseries8010306}
\caption{Rank 2, $SIM=$}
\caption{Rank\,2, SIM=78\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_5024292-out/bin_aggzeros-0.7347--14timeseries4498983}
\caption{$SIM=$}
\caption{Rank\,15, SIM=73\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_5024292-out/bin_aggzeros-0.5102--99timeseries5120077}
\caption{$SIM=$ }
\caption{Rank\,100, SIM=51\% }
\end{subfigure}
\caption{Job-M with Bin-Aggzero, selection of similar jobs}
@ -522,21 +609,21 @@ Bin aggzero liefert Mist zurück.
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_lev-0.9546--1timeseries7826634}
\caption{Rank 2, $SIM=$}
\caption{Rank\,2, SIM=95\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_lev-0.9365--2timeseries5240733}
\caption{Rank 3, $SIM=$}
\caption{Rank 3, SIM=94\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_lev-0.7392--15timeseries7651420}
\caption{$SIM=$}
\caption{Rank\,15, SIM=74\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_lev-0.7007--99timeseries8201967}
\caption{$SIM=$ }
\caption{Rank\,100, SIM=70\%}
\end{subfigure}
\caption{Job-M with hex\_lev, selection of similar jobs}
@ -549,21 +636,21 @@ Bin aggzero liefert Mist zurück.
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_native-0.9878--1timeseries5240733}
\caption{Rank 2, $SIM=$}
\caption{Rank 2, SIM=99\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_native-0.9651--2timeseries7826634}
\caption{Rank 3, $SIM=$}
\caption{Rank 3, SIM=97\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_native-0.9084--14timeseries8037817}
\caption{$SIM=$}
\caption{Rank 15, SIM=91\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_native-0.8838--99timeseries7571967}
\caption{$SIM=$ }
\caption{Rank 100, SIM=88\%}
\end{subfigure}
\caption{Job-M with hex\_native, selection of similar jobs}
@ -575,21 +662,21 @@ Bin aggzero liefert Mist zurück.
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_phases-0.8831--1timeseries7826634}
\caption{Rank 2, $SIM=$}
\caption{Rank 2, SIM=88\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_phases-0.7963--2timeseries5240733}
\caption{Rank 3, $SIM=$}
\caption{Rank 3, SIM=80\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_phases-0.4583--14timeseries4244400}
\caption{$SIM=$}
\caption{Rank 15, SIM=46\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_5024292-out/hex_phases-0.2397--99timeseries7644009}
\caption{$SIM=$ }
\caption{Rank 100, SIM=24\%}
\end{subfigure}
\caption{Job-M with hex\_phases, selection of similar jobs}
@ -598,26 +685,28 @@ Bin aggzero liefert Mist zurück.
\subsection{Job-L}
For the bin algorithms, the inspection of job names (14 unique names) leads to two prominent applications: bash and xmessy with 45 and 48 instances, respectively.
The hex algorithms identify a more diverse set of applications (18 unique names), with no xmessy job, and the hex\_phases algorithm has 85 unique names.
\begin{figure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_7488914-out/bin_aggzeros-0.1671--1timeseries7869050}
\caption{Rank 2, $SIM=$}
\caption{Rank 2, SIM=17\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_7488914-out/bin_aggzeros-0.1671--2timeseries7990497}
\caption{Rank 3, $SIM=$}
\caption{Rank 3, SIM=17\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\includegraphics[width=\textwidth]{job_similarities_7488914-out/bin_aggzeros-0.1521--14timeseries8363584}
\caption{$SIM=$}
\caption{Rank 15, SIM=15\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_7488914-out/bin_aggzeros-0.1097--97timeseries4262983}
\caption{$SIM=$ }
\caption{Rank 100, SIM=11\%}
\end{subfigure}
\caption{Job-L with bin\_aggzero, selection of similar jobs}
@ -629,21 +718,21 @@ Bin aggzero liefert Mist zurück.
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_lev-0.9386--1timeseries7266845}
\caption{Rank 2, $SIM=$}
\caption{Rank 2, SIM=94\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_lev-0.9375--2timeseries7214657}
\caption{Rank 3, $SIM=$}
\caption{Rank 3, SIM=94\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_lev-0.7251--14timeseries4341304}
\caption{$SIM=$}
\caption{Rank 15, SIM=73\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_lev-0.1657--99timeseries8036223}
\caption{$SIM=$ (30s)}
\caption{Rank 100, SIM=17\%}
\end{subfigure}
\caption{Job-L with hex\_lev, selection of similar jobs}
@ -655,21 +744,21 @@ Bin aggzero liefert Mist zurück.
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_native-0.9390--1timeseries7266845}
\caption{Rank 2, $SIM=$}
\caption{Rank 2, SIM=94\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_native-0.9333--2timeseries7214657}
\caption{Rank 3, $SIM=$}
\caption{Rank 3, SIM=93\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_native-0.8708--14timeseries4936553}
\caption{$SIM=$}
\caption{Rank 15, SIM=87\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_native-0.1695--99timeseries7942052}
\caption{$SIM=$ }
\caption{Rank 100, SIM=17\%}
\end{subfigure}
\caption{Job-L with hex\_native, selection of similar jobs}
@ -680,21 +769,21 @@ Bin aggzero liefert Mist zurück.
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_phases-1.0000--14timeseries4577917}
\caption{Rank 2, $SIM=$}
\caption{Rank 2, SIM=100\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_phases-1.0000--1timeseries4405671}
\caption{Rank 3, $SIM=$}
\caption{Rank 3, SIM=100\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_phases-1.0000--2timeseries4621422}
\caption{$SIM=$}
\caption{Rank 15, SIM=100\%}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\centering
\includegraphics[width=\textwidth]{job_similarities_7488914-out/hex_phases-1.0000--99timeseries4232293}
\caption{$SIM=$ }
\caption{Rank 100, SIM=100\%}
\end{subfigure}
\caption{Job-L with hex\_phases, selection of similar jobs}
@ -707,5 +796,8 @@ Bin aggzero liefert Mist zurück.
\section{Conclusion}
\label{sec:summary}
One consideration could be to identify jobs that are found by all algorithms, i.e., jobs that meet a certain (rank) threshold for different algorithms.
That would increase the likelihood that these jobs are very similar and what the user is looking for.
%\printbibliography
\end{document}

View File

@ -12,11 +12,14 @@ function prepare(){
popd
for I in datasets/*.csv ; do
ln -s $I
if [ ! -e $(basename $I) ]; then
echo "Creating symlink $(basename $I)"
ln -s $I
fi
done
}
# prepare
prepare
for I in job_similarities_*.csv ; do
rm *.png *.pdf
@ -27,7 +30,7 @@ for I in job_similarities_*.csv ; do
rm $OUT/*
mv description.txt $OUT
fi
mv *.png *.pdf $OUT
mv *.png *.pdf jobs-*.txt $OUT
done
# analyze peformance data

View File

@ -0,0 +1,18 @@
#!/bin/bash
if [[ ! -e datasets/job_metadata_confidential.csv ]] ; then
exit 0
fi
# This script extracts the actual usernames and job-informations
# As it is confidential information, we cannot include the files
for I in $@ ; do
DATA=$(grep $I datasets/job_metadata.csv | cut -d "," -f 7-)
echo -n $I,
if [[ "$DATA" == "" ]] ; then
echo "No data found"
continue
fi
grep $DATA datasets/job_metadata_confidential.csv | cut -d "," -f 1-5
done

View File

@ -7,8 +7,8 @@ require(scales)
data = read.csv("datasets/clustering_progress.csv")
e = data %>% filter(min_sim %in% c(0.1, 0.5, 0.99))
e$percent = paste("SIM =", as.factor(round(e$min_sim*100,0)), " %")
e = data %>% filter(sim_param %in% c(0.1, 0.5, 0.99))
e$percent = paste("SIM =", as.factor(round(e$sim_param*100,0)), " %")
# Development when adding more jobs
ggplot(e, aes(x=jobs_done, y=elapsed, color=alg_name)) + geom_point() + facet_grid(percent ~ .) + ylab("Cummulative runtime in s") + xlab("Jobs processed") + scale_y_log10() + theme(legend.position = "bottom")
@ -16,6 +16,6 @@ ggsave("fig/runtime-cummulative.png", width=6, height=4.5)
# Bar chart for the maximum
e = data %>% filter(jobs_done >= (jobs_total - 9998))
e$percent = as.factor(round(e$min_sim*100,0))
e$percent = as.factor(round(e$sim_param*100,0))
ggplot(e, aes(y=elapsed, x=percent, fill=alg_name)) + geom_bar(stat="identity") + facet_grid(. ~ alg_name, switch = 'y') + scale_y_log10() + theme(legend.position = "none") + ylab("Runtime in s") + xlab("Minimum similarity in %") + geom_text(aes(label = round(elapsed,0), angle = 90, y=0*(elapsed)+20))
ggsave("fig/runtime-overview.png", width=7, height=2)

View File

@ -55,8 +55,27 @@ def plot(prefix, header, row):
x = { h : d for (h, d) in zip(header, row)}
jobid = x["jobid"]
del x["jobid"]
del x["coding_abs"]
del x["coding_abs_aggzeros"]
del x["bcoding"]
# EB: Removing segment mean values
del x["mean_md_file_create"]
del x["mean_md_file_delete"]
del x["mean_md_mod"]
del x["mean_md_other"]
del x["mean_md_read"]
del x["mean_read_bytes"]
del x["mean_read_calls"]
del x["mean_write_bytes"]
del x["mean_write_calls"]
# EB: Renaming dict keys
x["md_file_create"] = x.pop("q16_md_file_create")
x["md_file_delete"] = x.pop("q16_md_file_delete")
x["md_mod"] = x.pop("q16_md_mod")
x["md_other"] = x.pop("q16_md_other")
x["md_read"] = x.pop("q16_md_read")
x["read_bytes"] = x.pop("q16_read_bytes")
x["read_calls"] = x.pop("q16_read_calls")
x["write_bytes"] = x.pop("q16_write_bytes")
x["write_calls"] = x.pop("q16_write_calls")
result = []
for k in x:
@ -123,7 +142,8 @@ def plot(prefix, header, row):
with open('job-io-datasets/datasets/job_codings.csv') as csv_file:
#with open('job-io-datasets/datasets/job_codings.csv') as csv_file: # EB: old codings
with open('./datasets/job_codings_v3.csv') as csv_file: # EB: v3 codings moved to this repo
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:

View File

@ -3,21 +3,26 @@
library(ggplot2)
library(dplyr)
require(scales)
library(stringi)
library(stringr)
# Turn to TRUE to print indivdiual job images
plotjobs = FALSE
# Color scheme
plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000066")
plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000099")
# Parse job from command line
args = commandArgs(trailingOnly = TRUE)
file = "job_similarities_5024292.csv" # for manual execution
file = args[1]
jobID = str_extract(file, regex("[0-9]+"))
data = read.csv(file)
# Columns are: jobid alg_id alg_name similarity
data$alg_id = as.factor(data$alg_id)
#data$alg_id = as.factor(data$alg_id) # EB: falsche Spalte?
data$alg_name = as.factor(data$alg_name) # EB: im Script wird diese Spalte benutzt
cat("Job count:")
cat(nrow(data))
@ -37,8 +42,8 @@ e = data %>% filter(similarity >= 0.5)
print(summary(e))
# load job information, i.e., the time series per job
jobData = read.csv("job-io-datasets/datasets/job_codings.csv")
metadata = read.csv("job-io-datasets/datasets/job_metadata.csv")
jobData = read.csv("./datasets/job_codings_v3.csv") # EB: liegt jetzt Repo. v3 hat die korrekten hexadezimalen Codings
metadata = read.csv("./datasets/job_metadata.csv") # EB: is ebenfalls im Repo
metadata$user_id = as.factor(metadata$user_id)
metadata$group_id = as.factor(metadata$group_id)
@ -48,8 +53,10 @@ plotJobs = function(jobs){
if (plotjobs) {
prefix = do.call("sprintf", list("%s-%.4f-", level, r$similarity))
system(sprintf("scripts/plot-single-job.py %s %s", paste(r$jobid, collapse=","), paste(prefix, collapse=",")))
system(sprintf("./scripts/plot-single-job.py %s %s", paste(r$jobid, collapse=","), paste(prefix, collapse=",")))
}
system(sprintf("./scripts/extract-conf-data.sh %s > jobs-%s.txt", paste(r$jobid, collapse=" "), level))
}
# Store the job ids in a table, each column is one algorithm
@ -122,10 +129,15 @@ for (alg_name in levels(data$alg_name)){
res.jobs = rbind(res.jobs, cbind(alg_name, metadata[metadata$jobid %in% result[, alg_name],]))
}
ggplot(res.jobs, aes(alg_name, total_nodes, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + theme(legend.position = "none") + xlab("Algorithm")
# Plot histogram of nodes per algorithm
jobRef = metadata[metadata$jobid == jobID,]$total_nodes
ggplot(res.jobs, aes(alg_name, total_nodes, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x))) + theme(legend.position = "none") + xlab("Algorithm") + ylab("Job node count") + geom_hline(yintercept= jobRef, linetype="dashed", color = "red", size=0.5)
ggsave("jobs-nodes.png", width=6, height=4)
ggplot(res.jobs, aes(alg_name, elapsed, fill=alg_name)) + geom_boxplot() + scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) + ylab("Runtime in s") + xlab("Algorithm") + theme(legend.position = "none")
# Plot histogram of elapsed time per algorithm
jobRef = metadata[metadata$jobid == jobID,]$elapsed
ggplot(res.jobs, aes(alg_name, elapsed, fill=alg_name)) + geom_boxplot() + ylab("Job runtime in s") + xlab("Algorithm") + theme(legend.position = "none") + ylim(0, max(res.jobs$elapsed)) + geom_hline(yintercept= jobRef, linetype="dashed", color = "red", size=0.5)
# scale_y_continuous(trans = log2_trans(), breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))
ggsave("jobs-elapsed.png", width=6, height=4)

View File

@ -11,7 +11,7 @@
#jobids=( ${jobids[@]} 18672376 )
#jobids=( ${jobids[@]} 17944118 )
output_dir="../../datasets"
dataset_fn="../../datasets/job_codings_v4.csv"
jobids=( )
jobids=( ${jobids[@]} 7488914 )