Minor fixes

2020-12-08 12:50:21 +01:00 · 2020-12-08 12:50:21 +01:00 · 1cf57a036c
commit 1cf57a036c
parent 368b32d1db
2 changed files with 105 additions and 99 deletions
--- a/paper/bibliography.bib
+++ b/paper/bibliography.bib
@ -12,14 +12,14 @@
 %   pages={569--580},

@inproceedings{morse2007efficient,
-  title={An efficient and accurate method for evaluating time series similarity},
+  title={{An efficient and accurate method for evaluating time series similarity}},
  author={Morse, Michael D and Patel, Jignesh M},
-  booktitle={Proceedings of the 2007 ACM SIGMOD international conference on Management of data},
+  booktitle={{Proceedings of the 2007 ACM SIGMOD international conference on Management of data}},
  year={2007}
 }

@article{navarro2001guided,
-  title={A guided tour to approximate string matching},
+  title={{A guided tour to approximate string matching}},
  author={Navarro, Gonzalo},
  journal={ACM computing surveys (CSUR)},
  volume={33},
@ -78,7 +78,7 @@
@inproceedings{bahmani2018chameleon,
  title={{Chameleon: Online clustering of mpi program traces}},
  author={Bahmani, Amir and Mueller, Frank},
-  booktitle={2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS)},
+  booktitle={{2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS)}},
  pages={1102--1112},
  year={2018},
  organization={IEEE}
@ -86,9 +86,9 @@


@article{rodrigo2018towards,
-  title={Towards understanding HPC users and systems: a NERSC case study},
+  title={{Towards understanding HPC users and systems: a NERSC case study}},
  author={Rodrigo, Gonzalo P and {\"O}stberg, P-O and Elmroth, Erik and Antypas, Katie and Gerber, Richard and Ramakrishnan, Lavanya},
-  journal={Journal of Parallel and Distributed Computing},
+  journal={{Journal of Parallel and Distributed Computing}},
  volume={111},
  pages={206--221},
  year={2018},
@ -128,9 +128,9 @@
 }

@inproceedings{lu2013comprehensive,
-  title={Comprehensive job level resource usage measurement and analysis for XSEDE HPC systems},
+  title={{Comprehensive job level resource usage measurement and analysis for XSEDE HPC systems}},
  author={Lu, Charng-Da and Browne, James and DeLeon, Robert L and Hammond, John and Barth, William and Furlani, Thomas R and Gallo, Steven M and Jones, Matthew D and Patra, Abani K},
-  booktitle={Proceedings of the Conference on Extreme Science and Engineering Discovery Environment: Gateway to Discovery},
+  booktitle={{Proceedings of the Conference on Extreme Science and Engineering Discovery Environment: Gateway to Discovery}},
  pages={1--8},
  year={2013}
 }
@ -139,7 +139,7 @@
@inproceedings{evans2014comprehensive,
  title={{Comprehensive resource use monitoring for HPC systems with TACC stats}},
  author={Evans, Todd and Barth, William L and Browne, James C and DeLeon, Robert L and Furlani, Thomas R and Gallo, Steven M and Jones, Matthew D and Patra, Abani K},
-  booktitle={2014 First International Workshop on HPC User Support Tools},
+  booktitle={{2014 First International Workshop on HPC User Support Tools}},
  pages={13--21},
  year={2014},
  organization={IEEE}
@ -148,7 +148,7 @@
@inproceedings{liu2020characterization,
  title={{Characterization and identification of HPC applications at leadership computing facility}},
  author={Liu, Zhengchun and Lewis, Ryan and Kettimuthu, Rajkumar and Harms, Kevin and Carns, Philip and Rao, Nageswara and Foster, Ian and Papka, Michael E},
-  booktitle={Proceedings of the 34th ACM International Conference on Supercomputing},
+  booktitle={{Proceedings of the 34th ACM International Conference on Supercomputing}},
  pages={1--12},
  year={2020}
 }
@ -172,7 +172,7 @@
@incollection{white2018automatic,
  title={{Automatic Characterization of HPC Job Parallel Filesystem I/O Patterns}},
  author={White, Joseph P and Kofke, Alexander D and DeLeon, Robert L and Innus, Martins and Jones, Matthew D and Furlani, Thomas R},
-  booktitle={Proceedings of the Practice and Experience on Advanced Research Computing},
+  booktitle={{Proceedings of the Practice and Experience on Advanced Research Computing}},
  pages={1--8},
  year={2018}
 }
@ -180,7 +180,7 @@
@incollection{chan2019resource,
  title={{A Resource Utilization Analytics Platform Using Grafana and Telegraf for the Savio Supercluster}},
  author={Chan, Nicolas},
-  booktitle={Proceedings of the Practice and Experience in Advanced Research Computing on Rise of the Machines (learning)},
+  booktitle={{Proceedings of the Practice and Experience in Advanced Research Computing on Rise of the Machines (learning)}},
  pages={1--6},
  year={2019}
 }
@ -205,7 +205,7 @@
@article{Eugen20HPS,
  title={{Classifying Temporal Characteristics of Job I/O}},
  author={Betke, Eugen and Kunkel, Julian},
-  journal={Journal of High Performance Storage: Incubator},
+  journal={{Journal of High Performance Storage: Incubator}},
  issue={7},
  date={2020}
 }
--- a/paper/main.tex
+++ b/paper/main.tex
@ -1,7 +1,8 @@
 \let\accentvec\vec
 \documentclass[]{llncs}

-\usepackage{todonotes}
+%\usepackage{todonotes}
+\usepackage[disable]{todonotes}
 \newcommand{\eb}[1]{\todo[inline, color=green]{EB: #1}}
 \newcommand{\jk}[1]{\todo[inline]{JK: #1}}

@ -33,6 +34,7 @@
 \usepackage{lstautogobble}
 \usepackage[listings,skins,breakable,raster,most]{tcolorbox}
 \usepackage{caption}
+\usepackage{placeins}

 \lstset{
 	numberbychapter=false,
@ -95,7 +97,8 @@ This allows staff to understand the usage of the exhibited behavior better and t

 \medskip

-In this paper, a methodology to rank the similarity of all jobs to a reference job based on their temporal I/O behavior is described.
+%In this paper, a methodology to rank the similarity of all jobs to a reference job based on their temporal I/O behavior is described.
+In this paper, we describe a methodology to process efficiently a large set of jobs and find a class with a high temporal I/O similarity to a reference job.
 Practically, we apply several previously developed time series algorithms and also utilize the Kolmogorov-Smirnov-Test to compare the distribution of the metrics.
 A study is conducted to explore the effectiveness of the approach by investigating  related jobs for three reference jobs.
 The data stems from DKRZ's supercomputer Mistral and includes more than 500.000 jobs that have been executed for more than 6 months of operation. Our analysis shows that the strategy and algorithms are effective to identify similar jobs and revealed interesting patterns in the data.
@ -115,9 +118,10 @@ In order to optimize a single job, its behavior and resource utilization must be
 Rarely, users will liaise with staff and request a performance analysis and optimization explicitly.
 Therefore, data centers deploy monitoring systems and staff must pro-actively identify candidates for optimization.
 Monitoring tools such as TACC Stats \cite{evans2014comprehensive}, Grafana \cite{chan2019resource}, and XDMod \cite{simakov2018workload} provide various statistics and time-series data for job execution.
+\eb{Grafana ist ein reines Visualisierungswerkzeug}

 The support staff should focus on workloads for which optimization is beneficial, for instance, the analysis of a job that is executed once on 20 nodes may not be a good return of investment.
-By ranking jobs based on their utilization, it isn't difficult to find a job that exhibits extensive usage of computing, network, and IO resources.
+By ranking jobs based on their utilization, it isn't difficult to find a job that exhibits extensive usage of computing, network, and I/O resources.
 However, would it be beneficial to investigate this workload in detail and potentially optimize it?
 However, a pattern that is observed in many jobs bears potential as the blueprint for optimizing one job may be applied to other jobs as well.
 This is particularly true when running one application with similar inputs but also different applications may lead to similar behavior.
@ -126,12 +130,12 @@ Therefore, it is useful for support staff (or a user) that investigates a resour

 It is non-trivial to identify jobs with similar behavior from the pool of executed jobs.
 Re-executing the same job will lead to slightly different behavior, a program may be executed with different inputs or using a different configuration (e.g., number of nodes).
-Job names are defined by users; while a similar name may hint to be a similar workload finding other applications with the same IO behavior would not be possible.
+Job names are defined by users; while a similar name may hint to be a similar workload finding other applications with the same I/O behavior would not be possible.

-In the paper \cite{Eugen20HPS}, the authors developed several distance measures and algorithms for the clustering of jobs based on the time series of their IO behavior.
+In the paper \cite{Eugen20HPS}, the authors developed several distance measures and algorithms for the clustering of jobs based on the time series and their I/O behavior.
 These distance measures can be applied to jobs with different runtime and number of nodes utilized but differ in the way they define similarity.
 They showed that the metrics can be used to cluster jobs, however, it remained unclear if the method can be used by data center staff to explore similar jobs effectively.
-In this paper, we refine these algorithms slightly, also include another algorithm and apply them to rank jobs based on their similarity to a reference job.
+In this paper, we refine these algorithms slightly, also include another algorithm and apply them to rank jobs based on their temporal I/O similarity to a reference job.

 We start by introducing related work in \Cref{sec:relwork}.
 In \Cref{sec:methodology}, we describe briefly the data reduction and the algorithms for similarity analysis.
@ -144,7 +148,7 @@ The paper is concluded in \Cref{sec:summary}.
 \section{Related Work}
 \label{sec:relwork}

-Related work can be classified into distance measures, analysis of HPC application performance, inter-comparison of jobs in HPC, and IO-specific tools.
+Related work can be classified into distance measures, analysis of HPC application performance, inter-comparison of jobs in HPC, and I/O-specific tools.

 %% DISTANCE MEASURES
 The ranking of similar jobs performed in this article is related to clustering strategies.
@ -161,7 +165,7 @@ However, the Swale scoring model \cite{morse2007efficient} produced the most dis
 % Analysis of HPC application performance
 The performance of applications can be analyzed using one of many tracing tools such as Vampir \cite{weber2017visual} that record the behavior of an application explicitly or implicitly by collecting information about the resource usage with a monitoring system.
 Monitoring systems that record statistics about hardware usage are widely deployed in data centers to record system utilization by applications.
-There are various tools for analyzing the IO behavior of an application \cite{TFAPIKBBCF19}.
+There are various tools for analyzing the I/O behavior of an application \cite{TFAPIKBBCF19}.

 % time series analysis for inter-comparison of processes or jobs in HPC
 For Vampir, a popular tool for trace file analysis, in \cite{weber2017visual} the Comparison View is introduced that allows them to manually compare traces of application runs, e.g., to compare optimized with original code.
@ -173,13 +177,13 @@ In \cite{halawa2020unsupervised}, 11 performance metrics including CPU and netwo

 In \cite{rodrigo2018towards}, a characterization of the NERSC workload is performed based on job scheduler information (profiles).
 Profiles that include the MPI activities have shown effective to identify the code that is executed \cite{demasi2013identifying}.
-Many approaches for clustering applications operate on profiles for compute, network, and IO \cite{emeras2015evalix,liu2020characterization,bang2020hpc}.
-For example, Evalix \cite{emeras2015evalix} monitors system statistics (from proc) in 1-minute intervals but for the analysis, they are converted to a profile removing the time dimension, i.e., compute the average CPU, memory, and IO over the job runtime.
+Many approaches for clustering applications operate on profiles for compute, network, and I/O \cite{emeras2015evalix,liu2020characterization,bang2020hpc}.
+For example, Evalix \cite{emeras2015evalix} monitors system statistics (from proc) in 1-minute intervals but for the analysis, they are converted to a profile removing the time dimension, i.e., compute the average CPU, memory, and I/O over the job runtime.

-% IO-specific tools
-PAS2P \cite{mendez2012new} extracts the IO patterns from application traces and then allows users to manually compare them.
+% I/O-specific tools
+PAS2P \cite{mendez2012new} extracts the I/O patterns from application traces and then allows users to manually compare them.
 In \cite{white2018automatic}, a heuristic classifier is developed that analyzes the I/O read/write throughput time series to extract the periodicity of the jobs -- similar to Fourier analysis.
-The LASSi tool \cite{AOPIUOTUNS19} periodically monitors Lustre I/O statistics and computes a "risk" factor to identify IO patterns that stress the file system.
+The LASSi tool \cite{AOPIUOTUNS19} periodically monitors Lustre I/O statistics and computes a "risk" factor to identify I/O patterns that stress the file system.
 In contrast to existing work, our approach allows a user to identify similar activities based on the temporal I/O behavior recorded by a data center-wide deployed monitoring system.


@ -190,17 +194,18 @@ The purpose of the methodology is to allow users and support staff to explore al
 Therefore, we first need to define how a job's data is represented, then describe the algorithms used to compute the similarity, and, the methodology to investigate jobs.

 \subsection{Job Data}
-On the Mistral supercomputer at DKRZ, the monitoring system \cite{betke20} gathers in 10s intervals on all nodes nine IO metrics for the two Lustre file systems together with general job metadata from the SLURM workload manager.
+On the Mistral supercomputer at DKRZ, the monitoring system \cite{betke20} gathers in ten seconds intervals on all nodes nine I/O metrics for the two Lustre file systems together with general job metadata from the SLURM workload manager.
 The results are 4D data (time, nodes, metrics, file system) per job.
 The distance measures should handle jobs of different lengths and node count.
 In \cite{Eugen20HPS}, the authors discussed a variety of options from 1D job-profiles to data reductions to compare time series data and the general workflow and pre-processing in detail. We are using their data.
-In a nutshell, for each job executed on Mistral, they partitioned it into 10-minute segments and compute the arithmetic mean of each metric, categorize the value into non-IO (0), HighIO (1), and CriticalIO (4) for values below 99-percentile, up to 99.9-percentile, and above, respectively.
+In a nutshell, for each job executed on Mistral, they partitioned it into 10 minutes segments and compute the arithmetic mean of each metric, categorize the value into NonIO (0), HighIO (1), and CriticalIO (4) for values below 99-percentile, up to 99.9-percentile, and above, respectively.
 The fixed interval of 10 minutes ensures the portability of the approach to other HPC systems.
-After the mean value across nodes is computed for a segment, the resulting numeric value is encoded either using binary (IO activity on the segment: yes/no) or hexadecimal representation (quantizing the numerical performance value into 0-15) which is then ready for similarity analysis.
+\eb{Portability muss noch verdeutlicht werden}
+After the mean value across nodes is computed for a segment, the resulting numeric value is encoded either using binary (I/O activity on the segment: yes/no) or hexadecimal representation (quantizing the numerical performance value into 0-15) which is then ready for similarity analysis.
 By pre-filtering jobs with no I/O activity -- their sum across all dimensions and time series is equal to zero, dataset is reduced from 1 million jobs to about 580k jobs.

 \subsection{Algorithms for Computing Similarity}
-We reuse the algorithms developed in \cite{Eugen20HPS}: B-all, B-aggz(eros), Q-native, Q-lev, and Q-phases.
+We reuse the B and Q algorithms developed in~\cite{Eugen20HPS}: B-all, B-aggz(eros), Q-native, Q-lev, and Q-phases.
 They differ in the way data similarity is defined; either the time series is encoded in binary or hexadecimal quantization, the distance measure is the Euclidean distance or the Levenshtein-distance.
 B-all determines similarity between binary codings by means of Levenshtein distance.
 B-aggz is similar to B-all, but computes similarity on binary codings where subsequent segments of zero activities are replaced by just one zero.
@ -212,15 +217,15 @@ The Q-phases algorithm extracts I/O phases and computes the similarity between t
 In this paper, we add a similarity definition based on Kolmogorov-Smirnov-Test that compares the probability distribution of the observed values which we describe in the following.
 %In brief, KS concatenates individual node data  and computes similarity be means of Kolmogorov-Smirnov-Test.

-\paragraph{Kolmogorov-Smirnov (KS) algorithm}
+\paragraph{Kolmogorov-Smirnov (KS) algorithm.}
 % Summary
 For the analysis, we perform two preparation steps.
-Dimension reduction by computing means across the two file systems and by concatenating the time series data of the individual nodes (instead of averaging) them.
+Dimension reduction by computing means across the two file systems and by concatenating the time series data of the individual nodes (instead of averaging them).
 This reduces the four-dimensional dataset to two dimensions (time, metrics).
 % Aggregation
 The reduction of the file system dimension by the mean function ensures the time series values stay in the range between 0 and 4, independently how many file systems are present on an HPC system.
 Unlike the previous similarity definitions, the concatenation of time series on the node dimension preserves the individual I/O information of all nodes while it still allows comparison of jobs with a different number of nodes.
-No aggregation is performed on the metric dimension.
+%No aggregation is performed on the metric dimension.

 % Filtering
 %Zero-jobs are jobs with no sign of significant I/O load are of little interest in the analysis.
@ -240,43 +245,44 @@ The similarity function calculates the mean inverse of reject probability $p_{\t
 Our strategy for localizing similar jobs works as follows:
 \begin{itemize}
  \item A user\footnote{This can be support staff or a data center user that was executing the job.} provides a reference job ID and selects a similarity algorithm.
-  \item The system iterates over all jobs of the job pool computing the distance to the reference job using the specified algorithm.
-  \item It sorts the jobs based on the distance to the reference job.
-  \item It visualizes the cumulative job distance allowing the user to understand how job similarity is distributed.
+  \item The system iterates over all jobs of the job pool computing the similarity to the reference job using the specified algorithm.
+  \item It sorts the jobs based on the similarity to the reference job.
+  \item It visualizes the cumulative job similarity allowing the user to understand how job similarity is distributed.
  \item The user start the inspection by looking at the most similar jobs first.
 \end{itemize}
 The user can decide about the criterion when to stop inspecting jobs; based on the similarity, the number of investigated jobs, or the distribution of the job similarity.
 For the latter, it is interesting to investigate clusters of similar jobs, e.g., if there are many jobs between 80-90\% similarity but few between 70-80\%.

-For the inspection of the jobs, a user may explore the job metadata, searching for similarities, and explore the time series of a job's IO metrics.
+For the inspection of the jobs, a user may explore the job metadata, searching for similarities, and explore the time series of a job's I/O metrics.

-\section{Reference Jobs}
+\section{Reference Jobs}%
 \label{sec:refjobs}

-For this study, we chose several reference jobs with different compute and IO characteristics:
+For this study, we chose several reference jobs with different compute and I/O characteristics:
 \begin{itemize}
-	\item Job-S: performs post-processing on a single node. This is a typical process in climate science where data products are reformatted and annotated with metadata to a standard representation (so-called CMORization). The post-processing is IO intensive.
+	\item Job-S: performs post-processing on a single node. This is a typical process in climate science where data products are reformatted and annotated with metadata to a standard representation (so-called CMORization). The post-processing is I/O intensive.
  \item Job-M: a typical MPI parallel 8-hour compute job on 128 nodes which write time series data after some spin up.   %CHE.ws12
 	\item Job-L: a 66-hour 20-node job.
  The initialization data is read at the beginning.
-  Then only a single master node writes constantly a small volume of data; in fact, the generated data is too small to be categorized as IO relevant.
+  Then only a single master node writes constantly a small volume of data; in fact, the generated data is too small to be categorized as I/O relevant.
 \end{itemize}

 The segmented timelines of the jobs are visualized in \Cref{fig:refJobs} -- remember that the mean value is computed across all nodes.
-This coding is also used for the Q class of algorithms, thus this representation is what the algorithms will analyze; B algorithms merge all timelines together as described in \cite{Eugen20HPS}.
-The figures show the values of active metrics ($\neq 0$); if few are active then they are shown in one timeline, otherwise, they are rendered individually to provide a better overview.
+This coding is also used for the Q algorithms, thus this representation is what the algorithms will analyze; B algorithms merge all timelines together as described in~\cite{Eugen20HPS}.
+The figures show the values of active metrics ($\neq 0$); if few are active, then they are shown in one timeline, otherwise, they are rendered individually to provide a better overview.
 For example, we can see in \Cref{fig:job-S}, that several metrics increase in Segment\,6.
-In \Cref{fig:refJobsHist}, the histograms of the job metrics are shown in Q coding (16 steps).
-The histogram contains the activities of each node at every timestep -- without being averaged across the nodes.
-This data is used to compare jobs using Kolmogorov-Smirnov.
+%In \Cref{fig:refJobsHist}, the histograms of the job metrics are shown in Q coding (16 steps).
+\Cref{fig:refJobsHist} summaries hexadecimal codings of Job-S and Job-M to histograms.
+They contain activities of each node at every timestep -- without being averaged across the nodes.
+Essentially, these data is used to compare jobs using Kolmogorov-Smirnov-Test.
 The metrics at Job-L are not shown as they have only a handful of instances where the value is not 0, except for write\_bytes: the first process is writing out at a low rate.
-In \Cref{fig:job-L}, the mean value is mostly rounded down to 0 except for the first segment as primarily Rank\,0 is doing IO.
+In \Cref{fig:job-L}, the mean value is mostly rounded down to 0 except for the first segment as primarily Rank\,0 is doing I/O.

 \begin{figure}
 \begin{subfigure}{0.8\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job-timeseries4296426}
-\caption{Job-S (runtime=15,551\,s, segments=25)} \label{fig:job-S}
+\caption{Job-S (runtime=15,551\,s, segments=25)}\label{fig:job-S}
 \end{subfigure}
 \centering

@ -284,26 +290,26 @@ In \Cref{fig:job-L}, the mean value is mostly rounded down to 0 except for the f
 \begin{subfigure}{0.8\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job-timeseries5024292}
-\caption{Job-M (runtime=28,828\,s, segments=48)} \label{fig:job-M}
+\caption{Job-M (runtime=28,828\,s, segments=48)}\label{fig:job-M}
 \end{subfigure}
 \centering


-\caption{Reference jobs: segmented timelines of mean IO activity}
+\caption{Reference jobs: segmented timelines of mean I/O activity}%
 \label{fig:refJobs}
 \end{figure}


-\begin{figure}\ContinuedFloat
+\begin{figure}\ContinuedFloat%

 \begin{subfigure}{0.8\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job-timeseries7488914-30}
-\caption{Job-L (first 30 segments of 400; remaining segments are zero)}
+\caption{Job-L (first 30 segments of 400; remaining segments are zero)}%
 \label{fig:job-L}
 \end{subfigure}
 \centering
-\caption{Reference jobs: segmented timelines of mean IO activity}
+\caption{Reference jobs: segmented timelines of mean I/O activity}
 \end{figure}


@ -311,18 +317,18 @@ In \Cref{fig:job-L}, the mean value is mostly rounded down to 0 except for the f
 \begin{subfigure}{0.49\textwidth} % TODO war 0.8
 \centering
 \includegraphics[width=\textwidth]{job-ks-0hist4296426}
-\caption{Job-S} \label{fig:job-S-hist}
+\caption{Job-S}\label{fig:job-S-hist}
 \end{subfigure}
 \centering
 \begin{subfigure}{0.49\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job-ks-1hist5024292}
-\caption{Job-M} \label{fig:job-M-hist}
+\caption{Job-M}\label{fig:job-M-hist}
 \end{subfigure}
 \centering


-\caption{Reference jobs: histogram of IO activities}
+\caption{Reference jobs: histogram of I/O activities}%
 \label{fig:refJobsHist}
 \end{figure}

@ -334,12 +340,12 @@ In \Cref{fig:job-L}, the mean value is mostly rounded down to 0 except for the f
 %\label{fig:job-L}
 %\end{subfigure}
 %\centering
-%\caption{Reference jobs: histogram of IO activities}
+%\caption{Reference jobs: histogram of I/O activities}
 %\end{figure}



-\section{Evaluation}
+\section{Evaluation}%
 \label{sec:evaluation}

 In the following, we assume a reference job is given (we use Job-S, Job-M, and Job-L) and we aim to identify similar jobs.
@ -367,20 +373,20 @@ They could easily be parallelized which would then allow for an online analysis.
  \begin{subfigure}{0.31\textwidth}
  \centering
  \includegraphics[width=\textwidth]{progress_4296426-out-boxplot}
-  \caption{Job-S (segments=25)} \label{fig:perf-job-S}
+  \caption{Job-S (segments=25)}\label{fig:perf-job-S}
  \end{subfigure}
  \begin{subfigure}{0.31\textwidth}
  \centering
  \includegraphics[width=\textwidth]{progress_5024292-out-boxplot}
-  \caption{Job-M (segments=48)} \label{fig:perf-job-M}
+  \caption{Job-M (segments=48)}\label{fig:perf-job-M}
  \end{subfigure}
  \begin{subfigure}{0.31\textwidth}
  \centering
  \includegraphics[width=\textwidth]{progress_7488914-out-boxplot}
-  \caption{Job-L (segments=400)} \label{fig:perf-job-L}
+  \caption{Job-L (segments=400)}\label{fig:perf-job-L}
  \end{subfigure}

-  \caption{Runtime of the algorithms to compute the similarity to reference jobs}
+  \caption{Runtime of the algorithms to compute the similarity to reference jobs}%
  \label{fig:performance}
 \end{figure}

@ -439,22 +445,22 @@ Practically, the support team would start with Rank\,1 (most similar job, e.g.,
 \begin{subfigure}{0.7\textwidth}
 \centering
 \includegraphics[width=\textwidth,trim={0 0 0 2.0cm},clip]{job_similarities_4296426-out/hist-sim}
-\caption{Job-S} \label{fig:hist-job-S}
+\caption{Job-S}\label{fig:hist-job-S}
 \end{subfigure}

 \begin{subfigure}{0.7\textwidth}
 \centering
 \includegraphics[width=\textwidth,trim={0 0 0 2.0cm},clip]{job_similarities_5024292-out/hist-sim}
-\caption{Job-M} \label{fig:hist-job-M}
+\caption{Job-M}\label{fig:hist-job-M}
 \end{subfigure}

 \begin{subfigure}{0.7\textwidth}
 \centering
 \includegraphics[width=\textwidth,trim={0 0 0 2.0cm},clip]{job_similarities_7488914-out/hist-sim}
-\caption{Job-L} \label{fig:hist-job-L}
+\caption{Job-L}\label{fig:hist-job-L}
 \end{subfigure}
 \centering
-\caption{Histogram for the number of jobs (bin width: 2.5\%, numbers are the actual job counts). B-aggz is nearly identical to B-all.}
+\caption{Histogram for the number of jobs (bin width: 2.5\%, numbers are the actual job counts). B-aggz is nearly identical to B-all.}%
 \label{fig:hist}
 \end{figure}

@ -497,24 +503,23 @@ While all algorithms can compute the similarity between jobs of different length
 For Job-M and Job-L, Q-phases and KS are able to identify much shorter or longer jobs.
 For Job-L, the job itself isn't included in the chosen Top\,100 (see \Cref{fig:hist-job-L}, 393 jobs have a similarity of 100\%) which is the reason why the job runtime isn't shown in the figure itself.

-\begin{figure}
+\begin{figure}[bt]
 \begin{subfigure}{0.31\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_4296426-out/user-ids}
-\caption{Job-S} \label{fig:users-job-S}
+\caption{Job-S}\label{fig:users-job-S}
 \end{subfigure}
 \begin{subfigure}{0.31\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/user-ids}
-\caption{Job-M} \label{fig:users-job-M}
+\caption{Job-M}\label{fig:users-job-M}
 \end{subfigure}
 \begin{subfigure}{0.31\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/user-ids}
-\caption{Job-L} \label{fig:users-job-L}
+\caption{Job-L}\label{fig:users-job-L}
 \end{subfigure}

-
 \caption{User information for all 100 top-ranked jobs}
 \label{fig:userids}
 \end{figure}
@ -528,15 +533,15 @@ For Job-L, the job itself isn't included in the chosen Top\,100 (see \Cref{fig:h
 \begin{subfigure}{0.48\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/jobs-nodes}
-\caption{Job-M (ref. job runs on 128 nodes)} \label{fig:nodes-job-M}
+\caption{Job-M (ref. job runs on 128 nodes)}\label{fig:nodes-job-M}
 \end{subfigure}
 \begin{subfigure}{0.48\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/jobs-nodes}
-\caption{Job-L (reference job runs on 20 nodes)} \label{fig:nodes-job-L}
+\caption{Job-L (reference job runs on 20 nodes)}\label{fig:nodes-job-L}
 \end{subfigure}
 \centering
-\caption{Distribution of node counts for Top 100 (for Job-S always nodes=1)}
+\caption{Distribution of node counts for Top 100 (for Job-S always nodes=1)}%
 \label{fig:nodes-job}
 \end{figure}

@ -544,30 +549,30 @@ For Job-L, the job itself isn't included in the chosen Top\,100 (see \Cref{fig:h
 \begin{subfigure}{0.31\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_4296426-out/jobs-elapsed}
-\caption{Job-S ($job=15,551s$)} \label{fig:runtime-job-S}
+\caption{Job-S ($job=15,551s$)}\label{fig:runtime-job-S}
 \end{subfigure}
 \begin{subfigure}{0.31\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/jobs-elapsed}
-\caption{Job-M ($job=28,828s$)} \label{fig:runtime-job-M}
+\caption{Job-M ($job=28,828s$)}\label{fig:runtime-job-M}
 \end{subfigure}
 \begin{subfigure}{0.31\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/jobs-elapsed}
-\caption{Job-L ($job=240ks$)} \label{fig:runtime-job-L}
+\caption{Job-L ($job=240ks$)}\label{fig:runtime-job-L}
 \end{subfigure}
 \centering
-\caption{Distribution of runtime for all 100 top-ranked jobs}
+\caption{Distribution of runtime for all 100 top-ranked jobs}%
 \label{fig:runtime-job}
 \end{figure}

 \subsubsection{Algorithmic differences}
 To verify that the different algorithms behave differently, the intersection for the Top\,100 is computed for all combinations of algorithms and visualized in \Cref{fig:heatmap-job}.
-Bin\_all and B-aggz overlap with at least 99 ranks for all three jobs.
+B-all and B-aggz overlap with at least 99 ranks for all three jobs.
 While there is some reordering, both algorithms lead to a comparable set.
 All algorithms have a significant overlap for Job-S.
 For Job-M, however, they lead to a different ranking, and Top\,100, particularly KS determines a different set.
-Generally, Q-lev and Q\_native are generating more similar results than other algorithms.
+Generally, Q-lev and Q-native are generating more similar results than other algorithms.
 From this analysis, we conclude that one representative from B is sufficient as it generates very similar results while the other algorithms identify mostly disjoint behavioral aspects. % and, therefore, should be analyzed individually


@ -575,27 +580,27 @@ From this analysis, we conclude that one representative from B is sufficient as
 \begin{subfigure}{0.31\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_4296426-out/intersection-heatmap}
-\caption{Job-S} \label{fig:heatmap-job-S}
+\caption{Job-S}\label{fig:heatmap-job-S}
 \end{subfigure}
 \begin{subfigure}{0.31\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_5024292-out/intersection-heatmap}
-\caption{Job-M} \label{fig:heatmap-job-M} %,trim={2.5cm 0 0 0},clip
+\caption{Job-M}\label{fig:heatmap-job-M} %,trim={2.5cm 0 0 0},clip
 \end{subfigure}
 \begin{subfigure}{0.31\textwidth}
 \centering
 \includegraphics[width=\textwidth]{job_similarities_7488914-out/intersection-heatmap}
-\caption{Job-L} \label{fig:heatmap-job-L}
+\caption{Job-L}\label{fig:heatmap-job-L}
 \end{subfigure}

 \centering
-\caption{Intersection of the 100 top-ranked jobs for different algorithms}
+\caption{Intersection of the 100 top-ranked jobs for different algorithms}%
 \label{fig:heatmap-job}
 \end{figure}

 %%%%%%%%%%% %%%%%%%%%%% %%%%%%%%%%% %%%%%%%%%%% %%%%%%%%%%% %%%%%%%%%%% %%%%%%%%%%% %%%%%%%%%%%

-\section{Assessing Timelines for Similar Jobs}
+\section{Assessing Timelines for Similar Jobs}%
 \label{sec:timelines}
 To verify the suitability of the similarity metrics, for each algorithm, we carefully investigated the timelines of each of the jobs in the Top\,100.
 We subjectively found that the approach works very well and identifies suitable similar jobs.
@ -611,7 +616,7 @@ It is executed for different simulations and variables across timesteps.
 The job name suggests that is applied to the control variable.
 In the metadata, we found 22,580 jobs with “cmor” in the name of which 367 jobs mention “control”.

-The B and KS algorithms identify one job which name doesn't include “cmor”,
+The B and KS algorithms identify one job which name doesn't include “cmor”.
 All other algorithms identify only “cmor” jobs and 26-38 of these jobs are applied to “control” (see \Cref{tbl:control-jobs}) -- only the KS algorithm doesn't identify any job with control.
 A selection of job timelines on control variables is given in \Cref{fig:job-S-hex-lev}.
 The single non-cmor job and a high-ranked non-control cmor job is shown in \Cref{fig:job-S-bin-agg}.
@ -634,7 +639,7 @@ For Job-S, we found that all algorithms work well and, therefore, omit further t
 %    Q-phases & 33 \\
 %    KS & 0
 %\end{tabular}
-  \caption{Job-S: number of jobs with “control” in their name in the Top-100}
+  \caption{Job-S: number of jobs with “control” in their name in the Top-100}%
  \label{tbl:control-jobs}
 \end{table}

@ -652,7 +657,7 @@ For Job-S, we found that all algorithms work well and, therefore, omit further t
 \caption{Non-control job: Rank\,4, SIM=81\%}
 \end{subfigure}

-\caption{Job-S: jobs with different job names when using B-aggz}
+\caption{Job-S: jobs with different job names when using B-aggz}%
 \label{fig:job-S-bin-agg}
 \end{figure}

@ -674,7 +679,7 @@ For Job-S, we found that all algorithms work well and, therefore, omit further t
 \caption{Rank\,100, SIM=79\%}
 \end{subfigure}

-\caption{Job-S with Q-Lev, selection of similar jobs}
+\caption{Job-S with Q-Lev, selection of similar jobs}%
 \label{fig:job-S-hex-lev}
 \end{figure}

@ -752,7 +757,7 @@ Remember, for the KS algorithm, we concatenate the metrics of all nodes together
 \caption{Concatenated time series}
 \end{subfigure}

-\caption{Job-M with KS, for Rank\,3, SIM=78\%}
+\caption{Job-M with KS, for Rank\,3, SIM=78\%}%
 \label{fig:job-M-ks}
 \end{figure}

@ -776,7 +781,7 @@ Remember, for the KS algorithm, we concatenate the metrics of all nodes together
 \caption{Rank\,100, SIM=51\% }
 \end{subfigure}

-\caption{Job-M with Bin-Aggzero, selection of similar jobs}
+\caption{Job-M with Bin-Aggzero, selection of similar jobs}%
 \label{fig:job-M-bin-aggzero}
 \end{figure}

@ -806,7 +811,7 @@ Remember, for the KS algorithm, we concatenate the metrics of all nodes together
 \caption{Rank\,100, SIM=70\%}
 \end{subfigure}

-\caption{Job-M with Q-lev, selection of similar jobs}
+\caption{Job-M with Q-lev, selection of similar jobs}%
 \label{fig:job-M-hex-lev}
 \end{figure}

@ -837,7 +842,7 @@ Remember, for the KS algorithm, we concatenate the metrics of all nodes together
 \caption{Rank 3, SIM=97\%}
 \end{subfigure}

-\caption{Job-M with Q-native, selection of similar jobs}
+\caption{Job-M with Q-native, selection of similar jobs}%
 \label{fig:job-M-hex-native}
 \end{figure}

@ -874,7 +879,7 @@ In \Cref{fig:job-L-bin-aggzero}, it can be seen that the found jobs have little

 The Q-lev and Q-native algorithms identify a more diverse set of applications (18 unique names and no xmessy job).
 Q-native \Cref{fig:job-L-hex-native} finds long jobs where the only few activity as our reference job.
-The Q-phases algorithm finds 85 unique names but as there is only one short IO phase in the reference job, it finds many (short) jobs with 100\% similarity as seen in \Cref{fig:job-L-hex-phases}.
+The Q-phases algorithm finds 85 unique names but as there is only one short I/O phase in the reference job, it finds many (short) jobs with 100\% similarity as seen in \Cref{fig:job-L-hex-phases}.
 The KS algorithm is even more inclusive having 1285 jobs with 100\% similarity; the 100 selected ones contain 71 jobs ending with t127, which is a typical model configuration.
 As expected, the histograms mimics the profile of the reference job, and thus, the algorithm does what it is expected to do.

@ -900,7 +905,7 @@ As expected, the histograms mimics the profile of the reference job, and thus, t
 \caption{Rank 100, SIM=11\%}
 \end{subfigure}

-\caption{Job-L with B-aggzero, selection of similar jobs}
+\caption{Job-L with B-aggzero, selection of similar jobs}%
 \label{fig:job-L-bin-aggzero}
 \end{figure}

@ -952,7 +957,7 @@ As expected, the histograms mimics the profile of the reference job, and thus, t
 % \caption{Rank 100, SIM=17\%}
 % \end{subfigure}

-\caption{Job-L with Q-native, selection of similar jobs}
+\caption{Job-L with Q-native, selection of similar jobs}%
 \label{fig:job-L-hex-native}
 \end{figure}

@ -977,14 +982,14 @@ As expected, the histograms mimics the profile of the reference job, and thus, t
 \caption{Rank 100, SIM=100\%}
 \end{subfigure}

-\caption{Job-L with Q-phases, selection of similar jobs}
+\caption{Job-L with Q-phases, selection of similar jobs}%
 \label{fig:job-L-hex-phases}
 \end{figure}




-\section{Conclusion}
+\section{Conclusion}%
 \label{sec:summary}

 We conducted a study to identify similar jobs based on timelines of nine I/O statistics.
@ -1004,5 +1009,6 @@ That would increase the likelihood that these jobs are very similar and what the

 Our next step is to foster a discussion in the community to identify and define suitable similarity metrics for the different analysis purposes.

-\printbibliography
+\FloatBarrier
+\printbibliography%
 \end{document}