From a0a2ebabf7133a37cec9b6148acda2a38c2faab6 Mon Sep 17 00:00:00 2001 From: "Julian M. Kunkel" Date: Fri, 23 Oct 2020 18:02:19 +0100 Subject: [PATCH] Added related work (to order) --- paper/bibliography.bib | 174 +++++++++++++++++++++++++++++++++++++++++ paper/main.tex | 52 +++++++++--- 2 files changed, 216 insertions(+), 10 deletions(-) diff --git a/paper/bibliography.bib b/paper/bibliography.bib index 8b13789..2610b26 100644 --- a/paper/bibliography.bib +++ b/paper/bibliography.bib @@ -1 +1,175 @@ +@article{khotanlou2018empirical, + title={{An Empirical Comparison of Distance Measures for Multivariate Time Series Clustering}}, + author={Khotanlou, Hassan and Salarpour, Amir}, + journal={International Journal of Engineering}, + volume={31}, + number={2}, + pages={250--262}, + year={2018}, + publisher={Materials and Energy Research Center} +} +@inproceedings{morse2007efficient, + title={An efficient and accurate method for evaluating time series similarity}, + author={Morse, Michael D and Patel, Jignesh M}, + booktitle={Proceedings of the 2007 ACM SIGMOD international conference on Management of data}, + pages={569--580}, + year={2007} +} + +@article{navarro2001guided, + title={A guided tour to approximate string matching}, + author={Navarro, Gonzalo}, + journal={ACM computing surveys (CSUR)}, + volume={33}, + number={1}, + pages={31--88}, + year={2001}, + publisher={ACM New York, NY, USA} +} + +@article{mendez2012new, + title={{A new approach for Analyzing I/O in parallel scientific applications}}, + author={M{\'e}ndez, Sandra and Panadero, Javier and Wong, Alvaro and Rexachs, Dolores and Luque, Emilio}, + journal={Computer Science \& Technology Series}, + pages={67}, + year={2012} +} + +@article{halawa2020unsupervised, + title={{Unsupervised KPIs-Based Clustering of Jobs in HPC Data Centers}}, + author={Halawa, Mohamed S and D{\'\i}az Redondo, Rebeca P and Fern{\'a}ndez Vilas, Ana}, + journal={Sensors}, + volume={20}, + number={15}, + pages={4111}, + year={2020}, + publisher={Multidisciplinary Digital Publishing Institute} +} + +@inproceedings{emeras2015evalix, + title={{Evalix: classification and prediction of job resource consumption on HPC platforms}}, + author={Emeras, Joseph and Varrette, S{\'e}bastien and Guzek, Mateusz and Bouvry, Pascal}, + booktitle={Job Scheduling Strategies for Parallel Processing}, + pages={102--122}, + year={2015}, + organization={Springer} +} + +@inproceedings{TFAPIKBBCF19, + author = {Julian Kunkel and Eugen Betke and Matt Bryson and Philip Carns and Rosemary Francis and Wolfgang Frings and Roland Laifer and Sandra Mendez}, + title = {{Tools for Analyzing Parallel I/O}}, + year = {2019}, + month = {01}, + booktitle = {{High Performance Computing: ISC High Performance 2018 International Workshops, Frankfurt/Main, Germany, June 28, 2018, Revised Selected Papers}}, + editor = {Rio Yokota and Michele Weiland and John Shalf and Sadaf Alam}, + publisher = {Springer}, + series = {Lecture Notes in Computer Science}, + number = {11203}, + pages = {49--70}, + conference = {HPC-IODC workshop, ISC HPC}, + organization = {ISC Team}, + location = {Frankfurt, Germany}, + isbn = {978-3-030-02465-9}, + issn = {1611-3349}, + doi = {https://doi.org/10.1007/978-3-030-02465-9_4}, + abstract = {Parallel application I/O performance often does not meet user expectations. Additionally, slight access pattern modifications may lead to significant changes in performance due to complex interactions between hardware and software. These issues call for sophisticated tools to capture, analyze, understand, and tune application I/O. In this paper, we highlight advances in monitoring tools to help address these issues. We also describe best practices, identify issues in measure- ment and analysis, and provide practical approaches to translate parallel I/O analysis into actionable outcomes for users, facility operators, and researchers.}, +} + + +@inproceedings{bahmani2018chameleon, + title={{Chameleon: Online clustering of mpi program traces}}, + author={Bahmani, Amir and Mueller, Frank}, + booktitle={2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, + pages={1102--1112}, + year={2018}, + organization={IEEE} +} + + +@article{rodrigo2018towards, + title={Towards understanding HPC users and systems: a NERSC case study}, + author={Rodrigo, Gonzalo P and {\"O}stberg, P-O and Elmroth, Erik and Antypas, Katie and Gerber, Richard and Ramakrishnan, Lavanya}, + journal={Journal of Parallel and Distributed Computing}, + volume={111}, + pages={206--221}, + year={2018}, + publisher={Elsevier} +} + + +@inproceedings{AOPIUOTUNS19, + author = {Andrew Turner and Dominic Sloan-Murphy and Karthee Sivalingam and Harvey Richardson and Julian Kunkel}, + title = {{Analysis of parallel I/O use on the UK national supercomputing service, ARCHER using Cray's LASSi and EPCC SAFE}}, + year = {2019}, + month = {10}, + editor = {}, + conference = {CUG}, + location = {Montreal, Canada}, + abstract = {In this paper, we describe how we have used a combination of the LASSi tool (developed by Cray) and the SAFE software (developed by EPCC) to collect and analyse Lustre I/O performance data for all jobs running on the UK national supercomputing service, ARCHER; and to provide reports on I/O usage for users in our standard reporting framework. We also present results from analysis of parallel I/O use on ARCHER and analysis on the potential impact of different applications on file system performance using metrics we have derived from the LASSi data. We show that the performance data from LASSi reveals how the same application can stress different components of the file system depending on how it is run, and how the LASSi risk metrics allow us to identify use cases that could potentially cause issues for global I/O performance and work with users to improve their I/O use. We use the IO-500 benchmark to help us understand how LASSi risk metrics correspond to observed performance on the ARCHER file systems. We also use LASSi data imported into SAFE to identify I/O use patterns associated with different research areas, understand how the research workflow gives rise to the observed patterns and project how this will affect I/O requirements in the future. Finally, we provide an overview of likely future directions for the continuation of this work.}, + url = {https://cug.org/proceedings/cug2019_proceedings/includes/files/pap118s2-file1.pdf}, +} + +@incollection{weber2017visual, + title={{Visual Comparison of Trace Files in Vampir}}, + author={Weber, Matthias and Brendel, Ronny and Wagner, Michael and Dietrich, Robert and Tsch{\"u}ter, Ronny and Brunst, Holger}, + booktitle={Programming and Performance Visualization Tools}, + pages={105--121}, + year={2017}, + publisher={Springer} +} + +@inproceedings{demasi2013identifying, + title={{Identifying HPC codes via performance logs and machine learning}}, + author={DeMasi, Orianna and Samak, Taghrid and Bailey, David H}, + booktitle={Proceedings of the first workshop on Changing landscapes in HPC security}, + pages={23--30}, + year={2013} +} + +@inproceedings{lu2013comprehensive, + title={Comprehensive job level resource usage measurement and analysis for XSEDE HPC systems}, + author={Lu, Charng-Da and Browne, James and DeLeon, Robert L and Hammond, John and Barth, William and Furlani, Thomas R and Gallo, Steven M and Jones, Matthew D and Patra, Abani K}, + booktitle={Proceedings of the Conference on Extreme Science and Engineering Discovery Environment: Gateway to Discovery}, + pages={1--8}, + year={2013} +} + +@inproceedings{evans2014comprehensive, + title={{Comprehensive resource use monitoring for HPC systems with TACC stats}}, + author={Evans, Todd and Barth, William L and Browne, James C and DeLeon, Robert L and Furlani, Thomas R and Gallo, Steven M and Jones, Matthew D and Patra, Abani K}, + booktitle={2014 First International Workshop on HPC User Support Tools}, + pages={13--21}, + year={2014}, + organization={IEEE} +} + +@inproceedings{liu2020characterization, + title={{Characterization and identification of HPC applications at leadership computing facility}}, + author={Liu, Zhengchun and Lewis, Ryan and Kettimuthu, Rajkumar and Harms, Kevin and Carns, Philip and Rao, Nageswara and Foster, Ian and Papka, Michael E}, + booktitle={Proceedings of the 34th ACM International Conference on Supercomputing}, + pages={1--12}, + year={2020} +} + +@inproceedings{bang2020hpc, + title={{HPC Workload Characterization Using Feature Selection and Clustering}}, + author={Bang, Jiwoo and Kim, Chungyong and Wu, Kesheng and Sim, Alex and Byna, Suren and Kim, Sunggon and Eom, Hyeonsang}, + booktitle={Proceedings of the 3rd International Workshop on Systems and Network Telemetry and Analytics}, + pages={33--40}, + year={2020} +} + +@article{betke20, + title={The Importance of Temporal Behavior when Classifying Job IO Patterns Using Machine Learning Techniques}, + author={Betke, Eugen and Kunkel, Julian} +} + + +@incollection{white2018automatic, + title={{Automatic Characterization of HPC Job Parallel Filesystem I/O Patterns}}, + author={White, Joseph P and Kofke, Alexander D and DeLeon, Robert L and Innus, Martins and Jones, Matthew D and Furlani, Thomas R}, + booktitle={Proceedings of the Practice and Experience on Advanced Research Computing}, + pages={1--8}, + year={2018} +} diff --git a/paper/main.tex b/paper/main.tex index 69b4818..a741ba6 100644 --- a/paper/main.tex +++ b/paper/main.tex @@ -129,10 +129,10 @@ Job names are defined by users; while a similar name may hint to be a similar wo \eb{Hier fehlt noch die Info, warum der Support nach aehnlichen Jobs suchen sollen. So wie ich es verstehe, wenn ein Job Probleme verursacht, dann koennen auch aehnliche Jobs aehnliche Probleme verursachen.} \eb{Vorteil fuer den Nutzer ist nicht ganz klar. Warum sollte ein Nutzer nach ähnlichen Jobs suchen?} -In our previous paper \cite{XXX}, we developed several distance metrics and algorithms for the clustering of jobs based on the time series of their IO behavior. -The distance metrics can be applied to jobs with different runtime and number of nodes utilized but differ in the way they define similarity. +In our previous paper \cite{XXX}, we developed several distance measures and algorithms for the clustering of jobs based on the time series of their IO behavior. +The distance measures can be applied to jobs with different runtime and number of nodes utilized but differ in the way they define similarity. We showed that the metrics can be used to cluster jobs, however, it remains unclear if the method can be used by data center staff to explore jobs of a reference job effectively. -In this article, we refined these distance metrics slightly and apply them to rank jobs based on their similarity to a reference job. +In this article, we refined these distance measures slightly and apply them to rank jobs based on their similarity to a reference job. Therefore, we perform a study on three reference jobs with a different character. We also utilize Kolmogorov-Smirnov-Test to illustrate the benefit and drawbacks of the different methods. @@ -148,12 +148,44 @@ Finally, we conclude our paper in \Cref{sec:summary}. \section{Related Work} \label{sec:relwork} -Clustering of jobs based on their names +Related work can be classified into: distance measures, time series analysis of HPC applications, and IO monitoring tools. -Multivariate time series -Levenshtein distance also known as Edit Distance (ED). +The ranking of similar jobs performed in this article is related to clustering strategies. +The comparison of the time series using various metrics has been extensively investigated. +In \cite{khotanlou2018empirical}, an empirical comparison of distance measures for clustering of multivariate time series is performed. +14 similarity measures are applied to 23 data sets. +It shows that no similarity measure produces statistical significant better results than another. +However, the Swale scoring model \cite{morse2007efficient} produced the most disjoint clusters. +In this model, gaps imply a cost. +Levenshtein distance is often referred to as Edit Distance (ED) \cite{navarro2001guided}. +% Lock-Step Measures and Elastic Measures + + +Monitoring systems that record statistics about hardware usage are widely used in HPC. +In \cite{halawa2020unsupervised}, 11 performance metrics including CPU and network are utilized for agglomerative clustering showing the general effectivity of the approach. + +There are various tools for analyzing the IO behavior of an application \cite{TFAPIKBBCF19}. + +Comparison of applications by extracting the IO patterns from application traces. +With PAS2P \cite{mendez2012new}... + +For Vampir, a popular tool for trace file analysis, in \cite{weber2017visual} the Comparison View is introduced that allows to manually compare traces of application runs, e.g., to compare optimized with original code. +Vampir generally supports the clustering of process timelines of a single job allowing to focus on relevant code sections and processes when investigating large number of processes. + +Chameleon \cite{bahmani2018chameleon} extends ScalaTrace for recording MPI traces but reduces the overhead by clustering processes and collecting information from one representative of each cluster. +For the clustering, a signature is created for each process that includes the call-graph. + +Characterization of jobs + +In \cite{rodrigo2018towards}, a characterization of the NERSC workload is performed based on job scheduler information (profiles). +Profiles that include the MPI activities have shown effective to identify the code that is executed \cite{demasi2013identifying}. +Approaches for clustering HPC applications typically operate on profiles for compute, network, and IO \cite{emeras2015evalix,liu2020characterization,bang2020hpc}. +For example, Evalix \cite{emeras2015evalix} monitors system statistics (from proc) in 1 minute intervals but for the analysis they are converted to a profile removing the time dimension, i.e., compute the average CPU, memory, and IO over the job runtime. + +The LASSi tool \cite{AOPIUOTUNS19} periodically monitors Lustre I/O statistics and computes a "risk" factor to identify IO patterns which stress the file system. + +In \cite{white2018automatic}, a heuristic classifier is developed that analyzes the I/O read/write throughput time series to extract the periodicity of the jobs -- there is a considerable similarity to fourier analysis. -Vampir clustering of timelines of a single job. \section{Methodology} \label{sec:methodology} @@ -162,9 +194,9 @@ The purpose of the methodology is to allow users and support staff to explore al Therefore, we first need to define how a job's data is represented, then describe the algorithms used to compute the similarity, and, finally, the methodology to investigate jobs is described. \subsection{Job Data} -On the Mistral supercomputer at DKRZ, the monitoring system gathers in 10s intervals on all nodes nine IO metrics for the two Lustre file systems together with general job metadata from the SLURM workload manager. +On the Mistral supercomputer at DKRZ, the monitoring system \cite{betke20} gathers in 10s intervals on all nodes nine IO metrics for the two Lustre file systems together with general job metadata from the SLURM workload manager. The results are 4D data (time, nodes, metrics, file system) per job. -The distance metrics should handle jobs of different lengths and node count. +The distance measures should handle jobs of different lengths and node count. In \cite{TODOPaper}, we discussed a variety of options from 1D job-profiles to data reductions to compare time series data and the general workflow and pre-processing in detail. In a nutshell, for each job executed on Mistral, we partition it into 10-minute segments and compute the arithmetic mean of each metric, categorize the value into non-IO (0), HighIO (1), and CriticalIO (4) for values below 99-percentile, up to 99.9-percentile, and above, respectively. After data is reduced across nodes, we quantize the timelines either using binary or hexadecimal representation which is then ready for similarity analysis. @@ -172,7 +204,7 @@ By pre-filtering jobs with no I/O activity -- their sum across all dimensions an \subsection{Algorithms for Computing Similarity} We reuse the algorithms developed in \cite{TODO}: BIN\_all, BIN\_aggzeros, HEX\_native, HEX\_lev, and HEX\_quant. -They differ in the way data similarity is defined; either the binary or hexadecimal coding is used, the distance metrics is mostly the Euclidean distance or the Levenshtein-distance. +They differ in the way data similarity is defined; either the binary or hexadecimal coding is used, the distance measure is mostly the Euclidean distance or the Levenshtein-distance. For jobs with different lengths, we apply a sliding-windows approach which finds the location for the shorter job in the long job with the highest similarity. The HEX\_quant algorithm extracts I/O phases and computes the similarity between the most similar I/O phases of both jobs. In this paper, we add a new similarity definition based on Kolmogorov-Smirnov-Test that compares the probability distribution of the observed values which we describe in the following.