diff --git a/paper/bibliography.bib b/paper/bibliography.bib index c2a3bfd..c53e787 100644 --- a/paper/bibliography.bib +++ b/paper/bibliography.bib @@ -9,11 +9,12 @@ publisher={Materials and Energy Research Center} } +% pages={569--580}, + @inproceedings{morse2007efficient, title={An efficient and accurate method for evaluating time series similarity}, author={Morse, Michael D and Patel, Jignesh M}, booktitle={Proceedings of the 2007 ACM SIGMOD international conference on Management of data}, - pages={569--580}, year={2007} } @@ -28,11 +29,11 @@ publisher={ACM New York, NY, USA} } +% pages={67}, @article{mendez2012new, title={{A new approach for Analyzing I/O in parallel scientific applications}}, author={M{\'e}ndez, Sandra and Panadero, Javier and Wong, Alvaro and Rexachs, Dolores and Luque, Emilio}, journal={Computer Science \& Technology Series}, - pages={67}, year={2012} } @@ -56,23 +57,20 @@ organization={Springer} } +% doi = {https://doi.org/10.1007/978-3-030-02465-9_4}, @inproceedings{TFAPIKBBCF19, author = {Julian Kunkel and Eugen Betke and Matt Bryson and Philip Carns and Rosemary Francis and Wolfgang Frings and Roland Laifer and Sandra Mendez}, title = {{Tools for Analyzing Parallel I/O}}, year = {2019}, month = {01}, booktitle = {{High Performance Computing: ISC High Performance 2018 International Workshops, Frankfurt/Main, Germany, June 28, 2018, Revised Selected Papers}}, - editor = {Rio Yokota and Michele Weiland and John Shalf and Sadaf Alam}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, number = {11203}, pages = {49--70}, conference = {HPC-IODC workshop, ISC HPC}, - organization = {ISC Team}, - location = {Frankfurt, Germany}, isbn = {978-3-030-02465-9}, issn = {1611-3349}, - doi = {https://doi.org/10.1007/978-3-030-02465-9_4}, abstract = {Parallel application I/O performance often does not meet user expectations. Additionally, slight access pattern modifications may lead to significant changes in performance due to complex interactions between hardware and software. These issues call for sophisticated tools to capture, analyze, understand, and tune application I/O. In this paper, we highlight advances in monitoring tools to help address these issues. We also describe best practices, identify issues in measure- ment and analysis, and provide practical approaches to translate parallel I/O analysis into actionable outcomes for users, facility operators, and researchers.}, } @@ -98,6 +96,9 @@ } +% location = {Montreal, Canada}, +% url = {https://cug.org/proceedings/cug2019_proceedings/includes/files/pap118s2-file1.pdf}, + @inproceedings{AOPIUOTUNS19, author = {Andrew Turner and Dominic Sloan-Murphy and Karthee Sivalingam and Harvey Richardson and Julian Kunkel}, title = {{Analysis of parallel I/O use on the UK national supercomputing service, ARCHER using Cray's LASSi and EPCC SAFE}}, @@ -105,11 +106,10 @@ month = {10}, editor = {}, conference = {CUG}, - location = {Montreal, Canada}, - abstract = {In this paper, we describe how we have used a combination of the LASSi tool (developed by Cray) and the SAFE software (developed by EPCC) to collect and analyse Lustre I/O performance data for all jobs running on the UK national supercomputing service, ARCHER; and to provide reports on I/O usage for users in our standard reporting framework. We also present results from analysis of parallel I/O use on ARCHER and analysis on the potential impact of different applications on file system performance using metrics we have derived from the LASSi data. We show that the performance data from LASSi reveals how the same application can stress different components of the file system depending on how it is run, and how the LASSi risk metrics allow us to identify use cases that could potentially cause issues for global I/O performance and work with users to improve their I/O use. We use the IO-500 benchmark to help us understand how LASSi risk metrics correspond to observed performance on the ARCHER file systems. We also use LASSi data imported into SAFE to identify I/O use patterns associated with different research areas, understand how the research workflow gives rise to the observed patterns and project how this will affect I/O requirements in the future. Finally, we provide an overview of likely future directions for the continuation of this work.}, - url = {https://cug.org/proceedings/cug2019_proceedings/includes/files/pap118s2-file1.pdf}, + abstract = {In this paper, we describe how we have used a combination of the LASSi tool (developed by Cray) and the SAFE software (developed by EPCC) to collect and analyse Lustre I/O performance data for all jobs running on the UK national supercomputing service, ARCHER; and to provide reports on I/O usage for users in our standard reporting framework. We also present results from analysis of parallel I/O use on ARCHER and analysis on the potential impact of different applications on file system performance using metrics we have derived from the LASSi data. We show that the performance data from LASSi reveals how the same application can stress different components of the file system depending on how it is run, and how the LASSi risk metrics allow us to identify use cases that could potentially cause issues for global I/O performance and work with users to improve their I/O use. We use the IO-500 benchmark to help us understand how LASSi risk metrics correspond to observed performance on the ARCHER file systems. We also use LASSi data imported into SAFE to identify I/O use patterns associated with different research areas, understand how the research workflow gives rise to the observed patterns and project how this will affect I/O requirements in the future. Finally, we provide an overview of likely future directions for the continuation of this work.} } + @incollection{weber2017visual, title={{Visual Comparison of Trace Files in Vampir}}, author={Weber, Matthias and Brendel, Ronny and Wagner, Michael and Dietrich, Robert and Tsch{\"u}ter, Ronny and Brunst, Holger}, @@ -185,30 +185,27 @@ year={2019} } - +%doi = {https://doi.org/10.1007/978-3-030-59851-8_12}, @inproceedings{betke20, author = {Eugen Betke and Julian Kunkel}, title = {{The Importance of Temporal Behavior when Classifying Job IO Patterns Using Machine Learning Techniques}}, year = {2020}, month = {06}, booktitle = {{High Performance Computing: ISC High Performance 2020 International Workshops, Revised Selected Papers}}, - editor = {Heike Jagode and Hartwig Anzt and Guido Juckeland and Hatem Ltaief}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, number = {12151}, pages = {191-205}, conference = {ISC HPC}, - location = {Frankfurt, Germany}, isbn = {978-3-030-59851-8}, issn = {1611-3349}, - doi = {https://doi.org/10.1007/978-3-030-59851-8_12}, abstract = {Every day, supercomputers execute 1000s of jobs with different characteristics. Data centers monitor the behavior of jobs to support the users and improve the infrastructure, for instance, by optimizing jobs or by determining guidelines for the next procurement. The classification of jobs into groups that express similar run-time behavior aids this analysis as it reduces the number of representative jobs to look into. It is state of the practice to investigate job similarity by looking into job profiles that summarize the dynamics of job execution into one dimension of statistics and neglect the temporal behavior. In this work, we utilize machine learning techniques to cluster and classify parallel jobs based on the similarity in their temporal IO behavior to highlight the importance of temporal behavior when comparing jobs. Our contribution is the qualitative and quantitative evaluation of different IO characterizations and similarity measurements that work toward the development of a suitable clustering algorithm. We explore IO characteristics from monitoring data of one million parallel jobs and cluster them into groups of similar jobs. Therefore, the time series of various IO statistics is converted into features using different similarity metrics that customize the classification. We discuss conventional ML techniques that are applied to job profiles and contrast this with the analysis of time series data where we apply the Levenshtein distance as a distance metrics. While the employed Levenshtein algorithms aren’t yet optimal, the results suggest that temporal behavior is key to identify related pattern.}, } @article{Eugen20HPS, title={{Classifying Temporal Characteristics of Job I/O}}, author={Betke, Eugen and Kunkel, Julian}, - journal={Journal of High Performance Storage}, - issue={1}, + journal={Journal of High Performance Storage: Incubator}, + issue={7}, date={2020} } diff --git a/paper/main.tex b/paper/main.tex index 0d48c6b..dc4fd0e 100644 --- a/paper/main.tex +++ b/paper/main.tex @@ -71,7 +71,8 @@ \title{A Workflow for Identifying Jobs with Similar I/O Behavior Utilizing Time Series Analysis} %\author{Julian Kunkel\inst{2} \and Eugen Betke\inst{1}} - +\author{} +\institute{} %\institute{ %University of Reading--% @@ -231,11 +232,10 @@ No aggregation is performed on the metric dimension. % Similarity For the analysis we use the kolmogorov-smirnov-test 1.1.0 Rust library from the official Rust Package Registry ``cargo.io''. -The similarity function \Cref{eq:ks_similarity} calculates the mean inverse of reject probability $p_{\text{reject}}$ computed with the ks-test across all metrics $m$. +The similarity function $sim = \frac{\sum_m 1 - p_{\text{reject}(m)}}{|M|}, \text{with } m \in \text{metrics}$, calculates the mean inverse of reject probability $p_{\text{reject}}$ computed with the ks-test across all metrics $m$. + +%\begin{equation}\label{eq:ks_similarity} -\begin{equation}\label{eq:ks_similarity} - similarity = \frac{\sum_m 1 - p_{\text{reject}(m)}}{|M|}, \text{with } m \in M, \text{where } M \text{ is set of metrics} -\end{equation}