author = {Julian Kunkel and Eugen Betke and Matt Bryson and Philip Carns and Rosemary Francis and Wolfgang Frings and Roland Laifer and Sandra Mendez},
title = {{Tools for Analyzing Parallel I/O}},
year = {2019},
month = {01},
booktitle = {{High Performance Computing: ISC High Performance 2018 International Workshops, Frankfurt/Main, Germany, June 28, 2018, Revised Selected Papers}},
publisher = {Springer},
series = {Lecture Notes in Computer Science},
number = {11203},
pages = {49--70},
conference = {HPC-IODC workshop, ISC HPC},
isbn = {978-3-030-02465-9},
issn = {1611-3349},
abstract = {Parallel application I/O performance often does not meet user expectations. Additionally, slight access pattern modifications may lead to significant changes in performance due to complex interactions between hardware and software. These issues call for sophisticated tools to capture, analyze, understand, and tune application I/O. In this paper, we highlight advances in monitoring tools to help address these issues. We also describe best practices, identify issues in measure- ment and analysis, and provide practical approaches to translate parallel I/O analysis into actionable outcomes for users, facility operators, and researchers.},
}
@inproceedings{bahmani2018chameleon,
title={{Chameleon: Online clustering of mpi program traces}},
abstract = {In this paper, we describe how we have used a combination of the LASSi tool (developed by Cray) and the SAFE software (developed by EPCC) to collect and analyse Lustre I/O performance data for all jobs running on the UK national supercomputing service, ARCHER; and to provide reports on I/O usage for users in our standard reporting framework. We also present results from analysis of parallel I/O use on ARCHER and analysis on the potential impact of different applications on file system performance using metrics we have derived from the LASSi data. We show that the performance data from LASSi reveals how the same application can stress different components of the file system depending on how it is run, and how the LASSi risk metrics allow us to identify use cases that could potentially cause issues for global I/O performance and work with users to improve their I/O use. We use the IO-500 benchmark to help us understand how LASSi risk metrics correspond to observed performance on the ARCHER file systems. We also use LASSi data imported into SAFE to identify I/O use patterns associated with different research areas, understand how the research workflow gives rise to the observed patterns and project how this will affect I/O requirements in the future. Finally, we provide an overview of likely future directions for the continuation of this work.}
author={Lu, Charng-Da and Browne, James and DeLeon, Robert L and Hammond, John and Barth, William and Furlani, Thomas R and Gallo, Steven M and Jones, Matthew D and Patra, Abani K},
title={{Comprehensive resource use monitoring for HPC systems with TACC stats}},
author={Evans, Todd and Barth, William L and Browne, James C and DeLeon, Robert L and Furlani, Thomas R and Gallo, Steven M and Jones, Matthew D and Patra, Abani K},
title={{Characterization and identification of HPC applications at leadership computing facility}},
author={Liu, Zhengchun and Lewis, Ryan and Kettimuthu, Rajkumar and Harms, Kevin and Carns, Philip and Rao, Nageswara and Foster, Ian and Papka, Michael E},
title={{A Workload Analysis of NSF's Innovative HPC Resources Using XDMoD}},
author={Simakov, Nikolay A and White, Joseph P and DeLeon, Robert L and Gallo, Steven M and Jones, Matthew D and Palmer, Jeffrey T and Plessinger, Benjamin and Furlani, Thomas R},
title = {{The Importance of Temporal Behavior when Classifying Job IO Patterns Using Machine Learning Techniques}},
year = {2020},
month = {06},
booktitle = {{High Performance Computing: ISC High Performance 2020 International Workshops, Revised Selected Papers}},
publisher = {Springer},
series = {Lecture Notes in Computer Science},
number = {12151},
pages = {191-205},
conference = {ISC HPC},
isbn = {978-3-030-59851-8},
issn = {1611-3349},
abstract = {Every day, supercomputers execute 1000s of jobs with different characteristics. Data centers monitor the behavior of jobs to support the users and improve the infrastructure, for instance, by optimizing jobs or by determining guidelines for the next procurement. The classification of jobs into groups that express similar run-time behavior aids this analysis as it reduces the number of representative jobs to look into. It is state of the practice to investigate job similarity by looking into job profiles that summarize the dynamics of job execution into one dimension of statistics and neglect the temporal behavior. In this work, we utilize machine learning techniques to cluster and classify parallel jobs based on the similarity in their temporal IO behavior to highlight the importance of temporal behavior when comparing jobs. Our contribution is the qualitative and quantitative evaluation of different IO characterizations and similarity measurements that work toward the development of a suitable clustering algorithm. We explore IO characteristics from monitoring data of one million parallel jobs and cluster them into groups of similar jobs. Therefore, the time series of various IO statistics is converted into features using different similarity metrics that customize the classification. We discuss conventional ML techniques that are applied to job profiles and contrast this with the analysis of time series data where we apply the Levenshtein distance as a distance metrics. While the employed Levenshtein algorithms aren’t yet optimal, the results suggest that temporal behavior is key to identify related pattern.},