217 lines
13 KiB
BibTeX
217 lines
13 KiB
BibTeX
@article{khotanlou2018empirical,
|
||
title={{An Empirical Comparison of Distance Measures for Multivariate Time Series Clustering}},
|
||
author={Khotanlou, Hassan and Salarpour, Amir},
|
||
journal={International Journal of Engineering},
|
||
volume={31},
|
||
number={2},
|
||
pages={250--262},
|
||
year={2018},
|
||
publisher={Materials and Energy Research Center}
|
||
}
|
||
|
||
% pages={569--580},
|
||
|
||
@inproceedings{morse2007efficient,
|
||
title={{An efficient and accurate method for evaluating time series similarity}},
|
||
author={Morse, Michael D and Patel, Jignesh M},
|
||
booktitle={{Proceedings of the 2007 ACM SIGMOD international conference on Management of data}},
|
||
year={2007}
|
||
}
|
||
|
||
@article{navarro2001guided,
|
||
title={{A guided tour to approximate string matching}},
|
||
author={Navarro, Gonzalo},
|
||
journal={ACM computing surveys (CSUR)},
|
||
volume={33},
|
||
number={1},
|
||
pages={31--88},
|
||
year={2001},
|
||
publisher={ACM New York, NY, USA}
|
||
}
|
||
|
||
% pages={67},
|
||
@article{mendez2012new,
|
||
title={{A new approach for Analyzing I/O in parallel scientific applications}},
|
||
author={M{\'e}ndez, Sandra and Panadero, Javier and Wong, Alvaro and Rexachs, Dolores and Luque, Emilio},
|
||
journal={Computer Science \& Technology Series},
|
||
year={2012}
|
||
}
|
||
|
||
@article{halawa2020unsupervised,
|
||
title={{Unsupervised KPIs-Based Clustering of Jobs in HPC Data Centers}},
|
||
author={Halawa, Mohamed S and D{\'\i}az Redondo, Rebeca P and Fern{\'a}ndez Vilas, Ana},
|
||
journal={Sensors},
|
||
volume={20},
|
||
number={15},
|
||
pages={4111},
|
||
year={2020},
|
||
publisher={Multidisciplinary Digital Publishing Institute}
|
||
}
|
||
|
||
@inproceedings{emeras2015evalix,
|
||
title={{Evalix: classification and prediction of job resource consumption on HPC platforms}},
|
||
author={Emeras, Joseph and Varrette, S{\'e}bastien and Guzek, Mateusz and Bouvry, Pascal},
|
||
booktitle={Job Scheduling Strategies for Parallel Processing},
|
||
pages={102--122},
|
||
year={2015},
|
||
organization={Springer}
|
||
}
|
||
|
||
|
||
% doi = {https://doi.org/10.1007/978-3-030-02465-9_4},
|
||
@inproceedings{TFAPIKBBCF19,
|
||
author = {Julian Kunkel and Eugen Betke and Matt Bryson and Philip Carns and Rosemary Francis and Wolfgang Frings and Roland Laifer and Sandra Mendez},
|
||
title = {{Tools for Analyzing Parallel I/O}},
|
||
year = {2019},
|
||
month = {01},
|
||
booktitle = {{High Performance Computing: ISC High Performance 2018 International Workshops, Frankfurt/Main, Germany, June 28, 2018, Revised Selected Papers}},
|
||
publisher = {Springer},
|
||
series = {Lecture Notes in Computer Science},
|
||
number = {11203},
|
||
pages = {49--70},
|
||
conference = {HPC-IODC workshop, ISC HPC},
|
||
isbn = {978-3-030-02465-9},
|
||
issn = {1611-3349},
|
||
abstract = {Parallel application I/O performance often does not meet user expectations. Additionally, slight access pattern modifications may lead to significant changes in performance due to complex interactions between hardware and software. These issues call for sophisticated tools to capture, analyze, understand, and tune application I/O. In this paper, we highlight advances in monitoring tools to help address these issues. We also describe best practices, identify issues in measure- ment and analysis, and provide practical approaches to translate parallel I/O analysis into actionable outcomes for users, facility operators, and researchers.},
|
||
}
|
||
|
||
|
||
@inproceedings{bahmani2018chameleon,
|
||
title={{Chameleon: Online clustering of mpi program traces}},
|
||
author={Bahmani, Amir and Mueller, Frank},
|
||
booktitle={{2018 IEEE International Parallel and Distributed Processing Symposium (IPDPS)}},
|
||
pages={1102--1112},
|
||
year={2018},
|
||
organization={IEEE}
|
||
}
|
||
|
||
|
||
@article{rodrigo2018towards,
|
||
title={{Towards understanding HPC users and systems: a NERSC case study}},
|
||
author={Rodrigo, Gonzalo P and {\"O}stberg, P-O and Elmroth, Erik and Antypas, Katie and Gerber, Richard and Ramakrishnan, Lavanya},
|
||
journal={{Journal of Parallel and Distributed Computing}},
|
||
volume={111},
|
||
pages={206--221},
|
||
year={2018},
|
||
publisher={Elsevier}
|
||
}
|
||
|
||
|
||
%location = {Montreal, Canada},
|
||
%url = {https://cug.org/proceedings/cug2019_proceedings/includes/files/pap118s2-file1.pdf},
|
||
|
||
@inproceedings{AOPIUOTUNS19,
|
||
author = {Andrew Turner and Dominic Sloan-Murphy and Karthee Sivalingam and Harvey Richardson and Julian Kunkel},
|
||
title = {{Analysis of parallel I/O use on the UK national supercomputing service, ARCHER using Cray's LASSi and EPCC SAFE}},
|
||
year = {2019},
|
||
month = {10},
|
||
editor = {},
|
||
conference = {CUG},
|
||
abstract = {In this paper, we describe how we have used a combination of the LASSi tool (developed by Cray) and the SAFE software (developed by EPCC) to collect and analyse Lustre I/O performance data for all jobs running on the UK national supercomputing service, ARCHER; and to provide reports on I/O usage for users in our standard reporting framework. We also present results from analysis of parallel I/O use on ARCHER and analysis on the potential impact of different applications on file system performance using metrics we have derived from the LASSi data. We show that the performance data from LASSi reveals how the same application can stress different components of the file system depending on how it is run, and how the LASSi risk metrics allow us to identify use cases that could potentially cause issues for global I/O performance and work with users to improve their I/O use. We use the IO-500 benchmark to help us understand how LASSi risk metrics correspond to observed performance on the ARCHER file systems. We also use LASSi data imported into SAFE to identify I/O use patterns associated with different research areas, understand how the research workflow gives rise to the observed patterns and project how this will affect I/O requirements in the future. Finally, we provide an overview of likely future directions for the continuation of this work.}
|
||
}
|
||
|
||
|
||
@incollection{weber2017visual,
|
||
title={{Visual Comparison of Trace Files in Vampir}},
|
||
author={Weber, Matthias and Brendel, Ronny and Wagner, Michael and Dietrich, Robert and Tsch{\"u}ter, Ronny and Brunst, Holger},
|
||
booktitle={Programming and Performance Visualization Tools},
|
||
pages={105--121},
|
||
year={2017},
|
||
publisher={Springer}
|
||
}
|
||
|
||
@inproceedings{demasi2013identifying,
|
||
title={{Identifying HPC codes via performance logs and machine learning}},
|
||
author={DeMasi, Orianna and Samak, Taghrid and Bailey, David H},
|
||
booktitle={Proceedings of the first workshop on Changing landscapes in HPC security},
|
||
pages={23--30},
|
||
year={2013}
|
||
}
|
||
|
||
@inproceedings{lu2013comprehensive,
|
||
title={{Comprehensive job level resource usage measurement and analysis for XSEDE HPC systems}},
|
||
author={Lu, Charng-Da and Browne, James and DeLeon, Robert L and Hammond, John and Barth, William and Furlani, Thomas R and Gallo, Steven M and Jones, Matthew D and Patra, Abani K},
|
||
booktitle={{Proceedings of the Conference on Extreme Science and Engineering Discovery Environment: Gateway to Discovery}},
|
||
pages={1--8},
|
||
year={2013}
|
||
}
|
||
|
||
|
||
@inproceedings{evans2014comprehensive,
|
||
title={{Comprehensive resource use monitoring for HPC systems with TACC stats}},
|
||
author={Evans, Todd and Barth, William L and Browne, James C and DeLeon, Robert L and Furlani, Thomas R and Gallo, Steven M and Jones, Matthew D and Patra, Abani K},
|
||
booktitle={{2014 First International Workshop on HPC User Support Tools}},
|
||
pages={13--21},
|
||
year={2014},
|
||
organization={IEEE}
|
||
}
|
||
|
||
@inproceedings{liu2020characterization,
|
||
title={{Characterization and identification of HPC applications at leadership computing facility}},
|
||
author={Liu, Zhengchun and Lewis, Ryan and Kettimuthu, Rajkumar and Harms, Kevin and Carns, Philip and Rao, Nageswara and Foster, Ian and Papka, Michael E},
|
||
booktitle={{Proceedings of the 34th ACM International Conference on Supercomputing}},
|
||
pages={1--12},
|
||
year={2020}
|
||
}
|
||
|
||
@inproceedings{bang2020hpc,
|
||
title={{HPC Workload Characterization Using Feature Selection and Clustering}},
|
||
author={Bang, Jiwoo and Kim, Chungyong and Wu, Kesheng and Sim, Alex and Byna, Suren and Kim, Sunggon and Eom, Hyeonsang},
|
||
booktitle={Proceedings of the 3rd International Workshop on Systems and Network Telemetry and Analytics},
|
||
pages={33--40},
|
||
year={2020}
|
||
}
|
||
|
||
@article{simakov2018workload,
|
||
title={{A Workload Analysis of NSF's Innovative HPC Resources Using XDMoD}},
|
||
author={Simakov, Nikolay A and White, Joseph P and DeLeon, Robert L and Gallo, Steven M and Jones, Matthew D and Palmer, Jeffrey T and Plessinger, Benjamin and Furlani, Thomas R},
|
||
journal={arXiv preprint arXiv:1801.04306},
|
||
year={2018}
|
||
}
|
||
|
||
|
||
@incollection{white2018automatic,
|
||
title={{Automatic Characterization of HPC Job Parallel Filesystem I/O Patterns}},
|
||
author={White, Joseph P and Kofke, Alexander D and DeLeon, Robert L and Innus, Martins and Jones, Matthew D and Furlani, Thomas R},
|
||
booktitle={{Proceedings of the Practice and Experience on Advanced Research Computing}},
|
||
pages={1--8},
|
||
year={2018}
|
||
}
|
||
|
||
@incollection{chan2019resource,
|
||
title={{A Resource Utilization Analytics Platform Using Grafana and Telegraf for the Savio Supercluster}},
|
||
author={Chan, Nicolas},
|
||
booktitle={{Proceedings of the Practice and Experience in Advanced Research Computing on Rise of the Machines (learning)}},
|
||
pages={1--6},
|
||
year={2019}
|
||
}
|
||
|
||
%doi = {https://doi.org/10.1007/978-3-030-59851-8_12},
|
||
@inproceedings{betke20,
|
||
author = {Eugen Betke and Julian Kunkel},
|
||
title = {{The Importance of Temporal Behavior when Classifying Job IO Patterns Using Machine Learning Techniques}},
|
||
year = {2020},
|
||
month = {06},
|
||
booktitle = {{High Performance Computing: ISC High Performance 2020 International Workshops, Revised Selected Papers}},
|
||
publisher = {Springer},
|
||
series = {Lecture Notes in Computer Science},
|
||
number = {12151},
|
||
pages = {191-205},
|
||
conference = {ISC HPC},
|
||
isbn = {978-3-030-59851-8},
|
||
issn = {1611-3349},
|
||
abstract = {Every day, supercomputers execute 1000s of jobs with different characteristics. Data centers monitor the behavior of jobs to support the users and improve the infrastructure, for instance, by optimizing jobs or by determining guidelines for the next procurement. The classification of jobs into groups that express similar run-time behavior aids this analysis as it reduces the number of representative jobs to look into. It is state of the practice to investigate job similarity by looking into job profiles that summarize the dynamics of job execution into one dimension of statistics and neglect the temporal behavior. In this work, we utilize machine learning techniques to cluster and classify parallel jobs based on the similarity in their temporal IO behavior to highlight the importance of temporal behavior when comparing jobs. Our contribution is the qualitative and quantitative evaluation of different IO characterizations and similarity measurements that work toward the development of a suitable clustering algorithm. We explore IO characteristics from monitoring data of one million parallel jobs and cluster them into groups of similar jobs. Therefore, the time series of various IO statistics is converted into features using different similarity metrics that customize the classification. We discuss conventional ML techniques that are applied to job profiles and contrast this with the analysis of time series data where we apply the Levenshtein distance as a distance metrics. While the employed Levenshtein algorithms aren’t yet optimal, the results suggest that temporal behavior is key to identify related pattern.},
|
||
}
|
||
|
||
@article{Eugen20HPS,
|
||
author = {Eugen Betke and Julian Kunkel},
|
||
title = {{Classifying Temporal Characteristics of Job I/O Using Machine Learning Techniques}},
|
||
year = {2021},
|
||
month = {01},
|
||
journal = {Journal of High Performance Computing},
|
||
series = {Issue 1},
|
||
isbn = {},
|
||
doi = {10.5281/zenodo.4478960},
|
||
abstract = {{Every day, supercomputers execute 1000s of jobs with different characteristics. Data centers monitor the behavior of jobs to support the users and improve the infrastructure, for instance, by optimizing jobs or by determining guidelines for the next procurement. The classification of jobs into groups that express similar run-time behavior aids this analysis as it reduces the number of representative jobs to look into. This work utilizes machine learning techniques to cluster and classify parallel jobs based on the similarity in their temporal I/O behavior. Our contribution is the qualitative and quantitative evaluation of different I/O characterizations and similarity measurements and the development of a suitable clustering algorithm. <br><br> In the evaluation, we explore I/O characteristics from monitoring data of one million parallel jobs and cluster them into groups of similar jobs. Therefore, the time series of various I/O statistics is converted into features using different similarity metrics that customize the classification. <br><br> When using general-purpose clustering techniques, suboptimal results are obtained. Additionally, we extract phases of I/O activity from jobs. Finally, we simplify the grouping algorithm in favor of performance. We discuss the impact of these changes on the clustering quality.}}
|
||
}
|