Reading CPU data into database (not complete)

Eugen Betke 2018-12-14 19:00:16 +01:00
parent 08be8ad2af
commit db43c985a3
3 changed files with 499 additions and 15 deletions

View File

@ -21,12 +21,16 @@ make_facet_label <- function(variable, value){
return(paste0(value, " KiB"))
to_fn_str <- function(name) {
tolower(chartr(' ', '_', name))
#connection = dbConnect(SQLite(), dbname='results.ddnime.db')
connection = dbConnect(SQLite(), dbname=file_db)
dbdata = dbGetQuery(connection,'select * from p where count!=1 and fs=="gpfs"' )
dbdata = dbGetQuery(connection,'select * from p' )
dbdata[,"blocksize"] = dbdata$t
@ -43,16 +47,25 @@ dbdata$lab_access <- dbdata$access
dbdata$lab_access[dbdata$lab_access == "write"] = "Write"
dbdata$lab_access[dbdata$lab_access == "read"] = "Read"
dbdata$lab_fs <- dbdata$fs
dbdata$lab_fs[dbdata$lab_fs == "gpfs"] = "IBM Spectrum Scale"
dbdata$lab_fs[dbdata$lab_fs == "fuse"] = "IME FUSE"
dbdata$lab_fs[dbdata$lab_fs == "ime"] = "IME"
dbdata$lab_count <- sprintf('Iteration %d', dbdata$count)
for (scale in c("linear", "logarithmic")) {
for (lab_fs in unique(dbdata$lab_fs)) {
#for (scale in c("linear", "logarithmic")) {
for (scale in c("linear")) {
p = ggplot(data=dbdata, aes(x=nn, y=bwMiB, colour=as.factor(blocksize/1024), group=blocksize), ymin=0) +
p = ggplot(data=dbdata[dbdata$lab_fs==lab_fs,], aes(x=nn, y=bwMiB, colour=as.factor(blocksize/1024), group=blocksize), ymin=0) +
#aes(x=nn, y=bwMiB) +
ggtitle("Independent random access to a shared file with IOR") +
facet_grid(fs + ppn ~ lab_access + count, labeller = labeller(nn = as_labeller(nn_lab), ppn = as_labeller(ppn_lab))) +
#facet_grid(fs + ppn ~ lab_access, labeller = labeller(nn = as_labeller(nn_lab), ppn = as_labeller(ppn_lab))) +
ggtitle(sprintf("%s independent random access to a shared file with IOR", lab_fs)) +
facet_grid(ppn ~ lab_access + lab_count, labeller = labeller(nn = as_labeller(nn_lab), ppn = as_labeller(ppn_lab))) +
#facet_grid(lab_fs + ppn ~ lab_access, labeller = labeller(nn = as_labeller(nn_lab), ppn = as_labeller(ppn_lab))) +
xlab("Nodes") +
ylab("Performance in MiB/s") +
theme(axis.text.x=element_text(angle=90, hjust=0.95, vjust=0.5)) +
@ -62,6 +75,7 @@ for (scale in c("linear", "logarithmic")) {
scale_color_manual(name="Blocksize in KiB: ", values=c('#999999','#E69F00', '#56B4E9', '#000000'), breaks=sort(unique(dbdata$blocksize)/1024)) +
#stat_summary(fun.y="median", geom="line", aes(group=factor(blocksize))) +
stat_summary(fun.y="mean", geom="line", aes(group=factor(blocksize))) +
#ylim(0, max_y) +
#geom_point(data=dbdata, aes(x=nn, y=PortRcvData), colour='red') +
@ -71,19 +85,19 @@ for (scale in c("linear", "logarithmic")) {
p = p + scale_y_log10()
filename_eps = sprintf("%s/performance_%s.eps", folder_out, scale)
filename_png = sprintf("%s/performance_%s.png", folder_out, scale)
filename_eps = sprintf("%s/%s_%s_performance.eps", folder_out, to_fn_str(lab_fs), scale)
filename_png = sprintf("%s/%s_%s_performance.png", folder_out, to_fn_str(lab_fs), scale)
ggsave(filename_png, width = 10, height = 8)
ggsave(filename_eps, width = 10, height = 8)
#system(sprintf("epstopdf %s", filename_eps))
system(sprintf("rm %s", filename_eps))
p = ggplot(data=dbdata, ymin=0) +
p = ggplot(data=dbdata[dbdata$lab_fs==lab_fs,], ymin=0) +
aes(x=nn, y=(PortXmitData + PortRcvData) * 4 / ppn, colour=as.factor(blocksize/1024), group=blocksize) +
#aes(x=nn, y=bwMiB) +
ggtitle('Infiniband throughput (PortRcvData and PortXmitData by "perfquery -x")') +
facet_grid(fs + ppn ~ lab_access + count, labeller = labeller(nn = as_labeller(nn_lab), ppn = as_labeller(ppn_lab))) +
#facet_grid(fs + ppn ~ lab_access, labeller = labeller(nn = as_labeller(nn_lab), ppn = as_labeller(ppn_lab))) +
ggtitle(sprintf('%s Infiniband throughput (PortRcvData and PortXmitData by "perfquery -x")', lab_fs)) +
facet_grid(ppn ~ lab_access + lab_count, labeller = labeller(nn = as_labeller(nn_lab), ppn = as_labeller(ppn_lab))) +
#facet_grid(lab_fs + ppn ~ lab_access, labeller = labeller(nn = as_labeller(nn_lab), ppn = as_labeller(ppn_lab))) +
xlab("Nodes") +
ylab("Performance in MiB/s") +
theme(axis.text.x=element_text(angle=90, hjust=0.95, vjust=0.5)) +
@ -92,6 +106,7 @@ for (scale in c("linear", "logarithmic")) {
scale_x_log10(breaks = c(unique(dbdata$nn))) +
scale_color_manual(name="Blocksize in KiB: ", values=c('#999999','#E69F00', '#56B4E9', '#000000'), breaks=sort(unique(dbdata$blocksize)/1024)) +
stat_summary(fun.y="mean", geom="line", aes(group=factor(blocksize))) +
#ylim(0, max_y) +
#geom_point(data=dbdata, aes(x=nn, y=PortXmitData), colour='blue')
@ -99,11 +114,11 @@ for (scale in c("linear", "logarithmic")) {
p = p + scale_y_log10()
filename_eps = sprintf("%s/ib_%s.eps", folder_out, scale)
filename_png = sprintf("%s/ib_%s.png", folder_out, scale)
filename_eps = sprintf("%s/%s_%s_ib.eps", folder_out, to_fn_str(lab_fs), scale)
filename_png = sprintf("%s/%s_%s_ib.png", folder_out, to_fn_str(lab_fs), scale)
ggsave(filename_png, width = 10, height = 8)
ggsave(filename_eps, width = 10, height = 8)
#system(sprintf("epstopdf %s", filename_eps))
system(sprintf("rm %s", filename_eps))

View File

@ -60,6 +60,9 @@ def parseSysCounters(fn:str):
table['PortXmitData'].append(stop['PortXmitData'] - start['PortXmitData'])
table['PortRcvData'].append(stop['PortRcvData'] - start['PortRcvData'])
res['duration'] = stats.hmean(table['duration'])
res['PortXmitData'] = np.sum(table['PortXmitData']) / res['duration'] / 1024 / 1024
res['PortRcvData'] = np.sum(table['PortRcvData']) / res['duration'] / 1024 / 1024
@ -76,6 +79,19 @@ def _parseSysCounters(fn:str):
if (m):
res[] = int(
m = re.match("cpu([0-9]+)\s+([0-9]+)\s+([0-9]+)\s+([0-9]+)\s+([0-9]+)\s+([0-9]+)\s+([0-9]+)\s+([0-9]+)\s+([0-9]+)\s+([0-9]+)\s+([0-9]+)", line)
res['num'] = int(
res['user'] = int(
res['nice'] = int(
res['system'] = int(
res['idle'] = int(
res['iowait'] = int(
res['irq'] = int(
res['softirq'] = int(
if (m):
res[] = int(
m = re.match("(PortXmitData):\.+([0-9]+)", line)
if (m):
res[] = int(

paper/paper-old.tex 100644
View File

@ -0,0 +1,453 @@
\WarningFilter{latex}{Text page}
\newcommand{\jk}[1]{\todo[inline]{JK: #1}}
\newcommand{\eb}[1]{\todo[inline]{EB: #1}}
%\newcommand{\ebr}[2]{ \hl{\replaced{ #1 }{ #2 }} }
\WarningFilter{llncs}{The remreset package is obsolete}
%\WarningFilter{latexfont}{Font shape}
%\WarningFilter{latexfont}{Some font}
\pagestyle{headings} % switches on printing of running heads
\title{Benefit of DDN's IME-FUSE for I/O intensive HPC applications.}
%\author{Double blind}
\author{Eugen Betke\inst{1} \and Julian Kunkel\inst{2}}
\institute{Deutsches Klimarechenzentrum, Hamburg, Germany,\\
University of Reading, Reading, United Kingdom,\\
\maketitle % typeset the title of the contribution
Many scientific applications are limited by I/O performance offered by parallel file systems on conventional storage systems.
Flash-based burst buffers provide significant better performance than HDD backed storage, but at the expense of capacity.
Burst buffers are considered as the next step towards achieving wire-speed of interconnect and providing more predictable low latency I/O, which are the holy grail of storage.
A critical evaluation of storage technology is mandatory as there is no long-term experience with performance behavior for particular applications scenarios.
The evaluation enables data centers choosing the right products and system architects the integration in HPC architectures.
This paper investigates the native performance of DDN-IME, a flash-based burst buffer solution.
Then, it takes a closer look at the IME-FUSE file systems, which uses IMEs as burst buffer and a Lustre file system as back-end.
Finally, by utilizing a NetCDF benchmark, it estimates the performance benefit for climate applications.
\keywords{Lustre, FUSE, evaluation, flash-based storage}
The dilemma of conventional high-performance storage systems based on HDDs is that they must maximize the throughput to reduce application run times and at the same time they shall minimize the provided bandwidth to reduce costs.
The first requirement is often prioritized to the detriment of the second one, which typically ends up in the oversizing and in a low average usage of the bandwidth procured.
The prioritization is motivated by the requirement to process large performance peaks particular due to checkpoint/restart workloads, that often occur in large-scale applications.
However, since these systems are optimized for sequential I/O, data-intense workloads that are not following this pattern are unable to saturate the network -- reducing the effective utilization.
% Hardware solutions
Traditional parallel file systems can be deployed on flash-based storage instead of HDDs, increasing performance for random workloads.
A nice work in this direction was done in \cite{heben2014lfsperf}.
Typically, data is accessed via POSIX interfaces but can be accessed using MPI-IO~\cite{Thakur:1999:IMP:301816.301826}.
MPI-IO is a widely accepted middleware layer for parallel I/O that relaxes the POSIX semantics and is designed for parallel I/O.
In an alternative storage architecture, a burst buffer~\cite{Liu12onthe,romanus2015challenges} is placed between compute nodes and the storage.
Acting as an intermediate storage tier, it's goal is to catch the I/O peaks from the compute nodes.
Therefore, it provides a low latency and high bandwidth to the compute nodes, but also utilizes the back-end storage by streaming data constantly at a lower bandwidth.
%It allows the large-scale applications to share files efficiently, facilitating the programming efforts and is a part of many libraries and used by a wide range of applications.
In-memory systems, like the Kove\textsuperscript{\textregistered} XPD\textsuperscript{\textregistered}~\cite{kove:xpd:l2}, provide byte-addressable storage with better latency, endurance and availability as flash chips.
Flash-based systems, like DDN IME \cite{ddnime2015}, are also byte-addressable, but have different characteristics than an in-memory storage, for example, flash offers a better costs per gigabyte ratio.
%The address space of burst-buffer can be used to deploy a parallel file system, but performance would be limited by the POSIX semantics.
%For large data this solution may be not suitable due space limitation.
%In contrast, the relaxed MPI-IO semantics enables a lock-free access.
Accessing a fast storage over a POSIX compliant file system or MPI-IO interface is an interesting option for many users, because neither changes in source code, nor software recompilation is required as long as it doesn't degrade the performance too much.
Closed source and pre-compiled applications could also benefit from that.
For that purpose, DDN developed a fuse module (IME-FUSE) which uses IME as a burst buffer and stores data on a parallel file system.
In this evaluation we used Lustre as back-end.
%\jk{Lustre is now what?}
Our \textbf{contributions} are: 1) we investigate peak performance of IME-native and IME-FUSE, and compare it to Lustre,
2) we estimate the performance behaviour for HPC applications, that access data using NetCDF library.
This paper is structured as follows:
\Cref{sec:relatedWork} discusses related work, then Section 3 describes the test environment.
Section 4 and 5 show the test setup and performance results.
Finally, the paper is summarized in Section 6.
\section{Related Work}
% Burst buffer hardware
Relevant state-of-the-art can be grouped into performance optimization, burst buffers to speedup I/O and in-memory storage solutions.
Optimization and tuning of file systems and I/O libraries is traditionally an important but daunting task as many configuration knobs can be considered in parallel file system servers, clients and the I/O middleware.
Without tuning, typical workloads stay behind the peak-performance by orders of magnitude.
With considerable tuning effort a well fitting problem can yield good results: \cite{HDF5Intro} reports 50\% peak performance with a single 291~TB file.
In \cite{howison2012tuning} MPI-IO and HDF5 were optimized and adapted to each other, improving write throughput by 1.4x to 33x.
Many existing workloads can take benefit of a burst buffer as a fast write-behind cache that transparently migrates data from the fast storage to traditional parallel file system.
Burst buffers typically rely on flash or NVRAM to support random I/O workloads.
For flash based SSDs, many vendors offer high-performance storage solutions, for example, DDN Infinite Memory Engine (IME)~\cite{DDNIME}, IBM FlashSystem~\cite{IBMFlash} and Cray's DataWarp accelerator~\cite{CrayDataWarp}.
Using comprehensive strategies to utilize flash chips concurrently, these solutions are powerful and robust to guarantee availability and durability of data for many years.
The integration of Cray DataWarp burst buffer into the NERSC HPC architecture \cite{pdswDataWarp} increased the I/O performance of Chumbo-Crunch simulator by 2.84x to 5.73x, compared to Lustre.
However, for the sake of efficient burst buffer usage, the serial simulator workflow had to be split into single stages (i.e., simulation, visualization, movie encoding), which then were executed in parallel.
The research group at JSC uses DDN IME burst buffer \cite{Schenck2016} and GPFS to identify requirements for the next HPC generation.
The main purpose is to accelerate the I/O performance of the NEST (``NEural Simulation Tool``).
The preliminary IOR experiments show, that I/O performance can be increased upto 20x.
BurstFS \cite{Wang:2016:EBF:3014904.3014997} uses local NVRAM of compute nodes, instead of dedicated remote machines.
An elaborated communication scheme interconnects the distributed NVRAM and provides a contiguous storage space.
This storage is allocated at beginning and exists for the lifetime of the job.
In the experiments, BurstFS outperforms OrangeFS and PLFS by several times.
%In \cite{sato2014user}, a user-level InfiniBand-based file system is designed as intermediate layer between compute nodes and parallel file system.
%With SSDs and FDR Infiniband, they achieve on one server a throughput of 2~GB/s and 3~GB/s for write and read, respectively.
The usage of DRAM for storing intermediate data is not new and RAM drives have been used in MSDOS and Linux (with tmpfs) for decades.
However, offered RAM storage was used as temporary local storage and not durable and usually not accessible from remote nodes.
Exporting tmpfs storage via parallel file systems has been used mainly for performance evaluation but without durability guarantees.
Wickberg and Carothers introduced the RAMDISK Storage Accelerator~\cite{wickberg2012ramdisk} for HPC applications that flushes data to a back-end.
It consists of a set of dedicated nodes that offer in-memory scratch space.
Jobs can use the storage to pre-fetch input data prior job execution or as write-behind cache to speedup I/O.
A prototype with a PVFS-based RAMDISK improved performance of 2048 processes compared to GPFS (100~MB/s vs. 36~MB/s for writes).
Burst-mem~\cite{wang2014burstmem} provides a burst buffer with write-behind capabilities by extending Memcached~\cite{jose2011memcached}.
Experiments show that the ingress performance grows up to 100~GB/s with 128 BurstMem servers.
%An extension of the work discusses resilience on server failures with minor performance reductions~\cite{wang2015development}.
In the field of big data, in-memory data management and processing has become popular with Spark~\cite{zaharia2012resilient}.
Now there are many software packages providing storage management and compute engines~\cite{zhang2015memory}.
%By using such tools, various application workloads have been accelerated significantly.
The Kove XPD~\cite{kove:xpd:l2} is a robust scale-out pooled memory solution that allows aggregating multiple Infiniband links and devices into one big virtual address space that can be dynamically partitioned.
Internally, the Kove provides persistence by periodically flushing memory with a SATA RAID.
Due to the performance differences, the process comes with a delay, but the solution is connected to a UPS to ensure that data becomes durable in case of a power outage.
While providing many interfaces, the XPD does not offer a shared storage that can be utilized from multiple nodes concurrently.
\section{Test environment}
%\subsection{DDN cluster}
DDN provided access to their test cluster in D\"usseldorf on which 10 nodes could be used for testing.
Each node is equipped with two Sandy Bridge processors (8 cores, E5-2650v2 @2.60GHz) and 64~GB RAM.
They are interconnected with a Mellanox Connect-X-4 card providing 100 Gb/sec (4x EDR).
As storage, a DDN ES14K (Exascale 3.1) with two metadata servers and Lustre 2.10.5 is provided; additionally, an IME system consisting of 4 servers is provided.
The flash native data cache of IME acts as a burst buffer and is drained to the Lustre system, the performance reported with IOR is 85~GB/s in write mode.
The DDN IME provides byte-addressable flash-based storage space with high performance characteristics.
It can be addressed directly (IME-native) in a fast and efficient way, but DDN also provides a number of convenient solutions, that require less integration effort.
(1) The applications can be re-linked to the MPI-IO implementation with IME support, which was developed by DDN.
(2) Then, DDN provides a fuse module (IME-FUSE) with IME support, which are convenient ways to access a shared storage.
Both file systems are POSIX compliant and can be used by the applications without any source code modification, recompilation, or re-linking.
%\jk{Was ist patched Lustre.}
In the conducted tests, IME is used via its FUSE mount and backed by the DDN Lustre.
We assume during the write experiment, data is kept inside the burst buffer and not written back, albeit we cannot ensure this.
\caption{DDN test cluster}
The DDN cluster is a experimental system with a lightweight software setup.
Especially, the exclusive access to the IME was not guaranteed, so that some results could be affected by other users.
Therefore, we don't draw conclusions from outliers, since we don't know the origin of them.
As our primary benchmark, IOR~\cite{loewe2012ior} is used varying access granularity, processes-per-node, nodes and access pattern (random and sequential).
The official version of IOR allows us to measure the real performance without considering open/close times (see \Cref{eq:ior_official}).
To synchronize the measurements and capture time for open, close and I/O separately, inter-phase barriers are turned on (IOR option -g).
The DDN version (IME-IOR) supports IME-native interface, but doesn't allow measuring real I/O performance.
Therefore, the performance values include open/close times (see \Cref{eq:ior_ddn}).
\text{perf}_\text{Lustre, IME-FUSE} = \frac{\text{filesize}}{t_{\text{io}}}
\text{perf}_\text{IME-native} = \frac{\text{filesize}}{t_{\text{total}}} = \frac{\text{filesize}}{t_{\text{open}} + t_{\text{io}} + t_{\text{close}}}
\eb{Fixing IOR to make it report open, close, connection and I/O times}
Since the IOR benchmarks does not support NetCDF, and HDF5 is only supported with limited configuration of the pattern,
additionally, the NetCDF-Bench has been used\footnote{\url{}}.
This benchmark uses the parallel NetCDF interface to read/write patterns on a 4D dataset into a NetCDF4/HDF5 file.
It decomposes a domain geometry of ($t$,$x$,$y$,$z$), e.g., ($100$,$16$,$64$,$4$) across the processes of an MPI parallel program.
The processes partition the geometry in x and y direction and one time step is accessed per iteration of each parallel process.
Various options to control the optimizations and data mappings from NetCDF are exported by the benchmark (chunking vs. fixed layout, unbound dimensions, chunk size, pre-filling).
Finally, to measure performance of individual operations to investigate variability, the sequential benchmark \texttt{io-modelling} is used\footnote{\url{}}.
It uses a high-precision timer and supports various access patterns on top of the POSIX interface.
\section{Experiment Configuration}
On the DDN cluster, we use NetCDF-Bench, IOR, and IME-IOR to measure the IME's throughput, and use \texttt{io-modelling} for testing variability.
Each test configuration is repeated 10 times.
All experiments are conducted with block sizes 16, 100, 1024, and 10240~KiB.
\hl{To find the performance limits of the test system we use the IOR benchmarks.
For that purpose, we conduct a series of experiments with various parameters, where we measure the performance for \{read, write\} $\times$ \{POSIX, MPIIO\} $\times$ \{Lustre, IME-FUSE, IME-native\} $\times$ \{shared file, independent files\}.
The stripe count on Lustre is twice as large as the number of nodes, but is limited by at most 8 stripes, since Lustre has only 8 OSTs.}
The purpose of NetCDF-Bench is to investigate the I/O behaviour of typical scientific application, that access large variable through NetCDF4.
In the experiment, we varied the following parameters: \{Lustre, IME-FUSE\} $\times$ \{read, write\} $\times$ \{chunked, contiguous\} $\times$ \{collective, independent\}.
With \texttt{io-modelling} benchmark we looked at the variability of individual I/O accesses \{Lustre, IME-FUSE\} $\times$ \{read, write\} $\times$ \{random, sequential\}.
\subsection{Open/close times}
The time of open/close reduces the reported performance of IME.
They are dropped whenever possible for two reason.
Firstly, in our experiments the test file size is variable ($\text{filesize} = 100 \cdot \text{blocksize} \cdot \text{NN} \cdot \text{PPN}$), it affects small experiments more than the larger ones.
Additionally, it should be noted, that for production runs, larger files and capacities are assumed, reducing this overhead.
Unless otherwise stated, the performance reported in this paper was measured without open/close times.
The goal of our evaluation is to systematically investigate the scaling behavior of the DDN IME's, IME-FUSE and Lustre.
In the following experiments we use 1-10 client nodes (NN) and 1-8 processes per node (PPN) to push hardware to the limits.
On each compute node only one CPU is used, that is connected directly to the Infiniband adapter, to avoid the UPI overhead.
To provide reliable results, each experiment was repeated 10 times.
%\jk{Wie oft wurden die Experimente wiederholt steht nirgendwo?}
%\subsubsection{Peak performance}
\Cref{tab:bestperf_nn10} shows the best and the average performance values that were observed with IME-IOR during the test runs on a single node and on 10 nodes for random and sequential I/O.
Based on average performance for random I/O with NN=1 and PPN=8, 10 client nodes can achieve a throughput of 61~GB/s and of 80~GB/s for write and read, respectively.
As \Cref{tab:bestperf_nn10} shows, the measured write performance is similar to expected values, which indicates that the compute nodes are the bottlenecks.
But the measured read performance is significantly lower than expected.
This indicates, that the bottleneck here are the IMEs.
The same considerations apply to sequential performance.
\caption{Random access performance depending on blocksize and PPN}
& & \multicolumn{2}{c|}{Best} & \multicolumn{2}{c|}{Mean} & & \\
NN & PPN & \multicolumn{2}{c|}{Performance} & \multicolumn{2}{c|}{Performance} & I/O type & File size \\
& & \multicolumn{2}{c|}{in [MiB/s]} & \multicolumn{2}{c|}{in [MiB/s]} & & in [MiB] \\
& & read & write & read & write & & \\
1 & 1 & 2,560 & 1,240 & 2,400 & 1,180 & rnd & 1000 \\
1 & 1 & 2,290 & 1,230 & 2,000 & 870 & seq & 1000 \\
1 & 8 & 8,500 & 6,390 & 8,100 & 6,120 & rnd & 8000 \\
1 & 8 & 8,700 & 6,380 & 7,100 & 4,530 & seq & 8000 \\
10 & 1 & 22,300 & 10,700 & 21,200 & 10,000 & rnd & 10000 \\
10 & 1 & 23,200 & 10,800 & 22,200 & 8,430 & seq & 10000 \\
10 & 8 & 67,500 & 60,200 & 65,300 & 58,400 & rnd & 80000 \\
10 & 8 & 67,500 & 62,900 & 61,700 & 54,300 & seq & 80000 \\
\caption{The best and mean performance measured with IME-IOR (blocksize: 10MiB) (NN: number of nodes; PPN: processes per node).}
\textbf{IME-native (\Cref{fig:read_write_ime,fig:overview_ime}):}
Characteristic for IME-native is that for each block size, there is a linear dependency between read and write accesses.
The performance behavior for each block size can be approximated by a linear function and that small block sizes tend to have better write behaviour.
The complete set of performance results for random I/O is shown in \Cref{fig:overview_ime}.
Firstly, it confirms the linear scalability.
Secondly, there is also no regression of the curves, probably because the experiment setup couldn't push the IMEs to the limits.
Further observations are:
1) writing small blocks is more efficient than reading small blocks; reading large blocks is more efficient that writing large blocks,
%(2) the best configuration is not able to achieve the wire speed of the interconnect,
%(2) for large block sizes, a high percentage of peak is achieved quickly,
2) performance increases with increasing access granularity.
3) with 1 or 4 PPN the available network bandwidth is not utilized.
With PPN=8, we are close to the available network bandwidth for 1 and 10 MiB accesses.
Hence, the I/O path involves relevant latencies.
%\jk{Im Bild braucht es als TEXT PPN, versteht man sonst nicht}
\caption{IME-native random I/O performance (lines go through max. values)}
\textbf{Lustre (\Cref{fig:overview_lustre}):}
Firstly, a single node can profit from caching, when reading data.
In this case observable performance can rise up to 37~GiB/s (not shown in the figure).
The caching effects disappear for $\text{NN}>1$, hence we ignore them in further discussion.
\caption{Random I/O performance (lines go through max. values)}
\eb{Rerun experiments with large files for IME.}
Secondly, the read performance don't exceed 17.4~GiB/s, and is achieved with NN=10, PPN=8, BS=100~KiB.
This is a contra-intuitive, because usually large block size show better performance.
The best write performance is 11.8~GiB/s, and is achieved with NN=4, PPN=6, BS=1000~KiB.
This measurement and the incrementally flattening curve indicate a poor scalability of Lustre.
Generally speaking, Lustre has a lot of internal overhead, especially to make it POSIX compliant, e.g. distributed lock management.
Thirdly, a particular striking point is the result for MPI-IO write performance.
It is significantly lower than for other configurations.
For this behaviour we have no explanation at the moment.
It is also a confusing result, because it is in contradiction to our later experiment with NetCDF-Bench (\Cref{fig:netcdf_perf}).
NetCDF4 uses MPI-IO as back-end, but achieves better results.
%Although, a comparison is possible to a limited extend only, the benchmarks use different access pattern.
\textbf{IME-FUSE (\Cref{fig:overview_fuse}):}
The file system shows a linear scalability, similar to the IME-native, but provides less I/O performance, especially for reading.
This is probably caused by the FUSE overhead, which includes moving I/O requests from user space to kernel space, and then from kernel space to IME-FUSE.
\subsection{Application Kernel Using HDF5}
\eb{Rerun NetCDF with larger files. NetCDF-Bench requires a stonewalling feature, like IOR. This would make benchmark runtime more predictable.}
\eb{Compilation of NetCDF with DDN-IME support.}
In this experiment, the HDF5 VOL development branch (date 2016-05-09), NetCDF~4.4.1 and NetCDF-bench is used.
%Several values for the 4D data geometry (($100$:$16$:$64$:$4$) $\approx$ 3.1MiB, ($100$:$16$:$64$:$25$) $\approx$ 19.5iB, ($100$:$16$:$64$:$256$) $\approx$ 200MiB, ($100$:$16$:$64$:$2560$) $\approx$ 2000MiB) of raw integer data have been explored.
Several values for the 4D data geometry of raw integer data have been explored.
For each block size we did 100 measurements.
The configuration parameters are summarized in \Cref{tab:netcdf_conf}.
Parameter (-d) & Data size & Block size \\
($t$:$x$:$y$:$z$) & [in GiB] & [in KiB] \\
($100$:$16$:$64$:$4$) & 0.5 & 16 \\
($100$:$16$:$64$:$25$) & 3.1 & 100 \\
($100$:$16$:$64$:$256$) & 7.8 & 1024 \\
($100$:$16$:$64$:$2560$) & 78.1 & 10240 \\
\caption{NetCDF-Bench configuration used in during the benchmark.}
In the experiments, we use 10 client nodes and 8 processes per node to access a shared file.
All experiments were conducted with fixed dimension sizes only, since the unlimited/variable dimensions are not supported in combination with independent I/O in NetCDF4.
\Cref{fig:netcdf_perf} shows the results.
Generally, as expected, independent chunked I/O was a good configuration.
\caption{NetCDF performance for Lustre (similar to IME-FUSE)}
\textbf{Lustre vs. IME-FUSE:}
Generally, the performance looks very similar for Lustre and IME-FUSE, that is why we only included the picture for Lustre. There are a few differences:
(1) Collective I/O without chunking causes large variability while reading 16~KiB blocks, (2) and better performance while writing 10~MiB blocks on Lustre.
(3) If chunking is enabled and independent I/O is used, then 10~MiB block sizes can be read with a low variability.
The best performance achieved for collective read is 23~GiB/s write 14~GiB/s, and for independent read 40~GiB/s and write 18~GiB/s.
\textbf{Chunking vs. no chunking:}
Read performance suffers a lot on both file systems, if chunking is enabled for small blocks.
The probability, that several NetCDF processes access the same chunk, increases for small block sizes.
In this case, the processes have to load the whole chunk on each node into memory, even if only a small part of it is required.
Such inefficient access patterns can lead to unnecessary data transfer over the network, i.e. when large parts of the data are pre-loaded, but aren't unused.
This doesn't apply to large block sizes.
Therefore, we can observe performance advantages.
\textbf{Independent I/O vs. collective I/O:}
If chunking is enabled, collective I/O degrades the performance.
If chunking is disabled, it improves I/O for small blocks and degrades I/O of large blocks.
%Unfortunately, the chosen block sizes don't allow to determine the threshold.
For large block sizes (10204~KiB) independent chunked read performance outperforms the write performance.
We suppose that cache is responsible for this performance speed-up.
\subsection{Performance variability with individual I/Os.}
This experiment is conducted measuring timing of 10,000 or 1,024 individual I/Os with a single process on IME test cluster on IME-FUSE and Lustre.
\Cref{fig:variability} shows the qualitative difference between the file systems.
The figure shows the density (like a smoothened histogram) of the individually timed I/Os.
\caption{Density of timing individual I/O operations}
We observe 1) the read operations on Lustre are faster than using IME-FUSE -- this is presumably due to client-side caching.
2) the random acceleration of IME improves write latencies/throughput for IME.
IME is a burst buffer solution, that is completely transparent to applications and to users.
These properties make it beneficial for random workloads.
Read performance depends whether data is located on the IME flash or on Lustre.
The data migration policy is usually hidden from the users, so that read behaviour is not known in advanced.
There is an API though to allow users to stage data explicitly.
For large access sizes and processes per node, IME was able to nearly saturate the network.
We did not achieve better performance with IME in all test scenarios, particularly, for the NetCDF benchmark.
The reason for the suboptimal performance gain of IME compared to Lustre may be due to:
1) the access pattern caused by NetCDF4 with HDF5 has a considerable overhead;
2) the Lustre storage from DDN is already well optimized;
3) the small and experimental laboratory setup that we used for testing.
We expect a significant performance gain once more clients access IME.
Further large-scale investigation is necessary.
Thanks to DDN for providing access to the IME test cluster and to Jean-Thomas Acquaviva for the support.