diff --git a/fig/job-timeseries4296426.pdf b/fig/job-timeseries4296426.pdf new file mode 100644 index 0000000..1fa9c71 Binary files /dev/null and b/fig/job-timeseries4296426.pdf differ diff --git a/fig/job-timeseries5024292.pdf b/fig/job-timeseries5024292.pdf new file mode 100644 index 0000000..1c1a0bf Binary files /dev/null and b/fig/job-timeseries5024292.pdf differ diff --git a/fig/job-timeseries7488914-30.pdf b/fig/job-timeseries7488914-30.pdf new file mode 100644 index 0000000..629d16d Binary files /dev/null and b/fig/job-timeseries7488914-30.pdf differ diff --git a/fig/job-timeseries7488914.pdf b/fig/job-timeseries7488914.pdf new file mode 100644 index 0000000..b370766 Binary files /dev/null and b/fig/job-timeseries7488914.pdf differ diff --git a/paper/main.tex b/paper/main.tex index 1bfd25f..c8a0838 100644 --- a/paper/main.tex +++ b/paper/main.tex @@ -44,7 +44,8 @@ \usepackage{graphicx} \graphicspath{ - {./pictures/} + {./pictures/}, + {../fig/} } \usepackage[backend=bibtex, style=numeric]{biblatex} @@ -127,30 +128,62 @@ Check time series algorithms: \begin{itemize} \item bin - \item hex\_native/hex\_lev - \item pm\_quant + \item hex\_native + \item hex\_lev + \item hex\_quant \end{itemize} \section{Evaluation} \label{sec:evaluation} -Two study examples (two reference jobs): +In the following, we assume a job is given and we aim to identify similar jobs. +We chose several reference jobs with different compute and IO characteristics visualized in \Cref{fig:refJobs}: \begin{itemize} - \item jobA: shorter length, e.g. 5-10, that has a little bit IO in at least two metadata metrics (more better). - \item jobB: a very IO intensive longer job, e.g., length $>$ 20, with IO read or write and maybe one other metrics. + \item Job-S: performs postprocessing on a single node. This is a typical process in climate science where data products are reformatted and annotated with metadata to a standard representation (so called CMORization). The post-processing is IO intensive. + \item Job-M: a typical MPI parallel 8-hour compute job on 128 nodes which writes time series data after some spin up. %CHE.ws12 + \item Job-L: a 66-hour 20-node job. + The initialization data is read at the beginning. + Then only a single master node writes constantly a small volume of data; in fact, the generated data is too small to be categorized as IO relevant. \end{itemize} -For each reference job: create CSV file which contains all jobs with: -\begin{itemize} - \item JOB ID, for each algorithm: the coding and the computed ranking $\rightarrow$ thus one long row. -\end{itemize} -Alternatively, could be one CSV for each algorithm that contains JOB ID, coding + rank +For each reference job and algorithm, we created a CSV files with the computed similarity for all other jobs. + + +Sollte man was zur Laufzeit der Algorithmen sagen? Denke Daten zu haben wäre sinnvoll. Create histograms + cumulative job distribution for all algorithms. Insert job profiles for closest 10 jobs. Potentially, analyze how the rankings of different similarities look like. + +\begin{figure} +\begin{subfigure}{0.8\textwidth} +\includegraphics[width=\textwidth]{job-timeseries4296426} +\caption{Job-S} \label{fig:job-S} +\end{subfigure} + +\caption{Reference jobs: timeline of mean IO activity} +\label{fig:refJobs} +\end{figure} + + +\begin{figure}\ContinuedFloat + +\begin{subfigure}{0.8\textwidth} +\includegraphics[width=\textwidth]{job-timeseries5024292} +\caption{Job-M} \label{fig:job-M} +\end{subfigure} + +\begin{subfigure}{0.8\textwidth} +\includegraphics[width=\textwidth]{job-timeseries7488914-30.pdf} +\caption{Job-L (first 30 segments of 400; remaining segments are similar)} +\label{fig:job-L} +\end{subfigure} +\caption{Reference jobs: timeline of mean IO activity; non-shown timelines are 0} +\end{figure} + + \section{Summary and Conclusion} \label{sec:summary} diff --git a/scripts/create-paper-vis.sh b/scripts/create-paper-vis.sh new file mode 100755 index 0000000..db9e1c7 --- /dev/null +++ b/scripts/create-paper-vis.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +# This script calls all other scripts to re-create the figures for the paper + +mkdir fig +for job in 5024292 4296426 7488914 ; do +./scripts/plot-single-job.py $job "fig/job-" +done + +# Remove whitespace around jobs +# for file in fig/*.pdf ; do +# pdfcrop $file output.pdf +# mv output.pdf $file +# done diff --git a/scripts/plot-single-job.py b/scripts/plot-single-job.py index 426f59e..60ce6c0 100755 --- a/scripts/plot-single-job.py +++ b/scripts/plot-single-job.py @@ -5,12 +5,47 @@ import sys from pandas import DataFrame from pandas import Grouper from matplotlib import pyplot +import matplotlib.cm as cm -jobs = [sys.argv[1]] -prefix = sys.argv[2] +jobs = sys.argv[1].split(",") +prefix = sys.argv[2].split(",") print("Plotting the job: " + str(jobs)) +# Color map +colorMap = { "md_file_create": cm.tab10(0), +"md_file_delete": cm.tab10(1), +"md_mod": cm.tab10(2), +"md_other": cm.tab10(3), +"md_read": cm.tab10(4), +"read_bytes": cm.tab10(5), +"read_calls": cm.tab10(6), +"write_bytes": cm.tab10(7), +"write_calls": cm.tab10(8) +} + +markerMap = { "md_file_create": "^", +"md_file_delete": "v", +"md_other": ".", +"md_mod": "<", +"md_read": ">", +"read_bytes": "h", +"read_calls": "H", +"write_bytes": "D", +"write_calls": "d" +} + +linestyleMap = { "md_file_create": ":", +"md_file_delete": ":", +"md_mod": ":", +"md_other": ":", +"md_read": ":", +"read_bytes": "--", +"read_calls": "--", +"write_bytes": "-.", +"write_calls": "-." +} + # Plot the timeseries def plot(prefix, header, row): x = { h : d for (h, d) in zip(header, row)} @@ -36,27 +71,45 @@ def plot(prefix, header, row): groups = data.groupby(["metrics"]) metrics = DataFrame() labels = [] + colors = [] + style = [] for name, group in groups: metrics[name] = [x[2] for x in group.values] labels.append(name) + style.append(linestyleMap[name] + markerMap[name]) + colors.append(colorMap[name]) - ax = metrics.plot(subplots=True, legend=False, sharex=True, grid = True, sharey=True, colormap='jet', marker='.', markersize=10, figsize=(8, 2 + 2 * len(labels))) - for (i, l) in zip(range(0, len(labels)), labels): - ax[i].set_ylabel(l) + fsize = (8, 1 + 1.5 * len(labels)) + fsizeFixed = (8, 2) + + pyplot.close('all') + + if len(labels) < 4 : + ax = metrics.plot(legend=True, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style) + ax.set_ylabel("Value") + else: + ax = metrics.plot(subplots=True, legend=False, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsize, color=colors, style=style) + for (i, l) in zip(range(0, len(labels)), labels): + ax[i].set_ylabel(l) pyplot.xlabel("Segment number") - pyplot.savefig(prefix + "timeseries" + jobid + ".png") + pyplot.savefig(prefix + "timeseries" + jobid + ".pdf", bbox_inches='tight') # Plot first 30 segments if len(timeseries) <= 50: return - ax = metrics.plot(subplots=True, legend=False, sharex=True, grid = True, sharey=True, colormap='jet', marker='.', markersize=10, xlim=(0,30)) - for (i, l) in zip(range(0, len(labels)), labels): - ax[i].set_ylabel(l) + + if len(labels) < 4 : + ax = metrics.plot(legend=True, xlim=(0,30), sharex=True, grid = True, sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style) + ax.set_ylabel("Value") + else: + ax = metrics.plot(subplots=True, xlim=(0,30), legend=False, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsize, color=colors, style=style) + for (i, l) in zip(range(0, len(labels)), labels): + ax[i].set_ylabel(l) pyplot.xlabel("Segment number") - pyplot.savefig(prefix + "timeseries" + jobid + "-30.png") + pyplot.savefig(prefix + "timeseries" + jobid + "-30.pdf", bbox_inches='tight') ### end plotting function @@ -65,6 +118,7 @@ def plot(prefix, header, row): with open('job-io-datasets/datasets/job_codings.csv') as csv_file: csv_reader = csv.reader(csv_file, delimiter=',') line_count = 0 + job = 0 for row in csv_reader: if line_count == 0: header = row @@ -74,4 +128,5 @@ with open('job-io-datasets/datasets/job_codings.csv') as csv_file: if not row[0].strip() in jobs: continue else: - plot(prefix, header, row) + plot(prefix[job], header, row) + job += 1 diff --git a/scripts/plot.R b/scripts/plot.R index fc79b76..086d029 100755 --- a/scripts/plot.R +++ b/scripts/plot.R @@ -19,10 +19,8 @@ data = read.csv(file) # Columns are: jobid alg_id alg_name similarity data$alg_id = as.factor(data$alg_id) -print(nrow(data)) - -# FILTER, TODO -data = data %>% filter(similarity <= 1.0) +cat("Job count:") +cat(nrow(data)) # empirical cummulative density function (ECDF) ggplot(data, aes(similarity, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("SIM") + ylab("Fraction of jobs") + theme(legend.position="bottom") + scale_color_brewer(palette = "Set2") @@ -34,7 +32,7 @@ print(summary(e)) ggsave("ecdf-0.5.png") # histogram for the jobs -ggplot(data, aes(similarity), group=alg_name) + geom_histogram(color="black", binwidth=0.025) + aes(fill = alg_name) + facet_grid(alg_name ~ ., switch = 'y') + scale_y_continuous(limits=c(0, 100), oob=squish) + scale_color_brewer(palette = "Set2") + ylab("Count (cropped at 100)") +ggplot(data, aes(similarity), group=alg_name) + geom_histogram(color="black", binwidth=0.025) + aes(fill = alg_name) + facet_grid(alg_name ~ ., switch = 'y') + scale_y_continuous(limits=c(0, 100), oob=squish) + scale_color_brewer(palette = "Set2") + ylab("Count (cropped at 100)") + theme(legend.position = "none") ggsave("hist-sim.png") # load job information, i.e., the time series per job @@ -51,13 +49,10 @@ plotJobs = function(jobs){ md = metadata[metadata$jobid %in% jobs,] print(summary(md)) - # print the job timeline + # print the job timelines r = e[ordered, ] - for (row in 1:length(jobs)) { - prefix = sprintf("%s-%f-%.0f-", level, r[row, "similarity"], row) - job = r[row, "jobid"] - system(sprintf("scripts/plot-single-job.py %s %s", job, prefix)) - } + prefix = do.call("sprintf", list("%s-%.0f-", level, r$similarity)) + system(sprintf("scripts/plot-single-job.py %s %s", paste(r$jobid, collapse=","), paste(prefix, collapse=","))) } # Store the job ids in a table, each column is one algorithm