diff --git a/fig/job-timeseries4296426.pdf b/fig/job-timeseries4296426.pdf new file mode 100644 index 0000000..499ae3b Binary files /dev/null and b/fig/job-timeseries4296426.pdf differ diff --git a/fig/job-timeseries5024292.pdf b/fig/job-timeseries5024292.pdf new file mode 100644 index 0000000..7aea45f Binary files /dev/null and b/fig/job-timeseries5024292.pdf differ diff --git a/fig/job-timeseries7488914-30.pdf b/fig/job-timeseries7488914-30.pdf new file mode 100644 index 0000000..12c6b15 Binary files /dev/null and b/fig/job-timeseries7488914-30.pdf differ diff --git a/fig/job-timeseries7488914.pdf b/fig/job-timeseries7488914.pdf new file mode 100644 index 0000000..344cd11 Binary files /dev/null and b/fig/job-timeseries7488914.pdf differ diff --git a/paper/main.tex b/paper/main.tex index 1bfd25f..5b1da12 100644 --- a/paper/main.tex +++ b/paper/main.tex @@ -44,7 +44,8 @@ \usepackage{graphicx} \graphicspath{ - {./pictures/} + {./pictures/}, + {../fig/} } \usepackage[backend=bibtex, style=numeric]{biblatex} @@ -127,8 +128,9 @@ Check time series algorithms: \begin{itemize} \item bin - \item hex\_native/hex\_lev - \item pm\_quant + \item hex\_native + \item hex\_lev + \item hex\_quant \end{itemize} \section{Evaluation} @@ -136,8 +138,9 @@ Check time series algorithms: Two study examples (two reference jobs): \begin{itemize} - \item jobA: shorter length, e.g. 5-10, that has a little bit IO in at least two metadata metrics (more better). - \item jobB: a very IO intensive longer job, e.g., length $>$ 20, with IO read or write and maybe one other metrics. + \item job-short: shorter length, e.g. 5-10, that has a little bit IO in at least two metadata metrics (more better). + \item job-mixed: + \item job-long: a very IO intensive longer job, e.g., length $>$ 20, with IO read or write and maybe one other metrics. \end{itemize} For each reference job: create CSV file which contains all jobs with: @@ -151,6 +154,35 @@ Insert job profiles for closest 10 jobs. Potentially, analyze how the rankings of different similarities look like. +\Cref{fig:refJobs} + +\begin{figure} +\begin{subfigure}{0.8\textwidth} +\includegraphics[width=\textwidth]{job-timeseries4296426} +\caption{Job-S} \label{fig:job-S} +\end{subfigure} + +\caption{Reference jobs: timeline of mean IO activity} +\label{fig:refJobs} +\end{figure} + + +\begin{figure}\ContinuedFloat + +\begin{subfigure}{0.8\textwidth} +\includegraphics[width=\textwidth]{job-timeseries5024292} +\caption{Job-M} \label{fig:job-M} +\end{subfigure} + +\begin{subfigure}{0.8\textwidth} +\includegraphics[width=\textwidth]{job-timeseries7488914-30.pdf} +\caption{Job-L (first 30 segments of 400; remaining segments are similar)} +\label{fig:job-L} +\end{subfigure} +\caption{Reference jobs: timeline of mean IO activity; non-shown timelines are 0} +\end{figure} + + \section{Summary and Conclusion} \label{sec:summary} diff --git a/scripts/create-paper-vis.sh b/scripts/create-paper-vis.sh new file mode 100755 index 0000000..38db050 --- /dev/null +++ b/scripts/create-paper-vis.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# This script calls all other scripts to re-create the figures for the paper + +mkdir fig +for job in 5024292 4296426 7488914 ; do +./scripts/plot-single-job.py $job "fig/job-" +done + +for file in fig/*.pdf ; do + pdfcrop $file output.pdf + mv output.pdf $file +done diff --git a/scripts/plot-single-job.py b/scripts/plot-single-job.py index 426f59e..28874dc 100755 --- a/scripts/plot-single-job.py +++ b/scripts/plot-single-job.py @@ -5,12 +5,25 @@ import sys from pandas import DataFrame from pandas import Grouper from matplotlib import pyplot +import matplotlib.cm as cm jobs = [sys.argv[1]] prefix = sys.argv[2] print("Plotting the job: " + str(jobs)) +# Color map +colorMap = { "md_file_create": cm.tab10(0), +"md_file_delete": cm.tab10(1), +"md_mod": cm.tab10(2), +"md_other": cm.tab10(3), +"md_read": cm.tab10(4), +"read_bytes": cm.tab10(5), +"read_calls": cm.tab10(6), +"write_bytes": cm.tab10(7), +"write_calls": cm.tab10(8) +} + # Plot the timeseries def plot(prefix, header, row): x = { h : d for (h, d) in zip(header, row)} @@ -36,27 +49,31 @@ def plot(prefix, header, row): groups = data.groupby(["metrics"]) metrics = DataFrame() labels = [] + colors = [] for name, group in groups: metrics[name] = [x[2] for x in group.values] labels.append(name) + colors.append(colorMap[name]) - ax = metrics.plot(subplots=True, legend=False, sharex=True, grid = True, sharey=True, colormap='jet', marker='.', markersize=10, figsize=(8, 2 + 2 * len(labels))) + fsize = (8, 1 + 1.5 * len(labels)) + + ax = metrics.plot(subplots=True, legend=False, sharex=True, grid = True, sharey=True, marker='.', markersize=10, figsize=fsize, color=colors) for (i, l) in zip(range(0, len(labels)), labels): ax[i].set_ylabel(l) pyplot.xlabel("Segment number") - pyplot.savefig(prefix + "timeseries" + jobid + ".png") + pyplot.savefig(prefix + "timeseries" + jobid + ".pdf") # Plot first 30 segments if len(timeseries) <= 50: return - ax = metrics.plot(subplots=True, legend=False, sharex=True, grid = True, sharey=True, colormap='jet', marker='.', markersize=10, xlim=(0,30)) + ax = metrics.plot(subplots=True, legend=False, sharex=True, grid = True, sharey=True, marker='.', color=colors, markersize=10, xlim=(0,30), figsize=fsize) for (i, l) in zip(range(0, len(labels)), labels): ax[i].set_ylabel(l) pyplot.xlabel("Segment number") - pyplot.savefig(prefix + "timeseries" + jobid + "-30.png") + pyplot.savefig(prefix + "timeseries" + jobid + "-30.pdf") ### end plotting function diff --git a/scripts/plot.R b/scripts/plot.R index fc79b76..a47d515 100755 --- a/scripts/plot.R +++ b/scripts/plot.R @@ -34,7 +34,7 @@ print(summary(e)) ggsave("ecdf-0.5.png") # histogram for the jobs -ggplot(data, aes(similarity), group=alg_name) + geom_histogram(color="black", binwidth=0.025) + aes(fill = alg_name) + facet_grid(alg_name ~ ., switch = 'y') + scale_y_continuous(limits=c(0, 100), oob=squish) + scale_color_brewer(palette = "Set2") + ylab("Count (cropped at 100)") +ggplot(data, aes(similarity), group=alg_name) + geom_histogram(color="black", binwidth=0.025) + aes(fill = alg_name) + facet_grid(alg_name ~ ., switch = 'y') + scale_y_continuous(limits=c(0, 100), oob=squish) + scale_color_brewer(palette = "Set2") + ylab("Count (cropped at 100)") + theme(legend.position = "none") ggsave("hist-sim.png") # load job information, i.e., the time series per job