Optimization
This commit is contained in:
parent
743f1f0e39
commit
70739e74d5
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -136,25 +136,26 @@ Check time series algorithms:
|
|||
\section{Evaluation}
|
||||
\label{sec:evaluation}
|
||||
|
||||
Two study examples (two reference jobs):
|
||||
In the following, we assume a job is given and we aim to identify similar jobs.
|
||||
We chose several reference jobs with different compute and IO characteristics visualized in \Cref{fig:refJobs}:
|
||||
\begin{itemize}
|
||||
\item job-short: shorter length, e.g. 5-10, that has a little bit IO in at least two metadata metrics (more better).
|
||||
\item job-mixed:
|
||||
\item job-long: a very IO intensive longer job, e.g., length $>$ 20, with IO read or write and maybe one other metrics.
|
||||
\item Job-S: performs postprocessing on a single node. This is a typical process in climate science where data products are reformatted and annotated with metadata to a standard representation (so called CMORization). The post-processing is IO intensive.
|
||||
\item Job-M: a typical MPI parallel 8-hour compute job on 128 nodes which writes time series data after some spin up. %CHE.ws12
|
||||
\item Job-L: a 66-hour 20-node job.
|
||||
The initialization data is read at the beginning.
|
||||
Then only a single master node writes constantly a small volume of data; in fact, the generated data is too small to be categorized as IO relevant.
|
||||
\end{itemize}
|
||||
|
||||
For each reference job: create CSV file which contains all jobs with:
|
||||
\begin{itemize}
|
||||
\item JOB ID, for each algorithm: the coding and the computed ranking $\rightarrow$ thus one long row.
|
||||
\end{itemize}
|
||||
Alternatively, could be one CSV for each algorithm that contains JOB ID, coding + rank
|
||||
For each reference job and algorithm, we created a CSV files with the computed similarity for all other jobs.
|
||||
|
||||
|
||||
Sollte man was zur Laufzeit der Algorithmen sagen? Denke Daten zu haben wäre sinnvoll.
|
||||
|
||||
Create histograms + cumulative job distribution for all algorithms.
|
||||
Insert job profiles for closest 10 jobs.
|
||||
|
||||
Potentially, analyze how the rankings of different similarities look like.
|
||||
|
||||
\Cref{fig:refJobs}
|
||||
|
||||
\begin{figure}
|
||||
\begin{subfigure}{0.8\textwidth}
|
||||
|
|
|
@ -7,7 +7,8 @@ for job in 5024292 4296426 7488914 ; do
|
|||
./scripts/plot-single-job.py $job "fig/job-"
|
||||
done
|
||||
|
||||
for file in fig/*.pdf ; do
|
||||
pdfcrop $file output.pdf
|
||||
mv output.pdf $file
|
||||
done
|
||||
# Remove whitespace around jobs
|
||||
# for file in fig/*.pdf ; do
|
||||
# pdfcrop $file output.pdf
|
||||
# mv output.pdf $file
|
||||
# done
|
||||
|
|
|
@ -7,8 +7,8 @@ from pandas import Grouper
|
|||
from matplotlib import pyplot
|
||||
import matplotlib.cm as cm
|
||||
|
||||
jobs = [sys.argv[1]]
|
||||
prefix = sys.argv[2]
|
||||
jobs = sys.argv[1].split(",")
|
||||
prefix = sys.argv[2].split(",")
|
||||
|
||||
print("Plotting the job: " + str(jobs))
|
||||
|
||||
|
@ -80,7 +80,9 @@ def plot(prefix, header, row):
|
|||
colors.append(colorMap[name])
|
||||
|
||||
fsize = (8, 1 + 1.5 * len(labels))
|
||||
fsizeFixed = (8, 3)
|
||||
fsizeFixed = (8, 2)
|
||||
|
||||
pyplot.close('all')
|
||||
|
||||
if len(labels) < 4 :
|
||||
ax = metrics.plot(legend=True, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)
|
||||
|
@ -91,7 +93,7 @@ def plot(prefix, header, row):
|
|||
ax[i].set_ylabel(l)
|
||||
|
||||
pyplot.xlabel("Segment number")
|
||||
pyplot.savefig(prefix + "timeseries" + jobid + ".pdf")
|
||||
pyplot.savefig(prefix + "timeseries" + jobid + ".pdf", bbox_inches='tight')
|
||||
|
||||
# Plot first 30 segments
|
||||
if len(timeseries) <= 50:
|
||||
|
@ -107,7 +109,7 @@ def plot(prefix, header, row):
|
|||
ax[i].set_ylabel(l)
|
||||
|
||||
pyplot.xlabel("Segment number")
|
||||
pyplot.savefig(prefix + "timeseries" + jobid + "-30.pdf")
|
||||
pyplot.savefig(prefix + "timeseries" + jobid + "-30.pdf", bbox_inches='tight')
|
||||
|
||||
### end plotting function
|
||||
|
||||
|
@ -116,6 +118,7 @@ def plot(prefix, header, row):
|
|||
with open('job-io-datasets/datasets/job_codings.csv') as csv_file:
|
||||
csv_reader = csv.reader(csv_file, delimiter=',')
|
||||
line_count = 0
|
||||
job = 0
|
||||
for row in csv_reader:
|
||||
if line_count == 0:
|
||||
header = row
|
||||
|
@ -125,4 +128,5 @@ with open('job-io-datasets/datasets/job_codings.csv') as csv_file:
|
|||
if not row[0].strip() in jobs:
|
||||
continue
|
||||
else:
|
||||
plot(prefix, header, row)
|
||||
plot(prefix[job], header, row)
|
||||
job += 1
|
||||
|
|
|
@ -19,10 +19,8 @@ data = read.csv(file)
|
|||
# Columns are: jobid alg_id alg_name similarity
|
||||
|
||||
data$alg_id = as.factor(data$alg_id)
|
||||
print(nrow(data))
|
||||
|
||||
# FILTER, TODO
|
||||
data = data %>% filter(similarity <= 1.0)
|
||||
cat("Job count:")
|
||||
cat(nrow(data))
|
||||
|
||||
# empirical cummulative density function (ECDF)
|
||||
ggplot(data, aes(similarity, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("SIM") + ylab("Fraction of jobs") + theme(legend.position="bottom") + scale_color_brewer(palette = "Set2")
|
||||
|
@ -51,13 +49,10 @@ plotJobs = function(jobs){
|
|||
md = metadata[metadata$jobid %in% jobs,]
|
||||
print(summary(md))
|
||||
|
||||
# print the job timeline
|
||||
# print the job timelines
|
||||
r = e[ordered, ]
|
||||
for (row in 1:length(jobs)) {
|
||||
prefix = sprintf("%s-%f-%.0f-", level, r[row, "similarity"], row)
|
||||
job = r[row, "jobid"]
|
||||
system(sprintf("scripts/plot-single-job.py %s %s", job, prefix))
|
||||
}
|
||||
prefix = do.call("sprintf", list("%s-%.0f-", level, r$similarity))
|
||||
system(sprintf("scripts/plot-single-job.py %s %s", paste(r$jobid, collapse=","), paste(prefix, collapse=",")))
|
||||
}
|
||||
|
||||
# Store the job ids in a table, each column is one algorithm
|
||||
|
|
Loading…
Reference in New Issue