This commit is contained in:
Julian M. Kunkel 2020-09-03 13:59:20 +01:00
parent 65f8cdb98d
commit ea893d76f0
5 changed files with 170 additions and 8 deletions

View File

@ -23,14 +23,14 @@ prepare
for I in job_similarities_*.csv ; do
rm *.png *.pdf
./scripts/plot.R $I > description.txt
echo "processing $I"
./scripts/plot.R $I > description.txt 2>&1
OUT=${I%%.csv}-out
mkdir $OUT
if [[ $CLEAN != "0" ]] ; then
rm $OUT/*
mv description.txt $OUT
fi
mv *.png *.pdf jobs-*.txt $OUT
mv description.txt *.png *.pdf jobs-*.txt $OUT
done
# analyze peformance data

154
scripts/plot-job-timelines-ks.py Executable file
View File

@ -0,0 +1,154 @@
#!/usr/bin/env python3
import csv
import sys
import pandas as pd
from pandas import DataFrame
from pandas import Grouper
import seaborn as sns
from matplotlib import pyplot
import matplotlib.cm as cm
jobs = sys.argv[1].split(",")
prefix = sys.argv[2].split(",")
fileformat = ".png"
print("Plotting the job: " + str(sys.argv[1]))
print("Plotting with prefix: " + str(sys.argv[2]))
# Color map
colorMap = { "md_file_create": cm.tab10(0),
"md_file_delete": cm.tab10(1),
"md_mod": cm.tab10(2),
"md_other": cm.tab10(3),
"md_read": cm.tab10(4),
"read_bytes": cm.tab10(5),
"read_calls": cm.tab10(6),
"write_bytes": cm.tab10(7),
"write_calls": cm.tab10(8)
}
markerMap = { "md_file_create": "^",
"md_file_delete": "v",
"md_other": ".",
"md_mod": "<",
"md_read": ">",
"read_bytes": "h",
"read_calls": "H",
"write_bytes": "D",
"write_calls": "d"
}
linestyleMap = { "md_file_create": ":",
"md_file_delete": ":",
"md_mod": ":",
"md_other": ":",
"md_read": ":",
"read_bytes": "--",
"read_calls": "--",
"write_bytes": "-.",
"write_calls": "-."
}
# Plot the timeseries
def plot(prefix, header, row):
x = { h : d for (h, d) in zip(header, row)}
jobid = x["jobid"]
del x["jobid"]
result = []
for k in x:
timeseries = x[k].split(":")
timeseries = [ float(x) for x in timeseries]
if sum(timeseries) == 0:
continue
timeseries = [ [k, x, s] for (s,x) in zip(timeseries, range(0, len(timeseries))) ]
result.extend(timeseries)
if len(result) == 0:
print("Empty job! Cannot plot!")
return
data = DataFrame(result, columns=["metrics", "segment", "value"])
groups = data.groupby(["metrics"])
metrics = DataFrame()
labels = []
colors = []
style = []
for name, group in groups:
style.append(linestyleMap[name] + markerMap[name])
colors.append(colorMap[name])
if name == "md_file_delete":
name = "file_delete"
if name == "md_file_create":
name = "file_create"
try:
metrics[name] = pd.Series([x[2] for x in group.values])
except:
print("Error processing %s with" % jobid)
print(group.values)
return
labels.append(name)
fsize = (8, 1 + 1.1 * len(labels))
fsizeFixed = (8, 2)
fsizeHist = (8, 4)
pyplot.close('all')
if len(labels) < 4 :
ax = metrics.plot(legend=True, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)
ax.set_ylabel("Value")
else:
ax = metrics.plot(subplots=True, legend=False, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsize, color=colors, style=style)
for (i, l) in zip(range(0, len(labels)), labels):
ax[i].set_ylabel(l)
pyplot.xlabel("Segment number")
pyplot.savefig(prefix + "timeseries" + jobid + fileformat, bbox_inches='tight', dpi=150)
# Create a facetted grid
#g = sns.FacetGrid(tips, col="time", margin_titles=True)
#bins = np.linspace(0, 60, 13)
#g.map(plt.hist, "total_bill", color="steelblue", bins=bins)
ax = metrics.hist(sharex=True, grid = True, sharey=True, figsize=fsizeHist, bins=10)
pyplot.savefig(prefix + "hist" + jobid + fileformat, bbox_inches='tight', dpi=150)
# Plot first 30 segments
if len(timeseries) <= 50:
return
if len(labels) < 4 :
ax = metrics.plot(legend=True, xlim=(0,30), sharex=True, grid = True, sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)
ax.set_ylabel("Value")
else:
ax = metrics.plot(subplots=True, xlim=(0,30), legend=False, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsize, color=colors, style=style)
for (i, l) in zip(range(0, len(labels)), labels):
ax[i].set_ylabel(l)
pyplot.xlabel("Segment number")
pyplot.savefig(prefix + "timeseries" + jobid + "-30" + fileformat, bbox_inches='tight', dpi=150)
### end plotting function
#with open('job-io-datasets/datasets/job_codings.csv') as csv_file: # EB: old codings
with open('./datasets/job_codings_v4.csv') as csv_file: # EB: v3 codings moved to this repo
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:
if line_count == 0:
header = row
line_count += 1
continue
job = row[0].strip()
if not job in jobs:
continue
else:
index = jobs.index(job)
plot(prefix[index] + "-ks-" + str(index), header, row)

0
scripts/plot-single-ks-jobs.py Executable file → Normal file
View File

View File

@ -7,7 +7,7 @@ library(stringi)
library(stringr)
# Turn to TRUE to print indivdiual job images
plotjobs = FALSE
plotjobs = TRUE
# Color scheme
plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000099")
@ -28,7 +28,7 @@ cat(nrow(data))
# empirical cumulative density function (ECDF)
data$sim = data$similarity*100
ggplot(data, aes(sim, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("Similarity in %") + ylab("Fraction of jobs") + theme(legend.position=c(0.9, 0.4)) + scale_color_brewer(palette = "Set2") + scale_x_log10()
ggplot(data, aes(sim, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("Similarity in %") + ylab("Fraction of jobs") + theme(legend.position=c(0.05, 0.5), legend.title = element_blank()) + scale_color_brewer(palette = "Set2") + scale_x_log10()
ggsave("ecdf.png", width=8, height=2.5)
# histogram for the jobs
@ -47,13 +47,21 @@ metadata = read.csv("./datasets/job_metadata.csv") # EB: is ebenfalls im Repo
metadata$user_id = as.factor(metadata$user_id)
metadata$group_id = as.factor(metadata$group_id)
plotJobs = function(jobs){
plotJobs = function(algorithm, jobs){
# print the job timelines
r = e[ordered, ]
if (plotjobs) {
if(algorithm == "ks"){
script = "./scripts/plot-job-timelines-ks.py"
}else{
script = "./scripts/plot-job-timelines.py"
return(0) ### FIXME
}
prefix = do.call("sprintf", list("%s-%.4f-", level, r$similarity))
system(sprintf("./scripts/plot-single-job.py %s %s", paste(r$jobid, collapse=","), paste(prefix, collapse=",")))
call = sprintf("%s %s %s", script, paste(r$jobid, collapse=","), paste(prefix, collapse=","))
print(call)
system(call)
}
system(sprintf("./scripts/extract-conf-data.sh %s > jobs-%s.txt", paste(r$jobid, collapse=" "), level))
@ -88,7 +96,7 @@ for (level in levels(data$alg_name)){
userprofile$userrank = 1:nrow(userprofile)
result.userid = rbind(result.userid, cbind(level, userprofile))
plotJobs(jobs)
plotJobs(level, jobs)
}
colnames(result.userid) = c("alg_name", "user_id", "count", "userrank")