master
Julian M. Kunkel 2020-09-03 13:59:20 +01:00
parent 65f8cdb98d
commit ea893d76f0
5 changed files with 170 additions and 8 deletions

View File

@ -23,14 +23,14 @@ prepare
for I in job_similarities_*.csv ; do for I in job_similarities_*.csv ; do
rm *.png *.pdf rm *.png *.pdf
./scripts/plot.R $I > description.txt echo "processing $I"
./scripts/plot.R $I > description.txt 2>&1
OUT=${I%%.csv}-out OUT=${I%%.csv}-out
mkdir $OUT mkdir $OUT
if [[ $CLEAN != "0" ]] ; then if [[ $CLEAN != "0" ]] ; then
rm $OUT/* rm $OUT/*
mv description.txt $OUT
fi fi
mv *.png *.pdf jobs-*.txt $OUT mv description.txt *.png *.pdf jobs-*.txt $OUT
done done
# analyze peformance data # analyze peformance data

View File

@ -0,0 +1,154 @@
#!/usr/bin/env python3
import csv
import sys
import pandas as pd
from pandas import DataFrame
from pandas import Grouper
import seaborn as sns
from matplotlib import pyplot
import matplotlib.cm as cm
jobs = sys.argv[1].split(",")
prefix = sys.argv[2].split(",")
fileformat = ".png"
print("Plotting the job: " + str(sys.argv[1]))
print("Plotting with prefix: " + str(sys.argv[2]))
# Color map
colorMap = { "md_file_create": cm.tab10(0),
"md_file_delete": cm.tab10(1),
"md_mod": cm.tab10(2),
"md_other": cm.tab10(3),
"md_read": cm.tab10(4),
"read_bytes": cm.tab10(5),
"read_calls": cm.tab10(6),
"write_bytes": cm.tab10(7),
"write_calls": cm.tab10(8)
}
markerMap = { "md_file_create": "^",
"md_file_delete": "v",
"md_other": ".",
"md_mod": "<",
"md_read": ">",
"read_bytes": "h",
"read_calls": "H",
"write_bytes": "D",
"write_calls": "d"
}
linestyleMap = { "md_file_create": ":",
"md_file_delete": ":",
"md_mod": ":",
"md_other": ":",
"md_read": ":",
"read_bytes": "--",
"read_calls": "--",
"write_bytes": "-.",
"write_calls": "-."
}
# Plot the timeseries
def plot(prefix, header, row):
x = { h : d for (h, d) in zip(header, row)}
jobid = x["jobid"]
del x["jobid"]
result = []
for k in x:
timeseries = x[k].split(":")
timeseries = [ float(x) for x in timeseries]
if sum(timeseries) == 0:
continue
timeseries = [ [k, x, s] for (s,x) in zip(timeseries, range(0, len(timeseries))) ]
result.extend(timeseries)
if len(result) == 0:
print("Empty job! Cannot plot!")
return
data = DataFrame(result, columns=["metrics", "segment", "value"])
groups = data.groupby(["metrics"])
metrics = DataFrame()
labels = []
colors = []
style = []
for name, group in groups:
style.append(linestyleMap[name] + markerMap[name])
colors.append(colorMap[name])
if name == "md_file_delete":
name = "file_delete"
if name == "md_file_create":
name = "file_create"
try:
metrics[name] = pd.Series([x[2] for x in group.values])
except:
print("Error processing %s with" % jobid)
print(group.values)
return
labels.append(name)
fsize = (8, 1 + 1.1 * len(labels))
fsizeFixed = (8, 2)
fsizeHist = (8, 4)
pyplot.close('all')
if len(labels) < 4 :
ax = metrics.plot(legend=True, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)
ax.set_ylabel("Value")
else:
ax = metrics.plot(subplots=True, legend=False, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsize, color=colors, style=style)
for (i, l) in zip(range(0, len(labels)), labels):
ax[i].set_ylabel(l)
pyplot.xlabel("Segment number")
pyplot.savefig(prefix + "timeseries" + jobid + fileformat, bbox_inches='tight', dpi=150)
# Create a facetted grid
#g = sns.FacetGrid(tips, col="time", margin_titles=True)
#bins = np.linspace(0, 60, 13)
#g.map(plt.hist, "total_bill", color="steelblue", bins=bins)
ax = metrics.hist(sharex=True, grid = True, sharey=True, figsize=fsizeHist, bins=10)
pyplot.savefig(prefix + "hist" + jobid + fileformat, bbox_inches='tight', dpi=150)
# Plot first 30 segments
if len(timeseries) <= 50:
return
if len(labels) < 4 :
ax = metrics.plot(legend=True, xlim=(0,30), sharex=True, grid = True, sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)
ax.set_ylabel("Value")
else:
ax = metrics.plot(subplots=True, xlim=(0,30), legend=False, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsize, color=colors, style=style)
for (i, l) in zip(range(0, len(labels)), labels):
ax[i].set_ylabel(l)
pyplot.xlabel("Segment number")
pyplot.savefig(prefix + "timeseries" + jobid + "-30" + fileformat, bbox_inches='tight', dpi=150)
### end plotting function
#with open('job-io-datasets/datasets/job_codings.csv') as csv_file: # EB: old codings
with open('./datasets/job_codings_v4.csv') as csv_file: # EB: v3 codings moved to this repo
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:
if line_count == 0:
header = row
line_count += 1
continue
job = row[0].strip()
if not job in jobs:
continue
else:
index = jobs.index(job)
plot(prefix[index] + "-ks-" + str(index), header, row)

0
scripts/plot-single-ks-jobs.py 100755 → 100644
View File

View File

@ -7,7 +7,7 @@ library(stringi)
library(stringr) library(stringr)
# Turn to TRUE to print indivdiual job images # Turn to TRUE to print indivdiual job images
plotjobs = FALSE plotjobs = TRUE
# Color scheme # Color scheme
plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000099") plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000099")
@ -28,7 +28,7 @@ cat(nrow(data))
# empirical cumulative density function (ECDF) # empirical cumulative density function (ECDF)
data$sim = data$similarity*100 data$sim = data$similarity*100
ggplot(data, aes(sim, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("Similarity in %") + ylab("Fraction of jobs") + theme(legend.position=c(0.9, 0.4)) + scale_color_brewer(palette = "Set2") + scale_x_log10() ggplot(data, aes(sim, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("Similarity in %") + ylab("Fraction of jobs") + theme(legend.position=c(0.05, 0.5), legend.title = element_blank()) + scale_color_brewer(palette = "Set2") + scale_x_log10()
ggsave("ecdf.png", width=8, height=2.5) ggsave("ecdf.png", width=8, height=2.5)
# histogram for the jobs # histogram for the jobs
@ -47,13 +47,21 @@ metadata = read.csv("./datasets/job_metadata.csv") # EB: is ebenfalls im Repo
metadata$user_id = as.factor(metadata$user_id) metadata$user_id = as.factor(metadata$user_id)
metadata$group_id = as.factor(metadata$group_id) metadata$group_id = as.factor(metadata$group_id)
plotJobs = function(jobs){ plotJobs = function(algorithm, jobs){
# print the job timelines # print the job timelines
r = e[ordered, ] r = e[ordered, ]
if (plotjobs) { if (plotjobs) {
if(algorithm == "ks"){
script = "./scripts/plot-job-timelines-ks.py"
}else{
script = "./scripts/plot-job-timelines.py"
return(0) ### FIXME
}
prefix = do.call("sprintf", list("%s-%.4f-", level, r$similarity)) prefix = do.call("sprintf", list("%s-%.4f-", level, r$similarity))
system(sprintf("./scripts/plot-single-job.py %s %s", paste(r$jobid, collapse=","), paste(prefix, collapse=","))) call = sprintf("%s %s %s", script, paste(r$jobid, collapse=","), paste(prefix, collapse=","))
print(call)
system(call)
} }
system(sprintf("./scripts/extract-conf-data.sh %s > jobs-%s.txt", paste(r$jobid, collapse=" "), level)) system(sprintf("./scripts/extract-conf-data.sh %s > jobs-%s.txt", paste(r$jobid, collapse=" "), level))
@ -88,7 +96,7 @@ for (level in levels(data$alg_name)){
userprofile$userrank = 1:nrow(userprofile) userprofile$userrank = 1:nrow(userprofile)
result.userid = rbind(result.userid, cbind(level, userprofile)) result.userid = rbind(result.userid, cbind(level, userprofile))
plotJobs(jobs) plotJobs(level, jobs)
} }
colnames(result.userid) = c("alg_name", "user_id", "count", "userrank") colnames(result.userid) = c("alg_name", "user_id", "count", "userrank")