Browse Source

Renamed

master
Julian M. Kunkel 3 years ago
parent
commit
ea893d76f0
  1. 6
      scripts/analyse-all.sh
  2. 154
      scripts/plot-job-timelines-ks.py
  3. 0
      scripts/plot-job-timelines.py
  4. 0
      scripts/plot-single-ks-jobs.py
  5. 18
      scripts/plot.R

6
scripts/analyse-all.sh

@ -23,14 +23,14 @@ prepare
for I in job_similarities_*.csv ; do
rm *.png *.pdf
./scripts/plot.R $I > description.txt
echo "processing $I"
./scripts/plot.R $I > description.txt 2>&1
OUT=${I%%.csv}-out
mkdir $OUT
if [[ $CLEAN != "0" ]] ; then
rm $OUT/*
mv description.txt $OUT
fi
mv *.png *.pdf jobs-*.txt $OUT
mv description.txt *.png *.pdf jobs-*.txt $OUT
done
# analyze peformance data

154
scripts/plot-job-timelines-ks.py

@ -0,0 +1,154 @@
#!/usr/bin/env python3
import csv
import sys
import pandas as pd
from pandas import DataFrame
from pandas import Grouper
import seaborn as sns
from matplotlib import pyplot
import matplotlib.cm as cm
jobs = sys.argv[1].split(",")
prefix = sys.argv[2].split(",")
fileformat = ".png"
print("Plotting the job: " + str(sys.argv[1]))
print("Plotting with prefix: " + str(sys.argv[2]))
# Color map
colorMap = { "md_file_create": cm.tab10(0),
"md_file_delete": cm.tab10(1),
"md_mod": cm.tab10(2),
"md_other": cm.tab10(3),
"md_read": cm.tab10(4),
"read_bytes": cm.tab10(5),
"read_calls": cm.tab10(6),
"write_bytes": cm.tab10(7),
"write_calls": cm.tab10(8)
}
markerMap = { "md_file_create": "^",
"md_file_delete": "v",
"md_other": ".",
"md_mod": "<",
"md_read": ">",
"read_bytes": "h",
"read_calls": "H",
"write_bytes": "D",
"write_calls": "d"
}
linestyleMap = { "md_file_create": ":",
"md_file_delete": ":",
"md_mod": ":",
"md_other": ":",
"md_read": ":",
"read_bytes": "--",
"read_calls": "--",
"write_bytes": "-.",
"write_calls": "-."
}
# Plot the timeseries
def plot(prefix, header, row):
x = { h : d for (h, d) in zip(header, row)}
jobid = x["jobid"]
del x["jobid"]
result = []
for k in x:
timeseries = x[k].split(":")
timeseries = [ float(x) for x in timeseries]
if sum(timeseries) == 0:
continue
timeseries = [ [k, x, s] for (s,x) in zip(timeseries, range(0, len(timeseries))) ]
result.extend(timeseries)
if len(result) == 0:
print("Empty job! Cannot plot!")
return
data = DataFrame(result, columns=["metrics", "segment", "value"])
groups = data.groupby(["metrics"])
metrics = DataFrame()
labels = []
colors = []
style = []
for name, group in groups:
style.append(linestyleMap[name] + markerMap[name])
colors.append(colorMap[name])
if name == "md_file_delete":
name = "file_delete"
if name == "md_file_create":
name = "file_create"
try:
metrics[name] = pd.Series([x[2] for x in group.values])
except:
print("Error processing %s with" % jobid)
print(group.values)
return
labels.append(name)
fsize = (8, 1 + 1.1 * len(labels))
fsizeFixed = (8, 2)
fsizeHist = (8, 4)
pyplot.close('all')
if len(labels) < 4 :
ax = metrics.plot(legend=True, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)
ax.set_ylabel("Value")
else:
ax = metrics.plot(subplots=True, legend=False, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsize, color=colors, style=style)
for (i, l) in zip(range(0, len(labels)), labels):
ax[i].set_ylabel(l)
pyplot.xlabel("Segment number")
pyplot.savefig(prefix + "timeseries" + jobid + fileformat, bbox_inches='tight', dpi=150)
# Create a facetted grid
#g = sns.FacetGrid(tips, col="time", margin_titles=True)
#bins = np.linspace(0, 60, 13)
#g.map(plt.hist, "total_bill", color="steelblue", bins=bins)
ax = metrics.hist(sharex=True, grid = True, sharey=True, figsize=fsizeHist, bins=10)
pyplot.savefig(prefix + "hist" + jobid + fileformat, bbox_inches='tight', dpi=150)
# Plot first 30 segments
if len(timeseries) <= 50:
return
if len(labels) < 4 :
ax = metrics.plot(legend=True, xlim=(0,30), sharex=True, grid = True, sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)
ax.set_ylabel("Value")
else:
ax = metrics.plot(subplots=True, xlim=(0,30), legend=False, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsize, color=colors, style=style)
for (i, l) in zip(range(0, len(labels)), labels):
ax[i].set_ylabel(l)
pyplot.xlabel("Segment number")
pyplot.savefig(prefix + "timeseries" + jobid + "-30" + fileformat, bbox_inches='tight', dpi=150)
### end plotting function
#with open('job-io-datasets/datasets/job_codings.csv') as csv_file: # EB: old codings
with open('./datasets/job_codings_v4.csv') as csv_file: # EB: v3 codings moved to this repo
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:
if line_count == 0:
header = row
line_count += 1
continue
job = row[0].strip()
if not job in jobs:
continue
else:
index = jobs.index(job)
plot(prefix[index] + "-ks-" + str(index), header, row)

0
scripts/plot-single-job.py → scripts/plot-job-timelines.py

0
scripts/plot-single-ks-jobs.py

18
scripts/plot.R

@ -7,7 +7,7 @@ library(stringi)
library(stringr)
# Turn to TRUE to print indivdiual job images
plotjobs = FALSE
plotjobs = TRUE
# Color scheme
plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000099")
@ -28,7 +28,7 @@ cat(nrow(data))
# empirical cumulative density function (ECDF)
data$sim = data$similarity*100
ggplot(data, aes(sim, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("Similarity in %") + ylab("Fraction of jobs") + theme(legend.position=c(0.9, 0.4)) + scale_color_brewer(palette = "Set2") + scale_x_log10()
ggplot(data, aes(sim, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("Similarity in %") + ylab("Fraction of jobs") + theme(legend.position=c(0.05, 0.5), legend.title = element_blank()) + scale_color_brewer(palette = "Set2") + scale_x_log10()
ggsave("ecdf.png", width=8, height=2.5)
# histogram for the jobs
@ -47,13 +47,21 @@ metadata = read.csv("./datasets/job_metadata.csv") # EB: is ebenfalls im Repo
metadata$user_id = as.factor(metadata$user_id)
metadata$group_id = as.factor(metadata$group_id)
plotJobs = function(jobs){
plotJobs = function(algorithm, jobs){
# print the job timelines
r = e[ordered, ]
if (plotjobs) {
if(algorithm == "ks"){
script = "./scripts/plot-job-timelines-ks.py"
}else{
script = "./scripts/plot-job-timelines.py"
return(0) ### FIXME
}
prefix = do.call("sprintf", list("%s-%.4f-", level, r$similarity))
system(sprintf("./scripts/plot-single-job.py %s %s", paste(r$jobid, collapse=","), paste(prefix, collapse=",")))
call = sprintf("%s %s %s", script, paste(r$jobid, collapse=","), paste(prefix, collapse=","))
print(call)
system(call)
}
system(sprintf("./scripts/extract-conf-data.sh %s > jobs-%s.txt", paste(r$jobid, collapse=" "), level))
@ -88,7 +96,7 @@ for (level in levels(data$alg_name)){
userprofile$userrank = 1:nrow(userprofile)
result.userid = rbind(result.userid, cbind(level, userprofile))
plotJobs(jobs)
plotJobs(level, jobs)
}
colnames(result.userid) = c("alg_name", "user_id", "count", "userrank")

Loading…
Cancel
Save