Renamed

2020-09-03 13:59:20 +01:00 · 2020-09-03 13:59:20 +01:00 · ea893d76f0
commit ea893d76f0
parent 65f8cdb98d
5 changed files with 170 additions and 8 deletions
--- a/scripts/analyse-all.sh
+++ b/scripts/analyse-all.sh
@ -23,14 +23,14 @@ prepare
 for I in job_similarities_*.csv ; do
  rm *.png *.pdf
-  ./scripts/plot.R $I > description.txt
+  echo "processing $I"
  ./scripts/plot.R $I > description.txt 2>&1
  OUT=${I%%.csv}-out
  mkdir $OUT
  if [[ $CLEAN != "0" ]] ; then
    rm $OUT/*
    mv description.txt $OUT
  fi
-  mv *.png *.pdf jobs-*.txt $OUT
+  mv description.txt *.png *.pdf jobs-*.txt $OUT
 done
 # analyze peformance data
--- a/scripts/plot-job-timelines-ks.py
+++ b/scripts/plot-job-timelines-ks.py
@ -0,0 +1,154 @@
 #!/usr/bin/env python3
 import csv
 import sys
 import pandas as pd
 from pandas import DataFrame
 from pandas import Grouper
 import seaborn as sns
 from matplotlib import pyplot
 import matplotlib.cm as cm
 jobs = sys.argv[1].split(",")
 prefix = sys.argv[2].split(",")
 fileformat = ".png"
 print("Plotting the job: " + str(sys.argv[1]))
 print("Plotting with prefix: " + str(sys.argv[2]))
 # Color map
 colorMap = { "md_file_create": cm.tab10(0),
 "md_file_delete": cm.tab10(1),
 "md_mod": cm.tab10(2),
 "md_other": cm.tab10(3),
 "md_read": cm.tab10(4),
 "read_bytes": cm.tab10(5),
 "read_calls": cm.tab10(6),
 "write_bytes": cm.tab10(7),
 "write_calls": cm.tab10(8)
 }
 markerMap = { "md_file_create": "^",
 "md_file_delete": "v",
 "md_other": ".",
 "md_mod": "<",
 "md_read": ">",
 "read_bytes": "h",
 "read_calls": "H",
 "write_bytes": "D",
 "write_calls": "d"
 }
 linestyleMap = { "md_file_create": ":",
 "md_file_delete": ":",
 "md_mod": ":",
 "md_other": ":",
 "md_read": ":",
 "read_bytes": "--",
 "read_calls": "--",
 "write_bytes": "-.",
 "write_calls": "-."
 }
 # Plot the timeseries
 def plot(prefix, header, row):
  x = { h : d for (h, d) in zip(header, row)}
  jobid = x["jobid"]
  del x["jobid"]
  result = []
  for k in x:
    timeseries = x[k].split(":")
    timeseries = [ float(x) for x in timeseries]
    if sum(timeseries) == 0:
      continue
    timeseries = [ [k, x, s] for (s,x) in zip(timeseries, range(0, len(timeseries))) ]
    result.extend(timeseries)
  if len(result) == 0:
    print("Empty job! Cannot plot!")
    return
  data = DataFrame(result, columns=["metrics", "segment", "value"])
  groups = data.groupby(["metrics"])
  metrics = DataFrame()
  labels = []
  colors = []
  style = []
  for name, group in groups:
    style.append(linestyleMap[name] + markerMap[name])
    colors.append(colorMap[name])
    if name == "md_file_delete":
      name = "file_delete"
    if name == "md_file_create":
      name = "file_create"
    try:
      metrics[name] = pd.Series([x[2] for x in group.values])
    except:
      print("Error processing %s with" % jobid)
      print(group.values)
      return
    labels.append(name)
  fsize = (8, 1 + 1.1 * len(labels))
  fsizeFixed = (8, 2)
  fsizeHist = (8, 4)
  pyplot.close('all')
  if len(labels) < 4 :
    ax = metrics.plot(legend=True, sharex=True, grid = True,  sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)
    ax.set_ylabel("Value")
  else:
    ax = metrics.plot(subplots=True, legend=False, sharex=True, grid = True,  sharey=True, markersize=10, figsize=fsize, color=colors, style=style)
    for (i, l) in zip(range(0, len(labels)), labels):
      ax[i].set_ylabel(l)
  pyplot.xlabel("Segment number")
  pyplot.savefig(prefix + "timeseries" + jobid + fileformat, bbox_inches='tight', dpi=150)
  # Create a facetted grid
  #g = sns.FacetGrid(tips, col="time", margin_titles=True)
  #bins = np.linspace(0, 60, 13)
  #g.map(plt.hist, "total_bill", color="steelblue", bins=bins)
  ax = metrics.hist(sharex=True, grid = True, sharey=True, figsize=fsizeHist, bins=10)
  pyplot.savefig(prefix + "hist" + jobid + fileformat, bbox_inches='tight', dpi=150)
  # Plot first 30 segments
  if len(timeseries) <= 50:
    return
  if len(labels) < 4 :
    ax = metrics.plot(legend=True, xlim=(0,30), sharex=True, grid = True,  sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)
    ax.set_ylabel("Value")
  else:
    ax = metrics.plot(subplots=True, xlim=(0,30), legend=False, sharex=True, grid = True,  sharey=True, markersize=10, figsize=fsize, color=colors, style=style)
    for (i, l) in zip(range(0, len(labels)), labels):
      ax[i].set_ylabel(l)
  pyplot.xlabel("Segment number")
  pyplot.savefig(prefix + "timeseries" + jobid + "-30" + fileformat, bbox_inches='tight', dpi=150)
 ### end plotting function
 #with open('job-io-datasets/datasets/job_codings.csv') as csv_file: # EB: old codings
 with open('./datasets/job_codings_v4.csv') as csv_file: # EB: v3 codings moved to this repo
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
      if line_count == 0:
        header = row
        line_count += 1
        continue
      job = row[0].strip()
      if not job in jobs:
        continue
      else:
        index = jobs.index(job)
        plot(prefix[index] + "-ks-" + str(index), header, row)
--- a/scripts/plot-job-timelines.py
+++ b/scripts/plot-job-timelines.py
--- a/scripts/plot-single-ks-jobs.py
+++ b/scripts/plot-single-ks-jobs.py
--- a/scripts/plot.R
+++ b/scripts/plot.R
@ -7,7 +7,7 @@ library(stringi)
 library(stringr)
 # Turn to TRUE to print indivdiual job images
-plotjobs = FALSE
+plotjobs = TRUE
 # Color scheme
 plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000099")
@ -28,7 +28,7 @@ cat(nrow(data))
 # empirical cumulative density function (ECDF)
 data$sim = data$similarity*100
-ggplot(data, aes(sim, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("Similarity in %") + ylab("Fraction of jobs") + theme(legend.position=c(0.9, 0.4)) + scale_color_brewer(palette = "Set2") + scale_x_log10()
+ggplot(data, aes(sim, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("Similarity in %") + ylab("Fraction of jobs") + theme(legend.position=c(0.05, 0.5),  legend.title = element_blank()) + scale_color_brewer(palette = "Set2") + scale_x_log10()
 ggsave("ecdf.png", width=8, height=2.5)
 # histogram for the jobs
@ -47,13 +47,21 @@ metadata = read.csv("./datasets/job_metadata.csv") # EB: is ebenfalls im Repo
 metadata$user_id = as.factor(metadata$user_id)
 metadata$group_id = as.factor(metadata$group_id)
-plotJobs = function(jobs){
+plotJobs = function(algorithm, jobs){
    # print the job timelines
    r = e[ordered, ]
    if (plotjobs) {
      if(algorithm == "ks"){
        script = "./scripts/plot-job-timelines-ks.py"
      }else{
        script = "./scripts/plot-job-timelines.py"
        return(0) ### FIXME
      }
      prefix = do.call("sprintf", list("%s-%.4f-", level, r$similarity))
-      system(sprintf("./scripts/plot-single-job.py %s %s", paste(r$jobid, collapse=","), paste(prefix, collapse=",")))
+      call = sprintf("%s %s %s", script, paste(r$jobid, collapse=","), paste(prefix, collapse=","))
      print(call)
      system(call)
    }
    system(sprintf("./scripts/extract-conf-data.sh %s > jobs-%s.txt", paste(r$jobid, collapse=" "), level))
@ -88,7 +96,7 @@ for (level in levels(data$alg_name)){
    userprofile$userrank = 1:nrow(userprofile)
    result.userid = rbind(result.userid, cbind(level, userprofile))
-    plotJobs(jobs)
+    plotJobs(level, jobs)
 }
 colnames(result.userid) = c("alg_name", "user_id", "count", "userrank")