mistral-io-datasets/scripts/plot-job-timelines.py

#!/usr/bin/env python3

import csv
import sys
from pandas import DataFrame
from pandas import Grouper
from matplotlib import pyplot
import matplotlib.cm as cm

jobs = sys.argv[1].split(",")
prefix = sys.argv[2].split(",")

fileformat = ".pdf"

print("Plotting the job: " + str(sys.argv[1]))
print("Plotting with prefix: " + str(sys.argv[2]))


# Color map
colorMap = { "md_file_create": cm.tab10(0),
"md_file_delete": cm.tab10(1),
"md_mod": cm.tab10(2),
"md_other": cm.tab10(3),
"md_read": cm.tab10(4),
"read_bytes": cm.tab10(5),
"read_calls": cm.tab10(6),
"write_bytes": cm.tab10(7),
"write_calls": cm.tab10(8)
}

markerMap = { "md_file_create": "^",
"md_file_delete": "v",
"md_other": ".",
"md_mod": "<",
"md_read": ">",
"read_bytes": "h",
"read_calls": "H",
"write_bytes": "D",
"write_calls": "d"
}

linestyleMap = { "md_file_create": ":",
"md_file_delete": ":",
"md_mod": ":",
"md_other": ":",
"md_read": ":",
"read_bytes": "--",
"read_calls": "--",
"write_bytes": "-.",
"write_calls": "-."
}

# Plot the timeseries
def plot(prefix, header, row):
  x = { h : d for (h, d) in zip(header, row)}
  jobid = x["jobid"]
  del x["jobid"]
  del x["bcoding"]
  # EB: Removing segment mean values
  del x["mean_md_file_create"]
  del x["mean_md_file_delete"]
  del x["mean_md_mod"]
  del x["mean_md_other"]
  del x["mean_md_read"]
  del x["mean_read_bytes"]
  del x["mean_read_calls"]
  del x["mean_write_bytes"]
  del x["mean_write_calls"]
  # EB: Renaming dict keys
  x["md_file_create"] = x.pop("q16_md_file_create")
  x["md_file_delete"] = x.pop("q16_md_file_delete")
  x["md_mod"]         = x.pop("q16_md_mod")
  x["md_other"]       = x.pop("q16_md_other")
  x["md_read"]        = x.pop("q16_md_read")
  x["read_bytes"]     = x.pop("q16_read_bytes")
  x["read_calls"]     = x.pop("q16_read_calls")
  x["write_bytes"]    = x.pop("q16_write_bytes")
  x["write_calls"]    = x.pop("q16_write_calls")

  result = []
  for k in x:
    timeseries = x[k].split(":")
    timeseries = [ float(x) for x in timeseries]
    if sum(timeseries) == 0:
      continue
    timeseries = [ [k, x, s] for (s,x) in zip(timeseries, range(0, len(timeseries))) ]
    result.extend(timeseries)

  if len(result) == 0:
    print("Empty job! Cannot plot!")
    return

  data = DataFrame(result, columns=["metrics", "segment", "value"])
  groups = data.groupby(["metrics"])
  metrics = DataFrame()
  labels = []
  colors = []
  style = []
  for name, group in groups:
    style.append(linestyleMap[name] + markerMap[name])
    colors.append(colorMap[name])
    if name == "md_file_delete":
      name = "file_delete"
    if name == "md_file_create":
      name = "file_create"
    metrics[name] = [x[2] for x in group.values]
    labels.append(name)

  fsize = (8, 1 + 1.1 * len(labels))
  fsizeFixed = (8, 2)

  pyplot.close('all')

  if len(labels) < 4 :
    ax = metrics.plot(legend=True, sharex=True, grid = True,  sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)
    ax.set_ylabel("Value")
  else:
    ax = metrics.plot(subplots=True, legend=False, sharex=True, grid = True,  sharey=True, markersize=10, figsize=fsize, color=colors, style=style)
    for (i, l) in zip(range(0, len(labels)), labels):
      ax[i].set_ylabel(l)

  pyplot.xlabel("Segment number")
  pyplot.savefig(prefix + "timeseries" + jobid + fileformat, bbox_inches='tight', dpi=150)

  # Plot first 30 segments
  if len(timeseries) <= 50:
    return


  if len(labels) < 4 :
    ax = metrics.plot(legend=True, xlim=(0,30), sharex=True, grid = True,  sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)
    ax.set_ylabel("Value")
  else:
    ax = metrics.plot(subplots=True, xlim=(0,30), legend=False, sharex=True, grid = True,  sharey=True, markersize=10, figsize=fsize, color=colors, style=style)
    for (i, l) in zip(range(0, len(labels)), labels):
      ax[i].set_ylabel(l)

  pyplot.xlabel("Segment number")
  pyplot.savefig(prefix + "timeseries" + jobid + "-30" + fileformat, bbox_inches='tight', dpi=150)

### end plotting function


#with open('job-io-datasets/datasets/job_codings.csv') as csv_file: # EB: old codings
with open('./datasets/job_codings_v3.csv') as csv_file: # EB: v3 codings moved to this repo
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
      if line_count == 0:
        header = row
        line_count += 1
        continue
      job = row[0].strip()
      if not job in jobs:
        continue
      else:
        index = jobs.index(job)
        plot(prefix[index] + "-" + str(index), header, row)
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00			`#!/usr/bin/env python3`

			`import csv`
			`import sys`
			`from pandas import DataFrame`
			`from pandas import Grouper`
			`from matplotlib import pyplot`
Fix Color map for job vis. 2020-08-19 18:01:48 +00:00			`import matplotlib.cm as cm`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
Optimization 2020-08-20 10:48:27 +00:00			`jobs = sys.argv[1].split(",")`
			`prefix = sys.argv[2].split(",")`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
Update 2020-10-01 16:10:27 +00:00			`fileformat = ".pdf"`
Better plotting. 2020-08-20 11:11:35 +00:00
Bugfix plotting of jobs. More details in paper. 2020-08-20 15:16:46 +00:00			`print("Plotting the job: " + str(sys.argv[1]))`
			`print("Plotting with prefix: " + str(sys.argv[2]))`

Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
Fix Color map for job vis. 2020-08-19 18:01:48 +00:00			`# Color map`
			`colorMap = { "md_file_create": cm.tab10(0),`
			`"md_file_delete": cm.tab10(1),`
			`"md_mod": cm.tab10(2),`
			`"md_other": cm.tab10(3),`
			`"md_read": cm.tab10(4),`
			`"read_bytes": cm.tab10(5),`
			`"read_calls": cm.tab10(6),`
			`"write_bytes": cm.tab10(7),`
			`"write_calls": cm.tab10(8)`
			`}`

Plot diagrams more recognizable. 2020-08-19 18:23:10 +00:00			`markerMap = { "md_file_create": "^",`
			`"md_file_delete": "v",`
			`"md_other": ".",`
			`"md_mod": "<",`
			`"md_read": ">",`
			`"read_bytes": "h",`
			`"read_calls": "H",`
			`"write_bytes": "D",`
			`"write_calls": "d"`
			`}`

			`linestyleMap = { "md_file_create": ":",`
			`"md_file_delete": ":",`
			`"md_mod": ":",`
			`"md_other": ":",`
			`"md_read": ":",`
			`"read_bytes": "--",`
			`"read_calls": "--",`
			`"write_bytes": "-.",`
			`"write_calls": "-."`
			`}`

Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00			`# Plot the timeseries`
Plot the 100 jobs. 2020-08-18 14:26:29 +00:00			`def plot(prefix, header, row):`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00			`x = { h : d for (h, d) in zip(header, row)}`
			`jobid = x["jobid"]`
			`del x["jobid"]`
Adaption to new dataset 2020-08-26 12:26:55 +00:00			`del x["bcoding"]`
			`# EB: Removing segment mean values`
			`del x["mean_md_file_create"]`
			`del x["mean_md_file_delete"]`
			`del x["mean_md_mod"]`
			`del x["mean_md_other"]`
			`del x["mean_md_read"]`
			`del x["mean_read_bytes"]`
			`del x["mean_read_calls"]`
			`del x["mean_write_bytes"]`
			`del x["mean_write_calls"]`
			`# EB: Renaming dict keys`
			`x["md_file_create"] = x.pop("q16_md_file_create")`
			`x["md_file_delete"] = x.pop("q16_md_file_delete")`
			`x["md_mod"] = x.pop("q16_md_mod")`
			`x["md_other"] = x.pop("q16_md_other")`
			`x["md_read"] = x.pop("q16_md_read")`
			`x["read_bytes"] = x.pop("q16_read_bytes")`
			`x["read_calls"] = x.pop("q16_read_calls")`
			`x["write_bytes"] = x.pop("q16_write_bytes")`
			`x["write_calls"] = x.pop("q16_write_calls")`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
			`result = []`
			`for k in x:`
			`timeseries = x[k].split(":")`
			`timeseries = [ float(x) for x in timeseries]`
			`if sum(timeseries) == 0:`
			`continue`
Fix and new 2020-09-03 11:14:40 +00:00			`timeseries = [ [k, x, s] for (s,x) in zip(timeseries, range(0, len(timeseries))) ]`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00			`result.extend(timeseries)`

Tune size + output 2020-08-18 14:46:05 +00:00			`if len(result) == 0:`
			`print("Empty job! Cannot plot!")`
			`return`

Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00			`data = DataFrame(result, columns=["metrics", "segment", "value"])`
			`groups = data.groupby(["metrics"])`
			`metrics = DataFrame()`
			`labels = []`
Fix Color map for job vis. 2020-08-19 18:01:48 +00:00			`colors = []`
Plot diagrams more recognizable. 2020-08-19 18:23:10 +00:00			`style = []`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00			`for name, group in groups:`
Plot diagrams more recognizable. 2020-08-19 18:23:10 +00:00			`style.append(linestyleMap[name] + markerMap[name])`
Fix Color map for job vis. 2020-08-19 18:01:48 +00:00			`colors.append(colorMap[name])`
Nai 2020-08-21 18:12:33 +00:00			`if name == "md_file_delete":`
			`name = "file_delete"`
			`if name == "md_file_create":`
			`name = "file_create"`
			`metrics[name] = [x[2] for x in group.values]`
			`labels.append(name)`
Fix Color map for job vis. 2020-08-19 18:01:48 +00:00
Nai 2020-08-21 18:12:33 +00:00			`fsize = (8, 1 + 1.1 * len(labels))`
Optimization 2020-08-20 10:48:27 +00:00			`fsizeFixed = (8, 2)`
Better plotting. 2020-08-20 11:11:35 +00:00
Optimization 2020-08-20 10:48:27 +00:00			`pyplot.close('all')`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
Plot diagrams more recognizable. 2020-08-19 18:23:10 +00:00			`if len(labels) < 4 :`
			`ax = metrics.plot(legend=True, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)`
			`ax.set_ylabel("Value")`
			`else:`
			`ax = metrics.plot(subplots=True, legend=False, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsize, color=colors, style=style)`
			`for (i, l) in zip(range(0, len(labels)), labels):`
			`ax[i].set_ylabel(l)`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
			`pyplot.xlabel("Segment number")`
Nai 2020-08-21 18:12:33 +00:00			`pyplot.savefig(prefix + "timeseries" + jobid + fileformat, bbox_inches='tight', dpi=150)`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
Tune size + output 2020-08-18 14:46:05 +00:00			`# Plot first 30 segments`
			`if len(timeseries) <= 50:`
			`return`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
Plot diagrams more recognizable. 2020-08-19 18:23:10 +00:00
			`if len(labels) < 4 :`
			`ax = metrics.plot(legend=True, xlim=(0,30), sharex=True, grid = True, sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)`
			`ax.set_ylabel("Value")`
			`else:`
			`ax = metrics.plot(subplots=True, xlim=(0,30), legend=False, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsize, color=colors, style=style)`
			`for (i, l) in zip(range(0, len(labels)), labels):`
			`ax[i].set_ylabel(l)`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
			`pyplot.xlabel("Segment number")`
Nai 2020-08-21 18:12:33 +00:00			`pyplot.savefig(prefix + "timeseries" + jobid + "-30" + fileformat, bbox_inches='tight', dpi=150)`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
Plot the 100 jobs. 2020-08-18 14:26:29 +00:00			`### end plotting function`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00


Adaption to new dataset 2020-08-26 12:26:55 +00:00			`#with open('job-io-datasets/datasets/job_codings.csv') as csv_file: # EB: old codings`
			`with open('./datasets/job_codings_v3.csv') as csv_file: # EB: v3 codings moved to this repo`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00			`csv_reader = csv.reader(csv_file, delimiter=',')`
			`line_count = 0`
			`for row in csv_reader:`
			`if line_count == 0:`
			`header = row`
			`line_count += 1`
			`continue`
Bugfix plotting of jobs. More details in paper. 2020-08-20 15:16:46 +00:00			`job = row[0].strip()`
			`if not job in jobs:`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00			`continue`
			`else:`
Bugfix plotting of jobs. More details in paper. 2020-08-20 15:16:46 +00:00			`index = jobs.index(job)`
			`plot(prefix[index] + "-" + str(index), header, row)`