mistral-io-datasets/scripts/plot-job-timelines-ks.py

#!/usr/bin/env python3

import csv
import sys
import pandas as pd
from pandas import DataFrame
from pandas import Grouper
import seaborn as sns
from matplotlib import pyplot
import matplotlib.cm as cm

jobs = sys.argv[1].split(",")
prefix = sys.argv[2].split(",")

fileformat = ".pdf"

print("Plotting the job: " + str(sys.argv[1]))
print("Plotting with prefix: " + str(sys.argv[2]))


# Color map
colorMap = { "md_file_create": cm.tab10(0),
"md_file_delete": cm.tab10(1),
"md_mod": cm.tab10(2),
"md_other": cm.tab10(3),
"md_read": cm.tab10(4),
"read_bytes": cm.tab10(5),
"read_calls": cm.tab10(6),
"write_bytes": cm.tab10(7),
"write_calls": cm.tab10(8)
}

markerMap = { "md_file_create": "^",
"md_file_delete": "v",
"md_other": ".",
"md_mod": "<",
"md_read": ">",
"read_bytes": "h",
"read_calls": "H",
"write_bytes": "D",
"write_calls": "d"
}

linestyleMap = { "md_file_create": ":",
"md_file_delete": ":",
"md_mod": ":",
"md_other": ":",
"md_read": ":",
"read_bytes": "--",
"read_calls": "--",
"write_bytes": "-.",
"write_calls": "-."
}

# Plot the timeseries
def plot(prefix, header, row):
  x = { h : d for (h, d) in zip(header, row)}
  jobid = x["jobid"]
  del x["jobid"]
  result = []
  for k in x:
    timeseries = x[k].split(":")
    timeseries = [ float(x) for x in timeseries]
    if sum(timeseries) == 0:
      continue
    timeseries = [ [k, x, s] for (s,x) in zip(timeseries, range(0, len(timeseries))) ]
    result.extend(timeseries)

  if len(result) == 0:
    print("Empty job! Cannot plot!")
    return

  data = DataFrame(result, columns=["metrics", "segment", "value"])
  groups = data.groupby(["metrics"])
  metrics = DataFrame()
  labels = []
  colors = []
  style = []
  for name, group in groups:
    style.append(linestyleMap[name] + markerMap[name])
    colors.append(colorMap[name])
    if name == "md_file_delete":
      name = "file_delete"
    if name == "md_file_create":
      name = "file_create"
    try:
      metrics[name] = pd.Series([x[2] for x in group.values])
    except:
      print("Error processing %s with" % jobid)
      print(group.values)
      return

    labels.append(name)

  fsize = (8, 1 + 1.1 * len(labels))
  fsizeFixed = (8, 2)
  fsizeHist = (8, 6.5)

  pyplot.close('all')

  if len(labels) < 4 :
    ax = metrics.plot(legend=True, sharex=True, grid = True,  sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)
    ax.set_ylabel("Value")
  else:
    ax = metrics.plot(subplots=True, legend=False, sharex=True, grid = True,  sharey=True, markersize=10, figsize=fsize, color=colors, style=style)
    for (i, l) in zip(range(0, len(labels)), labels):
      ax[i].set_ylabel(l)

  pyplot.xlabel("Segment number")
  pyplot.savefig(prefix + "timeseries" + jobid + fileformat, bbox_inches='tight', dpi=150)

  # Create a facetted grid
  #g = sns.FacetGrid(tips, col="time", margin_titles=True)
  #bins = np.linspace(0, 60, 13)
  #g.map(plt.hist, "total_bill", color="steelblue", bins=bins)
  ax = metrics.hist(grid = True, sharey=True, figsize=fsizeHist, bins=15, range=(0, 15))
  pyplot.xlim(0, 15)
  pyplot.savefig(prefix + "hist" + jobid + fileformat, bbox_inches='tight', dpi=150)


  # Plot first 30 segments
  if len(timeseries) <= 50:
    return

  if len(labels) < 4 :
    ax = metrics.plot(legend=True, xlim=(0,30), sharex=True, grid = True,  sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)
    ax.set_ylabel("Value")
  else:
    ax = metrics.plot(subplots=True, xlim=(0,30), legend=False, sharex=True, grid = True,  sharey=True, markersize=10, figsize=fsize, color=colors, style=style)
    for (i, l) in zip(range(0, len(labels)), labels):
      ax[i].set_ylabel(l)

  pyplot.xlabel("Segment number")
  pyplot.savefig(prefix + "timeseries" + jobid + "-30" + fileformat, bbox_inches='tight', dpi=150)

### end plotting function


#with open('job-io-datasets/datasets/job_codings.csv') as csv_file: # EB: old codings
with open('./datasets/job_codings_v4.csv') as csv_file: # EB: v3 codings moved to this repo
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
      if line_count == 0:
        header = row
        line_count += 1
        continue
      job = row[0].strip()
      if not job in jobs:
        continue
      else:
        index = jobs.index(job)
        plot(prefix[index] + "-ks-" + str(index), header, row)
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00			`#!/usr/bin/env python3`

			`import csv`
			`import sys`
Renamed 2020-09-03 12:59:20 +00:00			`import pandas as pd`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00			`from pandas import DataFrame`
			`from pandas import Grouper`
Renamed 2020-09-03 12:59:20 +00:00			`import seaborn as sns`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00			`from matplotlib import pyplot`
Fix Color map for job vis. 2020-08-19 18:01:48 +00:00			`import matplotlib.cm as cm`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
Optimization 2020-08-20 10:48:27 +00:00			`jobs = sys.argv[1].split(",")`
			`prefix = sys.argv[2].split(",")`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
Update 2020-10-01 16:10:27 +00:00			`fileformat = ".pdf"`
Better plotting. 2020-08-20 11:11:35 +00:00
Bugfix plotting of jobs. More details in paper. 2020-08-20 15:16:46 +00:00			`print("Plotting the job: " + str(sys.argv[1]))`
			`print("Plotting with prefix: " + str(sys.argv[2]))`

Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
Fix Color map for job vis. 2020-08-19 18:01:48 +00:00			`# Color map`
			`colorMap = { "md_file_create": cm.tab10(0),`
			`"md_file_delete": cm.tab10(1),`
			`"md_mod": cm.tab10(2),`
			`"md_other": cm.tab10(3),`
			`"md_read": cm.tab10(4),`
			`"read_bytes": cm.tab10(5),`
			`"read_calls": cm.tab10(6),`
			`"write_bytes": cm.tab10(7),`
			`"write_calls": cm.tab10(8)`
			`}`

Plot diagrams more recognizable. 2020-08-19 18:23:10 +00:00			`markerMap = { "md_file_create": "^",`
			`"md_file_delete": "v",`
			`"md_other": ".",`
			`"md_mod": "<",`
			`"md_read": ">",`
			`"read_bytes": "h",`
			`"read_calls": "H",`
			`"write_bytes": "D",`
			`"write_calls": "d"`
			`}`

			`linestyleMap = { "md_file_create": ":",`
			`"md_file_delete": ":",`
			`"md_mod": ":",`
			`"md_other": ":",`
			`"md_read": ":",`
			`"read_bytes": "--",`
			`"read_calls": "--",`
			`"write_bytes": "-.",`
			`"write_calls": "-."`
			`}`

Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00			`# Plot the timeseries`
Plot the 100 jobs. 2020-08-18 14:26:29 +00:00			`def plot(prefix, header, row):`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00			`x = { h : d for (h, d) in zip(header, row)}`
			`jobid = x["jobid"]`
			`del x["jobid"]`
			`result = []`
			`for k in x:`
			`timeseries = x[k].split(":")`
			`timeseries = [ float(x) for x in timeseries]`
			`if sum(timeseries) == 0:`
			`continue`
Fix and new 2020-09-03 11:14:40 +00:00			`timeseries = [ [k, x, s] for (s,x) in zip(timeseries, range(0, len(timeseries))) ]`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00			`result.extend(timeseries)`

Tune size + output 2020-08-18 14:46:05 +00:00			`if len(result) == 0:`
			`print("Empty job! Cannot plot!")`
			`return`

Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00			`data = DataFrame(result, columns=["metrics", "segment", "value"])`
			`groups = data.groupby(["metrics"])`
			`metrics = DataFrame()`
			`labels = []`
Fix Color map for job vis. 2020-08-19 18:01:48 +00:00			`colors = []`
Plot diagrams more recognizable. 2020-08-19 18:23:10 +00:00			`style = []`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00			`for name, group in groups:`
Plot diagrams more recognizable. 2020-08-19 18:23:10 +00:00			`style.append(linestyleMap[name] + markerMap[name])`
Fix Color map for job vis. 2020-08-19 18:01:48 +00:00			`colors.append(colorMap[name])`
Nai 2020-08-21 18:12:33 +00:00			`if name == "md_file_delete":`
			`name = "file_delete"`
			`if name == "md_file_create":`
			`name = "file_create"`
Renamed 2020-09-03 12:59:20 +00:00			`try:`
			`metrics[name] = pd.Series([x[2] for x in group.values])`
			`except:`
			`print("Error processing %s with" % jobid)`
			`print(group.values)`
			`return`

Nai 2020-08-21 18:12:33 +00:00			`labels.append(name)`
Fix Color map for job vis. 2020-08-19 18:01:48 +00:00
Nai 2020-08-21 18:12:33 +00:00			`fsize = (8, 1 + 1.1 * len(labels))`
Optimization 2020-08-20 10:48:27 +00:00			`fsizeFixed = (8, 2)`
Script gefixt 2020-09-05 16:48:44 +00:00			`fsizeHist = (8, 6.5)`
Better plotting. 2020-08-20 11:11:35 +00:00
Optimization 2020-08-20 10:48:27 +00:00			`pyplot.close('all')`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
Plot diagrams more recognizable. 2020-08-19 18:23:10 +00:00			`if len(labels) < 4 :`
			`ax = metrics.plot(legend=True, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)`
			`ax.set_ylabel("Value")`
			`else:`
			`ax = metrics.plot(subplots=True, legend=False, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsize, color=colors, style=style)`
			`for (i, l) in zip(range(0, len(labels)), labels):`
			`ax[i].set_ylabel(l)`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
			`pyplot.xlabel("Segment number")`
Nai 2020-08-21 18:12:33 +00:00			`pyplot.savefig(prefix + "timeseries" + jobid + fileformat, bbox_inches='tight', dpi=150)`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
Renamed 2020-09-03 12:59:20 +00:00			`# Create a facetted grid`
			`#g = sns.FacetGrid(tips, col="time", margin_titles=True)`
			`#bins = np.linspace(0, 60, 13)`
			`#g.map(plt.hist, "total_bill", color="steelblue", bins=bins)`
Script gefixt 2020-09-05 16:48:44 +00:00			`ax = metrics.hist(grid = True, sharey=True, figsize=fsizeHist, bins=15, range=(0, 15))`
			`pyplot.xlim(0, 15)`
Renamed 2020-09-03 12:59:20 +00:00			`pyplot.savefig(prefix + "hist" + jobid + fileformat, bbox_inches='tight', dpi=150)`


Tune size + output 2020-08-18 14:46:05 +00:00			`# Plot first 30 segments`
			`if len(timeseries) <= 50:`
			`return`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
Plot diagrams more recognizable. 2020-08-19 18:23:10 +00:00			`if len(labels) < 4 :`
			`ax = metrics.plot(legend=True, xlim=(0,30), sharex=True, grid = True, sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)`
			`ax.set_ylabel("Value")`
			`else:`
			`ax = metrics.plot(subplots=True, xlim=(0,30), legend=False, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsize, color=colors, style=style)`
			`for (i, l) in zip(range(0, len(labels)), labels):`
			`ax[i].set_ylabel(l)`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
			`pyplot.xlabel("Segment number")`
Nai 2020-08-21 18:12:33 +00:00			`pyplot.savefig(prefix + "timeseries" + jobid + "-30" + fileformat, bbox_inches='tight', dpi=150)`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00
Plot the 100 jobs. 2020-08-18 14:26:29 +00:00			`### end plotting function`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00


Adaption to new dataset 2020-08-26 12:26:55 +00:00			`#with open('job-io-datasets/datasets/job_codings.csv') as csv_file: # EB: old codings`
Renamed 2020-09-03 12:59:20 +00:00			`with open('./datasets/job_codings_v4.csv') as csv_file: # EB: v3 codings moved to this repo`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00			`csv_reader = csv.reader(csv_file, delimiter=',')`
			`line_count = 0`
			`for row in csv_reader:`
			`if line_count == 0:`
			`header = row`
			`line_count += 1`
			`continue`
Bugfix plotting of jobs. More details in paper. 2020-08-20 15:16:46 +00:00			`job = row[0].strip()`
			`if not job in jobs:`
Plot one or multiple jobs. 2020-08-18 13:55:37 +00:00			`continue`
			`else:`
Bugfix plotting of jobs. More details in paper. 2020-08-20 15:16:46 +00:00			`index = jobs.index(job)`
Renamed 2020-09-03 12:59:20 +00:00			`plot(prefix[index] + "-ks-" + str(index), header, row)`