mistral-io-datasets/scripts/plot-job-timelines-ks.py

155 lines
4.3 KiB
Python
Raw Permalink Normal View History

2020-08-18 13:55:37 +00:00
#!/usr/bin/env python3
import csv
import sys
2020-09-03 12:59:20 +00:00
import pandas as pd
2020-08-18 13:55:37 +00:00
from pandas import DataFrame
from pandas import Grouper
2020-09-03 12:59:20 +00:00
import seaborn as sns
2020-08-18 13:55:37 +00:00
from matplotlib import pyplot
2020-08-19 18:01:48 +00:00
import matplotlib.cm as cm
2020-08-18 13:55:37 +00:00
2020-08-20 10:48:27 +00:00
jobs = sys.argv[1].split(",")
prefix = sys.argv[2].split(",")
2020-08-18 13:55:37 +00:00
2020-10-01 16:10:27 +00:00
fileformat = ".pdf"
2020-08-20 11:11:35 +00:00
print("Plotting the job: " + str(sys.argv[1]))
print("Plotting with prefix: " + str(sys.argv[2]))
2020-08-18 13:55:37 +00:00
2020-08-19 18:01:48 +00:00
# Color map
colorMap = { "md_file_create": cm.tab10(0),
"md_file_delete": cm.tab10(1),
"md_mod": cm.tab10(2),
"md_other": cm.tab10(3),
"md_read": cm.tab10(4),
"read_bytes": cm.tab10(5),
"read_calls": cm.tab10(6),
"write_bytes": cm.tab10(7),
"write_calls": cm.tab10(8)
}
2020-08-19 18:23:10 +00:00
markerMap = { "md_file_create": "^",
"md_file_delete": "v",
"md_other": ".",
"md_mod": "<",
"md_read": ">",
"read_bytes": "h",
"read_calls": "H",
"write_bytes": "D",
"write_calls": "d"
}
linestyleMap = { "md_file_create": ":",
"md_file_delete": ":",
"md_mod": ":",
"md_other": ":",
"md_read": ":",
"read_bytes": "--",
"read_calls": "--",
"write_bytes": "-.",
"write_calls": "-."
}
2020-08-18 13:55:37 +00:00
# Plot the timeseries
2020-08-18 14:26:29 +00:00
def plot(prefix, header, row):
2020-08-18 13:55:37 +00:00
x = { h : d for (h, d) in zip(header, row)}
jobid = x["jobid"]
del x["jobid"]
result = []
for k in x:
timeseries = x[k].split(":")
timeseries = [ float(x) for x in timeseries]
if sum(timeseries) == 0:
continue
2020-09-03 11:14:40 +00:00
timeseries = [ [k, x, s] for (s,x) in zip(timeseries, range(0, len(timeseries))) ]
2020-08-18 13:55:37 +00:00
result.extend(timeseries)
2020-08-18 14:46:05 +00:00
if len(result) == 0:
print("Empty job! Cannot plot!")
return
2020-08-18 13:55:37 +00:00
data = DataFrame(result, columns=["metrics", "segment", "value"])
groups = data.groupby(["metrics"])
metrics = DataFrame()
labels = []
2020-08-19 18:01:48 +00:00
colors = []
2020-08-19 18:23:10 +00:00
style = []
2020-08-18 13:55:37 +00:00
for name, group in groups:
2020-08-19 18:23:10 +00:00
style.append(linestyleMap[name] + markerMap[name])
2020-08-19 18:01:48 +00:00
colors.append(colorMap[name])
2020-08-21 18:12:33 +00:00
if name == "md_file_delete":
name = "file_delete"
if name == "md_file_create":
name = "file_create"
2020-09-03 12:59:20 +00:00
try:
metrics[name] = pd.Series([x[2] for x in group.values])
except:
print("Error processing %s with" % jobid)
print(group.values)
return
2020-08-21 18:12:33 +00:00
labels.append(name)
2020-08-19 18:01:48 +00:00
2020-08-21 18:12:33 +00:00
fsize = (8, 1 + 1.1 * len(labels))
2020-08-20 10:48:27 +00:00
fsizeFixed = (8, 2)
2020-09-05 16:48:44 +00:00
fsizeHist = (8, 6.5)
2020-08-20 11:11:35 +00:00
2020-08-20 10:48:27 +00:00
pyplot.close('all')
2020-08-18 13:55:37 +00:00
2020-08-19 18:23:10 +00:00
if len(labels) < 4 :
ax = metrics.plot(legend=True, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)
ax.set_ylabel("Value")
else:
ax = metrics.plot(subplots=True, legend=False, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsize, color=colors, style=style)
for (i, l) in zip(range(0, len(labels)), labels):
ax[i].set_ylabel(l)
2020-08-18 13:55:37 +00:00
pyplot.xlabel("Segment number")
2020-08-21 18:12:33 +00:00
pyplot.savefig(prefix + "timeseries" + jobid + fileformat, bbox_inches='tight', dpi=150)
2020-08-18 13:55:37 +00:00
2020-09-03 12:59:20 +00:00
# Create a facetted grid
#g = sns.FacetGrid(tips, col="time", margin_titles=True)
#bins = np.linspace(0, 60, 13)
#g.map(plt.hist, "total_bill", color="steelblue", bins=bins)
2020-09-05 16:48:44 +00:00
ax = metrics.hist(grid = True, sharey=True, figsize=fsizeHist, bins=15, range=(0, 15))
pyplot.xlim(0, 15)
2020-09-03 12:59:20 +00:00
pyplot.savefig(prefix + "hist" + jobid + fileformat, bbox_inches='tight', dpi=150)
2020-08-18 14:46:05 +00:00
# Plot first 30 segments
if len(timeseries) <= 50:
return
2020-08-18 13:55:37 +00:00
2020-08-19 18:23:10 +00:00
if len(labels) < 4 :
ax = metrics.plot(legend=True, xlim=(0,30), sharex=True, grid = True, sharey=True, markersize=10, figsize=fsizeFixed, color=colors, style=style)
ax.set_ylabel("Value")
else:
ax = metrics.plot(subplots=True, xlim=(0,30), legend=False, sharex=True, grid = True, sharey=True, markersize=10, figsize=fsize, color=colors, style=style)
for (i, l) in zip(range(0, len(labels)), labels):
ax[i].set_ylabel(l)
2020-08-18 13:55:37 +00:00
pyplot.xlabel("Segment number")
2020-08-21 18:12:33 +00:00
pyplot.savefig(prefix + "timeseries" + jobid + "-30" + fileformat, bbox_inches='tight', dpi=150)
2020-08-18 13:55:37 +00:00
2020-08-18 14:26:29 +00:00
### end plotting function
2020-08-18 13:55:37 +00:00
2020-08-26 12:26:55 +00:00
#with open('job-io-datasets/datasets/job_codings.csv') as csv_file: # EB: old codings
2020-09-03 12:59:20 +00:00
with open('./datasets/job_codings_v4.csv') as csv_file: # EB: v3 codings moved to this repo
2020-08-18 13:55:37 +00:00
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:
if line_count == 0:
header = row
line_count += 1
continue
job = row[0].strip()
if not job in jobs:
2020-08-18 13:55:37 +00:00
continue
else:
index = jobs.index(job)
2020-09-03 12:59:20 +00:00
plot(prefix[index] + "-ks-" + str(index), header, row)