From 7dc7328e34ababc39c3083e3bcf235b60e3f6014 Mon Sep 17 00:00:00 2001 From: "Julian M. Kunkel" Date: Fri, 21 Aug 2020 19:12:33 +0100 Subject: [PATCH] Nai --- fig/job-timeseries4296426.pdf | Bin 16479 -> 16457 bytes paper/main.tex | 142 +++++++++++++++++++++------------- scripts/analyse-all.sh | 9 ++- scripts/plot-single-job.py | 16 ++-- scripts/plot.R | 19 ++--- 5 files changed, 115 insertions(+), 71 deletions(-) diff --git a/fig/job-timeseries4296426.pdf b/fig/job-timeseries4296426.pdf index 1fa9c7105acf71dca5edf41c58d95241fc9a0c49..a43a026385a81859f765cf96f67fbec9138219f1 100644 GIT binary patch delta 4892 zcmZu!2UJtb+O@(3X@VLNFn}T|LK4yigQ6nc0wOg^?^2|N;?Wx+LIjjZ6$AyPNCyQZ zkxLC#yb%IYB1EM377&mh%e(je706%qg?E zSoAK^Vj#Z>t|ud2DcoW?XVmUmIC{{Bl3ETHejCt{IeB53Q!)weRr|W`rqUZEv;I!y z=<pghO}gK1A+>3uL&G$S)8 z;~z#V>zfLtLs#3o9>qfM(+6*lIE7~ndk&coTMS)wOA~W78!ytW#U-?m5XnDmijPXS`SJ zhIg)yD8YH#n;#wePrfwooxj8(PT7VpUS_0%Oi!j*fQ!P@?+r;7?t&SArr{!j zYed-PVAkiLwZZCMGTtBxgb38e! z051gG&l_~xeS|THTe|U@a_*bzUfj+KwVJk9{O_L$@6CRl%yMAo7w_iSIU&1n!le8U z8R;5#U#h+~$+(DtboH`+}pR`Z3( z1YpR2HX#ArM49&Q+|pBF>%`T`Uz_8Yo5s|HxMR(CxmT^Fi$L?yaKZ*SAE z*5eFz(pF!M;?@cNnK5_kulw}euyE6xKDHxuR{m4pRjoZ^XqEJR9XE?8OzRSnK@9GTp_e+6<{)gBl81THL*k4m?~t4jlMG1GVllb= z^_4-P#5GagbvYrP2Z?sp12-ybDv6;WF&`MGu0i6(knM3QUZ3@vm@oU60z~bE$y8Zw zT;Ut}nGZUibJTu9bz=x9dQ2=f&(!}iapWfA#~(8%s%!#F{kMYhlhA!ss<

SYU=40#KrN$v&j(ur$10UlE2T~)-H1QnI3Et#YI1am{pn0`g&#(>8^|cHN#)NdlX1`QLjH5bL))~f4`Z)L8klYp;>0X z+Yen+;!^Zw-SqzYDSp<0(SHO_p|U^j@tYIKms$MIcZaE-d^SSA)qWwgr1g%-+c>EU z_(zsweld#JSV)xSf&0X{T z(6`WjcGbPg;P~nU4Qgq=z{^Nf5=+J1avRm^?7Uu4*Rlfcea9w_7YeY5tUG}FTD6UT`o?1QrjP3Um=kUC#=mfi$ppPx6^NBDINb} z_CU(Wd-PE5VtDFj5&!iKKRJ_Bla=bRDxp55c00o}gJQgfhyczGuhE6u_a&!Z3V!`` zO_=n3N1bp^Fy83hI3w_6vfs)$?=$j<$G~3rL2b@3CCq41z}YUa`?1MoIjP4*MQ+!( zy7^cG0)n2pk2)rd8^w=h1wm)&7WQY)Af|Nh;muFSI=H73cHBCFtLQYQ6){^6U;;mNYj~`Wp1hMaO@Ew z&j*rU<1k7F^=BeNiIQTxSd9ozUCCJ90cn6<^AD!~QQ7mMHCy^8WL?G|`1p$+-Fkt@ zE*$!?38Z7cI3})YD0=9O-YR=0i@Y{&A(pP-lh@QnV{M`a1-qFS@v zdK!%ys==ntmZ`!LCflja<3%_Aa{Lqa_rl2wCi}GR^L0E}lAQDcS{Vk{f)N{gL4sTr zT(1%0`JQR36-cc7Vj2;2j&*WXvB3A@7XIV^p8F0NomvAqe4U~JoE<688UK9;c-En> zMMEXG%d>FX)g3-qg;8JDY8*GT*SlhpUYu21q8AT}d|ZEX@zjaF4eR;oJU~{hMu_6; z;<%huvD`K*K5Od_$IBC?oM!rJt^+H{k-il;hQuAa@fY90ZKcEaR4N2#iLcYA5)Le{ ztjW7ssgL2uJ=J<_aV#Vc-W*DRSxsBj4A<6JS{%Yt;|McWgSF&PA_z|Ude84?5D$VJ z5yCf1+Ig^R$r}M+_lw8I5JVyPW~v`-j$PZ_e?sH-EL)IdZ9z_#5>`-Q<}CG)ACe3_ z_J~#3oM@GJie6<2Dpr-ZB#mbInH!!rVkt5jej5*7%4N!>-fmIRwNEv`p_^_K$Tkb% zyOvgRPr}XIpS>-F_v)05*;=89Nt0}1EAZ*i!zn%I`T+r32< zLO1;Vr=@#6)3?(4m~cn1fOBz6+oiw+b=P*)?W&S1Vwziu2Lm$eT=M_p_rK28YrMVo z6mxB*rS>89&luY)@n)~0l7mY4{cbfv5or%(^R)BcUvV*eVqJU^FkdzwmI=pGtHr;n zmp){nKU0AUY1*}lt*;nQ8q1*oYf_U$8p$=C>Wg9Uf)FN<8^!plT7N#`f-)&M+^+zi zl=he5ve(4}#*jHHT1};Ye6_zi^=HsbUJ&ISvlaS?nX%fi`2}SNvuUHpBk@!yXv`L$ z9ZM*{1B31d4ygrQ^>3OnZf6w^)lyPDTC+GYUmA?DFE!Nobl8QhJW>JJIlVVV3hP>n z(}c6#Aw0aSj4R%^RFx7AiBBBiwdGYRFzW6c{>y1KXIIzz*zZ*YrI|M+GIh89(%!~9~dxuL4`luNALOquAJu$m!cH>Gz2d+w#YbJDIz6loig4K+KJ1-Qr)y=rw zr>!g%gjM3A(oh>dGu}&&Y~Pdgd^jWL_m!r2aOIg_s6X!6MQYG2@k5{I9)cb~;w3wp zwEY=nlRY}+YH&o%gn6a~CkOeD>vyzwWZ}oMDLR$-Wj_b(;hY8B`Dj4#l+?EpVYNW2 z*$S02I*|G;Y(9K2U_E?zB>WD2BG)lji}rrfivn`ES(#RL|IDr3se_`V^Whi=C65p0 z?3A+l%eO+St<)tEvA>{!jItU_QF);RoqwT?+U9MM`wb=L)n8}{Cgu(nmX;}H0|qrS zhY#4wDw)*Hg$xN+_?{)F-Mp3bwo3}s9wFyVK0JF+j_D*JkvyULJ?0+ydZ;0^W&5Z3 zPiBmdeCmJ zl~$xN6n7G?8e^stMf~7)+dL3jYj3VD`a#$1%8GmE1 z7-3H1anHtY?<2XW1xnq;17KU&Y%FPg+tv%3`V<+o*glvvRNBqTi64$ZqWNua+WcrM zBPo?2c?v>8q{v`5>Nqb9hDJaU7|KJHOgm+`^v3Tt6e@jFn*V1528W{l2Zlv*@nBd4 z7p4s3<|)Hblw?(X0M6Y7N20i%C!g#4dqG%hWsBG^!LKQ=L0t6g(;(vm`w{q5!@<35opxrkAWg#h|PjS zkq9I=hDLIYMM04OHwZ^>&IF1=Vm3?8}6#Ns;(w%r4n#DBD5SP{ zBs(z%BaEd%zftG<|Ig3-zU!LrdanDq-+901zV7#V-Vfo-qv6b@M$Eh@BuoX3Mq|)$ zI0gexy1*O&9;y_HT;D$KnWlVxf>!ow?6g+6@;qmY9Or=x;%&$tr1EirWr}W}A>>nm zYi}tj?S(dsMgHRT&h7uoGUQ4*bxhAM z57ZjIfUupexNnpBG2x`eI{A-+2J6YMmtrM8j(t5yc>Vsg1uD-yE;)U$RIfLU+8M;D35@s@RVJO{6zHf z&VJ}!TmSGxNls4vC;#*_wYVs}F;5NClTX*g}Au(fcJbL=ox2mMdj+>LmeWh$f zhW-k^x_-fnvdDDV!b^UR!0TQ{sAf&DC`){{j?HRZTjbTU%=O?34%p{jN~oh-BRQmS zHu55lYD0++za+kqs(kf(8IauxD!F^OB&Ual+T)L`>{qN`j;&V?P(N_s&$`cx3KHr` zfy!vv*nnjnxTxoOOLx_r^Dpi;QG8B)EqeMrcV|?`Ic@W>tbu}Gx}uKt%@FH)1wM6% zo}$qy{}!UmVZJ20z|_lQj(v`Z!-OS(92sS!deRU}k4pbaa0pDL))R|Bh?Ih4V5;9= z!#NNs*MG6@8g_B>mD5>%V~)AauWVL*5{o9*Jl1@8>bdq^^$N3=9evXTpXu_-SkisL z+;Q{$&G)$XF|C;BmEu0+-1sok>$&;F^@oCK0T%+3){nhLlHDim3)>Hx&Axt6eA`&I z0hcCfK3)MH9y#)+zPcPUP6(UtiKLys>U?qWr7One=#QzQ^7HFlx1uT@J)(Cqu}J;OwxJZuc1UWzGKTzT7kR-`a5RCH%w|Y%{`D zwQuyC8!XKlo>ZksHLb7P{iM#_7g!$u+0X1_bD_0>g?5f{Re3h7;Sk?xiPJt9YfyOg%z zar-W)&jW`*JoPtAG7p%m><2G2IdA`<*o6&zDL?oArn(@eS)I%B%2>IHHe0$>y;t1e zW9v^pOSrJdsuJ_&<^%0q^qjsZa7}AO7k|WlIW}-mOSg1`y%OLczQ4#CNoo#U?z0gOjq4za8+$iY5GrXruh#>ZZoHqNAGqqi9`E3$Smwf8uIs z=~=NSF~*9Sf@iB6>#h84YCpE-tooM~MT{)K;5;ga+tLBhOUHJ|2rlO`qL^F9c{lFk zQRzEsuu=Xd<*l}SX`fT464ooPqYd#LcGs{s|Fr0ouL|)0;qvxPPmZA*cT`d~PBm^V z`r5(E4UP}Ed6N#H+p)@5a#(qfKJ7i#QkvzM%zRgyT~uMRDmlTynr`^|B8aGhq&3~| zV>u8}S3)UBKNgWTs>+5~cH=H??(*L(;fQPOlWRiUD4wQx3gnR88rEyUVtTBL-7&eg zQIODdoF;rekSg2Whu*kH-2!vR&hWaQQ-WCdhItoO#9^7PZn=_tB-bACROECa|(*dMqR^mI{W2&_Bytfe!sL2>n=f{^tO`?8$lwW z6V3#)8DXEggTm?yWDD2Zv$qJn9hkZSDK_KQ*$G@(Xq}#jA=L4-tSv5@K~@NP6Aj{< zif#AQVj)BQF&h>@Q#k11Sq^&i3-5s{CGp-d#PY*PS}i$5Vcadbsk}~W zg00=!Nv{Q)+^L|G_Y_x+yqx+rW@@r@kdS%%?0jy2O>kf}A|sRgv5z(f1T|rip%7mP zKbt(%kZai0*yk$gZC4Z_MKDAYs)l)eqoLTyqxd|qLO~fc1+kIJ_h{U z^j#TZm1H_Sv6Yh4-ViC@P6Bc+zN#hkpU=&tyr?%?nCLf>m99i5x8{F%0iEH00tg^~vlwm~X^DjNhg$gd?diVH{)wKqnZE5o)(p9*WjRejH<3C>%#`?C$2Ty~Ek#Ow zIGf&z2GbULPvjp%-rb$D!}KZpUg+19rMGknlf23ADZA@?evRK#cJ~IRwU+a1%2ti8 z30-6A`G%8>hz%3#bt7o9M0kwZzpQDj)`>&5PO%HjBbOZdUAiT$%v3dyZNt2*#4*)Z z)!_bWrL-GOsi~?H4lQrZ?vG4|z;gK(WBo?MXyZh2vcnwBsE=owNqMwPy#Pyd>4rjYF(fjmPK_wEKzs)}^N3QI!kSO1_z@jdn!IgRM&MG&NfSK>| zguXRVgRkNx^X3T`O)_zJ^z}Zr{<`?oMo6uCXFtQjrL3MLj#sj22E{vFDZh{0G78P; zgOl4Qj4_`TQ>IF#j0`&;E#arQc0_M(D(32ST&^;qBG+Zy3%m3-{LqCFrqOwh+*f}DT`6a)?WYdx6SS>r4CV^1Z2k-nj8(q+vcy-e zqWQzKH~~nlf`J0qto|gD=hV4i_{K`JwC1pvJ&CZ&B(v4@H!gpHvUvM3AA8NE09M~z zBfmG*`PE#r5wTeYH<;}oF#Fm>QDatF-WEYVyAZ-vNK9{wh;OxnndxHBzI4#S`!d=0 z$vPy_BkKQ?4jw(Ln$`9XiPv0|3hwuH{M_p<8dKPUz0lKr=+U+MlIu;ow;9Tb*>^p- zuLZ>6>mrMs*mNNA_bnzg-Xhv>V#{3r%r~@*Q`jG}N?|b~A1W%XGm6r-vOYU&UX9dv zjw)O6D17cHsTZ_-{=~Gt1omh$Vl(AnV#*?9Q`)RT%c zh77k&szWlVY)8j*M#&>IBhBzXZ`SusI98`ekz|V}*bO~`HA3r1980ASHY-B32ZST% zeP5$ogg>Y|jXS;>mC2jwjFtvM$bR3q^Q7g@agKhQ*v3IMf;rv`3id7np(P}aYh?mY zKQul!jy{*$fO6?BfI`9IRN0%ZBU@Kdlz5aTC?2-c|Gm1mA7}4(ubpr4oCN+VCVhJG zh~NR@N{f2{S;P$n)}t4)bA8oSgK}SnmfP*z=$Tl)l^Bu{c{irKWpV9jrg<%XlvlpE z7hgN1%%)JVY$W1Ql$rT%pehtBAYU-P6j)UWk@6RDZe>~ASppGT#0e2MU~Y_O_r>Ox zRZsh<)!3i@-MRCM4vS|)5x%w?lXt!3khTk^R2r(;C7$EJ5nPG%pF4q_Lp z`yLAmouCuy)moQs(+RYWW5k&Z?WPiSIw1fjR=W^;0~4hU3;ilGMLK|MdsgUBkQqQ9 zCDx#3#-PWPp@2_BcPXxzCOlhnN3l@XGw0TW6QL@z$7h2~Y=q8PnMfR>|CY*o*E7S$ zlB65ETzM`^?r_IPI`bzD^GmXzFy&kP0ZJSI6beT}QAxq-1xj#)46ysi_{ebm3ZQBL znh}H2>5eCb6p;*Um>P-^L!guHoiqj!j43q~nt_ghLm4m>z{o?R;Cp#+s2amsI1GdN z#RLCU6&wM;7#5>|-^73OFffKqpePtaAyCBsz<_@?{^$Qdx}Z=P3JvVp7=SaX0Kie` zy&nT0pvb*~0|+>Z5ksLE)*=BE$Ox)o_ErKQk$~FX4@6>M3>ZpHZLcx_3V~uM4uyjL zgMt5NDgcFMkPARV5qpgWpw$>vKqDC|Mx$YS{Q+PA#D8PE`eWcohDtC9#9l7|7!+c! ztWW?z?Ntj3sKFQv2f!Im5r6~`$M(nyrI*aG2?7Qnk~TH<_9HM{M~<8^!EyZu+(?Y> diff --git a/paper/main.tex b/paper/main.tex index e8aac59..a5d0feb 100644 --- a/paper/main.tex +++ b/paper/main.tex @@ -137,17 +137,71 @@ Check time series algorithms: \section{Evaluation} \label{sec:evaluation} +For each reference job and algorithm, we created a CSV files with the computed similarity for all other jobs. +Next, we analyzed the performance of the algorithm. +Then the quantitative behavior and the correlation between chosen similarity and number of found jobs, and, finally, the quality of the 100 most similar jobs. + +\subsection{Reference Jobs} + In the following, we assume a job is given and we aim to identify similar jobs. -We chose several reference jobs with different compute and IO characteristics visualized in \Cref{fig:refJobs}: +We chose several reference jobs with different compute and IO characteristics: \begin{itemize} - \item Job-S: performs postprocessing on a single node. This is a typical process in climate science where data products are reformatted and annotated with metadata to a standard representation (so called CMORization). The post-processing is IO intensive. + \item Job-S: performs post-processing on a single node. This is a typical process in climate science where data products are reformatted and annotated with metadata to a standard representation (so called CMORization). The post-processing is IO intensive. \item Job-M: a typical MPI parallel 8-hour compute job on 128 nodes which writes time series data after some spin up. %CHE.ws12 \item Job-L: a 66-hour 20-node job. The initialization data is read at the beginning. Then only a single master node writes constantly a small volume of data; in fact, the generated data is too small to be categorized as IO relevant. \end{itemize} -For each reference job and algorithm, we created a CSV files with the computed similarity for all other jobs. +The segmented timeline of the jobs are visualized in \Cref{fig:refJobs}. +This coding is also used for the HEX class of algorithms (BIN algorithms merge all timelines together as described in \jk{TODO}. +The figures show the values of active metrics ($\neq 0$) only; if few are active then they are shown in one timeline, otherwise they are rendered individually to provide a better overview. +For example, we can see in \Cref{fig:job-S}, that several metrics increase in Segment\,6. + +\begin{figure} +\begin{subfigure}{0.8\textwidth} +\centering +\includegraphics[width=\textwidth]{job-timeseries4296426} +\caption{Job-S} \label{fig:job-S} +\end{subfigure} +\centering + + +\begin{subfigure}{0.8\textwidth} +\centering +\includegraphics[width=\textwidth]{job-timeseries5024292} +\caption{Job-M} \label{fig:job-M} +\end{subfigure} +\centering + + +\caption{Reference jobs: segmented timelines of mean IO activity} +\label{fig:refJobs} +\end{figure} + + +\begin{figure}\ContinuedFloat + +\begin{subfigure}{0.8\textwidth} +\centering +\includegraphics[width=\textwidth]{job-timeseries7488914-30} +\caption{Job-L (first 30 segments of 400; remaining segments are similar)} +\label{fig:job-L} +\end{subfigure} +\centering +\caption{Reference jobs: segmented timelines of mean IO activity} +\end{figure} + + + +\subsection{Performance} + +\jk{Describe System at DKRZ from old paper} + +The runtime for computing the similarity of relevant IO jobs (580,000 and 440,000 for BIN and HEX algorithms, respectively) is shown in \Cref{fig:performance}. + +\jk{TO FIX, This is for clustering algorithm, not for computing SIM, which is what we do here.} + \begin{figure} \centering @@ -168,93 +222,73 @@ For each reference job and algorithm, we created a CSV files with the computed s \end{figure} -Create histograms + cumulative job distribution for all algorithms. -Insert job profiles for closest 10 jobs. - -Potentially, analyze how the rankings of different similarities look like. - - -\begin{figure} -\begin{subfigure}{0.8\textwidth} -\centering -\includegraphics[width=\textwidth]{job-timeseries4296426} -\caption{Job-S} \label{fig:job-S} -\end{subfigure} -\centering - -\caption{Reference jobs: timeline of mean IO activity} -\label{fig:refJobs} -\end{figure} - - -\begin{figure}\ContinuedFloat - -\begin{subfigure}{0.8\textwidth} -\centering -\includegraphics[width=\textwidth]{job-timeseries5024292} -\caption{Job-M} \label{fig:job-M} -\end{subfigure} -\centering - -\begin{subfigure}{0.8\textwidth} -\centering -\includegraphics[width=\textwidth]{job-timeseries7488914-30.pdf} -\caption{Job-L (first 30 segments of 400; remaining segments are similar)} -\label{fig:job-L} -\end{subfigure} -\centering -\caption{Reference jobs: timeline of mean IO activity; non-shown timelines are 0} -\end{figure} +\subsection{Quantitative Analysis} +In the quantitative analysis, we explore for the different algorithms how the similarity of our pool of jobs behaves to our three reference jobs (Job-S, Job-M, and Job-L). +The cumulative distribution of similarity to the reference jobs is shown in \Cref{fig:ecdf}. +For example, in \Cref{fig:ecdf-job-S}, we see that about 70\% have a similarity of less than 10\% to Job-S for HEX\_native. +BIN\_aggzeros shows some steep increases, e.g., more than 75\% of jobs have the same low similarity below 2\%. +The different algorithms lead to different curves for our reference jobs, e.g., for Job-S, HEX\_phases bundles more jobs with low similarity compared to the other jobs; in Job-L, it is the slowest. +% This indicates that the algorithms +The support team in a data center may have time to investigate the most similar jobs. +Time for the analysis is typically bound, for instance, the team may analyze the 100 most similar jobs. +In \Cref{fig:hist}, the histograms with the actual number of jobs for a given similarity are shown. +As we focus on a feasible number of jobs, the diagram should be read from right (100\% similarity) to left and for a bin we show at most 100 jobs (total number is still given). +It turns out that both BIN algorithms produce nearly identical histograms and we omit one of them. +In the figures, we can see again a different behavior of the algorithms depending on the reference job. +Especially for Job-S, we can see clusters with jobs of higher similarity while for Job-M, the growth in the relevant section is more steady. +For Job-L, we find barely similar jobs, except when using the HEX\_phases algorithm. \begin{figure} \begin{subfigure}{0.8\textwidth} \centering -\includegraphics[width=\textwidth]{job_similarities_4296426-out/ecdf.png} +\includegraphics[width=\textwidth]{job_similarities_4296426-out/ecdf} \caption{Job-S} \label{fig:ecdf-job-S} \end{subfigure} \centering \begin{subfigure}{0.8\textwidth} \centering -\includegraphics[width=\textwidth]{job_similarities_5024292-out/ecdf.png} +\includegraphics[width=\textwidth]{job_similarities_5024292-out/ecdf} \caption{Job-M} \label{fig:ecdf-job-M} \end{subfigure} \centering \begin{subfigure}{0.8\textwidth} \centering -\includegraphics[width=\textwidth]{job_similarities_7488914-out/ecdf.png} +\includegraphics[width=\textwidth]{job_similarities_7488914-out/ecdf} \caption{Job-L} \label{fig:ecdf-job-L} \end{subfigure} \centering -\caption{Empirical cumulative density function} +\caption{Quantitative job similarity -- empirical cumulative density function} \label{fig:ecdf} \end{figure} \begin{figure} - -\begin{subfigure}{0.5\textwidth} \centering -\includegraphics[width=\textwidth]{job_similarities_4296426-out/hist-sim} + +\begin{subfigure}{0.75\textwidth} +\centering +\includegraphics[width=\textwidth,trim={0 0 0 2.2cm},clip]{job_similarities_4296426-out/hist-sim} \caption{Job-S} \label{fig:hist-job-S} \end{subfigure} -\begin{subfigure}{0.5\textwidth} + +\begin{subfigure}{0.75\textwidth} \centering -\includegraphics[width=\textwidth]{job_similarities_5024292-out/hist-sim} +\includegraphics[width=\textwidth,trim={0 0 0 2.2cm},clip]{job_similarities_5024292-out/hist-sim} \caption{Job-M} \label{fig:hist-job-M} \end{subfigure} -\begin{subfigure}{0.5\textwidth} +\begin{subfigure}{0.75\textwidth} \centering -\includegraphics[width=\textwidth]{job_similarities_7488914-out/hist-sim} +\includegraphics[width=\textwidth,trim={0 0 0 2.2cm},clip]{job_similarities_7488914-out/hist-sim} \caption{Job-L} \label{fig:hist-job-L} \end{subfigure} \centering -\caption{Histogram for the number of jobs (bin width: 2.5\%, numbers are the actual job counts)} +\caption{Histogram for the number of jobs (bin width: 2.5\%, numbers are the actual job counts). BIN\_aggzeros is nearly identical to BIN\_all.} \label{fig:hist} \end{figure} @@ -415,7 +449,7 @@ One consideration is to identify jobs that meet a rank threshold for all differe % \ContinuedFloat Hex phases very similar to hex native. -Komischer JOB zu inspizieren: \verb|job_similarities_4296426-out/hex_phases-0.7429--93timeseries4237860.png| +Komischer JOB zu inspizieren: \verb|job_similarities_4296426-out/hex_phases-0.7429--93timeseries4237860| Bin aggzeros works quite well here too. The jobs are a bit more diverse. @@ -602,7 +636,7 @@ Bin aggzero liefert Mist zurück. \end{subfigure} \caption{Job-L with hex\_lev, selection of similar jobs} -\label{fig:job-L-hex-phases} +\label{fig:job-L-hex-lev} \end{figure} diff --git a/scripts/analyse-all.sh b/scripts/analyse-all.sh index 1ff9a7e..d7b968c 100755 --- a/scripts/analyse-all.sh +++ b/scripts/analyse-all.sh @@ -4,6 +4,8 @@ echo "This script performs the complete analysis steps" +CLEAN=0 # Set to 0 to make some update + function prepare(){ pushd datasets ./decompress.sh @@ -21,6 +23,9 @@ for I in job_similarities_*.csv ; do ./scripts/plot.R $I > description.txt OUT=${I%%.csv}-out mkdir $OUT - rm $OUT/* - mv *.png *.pdf description.txt $OUT + if [[ $CLEAN != "0" ]] ; then + rm $OUT/* + mv description.txt $OUT + fi + mv *.png *.pdf $OUT done diff --git a/scripts/plot-single-job.py b/scripts/plot-single-job.py index 8849d7c..e9f6392 100755 --- a/scripts/plot-single-job.py +++ b/scripts/plot-single-job.py @@ -10,7 +10,7 @@ import matplotlib.cm as cm jobs = sys.argv[1].split(",") prefix = sys.argv[2].split(",") -fileformat = ".png" +fileformat = ".pdf" print("Plotting the job: " + str(sys.argv[1])) print("Plotting with prefix: " + str(sys.argv[2])) @@ -78,12 +78,16 @@ def plot(prefix, header, row): colors = [] style = [] for name, group in groups: - metrics[name] = [x[2] for x in group.values] - labels.append(name) style.append(linestyleMap[name] + markerMap[name]) colors.append(colorMap[name]) + if name == "md_file_delete": + name = "file_delete" + if name == "md_file_create": + name = "file_create" + metrics[name] = [x[2] for x in group.values] + labels.append(name) - fsize = (8, 1 + 1.5 * len(labels)) + fsize = (8, 1 + 1.1 * len(labels)) fsizeFixed = (8, 2) pyplot.close('all') @@ -97,7 +101,7 @@ def plot(prefix, header, row): ax[i].set_ylabel(l) pyplot.xlabel("Segment number") - pyplot.savefig(prefix + "timeseries" + jobid + fileformat, bbox_inches='tight') + pyplot.savefig(prefix + "timeseries" + jobid + fileformat, bbox_inches='tight', dpi=150) # Plot first 30 segments if len(timeseries) <= 50: @@ -113,7 +117,7 @@ def plot(prefix, header, row): ax[i].set_ylabel(l) pyplot.xlabel("Segment number") - pyplot.savefig(prefix + "timeseries" + jobid + "-30" + fileformat, bbox_inches='tight') + pyplot.savefig(prefix + "timeseries" + jobid + "-30" + fileformat, bbox_inches='tight', dpi=150) ### end plotting function diff --git a/scripts/plot.R b/scripts/plot.R index 642c61b..c8ff172 100755 --- a/scripts/plot.R +++ b/scripts/plot.R @@ -4,7 +4,7 @@ library(ggplot2) library(dplyr) require(scales) -plotjobs = TRUE +plotjobs = FALSE # Color scheme plotcolors <- c("#CC0000", "#FFA500", "#FFFF00", "#008000", "#9999ff", "#000066") @@ -22,19 +22,20 @@ cat("Job count:") cat(nrow(data)) # empirical cumulative density function (ECDF) -ggplot(data, aes(similarity, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("SIM") + ylab("Fraction of jobs") + theme(legend.position=c(0.9, 0.4)) + scale_color_brewer(palette = "Set2") -ggsave("ecdf.png", width=8, height=3) +data$sim = data$similarity*100 +ggplot(data, aes(sim, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("Similarity in %") + ylab("Fraction of jobs") + theme(legend.position=c(0.9, 0.4)) + scale_color_brewer(palette = "Set2") + scale_x_log10() +ggsave("ecdf.png", width=8, height=2.5) -ggplot(data, aes(similarity, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("SIM") + ylab("Fraction of jobs") + theme(legend.position=c(0.9, 0.4)) + scale_color_brewer(palette = "Set2") + xlim(0.5, 1.0) -ggsave("ecdf-0.5.png", width=8, height=3) +# histogram for the jobs +ggplot(data, aes(sim), group=alg_name) + geom_histogram(color="black", binwidth=2.5) + aes(fill = alg_name) + facet_grid(alg_name ~ ., switch = 'y') + xlab("Similarity in %") + scale_y_continuous(limits=c(0, 100), oob=squish) + scale_color_brewer(palette = "Set2") + ylab("Count (cropped at 100)") + theme(legend.position = "none") + stat_bin(binwidth=2.5, geom="text", adj=1.0, angle = 90, colour="black", size=3, aes(label=..count.., y=0*(..count..)+95)) +ggsave("hist-sim.png", width=6, height=4.5) + +#ggplot(data, aes(similarity, color=alg_name, group=alg_name)) + stat_ecdf(geom = "step") + xlab("SIM") + ylab("Fraction of jobs") + theme(legend.position=c(0.9, 0.4)) + scale_color_brewer(palette = "Set2") + xlim(0.5, 1.0) +#ggsave("ecdf-0.5.png", width=8, height=3) e = data %>% filter(similarity >= 0.5) print(summary(e)) -# histogram for the jobs -ggplot(data, aes(similarity), group=alg_name) + geom_histogram(color="black", binwidth=0.025) + aes(fill = alg_name) + facet_grid(alg_name ~ ., switch = 'y') + scale_y_continuous(limits=c(0, 100), oob=squish) + scale_color_brewer(palette = "Set2") + ylab("Count (cropped at 100)") + theme(legend.position = "none") + stat_bin(binwidth=0.025, geom="text", angle = 90, colour="black", size=3, aes(label=..count.., y=0*(..count..)+20)) -ggsave("hist-sim.png") - # load job information, i.e., the time series per job jobData = read.csv("job-io-datasets/datasets/job_codings.csv") metadata = read.csv("job-io-datasets/datasets/job_metadata.csv")