From 36434cb9199991f8a4cb5a680479e8c2a0451ed2 Mon Sep 17 00:00:00 2001 From: Eugen Betke Date: Tue, 13 Nov 2018 15:48:24 +0100 Subject: [PATCH] Works on DDN cluster --- benchmark/config.sh | 21 +++++++++++------ benchmark/drop_caches.sh | 8 +++++-- benchmark/ior_wrapper.sh | 7 +++--- benchmark/prepare.sh | 41 +++++++++++++++++++++------------- benchmark/run_v2.sh | 21 +++++++++-------- benchmark/tool_sanity_check.sh | 16 +++---------- 6 files changed, 63 insertions(+), 51 deletions(-) diff --git a/benchmark/config.sh b/benchmark/config.sh index 5f6f002e6..5be3bc64e 100755 --- a/benchmark/config.sh +++ b/benchmark/config.sh @@ -12,8 +12,9 @@ if [[ "isc17" == ${hostname:0:5} ]]; then module load betke/hdf5/1.8.20-ddn module load betke/ior/git-ddn module list - export TD="/esfs/jtacquaviva" - export WD="/esfs/jtacquaviva/git/ime-evaluation" + export TD="/esfs/jtacquaviva/testfiles" + export WD="/esfs/jtacquaviva/git/ddn-ime-evaluation/benchmark" + export NODES=( isc17-c04 isc17-c01 isc17-c02 isc17-c03 isc17-c05 isc17-c06 isc17-c07 isc17-c08 isc17-c09 isc17-c12 isc17-c13 isc17-c14 isc17-c15 isc17-c18 isc17-c22 ) elif [[ "m" == ${hostname:0:1} ]]; then echo "Loading Mistral configuration" . /sw/rhel6-x64/tcl/modules-3.2.10/Modules/3.2.10/init/sh @@ -28,21 +29,27 @@ elif [[ "m" == ${hostname:0:1} ]]; then module list export TD="/mnt/lustre01/work/ku0598/k202107/git/ddn-ime-evaluation/benchmark/wd" export WD="/mnt/lustre01/work/ku0598/k202107/git/ddn-ime-evaluation/benchmark" + export NODES=() else echo "Cluster $hostname is not supported. Quitting." exit 1 fi - + + +# Cache = 32108MB +# DATASIZE = 76800MB +DATASIZE=$((4800 * 1024 * 1024 * 16)) #TYPE_ARR=( "read" "write" ) -TYPE_ARR=( "write" ) #API_ARR=( "MPIIO" "POSIX") -API_ARR=( "POSIX") #NN_ARR=( 1 2 4 8 16) -NN_ARR=( 1 ) #PPN_ARR=( 8 4 1 ) -PPN_ARR=( 8 ) #T_ARR=( $((10*1024*1024)) $((1*1024*1024)) $((100*1024)) $((16*1024)) ) + +TYPE_ARR=( "write" ) +API_ARR=( "POSIX") +NN_ARR=( 1 ) +PPN_ARR=( 8 ) T_ARR=( $((10*1024*1024)) ) export IOR="$(which ior)" diff --git a/benchmark/drop_caches.sh b/benchmark/drop_caches.sh index 3dd958be1..4ca42ffdb 100755 --- a/benchmark/drop_caches.sh +++ b/benchmark/drop_caches.sh @@ -1,17 +1,21 @@ #!/bin/bash oscs=( $( find /proc/fs/lustre/osc -mindepth 1 -maxdepth 1 -type d ) ) +echo $oscs while [ ! 0 -eq ${#oscs[@]} ]; do + set -x sync - #echo 3 > /proc/sys/vm/drop_caches + echo 3 > /proc/sys/vm/drop_caches + set +x for i in ${!oscs[@]}; do used_mb=$( grep -h used_mb ${oscs[$i]}/osc_cached_mb | cut -d" " -f 2 ) if [ 0 == $used_mb ]; then + echo "remove from list ${oscs[$i]}, because cache is $used_mb MB" unset oscs[$i] else - echo "skip ${oscs[$i]} $used_mb" + echo "skip ${oscs[$i]}, cache is still $used_mb MB" fi done sleep 1 diff --git a/benchmark/ior_wrapper.sh b/benchmark/ior_wrapper.sh index b3a8ca825..91220697e 100755 --- a/benchmark/ior_wrapper.sh +++ b/benchmark/ior_wrapper.sh @@ -3,21 +3,20 @@ . ./config.sh IOR_PARAMS=$1 -BENCHFILE=$2 +NETOUTDIR=$2 HOST="$hostname" PID=$$ -NETOUTDIR="${BENCHFILE}_network" NETOUTFILE="$NETOUTDIR/HOST:$HOST#PID:$PID.txt" -[[ ! -d $NETOUTDIR ]] && mkdir $NETOUTDIR || rm $NETOUTDIR/HOST*PID*.txt - function capture_network_state { label=$1 echo "LABEL $label" echo "TIMESTAMP $(date +%s)" perfquery -x cat /proc/net/dev + free -m + cat /proc/fs/lustre/llite/esfs-*/max_cached_mb } echo "" > $NETOUTFILE diff --git a/benchmark/prepare.sh b/benchmark/prepare.sh index f669c20f2..f6c7d91f6 100755 --- a/benchmark/prepare.sh +++ b/benchmark/prepare.sh @@ -1,22 +1,31 @@ #!/bin/bash -export MODULEPATH=/esfs/jtacquaviva/software/modules:$MODULEPATH +. ./config.sh -module purge -module load betke/hdf5/1.8.20-ddn -module load betke/ior/git-ddn -module list - -NN=16 NODES='isc17-c02,isc17-c03,isc17-c04,isc17-c05,isc17-c06,isc17-c07,isc17-c08,isc17-c09,isc17-c11,isc17-c12,isc17-c13,isc17-c14,isc17-c15,isc17-c18,isc17-c22,isc17-c01' -LUSTRE_TESTFILE="/esfs/jtacquaviva/sharedread${NN}/file" -TESTDIR="$(dirname $LUSTRE_TESTFILE)" -mkdir $TESTDIR -lfs setstripe -c $(($NN * 2)) $TESTDIR -ITERATIONS=1 -IOR="$(which ior) -i $ITERATIONS -s 1 -t $((16 * 1024 * 1024)) -b $((4800 * 1024 * 1024 * 32 / 8)) -o $LUSTRE_TESTFILE -a MPIIO -e -g -k -w" -ENVVAR="-genv MV2_NUM_HCAS 1 -genv MV2_CPU_BINDING_LEVEL core -genv MV2_CPU_BINDING_POLICY scatter" -MPIEXEC="/opt/ddn/mvapich/bin/mpiexec -ppn 8 -np $((8*$NN)) $ENVVAR -hosts isc17-c04,isc17-c05" -$MPIEXEC $IOR +#for NN in ${NN_ARR[@]}; do + +for NN in 1; do + LUSTRE_TESTFILE="$TD/sharedread${NN}/file" + TESTDIR="$(dirname $LUSTRE_TESTFILE)" + mkdir $TESTDIR + lfs setstripe -c $(($NN * 2)) $TESTDIR + + MPIEXEC_PARAMS="-ppn 8 -np $((8*$NN)) -hosts isc17-c04,isc17-c05 " + MPIEXEC_PARAMS+="-genv MV2_NUM_HCAS 1 -genv MV2_CPU_BINDING_LEVEL core -genv MV2_CPU_BINDING_POLICY scatter" + + IOR_PARAMS="-i 1 -s 1 -t $((16 * 1024 * 1024)) -b $(($DATASIZE / 8)) -o $LUSTRE_TESTFILE -a MPIIO -e -g -k " + IOR_PARAMS+="-D 60 -O stoneWallingWearOut=1 " + + ./drop_caches.sh + set +x + $MPIEXEC $MPIEXEC_PARAMS $IOR $IOR_PARAMS -w + set -x + +# ./drop_caches.sh +# set +x +# $MPIEXEC $MPIEXEC_PARAMS $IOR $IOR_PARAMS -r +# set -x +done diff --git a/benchmark/run_v2.sh b/benchmark/run_v2.sh index 00ad46c94..7e40e4020 100755 --- a/benchmark/run_v2.sh +++ b/benchmark/run_v2.sh @@ -13,12 +13,9 @@ trap force_exit SIGINT # Provides a list of good hosts (that contains QDR connection) function hosts() { num="$1" - #HOST_LIST=( isc17-c04 isc17-c05 isc17-c06 isc17-c07 isc17-c08 isc17-c09 isc17-c11 isc17-c12 isc17-c13 isc17-c14 isc17-c15 isc17-c18 isc17-c22 isc17-c01 isc17-c02 isc17-c03 ) - HOST_LIST=( "m11388" ) - - hlist=${HOST_LIST[0]} + hlist=${NODES[0]} for POS in $(seq 1 $(($num - 1))) ; do - hlist="$hlist,${HOST_LIST[$POS]}" + hlist="$hlist,${NODES[$POS]}" done echo $hlist } @@ -49,16 +46,18 @@ for API in ${API_ARR[@]}; do lfs getstripe $TESTDIR | tee -a $BENCHFILE elif [[ "write" == $TYPE ]]; then +set -x IOR_TYPE_OPTS="-w" LUSTRE_TESTFILE="$TD/sharedwrite/file" TESTDIR="$(dirname $LUSTRE_TESTFILE)" [ -d $TESTDIR ] && rm -r $TESTDIR || mkdir -p $TESTDIR lfs setstripe -c $((2 * $NN)) $TESTDIR lfs getstripe $TESTDIR | tee -a $BENCHFILE +set +x fi - IOR_PARAMS="-i 3 " + IOR_PARAMS="-i 1 " IOR_PARAMS+="-s 1 -t $T -b $((4800 * 1024 * 1024 * 32 / $PPN)) " IOR_PARAMS+="-D 60 -O stoneWallingWearOut=1 " IOR_PARAMS+="-a $API " @@ -70,8 +69,8 @@ for API in ${API_ARR[@]}; do MPIEXEC_PARAMS+="-genv MV2_CPU_BINDING_LEVEL core " MPIEXEC_PARAMS+="-genv MV2_CPU_BINDING_POLICY scatter " elif [[ "m" == ${hostname:0:1} ]]; then - #MPIEXEC_PARAMS=" -ppn $PPN -n $(($NN * $PPN)) --host $(hosts $NN)" - MPIEXEC_PARAMS=" -ppn $PPN -n $(($NN * $PPN)) -wdir $WD" + #MPIEXEC_PARAMS=" -ppn $PPN -n $(($NN * $PPN)) -wdir $WD --host $(hosts $NN) " + MPIEXEC_PARAMS=" -ppn $PPN -n $(($NN * $PPN)) -wdir $WD " else echo "Cluster $hostname is not supported. Quitting." exit 1 @@ -79,7 +78,11 @@ for API in ${API_ARR[@]}; do ( set -x - $MPIEXEC $MPIEXEC_PARAMS ./ior_wrapper.sh "$IOR_PARAMS" "$BENCHFILE" | tee -a $BENCHFILE + NETOUTDIR="${BENCHFILE}_network" +echo $NETjOUTDIR + [[ ! -d $NETOUTDIR ]] && mkdir $NETOUTDIR || rm $NETOUTDIR/HOST*PID*.txt + + $MPIEXEC $MPIEXEC_PARAMS ./ior_wrapper.sh "$IOR_PARAMS" "$NETOUTDIR" | tee -a $BENCHFILE #$MPIEXEC $MPIEXEC_PARAMS $IOR $IOR_PARAMS | tee -a $BENCHFILE set +x ) 2> >(tee -a $BENCHFILE) diff --git a/benchmark/tool_sanity_check.sh b/benchmark/tool_sanity_check.sh index 4401c6614..32693fb66 100755 --- a/benchmark/tool_sanity_check.sh +++ b/benchmark/tool_sanity_check.sh @@ -2,27 +2,17 @@ . ./config.sh -#NN_ARR=( 4 2 1 8 10 12 14 16) -#PPN_ARR=( 8 6 4 2 1 ) -#T_ARR=( $((10*1024*1024)) $((1*1024*1024)) $((100*1024)) $((16*1024)) ) - - res="OK" -for COUNT in $(seq 1); do for NN in ${NN_ARR[@]}; do for T in ${T_ARR[@]}; do for PPN in ${PPN_ARR[@]}; do - - #datasize=$((130 * 1024 * 1024 * 1020 / $PPN)) - datasize=$((4800 * 1024 * 1024 * 32 / $PPN)) - remain=$(( $datasize - ($datasize / $T * $T) )) + datablock=$(( $DATASIZE / $PPN )) + remain=$(( $datablock - ($datablock / $T * $T) )) if [ 0 -ne $remain ]; then - echo "Bad IOR paramters: NN=$NN, PPN=$PPN, T=$T, DS=$datasize" + echo "FIX IOR parameters: NN=$NN, PPN=$PPN, T=$T, DS=$datablock" res="FAILED" fi - -done done done done