Works on DDN cluster
This commit is contained in:
parent
ad44d21503
commit
36434cb919
|
@ -12,8 +12,9 @@ if [[ "isc17" == ${hostname:0:5} ]]; then
|
|||
module load betke/hdf5/1.8.20-ddn
|
||||
module load betke/ior/git-ddn
|
||||
module list
|
||||
export TD="/esfs/jtacquaviva"
|
||||
export WD="/esfs/jtacquaviva/git/ime-evaluation"
|
||||
export TD="/esfs/jtacquaviva/testfiles"
|
||||
export WD="/esfs/jtacquaviva/git/ddn-ime-evaluation/benchmark"
|
||||
export NODES=( isc17-c04 isc17-c01 isc17-c02 isc17-c03 isc17-c05 isc17-c06 isc17-c07 isc17-c08 isc17-c09 isc17-c12 isc17-c13 isc17-c14 isc17-c15 isc17-c18 isc17-c22 )
|
||||
elif [[ "m" == ${hostname:0:1} ]]; then
|
||||
echo "Loading Mistral configuration"
|
||||
. /sw/rhel6-x64/tcl/modules-3.2.10/Modules/3.2.10/init/sh
|
||||
|
@ -28,21 +29,27 @@ elif [[ "m" == ${hostname:0:1} ]]; then
|
|||
module list
|
||||
export TD="/mnt/lustre01/work/ku0598/k202107/git/ddn-ime-evaluation/benchmark/wd"
|
||||
export WD="/mnt/lustre01/work/ku0598/k202107/git/ddn-ime-evaluation/benchmark"
|
||||
export NODES=()
|
||||
else
|
||||
echo "Cluster $hostname is not supported. Quitting."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
|
||||
# Cache = 32108MB
|
||||
# DATASIZE = 76800MB
|
||||
DATASIZE=$((4800 * 1024 * 1024 * 16))
|
||||
|
||||
#TYPE_ARR=( "read" "write" )
|
||||
TYPE_ARR=( "write" )
|
||||
#API_ARR=( "MPIIO" "POSIX")
|
||||
API_ARR=( "POSIX")
|
||||
#NN_ARR=( 1 2 4 8 16)
|
||||
NN_ARR=( 1 )
|
||||
#PPN_ARR=( 8 4 1 )
|
||||
PPN_ARR=( 8 )
|
||||
#T_ARR=( $((10*1024*1024)) $((1*1024*1024)) $((100*1024)) $((16*1024)) )
|
||||
|
||||
TYPE_ARR=( "write" )
|
||||
API_ARR=( "POSIX")
|
||||
NN_ARR=( 1 )
|
||||
PPN_ARR=( 8 )
|
||||
T_ARR=( $((10*1024*1024)) )
|
||||
|
||||
export IOR="$(which ior)"
|
||||
|
|
|
@ -1,17 +1,21 @@
|
|||
#!/bin/bash
|
||||
|
||||
oscs=( $( find /proc/fs/lustre/osc -mindepth 1 -maxdepth 1 -type d ) )
|
||||
echo $oscs
|
||||
|
||||
while [ ! 0 -eq ${#oscs[@]} ]; do
|
||||
set -x
|
||||
sync
|
||||
#echo 3 > /proc/sys/vm/drop_caches
|
||||
echo 3 > /proc/sys/vm/drop_caches
|
||||
set +x
|
||||
|
||||
for i in ${!oscs[@]}; do
|
||||
used_mb=$( grep -h used_mb ${oscs[$i]}/osc_cached_mb | cut -d" " -f 2 )
|
||||
if [ 0 == $used_mb ]; then
|
||||
echo "remove from list ${oscs[$i]}, because cache is $used_mb MB"
|
||||
unset oscs[$i]
|
||||
else
|
||||
echo "skip ${oscs[$i]} $used_mb"
|
||||
echo "skip ${oscs[$i]}, cache is still $used_mb MB"
|
||||
fi
|
||||
done
|
||||
sleep 1
|
||||
|
|
|
@ -3,21 +3,20 @@
|
|||
. ./config.sh
|
||||
|
||||
IOR_PARAMS=$1
|
||||
BENCHFILE=$2
|
||||
NETOUTDIR=$2
|
||||
HOST="$hostname"
|
||||
PID=$$
|
||||
|
||||
NETOUTDIR="${BENCHFILE}_network"
|
||||
NETOUTFILE="$NETOUTDIR/HOST:$HOST#PID:$PID.txt"
|
||||
|
||||
[[ ! -d $NETOUTDIR ]] && mkdir $NETOUTDIR || rm $NETOUTDIR/HOST*PID*.txt
|
||||
|
||||
function capture_network_state {
|
||||
label=$1
|
||||
echo "LABEL $label"
|
||||
echo "TIMESTAMP $(date +%s)"
|
||||
perfquery -x
|
||||
cat /proc/net/dev
|
||||
free -m
|
||||
cat /proc/fs/lustre/llite/esfs-*/max_cached_mb
|
||||
}
|
||||
|
||||
echo "" > $NETOUTFILE
|
||||
|
|
|
@ -1,22 +1,31 @@
|
|||
#!/bin/bash
|
||||
|
||||
export MODULEPATH=/esfs/jtacquaviva/software/modules:$MODULEPATH
|
||||
. ./config.sh
|
||||
|
||||
module purge
|
||||
module load betke/hdf5/1.8.20-ddn
|
||||
module load betke/ior/git-ddn
|
||||
module list
|
||||
|
||||
NN=16
|
||||
NODES='isc17-c02,isc17-c03,isc17-c04,isc17-c05,isc17-c06,isc17-c07,isc17-c08,isc17-c09,isc17-c11,isc17-c12,isc17-c13,isc17-c14,isc17-c15,isc17-c18,isc17-c22,isc17-c01'
|
||||
LUSTRE_TESTFILE="/esfs/jtacquaviva/sharedread${NN}/file"
|
||||
TESTDIR="$(dirname $LUSTRE_TESTFILE)"
|
||||
mkdir $TESTDIR
|
||||
lfs setstripe -c $(($NN * 2)) $TESTDIR
|
||||
ITERATIONS=1
|
||||
IOR="$(which ior) -i $ITERATIONS -s 1 -t $((16 * 1024 * 1024)) -b $((4800 * 1024 * 1024 * 32 / 8)) -o $LUSTRE_TESTFILE -a MPIIO -e -g -k -w"
|
||||
ENVVAR="-genv MV2_NUM_HCAS 1 -genv MV2_CPU_BINDING_LEVEL core -genv MV2_CPU_BINDING_POLICY scatter"
|
||||
MPIEXEC="/opt/ddn/mvapich/bin/mpiexec -ppn 8 -np $((8*$NN)) $ENVVAR -hosts isc17-c04,isc17-c05"
|
||||
|
||||
$MPIEXEC $IOR
|
||||
|
||||
#for NN in ${NN_ARR[@]}; do
|
||||
|
||||
for NN in 1; do
|
||||
LUSTRE_TESTFILE="$TD/sharedread${NN}/file"
|
||||
TESTDIR="$(dirname $LUSTRE_TESTFILE)"
|
||||
mkdir $TESTDIR
|
||||
lfs setstripe -c $(($NN * 2)) $TESTDIR
|
||||
|
||||
MPIEXEC_PARAMS="-ppn 8 -np $((8*$NN)) -hosts isc17-c04,isc17-c05 "
|
||||
MPIEXEC_PARAMS+="-genv MV2_NUM_HCAS 1 -genv MV2_CPU_BINDING_LEVEL core -genv MV2_CPU_BINDING_POLICY scatter"
|
||||
|
||||
IOR_PARAMS="-i 1 -s 1 -t $((16 * 1024 * 1024)) -b $(($DATASIZE / 8)) -o $LUSTRE_TESTFILE -a MPIIO -e -g -k "
|
||||
IOR_PARAMS+="-D 60 -O stoneWallingWearOut=1 "
|
||||
|
||||
./drop_caches.sh
|
||||
set +x
|
||||
$MPIEXEC $MPIEXEC_PARAMS $IOR $IOR_PARAMS -w
|
||||
set -x
|
||||
|
||||
# ./drop_caches.sh
|
||||
# set +x
|
||||
# $MPIEXEC $MPIEXEC_PARAMS $IOR $IOR_PARAMS -r
|
||||
# set -x
|
||||
done
|
||||
|
|
|
@ -13,12 +13,9 @@ trap force_exit SIGINT
|
|||
# Provides a list of good hosts (that contains QDR connection)
|
||||
function hosts() {
|
||||
num="$1"
|
||||
#HOST_LIST=( isc17-c04 isc17-c05 isc17-c06 isc17-c07 isc17-c08 isc17-c09 isc17-c11 isc17-c12 isc17-c13 isc17-c14 isc17-c15 isc17-c18 isc17-c22 isc17-c01 isc17-c02 isc17-c03 )
|
||||
HOST_LIST=( "m11388" )
|
||||
|
||||
hlist=${HOST_LIST[0]}
|
||||
hlist=${NODES[0]}
|
||||
for POS in $(seq 1 $(($num - 1))) ; do
|
||||
hlist="$hlist,${HOST_LIST[$POS]}"
|
||||
hlist="$hlist,${NODES[$POS]}"
|
||||
done
|
||||
echo $hlist
|
||||
}
|
||||
|
@ -49,16 +46,18 @@ for API in ${API_ARR[@]}; do
|
|||
lfs getstripe $TESTDIR | tee -a $BENCHFILE
|
||||
|
||||
elif [[ "write" == $TYPE ]]; then
|
||||
set -x
|
||||
IOR_TYPE_OPTS="-w"
|
||||
LUSTRE_TESTFILE="$TD/sharedwrite/file"
|
||||
TESTDIR="$(dirname $LUSTRE_TESTFILE)"
|
||||
[ -d $TESTDIR ] && rm -r $TESTDIR || mkdir -p $TESTDIR
|
||||
lfs setstripe -c $((2 * $NN)) $TESTDIR
|
||||
lfs getstripe $TESTDIR | tee -a $BENCHFILE
|
||||
set +x
|
||||
fi
|
||||
|
||||
|
||||
IOR_PARAMS="-i 3 "
|
||||
IOR_PARAMS="-i 1 "
|
||||
IOR_PARAMS+="-s 1 -t $T -b $((4800 * 1024 * 1024 * 32 / $PPN)) "
|
||||
IOR_PARAMS+="-D 60 -O stoneWallingWearOut=1 "
|
||||
IOR_PARAMS+="-a $API "
|
||||
|
@ -70,8 +69,8 @@ for API in ${API_ARR[@]}; do
|
|||
MPIEXEC_PARAMS+="-genv MV2_CPU_BINDING_LEVEL core "
|
||||
MPIEXEC_PARAMS+="-genv MV2_CPU_BINDING_POLICY scatter "
|
||||
elif [[ "m" == ${hostname:0:1} ]]; then
|
||||
#MPIEXEC_PARAMS=" -ppn $PPN -n $(($NN * $PPN)) --host $(hosts $NN)"
|
||||
MPIEXEC_PARAMS=" -ppn $PPN -n $(($NN * $PPN)) -wdir $WD"
|
||||
#MPIEXEC_PARAMS=" -ppn $PPN -n $(($NN * $PPN)) -wdir $WD --host $(hosts $NN) "
|
||||
MPIEXEC_PARAMS=" -ppn $PPN -n $(($NN * $PPN)) -wdir $WD "
|
||||
else
|
||||
echo "Cluster $hostname is not supported. Quitting."
|
||||
exit 1
|
||||
|
@ -79,7 +78,11 @@ for API in ${API_ARR[@]}; do
|
|||
|
||||
(
|
||||
set -x
|
||||
$MPIEXEC $MPIEXEC_PARAMS ./ior_wrapper.sh "$IOR_PARAMS" "$BENCHFILE" | tee -a $BENCHFILE
|
||||
NETOUTDIR="${BENCHFILE}_network"
|
||||
echo $NETjOUTDIR
|
||||
[[ ! -d $NETOUTDIR ]] && mkdir $NETOUTDIR || rm $NETOUTDIR/HOST*PID*.txt
|
||||
|
||||
$MPIEXEC $MPIEXEC_PARAMS ./ior_wrapper.sh "$IOR_PARAMS" "$NETOUTDIR" | tee -a $BENCHFILE
|
||||
#$MPIEXEC $MPIEXEC_PARAMS $IOR $IOR_PARAMS | tee -a $BENCHFILE
|
||||
set +x
|
||||
) 2> >(tee -a $BENCHFILE)
|
||||
|
|
|
@ -2,27 +2,17 @@
|
|||
|
||||
. ./config.sh
|
||||
|
||||
#NN_ARR=( 4 2 1 8 10 12 14 16)
|
||||
#PPN_ARR=( 8 6 4 2 1 )
|
||||
#T_ARR=( $((10*1024*1024)) $((1*1024*1024)) $((100*1024)) $((16*1024)) )
|
||||
|
||||
|
||||
res="OK"
|
||||
|
||||
for COUNT in $(seq 1); do
|
||||
for NN in ${NN_ARR[@]}; do
|
||||
for T in ${T_ARR[@]}; do
|
||||
for PPN in ${PPN_ARR[@]}; do
|
||||
|
||||
#datasize=$((130 * 1024 * 1024 * 1020 / $PPN))
|
||||
datasize=$((4800 * 1024 * 1024 * 32 / $PPN))
|
||||
remain=$(( $datasize - ($datasize / $T * $T) ))
|
||||
datablock=$(( $DATASIZE / $PPN ))
|
||||
remain=$(( $datablock - ($datablock / $T * $T) ))
|
||||
if [ 0 -ne $remain ]; then
|
||||
echo "Bad IOR paramters: NN=$NN, PPN=$PPN, T=$T, DS=$datasize"
|
||||
echo "FIX IOR parameters: NN=$NN, PPN=$PPN, T=$T, DS=$datablock"
|
||||
res="FAILED"
|
||||
fi
|
||||
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
|
|
Loading…
Reference in New Issue