-
Chris Hines authored
Former-commit-id: fcb46ee7
Chris Hines authoredFormer-commit-id: fcb46ee7
check_gpu_ecc.nhc 478 B
#!/bin/bash
function check_gpu_ecc() {
#echo ">>> Checking GPU ECC >>>>>>>>>>>>>>>>>>>>>>>>>"
MASSIVEDOC="Runs nvidia-smi to see if the GPUs have any errors. Errors may indicate faulty hardware"
if [ -f /usr/bin/nvidia-smi ]
then
ERRORS=`nvidia-smi -a --xml-format | grep -v " command not found" | grep -A 33 "<ecc_errors>" | grep "<total>" | grep -v "<total>0</total>"`
fi
if [[ -n "${ERRORS}" ]]
then
die 1 " $FUNCNAME ERROR GPU ECC errors"
return 1
fi
}