diff --git a/roles/nhc/files/scripts/check_gpu_generic_err.nhc b/roles/nhc/files/scripts/check_gpu_generic_err.nhc new file mode 100644 index 0000000000000000000000000000000000000000..2abe52276c20161717a1cce36be673bca6004dec --- /dev/null +++ b/roles/nhc/files/scripts/check_gpu_generic_err.nhc @@ -0,0 +1,15 @@ +#!/bin/bash +function check_gpu_generic_err() { +#echo ">>> Checking for generic GPU errors >>>>>>>>>>>>>>>>>>>>>>>>>" +MASSIVEDOC="Runs nvidia-smi to see if the GPUs have any errors. Errors may indicate faulty hardware" + +/usr/bin/nvidia-smi | ( ! grep ERR ) +ret=$? +if [[ $ret -ne 0 ]] +then + die 1 " $FUNCNAME ERROR GPU errors" + return 1 +else + return 0 +fi +}