diff --git a/CICD/files/.gitignore b/CICD/files/.gitignore index 4751cd688844ce20b649bf637a1db0e58fa0b416..de782f9443327185343652401b63a61d64ec76ea 100644 --- a/CICD/files/.gitignore +++ b/CICD/files/.gitignore @@ -1,3 +1,4 @@ +nhc.conf ssh_known_hosts slurm.conf slurmdbd.conf diff --git a/CICD/files/nhc.conf b/CICD/files/nhc.conf deleted file mode 100644 index a2f7e3649bdeebe90f46967e9b03020e58804861..0000000000000000000000000000000000000000 --- a/CICD/files/nhc.conf +++ /dev/null @@ -1,119 +0,0 @@ -####################################################################### -### -### Filesystem checks -### - - * || check_fs_used / 90% -# * || check_fs_iused / 100% -# * || check_fs_iused /glusterVolume 100% - -#not that useful at this stage as Nagios should be monitoring servers. -# just check the file servers are happy -#* || check_fs_used '/usr/local' 95% -#* || check_fs_used '/home' 95% -#* || check_fs_used '/projects' 95% -#* || check_fs_used '/scratch' 95% - - -#* || check_fs_used '/' 100% - -# -# New syntax: check_fs_mount [ -0 ] [ -r ] [ -s src ] [ -t fstype ] [ -o options ] [ -O mount_options ] [ -e cmd ] [ -E cmd ] -f fs -# - -# m3a0[16-20] nodes are currently disabled from mounting /usr/local. The check is disabled on all m3a because I didn't want to be more specific -- Chris 20180212 -m3a* || check_fs_mount_rw -f '/usr/local' -m3c* || check_fs_mount_rw -f '/usr/local' -m3d* || check_fs_mount_rw -f '/usr/local' -m3e* || check_fs_mount_rw -f '/usr/local' -m3f* || check_fs_mount_rw -f '/usr/local' -m3g* || check_fs_mount_rw -f '/usr/local' -m3h* || check_fs_mount_rw -f '/usr/local' -m3i* || check_fs_mount_rw -f '/usr/local' -m3m* || check_fs_mount_rw -f '/usr/local' -m3p* || check_fs_mount_rw -f '/usr/local' -dgx* || check_fs_mount_rw -f '/usr/local' -* || check_fs_mount_rw -f '/home' -* || check_fs_mount_rw -f '/projects' -* || check_fs_mount_rw -f '/scratch' -* || check_lustre_health - -#check numa config -m3a* || check_numa -m3c* || check_numa -m3d* || check_numa -m3e* || check_numa -m3g* || check_numa -m3h* || check_numa -m3i* || check_numa -m3m* || check_numa -m3p* || check_numa -* || check_SSSD -* || check_user_lookup - -####################################################################### -### -### Hardware checks -### -# Don't check_hw_eth eth0 because most of our compute nodes have eth1 not eth0, but I won't guarantee this -# This has to do with renaming eth0 to mlx0 (i.e. the mellanox device) but is senstive to device initialisation order I suspect -# Chris Hines 20160907 -# * || check_hw_cpuinfo 1 1 1 -# * || check_hw_physmem 4048416kB 4048416kB 3% - * || check_hw_swap 0kB 0kB 3% - * || check_hw_eth lo -!dgx* || check_hw_eth mlx0 -dgx* || check_hw_eth bond0.113 -dgx* || check_hw_eth bond0.114 - * || check_ibv_devinfo - - -### -### ECC not available on m3f K1 -m3c* || check_gpu_ecc -m3e* || check_gpu_ecc - -# m3c* have 4 gpus -# m3e* have 8 gpus -# m3f* have 3 gpus -# m3g* have 3 gpus -# m3h* have 2 gpus -# m3p* have 6 gpus -# dgx* have 8 gpus -m3c* || check_num_of_gpu 4 -m3e* || check_num_of_gpu 8 -m3f* || check_num_of_gpu 1 -m3g* || check_num_of_gpu 3 -m3h* || check_num_of_gpu 2 -m3p* || check_num_of_gpu 6 -dgx* || check_num_of_gpu 8 -#add more here - -m3c* || check_nvidia_device_existance -m3e* || check_nvidia_device_existance -m3f* || check_nvidia_device_existance -m3g* || check_nvidia_device_existance -m3h* || check_nvidia_device_existance -m3p* || check_nvidia_device_existance -dgx* || check_nvidia_device_existance -# Kerri Wait 20170830 Add new check for xorg.conf file to ensure vglrun works -m3c* || check_xorg_conf_file_existance -m3f* || check_xorg_conf_file_existance -m3g* || check_xorg_conf_file_existance -m3h* || check_xorg_conf_file_existance -m3p* || check_xorg_conf_file_existance - -####################################################################### -### -### Process checks -### -* || check_ps_service -S -u root sshd -* || check_ps_service -S ntpd - -#Check for UVM for m3c and m3h -m3c* || check_nvidia_uvm -m3g* || check_nvidia_uvm -m3h* || check_nvidia_uvm -m3p* || check_nvidia_uvm -#dgx* || check_nvidia_uvm - diff --git a/CICD/plays/computenodes.yml b/CICD/plays/computenodes.yml index 02274be770a8eba66a57b774eb10ea8cfbcdbb5e..ba998293aae04ecb1261543e450269a479e8e2ce 100644 --- a/CICD/plays/computenodes.yml +++ b/CICD/plays/computenodes.yml @@ -72,6 +72,7 @@ roles: - { role: slurm-common, tags: [ slurm, slurmbuild ] } - { role: slurm_config, tags: [ slurm_config, slurm ] } + - { role: calculateNhcConfig, tags: [ nhc, slurm ] } - { role: nhc, tags: [ nhc, slurm ] } - { role: slurm-start, start_slurmd: True, tags: [ slurm, slurm-start ] } - { role: vncserver, tags: [ other ] }