Skip to content
Snippets Groups Projects
Commit 420d334e authored by Andreas Hamacher's avatar Andreas Hamacher
Browse files

trying to calculate nhc.conf

Former-commit-id: 6c4d2f97
parent 1e705cc9
No related branches found
No related tags found
No related merge requests found
nhc.conf
ssh_known_hosts ssh_known_hosts
slurm.conf slurm.conf
slurmdbd.conf slurmdbd.conf
......
#######################################################################
###
### Filesystem checks
###
* || check_fs_used / 90%
# * || check_fs_iused / 100%
# * || check_fs_iused /glusterVolume 100%
#not that useful at this stage as Nagios should be monitoring servers.
# just check the file servers are happy
#* || check_fs_used '/usr/local' 95%
#* || check_fs_used '/home' 95%
#* || check_fs_used '/projects' 95%
#* || check_fs_used '/scratch' 95%
#* || check_fs_used '/' 100%
#
# New syntax: check_fs_mount [ -0 ] [ -r ] [ -s src ] [ -t fstype ] [ -o options ] [ -O mount_options ] [ -e cmd ] [ -E cmd ] -f fs
#
# m3a0[16-20] nodes are currently disabled from mounting /usr/local. The check is disabled on all m3a because I didn't want to be more specific -- Chris 20180212
m3a* || check_fs_mount_rw -f '/usr/local'
m3c* || check_fs_mount_rw -f '/usr/local'
m3d* || check_fs_mount_rw -f '/usr/local'
m3e* || check_fs_mount_rw -f '/usr/local'
m3f* || check_fs_mount_rw -f '/usr/local'
m3g* || check_fs_mount_rw -f '/usr/local'
m3h* || check_fs_mount_rw -f '/usr/local'
m3i* || check_fs_mount_rw -f '/usr/local'
m3m* || check_fs_mount_rw -f '/usr/local'
m3p* || check_fs_mount_rw -f '/usr/local'
dgx* || check_fs_mount_rw -f '/usr/local'
* || check_fs_mount_rw -f '/home'
* || check_fs_mount_rw -f '/projects'
* || check_fs_mount_rw -f '/scratch'
* || check_lustre_health
#check numa config
m3a* || check_numa
m3c* || check_numa
m3d* || check_numa
m3e* || check_numa
m3g* || check_numa
m3h* || check_numa
m3i* || check_numa
m3m* || check_numa
m3p* || check_numa
* || check_SSSD
* || check_user_lookup
#######################################################################
###
### Hardware checks
###
# Don't check_hw_eth eth0 because most of our compute nodes have eth1 not eth0, but I won't guarantee this
# This has to do with renaming eth0 to mlx0 (i.e. the mellanox device) but is senstive to device initialisation order I suspect
# Chris Hines 20160907
# * || check_hw_cpuinfo 1 1 1
# * || check_hw_physmem 4048416kB 4048416kB 3%
* || check_hw_swap 0kB 0kB 3%
* || check_hw_eth lo
!dgx* || check_hw_eth mlx0
dgx* || check_hw_eth bond0.113
dgx* || check_hw_eth bond0.114
* || check_ibv_devinfo
###
### ECC not available on m3f K1
m3c* || check_gpu_ecc
m3e* || check_gpu_ecc
# m3c* have 4 gpus
# m3e* have 8 gpus
# m3f* have 3 gpus
# m3g* have 3 gpus
# m3h* have 2 gpus
# m3p* have 6 gpus
# dgx* have 8 gpus
m3c* || check_num_of_gpu 4
m3e* || check_num_of_gpu 8
m3f* || check_num_of_gpu 1
m3g* || check_num_of_gpu 3
m3h* || check_num_of_gpu 2
m3p* || check_num_of_gpu 6
dgx* || check_num_of_gpu 8
#add more here
m3c* || check_nvidia_device_existance
m3e* || check_nvidia_device_existance
m3f* || check_nvidia_device_existance
m3g* || check_nvidia_device_existance
m3h* || check_nvidia_device_existance
m3p* || check_nvidia_device_existance
dgx* || check_nvidia_device_existance
# Kerri Wait 20170830 Add new check for xorg.conf file to ensure vglrun works
m3c* || check_xorg_conf_file_existance
m3f* || check_xorg_conf_file_existance
m3g* || check_xorg_conf_file_existance
m3h* || check_xorg_conf_file_existance
m3p* || check_xorg_conf_file_existance
#######################################################################
###
### Process checks
###
* || check_ps_service -S -u root sshd
* || check_ps_service -S ntpd
#Check for UVM for m3c and m3h
m3c* || check_nvidia_uvm
m3g* || check_nvidia_uvm
m3h* || check_nvidia_uvm
m3p* || check_nvidia_uvm
#dgx* || check_nvidia_uvm
...@@ -72,6 +72,7 @@ ...@@ -72,6 +72,7 @@
roles: roles:
- { role: slurm-common, tags: [ slurm, slurmbuild ] } - { role: slurm-common, tags: [ slurm, slurmbuild ] }
- { role: slurm_config, tags: [ slurm_config, slurm ] } - { role: slurm_config, tags: [ slurm_config, slurm ] }
- { role: calculateNhcConfig, tags: [ nhc, slurm ] }
- { role: nhc, tags: [ nhc, slurm ] } - { role: nhc, tags: [ nhc, slurm ] }
- { role: slurm-start, start_slurmd: True, tags: [ slurm, slurm-start ] } - { role: slurm-start, start_slurmd: True, tags: [ slurm, slurm-start ] }
- { role: vncserver, tags: [ other ] } - { role: vncserver, tags: [ other ] }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment