diff --git a/roles/nagios_monitored/files/scripts/check_blocked_beamline_jobs b/roles/nagios_monitored/files/scripts/check_blocked_beamline_jobs new file mode 100755 index 0000000000000000000000000000000000000000..43a63de5ae66a9fe9b9d9012cf350c9d0782107b --- /dev/null +++ b/roles/nagios_monitored/files/scripts/check_blocked_beamline_jobs @@ -0,0 +1,44 @@ +#!/usr/bin/python +import sys, os, pwd +import getopt +import commands +import subprocess +import datetime + +STATE_OK=0 +STATE_WARNING=1 +STATE_CRITICAL=2 +STATE=STATE_OK + +# get info about reservation +reservationname="beamline" +reservation_cmd=["scontrol","show","--oneliner","reservation=" + reservationname] +p = subprocess.Popen(reservation_cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) +line = p.stdout.readline() +reservation_dict = dict( (n,v) for n,v in (a.split('=') for a in line.split() ) ) +retval = p.wait() + +# count free nodes +process=subprocess.Popen("/usr/local/slurm/latest/bin/scontrol show node=%s | grep -c \" State=RESERVED\"" % reservation_dict['Nodes'], shell=True, stdout=subprocess.PIPE) +free_nodes=process.communicate()[0] + +if int(free_nodes) < 1: + # not looking good, no nodes left in beamline reservation - let's check for pending jobs to decide on the warning level + STATE=STATE_WARNING + process=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --noheader --format='%T %R' --long --reservation=beamline | grep PENDING", shell=True, stdout=subprocess.PIPE) + pending_list=process.communicate()[0] + if "PENDING" in pending_list: + print "Critical: we have no free nodes for beamline reservation and jobs trying to run!" + STATE=STATE_CRITICAL + # Lets provide some more info for the readers benefit + process=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --format=\"%.18i %.9P %.8j %.16u %.10a %.8T %.10M %.9l %.6D %R\" --reservation=beamline", shell=True, stdout=subprocess.PIPE) + squeue_list=process.communicate()[0] + print "Beamline reservation has %s of %s nodes free" % (int(free_nodes),int(reservation_dict['NodeCnt'])) + print "squeue --long --reservation=beamline" + print squeue_list + sys.exit(STATE) + +# If we made it hear we are happy +print "Beamline reservation has %s of %s nodes free" % (int(free_nodes),int(reservation_dict['NodeCnt'])) +sys.exit(STATE) + diff --git a/roles/nagios_monitored/files/scripts/check_blocked_compute_jobs b/roles/nagios_monitored/files/scripts/check_blocked_compute_jobs new file mode 100755 index 0000000000000000000000000000000000000000..d1c91793c1000c449892efc9d7acb1173f736159 --- /dev/null +++ b/roles/nagios_monitored/files/scripts/check_blocked_compute_jobs @@ -0,0 +1,71 @@ +#!/usr/bin/python +import sys, os, pwd +import getopt +import commands +import subprocess +import datetime + +STATE_OK=0 +state=STATE_OK +STATE_WARNING=1 +STATE_CRITICAL=2 +WARNING_THRESHOLD_SEC=7*24*60*60 +# WARNING_THRESHOLD_SEC=6*60*60 +CRITICAL_THRESHOLD_SEC=WARNING_THRESHOLD_SEC*2 + +check_pending_com_job=subprocess.Popen("squeue --noheader --states=PENDING --Format=SUBMITTIME", shell=True, stdout=subprocess.PIPE) +pending_com_job_id_list=check_pending_com_job.communicate()[0] + +check_recent_time=subprocess.Popen("date +'%Y-%m-%dT%H:%M:%S'", shell=True, stdout=subprocess.PIPE) +b = check_recent_time.communicate()[0].strip() +# print "date date" +# print "%s" % b +now_time = datetime.datetime.strptime(b, '%Y-%m-%dT%H:%M:%S') + +for submittime in pending_com_job_id_list.splitlines(): + # print job_ID + # check_submit_time=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --job %s --noheader --Format=submittime" % job_ID , shell=True, stdout=subprocess.PIPE) + # a = check_submit_time.communicate()[0].strip() + # print a + # print "-----------------------------------------------------------" + # print "slurm submit time" + # print "-%s-" % submittime + # check_recent_time=subprocess.Popen("date +'%Y-%m-%dT%H:%M:%S'", shell=True, stdout=subprocess.PIPE) + # b = check_recent_time.communicate()[0].strip() + # print "date date" + # print "%s" % b + + submit_time = datetime.datetime.strptime(submittime, '%Y-%m-%dT%H:%M:%S ') + # now_time = datetime.datetime.strptime(b, '%Y-%m-%dT%H:%M:%S') + + # print "python submit_time" + # print submit_time + # print "python now_time" + # print now_time + time_elapsed=(now_time - submit_time).seconds + # print "time_elapsed" + # print "%s" % time_elapsed + + if time_elapsed>=CRITICAL_THRESHOLD_SEC: + message="Critical: Slurm Job Pending over " + str(CRITICAL_THRESHOLD_SEC/60/60/24) + " days" + state=STATE_CRITICAL + + if time_elapsed>=WARNING_THRESHOLD_SEC and state != STATE_CRITICAL: + message="Warning: Slurm Job Pending over " + str(WARNING_THRESHOLD_SEC/60/60/24) + " days" + state=STATE_WARNING + + +if state == STATE_OK: + print "OK: No Slurm Jobs BLOCKED over %s days" % int(WARNING_THRESHOLD_SEC/60/60/24) +else: + print message + check_pending_com_job=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --format='%.18i %.9P %.8u %.8T %S %V %R' --states=PENDING", shell=True, stdout=subprocess.PIPE) + squeue_result=check_pending_com_job.communicate()[0] + print squeue_result + sys.exit(STATE_CRITICAL) + +sys.exit(state) + + + + diff --git a/roles/nagios_monitored/files/scripts/check_blocked_vis_jobs b/roles/nagios_monitored/files/scripts/check_blocked_vis_jobs new file mode 100755 index 0000000000000000000000000000000000000000..8614e13418de4f043de121a961ae59b17bbf3370 --- /dev/null +++ b/roles/nagios_monitored/files/scripts/check_blocked_vis_jobs @@ -0,0 +1,50 @@ +#!/usr/bin/python +import sys, os, pwd +import getopt +import commands +import subprocess +import datetime + +STATE_OK=0 +STATE_WARNING=1 +STATE_CRITICAL=2 + +check_pending_vis_job=subprocess.Popen("squeue -p m2-vis-c6,m1-vis-c6 --format='%.18i %.9P %.8j %.8u %.2t %.19S %.6D %20Y %R' --states=PENDING | grep -i vis | awk {'print $1'}", shell=True, stdout=subprocess.PIPE) + +pending_vis_job_id_list=check_pending_vis_job.communicate()[0] +for job_ID in pending_vis_job_id_list.splitlines(): + check_submit_time=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --Format=jobID,submittime | grep %s | awk {print'$2'}" % job_ID, shell=True, stdout=subprocess.PIPE) + a = check_submit_time.communicate()[0].strip() +# print "%s" % a + + check_recent_time=subprocess.Popen("date +'%Y-%m-%dT%H:%M:%S'", shell=True, stdout=subprocess.PIPE) + b = check_recent_time.communicate()[0].strip() +# print "%s" % b + + submit_time = datetime.datetime.strptime(a, '%Y-%m-%dT%H:%M:%S') + now_time = datetime.datetime.strptime(b, '%Y-%m-%dT%H:%M:%S') + + time_elapsed=(now_time - submit_time).seconds + if time_elapsed>=900: + print "CRTICAL: Slurm Vis Job Pending over 15 mins" + pending_listp = subprocess.Popen("/usr/local/slurm/latest/bin/squeue -p m2-vis-c6,m1-vis-c6 --states=PENDING", shell=True, stdout=subprocess.PIPE) + pending_list = pending_listp.communicate()[0] + print "\n\n/usr/local/slurm/latest/bin/squeue -p m2-vis-c6,m1-vis-c6 --states=PENDING\n" + print pending_list + sys.exit(STATE_CRITICAL) + +# print "%s" % time_elapsed + if time_elapsed>=300: + print "WARNING: Slurm Vis Job Pending over 5 mins" + pending_listp = subprocess.Popen("/usr/local/slurm/latest/bin/squeue -p m2-vis-c6,m1-vis-c6 --states=PENDING", shell=True, stdout=subprocess.PIPE) + pending_list = pending_listp.communicate()[0] + print "\n\n/usr/local/slurm/latest/bin/squeue -p m2-vis-c6,m1-vis-c6 --states=PENDING\n" + print pending_list + sys.exit(STATE_WARNING) + +print "NO Slurm Vis Jobs BLOCKED over 15 mins" +sys.exit(STATE_OK) + + + + diff --git a/roles/nagios_monitored/files/scripts/check_load b/roles/nagios_monitored/files/scripts/check_load index 8e6966c4e309874444dbe4cca2f5e783e107fc88..4768d6b00cf21d53446e56fbe2801ef315c71d57 100755 Binary files a/roles/nagios_monitored/files/scripts/check_load and b/roles/nagios_monitored/files/scripts/check_load differ diff --git a/roles/nagios_monitored/files/scripts/check_localfs.sh b/roles/nagios_monitored/files/scripts/check_localfs.sh new file mode 100755 index 0000000000000000000000000000000000000000..20043ee4530895c62c35b96d23d5b7ef31b4e4ee --- /dev/null +++ b/roles/nagios_monitored/files/scripts/check_localfs.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +STATE_OK=0 +STATE_WARNING=1 +STATE_CRITICAL=2 +STATE_UNKNOWN=3 +STATE_DEPENDENT=4 + + +main () +{ + tmpfile=$( mktemp -p /tmp ) + r=$? + if [ $r -ne 0 ] + then + return $STATE_CRITICAL + else + rm $tmpfile + fi + + return $STATE_OK + +} + +main +exit $? diff --git a/roles/nagios_monitored/files/scripts/check_munge b/roles/nagios_monitored/files/scripts/check_munge index bf7c01cf1c5be003604a09f77e51d99d7bc7344c..5e94412c7cdbb369f81c7ee6643aa74d2a436059 100755 --- a/roles/nagios_monitored/files/scripts/check_munge +++ b/roles/nagios_monitored/files/scripts/check_munge @@ -7,7 +7,7 @@ import subprocess STATE_OK=0 STATE_WARNING=1 -check_munge=subprocess.Popen("/usr/sbin/service munge status", shell=True, stdout=subprocess.PIPE) +check_munge=subprocess.Popen("service munge status", shell=True, stdout=subprocess.PIPE) munge_status=check_munge.communicate()[0] if "run" in munge_status: diff --git a/roles/nagios_monitored/files/scripts/check_ntp b/roles/nagios_monitored/files/scripts/check_ntp new file mode 100755 index 0000000000000000000000000000000000000000..6124b60650cde49593aef13653e583789aa2e7ae --- /dev/null +++ b/roles/nagios_monitored/files/scripts/check_ntp @@ -0,0 +1,21 @@ +#!/usr/bin/python +import sys, os, pwd +import getopt +import commands +import subprocess + +STATE_OK=0 +STATE_WARNING=1 + +check_munge=subprocess.Popen("service ntpd status", shell=True, stdout=subprocess.PIPE) +munge_status=check_munge.communicate()[0] + +if "run" in munge_status: + print "NTPD service is Running" + sys.exit(STATE_OK) +else: + print "NTPD service is NOT Running !!" + sys.exit(STATE_WARNING) + +sys.exit(STATE_OK) + diff --git a/roles/nagios_server/templates/commands_nagios2.cfg.j2 b/roles/nagios_server/templates/commands_nagios2.cfg.j2 index 6bc6f668a16050f28d3a1e164cbae407691120d6..385fa645e3d13ed4d32a6a47cc094017dc04c124 100644 --- a/roles/nagios_server/templates/commands_nagios2.cfg.j2 +++ b/roles/nagios_server/templates/commands_nagios2.cfg.j2 @@ -1,8 +1,8 @@ {% for service in nagios_services %} -{% if service.command %} +{% if service.script %} define command { command_name {{ service.name }} - command_line /usr/lib/nagios/plugins/check_by_ssh -H $HOSTADDRESS$ -C "{{ service.command }}" -E + command_line /usr/lib/nagios/plugins/check_by_ssh -H $HOSTADDRESS$ -o StrictHostKeyChecking=no -C "{{ nagios_home }}/scripts/{{ service.script }}" -E } {% endif %} {% endfor %}