Skip to content
Snippets Groups Projects
Commit dedb1e33 authored by Jupiter Hu's avatar Jupiter Hu
Browse files

add massive nagios scripts

parent 26a4c768
No related branches found
No related tags found
1 merge request!8Nagiosbranch1
#!/usr/bin/python
import sys, os, pwd
import getopt
import commands
import subprocess
import datetime
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE=STATE_OK
# get info about reservation
reservationname="beamline"
reservation_cmd=["scontrol","show","--oneliner","reservation=" + reservationname]
p = subprocess.Popen(reservation_cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
line = p.stdout.readline()
reservation_dict = dict( (n,v) for n,v in (a.split('=') for a in line.split() ) )
retval = p.wait()
# count free nodes
process=subprocess.Popen("/usr/local/slurm/latest/bin/scontrol show node=%s | grep -c \" State=RESERVED\"" % reservation_dict['Nodes'], shell=True, stdout=subprocess.PIPE)
free_nodes=process.communicate()[0]
if int(free_nodes) < 1:
# not looking good, no nodes left in beamline reservation - let's check for pending jobs to decide on the warning level
STATE=STATE_WARNING
process=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --noheader --format='%T %R' --long --reservation=beamline | grep PENDING", shell=True, stdout=subprocess.PIPE)
pending_list=process.communicate()[0]
if "PENDING" in pending_list:
print "Critical: we have no free nodes for beamline reservation and jobs trying to run!"
STATE=STATE_CRITICAL
# Lets provide some more info for the readers benefit
process=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --format=\"%.18i %.9P %.8j %.16u %.10a %.8T %.10M %.9l %.6D %R\" --reservation=beamline", shell=True, stdout=subprocess.PIPE)
squeue_list=process.communicate()[0]
print "Beamline reservation has %s of %s nodes free" % (int(free_nodes),int(reservation_dict['NodeCnt']))
print "squeue --long --reservation=beamline"
print squeue_list
sys.exit(STATE)
# If we made it hear we are happy
print "Beamline reservation has %s of %s nodes free" % (int(free_nodes),int(reservation_dict['NodeCnt']))
sys.exit(STATE)
#!/usr/bin/python
import sys, os, pwd
import getopt
import commands
import subprocess
import datetime
STATE_OK=0
state=STATE_OK
STATE_WARNING=1
STATE_CRITICAL=2
WARNING_THRESHOLD_SEC=7*24*60*60
# WARNING_THRESHOLD_SEC=6*60*60
CRITICAL_THRESHOLD_SEC=WARNING_THRESHOLD_SEC*2
check_pending_com_job=subprocess.Popen("squeue --noheader --states=PENDING --Format=SUBMITTIME", shell=True, stdout=subprocess.PIPE)
pending_com_job_id_list=check_pending_com_job.communicate()[0]
check_recent_time=subprocess.Popen("date +'%Y-%m-%dT%H:%M:%S'", shell=True, stdout=subprocess.PIPE)
b = check_recent_time.communicate()[0].strip()
# print "date date"
# print "%s" % b
now_time = datetime.datetime.strptime(b, '%Y-%m-%dT%H:%M:%S')
for submittime in pending_com_job_id_list.splitlines():
# print job_ID
# check_submit_time=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --job %s --noheader --Format=submittime" % job_ID , shell=True, stdout=subprocess.PIPE)
# a = check_submit_time.communicate()[0].strip()
# print a
# print "-----------------------------------------------------------"
# print "slurm submit time"
# print "-%s-" % submittime
# check_recent_time=subprocess.Popen("date +'%Y-%m-%dT%H:%M:%S'", shell=True, stdout=subprocess.PIPE)
# b = check_recent_time.communicate()[0].strip()
# print "date date"
# print "%s" % b
submit_time = datetime.datetime.strptime(submittime, '%Y-%m-%dT%H:%M:%S ')
# now_time = datetime.datetime.strptime(b, '%Y-%m-%dT%H:%M:%S')
# print "python submit_time"
# print submit_time
# print "python now_time"
# print now_time
time_elapsed=(now_time - submit_time).seconds
# print "time_elapsed"
# print "%s" % time_elapsed
if time_elapsed>=CRITICAL_THRESHOLD_SEC:
message="Critical: Slurm Job Pending over " + str(CRITICAL_THRESHOLD_SEC/60/60/24) + " days"
state=STATE_CRITICAL
if time_elapsed>=WARNING_THRESHOLD_SEC and state != STATE_CRITICAL:
message="Warning: Slurm Job Pending over " + str(WARNING_THRESHOLD_SEC/60/60/24) + " days"
state=STATE_WARNING
if state == STATE_OK:
print "OK: No Slurm Jobs BLOCKED over %s days" % int(WARNING_THRESHOLD_SEC/60/60/24)
else:
print message
check_pending_com_job=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --format='%.18i %.9P %.8u %.8T %S %V %R' --states=PENDING", shell=True, stdout=subprocess.PIPE)
squeue_result=check_pending_com_job.communicate()[0]
print squeue_result
sys.exit(STATE_CRITICAL)
sys.exit(state)
#!/usr/bin/python
import sys, os, pwd
import getopt
import commands
import subprocess
import datetime
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
check_pending_vis_job=subprocess.Popen("squeue -p m2-vis-c6,m1-vis-c6 --format='%.18i %.9P %.8j %.8u %.2t %.19S %.6D %20Y %R' --states=PENDING | grep -i vis | awk {'print $1'}", shell=True, stdout=subprocess.PIPE)
pending_vis_job_id_list=check_pending_vis_job.communicate()[0]
for job_ID in pending_vis_job_id_list.splitlines():
check_submit_time=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --Format=jobID,submittime | grep %s | awk {print'$2'}" % job_ID, shell=True, stdout=subprocess.PIPE)
a = check_submit_time.communicate()[0].strip()
# print "%s" % a
check_recent_time=subprocess.Popen("date +'%Y-%m-%dT%H:%M:%S'", shell=True, stdout=subprocess.PIPE)
b = check_recent_time.communicate()[0].strip()
# print "%s" % b
submit_time = datetime.datetime.strptime(a, '%Y-%m-%dT%H:%M:%S')
now_time = datetime.datetime.strptime(b, '%Y-%m-%dT%H:%M:%S')
time_elapsed=(now_time - submit_time).seconds
if time_elapsed>=900:
print "CRTICAL: Slurm Vis Job Pending over 15 mins"
pending_listp = subprocess.Popen("/usr/local/slurm/latest/bin/squeue -p m2-vis-c6,m1-vis-c6 --states=PENDING", shell=True, stdout=subprocess.PIPE)
pending_list = pending_listp.communicate()[0]
print "\n\n/usr/local/slurm/latest/bin/squeue -p m2-vis-c6,m1-vis-c6 --states=PENDING\n"
print pending_list
sys.exit(STATE_CRITICAL)
# print "%s" % time_elapsed
if time_elapsed>=300:
print "WARNING: Slurm Vis Job Pending over 5 mins"
pending_listp = subprocess.Popen("/usr/local/slurm/latest/bin/squeue -p m2-vis-c6,m1-vis-c6 --states=PENDING", shell=True, stdout=subprocess.PIPE)
pending_list = pending_listp.communicate()[0]
print "\n\n/usr/local/slurm/latest/bin/squeue -p m2-vis-c6,m1-vis-c6 --states=PENDING\n"
print pending_list
sys.exit(STATE_WARNING)
print "NO Slurm Vis Jobs BLOCKED over 15 mins"
sys.exit(STATE_OK)
No preview for this file type
#!/bin/bash
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
STATE_DEPENDENT=4
main ()
{
tmpfile=$( mktemp -p /tmp )
r=$?
if [ $r -ne 0 ]
then
return $STATE_CRITICAL
else
rm $tmpfile
fi
return $STATE_OK
}
main
exit $?
......@@ -7,7 +7,7 @@ import subprocess
STATE_OK=0
STATE_WARNING=1
check_munge=subprocess.Popen("/usr/sbin/service munge status", shell=True, stdout=subprocess.PIPE)
check_munge=subprocess.Popen("service munge status", shell=True, stdout=subprocess.PIPE)
munge_status=check_munge.communicate()[0]
if "run" in munge_status:
......
#!/usr/bin/python
import sys, os, pwd
import getopt
import commands
import subprocess
STATE_OK=0
STATE_WARNING=1
check_munge=subprocess.Popen("service ntpd status", shell=True, stdout=subprocess.PIPE)
munge_status=check_munge.communicate()[0]
if "run" in munge_status:
print "NTPD service is Running"
sys.exit(STATE_OK)
else:
print "NTPD service is NOT Running !!"
sys.exit(STATE_WARNING)
sys.exit(STATE_OK)
{% for service in nagios_services %}
{% if service.command %}
{% if service.script %}
define command {
command_name {{ service.name }}
command_line /usr/lib/nagios/plugins/check_by_ssh -H $HOSTADDRESS$ -C "{{ service.command }}" -E
command_line /usr/lib/nagios/plugins/check_by_ssh -H $HOSTADDRESS$ -o StrictHostKeyChecking=no -C "{{ nagios_home }}/scripts/{{ service.script }}" -E
}
{% endif %}
{% endfor %}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment