Skip to content
Snippets Groups Projects
Commit ec95cef9 authored by Chris Hines's avatar Chris Hines
Browse files

Merge branch 'master' of gitlab.erc.monash.edu.au:hpc-team/ansible_cluster_in_a_box

parents 066dd544 7a282b74
No related branches found
No related tags found
1 merge request!16Chris changes
Showing
with 687 additions and 4 deletions
--- ---
- name: "Install extra packages" - name: "Install extra packages"
yum: "name={{ item }} state=present" yum: "name={{ item }} state=present"
with_items: with_items: "{{ pkgs }}"
pkgs
sudo: true sudo: true
when: ansible_os_family == 'RedHat' when: ansible_os_family == 'RedHat'
...@@ -190,6 +190,7 @@ pkgs: ...@@ -190,6 +190,7 @@ pkgs:
- qt-x11 - qt-x11
- rhino - rhino
- rsync - rsync
- samba-client
- scipy - scipy
- spice-vdagent - spice-vdagent
- suitesparse - suitesparse
......
# one 0.25 GB cache
set_cachesize 0 268435456 1
# Transaction Log settings
set_lg_regionmax 262144
set_lg_bsize 2097152
set_flags DB_LOG_AUTOREMOVE
...@@ -102,6 +102,10 @@ ...@@ -102,6 +102,10 @@
sudo: true sudo: true
register: tlsConfigured register: tlsConfigured
- name: copy db config
copy: src=files/DB_CONFIG dest=/var/lib/ldap/DB_CONFIG owner=ldap group=ldap mode=644
sudo: true
- name: start ldap - name: start ldap
service: name=slapd state=restarted service: name=slapd state=restarted
sudo: true sudo: true
......
---
system_packages:
- openldap-servers
- openldap-clients
- openssl
dbname: olcDatabase={2}bdb
ldapuser: ldap
ldapgroup: ldap
---
- name: configure monitoring
template: src={{ item }}_nagios2.cfg.j2 dest=/etc/nagios3/conf.d/{{ item }}_nagios2.cfg
with_items:
- 'hostgroups'
- 'hosts'
- 'commands'
- 'services'
- 'contactgroup'
- 'contacts'
sudo: true
- name: change cgi config
template: src=cgi.cfg.j2 dest=/etc/nagios3/cgi.cfg
sudo: true
- name: nagios restart
service: name=nagios3 state=restarted
sudo: true
#################################################################
#
# CGI.CFG - Sample CGI Configuration File for Nagios
#
#################################################################
# MAIN CONFIGURATION FILE
# This tells the CGIs where to find your main configuration file.
# The CGIs will read the main and host config files for any other
# data they might need.
main_config_file=/etc/nagios3/nagios.cfg
# PHYSICAL HTML PATH
# This is the path where the HTML files for Nagios reside. This
# value is used to locate the logo images needed by the statusmap
# and statuswrl CGIs.
physical_html_path=/usr/share/nagios3/htdocs
# URL HTML PATH
# This is the path portion of the URL that corresponds to the
# physical location of the Nagios HTML files (as defined above).
# This value is used by the CGIs to locate the online documentation
# and graphics. If you access the Nagios pages with an URL like
# http://www.myhost.com/nagios, this value should be '/nagios'
# (without the quotes).
url_html_path=/nagios3
# CONTEXT-SENSITIVE HELP
# This option determines whether or not a context-sensitive
# help icon will be displayed for most of the CGIs.
# Values: 0 = disables context-sensitive help
# 1 = enables context-sensitive help
show_context_help=1
# PENDING STATES OPTION
# This option determines what states should be displayed in the web
# interface for hosts/services that have not yet been checked.
# Values: 0 = leave hosts/services that have not been check yet in their original state
# 1 = mark hosts/services that have not been checked yet as PENDING
use_pending_states=1
# NAGIOS PROCESS CHECK COMMAND
# This is the full path and filename of the program used to check
# the status of the Nagios process. It is used only by the CGIs
# and is completely optional. However, if you don't use it, you'll
# see warning messages in the CGIs about the Nagios process
# not running and you won't be able to execute any commands from
# the web interface. The program should follow the same rules
# as plugins; the return codes are the same as for the plugins,
# it should have timeout protection, it should output something
# to STDIO, etc.
#
# Note: The command line for the check_nagios plugin below may
# have to be tweaked a bit, as different versions of the plugin
# use different command line arguments/syntaxes.
nagios_check_command=/usr/lib/nagios/plugins/check_nagios /var/cache/nagios3/status.dat 5 '/usr/sbin/nagios3'
# AUTHENTICATION USAGE
# This option controls whether or not the CGIs will use any
# authentication when displaying host and service information, as
# well as committing commands to Nagios for processing.
#
# Read the HTML documentation to learn how the authorization works!
#
# NOTE: It is a really *bad* idea to disable authorization, unless
# you plan on removing the command CGI (cmd.cgi)! Failure to do
# so will leave you wide open to kiddies messing with Nagios and
# possibly hitting you with a denial of service attack by filling up
# your drive by continuously writing to your command file!
#
# Setting this value to 0 will cause the CGIs to *not* use
# authentication (bad idea), while any other value will make them
# use the authentication functions (the default).
use_authentication=1
# x509 CERT AUTHENTICATION
# When enabled, this option allows you to use x509 cert (SSL)
# authentication in the CGIs. This is an advanced option and should
# not be enabled unless you know what you're doing.
use_ssl_authentication=0
# DEFAULT USER
# Setting this variable will define a default user name that can
# access pages without authentication. This allows people within a
# secure domain (i.e., behind a firewall) to see the current status
# without authenticating. You may want to use this to avoid basic
# authentication if you are not using a secure server since basic
# authentication transmits passwords in the clear.
#
# Important: Do not define a default username unless you are
# running a secure web server and are sure that everyone who has
# access to the CGIs has been authenticated in some manner! If you
# define this variable, anyone who has not authenticated to the web
# server will inherit all rights you assign to this user!
#default_user_name=guest
# SYSTEM/PROCESS INFORMATION ACCESS
# This option is a comma-delimited list of all usernames that
# have access to viewing the Nagios process information as
# provided by the Extended Information CGI (extinfo.cgi). By
# default, *no one* has access to this unless you choose to
# not use authorization. You may use an asterisk (*) to
# authorize any user who has authenticated to the web server.
authorized_for_system_information=nagiosadmin,nagios
# CONFIGURATION INFORMATION ACCESS
# This option is a comma-delimited list of all usernames that
# can view ALL configuration information (hosts, commands, etc).
# By default, users can only view configuration information
# for the hosts and services they are contacts for. You may use
# an asterisk (*) to authorize any user who has authenticated
# to the web server.
authorized_for_configuration_information=nagiosadmin,nagios
# SYSTEM/PROCESS COMMAND ACCESS
# This option is a comma-delimited list of all usernames that
# can issue shutdown and restart commands to Nagios via the
# command CGI (cmd.cgi). Users in this list can also change
# the program mode to active or standby. By default, *no one*
# has access to this unless you choose to not use authorization.
# You may use an asterisk (*) to authorize any user who has
# authenticated to the web server.
authorized_for_system_commands=nagiosadmin,nagios
# GLOBAL HOST/SERVICE VIEW ACCESS
# These two options are comma-delimited lists of all usernames that
# can view information for all hosts and services that are being
# monitored. By default, users can only view information
# for hosts or services that they are contacts for (unless you
# you choose to not use authorization). You may use an asterisk (*)
# to authorize any user who has authenticated to the web server.
authorized_for_all_services=nagiosadmin,nagios
authorized_for_all_hosts=nagiosadmin,nagios
# GLOBAL HOST/SERVICE COMMAND ACCESS
# These two options are comma-delimited lists of all usernames that
# can issue host or service related commands via the command
# CGI (cmd.cgi) for all hosts and services that are being monitored.
# By default, users can only issue commands for hosts or services
# that they are contacts for (unless you you choose to not use
# authorization). You may use an asterisk (*) to authorize any
# user who has authenticated to the web server.
authorized_for_all_service_commands=nagiosadmin,nagios
authorized_for_all_host_commands=nagiosadmin,nagios
# READ-ONLY USERS
# A comma-delimited list of usernames that have read-only rights in
# the CGIs. This will block any service or host commands normally shown
# on the extinfo CGI pages. It will also block comments from being shown
# to read-only users.
#authorized_for_read_only=user1,user2
# STATUSMAP BACKGROUND IMAGE
# This option allows you to specify an image to be used as a
# background in the statusmap CGI. It is assumed that the image
# resides in the HTML images path (i.e. /usr/local/nagios/share/images).
# This path is automatically determined by appending "/images"
# to the path specified by the 'physical_html_path' directive.
# Note: The image file may be in GIF, PNG, JPEG, or GD2 format.
# However, I recommend that you convert your image to GD2 format
# (uncompressed), as this will cause less CPU load when the CGI
# generates the image.
#statusmap_background_image=smbackground.gd2
# STATUSMAP TRANSPARENCY INDEX COLOR
# These options set the r,g,b values of the background color used the statusmap CGI,
# so normal browsers that can't show real png transparency set the desired color as
# a background color instead (to make it look pretty).
# Defaults to white: (R,G,B) = (255,255,255).
#color_transparency_index_r=255
#color_transparency_index_g=255
#color_transparency_index_b=255
# DEFAULT STATUSMAP LAYOUT METHOD
# This option allows you to specify the default layout method
# the statusmap CGI should use for drawing hosts. If you do
# not use this option, the default is to use user-defined
# coordinates. Valid options are as follows:
# 0 = User-defined coordinates
# 1 = Depth layers
# 2 = Collapsed tree
# 3 = Balanced tree
# 4 = Circular
# 5 = Circular (Marked Up)
default_statusmap_layout=5
# DEFAULT STATUSWRL LAYOUT METHOD
# This option allows you to specify the default layout method
# the statuswrl (VRML) CGI should use for drawing hosts. If you
# do not use this option, the default is to use user-defined
# coordinates. Valid options are as follows:
# 0 = User-defined coordinates
# 2 = Collapsed tree
# 3 = Balanced tree
# 4 = Circular
default_statuswrl_layout=4
# STATUSWRL INCLUDE
# This option allows you to include your own objects in the
# generated VRML world. It is assumed that the file
# resides in the HTML path (i.e. /usr/local/nagios/share).
#statuswrl_include=myworld.wrl
# PING SYNTAX
# This option determines what syntax should be used when
# attempting to ping a host from the WAP interface (using
# the statuswml CGI. You must include the full path to
# the ping binary, along with all required options. The
# $HOSTADDRESS$ macro is substituted with the address of
# the host before the command is executed.
# Please note that the syntax for the ping binary is
# notorious for being different on virtually ever *NIX
# OS and distribution, so you may have to tweak this to
# work on your system.
ping_syntax=/bin/ping -n -U -c 5 $HOSTADDRESS$
# REFRESH RATE
# This option allows you to specify the refresh rate in seconds
# of various CGIs (status, statusmap, extinfo, and outages).
refresh_rate=90
# DEFAULT PAGE LIMIT
# This option allows you to specify the default number of results
# displayed on the status.cgi. This number can be adjusted from
# within the UI after the initial page load. Setting this to 0
# will show all results.
result_limit=100
# ESCAPE HTML TAGS
# This option determines whether HTML tags in host and service
# status output is escaped in the web interface. If enabled,
# your plugin output will not be able to contain clickable links.
escape_html_tags=1
# SOUND OPTIONS
# These options allow you to specify an optional audio file
# that should be played in your browser window when there are
# problems on the network. The audio files are used only in
# the status CGI. Only the sound for the most critical problem
# will be played. Order of importance (higher to lower) is as
# follows: unreachable hosts, down hosts, critical services,
# warning services, and unknown services. If there are no
# visible problems, the sound file optionally specified by
# 'normal_sound' variable will be played.
#
#
# <varname>=<sound_file>
#
# Note: All audio files must be placed in the /media subdirectory
# under the HTML path (i.e. /usr/local/nagios/share/media/).
#host_unreachable_sound=hostdown.wav
#host_down_sound=hostdown.wav
#service_critical_sound=critical.wav
#service_warning_sound=warning.wav
#service_unknown_sound=warning.wav
#normal_sound=noproblem.wav
# URL TARGET FRAMES
# These options determine the target frames in which notes and
# action URLs will open.
action_url_target=_blank
notes_url_target=_blank
# LOCK AUTHOR NAMES OPTION
# This option determines whether users can change the author name
# when submitting comments, scheduling downtime. If disabled, the
# author names will be locked into their contact name, as defined in Nagios.
# Values: 0 = allow editing author names
# 1 = lock author names (disallow editing)
lock_author_names=1
# SPLUNK INTEGRATION OPTIONS
# These options allow you to enable integration with Splunk
# in the web interface. If enabled, you'll be presented with
# "Splunk It" links in various places in the CGIs (log file,
# alert history, host/service detail, etc). Useful if you're
# trying to research why a particular problem occurred.
# For more information on Splunk, visit http://www.splunk.com/
# This option determines whether the Splunk integration is enabled
# Values: 0 = disable Splunk integration
# 1 = enable Splunk integration
#enable_splunk_integration=1
# This option should be the URL used to access your instance of Splunk
#splunk_url=http://127.0.0.1:8000/
{% for service in nagios_services %}
{% if service.script %}
define command {
command_name {{ service.name }}
command_line /usr/lib/nagios/plugins/check_by_ssh -H $HOSTADDRESS$ -o StrictHostKeyChecking=no -C "{{ nagios_home }}/scripts/{{ service.script }}" -E
}
{% endif %}
{% endfor %}
define contactgroup {
contactgroup_name admins ; Group name used in configuration
alias Administrators ; Alias for group displayed on webpage
members admins
; contactgroup_members ; Other contact groups to be notified
}
define contact{
contact_name admins
alias Admins
service_notification_period 24x7
host_notification_period 24x7
service_notification_options w,u,c,r
host_notification_options d,r
service_notifications_enabled 1
host_notifications_enabled 1
service_notification_commands notify-service-by-email
host_notification_commands notify-host-by-email
email hpc-alerts-warning-l@monash.edu
}
# Some generic hostgroup definitions
{% for group in groups %}
{% set nodelist = [] %}
{% for node in groups[group] %}
{% if nodelist.append(node) %}
{% endif %}
{% endfor %}
define hostgroup {
hostgroup_name {{ group }}
members {{ nodelist|unique|join(',') }}
}
{% endfor %}
{% for service in nagios_services %} {% for service in nagios_services %}
define service { define service {
name {{ service.name }}
service_description {{ service.description }} service_description {{ service.description }}
hostgroup_name {{ service.groups|join(',') }} hostgroup_name {{ service.groups|join(',') }}
check_command {{ service.command }} check_command {{ service.name }}
use generic-service use generic-service
notification_interval 0 notification_interval 0
} }
......
#!/usr/bin/python
import sys, os, pwd
import getopt
import commands
import subprocess
import datetime
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE=STATE_OK
# get info about reservation
reservationname="beamline"
reservation_cmd=["scontrol","show","--oneliner","reservation=" + reservationname]
p = subprocess.Popen(reservation_cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
line = p.stdout.readline()
reservation_dict = dict( (n,v) for n,v in (a.split('=') for a in line.split() ) )
retval = p.wait()
# count free nodes
process=subprocess.Popen("/usr/local/slurm/latest/bin/scontrol show node=%s | grep -c \" State=RESERVED\"" % reservation_dict['Nodes'], shell=True, stdout=subprocess.PIPE)
free_nodes=process.communicate()[0]
if int(free_nodes) < 1:
# not looking good, no nodes left in beamline reservation - let's check for pending jobs to decide on the warning level
STATE=STATE_WARNING
process=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --noheader --format='%T %R' --long --reservation=beamline | grep PENDING", shell=True, stdout=subprocess.PIPE)
pending_list=process.communicate()[0]
if "PENDING" in pending_list:
print "Critical: we have no free nodes for beamline reservation and jobs trying to run!"
STATE=STATE_CRITICAL
# Lets provide some more info for the readers benefit
process=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --format=\"%.18i %.9P %.8j %.16u %.10a %.8T %.10M %.9l %.6D %R\" --reservation=beamline", shell=True, stdout=subprocess.PIPE)
squeue_list=process.communicate()[0]
print "Beamline reservation has %s of %s nodes free" % (int(free_nodes),int(reservation_dict['NodeCnt']))
print "squeue --long --reservation=beamline"
print squeue_list
sys.exit(STATE)
# If we made it hear we are happy
print "Beamline reservation has %s of %s nodes free" % (int(free_nodes),int(reservation_dict['NodeCnt']))
sys.exit(STATE)
#!/usr/bin/python
import sys, os, pwd
import getopt
import commands
import subprocess
import datetime
STATE_OK=0
state=STATE_OK
STATE_WARNING=1
STATE_CRITICAL=2
WARNING_THRESHOLD_SEC=7*24*60*60
# WARNING_THRESHOLD_SEC=6*60*60
CRITICAL_THRESHOLD_SEC=WARNING_THRESHOLD_SEC*2
check_pending_com_job=subprocess.Popen("squeue --noheader --states=PENDING --Format=SUBMITTIME", shell=True, stdout=subprocess.PIPE)
pending_com_job_id_list=check_pending_com_job.communicate()[0]
check_recent_time=subprocess.Popen("date +'%Y-%m-%dT%H:%M:%S'", shell=True, stdout=subprocess.PIPE)
b = check_recent_time.communicate()[0].strip()
# print "date date"
# print "%s" % b
now_time = datetime.datetime.strptime(b, '%Y-%m-%dT%H:%M:%S')
for submittime in pending_com_job_id_list.splitlines():
# print job_ID
# check_submit_time=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --job %s --noheader --Format=submittime" % job_ID , shell=True, stdout=subprocess.PIPE)
# a = check_submit_time.communicate()[0].strip()
# print a
# print "-----------------------------------------------------------"
# print "slurm submit time"
# print "-%s-" % submittime
# check_recent_time=subprocess.Popen("date +'%Y-%m-%dT%H:%M:%S'", shell=True, stdout=subprocess.PIPE)
# b = check_recent_time.communicate()[0].strip()
# print "date date"
# print "%s" % b
submit_time = datetime.datetime.strptime(submittime, '%Y-%m-%dT%H:%M:%S ')
# now_time = datetime.datetime.strptime(b, '%Y-%m-%dT%H:%M:%S')
# print "python submit_time"
# print submit_time
# print "python now_time"
# print now_time
time_elapsed=(now_time - submit_time).seconds
# print "time_elapsed"
# print "%s" % time_elapsed
if time_elapsed>=CRITICAL_THRESHOLD_SEC:
message="Critical: Slurm Job Pending over " + str(CRITICAL_THRESHOLD_SEC/60/60/24) + " days"
state=STATE_CRITICAL
if time_elapsed>=WARNING_THRESHOLD_SEC and state != STATE_CRITICAL:
message="Warning: Slurm Job Pending over " + str(WARNING_THRESHOLD_SEC/60/60/24) + " days"
state=STATE_WARNING
if state == STATE_OK:
print "OK: No Slurm Jobs BLOCKED over %s days" % int(WARNING_THRESHOLD_SEC/60/60/24)
else:
print message
check_pending_com_job=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --format='%.18i %.9P %.8u %.8T %S %V %R' --states=PENDING", shell=True, stdout=subprocess.PIPE)
squeue_result=check_pending_com_job.communicate()[0]
print squeue_result
sys.exit(STATE_CRITICAL)
sys.exit(state)
#!/usr/bin/python
import sys, os, pwd
import getopt
import commands
import subprocess
import datetime
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
check_pending_vis_job=subprocess.Popen("squeue -p m2-vis-c6,m1-vis-c6 --format='%.18i %.9P %.8j %.8u %.2t %.19S %.6D %20Y %R' --states=PENDING | grep -i vis | awk {'print $1'}", shell=True, stdout=subprocess.PIPE)
pending_vis_job_id_list=check_pending_vis_job.communicate()[0]
for job_ID in pending_vis_job_id_list.splitlines():
check_submit_time=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --Format=jobID,submittime | grep %s | awk {print'$2'}" % job_ID, shell=True, stdout=subprocess.PIPE)
a = check_submit_time.communicate()[0].strip()
# print "%s" % a
check_recent_time=subprocess.Popen("date +'%Y-%m-%dT%H:%M:%S'", shell=True, stdout=subprocess.PIPE)
b = check_recent_time.communicate()[0].strip()
# print "%s" % b
submit_time = datetime.datetime.strptime(a, '%Y-%m-%dT%H:%M:%S')
now_time = datetime.datetime.strptime(b, '%Y-%m-%dT%H:%M:%S')
time_elapsed=(now_time - submit_time).seconds
if time_elapsed>=900:
print "CRTICAL: Slurm Vis Job Pending over 15 mins"
pending_listp = subprocess.Popen("/usr/local/slurm/latest/bin/squeue -p m2-vis-c6,m1-vis-c6 --states=PENDING", shell=True, stdout=subprocess.PIPE)
pending_list = pending_listp.communicate()[0]
print "\n\n/usr/local/slurm/latest/bin/squeue -p m2-vis-c6,m1-vis-c6 --states=PENDING\n"
print pending_list
sys.exit(STATE_CRITICAL)
# print "%s" % time_elapsed
if time_elapsed>=300:
print "WARNING: Slurm Vis Job Pending over 5 mins"
pending_listp = subprocess.Popen("/usr/local/slurm/latest/bin/squeue -p m2-vis-c6,m1-vis-c6 --states=PENDING", shell=True, stdout=subprocess.PIPE)
pending_list = pending_listp.communicate()[0]
print "\n\n/usr/local/slurm/latest/bin/squeue -p m2-vis-c6,m1-vis-c6 --states=PENDING\n"
print pending_list
sys.exit(STATE_WARNING)
print "NO Slurm Vis Jobs BLOCKED over 15 mins"
sys.exit(STATE_OK)
#!/bin/bash
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
STATE_DEPENDENT=4
HOST_NAME=$(hostname -f)
function check_disk() {
local space="$1"
local used=$(df -h ${space} | grep "^/" | awk '{print $5}' | sed -e 's/%//')
local threshhold=$((90))
if (( ${used} > ${threshhold} )); then
echo "WARNING: ${HOST_NAME} is running out of filesystem space"
exit "${STATE_WARNING}"
fi
}
folder="/"
check_disk "/"
check_disk "/tmp"
echo "Check ${HOST_NAME} disk space Ok"
exit "$STATE_OK"
No preview for this file type
#!/bin/bash
STATE_OK=0
STATE_WARNING=1
STATE_CRITICAL=2
STATE_UNKNOWN=3
STATE_DEPENDENT=4
main ()
{
tmpfile=$( mktemp -p /tmp )
r=$?
if [ $r -ne 0 ]
then
return $STATE_CRITICAL
else
rm $tmpfile
fi
return $STATE_OK
}
main
exit $?
...@@ -7,7 +7,7 @@ import subprocess ...@@ -7,7 +7,7 @@ import subprocess
STATE_OK=0 STATE_OK=0
STATE_WARNING=1 STATE_WARNING=1
check_munge=subprocess.Popen("/usr/sbin/service munge status", shell=True, stdout=subprocess.PIPE) check_munge=subprocess.Popen("service munge status", shell=True, stdout=subprocess.PIPE)
munge_status=check_munge.communicate()[0] munge_status=check_munge.communicate()[0]
if "run" in munge_status: if "run" in munge_status:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment