diff --git a/dynamicInventory b/dynamicInventory deleted file mode 100755 index 5ada7f57b7544089b04deef1de67a7b92b1fd1a9..0000000000000000000000000000000000000000 --- a/dynamicInventory +++ /dev/null @@ -1,141 +0,0 @@ -#!/usr/bin/env python -import sys, os, string, subprocess, socket, re -import copy, shlex,uuid, random, multiprocessing, time, shutil, json -#import novaclient.v1_1.client as nvclient -#import novaclient.exceptions as nvexceptions -from keystoneclient.auth.identity import v2 as v2_auth -#from heatclient import client as heat_client -#from novaclient import client as nova_client -#from cinderclient import client as cinder_client -import heatclient -import novaclient -import cinderclient -import heatclient.client -import novaclient.client -import cinderclient.client -import keystoneclient.client -from keystoneclient.auth.identity import v2 -from keystoneclient import session -from novaclient import client - -from keystoneclient import session as kssession -#NOVA_STANDALONE=True -NOVA_STANDALONE=False - - -class OpenStackConnection: - - def __init__(self, username, passwd): - self.username=username - self.passwd=passwd - self.tenantName= os.environ['OS_TENANT_NAME'] - self.tenantID= os.environ['OS_TENANT_ID'] - self.authUrl="https://keystone.rc.nectar.org.au:5000/v2.0" - - def get_stack_name(self,stack): - stacks=[] - for s in self.hc.stacks.list(): - stacks.append(s.stack_name) - if stack in stacks: - return stack - elif len(stacks)==1: - return stacks[0] - elif len(stacks)==0: - raise Exception("You do not have any heat stacks in your OpenStack Project") - else: - raise Exception("You have multiple heat stacks in your OpenStack Project and I'm not sure which one to use.\n You can select a stack by symlinking to a stack, for example if you have a stack called mycluster do ln -s %s mycluster\n"%stack) - - def auth(self): - - - kwargs = { - 'username': self.username, - 'password': self.passwd, - 'tenant_id': self.tenantID, - 'auth_url':self.authUrl, - } - - auth = v2.Password(**kwargs) - sess = session.Session(auth=auth) - kwargs = { - 'session':sess, - - } - api_version='2' - self.nc = novaclient.client.Client(api_version, session=sess) - - api_version=1 - endpoint="https://heat.rc.nectar.org.au:8004/v1/%s"%self.tenantID - self.hc = heatclient.client.Client(api_version, endpoint, session=sess) - - api_version=1 - self.cc = cinderclient.client.Client(api_version, session=sess) - - - def recurse_resources(self,stack,resource): - result=[] - if 'OS::Nova::Server' in resource.resource_type: - result.append(resource.physical_resource_id) - if 'OS::Heat::ResourceGroup' in resource.resource_type: - for r in self.hc.resources.list(resource.physical_resource_id): - result.extend(self.recurse_resources(stack,r)) - - return result - - def gatherInfo(self,stack_name): - - ## Fetch the Nova Object - instance_ids=[] - for i in self.hc.stacks.list(): - if i.stack_name == stack_name: - for r in self.hc.resources.list(i.stack_name): - instance_ids.extend(self.recurse_resources(stack=i,resource=r)) - - nc=self.nc - cc=self.cc - inventory = {} - inventory['_meta'] = { 'hostvars': {} } - for server in nc.servers.list(): - if server.id in instance_ids: - if server.metadata and 'ansible_host_groups' in server.metadata: - hostname=server.name - groups = json.loads(server.metadata['ansible_host_groups']) - for group in groups: - if group in inventory: - inventory[group].append(hostname) - else: - inventory[group] = [ hostname ] - elif server.metadata and 'ansible_host_group' in server.metadata: - #hostname = socket.gethostbyaddr(server.networks.values()[0][0])[0] - hostname = server.name - # Set Ansible Host Group - if server.metadata['ansible_host_group'] in inventory: - inventory[server.metadata['ansible_host_group']].append(hostname) - else: - inventory[server.metadata['ansible_host_group']] = [hostname] - #print dir(server) - # Set the other host variables - inventory['_meta']['hostvars'][hostname] = {} - inventory['_meta']['hostvars'][hostname]['ansible_ssh_host'] = server.networks.values()[0][0] - inventory['_meta']['hostvars'][hostname]['ansible_remote_tmp'] = '/tmp/ansible' - for key in server.metadata.keys(): - if 'ansible_ssh' in key: - inventory['_meta']['hostvars'][hostname][key] = server.metadata[key] - inventory['_meta']['hostvars'][hostname]['ansible_ssh_user'] = 'ec2-user' - for vol in server.to_dict()['os-extended-volumes:volumes_attached']: - for cv in cc.volumes.findall(): - if cv.id == vol['id']: - devname = '/dev/disk/by-id/virtio-'+cv.id[0:20] - if not 'ansible_host_volumes' in inventory['_meta']['hostvars'][hostname]: - inventory['_meta']['hostvars'][hostname]['ansible_host_volumes']={} - inventory['_meta']['hostvars'][hostname]['ansible_host_volumes'][cv.display_name]={'uuid':vol['id'],'dev':devname} - print json.dumps(inventory) - -if __name__ == "__main__": - stack_name=os.path.basename(sys.argv[0]) - username = os.environ['OS_USERNAME'] - passwd = os.environ['OS_PASSWORD'] - openstack = OpenStackConnection(username, passwd) - openstack.auth() - stack_name=openstack.get_stack_name(stack_name) - openstack.gatherInfo(stack_name) diff --git a/provisionCluster.yaml b/provisionCluster.yaml deleted file mode 100644 index 910d5ea4192c1e29d1f490c4505fb49d4e771d99..0000000000000000000000000000000000000000 --- a/provisionCluster.yaml +++ /dev/null @@ -1,41 +0,0 @@ ---- -description: " A simple template to boot a 3 node cluster" -heat_template_version: 2013-05-23 -parameters: - image_id: - type: string - label: Image ID - description: Image to be used for compute instance - default: a5e74703-f343-415a-aa23-bd0f0aacfc9e - key_name: - type: string - label: Key Name - description: Name of key-pair to be used for compute instance - default: shahaan - availability_z: - type: string - label: Availability Zone - description: Availability Zone to be used for launching compute instance - default: monash-01 -resources: - computeNodes: - type: "OS::Heat::ResourceGroup" - properties: - count: 2 - resource_def: - type: "OS::Nova::Server" - properties: - availability_zone: { get_param: availability_z } - flavor: m1.small - image: { get_param: image_id } - key_name: { get_param: key_name } - metadata: - ansible_host_group: computeNodes - ansible_ssh_user: ec2-user - ansible_ssh_private_key_file: /home/sgeadmin/.ssh/shahaan.pem - headNodes: - type: "OS::Heat::ResourceGroup" - properties: - count: 1 - resource_def: - type: headNode.yaml diff --git a/roles/SSHKnownHosts/tasks/main.yml b/roles/SSHKnownHosts/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..028cd43a39101452e88d878f84add57bdb16617b --- /dev/null +++ b/roles/SSHKnownHosts/tasks/main.yml @@ -0,0 +1,4 @@ +- name: install known hosts file + copy: src=files/ssh_known_hosts dest=/etc/ssh/ssh_known_hosts owner=root mode=644 + become: true + become_user: root diff --git a/roles/apt-get-upgrade/tasks/main.yml b/roles/apt-get-update/tasks/main.yml similarity index 51% rename from roles/apt-get-upgrade/tasks/main.yml rename to roles/apt-get-update/tasks/main.yml index 49e0217a1a170d17bee5ef958f7886903e0c1256..460364c28b119fa664cfa571f761dadb001891ae 100644 --- a/roles/apt-get-upgrade/tasks/main.yml +++ b/roles/apt-get-update/tasks/main.yml @@ -3,8 +3,3 @@ apt: update_cache=True sudo: true when: ansible_os_family=="Debian" - -- name: apt-get upgrade - apt: upgrade=safe - sudo: true - when: ansible_os_family=="Debian" diff --git a/roles/calculateEtcHosts/tasks/main.yml b/roles/calculateEtcHosts/tasks/main.yml index 6922e415e60c223856f1ddb9c837ae3ccd30f683..ff71a956bb33175ab3ebe2227ed0b13a55165746 100644 --- a/roles/calculateEtcHosts/tasks/main.yml +++ b/roles/calculateEtcHosts/tasks/main.yml @@ -1,3 +1,4 @@ +--- - name: get_groups_json template: dest=/tmp/groups src=groups.j2 @@ -11,6 +12,5 @@ - name: write hosts file template: dest=/tmp/etcHosts src=etcHosts.j2 - - name: fetch hosts file fetch: src=/tmp/etcHosts dest=files/etcHosts flat=yes diff --git a/roles/calculateEtcHosts/templates/groups.j2 b/roles/calculateEtcHosts/templates/groups.j2 index dffc1333690bb0a0495cdd0c5c4b2f9f8c1bd509..5304b73e541d9a3d62faa0cb39f3550482ab848f 100644 --- a/roles/calculateEtcHosts/templates/groups.j2 +++ b/roles/calculateEtcHosts/templates/groups.j2 @@ -1,4 +1,11 @@ { "groups": {{ groups | to_nice_json }}, - "hostvars": {{ hostvars | to_nice_json }} + "hostvars": { +{% for host in groups['all'] %} +"{{ host }}" : {{ hostvars[host]|to_nice_json }} +{% if not loop.last %} + , +{% endif %} +{% endfor %} +} } diff --git a/roles/calculateKnownHosts/tasks/main.yml b/roles/calculateKnownHosts/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..5714eb665161bde40336e00b0cf9360ba4721a15 --- /dev/null +++ b/roles/calculateKnownHosts/tasks/main.yml @@ -0,0 +1,13 @@ +- name: "Templating /etc/ssh/known_hosts" + template: src=known_hosts.j2 dest=/tmp/ssh_known_hosts owner=root group=root mode=644 + sudo: true + register: sshknownhost + +- name: fetch known_hosts file + fetch: src=/tmp/ssh_known_hosts dest=files/ssh_known_hosts flat=yes + +- name: delete ssh_known_hosts + file: path=/tmp/ssh_known_hosts state=absent + sudo: true + + diff --git a/roles/setupKnownHosts/templates/known_hosts.j2 b/roles/calculateKnownHosts/templates/known_hosts.j2 similarity index 62% rename from roles/setupKnownHosts/templates/known_hosts.j2 rename to roles/calculateKnownHosts/templates/known_hosts.j2 index b083b5ee6e037007b984b133264405c77a9340ff..28d5afcca141a3dd6ae0b75dc981f212380c6956 100644 --- a/roles/setupKnownHosts/templates/known_hosts.j2 +++ b/roles/calculateKnownHosts/templates/known_hosts.j2 @@ -2,14 +2,13 @@ {% for node in groups['all'] %} {% for interface in hostvars[node]['ansible_interfaces'] %} {% if interface != "lo" %} -{% if hostvars[node]['ansible_ssh_host_key_rsa_public'] %} -{% set host = {'name': node, 'ip': hostvars[node]['ansible_'+interface]['ipv4']['address'], 'keytype':'ssh-rsa', 'key': hostvars[node]['ansible_ssh_host_key_rsa_public']} %} +{% if 'ansible_host_key_rsa_public' in hostvars[node] and hostvars[node]['ansible_host_key_rsa_public'] %} +{% set host = {'name': node, 'ip': hostvars[node]['ansible_'+interface]['ipv4']['address'], 'keytype':'ssh-rsa', 'key': hostvars[node]['ansible_host_key_rsa_public']} %} {% if nodelist.append(host) %} {% endif %} {% endif %} -{% if hostvars[node]['ansible_ssh_host_key_ecdsa_public'] %} -#{% set host = {'name': node, 'ip': hostvars[node]['ansible_'+interface]['ipv4']['address'], 'keytype':'ssh-ecdsa', 'key': hostvars[node]['ansible_ssh_host_key_ecdsa_public']} %} -{% set host = {'name': node, 'ip': hostvars[node]['ansible_'+interface]['ipv4']['address'], 'keytype':'ecdsa-sha2-nistp256', 'key': hostvars[node]['ansible_ssh_host_key_ecdsa_public']} %} +{% if 'ansible_host_key_ecdsa_public' in hostvars[node] and hostvars[node]['ansible_host_key_ecdsa_public'] %} +{% set host = {'name': node, 'ip': hostvars[node]['ansible_'+interface]['ipv4']['address'], 'keytype':'ecdsa-sha2-nistp256', 'key': hostvars[node]['ansible_host_key_ecdsa_public']} %} {% if nodelist.append(host) %} {% endif %} {% endif %} diff --git a/roles/calculateSlurmConf/templates/slurm.conf.j2 b/roles/calculateSlurmConf/templates/slurm.conf.j2 index e5c567c03efa18fad6e2f75ad60dbc2cf5aa0b19..50e869b57f0bdbb4f4870d961cfbfc032d44d824 100644 --- a/roles/calculateSlurmConf/templates/slurm.conf.j2 +++ b/roles/calculateSlurmConf/templates/slurm.conf.j2 @@ -139,6 +139,10 @@ AccountingStorageEnforce=limits,safe # #GRES #GresTypes=gpu +# + +HealthCheckInterval=300 +HealthCheckProgram={{ nhc_dir }}/sbin/nhc #array jobs. max number {% if slurm_max_array_size is defined %} diff --git a/roles/gluster_server/files/glusterfs-epel.repo b/roles/config_repos/files/glusterfs-epel.repo similarity index 61% rename from roles/gluster_server/files/glusterfs-epel.repo rename to roles/config_repos/files/glusterfs-epel.repo index 6cac832a40534526c9e689321cb8a8b1f271c5f7..015ad0bb9c3b8539d6e87fe999c69605b52910e5 100644 --- a/roles/gluster_server/files/glusterfs-epel.repo +++ b/roles/config_repos/files/glusterfs-epel.repo @@ -2,21 +2,21 @@ [glusterfs-epel] name=GlusterFS is a clustered file-system capable of scaling to several petabytes. -baseurl=http://download.gluster.org/pub/gluster/glusterfs/3.7/LATEST/EPEL.repo/epel-$releasever/$basearch/ +baseurl=http://download.gluster.org/pub/gluster/glusterfs/LATEST/EPEL.repo/epel-$releasever/$basearch/ enabled=1 skip_if_unavailable=1 gpgcheck=0 [glusterfs-noarch-epel] name=GlusterFS is a clustered file-system capable of scaling to several petabytes. -baseurl=http://download.gluster.org/pub/gluster/glusterfs/3.7/LATEST/EPEL.repo/epel-$releasever/noarch +baseurl=http://download.gluster.org/pub/gluster/glusterfs/LATEST/EPEL.repo/epel-$releasever/noarch enabled=1 skip_if_unavailable=1 gpgcheck=0 [glusterfs-source-epel] name=GlusterFS is a clustered file-system capable of scaling to several petabytes. - Source -baseurl=http://download.gluster.org/pub/gluster/glusterfs/3.7/LATEST/EPEL.repo/epel-$releasever/SRPMS +baseurl=http://download.gluster.org/pub/gluster/glusterfs/LATEST/EPEL.repo/epel-$releasever/SRPMS enabled=0 skip_if_unavailable=1 gpgcheck=0 diff --git a/roles/config_repos/tasks/main.yml b/roles/config_repos/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..6d2efb6b166f5be9d2e4d3efaeee94ecef7e3058 --- /dev/null +++ b/roles/config_repos/tasks/main.yml @@ -0,0 +1,43 @@ +--- +# this repository was broken on some CentOS images. Remove it. +- name: Removing the RDO repository + file: path=/etc/yum.repos.d/rdo-release.repo state=absent + sudo: true + +- name: add gluster repo + copy: src=glusterfs-epel.repo dest=/etc/yum.repos.d/glusterfs-epel.repo + sudo: true + when: ansible_os_family == 'RedHat' + +- name: Install epel-release + yum: name=epel-release-7-5.noarch state=present + sudo: true + when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "7" + + +#- name: Enable epel +# command: yum-config-manager --enable epel +# sudo: true +# when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "7" + +# Use mate DE on systems that have moved to gnome3, since there is no gpu acceleration by default on NeCTAR openstack +# Trusty (Ubuntu 14.04 LTS) needs repos added. Wheezy (Debian Stable) gets mate from backports, Utopic (Ubuntu 14.10) Jessie (Debian testing) and Sid (Debian unstable) get it by default +- name: add repos apt + shell: "add-apt-repository -y ppa:ubuntu-mate-dev/ppa" + sudo: true + when: ansible_distribution_release == 'trusty' + +- name: add repos apt + shell: "add-apt-repository -y ppa:ubuntu-mate-dev/trusty-mate" + sudo: true + when: ansible_distribution_release == 'trusty' + +- name: add repos apt + shell: "add-apt-repository -y ppa:gluster/glusterfs-3.7" + sudo: true + when: ansible_distribution == 'Ubuntu' + +- name: apt-get update + apt: update_cache=True + sudo: true + when: ansible_os_family=="Debian" diff --git a/roles/cvlExtraFiles/tasks/main.yml b/roles/cvlExtraFiles/tasks/main.yml index 4dff080397f5cdf7d9897f912b77cc2e1da90832..6ecb7068275297b5f09ec3157814095677793a15 100644 --- a/roles/cvlExtraFiles/tasks/main.yml +++ b/roles/cvlExtraFiles/tasks/main.yml @@ -1,7 +1,6 @@ --- - name: extra file symbolic links file: src={{ item.src }} path={{ item.dest }} state={{ item.type }} force=yes - with_items: - extraFiles + with_items: "{{ extraFiles }}" sudo: true when: extraFiles is defined diff --git a/roles/easy-rsa-certificate/tasks/buildCert.yml b/roles/easy-rsa-certificate/tasks/buildCert.yml index 32f5a06f62d3461e0e9f63a2c6c1fa0a55c09c76..df849b969aee561f41ddb16a4481e1ace89b890c 100644 --- a/roles/easy-rsa-certificate/tasks/buildCert.yml +++ b/roles/easy-rsa-certificate/tasks/buildCert.yml @@ -31,7 +31,7 @@ when: cert.stat.exists == false or cert.stat.size == 0 - name: "Delete Zero Sized Ceritificates" - remote_user: "{{ hostvars[x509_ca_server]['ansible_ssh_user'] }}" + remote_user: "{{ hostvars[x509_ca_server]['ansible_user'] }}" delegate_to: "{{ x509_ca_server }}" shell: rm -rf /etc/easy-rsa/2.0/keys/{{ x509_common_name }}.* when: cert is defined and cert.stat.size == 0 @@ -61,28 +61,28 @@ when: needcert - name: "Copy CSR to CA" - remote_user: "{{ hostvars[x509_ca_server]['ansible_ssh_user'] }}" + remote_user: "{{ hostvars[x509_ca_server]['ansible_user'] }}" delegate_to: "{{ x509_ca_server }}" copy: "src=/tmp/{{ inventory_hostname }}/{{ inventory_hostname }}.csr dest=/etc/easy-rsa/2.0/keys/{{ x509_common_name }}.csr force=yes" when: needcert sudo: true - name: "Sign Certificate" - remote_user: "{{ hostvars[x509_ca_server]['ansible_ssh_user'] }}" + remote_user: "{{ hostvars[x509_ca_server]['ansible_user'] }}" delegate_to: "{{ x509_ca_server }}" shell: "cd /etc/easy-rsa/2.0; . ./vars; export EASY_RSA=\"${EASY_RSA:-.}\" ;\"$EASY_RSA\"/pkitool --sign {{ x509_sign_args }} {{ x509_common_name }}" when: needcert sudo: true - name: "Copy the Certificate to ansible host" - remote_user: "{{ hostvars[x509_ca_server]['ansible_ssh_user'] }}" + remote_user: "{{ hostvars[x509_ca_server]['ansible_user'] }}" delegate_to: "{{ x509_ca_server }}" fetch: "src=/etc/easy-rsa/2.0/keys/{{ x509_common_name }}.crt dest=/tmp/{{ inventory_hostname }}/{{ x509_common_name }}.crt fail_on_missing=yes validate_md5=yes flat=yes" sudo: true when: needcert - name: "Copy the CA Certificate to the ansible host" - remote_user: "{{ hostvars[x509_ca_server]['ansible_ssh_user'] }}" + remote_user: "{{ hostvars[x509_ca_server]['ansible_user'] }}" delegate_to: "{{ x509_ca_server }}" fetch: "src=/etc/easy-rsa/2.0/keys/ca.crt dest=/tmp/{{ inventory_hostname }}/ca.crt fail_on_missing=yes validate_md5=yes flat=yes" sudo: true diff --git a/roles/enable_lmod/tasks/main.yml b/roles/enable_lmod/tasks/main.yml index 5c1ff887b0969578ee0a0aaa52d4603e1472b9c7..4676f706030c27b4b86e35a342e26fb3ae9ad74b 100644 --- a/roles/enable_lmod/tasks/main.yml +++ b/roles/enable_lmod/tasks/main.yml @@ -1,16 +1,6 @@ --- - include_vars: "{{ ansible_os_family }}.yml" -- name: Install epel-release - yum: name=epel-release-7-5.noarch state=present - sudo: true - when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "7" - -- name: Enable epel - command: yum-config-manager --enable epel - sudo: true - when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "7" - - name: install lua yum: name={{ item }} state=installed with_items: diff --git a/roles/enable_sudo_group/tasks/main.yml b/roles/enable_sudo_group/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..4456e42744982dab5efa66b3e9539029e11378fb --- /dev/null +++ b/roles/enable_sudo_group/tasks/main.yml @@ -0,0 +1,4 @@ +--- +- name: template sudoers file + template: src=10-admin_group.j2 dest=/etc/sudoers.d/10-admin_group + become: true diff --git a/roles/enable_sudo_group/templates/10-admin_group.j2 b/roles/enable_sudo_group/templates/10-admin_group.j2 new file mode 100644 index 0000000000000000000000000000000000000000..624588d72e02a282b26e84dc44cd671f1410d6cb --- /dev/null +++ b/roles/enable_sudo_group/templates/10-admin_group.j2 @@ -0,0 +1 @@ +%{{ sudo_group }} ALL=(ALL) ALL diff --git a/roles/etcHosts/tasks/main.yml b/roles/etcHosts/tasks/main.yml index 74c899c395ce101631a2338890dedcb90cdd05da..52ca69082d2e13d8e4c954432efb9f1fb7367ce9 100644 --- a/roles/etcHosts/tasks/main.yml +++ b/roles/etcHosts/tasks/main.yml @@ -13,10 +13,20 @@ - name: set /etc/sysconfig/network on CentOS 6 lineinfile: dest=/etc/sysconfig/network line='HOSTNAME={{ ansible_hostname }}' regexp='^HOSTNAME' sudo: true - when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "6" + when: ansible_distribution == "CentOS" - name: set /etc/sysctl.conf on Debian 8 lineinfile: dest=/etc/sysctl.conf line='kernel.domainname = {{ domain }}' regexp='^#kernel.domainname' sudo: true when: ansible_distribution == "Debian" and ansible_distribution_major_version == "8" +- name: set preserve hostname on CentOS + lineinfile: dest=/etc/cloud/cloud.cfg line='preserve_hostname=True' + sudo: true + when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "7" + +- name: set /etc/hostname + template: src=hostname dest=/etc/hostname + become: true + become_user: root + when: ansible_distribution == "CentOS" diff --git a/roles/etcHosts/templates/hostname b/roles/etcHosts/templates/hostname new file mode 100644 index 0000000000000000000000000000000000000000..c3e4c0866425f36098edb843bfe08ea171b36fe8 --- /dev/null +++ b/roles/etcHosts/templates/hostname @@ -0,0 +1 @@ +{{ ansible_hostname }} diff --git a/roles/extra_rpms/tasks/main.yml b/roles/extra_rpms/tasks/main.yml index 2a85fba6ddd111326b878ba24e599bc1dc12f5d2..dde65974accbf019afbac0d1655e129ce9e84277 100644 --- a/roles/extra_rpms/tasks/main.yml +++ b/roles/extra_rpms/tasks/main.yml @@ -1,8 +1,20 @@ --- - name: "Install extra packages" yum: "name={{ item }} state=present" - with_items: - pkgs + with_items: "{{ pkgs }}" sudo: true + ignore_errors: true when: ansible_os_family == 'RedHat' +- name: "Check fusermount user access permission" + shell: fusermount --version + ignore_errors: true + register: fusermount_user_access_error + when: ansible_os_family == 'RedHat' + +- name: "Fix fusermount user access permission" + file: path=/bin/fusermount mode="o=rx" + sudo: true + when: ansible_os_family == 'RedHat' and fusermount_user_access_error | failed + + diff --git a/roles/extra_rpms/vars/main.yml b/roles/extra_rpms/vars/main.yml index df10f44104f4a08b545af265cbefa0b3ce8338ec..b70d611e95a35d73562b2d50a066f34d98c685d8 100644 --- a/roles/extra_rpms/vars/main.yml +++ b/roles/extra_rpms/vars/main.yml @@ -190,6 +190,7 @@ pkgs: - qt-x11 - rhino - rsync + - samba-client - scipy - spice-vdagent - suitesparse diff --git a/roles/gluster_client/files/glusterfs-epel.repo b/roles/gluster_client/files/glusterfs-epel.repo deleted file mode 100644 index 843b4baef3cf4d81aca369e49c44b92c1599c3cf..0000000000000000000000000000000000000000 --- a/roles/gluster_client/files/glusterfs-epel.repo +++ /dev/null @@ -1,22 +0,0 @@ -# Place this file in your /etc/yum.repos.d/ directory - -[glusterfs-epel] -name=GlusterFS is a clustered file-system capable of scaling to several petabytes. -baseurl=http://download.gluster.org/pub/gluster/glusterfs/3.6/LATEST/EPEL.repo/epel-$releasever/$basearch/ -enabled=1 -skip_if_unavailable=1 -gpgcheck=0 - -[glusterfs-noarch-epel] -name=GlusterFS is a clustered file-system capable of scaling to several petabytes. -baseurl=http://download.gluster.org/pub/gluster/glusterfs/3.6/LATEST/EPEL.repo/epel-$releasever/noarch -enabled=1 -skip_if_unavailable=1 -gpgcheck=0 - -[glusterfs-source-epel] -name=GlusterFS is a clustered file-system capable of scaling to several petabytes. - Source -baseurl=http://download.gluster.org/pub/gluster/glusterfs/3.6/LATEST/EPEL.repo/epel-$releasever/SRPMS -enabled=0 -skip_if_unavailable=1 -gpgcheck=0 diff --git a/roles/gluster_client/tasks/main.yml b/roles/gluster_client/tasks/main.yml index 68d24002695056b5b32337510a22bb5b48f187f6..7c769da5dbf8558cf13ac35d6b6c0db5e2098642 100644 --- a/roles/gluster_client/tasks/main.yml +++ b/roles/gluster_client/tasks/main.yml @@ -1,8 +1,4 @@ --- -- name: add repo - copy: src=glusterfs-epel.repo dest=/etc/yum.repos.d/glusterfs-epel.repo - sudo: true - when: ansible_os_family == 'RedHat' - name: install gluster yum: name={{ item }} state='latest' diff --git a/roles/gluster_server/tasks/main.yml b/roles/gluster_server/tasks/main.yml index 82dcabaa4088bec82eb7168983735fdb9cddae5a..5b5248e3c6e5c8eaaf261431e024b59608468fee 100644 --- a/roles/gluster_server/tasks/main.yml +++ b/roles/gluster_server/tasks/main.yml @@ -1,10 +1,5 @@ --- -- name: add repo - copy: src=glusterfs-epel.repo dest=/etc/yum.repos.d/glusterfs-epel.repo - sudo: true - when: ansible_os_family == 'RedHat' - - name: install gluster yum: name={{ item }} state='latest' when: ansible_os_family == 'RedHat' @@ -29,25 +24,6 @@ sudo: true when: ansible_os_family == 'Debian' -- name: make server list - set_fact: - server_list: "{{ gluster_servers|join(',') }}" - - -- name: echo server list - debug: var=server_list - - name: make brick dir file: state=directory path="{{ brickmnt }}/brick" sudo: true - -- name: create volume - gluster_volume: - name: "{{ volname }}" - brick: "{{ brickmnt }}/brick" - cluster: "{{ server_list }}" - replicas: "{{ replicas }}" - state: present - sudo: true - run_once: true - diff --git a/roles/gluster_volcreate/tasks/main.yml b/roles/gluster_volcreate/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..d14e0387c6cf1b28d2a2b5e15cd1f35bfa76ec73 --- /dev/null +++ b/roles/gluster_volcreate/tasks/main.yml @@ -0,0 +1,32 @@ +--- + +- name: make server list + set_fact: + server_list: "{{ gluster_servers|join(',') }}" + +- name: probe peers + shell: "gluster peer probe {{ (gluster_servers|difference([inventory_hostname]))|join(',') }}" + run_once: true + become: true + become_user: root + +- name: peer status + shell: "gluster peer status" + become: true + become_user: root + register: peer_status + +- name: debug peer status + debug: var=peer_status + +- name: create volume + gluster_volume: + name: "{{ volname }}" + brick: "{{ brickmnt }}/brick" + cluster: "{{ server_list }}" + replicas: "{{ replicas }}" + host: "{{ inventory_hostname }}" + state: present + sudo: true + run_once: true + diff --git a/roles/karaage3.1.17/tasks/karaage.yml b/roles/karaage3.1.17/tasks/karaage.yml index c93567691305ebdc4efcc535e2ecd478e0286546..aae0d0b95f190ee3d70b53f186386a32ce09b828 100644 --- a/roles/karaage3.1.17/tasks/karaage.yml +++ b/roles/karaage3.1.17/tasks/karaage.yml @@ -51,22 +51,15 @@ - name: "Getting Karaage from Github" - git: repo="https://github.com/monash-merc/karaage.git" dest="/root/karaage3.1.7" {% if karaage_source_version is defined %}version="{{ karaage_source_version }}" {% endif %} force=yes + git: repo="https://github.com/monash-merc/karaage.git" dest="/root/karaage3.1.7" force=yes sudo: true -- - name: "Installing Karaage Dependencies" - pip: name={{ item }} - sudo: true - with_items: - - six - - slimit - - ply - - cython - - django-celery - - pyasn1 - - ldap3 -# - mod_wsgi +- name: Copy dependence file + copy: src=files/requirements.txt dest=/tmp/requirements.txt mode=644 + +- name: "Installing Karaage Dependencies" + pip: requirements=/tmp/requirements.txt + sudo: true - name: "Restrict Django version to 1.7.8" sudo: true diff --git a/roles/karaage3.1.17/tasks/prerequisitesDebian.yml b/roles/karaage3.1.17/tasks/prerequisitesDebian.yml index c0517a924ae7db1a5f9039d9d33a584b59fa6a62..2b21c1c82aa3cd80563fe33ec5cebd6cf3ad68bb 100644 --- a/roles/karaage3.1.17/tasks/prerequisitesDebian.yml +++ b/roles/karaage3.1.17/tasks/prerequisitesDebian.yml @@ -5,18 +5,6 @@ apt: name={{ item }} update_cache=yes with_items: - debian-keyring -- - apt_key: "url=http://code.vpac.org/debian/vpac-debian-key.gpg state=present" - name: "Installing the VPAC Debian Archive signing key" - sudo: true -- - apt_repository: "repo='deb http://code.vpac.org/debian jessie main' state=present" - name: "Adding VPAC repository in the source list" - sudo: true -- - apt_repository: "repo='deb-src http://code.vpac.org/debian jessie main' state=present" - name: "Adding VPAC source repository" - sudo: true - apt: update_cache=yes name: "Upgrading apt..." diff --git a/roles/ldapclient/tasks/configLdapClient.yml b/roles/ldapclient/tasks/configLdapClient.yml index 3e22db75422d4647405933e4379b34548da6aa24..5b667f2e4f348690bf4993b29539557f3d8907c4 100644 --- a/roles/ldapclient/tasks/configLdapClient.yml +++ b/roles/ldapclient/tasks/configLdapClient.yml @@ -4,41 +4,55 @@ with_items: - pam_ldap.conf - nsswitch.conf - sudo: true + become: true + become_user: root - name: "make basedir" - file: path="{{ ldapCaCertFile | dirname }}" state=directory owner=root - sudo: true + file: path="{{ ldapCaCertFile | dirname }}" state=directory owner=root follow=yes + become: true + become_user: root ignore_errors: true - name: "Copy the CA cert" copy: src={{ ldapCaCertSrc }} dest={{ ldapCaCertFile }} owner=root mode=644 - sudo: true + become: true + become_user: root when: ldapCaCertSrc is defined - name: "Template CA cert" template: src=ldapCaCert.j2 dest={{ ldapCaCertFile }} owner=root mode=644 - sudo: true + become: true + become_user: root when: ldapCaCertContents is defined - name: "Copy system auth" template: src=system-auth.j2 dest=/etc/pam.d/system-auth - sudo: true + become: true + become_user: root - name: "Copy password auth" template: src=password-auth.j2 dest=/etc/pam.d/password-auth - sudo: true + become: true + become_user: root - name: "Add LDAP server IP address to /etc/hosts" lineinfile: dest=/etc/hosts line="{{ ldapServerHostIpLine }}" state=present insertafter=EOF - sudo: true + become: true + become_user: root when: ldapServerHostIpLine is defined - name: "Copy sssd.conf to ldap client" template: src=sssd.j2 dest=/etc/sssd/sssd.conf owner=root group=root mode=600 - sudo: true + become: true + become_user: root notify: restart sssd +- name: "Make the cache a tmpfs" + mount: name=/var/lib/sss/db/ src=tmpfs fstype=tmpfs opts='size=40m' state=mounted + become: true + become_user: root + - name: "start sssd" service: name=sssd state=started enabled=yes - sudo: true + become: true + become_user: root diff --git a/roles/ldapclient/templates/sssd.j2 b/roles/ldapclient/templates/sssd.j2 index 6b3d4728c6e86bc79e04f101655a545b20adae59..17de2c97c1a66d05cc994902d51ab1f08476c723 100644 --- a/roles/ldapclient/templates/sssd.j2 +++ b/roles/ldapclient/templates/sssd.j2 @@ -12,10 +12,10 @@ filter_groups = slurm, munge [domain/{{ ldapDomain }}] ldap_referrals = false cache_credentials = false -entry_cache_timeout=60480 -memcache_timeout=60480 +entry_cache_timeout=5400 +memcache_timeout=300 entry_cache_nowait_percentage=50 -enumerate = true +enumerate = false id_provider = ldap auth_provider = ldap diff --git a/roles/ldapserver/files/DB_CONFIG b/roles/ldapserver/files/DB_CONFIG new file mode 100644 index 0000000000000000000000000000000000000000..8fb2cb25da676c2ecb7a5e31c3f4e495ff1fba64 --- /dev/null +++ b/roles/ldapserver/files/DB_CONFIG @@ -0,0 +1,8 @@ +# one 0.25 GB cache +set_cachesize 0 268435456 1 + +# Transaction Log settings +set_lg_regionmax 262144 +set_lg_bsize 2097152 + +set_flags DB_LOG_AUTOREMOVE diff --git a/roles/ldapserver/tasks/main.yml b/roles/ldapserver/tasks/main.yml index 2909be7b7685a8f01d454c3784c18ef66f6d3863..c007a12a85efccafa53b2520c6928f5f73e94953 100644 --- a/roles/ldapserver/tasks/main.yml +++ b/roles/ldapserver/tasks/main.yml @@ -102,6 +102,10 @@ sudo: true register: tlsConfigured +- name: copy db config + copy: src=files/DB_CONFIG dest=/var/lib/ldap/DB_CONFIG owner=ldap group=ldap mode=644 + sudo: true + - name: start ldap service: name=slapd state=restarted sudo: true diff --git a/roles/ldapserver/templates/default_ppolicy_ldif.j2 b/roles/ldapserver/templates/default_ppolicy_ldif.j2 index cc638a27e219461a3b033eee4701d53ca594bff3..7400bc39342ea7d6e65017207be6c37ef8c5b6ba 100644 --- a/roles/ldapserver/templates/default_ppolicy_ldif.j2 +++ b/roles/ldapserver/templates/default_ppolicy_ldif.j2 @@ -11,7 +11,7 @@ pwdGraceAuthNLimit: 0 pwdInHistory: 10 pwdLockout: TRUE pwdLockoutDuration: 3600 -pwdMaxAge: 7776000 +pwdMaxAge: 0 pwdMaxFailure: 5 pwdMinAge: 3600 pwdMinLength: 12 diff --git a/roles/ldapserver/vars/CentOS_6.7_x86_64.yml b/roles/ldapserver/vars/CentOS_6.7_x86_64.yml new file mode 100644 index 0000000000000000000000000000000000000000..ae41ae86c9d53c509d1464ef8d21b1b18b1f1267 --- /dev/null +++ b/roles/ldapserver/vars/CentOS_6.7_x86_64.yml @@ -0,0 +1,8 @@ +--- + system_packages: + - openldap-servers + - openldap-clients + - openssl + dbname: olcDatabase={2}bdb + ldapuser: ldap + ldapgroup: ldap diff --git a/roles/link_usr_local/tasks/main.yml b/roles/link_directories/tasks/main.yml similarity index 100% rename from roles/link_usr_local/tasks/main.yml rename to roles/link_directories/tasks/main.yml diff --git a/roles/lmod/tasks/main.yml b/roles/lmod/tasks/main.yml index 6c84ac239a44e85745de9f1bc10f5777e085bfb3..393a4f0058132827daa29be45ad5fe6b9df5f53f 100644 --- a/roles/lmod/tasks/main.yml +++ b/roles/lmod/tasks/main.yml @@ -1,28 +1,6 @@ --- - include_vars: "{{ ansible_os_family }}.yml" -- name: add epel on CentOS 7 - shell: rpm -iUvh http://dl.fedoraproject.org/pub/epel/7/x86_64/e/epel-release-7-5.noarch.rpm - sudo: true - when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "7" - ignore_errors: true - -#- name: add epel on CentOS 7 -# shell: yum -y update -# sudo: true -# when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "7" - -- name: Install epel-release - yum: name=epel-release-7-5.noarch state=present - sudo: true - when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "7" - -- name: Enable epel - command: yum-config-manager --enable epel - sudo: true - when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "7" - - - name: install lua yum: name={{ item }} state=installed with_items: diff --git a/roles/lustre-client/tasks/main.yml b/roles/lustre-client/tasks/main.yml index d02a6ba83d2d2238bc39b8bf09ecda2f00db08c6..e2691920b913a10ef3ae11cf123150d6a633aa69 100644 --- a/roles/lustre-client/tasks/main.yml +++ b/roles/lustre-client/tasks/main.yml @@ -29,19 +29,21 @@ when: ansible_os_family == "RedHat" # instructions to build these debs: +# instructions based on this https://linuxsysadm.wordpress.com/2014/10/10/lustre-2-6-on-debian-wheezy-clients/ # Instantiate an Ubuntu 14.04 instance # git clone git://git.hpdd.intel.com/fs/lustre-release.git # cd lustre-release # optionally git checkout 0754bc8f2623bea184111af216f7567608db35b6 <- I know this commit works on Ubuntu, but I had a lot of trouble with other branches # sh autogen.sh -# ./configure --enable-dist --disable-doc --disable-server --disable-dependency-tracking --with-o2ib=/var/lib/dkms/mlnx-ofed-kernel/3.1/build/ +# ./configure --enable-dist --disable-doc --disable-server --disable-dependency-tracking --with-o2ib=/var/lib/dkms/mlnx-ofed-kernel/3.1/build/ <- if you didn't checkout the commit above you may also need to --disable-manpages or similar +# make dist # mkdir BUILD # cd BUILD # ln -s ../lustre-2.7.62.tar.gz lustre-2.7.62.orig.tar.gz # tar zxvf ../lustre-2.7.62.tar.gz # cd lustre-2.7.62 # ./configure --disable-doc --disable-server --disable-dependency-tracking --with-o2ib=/var/lib/dkms/mlnx-ofed-kernel/3.1/build/ -# vi debian/changelog (the version number on the first line is incorrect) +# vi debian/changelog (the version number on the first line is incorrect) instead of 2.7.50-1 it should be 2.7.62 .... this may not be true depending on what commit you checked out # make debs # #- linux-patch-lustre_2.7.62-1_all.deb diff --git a/roles/mellanox_drivers/tasks/main.yml b/roles/mellanox_drivers/tasks/main.yml index f265031ff67c3d61a68e9f39698a7c2fd31afb0a..5a512635ceab81143cdf31ae1c28b80f977c8f13 100644 --- a/roles/mellanox_drivers/tasks/main.yml +++ b/roles/mellanox_drivers/tasks/main.yml @@ -33,11 +33,11 @@ # when: ansible_os_family == "RedHat" and drivers_installed|failed # #- name: waiting for server to come back -# local_action: wait_for host={{ ansible_ssh_host }} state=started port=22 delay=10 search_regex=OpenSSH +# local_action: wait_for host={{ ansible_host }} state=started port=22 delay=10 search_regex=OpenSSH # sudo: false # #- name: waiting for server to come back number 2 -# local_action: wait_for host={{ ansible_ssh_host }} state=started port=22 delay=10 search_regex=OpenSSH +# local_action: wait_for host={{ ansible_host }} state=started port=22 delay=10 search_regex=OpenSSH # sudo: false @@ -131,12 +131,12 @@ when: ansible_os_family=="Debian" and drivers_installed|failed - name: waiting for server to come back - local_action: wait_for host={{ ansible_ssh_host }} state=started port=22 delay=10 search_regex=OpenSSH + local_action: wait_for host={{ ansible_host }} state=started port=22 delay=10 search_regex=OpenSSH sudo: false when: drivers_installed|failed - name: waiting for server to come back 2 - local_action: wait_for host={{ ansible_ssh_host }} state=started port=22 delay=10 search_regex=OpenSSH + local_action: wait_for host={{ ansible_host }} state=started port=22 delay=10 search_regex=OpenSSH when: drivers_installed|failed - name: bring up interface diff --git a/roles/move_homedir/tasks/main.yml b/roles/move_homedir/tasks/main.yml index 9ed97c5d5d6dcd1098e8a734af3d8d926aa67260..e0f4863464e6aff0ea2b3c6a6898d1e0b3da4336 100644 --- a/roles/move_homedir/tasks/main.yml +++ b/roles/move_homedir/tasks/main.yml @@ -2,8 +2,8 @@ file: path=/local_home owner=root group=root state=directory sudo: true -- name: copy the {{ ansible_ssh_user }} home - shell: cp -ar /home/{{ ansible_ssh_user }} /local_home +- name: copy the {{ ansible_user }} home + shell: cp -ar /home/{{ ansible_user }} /local_home ignore_errors: true sudo: true register: home_copied @@ -13,8 +13,8 @@ lineinfile: args: dest: /etc/passwd - regexp: '{{ ansible_ssh_user }}:x:(.*):(.*):(.*):/home/{{ ansible_ssh_user }}:(.*)' - line: '{{ ansible_ssh_user }}:x:\1:\2:\3:/local_home/{{ ansible_ssh_user }}:\4' + regexp: '{{ ansible_user }}:x:(.*):(.*):(.*):/home/{{ ansible_user }}:(.*)' + line: '{{ ansible_user }}:x:\1:\2:\3:/local_home/{{ ansible_user }}:\4' backrefs: yes sudo: true register: edit diff --git a/roles/mysql/tasks/mysql_client.yml b/roles/mysql/tasks/mysql_client.yml index 05a1022d073868f6f2cd19bc21ae905871a6e64e..bffd83e36077055f0a1224999eded1ae17444fdc 100644 --- a/roles/mysql/tasks/mysql_client.yml +++ b/roles/mysql/tasks/mysql_client.yml @@ -1,12 +1,12 @@ --- - name: "Installing MySQL Debian" apt: name="{{ item }}" update_cache=yes cache_valid_time=3600 state=present - with_items: client_packages + with_items: "{{ client_packages }}" sudo: true when: ansible_os_family == "Debian" - name: Installing MySQL RedHat yum: name="{{ item }}" state=present - with_items: client_packages + with_items: "{{ client_packages }}" sudo: true when: ansible_os_family == "RedHat" diff --git a/roles/mysql/tasks/mysql_server.yml b/roles/mysql/tasks/mysql_server.yml index 1d2d054f80fcd5aaad722d01186ac9be03c4d358..68806bdadbed24eb4f3cb66f949556860cfdcecf 100644 --- a/roles/mysql/tasks/mysql_server.yml +++ b/roles/mysql/tasks/mysql_server.yml @@ -1,13 +1,13 @@ --- - name: "Installing MySQL Debian" apt: name="{{ item }}" update_cache=yes cache_valid_time=3600 state=present - with_items: server_packages + with_items: "{{ server_packages }}" sudo: true when: ansible_os_family == "Debian" - name: Installing MySQL RedHat yum: name={{ item }} - with_items: server_packages + with_items: "{{ server_packages }}" sudo: true when: ansible_os_family == "RedHat" diff --git a/roles/nagios_config/tasks/main.yml b/roles/nagios_config/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..c99e088a755d49015f9edf8580321ff41b8988d7 --- /dev/null +++ b/roles/nagios_config/tasks/main.yml @@ -0,0 +1,27 @@ +--- +- name: configure monitoring + template: src={{ item }}_nagios2.cfg.j2 dest=/etc/nagios3/conf.d/{{ item }}_nagios2.cfg + with_items: + - 'hostgroups' + - 'hosts' + - 'commands' + - 'services' + - 'contactgroup' + - 'contacts' + sudo: true + +- name: remove unwanted configure files + file: path={{ item }}_nagios2.cfg.j2 state=absent + with_items: + - 'localhost' + - 'extinfo' + sudo: true + +- name: change cgi config + template: src=cgi.cfg.j2 dest=/etc/nagios3/cgi.cfg + sudo: true + +- name: nagios restart + service: name=nagios3 state=restarted + sudo: true + diff --git a/roles/nagios_config/templates/cgi.cfg.j2 b/roles/nagios_config/templates/cgi.cfg.j2 new file mode 100644 index 0000000000000000000000000000000000000000..a3f3a07cd440ba32de38771fe0350102443b6618 --- /dev/null +++ b/roles/nagios_config/templates/cgi.cfg.j2 @@ -0,0 +1,377 @@ +################################################################# +# +# CGI.CFG - Sample CGI Configuration File for Nagios +# +################################################################# + + +# MAIN CONFIGURATION FILE +# This tells the CGIs where to find your main configuration file. +# The CGIs will read the main and host config files for any other +# data they might need. + +main_config_file=/etc/nagios3/nagios.cfg + + + +# PHYSICAL HTML PATH +# This is the path where the HTML files for Nagios reside. This +# value is used to locate the logo images needed by the statusmap +# and statuswrl CGIs. + +physical_html_path=/usr/share/nagios3/htdocs + + + +# URL HTML PATH +# This is the path portion of the URL that corresponds to the +# physical location of the Nagios HTML files (as defined above). +# This value is used by the CGIs to locate the online documentation +# and graphics. If you access the Nagios pages with an URL like +# http://www.myhost.com/nagios, this value should be '/nagios' +# (without the quotes). + +url_html_path=/nagios3 + + + +# CONTEXT-SENSITIVE HELP +# This option determines whether or not a context-sensitive +# help icon will be displayed for most of the CGIs. +# Values: 0 = disables context-sensitive help +# 1 = enables context-sensitive help + +show_context_help=1 + + + +# PENDING STATES OPTION +# This option determines what states should be displayed in the web +# interface for hosts/services that have not yet been checked. +# Values: 0 = leave hosts/services that have not been check yet in their original state +# 1 = mark hosts/services that have not been checked yet as PENDING + +use_pending_states=1 + +# NAGIOS PROCESS CHECK COMMAND +# This is the full path and filename of the program used to check +# the status of the Nagios process. It is used only by the CGIs +# and is completely optional. However, if you don't use it, you'll +# see warning messages in the CGIs about the Nagios process +# not running and you won't be able to execute any commands from +# the web interface. The program should follow the same rules +# as plugins; the return codes are the same as for the plugins, +# it should have timeout protection, it should output something +# to STDIO, etc. +# +# Note: The command line for the check_nagios plugin below may +# have to be tweaked a bit, as different versions of the plugin +# use different command line arguments/syntaxes. + +nagios_check_command=/usr/lib/nagios/plugins/check_nagios /var/cache/nagios3/status.dat 5 '/usr/sbin/nagios3' + + +# AUTHENTICATION USAGE +# This option controls whether or not the CGIs will use any +# authentication when displaying host and service information, as +# well as committing commands to Nagios for processing. +# +# Read the HTML documentation to learn how the authorization works! +# +# NOTE: It is a really *bad* idea to disable authorization, unless +# you plan on removing the command CGI (cmd.cgi)! Failure to do +# so will leave you wide open to kiddies messing with Nagios and +# possibly hitting you with a denial of service attack by filling up +# your drive by continuously writing to your command file! +# +# Setting this value to 0 will cause the CGIs to *not* use +# authentication (bad idea), while any other value will make them +# use the authentication functions (the default). + +use_authentication=1 + + + + +# x509 CERT AUTHENTICATION +# When enabled, this option allows you to use x509 cert (SSL) +# authentication in the CGIs. This is an advanced option and should +# not be enabled unless you know what you're doing. + +use_ssl_authentication=0 + + + + +# DEFAULT USER +# Setting this variable will define a default user name that can +# access pages without authentication. This allows people within a +# secure domain (i.e., behind a firewall) to see the current status +# without authenticating. You may want to use this to avoid basic +# authentication if you are not using a secure server since basic +# authentication transmits passwords in the clear. +# +# Important: Do not define a default username unless you are +# running a secure web server and are sure that everyone who has +# access to the CGIs has been authenticated in some manner! If you +# define this variable, anyone who has not authenticated to the web +# server will inherit all rights you assign to this user! + +#default_user_name=guest + + + +# SYSTEM/PROCESS INFORMATION ACCESS +# This option is a comma-delimited list of all usernames that +# have access to viewing the Nagios process information as +# provided by the Extended Information CGI (extinfo.cgi). By +# default, *no one* has access to this unless you choose to +# not use authorization. You may use an asterisk (*) to +# authorize any user who has authenticated to the web server. + +authorized_for_system_information=nagiosadmin,nagios + + + +# CONFIGURATION INFORMATION ACCESS +# This option is a comma-delimited list of all usernames that +# can view ALL configuration information (hosts, commands, etc). +# By default, users can only view configuration information +# for the hosts and services they are contacts for. You may use +# an asterisk (*) to authorize any user who has authenticated +# to the web server. + +authorized_for_configuration_information=nagiosadmin,nagios + + + +# SYSTEM/PROCESS COMMAND ACCESS +# This option is a comma-delimited list of all usernames that +# can issue shutdown and restart commands to Nagios via the +# command CGI (cmd.cgi). Users in this list can also change +# the program mode to active or standby. By default, *no one* +# has access to this unless you choose to not use authorization. +# You may use an asterisk (*) to authorize any user who has +# authenticated to the web server. + +authorized_for_system_commands=nagiosadmin,nagios + + + +# GLOBAL HOST/SERVICE VIEW ACCESS +# These two options are comma-delimited lists of all usernames that +# can view information for all hosts and services that are being +# monitored. By default, users can only view information +# for hosts or services that they are contacts for (unless you +# you choose to not use authorization). You may use an asterisk (*) +# to authorize any user who has authenticated to the web server. + + +authorized_for_all_services=nagiosadmin,nagios +authorized_for_all_hosts=nagiosadmin,nagios + + + +# GLOBAL HOST/SERVICE COMMAND ACCESS +# These two options are comma-delimited lists of all usernames that +# can issue host or service related commands via the command +# CGI (cmd.cgi) for all hosts and services that are being monitored. +# By default, users can only issue commands for hosts or services +# that they are contacts for (unless you you choose to not use +# authorization). You may use an asterisk (*) to authorize any +# user who has authenticated to the web server. + +authorized_for_all_service_commands=nagiosadmin,nagios +authorized_for_all_host_commands=nagiosadmin,nagios + + + +# READ-ONLY USERS +# A comma-delimited list of usernames that have read-only rights in +# the CGIs. This will block any service or host commands normally shown +# on the extinfo CGI pages. It will also block comments from being shown +# to read-only users. + +#authorized_for_read_only=user1,user2 + + + + +# STATUSMAP BACKGROUND IMAGE +# This option allows you to specify an image to be used as a +# background in the statusmap CGI. It is assumed that the image +# resides in the HTML images path (i.e. /usr/local/nagios/share/images). +# This path is automatically determined by appending "/images" +# to the path specified by the 'physical_html_path' directive. +# Note: The image file may be in GIF, PNG, JPEG, or GD2 format. +# However, I recommend that you convert your image to GD2 format +# (uncompressed), as this will cause less CPU load when the CGI +# generates the image. + +#statusmap_background_image=smbackground.gd2 + + + + +# STATUSMAP TRANSPARENCY INDEX COLOR +# These options set the r,g,b values of the background color used the statusmap CGI, +# so normal browsers that can't show real png transparency set the desired color as +# a background color instead (to make it look pretty). +# Defaults to white: (R,G,B) = (255,255,255). + +#color_transparency_index_r=255 +#color_transparency_index_g=255 +#color_transparency_index_b=255 + + + + +# DEFAULT STATUSMAP LAYOUT METHOD +# This option allows you to specify the default layout method +# the statusmap CGI should use for drawing hosts. If you do +# not use this option, the default is to use user-defined +# coordinates. Valid options are as follows: +# 0 = User-defined coordinates +# 1 = Depth layers +# 2 = Collapsed tree +# 3 = Balanced tree +# 4 = Circular +# 5 = Circular (Marked Up) + +default_statusmap_layout=5 + + + +# DEFAULT STATUSWRL LAYOUT METHOD +# This option allows you to specify the default layout method +# the statuswrl (VRML) CGI should use for drawing hosts. If you +# do not use this option, the default is to use user-defined +# coordinates. Valid options are as follows: +# 0 = User-defined coordinates +# 2 = Collapsed tree +# 3 = Balanced tree +# 4 = Circular + +default_statuswrl_layout=4 + + + +# STATUSWRL INCLUDE +# This option allows you to include your own objects in the +# generated VRML world. It is assumed that the file +# resides in the HTML path (i.e. /usr/local/nagios/share). + +#statuswrl_include=myworld.wrl + + + +# PING SYNTAX +# This option determines what syntax should be used when +# attempting to ping a host from the WAP interface (using +# the statuswml CGI. You must include the full path to +# the ping binary, along with all required options. The +# $HOSTADDRESS$ macro is substituted with the address of +# the host before the command is executed. +# Please note that the syntax for the ping binary is +# notorious for being different on virtually ever *NIX +# OS and distribution, so you may have to tweak this to +# work on your system. + +ping_syntax=/bin/ping -n -U -c 5 $HOSTADDRESS$ + + + +# REFRESH RATE +# This option allows you to specify the refresh rate in seconds +# of various CGIs (status, statusmap, extinfo, and outages). + +refresh_rate=90 + +# DEFAULT PAGE LIMIT +# This option allows you to specify the default number of results +# displayed on the status.cgi. This number can be adjusted from +# within the UI after the initial page load. Setting this to 0 +# will show all results. + +result_limit=100 + + +# ESCAPE HTML TAGS +# This option determines whether HTML tags in host and service +# status output is escaped in the web interface. If enabled, +# your plugin output will not be able to contain clickable links. + +escape_html_tags=1 + + + + +# SOUND OPTIONS +# These options allow you to specify an optional audio file +# that should be played in your browser window when there are +# problems on the network. The audio files are used only in +# the status CGI. Only the sound for the most critical problem +# will be played. Order of importance (higher to lower) is as +# follows: unreachable hosts, down hosts, critical services, +# warning services, and unknown services. If there are no +# visible problems, the sound file optionally specified by +# 'normal_sound' variable will be played. +# +# +# <varname>=<sound_file> +# +# Note: All audio files must be placed in the /media subdirectory +# under the HTML path (i.e. /usr/local/nagios/share/media/). + +#host_unreachable_sound=hostdown.wav +#host_down_sound=hostdown.wav +#service_critical_sound=critical.wav +#service_warning_sound=warning.wav +#service_unknown_sound=warning.wav +#normal_sound=noproblem.wav + + + +# URL TARGET FRAMES +# These options determine the target frames in which notes and +# action URLs will open. + +action_url_target=_blank +notes_url_target=_blank + + + + +# LOCK AUTHOR NAMES OPTION +# This option determines whether users can change the author name +# when submitting comments, scheduling downtime. If disabled, the +# author names will be locked into their contact name, as defined in Nagios. +# Values: 0 = allow editing author names +# 1 = lock author names (disallow editing) + +lock_author_names=1 + + + + +# SPLUNK INTEGRATION OPTIONS +# These options allow you to enable integration with Splunk +# in the web interface. If enabled, you'll be presented with +# "Splunk It" links in various places in the CGIs (log file, +# alert history, host/service detail, etc). Useful if you're +# trying to research why a particular problem occurred. +# For more information on Splunk, visit http://www.splunk.com/ + +# This option determines whether the Splunk integration is enabled +# Values: 0 = disable Splunk integration +# 1 = enable Splunk integration + +#enable_splunk_integration=1 + + +# This option should be the URL used to access your instance of Splunk + +#splunk_url=http://127.0.0.1:8000/ + + + diff --git a/roles/nagios_config/templates/commands_nagios2.cfg.j2 b/roles/nagios_config/templates/commands_nagios2.cfg.j2 new file mode 100644 index 0000000000000000000000000000000000000000..385fa645e3d13ed4d32a6a47cc094017dc04c124 --- /dev/null +++ b/roles/nagios_config/templates/commands_nagios2.cfg.j2 @@ -0,0 +1,8 @@ +{% for service in nagios_services %} +{% if service.script %} +define command { + command_name {{ service.name }} + command_line /usr/lib/nagios/plugins/check_by_ssh -H $HOSTADDRESS$ -o StrictHostKeyChecking=no -C "{{ nagios_home }}/scripts/{{ service.script }}" -E +} +{% endif %} +{% endfor %} diff --git a/roles/nagios_config/templates/contactgroup_nagios2.cfg.j2 b/roles/nagios_config/templates/contactgroup_nagios2.cfg.j2 new file mode 100644 index 0000000000000000000000000000000000000000..8e671a0956e5df1ddd6a694a5350ef711ede14bb --- /dev/null +++ b/roles/nagios_config/templates/contactgroup_nagios2.cfg.j2 @@ -0,0 +1,9 @@ +define contactgroup { + contactgroup_name admins ; Group name used in configuration + alias Administrators ; Alias for group displayed on webpage + members admins +; contactgroup_members ; Other contact groups to be notified + } + + + diff --git a/roles/nagios_config/templates/contacts_nagios2.cfg.j2 b/roles/nagios_config/templates/contacts_nagios2.cfg.j2 new file mode 100644 index 0000000000000000000000000000000000000000..1fd6d8509fd0d65546b199ea14f686fefcd2d1ed --- /dev/null +++ b/roles/nagios_config/templates/contacts_nagios2.cfg.j2 @@ -0,0 +1,14 @@ +define contact{ + contact_name admins + alias Admins + service_notification_period 24x7 + host_notification_period 24x7 + service_notification_options w,u,c,r + host_notification_options d,r + service_notifications_enabled 1 + host_notifications_enabled 1 + service_notification_commands notify-service-by-email + host_notification_commands notify-host-by-email + email hpc-alerts-warning-l@monash.edu + } + diff --git a/roles/nagios_config/templates/hostgroups_nagios2.cfg.j2 b/roles/nagios_config/templates/hostgroups_nagios2.cfg.j2 new file mode 100644 index 0000000000000000000000000000000000000000..6164bc63307f5de0ea0b51f56de9db2cf8fb3822 --- /dev/null +++ b/roles/nagios_config/templates/hostgroups_nagios2.cfg.j2 @@ -0,0 +1,15 @@ +# Some generic hostgroup definitions + +{% for group in groups %} +{% if group != "ungrouped" %} +{% set nodelist = [] %} +{% for node in groups[group] %} +{% if nodelist.append(node) %} +{% endif %} +{% endfor %} +define hostgroup { + hostgroup_name {{ group }} + members {{ nodelist|unique|join(',') }} +} +{% endif %} +{% endfor %} diff --git a/roles/nagios_server/templates/hosts_nagios2.cfg.j2 b/roles/nagios_config/templates/hosts_nagios2.cfg.j2 similarity index 100% rename from roles/nagios_server/templates/hosts_nagios2.cfg.j2 rename to roles/nagios_config/templates/hosts_nagios2.cfg.j2 diff --git a/roles/nagios_server/templates/services_nagios2.cfg.j2 b/roles/nagios_config/templates/services_nagios2.cfg.j2 similarity index 77% rename from roles/nagios_server/templates/services_nagios2.cfg.j2 rename to roles/nagios_config/templates/services_nagios2.cfg.j2 index d5cbbca3b22bbd77003de659af549c130b87c71a..2eec8a5eb5b02b7bfec039b0721c6e646237ca2a 100644 --- a/roles/nagios_server/templates/services_nagios2.cfg.j2 +++ b/roles/nagios_config/templates/services_nagios2.cfg.j2 @@ -1,8 +1,9 @@ {% for service in nagios_services %} define service { + name {{ service.name }} service_description {{ service.description }} hostgroup_name {{ service.groups|join(',') }} - check_command {{ service.command }} + check_command {{ service.name }} use generic-service notification_interval 0 } diff --git a/roles/nagios_monitored/files/scripts/check_apache2 b/roles/nagios_monitored/files/scripts/check_apache2 new file mode 100755 index 0000000000000000000000000000000000000000..c9a50f406e7f390939ee14d7b01d41534dff776a --- /dev/null +++ b/roles/nagios_monitored/files/scripts/check_apache2 @@ -0,0 +1,32 @@ +#!/usr/bin/python +import sys, os, pwd +import getopt +import commands +import subprocess +import platform + +STATE_OK=0 +STATE_WARNING=1 + +dist = platform.dist() +if dist[0] == "debian": + command = "/usr/sbin/service apache2 status" +elif dist[0] == "centos": + command = "/sbin/service httpd status" +else: + command = "service apache2 status" + +check_apache=subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) +apache_status=check_apache.communicate()[0] + +if "run" in apache_status: + print "Apache is Running" + sys.exit(STATE_OK) +else: + print "Apache is NOT Running !!" + sys.exit(STATE_WARNING) + +sys.exit(STATE_OK) + + + diff --git a/roles/nagios_monitored/files/scripts/check_blocked_beamline_jobs b/roles/nagios_monitored/files/scripts/check_blocked_beamline_jobs new file mode 100755 index 0000000000000000000000000000000000000000..43a63de5ae66a9fe9b9d9012cf350c9d0782107b --- /dev/null +++ b/roles/nagios_monitored/files/scripts/check_blocked_beamline_jobs @@ -0,0 +1,44 @@ +#!/usr/bin/python +import sys, os, pwd +import getopt +import commands +import subprocess +import datetime + +STATE_OK=0 +STATE_WARNING=1 +STATE_CRITICAL=2 +STATE=STATE_OK + +# get info about reservation +reservationname="beamline" +reservation_cmd=["scontrol","show","--oneliner","reservation=" + reservationname] +p = subprocess.Popen(reservation_cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) +line = p.stdout.readline() +reservation_dict = dict( (n,v) for n,v in (a.split('=') for a in line.split() ) ) +retval = p.wait() + +# count free nodes +process=subprocess.Popen("/usr/local/slurm/latest/bin/scontrol show node=%s | grep -c \" State=RESERVED\"" % reservation_dict['Nodes'], shell=True, stdout=subprocess.PIPE) +free_nodes=process.communicate()[0] + +if int(free_nodes) < 1: + # not looking good, no nodes left in beamline reservation - let's check for pending jobs to decide on the warning level + STATE=STATE_WARNING + process=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --noheader --format='%T %R' --long --reservation=beamline | grep PENDING", shell=True, stdout=subprocess.PIPE) + pending_list=process.communicate()[0] + if "PENDING" in pending_list: + print "Critical: we have no free nodes for beamline reservation and jobs trying to run!" + STATE=STATE_CRITICAL + # Lets provide some more info for the readers benefit + process=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --format=\"%.18i %.9P %.8j %.16u %.10a %.8T %.10M %.9l %.6D %R\" --reservation=beamline", shell=True, stdout=subprocess.PIPE) + squeue_list=process.communicate()[0] + print "Beamline reservation has %s of %s nodes free" % (int(free_nodes),int(reservation_dict['NodeCnt'])) + print "squeue --long --reservation=beamline" + print squeue_list + sys.exit(STATE) + +# If we made it hear we are happy +print "Beamline reservation has %s of %s nodes free" % (int(free_nodes),int(reservation_dict['NodeCnt'])) +sys.exit(STATE) + diff --git a/roles/nagios_monitored/files/scripts/check_blocked_compute_jobs b/roles/nagios_monitored/files/scripts/check_blocked_compute_jobs new file mode 100755 index 0000000000000000000000000000000000000000..d1c91793c1000c449892efc9d7acb1173f736159 --- /dev/null +++ b/roles/nagios_monitored/files/scripts/check_blocked_compute_jobs @@ -0,0 +1,71 @@ +#!/usr/bin/python +import sys, os, pwd +import getopt +import commands +import subprocess +import datetime + +STATE_OK=0 +state=STATE_OK +STATE_WARNING=1 +STATE_CRITICAL=2 +WARNING_THRESHOLD_SEC=7*24*60*60 +# WARNING_THRESHOLD_SEC=6*60*60 +CRITICAL_THRESHOLD_SEC=WARNING_THRESHOLD_SEC*2 + +check_pending_com_job=subprocess.Popen("squeue --noheader --states=PENDING --Format=SUBMITTIME", shell=True, stdout=subprocess.PIPE) +pending_com_job_id_list=check_pending_com_job.communicate()[0] + +check_recent_time=subprocess.Popen("date +'%Y-%m-%dT%H:%M:%S'", shell=True, stdout=subprocess.PIPE) +b = check_recent_time.communicate()[0].strip() +# print "date date" +# print "%s" % b +now_time = datetime.datetime.strptime(b, '%Y-%m-%dT%H:%M:%S') + +for submittime in pending_com_job_id_list.splitlines(): + # print job_ID + # check_submit_time=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --job %s --noheader --Format=submittime" % job_ID , shell=True, stdout=subprocess.PIPE) + # a = check_submit_time.communicate()[0].strip() + # print a + # print "-----------------------------------------------------------" + # print "slurm submit time" + # print "-%s-" % submittime + # check_recent_time=subprocess.Popen("date +'%Y-%m-%dT%H:%M:%S'", shell=True, stdout=subprocess.PIPE) + # b = check_recent_time.communicate()[0].strip() + # print "date date" + # print "%s" % b + + submit_time = datetime.datetime.strptime(submittime, '%Y-%m-%dT%H:%M:%S ') + # now_time = datetime.datetime.strptime(b, '%Y-%m-%dT%H:%M:%S') + + # print "python submit_time" + # print submit_time + # print "python now_time" + # print now_time + time_elapsed=(now_time - submit_time).seconds + # print "time_elapsed" + # print "%s" % time_elapsed + + if time_elapsed>=CRITICAL_THRESHOLD_SEC: + message="Critical: Slurm Job Pending over " + str(CRITICAL_THRESHOLD_SEC/60/60/24) + " days" + state=STATE_CRITICAL + + if time_elapsed>=WARNING_THRESHOLD_SEC and state != STATE_CRITICAL: + message="Warning: Slurm Job Pending over " + str(WARNING_THRESHOLD_SEC/60/60/24) + " days" + state=STATE_WARNING + + +if state == STATE_OK: + print "OK: No Slurm Jobs BLOCKED over %s days" % int(WARNING_THRESHOLD_SEC/60/60/24) +else: + print message + check_pending_com_job=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --format='%.18i %.9P %.8u %.8T %S %V %R' --states=PENDING", shell=True, stdout=subprocess.PIPE) + squeue_result=check_pending_com_job.communicate()[0] + print squeue_result + sys.exit(STATE_CRITICAL) + +sys.exit(state) + + + + diff --git a/roles/nagios_monitored/files/scripts/check_blocked_vis_jobs b/roles/nagios_monitored/files/scripts/check_blocked_vis_jobs new file mode 100755 index 0000000000000000000000000000000000000000..8614e13418de4f043de121a961ae59b17bbf3370 --- /dev/null +++ b/roles/nagios_monitored/files/scripts/check_blocked_vis_jobs @@ -0,0 +1,50 @@ +#!/usr/bin/python +import sys, os, pwd +import getopt +import commands +import subprocess +import datetime + +STATE_OK=0 +STATE_WARNING=1 +STATE_CRITICAL=2 + +check_pending_vis_job=subprocess.Popen("squeue -p m2-vis-c6,m1-vis-c6 --format='%.18i %.9P %.8j %.8u %.2t %.19S %.6D %20Y %R' --states=PENDING | grep -i vis | awk {'print $1'}", shell=True, stdout=subprocess.PIPE) + +pending_vis_job_id_list=check_pending_vis_job.communicate()[0] +for job_ID in pending_vis_job_id_list.splitlines(): + check_submit_time=subprocess.Popen("/usr/local/slurm/latest/bin/squeue --Format=jobID,submittime | grep %s | awk {print'$2'}" % job_ID, shell=True, stdout=subprocess.PIPE) + a = check_submit_time.communicate()[0].strip() +# print "%s" % a + + check_recent_time=subprocess.Popen("date +'%Y-%m-%dT%H:%M:%S'", shell=True, stdout=subprocess.PIPE) + b = check_recent_time.communicate()[0].strip() +# print "%s" % b + + submit_time = datetime.datetime.strptime(a, '%Y-%m-%dT%H:%M:%S') + now_time = datetime.datetime.strptime(b, '%Y-%m-%dT%H:%M:%S') + + time_elapsed=(now_time - submit_time).seconds + if time_elapsed>=900: + print "CRTICAL: Slurm Vis Job Pending over 15 mins" + pending_listp = subprocess.Popen("/usr/local/slurm/latest/bin/squeue -p m2-vis-c6,m1-vis-c6 --states=PENDING", shell=True, stdout=subprocess.PIPE) + pending_list = pending_listp.communicate()[0] + print "\n\n/usr/local/slurm/latest/bin/squeue -p m2-vis-c6,m1-vis-c6 --states=PENDING\n" + print pending_list + sys.exit(STATE_CRITICAL) + +# print "%s" % time_elapsed + if time_elapsed>=300: + print "WARNING: Slurm Vis Job Pending over 5 mins" + pending_listp = subprocess.Popen("/usr/local/slurm/latest/bin/squeue -p m2-vis-c6,m1-vis-c6 --states=PENDING", shell=True, stdout=subprocess.PIPE) + pending_list = pending_listp.communicate()[0] + print "\n\n/usr/local/slurm/latest/bin/squeue -p m2-vis-c6,m1-vis-c6 --states=PENDING\n" + print pending_list + sys.exit(STATE_WARNING) + +print "NO Slurm Vis Jobs BLOCKED over 15 mins" +sys.exit(STATE_OK) + + + + diff --git a/roles/nagios_monitored/files/scripts/check_disk b/roles/nagios_monitored/files/scripts/check_disk new file mode 100755 index 0000000000000000000000000000000000000000..0c573231bfc07670bca55e9cb0d4844e0b50d6e6 --- /dev/null +++ b/roles/nagios_monitored/files/scripts/check_disk @@ -0,0 +1,35 @@ +#!/bin/bash + +STATE_OK=0 +STATE_WARNING=1 +STATE_CRITICAL=2 +STATE_UNKNOWN=3 +STATE_DEPENDENT=4 + +HOST_NAME=$(hostname -f) + +function check_disk() { + local space="$1" + + local used=$(df -h ${space} | grep "^/" | awk '{print $5}' | sed -e 's/%//') + local critical_threshhold=$((90)) + local warning_threshhold=$((80)) + + if (( ${used} > ${critical_threshhold} )); then + echo "CRITICAL: ${HOST_NAME} is running out of filesystem space" + exit "${STATE_CRITICAL}" + fi + if (( ${used} > ${warning_threshhold} )); then + echo "WARNING: ${HOST_NAME} is running out of filesystem space" + exit "${STATE_WARNING}" + fi +} + +folder="/" + +check_disk "/" +check_disk "/tmp" + +echo "Check ${HOST_NAME} disk space Ok" +exit "$STATE_OK" + diff --git a/roles/nagios_monitored/files/scripts/check_ldap b/roles/nagios_monitored/files/scripts/check_ldap new file mode 100755 index 0000000000000000000000000000000000000000..7ac7f4fcd47161bbb524bbef4043da32dadba387 --- /dev/null +++ b/roles/nagios_monitored/files/scripts/check_ldap @@ -0,0 +1,32 @@ +#!/usr/bin/python +import sys, os, pwd +import getopt +import commands +import subprocess +import platform + +STATE_OK=0 +STATE_WARNING=1 + +dist = platform.dist() +if dist[0] == "debian": + command = "/usr/sbin/service slapd status" +elif dist[0] == "centos": + command = "/sbin/service slapd status" +else: + command = "service slapd status" + +check_ldap=subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) +ldap_status=check_ldap.communicate()[0] + +if "run" in ldap_status: + print "Ldap is Running" + sys.exit(STATE_OK) +else: + print "Ldap is NOT Running !!" + sys.exit(STATE_WARNING) + +sys.exit(STATE_OK) + + + diff --git a/roles/nagios_monitored/files/scripts/check_ldap_client b/roles/nagios_monitored/files/scripts/check_ldap_client new file mode 100755 index 0000000000000000000000000000000000000000..e0a73c506d3d4b9aa92a6715634a0985b8c78517 --- /dev/null +++ b/roles/nagios_monitored/files/scripts/check_ldap_client @@ -0,0 +1,24 @@ +#!/bin/bash + +STATE_OK=0 +STATE_WARNING=1 +STATE_CRITICAL=2 +STATE_UNKNOWN=3 +STATE_DEPENDENT=4 + + +function main () { + id chines + local r=$? + if [ ${r} -ne 0 ]; then + echo "Broken LDAP authentication" + return $STATE_CRITICAL + fi + + echo "LDAP authentication OK" + return $STATE_OK + +} + +main +exit $? diff --git a/roles/nagios_monitored/files/scripts/check_load b/roles/nagios_monitored/files/scripts/check_load deleted file mode 100755 index 8e6966c4e309874444dbe4cca2f5e783e107fc88..0000000000000000000000000000000000000000 Binary files a/roles/nagios_monitored/files/scripts/check_load and /dev/null differ diff --git a/roles/nagios_monitored/files/scripts/check_localfs.sh b/roles/nagios_monitored/files/scripts/check_localfs.sh new file mode 100755 index 0000000000000000000000000000000000000000..20043ee4530895c62c35b96d23d5b7ef31b4e4ee --- /dev/null +++ b/roles/nagios_monitored/files/scripts/check_localfs.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +STATE_OK=0 +STATE_WARNING=1 +STATE_CRITICAL=2 +STATE_UNKNOWN=3 +STATE_DEPENDENT=4 + + +main () +{ + tmpfile=$( mktemp -p /tmp ) + r=$? + if [ $r -ne 0 ] + then + return $STATE_CRITICAL + else + rm $tmpfile + fi + + return $STATE_OK + +} + +main +exit $? diff --git a/roles/nagios_monitored/files/scripts/check_munge b/roles/nagios_monitored/files/scripts/check_munge index bf7c01cf1c5be003604a09f77e51d99d7bc7344c..5e94412c7cdbb369f81c7ee6643aa74d2a436059 100755 --- a/roles/nagios_monitored/files/scripts/check_munge +++ b/roles/nagios_monitored/files/scripts/check_munge @@ -7,7 +7,7 @@ import subprocess STATE_OK=0 STATE_WARNING=1 -check_munge=subprocess.Popen("/usr/sbin/service munge status", shell=True, stdout=subprocess.PIPE) +check_munge=subprocess.Popen("service munge status", shell=True, stdout=subprocess.PIPE) munge_status=check_munge.communicate()[0] if "run" in munge_status: diff --git a/roles/nagios_monitored/files/scripts/check_mysql b/roles/nagios_monitored/files/scripts/check_mysql new file mode 100755 index 0000000000000000000000000000000000000000..f2b64eb6757380fad60b8bf53e1bab5abcb34752 --- /dev/null +++ b/roles/nagios_monitored/files/scripts/check_mysql @@ -0,0 +1,32 @@ +#!/usr/bin/python +import sys, os, pwd +import getopt +import commands +import subprocess +import platform + +STATE_OK=0 +STATE_WARNING=1 + +dist = platform.dist() +if dist[0] == "debian": + command = "/usr/sbin/service mysql status" +elif dist[0] == "centos": + command = "/sbin/service mysqld status" +else: + command = "service mysql status" + +check_mysql=subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) +mysql_status=check_mysql.communicate()[0] + +if "run" in mysql_status: + print "Mysql is Running" + sys.exit(STATE_OK) +else: + print "Mysql is NOT Running !!" + sys.exit(STATE_WARNING) + +sys.exit(STATE_OK) + + + diff --git a/roles/nagios_monitored/files/scripts/check_ntp b/roles/nagios_monitored/files/scripts/check_ntp new file mode 100755 index 0000000000000000000000000000000000000000..6124b60650cde49593aef13653e583789aa2e7ae --- /dev/null +++ b/roles/nagios_monitored/files/scripts/check_ntp @@ -0,0 +1,21 @@ +#!/usr/bin/python +import sys, os, pwd +import getopt +import commands +import subprocess + +STATE_OK=0 +STATE_WARNING=1 + +check_munge=subprocess.Popen("service ntpd status", shell=True, stdout=subprocess.PIPE) +munge_status=check_munge.communicate()[0] + +if "run" in munge_status: + print "NTPD service is Running" + sys.exit(STATE_OK) +else: + print "NTPD service is NOT Running !!" + sys.exit(STATE_WARNING) + +sys.exit(STATE_OK) + diff --git a/roles/nagios_monitored/files/scripts/check_slurm b/roles/nagios_monitored/files/scripts/check_slurm new file mode 100755 index 0000000000000000000000000000000000000000..0ec5a59f961614ad23f6ec0a880b36875049ff76 --- /dev/null +++ b/roles/nagios_monitored/files/scripts/check_slurm @@ -0,0 +1,23 @@ +#!/usr/bin/python +import sys, os, pwd +import getopt +import commands +import subprocess + +STATE_OK=0 +STATE_WARNING=1 + +check_slurm=subprocess.Popen("service slurm status", shell=True, stdout=subprocess.PIPE) +slurm_status=check_slurm.communicate()[0] + +if "run" in slurm_status: + print "Slurm is Running" + sys.exit(STATE_OK) +else: + print "Slurm is NOT Running !!" + sys.exit(STATE_WARNING) + +sys.exit(STATE_OK) + + + diff --git a/roles/nagios_monitored/files/scripts/check_slurmdbd b/roles/nagios_monitored/files/scripts/check_slurmdbd new file mode 100755 index 0000000000000000000000000000000000000000..fbb08b96c6b7eae8d789c14ad638171addea8b4a --- /dev/null +++ b/roles/nagios_monitored/files/scripts/check_slurmdbd @@ -0,0 +1,21 @@ +#!/usr/bin/python +import sys, os, pwd +import getopt +import commands +import subprocess + +STATE_OK=0 +STATE_WARNING=1 + +check_slurmdbd=subprocess.Popen("service slurmdbd status", shell=True, stdout=subprocess.PIPE) +slurmdbd_status=check_slurmdbd.communicate()[0] + +if "run" in slurmdbd_status: + print "Slurmdbd is Running" + sys.exit(STATE_OK) +else: + print "Slurmdbd is NOT Running !!" + sys.exit(STATE_WARNING) + +sys.exit(STATE_OK) + diff --git a/roles/nagios_monitored/tasks/main.yml b/roles/nagios_monitored/tasks/main.yml index 8a1e5ce9b0f0273344b7e8b3331d8ac4cb2b4a1b..5ff9185dc4c0ae7eae41056e439bc1da9d573d95 100644 --- a/roles/nagios_monitored/tasks/main.yml +++ b/roles/nagios_monitored/tasks/main.yml @@ -1,19 +1,22 @@ --- - name: create nagios user - user: name=nagios system=yes createhome=yes home=/var/lib/nagios shell=/bin/bash + user: name=nagios system=yes createhome=yes home={{ nagios_home }} shell=/bin/bash + sudo: true + +- name: create ssh directory + file: path={{ nagios_home }}/.ssh state=directory owner=nagios mode=700 sudo: true - name: authorize_key - authorized_key: user=nagios key="{{ monitor_pubkey }}" + authorized_key: user=nagios key="{{ lookup('file', 'files/nagios_public_key') }}" path="{{ nagios_home }}/.ssh/authorized_keys" sudo: true - name: make scripts directory - file: path=/var/lib/nagios/scripts state=directory owner=nagios mode=755 + file: path={{ nagios_home }}/scripts state=directory owner=nagios mode=755 sudo: true - name: install monitor scripts - copy: dest=/var/lib/nagios/scripts/{{ item }} src=scripts/{{ item }} mode=755 - with_items: - - check_load - - check_munge + copy: dest={{ nagios_home }}/scripts/{{ item }} src=scripts/{{ item }} mode=755 + with_items: "{% set script_list = [] %}{% for s in nagios_services %}{%for g in hostvars[ansible_hostname].group_names %}{% if g in s.groups and s.script %}{% if script_list.append(s.script) %}{% endif %}{% endif %}{% endfor %}{% endfor %}{{ script_list }}" sudo: true + diff --git a/roles/nagios_server/handlers/main.yml b/roles/nagios_server/handlers/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..fb97f1971ff637b1d7e27713e2386c53603d6a8c --- /dev/null +++ b/roles/nagios_server/handlers/main.yml @@ -0,0 +1,8 @@ +--- +- name: restart apache2 + service: name=apache2 state=restarted + sudo: true + +- name: restart postfix + service: name=postfix state=restarted + sudo: true diff --git a/roles/nagios_server/tasks/main.yml b/roles/nagios_server/tasks/main.yml index 9e56db5b42cb982990d45423a72ec534102377fd..203b4f8b10efaa8324e188c745396101b9c8db76 100644 --- a/roles/nagios_server/tasks/main.yml +++ b/roles/nagios_server/tasks/main.yml @@ -1,31 +1,29 @@ --- +- name: create directory + file: dest=/var/lib/nagios/.ssh state=directory + sudo: true + +- name: create nagios user + user: name=nagios system=yes createhome=yes home={{ nagios_home }} shell=/bin/bash + sudo: true + - name: copy priv key - template: src={{ monitor_privkey_file }} dest=/var/lib/nagios/.ssh/id_rsa mode=600 owner=nagios + template: src={{ monitor_privkey_file }} dest={{ nagios_home }}/.ssh/id_rsa mode=600 owner={{ nagios_username }} sudo: true - name: install packages - apt: name={{ item }} state=installed + apt: name={{ item }} state=present with_items: - nagios3 - python-passlib - - python3-passlib sudo: true + when: ansible_os_family == "Debian" - name: configure nagios authentication htpasswd: path=/etc/nagios3/htpasswd.users name={{ nagios_username }} password={{ nagios_password }} sudo: true -- name: configure monitoring - template: src={{ item }}_nagios2.cfg.j2 dest=/etc/nagios3/conf.d/{{ item }}_nagios2.cfg - with_items: - - 'hostgroups' - - 'hosts' - - 'commands' - - 'services' - - 'extinfo' - sudo: true - - name: force restart - service: name=nagios3 state=restarted + service: name=nagios3 state=started sudo: true diff --git a/roles/nagios_server/templates/cgi.cfg.j2 b/roles/nagios_server/templates/cgi.cfg.j2 new file mode 100644 index 0000000000000000000000000000000000000000..a3f3a07cd440ba32de38771fe0350102443b6618 --- /dev/null +++ b/roles/nagios_server/templates/cgi.cfg.j2 @@ -0,0 +1,377 @@ +################################################################# +# +# CGI.CFG - Sample CGI Configuration File for Nagios +# +################################################################# + + +# MAIN CONFIGURATION FILE +# This tells the CGIs where to find your main configuration file. +# The CGIs will read the main and host config files for any other +# data they might need. + +main_config_file=/etc/nagios3/nagios.cfg + + + +# PHYSICAL HTML PATH +# This is the path where the HTML files for Nagios reside. This +# value is used to locate the logo images needed by the statusmap +# and statuswrl CGIs. + +physical_html_path=/usr/share/nagios3/htdocs + + + +# URL HTML PATH +# This is the path portion of the URL that corresponds to the +# physical location of the Nagios HTML files (as defined above). +# This value is used by the CGIs to locate the online documentation +# and graphics. If you access the Nagios pages with an URL like +# http://www.myhost.com/nagios, this value should be '/nagios' +# (without the quotes). + +url_html_path=/nagios3 + + + +# CONTEXT-SENSITIVE HELP +# This option determines whether or not a context-sensitive +# help icon will be displayed for most of the CGIs. +# Values: 0 = disables context-sensitive help +# 1 = enables context-sensitive help + +show_context_help=1 + + + +# PENDING STATES OPTION +# This option determines what states should be displayed in the web +# interface for hosts/services that have not yet been checked. +# Values: 0 = leave hosts/services that have not been check yet in their original state +# 1 = mark hosts/services that have not been checked yet as PENDING + +use_pending_states=1 + +# NAGIOS PROCESS CHECK COMMAND +# This is the full path and filename of the program used to check +# the status of the Nagios process. It is used only by the CGIs +# and is completely optional. However, if you don't use it, you'll +# see warning messages in the CGIs about the Nagios process +# not running and you won't be able to execute any commands from +# the web interface. The program should follow the same rules +# as plugins; the return codes are the same as for the plugins, +# it should have timeout protection, it should output something +# to STDIO, etc. +# +# Note: The command line for the check_nagios plugin below may +# have to be tweaked a bit, as different versions of the plugin +# use different command line arguments/syntaxes. + +nagios_check_command=/usr/lib/nagios/plugins/check_nagios /var/cache/nagios3/status.dat 5 '/usr/sbin/nagios3' + + +# AUTHENTICATION USAGE +# This option controls whether or not the CGIs will use any +# authentication when displaying host and service information, as +# well as committing commands to Nagios for processing. +# +# Read the HTML documentation to learn how the authorization works! +# +# NOTE: It is a really *bad* idea to disable authorization, unless +# you plan on removing the command CGI (cmd.cgi)! Failure to do +# so will leave you wide open to kiddies messing with Nagios and +# possibly hitting you with a denial of service attack by filling up +# your drive by continuously writing to your command file! +# +# Setting this value to 0 will cause the CGIs to *not* use +# authentication (bad idea), while any other value will make them +# use the authentication functions (the default). + +use_authentication=1 + + + + +# x509 CERT AUTHENTICATION +# When enabled, this option allows you to use x509 cert (SSL) +# authentication in the CGIs. This is an advanced option and should +# not be enabled unless you know what you're doing. + +use_ssl_authentication=0 + + + + +# DEFAULT USER +# Setting this variable will define a default user name that can +# access pages without authentication. This allows people within a +# secure domain (i.e., behind a firewall) to see the current status +# without authenticating. You may want to use this to avoid basic +# authentication if you are not using a secure server since basic +# authentication transmits passwords in the clear. +# +# Important: Do not define a default username unless you are +# running a secure web server and are sure that everyone who has +# access to the CGIs has been authenticated in some manner! If you +# define this variable, anyone who has not authenticated to the web +# server will inherit all rights you assign to this user! + +#default_user_name=guest + + + +# SYSTEM/PROCESS INFORMATION ACCESS +# This option is a comma-delimited list of all usernames that +# have access to viewing the Nagios process information as +# provided by the Extended Information CGI (extinfo.cgi). By +# default, *no one* has access to this unless you choose to +# not use authorization. You may use an asterisk (*) to +# authorize any user who has authenticated to the web server. + +authorized_for_system_information=nagiosadmin,nagios + + + +# CONFIGURATION INFORMATION ACCESS +# This option is a comma-delimited list of all usernames that +# can view ALL configuration information (hosts, commands, etc). +# By default, users can only view configuration information +# for the hosts and services they are contacts for. You may use +# an asterisk (*) to authorize any user who has authenticated +# to the web server. + +authorized_for_configuration_information=nagiosadmin,nagios + + + +# SYSTEM/PROCESS COMMAND ACCESS +# This option is a comma-delimited list of all usernames that +# can issue shutdown and restart commands to Nagios via the +# command CGI (cmd.cgi). Users in this list can also change +# the program mode to active or standby. By default, *no one* +# has access to this unless you choose to not use authorization. +# You may use an asterisk (*) to authorize any user who has +# authenticated to the web server. + +authorized_for_system_commands=nagiosadmin,nagios + + + +# GLOBAL HOST/SERVICE VIEW ACCESS +# These two options are comma-delimited lists of all usernames that +# can view information for all hosts and services that are being +# monitored. By default, users can only view information +# for hosts or services that they are contacts for (unless you +# you choose to not use authorization). You may use an asterisk (*) +# to authorize any user who has authenticated to the web server. + + +authorized_for_all_services=nagiosadmin,nagios +authorized_for_all_hosts=nagiosadmin,nagios + + + +# GLOBAL HOST/SERVICE COMMAND ACCESS +# These two options are comma-delimited lists of all usernames that +# can issue host or service related commands via the command +# CGI (cmd.cgi) for all hosts and services that are being monitored. +# By default, users can only issue commands for hosts or services +# that they are contacts for (unless you you choose to not use +# authorization). You may use an asterisk (*) to authorize any +# user who has authenticated to the web server. + +authorized_for_all_service_commands=nagiosadmin,nagios +authorized_for_all_host_commands=nagiosadmin,nagios + + + +# READ-ONLY USERS +# A comma-delimited list of usernames that have read-only rights in +# the CGIs. This will block any service or host commands normally shown +# on the extinfo CGI pages. It will also block comments from being shown +# to read-only users. + +#authorized_for_read_only=user1,user2 + + + + +# STATUSMAP BACKGROUND IMAGE +# This option allows you to specify an image to be used as a +# background in the statusmap CGI. It is assumed that the image +# resides in the HTML images path (i.e. /usr/local/nagios/share/images). +# This path is automatically determined by appending "/images" +# to the path specified by the 'physical_html_path' directive. +# Note: The image file may be in GIF, PNG, JPEG, or GD2 format. +# However, I recommend that you convert your image to GD2 format +# (uncompressed), as this will cause less CPU load when the CGI +# generates the image. + +#statusmap_background_image=smbackground.gd2 + + + + +# STATUSMAP TRANSPARENCY INDEX COLOR +# These options set the r,g,b values of the background color used the statusmap CGI, +# so normal browsers that can't show real png transparency set the desired color as +# a background color instead (to make it look pretty). +# Defaults to white: (R,G,B) = (255,255,255). + +#color_transparency_index_r=255 +#color_transparency_index_g=255 +#color_transparency_index_b=255 + + + + +# DEFAULT STATUSMAP LAYOUT METHOD +# This option allows you to specify the default layout method +# the statusmap CGI should use for drawing hosts. If you do +# not use this option, the default is to use user-defined +# coordinates. Valid options are as follows: +# 0 = User-defined coordinates +# 1 = Depth layers +# 2 = Collapsed tree +# 3 = Balanced tree +# 4 = Circular +# 5 = Circular (Marked Up) + +default_statusmap_layout=5 + + + +# DEFAULT STATUSWRL LAYOUT METHOD +# This option allows you to specify the default layout method +# the statuswrl (VRML) CGI should use for drawing hosts. If you +# do not use this option, the default is to use user-defined +# coordinates. Valid options are as follows: +# 0 = User-defined coordinates +# 2 = Collapsed tree +# 3 = Balanced tree +# 4 = Circular + +default_statuswrl_layout=4 + + + +# STATUSWRL INCLUDE +# This option allows you to include your own objects in the +# generated VRML world. It is assumed that the file +# resides in the HTML path (i.e. /usr/local/nagios/share). + +#statuswrl_include=myworld.wrl + + + +# PING SYNTAX +# This option determines what syntax should be used when +# attempting to ping a host from the WAP interface (using +# the statuswml CGI. You must include the full path to +# the ping binary, along with all required options. The +# $HOSTADDRESS$ macro is substituted with the address of +# the host before the command is executed. +# Please note that the syntax for the ping binary is +# notorious for being different on virtually ever *NIX +# OS and distribution, so you may have to tweak this to +# work on your system. + +ping_syntax=/bin/ping -n -U -c 5 $HOSTADDRESS$ + + + +# REFRESH RATE +# This option allows you to specify the refresh rate in seconds +# of various CGIs (status, statusmap, extinfo, and outages). + +refresh_rate=90 + +# DEFAULT PAGE LIMIT +# This option allows you to specify the default number of results +# displayed on the status.cgi. This number can be adjusted from +# within the UI after the initial page load. Setting this to 0 +# will show all results. + +result_limit=100 + + +# ESCAPE HTML TAGS +# This option determines whether HTML tags in host and service +# status output is escaped in the web interface. If enabled, +# your plugin output will not be able to contain clickable links. + +escape_html_tags=1 + + + + +# SOUND OPTIONS +# These options allow you to specify an optional audio file +# that should be played in your browser window when there are +# problems on the network. The audio files are used only in +# the status CGI. Only the sound for the most critical problem +# will be played. Order of importance (higher to lower) is as +# follows: unreachable hosts, down hosts, critical services, +# warning services, and unknown services. If there are no +# visible problems, the sound file optionally specified by +# 'normal_sound' variable will be played. +# +# +# <varname>=<sound_file> +# +# Note: All audio files must be placed in the /media subdirectory +# under the HTML path (i.e. /usr/local/nagios/share/media/). + +#host_unreachable_sound=hostdown.wav +#host_down_sound=hostdown.wav +#service_critical_sound=critical.wav +#service_warning_sound=warning.wav +#service_unknown_sound=warning.wav +#normal_sound=noproblem.wav + + + +# URL TARGET FRAMES +# These options determine the target frames in which notes and +# action URLs will open. + +action_url_target=_blank +notes_url_target=_blank + + + + +# LOCK AUTHOR NAMES OPTION +# This option determines whether users can change the author name +# when submitting comments, scheduling downtime. If disabled, the +# author names will be locked into their contact name, as defined in Nagios. +# Values: 0 = allow editing author names +# 1 = lock author names (disallow editing) + +lock_author_names=1 + + + + +# SPLUNK INTEGRATION OPTIONS +# These options allow you to enable integration with Splunk +# in the web interface. If enabled, you'll be presented with +# "Splunk It" links in various places in the CGIs (log file, +# alert history, host/service detail, etc). Useful if you're +# trying to research why a particular problem occurred. +# For more information on Splunk, visit http://www.splunk.com/ + +# This option determines whether the Splunk integration is enabled +# Values: 0 = disable Splunk integration +# 1 = enable Splunk integration + +#enable_splunk_integration=1 + + +# This option should be the URL used to access your instance of Splunk + +#splunk_url=http://127.0.0.1:8000/ + + + diff --git a/roles/nagios_server/templates/commands_nagios2.cfg.j2 b/roles/nagios_server/templates/commands_nagios2.cfg.j2 deleted file mode 100644 index 8f4f49cf168a8c9cf3aa55a036c6ff03c7dc79a2..0000000000000000000000000000000000000000 --- a/roles/nagios_server/templates/commands_nagios2.cfg.j2 +++ /dev/null @@ -1,9 +0,0 @@ -define command { - command_name check_mount - command_line /usr/lib/nagios/plugins/check_by_ssh -H $HOSTADDRESS$ -C "/var/lib/nagios/scripts/check_mount.pl -m $ARG1$" -} -define command { - command_name check_munge - command_line /usr/lib/nagios/plugins/check_by_ssh -H $HOSTADDRESS$ -C "/var/lib/nagios/scripts/check_munge" -} - diff --git a/roles/nagios_server/templates/extinfo_nagios2.cfg.j2 b/roles/nagios_server/templates/extinfo_nagios2.cfg.j2 deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/roles/nagios_server/templates/hostgroups_nagios2.cfg.j2 b/roles/nagios_server/templates/hostgroups_nagios2.cfg.j2 deleted file mode 100644 index 54b7862e846f0b6cc8aa247cb4c73d0b1c79c601..0000000000000000000000000000000000000000 --- a/roles/nagios_server/templates/hostgroups_nagios2.cfg.j2 +++ /dev/null @@ -1,44 +0,0 @@ -# Some generic hostgroup definitions - -## A simple wildcard hostgroup -#define hostgroup { -# hostgroup_name all -# alias All Servers -# members * -# } - -## A list of your Debian GNU/Linux servers -#define hostgroup { -# hostgroup_name debian-servers -# alias Debian GNU/Linux Servers -# members localhost -# } - -## A list of your web servers -#define hostgroup { -# hostgroup_name http-servers -# alias HTTP servers -# members localhost -# } - -## A list of your ssh-accessible servers -#define hostgroup { -# hostgroup_name ssh-servers -# alias SSH servers -# members * -# } - - -{% for group in groups %} -#{ % if group != "all" % } -{% set nodelist = [] %} -{% for node in groups[group] %} -{% if nodelist.append(node) %} -{% endif %} -{% endfor %} -define hostgroup { - hostgroup_name {{ group }} - members {{ nodelist|unique|join(',') }} -} -#{ % endif % } -{% endfor %} diff --git a/roles/nagios_server/templates/main_cf.j2 b/roles/nagios_server/templates/main_cf.j2 new file mode 100644 index 0000000000000000000000000000000000000000..2823b289dc68bb169f0f6a2556a314876762bf61 --- /dev/null +++ b/roles/nagios_server/templates/main_cf.j2 @@ -0,0 +1,39 @@ +# See /usr/share/postfix/main.cf.dist for a commented, more complete version + + +# Debian specific: Specifying a file name will cause the first +# line of that file to be used as the name. The Debian default +# is /etc/mailname. +#myorigin = /etc/mailname + +smtpd_banner = $myhostname ESMTP $mail_name +biff = no + +# appending .domain is the MUA's job. +append_dot_mydomain = no + +# Uncomment the next line to generate "delayed mail" warnings +#delay_warning_time = 4h + +readme_directory = no + +# TLS parameters +smtpd_tls_cert_file=/etc/ssl/certs/ssl-cert-snakeoil.pem +smtpd_tls_key_file=/etc/ssl/private/ssl-cert-snakeoil.key +smtpd_use_tls=yes +smtpd_tls_session_cache_database = btree:${data_directory}/smtpd_scache +smtp_tls_session_cache_database = btree:${data_directory}/smtp_scache + +# See /usr/share/doc/postfix/TLS_README.gz in the postfix-doc package for +# information on enabling SSL in the smtp client. + +myhostname = {{ ansible_fqdn }} +alias_maps = hash:/etc/aliases +alias_database = hash:/etc/aliases +myorigin = {{ ansible_fqdn }} +mydestination = {{ ansible_fqdn }}, localhost.{{ ansible_domain }}, localhost +relayhost = {{ smtp_smarthost }} +mynetworks = 127.0.0.0/8 [::ffff:127.0.0.0]/104 [::1]/128 +mailbox_size_limit = 0 +recipient_delimiter = + +inet_interfaces = loopback-only diff --git a/roles/nfs-client/tasks/mountFileSystem.yml b/roles/nfs-client/tasks/mountFileSystem.yml index e74665b73613410f268aae6e5d3dd6542e49c735..1e5c7e99c52baa0e02405e22c1510bfc05f08eac 100644 --- a/roles/nfs-client/tasks/mountFileSystem.yml +++ b/roles/nfs-client/tasks/mountFileSystem.yml @@ -3,28 +3,34 @@ service: name=fail2ban state=stopped sudo: true +- name: "Check NFS mount, it may not be necessary, just in case if the following role does actively re-mount" + shell: mountpoint -q {{ nfsMounts[0].name }} + register: mount_state + ignore_errors: true + when: nfsMounts is defined + - name: "Mounting NFS mounts" mount: name={{ item.name }} src="{{ item.ipv4 }}:{{ item.src }} " fstype={{ item.fstype }} opts={{ item.opts }} state=mounted - with_items: nfsMounts + with_items: "{{ nfsMounts }}" notify: "restart rpcbind" notify: "restart idmap" sudo: true ignore_errors: true register: firstMount - when: nfsMounts is defined + when: nfsMounts is defined and mount_state | failed - name: "Wait for nfs to stabailse" command: sleep 60 delegate_to: 127.0.0.1 when: firstMount | failed -- name: "Mounting NFS mounts" +- name: "Mounting NFS mounts after failure" mount: name={{ item.name }} src=" {{ item.ipv4 }}:{{ item.src }} " fstype={{ item.fstype }} opts={{ item.opts }} state=mounted - with_items: nfsMounts + with_items: "{{ nfsMounts }}" notify: "restart idmap" notify: "restart rpcbind" sudo: true - when: nfsMounts is defined and firstMount | failed + when: nfsMounts is defined and firstMount is defined and firstMount | failed - name: "restart fail2ban" service: name=fail2ban state=started diff --git a/roles/nfs-server/tasks/startServer.yml b/roles/nfs-server/tasks/startServer.yml index 60d84ab3ca51ff23e5a93e1e648c0e04dda17b38..9e836f317895c4dccdbb7ac12b4aa7b875e5c033 100644 --- a/roles/nfs-server/tasks/startServer.yml +++ b/roles/nfs-server/tasks/startServer.yml @@ -2,7 +2,7 @@ - name: "Create exports if necessary" file: dest={{ item.src }} state=directory mode=755 owner=root group=root sudo: true - with_items: exportList + with_items: "{{ exportList }}" - name: "Starting rpcbind" service: "name=rpcbind state=started enabled=true" diff --git a/roles/postfix/handlers/main.yml b/roles/postfix/handlers/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..be706e39ee3209bb6bc551cafbf34bf6ad3eff5d --- /dev/null +++ b/roles/postfix/handlers/main.yml @@ -0,0 +1,4 @@ +--- +- name: restart postfix + service: name=postfix state=restarted + sudo: true diff --git a/roles/postfix/tasks/main.yml b/roles/postfix/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..5c2345c380956921a6ac87e18c0e2a59dea486c1 --- /dev/null +++ b/roles/postfix/tasks/main.yml @@ -0,0 +1,15 @@ +--- +- name: install postfix + apt: name=postfix state=present + sudo: true + when: ansible_os_family == "Debian" + +- name: configure postfix + template: src=main_cf.j2 dest=/etc/postfix/main.cf + sudo: true + notify: restart postfix + +- name: start postfix + service: name=postfix state=started + sudo: true + diff --git a/roles/postfix/templates/main_cf.j2 b/roles/postfix/templates/main_cf.j2 new file mode 100644 index 0000000000000000000000000000000000000000..2823b289dc68bb169f0f6a2556a314876762bf61 --- /dev/null +++ b/roles/postfix/templates/main_cf.j2 @@ -0,0 +1,39 @@ +# See /usr/share/postfix/main.cf.dist for a commented, more complete version + + +# Debian specific: Specifying a file name will cause the first +# line of that file to be used as the name. The Debian default +# is /etc/mailname. +#myorigin = /etc/mailname + +smtpd_banner = $myhostname ESMTP $mail_name +biff = no + +# appending .domain is the MUA's job. +append_dot_mydomain = no + +# Uncomment the next line to generate "delayed mail" warnings +#delay_warning_time = 4h + +readme_directory = no + +# TLS parameters +smtpd_tls_cert_file=/etc/ssl/certs/ssl-cert-snakeoil.pem +smtpd_tls_key_file=/etc/ssl/private/ssl-cert-snakeoil.key +smtpd_use_tls=yes +smtpd_tls_session_cache_database = btree:${data_directory}/smtpd_scache +smtp_tls_session_cache_database = btree:${data_directory}/smtp_scache + +# See /usr/share/doc/postfix/TLS_README.gz in the postfix-doc package for +# information on enabling SSL in the smtp client. + +myhostname = {{ ansible_fqdn }} +alias_maps = hash:/etc/aliases +alias_database = hash:/etc/aliases +myorigin = {{ ansible_fqdn }} +mydestination = {{ ansible_fqdn }}, localhost.{{ ansible_domain }}, localhost +relayhost = {{ smtp_smarthost }} +mynetworks = 127.0.0.0/8 [::ffff:127.0.0.0]/104 [::1]/128 +mailbox_size_limit = 0 +recipient_delimiter = + +inet_interfaces = loopback-only diff --git a/roles/provision_homedir/vars/main.yml b/roles/provision_homedir/vars/main.yml index f1b8ef6cdfbc9d1d7a245a3e607b5ec31dae9151..ded62ea83ce3a9792a72c0393eeeb8b5e34e94b2 100644 --- a/roles/provision_homedir/vars/main.yml +++ b/roles/provision_homedir/vars/main.yml @@ -1,5 +1,5 @@ --- use_active_directory: False provision_homedir: /usr/local/sbin/provision_homedir.py -homeDirEntry: "{% if use_active_directory %}unixHomeDirectory{% else %}homeDirectory {% endif %}" -search_filter: "{% if use_active_directory %}(unixHomeDirectory=*){% else %} (objectClass=posixAccount) {% endif %}" +homeDirEntry: "{% if use_active_directory %}unixHomeDirectory{% else %}homeDirectory{% endif %}" +search_filter: "{% if use_active_directory %}(unixHomeDirectory=*){% else %}(objectClass=posixAccount){% endif %}" diff --git a/roles/setupKnownHosts/tasks/main.yml b/roles/setupKnownHosts/tasks/main.yml deleted file mode 100644 index ad1ebffe3ffe2c98ef37a7aecfa739af6f06bde1..0000000000000000000000000000000000000000 --- a/roles/setupKnownHosts/tasks/main.yml +++ /dev/null @@ -1,18 +0,0 @@ -- name: "Templating /etc/ssh/known_hosts" - template: src=known_hosts.j2 dest=/etc/ssh/ssh_known_hosts owner=root group=root mode=644 - sudo: true - register: sshknownhost - -- name: encrypt the hosts file - shell: ssh-keygen -H -f /etc/ssh/ssh_known_hosts - sudo: true - when: sshknownhost.changed - -- name: set read permissions - file: path=/etc/ssh/ssh_known_hosts owner=root group=root mode=644 state=file - sudo: true - -- name: delete ssh_known_hosts.old - file: path=/etc/ssh/ssh_known_hosts.old state=absent - sudo: true - diff --git a/roles/slurm-common/tasks/installMungeFromSource.yml b/roles/slurm-common/tasks/installMungeFromSource.yml index 06c72f5197dd8352b6c32d7d6838147b132b0b51..8d27a45364f2f221584971e2bf36aca78003a368 100644 --- a/roles/slurm-common/tasks/installMungeFromSource.yml +++ b/roles/slurm-common/tasks/installMungeFromSource.yml @@ -8,7 +8,7 @@ shell: tar jxf munge-{{ munge_version }}.tar.bz2 args: chdir: /tmp - creates: /tmp/munge-{{ munge_version }} + creates: /tmp/munge-{{ munge_version }}/configure - name: build munge shell: ./configure --prefix={{ munge_dir }} && make diff --git a/roles/slurm-common/tasks/installNhc.yml b/roles/slurm-common/tasks/installNhc.yml index 53a59c417716423517a826c842356ef923556850..17e279474e7548f3ba895252de109fedc8378a42 100644 --- a/roles/slurm-common/tasks/installNhc.yml +++ b/roles/slurm-common/tasks/installNhc.yml @@ -1,15 +1,24 @@ -- name: Copy nhc source to /tmp - unarchive: copy=yes src="files/warewulf-nhc-1.4.2.patched.tar.gz" dest=/tmp - sudo: true +--- +- name: install automake + yum: name=automake state=present + become: true + become_user: root + when: ansible_os_family=='RedHat' + +- name: install automake + apt: name=automake state=present + become: true + become_user: root + when: ansible_os_family=='Debian' + +- name: unarchive nhc + unarchive: args: - creates: /tmp/warewulf-nhc-{{ nhc_version }}/configure + src: "https://github.com/mej/nhc/archive/{{ nhc_version }}.tar.gz" + copy: no + dest: /tmp + creates: /tmp/nhc-{{ nhc_version }}/autogen.sh -#- name: Download nhc source -# shell: wget https://cvl.massive.org.au/warewulf-nhc-{{ nhc_version }}.tar.gz -# shell: wget http://warewulf.lbl.gov/downloads/releases/warewulf-nhc/warewulf-nhc-{{ nhc_version }}.tar.gz -# args: -# chdir: /tmp -# creates: /tmp/warewulf-nhc-{{ nhc_version }}.tar.gz #- name: untar nhc # shell: tar zxf /tmp/warewulf-nhc-{{ nhc_version }}.tar.gz @@ -17,16 +26,16 @@ # chdir: /tmp # - name: build nhc - shell: ./configure --prefix={{ nhc_dir }} && make - sudo: true + shell: ./autogen.sh && ./configure --prefix={{ nhc_dir }} && make args: - chdir: /tmp/warewulf-nhc-{{ nhc_version }} + chdir: /tmp/nhc-{{ nhc_version }} + creates: /tmp/nhc-{{ nhc_version }}/configure - name: install nhc shell: make install sudo: true args: - chdir: /tmp/warewulf-nhc-{{ nhc_version }} + chdir: /tmp/nhc-{{ nhc_version }} - name: ensure sysconfig dir exists file: dest=/etc/sysconfig state=directory owner=root group=root mode=755 @@ -50,8 +59,9 @@ register: generate_nhc_config_file - name: generate config file - shell: "{{ nhc_dir }}/sbin/nhc-genconf -d -c {{ nhc_dir }}/etc/nhc/{{ nhc_config_file }} CONFDIR={{ nhc_dir }}/etc/nhc" + shell: "{{ nhc_dir }}/sbin/nhc-genconf -c {{ nhc_dir }}/etc/nhc/{{ nhc_config_file }} CONFDIR={{ nhc_dir }}/etc/nhc" sudo: true + ignore_errors: true when: generate_nhc_config_file - name: config file extension @@ -61,7 +71,3 @@ sudo: true when: nhc_user_conf is defined and generate_nhc_config_file -- name: start cron job - cron: name=nhc_monitor job={{ nhc_dir }}/sbin/nhc_cron user=root minute=*/5 state=present - sudo: true - diff --git a/roles/slurm-common/tasks/main.yml b/roles/slurm-common/tasks/main.yml index fc9eedfb4a294c364b33cc14935b762c71ba93fc..5cc6f6d7e1430d9eeffd6e9538cdbe12b4b8a375 100644 --- a/roles/slurm-common/tasks/main.yml +++ b/roles/slurm-common/tasks/main.yml @@ -42,12 +42,12 @@ file: path={{slurmsharedstatedir }} state=directory owner=slurm group=slurm mode=750 sudo: true run_once: true - when: usesharedstatedir + when: usesharedstatedir is defined and usesharedstatedir - name: symlink shared state dir file: path={{ slurmstatedir }} src={{ slurmsharedstatedir }} state=link sudo: true - when: usesharedstatedir + when: usesharedstatedir is defined and usesharedstatedir - name: create state directory file: path={{ slurmstatedir }} state=directory owner=slurm group=slurm mode=750 diff --git a/roles/upgrade/tasks/main.yml b/roles/upgrade/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..58a50ac0a19561e44b952d800033d9c1c9dacdac --- /dev/null +++ b/roles/upgrade/tasks/main.yml @@ -0,0 +1,11 @@ +--- +- name: apt-get upgrade + apt: upgrade=safe + sudo: true + when: ansible_os_family=="Debian" + +- name: yum upgrade + yum: name=* state=latest + become: true + become_user: root + when: ansible_os_family=="RedHat" diff --git a/roles/vncserver/tasks/main.yml b/roles/vncserver/tasks/main.yml index 13c13dc639c66a9c031ecdfce3b5d84482de367e..e79ed591803e7f285e737167d9e3c12656ebfbaf 100644 --- a/roles/vncserver/tasks/main.yml +++ b/roles/vncserver/tasks/main.yml @@ -5,24 +5,6 @@ - include_vars: "CentOS_7_x86_64.yml" when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "7" -# Use mate DE on systems that have moved to gnome3, since there is no gpu acceleration by default on NeCTAR openstack -# Trusty (Ubuntu 14.04 LTS) needs repos added. Wheezy (Debian Stable) gets mate from backports, Utopic (Ubuntu 14.10) Jessie (Debian testing) and Sid (Debian unstable) get it by default -- name: add repos apt - shell: "add-apt-repository -y ppa:ubuntu-mate-dev/ppa" - sudo: true - when: ansible_distribution_release == 'trusty' - -- name: add repos apt - shell: "add-apt-repository -y ppa:ubuntu-mate-dev/trusty-mate" - sudo: true - when: ansible_distribution_release == 'trusty' - -- name: add epel on CentOS 7 - shell: rpm -iUvh http://dl.fedoraproject.org/pub/epel/7/x86_64/e/epel-release-7-5.noarch.rpm - sudo: true - when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "7" - ignore_errors: true - - name: install system packages apt apt: name={{ item }} state=present update_cache=true force=yes sudo: true