diff --git a/roles/slurm-trigger/README.rst b/roles/slurm-trigger/README.rst new file mode 100644 index 0000000000000000000000000000000000000000..04f41095ec0ae2413ac9182e93ec2c70279d2f12 --- /dev/null +++ b/roles/slurm-trigger/README.rst @@ -0,0 +1,25 @@ +THis role sets up trigger events on your slurm cluster. +What you want the triggers to do is up to you, so you will probably modify the templated shell files. +Copy the role to a local role directory? + +Triggers used in this role as it stands +- primary_slurmctld_failure +- primary_slurmctld_resumed_operation.sh +- node down + +USAGE: +- hosts: 'ManagementNodes' + tasks: + - include_vars: vars/slurm.yml + +- hosts: 'ManagementNodes' + roles: + - slurm_trigger + + +The role uses several variables that need to be defined: +{{ slurm_dir }} The directory of slurm install. Shell scripts are copied to sbin +{{ admin_email }} Email address (defined in slurm.yml, or defined some other way) to send alerts to + +Each trigger has 2 files. One to respond to a trigger. And one to reset the trigger. The role calls the last one to start the process. + diff --git a/roles/slurm-trigger/tasks/main.yml b/roles/slurm-trigger/tasks/main.yml new file mode 100644 index 0000000000000000000000000000000000000000..ceb47c7f8ece4fddc9227ddce0315842478e5989 --- /dev/null +++ b/roles/slurm-trigger/tasks/main.yml @@ -0,0 +1,53 @@ +--- +############################ +- name: template primary_slurmctld_failure + template: dest="{{ slurm_dir }}/sbin/primary_slurmctld_failure.sh" src=primary_slurmctld_failure.sh.j2 mode="0755" + become: true + become_user: root + +- name: template set primary_slurmctld_failure trigger + template: dest="{{ slurm_dir }}/sbin/set_primary_slurmctld_failure_trigger.sh" src=set_primary_slurmctld_failure_trigger.sh.j2 mode="0755" + become: true + become_user: root + +- name: Execute set_primary_slurmctld_failure)trigger + command: "{{ slurm_dir }}/sbin/set_primary_slurmctld_failure_trigger.sh" + become: true + become_user: slurm + run_once: true + +- name: template primary_slurmctld_resumed_operation + template: dest="{{ slurm_dir }}/sbin/primary_slurmctld_resumed_operation.sh" src=primary_slurmctld_resumed_operation.sh.j2 mode="0755" + become: true + become_user: root + +- name: template set primary_slurmctld_resumed trigger + template: dest="{{ slurm_dir }}/sbin/set_primary_slurmctld_resumed_operation_trigger.sh" src=set_primary_slurmctld_resumed_operation_trigger.sh.j2 mode="0755" + become: true + become_user: root + +- name: Execute primary_slurmctld_resumed_operation.sh + command: "{{ slurm_dir }}/sbin/set_primary_slurmctld_resumed_operation_trigger.sh" + become: true + become_user: slurm + run_once: true + +- name: template node_down + template: dest="{{ slurm_dir }}/sbin/node_down.sh" src=node_down.sh.j2 mode="0755" + become: true + become_user: root + +- name: template node_down trigger command + template: dest="{{ slurm_dir }}/sbin/set_node_trigger.sh" src=set_node_trigger.sh.j2 mode="0755" + become: true + become_user: root + + +- name: Execute set_node_trigger.sh + command: "{{ slurm_dir }}/sbin/set_node_trigger.sh" + become: true + become_user: slurm + run_once: true + + + diff --git a/roles/slurm-trigger/templates/node_down.sh.j2 b/roles/slurm-trigger/templates/node_down.sh.j2 new file mode 100644 index 0000000000000000000000000000000000000000..679e18c7daa2b2f2e9dff282daf9d0a1f2967802 --- /dev/null +++ b/roles/slurm-trigger/templates/node_down.sh.j2 @@ -0,0 +1,10 @@ +#!/bin/bash +# Notify the administrator of the failure using by e-mail +echo "On `hostname`:`date`:`whoami`: slurm-trigger event for NODE_FAILURE: $*" | `which mail` -s "NODE FAILURE $*" {{ admin_email }} +# Submit trigger for next primary slurmctld failure event +TRIGGER_CMD="{{ slurm_dir }}/sbin/set_node_trigger.sh" + +FILE=/tmp/node_down.txt +#COMMAND="su slurm -c $TRIGGER_CMD" +echo "node_down.sh: `date`: `whoami`: $TRIGGER_CMD" >> $FILE +$TRIGGER_CMD >> $FILE 2>&1 diff --git a/roles/slurm-trigger/templates/primary_slurmctld_failure.sh.j2 b/roles/slurm-trigger/templates/primary_slurmctld_failure.sh.j2 new file mode 100644 index 0000000000000000000000000000000000000000..61747379a436a63f89f8583d05ec5d8aa87cd9cb --- /dev/null +++ b/roles/slurm-trigger/templates/primary_slurmctld_failure.sh.j2 @@ -0,0 +1,10 @@ +#!/bin/bash +# Notify the administrator of the failure using by e-mail +echo "On `hostname`:`date`:`who`: slurm-trigger event for Primary_SLURMCTLD_FAILURE" | `which mail` -s Primary_SLURMCTLD_FAILURE {{ admin_email }} +# Submit trigger for next primary slurmctld failure event +TRIGGER_CMD="{{ slurm_dir }}/sbin/set_primary_slurmctld_failure_trigger.sh" + +FILE=/tmp/primary_down.txt +#COMMAND="su slurm -c $TRIGGER_CMD" +echo "primary_slurmctld_failure.sh:`date`:`whoami`: $TRIGGER_CMD" >> $FILE +$TRIGGER_CMD >> $FILE 2>&1 diff --git a/roles/slurm-trigger/templates/primary_slurmctld_resumed_operation.sh.j2 b/roles/slurm-trigger/templates/primary_slurmctld_resumed_operation.sh.j2 new file mode 100644 index 0000000000000000000000000000000000000000..b8e6788bccef5b36f65575c12b507ef9c624be09 --- /dev/null +++ b/roles/slurm-trigger/templates/primary_slurmctld_resumed_operation.sh.j2 @@ -0,0 +1,10 @@ +#!/bin/bash +# Notify the administrator of the failure using by e-mail +echo "On `hostname`:`date`:`whoami`: slurm-trigger event for Primary_SLURMCTLD_RESUMED" | `which mail` -s Primary_SLURMCTLD_RESUMED {{ admin_email }} +# Submit trigger for next primary slurmctld failure event + +FILE=/tmp/primary_up.txt +#COMMAND="su slurm -c {{ slurm_dir }}/sbin/set_primary_slurmctld_resumed_operation_trigger.sh" +COMMAND="{{ slurm_dir }}/sbin/set_primary_slurmctld_resumed_operation_trigger.sh" +echo "primary_slurmctld_resumed_operation.sh.sh:`date`:`whoami`: $COMMAND" >> $FILE +$COMMAND >> $FILE 2>&1 diff --git a/roles/slurm-trigger/templates/set_node_trigger.sh.j2 b/roles/slurm-trigger/templates/set_node_trigger.sh.j2 new file mode 100644 index 0000000000000000000000000000000000000000..87e2938bfec5c1184a7f9e949d2fc16d0cfe571c --- /dev/null +++ b/roles/slurm-trigger/templates/set_node_trigger.sh.j2 @@ -0,0 +1,4 @@ +#!/bin/bash +TRIGGER_CMD="{{ slurm_dir }}/bin/strigger --set --down --program={{ slurm_dir }}/sbin/node_down.sh" +echo "set_node_trigger.sh: `date`: $TRIGGER_CMD" +$TRIGGER_CMD diff --git a/roles/slurm-trigger/templates/set_primary_slurmctld_failure_trigger.sh.j2 b/roles/slurm-trigger/templates/set_primary_slurmctld_failure_trigger.sh.j2 new file mode 100644 index 0000000000000000000000000000000000000000..30b0e9ea3200e0305c5c052d3a0c282d66fc4b2f --- /dev/null +++ b/roles/slurm-trigger/templates/set_primary_slurmctld_failure_trigger.sh.j2 @@ -0,0 +1,4 @@ +#!/bin/bash +TRIGGER_CMD="{{ slurm_dir }}/bin/strigger --set --primary_slurmctld_failure --program={{ slurm_dir }}/sbin/primary_slurmctld_failure.sh" +echo "set_primary_slurmctld_failure_trigger.sh: `date`: $TRIGGER_CMD" +$TRIGGER_CMD diff --git a/roles/slurm-trigger/templates/set_primary_slurmctld_resumed_operation_trigger.sh.j2 b/roles/slurm-trigger/templates/set_primary_slurmctld_resumed_operation_trigger.sh.j2 new file mode 100644 index 0000000000000000000000000000000000000000..ea3ad3f7cead736a0a64cc1f9b11add14d1ad610 --- /dev/null +++ b/roles/slurm-trigger/templates/set_primary_slurmctld_resumed_operation_trigger.sh.j2 @@ -0,0 +1,4 @@ +#!/bin/bash +TRIGGER_CMD="{{ slurm_dir }}/bin/strigger --set --primary_slurmctld_resumed_operation --program={{ slurm_dir }}/sbin/primary_slurmctld_resumed_operation.sh" +echo "set_primary_slurmctld_resumed_operation_trigger.sh: `date`: $TRIGGER_CMD" +$TRIGGER_CMD