Compare revisions

cae6bd7b · cae6bd7b · cae6bd7b · cae6bd7b · cae6bd7b · cae6bd7b
--- a/CICD/plays/loginnodes.yml
+++ b/CICD/plays/loginnodes.yml
+../tests/ManagementNodes/check.yml
\ No newline at end of file
--- a/CICD/plays/make_files.yml
+++ b/CICD/plays/make_files.yml
+---
+# just calculates an etc hosts
+- hosts: 'all'
+  tasks:
+  - include_vars: vars/passwords.yml
+  - include_vars: vars/names.yml
+  - include_vars: vars/ldapConfig.yml
+  - include_vars: vars/filesystems.yml
+  - include_vars: vars/slurm.yml
+  - include_vars: vars/vars.yml
+- hosts: 'all'
+  tasks:
+  - { name: setup, setup: }
+- hosts: 'ManagementNodes'
+  roles:
+  - { role: calculateEtcHosts }
+
+#- hosts: 'NFSNodes'
+#  roles:
+#  - { role: calculateExports }
+
+
--- a/CICD/plays/mgmtnodes.yml
+++ b/CICD/plays/mgmtnodes.yml
+# Basic stuff to make the nodes functionl
+# i.e. upgrade operating systems, etc
+#
+
+- hosts: 'ManagementNodes'
+  gather_facts: True
+  vars_files:
+  - vars/passwords.yml
+  - vars/slurm.yml
+  - vars/vars.yml
+  - vars/vars_centos79.yml
+  - vars/versions.yml
+  tasks:
+  - { name: unmount vdb if absent,  mount: { path: "/mnt", src: "/dev/vdb", state: absent},
+      when: 'hostvars[inventory_hostname]["ansible_devices"]["vdb"] is not defined', become: true }
+  - { name: keep mnt present, file: { path: "/mnt", owner: root, group: root, mode: "u=rwx,g=rx,o=rx", state: directory},
+      when: 'hostvars[inventory_hostname]["ansible_devices"]["vdb"] is not defined', become: true }
+  - { name: set use shared state, set_fact: usesharedstatedir=True }
+  tags: [ always ]
+
+- hosts: 'ManagementNodes'
+  gather_facts: False
+  vars_files:
+  - vars/passwords.yml
+  - vars/slurm.yml
+  - vars/vars.yml
+  - vars/versions.yml
+  roles:
+#  - { role: ldapclient, tags: [ authentication ] }
+#  - { role: ssh-password-login }
+#  - { role: enable_sudo_group }
+
+  - { role: slurmdb-config, tags: [ slurm, slurmdb-config ] }
+  - { role: slurm-common, tags: [ slurm, slurm-common ]  }
+  - { role: slurm_config, tags: [ slurm, slurm-config ] }
+  - { role: slurm-start, start_slurmdbd: True, start_slurmctld: True, slurmd_enabled: False, start_slurmd: False, use_glusterfs: False, EMAIL_DEST: "nobody@nowhere.com", tags: [ slurm-start ]  }
+#  - { role: provision_slurm, use_active_directory: False, lockpath: "/mnt/home", tags: [ slurm ]  }
+#  - { role: provision_homedir, use_active_directory: False, mntpt: "/mnt/home", tags: [ provisioning ] }
+
--- a/CICD/plays/mockldap.yml
+++ b/CICD/plays/mockldap.yml
+---
+- hosts: LDAPServer
+  vars_files: 
+  - vars/passwords.yml
+  - vars/ldapConfig.yml
+  gather_facts: true
+  tasks:
+  - include_vars: vars/passwords.yml
+  roles:
+  - { role: ldapservertest, ssl: false, tags: [ ldapserver ], become: true }
+    
+#- hosts: ComputeNodes, LoginNodes
+#  vars_files: 
+#  - vars/passwords.yml
+#  - vars/ldapConfig.yml
+#  gather_facts: true
+#  tasks:
+#  - include_vars: vars/passwords.yml
+#  roles:
+#  - { role: ldapclient, ssl: false
+#    , tags: [ ldapclient ] }
\ No newline at end of file
--- a/CICD/plays/nfssqlnodes.yml
+++ b/CICD/plays/nfssqlnodes.yml
+# Role to initialize nfs and SQL Nodes
+#
+#
+
+- hosts: 'all'
+  tasks:
+  - { name: setup, setup: }
+  tags: [ always ]
+
+#we need this here to gather facts and fill required variables.
+- hosts: 'ManagementNodes'
+  gather_facts: True
+  tasks:
+  - include_vars: vars/passwords.yml
+  - include_vars: vars/slurm.yml
+  - include_vars: vars/vars.yml
+  - include_vars: vars/versions.yml
+  - include_vars: vars/vars_centos79.yml
+  - { name: set hostgroup, set_fact: hostgroup='ManagementNodes' }
+  - { name: set use shared state, set_fact: usesharedstatedir=True }
+  tags: [ always ]
+
+- hosts: 'SQLNodes,NFSNodes'
+  vars_files:
+  - vars/passwords.yml
+  - vars/slurm.yml
+  - vars/vars.yml
+  - vars/versions.yml
+  pre_tasks:
+  - { name: set hostgroup, set_fact: hostgroup='SQLNodes', tags: [ always ] }
+  - { name: set use shared state, set_fact: usesharedstatedir=True, tags: [ always ] }
+
+- hosts: 'SQLNodes'
+  vars_files:
+  - vars/passwords.yml
+  - vars/slurm.yml
+  - vars/vars.yml
+  - vars/versions.yml
+  - vars/vars_centos79.yml
+  strategy: free
+  gather_facts: True
+  roles:
+    #  - { role: upgrade, tags: [ upgrade ] }
+  - { role: mysql, mysql_type: mysql_server, mysql_user_name: slurmdb, mysql_user_db_name: slurm_acct_db, mysql_user_hosts_group: "{{ groups['ManagementNodes'] }}", mysql_user_password: "{{ slurmdb_passwd }}", tags: [ database ] }
+  - { role: slurm-mysql-config, tags: [database,slurmdb] }
+  tags: [ sql ]
+
+
+- hosts: 'LogNodes'
+  roles:
+  - { role: etcHosts, tags: [ networking, etcHosts ] }
+  - { role: rsyslog_server }
--- a/CICD/plays/roles
+++ b/CICD/plays/roles
+../../roles
\ No newline at end of file
--- a/CICD/plays/testlustre/files
+++ b/CICD/plays/testlustre/files
+../files
\ No newline at end of file
--- a/CICD/plays/testlustre/lnet.conf
+++ b/CICD/plays/testlustre/lnet.conf
+options lnet networks=tcp0(eth0)
\ No newline at end of file
--- a/CICD/plays/testlustre/lnet.modules
+++ b/CICD/plays/testlustre/lnet.modules
+#!/bin/sh
+
+if [ ! -c /dev/lnet ] ; then
+    exec /sbin/modprobe lnet >/dev/null 2>&1
+fi
--- a/CICD/plays/testlustre/lustre.modules
+++ b/CICD/plays/testlustre/lustre.modules
+#!/bin/sh
+
+/sbin/lsmod | /bin/grep lustre 1>/dev/null 2>&1
+if [ ! $? ] ; then
+   /sbin/modprobe lustre >/dev/null 2>&1
+fi
--- a/CICD/plays/testlustre/lustre.repo
+++ b/CICD/plays/testlustre/lustre.repo
+[lustre-server]
+name=lustre-server
+baseurl=https://downloads.whamcloud.com/public/lustre/latest-release/el7/server
+# exclude=*debuginfo*
+gpgcheck=0
+
+[lustre-client]
+name=lustre-client
+baseurl=https://downloads.whamcloud.com/public/lustre/latest-release/el7/client
+# exclude=*debuginfo*
+gpgcheck=0
+
+[e2fsprogs-wc]
+name=e2fsprogs-wc
+baseurl=https://downloads.whamcloud.com/public/e2fsprogs/latest/el7
+# exclude=*debuginfo*
+gpgcheck=0
\ No newline at end of file
--- a/CICD/plays/testlustre/roles
+++ b/CICD/plays/testlustre/roles
+../roles
\ No newline at end of file
--- a/CICD/plays/testlustre/testlustre.yml
+++ b/CICD/plays/testlustre/testlustre.yml
+---
+- hosts: 'LustreONodes,LustreMNodes'
+  tasks:
+  - name: install epel release repo
+    package:
+      name: epel-release
+      state: present
+  - name: Enable ZFS on Linux repo
+    command: yum localinstall --nogpgcheck -y http://download.zfsonlinux.org/epel/zfs-release.el7_6.noarch.rpm
+    become: true
+  - name: add lustre repos
+    copy:
+      src: lustre.repo
+      dest: /etc/yum.repos.d/lustre.repo
+    become: true
+  - name: Upgrade e2fsprogs
+    package:
+      name: e2fsprogs
+      state: latest
+    become: true
+  - name: install the lustre-tests package
+    package:
+      name: lustre-tests
+      state: present
+    become: true
+  - name: Create the following file
+    copy:
+      src: lnet.conf
+      dest: /etc/modprobe.d/lnet.conf      
+    become: true
+
+- hosts: 'LustreONodes,LustreMNodes'
+  tasks:
+  - name: On the MGS and OSS only, create the file /etc/sysconfig/modules/lnet.modules
+    copy:
+      src: lustre.repo
+      dest: /etc/sysconfig/modules/lnet.modules
+    become: true
+    register: lnetmodules
+  - name: Reboot a slow machine that might have lots of updates to apply
+    reboot:
+      reboot_timeout: 30 # still tweaking this ! 
+    become: true
+    when: lnetmodules.changed
+
+- hosts: 'LustreMNodes' #TODO
+  tasks:
+  - name: Intialise a disk or partition to use for lustre
+    #command: mkfs.lustre --fsname=whatevs --mgs --mdt --index=0 /dev/disk/by-id/virtio-f6705d5a-62d8-4d93-b  # change ID, done before?
+    command: mkfs.lustre --fsname=whatevs --mgs --mdt --index=0 {{ hostvars[inventory_hostname]['ansible_host_volumes']['mdtvol']['dev'] }} # change ID, done before? 
+    become: true
+    ignore_errors: yes  # i am noughty and lazy here. this is in case of a rerun to not fail
+  - name: Create a directory if it does not exist
+    file:
+      path: /mnt/mdt
+      state: directory
+      mode: '0755'
+    become: true
+    ignore_errors: yes #this will not work if already mounted
+  - name: Create a mount point and mount the lustre FS
+    #command: mount -t lustre {{ hostvars[inventory_hostname]['ansible_host_volumes']['mdtvol']['dev'] }} /mnt/mdt
+    become: true
+    mount:
+      path: /mnt/mdt
+      src: "{{ hostvars[inventory_hostname]['ansible_host_volumes']['mdtvol']['dev'] }}"
+      fstype: lustre
+      state: mounted
+  
+    
+- hosts: 'LustreONodes' 
+  tasks:
+  - name: Create a lustre OST
+    #command: mkfs.lustre --ost --fsname=whatevs --mgsnode=118.138.233.250@tcp0 --index=0 /dev/disk/by-id/virtio-3c2223cd-e3d3-4264-9
+    command: mkfs.lustre --ost --fsname=whatevs --mgsnode={{ hostvars[groups['LustreMNodes'][0]]['ansible_host'] }}@tcp0 --index=0 {{ hostvars[inventory_hostname]['ansible_host_volumes']['ostvol']['dev'] }}  # /dev/disk/by-id/virtio-3c2223cd-e3d3-4264-9
+    become: true
+    ignore_errors: yes # if filesystem already exists do nothing
+    register: ost_mkfs
+  - name: Create a directory if it does not exist
+    file:
+      path: /ostoss_mount
+      state: directory
+      mode: '0755'
+    become: true
+    when: ost_mkfs.changed
+    ignore_errors: yes
+  - name: Create a mount point and mount the lustre FS
+    #command: mount -t lustre {{ hostvars[inventory_hostname]['ansible_host_volumes']['ostvol']['dev'] }} /ostoss_mount
+    become: true
+    mount:
+      path: /ostoss_mount
+      src: "{{ hostvars[inventory_hostname]['ansible_host_volumes']['ostvol']['dev'] }}"
+      fstype: lustre
+      state: mounted
+
+- hosts: 'LoginNodes' # this does not work. we need to config_repos first
+  tasks:
+  - include_vars: vars/vars_centos78.yml
+
+- hosts: 'LoginNodes'
+  roles:
+  - { role: config_repos, tags: [ repos ] }
+  - { role: upgrade, tags: [ upgrade ] } # upgrade from centos7.6 base image to centos7.8
+  - { role: mellanox_drivers, start_roce_service: false }
+  
+- hosts: 'LoginNodes' # this does not work. we need to config_repos first
+  tasks:
+  - name: install rpms
+    package: 
+      name:
+        - kmod-lustre-client
+        - lustre-client
+    become: true
+  
+  - name: Add the lustre module
+    modprobe:
+      name: lustre
+      state: present
+    become: true
+
+  - name: Create the file /etc/sysconfig/modules/lustre.modules to load the lustre module on boot
+    copy:
+      src: lustre.modules
+      dest: /etc/sysconfig/modules/lustre.modules
+    become: true
+
+  - name: Create a directory if it does not exist
+    file:
+      path: /mnt/lustre
+      state: directory
+      mode: '0755'
+    become: true
+
+  - name: Create a mount point and mount the lustre FS
+    command: mount -t lustre {{ hostvars[groups['LustreMNodes'][0]]['ansible_host'] }}@tcp0:/whatevs /mnt/lustre
+    become: true
+  
+  - name: create a directory for the users group. ID 100 
+    command: mkdir -p /mnt/lustre/users
+    become: true
+    
+  - name: set a quota 
+    command: lfs setquota -g 100 -b 1G -B 1.1G /mnt/lustre/users/
+    become: true
+    
+  - name: poll the quota
+    command: lfs quota -gh 100 /mnt/lustre/users/
+    register: quota
+    become: true
+    failed_when: "'is using default block quota setting' in quota.stdout"
\ No newline at end of file
--- a/CICD/plays/testlustre/vars
+++ b/CICD/plays/testlustre/vars
+../../vars
\ No newline at end of file
--- a/CICD/plays/vars
+++ b/CICD/plays/vars
+../vars
\ No newline at end of file
--- a/CICD/pre_templates/cgroup.conf.j2
+++ b/CICD/pre_templates/cgroup.conf.j2
+CgroupAutomount=yes
+ConstrainDevices=yes
+ConstrainCores=yes
+ConstrainRAMSpace=yes
+ConstrainKmemSpace=no
--- a/CICD/pre_templates/filesystems_yml.j2
+++ b/CICD/pre_templates/filesystems_yml.j2
+domain: {{ domain }}
+volumes:
+  {% for name in exports %}
+  - host: {{ clustername}}-sql0
+    dev: {{ disks[name] }}
+    mnt: /mnt/{{ name }}
+  {% endfor %}
+nfsexports:
+  - host: {{ clustername }}-sql0
+    exportList:
+    {% for name in exports %}  - src: /mnt/{{ name }}
+    {% endfor %}
+    {% for name in ['/home','/nfs/opt'] %}  - src: {{ name }}
+    {% endfor %}
+nfsmounts:
+  - group: LoginNodes
+    nfsMounts:
+    {% for name in exports %}
+    - name: /mnt/{{ name }}
+      ipv4: {{ clustername }}-sql0
+      src: /mnt/{{ name }}
+      fstype: nfs4
+      opts: "defaults,rw"
+    {% endfor %}
+    {% for name in ['/home','/nfs/opt'] %}
+    - name: {{ name }}
+      ipv4: {{ clustername }}-sql0
+      src:  {{ name }}
+      fstype: nfs4
+      opts: "defaults,rw"
+    {% endfor %}
+  - group: ComputeNodes
+    nfsMounts:
+    {% for name in exports %}
+    - name: /mnt/{{ name }}
+      ipv4: {{ clustername }}-sql0
+      src: /mnt/{{ name }}
+      fstype: nfs4
+      opts: "defaults,rw" {% endfor %}
+    - name: /home
+      ipv4: {{ clustername }}-sql0
+      src: /home
+      fstype: nfs4
+      opts: "defaults,rw"
+  - group: ManagementNodes
+    nfsMounts:
+    {% for name in exports %}
+    - name: /mnt/{{ name }}
+      ipv4: {{ clustername }}-sql0
+      src: /mnt/{{ name }}
+      fstype: nfs4
+      opts: "defaults,rw" {% endfor %}
--- a/CICD/pre_templates/job_container.conf.j2
+++ b/CICD/pre_templates/job_container.conf.j2
+AutoBasePath=true 
+BasePath=/mnt/privatedir
--- a/CICD/pre_templates/nhc.conf.j2
+++ b/CICD/pre_templates/nhc.conf.j2
+
+
+#######################################################################
+###
+### Filesystem checks
+###
+# * || check_fs_mount_rw -t "fuse.glusterfs" -s "mgmt0:/gv" -f "/glusterVolume"
+ * || check_fs_used / 90%
+# * || check_fs_used /glusterVolume 90%
+ * || check_fs_iused / 100%
+# * || check_fs_iused /glusterVolume 100%
+
+
+#######################################################################
+###
+### Hardware checks
+###
+# * || check_hw_cpuinfo 1 1 1
+# * || check_hw_physmem 4048416kB 4048416kB 3%
+# * || check_hw_swap 0kB 0kB 3%
+# * || check_hw_eth eth0
+# * || check_hw_eth lo
+
+
+#######################################################################
+###
+### Process checks
+###
+# * || check_ps_service -S -u root sshd
--- a/roles/slurm/templates/slurm.conf.j2
+++ b/roles/slurm/templates/slurm.conf.j2
@@ -9,11 +9,13 @@
 # See the slurm.conf man page for more information.
 #
 ClusterName={{ clustername }}
-ControlMachine={{ slurmctrl }}
+ControlMachine={{ controller }}
+BackupController={{ backup }}
 #ControlAddr=
 #BackupController=
 #BackupAddr=
 #
+SlurmctldParameters=enable_configless
 SlurmUser=slurm
 SlurmdUser=root
 SlurmctldPort=6817
@@ -21,17 +23,19 @@ SlurmdPort=6818
 AuthType=auth/munge
 #JobCredentialPrivateKey=
 #JobCredentialPublicCertificate=
-StateSaveLocation={{ slurmdatadir }}
-SlurmdSpoolDir={{ slurmdatadir }} 
+StateSaveLocation=/opt/slurm/var/state
+SlurmdSpoolDir=/opt/slurm/var/spool
 SwitchType=switch/none
-MpiDefault=none
-SlurmctldPidFile=/var/run/slurmctld.pid
-SlurmdPidFile=/var/run/slurmd.pid
-ProctrackType=proctrack/pgid
+MpiDefault=pmi2
+SlurmctldPidFile=/opt/slurm/var/run/slurmctld.pid
+SlurmdPidFile=/opt/slurm/var/run/slurmd.pid
+#ProctrackType=proctrack/linuxproc
+ProctrackType=proctrack/cgroup
 #PluginDir=
-CacheGroups=0
 #FirstJobId=
-ReturnToService=0
+ReturnToService=1
+RebootProgram=/sbin/reboot
+#ResumeTimeout=300
 #MaxJobCount=
 #PlugStackConfig=
 #PropagatePrioProcess=
@@ -43,35 +47,44 @@ ReturnToService=0
 #SrunEpilog=
 #TaskProlog=
 #TaskEpilog=
-#TaskPlugin=
+TaskPlugin=task/affinity,task/cgroup
+#TaskPlugin=task/affinity
+#TaskPlugin=task/affinity,task/cgroup
+#JobSubmitPlugins=lua
+OverTimeLimit=1
+CompleteWait=10
+
 #TrackWCKey=no
 #TreeWidth=50
 #TmpFS=
 #UsePAM=
 #
 # TIMERS
-SlurmctldTimeout=300
-SlurmdTimeout=300
-InactiveLimit=0
-MinJobAge=300
-KillWait=30
-Waittime=0
+SlurmctldTimeout=3000
+#SlurmdTimeout=300
+#InactiveLimit=0
+#MinJobAge=300
+KillWait=10
+#Waittime=0
 #
 # SCHEDULING
-SchedulerType=sched/backfill
+SchedulerType="sched/backfill"
 #SchedulerAuth=
 #SchedulerPort=
 #SchedulerRootFilter=
-SelectType=select/linear
-FastSchedule=1
-#PriorityType=priority/multifactor
-#PriorityDecayHalfLife=14-0
+SelectType="select/cons_tres"
+SelectTypeParameters=CR_Core_Memory
+JobContainerType=job_container/tmpfs
+PriorityType=priority/multifactor
+#PriorityFlags=Ticket_Based
+#PriorityCalcPeriod=5
+#PriorityDecayHalfLife=0
 #PriorityUsageResetPeriod=14-0
-#PriorityWeightFairshare=100000
-#PriorityWeightAge=1000
-#PriorityWeightPartition=10000
-#PriorityWeightJobSize=1000
-#PriorityMaxAge=1-0
+PriorityWeightFairshare=10000
+PriorityWeightAge=10000
+PriorityWeightPartition=10000
+PriorityWeightJobSize=10000
+PriorityMaxAge=14-0
 #
 # LOGGING
 {% if slurmctlddebug %}
@@ -98,22 +111,35 @@ SlurmSchedLogFile={{ slurmschedlog.log }}
 JobCompType=jobcomp/none
 #JobCompLoc=
 #
+{% if slurmjob is defined %}
+Prolog={{ slurmjob.prolog }}
+Epilog={{ slurmjob.epilog }}
+{% else %}
+Prolog=/opt/slurm/etc/slurm.prolog
+Epilog=/opt/slurm/etc/slurm.epilog
+{% endif %}
+PrologFlags=contain
+#
 # ACCOUNTING
-#JobAcctGatherType=jobacct_gather/linux
-#JobAcctGatherFrequency=30
+JobAcctGatherType=jobacct_gather/cgroup
+JobAcctGatherFrequency=30
 #
 AccountingStorageType=accounting_storage/slurmdbd
-AccountingStorageHost={{ slurmctrl }}
-#AccountingStorageEnforce=limits,safe
+AccountingStorageHost={{ controller }}
+{% if slurmdbdbackup is defined %}
+AccountingStorageBackupHost={{ backup }}
+{% endif %}
+AccountingStorageEnforce=limits,safe
 #AccountingStorageLoc=
 #AccountingStoragePass=
 #AccountingStorageUser=
 #
+#GRES
+GresTypes=gpu
+#

-# Fair share
-{% if slurmfairshare.def %}
-PriorityWeightFairshare={{ slurmfairshare.val }}
-{% endif %}
+HealthCheckInterval=300
+HealthCheckProgram={{ nhc_dir }}/sbin/nhc

 DisableRootJobs=YES
 MpiParams=ports=12000-12999
@@ -126,9 +152,14 @@ MpiParams=ports=12000-12999
 {% endfor %}
 {% endfor %}
 {% for node in nodelist|unique %}
-NodeName={{ node }} Procs={{ hostvars[node]['ansible_processor_cores'] }} State=UNKNOWN
+NodeName={{ node }} Procs=1 RealMemory=1024 Sockets=1 CoresPerSocket=1 ThreadsPerCore=1 Gres=gpu:0 Weight=1 State=UNKNOWN
 {% endfor %}

 {% for queue in slurmqueues %}
-PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ groups[queue.group]|join(',') }}
+{% set nodenames = [] %}
+{% for node in groups[queue.group] %}
+{% if nodenames.append(node) %}
+{% endif %} 
+{% endfor %}
+PartitionName={{ queue.name }} {% if queue.default %}Default=yes{% endif %} Nodes={{ nodenames|join(',') }} {% if queue.DefaultTime is defined %} DefaultTime={{ queue.DefaultTime }} {% endif %} {% if queue.DefMemPerCPU is defined %} DefMemPerCPU={{ queue.DefMemPerCPU }} {% endif %}  {% if queue.MaxTime is defined %} MaxTime={{ queue.MaxTime}} {% endif %}  State=UP 
 {% endfor %}
No results found