Skip to content
Snippets Groups Projects
Commit 3b82f500 authored by Chris Hines's avatar Chris Hines
Browse files
parents db9fcff7 60d4566a
No related branches found
No related tags found
No related merge requests found
Showing
with 122 additions and 63580 deletions
......@@ -3,6 +3,6 @@
include: installOpenVPN.yml
- name: "Start OpenVPN"
service: name=openvpn state=started
service: name=openvpn state=started enabled=yes
sudo: true
......@@ -3,5 +3,5 @@
include: installOpenVPN.yml
- name: "Start OpenVPN"
service: name=openvpn state=started
service: name=openvpn state=started enabled=yes
sudo: true
......@@ -29,6 +29,6 @@
-
name: "Starting Apache2"
service: name=apache2 state=started
service: name=apache2 state=started enabled=yes
sudo: true
......@@ -30,7 +30,8 @@ SwitchType=switch/none
MpiDefault=pmi2
SlurmctldPidFile={{ slurmpiddir }}/slurmctld.pid
SlurmdPidFile={{ slurmpiddir }}/slurmd.pid
ProctrackType=proctrack/linuxproc
#ProctrackType=proctrack/linuxproc
ProctrackType=proctrack/cgroup
#PluginDir=
CacheGroups=0
#FirstJobId=
......@@ -78,16 +79,16 @@ SelectType={{ slurmselecttype }}
SelectTypeParameters=CR_Core_Memory
{% endif %}
FastSchedule={{ slurmfastschedule }}
#PriorityType=priority/multifactor
PriorityType=priority/multifactor
#PriorityFlags=Ticket_Based
#PriorityCalcPeriod=5
#PriorityDecayHalfLife=0
#PriorityUsageResetPeriod=14-0
##PriorityWeightFairshare=10000
#PriorityWeightAge=10000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=10000
#PriorityMaxAge=14-0
PriorityWeightFairshare=10000
PriorityWeightAge=10000
PriorityWeightPartition=10000
PriorityWeightJobSize=10000
PriorityMaxAge=14-0
#
# LOGGING
{% if slurmctlddebug %}
......@@ -117,24 +118,27 @@ JobCompType=jobcomp/none
{% if slurmjob is defined %}
Prolog={{ slurmjob.prolog }}
Epilog={{ slurmjob.epilog }}
{% else %}
Prolog={{ slurm_dir }}/bin/slurm.prolog
Epilog={{ slurm_dir }}/bin/slurm.epilog
{% endif %}
#
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/linux
JobAcctGatherFrequency=30
#
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost={{ slurmdbd }}
{% if slurmdbdbackup is defined %}
AccountingStorageBackupHost={{ slurmdbdbackup }}
{% endif %}
#AccountingStorageEnforce=limits,safe
AccountingStorageEnforce=limits,safe
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
#GRES
GresTypes=gpu
#GresTypes=gpu
# Fair share
{% if slurmfairshare.def %}
......@@ -155,6 +159,10 @@ MpiParams=ports=12000-12999
NodeName={{ hostvars[node]['ansible_hostname'] }} Procs={{ hostvars[node]['ansible_processor_vcpus'] }} RealMemory={{ hostvars[node].ansible_memory_mb.real.total }} Sockets={{ hostvars[node]['ansible_processor_vcpus'] }} CoresPerSocket=1 ThreadsPerCore={{ hostvars[node].ansible_processor_threads_per_core }} {% if hostvars[node].ansible_hostname.find('vis') != -1 %}Gres=gpu:1{% endif %} {% if hostvars[node]['ansible_processor_vcpus'] == 1 %}Weight=1{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 1 and hostvars[node]['ansible_processor_vcpus'] <= 16 %}Weight=3{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 16 and hostvars[node]['ansible_processor_vcpus'] <= 20 %}Weight=5{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 20 and hostvars[node]['ansible_processor_vcpus'] <= 40 %}Weight=7{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 40 and hostvars[node]['ansible_processor_vcpus'] <= 64 %}Weight=8{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 64 and hostvars[node]['ansible_processor_vcpus'] <= 128 %}Weight=9{% endif %}{% if hostvars[node]['ansible_processor_vcpus'] > 128 %}Weight=10{% endif %} Feature=stage1 State=UNKNOWN
{% endfor %}
#monarch specific to stop stupid warning messages
NodeName={{ hostvars[groups['LoginNodes'][0]]['ansible_hostname'] }} State=DOWN
NodeName={{ slurmctrl }} State=DOWN
{% for queue in slurmqueues %}
{% set nodenames = [] %}
{% for node in groups[queue.group] %}
......
---
- name: grab cacert
shell: cat {{ ldapCertDest }}
# shell: cat /etc/openldap/certs/cacert.pem
shell: cat {{ ldapCARootDest }}
register: ldapCaCertContents
- name: dump vars
......
---
- include_vars: "{{ ansible_os_family }}.yml"
- name: Install epel-release
yum: name=epel-release-7-5.noarch state=present
sudo: true
when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "7"
- name: Enable epel
command: yum-config-manager --enable epel
sudo: true
when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "7"
- name: install lua
yum: name={{ item }} state=installed
with_items:
......
......@@ -12,7 +12,7 @@
sudo: true
- name: install gluster
apt: name=glusterfs state='latest'
apt: name=glusterfs-client state='latest'
when: ansible_os_family == 'Debian'
sudo: true
......
......@@ -15,13 +15,19 @@
sudo: true
- name: install gluster
apt: name=glusterfs state='latest'
apt: name=glusterfs-server state='latest'
when: ansible_os_family == 'Debian'
sudo: true
- name: start daemon
service: name=glusterd enabled=yes state=started
sudo: true
when: ansible_os_family == 'RedHat'
- name: start daemon
service: name=glusterfs-server enabled=yes state=started
sudo: true
when: ansible_os_family == 'Debian'
- name: make server list
set_fact:
......
......@@ -116,7 +116,7 @@
sudo: true
- name: install shibboleth cache file
template: src="{{ shibboleth_deploy }}_metadata.aaf.xml.j2" dest=/tmp/metadata.aaf.xml
template: src="files/{{ shibboleth_deploy }}_metadata.aaf.xml.j2" dest=/tmp/metadata.aaf.xml
-
name: "enabling Karaage configuration"
......@@ -154,11 +154,11 @@
sudo: true
when: karaage_db_init.stdout.find("0") == 0
#-
# name: "Create IDP institutes (disable it as cache is not available)"
# shell: kg-idps /tmp/metadata.aaf.xml
# sudo: true
# when: karaage_db_init.stdout.find("0") == 0
-
name: "Create IDP institutes (disable it as cache is not available)"
shell: kg-idps /tmp/metadata.aaf.xml
sudo: true
when: karaage_db_init.stdout.find("0") == 0
-
name: "Create projects"
......
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<body><h3>HPC identity system (The landing page is under the construction)</h3>
<br>
<p>Monash HPC identity system is a new HPC access control system. Access to the HPC ID system is done through the Australian Access Federation (AAF). This allows you to login using your Institutional username and password.
<br>
<br>
If it is the first time you are using the system, it will give your options to select your existing HPC username for creating a new user account. You'll need to join projects before you can access HPC system.
<br>
<br>
If your organisation is not a member of the AAF or if you need helps, please send HPC email support: help@massive.org.au.</p>
<br>
<p>Click following link <a href=https://{{ ansible_fqdn }}/aafbootstrap>to continue.</a></p>
</body>
</html>
<html><body><h3>HPC identity management</h3>
<p>To log in via AAF authentication, connect to <a href=https://{{ hpchostname }}.erc.monash.edu.au/aafbootstrap>aafbootstrap</a></p>
<p>To log in without AAF authentication, connect to <a href=https://{{ hpchostname }}.erc.monash.edu.au/users>users</a></p>
</body></html>
......@@ -26,8 +26,6 @@ class HpcIdInit():
if self.path and os.path.exists(self.path):
with open(self.path) as data:
config_data = json.load(data)
self.project = config_data["project"]
self.mc = config_data["machine_category"]
self.user = config_data["superuser"]
else:
log("Invalid input data")
......@@ -44,115 +42,6 @@ class HpcIdInit():
now = time.strftime("%c")
self.logfile.write(now + ": " + message + "\n")
def getGroup(self, name):
group = None
try:
group =Group.objects.get(name = name)
if group:
self.log("Find group %s" %(name))
except:
self.log("Group %s not found" %(name))
finally:
return group
def getProject(self, name):
self.log("Get Project 1 %s" %(name))
project = None
try:
project = Project.objects.get(name = name)
if project:
self.log("Find project %s" %(project.name))
group = project.group
if group:
self.log("Group name = %s" %(group.name))
else:
self.log("Project %s not found" %(project.name))
except Project.DoesNotExist:
self.log("project %s does not exists" %(name))
except:
self.log("Exception: ", traceback.format_exc())
finally:
return project
def createProject(self, pid, name, institute_name, superuser):
project = None
try:
institute = self.getInstitute(institute_name)
if institute:
self.log("Find insititute %s" %(institute.name))
project = Project.objects.create(pid = pid, name = name, institute = institute, group = institute.group, is_active = True, is_approved = True, approved_by = superuser)
if project:
project.leaders.add(superuser)
self.log("Create project OK")
else:
self.log("Create project failed")
else:
self.log("Insititute %s does not exist" %(institute_name))
except:
self.log("Exception: ", traceback.format_exc())
finally:
return project
def getInstitute(self, name):
institute = None
try:
institute = Institute.objects.get(name = name)
if institute:
self.log("Institute %s exist" %(institute.name))
group = institute.group
if group:
self.log("Group name = %s" %(group.name))
else:
self.log("Institute %s not found" %(name))
except Institute.DoesNotExist:
self.log("Institute %s not found" %(name))
finally:
return institute
def getDefaultDatastore(self):
for key, value in settings.MACHINE_CATEGORY_DATASTORES.items():
if value:
return key
return None
def getMachineCategory(self, name):
mc = None
self.log("Running getMachineGategory %s" %(name))
try:
mc = MachineCategory.objects.get(name = name)
if mc:
self.log("Find machine category %s" %(mc.name))
else:
slef.log("Not found machine category %s" %(name))
except MachineCategory.DoesNotExist:
self.log("Machine category %s dose not exist" %(name))
except:
self.log("Except to create machine category %s" %(traceback.format_exc()))
finally:
return mc
def getOrCreateMachineCategory(self, name):
mc = None
try:
self.log("getOrCreateMachineCategory %s" %(name))
mc = self.getMachineCategory(name)
if not mc:
datastore = self.getDefaultDatastore()
self.log("datastore = '%s'" %(datastore))
mc = MachineCategory.objects.get_or_create(name = name, datastore = datastore)
self.log("after create machine catetory '%s'" %(name))
if mc:
self.log("Create MachineCategory %s OK" %(mc.name))
else:
self.log("Create MachineCategory failed")
except:
self.log("Except to create machine category %s" %(traceback.format_exc()))
finally:
return mc
def getUser(self, username):
person = None
try:
......@@ -171,33 +60,13 @@ class HpcIdInit():
if person:
person.set_password(self.password)
person.save()
result = self.addInstituteDelegate(person, institute)
if result:
log("Add super user %s to institute %s delegate" %(person.username, institute.name))
else:
log("Faired to add super user %s to institute %s delegate" %(person.username, institute.name))
person.full_clean()
except:
log("Create super user exception: %s" %(traceback.format_exc()))
finally:
return person
def addInstituteDelegate(self, su, institute):
result = True
try:
delegates = institute.delegates.all().filter(username = su.username)
if len(delegates) == 0:
self.log("Create institution delegate %s"%(su.username))
institute.delegates.add(su)
self.log("Create institution delegate %s OK"%(su.username))
except:
result = False
self.log("Create institution delegate exception: %s" %(traceback.format_exc()))
finally:
return result
def setup(self):
self.log("Debug = %s" %(self.debug))
su = self.getUser(self.user["username"])
if su:
self.log("Find super user %s" %(su.username))
......@@ -207,25 +76,6 @@ class HpcIdInit():
self.log("Create super user %s OK" %(su.username))
else:
self.log("Create super user %s failed" %(self.user["username"]))
if self.mc:
mc = self.getOrCreateMachineCategory(self.mc)
if mc:
self.log("Get machine category = '%s'" %(self.mc))
else:
self.log("Failed to get machine category = '%s'" %(self.mc))
if su:
for p in self.project:
project = self.getProject(p["project_name"])
if project:
self.log("Find project %s" %(project.name))
else:
self.log("Create project name = %s, pid = %s, institute name = %s" %(p["project_name"], p["pid"], p["institute_name"]))
project = self.createProject(p["pid"], p["project_name"], p["institute_name"], su)
if project:
self.log("Create project %s OK." %(project.name))
else:
self.log("Create project %s failed." %(p["project_name"]))
break
def main(argv):
config_path = None
......@@ -236,7 +86,6 @@ def main(argv):
if len(sys.argv) > 3:
debug = argv[2]
init = HpcIdInit(config_path, password, debug)
init.log("Password = %s, debug = %s" %(password, debug))
init.setup()
else:
print "Usage: kg_init <config file> <superuser password> <option: debug True | False>"
......
{"project": [{"project_name": "MCC2", "pid": "pMcc2", "institute_name": "Monash University"}, {"project_name": "CVL", "pid": "pCvl", "institute_name": "Monash University"}], "machine_category": "hpc_cluster", "superuser": {"username": "admin", "email": "jupiter.hu@monash.edu", "institute_name": "Monash University", "short_name": "admin", "full_name": "admin"}}
{"superuser": {"username": "admin", "email": "jupiter.hu@monash.edu", "institute_name": "Monash University", "short_name": "admin", "full_name": "admin"}}
This diff is collapsed.
......@@ -267,7 +267,7 @@ LOGGING = {
# Users are advised to contact this address if having problems.
# This is also used as the from address in outgoing emails.
ACCOUNTS_EMAIL = '{{ karaageAdminEmail }}'
ACCOUNTS_EMAIL = '{{ karaageAccountEmail }}'
# This organisation name, used in outgoing emails.
ACCOUNTS_ORG_NAME = '{{ karaageAcountName }}'
......
This diff is collapsed.
......@@ -40,7 +40,5 @@
notify: restart sssd
- name: "start sssd"
service: name=sssd state=started
service: name=sssd state=started enabled=yes
sudo: true
......@@ -77,6 +77,10 @@
copy: src="files/{{ ldapCAChain }}" dest="{{ ldapCAChainDest }}"
sudo: true
- name: copy ca root cert
copy: src="files/{{ ldap_TLSCARoot }}" dest="{{ ldapCARootDest }}"
sudo: true
when: ldap_TLSCARoot is defined
- name: copy key
copy: src="files/{{ ldapKey }}" dest="{{ ldapKeyDest }}" mode=600 owner={{ ldapuser }} group={{ ldapgroup }}
......
......@@ -2,3 +2,4 @@
ldapCertDest: "{{ ldapDir }}/ssl/certs/ldapcert.pem"
ldapKeyDest: "{{ ldapDir }}/ssl/private/ldapkey.pem"
ldapCAChainDest: "{{ ldapDir }}/ssl/certs/cacert.pem"
ldapCARootDest: "{{ ldapDir }}/ssl/certs/ca_cert.pem"
......@@ -12,6 +12,17 @@
# sudo: true
# when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "7"
- name: Install epel-release
yum: name=epel-release-7-5.noarch state=present
sudo: true
when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "7"
- name: Enable epel
command: yum-config-manager --enable epel
sudo: true
when: ansible_distribution == "CentOS" and ansible_distribution_major_version == "7"
- name: install lua
yum: name={{ item }} state=installed
with_items:
......
---
- include_vars: "{{ ansible_distribution }}_{{ ansible_distribution_major_version }}.yaml"
- name: copy rpms/debs
copy: dest=/tmp/ src=lustre-install/{{ item }}
with_items:
"{{ lustre_pkgs }}"
#- name: install rpms
# yum: name="/tmp/{{ item }}"
# sudo: true
# with_items: "{{ lustre_pkgs }}"
- name: install rpms
yum: name=/tmp/lustre-client-modules-2.7.0-3.10.0_229.14.1.el7.x86_64.x86_64.rpm
sudo: true
when: ansible_os_family == "RedHat"
- name: install rpms
yum: name=/tmp/lustre-client-2.7.0-3.10.0_229.14.1.el7.x86_64.x86_64.rpm
sudo: true
when: ansible_os_family == "RedHat"
# instructions to build these debs:
# Instantiate an Ubuntu 14.04 instance
# git clone git://git.hpdd.intel.com/fs/lustre-release.git
# cd lustre-release
# optionally git checkout 0754bc8f2623bea184111af216f7567608db35b6 <- I know this commit works on Ubuntu, but I had a lot of trouble with other branches
# sh autogen.sh
# ./configure --enable-dist --disable-doc --disable-server --disable-dependency-tracking --with-o2ib=/var/lib/dkms/mlnx-ofed-kernel/3.1/build/
# mkdir BUILD
# cd BUILD
# ln -s ../lustre-2.7.62.tar.gz lustre-2.7.62.orig.tar.gz
# tar zxvf ../lustre-2.7.62.tar.gz
# cd lustre-2.7.62
# ./configure --disable-doc --disable-server --disable-dependency-tracking --with-o2ib=/var/lib/dkms/mlnx-ofed-kernel/3.1/build/
# vi debian/changelog (the version number on the first line is incorrect)
# make debs
#
- name: install debs
apt: name="/tmp/{{ item }}"
sudo: true
with_items: "{{ lustre_pkgs }}"
when: ansible_distribution == "Ubuntu" and ansible_distribution_major_version == "14"
- name: "Mount lustre filesystems"
mount: name="{{ item.mntpt }}" src="{{ item.servers }}"/"{{ item.src }}" state="mounted" fstype="lustre" opts="_netdev,flock"
sudo: true
with_items: "{{ mntlist }}"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment