Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • hpc-team/HPCasCode
  • chines/ansible_cluster_in_a_box
2 results
Show changes
Showing
with 345 additions and 23 deletions
[
[
"GenericDesktops"
{% for partition in slurmqueues %}
"{{ partition.name }}"{% if not loop.last %},{% endif %}
{% endfor %}
],
{
"GenericDesktops": {
{% for partition in slurmqueues %}
"{{ partition.name }}": {
"__class__": "siteConfig",
"__module__": "siteConfig",
"agent": {
......@@ -121,7 +124,7 @@
"__class__": "cmdRegEx",
"__module__": "siteConfig",
"async": false,
"cmd": "{{ slurm_dir }}/bin/squeue -u {username} -o \\\"%i %L\\\" | tail -n -1",
"cmd": "\"{{ slurm_dir }}/bin/squeue -u {username} -o \\\"%i %L\\\" | tail -n -1\"",
"failFatal": true,
"formatFatal": false,
"host": "login",
......@@ -268,7 +271,7 @@
"__class__": "cmdRegEx",
"__module__": "siteConfig",
"async": false,
"cmd": "\"mkdir ~/.vnc ; rm -f ~/.vnc/clearpass ; touch ~/.vnc/clearpass ; chmod 600 ~/.vnc/clearpass ; passwd=\"'$'\"( dd if=/dev/urandom bs=1 count=8 2>/dev/null | md5sum | cut -b 1-8 ) ; echo \"'$'\"passwd > ~/.vnc/clearpass ; cat ~/.vnc/clearpass | vncpasswd -f > ~/.vnc/passwd ; chmod 600 ~/.vnc/passwd ; echo -e '#!/bin/bash\\nvncserver ; sleep 36000000 ' | {{slurm_dir}}/bin/sbatch -p batch -N {nodes} -n {ppn} --time={hours}:00:00 -J desktop_{username} -o .vnc/slurm-%j.out \"",
"cmd": "\"mkdir ~/.vnc ; rm -f ~/.vnc/clearpass ; touch ~/.vnc/clearpass ; chmod 600 ~/.vnc/clearpass ; passwd=\"'$'\"( dd if=/dev/urandom bs=1 count=8 2>/dev/null | md5sum | cut -b 1-8 ) ; echo \"'$'\"passwd > ~/.vnc/clearpass ; cat ~/.vnc/clearpass | vncpasswd -f > ~/.vnc/passwd ; chmod 600 ~/.vnc/passwd ; echo -e '#!/bin/bash\\nexport PATH=\"'$'\"PATH:/bin ; vncserver ; sleep 36000000 ' | {{slurm_dir}}/bin/sbatch -p {{ partition.name }} -N {nodes} -n {ppn} --time={hours}:00:00 -J desktop_{username} -o .vnc/slurm-%j.out \"",
"failFatal": true,
"formatFatal": false,
"host": "login",
......@@ -345,7 +348,7 @@
"host": "exec",
"loop": false,
"regex": [
"^.*?New 'X' desktop is \\S+(?P<vncDisplay>:[0-9]+)\\s*$"
"^.*?New .* desktop is \\S+(?P<vncDisplay>:[0-9]+)\\s*$"
],
"requireMatch": true
},
......@@ -447,6 +450,7 @@
],
"requireMatch": true
}
}
}{% if not loop.last %},{% endif %}
{% endfor %}
}
]
......@@ -121,7 +121,7 @@
"__class__": "cmdRegEx",
"__module__": "siteConfig",
"async": false,
"cmd": "squeue -u {username} -o \\\"%i %L\\\" | tail -n -1",
"cmd": "\"squeue -u {username} -o \\\"%i %L\\\" | tail -n -1\"",
"failFatal": true,
"formatFatal": false,
"host": "login",
......@@ -449,4 +449,4 @@
}
}
}
]
\ No newline at end of file
]
......@@ -2,12 +2,12 @@
- include_vars: "{{ ansible_distribution }}_{{ ansible_distribution_major_version }}_{{ ansible_architecture }}.yml"
- name: get turbovnc
shell: wget http://sourceforge.net/projects/turbovnc/files/1.2.3/turbovnc_1.2.3_amd64.deb
shell: wget http://sourceforge.net/projects/turbovnc/files/1.2.3/turbovnc_1.2.3_amd64.deb
when: ansible_os_family == "Debian"
- name: install turobvnc
apt: deb=turbovnc_1.2.3_amd64.deb
sudo: true
become: true
when: ansible_os_family == "Debian"
- name: get turbovnc
......@@ -16,19 +16,19 @@
- name: install turobvnc
yum: src=turbovnc-1.2.3.x86_64.rpm
sudo: true
become: true
when: ansible_os_family == "RedHat"
- name: copy launcher
copy: src=/tmp/{{ dest_pkg_name }} dest=/tmp/{{ dest_pkg_name }}
- name: install launhcer
apt: deb=/tmp/{{ dest_pkg_name }}
sudo: true
apt: deb=/tmp/{{ dest_pkg_name }}
become: true
when: ansible_os_family == "Debian"
- name: install launcher
yum: src=/tmp/{{ dest_pkg_name }}
sudo: true
become: true
when: ansible_os_family == "RedHat"
......@@ -68,6 +68,6 @@
- python-psutil
pkg_name: ./rpmbuild/RPMS/x86_64/strudel-{{ strudel_ver }}-1.x86_64.rpm
dest_pkg_name: strudel_{{ ansible_distribution }}_{{ ansible_distribution_major_version }}_{{ strudel_ver }}_x86_64.rpm
......@@ -65,6 +65,6 @@
- python-psutil
pkg_name: ./rpmbuild/RPMS/x86_64/strudel-{{ strudel_ver }}-1.x86_64.rpm
dest_pkg_name: strudel_{{ ansible_distribution }}_{{ ansible_distribution_major_version }}_{{ strudel_ver }}_x86_64.rpm
......@@ -66,6 +66,6 @@
- python-psutil
pkg_name: ./rpmbuild/RPMS/x86_64/strudel-0.6.0-1.x86_64.rpm
dest_pkg_name: strudel_{{ ansible_distribution }}_{{ ansible_distribution_version }}_{{ hostvars[ansible_hostname]['ansible_date_time']['date'] }}_x86_64.rpm
......@@ -67,6 +67,6 @@
- python-psutil
pkg_name: ./rpmbuild/RPMS/x86_64/strudel-{{ strudel_ver }}-1.x86_64.rpm
dest_pkg_name: strudel_{{ ansible_distribution }}_{{ ansible_distribution_major_version }}_{{ strudel_ver }}_x86_64.rpm
---
- name: "Reload exports"
- name: "Reload exports"
command: exportfs -ra
delegate_to: "{{ nfs_server }}"
sudo: true
become: true
---
- name: "Templating /etc/exports"
copy: src=files/etcExports dest=/etc/exports owner=root group=root mode=644
sudo: true
become: true
register: exports
- name: "Start the Server"
service: "name=nfs state=restarted"
sudo: true
become: true
when: ansible_os_family == "RedHat" and exports.changed
- name: "Start the Server"
service: "name=nfs-kernel-server state=restarted"
sudo: true
become: true
when: ansible_os_family == "Debian" and exports.changed
- name : "Pause ... clients sometimes have errors"
......
user.max_user_namespaces=10000
- name: copying networking config for sysctl
copy:
mode: '640'
src: max_user_name_spaces.conf
dest: '/etc/sysctl.d'
become: true
- name: update max_user_name_spaces variable for live system
sysctl:
name: user.max_user_namespaces
value: "10000"
become: true
net.ipv4.tcp_max_syn_backlog=30000
net.ipv4.conf.all.accept_redirects=0
net.ipv4.udp_rmem_min=8192
net.ipv4.tcp_congestion_control=htcp
net.core.default_qdisc=fq_codel
net.ipv4.tcp_rmem=4096 87380 33554432
net.ipv4.tcp_tw_recycle=1
net.ipv4.tcp_tw_reuse=1
net.core.optmem_max=4194304
net.ipv4.tcp_slow_start_after_idle=0
net.core.wmem_max=33554432
net.ipv4.conf.all.send_redirects=0
net.core.netdev_budget=600
net.ipv4.tcp_fack=1
net.netfilter.nf_conntrack_max=1024000
net.ipv4.tcp_fastopen=1
net.ipv4.conf.all.log_martians=0
net.core.netdev_max_backlog=50000
net.ipv4.tcp_ecn=1
net.ipv4.tcp_timestamps=1
net.ipv4.tcp_mtu_probing=1
net.ipv4.tcp_wmem=4096 65536 33554432
net.nf_conntrack_max=1024000
net.core.somaxconn=1024
net.ipv4.tcp_fin_timeout=10
net.ipv4.tcp_sack=1
kernel.pid_max=4194303
net.core.rmem_max=33554432
net.ipv4.udp_wmem_min=8192
net.ipv4.tcp_dsack=1
- name: copying networking config for sysctl
copy:
mode: '640'
src: 90-network.conf
dest: '/etc/sysctl.d'
become: true
become_user: root
register: sysctl_network_conf
#The sysctl module manages entries in sysctl.conf and setting "reload :yes" performs a /sbin/sysctl -p if the sysctl file is updated. In addition this module requires a name for a specific sysctl variable in order to work.
#In this case we need to update the network configuration and making no change to the sysctl.conf file hence the module is not applicable. I am replacing it with a shell module instead.
- name: Reloading sysctl
shell: sysctl -p
become: true
become_user: root
when: sysctl_network_conf.changed
#!/bin/bash
echo "Checking nvidia devices"
if [ -f /dev/nvidia-uvm ]; then
echo "Device created"
else
echo "Device not created"
fi
[Unit]
Description=Check Nvidia UVM devices
After=lustre-client.service
Wants=lustre-client.service
[Service]
Type=oneshot
ExecStart=/usr/local/sbin/create-dev-uvm.sh
RemainAfterExit=true
ExecStop=/usr/local/sbin/check-dev-uvm.sh
StandardOutput=journal
[Install]
WantedBy=multi-user.target
#!/bin/bash
# This script is intended to be run via systemd during startup of visnodes to create NVidia UVM Devices
LDFLAGS="-L/usr/local/cuda/8.0.61/lib64/stubs -L/usr/lib64 -L/usr/local/cuda/8.0.61/lib64 -L/usr/local/cuda/8.0.61/lib" /usr/local/cuda/8.0.61/samples/1_Utilities/deviceQuery/deviceQuery
/bin/nvidia-modprobe -u -c 0
if [ ! -e /dev/nvidia-uvm ]; then
# Find out the major device number used by the nvidia-uvm driver
D=`grep nvidia-uvm /proc/devices | awk '{print $1}'`
mknod -m 666 /dev/nvidia-uvm c $D 0
else
exit 0
fi
---
- name: install nvidia-modprobe on ubuntu
package:
name: nvidia-modprobe
state: present
become: true
when: ansible_os_family == 'Debian'
- name: Copy Files
become: true
become_user: root
copy:
src: "{{ item.src }}"
dest: "{{ item.dest }}"
owner: root
mode: 0755
with_items:
- { src: 'create-dev-uvm.sh', dest: '/usr/local/sbin/' }
- { src: 'check-dev-uvm.sh', dest: '/usr/local/sbin/' }
- { src: 'create-dev-uvm.service', dest: '/etc/systemd/system/' }
- name: Enable Service
become: true
become_user: root
systemd: enabled=yes state=started name=create-dev-uvm
#!/bin/python -E
import json
import os
def render_data(data):
return json.dumps(data)
hw_counters = {}
path_prefix = '/sys/class/infiniband'
path_suffix = 'ports/1/hw_counters'
for device_name in os.listdir(path_prefix):
path = os.path.join(path_prefix, device_name, path_suffix)
hw_counters[device_name] = os.listdir(path)
print(render_data(hw_counters))
\ No newline at end of file
# Pulls statistics from nvidia GPUs attached to the host
[[inputs.nvidia_smi]]
## Optional: path to nvidia-smi binary, defaults to $PATH via exec.LookPath
# bin_path = "/usr/bin/nvidia-smi"
## Optional: timeout for GPU polling
# timeout = "5s"
\ No newline at end of file
#!/usr/bin/python
xprtudpcounters="proto,port,bind_count,rpcsends,rpcreceives,badxids,inflightsends,backlogutil".split(',')
xprttcpcounters="proto,port,bind_count,connect_count,connect_time,idle_time,rpcsends,rpcreceives,badxids,inflightsends,backlogutil".split(',')
xprtrdmacounters="proto,port,bind_count,connect_count,idle_time,rpcsends,rpcreceives,badxids,backlogutil,read_chunks,write_chunks,reply_chunks,total_rdma_req',total_rdma_rep,pullup,fixup,hardway,failed_marshal,bad_reply".split(',')
NfsOpCounters="operations,transmissions,major_timeouts,bytes_sent,bytes_recv,queue_time,response_time,request_time".split(',')
OPS="RELEASE_LOCKOWNER,LOCK,LOCKT,LOCKU,GETATTR,WRITE,OPEN,CLOSE,ACCESS,RENAME,SYMLINK,CREATE".split(',')
NfsEventCounters = [
'inoderevalidates',
'dentryrevalidates',
'datainvalidates',
'attrinvalidates',
'vfsopen',
'vfslookup',
'vfspermission',
'vfsupdatepage',
'vfsreadpage',
'vfsreadpages',
'vfswritepage',
'vfswritepages',
'vfsreaddir',
'vfssetattr',
'vfsflush',
'vfsfsync',
'vfslock',
'vfsrelease',
'congestionwait',
'setattrtrunc',
'extendwrite',
'sillyrenames',
'shortreads',
'shortwrites',
'delay'
]
NfsByteCounters = [
'normalreadbytes',
'normalwritebytes',
'directreadbytes',
'directwritebytes',
'serverreadbytes',
'serverwritebytes',
'readpages',
'writepages'
]
class DeviceData:
"""DeviceData objects provide methods for parsing and displaying
data for a single mount grabbed from /proc/self/mountstats
"""
def __init__(self):
self.__nfs = dict()
self.__nfs_device = dict()
def fstype(self):
return self.__nfs_device['fstype']
def tags(self):
return ",".join(["{}={}".format(key,value) for key,value in self.__nfs_device.items()])
def values(self):
try:
values = ",".join(["{}={}".format(key,value) for key,value in self.__nfs['bytes']])
values +=","
values += ",".join(["{}={}".format(key,value) for key,value in self.__nfs['events']])
except KeyError as e:
# key error occurs if we haven't filtered the lustre mount points from the NFS mount points yet
return None
return values
def opvalues(self,op):
return ",".join(["{}={}".format(key,value) for key,value in self.__nfs[op]])
def __parse_device_line(self, words):
if words[0] == 'device':
self.__nfs_device['export'] = words[1]
self.__nfs_device['mountpoint'] = words[4]
self.__nfs_device['fstype'] = words[7]
def __parse_bytes_line(self, words):
if words[0] == 'bytes:':
self.__nfs['bytes'] = zip(NfsByteCounters,[ int(x) for x in words[1:]])
def __parse_events_line(self,words):
if words[0] == 'events:':
self.__nfs['events'] = zip(NfsEventCounters,[int(x) for x in words[1:]])
def __parse_ops_line(self,words):
if words[0][:-1] in OPS:
self.__nfs[words[0][:-1]] = zip(NfsOpCounters, [ int(x) for x in words[1:]])
def __parse_xprt_line(self, words):
if words[0] == 'xprt:':
if words[1] == 'udp':
self._rpc = zip(xprtudpcounters, words[1:11])
if words[1] == 'tcp':
self._rpc = zip(xprttcpcounters, words[1:11])
if words[1] == 'rdma':
self._rpc = zip(xprtrdmacounters, words[1:11])
def parse_stats(self, lines):
"""Turn a list of lines from a mount stat file into a
dictionary full of stats, keyed by name
"""
foundnfs = False
foundrpc = False
for line in lines:
words = line.split()
if len(words) == 0:
continue
self.__parse_device_line(words)
self.__parse_bytes_line(words)
self.__parse_events_line(words)
self.__parse_ops_line(words)
self.__parse_xprt_line(words)
def parse_stats_file(filename):
"""pop the contents of a mountstats file into a dictionary,
keyed by mount point. each value object is a list of the
lines in the mountstats file corresponding to the mount
point named in the key.
"""
ms_dict = dict()
key = ''
f = open(filename)
for line in f.readlines():
words = line.split()
if len(words) == 0:
continue
if line.startswith("no device mounted") :
continue
if words[0] == 'device':
key = words[4]
new = [ line.strip() ]
elif 'nfs' in words or 'nfs4' in words:
key = words[3]
new = [ line.strip() ]
else:
new += [ line.strip() ]
ms_dict[key] = new
f.close
return ms_dict
def iostats(mountstats):
stats = {}
for device in mountstats:
stats[device] = DeviceData()
stats[device].parse_stats(mountstats[device])
return stats
def print_influx_line_proto(device,stats):
try:
if not 'nfs' in stats.fstype():
return
print("mountstats,{} {}".format(stats.tags(), stats.values()))
for op in OPS:
print("nfsops,{},op={} {}".format(stats.tags(),op,stats.opvalues(op)))
except:
return
mountstats = parse_stats_file("/proc/self/mountstats")
stats = iostats(mountstats)
for device in stats:
print_influx_line_proto(device,stats[device])