Commit 692d69df authored by Chris Hines's avatar Chris Hines
Browse files

initial commit for status page integration

parents
*~
__pycache__
.vscode
config.yml
def main():
import logging
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.getLogger('urllib3').setLevel(logging.CRITICAL)
logging.getLogger('paramiko').setLevel(logging.CRITICAL)
import argparse
import yaml
parser = argparse.ArgumentParser()
parser.add_argument('--config', default='./config.yml')
args = parser.parse_args()
from . import statuspagewrapper
from . import influxdbwrapper
from .influxdbwrapper import Client as influxWrapperClient
from . import component_tests
from .status import Status
with open(args.config) as f:
config = yaml.safe_load(f.read())
influxClient = influxdbwrapper.Client.configure(config['influx'])
spClient = statuspagewrapper.Client.configure(config['statuspageio'])
spcomponents = {}
for name,c in config['components'].items():
msgs = []
overall = Status.operational
print(name)
print(spcomponents[name])
if spcomponents[name]['status'] == 'under_maintenance':
continue
for t in c['tests']:
testclass = component_tests.getTestClass(classname=t['class'],influxClient=influxClient,**t)
result = testclass.test()
print(result)
overall = max(overall,result['status'])
if result['status'] != Status.operational:
msgs.append(result['msg'])
update_component(spClient,spcomponents[name],overall,"\n".join(msgs))
#update_incident(spclient,spcomponents[name],overall,"\n".join(msgs),open_incidents)
def update_component(spclient,component,status,msg):
from .status import Status
if status is Status.operational:
statusstr = 'operational'
if status is Status.partial_outage:
statusstr = 'partial_outage'
if status is Status.major_outage:
statusstr = 'major_outage'
if status is Status.under_maintenance:
statusstr = 'under_maintenance'
spclient.components.update(component.id, status = statusstr)
if __name__ == '__main__':
main()
from ..influxrate import InfluxDBRate
from ..status import Status
def getTestClass(classname,influxClient, **kwargs):
if classname == 'NFSFS':
return NFSFS(influxClient, **kwargs)
if classname == 'LustreFS':
return LustreFS(influxClient, **kwargs)
if classname == 'SSHServer':
return SSHServer(**kwargs)
if classname == 'HTTPService':
return HTTPService(**kwargs)
class HTTPService():
def __init__(self,**kwargs):
self.url = kwargs['url']
def test(self):
import requests
try:
requests.get(self.url)
return {'status': Status.operational, 'msg': 'OK'}
except:
return {'status': Status.major_outage, 'msg': 'failed'}
class NFSFS():
def __init__(self,influxClient, database, **kwargs):
self.componentid = "Not Implemented"
influxClient = influxClient
self.database = database
self.fsname = kwargs['name']
self.export = kwargs['export']
self.nfs = [InfluxDBRate(client = influxClient,
database = self.database,
tags = {'export': self.export},
field = 'vfsreadpage',
measurement = 'mountstats'),
InfluxDBRate(client = influxClient,
database = self.database,
tags = {'export': self.export},
field = 'vfswritepage',
measurement = 'mountstats')]
def test(self):
if sum(map(lambda x: x.ratequery(), self.nfs)) > 0:
msgstring = """The {fsname} filesystem is OK""".format(fsname=self.fsname)
return {'status': Status.operational, 'msg': msgstring}
msgstring = """The {fsname} filesystem is down""".format(fsname = self.fsname)
return {'status': Status.major_outage, 'msg': msgstring}
class SSHServer():
def __init__(self,**kwargs):
self.hostname = kwargs['hostname']
self.user = kwargs['user']
def test(self):
import paramiko
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
try:
ssh.connect(self.hostname, username=self.user)
return {'status': Status.operational, 'msg': "{} is OK".format(self.hostname)}
except paramiko.AuthenticationException as e:
msgstring = "{} SSH Authentication Failed".format(self.hostname)
return {'status': Status.operational, 'msg': msgstring}
except Exception as e:
return {'status': Status.major_outage, 'msg': "{} is unavailable".format(self.hostname)}
class LustreFS():
def __init__(self,influxClient, **kwargs):
self.database = kwargs['database'];
self.componentid = "Not Implemented"
influxClient = influxClient
self.fsname = kwargs['name']
mdt0=kwargs['mdt0']
mdt1=kwargs['mdt1']
self.mdt0 = [ InfluxDBRate(client = influxClient,
database = self.database,
tags = {'name': mdt0},
field = 'open',
measurement = 'lustre2'),
InfluxDBRate(client = influxClient,
database = self.database,
tags = {'name': mdt0},
field = 'close',
measurement = 'lustre2'),
InfluxDBRate(client = influxClient,
database = self.database,
tags = {'name': mdt0},
field = 'statfs',
measurement = 'lustre2')]
self.mdt1 = [ InfluxDBRate(client = influxClient,
database = self.database,
tags = {'name': mdt1},
field = 'open',
measurement = 'lustre2'),
InfluxDBRate(client = influxClient,
database = self.database,
tags = {'name': mdt1},
field = 'close',
measurement = 'lustre2'),
InfluxDBRate(client = influxClient,
database = self.database,
tags = {'name': mdt1},
field = 'statfs',
measurement = 'lustre2')]
def test(self):
if sum(map(lambda x: x.ratequery(), self.mdt0)) > 0:
msgstring = """The {fsname} filesystem is OK""".format(fsname=self.fsname)
return {'status': Status.operational, 'msg': msgstring}
if sum(map(lambda x: x.ratequery(), self.mdt1)) > 0:
msgstring = """The {fsname} is partially degraded""".format(fsname = self.fsname)
return {'status': Status.partial_outage, 'msg': msgstring}
msgstring = """The {fsname} filesystem is down""".format(fsname = self.fsname)
return {'status': Status.major_outage, 'msg': msgstring}
import influxdb
import datetime
class Client():
@staticmethod
def configure(config):
import logging
import yaml
import os
return influxdb.InfluxDBClient(config['host'],
config['port'],
config['user'],
config['password'],
config['dbname'],
ssl=True, verify_ssl=True)
import datetime
class InfluxDBRate():
"""
Take values from every host in influx.
Sum the values.
Compare the sums at two different points in time
calculate the rate
The field, measurment and tags are configurable. Assumes that every entry has a host tag
for the groupby operation
"""
def __init__(self, client, database=None, tags=None,field=None,measurement=None):
self.database = database
self.client = client
self.tags = tags
self.field = field
self.measurement = measurement
if self.tags is None:
self.tags = {'cluster': 'm3', 'export': '130.194.249.82:/data1/m3-home'}
if self.field is None:
self.field = 'vfsreadpage'
if self.measurement is None:
self.measurement = 'mountstats'
def showseries(self, client):
query_string = """show series"""
client.switch_database(self.database)
q = client.query(
query_string
)
print(q)
return list(q.get_points())
def gettags(self):
time = datetime.datetime.now()
import pytz
if time.tzinfo == None:
timetz = pytz.timezone('Australia/Melbourne').localize(time)
else:
timetz = time
time2tz = timetz - datetime.timedelta(seconds=30)
tags = self.tagstring()
if tags != "":
tags = tags + " and "
query_string = """select "close", "name" from "{measurement}" where {tags} time < '{time}' and time > '{time2}' group by "host" """
self.client.switch_database(self.database)
q = self.client.query(
query_string.format(time=timetz.isoformat(),
time2 = time2tz.isoformat(),
field = self.field,
measurement = self.measurement,
tags = tags)
)
for p in q.get_points():
print(p)
def ratequery(self, t1=None, t2=datetime.datetime.now()):
"""
Compare the value of a specific measurment across the entire cluster at to different points in time
"""
if t1 is None:
t1 = t2 - datetime.timedelta(minutes=30)
a1 = self.sumlastquery(t1)
a2 = self.sumlastquery(t2)
return (a2-a1)/(t2-t1).total_seconds()
def tagstring(self):
"""
take the dictionary of tags in self.tags
format as a snippet of an influxQL string (i.e. insert and, use appropriate quotes)
"""
return " and ".join([ """ "{key}" = '{value}' """.
format(key=key,value=value)
for key,value in self.tags.items() ])
def sumlastquery(self,time):
"""
Get the last value before time of the measurment on every host. Sum it and return the agregate
If timezone isn't specified, assume Melbourne.
"""
query_string = """select last({field}) from "{measurement}" where {tags} time < '{time}' and time > '{time2}' group by "host" """
tags = self.tagstring()
if tags != "":
tags = tags + " and "
import pytz
if time.tzinfo == None:
timetz = pytz.timezone('Australia/Melbourne').localize(time)
else:
timetz = time
time2tz = timetz - datetime.timedelta(minutes=5);
self.client.switch_database(self.database)
q = self.client.query(
query_string.format(time=timetz.isoformat(),
time2 = time2tz.isoformat(),
field = self.field,
measurement = self.measurement,
tags = tags)
)
return sum([x.get('last',0) for x in q.get_points()])
import enum
class Status(enum.IntEnum):
operational = enum.auto()
degraded_performance = enum.auto()
partial_outage = enum.auto()
major_outage = enum.auto()
under_maintenance = enum.auto()
\ No newline at end of file
import statuspageio
class Client():
@staticmethod
def configure(config):
import logging
import statuspageio
return statuspageio.Client(api_key=config['api_key'], page_id=config['page_id'], organization_id = config['organization_id'])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment