check_summary.py

import pandas as pd

import zipfile
import json
import os,sys,yaml

def task_result(task):
    # 0 is skipped (Because of a conditional"
    # 1 is not changed (everything should be 1)
    # 2 is changed (you should run ansible playbook to fix the cluster)
    # 3 is skipped due to check mode (you should not write a role like this)
    # 4 is failed. You've got somethign to fix.
    if "skipped" in task and task["skipped"] is True:
        if "skipped_reason" in task and task["skipped_reason"] == "Conditional result was False":
            return 0
        if "msg" in task and task["msg"] == "skipped, running in check mode":
            return 3
    if "failed" in task and task["failed"]:
        return 4
    if "changed" in task:
        if task["changed"]:
            return 2
        else:
            return 1

def change_value(change):
    if change == False:
        return 1
    if change == True:
        return 2
    return 0

def change_str(change):
    if change == 0:
        return "N/A"
    if change == 1:
        return "False"
    if change == 2: 
        return "True"
    if change == 3:
        return "Skipped in check mode"
    if change == 4:
        return "Failed"

def get_changes(data):
    for play in data['plays']:
        for task in play['tasks']:
            for host,hosttask in task['hosts'].items():
                #yield {'task':task['task']['name'],'host':host,'change':change_value(hosttask['changed']),'changestr':hosttask['changed'],'taskid':task['task']['id']}
                yield {'task':task['task']['name'],'host':host,'change':task_result(hosttask),'changestr':change_str(task_result(hosttask)),'taskid':task['task']['id']}

#def change_value(change):
#    if change == False:
#        return 1
#    if change == True:
#        return 2
#    return 0
#
#def change_str(change):
#    if change == 0:
#        return "N/A"
#    if change == 1:
#        return "False"
#    if change == 2: 
#        return "True"
#
#def get_changes(data):
#    for play in data['plays']:
#        for task in play['tasks']:
#            for host,hosttask in task['hosts'].items():
#                yield {'task':task['task']['name'],'host':host,'change':change_value(hosttask['changed']),'changestr':hosttask['changed'],'taskid':task['task']['id']}


def load_data(artifactfile="artifacts.zip", nodeclass="compute_ansible_check.log"):
    #with zipfile.ZipFile(artifactfile,'r') as zipf:
    #    data = json.loads(zipf.read(nodeclass))
    with open(nodeclass) as f:
        data = json.loads(f.read())

    # Create my dataframe from a list of dictionaries
    df = pd.DataFrame(list(get_changes(data)))
    # Extract a mapping from the taskid to the task name
    taskmap = df[['task','taskid']].copy().drop_duplicates().set_index('taskid')
    # reindex the list of values, so that each change even can be referecned by a unique combination of host and taskid
    midx = pd.MultiIndex.from_frame((df[['host','taskid']]))
    df = df.set_index(midx);
    # Assume that ever host executes every task. Use fillna to fill in task which hosts don't execute
    #print(df)
    #print(df.unstack('taskid'))
    df.fillna(0)
    #df=df.unstack('taskid').fillna(0).stack()
    # Since our dataframe is now bigger (has those pesky NaNs filled in with zeros) create a new list of hosts and tasks.
    hosts = df.index.get_level_values(0)
    df['host']=hosts
    tasks = list(map(lambda x: taskmap.loc[x]['task'],df.index.get_level_values(1)))
    df['task'] = tasks
    changestr = list(map(lambda x: change_str(x), df['change']))
    df['changestr'] = changestr
    df['taskid'] = df.index.get_level_values(1)
    return df


def bokeh_plot(df,title):
    # Create a series of colour bars (i.e. a HeatMap) from a list
    # The list should include columns for task, host, change and changestr
    # (the value of change sets the colour but the value of changestr is shown in the tool tip)
    from bokeh.io import output_file, show
    from bokeh.layouts import column
    from bokeh.plotting import figure
    from bokeh.models import LinearColorMapper, BasicTicker, PrintfTickFormatter, ColorBar, Div
    from bokeh.plotting import figure, save
    from math import pi


    # this is an abbreviated colormap from a bokeh example
    colors = ['#084594', '#2171b5', '#4292c6', "#dfccce", "#550b1d"]
    #colors = [ "#e2e2e2", "#dfccce", "#550b1d"]
    mapper = LinearColorMapper(palette=colors, low=0, high=4)
    #colors = [ "#e2e2e2", "#dfccce", "#550b1d"]
    #mapper = LinearColorMapper(palette=colors, low=0, high=2)

    TOOLS = "hover,save,pan,box_zoom,reset,wheel_zoom"


    dataxrange = list(df.index.get_level_values(1).unique())
    datayrange = list(df.index.get_level_values(0).unique())
    p = figure(title=title,
               x_range=dataxrange, y_range=datayrange,
               x_axis_location="above", 
               sizing_mode='stretch_width',
               tools=TOOLS, toolbar_location='below',
               tooltips=[('host', '@host'), ('task', '@task'), ('changed', '@changestr'),('taskid','@taskid') ])

    p.grid.grid_line_color = None
    p.axis.axis_line_color = None

    p.xaxis.major_tick_line_color = None  # turn off x-axis major ticks
    p.xaxis.minor_tick_line_color = None  # turn off x-axis minor ticks
    p.xaxis.major_label_text_color = None  # turn off x-axis tick labels leaving space
    p.xaxis.major_label_text_font_size = '0pt'  # turn off x-axis tick labels

    p.yaxis.major_tick_line_color = None  # turn off x-axis major ticks
    p.yaxis.minor_tick_line_color = None  # turn off x-axis minor ticks
    p.yaxis.major_label_text_color = None  # turn off y-axis tick labels leaving space
    p.yaxis.major_label_text_font_size = '0pt'  # turn off y-axis tick labels
    #p.axis.major_tick_line_color = None
    #p.axis.major_label_text_font_size = "5pt"
    #p.axis.major_label_standoff = 0
    #p.xaxis.major_label_orientation = pi / 3

    p.rect(x="taskid", y="host", width=1, height=1,
           source=df,
           fill_color={'field': 'change', 'transform': mapper},
           line_color=None)
    save(p)

    return p


import logging
from slack_logger import SlackHandler, SlackFormatter
slack_hook = os.environ['SLACK_HOOK']

logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter('%(asctime)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

sh = SlackHandler(username='m3-ansible-check', icon_emoji=':robot_face:', url=slack_hook)
sh.setLevel(logging.DEBUG)

logger.addHandler(sh)


import bokeh.io
from datetime import datetime
strBokehfile="output.html"  
#datetime.today().strftime('%Y%m%d')+'.html'
bokeh.io.output_file(strBokehfile)
#from bokeh.io import curdoc
from bokeh.models import Div
from bokeh.layouts import layout, column
import logging
import sys
import os


if (len(sys.argv)>1 and 'outputChangedNodeList' in sys.argv):
    print(yaml.dump(list(df[df.change == 2].host.unique())))
    sys.exit(0)

df = load_data(nodeclass="compute_ansible_check.log")


if (len(sys.argv)>1 and 'bokehplot' in sys.argv):
    cmpplot = bokeh_plot(df, "Compute Nodes")
    cmd="mv "+strBokehfile+" comp_"+strBokehfile
    os.system(cmd)
    #cmd='swift upload ansiblechecker comp_'+strBokehfile
    #os.system(cmd)

nodes = len(df.host.unique())
changed = len(df[df.change == 2].host.unique())
failed = len(df[df.change == 4].host.unique())
logger.info("{} Compute nodes, {} had at least one change {} had at least one failed task".format(nodes,changed,failed))

df = load_data(nodeclass="login_ansible_check.log")
if (len(sys.argv)>1 and 'bokehplot' in sys.argv):
    cmpplot = bokeh_plot(df, "login Nodes")
    cmd="mv "+strBokehfile+" login_"+strBokehfile
    os.system(cmd)
    #cmd='swift upload ansiblechecker login_'+strBokehfile
    #os.system(cmd)
nodes = len(df.host.unique())
changed = len(df[df.change == 2].host.unique())
failed = len(df[df.change == 4].host.unique())
logger.info("{} Login nodes, {} had at least one change {} had at least one failed task".format(nodes,changed,failed))

df = load_data(nodeclass="mgmt_ansible_check.log")
if (len(sys.argv)>1 and 'bokehplot' in sys.argv):
    cmpplot = bokeh_plot(df, "mgmt_ Nodes")
    cmd="mv "+strBokehfile+" mgmt_"+strBokehfile
    os.system(cmd)
    #cmd='swift upload ansiblechecker mgmt_'+strBokehfile
    #os.system(cmd)
nodes = len(df.host.unique())
changed = len(df[df.change == 2].host.unique())
failed = len(df[df.change == 4].host.unique())
logger.info("{} Management nodes, {} had at least one change {} had at least one failed task".format(nodes,changed,failed))

df = load_data(nodeclass="dgx_ansible_check.log")
if (len(sys.argv)>1 and 'bokehplot' in sys.argv):
    cmpplot = bokeh_plot(df, "dgx_ Nodes")
    cmd="mv "+strBokehfile+" dgx_"+strBokehfile
    os.system(cmd)
    #cmd='swift upload ansiblechecker dgx_'+strBokehfile
    #os.system(cmd)
nodes = len(df.host.unique())
changed = len(df[df.change == 2].host.unique())
failed = len(df[df.change == 4].host.unique())
logger.info("{} DGX nodes, {} had at least one change {} had at least one failed task".format(nodes,changed,failed))

logger.info("this is defined in .gitlab-ci.yml in ansible_check and the trigger is configured in https://gitlab.erc.monash.edu.au/hpc-team/clusterbuild/pipeline_schedules ")
#logger.info("https://swift.rc.nectar.org.au/v1/AUTH_e86c925319094fb2b8cc1bf2373c69dc/ansiblechecker/"+strBokehfile)
str="https://gitlab.erc.monash.edu.au/hpc-team/clusterbuild/-/jobs/"+os.environ['CI_JOB_ID']+"/artifacts/browse"
logger.info(str)
#if (len(sys.argv)>1 and 'bokehplot' in sys.argv):
    #cmpplot = bokeh_plot(df, "Compute Nodes")
    #cmd='swift upload ansiblechecker '+strBokehfile
    #os.system(cmd)