diff --git a/src/pvecontrol/__init__.py b/src/pvecontrol/__init__.py index 60243f2..df771ae 100644 --- a/src/pvecontrol/__init__.py +++ b/src/pvecontrol/__init__.py @@ -100,6 +100,7 @@ def _parser(): # sanitycheck parser parser_sanitycheck = subparsers.add_parser('sanitycheck', help='Run Sanity checks on the cluster') + parser_sanitycheck.add_argument('--check', action='append', required=False, help="Check to run", default=[]) parser_sanitycheck.set_defaults(func=actions.cluster.action_sanitycheck) # _test parser, hidden from help diff --git a/src/pvecontrol/actions/cluster.py b/src/pvecontrol/actions/cluster.py index 2d1aeac..790502b 100644 --- a/src/pvecontrol/actions/cluster.py +++ b/src/pvecontrol/actions/cluster.py @@ -1,6 +1,9 @@ +import sys + from humanize import naturalsize from pvecontrol.node import NodeStatus +from pvecontrol.sanitycheck import SanityCheck def action_clusterstatus(proxmox, args): @@ -48,13 +51,11 @@ def action_clusterstatus(proxmox, args): def action_sanitycheck(proxmox, args): """Check status of proxmox Cluster""" - - for node in proxmox.nodes: - if (node.maxcpu * proxmox.config['node']['cpufactor']) <= node.allocatedcpu: - print("Node %s is in cpu overcommit status: %s allocated but %s available"%(node.node, node.allocatedcpu, node.maxcpu)) - if (node.allocatedmem + proxmox.config['node']['memoryminimum']) >= node.maxmem: - print("Node %s is in mem overcommit status: %s allocated but %s available"%(node.node, node.allocatedmem, node.maxmem)) # More checks to implement # VM is started but 'startonboot' not set # VM is running in cpu = host # VM is running in cpu = qemu64 + sc = SanityCheck(proxmox) + exitcode = sc.run(checks=args.check) + sc.display() + sys.exit(exitcode) diff --git a/src/pvecontrol/cluster.py b/src/pvecontrol/cluster.py index fc55cc2..8d13ffe 100644 --- a/src/pvecontrol/cluster.py +++ b/src/pvecontrol/cluster.py @@ -1,9 +1,10 @@ +import re import logging from proxmoxer import ProxmoxAPI from pvecontrol.node import PVENode -from pvecontrol.storage import PVEStorage +from pvecontrol.storage import PVEStorage, StorageShared from pvecontrol.task import PVETask @@ -21,19 +22,25 @@ def _initstatus(self): self.status = self._api.cluster.status.get() self.resources = self._api.cluster.resources.get() - self.nodes = [] - for node in self._api.nodes.get(): - self.nodes.append(PVENode(self._api, node["node"], node["status"], node)) - self.storages = [] for storage in self.get_resources_storages(): self.storages.append(PVEStorage(storage.pop("node"), storage.pop("id"), storage.pop("shared"), **storage)) + self.nodes = [] + for node in self._api.nodes.get(): + self.nodes.append(PVENode(self._api, node["node"], node["status"], node)) + self.tasks = [] for task in self._api.cluster.tasks.get(): logging.debug("Get task informations: %s"%(str(task))) self.tasks.append(PVETask(self._api, task["upid"])) + self.ha = { + 'groups': self._api.cluster.ha.groups.get(), + 'manager_status': self._api.cluster.ha.status.manager_status.get(), + 'resources': self._api.cluster.ha.resources.get() + } + def refresh(self): self._initstatus() @@ -71,12 +78,36 @@ def find_task(self, upid): def is_healthy(self): return bool([item for item in self.status if item.get('type') == 'cluster'][0]['quorate']) + def get_vm(self, vm_id): + if isinstance(vm_id, str): + vm_id = int(vm_id) + + result = None + node_name = None + for vm in self.get_resources_vms(): + if vm['vmid'] == vm_id: + node_name = vm['node'] + break + + for node in self.nodes: + if node.node == node_name: + result = [v for v in node.vms if v.vmid == vm_id][0] + break + + return result + + def get_resources_vms(self): + return [resource for resource in self.resources if resource["type"] == "qemu"] + def get_resources_nodes(self): return [resource for resource in self.resources if resource["type"] == "node"] def get_resources_storages(self): return [resource for resource in self.resources if resource["type"] == "storage"] + def get_storage(self, storage_name): + return next(filter(lambda s: s.storage == storage_name, self.storages), None) + def cpu_metrics(self): nodes = self.get_resources_nodes() total_cpu = sum([node['maxcpu'] for node in nodes]) @@ -107,8 +138,8 @@ def memory_metrics(self): def disk_metrics(self): storages = self.get_resources_storages() - total_disk = sum([node['maxdisk'] for node in storages]) - total_disk_usage = sum([node['disk'] for node in storages]) + total_disk = sum([node.get('maxdisk', 0) for node in storages]) + total_disk_usage = sum([node.get('disk', 0) for node in storages]) disk_percent = total_disk_usage / total_disk *100 return { diff --git a/src/pvecontrol/node.py b/src/pvecontrol/node.py index 6f67267..32df122 100644 --- a/src/pvecontrol/node.py +++ b/src/pvecontrol/node.py @@ -75,4 +75,4 @@ def _init_allocatedcpu(self): # return False def templates(self): - return [vm for vm in self.vms if vm.template] + return [vm for vm in self.vms if vm.template] diff --git a/src/pvecontrol/sanitycheck/__init__.py b/src/pvecontrol/sanitycheck/__init__.py new file mode 100644 index 0000000..9833d6b --- /dev/null +++ b/src/pvecontrol/sanitycheck/__init__.py @@ -0,0 +1 @@ +from .sanitychecks import SanityCheck diff --git a/src/pvecontrol/sanitycheck/checks.py b/src/pvecontrol/sanitycheck/checks.py new file mode 100644 index 0000000..1fbe778 --- /dev/null +++ b/src/pvecontrol/sanitycheck/checks.py @@ -0,0 +1,110 @@ +from abc import ABC, abstractmethod +from enum import Enum + +from pvecontrol.utils import fonts, teminal_support_utf_8, terminal_support_colors + +class CheckType(Enum): + HA = 'HIGH_AVAILABILITY' + Node = "NODE" + +class CheckCode(Enum): + CRIT = 'CRITICAL' + WARN = 'WARNING' + INFO = 'INFO' + OK = 'OK' + +ICONS_UTF8 = { + CheckCode.CRIT.value: '❌', + CheckCode.WARN.value: '⚠️', + CheckCode.INFO.value: 'ℹ️', + CheckCode.OK.value: '✅', +} + +ICONS_ASCII = { + CheckCode.CRIT.value: '[CRIT]', + CheckCode.WARN.value: '[WARN]', + CheckCode.INFO.value: '[INFO]', + CheckCode.OK.value: '[OK]', +} + +ICONS_COLORED_ASCII = { + CheckCode.CRIT.value: f'{fonts.RED}[CRIT]{fonts.END}', + CheckCode.WARN.value: f'{fonts.YELLOW}[WARN]{fonts.END}', + CheckCode.INFO.value: f'{fonts.BLUE}[INFO]{fonts.END}', + CheckCode.OK.value: f'{fonts.GREEN}[OK]{fonts.END}', +} + +def set_icons(): + if teminal_support_utf_8(): + return ICONS_UTF8 + if terminal_support_colors(): + return ICONS_COLORED_ASCII + return ICONS_ASCII + +ICONS = set_icons() + +class CheckMessage: + def __init__(self, code: CheckCode, message): + self.code = code + self.message = message + + def display(self, padding_max_size): + padding = padding_max_size - len(self.message) + msg = f"{self.message}{padding * '.'}{ICONS[self.code.value]}" + print(msg) + + def __len__(self): + return len(self.message) + +class Check(ABC): + + type = "" + name = "" + + def __init__(self, proxmox, messages = None): + if messages is None: + messages = [] + self.proxmox = proxmox + self.messages = messages + + @abstractmethod + def run(self): + pass + + @property + def status(self): + """Define status by the most import status in messages""" + status = [] + for msg in self.messages: + # exit early if most import code is found. + if CheckCode.CRIT == msg.code: + return CheckCode.CRIT + status.append(msg.code) + + if CheckCode.WARN in status: + return CheckCode.WARN + + if CheckCode.INFO in status: + return CheckCode.INFO + + return CheckCode.OK + + def add_messages(self, messages): + if isinstance(messages, CheckMessage): + self.messages.append(messages) + elif isinstance(messages, list): + self.messages += messages + + def set_code(self, code: CheckCode): + self.code = code + + def display(self, padding_max_size): + if terminal_support_colors(): + name = f"{fonts.BOLD}{self.name}{fonts.END}\n" + else: + name = f"{self.name}\n" + print(name) + + for msg in self.messages: + msg.display(padding_max_size) + print() diff --git a/src/pvecontrol/sanitycheck/sanitychecks.py b/src/pvecontrol/sanitycheck/sanitychecks.py new file mode 100644 index 0000000..e5aa234 --- /dev/null +++ b/src/pvecontrol/sanitycheck/sanitychecks.py @@ -0,0 +1,54 @@ +from pvecontrol.cluster import PVECluster +from pvecontrol.sanitycheck.checks import CheckCode +from pvecontrol.sanitycheck.tests import DEFAULT_CHECKS, DEFAULT_CHECK_IDS + + +class SanityCheck(): + + def __init__(self, proxmox: PVECluster): + self._proxmox = proxmox + self._checks = [] + + def run(self, checks): + if not checks: + checks = DEFAULT_CHECK_IDS + + for check in checks: + if not check in DEFAULT_CHECK_IDS: + print( + f"Sanity check '{check}' doesn't exists.\n" + f"Here available values are:\n{', '.join(DEFAULT_CHECK_IDS)}" + ) + return 1 + + for id in checks: + check = DEFAULT_CHECKS[id](self._proxmox) + check.run() + self._checks.append(check) + + return self.get_exit_code() + + def get_exit_code(self): + for check in self._checks: + # exit early if most import code is found. + if CheckCode.CRIT == check.status: + return 1 + return 0 + + def _get_longest_message(self): + size = 0 + for check in self._checks: + for msg in check.messages: + if len(msg) > size: + size = len(msg) + return size + 1 + + def display(self): + size = self._get_longest_message() + current_type = None + for check in self._checks: + if current_type != check.type: + current_type = check.type + dash_size = int((size - (len(check.type.value) + 2))/2) + print(f"{dash_size*'-'} {check.type.value} {dash_size*'-'}\n") + check.display(size) diff --git a/src/pvecontrol/sanitycheck/tests/__init__.py b/src/pvecontrol/sanitycheck/tests/__init__.py new file mode 100644 index 0000000..4e89a15 --- /dev/null +++ b/src/pvecontrol/sanitycheck/tests/__init__.py @@ -0,0 +1,11 @@ +from .nodes import Nodes +from .ha_groups import HaGroups +from .ha_vms import HaVms + +DEFAULT_CHECKS = { + Nodes.id: Nodes, + HaGroups.id: HaGroups, + HaVms.id: HaVms +} + +DEFAULT_CHECK_IDS = DEFAULT_CHECKS.keys() diff --git a/src/pvecontrol/sanitycheck/tests/ha_groups.py b/src/pvecontrol/sanitycheck/tests/ha_groups.py new file mode 100644 index 0000000..6ec35d8 --- /dev/null +++ b/src/pvecontrol/sanitycheck/tests/ha_groups.py @@ -0,0 +1,19 @@ +from pvecontrol.sanitycheck.checks import Check, CheckType, CheckMessage, CheckCode + + +class HaGroups(Check): + + id = "ha_groups" + type = CheckType.HA + name = "Check HA groups" + + def run(self): + for group in self.proxmox.ha['groups']: + num_nodes = len(group['nodes'].split(",")) + if num_nodes < 2: + msg = f"Group {group['group']} contain only {num_nodes} node" + self.add_messages(CheckMessage(CheckCode.CRIT, msg)) + + if not self.messages: + msg = "HA Group checked" + self.add_messages(CheckMessage(CheckCode.OK, msg)) diff --git a/src/pvecontrol/sanitycheck/tests/ha_vms.py b/src/pvecontrol/sanitycheck/tests/ha_vms.py new file mode 100644 index 0000000..d1b154b --- /dev/null +++ b/src/pvecontrol/sanitycheck/tests/ha_vms.py @@ -0,0 +1,63 @@ +import re + +from pvecontrol.storage import StorageShared +from pvecontrol.sanitycheck.checks import Check, CheckType, CheckMessage, CheckCode + + +class HaVms(Check): + + id = "ha_vms" + type = CheckType.HA + name = "Check VMs in a HA group" + + def run(self): + ha_resources = [r for r in self.proxmox.ha['resources'] if r['type'] in ['vm']] + ha_vms = [] + for resource in ha_resources: + id = resource['sid'].split(':')[1] # "sid = vm:100" + if resource['type'] == 'vm': + ha_vms.append(self.proxmox.get_vm(id)) + + self.add_messages(self._check_disk_ha_consistency(ha_vms)) + self.add_messages(self._check_cpu_ha_consistency(ha_vms)) + + if not self.messages: + msg = "HA VMS checked" + self.add_messages(CheckMessage(CheckCode.OK, msg)) + + def _check_disk_ha_consistency(self, ha_vms): + messages = [] + # Value are quite hard to find from ressources keys if it's a disk + regex = r"^(.*):(vm|base)-[0-9]+-(disk|cloudinit).*" + vms_not_consistent = [] + for vm in ha_vms: + result = {'name': vm.name, 'node': vm.node, 'disks': []} + for k, v in vm.config.items(): + if not isinstance(v, str): + continue + if regex_result := re.search(regex, v): + storage = self.proxmox.get_storage(regex_result.group(1)) + if ( + storage != None and + StorageShared[storage.shared] != StorageShared.shared + ): + result['disks'].append(k) + if result['disks']: + vms_not_consistent.append(result) + + for vm in vms_not_consistent: + msg = f"Node '{vm['node']}' has VM '{vm['name']}' with disk(s) '{', '.join(vm['disks'])}' not on shared storage" + messages.append(CheckMessage(CheckCode.CRIT, msg)) + + return messages + + def _check_cpu_ha_consistency(self, ha_vms): + messages = [] + for vm in ha_vms: + if vm.config.get('cpu', '') == 'host': + msg = f"Node '{vm.node}' has VM '{vm.name}' with cpu type host" + messages.append(CheckMessage(CheckCode.WARN, msg)) + else: + msg = f"Node '{vm.node}' has VM '{vm.name}' with cpu type {vm.config.get('cpu', 'Default')}" + messages.append(CheckMessage(CheckCode.OK, msg)) + return messages diff --git a/src/pvecontrol/sanitycheck/tests/nodes.py b/src/pvecontrol/sanitycheck/tests/nodes.py new file mode 100644 index 0000000..6e3bb63 --- /dev/null +++ b/src/pvecontrol/sanitycheck/tests/nodes.py @@ -0,0 +1,36 @@ +from pvecontrol.sanitycheck.checks import Check, CheckCode, CheckType, CheckMessage + + +class Nodes(Check): + + id = "nodes" + type = CheckType.Node + name = "Check Node capacity" + + def run(self): + self._check_cpu_overcommit() + self._check_mem_overcommit() + + def _check_mem_overcommit(self): + for node in self.proxmox.nodes: + if self._mem_is_overcommited(node.maxmem, self.proxmox.config['node']['memoryminimum'], node.allocatedmem): + msg = f"Node '{node.node}' is in mem overcommit status: {node.allocatedmem} allocated but {node.maxmem} available" + self.add_messages(CheckMessage(CheckCode.CRIT, msg)) + else: + msg = f"Node '{node.node}' isn't in mem overcommit" + self.add_messages(CheckMessage(CheckCode.OK, msg)) + + def _check_cpu_overcommit(self): + for node in self.proxmox.nodes: + if self._cpu_is_overcommited(node.maxcpu, self.proxmox.config['node']['cpufactor'], node.allocatedcpu): + msg = f"Node {node.node} is in cpu overcommit status: {node.allocatedcpu} allocated but {node.maxcpu} available" + self.add_messages(CheckMessage(CheckCode.WARN, msg)) + else: + msg = f"Node '{node.node}' isn't in cpu overcommit" + self.add_messages(CheckMessage(CheckCode.OK, msg)) + + def _cpu_is_overcommited(self, maxcpu, cpufactor, allocated_cpu): + return (maxcpu * cpufactor) <= allocated_cpu + + def _mem_is_overcommited(self, max_mem, min_mem, allocated_mem): + return (allocated_mem + min_mem) >= max_mem diff --git a/src/pvecontrol/utils.py b/src/pvecontrol/utils.py index 99a4486..a1abb54 100644 --- a/src/pvecontrol/utils.py +++ b/src/pvecontrol/utils.py @@ -2,12 +2,41 @@ import time import sys import re +import curses from prettytable import PrettyTable from collections import OrderedDict from humanize import naturalsize from enum import Enum +class fonts: + BLUE = '\033[94m' + GREEN = '\033[92m' + YELLOW = '\033[93m' + RED = '\033[91m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + END = '\033[0m' + +def terminal_support_colors(): + try: + _stdscr = curses.initscr() + curses.start_color() + if curses.has_colors(): + _num_colors = curses.color_pair(1) + if curses.COLORS > 0: + return True + else: + return False + else: + return False + except Exception as e: + return False + finally: + curses.endwin() + +def teminal_support_utf_8(): + return sys.stdout.encoding.lower() == 'utf-8' # Pretty output a table from a table of dicts # We assume all dicts have the same keys and are sorted by key