Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add sanitycheck VM has HA disks #18

Open
wants to merge 14 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/pvecontrol/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def _parser():

# sanitycheck parser
parser_sanitycheck = subparsers.add_parser('sanitycheck', help='Run Sanity checks on the cluster')
parser_sanitycheck.add_argument('--check', action='append', required=False, help="Check to run", default=[])
parser_sanitycheck.set_defaults(func=actions.cluster.action_sanitycheck)

# _test parser, hidden from help
Expand Down
13 changes: 7 additions & 6 deletions src/pvecontrol/actions/cluster.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import sys

from humanize import naturalsize

from pvecontrol.node import NodeStatus
from pvecontrol.sanitycheck import SanityCheck


def action_clusterstatus(proxmox, args):
Expand Down Expand Up @@ -48,13 +51,11 @@ def action_clusterstatus(proxmox, args):

def action_sanitycheck(proxmox, args):
"""Check status of proxmox Cluster"""

for node in proxmox.nodes:
if (node.maxcpu * proxmox.config['node']['cpufactor']) <= node.allocatedcpu:
print("Node %s is in cpu overcommit status: %s allocated but %s available"%(node.node, node.allocatedcpu, node.maxcpu))
if (node.allocatedmem + proxmox.config['node']['memoryminimum']) >= node.maxmem:
print("Node %s is in mem overcommit status: %s allocated but %s available"%(node.node, node.allocatedmem, node.maxmem))
# More checks to implement
# VM is started but 'startonboot' not set
# VM is running in cpu = host
# VM is running in cpu = qemu64
sc = SanityCheck(proxmox)
exitcode = sc.run(checks=args.check)
sc.display()
sys.exit(exitcode)
42 changes: 37 additions & 5 deletions src/pvecontrol/cluster.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import re
import logging

from proxmoxer import ProxmoxAPI

from pvecontrol.node import PVENode
from pvecontrol.storage import PVEStorage
from pvecontrol.storage import PVEStorage, StorageShared
from pvecontrol.task import PVETask


Expand All @@ -21,14 +22,14 @@ def _initstatus(self):
self.status = self._api.cluster.status.get()
self.resources = self._api.cluster.resources.get()

self.nodes = []
for node in self._api.nodes.get():
self.nodes.append(PVENode(self._api, node["node"], node["status"], node))

self.storages = []
for storage in self.get_resources_storages():
self.storages.append(PVEStorage(storage.pop("node"), storage.pop("id"), storage.pop("shared"), **storage))

self.nodes = []
for node in self._api.nodes.get():
self.nodes.append(PVENode(self._api, node["node"], node["status"], node))

self.tasks = []
for task in self._api.cluster.tasks.get():
logging.debug("Get task informations: %s"%(str(task)))
Expand Down Expand Up @@ -71,12 +72,36 @@ def find_task(self, upid):
def is_healthy(self):
return bool([item for item in self.status if item.get('type') == 'cluster'][0]['quorate'])

def get_vm(self, vm_id):
if isinstance(vm_id, str):
vm_id = int(vm_id)

result = None
node_name = None
for vm in self.get_resources_vms():
if vm['vmid'] == vm_id:
node_name = vm['node']
break

for node in self.nodes:
if node.node == node_name:
result = [v for v in node.vms if v.vmid == vm_id][0]
break

return result

def get_resources_vms(self):
return [resource for resource in self.resources if resource["type"] == "qemu"]

def get_resources_nodes(self):
return [resource for resource in self.resources if resource["type"] == "node"]

def get_resources_storages(self):
return [resource for resource in self.resources if resource["type"] == "storage"]

def get_storage(self, storage_name):
return next(filter(lambda s: s.storage == storage_name, self.storages), None)

def cpu_metrics(self):
nodes = self.get_resources_nodes()
total_cpu = sum([node['maxcpu'] for node in nodes])
Expand Down Expand Up @@ -123,3 +148,10 @@ def metrics(self):
"memory": self.memory_metrics(),
"disk": self.disk_metrics()
}

def ha(self):
return {
'groups': self._api.cluster.ha.groups.get(),
'manager_status': self._api.cluster.ha.status.manager_status.get(),
'resources': self._api.cluster.ha.resources.get()
}
2 changes: 1 addition & 1 deletion src/pvecontrol/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,4 @@ def _init_allocatedcpu(self):
# return False

def templates(self):
return [vm for vm in self.vms if vm.template]
return [vm for vm in self.vms if vm.template]
1 change: 1 addition & 0 deletions src/pvecontrol/sanitycheck/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .sanitychecks import SanityCheck
98 changes: 98 additions & 0 deletions src/pvecontrol/sanitycheck/checks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from abc import ABC, abstractmethod
from enum import Enum

from pvecontrol.utils import fonts, teminal_support_bold, teminal_support_utf_8

class CheckType(Enum):
HA = 'HIGHT_AVAILABILITY'
Node = "NODE"

class CheckCode(Enum):
CRIT = 'CRITICAL'
WARN = 'WARNING'
INFO = 'INFO'
OK = 'OK'

ICONS_UTF8 = {
CheckCode.CRIT.value: '❌',
CheckCode.WARN.value: '⚠️',
CheckCode.INFO.value: 'ℹ️',
CheckCode.OK.value: '✅',
}

ICONS_ASCII = {
CheckCode.CRIT.value: '[CRIT]',
CheckCode.WARN.value: '[WARN]',
CheckCode.INFO.value: '[INFO]',
CheckCode.OK.value: '[OK]',
}

class CheckMessage:
def __init__(self, code: CheckCode, message):
self.code = code
self.message = message

def display(self, padding_max_size):
padding = padding_max_size - len(self.message)
msg = f"{self.message}{padding * '.'}"
if teminal_support_utf_8():
msg += ICONS_UTF8[self.code.value]
else:
msg += ICONS_ASCII[self.code.value]
print(msg)

def __len__(self):
return len(self.message)

class Check(ABC):

type = ""
name = ""

def __init__(self, proxmox, messages = None):
if messages is None:
messages = []
self.proxmox = proxmox
self.messages = messages

@abstractmethod
def run(self):
pass

@property
def status(self):
"""Define status by the most import status in messages"""
status = []
for msg in self.messages:
# exit early if most import code is found.
if CheckCode.CRIT == msg.code:
return CheckCode.CRIT
status.append(msg.code)

if CheckCode.WARN in status:
return CheckCode.WARN

if CheckCode.INFO in status:
return CheckCode.INFO

return CheckCode.OK

def add_messages(self, messages):
if isinstance(messages, CheckMessage):
self.messages.append(messages)
elif isinstance(messages, list):
self.messages += messages

def set_code(self, code: CheckCode):
self.code = code

def display(self, padding_max_size):
if teminal_support_bold():
name = f"{fonts.BOLD}{self.name}{fonts.END}\n"
else:
name = f"{self.name}\n"
print(name)

for msg in self.messages:
msg.display(padding_max_size)
print()
54 changes: 54 additions & 0 deletions src/pvecontrol/sanitycheck/sanitychecks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from pvecontrol.cluster import PVECluster
from pvecontrol.sanitycheck.checks import CheckCode
from pvecontrol.sanitycheck.tests import DEFAULT_CHECKS, DEFAULT_CHECK_IDS


class SanityCheck():

def __init__(self, proxmox: PVECluster):
self._proxmox = proxmox
self._checks = []

def run(self, checks):
if not checks:
checks = DEFAULT_CHECK_IDS

for check in checks:
if not check in DEFAULT_CHECK_IDS:
print(
f"Sanity check '{check}' doesn't exists.\n"
f"Here available values are:\n{', '.join(DEFAULT_CHECK_IDS)}"
)
return 1

for id in checks:
check = DEFAULT_CHECKS[id](self._proxmox)
check.run()
self._checks.append(check)

return self.get_exit_code()

def get_exit_code(self):
for check in self._checks:
# exit early if most import code is found.
if CheckCode.CRIT == check.status:
return 1
return 0

def _get_longest_message(self):
size = 0
for check in self._checks:
for msg in check.messages:
if len(msg) > size:
size = len(msg)
return size + 1

def display(self):
size = self._get_longest_message()
current_type = None
for check in self._checks:
if current_type != check.type:
current_type = check.type
dash_size = int((size - (len(check.type.value) + 2))/2)
print(f"{dash_size*'-'} {check.type.value} {dash_size*'-'}\n")
check.display(size)
11 changes: 11 additions & 0 deletions src/pvecontrol/sanitycheck/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from .nodes import Nodes
from .ha_groups import HaGroups
from .ha_vms import HaVms

DEFAULT_CHECKS = {
Nodes.id: Nodes,
HaGroups.id: HaGroups,
HaVms.id: HaVms
}

DEFAULT_CHECK_IDS = DEFAULT_CHECKS.keys()
19 changes: 19 additions & 0 deletions src/pvecontrol/sanitycheck/tests/ha_groups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from pvecontrol.sanitycheck.checks import Check, CheckType, CheckMessage, CheckCode


class HaGroups(Check):

id = "ha_groups"
type = CheckType.HA
name = "Check HA groups"

def run(self):
for group in self.proxmox.ha()['groups']:
num_nodes = len(group['nodes'].split(","))
if num_nodes < 2:
msg = f"Group {group['group']} contain only {num_nodes} node"
self.add_messages(CheckMessage(CheckCode.CRIT, msg))

if not self.messages:
msg = "HA Group checked"
self.add_messages(CheckMessage(CheckCode.OK, msg))
63 changes: 63 additions & 0 deletions src/pvecontrol/sanitycheck/tests/ha_vms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import re

from pvecontrol.storage import StorageShared
from pvecontrol.sanitycheck.checks import Check, CheckType, CheckMessage, CheckCode


class HaVms(Check):

id = "ha_vms"
type = CheckType.HA
name = "Check VMs in a HA group"

def run(self):
ha_resources = [r for r in self.proxmox.ha()['resources'] if r['type'] in ['vm']]
ha_vms = []
for resource in ha_resources:
id = resource['sid'].split(':')[1] # "sid = vm:100"
if resource['type'] == 'vm':
ha_vms.append(self.proxmox.get_vm(id))

self.add_messages(self._check_disk_ha_consistency(ha_vms))
self.add_messages(self._check_cpu_ha_consistency(ha_vms))

if not self.messages:
msg = "HA VMS checked"
self.add_messages(CheckMessage(CheckCode.OK, msg))

def _check_disk_ha_consistency(self, ha_vms):
messages = []
# Value are quite hard to find from ressources keys if it's a disk
regex = r"^(.*):(vm|base)-[0-9]+-(disk|cloudinit).*"
vms_not_consistent = []
for vm in ha_vms:
result = {'name': vm.name, 'node': vm.node, 'disks': []}
for k, v in vm.config.items():
if not isinstance(v, str):
continue
if regex_result := re.search(regex, v):
storage = self.proxmox.get_storage(regex_result.group(1))
if (
storage != None and
StorageShared[storage.shared] != StorageShared.shared
):
result['disks'].append(k)
if result['disks']:
vms_not_consistent.append(result)

for vm in vms_not_consistent:
msg = f"Node '{vm['node']}' has VM '{vm['name']}' with disk(s) '{', '.join(vm['disks'])}' not on shared storage"
messages.append(CheckMessage(CheckCode.CRIT, msg))

return messages

def _check_cpu_ha_consistency(self, ha_vms):
messages = []
for vm in ha_vms:
if vm.config['cpu'] == 'host':
msg = f"Node '{vm.node}' has VM '{vm.name}' with cpu type host"
messages.append(CheckMessage(CheckCode.WARN, msg))
else:
msg = f"Node '{vm.node}' has VM '{vm.name}' with cpu type {vm.config['cpu']}"
messages.append(CheckMessage(CheckCode.OK, msg))
return messages
Loading
Loading