Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NRPE: add check: check_sriov_numvfs.py #670

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions charmhelpers/contrib/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +0,0 @@
# Copyright 2014-2015 Canonical Limited.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14 changes: 13 additions & 1 deletion charmhelpers/contrib/charmsupport/nrpe.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2012-2021 Canonical Limited.
# Copyright 2012-2022 Canonical Limited.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -520,3 +520,15 @@ def remove_deprecated_check(nrpe, deprecated_services):
for dep_svc in deprecated_services:
SPFZ marked this conversation as resolved.
Show resolved Hide resolved
log('Deprecated service: {}'.format(dep_svc))
nrpe.remove_check(shortname=dep_svc)


def add_sriov_numvfs_check(nrpe, sriov_numvfs):
"""
Add a check for the number of VFs configured for SR-IOV interfaces
:param NRPE nrpe: NRPE object to add check to
:param str sriov_numvfs: space separated list of <interface>:<numvfs>
"""
nrpe.add_check(
shortname='sriov_numvfs',
description='Check SRIOV numvfs',
check_cmd='check_sriov_numvfs.py {}'.format(sriov_numvfs))
Empty file.
128 changes: 128 additions & 0 deletions charmhelpers/contrib/network/files/check_sriov_numvfs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#!/usr/bin/env python3
SPFZ marked this conversation as resolved.
Show resolved Hide resolved
# Copyright 2014-2022 Canonical Limited.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Check to verify the number of virtual functions (VFs) active on SR-IOV network
interfaces. Based on /sys/class/net/<interface>/device/sriov_numvfs and
/sys/class/net/<interface>/device/sriov_totalvfs.

usage: check_sriov_numvfs.py [-h] sriov_numvfs [sriov_numvfs ...]

Check SR-IOV number of virtual functions configuration

positional arguments:
sriov_numvfs format: <interface>:<numvfs>

For each interfaces:numvfs pair given it verifies that:
1. VFs are enabled
2. VFs are configured are the same as specified in the check parameter
3. VFs configured do not exceed the maximum available on the interface

Non-existent interfaces are not checked.

Example: ./check_sriov_numvfs.py ens3f0:32 ens3f1:32 ens6f0:32 ens6f1:32
"""

import argparse
import os.path
import traceback
import sys


DEVICE_TEMPLATE = "/sys/class/net/{0}"
SRIOV_NUMVFS_TEMPLATE = "/sys/class/net/{0}/device/sriov_numvfs"
SRIOV_TOTALVFS_TEMPLATE = "/sys/class/net/{0}/device/sriov_totalvfs"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm - why are we checking this one? This is the total number of VFs that the PF supports? Customers could have valid reasons for configuring less VFs than the number the PF supports, e.g. the less VFs you have, the more channels/queues you have per VF.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the number of VFs that the PF supports.
We are checking this to make sure we are not trying to configure more VFs than the PF supports. This could happen if we have different NICs in a group of servers with different capabilities.
Currently, if the number of VFs set on the PF is higher than totalvfs, it just sets it to totalvfs (silent fail).

If numvfs > totalvfs then numvfs != sriov_numvfs will also fail, however the help text will report a more accurate reason and therefore hopefully help to resolve it faster.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you elaborate a bit more on why this is a critical alert when the requested VFs cannot be provided by the card? It seems conceivable to me that 2 cards with different capabilities across different units have the same interface name but one can serve 128 VFs and another 64. Setting the value to 128 maxes out both available VFs but it is fewer than 128*2.



def get_interface_setting(file):
"""Return the value content of a setting file as int"""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstrings should also document the function arguments and return expectations (including exceptions raised).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added parameter and return docstrings to all functions now, please verify.

with open(file) as f:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you want to guard against potential errors here (e.g. permission denied or the file not existing)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

L66 checks if the files do exist. But I added an exception catching wrapper around the check now to be safe.

value = f.read()
return int(value)


def check_interface_numvfs(iface, numvfs):
"""Check SR-IOV numvfs config"""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as docstring comment as above

sriov_numvfs_path = SRIOV_NUMVFS_TEMPLATE.format(iface)
sriov_totalvfs_path = SRIOV_TOTALVFS_TEMPLATE.format(iface)
msg = []

# Ignore non-existing interfaces
if not os.path.exists(DEVICE_TEMPLATE.format(iface)):
return []

# Ensure that SR-IOV/VT-d and IOMMU are enabled
if (not os.path.exists(sriov_totalvfs_path) or
not os.path.exists(sriov_numvfs_path)):
return ["{}: VFs are disabled or not-available".format(iface)]

# Verify number of virtual functions
sriov_numvfs = get_interface_setting(sriov_numvfs_path)
sriov_totalvfs = get_interface_setting(sriov_totalvfs_path)
if numvfs != sriov_numvfs:
msg.append(
"{}: Number of VFs on interface ({}) does not match expected ({})"
.format(iface, sriov_numvfs, numvfs)
)
if numvfs > sriov_totalvfs:
msg.append(
"{}: Maximum number of VFs available on interface ({}) is lower than the expected ({})"
.format(iface, sriov_totalvfs, numvfs)
)
return msg


def parse_sriov_numvfs(device_numvfs):
"""Parse parameters and check format"""
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see above

msg = "Parameter format must be '<interface>:<numvfs>', e.g. ens3f0:32, given: {}".format(device_numvfs)
assert device_numvfs != '', msg
parts = device_numvfs.split(':')
assert len(parts) == 2, msg
assert len(parts[0]) > 0, msg
assert int(parts[1]) > 0, msg
iface = str(device_numvfs.split(':')[0])
numvfs = int(device_numvfs.split(':')[1])
return (iface, numvfs)


def parse_args():
"""Parse command-line options."""
parser = argparse.ArgumentParser(description="Check SR-IOV number of virtual functions configuration")
parser.add_argument("sriov_numvfs", nargs='+', help="format: <interface>:<numvfs>")
args = parser.parse_args()
return args


def main():
"""Parse args and check the sriov numvfs."""
args = parse_args()
error_msg = []

try:
for device_numvfs in args.sriov_numvfs:
iface, numvfs = parse_sriov_numvfs(device_numvfs)
error_msg += check_interface_numvfs(iface, numvfs)
if error_msg:
print("CRITICAL: {} problems detected\n".format(len(error_msg)) + "\n".join(error_msg))
sys.exit(2)
except: # noqa: E722
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not opposed to a catch-all scope for unexpected exceptions. However, we should consider adding a previous specific list of exceptions that we think could be raised (and maybe raise a critical error when they happen). Alerting is triggered in Nagios when unknown/critical/recover events happen, so this change would only help in sharing a more meaningful message (a sentence vs a python traceback).

e.g. PermissionError, FileNotFoundError, AssertionError.

Side note: instead of using "assert ...", an exception class inherited from Exception (e.g. ArgsFormatError) could be raised when parsing CLI arguments (and caught here).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the suggestions, I changed it to match your suggestion.

print("{} raised unknown exception '{}'".format(__file__, sys.exc_info()[0]))
traceback.print_exc(file=sys.stdout)
sys.exit(3)

print("OK: sriov_numvfs set to " + ", ".join(args.sriov_numvfs))
SPFZ marked this conversation as resolved.
Show resolved Hide resolved


if __name__ == "__main__":
main()
9 changes: 9 additions & 0 deletions tests/contrib/charmsupport/test_nrpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,3 +494,12 @@ def test_copy_nrpe_checks_nrpe_files_dir(self):
self.patched['copy2'].assert_called_once_with(
'filea',
'/usr/local/lib/nagios/plugins/filea')

@patch.object(nrpe.NRPE, 'add_check')
def test_add_sriov_numvfs_check(self, mock_nrpe_add_check):
foo = nrpe.NRPE()
nrpe.add_sriov_numvfs_check(foo, 'ens3f0:32 ens3f1:32')
mock_nrpe_add_check.assert_called_once_with(
shortname='sriov_numvfs',
description='Check SRIOV numvfs',
check_cmd='check_sriov_numvfs.py ens3f0:32 ens3f1:32')
Empty file.
82 changes: 82 additions & 0 deletions tests/contrib/network/files/test_check_sriov_numvfs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import mock
import os
import tempfile
import unittest

from charmhelpers.contrib.network.files import check_sriov_numvfs

__author__ = 'Stephan Pampel <[email protected]>'


class TestCheckSriovNumfs(unittest.TestCase):

interface_folder = os.path.join(tempfile.gettempdir(), 'ens3f0')
sriov_numvfs_file = os.path.join(interface_folder, 'sriov_numvfs')
sriov_totalvfs_file = os.path.join(interface_folder, 'sriov_totalvfs')

def __init__(self, *args, **kwargs):
super(TestCheckSriovNumfs, self).__init__(*args, **kwargs)
if not os.path.exists(self.interface_folder):
os.mkdir(self.interface_folder)
with open(self.sriov_numvfs_file, "w") as f:
f.write("32")
with open(self.sriov_totalvfs_file, "w") as f:
f.write("64")

def __del__(self):
if os.path.exists(self.interface_folder):
for file in os.listdir(self.interface_folder):
os.remove(os.path.join(self.interface_folder, file))
os.removedirs(self.interface_folder)

def test_parameter_parsing(self):
"""Ensure that the format of the sriov_numvfs parameter is parsed correctly"""
iface, numvfs = check_sriov_numvfs.parse_sriov_numvfs('ens3f0:32')
self.assertEqual(iface, 'ens3f0')
self.assertEqual(numvfs, 32)

for param in ['', ':', ':test', ':32', 'ens3f2', 'a:1:b']:
self.assertRaises(AssertionError, check_sriov_numvfs.parse_sriov_numvfs, param)

for param in ['ens3f2:', 'ens3f2:test']:
self.assertRaises(ValueError, check_sriov_numvfs.parse_sriov_numvfs, param)

def test_check_interface_numvfs_no_interface(self):
"""Check should ignore the interface if it does not exists"""
self.assertListEqual(
check_sriov_numvfs.check_interface_numvfs('no-interface', 0), []
)

@mock.patch('charmhelpers.contrib.network.files.check_sriov_numvfs.DEVICE_TEMPLATE', interface_folder)
def test_check_interface_numvfs_vfs_disabled(self):
"""Check should report if virtual functions are disabled """
self.assertListEqual(
check_sriov_numvfs.check_interface_numvfs('ens3f0', 0),
['ens3f0: VFs are disabled or not-available']
)

@mock.patch('charmhelpers.contrib.network.files.check_sriov_numvfs.DEVICE_TEMPLATE', interface_folder)
@mock.patch('charmhelpers.contrib.network.files.check_sriov_numvfs.SRIOV_NUMVFS_TEMPLATE', sriov_numvfs_file)
@mock.patch('charmhelpers.contrib.network.files.check_sriov_numvfs.SRIOV_TOTALVFS_TEMPLATE', sriov_totalvfs_file)
def test_check_interface_numvfs_vfs_enabled(self):
"""Check if numvfs is evaluated correctly"""

# check numvfs correct should pass
self.assertListEqual(
check_sriov_numvfs.check_interface_numvfs('ens3f0', 32),
[]
)

# check numvfs != expected should faild
self.assertListEqual(
check_sriov_numvfs.check_interface_numvfs('ens3f0', 16),
['ens3f0: Number of VFs on interface (32) does not match expected (16)']
)

# check numvfs > sriov_totalvfs should fail
with open(self.sriov_numvfs_file, "w") as f:
f.write("128")
self.assertListEqual(
check_sriov_numvfs.check_interface_numvfs('ens3f0', 128),
['ens3f0: Maximum number of VFs available on interface (64) is lower than the expected (128)']
)