From eac9ad2647cf3445d116dbacbf6dfc652f71677a Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Fri, 11 Oct 2024 11:11:53 +0100 Subject: [PATCH 1/4] Convert smartmon script to python --- etc/kayobe/ansible/scripts/smartmon.py | 156 +++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 etc/kayobe/ansible/scripts/smartmon.py diff --git a/etc/kayobe/ansible/scripts/smartmon.py b/etc/kayobe/ansible/scripts/smartmon.py new file mode 100644 index 000000000..2a50c9187 --- /dev/null +++ b/etc/kayobe/ansible/scripts/smartmon.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 + +import subprocess +import json +from datetime import datetime + +SMARTCTL_PATH = "/usr/sbin/smartctl" + +def run_command(command, parse_json=False): + result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + if parse_json: + return json.loads(result.stdout) + else: + return result.stdout.strip() + +def parse_smartctl_attributes(disk, disk_type, serial, json_data): + labels = f'disk="{disk}",type="{disk_type}",serial_number="{serial}"' + metrics = [] + smartmon_attrs = set([ + "airflow_temperature_cel", "command_timeout", "current_pending_sector", "end_to_end_error", "erase_fail_count", + "g_sense_error_rate", "hardware_ecc_recovered", "host_reads_32mib", "host_reads_mib", "host_writes_32mib", + "host_writes_mib", "load_cycle_count", "media_wearout_indicator", "nand_writes_1gib", "offline_uncorrectable", + "power_cycle_count", "power_on_hours", "program_fail_cnt_total", "program_fail_count", "raw_read_error_rate", + "reallocated_event_count", "reallocated_sector_ct", "reported_uncorrect", "runtime_bad_block", "sata_downshift_count", + "seek_error_rate", "spin_retry_count", "spin_up_time", "start_stop_count", "temperature_case", "temperature_celsius", + "temperature_internal", "total_lbas_read", "total_lbas_written", "udma_crc_error_count", "unsafe_shutdown_count", + "unused_rsvd_blk_cnt_tot", "wear_leveling_count", "workld_host_reads_perc", "workld_media_wear_indic", "workload_minutes", + "critical_warning", "temperature", "available_spare", "available_spare_threshold", "percentage_used", + "data_units_read", "data_units_written", "host_reads", "host_writes", "controller_busy_time", + "power_cycles", "unsafe_shutdowns", "media_errors", "num_err_log_entries", + "warning_temp_time", "critical_comp_time" + ]) + if 'nvme_smart_health_information_log' in json_data: + smart_log = json_data['nvme_smart_health_information_log'] + for attr_name, value in smart_log.items(): + attr_name = attr_name.replace(' ', '_').lower() + if attr_name in smartmon_attrs: + metrics.append(f"{attr_name}{{{labels}}} {value}") + elif 'scsi_grown_defect_list' in json_data: + scsi_attrs = json_data.get('scsi_grown_defect_list', {}) + for attr_name, value in scsi_attrs.items(): + attr_name = attr_name.replace(' ', '_').lower() + if attr_name in smartmon_attrs: + metrics.append(f"{attr_name}{{{labels}}} {value}") + elif 'ata_smart_attributes' in json_data and 'table' in json_data['ata_smart_attributes']: + for attr in json_data['ata_smart_attributes']['table']: + attr_name = attr['name'].replace('-', '_').lower() + if attr_name in smartmon_attrs: + attr_id = attr.get('id', '') + value = attr.get('value', '') + worst = attr.get('worst', '') + threshold = attr.get('thresh', '') + raw_value = attr.get('raw', {}).get('value', '') + metrics.append(f"{attr_name}_value{{{labels},smart_id=\"{attr_id}\"}} {value}") + metrics.append(f"{attr_name}_worst{{{labels},smart_id=\"{attr_id}\"}} {worst}") + metrics.append(f"{attr_name}_threshold{{{labels},smart_id=\"{attr_id}\"}} {threshold}") + metrics.append(f"{attr_name}_raw_value{{{labels},smart_id=\"{attr_id}\"}} {raw_value}") + return metrics + +def parse_smartctl_info(disk, disk_type, json_data): + info = json_data.get('device', {}) + smart_status = json_data.get('smart_status', {}) + labels = { + 'disk': disk, + 'type': disk_type, + 'vendor': info.get('vendor', ''), + 'product': info.get('product', ''), + 'revision': info.get('revision', ''), + 'lun_id': info.get('lun_id', ''), + 'model_family': json_data.get('model_family', ''), + 'device_model': json_data.get('model_name', ''), + 'serial_number': json_data.get('serial_number', '').lower(), + 'firmware_version': json_data.get('firmware_version', '') + } + label_str = ','.join(f'{k}="{v}"' for k, v in labels.items()) + metrics = [ + f'device_info{{{label_str}}} 1', + f'device_smart_available{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("available", False) else 0}', + ] + if smart_status.get("available", False): + metrics.append(f'device_smart_enabled{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("enabled", False) else 0}') + if 'passed' in smart_status: + metrics.append(f'device_smart_healthy{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("passed", False) else 0}') + return metrics + +def format_output(metrics): + output = [] + last_metric = "" + for metric in sorted(metrics): + metric_name = metric.split('{')[0] + if metric_name != last_metric: + output.append(f"# HELP smartmon_{metric_name} SMART metric {metric_name}") + output.append(f"# TYPE smartmon_{metric_name} gauge") + last_metric = metric_name + output.append(f"smartmon_{metric}") + return '\n'.join(output) + +def main(): + try: + version_output = run_command([SMARTCTL_PATH, '-j'], parse_json=True) + smartctl_version_list = version_output.get('smartctl', {}).get('version', []) + if smartctl_version_list: + smartctl_version_str = '.'.join(map(str, smartctl_version_list)) + else: + smartctl_version_str = "unknown" + except json.JSONDecodeError: + smartctl_version_str = "unknown" + metrics = [f'smartctl_version{{version="{smartctl_version_str}"}} 1'] + + try: + device_list_output = run_command([SMARTCTL_PATH, '--scan-open', '-j'], parse_json=True) + devices = [] + for device in device_list_output.get('devices', []): + disk = device.get('name', '') + disk_type = device.get('type', 'auto') + if disk: + devices.append((disk, disk_type)) + except json.JSONDecodeError: + devices = [] + + for disk, disk_type in devices: + serial_number = '' + active = 1 + metrics.append(f'smartctl_run{{disk="{disk}",type="{disk_type}"}} {int(datetime.utcnow().timestamp())}') + + try: + standby_output = run_command([SMARTCTL_PATH, '-n', 'standby', '-d', disk_type, '-j', disk], parse_json=True) + power_mode = standby_output.get('power_mode', '') + if power_mode == 'standby': + active = 0 + except json.JSONDecodeError: + active = 0 # Assume device is inactive if we can't parse the output + + metrics.append(f'device_active{{disk="{disk}",type="{disk_type}"}} {active}') + + if active == 0: + continue + + try: + info_output = run_command([SMARTCTL_PATH, '-i', '-H', '-d', disk_type, '-j', disk], parse_json=True) + except json.JSONDecodeError: + continue + metrics.extend(parse_smartctl_info(disk, disk_type, info_output)) + serial_number = info_output.get('serial_number', '').lower() + + try: + attributes_output = run_command([SMARTCTL_PATH, '-A', '-d', disk_type, '-j', disk], parse_json=True) + except json.JSONDecodeError: + continue + metrics.extend(parse_smartctl_attributes(disk, disk_type, serial_number, attributes_output)) + + formatted_output = format_output(metrics) + print(formatted_output) + +if __name__ == "__main__": + main() From 915c0ab9fa71d20292b1061b060dcc56af4cfc4b Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Fri, 11 Oct 2024 11:12:32 +0100 Subject: [PATCH 2/4] Create tests for smartmon --- etc/kayobe/ansible/scripts/test_smartmon.py | 265 ++++++++++++++++++++ 1 file changed, 265 insertions(+) create mode 100644 etc/kayobe/ansible/scripts/test_smartmon.py diff --git a/etc/kayobe/ansible/scripts/test_smartmon.py b/etc/kayobe/ansible/scripts/test_smartmon.py new file mode 100644 index 000000000..a771a7ee6 --- /dev/null +++ b/etc/kayobe/ansible/scripts/test_smartmon.py @@ -0,0 +1,265 @@ +import unittest +from unittest.mock import patch +from smartmon import ( + parse_smartctl_info, + parse_smartctl_attributes, + main, +) + +class TestSmartMon(unittest.TestCase): + @patch('smartmon.run_command') + def test_parse_smartctl_info(self, mock_run_command): + devices_info = [ + { + 'disk': '/dev/nvme0', + 'disk_type': 'nvme', + 'json_output': { + 'device': { + 'name': '/dev/nvme0', + 'info_name': '/dev/nvme0', + 'type': 'nvme', + 'protocol': 'NVMe', + }, + 'model_name': 'Dell Ent NVMe CM6 RI 7.68TB', + 'serial_number': 'Y2Q0A0BGTCF8', + 'firmware_version': '2.2.0', + 'smart_status': { + 'passed': True, + 'available': True, + 'enabled': True + }, + } + }, + { + 'disk': '/dev/nvme1', + 'disk_type': 'nvme', + 'json_output': { + 'device': { + 'name': '/dev/nvme1', + 'info_name': '/dev/nvme1', + 'type': 'nvme', + 'protocol': 'NVMe', + }, + 'model_name': 'Dell Ent NVMe CM6 RI 7.68TB', + 'serial_number': 'Y2Q0A09PTCF8', + 'firmware_version': '2.2.0', + 'smart_status': { + 'passed': True, + 'available': True, + 'enabled': True + }, + } + }, + ] + + for device_info in devices_info: + disk = device_info['disk'] + disk_type = device_info['disk_type'] + json_output = device_info['json_output'] + serial_number = json_output.get('serial_number', '').lower() + + expected_metrics = [ + f'device_info{{disk="{disk}",type="{disk_type}",vendor="",product="",revision="",lun_id="",model_family="",device_model="{json_output.get("model_name", "")}",serial_number="{serial_number}",firmware_version="{json_output.get("firmware_version", "")}"}} 1', + f'device_smart_available{{disk="{disk}",type="{disk_type}",serial_number="{serial_number}"}} 1', + f'device_smart_enabled{{disk="{disk}",type="{disk_type}",serial_number="{serial_number}"}} 1', + f'device_smart_healthy{{disk="{disk}",type="{disk_type}",serial_number="{serial_number}"}} 1', + ] + + metrics = parse_smartctl_info(disk, disk_type, json_output) + for expected_metric in expected_metrics: + self.assertIn(expected_metric, metrics) + + @patch('smartmon.run_command') + def test_parse_smartctl_attributes(self, mock_run_command): + devices_attributes = [ + { + 'disk': '/dev/nvme0', + 'disk_type': 'nvme', + 'serial': 'y2q0a0bgtcf8', + 'json_output': { + 'nvme_smart_health_information_log': { + 'critical_warning': 0, + 'temperature': 36, + 'available_spare': 100, + 'available_spare_threshold': 10, + 'percentage_used': 0, + 'data_units_read': 117446405, + 'data_units_written': 84630284, + 'host_reads': 634894145, + 'host_writes': 4502620984, + 'controller_busy_time': 92090, + 'power_cycles': 746, + 'power_on_hours': 12494, + 'unsafe_shutdowns': 35, + 'media_errors': 0, + 'num_err_log_entries': 827, + 'warning_temp_time': 0, + 'critical_comp_time': 0 + } + } + }, + { + 'disk': '/dev/nvme1', + 'disk_type': 'nvme', + 'serial': 'y2q0a09ptcf8', + 'json_output': { + 'nvme_smart_health_information_log': { + 'critical_warning': 0, + 'temperature': 35, + 'available_spare': 99, + 'available_spare_threshold': 10, + 'percentage_used': 1, + 'data_units_read': 50000000, + 'data_units_written': 40000000, + 'host_reads': 300000000, + 'host_writes': 2000000000, + 'controller_busy_time': 80000, + 'power_cycles': 700, + 'power_on_hours': 12000, + 'unsafe_shutdowns': 30, + 'media_errors': 0, + 'num_err_log_entries': 800, + 'warning_temp_time': 0, + 'critical_comp_time': 0 + } + } + }, + ] + + for device_attr in devices_attributes: + disk = device_attr['disk'] + disk_type = device_attr['disk_type'] + serial = device_attr['serial'] + json_output = device_attr['json_output'] + + metrics = parse_smartctl_attributes(disk, disk_type, serial, json_output) + + expected_metrics = [ + f'temperature{{disk="{disk}",type="{disk_type}",serial_number="{serial}"}} {json_output["nvme_smart_health_information_log"]["temperature"]}', + f'available_spare{{disk="{disk}",type="{disk_type}",serial_number="{serial}"}} {json_output["nvme_smart_health_information_log"]["available_spare"]}', + ] + + for expected_metric in expected_metrics: + self.assertIn(expected_metric, metrics) + + @patch('smartmon.run_command') + def test_main(self, mock_run_command): + def side_effect(command, parse_json=False): + if '--scan-open' in command: + return { + 'devices': [ + {'name': '/dev/nvme0', 'info_name': '/dev/nvme0', 'type': 'nvme'}, + {'name': '/dev/nvme1', 'info_name': '/dev/nvme1', 'type': 'nvme'}, + ] + } if parse_json else '' + elif '-n' in command: + return {'power_mode': 'active'} if parse_json else '' + elif '-i' in command: + if '/dev/nvme0' in command: + return { + 'device': { + 'name': '/dev/nvme0', + 'info_name': '/dev/nvme0', + 'type': 'nvme', + 'protocol': 'NVMe', + }, + 'model_name': 'Dell Ent NVMe CM6 RI 7.68TB', + 'serial_number': 'Y2Q0A0BGTCF8', + 'firmware_version': '2.2.0', + 'smart_status': { + 'passed': True, + 'available': True, + 'enabled': True + }, + } if parse_json else '' + elif '/dev/nvme1' in command: + return { + 'device': { + 'name': '/dev/nvme1', + 'info_name': '/dev/nvme1', + 'type': 'nvme', + 'protocol': 'NVMe', + }, + 'model_name': 'Dell Ent NVMe CM6 RI 7.68TB', + 'serial_number': 'Y2Q0A09PTCF8', + 'firmware_version': '2.2.0', + 'smart_status': { + 'passed': True, + 'available': True, + 'enabled': True + }, + } if parse_json else '' + elif '-A' in command: + if '/dev/nvme0' in command: + return { + 'nvme_smart_health_information_log': { + 'critical_warning': 0, + 'temperature': 36, + 'available_spare': 100, + 'available_spare_threshold': 10, + 'percentage_used': 0, + 'data_units_read': 117446405, + 'data_units_written': 84630284, + 'host_reads': 634894145, + 'host_writes': 4502620984, + 'controller_busy_time': 92090, + 'power_cycles': 746, + 'power_on_hours': 12494, + 'unsafe_shutdowns': 35, + 'media_errors': 0, + 'num_err_log_entries': 827, + 'warning_temp_time': 0, + 'critical_comp_time': 0 + } + } if parse_json else '' + elif '/dev/nvme1' in command: + return { + 'nvme_smart_health_information_log': { + 'critical_warning': 0, + 'temperature': 35, + 'available_spare': 99, + 'available_spare_threshold': 10, + 'percentage_used': 1, + 'data_units_read': 50000000, + 'data_units_written': 40000000, + 'host_reads': 300000000, + 'host_writes': 2000000000, + 'controller_busy_time': 80000, + 'power_cycles': 700, + 'power_on_hours': 12000, + 'unsafe_shutdowns': 30, + 'media_errors': 0, + 'num_err_log_entries': 800, + 'warning_temp_time': 0, + 'critical_comp_time': 0 + } + } if parse_json else '' + elif '-j' in command and len(command) == 2: + return { + 'smartctl': { + 'version': [7, 2], + 'svn_revision': '5155', + 'platform_info': 'x86_64-linux-5.15.0-122-generic', + 'build_info': '(local build)', + } + } if parse_json else '' + else: + return {} if parse_json else '' + + mock_run_command.side_effect = side_effect + + with patch('builtins.print') as mock_print: + main() + output_lines = [] + for call in mock_print.call_args_list: + output_lines.extend(call[0][0].split('\n')) + expected_metrics = [ + 'smartmon_device_info{disk="/dev/nvme0",type="nvme",vendor="",product="",revision="",lun_id="",model_family="",device_model="Dell Ent NVMe CM6 RI 7.68TB",serial_number="y2q0a0bgtcf8",firmware_version="2.2.0"} 1', + 'smartmon_device_info{disk="/dev/nvme1",type="nvme",vendor="",product="",revision="",lun_id="",model_family="",device_model="Dell Ent NVMe CM6 RI 7.68TB",serial_number="y2q0a09ptcf8",firmware_version="2.2.0"} 1', + ] + for expected_metric in expected_metrics: + self.assertIn(expected_metric, output_lines) + + +if __name__ == '__main__': + unittest.main() From b5b72bac81628e3b97bfb3fd8d0ddd1cdf25a787 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Fri, 17 Jan 2025 16:08:49 +0000 Subject: [PATCH 3/4] Use pySMART --- etc/kayobe/ansible/scripts/smartmon.py | 279 +++++++++++++++---------- 1 file changed, 168 insertions(+), 111 deletions(-) diff --git a/etc/kayobe/ansible/scripts/smartmon.py b/etc/kayobe/ansible/scripts/smartmon.py index 2a50c9187..bd4bb36bc 100644 --- a/etc/kayobe/ansible/scripts/smartmon.py +++ b/etc/kayobe/ansible/scripts/smartmon.py @@ -2,155 +2,212 @@ import subprocess import json +import re from datetime import datetime +from pySMART import DeviceList + SMARTCTL_PATH = "/usr/sbin/smartctl" +SMARTMON_ATTRS = { + "airflow_temperature_cel", + "command_timeout", + "current_pending_sector", + "end_to_end_error", + "erase_fail_count", + "g_sense_error_rate", + "hardware_ecc_recovered", + "host_reads_32mib", + "host_reads_mib", + "host_writes_32mib", + "host_writes_mib", + "load_cycle_count", + "media_wearout_indicator", + "nand_writes_1gib", + "offline_uncorrectable", + "power_cycle_count", + "power_on_hours", + "program_fail_cnt_total", + "program_fail_count", + "raw_read_error_rate", + "reallocated_event_count", + "reallocated_sector_ct", + "reported_uncorrect", + "runtime_bad_block", + "sata_downshift_count", + "seek_error_rate", + "spin_retry_count", + "spin_up_time", + "start_stop_count", + "temperature_case", + "temperature_celsius", + "temperature_internal", + "total_lbas_read", + "total_lbas_written", + "udma_crc_error_count", + "unsafe_shutdown_count", + "unused_rsvd_blk_cnt_tot", + "wear_leveling_count", + "workld_host_reads_perc", + "workld_media_wear_indic", + "workload_minutes", + "critical_warning", + "temperature", + "available_spare", + "available_spare_threshold", + "percentage_used", + "data_units_read", + "data_units_written", + "host_reads", + "host_writes", + "controller_busy_time", + "power_cycles", + "unsafe_shutdowns", + "media_errors", + "num_err_log_entries", + "warning_temp_time", + "critical_comp_time", +} + def run_command(command, parse_json=False): + """ + Helper to run a subprocess command and optionally parse JSON output. + """ result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) if parse_json: return json.loads(result.stdout) - else: - return result.stdout.strip() - -def parse_smartctl_attributes(disk, disk_type, serial, json_data): - labels = f'disk="{disk}",type="{disk_type}",serial_number="{serial}"' - metrics = [] - smartmon_attrs = set([ - "airflow_temperature_cel", "command_timeout", "current_pending_sector", "end_to_end_error", "erase_fail_count", - "g_sense_error_rate", "hardware_ecc_recovered", "host_reads_32mib", "host_reads_mib", "host_writes_32mib", - "host_writes_mib", "load_cycle_count", "media_wearout_indicator", "nand_writes_1gib", "offline_uncorrectable", - "power_cycle_count", "power_on_hours", "program_fail_cnt_total", "program_fail_count", "raw_read_error_rate", - "reallocated_event_count", "reallocated_sector_ct", "reported_uncorrect", "runtime_bad_block", "sata_downshift_count", - "seek_error_rate", "spin_retry_count", "spin_up_time", "start_stop_count", "temperature_case", "temperature_celsius", - "temperature_internal", "total_lbas_read", "total_lbas_written", "udma_crc_error_count", "unsafe_shutdown_count", - "unused_rsvd_blk_cnt_tot", "wear_leveling_count", "workld_host_reads_perc", "workld_media_wear_indic", "workload_minutes", - "critical_warning", "temperature", "available_spare", "available_spare_threshold", "percentage_used", - "data_units_read", "data_units_written", "host_reads", "host_writes", "controller_busy_time", - "power_cycles", "unsafe_shutdowns", "media_errors", "num_err_log_entries", - "warning_temp_time", "critical_comp_time" - ]) - if 'nvme_smart_health_information_log' in json_data: - smart_log = json_data['nvme_smart_health_information_log'] - for attr_name, value in smart_log.items(): - attr_name = attr_name.replace(' ', '_').lower() - if attr_name in smartmon_attrs: - metrics.append(f"{attr_name}{{{labels}}} {value}") - elif 'scsi_grown_defect_list' in json_data: - scsi_attrs = json_data.get('scsi_grown_defect_list', {}) - for attr_name, value in scsi_attrs.items(): - attr_name = attr_name.replace(' ', '_').lower() - if attr_name in smartmon_attrs: - metrics.append(f"{attr_name}{{{labels}}} {value}") - elif 'ata_smart_attributes' in json_data and 'table' in json_data['ata_smart_attributes']: - for attr in json_data['ata_smart_attributes']['table']: - attr_name = attr['name'].replace('-', '_').lower() - if attr_name in smartmon_attrs: - attr_id = attr.get('id', '') - value = attr.get('value', '') - worst = attr.get('worst', '') - threshold = attr.get('thresh', '') - raw_value = attr.get('raw', {}).get('value', '') - metrics.append(f"{attr_name}_value{{{labels},smart_id=\"{attr_id}\"}} {value}") - metrics.append(f"{attr_name}_worst{{{labels},smart_id=\"{attr_id}\"}} {worst}") - metrics.append(f"{attr_name}_threshold{{{labels},smart_id=\"{attr_id}\"}} {threshold}") - metrics.append(f"{attr_name}_raw_value{{{labels},smart_id=\"{attr_id}\"}} {raw_value}") - return metrics - -def parse_smartctl_info(disk, disk_type, json_data): - info = json_data.get('device', {}) - smart_status = json_data.get('smart_status', {}) + return result.stdout.strip() + +def parse_device_info(device): + """ + Produce Prometheus lines describing the device's identity and SMART status: + - device_info + - device_smart_available + - device_smart_enabled + - device_smart_healthy + """ + serial_number = (device.serial or "").lower() labels = { - 'disk': disk, - 'type': disk_type, - 'vendor': info.get('vendor', ''), - 'product': info.get('product', ''), - 'revision': info.get('revision', ''), - 'lun_id': info.get('lun_id', ''), - 'model_family': json_data.get('model_family', ''), - 'device_model': json_data.get('model_name', ''), - 'serial_number': json_data.get('serial_number', '').lower(), - 'firmware_version': json_data.get('firmware_version', '') + "disk": device.name, + "type": device.interface or "", + "vendor": device.vendor or "", + "model_family": device.family or "", + "device_model": device.model or "", + "serial_number": serial_number, + "firmware_version": device.firmware or "", } - label_str = ','.join(f'{k}="{v}"' for k, v in labels.items()) + label_str = ",".join(f'{k}="{v}"' for k, v in labels.items()) + metrics = [ f'device_info{{{label_str}}} 1', - f'device_smart_available{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("available", False) else 0}', + f'device_smart_available{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {1 if device.smart_capable else 0}', ] - if smart_status.get("available", False): - metrics.append(f'device_smart_enabled{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("enabled", False) else 0}') - if 'passed' in smart_status: - metrics.append(f'device_smart_healthy{{disk="{disk}",type="{disk_type}",serial_number="{labels["serial_number"]}"}} {1 if smart_status.get("passed", False) else 0}') + + if device.smart_capable: + metrics.append( + f'device_smart_enabled{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {1 if device.smart_enabled else 0}' + ) + if device.assessment: + is_healthy = 1 if device.assessment.upper() == "PASS" else 0 + metrics.append( + f'device_smart_healthy{{disk="{device.name}",type="{device.interface}",serial_number="{serial_number}"}} {is_healthy}' + ) + + return metrics + +def parse_if_attributes(device): + """ + For any device type (ATA, NVMe, SCSI, etc.), we read device.if_attributes. + We'll iterate over its public fields, convert them to snake_case, + and if it's in SMARTMON_ATTRS and numeric, we produce metrics. + """ + metrics = [] + + if not device.if_attributes: + return metrics + + disk = device.name + disk_type = device.interface or "" + serial_number = (device.serial or "").lower() + labels = f'disk="{disk}",type="{disk_type}",serial_number="{serial_number}"' + + # Inspect all public attributes on device.if_attributes + for attr_name in dir(device.if_attributes): + if attr_name.startswith("_"): + continue # skip private / special methods + val = getattr(device.if_attributes, attr_name, None) + if callable(val): + continue # skip methods + + # Convert CamelCase or PascalCase -> snake_case, e.g. dataUnitsRead -> data_units_read + snake_name = re.sub(r'(? Date: Fri, 17 Jan 2025 16:13:05 +0000 Subject: [PATCH 4/4] Add tests for pysmart --- etc/kayobe/ansible/scripts/drives/nvme.json | 24 + etc/kayobe/ansible/scripts/test_smartmon.py | 513 ++++++++++---------- 2 files changed, 279 insertions(+), 258 deletions(-) create mode 100644 etc/kayobe/ansible/scripts/drives/nvme.json diff --git a/etc/kayobe/ansible/scripts/drives/nvme.json b/etc/kayobe/ansible/scripts/drives/nvme.json new file mode 100644 index 000000000..bbff19ec0 --- /dev/null +++ b/etc/kayobe/ansible/scripts/drives/nvme.json @@ -0,0 +1,24 @@ +{ + "device_info": { + "name": "/dev/nvme0", + "interface": "nvme", + "vendor": "AcmeCorp", + "family": "Acme NVMe Family", + "model": "Acme NVMe 1TB", + "serial": "ABCD1234", + "firmware": "3.0.1", + "smart_capable": true, + "smart_enabled": true, + "assessment": "PASS" + }, + "if_attributes": { + "criticalWarning": 0, + "temperature": 36, + "availableSpare": 100, + "availableSpareThreshold": 10, + "percentageUsed": 0, + "dataUnitsRead": 117446405, + "dataUnitsWritten": 84630284, + "notInSmartmonAttrs": 999 + } +} diff --git a/etc/kayobe/ansible/scripts/test_smartmon.py b/etc/kayobe/ansible/scripts/test_smartmon.py index a771a7ee6..a22df8ee1 100644 --- a/etc/kayobe/ansible/scripts/test_smartmon.py +++ b/etc/kayobe/ansible/scripts/test_smartmon.py @@ -1,265 +1,262 @@ +import glob +import json +import os +import re import unittest -from unittest.mock import patch + +from unittest.mock import patch, MagicMock + from smartmon import ( - parse_smartctl_info, - parse_smartctl_attributes, + parse_device_info, + parse_if_attributes, main, + SMARTMON_ATTRS ) +def load_json_fixture(filename): + """ + Load a JSON file from the 'drives' subfolder. + """ + path = os.path.join(os.path.dirname(__file__), "drives", filename) + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + class TestSmartMon(unittest.TestCase): - @patch('smartmon.run_command') - def test_parse_smartctl_info(self, mock_run_command): - devices_info = [ - { - 'disk': '/dev/nvme0', - 'disk_type': 'nvme', - 'json_output': { - 'device': { - 'name': '/dev/nvme0', - 'info_name': '/dev/nvme0', - 'type': 'nvme', - 'protocol': 'NVMe', - }, - 'model_name': 'Dell Ent NVMe CM6 RI 7.68TB', - 'serial_number': 'Y2Q0A0BGTCF8', - 'firmware_version': '2.2.0', - 'smart_status': { - 'passed': True, - 'available': True, - 'enabled': True - }, - } - }, - { - 'disk': '/dev/nvme1', - 'disk_type': 'nvme', - 'json_output': { - 'device': { - 'name': '/dev/nvme1', - 'info_name': '/dev/nvme1', - 'type': 'nvme', - 'protocol': 'NVMe', - }, - 'model_name': 'Dell Ent NVMe CM6 RI 7.68TB', - 'serial_number': 'Y2Q0A09PTCF8', - 'firmware_version': '2.2.0', - 'smart_status': { - 'passed': True, - 'available': True, - 'enabled': True - }, - } - }, - ] - - for device_info in devices_info: - disk = device_info['disk'] - disk_type = device_info['disk_type'] - json_output = device_info['json_output'] - serial_number = json_output.get('serial_number', '').lower() - - expected_metrics = [ - f'device_info{{disk="{disk}",type="{disk_type}",vendor="",product="",revision="",lun_id="",model_family="",device_model="{json_output.get("model_name", "")}",serial_number="{serial_number}",firmware_version="{json_output.get("firmware_version", "")}"}} 1', - f'device_smart_available{{disk="{disk}",type="{disk_type}",serial_number="{serial_number}"}} 1', - f'device_smart_enabled{{disk="{disk}",type="{disk_type}",serial_number="{serial_number}"}} 1', - f'device_smart_healthy{{disk="{disk}",type="{disk_type}",serial_number="{serial_number}"}} 1', - ] - - metrics = parse_smartctl_info(disk, disk_type, json_output) - for expected_metric in expected_metrics: - self.assertIn(expected_metric, metrics) - - @patch('smartmon.run_command') - def test_parse_smartctl_attributes(self, mock_run_command): - devices_attributes = [ - { - 'disk': '/dev/nvme0', - 'disk_type': 'nvme', - 'serial': 'y2q0a0bgtcf8', - 'json_output': { - 'nvme_smart_health_information_log': { - 'critical_warning': 0, - 'temperature': 36, - 'available_spare': 100, - 'available_spare_threshold': 10, - 'percentage_used': 0, - 'data_units_read': 117446405, - 'data_units_written': 84630284, - 'host_reads': 634894145, - 'host_writes': 4502620984, - 'controller_busy_time': 92090, - 'power_cycles': 746, - 'power_on_hours': 12494, - 'unsafe_shutdowns': 35, - 'media_errors': 0, - 'num_err_log_entries': 827, - 'warning_temp_time': 0, - 'critical_comp_time': 0 - } - } - }, - { - 'disk': '/dev/nvme1', - 'disk_type': 'nvme', - 'serial': 'y2q0a09ptcf8', - 'json_output': { - 'nvme_smart_health_information_log': { - 'critical_warning': 0, - 'temperature': 35, - 'available_spare': 99, - 'available_spare_threshold': 10, - 'percentage_used': 1, - 'data_units_read': 50000000, - 'data_units_written': 40000000, - 'host_reads': 300000000, - 'host_writes': 2000000000, - 'controller_busy_time': 80000, - 'power_cycles': 700, - 'power_on_hours': 12000, - 'unsafe_shutdowns': 30, - 'media_errors': 0, - 'num_err_log_entries': 800, - 'warning_temp_time': 0, - 'critical_comp_time': 0 - } - } - }, - ] - - for device_attr in devices_attributes: - disk = device_attr['disk'] - disk_type = device_attr['disk_type'] - serial = device_attr['serial'] - json_output = device_attr['json_output'] - - metrics = parse_smartctl_attributes(disk, disk_type, serial, json_output) - - expected_metrics = [ - f'temperature{{disk="{disk}",type="{disk_type}",serial_number="{serial}"}} {json_output["nvme_smart_health_information_log"]["temperature"]}', - f'available_spare{{disk="{disk}",type="{disk_type}",serial_number="{serial}"}} {json_output["nvme_smart_health_information_log"]["available_spare"]}', - ] - - for expected_metric in expected_metrics: - self.assertIn(expected_metric, metrics) - - @patch('smartmon.run_command') - def test_main(self, mock_run_command): - def side_effect(command, parse_json=False): - if '--scan-open' in command: - return { - 'devices': [ - {'name': '/dev/nvme0', 'info_name': '/dev/nvme0', 'type': 'nvme'}, - {'name': '/dev/nvme1', 'info_name': '/dev/nvme1', 'type': 'nvme'}, - ] - } if parse_json else '' - elif '-n' in command: - return {'power_mode': 'active'} if parse_json else '' - elif '-i' in command: - if '/dev/nvme0' in command: - return { - 'device': { - 'name': '/dev/nvme0', - 'info_name': '/dev/nvme0', - 'type': 'nvme', - 'protocol': 'NVMe', - }, - 'model_name': 'Dell Ent NVMe CM6 RI 7.68TB', - 'serial_number': 'Y2Q0A0BGTCF8', - 'firmware_version': '2.2.0', - 'smart_status': { - 'passed': True, - 'available': True, - 'enabled': True - }, - } if parse_json else '' - elif '/dev/nvme1' in command: - return { - 'device': { - 'name': '/dev/nvme1', - 'info_name': '/dev/nvme1', - 'type': 'nvme', - 'protocol': 'NVMe', - }, - 'model_name': 'Dell Ent NVMe CM6 RI 7.68TB', - 'serial_number': 'Y2Q0A09PTCF8', - 'firmware_version': '2.2.0', - 'smart_status': { - 'passed': True, - 'available': True, - 'enabled': True - }, - } if parse_json else '' - elif '-A' in command: - if '/dev/nvme0' in command: - return { - 'nvme_smart_health_information_log': { - 'critical_warning': 0, - 'temperature': 36, - 'available_spare': 100, - 'available_spare_threshold': 10, - 'percentage_used': 0, - 'data_units_read': 117446405, - 'data_units_written': 84630284, - 'host_reads': 634894145, - 'host_writes': 4502620984, - 'controller_busy_time': 92090, - 'power_cycles': 746, - 'power_on_hours': 12494, - 'unsafe_shutdowns': 35, - 'media_errors': 0, - 'num_err_log_entries': 827, - 'warning_temp_time': 0, - 'critical_comp_time': 0 - } - } if parse_json else '' - elif '/dev/nvme1' in command: - return { - 'nvme_smart_health_information_log': { - 'critical_warning': 0, - 'temperature': 35, - 'available_spare': 99, - 'available_spare_threshold': 10, - 'percentage_used': 1, - 'data_units_read': 50000000, - 'data_units_written': 40000000, - 'host_reads': 300000000, - 'host_writes': 2000000000, - 'controller_busy_time': 80000, - 'power_cycles': 700, - 'power_on_hours': 12000, - 'unsafe_shutdowns': 30, - 'media_errors': 0, - 'num_err_log_entries': 800, - 'warning_temp_time': 0, - 'critical_comp_time': 0 - } - } if parse_json else '' - elif '-j' in command and len(command) == 2: - return { - 'smartctl': { - 'version': [7, 2], - 'svn_revision': '5155', - 'platform_info': 'x86_64-linux-5.15.0-122-generic', - 'build_info': '(local build)', - } - } if parse_json else '' - else: - return {} if parse_json else '' - - mock_run_command.side_effect = side_effect - - with patch('builtins.print') as mock_print: - main() - output_lines = [] - for call in mock_print.call_args_list: - output_lines.extend(call[0][0].split('\n')) - expected_metrics = [ - 'smartmon_device_info{disk="/dev/nvme0",type="nvme",vendor="",product="",revision="",lun_id="",model_family="",device_model="Dell Ent NVMe CM6 RI 7.68TB",serial_number="y2q0a0bgtcf8",firmware_version="2.2.0"} 1', - 'smartmon_device_info{disk="/dev/nvme1",type="nvme",vendor="",product="",revision="",lun_id="",model_family="",device_model="Dell Ent NVMe CM6 RI 7.68TB",serial_number="y2q0a09ptcf8",firmware_version="2.2.0"} 1', - ] - for expected_metric in expected_metrics: - self.assertIn(expected_metric, output_lines) - - -if __name__ == '__main__': + @classmethod + def setUpClass(cls): + # Collect all *.json files from ./drives/ + data_folder = os.path.join(os.path.dirname(__file__), "drives") + cls.fixture_files = glob.glob(os.path.join(data_folder, "*.json")) + + def create_mock_device_from_json(self, device_info, if_attributes=None): + """ + Given a 'device_info' dict and optional 'if_attributes', build + a MagicMock that mimics a pySMART Device object. + """ + device = MagicMock() + device.name = device_info.get("name", "") + device.interface = device_info.get("interface", "") + device.vendor = device_info.get("vendor", "") + device.family = device_info.get("family", "") + device.model = device_info.get("model", "") + device.serial = device_info.get("serial", "") + device.firmware = device_info.get("firmware", "") + device.smart_capable = device_info.get("smart_capable", False) + device.smart_enabled = device_info.get("smart_enabled", False) + device.assessment = device_info.get("assessment", "") + + if if_attributes: + class IfAttributesMock: + pass + + if_mock = IfAttributesMock() + for key, val in if_attributes.items(): + setattr(if_mock, key, val) + device.if_attributes = if_mock + else: + device.if_attributes = None + + return device + + def test_parse_device_info(self): + """ + Test parse_device_info() for every JSON fixture in ./drives/. + We do subTest() so each fixture is tested individually. + """ + for fixture_path in self.fixture_files: + fixture_name = os.path.basename(fixture_path) + with self.subTest(msg=f"Testing device_info with {fixture_name}"): + data = load_json_fixture(fixture_name) + device_info = data["device_info"] + + device = self.create_mock_device_from_json(device_info) + metrics = parse_device_info(device) + + dev_name = device_info["name"] + dev_iface = device_info["interface"] + dev_serial = device_info["serial"].lower() + + # The device_info line should exist for every device + # e.g. device_info{disk="/dev/...",type="...",serial_number="..."} 1 + device_info_found = any( + line.startswith("device_info{") and + f'disk="{dev_name}"' in line and + f'type="{dev_iface}"' in line and + f'serial_number="{dev_serial}"' in line + for line in metrics + ) + self.assertTrue( + device_info_found, + f"Expected a device_info metric line for {dev_name} but didn't find it." + ) + + # If smart_capable is true, we expect device_smart_available = 1 + if device_info.get("smart_capable"): + smart_available_found = any( + line.startswith("device_smart_available{") and + f'disk="{dev_name}"' in line and + f'serial_number="{dev_serial}"' in line and + line.endswith(" 1") + for line in metrics + ) + self.assertTrue( + smart_available_found, + f"Expected device_smart_available=1 for {dev_name}, not found." + ) + + # If smart_enabled is true, we expect device_smart_enabled = 1 + if device_info.get("smart_enabled"): + smart_enabled_found = any( + line.startswith("device_smart_enabled{") and + f'disk="{dev_name}"' in line and + line.endswith(" 1") + for line in metrics + ) + self.assertTrue( + smart_enabled_found, + f"Expected device_smart_enabled=1 for {dev_name}, not found." + ) + + # device_smart_healthy if assessment in [PASS, WARN, FAIL] + # PASS => 1, otherwise => 0 + assessment = device_info.get("assessment", "").upper() + if assessment in ["PASS", "WARN", "FAIL"]: + expected_val = 1 if assessment == "PASS" else 0 + smart_healthy_found = any( + line.startswith("device_smart_healthy{") and + f'disk="{dev_name}"' in line and + line.endswith(f" {expected_val}") + for line in metrics + ) + self.assertTrue( + smart_healthy_found, + f"Expected device_smart_healthy={expected_val} for {dev_name}, not found." + ) + + def test_parse_if_attributes(self): + """ + Test parse_if_attributes() for every JSON fixture in ./drives/. + We do subTest() so each fixture is tested individually. + """ + for fixture_path in self.fixture_files: + fixture_name = os.path.basename(fixture_path) + with self.subTest(msg=f"Testing if_attributes with {fixture_name}"): + data = load_json_fixture(fixture_name) + device_info = data["device_info"] + if_attrs = data.get("if_attributes", {}) + + device = self.create_mock_device_from_json(device_info, if_attrs) + metrics = parse_if_attributes(device) + + dev_name = device_info["name"] + dev_iface = device_info["interface"] + dev_serial = device_info["serial"].lower() + + # For each numeric attribute in JSON, if it's in SMARTMON_ATTRS, + # we expect a line in the script's output. + for attr_key, attr_val in if_attrs.items(): + # Convert from e.g. "criticalWarning" -> "critical_warning" + snake_key = re.sub(r'(? + expected_line = ( + f"{snake_key}{{disk=\"{dev_name}\",type=\"{dev_iface}\",serial_number=\"{dev_serial}\"}} {attr_val}" + ) + self.assertIn( + expected_line, + metrics, + f"Expected metric '{expected_line}' for attribute '{attr_key}' not found." + ) + else: + # If it's not in SMARTMON_ATTRS or not numeric, + # we do NOT expect a line with that name+value + unexpected_line = ( + f"{snake_key}{{disk=\"{dev_name}\",type=\"{dev_iface}\",serial_number=\"{dev_serial}\"}} {attr_val}" + ) + self.assertNotIn( + unexpected_line, + metrics, + f"Unexpected metric '{unexpected_line}' found for {attr_key}." + ) + + # Also ensure that non-numeric or disallowed attributes do not appear + # For instance "notInSmartmonAttrs" should never appear. + for line in metrics: + self.assertNotIn( + "not_in_smartmon_attrs", + line, + f"'notInSmartmonAttrs' attribute unexpectedly found in metric line: {line}" + ) + + @patch("smartmon.run_command") + @patch("smartmon.DeviceList") + def test_main(self, mock_devicelist_class, mock_run_cmd): + """ + End-to-end test of main() for every JSON fixture in ./drives/. + This ensures we can handle multiple disks (multiple fixture files). + """ + for fixture_path in self.fixture_files: + fixture_name = os.path.basename(fixture_path) + with self.subTest(msg=f"Testing main() with {fixture_name}"): + data = load_json_fixture(fixture_name) + device_info = data["device_info"] + if_attrs = data.get("if_attributes", {}) + + # Patch run_command to return a version & "active" power_mode + def run_command_side_effect(cmd, parse_json=False): + if "--version" in cmd: + return "smartctl 7.3 5422 [x86_64-linux-5.15.0]\n..." + if "-n" in cmd and "standby" in cmd and parse_json: + return {"power_mode": "active"} + return "" + + mock_run_cmd.side_effect = run_command_side_effect + + # Mock a single device from the fixture + device_mock = self.create_mock_device_from_json(device_info, if_attrs) + + # Make DeviceList() return our single mock device + mock_dev_list = MagicMock() + mock_dev_list.devices = [device_mock] + mock_devicelist_class.return_value = mock_dev_list + + with patch("builtins.print") as mock_print: + main() + + printed_lines = [] + for call_args in mock_print.call_args_list: + printed_lines.extend(call_args[0][0].split("\n")) + dev_name = device_info["name"] + dev_iface = device_info["interface"] + dev_serial = device_info["serial"].lower() + + # We expect a line for the run timestamp, e.g.: + # smartmon_smartctl_run{disk="/dev/...",type="..."} 1671234567 + run_line_found = any( + line.startswith("smartmon_smartctl_run{") and + f'disk="{dev_name}"' in line and + f'type="{dev_iface}"' in line + for line in printed_lines + ) + self.assertTrue( + run_line_found, + f"Expected 'smartmon_smartctl_run' metric line for {dev_name} not found." + ) + + # Because we mocked "power_mode": "active", we expect device_active=1 + active_line_found = any( + line.startswith("smartmon_device_active{") and + f'disk="{dev_name}"' in line and + f'serial_number="{dev_serial}"' in line and + line.endswith(" 1") + for line in printed_lines + ) + self.assertTrue( + active_line_found, + f"Expected 'device_active{{...}} 1' line for {dev_name} not found." + ) + +if __name__ == "__main__": unittest.main()