From 4c394742963114f1b774205b02c2f51461a365bf Mon Sep 17 00:00:00 2001 From: Ponnuvel Palaniyappan Date: Fri, 4 Oct 2024 13:33:11 +0100 Subject: [PATCH] [ceph-osd] improve osdfull reason and suggest workaround (#982) There's been evidence that bdev_async_discard was the root cause of this issue. So if anyone encountering this problem should disable it. Signed-off-by: Ponnuvel Palaniyappan --- .../storage/ceph/ceph-mon/osd_unusual_raw.yaml | 16 +++++++++++----- .../storage/ceph/ceph-mon/osd_unusual_raw.yaml | 16 +++++++++++----- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/hotsos/defs/scenarios/storage/ceph/ceph-mon/osd_unusual_raw.yaml b/hotsos/defs/scenarios/storage/ceph/ceph-mon/osd_unusual_raw.yaml index a6c00949d..7f02cae9f 100644 --- a/hotsos/defs/scenarios/storage/ceph/ceph-mon/osd_unusual_raw.yaml +++ b/hotsos/defs/scenarios/storage/ceph/ceph-mon/osd_unusual_raw.yaml @@ -9,11 +9,17 @@ conclusions: raises: type: CephOSDWarning message: >- - Found OSD(s) {bad_osds} with larger raw usage size than the combined - data+meta+omap usage. While a certain discrepancy is to be expected due to - Ceph's using space not accounted by data+meta+omap columns, these are more - than {limit}% and potentially indicate a bug in Ceph. If these OSDs appear - full or misbehave, please restart them and possibly file a bug in Ceph tracker. + Found OSD(s) {bad_osds} with larger raw usage size than data+meta+omap + combined. While a discrepancy is to be expected due to Ceph using space + not accounted by data+meta+omap columns, usage is greater than {limit}% + and likely indicates high discard ops sent to disk which is often + the case for workloads with frequent rewrites. + + If these OSDs appear full or misbehave please restart them. + + If the problem persists (i.e. OSD restarts do not help) you should disable + bdev_async_discard for OSDs. For charmed Ceph, this option is controlled + via the bdev-enable-discard flag which should be set to 'disable'. format-dict: bad_osds: '@checks.osds_have_unusual_raw_usage.requires.value_actual:comma_join' limit: hotsos.core.plugins.storage.ceph.CephCluster.OSD_DISCREPANCY_ALLOWED diff --git a/hotsos/defs/tests/scenarios/storage/ceph/ceph-mon/osd_unusual_raw.yaml b/hotsos/defs/tests/scenarios/storage/ceph/ceph-mon/osd_unusual_raw.yaml index f62ef4319..0a09b7e7c 100644 --- a/hotsos/defs/tests/scenarios/storage/ceph/ceph-mon/osd_unusual_raw.yaml +++ b/hotsos/defs/tests/scenarios/storage/ceph/ceph-mon/osd_unusual_raw.yaml @@ -115,8 +115,14 @@ data-root: - sos_commands/systemd/systemctl_list-unit-files raised-issues: CephOSDWarning: >- - Found OSD(s) osd.2 with larger raw usage size than the combined - data+meta+omap usage. While a certain discrepancy is to be expected due to - Ceph's using space not accounted by data+meta+omap columns, these are more - than 5% and potentially indicate a bug in Ceph. If these OSDs appear - full or misbehave, please restart them and possibly file a bug in Ceph tracker. + Found OSD(s) osd.2 with larger raw usage size than data+meta+omap + combined. While a discrepancy is to be expected due to Ceph using space + not accounted by data+meta+omap columns, usage is greater than 5% + and likely indicates high discard ops sent to disk which is often + the case for workloads with frequent rewrites. + + If these OSDs appear full or misbehave please restart them. + + If the problem persists (i.e. OSD restarts do not help) you should disable + bdev_async_discard for OSDs. For charmed Ceph, this option is controlled + via the bdev-enable-discard flag which should be set to 'disable'.