From 53e5c722ef95eb3daec12d6268ac2308909fccec Mon Sep 17 00:00:00 2001 From: Zheng Li Date: Mon, 21 Jul 2014 17:42:16 +0100 Subject: [PATCH 1/5] Update the default priority setting in the comment section ... to reflect the changes of PR-1455. Signed-off-by: Zheng Li --- scripts/perfmon | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/perfmon b/scripts/perfmon index d8ee612c93b..a9300aa8435 100644 --- a/scripts/perfmon +++ b/scripts/perfmon @@ -616,7 +616,7 @@ class VMMonitor(ObjectMonitor): - Multiple nodes allowed - full list of child nodes is * name: what to call the variable (no default) - * alarm_priority: the priority of the messages generated (default '5') + * alarm_priority: the priority of the messages generated (default '3') * alarm_trigger_level: level of value that triggers an alarm (no default) * alarm_trigger_sense: 'high' if alarm_trigger_level is a max, otherwise 'low'. (default 'high') * alarm_trigger_period: num seconds of 'bad' values before an alarm is sent (default '60') @@ -665,7 +665,7 @@ class HOSTMonitor(ObjectMonitor): - Multiple nodes allowed - full list of child nodes is * name: what to call the variable (no default) - * alarm_priority: the priority of the messages generated (default '5') + * alarm_priority: the priority of the messages generated (default '3') * alarm_trigger_level: level of value that triggers an alarm (no default) * alarm_trigger_sense: 'high' if alarm_trigger_level is a max, otherwise 'low'. (default 'high') * alarm_trigger_period: num seconds of 'bad' values before an alarm is sent (default '60') From 43d6c0fea655cdec73d969efb1f7b6b89a3b49f7 Mon Sep 17 00:00:00 2001 From: Zheng Li Date: Mon, 21 Jul 2014 18:14:25 +0100 Subject: [PATCH 2/5] CP-9091: Add Dom0 mem_usage alert for perfmon Signed-off-by: Zheng Li --- scripts/perfmon | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/scripts/perfmon b/scripts/perfmon index a9300aa8435..32e5f028d2f 100644 --- a/scripts/perfmon +++ b/scripts/perfmon @@ -345,7 +345,7 @@ class RRDUpdates: # Consolidation functions: -supported_consolidation_functions = [ 'sum', 'average', 'max', 'get_percent_fs_usage' ] +supported_consolidation_functions = [ 'sum', 'average', 'max', 'get_percent_fs_usage', 'get_percent_mem_usage' ] def average(mylist): return sum(mylist)/float(len(mylist)) @@ -359,6 +359,27 @@ def get_percent_fs_usage(ignored): # strip of % character and convert to float return float(percentage[0:-1])/100.0 +def get_percent_mem_usage(ignored): + "Get the percent usage of Dom0 memory/swap. Input list is ignored and should be empty" + try: + memfd = open('/proc/meminfo', 'r') + memlist = memfd.readlines() + memfd.close() + memdict = [ m.split(':', 1) for m in memlist ] + memdict = dict([(k.strip(), float(re.search('\d+', v.strip()).group(0))) for (k,v) in memdict]) + # We consider the sum of res memory and swap in use as the hard demand + # of mem usage, it is bad if this number is beyond the physical mem, as + # in such case swapping is obligatory rather than voluntary, hence + # degrading the performance. We define the percentage metrics as + # (res_mem + swap_in_use) / phy_mem, which could potentially go beyond + # 100% (but is considered bad when it does) + mem_in_use = memdict['MemTotal'] - memdict['MemFree'] - memdict['Buffers'] - memdict['Cached'] + swap_in_use = memdict['SwapTotal'] - memdict['SwapFree'] + return float(mem_in_use + swap_in_use) / memdict['MemTotal'] + except Exception, e: + log_err("Error %s in get_percent_mem_usage, return 0.0 instead" % e) + return 0.0 + class VariableConfig: """Object storing the configuration of a Variable @@ -622,7 +643,7 @@ class VMMonitor(ObjectMonitor): * alarm_trigger_period: num seconds of 'bad' values before an alarm is sent (default '60') * alarm_auto_inhibit_period: num seconds this alarm disabled after an alarm is sent (default '3600') * consolidation_fn: how to combine variables from rrd_updates into one value - (default is 'average' for 'cpu_usage', 'get_percent_fs_usage' for 'fs_usage', & 'sum' for everything else) + (default is 'average' for 'cpu_usage', 'get_percent_fs_usage' for 'fs_usage', 'get_percent_mem_usage' for 'mem_usage', & 'sum' for everything else) * rrd_regex matches the names of variables from (xe vm-data-sources-list uuid=$vmuuid) used to compute value (only has defaults for "cpu_usage", "network_usage", and "disk_usage") """ @@ -636,19 +657,22 @@ class VMMonitor(ObjectMonitor): if config_tag == 'consolidation_fn': if variable_name == "cpu_usage": return 'average' elif variable_name == "fs_usage": return 'get_percent_fs_usage' + elif variable_name == "mem_usage": return 'get_percent_mem_usage' else: return 'sum' elif config_tag == 'rrd_regex': if variable_name == "cpu_usage": return "cpu[0-9]+" elif variable_name == "network_usage": return "vif_[0-9]+_[rt]x" elif variable_name == "disk_usage": return "vbd_(xvd|hd)[a-z]+_(read|write)" elif variable_name == "fs_usage": return "_$_DUMMY__" # match nothing + elif variable_name == "mem_usage": return "_$_DUMMY__" # match nothing else: raise XmlConfigException, "variable %s: no default rrd_regex - please specify one" % variable_name elif config_tag == 'alarm_trigger_period': return '60' # 1 minute elif config_tag == 'alarm_auto_inhibit_period': if variable_name == "fs_usage": return '604800' # 1 week else: return '3600' # 1 hour elif config_tag == 'alarm_trigger_level': - if variable_name == "fs_usage": return '0.9' # trigger when 90% full + if variable_name == "fs_usage": return '0.9' # trigger when 90% full + elif variable_name == "mem_usage": return '0.95' # tigger when mem demanded is close to phy_mem else: raise XmlConfigException, "variable %s: no default alarm_trigger_level - please specify one" % variable_name elif config_tag == 'alarm_trigger_sense': return 'high' # trigger if *above* elif config_tag == 'alarm_priority': return '3' # Service degradation level defined in PR-1455 From 2773824257a352c3b0c655a3ac3b9207b2941766 Mon Sep 17 00:00:00 2001 From: Zheng Li Date: Mon, 21 Jul 2014 18:21:37 +0100 Subject: [PATCH 3/5] CP-9093: decrease the inhibit period of fs_usage alert It seems that the inhibit period of fs_usage alert is set too long (one week). The consequence is that the alerts will be sent out much less frequently (not showing much emergency as we want). Also it will mute the alarm for a week long even if a different occurence shows up later on (after the first occurence being addressed already) during this period of time. Signed-off-by: Zheng Li --- scripts/perfmon | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/perfmon b/scripts/perfmon index 32e5f028d2f..0ea9aa719d3 100644 --- a/scripts/perfmon +++ b/scripts/perfmon @@ -667,9 +667,7 @@ class VMMonitor(ObjectMonitor): elif variable_name == "mem_usage": return "_$_DUMMY__" # match nothing else: raise XmlConfigException, "variable %s: no default rrd_regex - please specify one" % variable_name elif config_tag == 'alarm_trigger_period': return '60' # 1 minute - elif config_tag == 'alarm_auto_inhibit_period': - if variable_name == "fs_usage": return '604800' # 1 week - else: return '3600' # 1 hour + elif config_tag == 'alarm_auto_inhibit_period': return '3600' # 1 hour elif config_tag == 'alarm_trigger_level': if variable_name == "fs_usage": return '0.9' # trigger when 90% full elif variable_name == "mem_usage": return '0.95' # tigger when mem demanded is close to phy_mem From d8c640abe3be3c2b96c257dcab740b0037da7c92 Mon Sep 17 00:00:00 2001 From: Zheng Li Date: Mon, 21 Jul 2014 18:46:42 +0100 Subject: [PATCH 4/5] CP-9091: Add mem_usage logic in mail-alarm Signed-off-by: Zheng Li --- scripts/mail-alarm | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/scripts/mail-alarm b/scripts/mail-alarm index 3e8d42b7bba..addeff4972c 100755 --- a/scripts/mail-alarm +++ b/scripts/mail-alarm @@ -221,6 +221,30 @@ class Dom0FSUsageAlarmETG(EmailTextGenerator): self.value * 100.0, self.alarm_trigger_level * 100.0) +class Dom0MemUsageAlarmETG(EmailTextGenerator): + def __init__(self, cls, obj_uuid, value, alarm_trigger_level): + if not alarm_trigger_level: alarm_trigger_level = 0.95 + if cls != 'VM': + raise Exception, "programmer error - this alarm should only be available for control domain VM" + self.params = get_VM_params(obj_uuid) + self.cls = cls + self.value = value + self.alarm_trigger_level = alarm_trigger_level + + def generate_subject(self): + pool_name = get_pool_name() + return '[%s] XenServer Alarm: Dom0 memory demand is high on "%s"' % (pool_name, self.params['name_label']) + + def generate_body(self): + return \ + 'The memory demand on "%s" is about %.1f%% of the physical memory of the domain. ' \ + 'Occasional performance degradation can be expected when memory swapping is forced to happen.\n' \ + 'This alarm is set to be triggered when the ratio of the memory demand to the physical memory is beyond %.1f%%.\n' \ + '\n' % \ + (self.params['name_label'], + self.value * 100.0, + self.alarm_trigger_level * 100.0) + class WlbConsultationFailure(EmailTextGenerator): def __init__(self, cls, obj_uuid): self.cls = cls @@ -336,6 +360,8 @@ class XapiMessage: etg = DiskUsageAlarmETG(self.cls, self.obj_uuid, value, alarm_trigger_period, alarm_trigger_level) elif name == 'fs_usage': etg = Dom0FSUsageAlarmETG(self.cls, self.obj_uuid, value, alarm_trigger_level) + elif name == 'mem_usage': + etg = Dom0MemUsageAlarmETG(self.cls, self.obj_uuid, value, alarm_trigger_level) else: etg = None elif self.name == 'HA_HOST_FAILED': From b10dbb8ecd1c78661421a87f28eae26552eac739 Mon Sep 17 00:00:00 2001 From: Zheng Li Date: Tue, 22 Jul 2014 01:54:50 +0100 Subject: [PATCH 5/5] CP-9093: fix a debug message error Signed-off-by: Zheng Li --- scripts/perfmon | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/perfmon b/scripts/perfmon index 0ea9aa719d3..e8f28e22348 100644 --- a/scripts/perfmon +++ b/scripts/perfmon @@ -745,7 +745,7 @@ class HOSTMonitor(ObjectMonitor): # possible to set up an alarm on each host that uses an SR by setting # appropriate configuration in the SR's other-config. if self.uuid not in sruuids_by_hostuuid: - print_debug("%s not in sruuids_by_hostuuid") + print_debug("%s not in sruuids_by_hostuuid" % self.uuid) self.secondary_variables.clear() self.secondary_xmlconfigs.clear() return