From 3cf61d35d7ad2dd15d653601c0717d32cfb2d685 Mon Sep 17 00:00:00 2001 From: Ryan Barry Date: Sat, 21 Jan 2023 22:11:22 +0000 Subject: [PATCH 1/5] Break the Rules monolith --- .../prometheus_k8s/v0/prometheus_scrape.py | 547 +++++++++++------- requirements.txt | 1 + tests/unit/test_charm.py | 1 + tests/unit/test_endpoint_provider.py | 4 +- tests/unit/test_functions.py | 16 +- tests/unit/test_transform.py | 4 +- 6 files changed, 368 insertions(+), 205 deletions(-) diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index cc582b95..ac7bbf7c 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -280,34 +280,34 @@ def _on_scrape_targets_changed(self, event): summary: High request latency for {{ $labels.instance }}. ``` -The `MetricsEndpointProvider` will read all available alert rules and -also inject "filtering labels" into the alert expressions. The -filtering labels ensure that alert rules are localised to the metrics +The `MetricsEndpointProvider` will read all available rules and +also inject "filtering labels" into the expressions. The +filtering labels ensure that rules are localised to the metrics provider charm's Juju topology (application, model and its UUID). Such -a topology filter is essential to ensure that alert rules submitted by -one provider charm generates alerts only for that same charm. When -alert rules are embedded in a charm, and the charm is deployed as a -Juju application, the alert rules from that application have their +a topology filter is essential to ensure that rules submitted by +one provider charm generates information only for that same charm. When +rules are embedded in a charm, and the charm is deployed as a +Juju application, the rules from that application have their expressions automatically updated to filter for metrics coming from -the units of that application alone. This remove risk of spurious +the units of that application alone. This removes risk of spurious evaluation, e.g., when you have multiple deployments of the same charm monitored by the same Prometheus. -Not all alerts one may want to specify can be embedded in a -charm. Some alert rules will be specific to a user's use case. This is -the case, for example, of alert rules that are based on business +Not all rules one may want to specify can be embedded in a +charm. Some rules will be specific to a user's use case. This is +the case, for example, of rules that are based on business constraints, like expecting a certain amount of requests to a specific API every five minutes. Such alert rules can be specified via the [COS Config Charm](https://charmhub.io/cos-configuration-k8s), which allows importing alert rules and other settings like dashboards from a Git repository. -Gathering alert rules and generating rule files within the Prometheus -charm is easily done using the `alerts()` method of -`MetricsEndpointConsumer`. Alerts generated by Prometheus will +Gathering rules and generating rule files within the Prometheus +charm is easily done using the `alerts()` or `recording_rules()` method(s) +of `MetricsEndpointConsumer`. Rules generated by Prometheus will automatically include Juju topology labels in the alerts. These labels -indicate the source of the alert. The following labels are -automatically included with each alert +indicate the source of the record or alert. The following labels are +automatically included with each rule - `juju_model` - `juju_model_uuid` @@ -340,6 +340,7 @@ def _on_scrape_targets_changed(self, event): import socket import subprocess import tempfile +from abc import ABC, abstractmethod from collections import defaultdict from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -359,6 +360,7 @@ def _on_scrape_targets_changed(self, event): StoredState, ) from ops.model import Relation +from typing_extensions import Literal # The unique Charmhub library identifier, never change it LIBID = "bc84295fef5f4049878f07b131968ee2" @@ -400,6 +402,9 @@ def _on_scrape_targets_changed(self, event): RELATION_INTERFACE_NAME = "prometheus_scrape" DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" +DEFAULT_RECORDING_RULES_RELATIVE_PATH = "./src/prometheus_recording_rules" + +_RuleType = Literal["alert", "record"] class PrometheusConfig: @@ -662,8 +667,8 @@ def __init__( super().__init__(self.message) -class InvalidAlertRuleEvent(EventBase): - """Event emitted when alert rule files are not parsable. +class InvalidRuleEvent(EventBase): + """Event emitted when rule files are not parsable. Enables us to set a clear status on the provider. """ @@ -674,22 +679,23 @@ def __init__(self, handle, errors: str = "", valid: bool = False): self.valid = valid def snapshot(self) -> Dict: - """Save alert rule information.""" + """Save rule information.""" return { "valid": self.valid, "errors": self.errors, } def restore(self, snapshot): - """Restore alert rule information.""" + """Restore rule information.""" self.valid = snapshot["valid"] self.errors = snapshot["errors"] class MetricsEndpointProviderEvents(ObjectEvents): - """Events raised by :class:`InvalidAlertRuleEvent`s.""" + """Events raised by :class:`InvalidRuleEvent`s.""" - alert_rule_status_changed = EventSource(InvalidAlertRuleEvent) + alert_rule_status_changed = EventSource(InvalidRuleEvent) + recordingt_rule_status_changed = EventSource(InvalidRuleEvent) def _type_convert_stored(obj): @@ -761,68 +767,67 @@ def _validate_relation_by_interface_and_direction( raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role)) -class InvalidAlertRulePathError(Exception): - """Raised if the alert rules folder cannot be found or is otherwise invalid.""" +class InvalidRulePathError(Exception): + """Raised if the rules folder cannot be found or is otherwise invalid.""" def __init__( self, - alert_rules_absolute_path: Path, + rules_absolute_path: Path, message: str, ): - self.alert_rules_absolute_path = alert_rules_absolute_path + self.rules_absolute_path = rules_absolute_path self.message = message super().__init__(self.message) -def _is_official_alert_rule_format(rules_dict: dict) -> bool: - """Are alert rules in the upstream format as supported by Prometheus. +def _is_official_rule_format(rules_dict: dict) -> bool: + """Are rules in the upstream format as supported by Prometheus. - Alert rules in dictionary format are in "official" form if they + Rules in dictionary format are in "official" form if they contain a "groups" key, since this implies they contain a list of - alert rule groups. + rule groups. Args: - rules_dict: a set of alert rules in Python dictionary format + rules_dict: a set of rules in Python dictionary format Returns: - True if alert rules are in official Prometheus file format. + True if rules are in official Prometheus file format. """ return "groups" in rules_dict -def _is_single_alert_rule_format(rules_dict: dict) -> bool: +def _is_single_rule_format(rules_dict: dict, rule_type: _RuleType) -> bool: """Are alert rules in single rule format. - The Prometheus charm library supports reading of alert rules in a - custom format that consists of a single alert rule per file. This - does not conform to the official Prometheus alert rule file format - which requires that each alert rules file consists of a list of - alert rule groups and each group consists of a list of alert - rules. + The Prometheus charm library supports reading of rules in a + custom format that consists of a single rule per file. This + does not conform to the official Prometheus rule file format + which requires that each rules file consists of a list of + rule groups and each group consists of a list of rules. - Alert rules in dictionary form are considered to be in single rule + Rules in dictionary form are considered to be in single rule format if in the least it contains two keys corresponding to the - alert rule name and alert expression. + rule name and expression. Returns: - True if alert rule is in single rule file format. + True if rule is in single rule file format. """ # one alert rule per file - return set(rules_dict) >= {"alert", "expr"} + return set(rules_dict) >= {rule_type, "expr"} -class AlertRules: - """Utility class for amalgamating prometheus alert rule files and injecting juju topology. +class Rules(ABC): + """Utility class for amalgamating prometheus alert files and injecting juju topology. - An `AlertRules` object supports aggregating alert rules from files and directories in both - official and single rule file formats using the `add_path()` method. All the alert rules + A `Rules` object supports aggregating rules from files and directories in both + official and single rule file formats using the `add_path()` method. All the rules read are annotated with Juju topology labels and amalgamated into a single data structure in the form of a Python dictionary using the `as_dict()` method. Such a dictionary can be easily dumped into JSON format and exchanged over relation data. The dictionary can also - be dumped into YAML format and written directly into an alert rules file that is read by - Prometheus. Note that multiple `AlertRules` objects must not be written into the same file, - since Prometheus allows only a single list of alert rule groups per alert rules file. + be dumped into YAML format and written directly into an rules file that is read by + Prometheus. Note that multiple `Rules` objects must not be written into the same file, + since Prometheus allows only a single list of rule groups per rules file. The official Prometheus format is a YAML file conforming to the Prometheus documentation (https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). @@ -831,13 +836,14 @@ class AlertRules: """ # This class uses the following terminology for the various parts of a rule file: - # - alert rules file: the entire groups[] yaml, including the "groups:" key. - # - alert groups (plural): the list of groups[] (a list, i.e. no "groups:" key) - it is a list + # - rules file: the entire groups[] yaml, including the "groups:" key. + # - groups (plural): the list of groups[] (a list, i.e. no "groups:" key) - it is a list # of dictionaries that have the "name" and "rules" keys. - # - alert group (singular): a single dictionary that has the "name" and "rules" keys. - # - alert rules (plural): all the alerts in a given alert group - a list of dictionaries with - # the "alert" and "expr" keys. - # - alert rule (singular): a single dictionary that has the "alert" and "expr" keys. + # - group (singular): a single dictionary that has the "name" and "rules" keys. + # - rules (plural): all the rules in a given group - a list of dictionaries with + # the `self.rule_type` (either "alert" or "record") and "expr" keys. + # - rule (singular): a single dictionary that has the `self.rule_type` (either "alert" or + # "record") and "expr" keys. def __init__(self, topology: Optional[JujuTopology] = None): """Build and alert rule object. @@ -847,7 +853,13 @@ def __init__(self, topology: Optional[JujuTopology] = None): """ self.topology = topology self.tool = CosTool(None) - self.alert_groups = [] # type: List[dict] + self.groups = [] # type: List[dict] + + @property + @abstractmethod + def rule_type(self) -> _RuleType: + """Return the rule type being used for interpolation in messages.""" + pass def _from_file(self, root_path: Path, file_path: Path) -> List[dict]: """Read a rules file from path, injecting juju topology. @@ -866,7 +878,7 @@ def _from_file(self, root_path: Path, file_path: Path) -> List[dict]: rule_file = yaml.safe_load(rf) except Exception as e: - logger.error("Failed to read alert rules from %s: %s", file_path.name, e) + logger.error("Failed to read rules from %s: %s", file_path.name, e) return [] if not rule_file: @@ -875,40 +887,40 @@ def _from_file(self, root_path: Path, file_path: Path) -> List[dict]: if not isinstance(rule_file, dict): logger.error("Invalid rules file (must be a dict): %s", file_path.name) return [] - if _is_official_alert_rule_format(rule_file): - alert_groups = rule_file["groups"] - elif _is_single_alert_rule_format(rule_file): - # convert to list of alert groups + if _is_official_rule_format(rule_file): + groups = rule_file["groups"] + elif _is_single_rule_format(rule_file, self.rule_type): + # convert to list of groups # group name is made up from the file name - alert_groups = [{"name": file_path.stem, "rules": [rule_file]}] + groups = [{"name": file_path.stem, "rules": [rule_file]}] else: # invalid/unsupported logger.error("Invalid rules file: %s", file_path.name) return [] # update rules with additional metadata - for alert_group in alert_groups: + for group in groups: # update group name with topology and sub-path - alert_group["name"] = self._group_name( + group["name"] = self._group_name( str(root_path), str(file_path), - alert_group["name"], + group["name"], ) # add "juju_" topology labels - for alert_rule in alert_group["rules"]: - if "labels" not in alert_rule: - alert_rule["labels"] = {} + for rule in group["rules"]: + if "labels" not in rule: + rule["labels"] = {} if self.topology: - alert_rule["labels"].update(self.topology.label_matcher_dict) - # insert juju topology filters into a prometheus alert rule - alert_rule["expr"] = self.tool.inject_label_matchers( - re.sub(r"%%juju_topology%%,?", "", alert_rule["expr"]), + rule["labels"].update(self.topology.label_matcher_dict) + # insert juju topology filters into a prometheus rule + rule["expr"] = self.tool.inject_label_matchers( + re.sub(r"%%juju_topology%%,?", "", rule["expr"]), self.topology.label_matcher_dict, ) - return alert_groups + return groups def _group_name(self, root_path: str, file_path: str, group_name: str) -> str: """Generate group name from path and topology. @@ -935,9 +947,8 @@ def _group_name(self, root_path: str, file_path: str, group_name: str) -> str: # filter to remove empty strings return "_".join(filter(None, group_name_parts)) - @classmethod def _multi_suffix_glob( - cls, dir_path: Path, suffixes: List[str], recursive: bool = True + self, dir_path: Path, suffixes: List[str], recursive: bool = True ) -> list: """Helper function for getting all files in a directory that have a matching suffix. @@ -960,27 +971,27 @@ def _from_dir(self, dir_path: Path, recursive: bool) -> List[dict]: By default, only the top directory is scanned; for nested scanning, pass `recursive=True`. Args: - dir_path: directory containing *.rule files (alert rules without groups). + dir_path: directory containing *.rule files (rules without groups). recursive: flag indicating whether to scan for rule files recursively. Returns: - a list of dictionaries representing prometheus alert rule groups, each dictionary - representing an alert group (structure determined by `yaml.safe_load`). + a list of dictionaries representing prometheus rule groups, each dictionary + representing an group (structure determined by `yaml.safe_load`). """ - alert_groups = [] # type: List[dict] + groups = [] # type: List[dict] - # Gather all alerts into a list of groups + # Gather all records into a list of groups for file_path in self._multi_suffix_glob( dir_path, [".rule", ".rules", ".yml", ".yaml"], recursive ): - alert_groups_from_file = self._from_file(dir_path, file_path) - if alert_groups_from_file: - logger.debug("Reading alert rule from %s", file_path) - alert_groups.extend(alert_groups_from_file) + groups_from_file = self._from_file(dir_path, file_path) + if groups_from_file: + logger.debug("Reading %s rule from %s", self.rule_type, file_path) + groups.extend(groups_from_file) - return alert_groups + return groups - def add_path(self, path: str, *, recursive: bool = False) -> None: + def add_path(self, path: Union[str, Path], *, recursive: bool = False) -> None: """Add rules from a dir path. All rules from files are aggregated into a data structure representing a single rule file. @@ -993,23 +1004,57 @@ def add_path(self, path: str, *, recursive: bool = False) -> None: Returns: True if path was added else False. """ - path = Path(path) # type: Path + path = Path(path) if isinstance(path, str) else path # type: Path if path.is_dir(): - self.alert_groups.extend(self._from_dir(path, recursive)) + self.groups.extend(self._from_dir(path, recursive)) elif path.is_file(): - self.alert_groups.extend(self._from_file(path.parent, path)) + self.groups.extend(self._from_file(path.parent, path)) else: - logger.debug("Alert rules path does not exist: %s", path) + logger.debug("%s rules path does not exist: %s", self.rule_type.capitalize(), path) def as_dict(self) -> dict: - """Return standard alert rules file in dict representation. + """Return standard rules file in dict representation. Returns: - a dictionary containing a single list of alert rule groups. - The list of alert rule groups is provided as value of the + a dictionary containing a single list of rule groups. + The list of rule groups is provided as value of the "groups" dictionary key. """ - return {"groups": self.alert_groups} if self.alert_groups else {} + return {"groups": self.groups} if self.groups else {} + + +class AlertRules(Rules): + """Utility class for amalgamating prometheus alert files and injecting juju topology. + + The official Prometheus format is a YAML file conforming to the Prometheus documentation + (https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). + The custom single rule format is a subsection of the official YAML, having a single alert + rule, effectively "one alert per file". + """ + + _rule_type = "alert" # type: _RuleType + + @property + def rule_type(self) -> _RuleType: + """Return the rule type being used for interpolation in messages.""" + return self._rule_type + + +class RecordingRules(Rules): + """Utility class for amalgamating prometheus recording files and injecting juju topology. + + The official Prometheus format is a YAML file conforming to the Prometheus documentation + (https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/). + The custom single rule format is a subsection of the official YAML, having a single recording + rule, effectively "one record per file". + """ + + _rule_type = "record" # type: _RuleType + + @property + def rule_type(self) -> _RuleType: + """Return the rule type being used for interpolation in messages.""" + return self._rule_type class TargetsChangedEvent(EventBase): @@ -1121,10 +1166,84 @@ def jobs(self) -> list: if static_scrape_jobs: scrape_jobs.extend(static_scrape_jobs) - scrape_jobs = _dedupe_job_names(scrape_jobs) + scrape_jobs = self._dedupe_job_names(scrape_jobs) return scrape_jobs + def _dedupe_job_names(self, jobs: List[dict]): + """Deduplicate a list of dicts by appending a hash to the value of the 'job_name' key. + + Additionally, fully de-duplicate any identical jobs. + + Args: + jobs: A list of prometheus scrape jobs + """ + jobs_copy = copy.deepcopy(jobs) + + # Convert to a dict with job names as keys + # I think this line is O(n^2) but it should be okay given the list sizes + jobs_dict = { + job["job_name"]: list(filter(lambda x: x["job_name"] == job["job_name"], jobs_copy)) + for job in jobs_copy + } + + # If multiple jobs have the same name, convert the name to "name_" + for key in jobs_dict: + if len(jobs_dict[key]) > 1: + for job in jobs_dict[key]: + job_json = json.dumps(job) + hashed = hashlib.sha256(job_json.encode()).hexdigest() + job["job_name"] = "{}_{}".format(job["job_name"], hashed) + new_jobs = [] + for key in jobs_dict: + new_jobs.extend([i for i in jobs_dict[key]]) + + # Deduplicate jobs which are equal + # Again this in O(n^2) but it should be okay + deduped_jobs = [] + seen = [] + for job in new_jobs: + job_json = json.dumps(job) + hashed = hashlib.sha256(job_json.encode()).hexdigest() + if hashed in seen: + continue + seen.append(hashed) + deduped_jobs.append(job) + + return deduped_jobs + + def recording_rules(self) -> dict: + """Fetch recording rules for all relations. + + A Prometheus recording rules file consists of a list of "groups". Each + group consists of a list of recording rules (`rules`) that are sequentially + executed. This method returns all the recording rules provided by each + related metrics provider charm. These rules may be used to generate a + separate recording rules file for each relation since the returned list + of recording groups are indexed by that relations Juju topology identifier. + The Juju topology identifier string includes substrings that identify + recording rule related metadata such as the Juju model, model UUID and the + application name from where the recording rule originates. Since this + topology identifier is globally unique, it may be used for instance as + the name for the file into which the list of recording rule groups are + written. For each relation, the structure of data returned is a dictionary + representation of a standard prometheus rules file: + + {"groups": [{"name": ...}, ...]} + + per official prometheus documentation + https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/ + + The value of the `groups` key is such that it may be used to generate + a Prometheus recording rules file directly using `yaml.dump` but the + `groups` key itself must be included as this is required by Prometheus. + + Returns: + A dictionary mapping the Juju topology identifier of the source charm to + its list of recording rule groups. + """ + return self._get_rules_by_type("recording") + def alerts(self) -> dict: """Fetch alerts for all relations. @@ -1151,34 +1270,35 @@ def alerts(self) -> dict: a Prometheus alert rules file directly using `yaml.dump` but the `groups` key itself must be included as this is required by Prometheus. - For example the list of alert rule groups returned by this method may - be written into files consumed by Prometheus as follows - - ``` - for topology_identifier, alert_rule_groups in self.metrics_consumer.alerts().items(): - filename = "juju_" + topology_identifier + ".rules" - path = os.path.join(PROMETHEUS_RULES_DIR, filename) - rules = yaml.safe_dump(alert_rule_groups) - container.push(path, rules, make_dirs=True) - ``` - Returns: A dictionary mapping the Juju topology identifier of the source charm to its list of alert rule groups. """ - alerts = {} # type: Dict[str, dict] # mapping b/w juju identifiers and alert rule files + return self._get_rules_by_type("alert") + + def _get_rules_by_type(self, rule_type: str) -> dict: + """Unified logic for fetching rules from different databags. + + The logic for alert|recording rules is shared here, since other than the + databag, it is completely identical. + + Returns: + A dictionary representing the rules. + """ + rule_mapping = {"alert": "alert_rules", "recording": "recording_rules"} + all_rules = {} for relation in self._charm.model.relations[self._relation_name]: if not relation.units or not relation.app: continue - alert_rules = json.loads(relation.data[relation.app].get("alert_rules", "{}")) - if not alert_rules: + rules = json.loads(relation.data[relation.app].get(rule_mapping[rule_type], "{}")) + if not rules: continue try: scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"]) identifier = JujuTopology.from_dict(scrape_metadata).identifier - alerts[identifier] = self._tool.apply_label_matchers(alert_rules) + rules[identifier] = self._tool.apply_label_matchers(rules) except KeyError as e: logger.debug( @@ -1186,39 +1306,41 @@ def alerts(self) -> dict: relation.id, e, ) - identifier = self._get_identifier_by_alert_rules(alert_rules) + identifier = self._get_identifier_by_rule_content(rules, rule_type) if not identifier: logger.error( - "Alert rules were found but no usable group or identifier was present" + "%s rules were found but no usable group or identifier was present", + rule_type.capitalize(), ) continue - alerts[identifier] = alert_rules + all_rules[identifier] = rules - _, errmsg = self._tool.validate_alert_rules(alert_rules) + _, errmsg = self._tool.validate_rules(rules) if errmsg: - if alerts[identifier]: - del alerts[identifier] + if rules[identifier]: + del rules[identifier] relation.data[self._charm.app]["event"] = json.dumps({"errors": errmsg}) continue - return alerts + return all_rules - def _get_identifier_by_alert_rules(self, rules: dict) -> Union[str, None]: - """Determine an appropriate dict key for alert rules. + def _get_identifier_by_rule_content(self, rules: dict, rule_type: str) -> Union[str, None]: + """Determine an appropriate dict key for rules. - The key is used as the filename when writing alerts to disk, so the structure + The key is used as the filename when writing rules to disk, so the structure and uniqueness is important. Args: - rules: a dict of alert rules + rules: a dict of rules + rule_type: a string representing the rule type for interpolation in messages """ if "groups" not in rules: - logger.debug("No alert groups were found in relation data") + logger.debug("No %s groups were found in relation data", rule_type) return None - # Construct an ID based on what's in the alert rules if they have labels + # Construct an ID based on what's in the rules if they have labels for group in rules["groups"]: try: labels = group["rules"][0]["labels"] @@ -1229,12 +1351,16 @@ def _get_identifier_by_alert_rules(self, rules: dict) -> Union[str, None]: ) return identifier except KeyError: - logger.debug("Alert rules were found but no usable labels were present") + logger.debug( + "%s rules were found but no usable labels were present", rule_type.capitalize() + ) continue logger.warning( - "No labeled alert rules were found, and no 'scrape_metadata' " - "was available. Using the alert group name as filename." + "No labeled %s rules were found, and no 'scrape_metadata' " + "was available. Using the %s group name as filename.", + rule_type, + rule_type, ) try: for group in rules["groups"]: @@ -1324,49 +1450,6 @@ def _target_parts(self, target) -> list: return parts -def _dedupe_job_names(jobs: List[dict]): - """Deduplicate a list of dicts by appending a hash to the value of the 'job_name' key. - - Additionally, fully de-duplicate any identical jobs. - - Args: - jobs: A list of prometheus scrape jobs - """ - jobs_copy = copy.deepcopy(jobs) - - # Convert to a dict with job names as keys - # I think this line is O(n^2) but it should be okay given the list sizes - jobs_dict = { - job["job_name"]: list(filter(lambda x: x["job_name"] == job["job_name"], jobs_copy)) - for job in jobs_copy - } - - # If multiple jobs have the same name, convert the name to "name_" - for key in jobs_dict: - if len(jobs_dict[key]) > 1: - for job in jobs_dict[key]: - job_json = json.dumps(job) - hashed = hashlib.sha256(job_json.encode()).hexdigest() - job["job_name"] = "{}_{}".format(job["job_name"], hashed) - new_jobs = [] - for key in jobs_dict: - new_jobs.extend([i for i in jobs_dict[key]]) - - # Deduplicate jobs which are equal - # Again this in O(n^2) but it should be okay - deduped_jobs = [] - seen = [] - for job in new_jobs: - job_json = json.dumps(job) - hashed = hashlib.sha256(job_json.encode()).hexdigest() - if hashed in seen: - continue - seen.append(hashed) - deduped_jobs.append(job) - - return deduped_jobs - - def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> str: """Resolve the provided path items against the directory of the main file. @@ -1388,14 +1471,14 @@ def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> st # https://github.com/canonical/operator/issues/643 charm_dir = Path(os.getcwd()) - alerts_dir_path = charm_dir.absolute().joinpath(*path_elements) + rules_dir_path = charm_dir.absolute().joinpath(*path_elements) - if not alerts_dir_path.exists(): - raise InvalidAlertRulePathError(alerts_dir_path, "directory does not exist") - if not alerts_dir_path.is_dir(): - raise InvalidAlertRulePathError(alerts_dir_path, "is not a directory") + if not rules_dir_path.exists(): + raise InvalidRulePathError(rules_dir_path, "directory does not exist") + if not rules_dir_path.is_dir(): + raise InvalidRulePathError(rules_dir_path, "is not a directory") - return str(alerts_dir_path) + return str(rules_dir_path) class MetricsEndpointProvider(Object): @@ -1409,6 +1492,7 @@ def __init__( relation_name: str = DEFAULT_RELATION_NAME, jobs=None, alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, + recording_rules_path: str = DEFAULT_RECORDING_RULES_RELATIVE_PATH, refresh_event: Optional[Union[BoundEvent, List[BoundEvent]]] = None, external_url: str = "", lookaside_jobs_callable: Optional[Callable] = None, @@ -1486,13 +1570,22 @@ def __init__( expr: up{juju_model=, juju_model_uuid=, juju_application=} < 1 for: 0m - An attempt will be made to validate alert rules prior to loading them into Prometheus. + In addition, recording rules can be specified. By default, this library will look + into the `/prometheus_recording_rules`, which in a standard charm + layouts resolves to `src/prometheus_recording_rules`. Each recording rule goes into a + separate `*.rule` file. If the syntax of a rule is invalid, + the `MetricsEndpointProvider` logs an error and does not load the particular + rule. + + + An attempt will be made to validate rules prior to loading them into Prometheus. If they are invalid, an event will be emitted from this object which charms can respond to in order to set a meaningful status for administrators. - This can be observed via `consumer.on.alert_rule_status_changed` which contains: + This can be observed via `consumer.on.recording_rule_status_changed`, or + `consumer.on.alert_rule_status_changed`, which contains: - The error(s) encountered when validating as `errors` - - A `valid` attribute, which can be used to reset the state of charms if alert rules + - A `valid` attribute, which can be used to reset the state of charms if rules are updated via another mechanism (e.g. `cos-config`) and refreshed. Args: @@ -1514,6 +1607,10 @@ def __init__( files. Defaults to "./prometheus_alert_rules", resolved relative to the directory hosting the charm entry file. The alert rules are automatically updated on charm upgrade. + recording_rules_path: an optional path for the location of recording rules + files. Defaults to "./prometheus_recording_rules", + resolved relative to the directory hosting the charm entry file. + The recording rules are automatically updated on charm upgrade. refresh_event: an optional bound event or list of bound events which will be observed to re-set scrape job data (IP address and others) external_url: an optional argument that represents an external url that @@ -1540,10 +1637,19 @@ def __init__( try: alert_rules_path = _resolve_dir_against_charm_path(charm, alert_rules_path) - except InvalidAlertRulePathError as e: + except InvalidRulePathError as e: logger.debug( "Invalid Prometheus alert rules folder at %s: %s", - e.alert_rules_absolute_path, + e.rules_absolute_path, + e.message, + ) + + try: + recording_rules_path = _resolve_dir_against_charm_path(charm, recording_rules_path) + except InvalidRulePathError as e: + logger.debug( + "Invalid Prometheus recording rules folder at %s: %s", + e.rules_absolute_path, e.message, ) @@ -1552,6 +1658,7 @@ def __init__( self._charm = charm self._alert_rules_path = alert_rules_path + self._recording_rules_path = recording_rules_path self._relation_name = relation_name # sanitize job configurations to the supported subset of parameters jobs = [] if jobs is None else jobs @@ -1606,7 +1713,7 @@ def __init__( self.set_scrape_job_spec() def _on_relation_changed(self, event): - """Check for alert rule messages in the relation data before moving on.""" + """Check for rule messages in the relation data before moving on.""" if self._charm.unit.is_leader(): ev = json.loads(event.relation.data[event.app].get("event", "{}")) @@ -1642,6 +1749,10 @@ def set_scrape_job_spec(self, _=None): alert_rules.add_path(self._alert_rules_path, recursive=True) alert_rules_as_dict = alert_rules.as_dict() + recording_rules = RecordingRules(topology=self.topology) + recording_rules.add_path(self._recording_rules_path, recursive=True) + recording_rules_as_dict = recording_rules.as_dict() + for relation in self._charm.model.relations[self._relation_name]: relation.data[self._charm.app]["scrape_metadata"] = json.dumps(self._scrape_metadata) relation.data[self._charm.app]["scrape_jobs"] = json.dumps(self._scrape_jobs) @@ -1653,6 +1764,11 @@ def set_scrape_job_spec(self, _=None): # that is written to the filesystem. relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict) + if recording_rules_as_dict: + relation.data[self._charm.app]["recording_rules"] = json.dumps( + recording_rules_as_dict + ) + def _set_unit_ip(self, _=None): """Set unit host address. @@ -1728,19 +1844,18 @@ def _scrape_metadata(self) -> dict: class PrometheusRulesProvider(Object): """Forward rules to Prometheus. - This object may be used to forward rules to Prometheus. At present it only supports - forwarding alert rules. This is unlike :class:`MetricsEndpointProvider`, which - is used for forwarding both scrape targets and associated alert rules. This object - is typically used when there is a desire to forward rules that apply globally (across - all deployed charms and units) rather than to a single charm. All rule files are - forwarded using the same 'prometheus_scrape' interface that is also used by + This object may be used to forward rules to Prometheus. This is unlike + :class:`MetricsEndpointProvider`, which is used for forwarding both scrape targets and + associated rules. This object is typically used when there is a desire to forward rules + that apply globally (across all deployed charms and units) rather than to a single charm. + All rule files are forwarded using the same 'prometheus_scrape' interface that is also used by `MetricsEndpointProvider`. Args: charm: A charm instance that `provides` a relation with the `prometheus_scrape` interface. relation_name: Name of the relation in `metadata.yaml` that has the `prometheus_scrape` interface. - dir_path: Root directory for the collection of rule files. + alert_dir_path: Root directory for the collection of rule files. recursive: Whether to scan for rule files recursively. """ @@ -1748,23 +1863,40 @@ def __init__( self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME, - dir_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, + alert_dir_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, + recording_dir_path: str = DEFAULT_RECORDING_RULES_RELATIVE_PATH, recursive=True, + **kwargs, ): super().__init__(charm, relation_name) self._charm = charm self._relation_name = relation_name self._recursive = recursive + # FIXME: get rid of this once cos-config stops using it, or stops using + # explicitly named arguments. It's the only consumer. + if "dir_path" in kwargs: + alert_dir_path = kwargs.get("dir_path", DEFAULT_ALERT_RULES_RELATIVE_PATH) + try: - dir_path = _resolve_dir_against_charm_path(charm, dir_path) - except InvalidAlertRulePathError as e: + alert_dir_path = _resolve_dir_against_charm_path(charm, alert_dir_path) + except InvalidRulePathError as e: logger.debug( "Invalid Prometheus alert rules folder at %s: %s", - e.alert_rules_absolute_path, + e.rules_absolute_path, + e.message, + ) + self.alert_dir_path = alert_dir_path + + try: + recording_dir_path = _resolve_dir_against_charm_path(charm, recording_dir_path) + except InvalidRulePathError as e: + logger.debug( + "Invalid Prometheus recording rules folder at %s: %s", + e.rules_absolute_path, e.message, ) - self.dir_path = dir_path + self.recording_dir_path = recording_dir_path events = self._charm.on[self._relation_name] event_sources = [ @@ -1779,6 +1911,12 @@ def __init__( def _reinitialize_alert_rules(self): """Reloads alert rules and updates all relations.""" + # FIXME: cos-config uses this, and should call `reinitialize_rules` instead. We don't + # need to keep everything private. + self.reinitialize_rules() + + def reinitialize_rules(self): + """Reloads rules and updates all relations.""" self._update_relation_data(None) def _update_relation_data(self, _): @@ -1787,14 +1925,25 @@ def _update_relation_data(self, _): return alert_rules = AlertRules() - alert_rules.add_path(self.dir_path, recursive=self._recursive) + alert_rules.add_path(self.alert_dir_path, recursive=self._recursive) alert_rules_as_dict = alert_rules.as_dict() + recording_rules = RecordingRules() + recording_rules.add_path(self.recording_dir_path, recursive=self._recursive) + recording_rules_as_dict = recording_rules.as_dict() + logger.info("Updating relation data with rule files from disk") for relation in self._charm.model.relations[self._relation_name]: + # Update relation data with the string representation of the rule file. + # Juju topology is already included in the "scrape_metadata" field above. + # The consumer side of the relation uses this information to name the rules file + # that is written to the filesystem. relation.data[self._charm.app]["alert_rules"] = json.dumps( - alert_rules_as_dict, - sort_keys=True, # sort, to prevent unnecessary relation_changed events + alert_rules_as_dict, sort_keys=True + ) + + relation.data[self._charm.app]["recording_rules"] = json.dumps( + recording_rules_as_dict, sort_keys=True ) @@ -2362,8 +2511,8 @@ def apply_label_matchers(self, rules) -> dict: rule["expr"] = self.inject_label_matchers(rule["expr"], topology) return rules - def validate_alert_rules(self, rules: dict) -> Tuple[bool, str]: - """Will validate correctness of alert rules, returning a boolean and any errors.""" + def validate_rules(self, rules: dict) -> Tuple[bool, str]: + """Will validate correctness of rules, returning a boolean and any errors.""" if not self.path: logger.debug("`cos-tool` unavailable. Not validating alert correctness.") return True, "" diff --git a/requirements.txt b/requirements.txt index 0181c2c9..5e771164 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,6 @@ jsonschema ops pyaml requests +typing_extensions lightkube >= 0.11 lightkube-models >= 1.22.0.4 diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py index aaa6e55c..71bb225f 100644 --- a/tests/unit/test_charm.py +++ b/tests/unit/test_charm.py @@ -425,6 +425,7 @@ def setUp(self, *unused): self.rel_id = self.harness.add_relation(RELATION_NAME, "remote-app") self.harness.add_relation_unit(self.rel_id, "remote-app/0") + self.harness.set_leader(True) @patch("charm.KubernetesServicePatch", lambda x, y: None) @k8s_resource_multipatch diff --git a/tests/unit/test_endpoint_provider.py b/tests/unit/test_endpoint_provider.py index 7aeba4cc..4dbea9c2 100644 --- a/tests/unit/test_endpoint_provider.py +++ b/tests/unit/test_endpoint_provider.py @@ -439,7 +439,7 @@ def test_a_bad_alert_rules_logs_an_error(self): messages = sorted(logger.output) self.assertEqual(len(messages), 1) - self.assertIn("Failed to read alert rules from bad_yaml.rule", messages[0]) + self.assertIn("Failed to read rules from bad_yaml.rule", messages[0]) def sorted_matchers(matchers) -> str: @@ -841,6 +841,6 @@ def test_alert_rules(self): baked_in_alert_rules_as_they_appear_in_reldata = json.loads(data["alert_rules"]) tool = self.harness.charm.tool - valid, errs = tool.validate_alert_rules(baked_in_alert_rules_as_they_appear_in_reldata) + valid, errs = tool.validate_rules(baked_in_alert_rules_as_they_appear_in_reldata) self.assertEqual(valid, True) self.assertEqual(errs, "") diff --git a/tests/unit/test_functions.py b/tests/unit/test_functions.py index fb69af9f..84595844 100644 --- a/tests/unit/test_functions.py +++ b/tests/unit/test_functions.py @@ -3,12 +3,22 @@ import copy import unittest +from unittest import mock import deepdiff -from charms.prometheus_k8s.v0.prometheus_scrape import _dedupe_job_names + +# FIXME: We should **NOT** move methods to the top level to make writing tests +# more convenient. Mock it with `create_autospec`. See `setUp()` +from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointConsumer class TestFunctions(unittest.TestCase): + def setUp(self): + self.consumer = mock.create_autospec(MetricsEndpointConsumer) + self.consumer._dedupe_job_names = lambda x: MetricsEndpointConsumer._dedupe_job_names( + self.consumer, x + ) + def test_dedupe_job_names(self): jobs = [ { @@ -60,6 +70,8 @@ def test_dedupe_job_names(self): "static_configs": [{"targets": ["localhost:9090"]}], }, ] - self.assertTrue(len(deepdiff.DeepDiff(_dedupe_job_names(jobs), expected)) == 0) + self.assertTrue( + len(deepdiff.DeepDiff(self.consumer._dedupe_job_names(jobs), expected)) == 0 + ) # Make sure the function does not modify its argument self.assertEqual(jobs, jobs_original) diff --git a/tests/unit/test_transform.py b/tests/unit/test_transform.py index 87a12157..108bd2f8 100644 --- a/tests/unit/test_transform.py +++ b/tests/unit/test_transform.py @@ -155,7 +155,7 @@ def setUp(self): @unittest.mock.patch("platform.machine", lambda: "x86_64") def test_returns_errors_on_bad_rule_file(self): tool = self.harness.charm.tool - valid, errs = tool.validate_alert_rules( + valid, errs = tool.validate_rules( { "groups": [ { @@ -171,7 +171,7 @@ def test_returns_errors_on_bad_rule_file(self): @unittest.mock.patch("platform.machine", lambda: "x86_64") def test_successfully_validates_good_alert_rules(self): tool = self.harness.charm.tool - valid, errs = tool.validate_alert_rules( + valid, errs = tool.validate_rules( { "groups": [ { From 499ae22b6083694d069e0f268a7042750a0a4740 Mon Sep 17 00:00:00 2001 From: Ryan Barry Date: Sat, 21 Jan 2023 22:53:37 +0000 Subject: [PATCH 2/5] Get `prometheus_remote_write`, too. And move the `static-lib` check to 3.8 now that ops is --- .../v0/prometheus_remote_write.py | 364 ++++++++++++------ .../prometheus_k8s/v0/prometheus_scrape.py | 15 +- tox.ini | 2 +- 3 files changed, 251 insertions(+), 130 deletions(-) diff --git a/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py b/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py index 07a379f3..691d7775 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py @@ -21,6 +21,7 @@ import socket import subprocess import tempfile +from abc import ABC, abstractmethod from pathlib import Path from typing import Dict, List, Optional, Tuple, Union @@ -36,6 +37,7 @@ ) from ops.framework import EventBase, EventSource, Object, ObjectEvents from ops.model import Relation +from typing_extensions import Literal # The unique Charmhub library identifier, never change it LIBID = "f783823fa75f4b7880eb70f2077ec259" @@ -45,7 +47,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 10 +LIBPATCH = 11 logger = logging.getLogger(__name__) @@ -56,6 +58,9 @@ RELATION_INTERFACE_NAME = "prometheus_remote_write" DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" +DEFAULT_RECORDING_RULES_RELATIVE_PATH = "./src/prometheus_recording_rules" + +_RuleType = Literal["alert", "record"] class RelationNotFoundError(Exception): @@ -108,8 +113,8 @@ def __init__( super().__init__(self.message) -class InvalidAlertRuleEvent(EventBase): - """Event emitted when alert rule files are not parsable. +class InvalidRuleEvent(EventBase): + """Event emitted when rule files are not parsable. Enables us to set a clear status on the provider. """ @@ -120,66 +125,66 @@ def __init__(self, handle, errors: str = "", valid: bool = False): self.valid = valid def snapshot(self) -> Dict: - """Save alert rule information.""" + """Save rule information.""" return { "valid": self.valid, "errors": self.errors, } def restore(self, snapshot): - """Restore alert rule information.""" + """Restore rule information.""" self.valid = snapshot["valid"] self.errors = snapshot["errors"] -def _is_official_alert_rule_format(rules_dict: dict) -> bool: - """Are alert rules in the upstream format as supported by Prometheus. +def _is_official_rule_format(rules_dict: dict) -> bool: + """Are rules in the upstream format as supported by Prometheus. - Alert rules in dictionary format are in "official" form if they + Rules in dictionary format are in "official" form if they contain a "groups" key, since this implies they contain a list of - alert rule groups. + rule groups. Args: - rules_dict: a set of alert rules in Python dictionary format + rules_dict: a set of rules in Python dictionary format Returns: - True if alert rules are in official Prometheus file format. + True if rules are in official Prometheus file format. """ return "groups" in rules_dict -def _is_single_alert_rule_format(rules_dict: dict) -> bool: - """Are alert rules in single rule format. +def _is_single_rule_format(rules_dict: dict, rule_type: _RuleType) -> bool: + """Are rules in single rule format. - The Prometheus charm library supports reading of alert rules in a - custom format that consists of a single alert rule per file. This - does not conform to the official Prometheus alert rule file format - which requires that each alert rules file consists of a list of - alert rule groups and each group consists of a list of alert - rules. + The Prometheus charm library supports reading of rules in a + custom format that consists of a single rule per file. This + does not conform to the official Prometheus rule file format + which requires that each rules file consists of a list of + rule groups and each group consists of a list of rules. - Alert rules in dictionary form are considered to be in single rule + Rules in dictionary form are considered to be in single rule format if in the least it contains two keys corresponding to the - alert rule name and alert expression. + rule name and expression. Returns: - True if alert rule is in single rule file format. + True if rule is in single rule file format. """ - # one alert rule per file - return set(rules_dict) >= {"alert", "expr"} + # one rule per file + return set(rules_dict) >= {rule_type, "expr"} -class AlertRules: - """Utility class for amalgamating prometheus alert rule files and injecting juju topology. +class Rules(ABC): + """Utility class for amalgamating prometheus alert files and injecting juju topology. - An `AlertRules` object supports aggregating alert rules from files and directories in both - official and single rule file formats using the `add_path()` method. All the alert rules + A `Rules` object supports aggregating rules from files and directories in both + official and single rule file formats using the `add_path()` method. All the rules read are annotated with Juju topology labels and amalgamated into a single data structure in the form of a Python dictionary using the `as_dict()` method. Such a dictionary can be easily dumped into JSON format and exchanged over relation data. The dictionary can also - be dumped into YAML format and written directly into an alert rules file that is read by - Prometheus. Note that multiple `AlertRules` objects must not be written into the same file, - since Prometheus allows only a single list of alert rule groups per alert rules file. + be dumped into YAML format and written directly into an rules file that is read by + Prometheus. Note that multiple `Rules` objects must not be written into the same file, + since Prometheus allows only a single list of rule groups per rules file. + The official Prometheus format is a YAML file conforming to the Prometheus documentation (https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). The custom single rule format is a subsection of the official YAML, having a single alert @@ -187,23 +192,30 @@ class AlertRules: """ # This class uses the following terminology for the various parts of a rule file: - # - alert rules file: the entire groups[] yaml, including the "groups:" key. - # - alert groups (plural): the list of groups[] (a list, i.e. no "groups:" key) - it is a list + # - rules file: the entire groups[] yaml, including the "groups:" key. + # - groups (plural): the list of groups[] (a list, i.e. no "groups:" key) - it is a list # of dictionaries that have the "name" and "rules" keys. - # - alert group (singular): a single dictionary that has the "name" and "rules" keys. - # - alert rules (plural): all the alerts in a given alert group - a list of dictionaries with - # the "alert" and "expr" keys. - # - alert rule (singular): a single dictionary that has the "alert" and "expr" keys. + # - group (singular): a single dictionary that has the "name" and "rules" keys. + # - rules (plural): all the rules in a given group - a list of dictionaries with + # the `self.rule_type` (either "alert" or "record") and "expr" keys. + # - rule (singular): a single dictionary that has the `self.rule_type` (either "alert" or + # "record") and "expr" keys. def __init__(self, topology: Optional[JujuTopology] = None): - """Build and alert rule object. + """Build a rule object. Args: - topology: an optional `JujuTopology` instance that is used to annotate all alert rules. + topology: an optional `JujuTopology` instance that is used to annotate all rules. """ self.topology = topology self.tool = CosTool(None) - self.alert_groups = [] # type: List[dict] + self.groups = [] # type: List[dict] + + @property + @abstractmethod + def rule_type(self) -> _RuleType: + """Return the rule type being used for interpolation in messages.""" + pass def _from_file(self, root_path: Path, file_path: Path) -> List[dict]: """Read a rules file from path, injecting juju topology. @@ -222,47 +234,47 @@ def _from_file(self, root_path: Path, file_path: Path) -> List[dict]: rule_file = yaml.safe_load(rf) except Exception as e: - logger.error("Failed to read alert rules from %s: %s", file_path.name, e) + logger.error("Failed to read rules from %s: %s", file_path.name, e) return [] - if _is_official_alert_rule_format(rule_file): - alert_groups = rule_file["groups"] - elif _is_single_alert_rule_format(rule_file): - # convert to list of alert groups + if _is_official_rule_format(rule_file): + groups = rule_file["groups"] + elif _is_single_rule_format(rule_file, self.rule_type): + # convert to list of groups # group name is made up from the file name - alert_groups = [{"name": file_path.stem, "rules": [rule_file]}] + groups = [{"name": file_path.stem, "rules": [rule_file]}] else: # invalid/unsupported logger.error("Invalid rules file: %s", file_path.name) return [] # update rules with additional metadata - for alert_group in alert_groups: - if not self._is_already_modified(alert_group["name"]): + for group in groups: + if not self._is_already_modified(group["name"]): # update group name with topology and sub-path - alert_group["name"] = self._group_name( + group["name"] = self._group_name( str(root_path), str(file_path), - alert_group["name"], + group["name"], ) # add "juju_" topology labels - for alert_rule in alert_group["rules"]: - if "labels" not in alert_rule: - alert_rule["labels"] = {} + for rule in group["rules"]: + if "labels" not in rule: + rule["labels"] = {} if self.topology: # only insert labels that do not already exist for label, val in self.topology.label_matcher_dict.items(): - if label not in alert_rule["labels"]: - alert_rule["labels"][label] = val - # insert juju topology filters into a prometheus alert rule - alert_rule["expr"] = self.tool.inject_label_matchers( - re.sub(r"%%juju_topology%%,?", "", alert_rule["expr"]), + if label not in rule["labels"]: + rule["labels"][label] = val + # insert juju topology filters into a prometheus rule + rule["expr"] = self.tool.inject_label_matchers( + re.sub(r"%%juju_topology%%,?", "", rule["expr"]), self.topology.label_matcher_dict, ) - return alert_groups + return groups def _group_name(self, root_path: str, file_path: str, group_name: str) -> str: """Generate group name from path and topology. @@ -285,13 +297,13 @@ def _group_name(self, root_path: str, file_path: str, group_name: str) -> str: # - name, from juju topology # - suffix, from the relative path of the rule file; group_name_parts = [self.topology.identifier] if self.topology else [] - group_name_parts.extend([rel_path, group_name, "alerts"]) + group_name_parts.extend([rel_path, group_name, f"{self.rule_type}s"]) # filter to remove empty strings return "_".join(filter(None, group_name_parts)) def _is_already_modified(self, name: str) -> bool: """Detect whether a group name has already been modified with juju topology.""" - modified_matcher = re.compile(r"^.*?_[\da-f]{8}_.*?alerts$") + modified_matcher = re.compile(r"^.*?_[\da-f]{8}_.*?(records|alerts)$") if modified_matcher.match(name) is not None: return True return False @@ -321,27 +333,27 @@ def _from_dir(self, dir_path: Path, recursive: bool) -> List[dict]: By default, only the top directory is scanned; for nested scanning, pass `recursive=True`. Args: - dir_path: directory containing *.rule files (alert rules without groups). + dir_path: directory containing *.rule files (rules without groups). recursive: flag indicating whether to scan for rule files recursively. Returns: - a list of dictionaries representing prometheus alert rule groups, each dictionary - representing an alert group (structure determined by `yaml.safe_load`). + a list of dictionaries representing prometheus rule groups, each dictionary + representing an group (structure determined by `yaml.safe_load`). """ - alert_groups = [] # type: List[dict] + groups = [] # type: List[dict] - # Gather all alerts into a list of groups + # Gather all into a list of groups for file_path in self._multi_suffix_glob( dir_path, [".rule", ".rules", ".yml", ".yaml"], recursive ): - alert_groups_from_file = self._from_file(dir_path, file_path) - if alert_groups_from_file: - logger.debug("Reading alert rule from %s", file_path) - alert_groups.extend(alert_groups_from_file) + groups_from_file = self._from_file(dir_path, file_path) + if groups_from_file: + logger.debug("Reading %s rule from %s", self.rule_type, file_path) + groups.extend(groups_from_file) - return alert_groups + return groups - def add_path(self, path: str, *, recursive: bool = False) -> None: + def add_path(self, path: Union[Path, str], *, recursive: bool = False) -> None: """Add rules from a dir path. All rules from files are aggregated into a data structure representing a single rule file. @@ -354,23 +366,57 @@ def add_path(self, path: str, *, recursive: bool = False) -> None: Returns: True if path was added else False. """ - path = Path(path) # type: Path + path = Path(path) if isinstance(path, str) else path # type: Path if path.is_dir(): - self.alert_groups.extend(self._from_dir(path, recursive)) + self.groups.extend(self._from_dir(path, recursive)) elif path.is_file(): - self.alert_groups.extend(self._from_file(path.parent, path)) + self.groups.extend(self._from_file(path.parent, path)) else: - logger.debug("Alert rules path does not exist: %s", path) + logger.debug("%s rules path does not exist: %s", self.rule_type.capitalize(), path) def as_dict(self) -> dict: - """Return standard alert rules file in dict representation. + """Return standard rules file in dict representation. Returns: a dictionary containing a single list of alert rule groups. The list of alert rule groups is provided as value of the "groups" dictionary key. """ - return {"groups": self.alert_groups} if self.alert_groups else {} + return {"groups": self.groups} if self.groups else {} + + +class AlertRules(Rules): + """Utility class for amalgamating prometheus alert files and injecting juju topology. + + The official Prometheus format is a YAML file conforming to the Prometheus documentation + (https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). + The custom single rule format is a subsection of the official YAML, having a single alert + rule, effectively "one alert per file". + """ + + _rule_type = "alert" # type: _RuleType + + @property + def rule_type(self) -> _RuleType: + """Return the rule type being used for interpolation in messages.""" + return self._rule_type + + +class RecordingRules(Rules): + """Utility class for amalgamating prometheus recording files and injecting juju topology. + + The official Prometheus format is a YAML file conforming to the Prometheus documentation + (https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/). + The custom single rule format is a subsection of the official YAML, having a single recording + rule, effectively "one record per file". + """ + + _rule_type = "record" # type: _RuleType + + @property + def rule_type(self) -> _RuleType: + """Return the rule type being used for interpolation in messages.""" + return self._rule_type def _validate_relation_by_interface_and_direction( @@ -445,15 +491,15 @@ def restore(self, snapshot): self.relation_id = snapshot["relation_id"] -class InvalidAlertRulePathError(Exception): - """Raised if the alert rules folder cannot be found or is otherwise invalid.""" +class InvalidRulePathError(Exception): + """Raised if the rules folder cannot be found or is otherwise invalid.""" def __init__( self, - alert_rules_absolute_path: str, + rules_absolute_path: str, message: str, ): - self.alert_rules_absolute_path = alert_rules_absolute_path + self.rules_absolute_path = rules_absolute_path self.message = message super().__init__(self.message) @@ -477,21 +523,22 @@ def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> st # https://github.com/canonical/operator/issues/643 charm_dir = Path(os.getcwd()) - alerts_dir_path = charm_dir.absolute().joinpath(*path_elements) + dir_path = charm_dir.absolute().joinpath(*path_elements) - if not alerts_dir_path.exists(): - raise InvalidAlertRulePathError(str(alerts_dir_path), "directory does not exist") - if not alerts_dir_path.is_dir(): - raise InvalidAlertRulePathError(str(alerts_dir_path), "is not a directory") + if not dir_path.exists(): + raise InvalidRulePathError(str(dir_path), "directory does not exist") + if not dir_path.is_dir(): + raise InvalidRulePathError(str(dir_path), "is not a directory") - return str(alerts_dir_path) + return str(dir_path) class PrometheusRemoteWriteConsumerEvents(ObjectEvents): """Event descriptor for events raised by `PrometheusRemoteWriteConsumer`.""" endpoints_changed = EventSource(PrometheusRemoteWriteEndpointsChangedEvent) - alert_rule_status_changed = EventSource(InvalidAlertRuleEvent) + alert_rule_status_changed = EventSource(InvalidRuleEvent) + recording_rule_status_changed = EventSource(InvalidRuleEvent) class PrometheusRemoteWriteConsumer(Object): @@ -583,6 +630,12 @@ def __init__(self, *args): description: > The unit {{ $labels.juju_model }} {{ $labels.juju_unit }} is unavailable + In addition, recording rules can be specified. By default, this library will look + into the `/prometheus_recording_rules`, which in a standard charm + layouts resolves to `src/prometheus_recording_rules`. Each recording rule goes into a + separate `*.rule` file. If the syntax of a rule is invalid, the `MetricsEndpointProvider` + logs an error and does not load the particular rule. + The `%%juju_topology%%` token will be replaced with label filters ensuring that the only timeseries evaluated are those scraped from this charm, and no other. Failing to ensure that the `%%juju_topology%%` token is applied to each and every @@ -598,6 +651,7 @@ def __init__( charm: CharmBase, relation_name: str = DEFAULT_CONSUMER_NAME, alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, + recording_rules_path: str = DEFAULT_RECORDING_RULES_RELATIVE_PATH, ): """API to manage a required relation with the `prometheus_remote_write` interface. @@ -622,10 +676,19 @@ def __init__( try: alert_rules_path = _resolve_dir_against_charm_path(charm, alert_rules_path) - except InvalidAlertRulePathError as e: + except InvalidRulePathError as e: logger.debug( "Invalid Prometheus alert rules folder at %s: %s", - e.alert_rules_absolute_path, + e.rules_absolute_path, + e.message, + ) + + try: + recording_rules_path = _resolve_dir_against_charm_path(charm, recording_rules_path) + except InvalidRulePathError as e: + logger.debug( + "Invalid Prometheus recording rules folder at %s: %s", + e.rules_absolute_path, e.message, ) @@ -633,6 +696,7 @@ def __init__( self._charm = charm self._relation_name = relation_name self._alert_rules_path = alert_rules_path + self._recording_rules_path = recording_rules_path self.topology = JujuTopology.from_charm(charm) @@ -642,12 +706,12 @@ def __init__( self.framework.observe(on_relation.relation_changed, self._handle_endpoints_changed) self.framework.observe(on_relation.relation_departed, self._handle_endpoints_changed) self.framework.observe(on_relation.relation_broken, self._on_relation_broken) - self.framework.observe(on_relation.relation_joined, self._push_alerts_on_relation_joined) + self.framework.observe(on_relation.relation_joined, self._push_rules_on_relation_joined) self.framework.observe( - self._charm.on.leader_elected, self._push_alerts_to_all_relation_databags + self._charm.on.leader_elected, self._push_rules_to_all_relation_databags ) self.framework.observe( - self._charm.on.upgrade_charm, self._push_alerts_to_all_relation_databags + self._charm.on.upgrade_charm, self._push_rules_to_all_relation_databags ) def _on_relation_broken(self, event: RelationBrokenEvent) -> None: @@ -668,28 +732,39 @@ def _handle_endpoints_changed(self, event: RelationEvent) -> None: self.on.endpoints_changed.emit(relation_id=event.relation.id) - def _push_alerts_on_relation_joined(self, event: RelationEvent) -> None: - self._push_alerts_to_relation_databag(event.relation) + def _push_rules_on_relation_joined(self, event: RelationEvent) -> None: + self._push_rules_to_relation_databag(event.relation) - def _push_alerts_to_all_relation_databags(self, _: Optional[HookEvent]) -> None: + def _push_rules_to_all_relation_databags(self, _: Optional[HookEvent]) -> None: for relation in self.model.relations[self._relation_name]: - self._push_alerts_to_relation_databag(relation) + self._push_rules_to_relation_databag(relation) - def _push_alerts_to_relation_databag(self, relation: Relation) -> None: + def _push_rules_to_relation_databag(self, relation: Relation) -> None: if not self._charm.unit.is_leader(): return alert_rules = AlertRules(self.topology) alert_rules.add_path(self._alert_rules_path, recursive=False) - alert_rules_as_dict = alert_rules.as_dict() + recording_rules = RecordingRules(self.topology) + recording_rules.add_path(self._recording_rules_path, recursive=False) + recording_rules_as_dict = recording_rules.as_dict() + if alert_rules_as_dict: relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict) + if recording_rules_as_dict: + relation.data[self._charm.app]["recording_rules"] = json.dumps(recording_rules_as_dict) + def reload_alerts(self) -> None: """Reload alert rules from disk and push to relation data.""" - self._push_alerts_to_all_relation_databags(None) + # FIXME: find who's using this and deprecate it + self.reload_rules() + + def reload_rules(self) -> None: + """Reload rules from disk and push them to relation data.""" + self._push_rules_to_all_relation_databags(None) @property def endpoints(self) -> List[Dict[str, str]]: @@ -861,6 +936,38 @@ def _set_endpoint_on_relation(self, relation: Relation) -> None: } ) + def recording_rules(self) -> dict: + """Fetch recording rules from all relations. + + A Prometheus recording rules file consists of a list of "groups". Each + group consists of a list of recording `rules` that are sequentially + executed. This method returns all the recording rules provided by each + related metrics provider charm. These rules may be used to generate a + separate recording rules file for each relation since the returned list + of recording groups are indexed by relation ID. Also, for each relation ID + associated scrape metadata such as Juju model, UUID and application + name are provided so the unique name may be generated for the rules + file. For each relation the structure of data returned is a dictionary + with four keys + + - groups + - model + - model_uuid + - application + + The value of the `groups` key is such that it may be used to generate + a Prometheus recording rules file directly using `yaml.dump` but the + `groups` key itself must be included as this is required by Prometheus, + for example as in `yaml.safe_dump({"groups": alerts["groups"]})`. + + The `PrometheusRemoteWriteProvider` accepts a list of rules and these + rules are all placed into one group. + + Returns: + a dictionary mapping the name of an recording rule group to the group. + """ + return self._get_rules_by_type("recording") + def alerts(self) -> dict: """Fetch alert rules from all relations. @@ -891,28 +998,40 @@ def alerts(self) -> dict: Returns: a dictionary mapping the name of an alert rule group to the group. """ - alerts = {} # type: Dict[str, dict] # mapping b/w juju identifiers and alert rule files + return self._get_rules_by_type("alert") + + def _get_rules_by_type(self, rule_type: str) -> dict: + """Unified logic for fetching rules from different databags. + + The logic for alert|recording rules is shared here, since other than the + databag, it is completely identical. + + Returns: + A dictionary representing the rules. + """ + rule_mapping = {"alert": "alert_rules", "recording": "recording_rules"} + all_rules = {} for relation in self._charm.model.relations[self._relation_name]: if not relation.units or not relation.app: continue - alert_rules = json.loads(relation.data[relation.app].get("alert_rules", "{}")) + rules = json.loads(relation.data[relation.app].get(rule_mapping[rule_type], "{}")) - if not alert_rules: + if not rules: continue - if "groups" not in alert_rules: - logger.debug("No alert groups were found in relation data") + if "groups" not in rules: + logger.debug("No %s groups were found in relation data", rule_type) continue # Construct an ID based on what's in the alert rules error_messages = [] tool = CosTool(self._charm) - for group in alert_rules["groups"]: + for group in rules["groups"]: # Copy off rules, so we don't modify an object we're iterating over rules = group["rules"] - for idx, alert_rule in enumerate(rules): - labels = alert_rule.get("labels") + for idx, rule in enumerate(rules): + labels = rule.get("labels") if labels: topology = JujuTopology( @@ -924,12 +1043,12 @@ def alerts(self) -> dict: ) # Inject topology and put it back in the list - alert_rule["expr"] = tool.inject_label_matchers( - re.sub(r"%%juju_topology%%,?", "", alert_rule["expr"]), + rule["expr"] = tool.inject_label_matchers( + re.sub(r"%%juju_topology%%,?", "", rule["expr"]), topology.label_matcher_dict, ) - group["rules"][idx] = alert_rule + group["rules"][idx] = rule try: labels = group["rules"][0]["labels"] identifier = JujuTopology( @@ -940,17 +1059,20 @@ def alerts(self) -> dict: charm_name=labels.get("juju_charm", ""), ).identifier - _, errmsg = self.tool.validate_alert_rules({"groups": [group]}) + _, errmsg = self.tool.validate_rules({"groups": [group]}) if errmsg: error_messages.append(errmsg) continue - if identifier not in alerts: - alerts[identifier] = {"groups": [group]} + if identifier not in rules: + all_rules[identifier] = {"groups": [group]} else: - alerts[identifier]["groups"].append(group) + all_rules[identifier]["groups"].append(group) except KeyError: - logger.error("Alert rules were found but no usable labels were present") + logger.error( + "%s rules were found but no usable labels were present", + rule_type.capitalize(), + ) if error_messages: relation.data[self._charm.app]["event"] = json.dumps( @@ -958,7 +1080,7 @@ def alerts(self) -> dict: ) continue - return alerts + return all_rules # Copy/pasted from prometheus_scrape.py @@ -1006,10 +1128,10 @@ def apply_label_matchers(self, rules) -> dict: rule["expr"] = self.inject_label_matchers(rule["expr"], topology) return rules - def validate_alert_rules(self, rules: dict) -> Tuple[bool, str]: - """Will validate correctness of alert rules, returning a boolean and any errors.""" + def validate_rules(self, rules: dict) -> Tuple[bool, str]: + """Will validate correctness of rules, returning a boolean and any errors.""" if not self.path: - logger.debug("`cos-tool` unavailable. Not validating alert correctness.") + logger.debug("`cos-tool` unavailable. Not validating correctness.") return True, "" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py index ac7bbf7c..f44d6fa4 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py @@ -370,7 +370,7 @@ def _on_scrape_targets_changed(self, event): # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 28 +LIBPATCH = 29 logger = logging.getLogger(__name__) @@ -695,7 +695,7 @@ class MetricsEndpointProviderEvents(ObjectEvents): """Events raised by :class:`InvalidRuleEvent`s.""" alert_rule_status_changed = EventSource(InvalidRuleEvent) - recordingt_rule_status_changed = EventSource(InvalidRuleEvent) + recording_rule_status_changed = EventSource(InvalidRuleEvent) def _type_convert_stored(obj): @@ -813,7 +813,7 @@ def _is_single_rule_format(rules_dict: dict, rule_type: _RuleType) -> bool: Returns: True if rule is in single rule file format. """ - # one alert rule per file + # one rule per file return set(rules_dict) >= {rule_type, "expr"} @@ -846,10 +846,10 @@ class Rules(ABC): # "record") and "expr" keys. def __init__(self, topology: Optional[JujuTopology] = None): - """Build and alert rule object. + """Build a rule object. Args: - topology: an optional `JujuTopology` instance that is used to annotate all alert rules. + topology: an optional `JujuTopology` instance that is used to annotate all rules. """ self.topology = topology self.tool = CosTool(None) @@ -943,7 +943,7 @@ def _group_name(self, root_path: str, file_path: str, group_name: str) -> str: # - name, from juju topology # - suffix, from the relative path of the rule file; group_name_parts = [self.topology.identifier] if self.topology else [] - group_name_parts.extend([rel_path, group_name, "alerts"]) + group_name_parts.extend([rel_path, group_name, f"{self.rule_type}s"]) # filter to remove empty strings return "_".join(filter(None, group_name_parts)) @@ -1577,7 +1577,6 @@ def __init__( the `MetricsEndpointProvider` logs an error and does not load the particular rule. - An attempt will be made to validate rules prior to loading them into Prometheus. If they are invalid, an event will be emitted from this object which charms can respond to in order to set a meaningful status for administrators. @@ -2514,7 +2513,7 @@ def apply_label_matchers(self, rules) -> dict: def validate_rules(self, rules: dict) -> Tuple[bool, str]: """Will validate correctness of rules, returning a boolean and any errors.""" if not self.path: - logger.debug("`cos-tool` unavailable. Not validating alert correctness.") + logger.debug("`cos-tool` unavailable. Not validating correctness.") return True, "" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tox.ini b/tox.ini index 27401ced..e97dc8af 100644 --- a/tox.ini +++ b/tox.ini @@ -73,7 +73,7 @@ deps = integration: pytest-operator==1.0.0b1 commands = charm: mypy {[vars]src_path} {posargs} - lib: mypy --python-version 3.5 {[vars]lib_path} {posargs} + lib: mypy --python-version 3.8 {[vars]lib_path} {posargs} unit: mypy {[vars]tst_path}/unit {posargs} integration: mypy {[vars]tst_path}/integration {posargs} From fc282adaeeb8429253b211d9c431879ecaaba765 Mon Sep 17 00:00:00 2001 From: Ryan Barry Date: Sat, 21 Jan 2023 22:55:03 +0000 Subject: [PATCH 3/5] Remove comment --- tests/unit/test_functions.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/unit/test_functions.py b/tests/unit/test_functions.py index 84595844..cdfee060 100644 --- a/tests/unit/test_functions.py +++ b/tests/unit/test_functions.py @@ -6,9 +6,6 @@ from unittest import mock import deepdiff - -# FIXME: We should **NOT** move methods to the top level to make writing tests -# more convenient. Mock it with `create_autospec`. See `setUp()` from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointConsumer From a1565608c7e00ce4f4bbd9aa23f94631c1e9b220 Mon Sep 17 00:00:00 2001 From: Ryan Barry Date: Wed, 25 Jan 2023 16:40:50 -0500 Subject: [PATCH 4/5] Update lib/charms/prometheus_k8s/v0/prometheus_remote_write.py Co-authored-by: Leon <82407168+sed-i@users.noreply.github.com> --- lib/charms/prometheus_k8s/v0/prometheus_remote_write.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py b/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py index 691d7775..68776794 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py @@ -391,7 +391,7 @@ class AlertRules(Rules): The official Prometheus format is a YAML file conforming to the Prometheus documentation (https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). The custom single rule format is a subsection of the official YAML, having a single alert - rule, effectively "one alert per file". + rule, effectively "one alerting rule per file". """ _rule_type = "alert" # type: _RuleType From 75d0cb42e84382c911649f03cd30218004ed8850 Mon Sep 17 00:00:00 2001 From: Ryan Barry Date: Wed, 25 Jan 2023 16:41:02 -0500 Subject: [PATCH 5/5] Update lib/charms/prometheus_k8s/v0/prometheus_remote_write.py Co-authored-by: Leon <82407168+sed-i@users.noreply.github.com> --- lib/charms/prometheus_k8s/v0/prometheus_remote_write.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py b/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py index 68776794..5e51f923 100644 --- a/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py +++ b/lib/charms/prometheus_k8s/v0/prometheus_remote_write.py @@ -408,7 +408,7 @@ class RecordingRules(Rules): The official Prometheus format is a YAML file conforming to the Prometheus documentation (https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/). The custom single rule format is a subsection of the official YAML, having a single recording - rule, effectively "one record per file". + rule, effectively "one recording rule per file". """ _rule_type = "record" # type: _RuleType