diff --git a/DC-SBP-SLES4SAP-sap-infra-monitoring b/DC-SBP-SLES4SAP-sap-infra-monitoring index b5d9a1632..df59b6263 100644 --- a/DC-SBP-SLES4SAP-sap-infra-monitoring +++ b/DC-SBP-SLES4SAP-sap-infra-monitoring @@ -4,7 +4,7 @@ ADOC_TYPE="article" ADOC_POST="yes" -ADOC_ATTRIBUTES="--attribute docdate=2022-02-15" +ADOC_ATTRIBUTES="--attribute docdate=2023-09-29" # stylesheets STYLEROOT=/usr/share/xml/docbook/stylesheet/sbp diff --git a/adoc/SLES4SAP-sap-infra-monitoring-alertmanager.adoc b/adoc/SLES4SAP-sap-infra-monitoring-alertmanager.adoc new file mode 100644 index 000000000..295cffc7a --- /dev/null +++ b/adoc/SLES4SAP-sap-infra-monitoring-alertmanager.adoc @@ -0,0 +1,79 @@ +// Alertmanager adoc file +// Please use the following line to implement each tagged content to the main document: +// include::SLES4SAP-sap-infra-monitoring-alertmanager.adoc[tag=alert-XXXXX] + +// Alertmanager general +# tag::alert-general[] +===== Alertmanager + +The https://prometheus.io/docs/alerting/latest/alertmanager/[Alertmanager] handles alerts sent by client applications such as the Prometheus or Loki server. +It takes care of deduplicating, grouping, and routing them to the correct receiver integration such as email or PagerDuty. It also takes care of +silencing and inhibition of alerts. +# end::alert-general[] + + +// Alertmanager Implementing +# tag::alert-impl[] +=== Alertmanager +The Alertmanager package can be found in the PackageHub repository. +The repository needs to be activated via the SUSEConnect command first, unless you have activated it in the previous steps already. + + +[source] +---- +SUSEConnect --product PackageHub/15.3/x86_64 +---- + +Alertmanager can then be installed via the `zypper` command: +[subs="attributes,specialchars,verbatim,quotes"] +---- +zypper in golang-github-prometheus-alertmanager +---- + + +Notification can be done to different receivers. A receivers can be simply be an email, chat systems, webhooks and more. +(for a complete list please take a look at the https://prometheus.io/docs/alerting/latest/configuration/#receiver[Alertmanager documentation]) + + + +The example configuration below is using email for notification (receiver). + + + +Edit the Alertmanager configuration file `/etc/alertmanager/config.yml` like below: + + +[subs="attributes,specialchars,verbatim,quotes"] +---- +global: + resolve_timeout: 5m + smtp_smarthost: '' + smtp_from: '' + smtp_auth_username: '' + smtp_auth_password: '' + smtp_require_tls: true + +route: + group_by: ['...'] + group_wait: 10s + group_interval: 5m + repeat_interval: 4h + receiver: 'email' + +receivers: + - name: 'email' + email_configs: + - send_resolved: true + to: '' + from: 'mail-address>' + headers: + From: + Subject: '{{ template "email.default.subject" . }}' + html: '{{ template "email.default.html" . }}' +---- + + +[subs="attributes,specialchars,verbatim,quotes"] +Start and enable the alertmanager service: +---- +systemctl enable --now prometheus-alertmanager.service +---- + +# end::alert-impl[] \ No newline at end of file diff --git a/adoc/SLES4SAP-sap-infra-monitoring-collectd.adoc b/adoc/SLES4SAP-sap-infra-monitoring-collectd.adoc new file mode 100644 index 000000000..fd10637d2 --- /dev/null +++ b/adoc/SLES4SAP-sap-infra-monitoring-collectd.adoc @@ -0,0 +1,106 @@ +// Collectd adoc file +// Please use the following line to implement each tagged content to the main document: +// include::SLES4SAP-sap-infra-monitoring-collectd.adoc[tag=collectd-XXXXX] + +// Collectd general +# tag::collectd-general[] + +===== `collectd` - System information collection daemon +https://collectd.org/[`collectd`] is a small daemon which collects system information periodically and provides mechanisms to store and monitor the values in a variety of ways. + +# end::collectd-general[] + + +// Collectd implementing +# tag::collectd-impl[] + +=== `collectd` + +The `collectd` packages can be installed from the SUSE repositories as well. For the example at hand, we have used a newer version from the openSUSE repository. + +Create a file `/etc/zypp/repos.d/server_monitoring.repo` and add the following content to it: +[subs="attributes,specialchars,verbatim,quotes"] +.Content for /etc/zypp/repos.d/server_monitoring.repo +---- +[server_monitoring] +name=Server Monitoring Software (SLE_15_SP3) +type=rpm-md +baseurl=https://download.opensuse.org/repositories/server:/monitoring/SLE_15_SP3/ +gpgcheck=1 +gpgkey=https://download.opensuse.org/repositories/server:/monitoring/SLE_15_SP3/repodata/repomd.xml.key +enabled=1 +---- + +Afterward refresh the repository metadata and install `collectd` and its plugins. + +[subs="attributes,specialchars,verbatim,quotes"] +---- +# zypper ref +# zypper in collectd collectd-plugins-all +---- + +Now the `collectd` must be adapted to collect the information you want to get and export it in the format you need. +For example, when looking for network latency, use the ping plugin and expose the data in a Prometheus format. + +[subs="attributes,specialchars,verbatim,quotes"] +.Configuration of collectd in /etc/collectd.conf (excerpts) +---- +... +LoadPlugin ping +... + + Host "10.162.63.254" + Interval 1.0 + Timeout 0.9 + TTL 255 +# SourceAddress "1.2.3.4" +# AddressFamily "any" + Device "eth0" + MaxMissed -1 + +... +LoadPlugin write_prometheus +... + + Port "9103" + +... +---- + +Uncomment the `LoadPlugin` line and check the `` section in the file. + +Modify the `systemd` unit that `collectd` works as expected. First, create a copy from the system-provided service file. +[subs="attributes,specialchars,verbatim,quotes"] +---- +# cp /usr/lib/systemd/system/collectd.service /etc/systemd/system/collectd.service +---- + +Second, adapt this local copy. +Add the required `CapabilityBoundingSet` parameters in our local copy `/etc/systemd/system/collectd.service`. +[subs="attributes,specialchars,verbatim,quotes"] +---- +... +# Here's a (incomplete) list of the plugins known capability requirements: +# ping CAP_NET_RAW +CapabilityBoundingSet=CAP_NET_RAW +... +---- + +Activate the changes and start the `collectd` function. +[subs="attributes,specialchars,verbatim,quotes"] +---- +# systemctl daemon-reload +# systemctl enable --now collectd +---- + +All `collectd` metrics are accessible at port 9103. + +With a quick test, you can see if the metrics can be scraped. +[subs="attributes,specialchars,verbatim,quotes"] +---- +# curl localhost:9103/metrics +---- +// The offical project on github: https://github.com/collectd/collectd/ + + +# end::collectd-impl[] \ No newline at end of file diff --git a/adoc/SLES4SAP-sap-infra-monitoring-grafana.adoc b/adoc/SLES4SAP-sap-infra-monitoring-grafana.adoc new file mode 100644 index 000000000..e4a99fbc1 --- /dev/null +++ b/adoc/SLES4SAP-sap-infra-monitoring-grafana.adoc @@ -0,0 +1,84 @@ +// Grafana adoc file +// Please use the following line to implement each tagged content to the main document: +// include::SLES4SAP-sap-infra-monitoring-grafana.adoc[tag=grafana-XXXXX] + +// Grafana general +# tag::grafana-general[] + +===== Grafana + +https://grafana.com/oss/grafana/[Grafana] is an open source visualization and analytics platform. +Grafana's plug-in architecture allows interaction with a variety of data sources without creating data copies. +Its graphical browser-based user interface visualizes the data through highly customizable views, providing an interactive diagnostic workspace. + +Grafana can display metrics data from Prometheus and log data from Loki side-by-side, correlating events from log files with metrics. +This can provide helpful insights when trying to identify the cause for an issue. +Also, Grafana can trigger alerts based on metrics or log entries, and thus help identify potential issues early. + +# end::grafana-general[] + + +// Grafana implementing +# tag::grafana-impl[] + +=== Grafana + +The Grafana RPM packages can be found in the PackageHub repository. +The repository has to be activated via the `SUSEConnect` command first, unless you have activated it in the previous steps already. +---- +# SUSEConnect --product PackageHub/15.3/x86_64 +---- + +Grafana can then be installed via `zypper` command: +---- +# zypper in grafana +---- + + +Start and enable the Grafana server service: +---- +# systemctl enable --now grafana-server.service +---- + + +Now connect from a browser to your Grafana instance and log in: + +image::sap-infra-monitoring-grafana-login.png[Grafana Login page,scaledwidth=80%,title="Grafana welcome page"] + +==== Grafana data sources +After the login, the data source must be added. On the right hand there is a wheel where a new data source can be added. + +image::sap-infra-monitoring-grafana-datasource-add.png[Grafana add a new data source,scaledwidth=80%,title="Adding a new Grafana data source"] + +Add a data source for the Prometheus service. + +.Prometheus example +image::sap-infra-monitoring-grafana-data-prometheus.png[Prometheus data source,scaledwidth=80%,title="Grafana data source for Prometheus DB"] + +Also add a data source for Loki. + +.Loki example +image::sap-infra-monitoring-grafana-data-loki.png[Loki data source,scaledwidth=80%,title="Grafana data source for LOKI DB"] + +Now Grafana can access both the metrics stored in Prometheus and the log data collected by Loki, to visualize them. + +==== Grafana dashboards + +Dashboards are how Grafana presents information to the user. +Prepared dashboards can be downloaded from https://grafana.com/dashboards, or imported using the Grafana ID. + +.Grafana dashboard import +image::sap-infra-monitoring-grafana-dashboards.png[Dashboard overview,scaledwidth=80%,title="Grafana dashboard import option"] + +The dashboards can also be created from scratch. Information from all data sources can be merged into one dashboard. + +image::sap-infra-monitoring-grafana-dashboard-new.png[Dashboard create a new dashboard,scaledwidth=80%,title="Build your own dashboard"] + +==== Putting it all together +The picture below shows a dashboard displaying detailed information about the SAP HANA cluster, orchestrated by *pacemaker*. + +.Dashboard example for SAP HANA +image::sap-infra-monitoring-grafana-hana-cluster.png[SUSE HANA cluster dashboard example,scaledwidth=80%,title="SUSE cluster exporter dashboard"] + + +# end::grafana-impl[] \ No newline at end of file diff --git a/adoc/SLES4SAP-sap-infra-monitoring-ipmi.adoc b/adoc/SLES4SAP-sap-infra-monitoring-ipmi.adoc new file mode 100644 index 000000000..b24001731 --- /dev/null +++ b/adoc/SLES4SAP-sap-infra-monitoring-ipmi.adoc @@ -0,0 +1,105 @@ +// IPMI adoc file +// Please use the following line to implement each tagged content to the main document: +// include::SLES4SAP-sap-infra-monitoring-ipmi.adoc[tag=ipmi-XXXXX] + +// IPMI general +# tag::ipmi-general[] + +===== Prometheus IPMI Exporter +The https://github.com/prometheus-community/ipmi_exporter[Prometheus IPMI Exporter] supports both + +* the regular /metrics endpoint for Prometheus, exposing metrics from the host that the exporter is running on, +* and an /ipmi endpoint that supports IPMI over RMCP. + +One exporter instance running on one host can be used to monitor a large number of IPMI interfaces by passing the target parameter to a scrape. + +# end::ipmi-general[] + + +// IPMI implementing +# tag::ipmi-impl[] + + +=== Prometheus IPMI Exporter + +The IPMI exporter can be used to scrape information like temperature, power supply information and fan information. + +Create a directory, download and extract the IPMI exporter. +[subs="attributes,specialchars,verbatim,quotes"] +---- +# mkdir ipmi_exporter +# cd ipmi_exporter +# curl -OL https://github.com/prometheus-community/ipmi_exporter/releases/download/v1.4.0/ipmi_exporter-1.4.0.linux-amd64.tar.gz +# tar xzvf ipmi_exporter-1.4.0.linux-amd64.tar.gz +---- + +NOTE: We have been using the version 1.4.0 of the IPMI exporter. For a different release, the URL used in the `curl` command above needs to be adapted. + Current releases can be found at the https://github.com/prometheus-community/ipmi_exporter[IPMI exporter GitHub repository]. + + +Some additional packages are required and need to be installed. +[subs="attributes,specialchars,verbatim,quotes"] +---- +# zypper in freeipmi libipmimonitoring6 monitoring-plugins-ipmi-sensor1 +---- + +To start the IPMI exporter on the observed host, first start a new `screen` session, and then start the exporter.footnote:[Starting the IPMI exporter should really be done by creating a systemd unit.] +// TODO: replace use of screen by a systemd unit for the IPMI exporter +[subs="attributes,specialchars,verbatim,quotes"] +.Starting IPMI +---- +screen -S ipmi +# cd ipmi_exporter-1.4.0.linux-amd64 +# ./ipmi_exporter +---- +The IPMI exporter binary `ipmi_exporter` has been started in a screen session which can be detached (type `Ctrl+a d`). +This lets the exporter continue running in the background. + +==== IPMI Exporter Systemd Service File + +A more convenient and secure way to start the IPMI exporter is using a systemd service. +To do so a service unit file has to be created under /etc/systemd/system/: + +[subs="attributes,specialchars,verbatim,quotes"] +.Copy IPMI binary +---- +cp ipmi_exporter-1.4.0.linux-amd64 /usr/local/bin/ +---- + +[source] +---- +# cat /etc/systemd/system/ipmi-exporter.service +[Unit] +Description=IPMI exporter +Documentation= +[Service] +Type=simple +Restart=no +ExecStart=/usr/local/bin/ipmi_exporter-1.4.0.linux-amd64 +[Install] +WantedBy=multi-user.target +---- + +The "systemd" needs to be informed about the new unit: + +.reload the systemd daemon +[source] +---- +# systemctl daemon-reload +---- + +And finally enabled and started: + +.Start ipmi exporter +[source] +---- +# systemctl enable --now ipmi-exporter.service +---- + + +The metrics of the ipmi_exporter are accessible port 9290. + +//accessing the remote configured ipmi metrics: http://ls3331:9290/ipmi?target=ls3316r&module=remote + + +# end::ipmi-impl[] diff --git a/adoc/SLES4SAP-sap-infra-monitoring-loki.adoc b/adoc/SLES4SAP-sap-infra-monitoring-loki.adoc new file mode 100644 index 000000000..66538a86c --- /dev/null +++ b/adoc/SLES4SAP-sap-infra-monitoring-loki.adoc @@ -0,0 +1,123 @@ +// Loki adoc file +// Please use the following line to implement each tagged content to the main document: +// include::SLES4SAP-sap-infra-monitoring-loki.adoc[tag=loki-XXXXX] + +// Loki general +# tag::loki-general[] + +===== Loki + +https://grafana.com/oss/loki/[Loki] is a log aggregation system, inspired by Prometheus and designed to be cost effective and easy to operate. +Unlike other logging systems, Loki is built around the idea of only indexing a set of metadata (labels) for logs and leaving the original log message unindexed. +Log data itself is then compressed and stored in chunks in object stores, for example locally on the file system. +A small index and highly compressed chunks simplify the operation and significantly lower the cost of Loki. + +# end::loki-general[] + + + +// Loki implementing +# tag::loki-impl[] + +=== Loki +The Loki RPM packages can be found in the PackageHub repository. +The repository needs to be activated via the SUSEConnect command first, unless you have activated it in the previous steps already. +---- +# SUSEConnect --product PackageHub/15.3/x86_64 +---- + +Loki can then be installed via the `zypper` command: +---- +# zypper in loki +---- + +Edit the Loki configuration file `/etc/loki/loki.yaml` and change the following lines: +[source] +---- +chunk_store_config: + max_look_back_period: 240h + +table_manager: + retention_deletes_enabled: true + retention_period: 240h +---- + +Start and enable Loki service: +---- +# systemctl enable --now loki.service +---- + +# end::loki-impl[] + + + +// Loki practical usecases +# tag::loki-alert[] + +==== Loki alerts +Loki supports Prometheus-compatible alerting rules. They are following the same syntax, except they use LogQL for their expressions. +To activate alerting the loki config needs a component called ruler: + +.loki.yaml +[source] +----- +# Loki defaults to running in multi-tenant mode. +# Multi-tenant mode is set in the configuration with: +# auth_enabled: false +# When configured with "auth_enabled: false", Loki uses a single tenant. +# The single tenant ID will be the string fake. +auth_enabled: false +[...] + +ruler: + wal: + dir: /loki/ruler-wal + storage: + type: local + local: + directory: /etc/loki/rules + rule_path: /tmp/loki-rules-scratch + alertmanager_url: http://alertmanager:9093 + enable_alertmanager_v2: true +----- + +Depending on the given directory path in our eample above, the rule file has to be stored under: + + /etc/loki/rules/fake/rules.yml + +NOTE: We are using `auth_enabled: false` and therefor the default tenant ID is `fake` which needs to be add + to the path the rules are stored. + +The example rule below will trigger a mail (vial alertmanager configuration) if the password failed after accessing via ssh. +The log line looks like the following: + + 2023-07-19T10:41:38.076428+02:00 nuc5 sshd[16723]: Failed password for invalid user charly from 192.168.1.201 port 58831 ssh2 + +.rules.yml +[source] +---- +groups: + - name: accessLog + rules: + - alert: Failed_user_found + expr: 'sum( + count_over_time( + {filename="/var/log/messages" } + |= "Failed password for" + | pattern `T