From c2bd44801aece18bef859b7b12643d1c41a8dcfd Mon Sep 17 00:00:00 2001 From: David Pequegnot Date: Wed, 3 Jul 2024 18:55:45 +0000 Subject: [PATCH 1/5] refactor: remove all aloy configuration to let configure it in the workshop --- docker/alloy/config.alloy | 157 +------------------------------------- 1 file changed, 2 insertions(+), 155 deletions(-) diff --git a/docker/alloy/config.alloy b/docker/alloy/config.alloy index 2e04ea5..c68771d 100644 --- a/docker/alloy/config.alloy +++ b/docker/alloy/config.alloy @@ -1,157 +1,4 @@ logging { - level = "debug" + level = "info" format = "logfmt" -} - -//////////////////// -// LOGS -//////////////////// - -// CLASSIC LOGS FILES -local.file_match "logs" { - path_targets = [{"__path__" = "/logs/*.log", "exporter" = "LOGFILE"}] -} - -loki.source.file "logfiles" { - targets = local.file_match.logs.targets - forward_to = [loki.write.endpoint.receiver] -} - -// JSON LOG FILES -local.file_match "jsonlogs" { - path_targets = [{"__path__" = "/logs/*.json", "exporter" = "JSONFILE"}] -} - -loki.source.file "jsonlogfiles" { - targets = local.file_match.jsonlogs.targets - forward_to = [loki.process.jsonlogs.receiver] -} - -loki.process "jsonlogs" { - forward_to = [loki.write.endpoint.receiver] - - // stage.luhn { } - - stage.json { - expressions = { - // timestamp = "timestamp", - application = "context.properties.applicationName", - instance = "context.properties.instance", - trace_id = "mdc.trace_id", - } - } - - stage.labels { - values = { - application = "application", - instance = "instance", - trace_id = "trace_id", - } - } - - /*stage.timestamp { - source = "timestamp" - format = "RFC3339" - fallback_formats = ["UnixMs",] - }*/ - - /*stage.luhn { - min_length = 13 - replacement = "**MASKED**" - } - */ -} - -// EXPORTER (LOKI) -loki.write "endpoint" { - endpoint { - url = "http://loki:3100/loki/api/v1/push" - } -} - -///////////////////////// -// AGENT OTEL & TRACES -///////////////////////// - -// RECEIVER (OTLP) -otelcol.receiver.otlp "default" { - grpc { - endpoint = "0.0.0.0:4317" - } - - http { - endpoint = "0.0.0.0:4318" - } - - output { - metrics = [otelcol.processor.batch.default.input] - logs = [otelcol.processor.batch.default.input] - traces = [otelcol.processor.tail_sampling.actuator.input] - } -} - -//SAMPLING -/* -otelcol.processor.tail_sampling "actuator" { - policy { - name = "filter_http_url" - type = "string_attribute" - string_attribute { - key = "http.url" - values = ["/actuator/health", "/actuator/prometheus"] - enabled_regex_matching = true - invert_match = true - } - } - - policy { - name = "filter_url_path" - type = "string_attribute" - string_attribute { - key = "url.path" - values = ["/actuator/health", "/actuator/prometheus"] - enabled_regex_matching = true - invert_match = true - } - } - - output { - traces = [otelcol.processor.batch.default.input] - } -} -*/ - -otelcol.processor.batch "default" { - output { - metrics = [otelcol.exporter.prometheus.default.input] - logs = [otelcol.exporter.loki.default.input] - traces = [otelcol.exporter.otlp.tempo.input] - } -} - -// TRACES EXPORTER (OTLP) -otelcol.exporter.otlp "tempo" { - client { - endpoint = "tempo:4317" - - tls { - insecure = true - } - } -} - -// METRICS EXPORTER (PROMETHEUS) -otelcol.exporter.prometheus "default" { - forward_to = [prometheus.remote_write.default.receiver] -} - -prometheus.remote_write "default" { - endpoint { - url = "http://prometheus:9090/api/v1/write" - } -} - -// LOGS EXPORTER (LOKI) -otelcol.exporter.loki "default" { - forward_to = [loki.write.endpoint.receiver] -} +} \ No newline at end of file From dd2427bcaa59ba030637651574f62ec4f4428612 Mon Sep 17 00:00:00 2001 From: David Pequegnot Date: Wed, 3 Jul 2024 19:03:59 +0000 Subject: [PATCH 2/5] feat: update loki configuration for version 3.1.0 --- docker/loki/local-config.yaml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/docker/loki/local-config.yaml b/docker/loki/local-config.yaml index d9202bd..9246fa3 100644 --- a/docker/loki/local-config.yaml +++ b/docker/loki/local-config.yaml @@ -1,12 +1,10 @@ auth_enabled: false -limits_config: - max_global_streams_per_user: 10000 - server: http_listen_port: 3100 common: + instance_addr: 127.0.0.1 path_prefix: /loki storage: filesystem: @@ -20,9 +18,9 @@ common: schema_config: configs: - from: 2020-10-24 - store: boltdb-shipper + store: tsdb object_store: filesystem - schema: v11 + schema: v13 index: prefix: index_ period: 24h From 3689044cc880e92de1e63e8b16923d9968816a4f Mon Sep 17 00:00:00 2001 From: David Pequegnot Date: Wed, 3 Jul 2024 20:12:29 +0000 Subject: [PATCH 3/5] feat: logs review + addition --- docs/workshop.md | 478 ++++++++++++++++++++++++++++------------------- 1 file changed, 287 insertions(+), 191 deletions(-) diff --git a/docs/workshop.md b/docs/workshop.md index e5c3034..1a5b2ae 100644 --- a/docs/workshop.md +++ b/docs/workshop.md @@ -66,7 +66,7 @@ As mentioned earlier, our observability stack is composed of : * [Loki](https://grafana.com/oss/loki/) for storing the logs * [Tempo](https://grafana.com/oss/tempo/) for storing the traces * [Grafana](https://grafana.com/) for the dashboards -* [GRAFANA Alloy - OTEL collector](https://grafana.com/docs/alloy/latest/) which gathers all the data to send it then to +* [Grafana Alloy - OTEL collector](https://grafana.com/docs/alloy/latest/) which gathers all the data to send it then to In addition, the microservices are started with an agent to broadcast the traces to the collector. @@ -93,7 +93,7 @@ You **MUST** have set up these tools first: * Any IDE ([IntelliJ IDEA](https://www.jetbrains.com/idea), [VSCode](https://code.visualstudio.com/), [Netbeans](https://netbeans.apache.org/),...) you want * [cURL](https://curl.se/), [jq](https://stedolan.github.io/jq/), [HTTPie](https://httpie.io/) or any tool to call your REST APIs -Here are commands to validate your environment: +๐Ÿ› ๏ธ Here are commands to validate your environment: **Java** @@ -107,7 +107,7 @@ OpenJDK 64-Bit Server VM Temurin-21.0.3+9 (build 21.0.3+9-LTS, mixed mode, shari **Gradle** -If you use the wrapper, you won't have troubles. Otherwise...: +๐Ÿ› ๏ธ If you use the wrapper, you won't have troubles. Otherwise...: ```jshelllanguage $ gradle -version @@ -160,9 +160,9 @@ The "infrastructure stack" is composed of the following components: * One [Configuration server](https://docs.spring.io/spring-cloud-config/) is also used to centralise the configuration of our microservices. * The following microservices: API Gateway, Merchant BO, Fraud Detect, Smart Bank Gateway -If you run your application on GitPod, the following step are automatically started during the provisioning of your GitPod environment. +โ„น๏ธ If you run your application on GitPod, the following step are automatically started during the provisioning of your GitPod environment. -Otherwise, to run it on your desktop, execute the following commands +๐Ÿ› ๏ธ Otherwise, to run it on your desktop, execute the following commands ``` bash $ bash scripts/download-agent.sh @@ -175,7 +175,8 @@ $ ./gradlew tasks ``` bash $ docker compose up -d --build --remove-orphans ``` -To check if all the services are up, you can run this command: + +โœ… To check if all the services are up, you can run this command: ``` bash $ docker compose ps -a @@ -209,18 +210,26 @@ smartbank-gateway smartbank-gateway:latest "ja #### Validation -Open the [Eureka](https://cloud.spring.io/spring-cloud-netflix/) website started during the infrastructure setup. +โœ… Open the [Eureka](https://cloud.spring.io/spring-cloud-netflix/) website started during the infrastructure setup. The following instances should be registered with Eureka: + +* API-GATEWAY +* EASYPAY-SERVICE +* FRAUDETECT-SERVICE +* MERCHANT-BACKOFFICE +* SMARTBANK-GATEWAY -* If you run this workshop on your desktop, you can go to this URL: http://localhost:8761. -* If you run it on GitPod, you can go to the corresponding URL (e.g., https://8761-worldline-observability-w98vrd59k5h.ws-eu114.gitpod.io) instead. +> aside positive +> +> If you run this workshop on your desktop, you can go to this URL: [http://localhost:8761]. +> If you run it on GitPod, you can go to the corresponding URL (e.g., https://8761-worldline-observability-w98vrd59k5h.ws-eu114.gitpod.io) instead by going into the `PORTS` view. -You can now reach our platform to initiate a payment: +๐Ÿ› ๏ธ You can now access our platform to initiate a payment: ```bash $ http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780008 expiryDate=789456123 amount:=25000 ``` -You should get the following content: +โœ… You should get the following content: ```bash HTTP/1.1 201 Created @@ -255,9 +264,9 @@ One of our customers raised an issue: Normally the first thing to do is checking the logs. Before that, we will reproduce these issues. -You can check the API as following: +๐Ÿ› ๏ธ You can check the API as following: -For the ``AMOUNT_EXCEEDED`` error: +* For the ``AMOUNT_EXCEEDED`` error: ```bash $ http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780008 expiryDate=789456123 amount:=51000 @@ -284,7 +293,7 @@ transfer-encoding: chunked } ``` -And for the ``INVALID_CARD_NUMBER`` error: +* And for the ``INVALID_CARD_NUMBER`` error: ```bash $ http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780007 expiryDate=789456123 amount:=51000 @@ -312,10 +321,14 @@ transfer-encoding: chunked ``` -Go then to the log folder (``easypay-service/logs/``). -You should get these log entries in JSON format. -Open one of these files and check out the logs. -Look into the ``easypay-service-log.json`` to pinpoint these issues. +๐Ÿ› ๏ธ Itโ€™s time to have a look to our `easypay-service` logs! +Service is started in a Docker container. To get its output you can use the following command: + +```bash +$ docker compose logs -f easypay-service +``` + +๐Ÿ‘€ Look into the console `logs` to pinpoint these issues. > aside positive > @@ -352,7 +365,8 @@ For instance, [SLF4J](https://www.slf4j.org/) offers the [following log levels b #### ``AMOUNT_EXCEEDED`` issue -Go the ``easypay-service/src/main/java/com/worldline/easypay/payment/control/bank/BankAuthorService.java`` class and modify the following code block +๐Ÿ“ Go the ``easypay-service/src/main/java/com/worldline/easypay/payment/control/bank/BankAuthorService.java`` class and modify the following code block + ```java @Retry(name = "BankAuthorService", fallbackMethod = "acceptByDelegation") public boolean authorize(PaymentProcessingContext context) { @@ -370,10 +384,20 @@ public boolean authorize(PaymentProcessingContext context) { } ``` -Modify the exception trace to provide contextual information such as the authorId, the status of the call and the result. +Add a log before the `return context.authorized;` instruction to provide contextual information such as the authorId and the result of the call (authorized or not). + +For the lazy, you can write something like that: +```java +[...] +context.bankCalled = true; +context.authorId = Optional.of(response.authorId()); +context.authorized = response.authorized(); +log.info("Bank answered payment authorization with authorId={}, authorized={}", response.authorId(), response.authorized()); +return context.authorized; +``` #### ``INVALID_CARD_NUMBER`` issue -Go to the ``easypay-service/src/main/java/com/worldline/easypay/payment/control/CardValidator.java`` class and modify the following block code in the process method in the same way: +๐Ÿ“ Go to the ``easypay-service/src/main/java/com/worldline/easypay/payment/control/PaymentService.java`` class and modify the following block code in the `process` method in the same way: ```java private void process(PaymentProcessingContext context) { @@ -391,50 +415,55 @@ private void process(PaymentProcessingContext context) { For this error, you can log the error with the following content: * The attributes of the ``PaymentProcessingContext `` -* The error message - +* An error message -You can also add more logs: +๐Ÿ“ You can also add more logs, especially in the ``easypay-service/src/main/java/com/worldline/easypay/payment/control/CardValidator.java`` class. +You may simply uncomment all the commented logs lines (``// LOG.โ€ฆ``), such as: -In the ``CarValidator.checkLunKey()`` method, you can add a warn message when the key is not valid. -For instance: +* In the ``CarValidator.checkLunKey()`` method, add a warning message when the key is not valid. For instance: ```java -log.warn("checkLunKey KO: {}",cardNumber); +LOG.warn("Check card number Luhn key does not pass: {}", cardNumber); ``` -In the ``CarValidator.checkExpiryDate()`` method, you can add a warn message when a ``DateTimeParseException`` is thrown. -For instance: +* In the ``CarValidator.checkExpiryDate()`` method, add a warning message when a ``DateTimeParseException`` is thrown. For instance: ```java -log.warn("checkExpiryDate KO: bad format {}",expiryDate); +LOG.warn("Check card expiry date does not pass: bad format: {}", expiryDate); ``` -You can go further and add as many log you think it would help in production. +* And do the same for other logsโ€ฆ + +You can go further and add as many logs as you think would help in production ๐Ÿ˜Ž + +#### Additional logs + +We can also log payment requests and responses to provide even more context, which could be helpful for following requests in this workshop. + +๐Ÿ“ Modify the `easypay-service/src/main/java/com/worldline/easypay/payment/boundary/PaymentResource.java` class by uncommenting all `// LOG.โ€ฆ` lines (keep MDC lines for later ๐Ÿ˜‰). ### Check your code -You can restart your easy pay service running the following commands: +๐Ÿ› ๏ธ You can redeploy the `easypay-service` with the following commands: ```bash -$ docker compose down easypay-service $ docker compose build easypay-service $ docker compose up -d easypay-service ``` -Now you can run the same commands ran earlier and check again the logs. +๐Ÿ› ๏ธ Now you can run the same commands ran earlier and check again the logs (`http POSTโ€ฆ`). ### A technical issue -Another issue was raised for the POS (Point of Sell) ``POS-02``. +Another issue was raised for the POS (Point of Sell) ``POS-02`` (but we didnโ€™t know yet!). -When you reach the API using this command: +๐Ÿ› ๏ธ When you reach the API using this command: ```bash http POST :8080/api/easypay/payments posId=POS-02 cardNumber=5555567898780008 expiryDate=789456123 amount:=25000 ``` -With the following output: +๐Ÿ‘€ You should get the following output: ````bash @@ -449,7 +478,7 @@ HTTP/1.1 500 } ```` -You then get the following log message: +๐Ÿ‘€ You then get the following log message: ```bash 2024-06-05T15:45:35.215+02:00 ERROR 135386 --- [easypay-service] [o-auto-1-exec-7] o.a.c.c.C.[.[.[/].[dispatcherServlet] : Servlet.service() for servlet [dispatcherServlet] in context with path [] threw exception [Request processing failed: java.lang.NullPointerException: Cannot invoke "java.lang.Boolean.booleanValue()" because "java.util.List.get(int).active" is null] with root cause @@ -466,9 +495,7 @@ java.lang.NullPointerException: Cannot invoke "java.lang.Boolean.booleanValue()" [...] ``` -To find the root cause, add first a _smart_ log entry in the ``easypay-service/src/main/java/com/worldline/easypay/payment/control/PosValidator.java`` class. - -In the ``isActive()`` method, catch the exception and trace the error: +๐Ÿ“ To find the root cause, add first a _smart_ log entry in the ``easypay-service/src/main/java/com/worldline/easypay/payment/control/PosValidator.java`` class. In the ``isActive()`` method, catch the exception and trace the error: ```java public boolean isActive(String posId) { @@ -494,49 +521,61 @@ public boolean isActive(String posId) { } } ``` - -You can also prevent this issue by simply fixing the SQL import file. -In the file ``easypay-service/src/main/resources/db/postgresql/data.sql``, modify the implied line for ``POS-02`` from: +This is most likely an issue with some data in the databaseโ€ฆ Thanks to logs, we may quickly get the idea that the problem comes from the point of sale with ID `POS-02`. + +It is easy to identify the issue if we donโ€™t have traffic against our application, but what if we have more realistic traffic? -```sql -INSERT INTO pos_ref(id, pos_id, location, active) VALUES (2, 'POS-02', 'Blois France', NULL) ON CONFLICT DO NOTHING; +๐Ÿ› ๏ธ Generate some load with `k6` (a Grafana tool for load testing): + +```bash +$ k6 run -u 5 -d 30s k6/01-payment-only.js ``` -to +๐Ÿ‘€ Check again logs: -```sql -INSERT INTO pos_ref(id, pos_id, location, active) VALUES (2, 'POS-02', 'Blois France', true) ON CONFLICT DO NOTHING; +```bash +$ docker compose logs -f easypay-service ``` +Logs are now interleaved due to several requests being executed concurrently. It becomes harder to identify which point of sale is causing this error... + +๐Ÿ‘€ If you look into the SQL import file (`easypay-service/src/main/resources/db/postgresql/data.sql`), you'll notice a `NULL` value instead of a boolean for the `active` column. + +> aside negative +> +> Letโ€™s keep the issue as it is for now. + ### Using Mapped Diagnostic Context (MDC) to get more insights > aside positive > > Mapped Diagnostic Context (MDC) will help us add more context on every log output. For more information, refer to this web page: [https://logback.qos.ch/manual/mdc.html](https://logback.qos.ch/manual/mdc.html). -Go to the ``PaymentResource`` class and modify the method ``processPayment()`` to instantiate the [MDC](https://logback.qos.ch/manual/mdc.html): - +๐Ÿ“ Go to the ``PaymentResource`` class and modify the method ``processPayment()`` to instantiate the [MDC](https://logback.qos.ch/manual/mdc.html): ```java public ResponseEntity processPayment(PaymentRequest paymentRequest) + MDC.put("CardNumber",paymentRequest.cardNumber()); + MDC.put("POS",paymentRequest.posId()); -MDC.put("CardNumber",paymentRequest.cardNumber()); -MDC.put("POS",paymentRequest.posId()); - -[...] -MDC.clear(); + [...] + MDC.clear(); return httpResponse; ``` -Go to the MDC spring profile configuration file (``easypay-service/src/main/resources/application-mdc.properties``) and check the configuration of both the ``CardNumber`` & ``POS``fields. +๐Ÿ‘€ Go to the MDC spring profile configuration file (``easypay-service/src/main/resources/application-mdc.properties``) and check the configuration of both the ``CardNumber`` & ``POS``fields. ```properties [...] %clr(CardNumber=){faint}%clr(%X{CardNumber:-null}) %clr(POS=){faint}%clr(%X{POS:-null}) [...] ``` -Activate the ``mdc`` profile in the ``compose.yml`` file: +> aside positive +> +> `%X` allows to print the content of a MDC map. + +๐Ÿ“ Activate the ``mdc`` profile in the ``compose.yml`` file: ```yaml easypay-service: @@ -545,48 +584,24 @@ Activate the ``mdc`` profile in the ``compose.yml`` file: SPRING_PROFILES_ACTIVE: default,docker,mdc ``` -> aside positive -> To apply these modifications, we must restart the ``easypay-service``. -> It will be done later. -> -### Adding more content in our logs - -To have more logs, we will run several HTTP requests using [K6](https://k6.io/): - -Run the following command: +๐Ÿ› ๏ธ Rebuild and redeploy the `easypay-service`: ```bash -$ k6 run -u 5 -d 5s k6/01-payment-only.js +$ docker compose build easypay-service +$ docker compose up -d easypay-service ``` -Check then the logs to pinpoint some exceptions. - -### Personal Identifiable Information (PII) obfuscation -For compliance and preventing personal data loss, we will obfuscate the card number in the logs: - -In the Alloy configuration file (``docker/alloy/config.alloy``), uncomment the [luhn stage](https://grafana.com/docs/alloy/latest/reference/components/loki.process/#stageluhn-block). - -``` -/*stage.luhn { -min_length = 13 -replacement = "**MASKED**" -} -*/ -``` +### Adding more content in our logs -Rebuild/Restart then the whole platform: +๐Ÿ› ๏ธ To have more logs, we will run several HTTP requests using [K6](https://k6.io/). Run the following command: ```bash -$ docker compose down -$ docker compose up -d --build --remove-orphans +$ k6 run -u 5 -d 5s k6/01-payment-only.js ``` +๐Ÿ‘€ Check then the logs to pinpoint some exceptions. +### Logs Correlation -> aside positive -> -> During this workshop, we will only obfuscate the card numbers in Loki. It will therefore be stored as is in the log files but obfuscated in Loki and by this way in the data exposed on Grafana. - -### Logs Correlation > aside positive > > You are probably wondering how to smartly debug in production when you have plenty of logs for several users and by the way different transactions? @@ -596,22 +611,31 @@ $ docker compose up -d --build --remove-orphans ### Let's dive into our logs on Grafana! -Logs are stored in the logs folder (``easypay-service/logs``). +Logs are stored in the `logs` folder. You should find a `.log.json` file for each service in our application. +We have configured our Spring Boot applications to output logs: -We use then [Promtail to broadcast them to Loki](https://grafana.com/grafana/dashboards/14055-loki-stack-monitoring-promtail-loki/) through [Grafana Alloy (OTEL Collector)](https://grafana.com/docs/alloy/latest/). +* In the console (the one we can see), +* But also in a file with JSON format. -Check out the Logging configuration in the ``docker/alloy/config.alloy`` file: +> aside positive +> +> Structured logging may be less readable for humans, but it is perfect for log concentrators as they are easy to parse! +> You can look at a `logback-spring.xml` file to see the configuration we used (in the `src/main/resources` directory of one of the modules). + +๐Ÿ‘€ You can take a look at log files such as `easypay-service.log.json` to see their structure: a lot of information is available! + +In this section we will use [Promtail to broadcast them to Loki](https://grafana.com/grafana/dashboards/14055-loki-stack-monitoring-promtail-loki/) through [Grafana Alloy (OTEL Collector)](https://grafana.com/docs/alloy/latest/). +[Loki](https://grafana.com/oss/loki/), our storage backend, is listening on host `loki` and port `3100`. + +#### Configure Grafana Alloy collector + +๐Ÿ“ Add the following Logging configuration to the ``docker/alloy/config.alloy`` file: ```json //////////////////// -// (1) LOGS +// LOGS //////////////////// -loki.source.file "logfiles" { - targets = local.file_match.logs.targets - forward_to = [loki.write.endpoint.receiver] -} - // JSON LOG FILES (1) local.file_match "jsonlogs" { path_targets = [{"__path__" = "/logs/*.json", "exporter" = "JSONFILE"}] @@ -625,30 +649,34 @@ loki.source.file "jsonlogfiles" { loki.process "jsonlogs" { forward_to = [loki.write.endpoint.receiver] - //stage.luhn { } - // (4) + // (4) stage.json { expressions = { - // timestamp = "timestamp", + timestamp = "timestamp", application = "context.properties.applicationName", instance = "context.properties.instance", + level = "level", } } -// (5) + + // (5) stage.labels { values = { application = "application", instance = "instance", + level = "level, } } - /*stage.timestamp { + // (6) + stage.timestamp { source = "timestamp" format = "RFC3339" fallback_formats = ["UnixMs",] - }*/ + } } - // (6) + +// (7) // EXPORTER (LOKI) loki.write "endpoint" { endpoint { @@ -656,41 +684,111 @@ loki.write "endpoint" { } } ``` -As you can see, the JSON files are automatically grabbed and broadcast to Loki. +Thanks to this configuration, the JSON files are automatically grabbed and broadcast to Loki. Here is a short summary of the following steps: -1. Configuration of the input files -2. Configuration of the broadcasting within Alloy -3. Process definition -4. Applying some contextual information suck like the application name -5. Follow up the previous action and applying labels -6. Output definition +1. List the input file we are interested in (all `.json` files in the `logs` directory) +2. Reads log entries from files defined in (1) and forward them to Alloyโ€™s Loki components +3. Process log entries to parse them, extract data, transform data, and filter log entries +4. Extract contextual information from JSON log entry, such as the application name +5. Follow up the previous action by applying labels which will be available in Loki +6. Extract timestamp from the log file instead of using Promtail scrape time, +7. Define the output to Loki + +๐Ÿ› ๏ธ Restart the Grafana Alloy collector: + +```bash +$ docker compose restart collector +``` + +๐Ÿ‘€ Open a browser page to Alloy UI (`port 12345`): + +* Check out first the page ``http://localhost:12345/graph``, +* Select the ``loki.source.file`` component named `jsonlogfiles`, +* Check all the targets. + +#### Explore logs with Grafana + +> aside positive +> +> For this workshop, we have already configured the Loki datasource in Grafana. +> You can take a look at its configuration in Grafana (port `3000`) by navigating to the `Connections` > `Data sources` section. +> We only setup the Loki server url. + +๐Ÿ› ๏ธ Go to Grafana (port `3000`): +* Open an `Explore` dashboard, +* Select the Loki datasource. + +Grafana offers a form to help you build your queries: +* Filtering labels, +* Finding text in logs, +* Parsing logs to extract and filter values, +* ... + +You can also use a dedicated query language to make your queries directly: this is named [LogQL](https://grafana.com/docs/loki/latest/query/). + + +๐Ÿ› ๏ธ Letโ€™s get logs from the `easypay-service`: +* In the `Label filter`, select the application as ``easypay-service``, +* Click on ``Run Query``, +* Check out logs on the bottom of the view and unfold some of them. + +Now we want to display logs in a table format with the timestamp, the log level and the message. +In order to do that, we should extract the message from the JSON as it is not done automatically. + +๐Ÿ› ๏ธ Modify the query such as: +* Keep the same `Label filters` for the application `easypay-service`, +* Click on `+ Operations` and select `Formats` > `Json`: this will let us parse the log as JSON, +* Click on `+ Expression` to extract the JSON `message` field, +* Type `message="message"` to extract the value in a field with the same name, +* Click on `Run query` and unfold a log: you should see a new field named `message` in the `Fields` group. + +๐Ÿ› ๏ธ Time to format: +* Click on `Table` just below the `Logs volume` graph, to the right, +* Select the `Time`, `level` and `message` fields. + +๐Ÿ› ๏ธ Do not hesitate to hit the easypay payment endpoint with curl/httpie or k6 to generate some logs (whichever you prefer): -Open a browser page to Grafana. +```bash +http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780008 expiryDate=789456123 amount:=40000 +# OR +k6 run -u 1 -d 2m k6/01-payment-only.js +``` -Check out first the page ``http://localhost:12345/graph`` -Select the ``loki.source.jsonlogfiles`` component. +๐Ÿ‘€ You can also view traces for the other services (e.g., ``api-gateway``). -Check all the targets. +๐Ÿ‘€ You can also search logs based on the correlation ID. -Now open the explore dashboard : ``http://localhost:12345/explore``. +Maybe another issue? Do you see the card numbers? ๐Ÿ˜จ -Select the Loki datasource. +### Personal Identifiable Information (PII) obfuscation +For compliance and to prevent personal data loss, we will obfuscate the card number in the logs. -In the label filter, select the application as ``easypay-service`` and click on ``Run Query``. +๐Ÿ› ๏ธ In the Alloy configuration file (``docker/alloy/config.alloy``), add a [luhn stage](https://grafana.com/docs/alloy/latest/reference/components/loki.process/#stageluhn-block) at the end of the `loki.process "jsonlogs"` block definition: -Add then a JSON parser operation , click on ``Run query`` again and check out the logs. +```hcl +loki.process "jsonlogs" { + //... + stage.luhn { + min_length = 13 + replacement = "**MASKED**" + } +} +``` -Additionally, you can add these expressions in the JSON parser operation box: +๐Ÿ› ๏ธ Restart the collector: -* Expression: ``message="message"`` -* Level: ``level="level"`` +```bash +$ docker compose restart collector +``` -Check out the logs again, view it now as a table. +๐Ÿ› ๏ธ Generate some logs with curl/httpie or k6. -You can also view traces for the other services (e.g., ``api-gateway``) +โœ… Check the card numbers are now obfuscated with the ``**MASKED**`` content. -Finally, you can search logs based on the correlation ID +> aside positive +> +> During this workshop, we will only obfuscate the card numbers in Loki. It will therefore be stored as is in the log files but obfuscated in Loki and by this way in the data exposed on Grafana. ## Metrics @@ -704,25 +802,25 @@ Letโ€™s take control of our applicationโ€™s metrics! ### Metrics exposed by the application -Check out the ``easypay-service`` metrics definitions exposed by Spring Boot Actuator first: +๐Ÿ› ๏ธ Check out the ``easypay-service`` metrics definitions exposed by Spring Boot Actuator first: ```bash -http :8080/actuator/metrics +$ http :8080/actuator/metrics ``` -Explore the output. +๐Ÿ‘€ Explore the output. -Now get the Prometheus metrics using this command: +๐Ÿ› ๏ธ Now get the Prometheus metrics using this command: ```bash -http :8080/actuator/prometheus +$ http :8080/actuator/prometheus ``` This is an endpoint exposed by Actuator to let the Prometheus server get your application metrics. ### How are metrics scraped? -Check out the Prometheus (``docker/prometheus/prometheus.yml``) configuration file. +๐Ÿ‘€ Check out the Prometheus (``docker/prometheus/prometheus.yml``) configuration file. All the scraper's definitions are configured here. > aside positive @@ -762,7 +860,7 @@ Hopefully, Prometheus is also able to query our ``discovery-server`` (Eureka ser 1. We plugged Prometheus to our Eureka `discovery-server` to explore all the metrics of the underlying systems 2. Configuration allows additional operations, such as relabelling the final metric before storing it into Prometehus -You can have an overview of all the scraped applications on the Prometheus dashboard: +๐Ÿ‘€ You can have an overview of all the scraped applications on the Prometheus dashboard: * Go to ``http://localhost:9090`` if you started the stack locally, or use the link provided by GitPod in the `PORTS` view for port `9090`, * Click on ``Status`` > ``Targets``, @@ -772,7 +870,7 @@ You can have an overview of all the scraped applications on the Prometheus dashb ### Add scrape configuration for our easypay service -Modify the ``docker/prometheus/prometheus.yml`` file to add a new configuration to scrape the easypay service. +๐Ÿ“ Modify the ``docker/prometheus/prometheus.yml`` file to add a new configuration to scrape the easypay service. You can use the ``prometheus-config-server`` configuration as a model: * Job name: ``prometheus-easypay-service`` @@ -792,13 +890,13 @@ That should give you the following yaml configuration: - easypay-service:8080 ``` -Restart the Prometheus server to take into account this new configuration: +๐Ÿ› ๏ธ Restart the Prometheus server to take into account this new configuration: ```bash -docker compose restart prometheus +$ docker compose restart prometheus ``` -Now explore again the targets (``Status`` > ``Targets``) on the Prometheus dashboard (``port 9090``). +๐Ÿ‘€ Now explore again the targets (``Status`` > ``Targets``) on the Prometheus dashboard (``port 9090``). > aside positive > @@ -813,22 +911,21 @@ Now explore again the targets (``Status`` > ``Targets``) on the Prometheus dashb > You can have a look at its configuration in Grafana (``port 3000``) in the ``Connections`` > ``Data sources`` section. > It is pretty straightforward as we have only setup the Prometheus server URL. -* Go to Grafana and start again an ``Explore`` dashboard. +๐Ÿ› ๏ธ Go to Grafana and start again an ``Explore`` dashboard. -* Select the ``Prometheus`` datasource instead of the ``Loki`` one. +๐Ÿ› ๏ธ Select the ``Prometheus`` datasource instead of the ``Loki`` one. In this section you will hand on the metrics query builder of Grafana. The ``Metric`` field lists all the metrics available in the Prometheus server: take time to explore them. -* For example, you can select the metric named ``jvm_memory_used_bytes``, and click on the ``Run query`` button to plot the memory usage of all your services by memory area, +๐Ÿ› ๏ธ For example, you can select the metric named ``jvm_memory_used_bytes``, and click on the ``Run query`` button to plot the memory usage of all your services by memory area, -* If you want to plot the total memory usage of your services: +๐Ÿ› ๏ธ If you want to plot the total memory usage of your services: * Click on ``Operations`` and select ``Aggregations`` > ``Sum``, and ``Run query``: you obtain the whole memory consumption of all your JVMs, * To split the memory usage per service, you can click on the ``By label`` button and select the label named ``application`` (do not forget to click on ``Run query`` afterthat). -* You can also filter metrics to be displayed using ``Label filters``: try to create a filter to display only the metric related to the application named easypay-service. -* Check the card numbers are now obfuscated with the ``**MASKED**`` content. +๐Ÿ› ๏ธ You can also filter metrics to be displayed using ``Label filters``: try to create a filter to display only the metric related to the application named easypay-service. > aside positive > @@ -845,7 +942,7 @@ We will choose the second solution right now and import the following dashboards * [JVM Micrometer](https://grafana.com/grafana/dashboards/12271-jvm-micrometer/), which ID is `12271`, * [Spring Boot JDBC & HikariCP](https://grafana.com/grafana/dashboards/20729-spring-boot-jdbc-hikaricp/), which ID is `20729`. -To import these dashboards: +๐Ÿ› ๏ธ To import these dashboards: * Go to Grafana (``port 3000``), and select the ``Dashboards`` section on the left, * Then click on ``New`` (top right), and click on ``Import``, * In the ``Find and importโ€ฆ`` field, just paste the ID of the dashboard and click on ``Load``, @@ -856,40 +953,40 @@ To import these dashboards: > > Imported dashboards are available directly from the ``Dashboards`` section of Grafana. -Explore the ``JVM Micrometer`` dashboard: it works almost out of box. +๐Ÿ‘€ Explore the ``JVM Micrometer`` dashboard: it works almost out of box. It contains a lot of useful information about JVMs running our services. The ``application`` filter (top of the dashboard) let you select the service you want to explore metrics. ### Incident! -Now let's simulate some traffic using [Grafana K6](https://k6.io/). - -Run the following command: +๐Ÿ› ๏ธ Now let's simulate some traffic using [Grafana K6](https://k6.io/). Run the following command: ```bash $ k6 run -u 5 -d 2m k6/02-payment-smartbank.js ``` -Go back to the Grafana dashboard, click on ``Dashboards`` and select ``JVM Micrometer``. +๐Ÿ‘€ Go back to the Grafana dashboard, click on ``Dashboards`` and select ``JVM Micrometer``: + +* Explore the dashboard for the ``easypay-service``, especially the Garbage collector and CPU statistics. -Explore the dashboard for the ``easypay-service``, especially the Garbage collector and CPU statistics. +* Look around the other ``Spring Boot JDBC & HikariCP`` dashboard then and see what happens on the database connection pool for ``easypay-service``. -Look around the other ``Spring Boot JDBC & HikariCP`` dashboard then and see what happens on the database connection pool for ``easypay-service``. +We were talking about an incident, isnโ€™t it? -We were talking about an incident, isnโ€™t it? Let's go back to the Explore view of Grafana, select Loki as a data source and see what happens! +๐Ÿ‘€ Let's go back to the Explore view of Grafana, select Loki as a data source and see what happens! -Create a query with the following parameters to get error logs of the ``smartbank-gateway`` service: +๐Ÿ› ๏ธ Create a query with the following parameters to get error logs of the ``smartbank-gateway`` service: * Label filters: ``application`` = ``smartbank-gateway`` * line contains/Json: ``expression``= ``level="level"`` * label filter expression: ``label`` = ``level ; ``operator`` = ``=~`` ; ``value`` = ``WARN|ERROR`` -Click on ``Run query`` and check out the logs. +๐Ÿ› ๏ธ Click on ``Run query`` and check out the logs. Normally you would get a ``java.lang.OutOfMemoryError`` due to a saturated Java heap space. -To get additional insights, you can go back to the JVM dashboard and select the ``smartbank-gateway`` application. +๐Ÿ‘€ To get additional insights, you can go back to the JVM dashboard and select the ``smartbank-gateway`` application. Normally you will see the used JVM Heap reaching the maximum allowed. @@ -935,7 +1032,7 @@ We need to declare two timers in our code: * ``processTimer`` to record the ``rivieradev.payment.process`` metric: it represents the payment processing time and record the time spent in the `process` method, * ``storeTimer`` to record the ``rivieradev.payment.store`` metric: it represents the time required to store a payment in database by recording the time spent in the `store` method. -Letโ€™s modify the ``com.worldline.easypay.payment.control.PaymentService`` class to declare them: +๐Ÿ“ Letโ€™s modify the ``com.worldline.easypay.payment.control.PaymentService`` class to declare them: ```java // ... @@ -977,7 +1074,7 @@ timer.record(() -> { }); ``` -Letโ€™s modify our `process` and `store` methods to record our latency with the new metrics. +๐Ÿ“ Letโ€™s modify our `process` and `store` methods to record our latency with the new metrics. We can simply wrap our original code in a Java `Runnable` functional interface: ```java @@ -1004,7 +1101,7 @@ We can simply wrap our original code in a Java `Runnable` functional interface: #### 3. Add counter -Letโ€™s do the same for the counter: +๐Ÿ“ Letโ€™s do the same for the counter: ```java // ... @@ -1028,7 +1125,7 @@ public class PaymentService { 1. Declares the counter, 2. Initializes the counter. -The method ``accept`` of the ``PaymentService`` class is invoked for each payment request, it is a good candidate to increment our counter: +๐Ÿ“ The method ``accept`` of the ``PaymentService`` class is invoked for each payment request, it is a good candidate to increment our counter: ```java @Transactional(Transactional.TxType.REQUIRED) @@ -1043,30 +1140,30 @@ The method ``accept`` of the ``PaymentService`` class is invoked for each paymen #### 4. Redeploy easypay -Rebuild the easypay-service: +๐Ÿ› ๏ธ Rebuild the easypay-service: ```bash -docker compose build easypay-service +$ docker compose build easypay-service ``` -Redeploy easypay: +๐Ÿ› ๏ธ Redeploy easypay: ```bash -docker compose up -d easypay-service +$ docker compose up -d easypay-service ``` -Once easypay is started (you can check logs with the ``docker compose logs -f easypay-service`` command and wait for an output like ``Started EasypayServiceApplication in 32.271 seconds``): +๐Ÿ› ๏ธ Once easypay is started (you can check logs with the ``docker compose logs -f easypay-service`` command and wait for an output like ``Started EasypayServiceApplication in 32.271 seconds``): * Execute some queries: ```bash -http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780008 expiryDate=789456123 amount:=40000 +$ http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780008 expiryDate=789456123 amount:=40000 ``` * Go into the container and query the ``actuator/prometheus`` endpoint to look at our new metrics: ```bash -docker compose exec -it easypay-service sh +$ docker compose exec -it easypay-service sh / $ curl http://localhost:8080/actuator/prometheus | grep riviera ``` @@ -1105,7 +1202,7 @@ Finally, our ``Counter`` is the last metric suffixed with ``_total``. As we are talking about latencies, you may be also interested in histograms to get the distribution of the events per buckets or percentiles values (the famous 0.99, 0.999โ€ฆ). Fortunately, ``Timers`` allow to compute [Histograms and Percentiles](https://docs.micrometer.io/micrometer/reference/concepts/histogram-quantiles.html)! -Modify the two timers as follows: +๐Ÿ“ Modify the two timers as follows: ```java // ... @@ -1134,7 +1231,7 @@ public class PaymentService { 1. Configures the ``Timer`` to publish a histogram allowing to compute aggregable percentiles server-side, 2. Exposes percentiles value computed from the application: **these value are not aggregable!** -Repeat the previous step `3. Redeploy easypay`. +๐Ÿ› ๏ธ Repeat the step 4 (redeploy `easypay-service` and get Prometheus metrics in-container). You should get way more metrics, especially a new one type suffixed with `_bucket`: @@ -1152,13 +1249,13 @@ rivieradev_payment_process_seconds_sum{application="easypay-service",instance="e rivieradev_payment_process_seconds_max{application="easypay-service",instance="easypay-service:a44149cd-937a-4e96-abc2-0770343e49bc",namespace="local"} 0.0 ``` -Each ` bucket ` contains the number of event which lasts less than the value defined in the ``le`` tag. +โ„น๏ธ Each ` bucket ` contains the number of event which lasts less than the value defined in the ``le`` tag. #### 6. Visualization -Go back to Grafana (`port 3000`), and go into the ``Dashboards`` section. +๐Ÿ› ๏ธ Go back to Grafana (`port 3000`), and go into the ``Dashboards`` section. -We will import the dashboard defined in the ``docker/grafana/dashboards/easypay-monitoring.json`` file: +๐Ÿ› ๏ธ We will import the dashboard defined in the ``docker/grafana/dashboards/easypay-monitoring.json`` file: * Click on ``New`` (top right), and select ``Import``, * In the ``Import via dashboard JSON model`` field, paste the content of the ``easypay-monitoring.json`` file and click on ``Load``, * Select Prometheus as a data source. @@ -1171,10 +1268,10 @@ It provides some dashboards we have created from the new metrics you exposed in * ``Payment Duration distribution``: represents the various percentiles of our application computed from the ``rivieradev_payment_process`` timer and its histogram, * ``Requests process performance`` and ``Requests store performance``: are a visualization of the buckets of the two timers we created previously. -You can generate some load to view your dashboards evolving live: +๐Ÿ› ๏ธ You can generate some load to view your dashboards evolving live: ```bash -k6 -u 2 -d 2m k6/01-payment-only.js +$ k6 -u 2 -d 2m k6/01-payment-only.js ``` > aside positive @@ -1224,7 +1321,7 @@ If you're using *GitPod*, the Java Agent should already be available in the `ins ๐Ÿ› ๏ธ If you are participating in this workshop on your workstation, or if the file is missing, you can run the following script to download it: ```bash -bash -x scripts/download-agent.sh +$ bash -x scripts/download-agent.sh ``` #### Enable Java Agent @@ -1242,12 +1339,12 @@ COPY instrumentation/grafana-opentelemetry-java.jar /app/grafana-opentelemetry-j ENTRYPOINT ["java", "-javaagent:/app/grafana-opentelemetry-java.jar", "-cp","app:app/lib/*","com.worldline.easypay.EasypayServiceApplication"] # (2) ``` -The ENTRYPOINT instruction specifies the default command that will be executed when the container starts. +The `ENTRYPOINT` instruction specifies the default command that will be executed when the container starts. ๐Ÿ› ๏ธ You can now build the updated easypay-service container image: ```bash -docker compose build easypay-service +$ docker compose build easypay-service ``` #### Configure Grafana Alloy @@ -1317,7 +1414,7 @@ services: ๐Ÿ› ๏ธ To apply the new settings, restart Grafana Alloy with the following command: ```bash -docker compose restart collector +$ docker compose restart collector ``` โœ… After restarting, verify that Grafana Alloy is up and running with the updated configuration by accessing the Alloy dashboard on port ``12345``. @@ -1325,13 +1422,13 @@ docker compose restart collector ๐Ÿ› ๏ธ Redeploy the updated ``easypay-service``: ```bash -docker compose up -d easypay-service +$ docker compose up -d easypay-service ``` โœ… To ensure easypay-service has started up correctly, check its logs with: ```bash -docker compose logs -f easypay-service +$ docker compose logs -f easypay-service ``` #### Explore Traces with Grafana @@ -1345,7 +1442,7 @@ docker compose logs -f easypay-service ๐Ÿ› ๏ธ Generate some load on the application to produce traces: ```bash -k6 run -u 1 -d 5m k6/01-payment-only.js +$ k6 run -u 1 -d 5m k6/01-payment-only.js ``` ๐Ÿ› ๏ธ Letโ€™s explore your first traces in Grafana: @@ -1396,7 +1493,6 @@ To avoid storing unnecessary data in Tempo, we can sample the data in two ways: In this workshop, we will implement Tail Sampling. Modify the Alloy configuration file (``docker/alloy/config.alloy``) as follows: -In the alloy configuration file (``docker/alloy/config.alloy``), uncomment this configuration just after the ``SAMPLING`` comment: ``` // ... // RECEIVER (OTLP) @@ -1542,8 +1638,8 @@ This will provide the whole PaymentProcessingContext into the trace. ๐Ÿ› ๏ธ As we did before: ```bash -docker compose build easypay-service -docker compose up -d easypay-service +$ docker compose build easypay-service +$ docker compose up -d easypay-service ``` #### 4. Test it! @@ -1551,7 +1647,7 @@ docker compose up -d easypay-service ๐Ÿ› ๏ธ Generate some payments: ```bash -http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780008 expiryDate=789456123 amount:=40000 +$ http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780008 expiryDate=789456123 amount:=40000 ``` ๐Ÿ‘€ Go back to Grafana and try to find your new traces using what you've learned previously. Observe the spans you added. @@ -1649,8 +1745,8 @@ This will provide the whole PaymentProcessingContext into the trace. ๐Ÿ› ๏ธ As we did before: ```bash -docker compose build easypay-service -docker compose up -d easypay-service +$ docker compose build easypay-service +$ docker compose up -d easypay-service ``` #### 4. Test it! @@ -1658,7 +1754,7 @@ docker compose up -d easypay-service ๐Ÿ› ๏ธ Generate some payments: ```bash -http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780008 expiryDate=789456123 amount:=40000 +$ http POST :8080/api/easypay/payments posId=POS-01 cardNumber=5555567898780008 expiryDate=789456123 amount:=40000 ``` ๐Ÿ‘€ Go back to Grafana and try to find your new traces using what you've learned previously. Observe the spans you added. @@ -1746,20 +1842,20 @@ Exemplars are annotations used in metrics that link specific occurrences, like l ๐Ÿ› ๏ธ Generate some load towards the `easypay-service`: ```bash -k6 run -u 1 -d 2m k6/01-payment-only.js +$ k6 run -u 1 -d 2m k6/01-payment-only.js ``` ๐Ÿ‘€ Now, let's see how exemplars are exported by our service: * Access the `easypay-service` container:: ```bash -docker compose exec -it easypay-service sh +$ docker compose exec -it easypay-service sh ``` * Query actuator for Prometheus metrics, but in the OpenMetrics format: ```bash -curl http://localhost:8080/actuator/metrics -H 'Accept: application/openmetrics-text' | grep 'trace_id' +$ curl http://localhost:8080/actuator/metrics -H 'Accept: application/openmetrics-text' | grep 'trace_id' ``` You should obtain metrics with the following format: From 057fe1e8a7aa24d96d18d2237d14b4b4cb5c9a7f Mon Sep 17 00:00:00 2001 From: David Pequegnot Date: Wed, 3 Jul 2024 20:15:17 +0000 Subject: [PATCH 4/5] fix: add same limit about global streams per user as previously --- docker/loki/local-config.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docker/loki/local-config.yaml b/docker/loki/local-config.yaml index 9246fa3..41d76c2 100644 --- a/docker/loki/local-config.yaml +++ b/docker/loki/local-config.yaml @@ -39,4 +39,7 @@ ruler: # # If you would like to disable reporting, uncomment the following lines: analytics: - reporting_enabled: false \ No newline at end of file + reporting_enabled: false + +limits_config: + max_global_streams_per_user: 10000 \ No newline at end of file From 65be58a2e0fc74a1890fcb666b557f4c6f33f12b Mon Sep 17 00:00:00 2001 From: David Pequegnot Date: Wed, 3 Jul 2024 20:18:48 +0000 Subject: [PATCH 5/5] feat: add iconography --- docs/workshop.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/workshop.md b/docs/workshop.md index 1a5b2ae..19b1133 100644 --- a/docs/workshop.md +++ b/docs/workshop.md @@ -70,6 +70,16 @@ As mentioned earlier, our observability stack is composed of : In addition, the microservices are started with an agent to broadcast the traces to the collector. +### Icons + +You will the following icons during the workshop: + +๐Ÿ› ๏ธ An action to perform, +๐Ÿ“ A file to modify, +๐Ÿ‘€ Something to observe, +โœ… Validate something, +โ„น๏ธ Some information. + ## Prerequisites ### Skills