diff --git a/src/PostgREST/Metrics.hs b/src/PostgREST/Metrics.hs index 7c1f7d1bc23..fc77f8efaaf 100644 --- a/src/PostgREST/Metrics.hs +++ b/src/PostgREST/Metrics.hs @@ -8,47 +8,49 @@ module PostgREST.Metrics import qualified Data.ByteString.Lazy as LBS import qualified Hasql.Pool.Observation as SQL -import qualified Prometheus as Prom +import Prometheus import PostgREST.Observation import Protolude data MetricsState = - MetricsState Prom.Counter Prom.Gauge Prom.Gauge Prom.Gauge Prom.Counter Prom.Gauge + MetricsState Counter Gauge Gauge Gauge (Vector Label1 Counter) Gauge init :: Int -> IO MetricsState init configDbPoolSize = do - poolTimeouts <- Prom.register $ Prom.counter (Prom.Info "pgrst_db_pool_timeouts_total" "The total number of pool connection timeouts") - poolAvailable <- Prom.register $ Prom.gauge (Prom.Info "pgrst_db_pool_available" "Available connections in the pool") - poolWaiting <- Prom.register $ Prom.gauge (Prom.Info "pgrst_db_pool_waiting" "Requests waiting to acquire a pool connection") - poolMaxSize <- Prom.register $ Prom.gauge (Prom.Info "pgrst_db_pool_max" "Max pool connections") - schemaCacheLoads <- Prom.register $ Prom.counter (Prom.Info "pgrst_schema_cache_loads_total" "The total number of times the schema cache was loaded") - schemaCacheQueryTime <- Prom.register $ Prom.gauge (Prom.Info "pgrst_schema_cache_query_time_seconds" "The query time in seconds of the last schema cache load") - Prom.setGauge poolMaxSize (fromIntegral configDbPoolSize) + poolTimeouts <- register $ counter (Info "pgrst_db_pool_timeouts_total" "The total number of pool connection timeouts") + poolAvailable <- register $ gauge (Info "pgrst_db_pool_available" "Available connections in the pool") + poolWaiting <- register $ gauge (Info "pgrst_db_pool_waiting" "Requests waiting to acquire a pool connection") + poolMaxSize <- register $ gauge (Info "pgrst_db_pool_max" "Max pool connections") + schemaCacheLoads <- register $ vector "status" $ counter (Info "pgrst_schema_cache_loads_total" "The total number of times the schema cache was loaded") + schemaCacheQueryTime <- register $ gauge (Info "pgrst_schema_cache_query_time_seconds" "The query time in seconds of the last schema cache load") + setGauge poolMaxSize (fromIntegral configDbPoolSize) pure $ MetricsState poolTimeouts poolAvailable poolWaiting poolMaxSize schemaCacheLoads schemaCacheQueryTime observationMetrics :: MetricsState -> ObservationHandler observationMetrics (MetricsState poolTimeouts poolAvailable poolWaiting _ schemaCacheLoads schemaCacheQueryTime) obs = case obs of (PoolAcqTimeoutObs _) -> do - Prom.incCounter poolTimeouts + incCounter poolTimeouts (HasqlPoolObs (SQL.ConnectionObservation _ status)) -> case status of SQL.ReadyForUseConnectionStatus -> do - Prom.incGauge poolAvailable + incGauge poolAvailable SQL.InUseConnectionStatus -> do - Prom.decGauge poolAvailable + decGauge poolAvailable SQL.TerminatedConnectionStatus _ -> do - Prom.decGauge poolAvailable + decGauge poolAvailable SQL.ConnectingConnectionStatus -> pure () PoolRequest -> - Prom.incGauge poolWaiting + incGauge poolWaiting PoolRequestFullfilled -> - Prom.decGauge poolWaiting + decGauge poolWaiting SchemaCacheLoadedObs resTime -> do - Prom.incCounter schemaCacheLoads - Prom.setGauge schemaCacheQueryTime resTime + withLabel schemaCacheLoads "SUCCESS" incCounter + setGauge schemaCacheQueryTime resTime + SchemaCacheNormalErrorObs _ -> do + withLabel schemaCacheLoads "FAIL" incCounter _ -> pure () metricsToText :: IO LBS.ByteString -metricsToText = Prom.exportMetricsAsText +metricsToText = exportMetricsAsText diff --git a/test/io/test_io.py b/test/io/test_io.py index e86e97632c3..641f5a756d8 100644 --- a/test/io/test_io.py +++ b/test/io/test_io.py @@ -732,6 +732,44 @@ def test_admin_ready_includes_schema_cache_state(defaultenv, metapostgrest): reset_statement_timeout(metapostgrest, role) +def test_metrics_include_schema_cache_fails(defaultenv, metapostgrest): + "Should get shema cache fails from the metrics endpoint" + + role = "timeout_authenticator" + + env = { + **defaultenv, + "PGUSER": role, + "PGRST_INTERNAL_SCHEMA_CACHE_SLEEP": "50", + } + + with run(env=env) as postgrest: + # The schema cache query takes at least 20ms, due to PGRST_INTERNAL_SCHEMA_CACHE_SLEEP above. + # Make it impossible to load the schema cache, by setting statement timeout to 100ms. + set_statement_timeout(metapostgrest, role, 20) + + # force a reconnection so the new role setting is picked up + postgrest.process.send_signal(signal.SIGUSR1) + + # wait for some schema cache retries + time.sleep(1) + + response = postgrest.admin.get("/ready", timeout=1) + assert response.status_code == 503 + + response = postgrest.admin.get("/metrics", timeout=1) + assert response.status_code == 200 + + metrics = float( + re.search( + r'pgrst_schema_cache_loads_total{status="FAIL"} (\d+)', response.text + ).group(1) + ) + assert metrics > 3.0 + + reset_statement_timeout(metapostgrest, role) + + def test_admin_not_found(defaultenv): "Should get a not found from a undefined endpoint on the admin server" @@ -1448,7 +1486,7 @@ def test_admin_metrics(defaultenv): response = postgrest.admin.get("/metrics") assert response.status_code == 200 assert "pgrst_schema_cache_query_time_seconds" in response.text - assert "pgrst_schema_cache_loads_total" in response.text + assert 'pgrst_schema_cache_loads_total{status="SUCCESS"}' in response.text assert "pgrst_db_pool_max" in response.text assert "pgrst_db_pool_waiting" in response.text assert "pgrst_db_pool_available" in response.text