Skip to content

Commit

Permalink
feat: add metric label for scache load
Browse files Browse the repository at this point in the history
localhost:3001/metrics now includes:

pgrst_schema_cache_loads_total{status="FAIL"} 352.0
pgrst_schema_cache_loads_total{status="SUCCESS"} 3.0

This allows testing the failure case on:

#3424 (comment)
  • Loading branch information
steve-chavez committed May 14, 2024
1 parent f6b2aa5 commit 9f8e3d8
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 19 deletions.
38 changes: 20 additions & 18 deletions src/PostgREST/Metrics.hs
Original file line number Diff line number Diff line change
Expand Up @@ -8,47 +8,49 @@ module PostgREST.Metrics
import qualified Data.ByteString.Lazy as LBS
import qualified Hasql.Pool.Observation as SQL

import qualified Prometheus as Prom
import Prometheus

import PostgREST.Observation

import Protolude

data MetricsState =
MetricsState Prom.Counter Prom.Gauge Prom.Gauge Prom.Gauge Prom.Counter Prom.Gauge
MetricsState Counter Gauge Gauge Gauge (Vector Label1 Counter) Gauge

init :: Int -> IO MetricsState
init configDbPoolSize = do
poolTimeouts <- Prom.register $ Prom.counter (Prom.Info "pgrst_db_pool_timeouts_total" "The total number of pool connection timeouts")
poolAvailable <- Prom.register $ Prom.gauge (Prom.Info "pgrst_db_pool_available" "Available connections in the pool")
poolWaiting <- Prom.register $ Prom.gauge (Prom.Info "pgrst_db_pool_waiting" "Requests waiting to acquire a pool connection")
poolMaxSize <- Prom.register $ Prom.gauge (Prom.Info "pgrst_db_pool_max" "Max pool connections")
schemaCacheLoads <- Prom.register $ Prom.counter (Prom.Info "pgrst_schema_cache_loads_total" "The total number of times the schema cache was loaded")
schemaCacheQueryTime <- Prom.register $ Prom.gauge (Prom.Info "pgrst_schema_cache_query_time_seconds" "The query time in seconds of the last schema cache load")
Prom.setGauge poolMaxSize (fromIntegral configDbPoolSize)
poolTimeouts <- register $ counter (Info "pgrst_db_pool_timeouts_total" "The total number of pool connection timeouts")
poolAvailable <- register $ gauge (Info "pgrst_db_pool_available" "Available connections in the pool")
poolWaiting <- register $ gauge (Info "pgrst_db_pool_waiting" "Requests waiting to acquire a pool connection")
poolMaxSize <- register $ gauge (Info "pgrst_db_pool_max" "Max pool connections")
schemaCacheLoads <- register $ vector "status" $ counter (Info "pgrst_schema_cache_loads_total" "The total number of times the schema cache was loaded")
schemaCacheQueryTime <- register $ gauge (Info "pgrst_schema_cache_query_time_seconds" "The query time in seconds of the last schema cache load")
setGauge poolMaxSize (fromIntegral configDbPoolSize)
pure $ MetricsState poolTimeouts poolAvailable poolWaiting poolMaxSize schemaCacheLoads schemaCacheQueryTime

observationMetrics :: MetricsState -> ObservationHandler
observationMetrics (MetricsState poolTimeouts poolAvailable poolWaiting _ schemaCacheLoads schemaCacheQueryTime) obs = case obs of
(PoolAcqTimeoutObs _) -> do
Prom.incCounter poolTimeouts
incCounter poolTimeouts
(HasqlPoolObs (SQL.ConnectionObservation _ status)) -> case status of
SQL.ReadyForUseConnectionStatus -> do
Prom.incGauge poolAvailable
incGauge poolAvailable
SQL.InUseConnectionStatus -> do
Prom.decGauge poolAvailable
decGauge poolAvailable
SQL.TerminatedConnectionStatus _ -> do
Prom.decGauge poolAvailable
decGauge poolAvailable
SQL.ConnectingConnectionStatus -> pure ()
PoolRequest ->
Prom.incGauge poolWaiting
incGauge poolWaiting
PoolRequestFullfilled ->
Prom.decGauge poolWaiting
decGauge poolWaiting
SchemaCacheLoadedObs resTime -> do
Prom.incCounter schemaCacheLoads
Prom.setGauge schemaCacheQueryTime resTime
withLabel schemaCacheLoads "SUCCESS" incCounter
setGauge schemaCacheQueryTime resTime
SchemaCacheNormalErrorObs _ -> do
withLabel schemaCacheLoads "FAIL" incCounter
_ ->
pure ()

metricsToText :: IO LBS.ByteString
metricsToText = Prom.exportMetricsAsText
metricsToText = exportMetricsAsText
40 changes: 39 additions & 1 deletion test/io/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,44 @@ def test_admin_ready_includes_schema_cache_state(defaultenv, metapostgrest):
reset_statement_timeout(metapostgrest, role)


def test_metrics_include_schema_cache_fails(defaultenv, metapostgrest):
"Should get shema cache fails from the metrics endpoint"

role = "timeout_authenticator"

env = {
**defaultenv,
"PGUSER": role,
"PGRST_INTERNAL_SCHEMA_CACHE_SLEEP": "50",
}

with run(env=env) as postgrest:
# The schema cache query takes at least 20ms, due to PGRST_INTERNAL_SCHEMA_CACHE_SLEEP above.
# Make it impossible to load the schema cache, by setting statement timeout to 100ms.
set_statement_timeout(metapostgrest, role, 20)

# force a reconnection so the new role setting is picked up
postgrest.process.send_signal(signal.SIGUSR1)

# wait for some schema cache retries
time.sleep(1)

response = postgrest.admin.get("/ready", timeout=1)
assert response.status_code == 503

response = postgrest.admin.get("/metrics", timeout=1)
assert response.status_code == 200

metrics = float(
re.search(
r'pgrst_schema_cache_loads_total{status="FAIL"} (\d+)', response.text
).group(1)
)
assert metrics > 3.0

reset_statement_timeout(metapostgrest, role)


def test_admin_not_found(defaultenv):
"Should get a not found from a undefined endpoint on the admin server"

Expand Down Expand Up @@ -1448,7 +1486,7 @@ def test_admin_metrics(defaultenv):
response = postgrest.admin.get("/metrics")
assert response.status_code == 200
assert "pgrst_schema_cache_query_time_seconds" in response.text
assert "pgrst_schema_cache_loads_total" in response.text
assert 'pgrst_schema_cache_loads_total{status="SUCCESS"}' in response.text
assert "pgrst_db_pool_max" in response.text
assert "pgrst_db_pool_waiting" in response.text
assert "pgrst_db_pool_available" in response.text
Expand Down

0 comments on commit 9f8e3d8

Please sign in to comment.