diff --git a/.github/gh_matrix_builder.py b/.github/gh_matrix_builder.py index 6d9a8606a40..1e24ff9af77 100755 --- a/.github/gh_matrix_builder.py +++ b/.github/gh_matrix_builder.py @@ -53,6 +53,13 @@ "memoize", } +# Tests that we do not run as part of a Flake tests +flaky_exclude_tests = { + # Not executed as a flake test since it easily exhausts available + # background worker slots. + "bgw_launcher", +} + # helper functions to generate matrix entries # the release and apache config inherit from the @@ -309,11 +316,13 @@ def macos_config(overrides): sys.exit(1) if tests: + flake_tests = [t for t in tests if t not in flaky_exclude_tests] + installcheck_args = f'TESTS="{" ".join(flake_tests * 20)}"' m["include"].append( build_debug_config( { "coverage": False, - "installcheck_args": f'TESTS="{" ".join(list(tests) * 20)}"', + "installcheck_args": installcheck_args, "name": "Flaky Check Debug", "pg": PG16_LATEST, "pginstallcheck": False, @@ -324,7 +333,7 @@ def macos_config(overrides): build_debug_config( { "coverage": False, - "installcheck_args": f'TESTS="{" ".join(list(tests) * 20)}"', + "installcheck_args": installcheck_args, "name": "Flaky Check Debug", "pg": PG17_LATEST, "pginstallcheck": False, diff --git a/.unreleased/pr_7527 b/.unreleased/pr_7527 new file mode 100644 index 00000000000..534b8bec629 --- /dev/null +++ b/.unreleased/pr_7527 @@ -0,0 +1 @@ +Fixes: #7527 Restart scheduler on error diff --git a/src/bgw/scheduler.c b/src/bgw/scheduler.c index d44fc59d433..fefe4cf2602 100644 --- a/src/bgw/scheduler.c +++ b/src/bgw/scheduler.c @@ -795,9 +795,11 @@ ts_bgw_scheduler_process(int32 run_for_interval_ms, * exit. */ if (ts_guc_restoring || IsBinaryUpgrade) { - elog(LOG, - "scheduler for database %u exiting since the database is restoring or upgrading", - MyDatabaseId); + ereport(LOG, + errmsg("scheduler for database %u exiting with exit status %d", + MyDatabaseId, + ts_debug_bgw_scheduler_exit_status), + errdetail("the database is restoring or upgrading")); terminate_all_jobs_and_release_workers(); goto scheduler_exit; } @@ -866,7 +868,10 @@ ts_bgw_scheduler_process(int32 run_for_interval_ms, MemoryContextReset(scratch_mctx); } - elog(DEBUG1, "database scheduler for database %u exiting", MyDatabaseId); + elog(DEBUG1, + "scheduler for database %u exiting with exit status %d", + MyDatabaseId, + ts_debug_bgw_scheduler_exit_status); #ifdef TS_DEBUG if (ts_shutdown_bgw) @@ -879,6 +884,7 @@ ts_bgw_scheduler_process(int32 run_for_interval_ms, wait_for_all_jobs_to_shutdown(); check_for_stopped_and_timed_out_jobs(); scheduled_jobs = NIL; + proc_exit(ts_debug_bgw_scheduler_exit_status); } static void diff --git a/src/compat/compat.h b/src/compat/compat.h index 298f74fb716..58654de9f9e 100644 --- a/src/compat/compat.h +++ b/src/compat/compat.h @@ -655,6 +655,12 @@ RelationGetSmgr(Relation rel) GenerationContextCreate(parent, name, blockSize) #endif +#if PG16_GE +#define pgstat_get_local_beentry_by_index_compat(idx) pgstat_get_local_beentry_by_index(idx) +#else +#define pgstat_get_local_beentry_by_index_compat(idx) pgstat_fetch_stat_local_beentry(idx) +#endif + /* * PG16 adds a new parameter to DefineIndex, total_parts, that takes * in the total number of direct and indirect partitions of the relation. diff --git a/src/guc.c b/src/guc.c index 861760c17e2..daae08425a3 100644 --- a/src/guc.c +++ b/src/guc.c @@ -184,6 +184,20 @@ bool ts_guc_debug_require_batch_sorted_merge = false; bool ts_guc_debug_allow_cagg_with_deprecated_funcs = false; #ifdef TS_DEBUG +/* + * Exit code for the scheduler. + * + * Normally it exits with a zero which means that it will not restart. If an + * error is raised, it exits with error code 1, which will trigger a + * restart. + * + * This variable exists to be able to trigger a restart for a normal exit, + * which is useful when debugging. + * + * See backend/postmaster/bgworker.c + */ +int ts_debug_bgw_scheduler_exit_status = 0; + bool ts_shutdown_bgw = false; char *ts_current_timestamp_mock = NULL; #endif @@ -1067,6 +1081,19 @@ _guc_init(void) /* assign_hook= */ NULL, /* show_hook= */ NULL); + DefineCustomIntVariable(/* name= */ MAKE_EXTOPTION("debug_bgw_scheduler_exit_status"), + /* short_desc= */ "exit status to use when shutting down the scheduler", + /* long_desc= */ "this is for debugging purposes", + /* valueAddr= */ &ts_debug_bgw_scheduler_exit_status, + /* bootValue= */ 0, + /* minValue= */ 0, + /* maxValue= */ 255, + /* context= */ PGC_SIGHUP, + /* flags= */ 0, + /* check_hook= */ NULL, + /* assign_hook= */ NULL, + /* show_hook= */ NULL); + DefineCustomStringVariable(/* name= */ MAKE_EXTOPTION("current_timestamp_mock"), /* short_desc= */ "set the current timestamp", /* long_desc= */ "this is for debugging purposes", diff --git a/src/guc.h b/src/guc.h index 34ebc0ef2d6..bbf30389001 100644 --- a/src/guc.h +++ b/src/guc.h @@ -93,6 +93,12 @@ extern TSDLLEXPORT bool ts_guc_enable_columnarscan; extern TSDLLEXPORT int ts_guc_bgw_log_level; #ifdef TS_DEBUG +/* + * Exit code to use when scheduler exits. + * + * Used for debugging. + */ +extern TSDLLEXPORT int ts_debug_bgw_scheduler_exit_status; extern bool ts_shutdown_bgw; extern char *ts_current_timestamp_mock; #else diff --git a/src/loader/bgw_launcher.c b/src/loader/bgw_launcher.c index af2a2cc3e13..02daa180b64 100644 --- a/src/loader/bgw_launcher.c +++ b/src/loader/bgw_launcher.c @@ -84,6 +84,8 @@ typedef enum SchedulerState static volatile sig_atomic_t got_SIGHUP = false; +int ts_guc_bgw_scheduler_restart_time_sec = BGW_DEFAULT_RESTART_INTERVAL; + static void launcher_sighup(SIGNAL_ARGS) { @@ -238,13 +240,27 @@ terminate_background_worker(BackgroundWorkerHandle *handle) } extern void -ts_bgw_cluster_launcher_register(void) +ts_bgw_cluster_launcher_init(void) { BackgroundWorker worker; + DefineCustomIntVariable(/* name= */ MAKE_EXTOPTION("bgw_scheduler_restart_time"), + /* short_desc= */ "Restart time for scheduler in seconds", + /* long_desc= */ + "The number of seconds until the scheduler restart on failure.", + /* valueAddr= */ &ts_guc_bgw_scheduler_restart_time_sec, + /* bootValue= */ BGW_DEFAULT_RESTART_INTERVAL, + /* minValue= */ 1, + /* maxValue= */ 3600, + /* context= */ PGC_SIGHUP, + /* flags= */ GUC_UNIT_S, + /* check_hook= */ NULL, + /* assign_hook= */ NULL, + /* show_hook= */ NULL); + memset(&worker, 0, sizeof(worker)); /* set up worker settings for our main worker */ - snprintf(worker.bgw_name, BGW_MAXLEN, "TimescaleDB Background Worker Launcher"); + snprintf(worker.bgw_name, BGW_MAXLEN, TS_BGW_TYPE_LAUNCHER); worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; worker.bgw_restart_time = BGW_LAUNCHER_RESTART_TIME_S; @@ -274,9 +290,10 @@ register_entrypoint_for_db(Oid db_id, VirtualTransactionId vxid, BackgroundWorke BackgroundWorker worker; memset(&worker, 0, sizeof(worker)); - snprintf(worker.bgw_name, BGW_MAXLEN, "TimescaleDB Background Worker Scheduler"); + snprintf(worker.bgw_type, BGW_MAXLEN, TS_BGW_TYPE_SCHEDULER); + snprintf(worker.bgw_name, BGW_MAXLEN, "%s for database %d", TS_BGW_TYPE_SCHEDULER, db_id); worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; - worker.bgw_restart_time = BGW_NEVER_RESTART; + worker.bgw_restart_time = ts_guc_bgw_scheduler_restart_time_sec, worker.bgw_start_time = BgWorkerStart_RecoveryFinished; snprintf(worker.bgw_library_name, BGW_MAXLEN, EXTENSION_NAME); snprintf(worker.bgw_function_name, BGW_MAXLEN, BGW_ENTRYPOINT_FUNCNAME); @@ -332,15 +349,89 @@ db_hash_entry_create_if_not_exists(HTAB *db_htab, Oid db_oid) return db_he; } +/* + * Result from signalling a backend. + * + * Error codes are non-zero, and success is zero. + */ +enum SignalBackendResult +{ + SIGNAL_BACKEND_SUCCESS = 0, + SIGNAL_BACKEND_ERROR, + SIGNAL_BACKEND_NOPERMISSION, + SIGNAL_BACKEND_NOSUPERUSER, +}; + +/* + * Terminate a background worker. + * + * This is copied from pg_signal_backend() in + * src/backend/storage/ipc/signalfuncs.c but tweaked to not require a database + * connection since the launcher does not have one. + */ +static enum SignalBackendResult +ts_signal_backend(int pid, int sig) +{ + PGPROC *proc = BackendPidGetProc(pid); + + if (unlikely(proc == NULL)) + { + ereport(WARNING, (errmsg("PID %d is not a PostgreSQL backend process", pid))); + return SIGNAL_BACKEND_ERROR; + } + + if (unlikely(kill(pid, sig))) + { + /* Again, just a warning to allow loops */ + ereport(WARNING, (errmsg("could not send signal to process %d: %m", pid))); + return SIGNAL_BACKEND_ERROR; + } + + return SIGNAL_BACKEND_SUCCESS; +} + +/* + * Terminate backends by backend type. + * + * We iterate through all backends and mark those that match the given backend + * type as terminated. + * + * Note that there is potentially a delay between marking backends as + * terminated and their actual termination, so the backends have to be able to + * run even if there are multiple instances accessing the same data. + * + * Parts of this code is taken from pg_stat_get_activity() in + * src/backend/utils/adt/pgstatfuncs.c. + */ +static void +terminate_backends_by_backend_type(const char *backend_type) +{ + Assert(backend_type); + + const int num_backends = pgstat_fetch_stat_numbackends(); + for (int curr_backend = 1; curr_backend <= num_backends; ++curr_backend) + { + const LocalPgBackendStatus *local_beentry = + pgstat_get_local_beentry_by_index_compat(curr_backend); + const PgBackendStatus *beentry = &local_beentry->backendStatus; + const char *bgw_type = GetBackgroundWorkerTypeByPid(beentry->st_procpid); + if (bgw_type && strcmp(backend_type, bgw_type) == 0) + { + int error = ts_signal_backend(beentry->st_procpid, SIGTERM); + if (error) + elog(LOG, "failed to terminate backend with pid %d", beentry->st_procpid); + } + } +} + /* * Model this on autovacuum.c -> get_database_list. * - * Note that we are not doing - * all the things around memory context that they do, because the hashtable - * we're using to store db entries is automatically created in its own memory - * context (a child of TopMemoryContext) This can get called at two different - * times 1) when the cluster launcher starts and is looking for dbs and 2) if - * it restarts due to a postmaster signal. + * Note that we are not doing all the things around memory context that they + * do, because the hashtable we're using to store db entries is automatically + * created in its own memory context (a child of TopMemoryContext) This can + * get called at two different times 1) when the cluster launcher starts and + * is looking for dbs and 2) if it restarts due to a postmaster signal. */ static void populate_database_htab(HTAB *db_htab) @@ -757,6 +848,16 @@ ts_bgw_cluster_launcher_main(PG_FUNCTION_ARGS) db_htab = init_database_htab(); *htab_storage = db_htab; + /* + * If the launcher was restarted and discovers old schedulers, these has + * to be terminated to avoid exhausting the worker slots. + * + * We cannot easily pick up the old schedulers since we do not have access + * to the slots array in PostgreSQL, so instead we scan for something that + * looks like schedulers for databases, and kill them. New ones will then + * be spawned below. + */ + terminate_backends_by_backend_type(TS_BGW_TYPE_SCHEDULER); populate_database_htab(db_htab); while (true) diff --git a/src/loader/bgw_launcher.h b/src/loader/bgw_launcher.h index f90a65cb3a9..82c2ec1893b 100644 --- a/src/loader/bgw_launcher.h +++ b/src/loader/bgw_launcher.h @@ -8,7 +8,12 @@ #include #include -extern void ts_bgw_cluster_launcher_register(void); +#define TS_BGW_TYPE_LAUNCHER "TimescaleDB Background Worker Launcher" +#define TS_BGW_TYPE_SCHEDULER "TimescaleDB Background Worker Scheduler" + +extern int ts_guc_bgw_scheduler_restart_time_sec; + +extern void ts_bgw_cluster_launcher_init(void); /*called by postmaster at launcher bgw startup*/ TSDLLEXPORT extern Datum ts_bgw_cluster_launcher_main(PG_FUNCTION_ARGS); diff --git a/src/loader/loader.c b/src/loader/loader.c index 6537e49d8c5..fdff65236fa 100644 --- a/src/loader/loader.c +++ b/src/loader/loader.c @@ -591,7 +591,7 @@ _PG_init(void) timescaledb_shmem_request_hook(); #endif - ts_bgw_cluster_launcher_register(); + ts_bgw_cluster_launcher_init(); ts_bgw_counter_setup_gucs(); ts_bgw_interface_register_api_version(); diff --git a/test/expected/bgw_launcher.out b/test/expected/bgw_launcher.out index 84c2b30bf5a..ff8a61559b6 100644 --- a/test/expected/bgw_launcher.out +++ b/test/expected/bgw_launcher.out @@ -21,11 +21,12 @@ CREATE DATABASE :TEST_DBNAME_2; -- Further Note: PG 9.6 changed what appeared in pg_stat_activity, so the launcher doesn't actually show up. -- we can still test its interactions with its children, but can't test some of the things specific to the launcher. -- So we've added some bits about the version number as needed. -CREATE VIEW worker_counts as SELECT count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Launcher') as launcher, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME') as single_scheduler, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2') as single_2_scheduler, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler -FROM pg_stat_activity; +CREATE VIEW worker_counts as +SELECT count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Launcher') as launcher, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME') as single_scheduler, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2') as single_2_scheduler, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler + FROM pg_stat_activity; CREATE FUNCTION wait_worker_counts(launcher_ct INTEGER, scheduler1_ct INTEGER, scheduler2_ct INTEGER, template1_ct INTEGER) RETURNS BOOLEAN LANGUAGE PLPGSQL AS $BODY$ DECLARE @@ -103,7 +104,7 @@ SELECT wait_worker_counts(1,0,1,0); -- Now let's restart the scheduler in test db 2 and make sure our backend_start changed SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset -- We'll do this in a txn so that we can see that the worker locks on our txn before continuing BEGIN; @@ -122,7 +123,7 @@ SELECT wait_worker_counts(1,0,1,0); SELECT (backend_start > :'orig_backend_start'::timestamptz) backend_start_changed, (wait_event = 'virtualxid') wait_event_changed FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2'; backend_start_changed | wait_event_changed -----------------------+-------------------- @@ -138,7 +139,7 @@ SELECT wait_worker_counts(1,0,1,0); SELECT (wait_event IS DISTINCT FROM 'virtualxid') wait_event_changed FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2'; wait_event_changed -------------------- @@ -187,7 +188,7 @@ SELECT wait_worker_counts(1,0,1,0); -- make sure start is idempotent SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset -- Since we're doing idempotency tests, we're also going to exercise our queue and start 20 times SELECT _timescaledb_functions.start_background_workers() as start_background_workers, * FROM generate_series(1,20); @@ -227,7 +228,7 @@ FOR i in 1..5 LOOP SELECT (backend_start = $1::timestamptz) backend_start_unchanged FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = $2 into r; if(r) THEN PERFORM pg_sleep(0.1); @@ -274,7 +275,7 @@ SELECT wait_worker_counts(1,0,1,0); -- Now let's restart the scheduler and make sure our backend_start changed SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset BEGIN; DROP EXTENSION timescaledb; @@ -294,7 +295,7 @@ FOR i in 1..10 LOOP SELECT (backend_start > $1::timestamptz) backend_start_changed FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = $2 into r; if(NOT r) THEN PERFORM pg_sleep(0.1); @@ -315,9 +316,9 @@ SELECT wait_greater(:'orig_backend_start',:'TEST_DBNAME_2'); -- Make sure canceling the launcher backend causes a restart of schedulers SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset -SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'TimescaleDB Background Worker Launcher'; +SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE backend_type = 'TimescaleDB Background Worker Launcher'; pg_cancel_backend ------------------- t @@ -445,11 +446,12 @@ ALTER ROLE :ROLE_DEFAULT_PERM_USER WITH NOSUPERUSER; -- Further Note: PG 9.6 changed what appeared in pg_stat_activity, so the launcher doesn't actually show up. -- we can still test its interactions with its children, but can't test some of the things specific to the launcher. -- So we've added some bits about the version number as needed. -CREATE VIEW worker_counts as SELECT count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Launcher') as launcher, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME') as single_scheduler, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2') as single_2_scheduler, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler -FROM pg_stat_activity; +CREATE VIEW worker_counts as +SELECT count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Launcher') as launcher, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME') as single_scheduler, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2') as single_2_scheduler, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler + FROM pg_stat_activity; CREATE FUNCTION wait_worker_counts(launcher_ct INTEGER, scheduler1_ct INTEGER, scheduler2_ct INTEGER, template1_ct INTEGER) RETURNS BOOLEAN LANGUAGE PLPGSQL AS $BODY$ DECLARE @@ -602,7 +604,7 @@ SELECT _timescaledb_functions.stop_background_workers(); t (1 row) -SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE application_name = 'TimescaleDB Background Worker Launcher'; +SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE backend_type = 'TimescaleDB Background Worker Launcher'; pg_terminate_backend ---------------------- t diff --git a/test/pgtest/CMakeLists.txt b/test/pgtest/CMakeLists.txt index 2d65a493eb2..8d7dc3779c1 100644 --- a/test/pgtest/CMakeLists.txt +++ b/test/pgtest/CMakeLists.txt @@ -29,6 +29,9 @@ set(PG_IGNORE_TESTS sanity_check type_sanity create_am + # Ignoring because it spawns different number of workers in different + # versions. + select_parallel psql) # Modify the test schedule to ignore some tests diff --git a/test/sql/bgw_launcher.sql b/test/sql/bgw_launcher.sql index db3d240c6fb..19a01789667 100644 --- a/test/sql/bgw_launcher.sql +++ b/test/sql/bgw_launcher.sql @@ -33,7 +33,7 @@ SELECT wait_worker_counts(1,0,1,0); -- Now let's restart the scheduler in test db 2 and make sure our backend_start changed SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset -- We'll do this in a txn so that we can see that the worker locks on our txn before continuing BEGIN; @@ -43,14 +43,14 @@ SELECT wait_worker_counts(1,0,1,0); SELECT (backend_start > :'orig_backend_start'::timestamptz) backend_start_changed, (wait_event = 'virtualxid') wait_event_changed FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2'; COMMIT; SELECT wait_worker_counts(1,0,1,0); SELECT (wait_event IS DISTINCT FROM 'virtualxid') wait_event_changed FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2'; -- Test stop @@ -68,7 +68,7 @@ SELECT wait_worker_counts(1,0,1,0); -- make sure start is idempotent SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset -- Since we're doing idempotency tests, we're also going to exercise our queue and start 20 times @@ -85,7 +85,7 @@ FOR i in 1..5 LOOP SELECT (backend_start = $1::timestamptz) backend_start_unchanged FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = $2 into r; if(r) THEN PERFORM pg_sleep(0.1); @@ -109,7 +109,7 @@ SELECT wait_worker_counts(1,0,1,0); -- Now let's restart the scheduler and make sure our backend_start changed SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset BEGIN; @@ -126,7 +126,7 @@ FOR i in 1..10 LOOP SELECT (backend_start > $1::timestamptz) backend_start_changed FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = $2 into r; if(NOT r) THEN PERFORM pg_sleep(0.1); @@ -143,10 +143,10 @@ SELECT wait_greater(:'orig_backend_start',:'TEST_DBNAME_2'); -- Make sure canceling the launcher backend causes a restart of schedulers SELECT backend_start as orig_backend_start FROM pg_stat_activity -WHERE application_name = 'TimescaleDB Background Worker Scheduler' +WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2' \gset -SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE application_name = 'TimescaleDB Background Worker Launcher'; +SELECT pg_cancel_backend(pid) FROM pg_stat_activity WHERE backend_type = 'TimescaleDB Background Worker Launcher'; SELECT wait_worker_counts(1,0,1,0); @@ -259,7 +259,7 @@ SELECT wait_for_bgw_scheduler(:'TEST_DBNAME'); -- Connect to TEST_DBNAME (_timescaledb_functions.stop_background_workers() is not available in TEST_DBNAME_2) \c :TEST_DBNAME :ROLE_SUPERUSER SELECT _timescaledb_functions.stop_background_workers(); -SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE application_name = 'TimescaleDB Background Worker Launcher'; +SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE backend_type = 'TimescaleDB Background Worker Launcher'; \c :TEST_DBNAME_2 :ROLE_SUPERUSER -- make sure nobody is using it diff --git a/test/sql/include/bgw_launcher_utils.sql b/test/sql/include/bgw_launcher_utils.sql index de06a853814..3e52778fc02 100644 --- a/test/sql/include/bgw_launcher_utils.sql +++ b/test/sql/include/bgw_launcher_utils.sql @@ -8,11 +8,12 @@ -- we can still test its interactions with its children, but can't test some of the things specific to the launcher. -- So we've added some bits about the version number as needed. -CREATE VIEW worker_counts as SELECT count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Launcher') as launcher, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME') as single_scheduler, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2') as single_2_scheduler, -count(*) filter (WHERE application_name = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler -FROM pg_stat_activity; +CREATE VIEW worker_counts as +SELECT count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Launcher') as launcher, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME') as single_scheduler, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = :'TEST_DBNAME_2') as single_2_scheduler, + count(*) filter (WHERE backend_type = 'TimescaleDB Background Worker Scheduler' AND datname = 'template1') as template1_scheduler + FROM pg_stat_activity; CREATE FUNCTION wait_worker_counts(launcher_ct INTEGER, scheduler1_ct INTEGER, scheduler2_ct INTEGER, template1_ct INTEGER) RETURNS BOOLEAN LANGUAGE PLPGSQL AS $BODY$ diff --git a/tsl/test/expected/bgw_scheduler_control.out b/tsl/test/expected/bgw_scheduler_control.out index 8ed411971f4..bb55cb2771b 100644 --- a/tsl/test/expected/bgw_scheduler_control.out +++ b/tsl/test/expected/bgw_scheduler_control.out @@ -99,7 +99,7 @@ SELECT * FROM cleaned_bgw_log; 4 | DB Scheduler | [TESTING] Wait until (RANDOM), started at (RANDOM) 0 | test_job_1b | Execute job 1 1 | test_job_1b | job 1000 (test_job_1b) exiting with success: execution time (RANDOM) ms - 5 | DB Scheduler | database scheduler for database (RANDOM) exiting + 5 | DB Scheduler | scheduler for database (RANDOM) exiting with exit status 0 (8 rows) ALTER DATABASE :TEST_DBNAME RESET timescaledb.bgw_log_level; diff --git a/tsl/test/expected/bgw_scheduler_restart.out b/tsl/test/expected/bgw_scheduler_restart.out new file mode 100644 index 00000000000..d5ad970e636 --- /dev/null +++ b/tsl/test/expected/bgw_scheduler_restart.out @@ -0,0 +1,166 @@ +-- This file and its contents are licensed under the Timescale License. +-- Please see the included NOTICE for copyright information and +-- LICENSE-TIMESCALE for a copy of the license. +\c :TEST_DBNAME :ROLE_SUPERUSER +CREATE VIEW tsdb_bgw AS + SELECT datname, pid, backend_type, application_name + FROM pg_stat_activity + WHERE backend_type LIKE '%TimescaleDB%' + ORDER BY datname, backend_type, application_name; +-- Show the default scheduler restart time +SHOW timescaledb.bgw_scheduler_restart_time; + timescaledb.bgw_scheduler_restart_time +---------------------------------------- + 1min +(1 row) + +ALTER SYSTEM SET timescaledb.bgw_scheduler_restart_time TO '10s'; +ALTER SYSTEM SET timescaledb.debug_bgw_scheduler_exit_status TO 1; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +SHOW timescaledb.bgw_scheduler_restart_time; + timescaledb.bgw_scheduler_restart_time +---------------------------------------- + 1min +(1 row) + +-- Launcher is running, so we need to restart it for the scheduler +-- restart time to take effect. +SELECT datname, application_name FROM tsdb_bgw; + datname | application_name +---------+---------------------------------------- + | TimescaleDB Background Worker Launcher +(1 row) + +SELECT pg_terminate_backend(pid) FROM tsdb_bgw + WHERE datname = :'TEST_DBNAME' AND backend_type LIKE '%Launcher%'; + pg_terminate_backend +---------------------- +(0 rows) + +-- It will restart automatically, but we wait one second to give it a +-- chance to start. +SELECT pg_sleep(1); + pg_sleep +---------- + +(1 row) + +-- Verify that launcher is running. If it is not, the rest of the test +-- will fail. +SELECT datname, application_name FROM tsdb_bgw; + datname | application_name +---------+---------------------------------------- + | TimescaleDB Background Worker Launcher +(1 row) + +-- Now we can start the background workers. +SELECT _timescaledb_functions.start_background_workers(); + start_background_workers +-------------------------- + t +(1 row) + +-- They should start immediately, but let's wait a short while for +-- scheduler to start. +SELECT pg_sleep(1); + pg_sleep +---------- + +(1 row) + +-- Check that the schedulers are running. If they are not, the rest of +-- the test is meaningless. +SELECT datname, application_name FROM tsdb_bgw; + datname | application_name +--------------------------+----------------------------------------- + db_bgw_scheduler_restart | TimescaleDB Background Worker Scheduler + | TimescaleDB Background Worker Launcher +(2 rows) + +-- Kill the schedulers and check that they restart. +SELECT pg_terminate_backend(pid) FROM tsdb_bgw + WHERE datname = :'TEST_DBNAME' AND backend_type LIKE '%Scheduler%'; + pg_terminate_backend +---------------------- + t +(1 row) + +-- Wait for scheduler to exit, they should exit immediately. +SELECT pg_sleep(1); + pg_sleep +---------- + +(1 row) + +-- Check that the schedulers really exited. +SELECT datname, application_name FROM tsdb_bgw; + datname | application_name +---------+---------------------------------------- + | TimescaleDB Background Worker Launcher +(1 row) + +SELECT pg_sleep(10); -- Wait for scheduler to restart. + pg_sleep +---------- + +(1 row) + +-- Make sure that the launcher and schedulers are running. +SELECT datname, application_name FROM tsdb_bgw; + datname | application_name +--------------------------+----------------------------------------- + db_bgw_scheduler_restart | TimescaleDB Background Worker Scheduler + | TimescaleDB Background Worker Launcher +(2 rows) + +-- Now, we had a previous bug where killing the launcher at this point +-- would leave the schedulers running (because the launcher did not +-- have a handle for them) and when launcher is restarting, it would +-- start more schedulers, leaving two schedulers per database. +-- Kill the launcher. Since there are new restarted schedulers, the +-- handle could not be used to terminate them, and they would be left +-- running. +SELECT pg_terminate_backend(pid) FROM tsdb_bgw + WHERE datname = :'TEST_DBNAME' AND backend_type LIKE '%Launcher%'; + pg_terminate_backend +---------------------- +(0 rows) + +-- Launcher will restart immediately, but we wait one second to give +-- it a chance to start. +SELECT pg_sleep(1); + pg_sleep +---------- + +(1 row) + +-- Check that the launcher is running and that there are exactly one +-- scheduler per database. Here the old schedulers are killed, so it +-- will be schedulers with a different PID than the ones before the +-- launcher was killed, but we are not showing this here. +SELECT datname, application_name FROM tsdb_bgw; + datname | application_name +--------------------------+----------------------------------------- + db_bgw_scheduler_restart | TimescaleDB Background Worker Scheduler + | TimescaleDB Background Worker Launcher +(2 rows) + +ALTER SYSTEM RESET timescaledb.bgw_scheduler_restart_time; +ALTER SYSTEM RESET timescaledb.debug_bgw_scheduler_exit_status; +SELECT pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +SELECT _timescaledb_functions.stop_background_workers(); + stop_background_workers +------------------------- + t +(1 row) + diff --git a/tsl/test/sql/CMakeLists.txt b/tsl/test/sql/CMakeLists.txt index 2b3a007d7a2..cc00f329d4a 100644 --- a/tsl/test/sql/CMakeLists.txt +++ b/tsl/test/sql/CMakeLists.txt @@ -73,6 +73,7 @@ if(CMAKE_BUILD_TYPE MATCHES Debug) bgw_job_stat_history_errors.sql bgw_job_stat_history_errors_permissions.sql bgw_db_scheduler_fixed.sql + bgw_scheduler_restart.sql bgw_reorder_drop_chunks.sql scheduler_fixed.sql compress_bgw_reorder_drop_chunks.sql @@ -185,6 +186,7 @@ set(SOLO_TESTS # log level. bgw_custom bgw_scheduler_control + bgw_scheduler_restart bgw_db_scheduler bgw_job_stat_history_errors_permissions bgw_job_stat_history_errors diff --git a/tsl/test/sql/bgw_scheduler_restart.sql b/tsl/test/sql/bgw_scheduler_restart.sql new file mode 100644 index 00000000000..df45aa40881 --- /dev/null +++ b/tsl/test/sql/bgw_scheduler_restart.sql @@ -0,0 +1,85 @@ +-- This file and its contents are licensed under the Timescale License. +-- Please see the included NOTICE for copyright information and +-- LICENSE-TIMESCALE for a copy of the license. + +\c :TEST_DBNAME :ROLE_SUPERUSER + +CREATE VIEW tsdb_bgw AS + SELECT datname, pid, backend_type, application_name + FROM pg_stat_activity + WHERE backend_type LIKE '%TimescaleDB%' + ORDER BY datname, backend_type, application_name; + +-- Show the default scheduler restart time +SHOW timescaledb.bgw_scheduler_restart_time; +ALTER SYSTEM SET timescaledb.bgw_scheduler_restart_time TO '10s'; +ALTER SYSTEM SET timescaledb.debug_bgw_scheduler_exit_status TO 1; +SELECT pg_reload_conf(); +SHOW timescaledb.bgw_scheduler_restart_time; + +-- Launcher is running, so we need to restart it for the scheduler +-- restart time to take effect. +SELECT datname, application_name FROM tsdb_bgw; +SELECT pg_terminate_backend(pid) FROM tsdb_bgw + WHERE datname = :'TEST_DBNAME' AND backend_type LIKE '%Launcher%'; + +-- It will restart automatically, but we wait one second to give it a +-- chance to start. +SELECT pg_sleep(1); + +-- Verify that launcher is running. If it is not, the rest of the test +-- will fail. +SELECT datname, application_name FROM tsdb_bgw; + +-- Now we can start the background workers. +SELECT _timescaledb_functions.start_background_workers(); + +-- They should start immediately, but let's wait a short while for +-- scheduler to start. +SELECT pg_sleep(1); + +-- Check that the schedulers are running. If they are not, the rest of +-- the test is meaningless. +SELECT datname, application_name FROM tsdb_bgw; + +-- Kill the schedulers and check that they restart. +SELECT pg_terminate_backend(pid) FROM tsdb_bgw + WHERE datname = :'TEST_DBNAME' AND backend_type LIKE '%Scheduler%'; + +-- Wait for scheduler to exit, they should exit immediately. +SELECT pg_sleep(1); + +-- Check that the schedulers really exited. +SELECT datname, application_name FROM tsdb_bgw; + +SELECT pg_sleep(10); -- Wait for scheduler to restart. + +-- Make sure that the launcher and schedulers are running. +SELECT datname, application_name FROM tsdb_bgw; + +-- Now, we had a previous bug where killing the launcher at this point +-- would leave the schedulers running (because the launcher did not +-- have a handle for them) and when launcher is restarting, it would +-- start more schedulers, leaving two schedulers per database. + +-- Kill the launcher. Since there are new restarted schedulers, the +-- handle could not be used to terminate them, and they would be left +-- running. +SELECT pg_terminate_backend(pid) FROM tsdb_bgw + WHERE datname = :'TEST_DBNAME' AND backend_type LIKE '%Launcher%'; + +-- Launcher will restart immediately, but we wait one second to give +-- it a chance to start. +SELECT pg_sleep(1); + +-- Check that the launcher is running and that there are exactly one +-- scheduler per database. Here the old schedulers are killed, so it +-- will be schedulers with a different PID than the ones before the +-- launcher was killed, but we are not showing this here. +SELECT datname, application_name FROM tsdb_bgw; + +ALTER SYSTEM RESET timescaledb.bgw_scheduler_restart_time; +ALTER SYSTEM RESET timescaledb.debug_bgw_scheduler_exit_status; +SELECT pg_reload_conf(); + +SELECT _timescaledb_functions.stop_background_workers();