Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: migrate backfill tests to deterministic integration tests #13219

Merged
merged 23 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions ci/scripts/deterministic-it-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ set -euo pipefail

source ci/scripts/common.sh

export LOGDIR=.risingwave/log
mkdir -p $LOGDIR

echo "--- Download artifacts"
buildkite-agent artifact download simulation-it-test.tar.zst .

Expand All @@ -13,10 +16,12 @@ tar -xvf simulation-it-test.tar.zst
mkdir target/sim
mv target/ci-sim target/sim

TEST_PATTERN="$@"

echo "--- Run integration tests in deterministic simulation mode"
seq $TEST_NUM | parallel MADSIM_TEST_SEED={} NEXTEST_PROFILE=ci-sim \
cargo nextest run \
'cargo nextest run \
--no-fail-fast \
--cargo-metadata target/nextest/cargo-metadata.json \
--binaries-metadata target/nextest/binaries-metadata.json \
"$@"
"$TEST_PATTERN" 2>$LOGDIR/deterministic-it-test-{}.log && rm $LOGDIR/deterministic-it-test-{}.log'
283 changes: 0 additions & 283 deletions ci/scripts/run-backfill-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -117,214 +117,6 @@ test_snapshot_and_upstream_read() {
cargo make wait-processes-exit
}

# Test background ddl recovery
test_background_ddl_recovery() {
echo "--- e2e, $CLUSTER_PROFILE, test_background_ddl_recovery"
cargo make ci-start $CLUSTER_PROFILE

# Test before recovery
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/create_table.slt"
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/create_bg_mv.slt"
sleep 1
OLD_PROGRESS=$(run_sql "SHOW JOBS;" | grep -E -o "[0-9]{1,2}\.[0-9]{1,2}")

# Restart
restart_cluster

# Test after recovery
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_one_job.slt"

# Recover the mview progress
sleep 5

NEW_PROGRESS=$(run_sql "SHOW JOBS;" | grep -E -o "[0-9]{1,2}\.[0-9]{1,2}")

if [[ ${OLD_PROGRESS%.*} -le ${NEW_PROGRESS%.*} ]]; then
echo "OK: $OLD_PROGRESS smaller or equal to $NEW_PROGRESS"
else
echo "FAILED: $OLD_PROGRESS larger than $NEW_PROGRESS"
exit 1
fi

sleep 60

# Test after backfill finished
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_backfilled_mv.slt"

# After cluster restart(s), backfilled mv should still be present.
restart_cluster
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_backfilled_mv.slt"
restart_cluster
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_backfilled_mv.slt"

sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/drop_mv.slt"
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/drop_table.slt"

kill_cluster
}

test_background_ddl_cancel() {
echo "--- e2e, $CLUSTER_PROFILE, test background ddl"
cargo make ci-start $CLUSTER_PROFILE

# Test before recovery
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/create_table.slt"
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/create_bg_mv.slt"
sleep 1
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_one_job.slt"

cancel_stream_jobs
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_no_jobs.slt"

sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/create_bg_mv.slt"

# Restart
restart_cluster

# Recover
sleep 3

sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_one_job.slt"

cancel_stream_jobs
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_no_jobs.slt"

sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/create_bg_mv.slt"
sleep 1
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_one_job.slt"
cancel_stream_jobs
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_no_jobs.slt"

# After cancel should be able to create MV
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/create_bg_mv.slt"
sleep 1
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_one_job.slt"
cancel_stream_jobs
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_no_jobs.slt"

sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/drop_table.slt"

kill_cluster
}

# Test foreground ddl should not recover
test_foreground_ddl_cancel() {
echo "--- e2e, $CLUSTER_PROFILE, test_foreground_ddl_cancel"
cargo make ci-start $CLUSTER_PROFILE

# Test before recovery
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/create_table.slt"
run_sql "CREATE MATERIALIZED VIEW m1 as select * FROM t;" &
sleep 1
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_one_job.slt"

cancel_stream_jobs
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_no_jobs.slt"

sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/create_fg_mv.slt"
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/drop_mv.slt"
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/drop_table.slt"

kill_cluster
}

# Test foreground ddl should not recover
test_foreground_ddl_no_recover() {
echo "--- e2e, $CLUSTER_PROFILE, test_foreground_ddl_no_recover"
cargo make ci-start $CLUSTER_PROFILE

# Test before recovery
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/create_table.slt"
run_sql "CREATE MATERIALIZED VIEW m1 as select * FROM t;" &
sleep 3
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_one_job.slt"


# Restart
restart_cluster

# Leave sometime for recovery
sleep 5

# Test after recovery
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_no_jobs.slt"
sleep 30

sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/drop_table.slt"

kill_cluster
}

test_foreground_index_cancel() {
echo "--- e2e, $CLUSTER_PROFILE, test_foreground_index_cancel"
cargo make ci-start $CLUSTER_PROFILE

sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/create_table.slt"

# Test cancel
run_sql "CREATE INDEX i ON t (v1);" &
sleep 3
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_one_job.slt"
cancel_stream_jobs
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_no_jobs.slt"

# Test index over recovery
run_sql "CREATE INDEX i ON t (v1);" &
sleep 3
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_one_job.slt"


# Restart
restart_cluster

# Leave sometime for recovery
sleep 5

# Test after recovery
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_no_jobs.slt"
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/create_index.slt"

sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/drop_index.slt"
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/drop_table.slt"

kill_cluster
}

test_foreground_sink_cancel() {
echo "--- e2e, $CLUSTER_PROFILE, test_foreground_sink_ddl_cancel"
cargo make ci-start $CLUSTER_PROFILE

sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/create_table.slt"

# Test cancel
run_sql "CREATE SINK i FROM t WITH (connector='blackhole');" &
sleep 3
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_one_job.slt"
cancel_stream_jobs
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_no_jobs.slt"

# Test sink over recovery
run_sql "CREATE SINK i FROM t WITH (connector='blackhole');" &
sleep 3
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_one_job.slt"


# Restart
restart_cluster

# Leave sometime for recovery
sleep 5

# Test after recovery
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_no_jobs.slt"
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/create_sink.slt"

sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/drop_sink.slt"
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/drop_table.slt"

kill_cluster
}

# Lots of upstream tombstone, backfill should still proceed.
test_backfill_tombstone() {
echo "--- e2e, test_backfill_tombstone"
Expand Down Expand Up @@ -358,85 +150,10 @@ test_backfill_tombstone() {
wait
}

test_backfill_restart_cn_recovery() {
echo "--- e2e, $CLUSTER_PROFILE, test_background_restart_cn_recovery"
cargo make ci-start $CLUSTER_PROFILE

# Test before recovery
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/create_table.slt"
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/create_bg_mv.slt"
sleep 1
OLD_PROGRESS=$(run_sql "SHOW JOBS;" | grep -E -o "[0-9]{1,2}\.[0-9]{1,2}")

# Restart 1 CN
restart_cn

# Give some time to recover.
sleep 3

# Test after recovery
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_one_job.slt"

# Recover the mview progress
sleep 5

NEW_PROGRESS=$(run_sql "SHOW JOBS;" | grep -E -o "[0-9]{1,2}\.[0-9]{1,2}")

if [[ ${OLD_PROGRESS%.*} -le ${NEW_PROGRESS%.*} ]]; then
echo "OK: $OLD_PROGRESS smaller or equal to $NEW_PROGRESS"
else
echo "FAILED: $OLD_PROGRESS larger than $NEW_PROGRESS"
exit 1
fi

# Trigger a bootstrap recovery
pkill compute-node
kill_cluster
rename_logs_with_prefix "before-restart"
sleep 10
cargo make dev $CLUSTER_PROFILE

# Recover mview progress
sleep 5

OLD_PROGRESS=$NEW_PROGRESS
NEW_PROGRESS=$(run_sql "SHOW JOBS;" | grep -E -o "[0-9]{1,2}\.[0-9]{1,2}")

if [[ ${OLD_PROGRESS%.*} -le ${NEW_PROGRESS%.*} ]]; then
echo "OK: $OLD_PROGRESS smaller or equal to $NEW_PROGRESS"
else
echo "FAILED: $OLD_PROGRESS larger than $NEW_PROGRESS"
exit 1
fi

sleep 60

# Test after backfill finished
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_backfilled_mv.slt"

# After cluster restart(s), backfilled mv should still be present.
restart_cluster
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_backfilled_mv.slt"
restart_cluster
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/validate_backfilled_mv.slt"

sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/drop_mv.slt"
sqllogictest -d dev -h localhost -p 4566 "$COMMON_DIR/drop_table.slt"

kill_cluster
}

main() {
set -euo pipefail
test_snapshot_and_upstream_read
test_backfill_tombstone
test_background_ddl_recovery
test_background_ddl_cancel
test_foreground_ddl_no_recover
test_foreground_ddl_cancel
test_foreground_index_cancel
test_foreground_sink_cancel
test_backfill_restart_cn_recovery
}

main
3 changes: 3 additions & 0 deletions ci/workflows/main-cron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ steps:
run: rw-build-env
config: ci/docker-compose.yml
mount-buildkite-agent: true
- ./ci/plugins/upload-failure-logs
timeout_in_minutes: 70
retry: *auto-retry

Expand All @@ -205,6 +206,7 @@ steps:
run: rw-build-env
config: ci/docker-compose.yml
mount-buildkite-agent: true
- ./ci/plugins/upload-failure-logs
timeout_in_minutes: 70
retry: *auto-retry

Expand All @@ -217,6 +219,7 @@ steps:
run: rw-build-env
config: ci/docker-compose.yml
mount-buildkite-agent: true
- ./ci/plugins/upload-failure-logs
timeout_in_minutes: 70
retry: *auto-retry

Expand Down
1 change: 1 addition & 0 deletions ci/workflows/pull-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ steps:
run: rw-build-env
config: ci/docker-compose.yml
mount-buildkite-agent: true
- ./ci/plugins/upload-failure-logs
timeout_in_minutes: 20
retry: *auto-retry

Expand Down
7 changes: 5 additions & 2 deletions src/meta/src/rpc/ddl_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1128,7 +1128,8 @@ impl DdlController {
}

pub async fn wait(&self) -> MetaResult<()> {
for _ in 0..30 * 60 {
let timeout_secs = 30 * 60;
for _ in 0..timeout_secs {
if self
.catalog_manager
.list_creating_background_mvs()
Expand All @@ -1139,7 +1140,9 @@ impl DdlController {
}
sleep(Duration::from_secs(1)).await;
}
Err(MetaError::cancelled("timeout".into()))
Err(MetaError::cancelled(format!(
"timeout after {timeout_secs}s"
)))
}

async fn comment_on(&self, comment: Comment) -> MetaResult<NotificationVersion> {
Expand Down
Loading
Loading