diff --git a/ci/scripts/find-regression.py b/ci/scripts/find-regression.py new file mode 100755 index 000000000000..e5106c9d892e --- /dev/null +++ b/ci/scripts/find-regression.py @@ -0,0 +1,262 @@ +#!/usr/bin/env python3 + +import subprocess +import unittest +import os +import sys + +''' +@kwannoel +This script is used to find the commit that introduced a regression in the codebase. +It uses binary search to find the regressed commit. +It works as follows: +1. Use the start (inclusive) and end (exclusive) bounds, find the middle commit. + e.g. given commit 0->1(start)->2->3->4(bad), start will be 1, end will be 4. Then the middle commit is (1+4)//2 = 2 + given commit 0->1(start)->2->3(bad)->4, start will be 1, end will be 3. Then the middle commit is (1+3)//2 = 2 + given commit 0->1(start)->2(bad), start will be 1, end will be 2. Then the middle commit is (1+2)//2 = 1. + given commit 0->1(start,bad), start will be 1, end will be 1. We just return the bad commit (1) immediately. +2. Run the pipeline on the middle commit. +3. If the pipeline fails, the regression is in the first half of the commits. Recurse (start, mid) +4. If the pipeline passes, the regression is in the second half of the commits. Recurse (mid+1, end) +5. If start>=end, return start as the regressed commit. + +We won't run the entire pipeline, only steps specified by the CI_STEPS environment variable. + +For step (2), we need to check its outcome and only run the next step, if the outcome is successful. +''' + + +def format_step(env): + commit = get_bisect_commit(env["GOOD_COMMIT"], env["BAD_COMMIT"]) + step = f''' +cat <<- YAML | buildkite-agent pipeline upload +steps: + - label: "run-{commit}" + key: "run-{commit}" + trigger: "main-cron" + soft_fail: true + build: + branch: {env["BISECT_BRANCH"]} + commit: {commit} + env: + CI_STEPS: {env['CI_STEPS']} + - wait + - label: 'check' + command: | + GOOD_COMMIT={env['GOOD_COMMIT']} BAD_COMMIT={env['BAD_COMMIT']} BISECT_BRANCH={env['BISECT_BRANCH']} CI_STEPS=\'{env['CI_STEPS']}\' ci/scripts/find-regression.py check +YAML''' + return step + + +def report_step(commit): + step = f''' +cat <<- YAML | buildkite-agent pipeline upload +steps: + - label: "Regressed Commit: {commit}" + command: "echo 'Regressed Commit: {commit}'" +YAML''' + print(f"--- reporting regression commit: {commit}") + result = subprocess.run(step, shell=True) + if result.returncode != 0: + print(f"stderr: {result.stderr}") + print(f"stdout: {result.stdout}") + sys.exit(1) + + +# Triggers a buildkite job to run the pipeline on the given commit, with the specified tests. +def run_pipeline(env): + step = format_step(env) + print(f"--- running upload pipeline for step\n{step}") + result = subprocess.run(step, shell=True) + if result.returncode != 0: + print(f"stderr: {result.stderr}") + print(f"stdout: {result.stdout}") + sys.exit(1) + + +# Number of commits for [start, end) +def get_number_of_commits(start, end): + cmd = f"git rev-list --count {start}..{end}" + result = subprocess.run([cmd], shell=True, capture_output=True, text=True) + if result.returncode != 0: + print(f"stderr: {result.stderr}") + print(f"stdout: {result.stdout}") + sys.exit(1) + return int(result.stdout) + + +def get_bisect_commit(start, end): + number_of_commits = get_number_of_commits(start, end) + commit_offset = number_of_commits // 2 + if commit_offset == 0: + return start + + cmd = f"git rev-list --reverse {start}..{end} | head -n {commit_offset} | tail -n 1" + result = subprocess.run([cmd], shell=True, capture_output=True, text=True) + if result.returncode != 0: + print(f"stderr: {result.stderr}") + print(f"stdout: {result.stdout}") + sys.exit(1) + return result.stdout.strip() + + +def get_commit_after(branch, commit): + cmd = f"git log --reverse --ancestry-path {commit}..origin/{branch} --format=%H | head -n 1" + result = subprocess.run([cmd], shell=True, capture_output=True, text=True) + + if result.returncode != 0: + print(f"stderr: {result.stderr}") + print(f"stdout: {result.stdout}") + sys.exit(1) + + return result.stdout.strip() + + +def get_env(): + env = { + "GOOD_COMMIT": os.environ['GOOD_COMMIT'], + "BAD_COMMIT": os.environ['BAD_COMMIT'], + "BISECT_BRANCH": os.environ['BISECT_BRANCH'], + "CI_STEPS": os.environ['CI_STEPS'], + } + + print(f''' +GOOD_COMMIT={env["GOOD_COMMIT"]} +BAD_COMMIT={env["BAD_COMMIT"]} +BISECT_BRANCH={env["BISECT_BRANCH"]} +CI_STEPS={env["CI_STEPS"]} + ''') + + return env + + +def fetch_branch_commits(branch): + cmd = f"git fetch -q origin {branch}" + result = subprocess.run([cmd], shell=True) + if result.returncode != 0: + print(f"stderr: {result.stderr}") + print(f"stdout: {result.stdout}") + sys.exit(1) + + +def main(): + cmd = sys.argv[1] + + if cmd == "start": + print("--- start bisecting") + env = get_env() + fetch_branch_commits(env["BISECT_BRANCH"]) + run_pipeline(env) + elif cmd == "check": + print("--- check pipeline outcome") + env = get_env() + fetch_branch_commits(env["BISECT_BRANCH"]) + commit = get_bisect_commit(env["GOOD_COMMIT"], env["BAD_COMMIT"]) + step = f"run-{commit}" + cmd = f"buildkite-agent step get outcome --step {step}" + outcome = subprocess.run(cmd, shell=True, capture_output=True, text=True) + + if outcome.returncode != 0: + print(f"stderr: {outcome.stderr}") + print(f"stdout: {outcome.stdout}") + sys.exit(1) + + outcome = outcome.stdout.strip() + if outcome == "soft_failed": + print(f"commit failed: {commit}") + env["BAD_COMMIT"] = commit + elif outcome == "passed": + print(f"commit passed: {commit}") + env["GOOD_COMMIT"] = get_commit_after(env["BISECT_BRANCH"], commit) + else: + print(f"invalid outcome: {outcome}") + sys.exit(1) + + if env["GOOD_COMMIT"] == env["BAD_COMMIT"]: + report_step(env["GOOD_COMMIT"]) + return + else: + print(f"run next iteration, start: {env['GOOD_COMMIT']}, end: {env['BAD_COMMIT']}") + run_pipeline(env) + else: + print(f"invalid cmd: {cmd}") + sys.exit(1) + + +# For the tests, we use RisingWave's sequence of commits, from earliest to latest: +# 617d23ddcac88ced87b96a2454c9217da0fe7915 +# 72f70960226680e841a8fbdd09c79d74609f27a2 +# 5c7b556ea60d136c5bccf1b1f7e313d2f9c79ef0 +# 9ca415a9998a5e04e021c899fb66d93a17931d4f +class Test(unittest.TestCase): + def test_get_commit_after(self): + fetch_branch_commits("kwannoel/find-regress") + commit = get_commit_after("kwannoel/find-regress", "72f70960226680e841a8fbdd09c79d74609f27a2") + self.assertEqual(commit, "5c7b556ea60d136c5bccf1b1f7e313d2f9c79ef0") + commit2 = get_commit_after("kwannoel/find-regress", "617d23ddcac88ced87b96a2454c9217da0fe7915") + self.assertEqual(commit2, "72f70960226680e841a8fbdd09c79d74609f27a2") + commit3 = get_commit_after("kwannoel/find-regress", "5c7b556ea60d136c5bccf1b1f7e313d2f9c79ef0") + self.assertEqual(commit3, "9ca415a9998a5e04e021c899fb66d93a17931d4f") + + def test_get_number_of_commits(self): + fetch_branch_commits("kwannoel/find-regress") + n = get_number_of_commits("72f70960226680e841a8fbdd09c79d74609f27a2", + "9ca415a9998a5e04e021c899fb66d93a17931d4f") + self.assertEqual(n, 2) + n2 = get_number_of_commits("617d23ddcac88ced87b96a2454c9217da0fe7915", + "9ca415a9998a5e04e021c899fb66d93a17931d4f") + self.assertEqual(n2, 3) + n3 = get_number_of_commits("72f70960226680e841a8fbdd09c79d74609f27a2", + "5c7b556ea60d136c5bccf1b1f7e313d2f9c79ef0") + self.assertEqual(n3, 1) + + def test_get_bisect_commit(self): + fetch_branch_commits("kwannoel/find-regress") + commit = get_bisect_commit("72f70960226680e841a8fbdd09c79d74609f27a2", + "9ca415a9998a5e04e021c899fb66d93a17931d4f") + self.assertEqual(commit, "5c7b556ea60d136c5bccf1b1f7e313d2f9c79ef0") + commit2 = get_bisect_commit("617d23ddcac88ced87b96a2454c9217da0fe7915", + "9ca415a9998a5e04e021c899fb66d93a17931d4f") + self.assertEqual(commit2, "72f70960226680e841a8fbdd09c79d74609f27a2") + commit3 = get_bisect_commit("72f70960226680e841a8fbdd09c79d74609f27a2", + "5c7b556ea60d136c5bccf1b1f7e313d2f9c79ef0") + self.assertEqual(commit3, "72f70960226680e841a8fbdd09c79d74609f27a2") + + def test_format_step(self): + fetch_branch_commits("kwannoel/find-regress") + self.maxDiff = None + env = { + "GOOD_COMMIT": "72f70960226680e841a8fbdd09c79d74609f27a2", + "BAD_COMMIT": "9ca415a9998a5e04e021c899fb66d93a17931d4f", + "BISECT_BRANCH": "kwannoel/find-regress", + "CI_STEPS": "test" + } + step = format_step(env) + self.assertEqual( + step, + ''' +cat <<- YAML | buildkite-agent pipeline upload +steps: + - label: "run-5c7b556ea60d136c5bccf1b1f7e313d2f9c79ef0" + key: "run-5c7b556ea60d136c5bccf1b1f7e313d2f9c79ef0" + trigger: "main-cron" + soft_fail: true + build: + branch: kwannoel/find-regress + commit: 5c7b556ea60d136c5bccf1b1f7e313d2f9c79ef0 + env: + CI_STEPS: test + - wait + - label: 'check' + command: | + GOOD_COMMIT=72f70960226680e841a8fbdd09c79d74609f27a2 BAD_COMMIT=9ca415a9998a5e04e021c899fb66d93a17931d4f BISECT_BRANCH=kwannoel/find-regress CI_STEPS='test' ci/scripts/find-regression.py check +YAML''' + ) + + +if __name__ == "__main__": + # You can run tests by just doing ./ci/scripts/find-regression.py + if len(sys.argv) == 1: + unittest.main() + else: + main() diff --git a/ci/workflows/main-cron-bisect.yml b/ci/workflows/main-cron-bisect.yml new file mode 100644 index 000000000000..ab8cebd234d7 --- /dev/null +++ b/ci/workflows/main-cron-bisect.yml @@ -0,0 +1,12 @@ +auto-retry: &auto-retry + automatic: + # Agent terminated because the AWS EC2 spot instance killed by AWS. + - signal_reason: agent_stop + limit: 3 + +steps: + - label: "find regressed step" + key: "find-regressed-step" + command: "GOOD_COMMIT=$GOOD_COMMIT BAD_COMMIT=$BAD_COMMIT BISECT_BRANCH=$BISECT_BRANCH CI_STEPS=$CI_STEPS ci/scripts/find-regression.py start" + if: build.env("CI_STEPS") != null + retry: *auto-retry diff --git a/ci/workflows/main-cron.yml b/ci/workflows/main-cron.yml index 0060335d8504..3c71be0f0984 100644 --- a/ci/workflows/main-cron.yml +++ b/ci/workflows/main-cron.yml @@ -8,6 +8,8 @@ steps: - label: "build" command: "ci/scripts/build.sh -p ci-release" key: "build" + if: | + build.env("CI_STEPS") !~ /(^|,)disable-build(,|$$)/ plugins: - docker-compose#v5.1.0: run: rw-build-env @@ -19,6 +21,8 @@ steps: - label: "build other components" command: "ci/scripts/build-other.sh" key: "build-other" + if: | + build.env("CI_STEPS") !~ /(^|,)disable-build(,|$$)/ plugins: - seek-oss/aws-sm#v2.3.1: env: @@ -35,6 +39,8 @@ steps: - label: "build simulation test" command: "ci/scripts/build-simulation.sh" key: "build-simulation" + if: | + build.env("CI_STEPS") !~ /(^|,)disable-build(,|$$)/ plugins: - docker-compose#v5.1.0: run: rw-build-env @@ -46,6 +52,8 @@ steps: - label: "docslt" command: "ci/scripts/docslt.sh" key: "docslt" + if: | + build.env("CI_STEPS") !~ /(^|,)disable-build(,|$$)/ plugins: - docker-compose#v5.1.0: run: rw-build-env @@ -649,8 +657,7 @@ steps: - label: "upload micro-benchmark" key: "upload-micro-benchmarks" if: | - build.branch == "main" - || !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null + !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null || build.pull_request.labels includes "ci/run-micro-benchmarks" || build.env("CI_STEPS") =~ /(^|,)micro-benchmarks?(,|$$)/ command: @@ -993,7 +1000,7 @@ steps: key: "e2e-mongodb-sink-tests" command: "ci/scripts/e2e-mongodb-sink-test.sh -p ci-release" if: | - !(build.pull_request.labels includes "ci/main-cron/skip-ci") && build.env("CI_STEPS") == null + !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null || build.pull_request.labels includes "ci/run-e2e-mongodb-sink-tests" || build.env("CI_STEPS") =~ /(^|,)e2e-mongodb-sink-tests?(,|$$)/ depends_on: @@ -1119,13 +1126,13 @@ steps: # Notification test. - key: "test-notify" - if: build.pull_request.labels includes "ci/main-cron/test-notify" + if: build.pull_request.labels includes "ci/main-cron/test-notify" || build.env("CI_STEPS") =~ /(^|,)test_notify(,|$$)/ command: | bash -c 'echo test && exit -1' # Notification test. - key: "test-notify-2" - if: build.pull_request.labels includes "ci/main-cron/test-notify" + if: build.pull_request.labels includes "ci/main-cron/test-notify" || build.env("CI_STEPS") =~ /(^|,)test_notify(,|$$)/ command: | bash -c 'echo test && exit -1' @@ -1138,4 +1145,4 @@ steps: # This should be the LAST part of the main-cron file. - label: "trigger failed test notification" if: build.pull_request.labels includes "ci/main-cron/test-notify" || build.branch == "main" - command: "ci/scripts/notify.py" + command: "ci/scripts/notify.py" \ No newline at end of file diff --git a/docs/dev/src/ci.md b/docs/dev/src/ci.md index 840173766055..0f12d893acae 100644 --- a/docs/dev/src/ci.md +++ b/docs/dev/src/ci.md @@ -19,3 +19,23 @@ To run `e2e-test` and `e2e-source-test` for `main-cron` in your pull request: 1. Add `ci/run-e2e-test`. 2. Add `ci/run-e2e-source-tests`. 3. Add `ci/main-cron/run-selected` to skip all other steps which were not selected with `ci/run-xxx`. + +## Main Cron Bisect Guide + +1. Create a new build via buildkite: https://buildkite.com/risingwavelabs/main-cron-bisect/builds/#new +2. Add the following environment variables: + - `GOOD_COMMIT`: The good commit hash. + - `BAD_COMMIT`: The bad commit hash. + - `BISECT_BRANCH`: The branch name where the bisect will be performed. + - `CI_STEPS`: The `CI_STEPS` to run during the bisect. Separate multiple steps with a comma. + - You can check the labels for this in `main-cron.yml`, under the conditions for each step. + +Example you can try on [buildkite](https://buildkite.com/risingwavelabs/main-cron-bisect/builds/#new): +- Branch: `kwannoel/find-regress` +- Environment variables: + ``` + START_COMMIT=29791ddf16fdf2c2e83ad3a58215f434e610f89a + END_COMMIT=7f36bf17c1d19a1e6b2cdb90491d3c08ae8b0004 + BISECT_BRANCH=kwannoel/test-bisect + CI_STEPS="test-bisect,disable-build" + ``` \ No newline at end of file