feat(ci): introduce main-cron bisect (#17596)

risingwavelabs · Jul 17, 2024 · 42d5153 · 42d5153
1 parent 05330f6
commit 42d5153
Show file tree

Hide file tree

Showing 4 changed files with 307 additions and 6 deletions.
diff --git a/ci/scripts/find-regression.py b/ci/scripts/find-regression.py
@@ -0,0 +1,262 @@
+#!/usr/bin/env python3
+
+import subprocess
+import unittest
+import os
+import sys
+
+'''
+@kwannoel
+This script is used to find the commit that introduced a regression in the codebase.
+It uses binary search to find the regressed commit.
+It works as follows:
+1. Use the start (inclusive) and end (exclusive) bounds, find the middle commit.
+   e.g. given commit 0->1(start)->2->3->4(bad), start will be 1, end will be 4. Then the middle commit is (1+4)//2 = 2
+        given commit 0->1(start)->2->3(bad)->4, start will be 1, end will be 3. Then the middle commit is (1+3)//2 = 2
+        given commit 0->1(start)->2(bad), start will be 1, end will be 2. Then the middle commit is (1+2)//2 = 1.
+        given commit 0->1(start,bad), start will be 1, end will be 1. We just return the bad commit (1) immediately.
+2. Run the pipeline on the middle commit.
+3. If the pipeline fails, the regression is in the first half of the commits. Recurse (start, mid)
+4. If the pipeline passes, the regression is in the second half of the commits. Recurse (mid+1, end)
+5. If start>=end, return start as the regressed commit.
+
+We won't run the entire pipeline, only steps specified by the CI_STEPS environment variable.
+
+For step (2), we need to check its outcome and only run the next step, if the outcome is successful.
+'''
+
+
+def format_step(env):
+    commit = get_bisect_commit(env["GOOD_COMMIT"], env["BAD_COMMIT"])
+    step = f'''
+cat <<- YAML | buildkite-agent pipeline upload
+steps:
+  - label: "run-{commit}"
+    key: "run-{commit}"
+    trigger: "main-cron"
+    soft_fail: true
+    build:
+      branch: {env["BISECT_BRANCH"]}
+      commit: {commit}
+      env:
+        CI_STEPS: {env['CI_STEPS']}
+  - wait
+  - label: 'check'
+    command: |
+        GOOD_COMMIT={env['GOOD_COMMIT']} BAD_COMMIT={env['BAD_COMMIT']} BISECT_BRANCH={env['BISECT_BRANCH']} CI_STEPS=\'{env['CI_STEPS']}\' ci/scripts/find-regression.py check
+YAML'''
+    return step
+
+
+def report_step(commit):
+    step = f'''
+cat <<- YAML | buildkite-agent pipeline upload
+steps:
+  - label: "Regressed Commit: {commit}"
+    command: "echo 'Regressed Commit: {commit}'"
+YAML'''
+    print(f"--- reporting regression commit: {commit}")
+    result = subprocess.run(step, shell=True)
+    if result.returncode != 0:
+        print(f"stderr: {result.stderr}")
+        print(f"stdout: {result.stdout}")
+        sys.exit(1)
+
+
+# Triggers a buildkite job to run the pipeline on the given commit, with the specified tests.
+def run_pipeline(env):
+    step = format_step(env)
+    print(f"--- running upload pipeline for step\n{step}")
+    result = subprocess.run(step, shell=True)
+    if result.returncode != 0:
+        print(f"stderr: {result.stderr}")
+        print(f"stdout: {result.stdout}")
+        sys.exit(1)
+
+
+# Number of commits for [start, end)
+def get_number_of_commits(start, end):
+    cmd = f"git rev-list --count {start}..{end}"
+    result = subprocess.run([cmd], shell=True, capture_output=True, text=True)
+    if result.returncode != 0:
+        print(f"stderr: {result.stderr}")
+        print(f"stdout: {result.stdout}")
+        sys.exit(1)
+    return int(result.stdout)
+
+
+def get_bisect_commit(start, end):
+    number_of_commits = get_number_of_commits(start, end)
+    commit_offset = number_of_commits // 2
+    if commit_offset == 0:
+        return start
+
+    cmd = f"git rev-list --reverse {start}..{end} | head -n {commit_offset} | tail -n 1"
+    result = subprocess.run([cmd], shell=True, capture_output=True, text=True)
+    if result.returncode != 0:
+        print(f"stderr: {result.stderr}")
+        print(f"stdout: {result.stdout}")
+        sys.exit(1)
+    return result.stdout.strip()
+
+
+def get_commit_after(branch, commit):
+    cmd = f"git log --reverse --ancestry-path {commit}..origin/{branch} --format=%H | head -n 1"
+    result = subprocess.run([cmd], shell=True, capture_output=True, text=True)
+
+    if result.returncode != 0:
+        print(f"stderr: {result.stderr}")
+        print(f"stdout: {result.stdout}")
+        sys.exit(1)
+
+    return result.stdout.strip()
+
+
+def get_env():
+    env = {
+        "GOOD_COMMIT": os.environ['GOOD_COMMIT'],
+        "BAD_COMMIT": os.environ['BAD_COMMIT'],
+        "BISECT_BRANCH": os.environ['BISECT_BRANCH'],
+        "CI_STEPS": os.environ['CI_STEPS'],
+    }
+
+    print(f'''
+GOOD_COMMIT={env["GOOD_COMMIT"]}
+BAD_COMMIT={env["BAD_COMMIT"]}
+BISECT_BRANCH={env["BISECT_BRANCH"]}
+CI_STEPS={env["CI_STEPS"]}
+        ''')
+
+    return env
+
+
+def fetch_branch_commits(branch):
+    cmd = f"git fetch -q origin {branch}"
+    result = subprocess.run([cmd], shell=True)
+    if result.returncode != 0:
+        print(f"stderr: {result.stderr}")
+        print(f"stdout: {result.stdout}")
+        sys.exit(1)
+
+
+def main():
+    cmd = sys.argv[1]
+
+    if cmd == "start":
+        print("--- start bisecting")
+        env = get_env()
+        fetch_branch_commits(env["BISECT_BRANCH"])
+        run_pipeline(env)
+    elif cmd == "check":
+        print("--- check pipeline outcome")
+        env = get_env()
+        fetch_branch_commits(env["BISECT_BRANCH"])
+        commit = get_bisect_commit(env["GOOD_COMMIT"], env["BAD_COMMIT"])
+        step = f"run-{commit}"
+        cmd = f"buildkite-agent step get outcome --step {step}"
+        outcome = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+
+        if outcome.returncode != 0:
+            print(f"stderr: {outcome.stderr}")
+            print(f"stdout: {outcome.stdout}")
+            sys.exit(1)
+
+        outcome = outcome.stdout.strip()
+        if outcome == "soft_failed":
+            print(f"commit failed: {commit}")
+            env["BAD_COMMIT"] = commit
+        elif outcome == "passed":
+            print(f"commit passed: {commit}")
+            env["GOOD_COMMIT"] = get_commit_after(env["BISECT_BRANCH"], commit)
+        else:
+            print(f"invalid outcome: {outcome}")
+            sys.exit(1)
+
+        if env["GOOD_COMMIT"] == env["BAD_COMMIT"]:
+            report_step(env["GOOD_COMMIT"])
+            return
+        else:
+            print(f"run next iteration, start: {env['GOOD_COMMIT']}, end: {env['BAD_COMMIT']}")
+            run_pipeline(env)
+    else:
+        print(f"invalid cmd: {cmd}")
+        sys.exit(1)
+
+
+# For the tests, we use RisingWave's sequence of commits, from earliest to latest:
+# 617d23ddcac88ced87b96a2454c9217da0fe7915
+# 72f70960226680e841a8fbdd09c79d74609f27a2
+# 5c7b556ea60d136c5bccf1b1f7e313d2f9c79ef0
+# 9ca415a9998a5e04e021c899fb66d93a17931d4f
+class Test(unittest.TestCase):
+    def test_get_commit_after(self):
+        fetch_branch_commits("kwannoel/find-regress")
+        commit = get_commit_after("kwannoel/find-regress", "72f70960226680e841a8fbdd09c79d74609f27a2")
+        self.assertEqual(commit, "5c7b556ea60d136c5bccf1b1f7e313d2f9c79ef0")
+        commit2 = get_commit_after("kwannoel/find-regress", "617d23ddcac88ced87b96a2454c9217da0fe7915")
+        self.assertEqual(commit2, "72f70960226680e841a8fbdd09c79d74609f27a2")
+        commit3 = get_commit_after("kwannoel/find-regress", "5c7b556ea60d136c5bccf1b1f7e313d2f9c79ef0")
+        self.assertEqual(commit3, "9ca415a9998a5e04e021c899fb66d93a17931d4f")
+
+    def test_get_number_of_commits(self):
+        fetch_branch_commits("kwannoel/find-regress")
+        n = get_number_of_commits("72f70960226680e841a8fbdd09c79d74609f27a2",
+                                  "9ca415a9998a5e04e021c899fb66d93a17931d4f")
+        self.assertEqual(n, 2)
+        n2 = get_number_of_commits("617d23ddcac88ced87b96a2454c9217da0fe7915",
+                                   "9ca415a9998a5e04e021c899fb66d93a17931d4f")
+        self.assertEqual(n2, 3)
+        n3 = get_number_of_commits("72f70960226680e841a8fbdd09c79d74609f27a2",
+                                   "5c7b556ea60d136c5bccf1b1f7e313d2f9c79ef0")
+        self.assertEqual(n3, 1)
+
+    def test_get_bisect_commit(self):
+        fetch_branch_commits("kwannoel/find-regress")
+        commit = get_bisect_commit("72f70960226680e841a8fbdd09c79d74609f27a2",
+                                   "9ca415a9998a5e04e021c899fb66d93a17931d4f")
+        self.assertEqual(commit, "5c7b556ea60d136c5bccf1b1f7e313d2f9c79ef0")
+        commit2 = get_bisect_commit("617d23ddcac88ced87b96a2454c9217da0fe7915",
+                                    "9ca415a9998a5e04e021c899fb66d93a17931d4f")
+        self.assertEqual(commit2, "72f70960226680e841a8fbdd09c79d74609f27a2")
+        commit3 = get_bisect_commit("72f70960226680e841a8fbdd09c79d74609f27a2",
+                                    "5c7b556ea60d136c5bccf1b1f7e313d2f9c79ef0")
+        self.assertEqual(commit3, "72f70960226680e841a8fbdd09c79d74609f27a2")
+
+    def test_format_step(self):
+        fetch_branch_commits("kwannoel/find-regress")
+        self.maxDiff = None
+        env = {
+            "GOOD_COMMIT": "72f70960226680e841a8fbdd09c79d74609f27a2",
+            "BAD_COMMIT": "9ca415a9998a5e04e021c899fb66d93a17931d4f",
+            "BISECT_BRANCH": "kwannoel/find-regress",
+            "CI_STEPS": "test"
+        }
+        step = format_step(env)
+        self.assertEqual(
+            step,
+            '''
+cat <<- YAML | buildkite-agent pipeline upload
+steps:
+  - label: "run-5c7b556ea60d136c5bccf1b1f7e313d2f9c79ef0"
+    key: "run-5c7b556ea60d136c5bccf1b1f7e313d2f9c79ef0"
+    trigger: "main-cron"
+    soft_fail: true
+    build:
+      branch: kwannoel/find-regress
+      commit: 5c7b556ea60d136c5bccf1b1f7e313d2f9c79ef0
+      env:
+        CI_STEPS: test
+  - wait
+  - label: 'check'
+    command: |
+        GOOD_COMMIT=72f70960226680e841a8fbdd09c79d74609f27a2 BAD_COMMIT=9ca415a9998a5e04e021c899fb66d93a17931d4f BISECT_BRANCH=kwannoel/find-regress CI_STEPS='test' ci/scripts/find-regression.py check
+YAML'''
+        )
+
+
+if __name__ == "__main__":
+    # You can run tests by just doing ./ci/scripts/find-regression.py
+    if len(sys.argv) == 1:
+        unittest.main()
+    else:
+        main()
diff --git a/ci/workflows/main-cron-bisect.yml b/ci/workflows/main-cron-bisect.yml
@@ -0,0 +1,12 @@
+auto-retry: &auto-retry
+  automatic:
+    # Agent terminated because the AWS EC2 spot instance killed by AWS.
+    - signal_reason: agent_stop
+      limit: 3
+
+steps:
+  - label: "find regressed step"
+    key: "find-regressed-step"
+    command: "GOOD_COMMIT=$GOOD_COMMIT BAD_COMMIT=$BAD_COMMIT BISECT_BRANCH=$BISECT_BRANCH CI_STEPS=$CI_STEPS ci/scripts/find-regression.py start"
+    if: build.env("CI_STEPS") != null
+    retry: *auto-retry
diff --git a/ci/workflows/main-cron.yml b/ci/workflows/main-cron.yml
@@ -8,6 +8,8 @@ steps:
   - label: "build"
     command: "ci/scripts/build.sh -p ci-release"
     key: "build"
+    if: |
+      build.env("CI_STEPS") !~ /(^|,)disable-build(,|$$)/
     plugins:
       - docker-compose#v5.1.0:
           run: rw-build-env
@@ -19,6 +21,8 @@ steps:
   - label: "build other components"
     command: "ci/scripts/build-other.sh"
     key: "build-other"
+    if: |
+      build.env("CI_STEPS") !~ /(^|,)disable-build(,|$$)/
     plugins:
       - seek-oss/aws-sm#v2.3.1:
           env:
@@ -35,6 +39,8 @@ steps:
   - label: "build simulation test"
     command: "ci/scripts/build-simulation.sh"
     key: "build-simulation"
+    if: |
+      build.env("CI_STEPS") !~ /(^|,)disable-build(,|$$)/
     plugins:
       - docker-compose#v5.1.0:
           run: rw-build-env
@@ -46,6 +52,8 @@ steps:
   - label: "docslt"
     command: "ci/scripts/docslt.sh"
     key: "docslt"
+    if: |
+      build.env("CI_STEPS") !~ /(^|,)disable-build(,|$$)/
     plugins:
       - docker-compose#v5.1.0:
           run: rw-build-env
@@ -649,8 +657,7 @@ steps:
   - label: "upload micro-benchmark"
     key: "upload-micro-benchmarks"
     if: |
-      build.branch == "main"
-      || !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null
+      !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null
       || build.pull_request.labels includes "ci/run-micro-benchmarks"
       || build.env("CI_STEPS") =~ /(^|,)micro-benchmarks?(,|$$)/
     command:
@@ -993,7 +1000,7 @@ steps:
     key: "e2e-mongodb-sink-tests"
     command: "ci/scripts/e2e-mongodb-sink-test.sh -p ci-release"
     if: |
-      !(build.pull_request.labels includes "ci/main-cron/skip-ci") && build.env("CI_STEPS") == null
+      !(build.pull_request.labels includes "ci/main-cron/run-selected") && build.env("CI_STEPS") == null
       || build.pull_request.labels includes "ci/run-e2e-mongodb-sink-tests"
       || build.env("CI_STEPS") =~ /(^|,)e2e-mongodb-sink-tests?(,|$$)/
     depends_on:
@@ -1119,13 +1126,13 @@ steps:
 
   # Notification test.
   - key: "test-notify"
-    if: build.pull_request.labels includes "ci/main-cron/test-notify"
+    if: build.pull_request.labels includes "ci/main-cron/test-notify" || build.env("CI_STEPS") =~ /(^|,)test_notify(,|$$)/
     command: |
       bash -c 'echo test && exit -1'
 
   # Notification test.
   - key: "test-notify-2"
-    if: build.pull_request.labels includes "ci/main-cron/test-notify"
+    if: build.pull_request.labels includes "ci/main-cron/test-notify" || build.env("CI_STEPS") =~ /(^|,)test_notify(,|$$)/
     command: |
       bash -c 'echo test && exit -1'
 
@@ -1138,4 +1145,4 @@ steps:
   # This should be the LAST part of the main-cron file.
   - label: "trigger failed test notification"
     if: build.pull_request.labels includes "ci/main-cron/test-notify" || build.branch == "main"
-    command: "ci/scripts/notify.py"
+    command: "ci/scripts/notify.py"
diff --git a/docs/dev/src/ci.md b/docs/dev/src/ci.md
@@ -19,3 +19,23 @@ To run `e2e-test` and `e2e-source-test` for `main-cron` in your pull request:
 1. Add `ci/run-e2e-test`.
 2. Add `ci/run-e2e-source-tests`.
 3. Add `ci/main-cron/run-selected` to skip all other steps which were not selected with `ci/run-xxx`.
+
+## Main Cron Bisect Guide
+
+1. Create a new build via buildkite: https://buildkite.com/risingwavelabs/main-cron-bisect/builds/#new
+2. Add the following environment variables:
+   - `GOOD_COMMIT`: The good commit hash.
+   - `BAD_COMMIT`: The bad commit hash.
+   - `BISECT_BRANCH`: The branch name where the bisect will be performed.
+   - `CI_STEPS`: The `CI_STEPS` to run during the bisect. Separate multiple steps with a comma.
+     - You can check the labels for this in `main-cron.yml`, under the conditions for each step.
+
+Example you can try on [buildkite](https://buildkite.com/risingwavelabs/main-cron-bisect/builds/#new):
+- Branch: `kwannoel/find-regress`
+- Environment variables:
+  ```
+  START_COMMIT=29791ddf16fdf2c2e83ad3a58215f434e610f89a
+  END_COMMIT=7f36bf17c1d19a1e6b2cdb90491d3c08ae8b0004
+  BISECT_BRANCH=kwannoel/test-bisect
+  CI_STEPS="test-bisect,disable-build"
+  ```