diff --git a/.github/workflows/provision-replay-verify-archive-disks.yaml b/.github/workflows/provision-replay-verify-archive-disks.yaml new file mode 100644 index 0000000000000..a8fd80ef2df5b --- /dev/null +++ b/.github/workflows/provision-replay-verify-archive-disks.yaml @@ -0,0 +1,66 @@ +name: "provision-replay-verify-archive-disks" +on: + # Allow triggering manually + workflow_dispatch: + inputs: + NETWORK: + required: true + type: choice + description: The network to provision storage for.If not specified, it will provision snapshot for both testnet and mainnet. + options: [testnet, mainnet, all] + default: all + pull_request: + paths: + - ".github/workflows/provision-replay-verify-archive-disks.yaml" + - ".github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml" + schedule: + - cron: "0 22 * * 1,3,5" # This runs every Mon,Wed,Fri + +permissions: + contents: read + id-token: write #required for GCP Workload Identity federation which we use to login into Google Artifact Registry + issues: read + pull-requests: read + +# cancel redundant builds +concurrency: + # cancel redundant builds on PRs (only on PR, not on branches) + group: ${{ github.workflow }}-${{ (github.event_name == 'pull_request' && github.ref) || github.sha }} + cancel-in-progress: true + +jobs: + determine-test-metadata: + runs-on: ubuntu-latest + steps: + # checkout the repo first, so check-aptos-core can use it and cancel the workflow if necessary + - uses: actions/checkout@v4 + - uses: ./.github/actions/check-aptos-core + with: + cancel-workflow: ${{ github.event_name == 'schedule' }} # Cancel the workflow if it is scheduled on a fork + + - name: Debug + run: | + echo "Event name: ${{ github.event_name }}" + echo "Network: ${{ inputs.NETWORK }}" + + provision-testnet: + if: | + github.event_name == 'schedule' || + github.event_name == 'push' || + github.event_name == 'workflow_dispatch' && (inputs.NETWORK == 'testnet' || inputs.NETWORK == 'all') + needs: determine-test-metadata + uses: ./.github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml + secrets: inherit + with: + NETWORK: testnet + + provision-mainnet: + if: | + github.event_name == 'schedule' || + github.event_name == 'push' || + github.event_name == 'workflow_dispatch' && (inputs.NETWORK == 'testnet' || inputs.NETWORK == 'all') + needs: determine-test-metadata + uses: ./.github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml + secrets: inherit + with: + NETWORK: mainnet diff --git a/.github/workflows/replay-verify-on-archive.yaml b/.github/workflows/replay-verify-on-archive.yaml new file mode 100644 index 0000000000000..0a5ba5ea90688 --- /dev/null +++ b/.github/workflows/replay-verify-on-archive.yaml @@ -0,0 +1,85 @@ +# This defines a workflow to replay transactions on the given chain with the latest aptos node software. +# In order to trigger it go to the Actions Tab of the Repo, click "replay-verify" and then "Run Workflow". +# +# On PR, a single test case will run. On workflow_dispatch, you may specify the CHAIN_NAME to verify. + +name: "replay-verify-on-archive" +on: + # Allow triggering manually + workflow_dispatch: + inputs: + NETWORK: + required: true + type: choice + options: [testnet, mainnet, all] + default: all + description: The chain name to test. If not specified, it will test both testnet and mainnet. + IMAGE_TAG: + required: false + type: string + description: The image tag of the feature branch to test, if not specified, it will use the latest commit on current branch. + START_VERSION: + required: false + type: string + description: Optional version to start replaying. If not specified, replay-verify will determines start version itself. + END_VERSION: + required: false + type: string + description: Optional version to end replaying. If not specified, replay-verify will determines end version itself. + pull_request: + paths: + - ".github/workflows/replay-verify-on-archive.yaml" + - ".github/workflows/workflow-run-replay-verify-on-archive.yaml" + schedule: + - cron: "0 22 * * 0,2,4" # The main branch cadence. This runs every Sun,Tues,Thurs + +permissions: + contents: read + id-token: write #required for GCP Workload Identity federation which we use to login into Google Artifact Registry + issues: read + pull-requests: read + +# cancel redundant builds +concurrency: + # cancel redundant builds on PRs (only on PR, not on branches) + group: ${{ github.workflow }}-${{ (github.event_name == 'pull_request' && github.ref) || github.sha }} + cancel-in-progress: true + +jobs: + determine-test-metadata: + runs-on: ubuntu-latest-32-core + steps: + # checkout the repo first, so check-aptos-core can use it and cancel the workflow if necessary + - uses: actions/checkout@v4 + - uses: ./.github/actions/check-aptos-core + with: + cancel-workflow: ${{ github.event_name == 'schedule' }} # Cancel the workflow if it is scheduled on a fork + + replay-testnet: + if: | + github.event_name == 'schedule' || + github.event_name == 'push' || + github.event_name == 'workflow_dispatch' && (inputs.NETWORK == 'testnet' || inputs.NETWORK == 'all') + needs: determine-test-metadata + uses: ./.github/workflows/workflow-run-replay-verify-on-archive.yaml + secrets: inherit + with: + NETWORK: "testnet" + IMAGE_TAG: ${{ inputs.IMAGE_TAG }} + START_VERSION: ${{ inputs.START_VERSION }} + END_VERSION: ${{ inputs.END_VERSION }} + + replay-mainnet: + if: | + github.event_name == 'schedule' || + github.event_name == 'push' || + github.event_name == 'pull_request' || + github.event_name == 'workflow_dispatch' && (inputs.NETWORK == 'mainnet' || inputs.NETWORK == 'all' ) + needs: determine-test-metadata + uses: ./.github/workflows/workflow-run-replay-verify-on-archive.yaml + secrets: inherit + with: + NETWORK: "mainnet" + IMAGE_TAG: ${{ inputs.IMAGE_TAG }} + START_VERSION: ${{ inputs.START_VERSION }} + END_VERSION: ${{ inputs.END_VERSION }} \ No newline at end of file diff --git a/.github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml b/.github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml new file mode 100644 index 0000000000000..e0560371a98b6 --- /dev/null +++ b/.github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml @@ -0,0 +1,61 @@ +name: "*run archive storage provision workflow" + +on: + # This allows the workflow to be triggered from another workflow + workflow_call: + inputs: + NETWORK: + required: true + type: string + description: The network to provision storage for. + workflow_dispatch: + inputs: + NETWORK: + description: The network to provision storage for. + type: string + required: true +jobs: + provision: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.BRANCH || 'add_replay_verify_workflow' }} + + # Authenticate to Google Cloud the project is aptos-ci + - name: Authenticate to Google Cloud + id: auth + uses: "google-github-actions/auth@v2" + with: + workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} + export_environment_variables: false + create_credentials_file: true + + # This is required since we need to switch from aptos-ci to aptos-devinfra-0 + - name: Setup Credentials + run: | + echo "GOOGLE_APPLICATION_CREDENTIALS=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV + echo "CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV + echo "GOOGLE_GHA_CREDS_PATH=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV + echo "CLOUDSDK_AUTH_ACCESS_TOKEN=${{ steps.auth.outputs.access_token }}" >> $GITHUB_ENV + + - name: Set up Cloud SDK + uses: "google-github-actions/setup-gcloud@v2" + with: + install_components: "kubectl, gke-gcloud-auth-plugin" + + - name: "Setup GCloud Project" + shell: bash + run: gcloud config set project aptos-devinfra-0 + + - uses: ./.github/actions/python-setup + with: + pyproject_directory: testsuite/replay-verify + + - name: "Provision Storage" + env: + GOOGLE_CLOUD_PROJECT: aptos-devinfra-0 + run: cd testsuite/replay-verify && poetry run python archive_disk_utils.py --network ${{ inputs.NETWORK }} + \ No newline at end of file diff --git a/.github/workflows/workflow-run-replay-verify-on-archive.yaml b/.github/workflows/workflow-run-replay-verify-on-archive.yaml new file mode 100644 index 0000000000000..ec2aab5147c51 --- /dev/null +++ b/.github/workflows/workflow-run-replay-verify-on-archive.yaml @@ -0,0 +1,119 @@ +name: "*run replay-verify on archive reusable workflow" + +on: + # This allows the workflow to be triggered from another workflow + workflow_call: + inputs: + NETWORK: + required: true + type: string + description: The network to run replay verify on. + IMAGE_TAG: + required: false + type: string + description: The image tag of the feature branch to test, if not specified, it will use the latest commit on current branch. + START_VERSION: + required: false + type: string + description: Optional version to start replaying. If not specified, replay-verify will determines start version itself. + END_VERSION: + required: false + type: string + description: Optional version to end replaying. If not specified, replay-verify will determines end version itself. + + workflow_dispatch: + inputs: + NETWORK: + required: true + type: string + description: The network to run replay verify on. + IMAGE_TAG: + required: false + type: string + description: The image tag of the feature branch to test, if not specified, it will use the latest commit on current branch. + START_VERSION: + required: false + type: string + description: The history start to use for the backup. If not specified, it will use the default history start. + END_VERSION: + required: false + type: string + description: The end version to use for the backup. If not specified, it will use the latest version. +jobs: + run-replay-verify: + runs-on: ubuntu-latest-32-core + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.BRANCH || 'add_replay_verify_workflow' }} + + - uses: aptos-labs/aptos-core/.github/actions/docker-setup@main + id: docker-setup + with: + GCP_WORKLOAD_IDENTITY_PROVIDER: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + GCP_SERVICE_ACCOUNT_EMAIL: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} + EXPORT_GCP_PROJECT_VARIABLES: "false" + GIT_CREDENTIALS: ${{ secrets.GIT_CREDENTIALS }} + + # Authenticate to Google Cloud the project is aptos-ci with credentails files generated + - name: Authenticate to Google Cloud + id: auth + uses: "google-github-actions/auth@v2" + with: + workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }} + export_environment_variables: false + create_credentials_file: true + + # This is required since we need to switch from aptos-ci to aptos-devinfra-0 + - name: Setup credentials + run: | + echo "GOOGLE_APPLICATION_CREDENTIALS=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV + echo "CLOUDSDK_AUTH_CREDENTIAL_FILE_OVERRIDE=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV + echo "GOOGLE_GHA_CREDS_PATH=${{ steps.auth.outputs.credentials_file_path }}" >> $GITHUB_ENV + echo "CLOUDSDK_AUTH_ACCESS_TOKEN=${{ steps.auth.outputs.access_token }}" >> $GITHUB_ENV + + - name: Set up Cloud SDK + uses: "google-github-actions/setup-gcloud@v2" + with: + install_components: "kubectl, gke-gcloud-auth-plugin" + + - name: "Setup GCloud project" + shell: bash + run: gcloud config set project aptos-devinfra-0 + + - uses: ./.github/actions/python-setup + with: + pyproject_directory: testsuite/replay-verify + + - name: Schedule replay verify + env: + GOOGLE_CLOUD_PROJECT: aptos-devinfra-0 + run: | + cd testsuite/replay-verify + CMD="poetry run python main.py --network ${{ inputs.NETWORK }}" + if [ -n "${{ inputs.START_VERSION }}" ]; then + CMD="$CMD --start ${{ inputs.START_VERSION }}" + fi + if [ -n "${{ inputs.END_VERSION }}" ]; then + CMD="$CMD --end ${{ inputs.END_VERSION }}" + fi + + if [ -n "${{ inputs.IMAGE_TAG }}" ]; then + CMD="$CMD --end ${{ inputs.IMAGE_TAG }}" + fi + + eval $CMD + # This is in case user manually cancel the step above, we still want to cleanup the resources + - name: Post-run cleanup + env: + GOOGLE_CLOUD_PROJECT: aptos-devinfra-0 + if: ${{ always() }} + run: | + cd testsuite/replay-verify + poetry run python main.py --network ${{ inputs.NETWORK }} --cleanup + + + + diff --git a/storage/db-tool/src/replay_on_archive.rs b/storage/db-tool/src/replay_on_archive.rs index 74971bb45eaf3..6c7c8ae8a6dda 100644 --- a/storage/db-tool/src/replay_on_archive.rs +++ b/storage/db-tool/src/replay_on_archive.rs @@ -210,7 +210,7 @@ impl Verifier { return Ok(total_failed_txns); } } - + let (input_txn, expected_txn_info, expected_event, expected_writeset) = item?; let is_epoch_ending = expected_event.iter().any(ContractEvent::is_new_epoch_event); cur_txns.push(input_txn); diff --git a/testsuite/replay-verify/__init__.py b/testsuite/replay-verify/__init__.py new file mode 100644 index 0000000000000..5e6dd8068ce10 --- /dev/null +++ b/testsuite/replay-verify/__init__.py @@ -0,0 +1,8 @@ +import os +import sys + + +path = os.path.dirname(__file__) + +if path not in sys.path: + sys.path.append(path) diff --git a/testsuite/replay-verify/archive-pvc-template.yaml b/testsuite/replay-verify/archive-pvc-template.yaml new file mode 100644 index 0000000000000..9f3162e56a31b --- /dev/null +++ b/testsuite/replay-verify/archive-pvc-template.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + annotations: + volume.kubernetes.io/storage-provisioner: pd.csi.storage.gke.io + name: testnet-archive-claim + labels: + run: some-label +spec: + accessModes: + - ReadOnlyMany + resources: + requests: + storage: 10Ti + storageClassName: ssd-data-xfs + volumeMode: Filesystem + dataSourceRef: + name: testnet-archive + kind: VolumeSnapshot + apiGroup: snapshot.storage.k8s.io \ No newline at end of file diff --git a/testsuite/replay-verify/archive-snapshot-template.yaml b/testsuite/replay-verify/archive-snapshot-template.yaml new file mode 100644 index 0000000000000..7627799081098 --- /dev/null +++ b/testsuite/replay-verify/archive-snapshot-template.yaml @@ -0,0 +1,22 @@ +apiVersion: snapshot.storage.k8s.io/v1 +kind: VolumeSnapshotContent +metadata: + name: testnet-archive +spec: + deletionPolicy: Retain + driver: pd.csi.storage.gke.io + source: + snapshotHandle: projects/aptos-devinfra-0/global/snapshots/testnet-archive + volumeSnapshotRef: + kind: VolumeSnapshot + name: testnet-archive + namespace: default +--- +apiVersion: snapshot.storage.k8s.io/v1 +kind: VolumeSnapshot +metadata: + name: testnet-archive +spec: + volumeSnapshotClassName: pd-data + source: + volumeSnapshotContentName: testnet-archive diff --git a/testsuite/replay-verify/archive_disk_utils.py b/testsuite/replay-verify/archive_disk_utils.py index 59f9d0f0ee233..bb1e67a1d6060 100644 --- a/testsuite/replay-verify/archive_disk_utils.py +++ b/testsuite/replay-verify/archive_disk_utils.py @@ -1,3 +1,4 @@ +import argparse from google.cloud import compute_v1 from kubernetes import client, config import time @@ -5,9 +6,11 @@ import concurrent.futures import time import yaml +from kubernetes.client.rest import ApiException + # Constants -DISK_COPIES = 4 +DISK_COPIES = 1 # Logging configuration logging.basicConfig(level=logging.INFO) @@ -15,6 +18,9 @@ import subprocess +TESTNET_SNAPSHOT_NAME = "testnet-archive" +MAINNET_SNAPSHOT_NAME = "mainnet-archive" + def get_kubectl_credentials(project_id, region, cluster_name): try: @@ -33,7 +39,7 @@ def get_kubectl_credentials(project_id, region, cluster_name): subprocess.check_call(command) logger.info(f"Successfully fetched credentials for cluster: {cluster_name}") except subprocess.CalledProcessError as e: - logger.info("Error fetching kubectl credentials:", e) + logger.error(f"Error fetching kubectl credentials: {e}") def get_snapshot_source_pv_and_zone(project_id, region, cluster_id, namespace): @@ -79,7 +85,7 @@ def get_snapshot_source_pv_and_zone(project_id, region, cluster_id, namespace): return pv_name, zone -def create_snapshot_with_gcloud( +def create_snapshot_from_backup_pods( snapshot_name, source_project, source_cluster, @@ -90,6 +96,22 @@ def create_snapshot_with_gcloud( (volume_name, zone) = get_snapshot_source_pv_and_zone( source_project, source_region, source_cluster, source_namespace ) + create_snapshot_with_gcloud( + snapshot_name, + source_project, + volume_name, + zone, + target_project, + ) + + +def create_snapshot_with_gcloud( + snapshot_name, + source_project, + source_volume, + source_zone, + target_project, +): # delete the snapshot if it already exists snapshot_client = compute_v1.SnapshotsClient() try: @@ -108,7 +130,7 @@ def create_snapshot_with_gcloud( ) # Construct the gcloud command to create the snapshot in the target project - source_disk_link = f"https://www.googleapis.com/compute/v1/projects/aptos-platform-compute-0/zones/{zone}/disks/{volume_name}" + source_disk_link = f"https://www.googleapis.com/compute/v1/projects/{source_project}/zones/{source_zone}/disks/{source_volume}" command = [ "gcloud", "compute", @@ -138,7 +160,15 @@ def create_snapshot_with_gcloud( # require getting a hold of the kubectrl of the cluster # eg: gcloud container clusters get-credentials replay-on-archive --region us-central1 --project replay-verify def create_disk_pv_pvc_from_snapshot( - project, zone, cluster_name, snapshot_name, disk_name, pv_name, pvc_name, namespace + project, + zone, + cluster_name, + og_snapshot_name, + snapshot_name, + disk_name, + pv_name, + pvc_name, + namespace, ): disk_client = compute_v1.DisksClient() snapshot_client = compute_v1.SnapshotsClient() @@ -157,7 +187,8 @@ def create_disk_pv_pvc_from_snapshot( logger.info(f"Disk {e} {disk_name} does not exist. Creating a new one.") # Create a new disk from the snapshot - snapshot = snapshot_client.get(project=project, snapshot=snapshot_name) + logger.info(f"Creating disk {disk_name} from snapshot {og_snapshot_name}.") + snapshot = snapshot_client.get(project=project, snapshot=og_snapshot_name) disk_body = compute_v1.Disk( name=disk_name, source_snapshot=snapshot.self_link, @@ -166,19 +197,20 @@ def create_disk_pv_pvc_from_snapshot( operation = disk_client.insert(project=project, zone=zone, disk_resource=disk_body) wait_for_operation(project, zone, operation.name, compute_v1.ZoneOperationsClient()) - logger.info(f"Disk {disk_name} created from snapshot {snapshot_name}.") + logger.info(f"Disk {disk_name} created from snapshot {og_snapshot_name}.") region_name = zone.rsplit("-", 1)[0] get_kubectl_credentials(project, region_name, cluster_name) - create_persistent_volume(disk_name, pv_name, pvc_name, namespace, True) + # create_persistent_volume(disk_name, pv_name, pvc_name, namespace, True) # this is only for xfs replaying logs to repair the disk repair_pv = f"{pv_name}-repair" repair_pvc = f"{pvc_name}-repair" + repair_job_name = f"xfs-repair-{pvc_name}" create_persistent_volume(disk_name, repair_pv, repair_pvc, namespace, False) # start a pod to mount the disk and run simple task with open("xfs-disk-repair.yaml", "r") as f: pod_manifest = yaml.safe_load(f) - pod_manifest["metadata"]["name"] = f"xfs-repair-{pvc_name}" + pod_manifest["metadata"]["name"] = repair_job_name pod_manifest["spec"]["template"]["spec"]["volumes"][0]["persistentVolumeClaim"][ "claimName" ] = repair_pvc @@ -190,6 +222,23 @@ def create_disk_pv_pvc_from_snapshot( except Exception as e: logger.error(f"Error creating disk repairing job: {e}") + # wait till the pod clean up so that disk attachement is not changed during snapshot creation + while not is_job_pod_cleanedup(namespace, repair_job_name): + logger.info(f"Waiting for job {repair_job_name} to finish.") + time.sleep(10) + logger.info(f"creating final snapshot") + create_snapshot_with_gcloud(snapshot_name, project, disk_name, zone, project) + + +def is_job_pod_cleanedup(namespace, job_name): + config.load_kube_config() + v1 = client.BatchV1Api() + try: + job = v1.read_namespaced_job(job_name, namespace) + return False + except Exception as e: + return True + def wait_for_operation(project, zone, operation_name, zone_operations_client): while True: @@ -271,7 +320,116 @@ def create_persistent_volume(disk_name, pv_name, pvc_name, namespace, read_only) v1.create_namespaced_persistent_volume_claim(namespace=namespace, body=pvc) -def create_disk_pv_pvc(project, zone, cluster_name, snapshot_name, prefix, namespace): +def create_one_pvc_from_snapshot(pvc_name, snapshot_name, namespace, label): + config.load_kube_config() + api_instance = client.CoreV1Api() + storage_size = "10Ti" if TESTNET_SNAPSHOT_NAME in snapshot_name else "8Ti" + # Define the PVC manifest + pvc_manifest = { + "apiVersion": "v1", + "kind": "PersistentVolumeClaim", + "metadata": { + "name": f"{pvc_name}", + "annotations": { + "volume.kubernetes.io/storage-provisioner": "pd.csi.storage.gke.io" + }, + "labels": {"run": f"{label}"}, + }, + "spec": { + "accessModes": ["ReadOnlyMany"], + "resources": {"requests": {"storage": storage_size}}, + "storageClassName": "ssd-data-xfs", + "volumeMode": "Filesystem", + "dataSource": { + "name": f"{snapshot_name}", + "kind": "VolumeSnapshot", + "apiGroup": "snapshot.storage.k8s.io", + }, + }, + } + + api_instance.create_namespaced_persistent_volume_claim( + namespace=namespace, body=pvc_manifest + ) + return pvc_name + + +def create_pvcs_from_snapshot(run_id, snapshot_name, namespace, pvc_num, label): + config.load_kube_config() + api_instance = client.CustomObjectsApi() + volume_snapshot_content = { + "apiVersion": "snapshot.storage.k8s.io/v1", + "kind": "VolumeSnapshotContent", + "metadata": {"name": f"{snapshot_name}"}, + "spec": { + "deletionPolicy": "Retain", + "driver": "pd.csi.storage.gke.io", + "source": { + "snapshotHandle": f"projects/aptos-devinfra-0/global/snapshots/{snapshot_name}" + }, + "volumeSnapshotRef": { + "kind": "VolumeSnapshot", + "name": f"{snapshot_name}", + "namespace": f"{namespace}", + }, + }, + } + + # Define the VolumeSnapshot manifest + volume_snapshot = { + "apiVersion": "snapshot.storage.k8s.io/v1", + "kind": "VolumeSnapshot", + "metadata": {"name": f"{snapshot_name}"}, + "spec": { + "volumeSnapshotClassName": "pd-data", + "source": {"volumeSnapshotContentName": f"{snapshot_name}"}, + }, + } + + # Create VolumeSnapshotContent + try: + api_instance.create_cluster_custom_object( + group="snapshot.storage.k8s.io", + version="v1", + plural="volumesnapshotcontents", + body=volume_snapshot_content, + ) + + # Create VolumeSnapshot + api_instance.create_namespaced_custom_object( + group="snapshot.storage.k8s.io", + version="v1", + namespace=namespace, + plural="volumesnapshots", + body=volume_snapshot, + ) + except ApiException as e: + if e.status != 409: + logger.error(f"Error creating new volumesnapshots: {e}") + + # Execute tasks in parallel + tasks = [ + (f"{run_id}-{snapshot_name}-{pvc_id}", snapshot_name, namespace, label) + for pvc_id in range(pvc_num) + ] + res = [] + with concurrent.futures.ThreadPoolExecutor() as executor: + futures = [ + executor.submit(create_one_pvc_from_snapshot, *task) for task in tasks + ] + for future in concurrent.futures.as_completed(futures): + try: + result = future.result() + logger.info(f"Task result: {result}") + res.append(result) + except Exception as e: + logger.error(f"Task generated an exception: {e}") + return res + + +def create_disk_pv_pvc( + project, zone, cluster_name, og_snapshot_name, snapshot_name, prefix, namespace +): tasks = [] for copy in range(DISK_COPIES): @@ -283,6 +441,7 @@ def create_disk_pv_pvc(project, zone, cluster_name, snapshot_name, prefix, names project, zone, cluster_name, + og_snapshot_name, snapshot_name, disk_name, pv_name, @@ -306,19 +465,41 @@ def create_disk_pv_pvc(project, zone, cluster_name, snapshot_name, prefix, names # start a self deleteing job to mount the xfs disks for repairing +def parse_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=__doc__, + ) + parser.add_argument("--network", required=True, choices=["testnet", "mainnet"]) + args = parser.parse_args() + return args + + if __name__ == "__main__": + # check input arg network + args = parse_args() + network = args.network source_project_id = "aptos-platform-compute-0" region = "us-central1" - source_cluster_id = "general-usce1-0" - source_namespace = "testnet-pfn-usce1-backup" project_id = "aptos-devinfra-0" - snapshot_name = "testnet-archive" - new_pv_prefix = "testnet-archive" target_namespace = "default" zone = "us-central1-a" cluster_name = "devinfra-usce1-0" - create_snapshot_with_gcloud( - snapshot_name, + + if network == "testnet": + source_cluster_id = "general-usce1-0" + source_namespace = "testnet-pfn-usce1-backup" + snapshot_name = TESTNET_SNAPSHOT_NAME + new_pv_prefix = TESTNET_SNAPSHOT_NAME + else: + source_cluster_id = "mainnet-usce1-0" + source_namespace = "mainnet-pfn-usce1-backup" + snapshot_name = MAINNET_SNAPSHOT_NAME + new_pv_prefix = MAINNET_SNAPSHOT_NAME + # create OG snapshot + og_snapshot_name = f"{snapshot_name}-og" + create_snapshot_from_backup_pods( + og_snapshot_name, source_project_id, source_cluster_id, region, @@ -326,5 +507,11 @@ def create_disk_pv_pvc(project, zone, cluster_name, snapshot_name, prefix, names project_id, ) create_disk_pv_pvc( - project_id, zone, cluster_name, snapshot_name, new_pv_prefix, target_namespace + project_id, + zone, + cluster_name, + og_snapshot_name, + snapshot_name, + new_pv_prefix, + target_namespace, ) diff --git a/testsuite/replay-verify/main.py b/testsuite/replay-verify/main.py index 83830d8ae159f..cf6546af98502 100644 --- a/testsuite/replay-verify/main.py +++ b/testsuite/replay-verify/main.py @@ -1,36 +1,51 @@ import yaml -from kubernetes import client, config +from kubernetes import client, config as KubernetesConfig from kubernetes.client.rest import ApiException +from google.cloud import storage import time import logging import os from enum import Enum import urllib.parse -from datetime import datetime +import json +import argparse +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))) + +from testsuite import forge +from archive_disk_utils import ( + TESTNET_SNAPSHOT_NAME, + MAINNET_SNAPSHOT_NAME, + create_pvcs_from_snapshot, + get_kubectl_credentials, +) SHARDING_ENABLED = False MAX_RETRIES = 5 RETRY_DELAY = 20 # seconds QUERY_DELAY = 5 # seconds -CONCURRENT_REPLAY = 20 -REPLAY_CONCURRENCY_LEVEL = 1 -DISK_COPIES = 4 +REPLAY_CONCURRENCY_LEVEL = 1 class Network(Enum): - TESETNET = 1 + TESTNET = 1 MAINNET = 2 def __str__(self): return self.name.lower() + @classmethod + def from_string(cls, name: str): + try: + return cls[name.upper()] + except KeyError: + raise ValueError(f"{name} is not a valid Network name") + logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -PUSH_METRICS_ENDPOINT = "PUSH_METRICS_ENDPOINT" - def construct_humio_url(labels_run, pod_name, start_time, end_time): query = f'#k8s.cluster = "devinfra-usce1-0" | k8s.labels.run = "{labels_run}" | "k8s.pod_name" = "{pod_name}"' @@ -73,7 +88,10 @@ def __init__( start_version, end_version, label, - network=Network.TESETNET, + image, + pvcs, + replay_config, + network=Network.TESTNET, namespace="default", ): self.id = id @@ -87,9 +105,15 @@ def __init__( self.network = network self.label = label self.start_time = time.time() + self.image = image + self.pvcs = pvcs + self.config = replay_config def update_status(self): - if self.status is not None and self.status.status.phase in ["Succeeded", "Failed"]: + if self.status is not None and self.status.status.phase in [ + "Succeeded", + "Failed", + ]: return self.status = self.get_pod_status() @@ -134,17 +158,11 @@ def has_txn_mismatch(self): return False def get_target_db_dir(self): - if self.network == Network.TESETNET: - return "/mnt/testnet_archive/db" - else: - return "/mnt/mainnet_archive/db" + return "/mnt/archive/db" def get_claim_name(self): - idx = self.id % DISK_COPIES - if self.network == Network.TESETNET: - return f"testnet-archive-claim-{idx}" - else: - return f"mainnet-archive-claim-{idx}" + idx = self.id % len(self.pvcs) + return self.pvcs[idx] def start(self): # Load the worker YAML from the file @@ -154,10 +172,11 @@ def start(self): # Create the Kubernetes API client to start a pod pod_manifest["metadata"]["name"] = self.name # Unique name for each pod pod_manifest["metadata"]["labels"]["run"] = self.label - pod_manifest["spec"]["containers"][0]["name"] = self.name + pod_manifest["spec"]["containers"][0]["image"] = self.image pod_manifest["spec"]["volumes"][0]["persistentVolumeClaim"][ "claimName" ] = self.get_claim_name() + pod_manifest["spec"]["containers"][0]["name"] = self.get_claim_name() pod_manifest["spec"]["containers"][0]["command"] = [ "aptos-debugger", "aptos-db", @@ -169,11 +188,13 @@ def start(self): "--target-db-dir", self.get_target_db_dir(), "--concurrent-replay", - f"{CONCURRENT_REPLAY}", + f"{self.config.concurrent_replayer}", "--replay-concurrency-level", f"{REPLAY_CONCURRENCY_LEVEL}", - "--timeout-secs", "900", - "--block-cache-size", "10737418240", + "--timeout-secs", + f"{self.config.timeout_secs}", + "--block-cache-size", + "10737418240", ] if SHARDING_ENABLED: @@ -220,6 +241,20 @@ def get_pod_status(self): def get_humio_log_link(self): return construct_humio_url(self.label, self.name, self.start_time, time.time()) +class ReplayConfig: + def __init__(self, network): + if network == Network.TESTNET: + self.concurrent_replayer = 20 + self.pvc_number = 5 + self.min_range_size = 10_000 + self.range_size = 5_000_000 + self.timeout_secs = 900 + else: + self.concurrent_replayer = 18 + self.pvc_number = 8 + self.min_range_size = 10_000 + self.range_size = 2_000_000 + self.timeout_secs = 400 class TaskStats: def __init__(self, name): @@ -249,10 +284,12 @@ def __init__( ranges_to_skip, worker_cnt, range_size, - network=Network.TESETNET, + image, + replay_config, + network=Network.TESTNET, namespace="default", ): - config.load_kube_config() + KubernetesConfig.load_kube_config() self.client = client.CoreV1Api() self.id = id self.namespace = namespace @@ -268,6 +305,21 @@ def __init__( self.txn_mismatch_logs = [] # record self.task_stats = {} + self.image = image + self.pvcs = [] + self.config = replay_config + + def __str__(self): + return f"""ReplayScheduler: + id: {self.id} + start_version: {self.start_version} + end_version: {self.end_version} + range_size: {self.range_size} + worker_cnt: {worker_cnt} + image: {image} + numberof_pvc: {self.config.pvc_number} + timeout_secs: {self.config.timeout_secs} + namespace: {self.namespace}""" def get_label(self): return f"{self.id}-{self.network}" @@ -295,13 +347,27 @@ def create_tasks(self): ) if current < range_end: - self.tasks.append((current, range_end)) + # avoid having too many small tasks, simply skip the task + if range_end - current >= self.config.min_range_size: + self.tasks.append((current, range_end)) current = range_end logger.info(self.tasks) + def create_pvc_from_snapshot(self): + snapshot_name = ( + TESTNET_SNAPSHOT_NAME + if self.network == Network.TESTNET + else MAINNET_SNAPSHOT_NAME + ) + pvcs = create_pvcs_from_snapshot( + self.id, snapshot_name, self.namespace, self.config.pvc_number, self.get_label() + ) + assert len(pvcs) == self.config.pvc_number, "failed to create all pvcs" + self.pvcs = pvcs + def schedule(self, from_scratch=False): if from_scratch: - self.kill_all_pods(self.get_label()) + self.kill_all_pods() self.create_tasks() while len(self.tasks) > 0: @@ -323,6 +389,9 @@ def schedule(self, from_scratch=False): task[0], task[1], self.get_label(), + self.image, + self.pvcs, + self.config, self.network, self.namespace, ) @@ -359,11 +428,21 @@ def process_completed_pod(self, worker_pod, worker_idx): self.task_stats[worker_pod.name].set_end_time() self.current_workers[worker_idx] = None - def kill_all_pods(self, label): + def cleanup(self): + self.kill_all_pods() + self.delete_all_pvcs() + + def kill_all_pods(self): # Delete all pods in the namespace response = self.client.delete_collection_namespaced_pod( namespace=self.namespace, - label_selector=f"run={label}", + label_selector=f"run={self.get_label()}", + ) + + def delete_all_pvcs(self): + response = self.client.delete_collection_namespaced_persistent_volume_claim( + namespace=self.namespace, + label_selector=f"run={self.get_label()}", ) def collect_all_failed_logs(self): @@ -371,7 +450,7 @@ def collect_all_failed_logs(self): all_completed = False while not all_completed: all_completed = True - for (idx, worker) in enumerate(self.current_workers): + for idx, worker in enumerate(self.current_workers): if worker is not None: logger.info(f"Checking worker {idx} {worker.name}") if not worker.is_completed(): @@ -379,32 +458,109 @@ def collect_all_failed_logs(self): else: self.process_completed_pod(worker, idx) time.sleep(QUERY_DELAY) + return (self.failed_workpod_logs, self.txn_mismatch_logs) - + def print_stats(self): for key, value in self.task_stats.items(): logger.info(f"{key}: {value}") - + # read skip ranges from gcp bucket -def main(): - scheduler = ReplayScheduler( - "test", - 862_000_000, - 6_212_936_741, - [], - worker_cnt=24, - range_size=5_000_000, - namespace="default", + +def read_skip_ranges(network): + storage_client = storage.Client() + bucket = storage_client.bucket("replay_verify_skip_ranges") + source_blob_name = f"{network}_skip_ranges.json" + # Get the blob (file) from the bucket + blob = bucket.blob(source_blob_name) + + data = json.loads(blob.download_as_text()) + skip_ranges = [ + (int(range["start_version"]), int(range["end_version"])) + for range in data["skip_ranges"] + ] + return (data["start"], data["end"], skip_ranges) + + +def parse_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=__doc__, ) - label = scheduler.get_label() - try: - scheduler.schedule(from_scratch=True) - print(scheduler.collect_all_failed_logs()) - finally: - scheduler.print_stats() - scheduler.kill_all_pods(label) + parser.add_argument("--network", required=True, choices=["testnet", "mainnet"]) + parser.add_argument("--start", required=False, type=int) + parser.add_argument("--end", required=False, type=int) + parser.add_argument("--worker_cnt", required=False, type=int) + parser.add_argument("--range_size", required=False, type=int) + parser.add_argument("--namespace", required=False, type=str, default="default") + parser.add_argument("--image_tag", required=False, type=str) + parser.add_argument("--cleanup", required=False, action="store_true", default=False) + args = parser.parse_args() + return args + + +def get_image(image_tag=None): + shell = forge.LocalShell() + git = forge.Git(shell) + image_name = "tools" + default_latest_image = forge.find_recent_images( + shell, + git, + 1, + image_name=image_name, + )[0] if image_tag is None else image_tag + full_image = f"{forge.GAR_REPO_NAME}/{image_name}:{default_latest_image}" + return full_image + + +def print_logs(failed_workpod_logs, txn_mismatch_logs): + if len(failed_workpod_logs) > 0: + logger.info("Failed workpods found") + for log in failed_workpod_logs: + logger.info(log) + if len(txn_mismatch_logs) == 0: + logger.info("No txn mismatch found") + else: + logger.info("Txn mismatch found") + for log in txn_mismatch_logs: + logger.info(log) if __name__ == "__main__": - main() + args = parse_args() + get_kubectl_credentials("aptos-devinfra-0", "us-central1", "devinfra-usce1-0") + (start, end, skip_ranges) = read_skip_ranges(args.network) + image = get_image(args.image_tag) if args.image_tag is not None else get_image() + run_id = image[-5:] + network = Network.from_string(args.network) + config = ReplayConfig(network) + worker_cnt = args.worker_cnt if args.worker_cnt else config.pvc_number * 7 + range_size = args.range_size if args.range_size else config.range_size + scheduler = ReplayScheduler( + run_id, + start if args.start is None else args.start, + end if args.end is None else args.end, + skip_ranges, + worker_cnt=worker_cnt, + range_size=range_size, + image=image, + replay_config=config, + network= network, + namespace=args.namespace, + ) + logger.info(f"scheduler: {scheduler}") + cleanup = args.cleanup + if cleanup: + scheduler.cleanup() + exit(0) + else: + scheduler.create_pvc_from_snapshot() + try: + scheduler.schedule(from_scratch=True) + (failed_logs, txn_mismatch_logs) = scheduler.collect_all_failed_logs() + scheduler.print_stats() + print_logs(failed_logs, txn_mismatch_logs) + + finally: + scheduler.cleanup() diff --git a/testsuite/replay-verify/poetry.lock b/testsuite/replay-verify/poetry.lock index f0eca350aa08f..a383c32e1a31e 100644 --- a/testsuite/replay-verify/poetry.lock +++ b/testsuite/replay-verify/poetry.lock @@ -1,10 +1,9 @@ -# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "black" version = "24.10.0" description = "The uncompromising code formatter." -category = "dev" optional = false python-versions = ">=3.9" files = [ @@ -51,7 +50,6 @@ uvloop = ["uvloop (>=0.15.2)"] name = "cachetools" version = "5.5.0" description = "Extensible memoizing collections and decorators" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -63,7 +61,6 @@ files = [ name = "certifi" version = "2024.8.30" description = "Python package for providing Mozilla's CA Bundle." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -75,7 +72,6 @@ files = [ name = "charset-normalizer" version = "3.4.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -category = "main" optional = false python-versions = ">=3.7.0" files = [ @@ -190,7 +186,6 @@ files = [ name = "click" version = "8.1.7" description = "Composable command line interface toolkit" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -205,7 +200,6 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." -category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -217,7 +211,6 @@ files = [ name = "durationpy" version = "0.9" description = "Module for converting between datetime.timedelta and Go's Duration strings." -category = "main" optional = false python-versions = "*" files = [ @@ -227,14 +220,13 @@ files = [ [[package]] name = "google-api-core" -version = "2.22.0" +version = "2.23.0" description = "Google API client core library" -category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "google_api_core-2.22.0-py3-none-any.whl", hash = "sha256:a6652b6bd51303902494998626653671703c420f6f4c88cfd3f50ed723e9d021"}, - {file = "google_api_core-2.22.0.tar.gz", hash = "sha256:26f8d76b96477db42b55fd02a33aae4a42ec8b86b98b94969b7333a2c828bf35"}, + {file = "google_api_core-2.23.0-py3-none-any.whl", hash = "sha256:c20100d4c4c41070cf365f1d8ddf5365915291b5eb11b83829fbd1c999b5122f"}, + {file = "google_api_core-2.23.0.tar.gz", hash = "sha256:2ceb087315e6af43f256704b871d99326b1f12a9d6ce99beaedec99ba26a0ace"}, ] [package.dependencies] @@ -263,14 +255,13 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] [[package]] name = "google-auth" -version = "2.35.0" +version = "2.36.0" description = "Google Authentication Library" -category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "google_auth-2.35.0-py2.py3-none-any.whl", hash = "sha256:25df55f327ef021de8be50bad0dfd4a916ad0de96da86cd05661c9297723ad3f"}, - {file = "google_auth-2.35.0.tar.gz", hash = "sha256:f4c64ed4e01e8e8b646ef34c018f8bf3338df0c8e37d8b3bba40e7f574a3278a"}, + {file = "google_auth-2.36.0-py2.py3-none-any.whl", hash = "sha256:51a15d47028b66fd36e5c64a82d2d57480075bccc7da37cde257fc94177a61fb"}, + {file = "google_auth-2.36.0.tar.gz", hash = "sha256:545e9618f2df0bcbb7dcbc45a546485b1212624716975a1ea5ae8149ce769ab1"}, ] [package.dependencies] @@ -289,7 +280,6 @@ requests = ["requests (>=2.20.0,<3.0.0.dev0)"] name = "google-cloud-compute" version = "1.20.1" description = "Google Cloud Compute API client library" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -298,7 +288,7 @@ files = [ ] [package.dependencies] -google-api-core = {version = ">=1.34.1,<2.0.0 || >=2.11.0,<3.0.0dev", extras = ["grpc"]} +google-api-core = {version = ">=1.34.1,<2.0.dev0 || >=2.11.dev0,<3.0.0dev", extras = ["grpc"]} google-auth = ">=2.14.1,<2.24.0 || >2.24.0,<2.25.0 || >2.25.0,<3.0.0dev" proto-plus = [ {version = ">=1.22.3,<2.0.0dev", markers = "python_version < \"3.13\""}, @@ -310,7 +300,6 @@ protobuf = ">=3.20.2,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4 name = "google-cloud-container" version = "2.53.0" description = "Google Cloud Container API client library" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -319,7 +308,7 @@ files = [ ] [package.dependencies] -google-api-core = {version = ">=1.34.1,<2.0.0 || >=2.11.0,<3.0.0dev", extras = ["grpc"]} +google-api-core = {version = ">=1.34.1,<2.0.dev0 || >=2.11.dev0,<3.0.0dev", extras = ["grpc"]} google-auth = ">=2.14.1,<2.24.0 || >2.24.0,<2.25.0 || >2.25.0,<3.0.0dev" proto-plus = [ {version = ">=1.22.3,<2.0.0dev", markers = "python_version < \"3.13\""}, @@ -327,16 +316,113 @@ proto-plus = [ ] protobuf = ">=3.20.2,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<6.0.0dev" +[[package]] +name = "google-cloud-core" +version = "2.4.1" +description = "Google Cloud API client core library" +optional = false +python-versions = ">=3.7" +files = [ + {file = "google-cloud-core-2.4.1.tar.gz", hash = "sha256:9b7749272a812bde58fff28868d0c5e2f585b82f37e09a1f6ed2d4d10f134073"}, + {file = "google_cloud_core-2.4.1-py2.py3-none-any.whl", hash = "sha256:a9e6a4422b9ac5c29f79a0ede9485473338e2ce78d91f2370c01e730eab22e61"}, +] + +[package.dependencies] +google-api-core = ">=1.31.6,<2.0.dev0 || >2.3.0,<3.0.0dev" +google-auth = ">=1.25.0,<3.0dev" + +[package.extras] +grpc = ["grpcio (>=1.38.0,<2.0dev)", "grpcio-status (>=1.38.0,<2.0.dev0)"] + +[[package]] +name = "google-cloud-storage" +version = "2.18.2" +description = "Google Cloud Storage API client library" +optional = false +python-versions = ">=3.7" +files = [ + {file = "google_cloud_storage-2.18.2-py2.py3-none-any.whl", hash = "sha256:97a4d45c368b7d401ed48c4fdfe86e1e1cb96401c9e199e419d289e2c0370166"}, + {file = "google_cloud_storage-2.18.2.tar.gz", hash = "sha256:aaf7acd70cdad9f274d29332673fcab98708d0e1f4dceb5a5356aaef06af4d99"}, +] + +[package.dependencies] +google-api-core = ">=2.15.0,<3.0.0dev" +google-auth = ">=2.26.1,<3.0dev" +google-cloud-core = ">=2.3.0,<3.0dev" +google-crc32c = ">=1.0,<2.0dev" +google-resumable-media = ">=2.7.2" +requests = ">=2.18.0,<3.0.0dev" + +[package.extras] +protobuf = ["protobuf (<6.0.0dev)"] +tracing = ["opentelemetry-api (>=1.1.0)"] + +[[package]] +name = "google-crc32c" +version = "1.6.0" +description = "A python wrapper of the C library 'Google CRC32C'" +optional = false +python-versions = ">=3.9" +files = [ + {file = "google_crc32c-1.6.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:5bcc90b34df28a4b38653c36bb5ada35671ad105c99cfe915fb5bed7ad6924aa"}, + {file = "google_crc32c-1.6.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:d9e9913f7bd69e093b81da4535ce27af842e7bf371cde42d1ae9e9bd382dc0e9"}, + {file = "google_crc32c-1.6.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a184243544811e4a50d345838a883733461e67578959ac59964e43cca2c791e7"}, + {file = "google_crc32c-1.6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:236c87a46cdf06384f614e9092b82c05f81bd34b80248021f729396a78e55d7e"}, + {file = "google_crc32c-1.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebab974b1687509e5c973b5c4b8b146683e101e102e17a86bd196ecaa4d099fc"}, + {file = "google_crc32c-1.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:50cf2a96da226dcbff8671233ecf37bf6e95de98b2a2ebadbfdf455e6d05df42"}, + {file = "google_crc32c-1.6.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:f7a1fc29803712f80879b0806cb83ab24ce62fc8daf0569f2204a0cfd7f68ed4"}, + {file = "google_crc32c-1.6.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:40b05ab32a5067525670880eb5d169529089a26fe35dce8891127aeddc1950e8"}, + {file = "google_crc32c-1.6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9e4b426c3702f3cd23b933436487eb34e01e00327fac20c9aebb68ccf34117d"}, + {file = "google_crc32c-1.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51c4f54dd8c6dfeb58d1df5e4f7f97df8abf17a36626a217f169893d1d7f3e9f"}, + {file = "google_crc32c-1.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:bb8b3c75bd157010459b15222c3fd30577042a7060e29d42dabce449c087f2b3"}, + {file = "google_crc32c-1.6.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:ed767bf4ba90104c1216b68111613f0d5926fb3780660ea1198fc469af410e9d"}, + {file = "google_crc32c-1.6.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:62f6d4a29fea082ac4a3c9be5e415218255cf11684ac6ef5488eea0c9132689b"}, + {file = "google_crc32c-1.6.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c87d98c7c4a69066fd31701c4e10d178a648c2cac3452e62c6b24dc51f9fcc00"}, + {file = "google_crc32c-1.6.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd5e7d2445d1a958c266bfa5d04c39932dc54093fa391736dbfdb0f1929c1fb3"}, + {file = "google_crc32c-1.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:7aec8e88a3583515f9e0957fe4f5f6d8d4997e36d0f61624e70469771584c760"}, + {file = "google_crc32c-1.6.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:e2806553238cd076f0a55bddab37a532b53580e699ed8e5606d0de1f856b5205"}, + {file = "google_crc32c-1.6.0-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:bb0966e1c50d0ef5bc743312cc730b533491d60585a9a08f897274e57c3f70e0"}, + {file = "google_crc32c-1.6.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:386122eeaaa76951a8196310432c5b0ef3b53590ef4c317ec7588ec554fec5d2"}, + {file = "google_crc32c-1.6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2952396dc604544ea7476b33fe87faedc24d666fb0c2d5ac971a2b9576ab871"}, + {file = "google_crc32c-1.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:35834855408429cecf495cac67ccbab802de269e948e27478b1e47dfb6465e57"}, + {file = "google_crc32c-1.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:d8797406499f28b5ef791f339594b0b5fdedf54e203b5066675c406ba69d705c"}, + {file = "google_crc32c-1.6.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48abd62ca76a2cbe034542ed1b6aee851b6f28aaca4e6551b5599b6f3ef175cc"}, + {file = "google_crc32c-1.6.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18e311c64008f1f1379158158bb3f0c8d72635b9eb4f9545f8cf990c5668e59d"}, + {file = "google_crc32c-1.6.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05e2d8c9a2f853ff116db9706b4a27350587f341eda835f46db3c0a8c8ce2f24"}, + {file = "google_crc32c-1.6.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:91ca8145b060679ec9176e6de4f89b07363d6805bd4760631ef254905503598d"}, + {file = "google_crc32c-1.6.0.tar.gz", hash = "sha256:6eceb6ad197656a1ff49ebfbbfa870678c75be4344feb35ac1edf694309413dc"}, +] + +[package.extras] +testing = ["pytest"] + +[[package]] +name = "google-resumable-media" +version = "2.7.2" +description = "Utilities for Google Media Downloads and Resumable Uploads" +optional = false +python-versions = ">=3.7" +files = [ + {file = "google_resumable_media-2.7.2-py2.py3-none-any.whl", hash = "sha256:3ce7551e9fe6d99e9a126101d2536612bb73486721951e9562fee0f90c6ababa"}, + {file = "google_resumable_media-2.7.2.tar.gz", hash = "sha256:5280aed4629f2b60b847b0d42f9857fd4935c11af266744df33d8074cae92fe0"}, +] + +[package.dependencies] +google-crc32c = ">=1.0,<2.0dev" + +[package.extras] +aiohttp = ["aiohttp (>=3.6.2,<4.0.0dev)", "google-auth (>=1.22.0,<2.0dev)"] +requests = ["requests (>=2.18.0,<3.0.0dev)"] + [[package]] name = "googleapis-common-protos" -version = "1.65.0" +version = "1.66.0" description = "Common protobufs used in Google APIs" -category = "main" optional = false python-versions = ">=3.7" files = [ - {file = "googleapis_common_protos-1.65.0-py2.py3-none-any.whl", hash = "sha256:2972e6c496f435b92590fd54045060867f3fe9be2c82ab148fc8885035479a63"}, - {file = "googleapis_common_protos-1.65.0.tar.gz", hash = "sha256:334a29d07cddc3aa01dee4988f9afd9b2916ee2ff49d6b757155dc0d197852c0"}, + {file = "googleapis_common_protos-1.66.0-py2.py3-none-any.whl", hash = "sha256:d7abcd75fabb2e0ec9f74466401f6c119a0b498e27370e9be4c94cb7e382b8ed"}, + {file = "googleapis_common_protos-1.66.0.tar.gz", hash = "sha256:c3e7b33d15fdca5374cc0a7346dd92ffa847425cc4ea941d970f13680052ec8c"}, ] [package.dependencies] @@ -349,7 +435,6 @@ grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"] name = "grpcio" version = "1.67.1" description = "HTTP/2-based RPC framework" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -417,7 +502,6 @@ protobuf = ["grpcio-tools (>=1.67.1)"] name = "grpcio-status" version = "1.67.1" description = "Status proto mapping for gRPC" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -434,7 +518,6 @@ protobuf = ">=5.26.1,<6.0dev" name = "idna" version = "3.10" description = "Internationalized Domain Names in Applications (IDNA)" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -449,7 +532,6 @@ all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2 name = "kubernetes" version = "31.0.0" description = "Kubernetes python client" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -468,7 +550,7 @@ requests = "*" requests-oauthlib = "*" six = ">=1.9.0" urllib3 = ">=1.24.2" -websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.0 || >=0.43.0" +websocket-client = ">=0.32.0,<0.40.0 || >0.40.0,<0.41.dev0 || >=0.43.dev0" [package.extras] adal = ["adal (>=1.0.2)"] @@ -477,7 +559,6 @@ adal = ["adal (>=1.0.2)"] name = "mypy-extensions" version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -489,7 +570,6 @@ files = [ name = "oauthlib" version = "3.2.2" description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -504,21 +584,19 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] [[package]] name = "packaging" -version = "24.1" +version = "24.2" description = "Core utilities for Python packages" -category = "dev" optional = false python-versions = ">=3.8" files = [ - {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, - {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, + {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, + {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, ] [[package]] name = "pathspec" version = "0.12.1" description = "Utility library for gitignore style pattern matching of file paths." -category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -530,7 +608,6 @@ files = [ name = "platformdirs" version = "4.3.6" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." -category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -547,7 +624,6 @@ type = ["mypy (>=1.11.2)"] name = "proto-plus" version = "1.25.0" description = "Beautiful, Pythonic protocol buffers." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -565,7 +641,6 @@ testing = ["google-api-core (>=1.31.5)"] name = "protobuf" version = "5.28.3" description = "" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -582,11 +657,40 @@ files = [ {file = "protobuf-5.28.3.tar.gz", hash = "sha256:64badbc49180a5e401f373f9ce7ab1d18b63f7dd4a9cdc43c92b9f0b481cef7b"}, ] +[[package]] +name = "psutil" +version = "6.1.0" +description = "Cross-platform lib for process and system monitoring in Python." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +files = [ + {file = "psutil-6.1.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ff34df86226c0227c52f38b919213157588a678d049688eded74c76c8ba4a5d0"}, + {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c0e0c00aa18ca2d3b2b991643b799a15fc8f0563d2ebb6040f64ce8dc027b942"}, + {file = "psutil-6.1.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:000d1d1ebd634b4efb383f4034437384e44a6d455260aaee2eca1e9c1b55f047"}, + {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:5cd2bcdc75b452ba2e10f0e8ecc0b57b827dd5d7aaffbc6821b2a9a242823a76"}, + {file = "psutil-6.1.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:045f00a43c737f960d273a83973b2511430d61f283a44c96bf13a6e829ba8fdc"}, + {file = "psutil-6.1.0-cp27-none-win32.whl", hash = "sha256:9118f27452b70bb1d9ab3198c1f626c2499384935aaf55388211ad982611407e"}, + {file = "psutil-6.1.0-cp27-none-win_amd64.whl", hash = "sha256:a8506f6119cff7015678e2bce904a4da21025cc70ad283a53b099e7620061d85"}, + {file = "psutil-6.1.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6e2dcd475ce8b80522e51d923d10c7871e45f20918e027ab682f94f1c6351688"}, + {file = "psutil-6.1.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:0895b8414afafc526712c498bd9de2b063deaac4021a3b3c34566283464aff8e"}, + {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9dcbfce5d89f1d1f2546a2090f4fcf87c7f669d1d90aacb7d7582addece9fb38"}, + {file = "psutil-6.1.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:498c6979f9c6637ebc3a73b3f87f9eb1ec24e1ce53a7c5173b8508981614a90b"}, + {file = "psutil-6.1.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d905186d647b16755a800e7263d43df08b790d709d575105d419f8b6ef65423a"}, + {file = "psutil-6.1.0-cp36-cp36m-win32.whl", hash = "sha256:6d3fbbc8d23fcdcb500d2c9f94e07b1342df8ed71b948a2649b5cb060a7c94ca"}, + {file = "psutil-6.1.0-cp36-cp36m-win_amd64.whl", hash = "sha256:1209036fbd0421afde505a4879dee3b2fd7b1e14fee81c0069807adcbbcca747"}, + {file = "psutil-6.1.0-cp37-abi3-win32.whl", hash = "sha256:1ad45a1f5d0b608253b11508f80940985d1d0c8f6111b5cb637533a0e6ddc13e"}, + {file = "psutil-6.1.0-cp37-abi3-win_amd64.whl", hash = "sha256:a8fb3752b491d246034fa4d279ff076501588ce8cbcdbb62c32fd7a377d996be"}, + {file = "psutil-6.1.0.tar.gz", hash = "sha256:353815f59a7f64cdaca1c0307ee13558a0512f6db064e92fe833784f08539c7a"}, +] + +[package.extras] +dev = ["black", "check-manifest", "coverage", "packaging", "pylint", "pyperf", "pypinfo", "pytest-cov", "requests", "rstcheck", "ruff", "sphinx", "sphinx_rtd_theme", "toml-sort", "twine", "virtualenv", "wheel"] +test = ["pytest", "pytest-xdist", "setuptools"] + [[package]] name = "pyasn1" version = "0.6.1" description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -598,7 +702,6 @@ files = [ name = "pyasn1-modules" version = "0.4.1" description = "A collection of ASN.1-based protocols modules" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -613,7 +716,6 @@ pyasn1 = ">=0.4.6,<0.7.0" name = "python-dateutil" version = "2.9.0.post0" description = "Extensions to the standard Python datetime module" -category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" files = [ @@ -628,7 +730,6 @@ six = ">=1.5" name = "pyyaml" version = "6.0.2" description = "YAML parser and emitter for Python" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -691,7 +792,6 @@ files = [ name = "requests" version = "2.32.3" description = "Python HTTP for Humans." -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -713,7 +813,6 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] name = "requests-oauthlib" version = "2.0.0" description = "OAuthlib authentication support for Requests." -category = "main" optional = false python-versions = ">=3.4" files = [ @@ -732,7 +831,6 @@ rsa = ["oauthlib[signedtoken] (>=3.0.0)"] name = "rsa" version = "4.9" description = "Pure-Python RSA implementation" -category = "main" optional = false python-versions = ">=3.6,<4" files = [ @@ -747,7 +845,6 @@ pyasn1 = ">=0.1.3" name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -757,21 +854,19 @@ files = [ [[package]] name = "tomli" -version = "2.0.2" +version = "2.1.0" description = "A lil' TOML parser" -category = "dev" optional = false python-versions = ">=3.8" files = [ - {file = "tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38"}, - {file = "tomli-2.0.2.tar.gz", hash = "sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed"}, + {file = "tomli-2.1.0-py3-none-any.whl", hash = "sha256:a5c57c3d1c56f5ccdf89f6523458f60ef716e210fc47c4cfb188c5ba473e0391"}, + {file = "tomli-2.1.0.tar.gz", hash = "sha256:3f646cae2aec94e17d04973e4249548320197cfabdf130015d023de4b74d8ab8"}, ] [[package]] name = "typing-extensions" version = "4.12.2" description = "Backported and Experimental Type Hints for Python 3.8+" -category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -783,7 +878,6 @@ files = [ name = "urllib3" version = "2.2.3" description = "HTTP library with thread-safe connection pooling, file post, and more." -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -801,7 +895,6 @@ zstd = ["zstandard (>=0.18.0)"] name = "websocket-client" version = "1.8.0" description = "WebSocket client for Python with low level API options" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -817,4 +910,4 @@ test = ["websockets"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "1a7ce86d4baeb3e1178bebdab8f413637b049dcc3a553f565cd95ddf218df6a1" +content-hash = "1122681d83360921d2b63f007b65c3436823b10b3d579d0fec4cf8b5e07fed59" diff --git a/testsuite/replay-verify/pyproject.toml b/testsuite/replay-verify/pyproject.toml index 565365ca6b021..63b0bac43b699 100644 --- a/testsuite/replay-verify/pyproject.toml +++ b/testsuite/replay-verify/pyproject.toml @@ -1,25 +1,22 @@ [tool.poetry] name = "replay-verify" version = "0.1.0" -description = "" -authors = ["Bo Wu "] +description = "replay verify on archive" +authors = ["Aptos Labs "] readme = "README.md" -packages = [{include = "replay_verify"}] [tool.poetry.dependencies] python = "^3.10" kubernetes = "^31.0.0" google-cloud-compute = "^1.20.1" google-cloud-container = "^2.53.0" +google-cloud-storage = "^2.18.2" +psutil = "^6.1.0" [tool.poetry.group.dev.dependencies] black = "^24.10.0" [build-system] -requires = ["poetry-core"] +requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" - - -[tool.poetry.scripts] -replay-verify = "main:main" \ No newline at end of file diff --git a/testsuite/replay-verify/replay-verify-worker-template.yaml b/testsuite/replay-verify/replay-verify-worker-template.yaml index 1cb3011d0fbff..524cf9a7fd9fe 100644 --- a/testsuite/replay-verify/replay-verify-worker-template.yaml +++ b/testsuite/replay-verify/replay-verify-worker-template.yaml @@ -1,7 +1,7 @@ apiVersion: v1 kind: Pod metadata: - name: worker-pod + name: worker-pod-9 labels: run: some-label spec: @@ -9,12 +9,12 @@ spec: restartPolicy: Never # Pod restarts only if it fails containers: - name: replay-verify-worker - image: us-docker.pkg.dev/aptos-registry/docker/tools:performance_65a65fc719cd40dc989a5832819095aa8f1e384d + image: us-docker.pkg.dev/aptos-registry/docker/tools:nightly volumeMounts: - - mountPath: /mnt/testnet_archive - name: testnet-archive + - mountPath: /mnt/archive + name: archive readOnly: true # Mount the volume as read-only - command: ["/bin/sh", "-c", "ls -al /mnt/testnet_archive && sleep 3600"] + command: ["/bin/sh", "-c", "ls -al /mnt/archive && sleep 3600"] env: #- name: PUSH_METRICS_ENDPOINT # value: "http://localhost:9091" @@ -22,12 +22,12 @@ spec: value: "info" resources: requests: - memory: "110Gi" + memory: "90Gi" cpu: "30" limits: - memory: "110Gi" + memory: "90Gi" cpu: "30" volumes: - - name: testnet-archive + - name: archive persistentVolumeClaim: - claimName: testnet-archive-claim-0 + claimName: testnet-archive-9 diff --git a/testsuite/replay-verify/testnet-archive.yaml b/testsuite/replay-verify/testnet-archive.yaml deleted file mode 100644 index cbaa18a7ae977..0000000000000 --- a/testsuite/replay-verify/testnet-archive.yaml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -kind: PersistentVolume -metadata: - name: testnet-archive -spec: - capacity: - storage: 10000Gi - accessModes: - - ReadOnlyMany - gcePersistentDisk: - pdName: testnet-archive-0 - fsType: xfs - persistentVolumeReclaimPolicy: Retain - storageClassName: standard ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: testnet-archive-claim -spec: - accessModes: - - ReadOnlyMany - resources: - requests: - storage: 10000Gi - storageClassName: standard - volumeName: testnet-archive diff --git a/testsuite/replay-verify/xfs-disk-repair.yaml b/testsuite/replay-verify/xfs-disk-repair.yaml index bcb384800ad2c..e07427cc56bdc 100644 --- a/testsuite/replay-verify/xfs-disk-repair.yaml +++ b/testsuite/replay-verify/xfs-disk-repair.yaml @@ -10,8 +10,8 @@ spec: spec: containers: - name: self-deleting-container - image: busybox - command: ["sh", "-c", "ls /mnt/* && sleep 30"] + image: gcr.io/google.com/cloudsdktool/google-cloud-cli:latest + command: ["sh", "-c", "ls /mnt/* && sleep 10"] volumeMounts: - name: my-volume mountPath: /mnt