Skip to content

Commit

Permalink
save progress on disk creation
Browse files Browse the repository at this point in the history
  • Loading branch information
areshand committed Nov 14, 2024
1 parent 995f85d commit deb49cb
Show file tree
Hide file tree
Showing 6 changed files with 404 additions and 68 deletions.
39 changes: 39 additions & 0 deletions .github/workflows/provision-replay-verify-archive-disks.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# This defines a workflow to replay transactions on the given chain with the latest aptos node software.
# In order to trigger it go to the Actions Tab of the Repo, click "replay-verify" and then "Run Workflow".
#
# On PR, a single test case will run. On workflow_dispatch, you may specify the CHAIN_NAME to verify.

name: "provision-replay-verify-archive-disks"
on:
# Allow triggering manually
workflow_dispatch:
inputs:
NETWORK:
required: true
type: string
description: The network to provision storage for.
pull_request:
paths:
- '.github/workflows/provision-replay-verify-archive-disks.yaml'
- '.github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml'
schedule:
- cron: "0 22 * * 0,2,4" # The main branch cadence. This runs every Sun,Tues,Thurs

permissions:
contents: read
id-token: write #required for GCP Workload Identity federation which we use to login into Google Artifact Registry
issues: read
pull-requests: read

jobs:
replay-testnet:
uses: ./.github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml
secrets: inherit
with:
NETWORK: testnet

replay-mainnet:
uses: ./.github/workflows/workflow-run-replay-verify-archive-storage-provision.yaml
secrets: inherit
with:
NETWORK: mainnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
name: "*run replay-verify reusable workflow"

on:
# This allows the workflow to be triggered from another workflow
workflow_call:
inputs:
NETWORK:
required: true
type: string
description: The network to provision storage for.
# This allows the workflow to be triggered manually from the Github UI or CLI
# NOTE: because the "number" type is not supported, we default to 720 minute timeout
workflow_dispatch:
inputs:
NETWORK:
description: The network to provision storage for.
type: string
required: true
jobs:
provision:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: main
- name: Authenticate to Google Cloud
uses: "google-github-actions/auth@v2"
with:
workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}
service_account: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL}}

- name: Set up Cloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
install_components: "kubectl, gke-gcloud-auth-plugin"

- name: "Setup GCloud project"
shell: bash
run: gcloud config set project aptos-devinfra-0

- name: Setup python
uses: actions/setup-python@v4
with:
python-version: 3.10.12

# Install Poetry.
- name: Install and configure Poetry
uses: snok/install-poetry@v1
with:
version: 1.5.1
virtualenvs-create: true
virtualenvs-in-project: false

- name: Install poetry project
run: poetry install --no-root
shell: bash
working-directory: ./testsuite/replay-verify

- name: "Provision storage"
run: cd testsuite/replay-verify && poetry run python archive_disk_utils.py --network ${{ inputs.NETWORK }}



253 changes: 253 additions & 0 deletions .github/workflows/workflow-run-replay-verify-on-archive.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
name: "*run replay-verify reusable workflow"

on:
# This allows the workflow to be triggered from another workflow
workflow_call:
inputs:
GIT_SHA:
required: true
type: string
description: The git SHA1 to test.
# replay-verify config
START_VERSION:
required: false
type: string
description: The history start to use for the backup. If not specified, it will use the default history start.
END_VERSION:
required: false
type: string
description: The end version to use for the backup. If not specified, it will use the latest version.
RANGES_TO_SKIP:
required: false
type: string
description: The optional list of transaction ranges to skip.
RUNS_ON:
description: "The runner to use for the job."
type: string
required: true
default: "medium-perf-local-ssd"
# This allows the workflow to be triggered manually from the Github UI or CLI
# NOTE: because the "number" type is not supported, we default to 720 minute timeout
workflow_dispatch:
inputs:
GIT_SHA:
required: true
type: string
description: The git SHA1 to test.
# replay-verify config
START_VERSION:
required: false
type: string
description: The history start to use for the backup. If not specified, it will use the default history start.
END_VERSION:
required: false
type: string
description: The end version to use for the backup. If not specified, it will use the latest version.
RANGES_TO_SKIP:
required: false
type: string
description: The optional list of transaction ranges to skip.
RUNS_ON:
description: "The runner to use for the job."
type: string
required: true
default: "high-perf-docker-with-local-ssd"
jobs:
prepare:
runs-on: ${{ inputs.RUNS_ON }}
outputs:
job_ids: ${{ steps.gen-jobs.outputs.job_ids }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.GIT_SHA }}

- name: Load cached aptos-debugger binary
id: cache-aptos-debugger-binary
uses: actions/cache@v4
with:
# copy the binary to the root of the repo and cache it there, because rust-setup calls a cache-rust action
# which cleans up the target directory in its post action
path: |
aptos-debugger
key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}

- name: Prepare for build if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
uses: aptos-labs/aptos-core/.github/actions/rust-setup@main
with:
GIT_CREDENTIALS: ${{ inputs.GIT_CREDENTIALS }}

- name: Build and strip aptos-debugger binary if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
shell: bash
run: |
cargo build --release -p aptos-debugger
strip -s target/release/aptos-debugger
cp target/release/aptos-debugger .
- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"

- name: get timestamp to use in cache key
id: get-timestamp
run: echo "ts=$(date +%s)" >> $GITHUB_OUTPUT

- name: Load cached backup storage metadata cache dir (and save back afterwards)
uses: actions/cache@v4
with:
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ steps.get-timestamp.outputs.ts }}
restore-keys: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-

- name: Generate job ranges
id: gen-jobs
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
run: |
./aptos-debugger aptos-db gen-replay-verify-jobs \
--metadata-cache-dir ./metadata_cache \
--command-adapter-config ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} \
--start-version ${{ inputs.HISTORY_START }} \
--ranges-to-skip "${{ inputs.RANGES_TO_SKIP }}" \
--max-versions-per-range ${{ inputs.MAX_VERSIONS_PER_RANGE }} \
\
--max-ranges-per-job 16 \
--output-json-file jobs.json \
jq -c 'length as $N | [range(0; $N)]' jobs.json > job_ids.json
cat job_ids.json
jq . jobs.json
echo "job_ids=$(cat job_ids.json)" >> $GITHUB_OUTPUT
- name: Cache backup storage config and job definition
uses: actions/cache/save@v4
with:
path: |
${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
jobs.json
key: backup-config-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ github.run_id }}

replay-verify:
needs: prepare
timeout-minutes: ${{ inputs.TIMEOUT_MINUTES || 180 }}
runs-on: ${{ inputs.RUNS_ON }}
strategy:
fail-fast: false
matrix:
job_id: ${{ fromJson(needs.prepare.outputs.job_ids) }}
steps:
- name: Load cached aptos-debugger binary
uses: actions/cache/restore@v4
with:
path: |
aptos-debugger
key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}
fail-on-cache-miss: true

- name: Load cached backup storage metadata cache dir
uses: actions/cache/restore@v4
with:
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-
fail-on-cache-miss: true

- name: Load cached backup storage config and job definitions
uses: actions/cache/restore@v4
with:
path: |
${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
jobs.json
key: backup-config-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ github.run_id }}
fail-on-cache-miss: true

- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"

- name: Run replay-verify in parallel
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
shell: bash
run: |
set -o nounset -o errexit -o pipefail
replay() {
idx=$1
id=$2
begin=$3
end=$4
desc=$5
echo ---------
echo Job start. $id: $desc
echo ---------
MC=metadata_cache_$idx
cp -r metadata_cache $MC
DB=db_$idx
for try in {0..6}
do
if [ $try -gt 0 ]; then
SLEEP=$((10 * $try))
echo "sleeping for $SLEEP seconds before retry #$try" >&2
sleep $SLEEP
fi
res=0
./aptos-debugger aptos-db replay-verify \
--metadata-cache-dir $MC \
--command-adapter-config ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} \
--start-version $begin \
--end-version $end \
\
--lazy-quit \
--enable-storage-sharding \
--target-db-dir $DB \
--concurrent-downloads 8 \
--replay-concurrency-level 4 \
|| res=$?
if [[ $res == 0 || $res == 2 ]]
then
return $res
fi
done
return 1
}
pids=()
idx=0
while read id begin end desc; do
replay $idx $id $begin $end "$desc" 2>&1 | sed "s/^/[partition $idx]: /" &
pids[$idx]=$!
idx=$((idx+1))
done < <(jq '.[${{ matrix.job_id }}][]' jobs.json)
res=0
for idx in `seq 0 $((idx-1))`
do
range_res=0
wait ${pids[$idx]} || range_res=$?
echo partition $idx returned $range_res
if [[ $range_res != 0 ]]
then
res=$range_res
fi
done
echo All partitions done, returning $res
exit $res
Loading

0 comments on commit deb49cb

Please sign in to comment.