Skip to content

Commit

Permalink
Merge branch 'main' into ckartik/better-logs
Browse files Browse the repository at this point in the history
  • Loading branch information
kant777 authored Jul 5, 2024
2 parents e9f0f8b + a6094d7 commit bcd2ae7
Show file tree
Hide file tree
Showing 13 changed files with 636 additions and 74 deletions.
59 changes: 54 additions & 5 deletions .github/workflows/infrastructure.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ on:
type: choice
options:
- lax1
- nyc1
- nyc2
- chi1
- mia2
- mia3
default: 'lax'

permissions:
Expand Down Expand Up @@ -131,16 +131,23 @@ jobs:
- name: Configure Control Machine
run: |
mkdir -p ~/.ssh
chmod 700 ~/.ssh
cat <<-EOH >> ~/.ssh/config
Host *
ControlMaster auto
ControlPath /tmp/ssh_mux_%h_%p_%r
ControlPersist 60m
EOH
ANSIBLE_USER="${USER}"
ANSIBLE_CONNECTION="ansible_connection=local"
if [ "${IS_MANUAL_DEPLOYMENT}" == "true" ]; then
ANSIBLE_USER="ubuntu"
ANSIBLE_CONNECTION=""
export ANSIBLE_HOST_KEY_CHECKING=false
mkdir -p ~/.ssh && \
chmod 700 ~/.ssh && \
echo "${{ secrets.INFRASTRUCTURE_DEPLOYMENT_KEY }}" > ~/.ssh/id_ed25519 && \
echo "${{ secrets.INFRASTRUCTURE_DEPLOYMENT_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
fi
Expand Down Expand Up @@ -250,6 +257,48 @@ jobs:
)
curl -X POST -H 'Content-type: application/json' --data "${PAYLOAD}" "${{ secrets.SLACK_CI_CHANNEL_WEBHOOK_URL }}"
- name: Collect Cluster Logs
if: ${{ env.IS_MANUAL_DEPLOYMENT == 'false' && failure() }}
run: |
NOMAD_SERVER="http://${TARGET_MACHINE_IP}:4646"
journalctl -u nomad > nomad.log
curl -s ${NOMAD_SERVER}/v1/jobs > nomad_jobs.json
ALLOC_IDS=$(curl -s ${NOMAD_SERVER}/v1/allocations | jq -r '.[].ID')
for ALLOC_ID in ${ALLOC_IDS}; do
JOB=$(curl -s ${NOMAD_SERVER}/v1/allocation/${ALLOC_ID} | jq -r '.JobID')
TASKS=$(curl -s ${NOMAD_SERVER}/v1/allocation/${ALLOC_ID} | jq -r '.TaskStates | keys[]')
for TASK in ${TASKS}; do
STDOUT=$(curl -s "${NOMAD_SERVER}/v1/client/fs/logs/${ALLOC_ID}?task=${TASK}&type=stdout")
if [ "$(jq -e .Data <<< "${STDOUT}" 2> /dev/null)" != "null" ]; then
echo ${STDOUT} | jq -r '.Data' | base64 -d > "${ALLOC_ID}_${JOB}_${TASK}_stdout.log"
else
echo "Failed to fetch stdout log for ${ALLOC_ID}_${JOB}_${TASK}:"
echo ${STDOUT}
fi
STDERR=$(curl -s "${NOMAD_SERVER}/v1/client/fs/logs/${ALLOC_ID}?task=${TASK}&type=stderr")
if [ "$(jq -e .Data <<< "${STDERR}" 2> /dev/null)" != "null" ]; then
echo ${STDERR} | jq -r '.Data' | base64 -d > "${ALLOC_ID}_${JOB}_${TASK}_stderr.log"
else
echo "Failed to fetch stderr log for ${ALLOC_ID}_${JOB}_${TASK}:"
echo ${STDERR}
fi
done
done
- name: Upload Cluster Logs
if: ${{ env.IS_MANUAL_DEPLOYMENT == 'false' && failure() }}
uses: actions/upload-artifact@v4
with:
name: cluster-logs
path: |
nomad_jobs.json
nomad.log
*_stdout.log
*_stderr.log
- name: Initialize Debug Shell
if: ${{ env.IS_MANUAL_DEPLOYMENT == 'false' && failure() }}
run: |
Expand Down
4 changes: 3 additions & 1 deletion infrastructure/nomad/ansible.cfg.example
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ timeout = 10
connection = ssh
pipelining = True
host_key_checking = False
ssh_args = -o ControlMaster=auto -o ControlPersist=60s -o ForwardAgent=yes
fact_caching = jsonfile
fact_caching_connection = /tmp/ansible_facts_cache
fact_caching_timeout = 86400
Expand All @@ -17,3 +16,6 @@ module_defaults:
gather_subset:
- '!all'
- 'min'

[ssh_connection]
ssh_args = -o ControlMaster=auto -o ControlPersist=60m -o ControlPath=/tmp/ssh_mux_%h_%p_%r -o ForwardAgent=yes
47 changes: 26 additions & 21 deletions infrastructure/nomad/playbooks/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@
fi
set +x
goreleaser release --snapshot --config=./{{ artifact.path }}/.goreleaser.tmp.yml
if [ $? -ne 0 ]; then exit 1; fi
rm ./{{ artifact.path }}/.goreleaser.tmp.yml
fi
flock -v -u 192
Expand All @@ -231,20 +232,22 @@
echo "${ARTIFACT}" >> "${INVENTORY}"
flock -v -u 192
tar -czvf "${ARTIFACT}" ./{{ artifact.path }}
if [ $? -ne 0 ]; then exit 1; fi
fi
{% endif %}

{% if artifact.keystore is defined %}
case "{{ environments[env].secrets }}" in
"generate")
PASSPHRASE="{{ lookup('password', '/dev/null', length=1024, chars=['ascii_letters', 'digits', '+-_,.:@']) }}"
PASSPHRASE="{{ lookup('password', '/dev/null', length=1024, chars=['ascii_letters', 'digits']) }}"

RESULT=$(
{{ keystore_generator.stdout }} generate \
--keystore-dir="${DESTINATION_DIR}" \
--passphrase="${PASSPHRASE}" \
--log-fmt="json"
)
if [ $? -ne 0 ]; then exit 1; fi
ARTIFACT_PATH="$(echo ${RESULT} | jq -e -r '.path // empty' 2>/dev/null)"
flock -v -x 192
echo "${ARTIFACT_PATH}" >> "${INVENTORY}"
Expand All @@ -261,6 +264,7 @@
"{{ artifact.keystore.name }}_password": $keystore_password
}' \
> "${SECRETS}"
if [ $? -ne 0 ]; then exit 1; fi
flock -v -u 191

ADDRESS="$(cat ${ARTIFACT_PATH} | jq -r '.address')"
Expand Down Expand Up @@ -309,6 +313,7 @@
"{{ artifact.keystore.name }}_password": $keystore_password
}' \
> "${SECRETS}"
if [ $? -ne 0 ]; then exit 1; fi
flock -v -u 191

ADDRESS=$(echo "${KEYSTORE}" | jq -r '.address')
Expand All @@ -331,6 +336,7 @@
{% endif %}
{% if artifact.boot_key | default(false) and environments[env].secrets == 'generate' %}
bootnode -genkey "${DESTINATION_DIR}/boot.key"
if [ $? -ne 0 ]; then exit 1; fi

flock -v -x 191
cat "${SECRETS}" | jq \
Expand All @@ -342,6 +348,7 @@
($item_name + "_boot_key_address"): $boot_key_address
}' \
> "${SECRETS}"
if [ $? -ne 0 ]; then exit 1; fi
flock -v -u 191

{% endif %}
Expand Down Expand Up @@ -517,6 +524,21 @@
label: "{{ item.path | basename }}"
when: build_artifacts and upload_artifacts.matched > 0 and env == 'devenv'

- name: Read Generated Secrets
slurp:
src: "{{ dist_dir }}/secrets.json"
register: secrets_json
when: build_artifacts and environments[env].secrets == 'generate'
delegate_to: localhost
run_once: true

- name: Debug Generated Secrets
ansible.builtin.debug:
msg: "{{ secrets_json.content | b64decode | from_json }}"
when: build_artifacts and env == 'devenv' and environments[env].secrets == 'generate'
delegate_to: localhost
run_once: true

- name: Push Generated Secrets to Vault
ansible.builtin.uri:
url: "{{ vault_address }}/v1/{{ vault_kv_engine_path }}/data/{{ vault_secret_path }}"
Expand All @@ -525,7 +547,7 @@
headers:
X-Vault-Token: "{{ vault_init.json.root_token }}"
Content-Type: "application/json"
body: "{{ {'data': (lookup('file', dist_dir + '/secrets.json') | from_json)} | to_json }}"
body: "{{ {'data': (secrets_json.content | b64decode | from_json)} | to_json }}"
status_code: [200, 204]
validate_certs: no
when: build_artifacts and environments[env].secrets == 'generate'
Expand Down Expand Up @@ -571,27 +593,14 @@

- name: Deploy Jobs
ansible.builtin.shell: |
RESULT="$(nomad run {{ ansible_env.HOME }}/{{ env }}/{{ job.name }}.nomad 2>&1)"
if [ $? -ne 0 ]; then
echo "Failed to deploy {{ job.name }}:"
echo "${RESULT}"
echo "{{ job.name }} stdout logs:"
nomad alloc logs -stdout -job "{{ job.name }}" | tail -n 100
echo "{{ job.name }} error logs:"
nomad alloc logs -stderr -job "{{ job.name }}" | tail -n 100
exit 1
fi
nomad run {{ ansible_env.HOME }}/{{ env }}/{{ job.name }}.nomad
TIMEOUT={% if profile == 'ci' %}600{% else %}300{% endif %}
START_TIME=$(date +%s)
RESULT=$(nomad job status -json "{{ job.name }}")
if [ $? -ne 0 ]; then
echo "Failed to get job status for {{ job.name }}:"
echo "${RESULT}"
echo "{{ job.name }} stdout logs:"
nomad alloc logs -stdout -job "{{ job.name }}" | tail -n 100
echo "{{ job.name }} error logs:"
nomad alloc logs -stderr -job "{{ job.name }}" | tail -n 100
exit 1
fi
JOB_TYPE=$(echo "${RESULT}" | jq -r '.[0].Allocations[0].JobType')
Expand Down Expand Up @@ -623,7 +632,7 @@
CURRENT_TIME="$(date +%s)"
ELAPSED_TIME="$(( CURRENT_TIME - START_TIME ))"
if [ ${ELAPSED_TIME} -ge ${TIMEOUT} ]; then
echo "Deploy timed out for {{ job.name }}, current status: ${STATUS}."
echo "Deploy timed out for {{ job.name }}, current status: ${STATUS}"
exit 1
fi
Expand All @@ -632,10 +641,6 @@
if [ $? -ne 0 ]; then
echo "Failed to get job status for {{ job.name }}:"
echo "${RESULT}"
echo "{{ job.name }} stdout logs:"
nomad alloc logs -stdout -job "{{ job.name }}" | tail -n 100
echo "{{ job.name }} error logs:"
nomad alloc logs -stderr -job "{{ job.name }}" | tail -n 100
exit 1
fi
done
Expand Down
6 changes: 6 additions & 0 deletions infrastructure/nomad/playbooks/init.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
- openssl
- httping
- iptables
- bridge-utils
- apt-transport-https
- python3
- python3-pip
Expand Down Expand Up @@ -66,6 +67,11 @@
state: present
ignore_errors: yes

- name: Ensure Kernel Bridge Module is Loaded
shell: |
modprobe bridge
if ! grep -q '^bridge$' /etc/modules; then echo "bridge" >> /etc/modules; fi
- name: Include Common Variables
include_vars:
file: variables/common.yml
Expand Down
2 changes: 1 addition & 1 deletion infrastructure/nomad/playbooks/variables/profiles.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
datacenter: "dc1"
l1_rpc_url: "https://eth-holesky.g.alchemy.com/v2/WqNEQeeexFLQwECjxCPpdep0uvCgn8Yj"
l1_rpc_url: "https://eth-holesky.g.alchemy.com/v2/H8JN1wImnEPrxkFRVOT7cJ_gzu9x3VmB"

artifacts:
bidder_emulator: &bidder_emulator_artifact
Expand Down
1 change: 1 addition & 0 deletions oracle/pkg/l1Listener/l1Listener.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ type WinnerRegister interface {
type EthClient interface {
BlockNumber(ctx context.Context) (uint64, error)
HeaderByNumber(ctx context.Context, number *big.Int) (*types.Header, error)
BlockByNumber(ctx context.Context, number *big.Int) (*types.Block, error)
}

type L1Listener struct {
Expand Down
4 changes: 4 additions & 0 deletions oracle/pkg/l1Listener/l1Listener_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,10 @@ func (t *testEthClient) HeaderByNumber(_ context.Context, number *big.Int) (*typ
return hdr, nil
}

func (t *testEthClient) BlockByNumber(_ context.Context, number *big.Int) (*types.Block, error) {
return nil, nil
}

func publishLog(
eventManager events.EventManager,
blockNum *big.Int,
Expand Down
62 changes: 60 additions & 2 deletions oracle/pkg/node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ func NewNode(opts *Options) (*Node, error) {
monitor := txmonitor.New(
owner,
settlementClient,
txmonitor.NewEVMHelper(settlementClient.Client()),
txmonitor.NewEVMHelperWithLogger(settlementClient.Client(), nd.logger),
st,
nd.logger.With("component", "tx_monitor"),
1024,
Expand Down Expand Up @@ -164,6 +164,8 @@ func NewNode(opts *Options) (*Node, error) {
listenerL1Client = &laggerdL1Client{EthClient: listenerL1Client, amount: opts.LaggerdMode}
}

listenerL1Client = &infiniteRetryL1Client{EthClient: listenerL1Client, logger: nd.logger}

blockTracker, err := blocktracker.NewBlocktrackerTransactor(
opts.BlockTrackerContractAddr,
settlementRPC,
Expand Down Expand Up @@ -232,10 +234,11 @@ func NewNode(opts *Options) (*Node, error) {

updtr, err := updater.NewUpdater(
nd.logger.With("component", "updater"),
l1Client,
listenerL1Client,
st,
evtMgr,
oracleTransactorSession,
txmonitor.NewEVMHelperWithLogger(l1Client.Client(), nd.logger),
)
if err != nil {
nd.logger.Error("failed to instantiate updater", "error", err)
Expand Down Expand Up @@ -403,6 +406,61 @@ func (w *winnerOverrideL1Client) HeaderByNumber(ctx context.Context, number *big
return hdr, nil
}

type infiniteRetryL1Client struct {
l1Listener.EthClient
logger *slog.Logger
}

func (i *infiniteRetryL1Client) BlockNumber(ctx context.Context) (uint64, error) {
var blkNum uint64
var err error
for retries := 50; retries > 0; retries-- {
blkNum, err = i.EthClient.BlockNumber(ctx)
if err == nil {
break
}
i.logger.Error("failed to get block number, retrying...", "error", err)
time.Sleep(2 * time.Second)
}
if err != nil {
return 0, err
}
return blkNum, nil
}

func (i *infiniteRetryL1Client) HeaderByNumber(ctx context.Context, number *big.Int) (*types.Header, error) {
var hdr *types.Header
var err error
for retries := 50; retries > 0; retries-- {
hdr, err = i.EthClient.HeaderByNumber(ctx, number)
if err == nil {
break
}
i.logger.Error("failed to get header by number, retrying...", "error", err)
time.Sleep(2 * time.Second)
}
if err != nil {
return nil, err
}
return hdr, nil
}

func (i *infiniteRetryL1Client) BlockByNumber(ctx context.Context, number *big.Int) (*types.Block, error) {
var blk *types.Block
var err error
for retries := 50; retries > 0; retries-- {
blk, err = i.EthClient.BlockByNumber(ctx, number)
if err == nil {
break
}
i.logger.Error("failed to get block by number, retrying...", "error", err)
time.Sleep(2 * time.Second)
}
if err != nil {
return nil, err
}
return blk, nil
}
func setBuilderMapping(
ctx context.Context,
bt *blocktracker.BlocktrackerTransactorSession,
Expand Down
Loading

0 comments on commit bcd2ae7

Please sign in to comment.