From 759118f9e56dd9af9856d7390049f84647256ad7 Mon Sep 17 00:00:00 2001 From: Ron Date: Mon, 14 Oct 2024 23:10:11 +0800 Subject: [PATCH] Improve alarm (#1310) * Improve alarm * Alarm config * Fix alarm * Loose alarm with missing data allowable * Seperate westend services * Breach alarm for production only * Remove unused * Change SCAN_INTERVAL to 1 hour for production * Change Latency Threshold --- web/packages/api/src/status.ts | 10 +- web/packages/operations/.env.example | 9 +- web/packages/operations/.env.production | 7 -- web/packages/operations/.env.testnet | 7 -- web/packages/operations/ecosystem.config.js | 10 +- web/packages/operations/src/alarm.ts | 101 +++++++++++++++++--- 6 files changed, 109 insertions(+), 35 deletions(-) delete mode 100644 web/packages/operations/.env.production delete mode 100644 web/packages/operations/.env.testnet diff --git a/web/packages/api/src/status.ts b/web/packages/api/src/status.ts index fecda19f4c..5dcf26ac5e 100644 --- a/web/packages/api/src/status.ts +++ b/web/packages/api/src/status.ts @@ -65,15 +65,17 @@ export enum AlarmReason { AccountBalanceInsufficient = "AccountBalanceInsufficient", ToEthereumNoTransfer = "ToEthereumNoTransfer", ToPolkadotNoTransfer = "ToPolkadotNoTransfer", + ToEthereumChannelAttacked = "ToEthereumChannelAttacked", + ToPolkadotChannelAttacked = "ToPolkadotChannelAttacked" } export type Sovereign = { name: string; account: string; balance: bigint; type: SourceType } export const BlockLatencyThreshold = { - // Syncing beefy finality update every 4 hours(2400 blocks) so we set 3000 blocks at most. - ToEthereum: 3000, - // Syncing beacon finality update every 6.4 minutes(32 blocks) so we set 128 blocks (4 epochs) at most. - ToPolkadot: 128, + // Syncing beefy finality update every 4 hours(1200 ethereum blocks), leave some buffer here + ToEthereum: 1350, + // Syncing beacon finality update every 6.4 minutes(64 substrate blocks), leave some buffer here + ToPolkadot: 80, } export const InsufficientBalanceThreshold = { diff --git a/web/packages/operations/.env.example b/web/packages/operations/.env.example index a679a1de38..674e0a5d3b 100644 --- a/web/packages/operations/.env.example +++ b/web/packages/operations/.env.example @@ -6,6 +6,7 @@ AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_REGION=eu-central-1 BRIDGE_STALE_SNS_TOPIC=arn:aws:sns:eu-central-1:232374692033:PD +BRIDGE_ATTACKED_SNS_TOPIC=arn:aws:sns:eu-central-1:232374692033:PD ACCOUNT_BALANCE_SNS_TOPIC=arn:aws:sns:eu-central-1:232374692033:PD-WALLET # INFURA Key config @@ -15,11 +16,15 @@ REACT_APP_INFURA_KEY= GRAPHQL_API_URL=https://data.snowbridge.network/graphql # Scan interval(in minutes) -SCAN_INTERVAL=30 +SCAN_INTERVAL=60 # Keys ETHEREUM_KEY= SUBSTRATE_KEY= -PENPAL_TRANSFER=false +# Cron expression to run token transfer tests CRON_EXPRESSION=0 0 * * * + +# Dashboard Url +LATENCY_DASHBOARD_URL= +BALANCE_DASHBOARD_URL= diff --git a/web/packages/operations/.env.production b/web/packages/operations/.env.production deleted file mode 100644 index 4a38bcd74d..0000000000 --- a/web/packages/operations/.env.production +++ /dev/null @@ -1,7 +0,0 @@ -NODE_ENV=polkadot_mainnet -REACT_APP_INFURA_KEY= -AWS_ACCESS_KEY_ID= -AWS_SECRET_ACCESS_KEY= -AWS_REGION=eu-central-1 -BRIDGE_STALE_SNS_TOPIC=arn:aws:sns:eu-central-1:232374692033:PD -ACCOUNT_BALANCE_SNS_TOPIC=arn:aws:sns:eu-central-1:232374692033:PD-WALLET diff --git a/web/packages/operations/.env.testnet b/web/packages/operations/.env.testnet deleted file mode 100644 index ff37d51178..0000000000 --- a/web/packages/operations/.env.testnet +++ /dev/null @@ -1,7 +0,0 @@ -NODE_ENV=rococo_sepolia -REACT_APP_INFURA_KEY= -AWS_ACCESS_KEY_ID= -AWS_SECRET_ACCESS_KEY= -AWS_REGION=eu-central-1 -BRIDGE_STALE_SNS_TOPIC=arn:aws:sns:eu-central-1:232374692033:Testnet -ACCOUNT_BALANCE_SNS_TOPIC=arn:aws:sns:eu-central-1:232374692033:Testnet diff --git a/web/packages/operations/ecosystem.config.js b/web/packages/operations/ecosystem.config.js index a7807f8673..c015a09e31 100644 --- a/web/packages/operations/ecosystem.config.js +++ b/web/packages/operations/ecosystem.config.js @@ -7,13 +7,19 @@ module.exports = { args: "cron" }, { - name: "transferToPolkadot", + name: "westend-monitor", + node_args: "--require=dotenv/config", + script: "./dist/src/main.js", + args: "cron" + }, + { + name: "westend-transferToPolkadot", node_args: "--require=dotenv/config", script: "./dist/src/transfer_to_polkadot.js", args: "cron" }, { - name: "transferToEthereum", + name: "westend-transferToEthereum", node_args: "--require=dotenv/config", script: "./dist/src/transfer_to_ethereum.js", args: "cron" diff --git a/web/packages/operations/src/alarm.ts b/web/packages/operations/src/alarm.ts index 6f3d68b564..839c3e93ac 100644 --- a/web/packages/operations/src/alarm.ts +++ b/web/packages/operations/src/alarm.ts @@ -7,12 +7,13 @@ import { const CLOUD_WATCH_NAME_SPACE = "SnowbridgeMetrics" const BRIDGE_STALE_SNS_TOPIC = process.env["BRIDGE_STALE_SNS_TOPIC"] || "" +const BRIDGE_ATTACKED_SNS_TOPIC = process.env["BRIDGE_ATTACKED_SNS_TOPIC"] || "" const ACCOUNT_BALANCE_SNS_TOPIC = process.env["ACCOUNT_BALANCE_SNS_TOPIC"] || "" const LatencyDashboard = - "https://eu-central-1.console.aws.amazon.com/cloudwatch/home?region=eu-central-1#dashboards/dashboard/Latency" + process.env["LATENCY_DASHBOARD_URL"] || "https://eu-central-1.console.aws.amazon.com/cloudwatch/home?region=eu-central-1#dashboards/dashboard/Latency" const BalanceDashboard = - "https://eu-central-1.console.aws.amazon.com/cloudwatch/home?region=eu-central-1#dashboards/dashboard/Balance" + process.env["BALANCE_DASHBOARD_URL"] || "https://eu-central-1.console.aws.amazon.com/cloudwatch/home?region=eu-central-1#dashboards/dashboard/Balance" export const sendMetrics = async (metrics: status.AllMetrics) => { const { AlarmReason, InsufficientBalanceThreshold } = status @@ -112,9 +113,14 @@ export const sendMetrics = async (metrics: status.AllMetrics) => { metricData.push({ MetricName: AlarmReason.ToEthereumChannelStale.toString(), Value: Number( - channel.toEthereum.outbound < channel.toEthereum.inbound || (channel.toEthereum.outbound > channel.toEthereum.inbound && - channel.toEthereum.inbound <= channel.toEthereum.previousInbound) + channel.toEthereum.inbound == channel.toEthereum.previousInbound) + ), + }) + metricData.push({ + MetricName: AlarmReason.ToEthereumChannelAttacked.toString(), + Value: Number( + channel.toEthereum.outbound < channel.toEthereum.inbound ), }) metricData.push({ @@ -167,9 +173,20 @@ export const sendMetrics = async (metrics: status.AllMetrics) => { metricData.push({ MetricName: AlarmReason.ToPolkadotChannelStale.toString(), Value: Number( - channel.toPolkadot.outbound < channel.toPolkadot.inbound || (channel.toPolkadot.outbound > channel.toPolkadot.inbound && - channel.toPolkadot.inbound <= channel.toPolkadot.previousInbound) + channel.toPolkadot.inbound == channel.toPolkadot.previousInbound) + ), + }) + metricData.push({ + MetricName: AlarmReason.ToPolkadotChannelAttacked.toString(), + Value: Number( + channel.toPolkadot.outbound < channel.toPolkadot.inbound + ), + }) + metricData.push({ + MetricName: AlarmReason.ToPolkadotNoTransfer.toString(), + Value: Number( + channel.toPolkadot.inbound == channel.toPolkadot.previousInbound ), }) } @@ -234,10 +251,12 @@ export const initializeAlarms = async () => { let client = new CloudWatchClient({}) let cloudWatchAlarms = [] - let alarmCommandSharedInput = { + let alarmCommandSharedInput: any = { Namespace: CLOUD_WATCH_NAME_SPACE + "-" + name, - Threshold: 0, - TreatMissingData: "breaching", + Threshold: 0 + } + if(name == "polkadot_mainnet") { + alarmCommandSharedInput.TreatMissingData = "breaching"; } // Alarm for stale bridge @@ -263,7 +282,7 @@ export const initializeAlarms = async () => { ComparisonOperator: "GreaterThanThreshold", AlarmActions: [BRIDGE_STALE_SNS_TOPIC], EvaluationPeriods: 3, - Period: 1800, + Period: 3600, ...alarmCommandSharedInput, }) ) @@ -289,10 +308,66 @@ export const initializeAlarms = async () => { ComparisonOperator: "GreaterThanThreshold", AlarmActions: [BRIDGE_STALE_SNS_TOPIC], EvaluationPeriods: 3, - Period: 1800, + Period: 3600, + ...alarmCommandSharedInput, + }) + ) + cloudWatchAlarms.push( + new PutMetricAlarmCommand({ + AlarmName: AlarmReason.ToEthereumChannelAttacked.toString() + "-" + name, + MetricName: AlarmReason.ToEthereumChannelAttacked.toString(), + AlarmDescription: LatencyDashboard, + Statistic: "Average", + ComparisonOperator: "GreaterThanThreshold", + AlarmActions: [BRIDGE_ATTACKED_SNS_TOPIC], + EvaluationPeriods: 3, + Period: 3600, ...alarmCommandSharedInput, }) ) + cloudWatchAlarms.push( + new PutMetricAlarmCommand({ + AlarmName: AlarmReason.ToPolkadotChannelAttacked.toString() + "-" + name, + MetricName: AlarmReason.ToPolkadotChannelAttacked.toString(), + AlarmDescription: LatencyDashboard, + Statistic: "Average", + ComparisonOperator: "GreaterThanThreshold", + AlarmActions: [BRIDGE_ATTACKED_SNS_TOPIC], + EvaluationPeriods: 3, + Period: 3600, + ...alarmCommandSharedInput, + }) + ) + // For westend alarm when there is no transfer(i.e. nonce not increased) for more than 1 day + if(name == "westend_sepolia") { + cloudWatchAlarms.push( + new PutMetricAlarmCommand({ + AlarmName: AlarmReason.ToEthereumNoTransfer.toString() + "-" + name, + MetricName: AlarmReason.ToEthereumNoTransfer.toString(), + AlarmDescription: LatencyDashboard, + Statistic: "Average", + ComparisonOperator: "GreaterThanThreshold", + AlarmActions: [BRIDGE_STALE_SNS_TOPIC], + EvaluationPeriods: 3, + Period: 21600, + ...alarmCommandSharedInput, + }) + ) + cloudWatchAlarms.push( + new PutMetricAlarmCommand({ + AlarmName: AlarmReason.ToPolkadotNoTransfer.toString() + "-" + name, + MetricName: AlarmReason.ToPolkadotNoTransfer.toString(), + AlarmDescription: LatencyDashboard, + Statistic: "Average", + ComparisonOperator: "GreaterThanThreshold", + AlarmActions: [BRIDGE_STALE_SNS_TOPIC], + EvaluationPeriods: 3, + Period: 21600, + ...alarmCommandSharedInput, + }) + ) + } + for (let alarm of cloudWatchAlarms) { await client.send(alarm) } @@ -305,8 +380,8 @@ export const initializeAlarms = async () => { Statistic: "Average", ComparisonOperator: "GreaterThanThreshold", AlarmActions: [ACCOUNT_BALANCE_SNS_TOPIC], - EvaluationPeriods: 1, - Period: 1800, + EvaluationPeriods: 2, + Period: 3600, ...alarmCommandSharedInput, }) await client.send(accountBalanceAlarm)