From 8a6f97bfd7d981c8e4169085243bd36e083e9845 Mon Sep 17 00:00:00 2001 From: Daryl Collins Date: Tue, 12 Mar 2024 16:54:08 +0000 Subject: [PATCH] feature(core): LIT-2560 - Refactor connect() logic - Ensure network sync is a mostly atomic operation - don't write node properties to client until handshakes complete - Ensure multiple calls to the same core instance's `connect()` method are coalesced into one handshake operation - Await operations inside of `connect()`, so that failures bubble up to the caller and they can react appropriately - Refactor `listenForNewEpoch()` to track staking contract listener in a way that allows for disconnection later and ensure one-and-only-one listener is ever attached - Add `disconnect()` method that clears all intervals to allow consumers to shut down gracefully - Make `networkSyncInterval` explicitly private (use `disconnect()`!) - Fix a variety of incorrect types - Replace 500ms polling setInterval that compared length of `serverKeys` and `config.bootstrapUrls` with Promise.race() for timeout condition - JSDocs and clearer context in error messages --- packages/core/src/lib/lit-core.ts | 636 ++++++++++++++++++------------ 1 file changed, 380 insertions(+), 256 deletions(-) diff --git a/packages/core/src/lib/lit-core.ts b/packages/core/src/lib/lit-core.ts index 02f9d1713d..10b1d05867 100644 --- a/packages/core/src/lib/lit-core.ts +++ b/packages/core/src/lib/lit-core.ts @@ -45,7 +45,6 @@ import type { FormattedMultipleAccs, HandshakeWithNode, JsonHandshakeResponse, - KV, LitNodeClientConfig, MultipleAccessControlConditions, NodeClientErrorV0, @@ -61,6 +60,17 @@ import type { SupportedJsonRequests, } from '@lit-protocol/types'; +type Listener = (...args: any[]) => void; + +interface CoreNodeConfig { + subnetPubKey: string; + networkPubKey: string; + networkPubKeySet: string; + hdRootPubkeys: string[]; + latestBlockhash: string; + lastBlockHashRetrieved: number; +} + export class LitCore { config: LitNodeClientConfig = { alertWhenUnauthorized: false, @@ -76,8 +86,8 @@ export class LitCore { interval: 100, }, }; - connectedNodes: Set = new Set(); - serverKeys: KV = {}; + connectedNodes = new Set(); + serverKeys: Record = {}; ready: boolean = false; subnetPubKey: string | null = null; networkPubKey: string | null = null; @@ -85,7 +95,10 @@ export class LitCore { hdRootPubkeys: string[] | null = null; latestBlockhash: string | null = null; lastBlockHashRetrieved: number | null = null; - networkSyncInterval: ReturnType | null = null; + private _networkSyncInterval: ReturnType | null = null; + private _stakingContract: ethers.Contract | null = null; + private _stakingContractListener: null | Listener = null; + private _connectingPromise: null | Promise = null; constructor(config: LitNodeClientConfig | CustomNetwork) { // Initialize default config based on litNetwork @@ -93,24 +106,25 @@ export class LitCore { case LitNetwork.Cayenne: this.config = { ...this.config, - litNetwork: LitNetwork.Cayenne, + ...config, }; break; case LitNetwork.Manzano: this.config = { ...this.config, - litNetwork: LitNetwork.Manzano, checkNodeAttestation: true, + ...config, }; break; case LitNetwork.Habanero: this.config = { ...this.config, - litNetwork: LitNetwork.Habanero, checkNodeAttestation: true, + ...config, }; break; default: + // Probably `custom` or `localhost` this.config = { ...this.config, ...config, @@ -246,6 +260,54 @@ export class LitCore { } }; + async handleStakingContractStateChange(state: StakingStates) { + log(`New state detected: "${state}"`); + if (state === StakingStates.NextValidatorSetLocked) { + try { + log( + 'State found to be new validator set locked, checking validator set' + ); + const oldNodeUrls: string[] = [...this.config.bootstrapUrls].sort(); + await this.setNewConfig(); + const currentNodeUrls: string[] = this.config.bootstrapUrls.sort(); + const delta: string[] = currentNodeUrls.filter((item) => + oldNodeUrls.includes(item) + ); + // if the sets differ we reconnect. + if (delta.length > 1) { + // check if the node sets are non-matching and re-connect if they do not. + /* + TODO: This covers *most* cases where a node may come in or out of the active + set which we will need to re attest to the execution environments. + However, the sdk currently does not know if there is an active network operation pending. + Such that the state when the request was sent will now mutate when the response is sent back. + The sdk should be able to understand its current execution environment and wait on an active + network request to the previous epoch's node set before changing over. + */ + log( + 'Active validator sets changed, new validators ', + delta, + 'starting node connection' + ); + } + + await this.connect(); + } catch (err: unknown) { + // FIXME: We should emit an error event so that consumers know that we are de-synced and can connect() again + // But for now, our every-30-second network sync will fix things in at most 30s from now. + // this.ready = false; Should we assume core is invalid if we encountered errors refreshing from an epoch change? + const { message = '' } = err as + | Error + | NodeClientErrorV0 + | NodeClientErrorV1; + logError( + 'Error while attempting to reconnect to nodes after epoch transition:', + message + ); + } + } + } + /** * Sets up a listener to detect state changes (new epochs) in the staking contract. * When a new epoch is detected, it triggers the `setNewConfig` function to update @@ -256,13 +318,18 @@ export class LitCore { * @returns {Promise} A promise that resolves when the listener is successfully set up. */ listenForNewEpoch = async (): Promise => { + if (this._stakingContractListener) { + // Already listening, do nothing + return; + } + if ( this.config.litNetwork === LitNetwork.Manzano || this.config.litNetwork === LitNetwork.Habanero || this.config.litNetwork === LitNetwork.Custom ) { const stakingContract = await LitContracts.getStakingContract( - this.config.litNetwork as any, + this.config.litNetwork, this.config.contractContext ); log( @@ -270,49 +337,34 @@ export class LitCore { stakingContract.address ); - stakingContract.on('StateChanged', async (state: StakingStates) => { - log(`New state detected: "${state}"`); - if (state === StakingStates.NextValidatorSetLocked) { - log( - 'State found to be new validator set locked, checking validator set' - ); - const oldNodeUrls: string[] = [...this.config.bootstrapUrls].sort(); - await this.setNewConfig(); - const currentNodeUrls: string[] = this.config.bootstrapUrls.sort(); - const delta: string[] = currentNodeUrls.filter((item) => - oldNodeUrls.includes(item) - ); - // if the sets differ we reconnect. - if (delta.length > 1) { - // check if the node sets are non-matching and re-connect if they do not. - /* - TODO: This covers *most* cases where a node may come in or out of the active - set which we will need to re attest to the execution environments. - However, the sdk currently does not know if there is an active network operation pending. - Such that the state when the request was sent will now mutate when the response is sent back. - The sdk should be able to understand its current execution environment and wait on an active - network request to the previous epoch's node set before changing over. - */ - log( - 'Active validator sets changed, new validators ', - delta, - 'starting node connection' - ); - this.connectedNodes = - await this._runHandshakeWithBootstrapUrls().catch( - (err: NodeClientErrorV0 | NodeClientErrorV1) => { - logError( - 'Error while attempting to reconnect to nodes after epoch transition: ', - err.message - ); - } - ); - } - } - }); + this._stakingContract = stakingContract; + // Stash a function instance, because its identity must be consistent for '.off()' usage to work later + this._stakingContractListener = (state: StakingStates) => { + // Intentionally not return or await; Listeners are _not async_ + this.handleStakingContractStateChange(state); + }; + this._stakingContract.on('StateChanged', this._stakingContractListener); } }; + async disconnect() { + this._stopListeningForNewEpoch(); + this._stopNetworkPolling(); + } + + _stopNetworkPolling() { + if (this._networkSyncInterval) { + clearInterval(this._networkSyncInterval); + this._networkSyncInterval = null; + } + } + _stopListeningForNewEpoch() { + if (this._stakingContract && this._stakingContractListener) { + this._stakingContract.off('StateChanged', this._stakingContractListener); + this._stakingContractListener = null; + } + } + /** * * Set bootstrapUrls to match the network litNetwork unless it's set to custom @@ -342,30 +394,166 @@ export class LitCore { }; /** - * * Connect to the LIT nodes * * @returns { Promise } A promise that resolves when the nodes are connected. * */ - connect = async (): Promise => { - // -- handshake with each node + async connect(): Promise { + // Ensure that multiple closely timed calls to `connect()` don't result in concurrent connect() operations being run + if (this._connectingPromise) { + return this._connectingPromise; + } + + this._connectingPromise = this._connect(); + + await this._connectingPromise.finally(() => { + this._connectingPromise = null; + }); + } + + private async _connect() { + // Ensure an ill-timed epoch change event doesn't trigger concurrent config changes while we're already doing that + this._stopListeningForNewEpoch(); + + // Ensure we don't fire an existing network sync poll handler while we're in the midst of connecting anyway + this._stopNetworkPolling(); + await this.setNewConfig(); + + // -- handshake with each node. Note that if we've previously initialized successfully, but this call fails, + // core will remain useable but with the existing set of `connectedNodes` and `serverKeys`. + const { connectedNodes, serverKeys, coreNodeConfig } = + await this._runHandshakeWithBootstrapUrls(); + Object.assign(this, { ...coreNodeConfig, connectedNodes, serverKeys }); + + this.scheduleNetworkSync(); await this.listenForNewEpoch(); - await this._runHandshakeWithBootstrapUrls(); - }; - /** + // FIXME: don't create global singleton; multiple instances of `core` should not all write to global + // @ts-ignore + + globalThis.litNodeClient = this; + this.ready = true; + + log(`🔥 lit is ready. "litNodeClient" variable is ready to use globally.`); + log('current network config', { + networkPubkey: this.networkPubKey, + networkPubKeySet: this.networkPubKeySet, + hdRootPubkeys: this.hdRootPubkeys, + subnetPubkey: this.subnetPubKey, + latestBlockhash: this.latestBlockhash, + }); + + // browser only + if (isBrowser()) { + document.dispatchEvent(new Event('lit-ready')); + } + } + + private async handshakeAndVerifyNodeAttestation({ + url, + requestId, + }: { + url: string; + requestId: string; + }): Promise { + const challenge = this.getRandomHexString(64); + + const handshakeResult = await this.handshakeWithNode( + { url, challenge }, + requestId + ); + + const keys: JsonHandshakeResponse = { + serverPubKey: handshakeResult.serverPublicKey, + subnetPubKey: handshakeResult.subnetPublicKey, + networkPubKey: handshakeResult.networkPublicKey, + networkPubKeySet: handshakeResult.networkPublicKeySet, + hdRootPubkeys: handshakeResult.hdRootPubkeys, + latestBlockhash: handshakeResult.latestBlockhash, + }; + + // Nodes that have just bootstrapped will not have negotiated their keys, yet + // They will return ERR for those values until they reach consensus + + // Note that if node attestation checks are disabled or checkSevSnpAttestation() succeeds, we will still track the + // node, even though its keys may be "ERR". + // Should we really track servers with ERR as keys? + if ( + keys.serverPubKey === 'ERR' || + keys.subnetPubKey === 'ERR' || + keys.networkPubKey === 'ERR' || + keys.networkPubKeySet === 'ERR' + ) { + logErrorWithRequestId( + requestId, + 'Error connecting to node. Detected "ERR" in keys', + url, + keys + ); + } + + log(`Handshake with ${url} returned keys: `, keys); + if (!keys.latestBlockhash) { + logErrorWithRequestId( + requestId, + `Error getting latest blockhash from the node ${url}.` + ); + } + + if ( + this.config.checkNodeAttestation || + this.config.litNetwork === LitNetwork.Manzano || + this.config.litNetwork === LitNetwork.Habanero + ) { + const attestation = handshakeResult.attestation; + + if (!attestation) { + throwError({ + message: `Missing attestation in handshake response from ${url}`, + errorKind: LIT_ERROR.INVALID_NODE_ATTESTATION.kind, + errorCode: LIT_ERROR.INVALID_NODE_ATTESTATION.name, + }); + } + + // actually verify the attestation by checking the signature against AMD certs + log('Checking attestation against amd certs...'); + + try { + // ensure we won't try to use a node with an invalid attestation response + await checkSevSnpAttestation(attestation, challenge, url); + log(`Lit Node Attestation verified for ${url}`); + } catch (e: any) { + throwError({ + message: `Lit Node Attestation failed verification for ${url} - ${e.message}`, + errorKind: LIT_ERROR.INVALID_NODE_ATTESTATION.kind, + errorCode: LIT_ERROR.INVALID_NODE_ATTESTATION.name, + }); + } + } + + return keys; + } + + /** Handshakes with all nodes that are in `bootstrapUrls` + * @private * - * @returns {Promise} + * @returns {Promise<{connectedNodes: Set, serverKeys: {}}>} Returns a set of the urls of nodes that we + * successfully connected to, an object containing their returned keys, and our 'core' config (most common values for + * critical values) */ - - _runHandshakeWithBootstrapUrls = async (): Promise => { + private async _runHandshakeWithBootstrapUrls(): Promise<{ + connectedNodes: Set; + serverKeys: Record; + coreNodeConfig: CoreNodeConfig; + }> { // -- handshake with each node - const requestId = this.getRequestId(); + const requestId: string = this.getRequestId(); - // reset connectedNodes for the new handshake operation - this.connectedNodes = new Set(); + // track connectedNodes for the new handshake operation + const connectedNodes = new Set(); + const serverKeys: Record = {}; if (this.config.bootstrapUrls.length <= 0) { throwError({ @@ -375,210 +563,146 @@ export class LitCore { }); } - for (const url of this.config.bootstrapUrls) { - const challenge = this.getRandomHexString(64); - this.handshakeWithNode({ url, challenge }, requestId) - .then((resp: any) => { - this.connectedNodes.add(url); - - const keys: JsonHandshakeResponse = { - serverPubKey: resp.serverPublicKey, - subnetPubKey: resp.subnetPublicKey, - networkPubKey: resp.networkPublicKey, - networkPubKeySet: resp.networkPublicKeySet, - hdRootPubkeys: resp.hdRootPubkeys, - latestBlockhash: resp.latestBlockhash, - }; - - // -- validate returned keys - if ( - keys.serverPubKey === 'ERR' || - keys.subnetPubKey === 'ERR' || - keys.networkPubKey === 'ERR' || - keys.networkPubKeySet === 'ERR' - ) { - logErrorWithRequestId( - requestId, - 'Error connecting to node. Detected "ERR" in keys', - url, - keys - ); - } - log('returned keys: ', keys); - if (!keys.latestBlockhash) { - logErrorWithRequestId( - requestId, - 'Error getting latest blockhash from the node.' - ); - } - - if ( - this.config.checkNodeAttestation || - this.config.litNetwork === LitNetwork.Manzano || - this.config.litNetwork === LitNetwork.Habanero - ) { - // check attestation - if (!resp.attestation) { - logErrorWithRequestId( - requestId, - `Missing attestation in handshake response from ${url}` - ); - throwError({ - message: `Missing attestation in handshake response from ${url}`, - errorKind: LIT_ERROR.INVALID_NODE_ATTESTATION.kind, - errorCode: LIT_ERROR.INVALID_NODE_ATTESTATION.name, - }); - } else { - // actually verify the attestation by checking the signature against AMD certs - log('Checking attestation against amd certs...'); - const attestation = resp.attestation; - - try { - checkSevSnpAttestation(attestation, challenge, url).then(() => { - log(`Lit Node Attestation verified for ${url}`); - - // only set server keys if attestation is valid - // so that we don't use this node if it's not valid - this.serverKeys[url] = keys; - }); - } catch (e) { - logErrorWithRequestId( - requestId, - `Lit Node Attestation failed verification for ${url}` - ); - throwError({ - message: `Lit Node Attestation failed verification for ${url}`, - errorKind: LIT_ERROR.INVALID_NODE_ATTESTATION.kind, - errorCode: LIT_ERROR.INVALID_NODE_ATTESTATION.name, - }); - } - } - } else { - // don't check attestation, just set server keys - this.serverKeys[url] = keys; + let timeoutHandle: ReturnType; + await Promise.race([ + new Promise((_resolve, reject) => { + timeoutHandle = setTimeout(() => { + const msg = `Error: Could not connect to enough nodes after timeout of ${ + this.config.connectTimeout + }ms. Could only connect to ${Object.keys(serverKeys).length} of ${ + this.config.minNodeCount + } required nodes, from ${ + this.config.bootstrapUrls.length + } possible nodes. Please check your network connection and try again. Note that you can control this timeout with the connectTimeout config option which takes milliseconds.`; + + try { + // TODO: Kludge, replace with standard error construction + throwError({ + message: msg, + errorKind: LIT_ERROR.INIT_ERROR.kind, + errorCode: LIT_ERROR.INIT_ERROR.name, + }); + } catch (e) { + reject(e); } + }, this.config.connectTimeout); + }), + Promise.all( + this.config.bootstrapUrls.map(async (url) => { + serverKeys[url] = await this.handshakeAndVerifyNodeAttestation({ + url, + requestId, + }); + connectedNodes.add(url); }) - .catch((e: any) => { - log('Error connecting to node ', url, e); - }); - } + ).finally(() => { + clearTimeout(timeoutHandle); + }), + ]); - // -- get promise - return new Promise((resolve: any, reject: any) => { - const startTime = Date.now(); - const interval = setInterval(() => { - if ( - Object.keys(this.serverKeys).length == - this.config.bootstrapUrls.length - ) { - clearInterval(interval); - - // pick the most common public keys for the subnet and network from the bunch, in case some evil node returned a bad key - this.subnetPubKey = mostCommonString( - Object.values(this.serverKeys).map( - (keysFromSingleNode: any) => keysFromSingleNode.subnetPubKey - ) - ); - this.networkPubKey = mostCommonString( - Object.values(this.serverKeys).map( - (keysFromSingleNode: any) => keysFromSingleNode.networkPubKey - ) - ); - this.networkPubKeySet = mostCommonString( - Object.values(this.serverKeys).map( - (keysFromSingleNode: any) => keysFromSingleNode.networkPubKeySet - ) - ); - this.hdRootPubkeys = mostCommonString( - Object.values(this.serverKeys).map( - (keysFromSingleNode: any) => keysFromSingleNode.hdRootPubkeys - ) - ); - this.latestBlockhash = mostCommonString( - Object.values(this.serverKeys).map( - (keysFromSingleNode: any) => keysFromSingleNode.latestBlockhash - ) - ); + const coreNodeConfig = this._getCoreNodeConfigFromHandshakeResults({ + serverKeys, + requestId, + }); - if (!this.latestBlockhash) { - logErrorWithRequestId( - requestId, - 'Error getting latest blockhash from the nodes.' - ); + return { connectedNodes, serverKeys, coreNodeConfig }; + } - throwError({ - message: 'Error getting latest blockhash from the nodes.', - errorKind: LIT_ERROR.INVALID_ETH_BLOCKHASH.kind, - errorCode: LIT_ERROR.INVALID_ETH_BLOCKHASH.name, - }); - } + private _getCoreNodeConfigFromHandshakeResults({ + serverKeys, + requestId, + }: { + serverKeys: Record; + requestId: string; + }): CoreNodeConfig { + const latestBlockhash = mostCommonString( + Object.values(serverKeys).map( + (keysFromSingleNode: any) => keysFromSingleNode.latestBlockhash + ) + ); - this.lastBlockHashRetrieved = Date.now(); - this.ready = true; + if (!latestBlockhash) { + logErrorWithRequestId( + requestId, + 'Error getting latest blockhash from the nodes.' + ); - log( - `🔥 lit is ready. "litNodeClient" variable is ready to use globally.` - ); - log('current network config', { - networkPubkey: this.networkPubKey, - networkPubKeySet: this.networkPubKeySet, - hdRootPubkeys: this.hdRootPubkeys, - subnetPubkey: this.subnetPubKey, - latestBlockhash: this.latestBlockhash, - }); + throwError({ + message: 'Error getting latest blockhash from the nodes.', + errorKind: LIT_ERROR.INVALID_ETH_BLOCKHASH.kind, + errorCode: LIT_ERROR.INVALID_ETH_BLOCKHASH.name, + }); + } - // FIXME: don't create global singleton; multiple instances of `core` should not all write to global - // @ts-ignore - globalThis.litNodeClient = this; + // pick the most common public keys for the subnet and network from the bunch, in case some evil node returned a bad key + return { + subnetPubKey: mostCommonString( + Object.values(serverKeys).map( + (keysFromSingleNode: any) => keysFromSingleNode.subnetPubKey + ) + ), + networkPubKey: mostCommonString( + Object.values(serverKeys).map( + (keysFromSingleNode: any) => keysFromSingleNode.networkPubKey + ) + ), + networkPubKeySet: mostCommonString( + Object.values(serverKeys).map( + (keysFromSingleNode: any) => keysFromSingleNode.networkPubKeySet + ) + ), + hdRootPubkeys: mostCommonString( + Object.values(serverKeys).map( + (keysFromSingleNode: any) => keysFromSingleNode.hdRootPubkeys + ) + ), + latestBlockhash, + lastBlockHashRetrieved: Date.now(), + } as CoreNodeConfig; + } - // browser only - if (isBrowser()) { - document.dispatchEvent(new Event('lit-ready')); - } - // if the interval is defined we clear it - if (this.networkSyncInterval) { - clearInterval(this.networkSyncInterval); - } + /** Currently, we perform a full sync every 30s, including handshaking with every node + * However, we also have a state change listener that watches for staking contract state change events, which + * _should_ be the only time that we need to perform handshakes with every node. + * + * However, the current block hash does need to be updated regularly, and we currently update it only when we + * handshake with every node. + * + * We can remove this network sync code entirely if we refactor our code to fetch latest blockhash on-demand. + * @private + */ + private scheduleNetworkSync() { + if (this._networkSyncInterval) { + clearInterval(this._networkSyncInterval); + } - this.networkSyncInterval = setInterval(async () => { - if (Date.now() - this.lastBlockHashRetrieved! >= 30_000) { - log( - 'Syncing state for new network context current config: ', - this.config, - 'current blockhash: ', - this.lastBlockHashRetrieved - ); - await this._runHandshakeWithBootstrapUrls().catch((err) => { - throw err; - }); - log( - 'Done syncing state new config: ', - this.config, - 'new blockhash: ', - this.lastBlockHashRetrieved - ); - } - }, 30_000); - - resolve(); - } else { - const now = Date.now(); - if (now - startTime > this.config.connectTimeout) { - clearInterval(interval); - const msg = `Error: Could not connect to enough nodes after timeout of ${ - this.config.connectTimeout - }ms. Could only connect to ${ - Object.keys(this.serverKeys).length - } of ${ - this.config.bootstrapUrls.length - } required nodes. Please check your network connection and try again. Note that you can control this timeout with the connectTimeout config option which takes milliseconds.`; - logErrorWithRequestId(requestId, msg); - reject(msg); - } + this._networkSyncInterval = setInterval(async () => { + if (Date.now() - this.lastBlockHashRetrieved! >= 30_000) { + log( + 'Syncing state for new network context current config: ', + this.config, + 'current blockhash: ', + this.lastBlockHashRetrieved + ); + try { + await this.connect(); + log( + 'Done syncing state new config: ', + this.config, + 'new blockhash: ', + this.lastBlockHashRetrieved + ); + } catch (err: unknown) { + // Don't let error from this setInterval handler bubble up to runtime; it'd be an unhandledRejectionError + const { message = '' } = err as Error | NodeClientErrorV1; + logError( + 'Error while attempting to refresh nodes to fetch new latestBlockhash:', + message + ); } - }, 500); - }); - }; + } + }, 30_000); + } /** * @@ -694,7 +818,7 @@ export class LitCore { * @returns { Array> } * */ - getNodePromises = (callback: Function): Array> => { + getNodePromises = (callback: Function): Promise[] => { const nodePromises = []; for (const url of this.connectedNodes) { @@ -828,12 +952,12 @@ export class LitCore { * @returns { Promise | RejectedNodePromises> } */ handleNodePromises = async ( - nodePromises: Array>, + nodePromises: Promise[], requestId: string, minNodeCount: number ): Promise | RejectedNodePromises> => { async function waitForNSuccessesWithErrors( - promises: Array>, + promises: Promise[], n: number ): Promise<{ successes: T[]; errors: any[] }> { let responses = 0;