From d51003934491bb16545b830e0777985cb5dfccf4 Mon Sep 17 00:00:00 2001 From: Jonathan Harvey-Buschel Date: Wed, 15 Nov 2023 12:48:02 -0500 Subject: [PATCH 01/10] tapgarden: stop and delete caretaker on err In this commit, we ensure that the planter clears the pending batch if batch finalization fails. This allows users to create a new batch and resubmit the assets from the failed batch, and ensures that caretakers are destroyed after failure. --- tapgarden/caretaker.go | 4 +++- tapgarden/planter.go | 23 ++++++++++++++++++----- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/tapgarden/caretaker.go b/tapgarden/caretaker.go index fb1c1c65a..684f2af20 100644 --- a/tapgarden/caretaker.go +++ b/tapgarden/caretaker.go @@ -166,6 +166,8 @@ func (b *BatchCaretaker) Start() error { func (b *BatchCaretaker) Stop() error { var stopErr error b.stopOnce.Do(func() { + log.Infof("BatchCaretaker(%x): Stopping", b.batchKey[:]) + close(b.Quit) b.Wg.Wait() }) @@ -313,7 +315,7 @@ func (b *BatchCaretaker) assetCultivator() { currentBatchState, BatchStateBroadcast, ) if err != nil { - log.Errorf("unable to advance state machine: %v", err) + log.Errorf("Unable to advance state machine: %v", err) b.cfg.BroadcastErrChan <- err return } diff --git a/tapgarden/planter.go b/tapgarden/planter.go index 436e00bf2..586f555e3 100644 --- a/tapgarden/planter.go +++ b/tapgarden/planter.go @@ -344,7 +344,8 @@ func (c *ChainPlanter) stopCaretakers() { if err := caretaker.Stop(); err != nil { // TODO(roasbeef): continue and stop the rest // of them? - log.Warnf("Unable to stop ChainCaretaker(%x)", batchKey[:]) + log.Warnf("Unable to stop ChainCaretaker(%x)", + batchKey[:]) return } } @@ -547,14 +548,14 @@ func (c *ChainPlanter) gardener() { case batchKey := <-c.completionSignals: caretaker, ok := c.caretakers[batchKey] if !ok { - log.Warnf("unknown caretaker: %x", batchKey[:]) + log.Warnf("Unknown caretaker: %x", batchKey[:]) continue } log.Infof("ChainCaretaker(%x) has finished", batchKey[:]) if err := caretaker.Stop(); err != nil { - log.Warnf("unable to stop care taker: %v", err) + log.Warnf("Unable to stop caretaker: %v", err) } delete(c.caretakers, batchKey) @@ -597,8 +598,8 @@ func (c *ChainPlanter) gardener() { } batchKey := c.pendingBatch.BatchKey.PubKey - log.Infof("Finalizing batch %x", - batchKey.SerializeCompressed()) + batchKeySerial := asset.ToSerialized(batchKey) + log.Infof("Finalizing batch %x", batchKeySerial) feeRate, err := typedParam[*chainfee.SatPerKWeight](req) @@ -624,6 +625,18 @@ func (c *ChainPlanter) gardener() { case err := <-caretaker.cfg.BroadcastErrChan: req.Error(err) + // Unrecoverable error, stop caretaker + // directly. The pending batch will not + // be saved. + stopErr := caretaker.Stop() + if stopErr != nil { + log.Warnf("Unable to stop "+ + "caretaker "+ + "gracefully: %v", err) + } + + delete(c.caretakers, batchKeySerial) + c.pendingBatch = nil continue case <-c.Quit: From 24b69860dfb9827659f8cf9909987609957c4abb Mon Sep 17 00:00:00 2001 From: Jonathan Harvey-Buschel Date: Wed, 15 Nov 2023 16:41:06 -0500 Subject: [PATCH 02/10] make: enable per-package race checking --- Makefile | 4 ++-- make/testing_flags.mk | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 5b2edda77..82c0057e9 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ COMMIT := $(shell git describe --tags --dirty) GOBUILD := GOEXPERIMENT=loopvar GO111MODULE=on go build -v GOINSTALL := GOEXPERIMENT=loopvar GO111MODULE=on go install -v -GOTEST := GOEXPERIMENT=loopvar GO111MODULE=on go test +GOTEST := GOEXPERIMENT=loopvar GO111MODULE=on go test GOMOD := GO111MODULE=on go mod GOLIST := go list -deps $(PKG)/... | grep '$(PKG)' @@ -168,7 +168,7 @@ unit-cover: $(GOACC_BIN) unit-race: @$(call print, "Running unit race tests.") - env CGO_ENABLED=1 GORACE="history_size=7 halt_on_errors=1" $(GOLIST) | $(XARGS) env $(GOTEST) -race -test.timeout=20m + env CGO_ENABLED=1 GORACE="history_size=7 halt_on_errors=1" $(UNIT_RACE) itest: build-itest itest-only diff --git a/make/testing_flags.mk b/make/testing_flags.mk index 929fd1e6b..c75283f5a 100644 --- a/make/testing_flags.mk +++ b/make/testing_flags.mk @@ -5,6 +5,7 @@ LOG_TAGS = TEST_FLAGS = ITEST_FLAGS = -logoutput COVER_PKG = $$(go list -deps -tags="$(DEV_TAGS)" ./... | grep '$(PKG)' | grep -v lnrpc) +RACE_PKG = go list -deps -tags="$(DEV_TAGS)" ./... | grep '$(PKG)' COVER_HTML = go tool cover -html=coverage.txt -o coverage.html POSTGRES_START_DELAY = 5 @@ -19,6 +20,7 @@ ifneq ($(pkg),) UNITPKG := $(PKG)/$(pkg) UNIT_TARGETED = yes COVER_PKG = $(PKG)/$(pkg) +RACE_PKG = $(PKG)/$(pkg) endif # If a specific unit test case is being target, construct test.run filter. @@ -78,7 +80,7 @@ TEST_FLAGS += -test.timeout=$(timeout) else ifneq ($(optional),) TEST_FLAGS += -test.timeout=240m else -TEST_FLAGS += -test.timeout=60m +TEST_FLAGS += -test.timeout=20m endif GOLIST := go list -tags="$(DEV_TAGS)" -deps $(PKG)/... | grep '$(PKG)'| grep -v '/vendor/' From a529896c8466458cd273301b216e35fa0f5aed8a Mon Sep 17 00:00:00 2001 From: Jonathan Harvey-Buschel Date: Mon, 20 Nov 2023 22:05:26 -0500 Subject: [PATCH 03/10] tapgarden: remove extra error sent during cancel In this commit, we remove an extra error broadcast during caretaker cancellation that could prevent graceful shutdown. If the caretaker state machine has not reached BatchStateBroadcast, it sends potential errors to the planter on a channel with capacity of one. If cancellation is requested before reaching BatchStateBroadcast and fails internally, sending that error to the planter prevents an error from being sent by the main caretaker goroutine. We also unify cancel request handling. --- tapgarden/caretaker.go | 81 +++++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 25 deletions(-) diff --git a/tapgarden/caretaker.go b/tapgarden/caretaker.go index 684f2af20..1b06def4b 100644 --- a/tapgarden/caretaker.go +++ b/tapgarden/caretaker.go @@ -175,18 +175,24 @@ func (b *BatchCaretaker) Stop() error { return stopErr } -// Cancel signals for a batch caretaker to stop advancing a batch if possible. -// A batch can only be cancelled if it has not reached BatchStateBroadcast yet. -func (b *BatchCaretaker) Cancel() CancelResp { +// Cancel signals for a batch caretaker to stop advancing a batch. A batch can +// only be cancelled if it has not reached BatchStateBroadcast yet. If +// cancellation succeeds, we forward the batch state after cancellation. If the +// batch could not be cancelled, the planter will handle caretaker shutdown and +// batch state. +func (b *BatchCaretaker) Cancel() error { ctx, cancel := b.WithCtxQuit() defer cancel() - batchKey := b.cfg.Batch.BatchKey.PubKey.SerializeCompressed() + batchKey := b.batchKey[:] batchState := b.cfg.Batch.State() + var cancelResp CancelResp + // This function can only be called before the caretaker state stepping // function, so the batch state read is the next state that has not yet // been executed. Seedlings are converted to asset sprouts in the Frozen // state, and broadcast in the Broadast state. + log.Debugf("BatchCaretaker(%x): Trying to cancel", batchKey) switch batchState { // In the pending state, the batch seedlings have not sprouted yet. case BatchStatePending, BatchStateFrozen: @@ -200,9 +206,7 @@ func (b *BatchCaretaker) Cancel() CancelResp { "cancel failed: %w", batchKey, batchState, err) } - b.cfg.BroadcastErrChan <- fmt.Errorf("caretaker canceled") - - return CancelResp{&finalBatchState, err} + cancelResp = CancelResp{&finalBatchState, err} case BatchStateCommitted: finalBatchState := BatchStateSproutCancelled @@ -215,15 +219,29 @@ func (b *BatchCaretaker) Cancel() CancelResp { "cancel failed: %w", batchKey, batchState, err) } - b.cfg.BroadcastErrChan <- fmt.Errorf("caretaker canceled") - - return CancelResp{&finalBatchState, err} + cancelResp = CancelResp{&finalBatchState, err} default: err := fmt.Errorf("BatchCaretaker(%x), batch not cancellable", b.cfg.Batch.BatchKey.PubKey.SerializeCompressed()) - return CancelResp{nil, err} + cancelResp = CancelResp{nil, err} } + + b.cfg.CancelRespChan <- cancelResp + + // If the batch was cancellable, the finalState of the cancel response + // will be non-nil. If the cancellation failed, that error will be + // handled by the planter. At this point, the caretaker should always + // shut down gracefully. + if cancelResp.finalState != nil { + log.Infof("BatchCaretaker(%x), attempted batch cancellation, "+ + "shutting down", b.batchKey[:]) + + return nil + } + + return fmt.Errorf("BatchCaretaker(%x) cancellation failed", + b.batchKey[:]) } // advanceStateUntil attempts to advance the internal state machine until the @@ -243,22 +261,20 @@ func (b *BatchCaretaker) advanceStateUntil(currentState, return 0, fmt.Errorf("BatchCaretaker(%x), shutting "+ "down", b.batchKey[:]) + // If the batch was cancellable, the finalState of the cancel + // response will be non-nil. If the cancellation failed, that + // error will be handled by the planter. At this point, the + // caretaker should always shut down gracefully. case <-b.cfg.CancelReqChan: - cancelResp := b.Cancel() - b.cfg.CancelRespChan <- cancelResp - - // TODO(jhb): Use concrete error types for caretaker - // shutdown cases - // If the batch was cancellable, the finalState of the - // cancel response will be non-nil. If the cancellation - // failed, that error will be handled by the planter. - // At this point, the caretaker should always shut down - // gracefully. - if cancelResp.finalState != nil { + cancelErr := b.Cancel() + if cancelErr == nil { return 0, fmt.Errorf("BatchCaretaker(%x), "+ "attempted batch cancellation, "+ "shutting down", b.batchKey[:]) } + + log.Info(cancelErr) + default: } @@ -362,7 +378,12 @@ func (b *BatchCaretaker) assetCultivator() { return case <-b.cfg.CancelReqChan: - b.cfg.CancelRespChan <- b.Cancel() + cancelErr := b.Cancel() + if cancelErr == nil { + return + } + + log.Error(cancelErr) case <-b.Quit: return @@ -866,7 +887,12 @@ func (b *BatchCaretaker) stateStep(currentState BatchState) (BatchState, error) "done") case <-b.cfg.CancelReqChan: - b.cfg.CancelRespChan <- b.Cancel() + cancelErr := b.Cancel() + if cancelErr == nil { + return + } + + log.Info(cancelErr) case <-b.Quit: log.Debugf("Skipping TX confirmation, exiting") @@ -887,7 +913,12 @@ func (b *BatchCaretaker) stateStep(currentState BatchState) (BatchState, error) "done") case <-b.cfg.CancelReqChan: - b.cfg.CancelRespChan <- b.Cancel() + cancelErr := b.Cancel() + if cancelErr == nil { + return + } + + log.Info(cancelErr) case <-b.Quit: log.Debugf("Skipping TX confirmation, exiting") From 75ee8c8f0e1dede0b67cc37ad368d28a1be9f0c8 Mon Sep 17 00:00:00 2001 From: Jonathan Harvey-Buschel Date: Mon, 20 Nov 2023 22:51:31 -0500 Subject: [PATCH 04/10] tapgarden: wait for TX conf after cancel failure In this commit, we update the TX confirmation logic to continue after a failed batch cancellation. If the caretaker state machine has already reached BatchStateBroadcast, batch cancellation should fail, but we could still handle TX confirmation and complete asset minting. This fixes the flaky deadlock in the minter unit tests. --- tapgarden/caretaker.go | 90 +++++++++++++++++++++++++----------------- 1 file changed, 54 insertions(+), 36 deletions(-) diff --git a/tapgarden/caretaker.go b/tapgarden/caretaker.go index 1b06def4b..c41a0ea57 100644 --- a/tapgarden/caretaker.go +++ b/tapgarden/caretaker.go @@ -763,7 +763,7 @@ func (b *BatchCaretaker) stateStep(currentState BatchState) (BatchState, error) b.cfg.Batch.GenesisPacket.ChainFees = chainFees log.Infof("BatchCaretaker(%x): GenesisPacket absolute fee: "+ - "%d sats", chainFees) + "%d sats", b.batchKey[:], chainFees) log.Infof("BatchCaretaker(%x): GenesisPacket finalized", b.batchKey[:]) log.Tracef("GenesisPacket: %v", spew.Sdump(signedPkt)) @@ -871,32 +871,43 @@ func (b *BatchCaretaker) stateStep(currentState BatchState) (BatchState, error) defer confCancel() defer b.Wg.Done() - var confEvent *chainntnfs.TxConfirmation - select { - case confEvent = <-confNtfn.Confirmed: - log.Debugf("Got chain confirmation: %v", - confEvent.Tx.TxHash()) - - case err := <-errChan: - b.cfg.ErrChan <- fmt.Errorf("error getting "+ - "confirmation: %w", err) - return + var ( + confEvent *chainntnfs.TxConfirmation + confRecv bool + ) - case <-confCtx.Done(): - log.Debugf("Skipping TX confirmation, context " + - "done") + for !confRecv { + select { + case confEvent = <-confNtfn.Confirmed: + log.Debugf("Got chain confirmation: %v", + confEvent.Tx.TxHash()) + confRecv = true - case <-b.cfg.CancelReqChan: - cancelErr := b.Cancel() - if cancelErr == nil { + case err := <-errChan: + b.cfg.ErrChan <- fmt.Errorf("error "+ + "getting confirmation: %w", err) return - } - log.Info(cancelErr) + case <-confCtx.Done(): + log.Debugf("Skipping TX confirmation, " + + "context done") + confRecv = true - case <-b.Quit: - log.Debugf("Skipping TX confirmation, exiting") - return + case <-b.cfg.CancelReqChan: + cancelErr := b.Cancel() + if cancelErr == nil { + return + } + + // Cancellation failed, continue to wait + // for transaction confirmation. + log.Info(cancelErr) + + case <-b.Quit: + log.Debugf("Skipping TX confirmation, " + + "exiting") + return + } } if confEvent == nil { @@ -905,24 +916,31 @@ func (b *BatchCaretaker) stateStep(currentState BatchState) (BatchState, error) return } - select { - case b.confEvent <- confEvent: - - case <-confCtx.Done(): - log.Debugf("Skipping TX confirmation, context " + - "done") + for { + select { + case b.confEvent <- confEvent: + return - case <-b.cfg.CancelReqChan: - cancelErr := b.Cancel() - if cancelErr == nil { + case <-confCtx.Done(): + log.Debugf("Skipping TX confirmation, " + + "context done") return - } - log.Info(cancelErr) + case <-b.cfg.CancelReqChan: + cancelErr := b.Cancel() + if cancelErr == nil { + return + } - case <-b.Quit: - log.Debugf("Skipping TX confirmation, exiting") - return + // Cancellation failed, continue to try + // and send the confirmation event. + log.Info(cancelErr) + + case <-b.Quit: + log.Debugf("Skipping TX confirmation, " + + "exiting") + return + } } }() From 6d0d93584a30c4fd31e9b3cd0bc5ac105ef077b7 Mon Sep 17 00:00:00 2001 From: Jonathan Harvey-Buschel Date: Thu, 30 Nov 2023 20:17:15 -0500 Subject: [PATCH 05/10] tapgarden: remove duplicate batch freeze In this commit, we update the caretaker start logic to remove an unnecessary batch write. Before we start the caretaker, we write the batch with the Frozen state, but we don't update the in-memory pending batch to move from the Pending to Frozen state. This causes the caretaker to write the batch again on start. We can address this by updating the in-memory batch after a successful batch freeze. --- tapgarden/planter.go | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/tapgarden/planter.go b/tapgarden/planter.go index 586f555e3..995559775 100644 --- a/tapgarden/planter.go +++ b/tapgarden/planter.go @@ -636,8 +636,6 @@ func (c *ChainPlanter) gardener() { } delete(c.caretakers, batchKeySerial) - c.pendingBatch = nil - continue case <-c.Quit: return @@ -677,10 +675,6 @@ func (c *ChainPlanter) gardener() { func (c *ChainPlanter) finalizeBatch( feeRate *chainfee.SatPerKWeight) (*BatchCaretaker, error) { - // Prep the new care taker that'll be launched assuming the call below - // to freeze the batch succeeds. - caretaker := c.newCaretakerForBatch(c.pendingBatch, feeRate) - // At this point, we have a non-empty batch, so we'll first finalize it // on disk. This means no further seedlings can be added to this batch. ctx, cancel := c.WithCtxQuit() @@ -691,9 +685,11 @@ func (c *ChainPlanter) finalizeBatch( err) } - // Now that the batch has been frozen, we'll launch a new caretaker - // state machine for the batch that'll drive all the seedlings do - // adulthood. + // Now that the batch has been frozen on disk, we can update the batch + // state to frozen before launching a new caretaker state machine for + // the batch that'll drive all the seedlings do adulthood. + c.pendingBatch.UpdateState(BatchStateFrozen) + caretaker := c.newCaretakerForBatch(c.pendingBatch, feeRate) if err := caretaker.Start(); err != nil { return nil, fmt.Errorf("unable to start new caretaker: %w", err) } From b15abc92e6cb539d9e5546d1819f3b03c0993c1c Mon Sep 17 00:00:00 2001 From: Jonathan Harvey-Buschel Date: Thu, 30 Nov 2023 20:25:06 -0500 Subject: [PATCH 06/10] tapgarden: stop caretaker on confirmation err In this commit, we update the TX confirmation handling logic to stop the caretaker if confirmation registration fails. At that point, the caretaker cannot successfully receive a confirmation for the broadcast batch, so it should shut down to allow caretaker restart for the same batch. Note that the planter will not actually delete the stopped caretaker, as the error is not sent on BroadcastErrChan. --- tapgarden/caretaker.go | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tapgarden/caretaker.go b/tapgarden/caretaker.go index c41a0ea57..74cd59bee 100644 --- a/tapgarden/caretaker.go +++ b/tapgarden/caretaker.go @@ -879,13 +879,14 @@ func (b *BatchCaretaker) stateStep(currentState BatchState) (BatchState, error) for !confRecv { select { case confEvent = <-confNtfn.Confirmed: - log.Debugf("Got chain confirmation: %v", - confEvent.Tx.TxHash()) confRecv = true case err := <-errChan: - b.cfg.ErrChan <- fmt.Errorf("error "+ - "getting confirmation: %w", err) + confErr := fmt.Errorf("error getting "+ + "confirmation: %w", err) + log.Info(confErr) + b.cfg.ErrChan <- confErr + return case <-confCtx.Done(): @@ -911,11 +912,19 @@ func (b *BatchCaretaker) stateStep(currentState BatchState) (BatchState, error) } if confEvent == nil { - b.cfg.ErrChan <- fmt.Errorf("got empty " + + confErr := fmt.Errorf("got empty " + "confirmation event in batch") + log.Info(confErr) + b.cfg.ErrChan <- confErr + return } + if confEvent.Tx != nil { + log.Debugf("Got chain confirmation: %v", + confEvent.Tx.TxHash()) + } + for { select { case b.confEvent <- confEvent: From 47568ee95e1bb949657ec09532600f76f6ff7b02 Mon Sep 17 00:00:00 2001 From: Jonathan Harvey-Buschel Date: Thu, 30 Nov 2023 20:43:06 -0500 Subject: [PATCH 07/10] tapgarden: enable fee estimate and TX conf failure In this commit, we update the mock ChainBridge to allow for certain calls to fail, including fee estimation, confirmation registration, and non-empty confirmation responses. --- tapgarden/mock.go | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/tapgarden/mock.go b/tapgarden/mock.go index 160ef3b0e..ea2e0aede 100644 --- a/tapgarden/mock.go +++ b/tapgarden/mock.go @@ -242,6 +242,11 @@ type MockChainBridge struct { ReqCount int ConfReqs map[int]*chainntnfs.ConfirmationEvent + + failFeeEstimates bool + emptyConf bool + errConf bool + confErr chan error } func NewMockChainBridge() *MockChainBridge { @@ -255,11 +260,27 @@ func NewMockChainBridge() *MockChainBridge { } } +func (m *MockChainBridge) FailFeeEstimates(enable bool) { + m.failFeeEstimates = enable +} + +func (m *MockChainBridge) FailConf(enable bool) { + m.errConf = enable +} +func (m *MockChainBridge) EmptyConf(enable bool) { + m.emptyConf = enable +} + func (m *MockChainBridge) SendConfNtfn(reqNo int, blockHash *chainhash.Hash, blockHeight, blockIndex int, block *wire.MsgBlock, tx *wire.MsgTx) { req := m.ConfReqs[reqNo] + if m.emptyConf { + req.Confirmed <- nil + return + } + req.Confirmed <- &chainntnfs.TxConfirmation{ BlockHash: blockHash, BlockHeight: uint32(blockHeight), @@ -287,7 +308,7 @@ func (m *MockChainBridge) RegisterConfirmationsNtfn(ctx context.Context, Confirmed: make(chan *chainntnfs.TxConfirmation), Cancel: func() {}, } - errChan := make(chan error) + m.confErr = make(chan error, 1) m.ConfReqs[m.ReqCount] = req @@ -296,7 +317,11 @@ func (m *MockChainBridge) RegisterConfirmationsNtfn(ctx context.Context, case <-ctx.Done(): } - return req, errChan, nil + if m.errConf { + m.confErr <- fmt.Errorf("confirmation error") + } + + return req, m.confErr, nil } func (m *MockChainBridge) RegisterBlockEpochNtfn( @@ -361,6 +386,10 @@ func (m *MockChainBridge) EstimateFee(ctx context.Context, return 0, fmt.Errorf("shutting down") } + if m.failFeeEstimates { + return 0, fmt.Errorf("failed to estimate fee") + } + return 253, nil } From d8d50878267a35011c83322ef2d89ee3909a47bb Mon Sep 17 00:00:00 2001 From: Jonathan Harvey-Buschel Date: Fri, 8 Dec 2023 14:10:42 -0500 Subject: [PATCH 08/10] tapgarden: drop batch state from CancelResp In this commit, we redefine the batch state reported in cancelResp to be a bool instead of an actual batch state. The provided batch state was not being used by any callers of Cancel(), including the planter. --- tapgarden/caretaker.go | 24 ++++++++++++------------ tapgarden/planter.go | 6 +++--- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/tapgarden/caretaker.go b/tapgarden/caretaker.go index 74cd59bee..042c73812 100644 --- a/tapgarden/caretaker.go +++ b/tapgarden/caretaker.go @@ -196,50 +196,50 @@ func (b *BatchCaretaker) Cancel() error { switch batchState { // In the pending state, the batch seedlings have not sprouted yet. case BatchStatePending, BatchStateFrozen: - finalBatchState := BatchStateSeedlingCancelled err := b.cfg.Log.UpdateBatchState( ctx, b.cfg.Batch.BatchKey.PubKey, - finalBatchState, + BatchStateSeedlingCancelled, ) if err != nil { err = fmt.Errorf("BatchCaretaker(%x), batch state(%v), "+ "cancel failed: %w", batchKey, batchState, err) } - cancelResp = CancelResp{&finalBatchState, err} + cancelResp = CancelResp{true, err} case BatchStateCommitted: - finalBatchState := BatchStateSproutCancelled err := b.cfg.Log.UpdateBatchState( ctx, b.cfg.Batch.BatchKey.PubKey, - finalBatchState, + BatchStateSproutCancelled, ) if err != nil { err = fmt.Errorf("BatchCaretaker(%x), batch state(%v), "+ "cancel failed: %w", batchKey, batchState, err) } - cancelResp = CancelResp{&finalBatchState, err} + cancelResp = CancelResp{true, err} default: err := fmt.Errorf("BatchCaretaker(%x), batch not cancellable", b.cfg.Batch.BatchKey.PubKey.SerializeCompressed()) - cancelResp = CancelResp{nil, err} + cancelResp = CancelResp{false, err} } b.cfg.CancelRespChan <- cancelResp - // If the batch was cancellable, the finalState of the cancel response - // will be non-nil. If the cancellation failed, that error will be - // handled by the planter. At this point, the caretaker should always - // shut down gracefully. - if cancelResp.finalState != nil { + // If the batch was cancellable, the final write of the cancelled batch + // may still have failed. That error will be handled by the planter. At + // this point, the caretaker should shut down gracefully if cancellation + // was attempted. + if cancelResp.cancelAttempted { log.Infof("BatchCaretaker(%x), attempted batch cancellation, "+ "shutting down", b.batchKey[:]) return nil } + // If the cancellation failed, that error will be handled by the + // planter. return fmt.Errorf("BatchCaretaker(%x) cancellation failed", b.batchKey[:]) } diff --git a/tapgarden/planter.go b/tapgarden/planter.go index 995559775..7eed9d5d1 100644 --- a/tapgarden/planter.go +++ b/tapgarden/planter.go @@ -91,8 +91,8 @@ type BatchKey = asset.SerializedKey // CancelResp is the response from a caretaker attempting to cancel a batch. type CancelResp struct { - finalState *BatchState - err error + cancelAttempted bool + err error } type stateRequest interface { @@ -448,7 +448,7 @@ func (c *ChainPlanter) cancelMintingBatch(ctx context.Context, // cancellation was possible and attempted. This means // that the caretaker is shut down and the planter // must delete it. - if cancelResp.finalState != nil { + if cancelResp.cancelAttempted { delete(c.caretakers, batchKeySerialized) } From e507890eb76fe270f08809a4d003e4fc6da46f97 Mon Sep 17 00:00:00 2001 From: Jonathan Harvey-Buschel Date: Fri, 1 Dec 2023 13:34:36 -0500 Subject: [PATCH 09/10] fn: add Last helper function --- fn/func.go | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/fn/func.go b/fn/func.go index 0514fd230..c721901b8 100644 --- a/fn/func.go +++ b/fn/func.go @@ -172,3 +172,20 @@ func First[T any](xs []*T, pred func(*T) bool) (*T, error) { return nil, fmt.Errorf("no item found") } + +// Last returns the last item in the slice that matches the predicate, or an +// error if none matches. +func Last[T any](xs []*T, pred func(*T) bool) (*T, error) { + var matches []*T + for i := range xs { + if pred(xs[i]) { + matches = append(matches, xs[i]) + } + } + + if len(matches) == 0 { + return nil, fmt.Errorf("no item found") + } + + return matches[len(matches)-1], nil +} From 05b82daf6434fecd2a8cfc6a1c9d0575c99480b2 Mon Sep 17 00:00:00 2001 From: Jonathan Harvey-Buschel Date: Fri, 1 Dec 2023 13:47:57 -0500 Subject: [PATCH 10/10] tapgarden: test err handling in FinalizeBatch In this commit, we add a new test for the minter to ensure that batch finalization errors are handled gracefully, including before and after TX broadcast. --- tapgarden/planter_test.go | 307 ++++++++++++++++++++++++++++++++++---- 1 file changed, 276 insertions(+), 31 deletions(-) diff --git a/tapgarden/planter_test.go b/tapgarden/planter_test.go index 0a814c8ad..a3a933b6f 100644 --- a/tapgarden/planter_test.go +++ b/tapgarden/planter_test.go @@ -7,6 +7,7 @@ import ( "encoding/hex" "fmt" "math/rand" + "sync" "testing" "time" @@ -229,7 +230,96 @@ func (t *mintingTestHarness) assertNoPendingBatch() { require.Nil(t, batch) } -// tickMintingBatch first the ticker that forces the planter to create a new +type FinalizeBatchResp struct { + Batch *tapgarden.MintingBatch + Err error +} + +// finalizeBatch uses the public FinalizeBatch planter call to start a caretaker +// for an existing batch. The caller must wait for the planter call to complete. +func (t *mintingTestHarness) finalizeBatch(wg *sync.WaitGroup, + respChan chan *FinalizeBatchResp) { + + t.Helper() + + wg.Add(1) + go func() { + defer wg.Done() + + frozenBatch, finalizeErr := t.planter.FinalizeBatch(nil) + resp := &FinalizeBatchResp{ + Batch: frozenBatch, + Err: finalizeErr, + } + + respChan <- resp + }() +} + +func (t *mintingTestHarness) assertFinalizeBatch(wg *sync.WaitGroup, + respChan chan *FinalizeBatchResp, errString string) { + + t.Helper() + + wg.Wait() + finalizeResp := <-respChan + + switch { + case errString == "": + require.NoError(t, finalizeResp.Err) + + default: + require.ErrorContains(t, finalizeResp.Err, errString) + } +} + +// progressCaretaker uses the mock interfaces to progress a caretaker from start +// to TX confirmation. +func (t *mintingTestHarness) progressCaretaker( + seedlings []*tapgarden.Seedling) func() { + + // Assert that the caretaker has requested a genesis TX to be funded. + _ = t.assertGenesisTxFunded() + + // For each seedling created above, we expect a new set of keys to be + // created for the asset script key and an additional key if emission + // was enabled. + for i := 0; i < len(seedlings); i++ { + t.assertKeyDerived() + + if seedlings[i].EnableEmission { + t.assertKeyDerived() + } + } + + // We should now transition to the next state where we'll attempt to + // sign this PSBT packet generated above. + t.assertGenesisPsbtFinalized() + + // With the PSBT packet finalized for the caretaker, we should now + // receive a request to publish a transaction followed by a + // confirmation request. + tx := t.assertTxPublished() + + // With the transaction published, we should now receive a confirmation + // request. To ensure the file proof is constructed properly, we'll + // also make a "fake" block that includes our transaction. + merkleTree := blockchain.BuildMerkleTreeStore( + []*btcutil.Tx{btcutil.NewTx(tx)}, false, + ) + merkleRoot := merkleTree[len(merkleTree)-1] + blockHeader := wire.NewBlockHeader( + 0, chaincfg.MainNetParams.GenesisHash, merkleRoot, 0, 0, + ) + block := &wire.MsgBlock{ + Header: *blockHeader, + Transactions: []*wire.MsgTx{tx}, + } + + return t.assertConfReqSent(tx, block) +} + +// tickMintingBatch fires the ticker that forces the planter to create a new // batch. func (t *mintingTestHarness) tickMintingBatch( noBatch bool) *tapgarden.MintingBatch { @@ -275,7 +365,9 @@ func (t *mintingTestHarness) assertNewBatchFrozen( var newBatches []*tapgarden.MintingBatch err := wait.NoError(func() error { - currentBatches, err := t.planter.ListBatches(nil) + currentBatches, err := t.store.FetchAllBatches( + context.Background(), + ) if err != nil { return err } @@ -455,6 +547,18 @@ func (t *mintingTestHarness) assertBatchState(batchKey *btcec.PublicKey, require.Equal(t, batchState, batch.State()) } +func (t *mintingTestHarness) assertLastBatchState(numBatches int, + batchState tapgarden.BatchState) { + + t.Helper() + + batches, err := t.planter.ListBatches(nil) + require.NoError(t, err) + + require.Len(t, batches, numBatches) + require.Equal(t, batchState, batches[len(batches)-1].State()) +} + // assertSeedlingsMatchSprouts asserts that the seedlings were properly matched // into actual assets. func (t *mintingTestHarness) assertSeedlingsMatchSprouts( @@ -540,7 +644,7 @@ func (t *mintingTestHarness) assertGenesisPsbtFinalized() { isNotCancelledBatch := func(batch *tapgarden.MintingBatch) bool { return !isCancelledBatch(batch) } - pendingBatch, err := fn.First(pendingBatches, isNotCancelledBatch) + pendingBatch, err := fn.Last(pendingBatches, isNotCancelledBatch) require.NoError(t, err) // The minting key of the batch should match the public key @@ -591,6 +695,39 @@ func (t *mintingTestHarness) assertNoError() { } } +// queueInitialBatch queues a set of random seedlings for the planter. +func (t *mintingTestHarness) queueInitialBatch( + numSeedlings int) []*tapgarden.Seedling { + + // Next make new random seedlings, and queue each of them up within + // the main state machine for batched minting. + seedlings := t.newRandSeedlings(numSeedlings) + t.queueSeedlingsInBatch(seedlings...) + + // At this point, there should be a single pending batch with 5 + // seedlings. The batch stored in the log should also match up exactly. + t.assertPendingBatchExists(numSeedlings) + + // Before we tick the batch, we record all existing batches, so we can + // make sure a new one was created. + existingBatches, err := t.planter.ListBatches(nil) + require.NoError(t, err) + + // We only want to know if a new batch gets to the frozen state. So the + // list of existing batches should only contain the already frozen. + existingBatches = fn.Filter( + existingBatches, func(batch *tapgarden.MintingBatch) bool { + return batch.State() == tapgarden.BatchStatePending + }, + ) + require.Len(t, existingBatches, 1) + batchKey := existingBatches[0].BatchKey.PubKey + + t.assertSeedlingsExist(seedlings, batchKey) + + return seedlings +} + // testBasicAssetCreation tests that we're able to properly progress the state // machine through the various stages of asset minting and creation. // @@ -601,16 +738,9 @@ func testBasicAssetCreation(t *mintingTestHarness) { // harness. t.refreshChainPlanter() - // Next make 5 new random seedlings, and queue each of them up within - // the main state machine for batched minting. + // Create an initial batch of 5 seedlings. const numSeedlings = 5 - seedlings := t.newRandSeedlings(numSeedlings) - t.queueSeedlingsInBatch(seedlings...) - - // At this point, there should be a single pending batch with 5 - // seedlings. The batch stored in the log should also match up exactly. - t.assertPendingBatchExists(numSeedlings) - t.assertSeedlingsExist(seedlings, nil) + seedlings := t.queueInitialBatch(numSeedlings) // Now we'll force a batch tick which should kick off a new caretaker // that starts to progress the batch all the way to broadcast. @@ -709,21 +839,16 @@ func testBasicAssetCreation(t *mintingTestHarness) { t.assertNumCaretakersActive(0) } +// testMintingTicker tests that we can start batch finalization with the planter +// ticker, and finalize a batch after cancelling a batch. func testMintingTicker(t *mintingTestHarness) { // First, create a new chain planter instance using the supplied test // harness. t.refreshChainPlanter() - // Next make 5 new random seedlings, and queue each of them up within - // the main state machine for batched minting. + // Create an initial batch of 5 seedlings. const numSeedlings = 5 - seedlings := t.newRandSeedlings(numSeedlings) - t.queueSeedlingsInBatch(seedlings...) - - // At this point, there should be a single pending batch with 5 - // seedlings. The batch stored in the log should also match up exactly. - t.assertPendingBatchExists(numSeedlings) - t.assertSeedlingsExist(seedlings, nil) + _ = t.queueInitialBatch(numSeedlings) // If we cancel the current batch, the pending batch should be cleared, // but the seedlings should still exist on disk. Requesting batch @@ -734,7 +859,7 @@ func testMintingTicker(t *mintingTestHarness) { // Next, make another 5 seedlings and continue with minting. // One seedling is a duplicate of a seedling from the cancelled batch, // to ensure that we can store multiple versions of the same seedling. - seedlings = t.newRandSeedlings(numSeedlings) + seedlings := t.newRandSeedlings(numSeedlings) t.queueSeedlingsInBatch(seedlings...) // Next, finalize the pending batch to continue with minting. @@ -810,18 +935,11 @@ func testMintingCancelFinalize(t *mintingTestHarness) { // harness. t.refreshChainPlanter() - // Next make 5 new random seedlings, and queue each of them up within - // the main state machine for batched minting. + // Create an initial batch of 5 seedlings. const numSeedlings = 5 - seedlings := t.newRandSeedlings(numSeedlings) - t.queueSeedlingsInBatch(seedlings...) + seedlings := t.queueInitialBatch(numSeedlings) firstSeedling := seedlings[0] - // At this point, there should be a single pending batch with 5 - // seedlings. The batch stored in the log should also match up exactly. - t.assertPendingBatchExists(numSeedlings) - t.assertSeedlingsExist(seedlings, nil) - // If we cancel the current batch, the pending batch should be cleared, // but the seedlings should still exist on disk. firstBatchKey := t.cancelMintingBatch(false) @@ -952,6 +1070,11 @@ func testMintingCancelFinalize(t *mintingTestHarness) { // the batch being finalized, and the caretaker being cleaned up. sendConfNtfn() + // Trying to cancel the batch after the confirmation has been sent + // should also fail with an error from the caretaker. + cancelResp = t.cancelMintingBatch(false) + require.True(t, batchKeyEquality(thirdBatchKey, cancelResp)) + // This time no error should be sent anywhere as we should've handled // all notifications. t.assertNoError() @@ -960,6 +1083,123 @@ func testMintingCancelFinalize(t *mintingTestHarness) { t.assertNumCaretakersActive(0) } +// testFinalizeBatch tests that the planter can recover from caretaker errors +// successfully when finalizing a batch, and that the planter state is properly +// reset after successful batch finalization. +func testFinalizeBatch(t *mintingTestHarness) { + // First, create a new chain planter instance using the supplied test + // harness. + t.refreshChainPlanter() + + // Create an initial batch of 5 seedlings. + const numSeedlings = 5 + _ = t.queueInitialBatch(numSeedlings) + + // Force fee estimation to fail so we crash the caretaker before the + // batch can be frozen. + t.chain.FailFeeEstimates(true) + + var ( + wg sync.WaitGroup + respChan = make(chan *FinalizeBatchResp, 1) + caretakerCount = 0 + batchCount = 0 + ) + + // Finalize the pending batch to start a caretaker. + t.finalizeBatch(&wg, respChan) + batchCount++ + + _, err := fn.RecvOrTimeout( + t.chain.FeeEstimateSignal, defaultTimeout, + ) + require.NoError(t, err) + + // If the caretaker failed, there should be no active caretakers nor + // pending batch. The caretaker error should be propogated to the caller + // of finalize. + t.assertNoPendingBatch() + t.assertNumCaretakersActive(caretakerCount) + t.assertLastBatchState(batchCount, tapgarden.BatchStateFrozen) + t.assertFinalizeBatch(&wg, respChan, "unable to estimate fee") + + // Queue another batch, reset fee estimation behavior, and set TX + // confirmation registration to fail. + seedlings := t.queueInitialBatch(numSeedlings) + t.chain.FailFeeEstimates(false) + t.chain.FailConf(true) + + // Finalize the pending batch to start a caretaker, and progress the + // caretaker to TX confirmation. The finalize call should report no + // error, but the caretaker should propogate the confirmation error to + // the shared error channel. + t.finalizeBatch(&wg, respChan) + batchCount++ + + _ = t.progressCaretaker(seedlings) + caretakerCount++ + + t.assertFinalizeBatch(&wg, respChan, "") + caretakerErr := <-t.errChan + require.ErrorContains(t, caretakerErr, "error getting confirmation") + + // The stopped caretaker will still exist but there should be no pending + // batch. We will have two batches on disk, including the broadcasted + // batch. + t.assertNoPendingBatch() + t.assertNumCaretakersActive(caretakerCount) + t.assertLastBatchState(batchCount, tapgarden.BatchStateBroadcast) + + // Queue another batch, set TX confirmation to succeed, and set the + // confirmation event to be empty. + seedlings = t.queueInitialBatch(numSeedlings) + t.chain.FailConf(false) + t.chain.EmptyConf(true) + + // Start a new caretaker that should reach TX broadcast. + t.finalizeBatch(&wg, respChan) + batchCount++ + + sendConfNtfn := t.progressCaretaker(seedlings) + caretakerCount++ + + // Trigger the confirmation event, which should cause the caretaker to + // fail. + sendConfNtfn() + + t.assertFinalizeBatch(&wg, respChan, "") + caretakerErr = <-t.errChan + require.ErrorContains(t, caretakerErr, "got empty confirmation") + + // The stopped caretaker will still exist but there should be no pending + // batch. We will now have three batches on disk. + t.assertNoPendingBatch() + t.assertNumCaretakersActive(caretakerCount) + t.assertLastBatchState(batchCount, tapgarden.BatchStateBroadcast) + + // If we try to finalize without a pending batch, the finalize call + // should return an error. + t.finalizeBatch(&wg, respChan) + t.assertFinalizeBatch(&wg, respChan, "no pending batch") + t.assertNumCaretakersActive(caretakerCount) + + // Queue another batch and drive the caretaker to a successful minting. + seedlings = t.queueInitialBatch(numSeedlings) + t.chain.EmptyConf(false) + + t.finalizeBatch(&wg, respChan) + batchCount++ + + sendConfNtfn = t.progressCaretaker(seedlings) + sendConfNtfn() + + t.assertFinalizeBatch(&wg, respChan, "") + t.assertNoError() + t.assertNoPendingBatch() + t.assertNumCaretakersActive(caretakerCount) + t.assertLastBatchState(batchCount, tapgarden.BatchStateFinalized) +} + // mintingStoreTestCase is used to programmatically run a series of test cases // that are parametrized based on a fresh minting store. type mintingStoreTestCase struct { @@ -985,6 +1225,11 @@ var testCases = []mintingStoreTestCase{ interval: minterInterval, testFunc: testMintingCancelFinalize, }, + { + name: "finalize_batch", + interval: minterInterval, + testFunc: testFinalizeBatch, + }, } // TestBatchedAssetIssuance runs a test of tests to ensure that the set of