-
Notifications
You must be signed in to change notification settings - Fork 721
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve NewEpochState
logging
#5854
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
{-# LANGUAGE DataKinds #-} | ||
{-# LANGUAGE FlexibleContexts #-} | ||
{-# LANGUAGE MonoLocalBinds #-} | ||
{-# LANGUAGE OverloadedStrings #-} | ||
{-# LANGUAGE ScopedTypeVariables #-} | ||
|
@@ -15,11 +16,13 @@ module Testnet.Property.Util | |
|
||
import Cardano.Api | ||
|
||
import Control.Exception.Safe (MonadCatch) | ||
import Control.Concurrent.Async | ||
import qualified Control.Exception as E | ||
import Control.Monad | ||
import Control.Monad.Trans.Resource | ||
import qualified Data.Aeson as Aeson | ||
import GHC.Stack | ||
import Network.Mux.Trace | ||
import qualified System.Environment as IO | ||
import System.Info (os) | ||
import qualified System.IO.Unsafe as IO | ||
|
@@ -60,18 +63,44 @@ integrationWorkspace workspaceName f = withFrozenCallStack $ | |
isLinux :: Bool | ||
isLinux = os == "linux" | ||
|
||
|
||
-- | Runs an action in background, and registers cleanup to `MonadResource m` | ||
-- The argument forces IO monad to prevent leaking of `MonadResource` to the child thread | ||
-- Concurrency is tricky in the 'ResourceT' monad. See the "Concurrency" section of | ||
-- https://www.fpcomplete.com/blog/understanding-resourcet/. | ||
runInBackground :: MonadTest m | ||
=> MonadResource m | ||
=> MonadCatch m | ||
=> IO a | ||
=> IO () | ||
-> IO a | ||
-> m () | ||
runInBackground act = void . H.evalM $ allocate (H.async act) cleanUp | ||
where | ||
cleanUp :: H.Async a -> IO () | ||
cleanUp a = H.cancel a >> void (H.link a) | ||
runInBackground runOnException act = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function body is too convoluted now.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
2/3: We can and should use
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I responded in slack. This currently works, lets get it merged and you can investigate using |
||
void . H.evalIO | ||
$ runResourceT | ||
-- We don't 'wait' because this "background process" may not terminate. | ||
-- If we 'wait' and it doesn't terminate, 'ResourceT' will not kill it | ||
-- and the test will hang indefinitely. | ||
-- Not waiting isn't a problem because this "background process" | ||
-- is meant to run indefinitely and will be cleaned up by | ||
-- 'ResourceT' when the test ends or fails. | ||
-- We use 'asyncWithUnmask' because our logging thread is terminated via an exception. | ||
-- In order to avoid competing for a file handle we must catch the exception which signals | ||
-- the logging file is no longer being written to and we can now run the desired additional IO action we | ||
-- want (runOnException). Attempting to share the 'FileHandle' and use concurrency primitives was not fruitful | ||
-- and the section "Other ways to abuse ResourceT" in https://www.fpcomplete.com/blog/understanding-resourcet/ | ||
-- confirms this is problematic in 'ResourceT'. | ||
$ resourceForkWith (\_ -> do r <- H.asyncWithUnmask (\restore -> restore act `E.onException` runOnException) | ||
linkOnly ignoreException r | ||
) $ return () | ||
where | ||
ignoreException :: E.SomeException -> Bool | ||
ignoreException e = | ||
case E.fromException e of | ||
Just (MuxError errType _) -> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this should be wrapped in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you show me what you want here? We need to ignore the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd like |
||
case errType of | ||
MuxBearerClosed -> False | ||
-- This is expected as the background thread is killed. | ||
-- However we do want to be made aware about other | ||
-- exceptions. | ||
_ -> True | ||
_ -> False | ||
|
||
decodeEraUTxO :: (IsShelleyBasedEra era, MonadTest m) => ShelleyBasedEra era -> Aeson.Value -> m (UTxO era) | ||
decodeEraUTxO _ = H.jsonErrorFail . Aeson.fromJSON | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,13 @@ | ||
{-# LANGUAGE DuplicateRecordFields #-} | ||
{-# LANGUAGE ExistentialQuantification #-} | ||
{-# LANGUAGE FlexibleContexts #-} | ||
{-# LANGUAGE FlexibleInstances #-} | ||
{-# LANGUAGE LambdaCase #-} | ||
{-# LANGUAGE OverloadedStrings #-} | ||
{-# LANGUAGE ScopedTypeVariables #-} | ||
|
||
{-# OPTIONS_GHC -Wno-orphans #-} | ||
|
||
module Testnet.Runtime | ||
( startNode | ||
, startLedgerNewEpochStateLogging | ||
|
@@ -13,14 +16,25 @@ module Testnet.Runtime | |
import Cardano.Api | ||
import qualified Cardano.Api as Api | ||
|
||
import qualified Cardano.Ledger.Api as L | ||
import qualified Cardano.Ledger.Shelley.LedgerState as L | ||
|
||
import Prelude | ||
|
||
import Control.Exception.Safe | ||
import Control.Monad | ||
import Control.Monad.State.Strict (StateT) | ||
import Control.Monad.State.Strict | ||
import Control.Monad.Trans.Resource | ||
import Data.Aeson | ||
import Data.Aeson.Encode.Pretty (encodePretty) | ||
import Data.Algorithm.Diff | ||
import Data.Algorithm.DiffOutput | ||
import qualified Data.ByteString.Lazy.Char8 as BSC | ||
import Data.Function | ||
import qualified Data.List as List | ||
import Data.Text (Text, unpack) | ||
import qualified Data.Text as Text | ||
import qualified Data.Text.IO as Text | ||
import GHC.Stack | ||
import qualified GHC.Stack as GHC | ||
import Network.Socket (PortNumber) | ||
|
@@ -33,8 +47,9 @@ import qualified System.Process as IO | |
import Testnet.Filepath | ||
import qualified Testnet.Ping as Ping | ||
import Testnet.Process.Run | ||
import Testnet.Property.Util (runInBackground) | ||
import Testnet.Types hiding (testnetMagic) | ||
import Testnet.Property.Util | ||
import Testnet.Types (NodeRuntime (NodeRuntime), TestnetRuntime (configurationFile), | ||
poolSprockets) | ||
|
||
import Hedgehog (MonadTest) | ||
import qualified Hedgehog as H | ||
|
@@ -169,7 +184,12 @@ createSubdirectoryIfMissingNew parent subdirectory = GHC.withFrozenCallStack $ d | |
pure subdirectory | ||
|
||
-- | Start ledger's new epoch state logging for the first node in the background. | ||
-- Logs will be placed in <tmp workspace directory>/logs/ledger-new-epoch-state.log | ||
-- Pretty JSON logs will be placed in: | ||
-- 1. <tmp workspace directory>/logs/ledger-new-epoch-state.log | ||
-- 2. <tmp workspace directory>/logs/ledger-new-epoch-state-diffs.log | ||
-- NB: The diffs represent the the changes in the 'NewEpochState' between each | ||
-- block or turn of the epoch. We have excluded the 'stashedAVVMAddresses' | ||
-- field of 'NewEpochState' in the JSON rendering. | ||
-- The logging thread will be cancelled when `MonadResource` releases all resources. | ||
-- Idempotent. | ||
startLedgerNewEpochStateLogging | ||
|
@@ -197,27 +217,112 @@ startLedgerNewEpochStateLogging testnetRuntime tmpWorkspace = withFrozenCallStac | |
False -> do | ||
H.evalIO $ appendFile logFile "" | ||
socketPath <- H.noteM $ H.sprocketSystemName <$> H.headM (poolSprockets testnetRuntime) | ||
_ <- runInBackground . runExceptT $ | ||
foldEpochState | ||
(configurationFile testnetRuntime) | ||
(Api.File socketPath) | ||
Api.QuickValidation | ||
(EpochNo maxBound) | ||
() | ||
(\epochState _ _ -> handler logFile epochState) | ||
|
||
runInBackground | ||
(do logFileContents <- IO.readFile logFile | ||
let epochStateValues = epochStateBeforeAfterValues logFileContents | ||
epochStateDiffs' = epochStateDiffs epochStateValues | ||
Text.writeFile (logDir </> "ledger-epoch-state-diffs.log") epochStateDiffs' | ||
) | ||
(do void $ runExceptT $ | ||
foldEpochState | ||
(configurationFile testnetRuntime) | ||
(Api.File socketPath) | ||
Api.QuickValidation | ||
(EpochNo maxBound) | ||
() | ||
(\epochState _ _ -> | ||
liftIO $ evalStateT (handler logFile epochState) 0 | ||
) | ||
) | ||
|
||
|
||
H.note_ $ "Started logging epoch states to: " <> logFile | ||
|
||
where | ||
handler :: FilePath -> AnyNewEpochState -> StateT () IO LedgerStateCondition | ||
handler outputFp anyNewEpochState = handleException . liftIO $ do | ||
appendFile outputFp $ "#### BLOCK ####" <> "\n" | ||
appendFile outputFp $ show anyNewEpochState <> "\n" | ||
pure ConditionNotMet | ||
handler :: FilePath -> AnyNewEpochState -> StateT Int IO LedgerStateCondition | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I need to remove it |
||
handler outputFpHandle (AnyNewEpochState sbe nes) = do | ||
handleException . liftIO $ do | ||
appendFile outputFpHandle $ "#### BLOCK ####" <> "\n" | ||
appendFile outputFpHandle $ BSC.unpack (shelleyBasedEraConstraints sbe $ encodePretty nes) <> "\n" | ||
pure ConditionNotMet | ||
where | ||
-- | Handle all sync exceptions and log them into the log file. We don't want to fail the test just | ||
-- because logging has failed. | ||
handleException = handle $ \(e :: SomeException) -> do | ||
liftIO $ appendFile outputFp $ "Ledger new epoch logging failed - caught exception:\n" | ||
liftIO $ appendFile outputFpHandle $ "Ledger new epoch logging failed - caught exception:\n" | ||
<> displayException e <> "\n" | ||
pure ConditionMet | ||
-- TODO: Not sure why this isn't terminating. Read up on resourcet and how it works. | ||
-- See concurrency section: https://www.fpcomplete.com/blog/understanding-resourcet/ | ||
-- Probably need to use resourceForkWith but best to not use concurrency at all! | ||
|
||
|
||
-- | Produce tuples that represent the change of the 'NewEpochState' after | ||
-- a transition. | ||
epochStateBeforeAfterValues | ||
:: String | ||
-> [(Text, Text)] | ||
epochStateBeforeAfterValues logFileContents = | ||
let allEpochStates = filter (/= "") . Text.splitOn "#### BLOCK ####" $ Text.pack logFileContents | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggestion: move the block tag to a variable and allow reusing it. Both usage places are 40 lines apart, so it'd be easy to get overlooked by someone making changes here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd rather do this in a follow up PR. I want to get the concurrency stuff right first. |
||
in getAllTransitions allEpochStates | ||
|
||
getAllTransitions :: [Text] -> [(Text, Text)] | ||
getAllTransitions [] = [] | ||
getAllTransitions trans = | ||
let (singleTransition, rest) = splitAt 2 trans | ||
tupleSingleTransition = toTuple singleTransition | ||
in case singleTransition of | ||
[a] -> [(a,"")] | ||
_ -> tupleSingleTransition ++ getAllTransitions (snd (head tupleSingleTransition) : rest) | ||
where | ||
toTuple [a,b] = [(a,b)] | ||
toTuple [] = [] | ||
toTuple _ = error "toTuple: a single transition was not generated" | ||
Jimbo4350 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
epochStateDiffs | ||
:: [(Text,Text)] | ||
-> Text | ||
epochStateDiffs [] = "No epoch state values to compare" | ||
epochStateDiffs states = | ||
-- We first get the block number changes | ||
labelAllEpochStateTransitions | ||
[ Text.pack $ epochStateTransitionDiff (Text.unpack i) (Text.unpack n) | ||
| (i, n) <- states | ||
] & Text.intercalate "\n" | ||
where | ||
labelSingleEpochStateTransition :: Int -> Text | ||
labelSingleEpochStateTransition transitionNumber = | ||
"Epoch state transition: " <> Text.pack (show transitionNumber) | ||
|
||
labelAllEpochStateTransitions [] = [] | ||
labelAllEpochStateTransitions trans = | ||
go trans (1 :: Int) [] | ||
where | ||
go [] _ acc = acc | ||
go (x:xs) 1 _ = go xs 2 [labelSingleEpochStateTransition 1, x] | ||
go (x:xs) n acc = go xs (n + 1) (acc ++ [labelSingleEpochStateTransition n, x]) | ||
|
||
epochStateTransitionDiff | ||
:: String -- ^ Initial epoch state | ||
-> String -- ^ Following epoch state | ||
-> String | ||
epochStateTransitionDiff initialState next = | ||
let removeBlockNumberChangeInitial = lines initialState | ||
removeBlockNumberChangeNext = lines next | ||
diffResult = getGroupedDiff removeBlockNumberChangeInitial removeBlockNumberChangeNext | ||
in if null diffResult | ||
then "No changes in epoch state" | ||
else ppDiff diffResult | ||
|
||
instance (L.EraTxOut ledgerera, L.EraGov ledgerera) => ToJSON (L.NewEpochState ledgerera) where | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we later upstream this to ledger? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would prefer that 👍 |
||
toJSON (L.NewEpochState nesEL nesBprev nesBCur nesEs nesRu nesPd _stashedAvvm)= | ||
object | ||
[ "currentEpoch" .= nesEL | ||
, "priorBlocks" .= nesBprev | ||
, "currentEpochBlocks" .= nesBCur | ||
, "currentEpochState" .= nesEs | ||
, "rewardUpdate" .= nesRu | ||
, "currentStakeDistribution" .= nesPd | ||
] | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Replying to: #5854 (comment)
I think
allocate (H.async act)
is correct here. Resources are cleaned in LIFO order, so this should ensure that the thread gets killed before resources it uses are freed.