From 7d46115b690c140461abbba1ae1b3233cb8ec161 Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Fri, 14 Jun 2024 10:43:15 +0100 Subject: [PATCH] perf: Group together packages by repo when verifying tarballs verifyFetchedTarball has the effect of deserialising the index tarball (see call to Sec.withIndex). verifyFetchedTarball is called individually for each package in the build plan (see ProjectPlanning.hs). Not once per repo. The hackage tarball is now 880mb so it takes a non significant amount of time to deserialise this (much better after haskell/tar#95). This code path is important as it can add 1s with these 38 calls on to the initial load of a project and scales linearly with the size of your build tree. Reproducer: Simple project with "lens" dependency deserialises the index tarball 38 times. Solution: Refactor verifyFetchedTarball to run once per repo rather than once per package. In future it would be much better to refactor this function so that the items are not immediately grouped and ungrouped but I didn't want to take that on immediately. Fixes #10110 --- .hlint.yaml | 1 + .../src/Distribution/Client/FetchUtils.hs | 97 ++++++++++++------- .../Distribution/Client/ProjectPlanning.hs | 38 ++++---- .../src/Distribution/Client/Utils.hs | 1 + 4 files changed, 85 insertions(+), 52 deletions(-) diff --git a/.hlint.yaml b/.hlint.yaml index 6266b76964f..0170ee22ebd 100644 --- a/.hlint.yaml +++ b/.hlint.yaml @@ -92,6 +92,7 @@ - ignore: {name: "Use unwords"} # 8 hints - ignore: {name: "Use void"} # 22 hints - ignore: {name: "Use when"} # 1 hint +- ignore: {name: "Use uncurry"} # 1 hint - arguments: - --ignore-glob=Cabal-syntax/src/Distribution/Fields/Lexer.hs diff --git a/cabal-install/src/Distribution/Client/FetchUtils.hs b/cabal-install/src/Distribution/Client/FetchUtils.hs index c804040cab7..62da386573d 100644 --- a/cabal-install/src/Distribution/Client/FetchUtils.hs +++ b/cabal-install/src/Distribution/Client/FetchUtils.hs @@ -25,7 +25,7 @@ module Distribution.Client.FetchUtils -- ** specifically for repo packages , checkRepoTarballFetched , fetchRepoTarball - , verifyFetchedTarball + , verifyFetchedTarballs -- ** fetching packages asynchronously , asyncFetchPackages @@ -98,6 +98,7 @@ import System.IO , openTempFile ) +import Control.Monad (forM) import Distribution.Client.Errors import qualified Hackage.Security.Client as Sec import qualified Hackage.Security.Util.Checked as Sec @@ -152,40 +153,66 @@ checkRepoTarballFetched repo pkgid = do then return (Just file) else return Nothing -verifyFetchedTarball :: Verbosity -> RepoContext -> Repo -> PackageId -> IO Bool -verifyFetchedTarball verbosity repoCtxt repo pkgid = - let file = packageFile repo pkgid - handleError :: IO Bool -> IO Bool - handleError act = do - res <- Safe.try act - case res of - Left e -> warn verbosity ("Error verifying fetched tarball " ++ file ++ ", will redownload: " ++ show (e :: SomeException)) >> pure False - Right b -> pure b - in handleError $ do - exists <- doesFileExist file - if not exists - then return True -- if the file does not exist, it vacuously passes validation, since it will be downloaded as necessary with what we will then check is a valid hash. - else case repo of - -- a secure repo has hashes we can compare against to confirm this is the correct file. - RepoSecure{} -> - repoContextWithSecureRepo repoCtxt repo $ \repoSecure -> - Sec.withIndex repoSecure $ \callbacks -> - let warnAndFail s = warn verbosity ("Fetched tarball " ++ file ++ " does not match server, will redownload: " ++ s) >> return False - in -- the do block in parens is due to dealing with the checked exceptions mechanism. - ( do - fileInfo <- Sec.indexLookupFileInfo callbacks pkgid - sz <- Sec.FileLength . fromInteger <$> getFileSize file - if sz /= Sec.fileInfoLength (Sec.trusted fileInfo) - then warnAndFail "file length mismatch" - else do - res <- Sec.compareTrustedFileInfo (Sec.trusted fileInfo) <$> Sec.computeFileInfo (Sec.Path file :: Sec.Path Sec.Absolute) - if res - then pure True - else warnAndFail "file hash mismatch" - ) - `Sec.catchChecked` (\(e :: Sec.InvalidPackageException) -> warnAndFail (show e)) - `Sec.catchChecked` (\(e :: Sec.VerificationError) -> warnAndFail (show e)) - _ -> pure True +verifyFetchedTarballs + :: Verbosity + -> RepoContext + -> Repo + -> [PackageId] + -> IO + ( [ Either + (Repo, PackageId) -- Verified + (Repo, PackageId) -- unverified) + ] + ) +verifyFetchedTarballs verbosity repoCtxt repo pkgids = + -- Establish the context once per repo (see #10110), this codepath is important + -- to be fast as it can happen when no other building happens. + let establishContext k = + case repo of + RepoSecure{} -> + repoContextWithSecureRepo repoCtxt repo $ \repoSecure -> + Sec.withIndex repoSecure $ \callbacks -> k (Just callbacks) + _ -> k Nothing + in do + establishContext $ \mCallbacks -> + forM pkgids $ \pkgid -> do + let file = packageFile repo pkgid + res <- verifyFetchedTarball verbosity file mCallbacks pkgid + return $ if res then Left (repo, pkgid) else Right (repo, pkgid) + +verifyFetchedTarball :: Verbosity -> FilePath -> Maybe Sec.IndexCallbacks -> PackageId -> IO Bool +verifyFetchedTarball verbosity file mCallbacks pkgid = + let + handleError :: IO Bool -> IO Bool + handleError act = do + res <- Safe.try act + case res of + Left e -> warn verbosity ("Error verifying fetched tarball " ++ file ++ ", will redownload: " ++ show (e :: SomeException)) >> pure False + Right b -> pure b + in + handleError $ do + exists <- doesFileExist file + if not exists + then return True -- if the file does not exist, it vacuously passes validation, since it will be downloaded as necessary with what we will then check is a valid hash. + else case mCallbacks of + -- a secure repo has hashes we can compare against to confirm this is the correct file. + Just callbacks -> + let warnAndFail s = warn verbosity ("Fetched tarball " ++ file ++ " does not match server, will redownload: " ++ s) >> return False + in -- the do block in parens is due to dealing with the checked exceptions mechanism. + ( do + fileInfo <- Sec.indexLookupFileInfo callbacks pkgid + sz <- Sec.FileLength . fromInteger <$> getFileSize file + if sz /= Sec.fileInfoLength (Sec.trusted fileInfo) + then warnAndFail "file length mismatch" + else do + res <- Sec.compareTrustedFileInfo (Sec.trusted fileInfo) <$> Sec.computeFileInfo (Sec.Path file :: Sec.Path Sec.Absolute) + if res + then pure True + else warnAndFail "file hash mismatch" + ) + `Sec.catchChecked` (\(e :: Sec.InvalidPackageException) -> warnAndFail (show e)) + `Sec.catchChecked` (\(e :: Sec.VerificationError) -> warnAndFail (show e)) + _ -> pure True -- | Fetch a package if we don't have it already. fetchPackage diff --git a/cabal-install/src/Distribution/Client/ProjectPlanning.hs b/cabal-install/src/Distribution/Client/ProjectPlanning.hs index 6b77531b8a5..efc4ebbd1e4 100644 --- a/cabal-install/src/Distribution/Client/ProjectPlanning.hs +++ b/cabal-install/src/Distribution/Client/ProjectPlanning.hs @@ -127,7 +127,7 @@ import Distribution.Client.SetupWrapper import Distribution.Client.Store import Distribution.Client.Targets (userToPackageConstraint) import Distribution.Client.Types -import Distribution.Client.Utils (incVersion) +import Distribution.Client.Utils (concatMapM, incVersion) import qualified Distribution.Client.BuildReports.Storage as BuildReports import qualified Distribution.Client.IndexUtils as IndexUtils @@ -206,7 +206,7 @@ import qualified Distribution.Solver.Types.ComponentDeps as CD import qualified Distribution.Compat.Graph as Graph import Control.Exception (assert) -import Control.Monad (forM, sequence) +import Control.Monad (sequence) import Control.Monad.IO.Class (liftIO) import Control.Monad.State as State (State, execState, runState, state) import Data.Foldable (fold) @@ -1069,25 +1069,29 @@ getPackageSourceHashes verbosity withRepoCtx solverPlan = do -- Tarballs from repositories, either where the repository provides -- hashes as part of the repo metadata, or where we will have to -- download and hash the tarball. - repoTarballPkgsWithMetadataUnvalidated :: [(PackageId, Repo)] - repoTarballPkgsWithoutMetadata :: [(PackageId, Repo)] + repoTarballPkgsWithMetadataUnvalidated :: [(Repo, [PackageId])] + repoTarballPkgsWithoutMetadata :: [(Repo, PackageId)] ( repoTarballPkgsWithMetadataUnvalidated , repoTarballPkgsWithoutMetadata ) = partitionEithers [ case repo of - RepoSecure{} -> Left (pkgid, repo) - _ -> Right (pkgid, repo) + RepoSecure{} -> Left (repo, [pkgid]) + _ -> Right (repo, pkgid) | (pkgid, RepoTarballPackage repo _ _) <- allPkgLocations ] + -- Group up the unvalidated packages by repo so we only read the remote + -- index once per repo (see #10110). The packages are ungrouped here and then regrouped + -- below, it would be better in future to refactor this whole code path so that we don't + -- repeatedly group and ungroup. + repoTarballPkgsWithMetadataUnvalidatedMap = Map.fromListWith (++) repoTarballPkgsWithMetadataUnvalidated + (repoTarballPkgsWithMetadata, repoTarballPkgsToDownloadWithMeta) <- fmap partitionEithers $ liftIO $ - withRepoCtx $ \repoctx -> forM repoTarballPkgsWithMetadataUnvalidated $ - \x@(pkg, repo) -> - verifyFetchedTarball verbosity repoctx repo pkg >>= \b -> case b of - True -> return $ Left x - False -> return $ Right x + withRepoCtx $ \repoctx -> flip concatMapM (Map.toList repoTarballPkgsWithMetadataUnvalidatedMap) $ + \(repo, pkgids) -> + verifyFetchedTarballs verbosity repoctx repo pkgids -- For tarballs from repos that do not have hashes available we now have -- to check if the packages were downloaded already. @@ -1101,9 +1105,9 @@ getPackageSourceHashes verbosity withRepoCtx solverPlan = do [ do mtarball <- checkRepoTarballFetched repo pkgid case mtarball of - Nothing -> return (Left (pkgid, repo)) + Nothing -> return (Left (repo, pkgid)) Just tarball -> return (Right (pkgid, tarball)) - | (pkgid, repo) <- repoTarballPkgsWithoutMetadata + | (repo, pkgid) <- repoTarballPkgsWithoutMetadata ] let repoTarballPkgsToDownload = repoTarballPkgsToDownloadWithMeta ++ repoTarballPkgsToDownloadWithNoMeta @@ -1139,9 +1143,9 @@ getPackageSourceHashes verbosity withRepoCtx solverPlan = do | pkgid <- pkgids ] | (repo, pkgids) <- - map (\grp@((_, repo) :| _) -> (repo, map fst (NE.toList grp))) - . NE.groupBy ((==) `on` (remoteRepoName . repoRemote . snd)) - . sortBy (compare `on` (remoteRepoName . repoRemote . snd)) + map (\grp@((repo, _) :| _) -> (repo, map snd (NE.toList grp))) + . NE.groupBy ((==) `on` (remoteRepoName . repoRemote . fst)) + . sortBy (compare `on` (remoteRepoName . repoRemote . fst)) $ repoTarballPkgsWithMetadata ] @@ -1153,7 +1157,7 @@ getPackageSourceHashes verbosity withRepoCtx solverPlan = do [ do tarball <- fetchRepoTarball verbosity repoctx repo pkgid return (pkgid, tarball) - | (pkgid, repo) <- repoTarballPkgsToDownload + | (repo, pkgid) <- repoTarballPkgsToDownload ] return diff --git a/cabal-install/src/Distribution/Client/Utils.hs b/cabal-install/src/Distribution/Client/Utils.hs index 87378da7f10..69d46f8a473 100644 --- a/cabal-install/src/Distribution/Client/Utils.hs +++ b/cabal-install/src/Distribution/Client/Utils.hs @@ -38,6 +38,7 @@ module Distribution.Client.Utils , listFilesInside , safeRead , hasElem + , concatMapM , occursOnlyOrBefore , giveRTSWarning ) where