From cbcc890193b86fcd2b417cc83987b261e4d26beb Mon Sep 17 00:00:00 2001
From: Adam Joseph <adam@westernsemico.com>
Date: Fri, 19 Jan 2024 20:01:58 -0800
Subject: [PATCH 01/34] gcc: link $lib/lib -> $lib/$targetConfig correctly and
 consistently

When native-compiling, gcc will install libraries into:

  /nix/store/...-$targetConfig-gcc-$version-lib/lib

When cross-compiling, gcc will install libraries into:

  /nix/store/...-$targetConfig-gcc-$version-lib/$targetConfig

When cross-compiling, we intended to create a link from $lib/lib to
$lib/$targetConfig, so that downstream users can always safely
assume that "${lib.getLib stdenv.cc.cc}/lib" is where the gcc
libraries are, regardless of whether `stdenv.cc.cc` is a cross
compiler or a native compiler.

Unfortunately, there were two problems with how we were trying to
create these links:

1. The link would be created only when `enableLibGccOutput==true`

2. The link was being created from the incorrect source
   `$lib/lib/lib` instead of `$lib/lib`.

Both of these mistakes are my fault.  This commit corrects them by
creating the link using `ln -Ts` (which is more predictable) and by
creating the link from `gcc/common/builder.nix` rather than from
`gcc/common/libgcc.nix`.
---
 pkgs/development/compilers/gcc/common/builder.nix | 8 ++++++++
 pkgs/development/compilers/gcc/common/libgcc.nix  | 4 ----
 pkgs/development/compilers/gcc/default.nix        | 4 +++-
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/pkgs/development/compilers/gcc/common/builder.nix b/pkgs/development/compilers/gcc/common/builder.nix
index 98525b5e237ef..25c5646338653 100644
--- a/pkgs/development/compilers/gcc/common/builder.nix
+++ b/pkgs/development/compilers/gcc/common/builder.nix
@@ -1,6 +1,7 @@
 { lib
 , stdenv
 , enableMultilib
+, targetConfig
 }:
 
 let
@@ -196,6 +197,13 @@ originalAttrs: (stdenv.mkDerivation (finalAttrs: originalAttrs // {
     mkdir -p "$out/''${targetConfig}/lib"
     mkdir -p "''${!outputLib}/''${targetConfig}/lib"
   '' +
+  # if cross-compiling, link from $lib/lib to $lib/${targetConfig}.
+  # since native-compiles have $lib/lib as a directory (not a
+  # symlink), this ensures that in every case we can assume that
+  # $lib/lib contains the .so files
+  lib.optionalString (with stdenv; targetPlatform.config != hostPlatform.config) ''
+    ln -Ts "''${!outputLib}/''${targetConfig}/lib" $lib/lib
+  '' +
   # Make `lib64` symlinks to `lib`.
   lib.optionalString (!enableMultilib && stdenv.hostPlatform.is64bit && !stdenv.hostPlatform.isMips64n32) ''
     ln -s lib "$out/''${targetConfig}/lib64"
diff --git a/pkgs/development/compilers/gcc/common/libgcc.nix b/pkgs/development/compilers/gcc/common/libgcc.nix
index c8342ae90054a..a7de840adc8d0 100644
--- a/pkgs/development/compilers/gcc/common/libgcc.nix
+++ b/pkgs/development/compilers/gcc/common/libgcc.nix
@@ -83,10 +83,6 @@ in
     lib.optionalString (!langC) ''
       rm -f $out/lib/libgcc_s.so*
     ''
-    + lib.optionalString (hostPlatform != targetPlatform) ''
-      mkdir -p $lib/lib/
-      ln -s ${targetPlatformSlash}lib $lib/lib
-    ''
 
     # TODO(amjoseph): remove the `libgcc_s.so` symlinks below and replace them
     # with a `-L${gccForLibs.libgcc}/lib` in cc-wrapper's
diff --git a/pkgs/development/compilers/gcc/default.nix b/pkgs/development/compilers/gcc/default.nix
index cc3546bed22cf..0144ab4cfff9b 100644
--- a/pkgs/development/compilers/gcc/default.nix
+++ b/pkgs/development/compilers/gcc/default.nix
@@ -103,6 +103,7 @@ let inherit version;
     disableBootstrap = atLeast11 && !stdenv.hostPlatform.isDarwin && (atLeast12 -> !profiledCompiler);
 
     inherit (stdenv) buildPlatform hostPlatform targetPlatform;
+    targetConfig = if targetPlatform != hostPlatform then targetPlatform.config else null;
 
     patches = callFile ./patches {};
 
@@ -124,6 +125,7 @@ let inherit version;
         buildPlatform
         hostPlatform
         targetPlatform
+        targetConfig
         patches
         crossMingw
         stageNameAddon
@@ -329,7 +331,7 @@ lib.pipe ((callFile ./common/builder.nix {}) ({
     ++ optional (is7 && targetPlatform.isAarch64) "--enable-fix-cortex-a53-843419"
     ++ optional (is7 && targetPlatform.isNetBSD) "--disable-libcilkrts";
 
-  targetConfig = if targetPlatform != hostPlatform then targetPlatform.config else null;
+  inherit targetConfig;
 
   buildFlags =
     # we do not yet have Nix-driven profiling

From cbde122958b83da0c89f322ef497bcfdf80a17c0 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Wed, 10 Jan 2024 02:32:25 +0000
Subject: [PATCH 02/34] cudaPackages: cross-compilation attempt 2

workaround bug in cross-compilation / meta.getExe

cudaPackages.autoAddCudaCompatRunpathHook: correct meta.platforms reference

cudaPackages.backendStdenv: use coreutils from buildPackages when cross-compiling
---
 .../cuda-modules/backend-stdenv.nix           |  4 +-
 .../cuda-modules/cuda/overrides.nix           |  9 ++-
 .../cuda-modules/cutensor/extension.nix       |  4 +-
 pkgs/development/cuda-modules/flags.nix       | 15 +++--
 .../generic-builders/manifest.nix             | 11 +++-
 .../generic-builders/multiplex.nix            |  4 +-
 .../cuda-modules/setup-hooks/extension.nix    | 17 ++---
 .../cuda-modules/tensorrt/fixup.nix           |  6 +-
 pkgs/top-level/cuda-packages.nix              | 64 +++++++++----------
 9 files changed, 72 insertions(+), 62 deletions(-)

diff --git a/pkgs/development/cuda-modules/backend-stdenv.nix b/pkgs/development/cuda-modules/backend-stdenv.nix
index bcca7118b163b..32386ffbdd4c9 100644
--- a/pkgs/development/cuda-modules/backend-stdenv.nix
+++ b/pkgs/development/cuda-modules/backend-stdenv.nix
@@ -3,10 +3,8 @@
   nvccCompatibilities,
   cudaVersion,
   pkgs,
-  overrideCC,
   stdenv,
-  wrapCCWith,
-  stdenvAdapters,
+  stdenvAdapters
 }:
 
 let
diff --git a/pkgs/development/cuda-modules/cuda/overrides.nix b/pkgs/development/cuda-modules/cuda/overrides.nix
index f43d649afbbf3..31d03dd0fc73f 100644
--- a/pkgs/development/cuda-modules/cuda/overrides.nix
+++ b/pkgs/development/cuda-modules/cuda/overrides.nix
@@ -1,4 +1,4 @@
-{cudaVersion, lib, addDriverRunpath}:
+{cudaVersion, lib}:
 let
   inherit (lib) attrsets lists strings;
   # cudaVersionOlder : Version -> Boolean
@@ -58,7 +58,7 @@ attrsets.filterAttrs (attr: _: (builtins.hasAttr attr prev)) {
           while IFS= read -r -d $'\0' path ; do
             sed -i \
               -e "s|^libdir\s*=.*/lib\$|libdir=''${!outputLib}/lib/stubs|" \
-              -e "s|^Libs\s*:\(.*\)\$|Libs: \1 -Wl,-rpath,${addDriverRunpath.driverLink}/lib|" \
+              -e "s|^Libs\s*:\(.*\)\$|Libs: \1 -Wl,-rpath,${final.pkgs.addDriverRunpath.driverLink}/lib|" \
               "$path"
           done < <(find -iname 'cuda-*.pc' -print0)
         ''
@@ -92,6 +92,11 @@ attrsets.filterAttrs (attr: _: (builtins.hasAttr attr prev)) {
         "Trying to use cuda_compat on aarch64-linux targeting non-Jetson devices" =
           !final.flags.isJetsonBuild;
       };
+      meta = prevAttrs.meta // {
+        # For cross-compilation, we need the hostPlatform to be included in order to fetch and build the package. This
+        # doesn't change the fact that it won't work on non-Jetson devices, so we only add it when building for Jetson.
+        platforms = prevAttrs.meta.platforms ++ lib.optionals final.flags.isJetsonBuild [ "x86_64-linux" ];
+      };
     }
   );
 
diff --git a/pkgs/development/cuda-modules/cutensor/extension.nix b/pkgs/development/cuda-modules/cutensor/extension.nix
index 534941887c6e4..38b0b03248aad 100644
--- a/pkgs/development/cuda-modules/cutensor/extension.nix
+++ b/pkgs/development/cuda-modules/cutensor/extension.nix
@@ -15,7 +15,7 @@
 {
   cudaVersion,
   flags,
-  hostPlatform,
+  targetPlatform,
   lib,
   mkVersionedPackageName,
 }:
@@ -93,7 +93,7 @@ let
   # LibPath are not constant across the same release -- one platform may support fewer
   # CUDA versions than another.
   # redistArch :: String
-  redistArch = flags.getRedistArch hostPlatform.system;
+  redistArch = flags.getRedistArch targetPlatform.system;
   # platformIsSupported :: Manifests -> Boolean
   platformIsSupported =
     {feature, ...}:
diff --git a/pkgs/development/cuda-modules/flags.nix b/pkgs/development/cuda-modules/flags.nix
index d5e01be01fd51..50a69d6fd1d1d 100644
--- a/pkgs/development/cuda-modules/flags.nix
+++ b/pkgs/development/cuda-modules/flags.nix
@@ -7,7 +7,9 @@
   cudaForwardCompat ? (config.cudaForwardCompat or true),
   lib,
   cudaVersion,
+  buildPlatform,
   hostPlatform,
+  targetPlatform,
   # gpus :: List Gpu
   gpus,
 }:
@@ -216,16 +218,19 @@ let
             lists.filter (cap: !(builtins.elem cap requestedJetsonDevices))
               cudaCapabilities;
           jetsonBuildSufficientCondition = requestedJetsonDevices != [];
-          jetsonBuildNecessaryCondition = requestedNonJetsonDevices == [] && hostPlatform.isAarch64;
+          jetsonBuildNecessaryCondition = requestedNonJetsonDevices == [] && targetPlatform.isAarch64;
         in
         trivial.throwIf (jetsonBuildSufficientCondition && !jetsonBuildNecessaryCondition)
           ''
-            Jetson devices cannot be targeted with non-Jetson devices. Additionally, they require hostPlatform to be aarch64.
-            You requested ${builtins.toJSON cudaCapabilities} for host platform ${hostPlatform.system}.
+            Jetson devices cannot be targeted with non-Jetson devices. Additionally, they require targetPlatform to be aarch64.
+            You requested ${builtins.toJSON cudaCapabilities} for:
+            - Build platform ${buildPlatform.system}
+            - Host platform ${hostPlatform.system}
+            - Target platform ${targetPlatform.system}
             Requested Jetson devices: ${builtins.toJSON requestedJetsonDevices}.
             Requested non-Jetson devices: ${builtins.toJSON requestedNonJetsonDevices}.
             Exactly one of the following must be true:
-            - All CUDA capabilities belong to Jetson devices and hostPlatform is aarch64.
+            - All CUDA capabilities belong to Jetson devices and targetPlatform is aarch64.
             - No CUDA capabilities belong to Jetson devices.
             See ${./gpus.nix} for a list of architectures supported by this version of Nixpkgs.
           ''
@@ -346,7 +351,7 @@ assert let
 in
 asserts.assertMsg
   # We can't do this test unless we're targeting aarch64
-  (hostPlatform.isAarch64 -> (expected == actualWrapped))
+  (targetPlatform.isAarch64 -> (expected == actualWrapped))
   ''
     Jetson devices can only be built with other Jetson devices.
     Both 6.2 and 7.2 are Jetson devices.
diff --git a/pkgs/development/cuda-modules/generic-builders/manifest.nix b/pkgs/development/cuda-modules/generic-builders/manifest.nix
index 4f40b7f01dc28..049c8936426d2 100644
--- a/pkgs/development/cuda-modules/generic-builders/manifest.nix
+++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix
@@ -10,7 +10,7 @@
   markForCudatoolkitRootHook,
   flags,
   stdenv,
-  hostPlatform,
+  targetPlatform,
   # Builder-specific arguments
   # Short package name (e.g., "cuda_cccl")
   # pname : String
@@ -46,7 +46,7 @@ let
   # redistArch :: String
   # The redistArch is the name of the architecture for which the redistributable is built.
   # It is `"unsupported"` if the redistributable is not supported on the target platform.
-  redistArch = flags.getRedistArch hostPlatform.system;
+  redistArch = flags.getRedistArch targetPlatform.system;
 
   sourceMatchesHost = flags.getNixSystem redistArch == stdenv.hostPlatform.system;
 in
@@ -195,6 +195,8 @@ backendStdenv.mkDerivation (
       # Check e.g. with `patchelf --print-rpath path/to/my/binary
       autoAddDriverRunpath
       markForCudatoolkitRootHook
+      # To create fat outputs from each component and find a version of `lndir` built for the host platform.
+      lndir
     ]
     # autoAddCudaCompatRunpath depends on cuda_compat and would cause
     # infinite recursion if applied to `cuda_compat` itself (beside the fact
@@ -296,11 +298,14 @@ backendStdenv.mkDerivation (
 
     # For each output, create a symlink to it in the out output.
     # NOTE: We must recreate the out output here, because the setup hook will have deleted it if it was empty.
+    # TODO: Previously we used `meta.getExe lndir` to get the path to lndir, but that doesn't work under
+    # cross-compilation -- whatever machinery Nixpkgs uses to get a version built for hostPlatform (so it can run
+    # during the build) doesn't extend to `meta.getExe`.
     postPatchelf = ''
       mkdir -p "$out"
       for output in $(getAllOutputNames); do
         if [[ "$output" != "out" ]]; then
-          ${meta.getExe lndir} "''${!output}" "$out"
+          lndir "''${!output}" "$out"
         fi
       done
     '';
diff --git a/pkgs/development/cuda-modules/generic-builders/multiplex.nix b/pkgs/development/cuda-modules/generic-builders/multiplex.nix
index f2a9c6840ecd0..deeb2da6e0042 100644
--- a/pkgs/development/cuda-modules/generic-builders/multiplex.nix
+++ b/pkgs/development/cuda-modules/generic-builders/multiplex.nix
@@ -3,7 +3,7 @@
   lib,
   cudaVersion,
   flags,
-  hostPlatform,
+  targetPlatform,
   # Expected to be passed by the caller
   mkVersionedPackageName,
   # pname :: String
@@ -74,7 +74,7 @@ let
   # Get all of the packages for our given platform.
   # redistArch :: String
   # Value is `"unsupported"` if the platform is not supported.
-  redistArch = flags.getRedistArch hostPlatform.system;
+  redistArch = flags.getRedistArch targetPlatform.system;
 
   preferable =
     p1: p2: (isSupported p2 -> isSupported p1) && (strings.versionAtLeast p1.version p2.version);
diff --git a/pkgs/development/cuda-modules/setup-hooks/extension.nix b/pkgs/development/cuda-modules/setup-hooks/extension.nix
index ece70da52b027..b9afd4f1998cd 100644
--- a/pkgs/development/cuda-modules/setup-hooks/extension.nix
+++ b/pkgs/development/cuda-modules/setup-hooks/extension.nix
@@ -71,22 +71,19 @@ final: _: {
   autoAddCudaCompatRunpath =
     final.callPackage
       (
-        {makeSetupHook, autoFixElfFiles, cuda_compat ? null }:
+        {makeSetupHook, autoFixElfFiles, lib, flags, cuda_compat ? null }:
         makeSetupHook
           {
             name = "auto-add-cuda-compat-runpath-hook";
             propagatedBuildInputs = [autoFixElfFiles];
 
-            substitutions = {
-              # Hotfix Ofborg evaluation
-              libcudaPath = if final.flags.isJetsonBuild then "${cuda_compat}/compat" else null;
-            };
-
-            meta.broken = !final.flags.isJetsonBuild;
+            substitutions.libcudaPath = lib.optionalString flags.isJetsonBuild "${cuda_compat}/compat";
 
-            # Pre-cuda_compat CUDA release:
-            meta.badPlatforms = final.lib.optionals (cuda_compat == null) final.lib.platforms.all;
-            meta.platforms = cuda_compat.meta.platforms or [ ];
+            meta = {
+              broken = !flags.isJetsonBuild;
+              badPlatforms = lib.optionals (cuda_compat == null) lib.platforms.all;
+              platforms = cuda_compat.meta.platforms or [ ];
+            };
           }
           ./auto-add-cuda-compat-runpath.sh
       )
diff --git a/pkgs/development/cuda-modules/tensorrt/fixup.nix b/pkgs/development/cuda-modules/tensorrt/fixup.nix
index 51ca3d652bd1a..c6cbd137a0e4c 100644
--- a/pkgs/development/cuda-modules/tensorrt/fixup.nix
+++ b/pkgs/development/cuda-modules/tensorrt/fixup.nix
@@ -1,7 +1,7 @@
 {
   cudaVersion,
   final,
-  hostPlatform,
+  targetPlatform,
   lib,
   mkVersionedPackageName,
   package,
@@ -18,7 +18,7 @@ let
     versions
     ;
   # targetArch :: String
-  targetArch = attrsets.attrByPath [ hostPlatform.system ] "unsupported" {
+  targetArch = attrsets.attrByPath [ targetPlatform.system ] "unsupported" {
     x86_64-linux = "x86_64-linux-gnu";
     aarch64-linux = "aarch64-linux-gnu";
   };
@@ -106,7 +106,7 @@ finalAttrs: prevAttrs: {
   meta = prevAttrs.meta // {
     badPlatforms =
       prevAttrs.meta.badPlatforms or [ ]
-      ++ lib.optionals (targetArch == "unsupported") [ hostPlatform.system ];
+      ++ lib.optionals (targetArch == "unsupported") [ targetPlatform.system ];
     homepage = "https://developer.nvidia.com/tensorrt";
     maintainers = prevAttrs.meta.maintainers ++ [maintainers.aidalgol];
   };
diff --git a/pkgs/top-level/cuda-packages.nix b/pkgs/top-level/cuda-packages.nix
index 4b8ad4646485e..eb0efcb10865f 100644
--- a/pkgs/top-level/cuda-packages.nix
+++ b/pkgs/top-level/cuda-packages.nix
@@ -21,17 +21,16 @@
 #
 # I've (@connorbaker) attempted to do that, though I'm unsure of how this will interact with overrides.
 {
-  callPackage,
+  config,
   cudaVersion,
+  generateSplicesForMkScope,
   lib,
-  newScope,
+  makeScopeWithSplicing',
   pkgs,
   __attrsFailEvaluation ? true,
 }:
 let
   inherit (lib)
-    attrsets
-    customisation
     fixedPoints
     strings
     versions
@@ -39,13 +38,13 @@ let
   # Backbone
   gpus = builtins.import ../development/cuda-modules/gpus.nix;
   nvccCompatibilities = builtins.import ../development/cuda-modules/nvcc-compatibilities.nix;
-  flags = callPackage ../development/cuda-modules/flags.nix {inherit cudaVersion gpus;};
   passthruFunction =
     final:
     (
       {
         inherit cudaVersion lib pkgs;
-        inherit gpus nvccCompatibilities flags;
+        inherit gpus nvccCompatibilities;
+        flags = final.callPackage ../development/cuda-modules/flags.nix {};
         cudaMajorVersion = versions.major cudaVersion;
         cudaMajorMinorVersion = versions.majorMinor cudaVersion;
         cudaOlder = strings.versionOlder cudaVersion;
@@ -58,7 +57,7 @@ let
         cudaPackages = final;
 
         # TODO(@connorbaker): `cudaFlags` is an alias for `flags` which should be removed in the future.
-        cudaFlags = flags;
+        cudaFlags = final.flags;
 
         # Exposed as cudaPackages.backendStdenv.
         # This is what nvcc uses as a backend,
@@ -86,32 +85,33 @@ let
     ];
 
   composedExtension = fixedPoints.composeManyExtensions [
-    (import ../development/cuda-modules/setup-hooks/extension.nix)
-    (callPackage ../development/cuda-modules/cuda/extension.nix {inherit cudaVersion;})
-    (callPackage ../development/cuda-modules/cuda/overrides.nix {inherit cudaVersion;})
-    (callPackage ../development/cuda-modules/generic-builders/multiplex.nix {
-      inherit cudaVersion flags mkVersionedPackageName;
-      pname = "cudnn";
-      releasesModule = ../development/cuda-modules/cudnn/releases.nix;
-      shimsFn = ../development/cuda-modules/cudnn/shims.nix;
-      fixupFn = ../development/cuda-modules/cudnn/fixup.nix;
-    })
-    (callPackage ../development/cuda-modules/cutensor/extension.nix {
-      inherit cudaVersion flags mkVersionedPackageName;
-    })
-    (callPackage ../development/cuda-modules/generic-builders/multiplex.nix {
-      inherit cudaVersion flags mkVersionedPackageName;
-      pname = "tensorrt";
-      releasesModule = ../development/cuda-modules/tensorrt/releases.nix;
-      shimsFn = ../development/cuda-modules/tensorrt/shims.nix;
-      fixupFn = ../development/cuda-modules/tensorrt/fixup.nix;
-    })
-    (callPackage ../development/cuda-modules/cuda-samples/extension.nix {inherit cudaVersion;})
-    (callPackage ../development/cuda-modules/cuda-library-samples/extension.nix {})
+    (builtins.import ../development/cuda-modules/setup-hooks/extension.nix)
+    (builtins.import ../development/cuda-modules/cuda/extension.nix {inherit cudaVersion lib;})
+    (builtins.import ../development/cuda-modules/cuda/overrides.nix {inherit cudaVersion lib;})
+    # (callPackage ../development/cuda-modules/generic-builders/multiplex.nix {
+    #   inherit cudaVersion flags mkVersionedPackageName;
+    #   pname = "cudnn";
+    #   releasesModule = ../development/cuda-modules/cudnn/releases.nix;
+    #   shimsFn = ../development/cuda-modules/cudnn/shims.nix;
+    #   fixupFn = ../development/cuda-modules/cudnn/fixup.nix;
+    # })
+    # (callPackage ../development/cuda-modules/cutensor/extension.nix {
+    #   inherit cudaVersion flags mkVersionedPackageName;
+    # })
+    # (callPackage ../development/cuda-modules/generic-builders/multiplex.nix {
+    #   inherit cudaVersion flags mkVersionedPackageName;
+    #   pname = "tensorrt";
+    #   releasesModule = ../development/cuda-modules/tensorrt/releases.nix;
+    #   shimsFn = ../development/cuda-modules/tensorrt/shims.nix;
+    #   fixupFn = ../development/cuda-modules/tensorrt/fixup.nix;
+    # })
+    # (callPackage ../development/cuda-modules/cuda-samples/extension.nix {inherit cudaVersion;})
+    # (callPackage ../development/cuda-modules/cuda-library-samples/extension.nix {})
   ];
 
-  cudaPackages = customisation.makeScope newScope (
-    fixedPoints.extends composedExtension passthruFunction
-  );
+  cudaPackages = makeScopeWithSplicing' {
+    otherSplices = generateSplicesForMkScope "cudaPackages";
+    f = fixedPoints.extends composedExtension passthruFunction;
+  };
 in
 cudaPackages // { inherit __attrsFailEvaluation; }

From c86cead2478428d1d73535bf801120576b87df3f Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Tue, 23 Jan 2024 03:24:40 +0000
Subject: [PATCH 03/34] cuda-modules/setup-hooks: introduce helper function and
 add comments about callPackage

---
 .../cuda-modules/setup-hooks/extension.nix    | 136 +++++++++---------
 1 file changed, 69 insertions(+), 67 deletions(-)

diff --git a/pkgs/development/cuda-modules/setup-hooks/extension.nix b/pkgs/development/cuda-modules/setup-hooks/extension.nix
index b9afd4f1998cd..32483c9e200eb 100644
--- a/pkgs/development/cuda-modules/setup-hooks/extension.nix
+++ b/pkgs/development/cuda-modules/setup-hooks/extension.nix
@@ -1,67 +1,67 @@
-final: _: {
+let
+  createSetupHooks =
+    setupHooksAttrs: final: prev:
+    let
+      # It is imperative that we use `final.callPackage` to create these setup hooks, as it allows us access to the spliced
+      # package sets.
+      inherit (final) callPackage;
+
+      # NOTE(@connorbaker): We MUST use `lib` from `prev` because the attribute names CAN NOT depend on `final`.
+      inherit (prev.lib.attrsets) mapAttrs;
+
+      aliases = {
+        # Deprecated: an alias kept for compatibility. Consider removing after 24.11
+        autoAddOpenGLRunpathHook = final.autoAddDriverRunpath;
+      };
+    in
+    mapAttrs (_: value: callPackage value { }) setupHooksAttrs // aliases;
+in
+createSetupHooks {
   # Helper hook used in both autoAddCudaCompatRunpath and
   # autoAddDriverRunpath that applies a generic patching action to all elf
   # files with a dynamic linking section.
   autoFixElfFiles =
-    final.callPackage
-      (
-        {makeSetupHook}:
-         makeSetupHook
-          {
-            name = "auto-fix-elf-files";
-          }
-          ./auto-fix-elf-files.sh
-      )
-      {};
+    { makeSetupHook }: makeSetupHook { name = "auto-fix-elf-files"; } ./auto-fix-elf-files.sh;
 
   # Internal hook, used by cudatoolkit and cuda redist packages
   # to accommodate automatic CUDAToolkit_ROOT construction
   markForCudatoolkitRootHook =
-    final.callPackage
-      (
-        {makeSetupHook}:
-        makeSetupHook {name = "mark-for-cudatoolkit-root-hook";} ./mark-for-cudatoolkit-root-hook.sh
-      )
-      {};
+    { makeSetupHook }:
+    makeSetupHook { name = "mark-for-cudatoolkit-root-hook"; } ./mark-for-cudatoolkit-root-hook.sh;
 
   # Currently propagated by cuda_nvcc or cudatoolkit, rather than used directly
   setupCudaHook =
-    (final.callPackage
-      (
-        {makeSetupHook, backendStdenv}:
-        makeSetupHook
-          {
-            name = "setup-cuda-hook";
-
-            substitutions.setupCudaHook = placeholder "out";
+    { backendStdenv, makeSetupHook }:
+    makeSetupHook
+      {
+        name = "setup-cuda-hook";
 
-            # Point NVCC at a compatible compiler
-            substitutions.ccRoot = "${backendStdenv.cc}";
-
-            # Required in addition to ccRoot as otherwise bin/gcc is looked up
-            # when building CMakeCUDACompilerId.cu
-            substitutions.ccFullPath = "${backendStdenv.cc}/bin/${backendStdenv.cc.targetPrefix}c++";
-          }
-          ./setup-cuda-hook.sh
-      )
-      {}
-    );
+        substitutions = {
+          # Required in addition to ccRoot as otherwise bin/gcc is looked up
+          # when building CMakeCUDACompilerId.cu
+          ccFullPath = "${backendStdenv.cc}/bin/${backendStdenv.cc.targetPrefix}c++";
+          # Point NVCC at a compatible compiler
+          ccRoot = "${backendStdenv.cc}";
+          setupCudaHook = placeholder "out";
+        };
+      }
+      ./setup-cuda-hook.sh;
 
   autoAddDriverRunpath =
-    final.callPackage
-      (
-        {addDriverRunpath, autoFixElfFiles, makeSetupHook}:
-        makeSetupHook
-          {
-            name = "auto-add-opengl-runpath-hook";
-            propagatedBuildInputs = [addDriverRunpath autoFixElfFiles];
-          }
-          ./auto-add-driver-runpath-hook.sh
-      )
-      {};
-
-  # Deprecated: an alias kept for compatibility. Consider removing after 24.11
-  autoAddOpenGLRunpathHook = final.autoAddDriverRunpath;
+    {
+      addDriverRunpath,
+      autoFixElfFiles,
+      makeSetupHook,
+    }:
+    makeSetupHook
+      {
+        name = "auto-add-opengl-runpath-hook";
+        propagatedBuildInputs = [
+          addDriverRunpath
+          autoFixElfFiles
+        ];
+      }
+      ./auto-add-driver-runpath-hook.sh;
 
   # autoAddCudaCompatRunpath hook must be added AFTER `setupCudaHook`. Both
   # hooks prepend a path with `libcuda.so` to the `DT_RUNPATH` section of
@@ -69,23 +69,25 @@ final: _: {
   # it doesn't have any effect) and thus appear first. Meaning this hook must be
   # executed last.
   autoAddCudaCompatRunpath =
-    final.callPackage
-      (
-        {makeSetupHook, autoFixElfFiles, lib, flags, cuda_compat ? null }:
-        makeSetupHook
-          {
-            name = "auto-add-cuda-compat-runpath-hook";
-            propagatedBuildInputs = [autoFixElfFiles];
+    {
+      autoFixElfFiles,
+      cuda_compat ? null,
+      flags,
+      lib,
+      makeSetupHook,
+    }:
+    makeSetupHook
+      {
+        name = "auto-add-cuda-compat-runpath-hook";
+        propagatedBuildInputs = [ autoFixElfFiles ];
 
-            substitutions.libcudaPath = lib.optionalString flags.isJetsonBuild "${cuda_compat}/compat";
+        substitutions.libcudaPath = lib.optionalString flags.isJetsonBuild "${cuda_compat}/compat";
 
-            meta = {
-              broken = !flags.isJetsonBuild;
-              badPlatforms = lib.optionals (cuda_compat == null) lib.platforms.all;
-              platforms = cuda_compat.meta.platforms or [ ];
-            };
-          }
-          ./auto-add-cuda-compat-runpath.sh
-      )
-      {};
+        meta = {
+          broken = !flags.isJetsonBuild;
+          badPlatforms = lib.optionals (cuda_compat == null) lib.platforms.all;
+          platforms = cuda_compat.meta.platforms or [ ];
+        };
+      }
+      ./auto-add-cuda-compat-runpath.sh;
 }

From 798c380ff1346c75adc751f6a3e09e1c066d6213 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Tue, 23 Jan 2024 03:28:33 +0000
Subject: [PATCH 04/34] cuda-modules/generic-builders/manifest: use hostTarget
 autoAddCudaCompatRunpathHook

---
 .../development/cuda-modules/generic-builders/manifest.nix | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pkgs/development/cuda-modules/generic-builders/manifest.nix b/pkgs/development/cuda-modules/generic-builders/manifest.nix
index 049c8936426d2..0a3c65f28fc30 100644
--- a/pkgs/development/cuda-modules/generic-builders/manifest.nix
+++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix
@@ -204,7 +204,12 @@ backendStdenv.mkDerivation (
     ++ lib.optionals (pname != "cuda_compat" && flags.isJetsonBuild) [
       # autoAddCudaCompatRunpath must appear AFTER autoAddDriverRunpath.
       # See its documentation in ./setup-hooks/extension.nix.
-      autoAddCudaCompatRunpath
+      # NOTE(@connorbaker): Because autoAddCudaCompatRunpath is in nativeBuildInputs, it tries to use toolchains
+      # from buildPlatform, but that's not what we want. We want to use our host/target toolchains!
+      # To overcome this, we access the `__spliced` attribute and choose the `hostTarget` attribute.
+      # In the case the `__spliced` attribute doesn't exist, we just use the hook directly (because we're not
+      # cross-compiling).
+      autoAddCudaCompatRunpath.__spliced.hostTarget or autoAddCudaCompatRunpath
     ];
 
     buildInputs =

From 96cc89966d01d05e22cd36a3cde9eb79ef17794a Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Tue, 23 Jan 2024 03:33:38 +0000
Subject: [PATCH 05/34] cuda-modules/cuda/overrides: introduce helper function
 and add comments about callPackage

---
 .../cuda-modules/cuda/overrides.nix           | 613 +++++++++++-------
 pkgs/top-level/cuda-packages.nix              |   2 +-
 2 files changed, 364 insertions(+), 251 deletions(-)

diff --git a/pkgs/development/cuda-modules/cuda/overrides.nix b/pkgs/development/cuda-modules/cuda/overrides.nix
index 31d03dd0fc73f..59d4639587ca5 100644
--- a/pkgs/development/cuda-modules/cuda/overrides.nix
+++ b/pkgs/development/cuda-modules/cuda/overrides.nix
@@ -1,284 +1,397 @@
-{cudaVersion, lib}:
+# NOTE(@connorbaker): None of the functions in this attribute set should need to access _final or _prev.
+# As such, they are prefixed with an underscore -- everything should be doable with the spliced package sets
+# provided to each function in the attribute set by `final.callPackage`.
 let
-  inherit (lib) attrsets lists strings;
-  # cudaVersionOlder : Version -> Boolean
-  cudaVersionOlder = strings.versionOlder cudaVersion;
-  # cudaVersionAtLeast : Version -> Boolean
-  cudaVersionAtLeast = strings.versionAtLeast cudaVersion;
+  filterAndCreateOverrides =
+    createOverrideAttrs: final: prev:
+    let
+      # It is imperative that we use `final.callPackage` to create these overrides, as it allows us access to the spliced
+      # package sets.
+      inherit (final) callPackage;
 
-  addBuildInputs =
-    drv: buildInputs:
-    drv.overrideAttrs (prevAttrs: {buildInputs = prevAttrs.buildInputs ++ buildInputs;});
+      # NOTE(@connorbaker): We MUST use `lib` from `prev` because the attribute names CAN NOT depend on `final`.
+      inherit (prev.lib.attrsets) filterAttrs mapAttrs;
+      inherit (prev.lib.trivial) pipe;
+    in
+    pipe createOverrideAttrs [
+      # NOTE: Filter out attributes that are not present in the previous version of
+      # the package set. This is necessary to prevent the appearance of attributes
+      # like `cuda_nvcc` in `cudaPackages_10_0, which predates redistributables.
+      (filterAttrs (name: _: prev ? ${name}))
+      # NOTE: It is imperative that we use `final.callPackage` to perform overrides,
+      # as it allows us access to the spliced package sets.
+      # Pass the previous version of the package to the override function.
+      (mapAttrs (name: value: callPackage value { ${name} = prev.${name}; }))
+    ];
 in
-# NOTE: Filter out attributes that are not present in the previous version of
-# the package set. This is necessary to prevent the appearance of attributes
-# like `cuda_nvcc` in `cudaPackages_10_0, which predates redistributables.
-final: prev:
-attrsets.filterAttrs (attr: _: (builtins.hasAttr attr prev)) {
-  libcufile = prev.libcufile.overrideAttrs (
-    prevAttrs: {
-      buildInputs = prevAttrs.buildInputs ++ [
-        final.libcublas.lib
-        final.pkgs.numactl
-        final.pkgs.rdma-core
-      ];
-      # Before 11.7 libcufile depends on itself for some reason.
-      autoPatchelfIgnoreMissingDeps =
-        prevAttrs.autoPatchelfIgnoreMissingDeps
-        ++ lists.optionals (cudaVersionOlder "11.7") [ "libcufile.so.0" ];
-    }
-  );
+filterAndCreateOverrides {
+  libcufile =
+    {
+      cudaOlder,
+      lib,
+      libcublas,
+      libcufile,
+      numactl,
+      rdma-core,
+    }:
+    libcufile.overrideAttrs (
+      prevAttrs: {
+        buildInputs = prevAttrs.buildInputs ++ [
+          libcublas.lib
+          numactl
+          rdma-core
+        ];
+        # Before 11.7 libcufile depends on itself for some reason.
+        autoPatchelfIgnoreMissingDeps =
+          prevAttrs.autoPatchelfIgnoreMissingDeps
+          ++ lib.lists.optionals (cudaOlder "11.7") [ "libcufile.so.0" ];
+      }
+    );
 
-  libcusolver = addBuildInputs prev.libcusolver (
-    # Always depends on this
-    [final.libcublas.lib]
-    # Dependency from 12.0 and on
-    ++ lists.optionals (cudaVersionAtLeast "12.0") [final.libnvjitlink.lib]
-    # Dependency from 12.1 and on
-    ++ lists.optionals (cudaVersionAtLeast "12.1") [final.libcusparse.lib]
-  );
+  libcusolver =
+    {
+      cudaAtLeast,
+      lib,
+      libcublas,
+      libcusolver,
+      libcusparse ? null,
+      libnvjitlink ? null,
+    }:
+    libcusolver.overrideAttrs (
+      prevAttrs: {
+        buildInputs =
+          prevAttrs.buildInputs
+          # Always depends on this
+          ++ [ libcublas.lib ]
+          # Dependency from 12.0 and on
+          ++ lib.lists.optionals (cudaAtLeast "12.0") [ libnvjitlink.lib ]
+          # Dependency from 12.1 and on
+          ++ lib.lists.optionals (cudaAtLeast "12.1") [ libcusparse.lib ];
+      }
+    );
 
-  libcusparse = addBuildInputs prev.libcusparse (
-    lists.optionals (cudaVersionAtLeast "12.0") [final.libnvjitlink.lib]
-  );
+  libcusparse =
+    {
+      cudaAtLeast,
+      lib,
+      libcusparse,
+      libnvjitlink ? null,
+    }:
+    libcusparse.overrideAttrs (
+      prevAttrs: {
+        buildInputs =
+          prevAttrs.buildInputs
+          # Dependency from 12.0 and on
+          ++ lib.lists.optionals (cudaAtLeast "12.0") [ libnvjitlink.lib ];
+      }
+    );
 
-  cuda_cudart = prev.cuda_cudart.overrideAttrs (
-    prevAttrs: {
-      # Remove once cuda-find-redist-features has a special case for libcuda
-      outputs =
-        prevAttrs.outputs
-        ++ lists.optionals (!(builtins.elem "stubs" prevAttrs.outputs)) [ "stubs" ];
+  cuda_cudart =
+    {
+      buildPackages,
+      cuda_cudart,
+      lib,
+    }:
+    cuda_cudart.overrideAttrs (
+      prevAttrs: {
+        # Remove once cuda-find-redist-features has a special case for libcuda
+        outputs =
+          prevAttrs.outputs
+          ++ lib.lists.optionals (!(builtins.elem "stubs" prevAttrs.outputs)) [ "stubs" ];
 
-      allowFHSReferences = false;
+        allowFHSReferences = false;
 
-      # The libcuda stub's pkg-config doesn't follow the general pattern:
-      postPatch =
-        prevAttrs.postPatch or ""
-        + ''
-          while IFS= read -r -d $'\0' path ; do
-            sed -i \
-              -e "s|^libdir\s*=.*/lib\$|libdir=''${!outputLib}/lib/stubs|" \
-              -e "s|^Libs\s*:\(.*\)\$|Libs: \1 -Wl,-rpath,${final.pkgs.addDriverRunpath.driverLink}/lib|" \
-              "$path"
-          done < <(find -iname 'cuda-*.pc' -print0)
-        ''
-        + ''
-          # Namelink may not be enough, add a soname.
-          # Cf. https://gitlab.kitware.com/cmake/cmake/-/issues/25536
-          if [[ -f lib/stubs/libcuda.so && ! -f lib/stubs/libcuda.so.1 ]] ; then
-            ln -s libcuda.so lib/stubs/libcuda.so.1
-          fi
-        '';
+        # The libcuda stub's pkg-config doesn't follow the general pattern:
+        postPatch =
+          prevAttrs.postPatch or ""
+          + ''
+            while IFS= read -r -d $'\0' path ; do
+              sed -i \
+                -e "s|^libdir\s*=.*/lib\$|libdir=''${!outputLib}/lib/stubs|" \
+                -e "s|^Libs\s*:\(.*\)\$|Libs: \1 -Wl,-rpath,${buildPackages.addDriverRunpath.driverLink}/lib|" \
+                "$path"
+            done < <(find -iname 'cuda-*.pc' -print0)
+          ''
+          + ''
+            # Namelink may not be enough, add a soname.
+            # Cf. https://gitlab.kitware.com/cmake/cmake/-/issues/25536
+            if [[ -f lib/stubs/libcuda.so && ! -f lib/stubs/libcuda.so.1 ]] ; then
+              ln -s libcuda.so lib/stubs/libcuda.so.1
+            fi
+          '';
 
-      postFixup =
-        prevAttrs.postFixup or ""
-        + ''
-          moveToOutput lib/stubs "$stubs"
-          ln -s "$stubs"/lib/stubs/* "$stubs"/lib/
-          ln -s "$stubs"/lib/stubs "''${!outputLib}/lib/stubs"
-        '';
-    }
-  );
+        postFixup =
+          prevAttrs.postFixup or ""
+          + ''
+            moveToOutput lib/stubs "$stubs"
+            ln -s "$stubs"/lib/stubs/* "$stubs"/lib/
+            ln -s "$stubs"/lib/stubs "''${!outputLib}/lib/stubs"
+          '';
+      }
+    );
 
-  cuda_compat = prev.cuda_compat.overrideAttrs (
-    prevAttrs: {
-      autoPatchelfIgnoreMissingDeps = prevAttrs.autoPatchelfIgnoreMissingDeps ++ [
-        "libnvrm_gpu.so"
-        "libnvrm_mem.so"
-        "libnvdla_runtime.so"
-      ];
-      # `cuda_compat` only works on aarch64-linux, and only when building for Jetson devices.
-      badPlatformsConditions = prevAttrs.badPlatformsConditions // {
-        "Trying to use cuda_compat on aarch64-linux targeting non-Jetson devices" =
-          !final.flags.isJetsonBuild;
-      };
-      meta = prevAttrs.meta // {
-        # For cross-compilation, we need the hostPlatform to be included in order to fetch and build the package. This
-        # doesn't change the fact that it won't work on non-Jetson devices, so we only add it when building for Jetson.
-        platforms = prevAttrs.meta.platforms ++ lib.optionals final.flags.isJetsonBuild [ "x86_64-linux" ];
-      };
-    }
-  );
+  cuda_compat =
+    {
+      cuda_compat,
+      flags,
+      lib,
+    }:
+    cuda_compat.overrideAttrs (
+      prevAttrs: {
+        autoPatchelfIgnoreMissingDeps = prevAttrs.autoPatchelfIgnoreMissingDeps ++ [
+          "libnvrm_gpu.so"
+          "libnvrm_mem.so"
+          "libnvdla_runtime.so"
+        ];
+        # `cuda_compat` only works on aarch64-linux, and only when building for Jetson devices.
+        badPlatformsConditions = prevAttrs.badPlatformsConditions // {
+          "Trying to use cuda_compat on aarch64-linux targeting non-Jetson devices" = !flags.isJetsonBuild;
+        };
+        meta = prevAttrs.meta // {
+          # For cross-compilation, we need the hostPlatform to be included in order to fetch and build the package. This
+          # doesn't change the fact that it won't work on non-Jetson devices, so we only add it when building for Jetson.
+          platforms = prevAttrs.meta.platforms ++ lib.lists.optionals flags.isJetsonBuild [ "x86_64-linux" ];
+        };
+      }
+    );
 
-  cuda_gdb = addBuildInputs prev.cuda_gdb (
-    # x86_64 only needs gmp from 12.0 and on
-    lists.optionals (cudaVersionAtLeast "12.0") [final.pkgs.gmp]
-  );
+  cuda_gdb =
+    {
+      cuda_gdb,
+      cudaAtLeast,
+      gmp,
+      lib,
+    }:
+    cuda_gdb.overrideAttrs (
+      prevAttrs: {
+        buildInputs =
+          prevAttrs.buildInputs
+          # x86_64 only needs gmp from 12.0 and on
+          ++ lib.lists.optionals (cudaAtLeast "11.0") [ gmp ];
+      }
+    );
 
-  cuda_nvcc = prev.cuda_nvcc.overrideAttrs (
-    oldAttrs:
-    let
-      # This replicates the logic in stdenvAdapters.useLibsFrom, except we use
-      # gcc from pkgsHostTarget and not from buildPackages.
-      ccForLibs-wrapper = final.pkgs.stdenv.cc;
-      gccMajorVersion = final.nvccCompatibilities.${cudaVersion}.gccMaxMajorVersion;
-      cc = final.pkgs.wrapCCWith {
-        cc = final.pkgs."gcc${gccMajorVersion}".cc;
-        useCcForLibs = true;
-        gccForLibs = ccForLibs-wrapper.cc;
-      };
-    in
+  cuda_nvcc =
     {
+      backendStdenv,
+      buildPackages,
+      cuda_cudart,
+      cuda_nvcc,
+      cudaAtLeast,
+      cudaOlder,
+      lib,
+      setupCudaHook,
+    }:
+    cuda_nvcc.overrideAttrs (
+      prevAttrs: {
+        # Remove once cuda-find-redist-features has a special case for libcuda
+        outputs =
+          prevAttrs.outputs
+          ++ lib.lists.optionals (!(builtins.elem "lib" prevAttrs.outputs)) [ "lib" ];
 
-      outputs = oldAttrs.outputs ++ lists.optionals (!(builtins.elem "lib" oldAttrs.outputs)) [ "lib" ];
+        # Patch the nvcc.profile.
+        # Syntax:
+        # - `=` for assignment,
+        # - `?=` for conditional assignment,
+        # - `+=` to "prepend",
+        # - `=+` to "append".
 
-      # Patch the nvcc.profile.
-      # Syntax:
-      # - `=` for assignment,
-      # - `?=` for conditional assignment,
-      # - `+=` to "prepend",
-      # - `=+` to "append".
+        # Cf. https://web.archive.org/web/20230308044351/https://arcb.csc.ncsu.edu/~mueller/cluster/nvidia/2.0/nvcc_2.0.pdf
 
-      # Cf. https://web.archive.org/web/20230308044351/https://arcb.csc.ncsu.edu/~mueller/cluster/nvidia/2.0/nvcc_2.0.pdf
+        # We set all variables with the lowest priority (=+), but we do force
+        # nvcc to use the fixed backend toolchain. Cf. comments in
+        # backend-stdenv.nix
 
-      # We set all variables with the lowest priority (=+), but we do force
-      # nvcc to use the fixed backend toolchain. Cf. comments in
-      # backend-stdenv.nix
+        nativeBuildInputs = prevAttrs.nativeBuildInputs ++ [ backendStdenv.cc ];
 
-      postPatch =
-        (oldAttrs.postPatch or "")
-        + ''
-          substituteInPlace bin/nvcc.profile \
-            --replace \
-              '$(TOP)/lib' \
-              "''${!outputLib}/lib" \
-            --replace \
-              '$(TOP)/$(_NVVM_BRANCH_)' \
-              "''${!outputBin}/nvvm" \
-            --replace \
-              '$(TOP)/$(_TARGET_DIR_)/include' \
-              "''${!outputDev}/include"
+        postPatch =
+          (prevAttrs.postPatch or "")
+          + ''
+            echo "Running the cuda_nvcc postPatch"
+            substituteInPlace bin/nvcc.profile \
+              --replace \
+                '$(TOP)/lib' \
+                "''${!outputLib}/lib" \
+              --replace \
+                '$(TOP)/$(_NVVM_BRANCH_)' \
+                "''${!outputBin}/nvvm" \
+              --replace \
+                '$(TOP)/$(_TARGET_DIR_)/include' \
+                "''${!outputDev}/include"
 
-          cat << EOF >> bin/nvcc.profile
+            cat << EOF >> bin/nvcc.profile
 
-          # Fix a compatible backend compiler
-          PATH += ${lib.getBin cc}/bin:
+            # Fix a compatible backend compiler
+            PATH += ${lib.getBin backendStdenv.cc}/bin:
 
-          # Expose the split-out nvvm
-          LIBRARIES =+ -L''${!outputBin}/nvvm/lib
-          INCLUDES =+ -I''${!outputBin}/nvvm/include
+            # Expose the split-out nvvm
+            LIBRARIES =+ -L''${!outputBin}/nvvm/lib
+            INCLUDES =+ -I''${!outputBin}/nvvm/include
 
-          # Expose cudart and the libcuda stubs
-          LIBRARIES =+ -L$static/lib" "-L${final.cuda_cudart.lib}/lib -L${final.cuda_cudart.lib}/lib/stubs
-          INCLUDES =+ -I${final.cuda_cudart.dev}/include
-          EOF
-        '';
+            # Expose cudart and the libcuda stubs
+            LIBRARIES =+ -L$static/lib" "-L${cuda_cudart.lib}/lib -L${cuda_cudart.lib}/lib/stubs
+            INCLUDES =+ -I${cuda_cudart.dev}/include
+            EOF
+          '';
 
-      propagatedBuildInputs = [ final.setupCudaHook ];
+        propagatedNativeBuildInputs = [ setupCudaHook ];
 
-      postInstall =
-        (oldAttrs.postInstall or "")
-        + ''
-          moveToOutput "nvvm" "''${!outputBin}"
-        '';
+        postInstall =
+          (prevAttrs.postInstall or "")
+          + ''
+            moveToOutput "nvvm" "''${!outputBin}"
+          '';
 
-      # The nvcc and cicc binaries contain hard-coded references to /usr
-      allowFHSReferences = true;
+        # The nvcc and cicc binaries contain hard-coded references to /usr
+        allowFHSReferences = true;
 
-      meta = (oldAttrs.meta or { }) // {
-        mainProgram = "nvcc";
-      };
-    }
-  );
+        meta = (prevAttrs.meta or { }) // {
+          mainProgram = "nvcc";
+        };
+      }
+    );
 
-  cuda_nvprof = prev.cuda_nvprof.overrideAttrs (
-    prevAttrs: {buildInputs = prevAttrs.buildInputs ++ [final.cuda_cupti.lib];}
-  );
+  cuda_nvprof =
+    { cuda_cupti, cuda_nvprof }:
+    cuda_nvprof.overrideAttrs (
+      prevAttrs: { buildInputs = prevAttrs.buildInputs ++ [ cuda_cupti.lib ]; }
+    );
 
-  cuda_demo_suite = addBuildInputs prev.cuda_demo_suite [
-    final.pkgs.freeglut
-    final.pkgs.libGLU
-    final.pkgs.libglvnd
-    final.pkgs.mesa
-    final.libcufft.lib
-    final.libcurand.lib
-  ];
+  cuda_demo_suite =
+    {
+      cuda_demo_suite,
+      freeglut,
+      lib,
+      libcufft,
+      libcurand,
+      libGLU,
+      libglvnd,
+      mesa,
+    }:
+    cuda_demo_suite.overrideAttrs (
+      prevAttrs: {
+        buildInputs = prevAttrs.buildInputs ++ [
+          freeglut
+          libcufft.lib
+          libcurand.lib
+          libGLU
+          libglvnd
+          mesa
+        ];
+      }
+    );
 
-  nsight_compute = prev.nsight_compute.overrideAttrs (
-    prevAttrs: {
-      nativeBuildInputs =
-        prevAttrs.nativeBuildInputs
-        ++ (
-          if (strings.versionOlder prev.nsight_compute.version "2022.2.0") then
-            [final.pkgs.qt5.wrapQtAppsHook]
-          else
-            [final.pkgs.qt6.wrapQtAppsHook]
-        );
-      buildInputs =
-        prevAttrs.buildInputs
-        ++ (
-          if (strings.versionOlder prev.nsight_compute.version "2022.2.0") then
-            [final.pkgs.qt5.qtwebview]
-          else
-            [final.pkgs.qt6.qtwebview]
-        );
-    }
-  );
+  nsight_compute =
+    {
+      lib,
+      nsight_compute,
+      qt5 ? null,
+      qt6 ? null,
+    }:
+    nsight_compute.overrideAttrs (
+      prevAttrs: {
+        nativeBuildInputs =
+          prevAttrs.nativeBuildInputs
+          ++ (
+            if (lib.strings.versionOlder prevAttrs.version "2022.2.0") then
+              [ qt5.wrapQtAppsHook ]
+            else
+              [ qt6.wrapQtAppsHook ]
+          );
+        buildInputs =
+          prevAttrs.buildInputs
+          ++ (
+            if (lib.strings.versionOlder prevAttrs.version "2022.2.0") then
+              [ qt5.qtwebview ]
+            else
+              [ qt6.qtwebview ]
+          );
+      }
+    );
 
-  nsight_systems = prev.nsight_systems.overrideAttrs (
-    prevAttrs:
-    let
-      qt = if lib.versionOlder prevAttrs.version "2022.4.2.1" then final.pkgs.qt5 else final.pkgs.qt6;
-      qtwayland =
-        if lib.versions.major qt.qtbase.version == "5" then
-          lib.getBin qt.qtwayland
-        else
-          lib.getLib qt.qtwayland;
-      qtWaylandPlugins = "${qtwayland}/${qt.qtbase.qtPluginPrefix}";
-    in
+  nsight_systems =
     {
-      # An ad hoc replacement for
-      # https://github.com/ConnorBaker/cuda-redist-find-features/issues/11
-      env.rmPatterns = toString [
-        "nsight-systems/*/*/libQt*"
-        "nsight-systems/*/*/libstdc*"
-        "nsight-systems/*/*/libboost*"
-        "nsight-systems/*/*/lib{ssl,ssh,crypto}*"
-        "nsight-systems/*/*/lib{arrow,jpeg}*"
-        "nsight-systems/*/*/Mesa"
-        "nsight-systems/*/*/python/bin/python"
-        "nsight-systems/*/*/libexec"
-        "nsight-systems/*/*/Plugins"
-      ];
-      postPatch =
-        prevAttrs.postPatch or ""
-        + ''
-          for path in $rmPatterns ; do
-            rm -r "$path"
-          done
-        '';
-      nativeBuildInputs = prevAttrs.nativeBuildInputs ++ [ qt.wrapQtAppsHook ];
-      buildInputs = prevAttrs.buildInputs ++ [
-        final.cuda_cudart.stubs
-        final.pkgs.alsa-lib
-        final.pkgs.boost178
-        final.pkgs.e2fsprogs
-        final.pkgs.gst_all_1.gst-plugins-base
-        final.pkgs.gst_all_1.gstreamer
-        final.pkgs.nss
-        final.pkgs.numactl
-        final.pkgs.pulseaudio
-        final.pkgs.rdma-core
-        final.pkgs.ucx
-        final.pkgs.wayland
-        final.pkgs.xorg.libXcursor
-        final.pkgs.xorg.libXdamage
-        final.pkgs.xorg.libXrandr
-        final.pkgs.xorg.libXtst
-        qt.qtbase
-        (qt.qtdeclarative or qt.full)
-        (qt.qtsvg or qt.full)
-        qtWaylandPlugins
-      ];
+      alsa-lib,
+      boost178,
+      cuda_cudart,
+      cudaOlder,
+      e2fsprogs,
+      gst_all_1,
+      lib,
+      nsight_systems,
+      nss,
+      numactl,
+      pulseaudio,
+      qt5 ? null,
+      qt6 ? null,
+      rdma-core,
+      ucx,
+      wayland,
+      xorg,
+    }:
+    nsight_systems.overrideAttrs (
+      prevAttrs:
+      let
+        qt = if lib.strings.versionOlder prevAttrs.version "2022.4.2.1" then qt5 else qt6;
+        qtwayland =
+          if lib.versions.major qt.qtbase.version == "5" then
+            lib.getBin qt.qtwayland
+          else
+            lib.getLib qt.qtwayland;
+        qtWaylandPlugins = "${qtwayland}/${qt.qtbase.qtPluginPrefix}";
+      in
+      {
+        # An ad hoc replacement for
+        # https://github.com/ConnorBaker/cuda-redist-find-features/issues/11
+        env.rmPatterns = toString [
+          "nsight-systems/*/*/libQt*"
+          "nsight-systems/*/*/libstdc*"
+          "nsight-systems/*/*/libboost*"
+          "nsight-systems/*/*/lib{ssl,ssh,crypto}*"
+          "nsight-systems/*/*/lib{arrow,jpeg}*"
+          "nsight-systems/*/*/Mesa"
+          "nsight-systems/*/*/python/bin/python"
+          "nsight-systems/*/*/libexec"
+          "nsight-systems/*/*/Plugins"
+        ];
+        postPatch =
+          prevAttrs.postPatch or ""
+          + ''
+            for path in $rmPatterns ; do
+              rm -r "$path"
+            done
+          '';
+        nativeBuildInputs = prevAttrs.nativeBuildInputs ++ [ qt.wrapQtAppsHook ];
+        buildInputs = prevAttrs.buildInputs ++ [
+          (qt.qtdeclarative or qt.full)
+          (qt.qtsvg or qt.full)
+          cuda_cudart.stubs
+          gst_all_1.gst-plugins-base
+          gst_all_1.gstreamer
+          nss
+          numactl
+          pulseaudio
+          qt.qtbase
+          qtWaylandPlugins
+          rdma-core
+          ucx
+          wayland
+          xorg.libXcursor
+          xorg.libXdamage
+          xorg.libXrandr
+          xorg.libXtst
+        ];
 
-      # Older releases require boost 1.70 deprecated in Nixpkgs
-      meta.broken = prevAttrs.meta.broken or false || lib.versionOlder final.cudaVersion "11.8";
-    }
-  );
+        # Older releases require boost 1.70 deprecated in Nixpkgs
+        meta.broken = prevAttrs.meta.broken or false || cudaOlder "11.8";
+      }
+    );
 
-  nvidia_driver = prev.nvidia_driver.overrideAttrs {
-    # No need to support this package as we have drivers already
-    # in linuxPackages.
-    meta.broken = true;
-  };
+  nvidia_driver =
+    { nvidia_driver }:
+    nvidia_driver.overrideAttrs {
+      # No need to support this package as we have drivers already
+      # in linuxPackages.
+      meta.broken = true;
+    };
 }
diff --git a/pkgs/top-level/cuda-packages.nix b/pkgs/top-level/cuda-packages.nix
index eb0efcb10865f..8191d6035cbb0 100644
--- a/pkgs/top-level/cuda-packages.nix
+++ b/pkgs/top-level/cuda-packages.nix
@@ -87,7 +87,7 @@ let
   composedExtension = fixedPoints.composeManyExtensions [
     (builtins.import ../development/cuda-modules/setup-hooks/extension.nix)
     (builtins.import ../development/cuda-modules/cuda/extension.nix {inherit cudaVersion lib;})
-    (builtins.import ../development/cuda-modules/cuda/overrides.nix {inherit cudaVersion lib;})
+    (builtins.import ../development/cuda-modules/cuda/overrides.nix)
     # (callPackage ../development/cuda-modules/generic-builders/multiplex.nix {
     #   inherit cudaVersion flags mkVersionedPackageName;
     #   pname = "cudnn";

From 40aab07fb59ec88c396eb4959b1bc92fec1b3d69 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Tue, 23 Jan 2024 03:36:03 +0000
Subject: [PATCH 06/34] cuda-modules/cuda/overrides: add TODOs for @connorbaker

---
 pkgs/development/cuda-modules/cuda/overrides.nix | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pkgs/development/cuda-modules/cuda/overrides.nix b/pkgs/development/cuda-modules/cuda/overrides.nix
index 59d4639587ca5..f8c96a538be12 100644
--- a/pkgs/development/cuda-modules/cuda/overrides.nix
+++ b/pkgs/development/cuda-modules/cuda/overrides.nix
@@ -185,6 +185,8 @@ filterAndCreateOverrides {
     cuda_nvcc.overrideAttrs (
       prevAttrs: {
         # Remove once cuda-find-redist-features has a special case for libcuda
+        # TODO(@connorbaker): The order of build outputs matters as we traverse them when creating split outputs.
+        # The `lib` output cannot come after `static` as it moves all the static libraries back to the `lib` output.
         outputs =
           prevAttrs.outputs
           ++ lib.lists.optionals (!(builtins.elem "lib" prevAttrs.outputs)) [ "lib" ];
@@ -204,6 +206,7 @@ filterAndCreateOverrides {
 
         nativeBuildInputs = prevAttrs.nativeBuildInputs ++ [ backendStdenv.cc ];
 
+        # TODO(@connorbaker): We should specify the spliced version of backendStdenv and cuda_cudart to use here.
         postPatch =
           (prevAttrs.postPatch or "")
           + ''

From 2057e243b20e3b34097b2fba0face2cb5e175e36 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Tue, 30 Jan 2024 17:38:09 +0000
Subject: [PATCH 07/34] cuda-modules: use hostPlatform when downloading
 binaries

---
 pkgs/development/cuda-modules/cutensor/extension.nix      | 4 ++--
 .../cuda-modules/generic-builders/manifest.nix            | 8 +++-----
 .../cuda-modules/generic-builders/multiplex.nix           | 4 ++--
 pkgs/development/cuda-modules/tensorrt/fixup.nix          | 6 +++---
 4 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/pkgs/development/cuda-modules/cutensor/extension.nix b/pkgs/development/cuda-modules/cutensor/extension.nix
index 38b0b03248aad..29959fc013f99 100644
--- a/pkgs/development/cuda-modules/cutensor/extension.nix
+++ b/pkgs/development/cuda-modules/cutensor/extension.nix
@@ -13,9 +13,9 @@
 # - Instead of providing different releases for each version of CUDA, CuTensor has multiple subdirectories in `lib`
 #   -- one for each version of CUDA.
 {
+  backendStdenv,
   cudaVersion,
   flags,
-  targetPlatform,
   lib,
   mkVersionedPackageName,
 }:
@@ -93,7 +93,7 @@ let
   # LibPath are not constant across the same release -- one platform may support fewer
   # CUDA versions than another.
   # redistArch :: String
-  redistArch = flags.getRedistArch targetPlatform.system;
+  redistArch = flags.getRedistArch backendStdenv.hostPlatform.system;
   # platformIsSupported :: Manifests -> Boolean
   platformIsSupported =
     {feature, ...}:
diff --git a/pkgs/development/cuda-modules/generic-builders/manifest.nix b/pkgs/development/cuda-modules/generic-builders/manifest.nix
index 0a3c65f28fc30..fc1b899d1146c 100644
--- a/pkgs/development/cuda-modules/generic-builders/manifest.nix
+++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix
@@ -10,7 +10,6 @@
   markForCudatoolkitRootHook,
   flags,
   stdenv,
-  targetPlatform,
   # Builder-specific arguments
   # Short package name (e.g., "cuda_cccl")
   # pname : String
@@ -32,7 +31,6 @@ let
   inherit (lib)
     attrsets
     lists
-    meta
     strings
     trivial
     licenses
@@ -45,10 +43,10 @@ let
   supportedRedistArchs = builtins.attrNames featureRelease;
   # redistArch :: String
   # The redistArch is the name of the architecture for which the redistributable is built.
-  # It is `"unsupported"` if the redistributable is not supported on the target platform.
-  redistArch = flags.getRedistArch targetPlatform.system;
+  # It is `"unsupported"` if the redistributable is not supported on the hostPlatform.
+  redistArch = flags.getRedistArch backendStdenv.hostPlatform.system;
 
-  sourceMatchesHost = flags.getNixSystem redistArch == stdenv.hostPlatform.system;
+  sourceMatchesHost = flags.getNixSystem redistArch == backendStdenv.hostPlatform.system;
 in
 backendStdenv.mkDerivation (
   finalAttrs: {
diff --git a/pkgs/development/cuda-modules/generic-builders/multiplex.nix b/pkgs/development/cuda-modules/generic-builders/multiplex.nix
index deeb2da6e0042..1cb6d8462b7dc 100644
--- a/pkgs/development/cuda-modules/generic-builders/multiplex.nix
+++ b/pkgs/development/cuda-modules/generic-builders/multiplex.nix
@@ -1,9 +1,9 @@
 {
   # callPackage-provided arguments
+  backendStdenv,
   lib,
   cudaVersion,
   flags,
-  targetPlatform,
   # Expected to be passed by the caller
   mkVersionedPackageName,
   # pname :: String
@@ -74,7 +74,7 @@ let
   # Get all of the packages for our given platform.
   # redistArch :: String
   # Value is `"unsupported"` if the platform is not supported.
-  redistArch = flags.getRedistArch targetPlatform.system;
+  redistArch = flags.getRedistArch backendStdenv.hostPlatform.system;
 
   preferable =
     p1: p2: (isSupported p2 -> isSupported p1) && (strings.versionAtLeast p1.version p2.version);
diff --git a/pkgs/development/cuda-modules/tensorrt/fixup.nix b/pkgs/development/cuda-modules/tensorrt/fixup.nix
index c6cbd137a0e4c..27851d3e1e56f 100644
--- a/pkgs/development/cuda-modules/tensorrt/fixup.nix
+++ b/pkgs/development/cuda-modules/tensorrt/fixup.nix
@@ -1,7 +1,7 @@
 {
+  backendStdenv,
   cudaVersion,
   final,
-  targetPlatform,
   lib,
   mkVersionedPackageName,
   package,
@@ -18,7 +18,7 @@ let
     versions
     ;
   # targetArch :: String
-  targetArch = attrsets.attrByPath [ targetPlatform.system ] "unsupported" {
+  targetArch = attrsets.attrByPath [ backendStdenv.hostPlatform.system ] "unsupported" {
     x86_64-linux = "x86_64-linux-gnu";
     aarch64-linux = "aarch64-linux-gnu";
   };
@@ -106,7 +106,7 @@ finalAttrs: prevAttrs: {
   meta = prevAttrs.meta // {
     badPlatforms =
       prevAttrs.meta.badPlatforms or [ ]
-      ++ lib.optionals (targetArch == "unsupported") [ targetPlatform.system ];
+      ++ lib.optionals (targetArch == "unsupported") [ backendStdenv.hostPlatform.system ];
     homepage = "https://developer.nvidia.com/tensorrt";
     maintainers = prevAttrs.meta.maintainers ++ [maintainers.aidalgol];
   };

From cd632d812adc51981c2662ce8b168d70ca342ac7 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Wed, 20 Mar 2024 17:18:17 +0000
Subject: [PATCH 08/34] cuda-modules/setup-hooks: switch to directory structure

---
 .../auto-add-cuda-compat-runpath-hook.sh}     |   0
 .../default.nix                               |  26 +++++
 .../auto-add-driver-runpath-hook.sh           |   0
 .../auto-add-driver-runpath-hook/default.nix  |  14 +++
 .../auto-fix-elf-files-hook.sh}               |   2 +-
 .../auto-fix-elf-files-hook/default.nix       |   4 +
 .../cuda-modules/setup-hooks/extension.nix    | 102 ++----------------
 .../default.nix                               |   4 +
 .../mark-for-cudatoolkit-root-hook.sh         |   0
 .../setup-hooks/setup-cuda-hook/default.nix   |  16 +++
 .../{ => setup-cuda-hook}/setup-cuda-hook.sh  |   0
 11 files changed, 75 insertions(+), 93 deletions(-)
 rename pkgs/development/cuda-modules/setup-hooks/{auto-add-cuda-compat-runpath.sh => auto-add-cuda-compat-runpath-hook/auto-add-cuda-compat-runpath-hook.sh} (100%)
 create mode 100644 pkgs/development/cuda-modules/setup-hooks/auto-add-cuda-compat-runpath-hook/default.nix
 rename pkgs/development/cuda-modules/setup-hooks/{ => auto-add-driver-runpath-hook}/auto-add-driver-runpath-hook.sh (100%)
 create mode 100644 pkgs/development/cuda-modules/setup-hooks/auto-add-driver-runpath-hook/default.nix
 rename pkgs/development/cuda-modules/setup-hooks/{auto-fix-elf-files.sh => auto-fix-elf-files-hook/auto-fix-elf-files-hook.sh} (97%)
 create mode 100644 pkgs/development/cuda-modules/setup-hooks/auto-fix-elf-files-hook/default.nix
 create mode 100644 pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook/default.nix
 rename pkgs/development/cuda-modules/setup-hooks/{ => mark-for-cudatoolkit-root-hook}/mark-for-cudatoolkit-root-hook.sh (100%)
 create mode 100644 pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/default.nix
 rename pkgs/development/cuda-modules/setup-hooks/{ => setup-cuda-hook}/setup-cuda-hook.sh (100%)

diff --git a/pkgs/development/cuda-modules/setup-hooks/auto-add-cuda-compat-runpath.sh b/pkgs/development/cuda-modules/setup-hooks/auto-add-cuda-compat-runpath-hook/auto-add-cuda-compat-runpath-hook.sh
similarity index 100%
rename from pkgs/development/cuda-modules/setup-hooks/auto-add-cuda-compat-runpath.sh
rename to pkgs/development/cuda-modules/setup-hooks/auto-add-cuda-compat-runpath-hook/auto-add-cuda-compat-runpath-hook.sh
diff --git a/pkgs/development/cuda-modules/setup-hooks/auto-add-cuda-compat-runpath-hook/default.nix b/pkgs/development/cuda-modules/setup-hooks/auto-add-cuda-compat-runpath-hook/default.nix
new file mode 100644
index 0000000000000..f253331fb24b0
--- /dev/null
+++ b/pkgs/development/cuda-modules/setup-hooks/auto-add-cuda-compat-runpath-hook/default.nix
@@ -0,0 +1,26 @@
+# autoAddCudaCompatRunpath hook must be added AFTER `setupCudaHook`. Both
+# hooks prepend a path with `libcuda.so` to the `DT_RUNPATH` section of
+# patched elf files, but `cuda_compat` path must take precedence (otherwise,
+# it doesn't have any effect) and thus appear first. Meaning this hook must be
+# executed last.
+{
+  autoFixElfFiles,
+  cuda_compat ? null,
+  flags,
+  lib,
+  makeSetupHook,
+}:
+makeSetupHook
+  {
+    name = "auto-add-cuda-compat-runpath-hook";
+    propagatedBuildInputs = [ autoFixElfFiles ];
+
+    substitutions.libcudaPath = lib.optionalString flags.isJetsonBuild "${cuda_compat}/compat";
+
+    meta = {
+      broken = !flags.isJetsonBuild;
+      badPlatforms = lib.optionals (cuda_compat == null) lib.platforms.all;
+      platforms = cuda_compat.meta.platforms or [ ];
+    };
+  }
+  ./auto-add-cuda-compat-runpath-hook.sh
diff --git a/pkgs/development/cuda-modules/setup-hooks/auto-add-driver-runpath-hook.sh b/pkgs/development/cuda-modules/setup-hooks/auto-add-driver-runpath-hook/auto-add-driver-runpath-hook.sh
similarity index 100%
rename from pkgs/development/cuda-modules/setup-hooks/auto-add-driver-runpath-hook.sh
rename to pkgs/development/cuda-modules/setup-hooks/auto-add-driver-runpath-hook/auto-add-driver-runpath-hook.sh
diff --git a/pkgs/development/cuda-modules/setup-hooks/auto-add-driver-runpath-hook/default.nix b/pkgs/development/cuda-modules/setup-hooks/auto-add-driver-runpath-hook/default.nix
new file mode 100644
index 0000000000000..97d020b2129d5
--- /dev/null
+++ b/pkgs/development/cuda-modules/setup-hooks/auto-add-driver-runpath-hook/default.nix
@@ -0,0 +1,14 @@
+{
+  addDriverRunpath,
+  autoFixElfFiles,
+  makeSetupHook,
+}:
+makeSetupHook
+  {
+    name = "auto-add-opengl-runpath-hook";
+    propagatedBuildInputs = [
+      addDriverRunpath
+      autoFixElfFiles
+    ];
+  }
+  ./auto-add-driver-runpath-hook.sh
diff --git a/pkgs/development/cuda-modules/setup-hooks/auto-fix-elf-files.sh b/pkgs/development/cuda-modules/setup-hooks/auto-fix-elf-files-hook/auto-fix-elf-files-hook.sh
similarity index 97%
rename from pkgs/development/cuda-modules/setup-hooks/auto-fix-elf-files.sh
rename to pkgs/development/cuda-modules/setup-hooks/auto-fix-elf-files-hook/auto-fix-elf-files-hook.sh
index 1d57dfb17a66d..084c14016fc0b 100644
--- a/pkgs/development/cuda-modules/setup-hooks/auto-fix-elf-files.sh
+++ b/pkgs/development/cuda-modules/setup-hooks/auto-fix-elf-files-hook/auto-fix-elf-files-hook.sh
@@ -2,7 +2,7 @@
 # List all dynamically linked ELF files in the outputs and apply a generic fix
 # action provided as a parameter (currently used to add the CUDA or the
 # cuda_compat driver to the runpath of binaries)
-echo "Sourcing cuda/fix-elf-files.sh"
+echo "Sourcing auto-fix-elf-files-hook"
 
 # Returns the exit code of patchelf --print-rpath.
 # A return code of 0 (success) means the ELF file has a dynamic section, while
diff --git a/pkgs/development/cuda-modules/setup-hooks/auto-fix-elf-files-hook/default.nix b/pkgs/development/cuda-modules/setup-hooks/auto-fix-elf-files-hook/default.nix
new file mode 100644
index 0000000000000..4550dc80edaef
--- /dev/null
+++ b/pkgs/development/cuda-modules/setup-hooks/auto-fix-elf-files-hook/default.nix
@@ -0,0 +1,4 @@
+# Helper hook used in both autoAddCudaCompatRunpath and
+# autoAddDriverRunpath that applies a generic patching action to all elf
+# files with a dynamic linking section.
+{ makeSetupHook }: makeSetupHook { name = "auto-fix-elf-files-hook"; } ./auto-fix-elf-files-hook.sh
diff --git a/pkgs/development/cuda-modules/setup-hooks/extension.nix b/pkgs/development/cuda-modules/setup-hooks/extension.nix
index 32483c9e200eb..57dc92900e77c 100644
--- a/pkgs/development/cuda-modules/setup-hooks/extension.nix
+++ b/pkgs/development/cuda-modules/setup-hooks/extension.nix
@@ -1,93 +1,11 @@
-let
-  createSetupHooks =
-    setupHooksAttrs: final: prev:
-    let
-      # It is imperative that we use `final.callPackage` to create these setup hooks, as it allows us access to the spliced
-      # package sets.
-      inherit (final) callPackage;
-
-      # NOTE(@connorbaker): We MUST use `lib` from `prev` because the attribute names CAN NOT depend on `final`.
-      inherit (prev.lib.attrsets) mapAttrs;
-
-      aliases = {
-        # Deprecated: an alias kept for compatibility. Consider removing after 24.11
-        autoAddOpenGLRunpathHook = final.autoAddDriverRunpath;
-      };
-    in
-    mapAttrs (_: value: callPackage value { }) setupHooksAttrs // aliases;
-in
-createSetupHooks {
-  # Helper hook used in both autoAddCudaCompatRunpath and
-  # autoAddDriverRunpath that applies a generic patching action to all elf
-  # files with a dynamic linking section.
-  autoFixElfFiles =
-    { makeSetupHook }: makeSetupHook { name = "auto-fix-elf-files"; } ./auto-fix-elf-files.sh;
-
-  # Internal hook, used by cudatoolkit and cuda redist packages
-  # to accommodate automatic CUDAToolkit_ROOT construction
-  markForCudatoolkitRootHook =
-    { makeSetupHook }:
-    makeSetupHook { name = "mark-for-cudatoolkit-root-hook"; } ./mark-for-cudatoolkit-root-hook.sh;
-
-  # Currently propagated by cuda_nvcc or cudatoolkit, rather than used directly
-  setupCudaHook =
-    { backendStdenv, makeSetupHook }:
-    makeSetupHook
-      {
-        name = "setup-cuda-hook";
-
-        substitutions = {
-          # Required in addition to ccRoot as otherwise bin/gcc is looked up
-          # when building CMakeCUDACompilerId.cu
-          ccFullPath = "${backendStdenv.cc}/bin/${backendStdenv.cc.targetPrefix}c++";
-          # Point NVCC at a compatible compiler
-          ccRoot = "${backendStdenv.cc}";
-          setupCudaHook = placeholder "out";
-        };
-      }
-      ./setup-cuda-hook.sh;
-
-  autoAddDriverRunpath =
-    {
-      addDriverRunpath,
-      autoFixElfFiles,
-      makeSetupHook,
-    }:
-    makeSetupHook
-      {
-        name = "auto-add-opengl-runpath-hook";
-        propagatedBuildInputs = [
-          addDriverRunpath
-          autoFixElfFiles
-        ];
-      }
-      ./auto-add-driver-runpath-hook.sh;
-
-  # autoAddCudaCompatRunpath hook must be added AFTER `setupCudaHook`. Both
-  # hooks prepend a path with `libcuda.so` to the `DT_RUNPATH` section of
-  # patched elf files, but `cuda_compat` path must take precedence (otherwise,
-  # it doesn't have any effect) and thus appear first. Meaning this hook must be
-  # executed last.
-  autoAddCudaCompatRunpath =
-    {
-      autoFixElfFiles,
-      cuda_compat ? null,
-      flags,
-      lib,
-      makeSetupHook,
-    }:
-    makeSetupHook
-      {
-        name = "auto-add-cuda-compat-runpath-hook";
-        propagatedBuildInputs = [ autoFixElfFiles ];
-
-        substitutions.libcudaPath = lib.optionalString flags.isJetsonBuild "${cuda_compat}/compat";
-
-        meta = {
-          broken = !flags.isJetsonBuild;
-          badPlatforms = lib.optionals (cuda_compat == null) lib.platforms.all;
-          platforms = cuda_compat.meta.platforms or [ ];
-        };
-      }
-      ./auto-add-cuda-compat-runpath.sh;
+final: _: {
+  autoAddCudaCompatRunpath = final.callPackage ./auto-add-cuda-compat-runpath-hook { };
+  autoAddDriverRunpath = final.callPackage ./auto-add-driver-runpath-hook { };
+  autoFixElfFiles = final.callPackage ./auto-fix-elf-files-hook { };
+  markForCudatoolkitRootHook = final.callPackage ./mark-for-cudatoolkit-root-hook { };
+  setupCudaHook = final.callPackage ./setup-cuda-hook { };
+
+  # Aliases
+  # Deprecated: an alias kept for compatibility. Consider removing after 24.11
+  autoAddOpenGLRunpathHook = final.autoAddDriverRunpath;
 }
diff --git a/pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook/default.nix b/pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook/default.nix
new file mode 100644
index 0000000000000..86ff28d6c41a1
--- /dev/null
+++ b/pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook/default.nix
@@ -0,0 +1,4 @@
+# Internal hook, used by cudatoolkit and cuda redist packages
+# to accommodate automatic CUDAToolkit_ROOT construction
+{ makeSetupHook }:
+makeSetupHook { name = "mark-for-cudatoolkit-root-hook"; } ./mark-for-cudatoolkit-root-hook.sh
diff --git a/pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook.sh b/pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook/mark-for-cudatoolkit-root-hook.sh
similarity index 100%
rename from pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook.sh
rename to pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook/mark-for-cudatoolkit-root-hook.sh
diff --git a/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/default.nix b/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/default.nix
new file mode 100644
index 0000000000000..6c5f299d4418c
--- /dev/null
+++ b/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/default.nix
@@ -0,0 +1,16 @@
+# Currently propagated by cuda_nvcc or cudatoolkit, rather than used directly
+{ backendStdenv, makeSetupHook }:
+makeSetupHook
+  {
+    name = "setup-cuda-hook";
+
+    substitutions = {
+      # Required in addition to ccRoot as otherwise bin/gcc is looked up
+      # when building CMakeCUDACompilerId.cu
+      ccFullPath = "${backendStdenv.cc}/bin/${backendStdenv.cc.targetPrefix}c++";
+      # Point NVCC at a compatible compiler
+      ccRoot = "${backendStdenv.cc}";
+      setupCudaHook = placeholder "out";
+    };
+  }
+  ./setup-cuda-hook.sh
diff --git a/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook.sh b/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/setup-cuda-hook.sh
similarity index 100%
rename from pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook.sh
rename to pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/setup-cuda-hook.sh

From 1aa56f017116b209784f770d65102e1f3fb7191a Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Wed, 20 Mar 2024 17:29:27 +0000
Subject: [PATCH 09/34] cuda-modules: always get *Platform from stdenv

---
 .../cuda-modules/cuda-library-samples/extension.nix    |  6 ++++--
 .../cuda-modules/cuda-samples/extension.nix            |  4 ++--
 pkgs/development/cuda-modules/cuda-samples/generic.nix |  3 ++-
 pkgs/development/cuda-modules/flags.nix                | 10 +++++++---
 .../cuda-modules/generic-builders/manifest.nix         |  6 ++++--
 pkgs/development/cuda-modules/tensorrt/fixup.nix       |  7 +++++--
 6 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/pkgs/development/cuda-modules/cuda-library-samples/extension.nix b/pkgs/development/cuda-modules/cuda-library-samples/extension.nix
index 4cb34af732095..9092a653bd5e9 100644
--- a/pkgs/development/cuda-modules/cuda-library-samples/extension.nix
+++ b/pkgs/development/cuda-modules/cuda-library-samples/extension.nix
@@ -1,8 +1,10 @@
-{hostPlatform, lib}:
+{backendStdenv, lib}:
 let
+  inherit (backendStdenv.hostPlatform) isx86_64 isLinux;
+
   # Samples are built around the CUDA Toolkit, which is not available for
   # aarch64. Check for both CUDA version and platform.
-  platformIsSupported = hostPlatform.isx86_64 && hostPlatform.isLinux;
+  platformIsSupported = isx86_64 && isLinux;
 
   # Build our extension
   extension =
diff --git a/pkgs/development/cuda-modules/cuda-samples/extension.nix b/pkgs/development/cuda-modules/cuda-samples/extension.nix
index d41da90cd5d0e..90a124f80fa73 100644
--- a/pkgs/development/cuda-modules/cuda-samples/extension.nix
+++ b/pkgs/development/cuda-modules/cuda-samples/extension.nix
@@ -1,6 +1,6 @@
 {
+  backendStdenv,
   cudaVersion,
-  hostPlatform,
   lib,
 }:
 let
@@ -26,7 +26,7 @@ let
   # Samples are built around the CUDA Toolkit, which is not available for
   # aarch64. Check for both CUDA version and platform.
   cudaVersionIsSupported = cudaVersionToHash ? ${cudaVersion};
-  platformIsSupported = hostPlatform.isx86_64;
+  platformIsSupported = backendStdenv.hostPlatform.isx86_64;
   isSupported = cudaVersionIsSupported && platformIsSupported;
 
   # Build our extension
diff --git a/pkgs/development/cuda-modules/cuda-samples/generic.nix b/pkgs/development/cuda-modules/cuda-samples/generic.nix
index 3d1dac015e16c..e2a33cd7839c9 100644
--- a/pkgs/development/cuda-modules/cuda-samples/generic.nix
+++ b/pkgs/development/cuda-modules/cuda-samples/generic.nix
@@ -14,6 +14,7 @@
 }:
 let
   inherit (lib) lists strings;
+  inherit (backendStdenv.hostPlatform.parsed) cpu kernel;
 in
 backendStdenv.mkDerivation (
   finalAttrs: {
@@ -64,7 +65,7 @@ backendStdenv.mkDerivation (
     installPhase = ''
       runHook preInstall
 
-      install -Dm755 -t $out/bin bin/${backendStdenv.hostPlatform.parsed.cpu.name}/${backendStdenv.hostPlatform.parsed.kernel.name}/release/*
+      install -Dm755 -t $out/bin bin/${cpu.name}/${kernel.name}/release/*
 
       runHook postInstall
     '';
diff --git a/pkgs/development/cuda-modules/flags.nix b/pkgs/development/cuda-modules/flags.nix
index 50a69d6fd1d1d..89ddfe53aea96 100644
--- a/pkgs/development/cuda-modules/flags.nix
+++ b/pkgs/development/cuda-modules/flags.nix
@@ -2,14 +2,12 @@
 # Gpu :: AttrSet
 #   - See the documentation in ./gpus.nix.
 {
+  backendStdenv,
   config,
   cudaCapabilities ? (config.cudaCapabilities or []),
   cudaForwardCompat ? (config.cudaForwardCompat or true),
   lib,
   cudaVersion,
-  buildPlatform,
-  hostPlatform,
-  targetPlatform,
   # gpus :: List Gpu
   gpus,
 }:
@@ -22,6 +20,12 @@ let
     trivial
     ;
 
+  inherit (backendStdenv)
+    buildPlatform
+    hostPlatform
+    targetPlatform
+    ;
+
   # Flags are determined based on your CUDA toolkit by default.  You may benefit
   # from improved performance, reduced file size, or greater hardware support by
   # passing a configuration based on your specific GPU environment.
diff --git a/pkgs/development/cuda-modules/generic-builders/manifest.nix b/pkgs/development/cuda-modules/generic-builders/manifest.nix
index fc1b899d1146c..cb49f98d77597 100644
--- a/pkgs/development/cuda-modules/generic-builders/manifest.nix
+++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix
@@ -38,15 +38,17 @@ let
     sourceTypes
     ;
 
+  inherit (backendStdenv) hostPlatform;
+
   # Get the redist architectures for which package provides distributables.
   # These are used by meta.platforms.
   supportedRedistArchs = builtins.attrNames featureRelease;
   # redistArch :: String
   # The redistArch is the name of the architecture for which the redistributable is built.
   # It is `"unsupported"` if the redistributable is not supported on the hostPlatform.
-  redistArch = flags.getRedistArch backendStdenv.hostPlatform.system;
+  redistArch = flags.getRedistArch hostPlatform.system;
 
-  sourceMatchesHost = flags.getNixSystem redistArch == backendStdenv.hostPlatform.system;
+  sourceMatchesHost = flags.getNixSystem redistArch == hostPlatform.system;
 in
 backendStdenv.mkDerivation (
   finalAttrs: {
diff --git a/pkgs/development/cuda-modules/tensorrt/fixup.nix b/pkgs/development/cuda-modules/tensorrt/fixup.nix
index 27851d3e1e56f..f632f1b138d96 100644
--- a/pkgs/development/cuda-modules/tensorrt/fixup.nix
+++ b/pkgs/development/cuda-modules/tensorrt/fixup.nix
@@ -17,8 +17,11 @@ let
     strings
     versions
     ;
+
+  inherit (backendStdenv) hostPlatform;
+
   # targetArch :: String
-  targetArch = attrsets.attrByPath [ backendStdenv.hostPlatform.system ] "unsupported" {
+  targetArch = attrsets.attrByPath [ hostPlatform.system ] "unsupported" {
     x86_64-linux = "x86_64-linux-gnu";
     aarch64-linux = "aarch64-linux-gnu";
   };
@@ -106,7 +109,7 @@ finalAttrs: prevAttrs: {
   meta = prevAttrs.meta // {
     badPlatforms =
       prevAttrs.meta.badPlatforms or [ ]
-      ++ lib.optionals (targetArch == "unsupported") [ backendStdenv.hostPlatform.system ];
+      ++ lib.optionals (targetArch == "unsupported") [ hostPlatform.system ];
     homepage = "https://developer.nvidia.com/tensorrt";
     maintainers = prevAttrs.meta.maintainers ++ [maintainers.aidalgol];
   };

From 0fec676f01a2d159b60f410dcb5052273bf04aec Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Wed, 20 Mar 2024 17:46:48 +0000
Subject: [PATCH 10/34] cuda-modules/cuda/overrides: simplify callPackage then
 overrideAttrs pattern

---
 .../cuda-modules/cuda/overrides.nix           | 520 +++++++++---------
 1 file changed, 246 insertions(+), 274 deletions(-)

diff --git a/pkgs/development/cuda-modules/cuda/overrides.nix b/pkgs/development/cuda-modules/cuda/overrides.nix
index f8c96a538be12..a8a2c173d3c17 100644
--- a/pkgs/development/cuda-modules/cuda/overrides.nix
+++ b/pkgs/development/cuda-modules/cuda/overrides.nix
@@ -1,268 +1,248 @@
-# NOTE(@connorbaker): None of the functions in this attribute set should need to access _final or _prev.
-# As such, they are prefixed with an underscore -- everything should be doable with the spliced package sets
-# provided to each function in the attribute set by `final.callPackage`.
 let
   filterAndCreateOverrides =
     createOverrideAttrs: final: prev:
     let
-      # It is imperative that we use `final.callPackage` to create these overrides, as it allows us access to the spliced
-      # package sets.
+      # It is imperative that we use `final.callPackage` to create these overrides
+      # as it allows us access to the spliced package sets.
       inherit (final) callPackage;
 
-      # NOTE(@connorbaker): We MUST use `lib` from `prev` because the attribute names CAN NOT depend on `final`.
+      # NOTE(@connorbaker): We MUST use `lib` from `prev` because the attribute
+      # names CAN NOT depend on `final`.
       inherit (prev.lib.attrsets) filterAttrs mapAttrs;
       inherit (prev.lib.trivial) pipe;
-    in
-    pipe createOverrideAttrs [
+
       # NOTE: Filter out attributes that are not present in the previous version of
       # the package set. This is necessary to prevent the appearance of attributes
       # like `cuda_nvcc` in `cudaPackages_10_0, which predates redistributables.
-      (filterAttrs (name: _: prev ? ${name}))
+      filterOutNewAttrs = filterAttrs (name: _: prev ? ${name});
+
       # NOTE: It is imperative that we use `final.callPackage` to perform overrides,
       # as it allows us access to the spliced package sets.
-      # Pass the previous version of the package to the override function.
-      (mapAttrs (name: value: callPackage value { ${name} = prev.${name}; }))
+      # Apply callPackage to each attribute value, yielding a value to be passed
+      # to overrideAttrs.
+      callPackageThenOverrideAttrs = mapAttrs (
+        name: value: prev.${name}.overrideAttrs (callPackage value { })
+      );
+    in
+    pipe createOverrideAttrs [
+      filterOutNewAttrs
+      callPackageThenOverrideAttrs
     ];
 in
+# Each attribute name is the name of an existing package in the previous version
+# of the package set.
+# The value is a function (to be provided to callPackage), which yields a value
+# to be provided to overrideAttrs. This allows us to override the attributes of
+# a package without losing access to the fixed point of the package set --
+# especially useful given that some packages may depend on each other!
 filterAndCreateOverrides {
   libcufile =
     {
       cudaOlder,
       lib,
       libcublas,
-      libcufile,
       numactl,
       rdma-core,
     }:
-    libcufile.overrideAttrs (
-      prevAttrs: {
-        buildInputs = prevAttrs.buildInputs ++ [
-          libcublas.lib
-          numactl
-          rdma-core
-        ];
-        # Before 11.7 libcufile depends on itself for some reason.
-        autoPatchelfIgnoreMissingDeps =
-          prevAttrs.autoPatchelfIgnoreMissingDeps
-          ++ lib.lists.optionals (cudaOlder "11.7") [ "libcufile.so.0" ];
-      }
-    );
+    prevAttrs: {
+      buildInputs = prevAttrs.buildInputs ++ [
+        libcublas.lib
+        numactl
+        rdma-core
+      ];
+      # Before 11.7 libcufile depends on itself for some reason.
+      autoPatchelfIgnoreMissingDeps =
+        prevAttrs.autoPatchelfIgnoreMissingDeps
+        ++ lib.lists.optionals (cudaOlder "11.7") [ "libcufile.so.0" ];
+    };
 
   libcusolver =
     {
       cudaAtLeast,
       lib,
       libcublas,
-      libcusolver,
       libcusparse ? null,
       libnvjitlink ? null,
     }:
-    libcusolver.overrideAttrs (
-      prevAttrs: {
-        buildInputs =
-          prevAttrs.buildInputs
-          # Always depends on this
-          ++ [ libcublas.lib ]
-          # Dependency from 12.0 and on
-          ++ lib.lists.optionals (cudaAtLeast "12.0") [ libnvjitlink.lib ]
-          # Dependency from 12.1 and on
-          ++ lib.lists.optionals (cudaAtLeast "12.1") [ libcusparse.lib ];
-      }
-    );
+    prevAttrs: {
+      buildInputs =
+        prevAttrs.buildInputs
+        # Always depends on this
+        ++ [ libcublas.lib ]
+        # Dependency from 12.0 and on
+        ++ lib.lists.optionals (cudaAtLeast "12.0") [ libnvjitlink.lib ]
+        # Dependency from 12.1 and on
+        ++ lib.lists.optionals (cudaAtLeast "12.1") [ libcusparse.lib ];
+    };
 
   libcusparse =
     {
       cudaAtLeast,
       lib,
-      libcusparse,
       libnvjitlink ? null,
     }:
-    libcusparse.overrideAttrs (
-      prevAttrs: {
-        buildInputs =
-          prevAttrs.buildInputs
-          # Dependency from 12.0 and on
-          ++ lib.lists.optionals (cudaAtLeast "12.0") [ libnvjitlink.lib ];
-      }
-    );
+    prevAttrs: {
+      buildInputs =
+        prevAttrs.buildInputs
+        # Dependency from 12.0 and on
+        ++ lib.lists.optionals (cudaAtLeast "12.0") [ libnvjitlink.lib ];
+    };
 
   cuda_cudart =
-    {
-      buildPackages,
-      cuda_cudart,
-      lib,
-    }:
-    cuda_cudart.overrideAttrs (
-      prevAttrs: {
-        # Remove once cuda-find-redist-features has a special case for libcuda
-        outputs =
-          prevAttrs.outputs
-          ++ lib.lists.optionals (!(builtins.elem "stubs" prevAttrs.outputs)) [ "stubs" ];
+    { buildPackages, lib }:
+    prevAttrs: {
+      # Remove once cuda-find-redist-features has a special case for libcuda
+      outputs =
+        prevAttrs.outputs
+        ++ lib.lists.optionals (!(builtins.elem "stubs" prevAttrs.outputs)) [ "stubs" ];
 
-        allowFHSReferences = false;
+      allowFHSReferences = false;
 
-        # The libcuda stub's pkg-config doesn't follow the general pattern:
-        postPatch =
-          prevAttrs.postPatch or ""
-          + ''
-            while IFS= read -r -d $'\0' path ; do
-              sed -i \
-                -e "s|^libdir\s*=.*/lib\$|libdir=''${!outputLib}/lib/stubs|" \
-                -e "s|^Libs\s*:\(.*\)\$|Libs: \1 -Wl,-rpath,${buildPackages.addDriverRunpath.driverLink}/lib|" \
-                "$path"
-            done < <(find -iname 'cuda-*.pc' -print0)
-          ''
-          + ''
-            # Namelink may not be enough, add a soname.
-            # Cf. https://gitlab.kitware.com/cmake/cmake/-/issues/25536
-            if [[ -f lib/stubs/libcuda.so && ! -f lib/stubs/libcuda.so.1 ]] ; then
-              ln -s libcuda.so lib/stubs/libcuda.so.1
-            fi
-          '';
+      # The libcuda stub's pkg-config doesn't follow the general pattern:
+      postPatch =
+        prevAttrs.postPatch or ""
+        + ''
+          while IFS= read -r -d $'\0' path ; do
+            sed -i \
+              -e "s|^libdir\s*=.*/lib\$|libdir=''${!outputLib}/lib/stubs|" \
+              -e "s|^Libs\s*:\(.*\)\$|Libs: \1 -Wl,-rpath,${buildPackages.addDriverRunpath.driverLink}/lib|" \
+              "$path"
+          done < <(find -iname 'cuda-*.pc' -print0)
+        ''
+        + ''
+          # Namelink may not be enough, add a soname.
+          # Cf. https://gitlab.kitware.com/cmake/cmake/-/issues/25536
+          if [[ -f lib/stubs/libcuda.so && ! -f lib/stubs/libcuda.so.1 ]] ; then
+            ln -s libcuda.so lib/stubs/libcuda.so.1
+          fi
+        '';
 
-        postFixup =
-          prevAttrs.postFixup or ""
-          + ''
-            moveToOutput lib/stubs "$stubs"
-            ln -s "$stubs"/lib/stubs/* "$stubs"/lib/
-            ln -s "$stubs"/lib/stubs "''${!outputLib}/lib/stubs"
-          '';
-      }
-    );
+      postFixup =
+        prevAttrs.postFixup or ""
+        + ''
+          moveToOutput lib/stubs "$stubs"
+          ln -s "$stubs"/lib/stubs/* "$stubs"/lib/
+          ln -s "$stubs"/lib/stubs "''${!outputLib}/lib/stubs"
+        '';
+    };
 
   cuda_compat =
-    {
-      cuda_compat,
-      flags,
-      lib,
-    }:
-    cuda_compat.overrideAttrs (
-      prevAttrs: {
-        autoPatchelfIgnoreMissingDeps = prevAttrs.autoPatchelfIgnoreMissingDeps ++ [
-          "libnvrm_gpu.so"
-          "libnvrm_mem.so"
-          "libnvdla_runtime.so"
-        ];
-        # `cuda_compat` only works on aarch64-linux, and only when building for Jetson devices.
-        badPlatformsConditions = prevAttrs.badPlatformsConditions // {
-          "Trying to use cuda_compat on aarch64-linux targeting non-Jetson devices" = !flags.isJetsonBuild;
-        };
-        meta = prevAttrs.meta // {
-          # For cross-compilation, we need the hostPlatform to be included in order to fetch and build the package. This
-          # doesn't change the fact that it won't work on non-Jetson devices, so we only add it when building for Jetson.
-          platforms = prevAttrs.meta.platforms ++ lib.lists.optionals flags.isJetsonBuild [ "x86_64-linux" ];
-        };
-      }
-    );
+    { flags, lib }:
+    prevAttrs: {
+      autoPatchelfIgnoreMissingDeps = prevAttrs.autoPatchelfIgnoreMissingDeps ++ [
+        "libnvrm_gpu.so"
+        "libnvrm_mem.so"
+        "libnvdla_runtime.so"
+      ];
+      # `cuda_compat` only works on aarch64-linux, and only when building for Jetson devices.
+      badPlatformsConditions = prevAttrs.badPlatformsConditions // {
+        "Trying to use cuda_compat on aarch64-linux targeting non-Jetson devices" = !flags.isJetsonBuild;
+      };
+      meta = prevAttrs.meta // {
+        # For cross-compilation, we need the hostPlatform to be included in order to fetch and build the package. This
+        # doesn't change the fact that it won't work on non-Jetson devices, so we only add it when building for Jetson.
+        platforms = prevAttrs.meta.platforms ++ lib.lists.optionals flags.isJetsonBuild [ "x86_64-linux" ];
+      };
+    };
 
   cuda_gdb =
     {
-      cuda_gdb,
       cudaAtLeast,
       gmp,
       lib,
     }:
-    cuda_gdb.overrideAttrs (
-      prevAttrs: {
-        buildInputs =
-          prevAttrs.buildInputs
-          # x86_64 only needs gmp from 12.0 and on
-          ++ lib.lists.optionals (cudaAtLeast "11.0") [ gmp ];
-      }
-    );
+    prevAttrs: {
+      buildInputs =
+        prevAttrs.buildInputs
+        # x86_64 only needs gmp from 12.0 and on
+        ++ lib.lists.optionals (cudaAtLeast "11.0") [ gmp ];
+    };
 
   cuda_nvcc =
     {
       backendStdenv,
       buildPackages,
       cuda_cudart,
-      cuda_nvcc,
       cudaAtLeast,
       cudaOlder,
       lib,
       setupCudaHook,
     }:
-    cuda_nvcc.overrideAttrs (
-      prevAttrs: {
-        # Remove once cuda-find-redist-features has a special case for libcuda
-        # TODO(@connorbaker): The order of build outputs matters as we traverse them when creating split outputs.
-        # The `lib` output cannot come after `static` as it moves all the static libraries back to the `lib` output.
-        outputs =
-          prevAttrs.outputs
-          ++ lib.lists.optionals (!(builtins.elem "lib" prevAttrs.outputs)) [ "lib" ];
+    prevAttrs: {
+      # Remove once cuda-find-redist-features has a special case for libcuda
+      # TODO(@connorbaker): The order of build outputs matters as we traverse them when creating split outputs.
+      # The `lib` output cannot come after `static` as it moves all the static libraries back to the `lib` output.
+      outputs =
+        prevAttrs.outputs
+        ++ lib.lists.optionals (!(builtins.elem "lib" prevAttrs.outputs)) [ "lib" ];
 
-        # Patch the nvcc.profile.
-        # Syntax:
-        # - `=` for assignment,
-        # - `?=` for conditional assignment,
-        # - `+=` to "prepend",
-        # - `=+` to "append".
+      # Patch the nvcc.profile.
+      # Syntax:
+      # - `=` for assignment,
+      # - `?=` for conditional assignment,
+      # - `+=` to "prepend",
+      # - `=+` to "append".
 
-        # Cf. https://web.archive.org/web/20230308044351/https://arcb.csc.ncsu.edu/~mueller/cluster/nvidia/2.0/nvcc_2.0.pdf
+      # Cf. https://web.archive.org/web/20230308044351/https://arcb.csc.ncsu.edu/~mueller/cluster/nvidia/2.0/nvcc_2.0.pdf
 
-        # We set all variables with the lowest priority (=+), but we do force
-        # nvcc to use the fixed backend toolchain. Cf. comments in
-        # backend-stdenv.nix
+      # We set all variables with the lowest priority (=+), but we do force
+      # nvcc to use the fixed backend toolchain. Cf. comments in
+      # backend-stdenv.nix
 
-        nativeBuildInputs = prevAttrs.nativeBuildInputs ++ [ backendStdenv.cc ];
+      nativeBuildInputs = prevAttrs.nativeBuildInputs ++ [ backendStdenv.cc ];
 
-        # TODO(@connorbaker): We should specify the spliced version of backendStdenv and cuda_cudart to use here.
-        postPatch =
-          (prevAttrs.postPatch or "")
-          + ''
-            echo "Running the cuda_nvcc postPatch"
-            substituteInPlace bin/nvcc.profile \
-              --replace \
-                '$(TOP)/lib' \
-                "''${!outputLib}/lib" \
-              --replace \
-                '$(TOP)/$(_NVVM_BRANCH_)' \
-                "''${!outputBin}/nvvm" \
-              --replace \
-                '$(TOP)/$(_TARGET_DIR_)/include' \
-                "''${!outputDev}/include"
+      # TODO(@connorbaker): We should specify the spliced version of backendStdenv and cuda_cudart to use here.
+      postPatch =
+        (prevAttrs.postPatch or "")
+        + ''
+          echo "Running the cuda_nvcc postPatch"
+          substituteInPlace bin/nvcc.profile \
+            --replace \
+              '$(TOP)/lib' \
+              "''${!outputLib}/lib" \
+            --replace \
+              '$(TOP)/$(_NVVM_BRANCH_)' \
+              "''${!outputBin}/nvvm" \
+            --replace \
+              '$(TOP)/$(_TARGET_DIR_)/include' \
+              "''${!outputDev}/include"
 
-            cat << EOF >> bin/nvcc.profile
+          cat << EOF >> bin/nvcc.profile
 
-            # Fix a compatible backend compiler
-            PATH += ${lib.getBin backendStdenv.cc}/bin:
+          # Fix a compatible backend compiler
+          PATH += ${lib.getBin backendStdenv.cc}/bin:
 
-            # Expose the split-out nvvm
-            LIBRARIES =+ -L''${!outputBin}/nvvm/lib
-            INCLUDES =+ -I''${!outputBin}/nvvm/include
+          # Expose the split-out nvvm
+          LIBRARIES =+ -L''${!outputBin}/nvvm/lib
+          INCLUDES =+ -I''${!outputBin}/nvvm/include
 
-            # Expose cudart and the libcuda stubs
-            LIBRARIES =+ -L$static/lib" "-L${cuda_cudart.lib}/lib -L${cuda_cudart.lib}/lib/stubs
-            INCLUDES =+ -I${cuda_cudart.dev}/include
-            EOF
-          '';
+          # Expose cudart and the libcuda stubs
+          LIBRARIES =+ -L$static/lib" "-L${cuda_cudart.lib}/lib -L${cuda_cudart.lib}/lib/stubs
+          INCLUDES =+ -I${cuda_cudart.dev}/include
+          EOF
+        '';
 
-        propagatedNativeBuildInputs = [ setupCudaHook ];
+      propagatedNativeBuildInputs = [ setupCudaHook ];
 
-        postInstall =
-          (prevAttrs.postInstall or "")
-          + ''
-            moveToOutput "nvvm" "''${!outputBin}"
-          '';
+      postInstall =
+        (prevAttrs.postInstall or "")
+        + ''
+          moveToOutput "nvvm" "''${!outputBin}"
+        '';
 
-        # The nvcc and cicc binaries contain hard-coded references to /usr
-        allowFHSReferences = true;
+      # The nvcc and cicc binaries contain hard-coded references to /usr
+      allowFHSReferences = true;
 
-        meta = (prevAttrs.meta or { }) // {
-          mainProgram = "nvcc";
-        };
-      }
-    );
+      meta = (prevAttrs.meta or { }) // {
+        mainProgram = "nvcc";
+      };
+    };
 
   cuda_nvprof =
-    { cuda_cupti, cuda_nvprof }:
-    cuda_nvprof.overrideAttrs (
-      prevAttrs: { buildInputs = prevAttrs.buildInputs ++ [ cuda_cupti.lib ]; }
-    );
+    { cuda_cupti }: prevAttrs: { buildInputs = prevAttrs.buildInputs ++ [ cuda_cupti.lib ]; };
 
   cuda_demo_suite =
     {
-      cuda_demo_suite,
       freeglut,
       lib,
       libcufft,
@@ -271,46 +251,41 @@ filterAndCreateOverrides {
       libglvnd,
       mesa,
     }:
-    cuda_demo_suite.overrideAttrs (
-      prevAttrs: {
-        buildInputs = prevAttrs.buildInputs ++ [
-          freeglut
-          libcufft.lib
-          libcurand.lib
-          libGLU
-          libglvnd
-          mesa
-        ];
-      }
-    );
+    prevAttrs: {
+      buildInputs = prevAttrs.buildInputs ++ [
+        freeglut
+        libcufft.lib
+        libcurand.lib
+        libGLU
+        libglvnd
+        mesa
+      ];
+    };
 
   nsight_compute =
     {
       lib,
-      nsight_compute,
       qt5 ? null,
       qt6 ? null,
     }:
-    nsight_compute.overrideAttrs (
-      prevAttrs: {
-        nativeBuildInputs =
-          prevAttrs.nativeBuildInputs
-          ++ (
-            if (lib.strings.versionOlder prevAttrs.version "2022.2.0") then
-              [ qt5.wrapQtAppsHook ]
-            else
-              [ qt6.wrapQtAppsHook ]
-          );
-        buildInputs =
-          prevAttrs.buildInputs
-          ++ (
-            if (lib.strings.versionOlder prevAttrs.version "2022.2.0") then
-              [ qt5.qtwebview ]
-            else
-              [ qt6.qtwebview ]
-          );
-      }
-    );
+    prevAttrs: {
+      nativeBuildInputs =
+        prevAttrs.nativeBuildInputs
+        ++ (
+          if (lib.strings.versionOlder prevAttrs.version "2022.2.0") then
+            [ qt5.wrapQtAppsHook ]
+          else
+            [ qt6.wrapQtAppsHook ]
+        );
+      buildInputs =
+        prevAttrs.buildInputs
+        ++ (
+          if (lib.strings.versionOlder prevAttrs.version "2022.2.0") then
+            [ qt5.qtwebview ]
+          else
+            [ qt6.qtwebview ]
+        );
+    };
 
   nsight_systems =
     {
@@ -321,7 +296,6 @@ filterAndCreateOverrides {
       e2fsprogs,
       gst_all_1,
       lib,
-      nsight_systems,
       nss,
       numactl,
       pulseaudio,
@@ -332,67 +306,65 @@ filterAndCreateOverrides {
       wayland,
       xorg,
     }:
-    nsight_systems.overrideAttrs (
-      prevAttrs:
-      let
-        qt = if lib.strings.versionOlder prevAttrs.version "2022.4.2.1" then qt5 else qt6;
-        qtwayland =
-          if lib.versions.major qt.qtbase.version == "5" then
-            lib.getBin qt.qtwayland
-          else
-            lib.getLib qt.qtwayland;
-        qtWaylandPlugins = "${qtwayland}/${qt.qtbase.qtPluginPrefix}";
-      in
-      {
-        # An ad hoc replacement for
-        # https://github.com/ConnorBaker/cuda-redist-find-features/issues/11
-        env.rmPatterns = toString [
-          "nsight-systems/*/*/libQt*"
-          "nsight-systems/*/*/libstdc*"
-          "nsight-systems/*/*/libboost*"
-          "nsight-systems/*/*/lib{ssl,ssh,crypto}*"
-          "nsight-systems/*/*/lib{arrow,jpeg}*"
-          "nsight-systems/*/*/Mesa"
-          "nsight-systems/*/*/python/bin/python"
-          "nsight-systems/*/*/libexec"
-          "nsight-systems/*/*/Plugins"
-        ];
-        postPatch =
-          prevAttrs.postPatch or ""
-          + ''
-            for path in $rmPatterns ; do
-              rm -r "$path"
-            done
-          '';
-        nativeBuildInputs = prevAttrs.nativeBuildInputs ++ [ qt.wrapQtAppsHook ];
-        buildInputs = prevAttrs.buildInputs ++ [
-          (qt.qtdeclarative or qt.full)
-          (qt.qtsvg or qt.full)
-          cuda_cudart.stubs
-          gst_all_1.gst-plugins-base
-          gst_all_1.gstreamer
-          nss
-          numactl
-          pulseaudio
-          qt.qtbase
-          qtWaylandPlugins
-          rdma-core
-          ucx
-          wayland
-          xorg.libXcursor
-          xorg.libXdamage
-          xorg.libXrandr
-          xorg.libXtst
-        ];
+    prevAttrs:
+    let
+      qt = if lib.strings.versionOlder prevAttrs.version "2022.4.2.1" then qt5 else qt6;
+      qtwayland =
+        if lib.versions.major qt.qtbase.version == "5" then
+          lib.getBin qt.qtwayland
+        else
+          lib.getLib qt.qtwayland;
+      qtWaylandPlugins = "${qtwayland}/${qt.qtbase.qtPluginPrefix}";
+    in
+    {
+      # An ad hoc replacement for
+      # https://github.com/ConnorBaker/cuda-redist-find-features/issues/11
+      env.rmPatterns = toString [
+        "nsight-systems/*/*/libQt*"
+        "nsight-systems/*/*/libstdc*"
+        "nsight-systems/*/*/libboost*"
+        "nsight-systems/*/*/lib{ssl,ssh,crypto}*"
+        "nsight-systems/*/*/lib{arrow,jpeg}*"
+        "nsight-systems/*/*/Mesa"
+        "nsight-systems/*/*/python/bin/python"
+        "nsight-systems/*/*/libexec"
+        "nsight-systems/*/*/Plugins"
+      ];
+      postPatch =
+        prevAttrs.postPatch or ""
+        + ''
+          for path in $rmPatterns ; do
+            rm -r "$path"
+          done
+        '';
+      nativeBuildInputs = prevAttrs.nativeBuildInputs ++ [ qt.wrapQtAppsHook ];
+      buildInputs = prevAttrs.buildInputs ++ [
+        (qt.qtdeclarative or qt.full)
+        (qt.qtsvg or qt.full)
+        cuda_cudart.stubs
+        gst_all_1.gst-plugins-base
+        gst_all_1.gstreamer
+        nss
+        numactl
+        pulseaudio
+        qt.qtbase
+        qtWaylandPlugins
+        rdma-core
+        ucx
+        wayland
+        xorg.libXcursor
+        xorg.libXdamage
+        xorg.libXrandr
+        xorg.libXtst
+      ];
 
-        # Older releases require boost 1.70 deprecated in Nixpkgs
-        meta.broken = prevAttrs.meta.broken or false || cudaOlder "11.8";
-      }
-    );
+      # Older releases require boost 1.70 deprecated in Nixpkgs
+      meta.broken = prevAttrs.meta.broken or false || cudaOlder "11.8";
+    };
 
   nvidia_driver =
-    { nvidia_driver }:
-    nvidia_driver.overrideAttrs {
+    { }:
+    {
       # No need to support this package as we have drivers already
       # in linuxPackages.
       meta.broken = true;

From ddcfff03cf907789a00aa0c4f95eac916fce9426 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Wed, 20 Mar 2024 18:16:01 +0000
Subject: [PATCH 11/34] cudaPackages.cuda_nvcc: lib must precede static in
 outputs

---
 .../cuda-modules/cuda/overrides.nix           | 26 ++++++++++++++++---
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/pkgs/development/cuda-modules/cuda/overrides.nix b/pkgs/development/cuda-modules/cuda/overrides.nix
index a8a2c173d3c17..2d40ccd2de9b3 100644
--- a/pkgs/development/cuda-modules/cuda/overrides.nix
+++ b/pkgs/development/cuda-modules/cuda/overrides.nix
@@ -170,11 +170,29 @@ filterAndCreateOverrides {
     }:
     prevAttrs: {
       # Remove once cuda-find-redist-features has a special case for libcuda
-      # TODO(@connorbaker): The order of build outputs matters as we traverse them when creating split outputs.
-      # The `lib` output cannot come after `static` as it moves all the static libraries back to the `lib` output.
       outputs =
-        prevAttrs.outputs
-        ++ lib.lists.optionals (!(builtins.elem "lib" prevAttrs.outputs)) [ "lib" ];
+        # NOTE: The order of build outputs matters as we traverse them when creating
+        # split outputs. The `lib` output cannot come after `static` as it moves all
+        # the static libraries back to the `lib` output.
+        let
+          libOutputIsPresent = builtins.elem "lib" prevAttrs.outputs;
+          staticOutputPos = lib.lists.findFirstIndex (x: x == "static") null prevAttrs.outputs;
+          outputsBeforeStatic = lib.lists.take staticOutputPos prevAttrs.outputs;
+          outputsFromStaticAndLater = lib.lists.drop staticOutputPos prevAttrs.outputs;
+          newOutputs =
+            if libOutputIsPresent then
+              # If the lib output is present, we want to keep it in the same position
+              prevAttrs.outputs
+            else if staticOutputPos == null then
+              # If the static output is not present, location of the lib output
+              # doesn't matter and we can append it
+              prevAttrs.outputs ++ [ "lib" ]
+            else
+              # The lib output is missing and the static output is present.
+              # We need to insert the lib output before the static output.
+              outputsBeforeStatic ++ [ "lib" ] ++ outputsFromStaticAndLater;
+        in
+        newOutputs;
 
       # Patch the nvcc.profile.
       # Syntax:

From 112d38bb8c21660f1aa7501319dec0789701021f Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Wed, 20 Mar 2024 18:25:51 +0000
Subject: [PATCH 12/34] cuda-modules: add check for duplicate/misordered
 outputs

---
 .../cuda-modules/generic-builders/manifest.nix      | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/pkgs/development/cuda-modules/generic-builders/manifest.nix b/pkgs/development/cuda-modules/generic-builders/manifest.nix
index cb49f98d77597..d47b979b7029a 100644
--- a/pkgs/development/cuda-modules/generic-builders/manifest.nix
+++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix
@@ -133,7 +133,18 @@ backendStdenv.mkDerivation (
     # brokenConditions :: AttrSet Bool
     # Sets `meta.broken = true` if any of the conditions are true.
     # Example: Broken on a specific version of CUDA or when a dependency has a specific version.
-    brokenConditions = { };
+    brokenConditions = {
+      # Unclear how this is handled by Nix internals.
+      "Duplicate entries in outputs" = finalAttrs.outputs != lists.unique finalAttrs.outputs;
+      # Typically this results in the static output being empty, as all libraries are moved
+      # back to the lib output.
+      "lib output follows static output" =
+        let
+          libIndex = lists.findFirstIndex (x: x == "lib") null finalAttrs.outputs;
+          staticIndex = lists.findFirstIndex (x: x == "static") null finalAttrs.outputs;
+        in
+        libIndex != null && staticIndex != null && libIndex > staticIndex;
+    };
 
     # badPlatformsConditions :: AttrSet Bool
     # Sets `meta.badPlatforms = meta.platforms` if any of the conditions are true.

From 49676c7cf88588a29e781432a8f2a136aa41a7db Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Wed, 20 Mar 2024 18:31:20 +0000
Subject: [PATCH 13/34] cuda-modules: update note on use of lndir from path

---
 pkgs/development/cuda-modules/generic-builders/manifest.nix | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pkgs/development/cuda-modules/generic-builders/manifest.nix b/pkgs/development/cuda-modules/generic-builders/manifest.nix
index d47b979b7029a..d0d023f565c60 100644
--- a/pkgs/development/cuda-modules/generic-builders/manifest.nix
+++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix
@@ -314,9 +314,9 @@ backendStdenv.mkDerivation (
 
     # For each output, create a symlink to it in the out output.
     # NOTE: We must recreate the out output here, because the setup hook will have deleted it if it was empty.
-    # TODO: Previously we used `meta.getExe lndir` to get the path to lndir, but that doesn't work under
-    # cross-compilation -- whatever machinery Nixpkgs uses to get a version built for hostPlatform (so it can run
-    # during the build) doesn't extend to `meta.getExe`.
+    # NOTE: Rely on nativeBuildInputs adding lndir to the path because meta.getExe has no concept of spliced
+    # attributes and will select the hostPlatform variant instead of the buildPlatform variant.
+    # TODO(@connorbaker): This should be removed when https://github.com/NixOS/nixpkgs/issues/271792 is resolved.
     postPatchelf = ''
       mkdir -p "$out"
       for output in $(getAllOutputNames); do

From 868fa525d4eb16a253ac073a1dbddf6ef87ff0f5 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Wed, 20 Mar 2024 18:40:33 +0000
Subject: [PATCH 14/34] cudaPackages.saxpy: Jetson should be supported after
 CUDA 11.4

---
 pkgs/development/cuda-modules/saxpy/default.nix | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/pkgs/development/cuda-modules/saxpy/default.nix b/pkgs/development/cuda-modules/saxpy/default.nix
index bc299dea006f4..be31ced11be77 100644
--- a/pkgs/development/cuda-modules/saxpy/default.nix
+++ b/pkgs/development/cuda-modules/saxpy/default.nix
@@ -10,11 +10,11 @@ let
     cuda_cccl
     cuda_cudart
     cuda_nvcc
+    cudaAtLeast
+    cudaOlder
     cudatoolkit
-    cudaVersion
     flags
     libcublas
-    setupCudaHook
     ;
   inherit (lib) getDev getLib getOutput;
 in
@@ -31,18 +31,18 @@ backendStdenv.mkDerivation {
       cmake
       autoAddDriverRunpath
     ]
-    ++ lib.optionals (lib.versionOlder cudaVersion "11.4") [cudatoolkit]
-    ++ lib.optionals (lib.versionAtLeast cudaVersion "11.4") [cuda_nvcc];
+    ++ lib.optionals (cudaOlder "11.4") [cudatoolkit]
+    ++ lib.optionals (cudaAtLeast "11.4") [cuda_nvcc];
 
   buildInputs =
-    lib.optionals (lib.versionOlder cudaVersion "11.4") [cudatoolkit]
-    ++ lib.optionals (lib.versionAtLeast cudaVersion "11.4") [
+    lib.optionals (cudaOlder "11.4") [cudatoolkit]
+    ++ lib.optionals (cudaAtLeast "11.4") [
       (getDev libcublas)
       (getLib libcublas)
       (getOutput "static" libcublas)
       cuda_cudart
     ]
-    ++ lib.optionals (lib.versionAtLeast cudaVersion "12.0") [cuda_cccl];
+    ++ lib.optionals (cudaAtLeast "12.0") [cuda_cccl];
 
   cmakeFlags = [
     (lib.cmakeBool "CMAKE_VERBOSE_MAKEFILE" true)
@@ -56,6 +56,6 @@ backendStdenv.mkDerivation {
     license = lib.licenses.mit;
     maintainers = lib.teams.cuda.members;
     platforms = lib.platforms.unix;
-    badPlatforms = lib.optionals flags.isJetsonBuild platforms;
+    badPlatforms = lib.optionals (flags.isJetsonBuild && cudaOlder "11.4") platforms;
   };
 }

From 2b351b2d1c64be6442cf44da597b9f062fd97b65 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Wed, 20 Mar 2024 19:09:33 +0000
Subject: [PATCH 15/34] cuda-modules/cuda/overrides: remove unused callPackage
 arguments

---
 pkgs/development/cuda-modules/cuda/overrides.nix | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/pkgs/development/cuda-modules/cuda/overrides.nix b/pkgs/development/cuda-modules/cuda/overrides.nix
index 2d40ccd2de9b3..963003944f332 100644
--- a/pkgs/development/cuda-modules/cuda/overrides.nix
+++ b/pkgs/development/cuda-modules/cuda/overrides.nix
@@ -161,10 +161,7 @@ filterAndCreateOverrides {
   cuda_nvcc =
     {
       backendStdenv,
-      buildPackages,
       cuda_cudart,
-      cudaAtLeast,
-      cudaOlder,
       lib,
       setupCudaHook,
     }:
@@ -262,7 +259,6 @@ filterAndCreateOverrides {
   cuda_demo_suite =
     {
       freeglut,
-      lib,
       libcufft,
       libcurand,
       libGLU,
@@ -307,11 +303,8 @@ filterAndCreateOverrides {
 
   nsight_systems =
     {
-      alsa-lib,
-      boost178,
       cuda_cudart,
       cudaOlder,
-      e2fsprogs,
       gst_all_1,
       lib,
       nss,

From 0304c9b726d1dc1e12fa8c14f681a1f521d242fd Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Mon, 25 Mar 2024 16:58:20 +0000
Subject: [PATCH 16/34] cuda-modules/flags: use cudaAtLeast when possible

---
 pkgs/development/cuda-modules/flags.nix | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pkgs/development/cuda-modules/flags.nix b/pkgs/development/cuda-modules/flags.nix
index 89ddfe53aea96..5f417287574d8 100644
--- a/pkgs/development/cuda-modules/flags.nix
+++ b/pkgs/development/cuda-modules/flags.nix
@@ -7,6 +7,7 @@
   cudaCapabilities ? (config.cudaCapabilities or []),
   cudaForwardCompat ? (config.cudaForwardCompat or true),
   lib,
+  cudaAtLeast,
   cudaVersion,
   # gpus :: List Gpu
   gpus,
@@ -48,7 +49,7 @@ let
     gpu:
     let
       inherit (gpu) minCudaVersion maxCudaVersion;
-      lowerBoundSatisfied = strings.versionAtLeast cudaVersion minCudaVersion;
+      lowerBoundSatisfied = cudaAtLeast minCudaVersion;
       upperBoundSatisfied =
         (maxCudaVersion == null) || !(strings.versionOlder maxCudaVersion cudaVersion);
     in
@@ -286,7 +287,7 @@ assert let
   };
   actualWrapped = (builtins.tryEval (builtins.deepSeq actual actual)).value;
 in
-asserts.assertMsg ((strings.versionAtLeast cudaVersion "11.2") -> (expected == actualWrapped)) ''
+asserts.assertMsg ((cudaAtLeast "11.2") -> (expected == actualWrapped)) ''
   This test should only fail when using a version of CUDA older than 11.2, the first to support
   8.6.
   Expected: ${builtins.toJSON expected}

From 5b65222a12d4cc530da82b78e9aed82e447fd2e8 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Mon, 25 Mar 2024 16:59:49 +0000
Subject: [PATCH 17/34] cuda-modules/flags: ignore platforms in throwIf in
 isJetsonBuild

Since, even under cross-compilation, we evaluate this flag on multiple platforms, it makes more sense to move the platform check out of the throw condition
and into the boolean return value. The alternative is to restrict all uses of this value to locations which gaurd evaluation so it does not occur when the
host platform is still x86_64.
---
 pkgs/development/cuda-modules/flags.nix | 65 +++----------------------
 1 file changed, 6 insertions(+), 59 deletions(-)

diff --git a/pkgs/development/cuda-modules/flags.nix b/pkgs/development/cuda-modules/flags.nix
index 5f417287574d8..495ba647feb1b 100644
--- a/pkgs/development/cuda-modules/flags.nix
+++ b/pkgs/development/cuda-modules/flags.nix
@@ -223,24 +223,24 @@ let
             lists.filter (cap: !(builtins.elem cap requestedJetsonDevices))
               cudaCapabilities;
           jetsonBuildSufficientCondition = requestedJetsonDevices != [];
-          jetsonBuildNecessaryCondition = requestedNonJetsonDevices == [] && targetPlatform.isAarch64;
+          jetsonBuildNecessaryCondition = requestedNonJetsonDevices == [];
         in
         trivial.throwIf (jetsonBuildSufficientCondition && !jetsonBuildNecessaryCondition)
           ''
-            Jetson devices cannot be targeted with non-Jetson devices. Additionally, they require targetPlatform to be aarch64.
+            Jetson devices cannot be targeted with non-Jetson devices. Additionally, host platform
+            and target platform must be aarch64.
             You requested ${builtins.toJSON cudaCapabilities} for:
             - Build platform ${buildPlatform.system}
             - Host platform ${hostPlatform.system}
             - Target platform ${targetPlatform.system}
             Requested Jetson devices: ${builtins.toJSON requestedJetsonDevices}.
             Requested non-Jetson devices: ${builtins.toJSON requestedNonJetsonDevices}.
-            Exactly one of the following must be true:
-            - All CUDA capabilities belong to Jetson devices and targetPlatform is aarch64.
-            - No CUDA capabilities belong to Jetson devices.
             See ${./gpus.nix} for a list of architectures supported by this version of Nixpkgs.
           ''
           jetsonBuildSufficientCondition
-        && jetsonBuildNecessaryCondition;
+        && jetsonBuildNecessaryCondition
+        && hostPlatform.isAarch64
+        && targetPlatform.isAarch64;
     };
 in
 # When changing names or formats: pause, validate, and update the assert
@@ -310,59 +310,6 @@ asserts.assertMsg (expected == actualWrapped) ''
   Expected: ${builtins.toJSON expected}
   Actual: ${builtins.toJSON actualWrapped}
 '';
-# Check Jetson-only
-assert let
-  expected = {
-    cudaCapabilities = [
-      "6.2"
-      "7.2"
-    ];
-    enableForwardCompat = true;
-
-    archNames = [
-      "Pascal"
-      "Volta"
-    ];
-    realArches = [
-      "sm_62"
-      "sm_72"
-    ];
-    virtualArches = [
-      "compute_62"
-      "compute_72"
-    ];
-    arches = [
-      "sm_62"
-      "sm_72"
-      "compute_72"
-    ];
-
-    gencode = [
-      "-gencode=arch=compute_62,code=sm_62"
-      "-gencode=arch=compute_72,code=sm_72"
-      "-gencode=arch=compute_72,code=compute_72"
-    ];
-    gencodeString = "-gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_72,code=sm_72 -gencode=arch=compute_72,code=compute_72";
-
-    isJetsonBuild = true;
-  };
-  actual = formatCapabilities {
-    cudaCapabilities = [
-      "6.2"
-      "7.2"
-    ];
-  };
-  actualWrapped = (builtins.tryEval (builtins.deepSeq actual actual)).value;
-in
-asserts.assertMsg
-  # We can't do this test unless we're targeting aarch64
-  (targetPlatform.isAarch64 -> (expected == actualWrapped))
-  ''
-    Jetson devices can only be built with other Jetson devices.
-    Both 6.2 and 7.2 are Jetson devices.
-    Expected: ${builtins.toJSON expected}
-    Actual: ${builtins.toJSON actualWrapped}
-  '';
 {
   # formatCapabilities :: { cudaCapabilities: List Capability, enableForwardCompat: Boolean } ->  { ... }
   inherit formatCapabilities;

From 969ee2bf21242e1eb4d7912081440bb52e83a3de Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Mon, 25 Mar 2024 17:04:03 +0000
Subject: [PATCH 18/34] cuda-modules: fix deprecated uses of substituteInPlace
 replace flag

---
 .../cuda-modules/cuda-library-samples/generic.nix           | 2 +-
 pkgs/development/cuda-modules/cuda/overrides.nix            | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pkgs/development/cuda-modules/cuda-library-samples/generic.nix b/pkgs/development/cuda-modules/cuda-library-samples/generic.nix
index d4182536654e1..3c080c8a9c382 100644
--- a/pkgs/development/cuda-modules/cuda-library-samples/generic.nix
+++ b/pkgs/development/cuda-modules/cuda-library-samples/generic.nix
@@ -76,7 +76,7 @@ in
       # CUTENSOR_ROOT is double escaped
       postPatch = ''
         substituteInPlace CMakeLists.txt \
-          --replace "\''${CUTENSOR_ROOT}/include" "${cutensor.dev}/include"
+          --replace-fail "\''${CUTENSOR_ROOT}/include" "${cutensor.dev}/include"
       '';
 
       CUTENSOR_ROOT = cutensor;
diff --git a/pkgs/development/cuda-modules/cuda/overrides.nix b/pkgs/development/cuda-modules/cuda/overrides.nix
index 963003944f332..5c041b9d8a077 100644
--- a/pkgs/development/cuda-modules/cuda/overrides.nix
+++ b/pkgs/development/cuda-modules/cuda/overrides.nix
@@ -212,13 +212,13 @@ filterAndCreateOverrides {
         + ''
           echo "Running the cuda_nvcc postPatch"
           substituteInPlace bin/nvcc.profile \
-            --replace \
+            --replace-fail \
               '$(TOP)/lib' \
               "''${!outputLib}/lib" \
-            --replace \
+            --replace-fail \
               '$(TOP)/$(_NVVM_BRANCH_)' \
               "''${!outputBin}/nvvm" \
-            --replace \
+            --replace-fail \
               '$(TOP)/$(_TARGET_DIR_)/include' \
               "''${!outputDev}/include"
 

From 3b629a2ad3200a2f1934f4fc0f554de8f8f69bc5 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Mon, 25 Mar 2024 17:05:27 +0000
Subject: [PATCH 19/34] cuda-modules/cuda/overrides: backendStdenv.cc is
 already part of nativeBuildInputs

---
 pkgs/development/cuda-modules/cuda/overrides.nix | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pkgs/development/cuda-modules/cuda/overrides.nix b/pkgs/development/cuda-modules/cuda/overrides.nix
index 5c041b9d8a077..917271f80e66a 100644
--- a/pkgs/development/cuda-modules/cuda/overrides.nix
+++ b/pkgs/development/cuda-modules/cuda/overrides.nix
@@ -204,8 +204,6 @@ filterAndCreateOverrides {
       # nvcc to use the fixed backend toolchain. Cf. comments in
       # backend-stdenv.nix
 
-      nativeBuildInputs = prevAttrs.nativeBuildInputs ++ [ backendStdenv.cc ];
-
       # TODO(@connorbaker): We should specify the spliced version of backendStdenv and cuda_cudart to use here.
       postPatch =
         (prevAttrs.postPatch or "")

From e55a9c21a4cebcdc395b9667a9b3cc0caadc3e25 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Mon, 25 Mar 2024 17:14:42 +0000
Subject: [PATCH 20/34] cuda-modules/cuda/overrides: specify spliced packages
 for cuda_nvcc postPatch phase

---
 .../cuda-modules/cuda/overrides.nix           | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/pkgs/development/cuda-modules/cuda/overrides.nix b/pkgs/development/cuda-modules/cuda/overrides.nix
index 917271f80e66a..460874b0e4978 100644
--- a/pkgs/development/cuda-modules/cuda/overrides.nix
+++ b/pkgs/development/cuda-modules/cuda/overrides.nix
@@ -204,8 +204,17 @@ filterAndCreateOverrides {
       # nvcc to use the fixed backend toolchain. Cf. comments in
       # backend-stdenv.nix
 
-      # TODO(@connorbaker): We should specify the spliced version of backendStdenv and cuda_cudart to use here.
       postPatch =
+        let
+          # CC must come from the host environment, not the target environment because it is
+          # used at build time.
+          ccBin = lib.getBin (backendStdenv.__spliced.buildHost.cc or backendStdenv.cc);
+          # CUDA runtime libraries must come from the host/target environment because they
+          # are used at runtime, not build time (outside of linking).
+          cudartStatic = (cuda_cudart.__spliced.hostTarget or cuda_cudart).static;
+          cudartLib = lib.getLib (cuda_cudart.__spliced.hostTarget or cuda_cudart);
+          cudartDev = lib.getDev (cuda_cudart.__spliced.hostTarget or cuda_cudart);
+        in
         (prevAttrs.postPatch or "")
         + ''
           echo "Running the cuda_nvcc postPatch"
@@ -223,19 +232,19 @@ filterAndCreateOverrides {
           cat << EOF >> bin/nvcc.profile
 
           # Fix a compatible backend compiler
-          PATH += ${lib.getBin backendStdenv.cc}/bin:
+          PATH += "${ccBin}/bin":
 
           # Expose the split-out nvvm
-          LIBRARIES =+ -L''${!outputBin}/nvvm/lib
-          INCLUDES =+ -I''${!outputBin}/nvvm/include
+          LIBRARIES =+ -L"''${!outputBin}/nvvm/lib"
+          INCLUDES =+ -I"''${!outputBin}/nvvm/include"
 
           # Expose cudart and the libcuda stubs
-          LIBRARIES =+ -L$static/lib" "-L${cuda_cudart.lib}/lib -L${cuda_cudart.lib}/lib/stubs
-          INCLUDES =+ -I${cuda_cudart.dev}/include
+          LIBRARIES =+ -L"$static/lib" -L"${cudartStatic}/lib" -L"${cudartLib}/lib" -L"${cudartLib}/lib/stubs"
+          INCLUDES =+ -I"${cudartDev}/include"
           EOF
         '';
 
-      propagatedNativeBuildInputs = [ setupCudaHook ];
+      propagatedNativeBuildInputs = (prevAttrs.propagatedNativeBuildInputs or [ ]) ++ [ setupCudaHook ];
 
       postInstall =
         (prevAttrs.postInstall or "")

From f85d321f170a2fb7b7391cce8d3d4b39a86f5b1f Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Mon, 25 Mar 2024 17:32:55 +0000
Subject: [PATCH 21/34] cuda-modules/generic-builders/manifest: wip
 cross-compilation

---
 .../generic-builders/manifest.nix             | 114 +++++++++++-------
 1 file changed, 70 insertions(+), 44 deletions(-)

diff --git a/pkgs/development/cuda-modules/generic-builders/manifest.nix b/pkgs/development/cuda-modules/generic-builders/manifest.nix
index d0d023f565c60..591acfcc4c051 100644
--- a/pkgs/development/cuda-modules/generic-builders/manifest.nix
+++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix
@@ -38,17 +38,19 @@ let
     sourceTypes
     ;
 
-  inherit (backendStdenv) hostPlatform;
+  inherit (backendStdenv) buildPlatform hostPlatform targetPlatform;
 
   # Get the redist architectures for which package provides distributables.
   # These are used by meta.platforms.
   supportedRedistArchs = builtins.attrNames featureRelease;
-  # redistArch :: String
-  # The redistArch is the name of the architecture for which the redistributable is built.
+
+  # hostPlatformRedistArch :: String
+  # The hostPlatformRedistArch is the name of the architecture for which the redistributable is built.
   # It is `"unsupported"` if the redistributable is not supported on the hostPlatform.
-  redistArch = flags.getRedistArch hostPlatform.system;
+  hostPlatformRedistArch = flags.getRedistArch hostPlatform.system;
 
-  sourceMatchesHost = flags.getNixSystem redistArch == hostPlatform.system;
+  # sourceMatchesHost :: Bool
+  sourceMatchesHost = flags.getNixSystem hostPlatformRedistArch == hostPlatform.system;
 in
 backendStdenv.mkDerivation (
   finalAttrs: {
@@ -76,7 +78,7 @@ backendStdenv.mkDerivation (
           output:
           attrsets.attrByPath
             [
-              redistArch
+              hostPlatformRedistArch
               "outputs"
               output
             ]
@@ -96,12 +98,12 @@ backendStdenv.mkDerivation (
         # NOTE: In the case the redistributable isn't supported on the target platform,
         # we will have `outputs = [ "out" ] ++ possibleOutputs`. This is of note because platforms which
         # aren't supported would otherwise have evaluation errors when trying to access outputs other than `out`.
-        # The alternative would be to have `outputs = [ "out" ]` when`redistArch = "unsupported"`, but that would
+        # The alternative would be to have `outputs = [ "out" ]` when`hostPlatformRedistArch = "unsupported"`, but that would
         # require adding guards throughout the entirety of the CUDA package set to ensure `cudaSupport` is true --
         # recall that OfBorg will evaluate packages marked as broken and that `cudaPackages` will be evaluated with
         # `cudaSupport = false`!
         additionalOutputs =
-          if redistArch == "unsupported"
+          if hostPlatformRedistArch == "unsupported"
           then possibleOutputs
           else builtins.filter hasOutput possibleOutputs;
         # The out output is special -- it's the default output and we always include it.
@@ -154,18 +156,35 @@ backendStdenv.mkDerivation (
     };
 
     # src :: Optional Derivation
-    src = trivial.pipe redistArch [
-      # If redistArch doesn't exist in redistribRelease, return null.
-      (redistArch: redistribRelease.${redistArch} or null)
-      # If the release is non-null, fetch the source; otherwise, return null.
-      (trivial.mapNullable (
-        { relative_path, sha256, ... }:
-        fetchurl {
-          url = "https://developer.download.nvidia.com/compute/${redistName}/redist/${relative_path}";
-          inherit sha256;
-        }
-      ))
-    ];
+    src =
+      # TODO(@connorbaker): Remove debugging lib.warn and inline this.
+      let
+        src =
+          trivial.mapNullable
+            (
+              { relative_path, sha256, ... }:
+              fetchurl {
+                url = "https://developer.download.nvidia.com/compute/${redistName}/redist/${relative_path}";
+                inherit sha256;
+              }
+            )
+            (redistribRelease.${hostPlatformRedistArch} or null);
+      in
+      lib.warn
+      ''
+        Info:
+        - redistName: ${redistName}
+        - hostPlatformRedistArch: ${hostPlatformRedistArch}
+        - pname: ${finalAttrs.pname}
+        - version: ${finalAttrs.version}
+        - outputs: ${builtins.toJSON finalAttrs.outputs}
+        - brokenConditions: ${builtins.toJSON finalAttrs.brokenConditions}
+        - badPlatformsConditions: ${builtins.toJSON finalAttrs.badPlatformsConditions}
+        - buildPlatform: ${buildPlatform.system}
+        - hostPlatform: ${hostPlatform.system}
+        - targetPlatform: ${targetPlatform.system}
+      ''
+      src;
 
     # Handle the pkg-config files:
     # 1. No FHS
@@ -198,30 +217,37 @@ backendStdenv.mkDerivation (
     # We do need some other phases, like configurePhase, so the multiple-output setup hook works.
     dontBuild = true;
 
-    nativeBuildInputs = [
-      autoPatchelfHook
-      # This hook will make sure libcuda can be found
-      # in typically /lib/opengl-driver by adding that
-      # directory to the rpath of all ELF binaries.
-      # Check e.g. with `patchelf --print-rpath path/to/my/binary
-      autoAddDriverRunpath
-      markForCudatoolkitRootHook
-      # To create fat outputs from each component and find a version of `lndir` built for the host platform.
-      lndir
-    ]
-    # autoAddCudaCompatRunpath depends on cuda_compat and would cause
-    # infinite recursion if applied to `cuda_compat` itself (beside the fact
-    # that it doesn't make sense in the first place)
-    ++ lib.optionals (pname != "cuda_compat" && flags.isJetsonBuild) [
-      # autoAddCudaCompatRunpath must appear AFTER autoAddDriverRunpath.
-      # See its documentation in ./setup-hooks/extension.nix.
-      # NOTE(@connorbaker): Because autoAddCudaCompatRunpath is in nativeBuildInputs, it tries to use toolchains
-      # from buildPlatform, but that's not what we want. We want to use our host/target toolchains!
-      # To overcome this, we access the `__spliced` attribute and choose the `hostTarget` attribute.
-      # In the case the `__spliced` attribute doesn't exist, we just use the hook directly (because we're not
-      # cross-compiling).
-      autoAddCudaCompatRunpath.__spliced.hostTarget or autoAddCudaCompatRunpath
-    ];
+    nativeBuildInputs =
+      [
+        # To create fat outputs from each component and find a version of `lndir` built for the host platform.
+        lndir
+      ]
+      ++ [
+        # Patchelf is used to fix the rpath of the binaries.
+        autoPatchelfHook
+        # (autoPatchelfHook.__spliced.buildHost or autoPatchelfHook)
+
+        # This hook will make sure libcuda can be found in typically
+        # /lib/opengl-driver by adding that directory to the rpath of all ELF
+        # binaries. Check e.g. with `patchelf --print-rpath path/to/my/binary
+        autoAddDriverRunpath
+        # (autoAddDriverRunpath.__spliced.buildHost or autoAddDriverRunpath)
+
+        # Mark the CUDA toolkit root directory for the CUDA compatibility libraries
+        markForCudatoolkitRootHook
+        # (markForCudatoolkitRootHook.__spliced.buildHost or markForCudatoolkitRootHook)
+      ]
+      # autoAddCudaCompatRunpath depends on cuda_compat and would cause
+      # infinite recursion if applied to `cuda_compat` itself (beside the fact
+      # that it doesn't make sense in the first place)
+      ++ lib.optionals (pname != "cuda_compat" && flags.isJetsonBuild) [
+        # autoAddCudaCompatRunpath must appear AFTER autoAddDriverRunpath.
+        # See its documentation in ./setup-hooks/auto-add-cuda-compat-runpath-hook/default.nix.
+        # NOTE(@connorbaker): Because autoAddCudaCompatRunpath is in nativeBuildInputs, it tries to use cuda_compat
+        # from buildPackages, but we need to use the one from targetPackages.
+        # We can either use autoAddCudaCompatRunpath.__spliced.hostTarget or move it to buildInputs.
+        (autoAddCudaCompatRunpath.__spliced.hostTarget or autoAddCudaCompatRunpath)
+      ];
 
     buildInputs =
       [

From 2b6a5a9107ba628715734cb2dda5a889090a986d Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Mon, 25 Mar 2024 17:35:04 +0000
Subject: [PATCH 22/34] cuda-modules/saxpy: remove CMAKE_VERBOSE_MAKEFILE

---
 pkgs/development/cuda-modules/saxpy/default.nix | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pkgs/development/cuda-modules/saxpy/default.nix b/pkgs/development/cuda-modules/saxpy/default.nix
index be31ced11be77..e8701f0fd2df5 100644
--- a/pkgs/development/cuda-modules/saxpy/default.nix
+++ b/pkgs/development/cuda-modules/saxpy/default.nix
@@ -45,7 +45,6 @@ backendStdenv.mkDerivation {
     ++ lib.optionals (cudaAtLeast "12.0") [cuda_cccl];
 
   cmakeFlags = [
-    (lib.cmakeBool "CMAKE_VERBOSE_MAKEFILE" true)
     (lib.cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
       with flags; lib.concatStringsSep ";" (lib.lists.map dropDot cudaCapabilities)
     ))

From 77ea14bb7e1ef4bb2a001b3387a314d65498981a Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Mon, 25 Mar 2024 17:39:47 +0000
Subject: [PATCH 23/34] cuda-modules/setup-hooks/setup-cuda-hook: factor out cc
 access

---
 .../cuda-modules/setup-hooks/setup-cuda-hook/default.nix  | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/default.nix b/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/default.nix
index 6c5f299d4418c..f36e9339de5ce 100644
--- a/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/default.nix
+++ b/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/default.nix
@@ -1,15 +1,17 @@
 # Currently propagated by cuda_nvcc or cudatoolkit, rather than used directly
 { backendStdenv, makeSetupHook }:
+let
+  inherit (backendStdenv) cc;
+in
 makeSetupHook
   {
     name = "setup-cuda-hook";
-
     substitutions = {
       # Required in addition to ccRoot as otherwise bin/gcc is looked up
       # when building CMakeCUDACompilerId.cu
-      ccFullPath = "${backendStdenv.cc}/bin/${backendStdenv.cc.targetPrefix}c++";
+      ccFullPath = "${cc}/bin/${cc.targetPrefix}c++";
       # Point NVCC at a compatible compiler
-      ccRoot = "${backendStdenv.cc}";
+      ccRoot = "${cc}";
       setupCudaHook = placeholder "out";
     };
   }

From 37d2448db92b108b31d0e081cfc4e2d157d89ec3 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Mon, 25 Mar 2024 17:40:28 +0000
Subject: [PATCH 24/34] 
 cuda-modules/setup-hooks/auto-add-cuda-compat-runpath-hook: collapse body
 like other setup hooks

---
 .../setup-hooks/auto-add-cuda-compat-runpath-hook/default.nix   | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pkgs/development/cuda-modules/setup-hooks/auto-add-cuda-compat-runpath-hook/default.nix b/pkgs/development/cuda-modules/setup-hooks/auto-add-cuda-compat-runpath-hook/default.nix
index f253331fb24b0..8209f02953c3e 100644
--- a/pkgs/development/cuda-modules/setup-hooks/auto-add-cuda-compat-runpath-hook/default.nix
+++ b/pkgs/development/cuda-modules/setup-hooks/auto-add-cuda-compat-runpath-hook/default.nix
@@ -14,9 +14,7 @@ makeSetupHook
   {
     name = "auto-add-cuda-compat-runpath-hook";
     propagatedBuildInputs = [ autoFixElfFiles ];
-
     substitutions.libcudaPath = lib.optionalString flags.isJetsonBuild "${cuda_compat}/compat";
-
     meta = {
       broken = !flags.isJetsonBuild;
       badPlatforms = lib.optionals (cuda_compat == null) lib.platforms.all;

From de0bb6a996d08beec718e8c54249d4172a9cc71c Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Mon, 25 Mar 2024 18:21:35 +0000
Subject: [PATCH 25/34] cuda-modules/setup-hooks: wip rewrite and set
 NIX_DEBUG=1

---
 .../mark-for-cudatoolkit-root-hook.sh         | 43 +++++++--
 .../setup-cuda-hook/setup-cuda-hook.sh        | 95 +++++++++++--------
 2 files changed, 92 insertions(+), 46 deletions(-)

diff --git a/pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook/mark-for-cudatoolkit-root-hook.sh b/pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook/mark-for-cudatoolkit-root-hook.sh
index ba04c2e0806af..92785ccf01ac1 100644
--- a/pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook/mark-for-cudatoolkit-root-hook.sh
+++ b/pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook/mark-for-cudatoolkit-root-hook.sh
@@ -1,14 +1,43 @@
 # shellcheck shell=bash
 
-# Should we mimick cc-wrapper's "hygiene"?
-[[ -z ${strictDeps-} ]] || (( "$hostOffset" < 0 )) || return 0
+guard=Sourcing
+reason=
 
-echo "Sourcing mark-for-cudatoolkit-root-hook" >&2
+export NIX_DEBUG=1
+
+# Only run the hook from nativeBuildInputs.
+# See the table under https://nixos.org/manual/nixpkgs/unstable/#dependency-propagation for information
+# about the different target combinations and their offsets.
+if (( "${hostOffset:?}" != -1 && "${targetOffset:?}" != 0 )); then
+    guard=Skipping
+    reason=" because the hook is not in nativeBuildInputs"
+fi
+
+if (( "${NIX_DEBUG:-0}" >= 1 )); then
+    echo "$guard hostOffset=$hostOffset targetOffset=$targetOffset mark-for-cudatoolkit-root-hook$reason" >&2
+else
+    echo "$guard mark-for-cudatoolkit-root-hook$reason" >&2
+fi
+
+[[ "$guard" = Sourcing ]] || return 0
 
 markForCUDAToolkit_ROOT() {
-    mkdir -p "${prefix}/nix-support"
-    [[ -f "${prefix}/nix-support/include-in-cudatoolkit-root" ]] && return
-    echo "$pname-$output" > "${prefix}/nix-support/include-in-cudatoolkit-root"
-}
+    local fnName=mark-for-cudatoolkit-root-hook::markForCUDAToolkit_ROOT
+    echo "$fnName: Running" >&2
+
+    mkdir -p "${prefix:?}/nix-support"
+    local markerPath="$prefix/nix-support/include-in-cudatoolkit-root"
+    if [[ -f "$markerPath" ]]; then
+        (( "${NIX_DEBUG:-0}" >= 1 )) && echo "$fnName: $markerPath exists, skipping" >&2
+        return
+    fi
 
+    # Always create the file, even if it's empty, since setup-cuda-hook relies on its existence.
+    # However, only populate it if strictDeps is not set.
+    touch "$markerPath"
+    if [[ -z ${strictDeps-} ]]; then
+        (( "${NIX_DEBUG:-0}" >= 1 )) || echo "$fnName: populating $markerPath" >&2
+        echo "${pname:?}-${output:?}" > "$markerPath"
+    fi
+}
 fixupOutputHooks+=(markForCUDAToolkit_ROOT)
diff --git a/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/setup-cuda-hook.sh b/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/setup-cuda-hook.sh
index a4a444fcd2417..99e6e100cf332 100644
--- a/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/setup-cuda-hook.sh
+++ b/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/setup-cuda-hook.sh
@@ -1,15 +1,26 @@
 # shellcheck shell=bash
 
-# Only run the hook from nativeBuildInputs
-(( "$hostOffset" == -1 && "$targetOffset" == 0)) || return 0
-
 guard=Sourcing
 reason=
 
-[[ -n ${cudaSetupHookOnce-} ]] && guard=Skipping && reason=" because the hook has been propagated more than once"
+export NIX_DEBUG=1
+
+# Only run the hook from buildInputs: outside executables like cuda_nvcc, most
+# CUDA dependencies are needed at runtime, not build-time.
+# See the table under https://nixos.org/manual/nixpkgs/unstable/#dependency-propagation for information
+# about the different target combinations and their offsets.
+if (( "${hostOffset:?}" != -1 && "${targetOffset:?}" != 0 )); then
+    guard=Skipping
+    reason=" because the hook is not in nativeBuildInputs"
+fi
 
-if (( "${NIX_DEBUG:-0}" >= 1 )) ; then
-    echo "$guard hostOffset=$hostOffset targetOffset=$targetOffset setupCudaHook$reason" >&2
+if [[ -n ${cudaSetupHookOnce-} ]]; then
+    guard=Skipping
+    reason=" because the hook has been propagated more than once"
+fi
+
+if (( "${NIX_DEBUG:-0}" >= 1 )); then
+    echo "$guard hostOffset=$hostOffset targetOffset=$targetOffset setup-cuda-hook$reason" >&2
 else
     echo "$guard setup-cuda-hook$reason" >&2
 fi
@@ -20,13 +31,22 @@ declare -g cudaSetupHookOnce=1
 declare -Ag cudaHostPathsSeen=()
 declare -Ag cudaOutputToPath=()
 
-extendcudaHostPathsSeen() {
-    (( "${NIX_DEBUG:-0}" >= 1 )) && echo "extendcudaHostPathsSeen $1" >&2
+extendCudaHostPathsSeen() {
+    local fnName=setup-cuda-hook::extendCudaHostPathsSeen
+    (( "${NIX_DEBUG:-0}" >= 1 )) && echo "$fnName: $1" >&2
 
     local markerPath="$1/nix-support/include-in-cudatoolkit-root"
-    [[ ! -f "${markerPath}" ]] && return
-    [[ -v cudaHostPathsSeen[$1] ]] && return
+    if [[ ! -f "$markerPath" ]]; then
+        (( "${NIX_DEBUG:-0}" >= 1 )) && echo "$fnName: skipping since $markerPath exists" >&2
+        return
+    fi
 
+    if [[ -v cudaHostPathsSeen[$1] ]]; then
+        (( "${NIX_DEBUG:-0}" >= 1 )) && echo "$fnName: skipping since $1 has already been seen" >&2
+        return
+    fi
+
+    # Add the path to the list of CUDA host paths.
     cudaHostPathsSeen["$1"]=1
 
     # E.g. cuda_cudart-lib
@@ -36,31 +56,32 @@ extendcudaHostPathsSeen() {
     [[ -z "$cudaOutputName" ]] && return
 
     local oldPath="${cudaOutputToPath[$cudaOutputName]-}"
-    [[ -n "$oldPath" ]] && echo "extendcudaHostPathsSeen: warning: overwriting $cudaOutputName from $oldPath to $1" >&2
+    [[ -n "$oldPath" ]] && echo "$fnName: warning: overwriting $cudaOutputName from $oldPath to $1" >&2
     cudaOutputToPath["$cudaOutputName"]="$1"
 }
-addEnvHooks "$targetOffset" extendcudaHostPathsSeen
+addEnvHooks "$targetOffset" extendCudaHostPathsSeen
 
 setupCUDAToolkit_ROOT() {
-    (( "${NIX_DEBUG:-0}" >= 1 )) && echo "setupCUDAToolkit_ROOT: cudaHostPathsSeen=${!cudaHostPathsSeen[*]}" >&2
+    local fnName=setup-cuda-hook::setupCUDAToolkit_ROOT
+    (( "${NIX_DEBUG:-0}" >= 1 )) && echo "$fnName: cudaHostPathsSeen=${!cudaHostPathsSeen[*]}" >&2
 
-    for path in "${!cudaHostPathsSeen[@]}" ; do
+    for path in "${!cudaHostPathsSeen[@]}"; do
         addToSearchPathWithCustomDelimiter ";" CUDAToolkit_ROOT "$path"
-        if [[ -d "$path/include" ]] ; then
-            addToSearchPathWithCustomDelimiter ";" CUDAToolkit_INCLUDE_DIR "$path/include"
-        fi
+        [[ -d "$path/include" ]] && addToSearchPathWithCustomDelimiter ";" CUDAToolkit_INCLUDE_DIR "$path/include"
     done
 
-    export cmakeFlags+=" -DCUDAToolkit_INCLUDE_DIR=$CUDAToolkit_INCLUDE_DIR -DCUDAToolkit_ROOT=$CUDAToolkit_ROOT"
+    export cmakeFlagsArray+=(
+        -DCUDAToolkit_INCLUDE_DIR="${CUDAToolkit_INCLUDE_DIR:-}"
+        -DCUDAToolkit_ROOT="${CUDAToolkit_ROOT:-}"
+    )
 }
 preConfigureHooks+=(setupCUDAToolkit_ROOT)
 
 setupCUDAToolkitCompilers() {
-    echo Executing setupCUDAToolkitCompilers >&2
+    local fnName=setup-cuda-hook::setupCUDAToolkitCompilers
+    echo "$fnName: Running" >&2
 
-    if [[ -n "${dontSetupCUDAToolkitCompilers-}" ]] ; then
-        return
-    fi
+    [[ -n "${dontSetupCUDAToolkitCompilers-}" ]] && return
 
     # Point NVCC at a compatible compiler
 
@@ -69,8 +90,10 @@ setupCUDAToolkitCompilers() {
     # https://cmake.org/cmake/help/latest/envvar/CUDAHOSTCXX.html
     # https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_HOST_COMPILER.html
 
-    export cmakeFlags+=" -DCUDA_HOST_COMPILER=@ccFullPath@"
-    export cmakeFlags+=" -DCMAKE_CUDA_HOST_COMPILER=@ccFullPath@"
+    export cmakeFlagsArray+=(
+        -DCUDA_HOST_COMPILER="@ccFullPath@"
+        -DCMAKE_CUDA_HOST_COMPILER="@ccFullPath@"
+    )
 
     # For non-CMake projects:
     # We prepend --compiler-bindir to nvcc flags.
@@ -78,26 +101,23 @@ setupCUDAToolkitCompilers() {
     # uses the last --compiler-bindir it gets on the command line.
     # FIXME: this results in "incompatible redefinition" warnings.
     # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#compiler-bindir-directory-ccbin
-    if [ -z "${CUDAHOSTCXX-}" ]; then
-      export CUDAHOSTCXX="@ccFullPath@";
-    fi
+    [[ -z "${CUDAHOSTCXX-}" ]] && export CUDAHOSTCXX="@ccFullPath@"
 
     export NVCC_PREPEND_FLAGS+=" --compiler-bindir=@ccRoot@/bin"
 
     # NOTE: We set -Xfatbin=-compress-all, which reduces the size of the compiled
-    #   binaries. If binaries grow over 2GB, they will fail to link. This is a problem for us, as
-    #   the default set of CUDA capabilities we build can regularly cause this to occur (for
-    #   example, with Magma).
+    # binaries. If binaries grow over 2GB, they will fail to link. This is a problem for us, as
+    # the default set of CUDA capabilities we build can regularly cause this to occur (for
+    # example, with Magma).
     #
     # @SomeoneSerge: original comment was made by @ConnorBaker in .../cudatoolkit/common.nix
-    if [[ -z "${dontCompressFatbin-}" ]]; then
-        export NVCC_PREPEND_FLAGS+=" -Xfatbin=-compress-all"
-    fi
+    [[ -z "${dontCompressFatbin-}" ]] && export NVCC_PREPEND_FLAGS+=" -Xfatbin=-compress-all"
 }
 preConfigureHooks+=(setupCUDAToolkitCompilers)
 
 propagateCudaLibraries() {
-    (( "${NIX_DEBUG:-0}" >= 1 )) && echo "propagateCudaLibraries: cudaPropagateToOutput=$cudaPropagateToOutput cudaHostPathsSeen=${!cudaHostPathsSeen[*]}" >&2
+    local fnName=setup-cuda-hook::propagateCudaLibraries
+    (( "${NIX_DEBUG:-0}" >= 1 )) && echo "$fnName: cudaPropagateToOutput=$cudaPropagateToOutput cudaHostPathsSeen=${!cudaHostPathsSeen[*]}" >&2
 
     [[ -z "${cudaPropagateToOutput-}" ]] && return
 
@@ -106,11 +126,8 @@ propagateCudaLibraries() {
     echo "@setupCudaHook@" >> "${!cudaPropagateToOutput}/nix-support/propagated-native-build-inputs"
 
     local propagatedBuildInputs=( "${!cudaHostPathsSeen[@]}" )
-    for output in $(getAllOutputNames) ; do
-        if [[ ! "$output" = "$cudaPropagateToOutput" ]] ; then
-            propagatedBuildInputs+=( "${!output}" )
-        fi
-        break
+    for output in $(getAllOutputNames); do
+        [[ ! "$output" = "$cudaPropagateToOutput" ]] && propagatedBuildInputs+=( "${!output}" ) && break
     done
 
     # One'd expect this should be propagated-host-host-deps, but that doesn't seem to work

From 3297d6fed39fbef2fb63a2861c3900fd28d608b0 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Tue, 26 Mar 2024 13:56:54 +0000
Subject: [PATCH 26/34] cudaPackages.cuda_nvcc: never has a lib output

---
 .../cuda-modules/cuda/overrides.nix           | 28 -------------------
 1 file changed, 28 deletions(-)

diff --git a/pkgs/development/cuda-modules/cuda/overrides.nix b/pkgs/development/cuda-modules/cuda/overrides.nix
index 460874b0e4978..d1f65fdd2a804 100644
--- a/pkgs/development/cuda-modules/cuda/overrides.nix
+++ b/pkgs/development/cuda-modules/cuda/overrides.nix
@@ -166,31 +166,6 @@ filterAndCreateOverrides {
       setupCudaHook,
     }:
     prevAttrs: {
-      # Remove once cuda-find-redist-features has a special case for libcuda
-      outputs =
-        # NOTE: The order of build outputs matters as we traverse them when creating
-        # split outputs. The `lib` output cannot come after `static` as it moves all
-        # the static libraries back to the `lib` output.
-        let
-          libOutputIsPresent = builtins.elem "lib" prevAttrs.outputs;
-          staticOutputPos = lib.lists.findFirstIndex (x: x == "static") null prevAttrs.outputs;
-          outputsBeforeStatic = lib.lists.take staticOutputPos prevAttrs.outputs;
-          outputsFromStaticAndLater = lib.lists.drop staticOutputPos prevAttrs.outputs;
-          newOutputs =
-            if libOutputIsPresent then
-              # If the lib output is present, we want to keep it in the same position
-              prevAttrs.outputs
-            else if staticOutputPos == null then
-              # If the static output is not present, location of the lib output
-              # doesn't matter and we can append it
-              prevAttrs.outputs ++ [ "lib" ]
-            else
-              # The lib output is missing and the static output is present.
-              # We need to insert the lib output before the static output.
-              outputsBeforeStatic ++ [ "lib" ] ++ outputsFromStaticAndLater;
-        in
-        newOutputs;
-
       # Patch the nvcc.profile.
       # Syntax:
       # - `=` for assignment,
@@ -219,9 +194,6 @@ filterAndCreateOverrides {
         + ''
           echo "Running the cuda_nvcc postPatch"
           substituteInPlace bin/nvcc.profile \
-            --replace-fail \
-              '$(TOP)/lib' \
-              "''${!outputLib}/lib" \
             --replace-fail \
               '$(TOP)/$(_NVVM_BRANCH_)' \
               "''${!outputBin}/nvvm" \

From 1ac762130a12a366b7638e594d79ba9bf832a2fd Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Tue, 26 Mar 2024 13:57:34 +0000
Subject: [PATCH 27/34] cuda-modules/setup-hooks: wip

---
 .../mark-for-cudatoolkit-root-hook.sh           | 14 ++++++++++----
 .../setup-cuda-hook/setup-cuda-hook.sh          | 17 ++++++++++-------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook/mark-for-cudatoolkit-root-hook.sh b/pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook/mark-for-cudatoolkit-root-hook.sh
index 92785ccf01ac1..475e19fb1db87 100644
--- a/pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook/mark-for-cudatoolkit-root-hook.sh
+++ b/pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook/mark-for-cudatoolkit-root-hook.sh
@@ -3,18 +3,24 @@
 guard=Sourcing
 reason=
 
-export NIX_DEBUG=1
+# export NIX_DEBUG=1
 
 # Only run the hook from nativeBuildInputs.
 # See the table under https://nixos.org/manual/nixpkgs/unstable/#dependency-propagation for information
 # about the different target combinations and their offsets.
-if (( "${hostOffset:?}" != -1 && "${targetOffset:?}" != 0 )); then
+
+# Skip setup hook if we're neither a build-time dep, nor, temporarily, doing a
+# native compile.
+if [[ -v ${strictDeps-} ]]; then
+    guard=Skipping
+    reason=" because strictDeps is set"
+elif (( "${hostOffset:?}" < 0 )); then
     guard=Skipping
-    reason=" because the hook is not in nativeBuildInputs"
+    reason=" because the hook is not in buildInputs"
 fi
 
 if (( "${NIX_DEBUG:-0}" >= 1 )); then
-    echo "$guard hostOffset=$hostOffset targetOffset=$targetOffset mark-for-cudatoolkit-root-hook$reason" >&2
+    echo "$guard hostOffset=$hostOffset targetOffset=${targetOffset:?} mark-for-cudatoolkit-root-hook$reason" >&2
 else
     echo "$guard mark-for-cudatoolkit-root-hook$reason" >&2
 fi
diff --git a/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/setup-cuda-hook.sh b/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/setup-cuda-hook.sh
index 99e6e100cf332..4c42a6dc26b48 100644
--- a/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/setup-cuda-hook.sh
+++ b/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/setup-cuda-hook.sh
@@ -3,24 +3,27 @@
 guard=Sourcing
 reason=
 
-export NIX_DEBUG=1
+# export NIX_DEBUG=1
 
 # Only run the hook from buildInputs: outside executables like cuda_nvcc, most
 # CUDA dependencies are needed at runtime, not build-time.
 # See the table under https://nixos.org/manual/nixpkgs/unstable/#dependency-propagation for information
 # about the different target combinations and their offsets.
-if (( "${hostOffset:?}" != -1 && "${targetOffset:?}" != 0 )); then
+# Skip setup hook if we're neither a build-time dep, nor, temporarily, doing a
+# native compile.
+if [[ -v ${strictDeps-} ]]; then
     guard=Skipping
-    reason=" because the hook is not in nativeBuildInputs"
-fi
-
-if [[ -n ${cudaSetupHookOnce-} ]]; then
+    reason=" because strictDeps is set"
+elif (( "${hostOffset:?}" < 0 )); then
+    guard=Skipping
+    reason=" because the hook is not in buildInputs"
+elif [[ -n ${cudaSetupHookOnce-} ]]; then
     guard=Skipping
     reason=" because the hook has been propagated more than once"
 fi
 
 if (( "${NIX_DEBUG:-0}" >= 1 )); then
-    echo "$guard hostOffset=$hostOffset targetOffset=$targetOffset setup-cuda-hook$reason" >&2
+    echo "$guard hostOffset=$hostOffset targetOffset=${targetOffset:?} setup-cuda-hook$reason" >&2
 else
     echo "$guard setup-cuda-hook$reason" >&2
 fi

From a026c059ffc813cece0a81bcb91184878bbb2ce3 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Wed, 27 Mar 2024 03:02:25 +0000
Subject: [PATCH 28/34] cuda-modules/cuda/overrides: cuda_nvcc should not
 include references to cuda_cudart

---
 .../cuda-modules/cuda/overrides.nix           | 69 +++++++++----------
 1 file changed, 32 insertions(+), 37 deletions(-)

diff --git a/pkgs/development/cuda-modules/cuda/overrides.nix b/pkgs/development/cuda-modules/cuda/overrides.nix
index d1f65fdd2a804..f0bcc8cbdd1ed 100644
--- a/pkgs/development/cuda-modules/cuda/overrides.nix
+++ b/pkgs/development/cuda-modules/cuda/overrides.nix
@@ -88,8 +88,13 @@ filterAndCreateOverrides {
         ++ lib.lists.optionals (cudaAtLeast "12.0") [ libnvjitlink.lib ];
     };
 
+  # TODO(@connorbaker): cuda_cudart.dev depends on crt/host_config.h, which is from
+  # cuda_nvcc.dev. It would be nice to be able to encode that.
   cuda_cudart =
-    { buildPackages, lib }:
+    { addDriverRunpath, lib }:
+    let
+      inherit (addDriverRunpath.__spliced.buildHost or addDriverRunpath) driverLink;
+    in
     prevAttrs: {
       # Remove once cuda-find-redist-features has a special case for libcuda
       outputs =
@@ -105,7 +110,7 @@ filterAndCreateOverrides {
           while IFS= read -r -d $'\0' path ; do
             sed -i \
               -e "s|^libdir\s*=.*/lib\$|libdir=''${!outputLib}/lib/stubs|" \
-              -e "s|^Libs\s*:\(.*\)\$|Libs: \1 -Wl,-rpath,${buildPackages.addDriverRunpath.driverLink}/lib|" \
+              -e "s|^Libs\s*:\(.*\)\$|Libs: \1 -Wl,-rpath,${driverLink}/lib|" \
               "$path"
           done < <(find -iname 'cuda-*.pc' -print0)
         ''
@@ -165,6 +170,11 @@ filterAndCreateOverrides {
       lib,
       setupCudaHook,
     }:
+    let
+      # CC must come from the host environment, not the target environment because it is
+      # used at build time.
+      inherit (backendStdenv.__spliced.buildHost or backendStdenv) cc;
+    in
     prevAttrs: {
       # Patch the nvcc.profile.
       # Syntax:
@@ -180,16 +190,6 @@ filterAndCreateOverrides {
       # backend-stdenv.nix
 
       postPatch =
-        let
-          # CC must come from the host environment, not the target environment because it is
-          # used at build time.
-          ccBin = lib.getBin (backendStdenv.__spliced.buildHost.cc or backendStdenv.cc);
-          # CUDA runtime libraries must come from the host/target environment because they
-          # are used at runtime, not build time (outside of linking).
-          cudartStatic = (cuda_cudart.__spliced.hostTarget or cuda_cudart).static;
-          cudartLib = lib.getLib (cuda_cudart.__spliced.hostTarget or cuda_cudart);
-          cudartDev = lib.getDev (cuda_cudart.__spliced.hostTarget or cuda_cudart);
-        in
         (prevAttrs.postPatch or "")
         + ''
           echo "Running the cuda_nvcc postPatch"
@@ -204,19 +204,23 @@ filterAndCreateOverrides {
           cat << EOF >> bin/nvcc.profile
 
           # Fix a compatible backend compiler
-          PATH += "${ccBin}/bin":
+          PATH += "${cc}/bin":
 
           # Expose the split-out nvvm
-          LIBRARIES =+ -L"''${!outputBin}/nvvm/lib"
-          INCLUDES =+ -I"''${!outputBin}/nvvm/include"
-
-          # Expose cudart and the libcuda stubs
-          LIBRARIES =+ -L"$static/lib" -L"${cudartStatic}/lib" -L"${cudartLib}/lib" -L"${cudartLib}/lib/stubs"
-          INCLUDES =+ -I"${cudartDev}/include"
+          LIBRARIES =+ "-L''${!outputBin}/nvvm/lib"
+          INCLUDES =+ "-I''${!outputBin}/nvvm/include"
           EOF
         '';
 
-      propagatedNativeBuildInputs = (prevAttrs.propagatedNativeBuildInputs or [ ]) ++ [ setupCudaHook ];
+      propagatedNativeBuildInputs = (prevAttrs.propagatedNativeBuildInputs or [ ]) ++ [ cc ];
+
+      # NOTE(@connorbaker):
+      # Though it might seem odd or counter-intuitive to add the setup hook to `propagatedBuildInputs` instead of
+      # `propagatedNativeBuildInputs`, it is necessary! If you move the setup hook from `propagatedBuildInputs` to
+      # `propagatedNativeBuildInputs`, it stops being propagated to downstream packages during their build because
+      # setup hooks in `propagatedNativeBuildInputs` are not designed to affect the runtime or build environment of
+      # dependencies; they are only meant to affect the build environment of the package that directly includes them.
+      propagatedBuildInputs = (prevAttrs.propagatedBuildInputs or [ ]) ++ [ setupCudaHook ];
 
       postInstall =
         (prevAttrs.postInstall or "")
@@ -261,23 +265,14 @@ filterAndCreateOverrides {
       qt5 ? null,
       qt6 ? null,
     }:
-    prevAttrs: {
-      nativeBuildInputs =
-        prevAttrs.nativeBuildInputs
-        ++ (
-          if (lib.strings.versionOlder prevAttrs.version "2022.2.0") then
-            [ qt5.wrapQtAppsHook ]
-          else
-            [ qt6.wrapQtAppsHook ]
-        );
-      buildInputs =
-        prevAttrs.buildInputs
-        ++ (
-          if (lib.strings.versionOlder prevAttrs.version "2022.2.0") then
-            [ qt5.qtwebview ]
-          else
-            [ qt6.qtwebview ]
-        );
+    prevAttrs:
+    let
+      qt = if lib.strings.versionOlder prevAttrs.version "2022.2.0" then qt5 else qt6;
+      inherit (qt) wrapQtAppsHook qtwebview;
+    in
+    {
+      nativeBuildInputs = prevAttrs.nativeBuildInputs ++ [ wrapQtAppsHook ];
+      buildInputs = prevAttrs.buildInputs ++ [ qtwebview ];
     };
 
   nsight_systems =

From b340c3ff8811e2a11f596a5343a3d30160196559 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Wed, 27 Mar 2024 03:22:08 +0000
Subject: [PATCH 29/34] cudaPackages.nccl: remove unneeded makeFlags, specify
 splicing, and enable structuredAttrs

---
 .../development/cuda-modules/nccl/default.nix | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/pkgs/development/cuda-modules/nccl/default.nix b/pkgs/development/cuda-modules/nccl/default.nix
index e3d10b79386f9..f7a3f78b9bd15 100644
--- a/pkgs/development/cuda-modules/nccl/default.nix
+++ b/pkgs/development/cuda-modules/nccl/default.nix
@@ -35,6 +35,7 @@ backendStdenv.mkDerivation (
     };
 
     strictDeps = true;
+    __structuredAttrs = true;
 
     outputs = [
       "out"
@@ -66,22 +67,20 @@ backendStdenv.mkDerivation (
 
     preConfigure = ''
       patchShebangs ./src/device/generate.py
-      makeFlagsArray+=(
-        "NVCC_GENCODE=${lib.concatStringsSep " " cudaFlags.gencode}"
-      )
     '';
 
+    # NOTE(@connorbaker): When referencing packages, make sure to use the spliced version corresponding to
+    # buildPackages instead of pkgs (the default).
     makeFlags =
-      ["PREFIX=$(out)"]
+      [
+        "PREFIX=$(out)"
+        "NVCC_GENCODE=${lib.concatStringsSep " " cudaFlags.gencode}"
+      ]
       ++ lib.optionals (lib.versionOlder cudaVersion "11.4") [
-        "CUDA_HOME=${cudatoolkit}"
-        "CUDA_LIB=${lib.getLib cudatoolkit}/lib"
-        "CUDA_INC=${lib.getDev cudatoolkit}/include"
+        "CUDA_HOME=${cudatoolkit.__spliced.buildHost or cudatoolkit}"
       ]
       ++ lib.optionals (lib.versionAtLeast cudaVersion "11.4") [
-        "CUDA_HOME=${cuda_nvcc}"
-        "CUDA_LIB=${lib.getLib cuda_cudart}/lib"
-        "CUDA_INC=${lib.getDev cuda_cudart}/include"
+        "CUDA_HOME=${cuda_nvcc.__spliced.buildHost or cuda_nvcc}"
       ];
 
     enableParallelBuilding = true;

From 0fa534c40570fb28c0408833f14c5a82626b748c Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Wed, 27 Mar 2024 03:26:27 +0000
Subject: [PATCH 30/34] cuda-modules/generic-builders/manifest: cleanup

---
 .../generic-builders/manifest.nix             | 62 +++++++------------
 1 file changed, 22 insertions(+), 40 deletions(-)

diff --git a/pkgs/development/cuda-modules/generic-builders/manifest.nix b/pkgs/development/cuda-modules/generic-builders/manifest.nix
index 591acfcc4c051..64850841d995b 100644
--- a/pkgs/development/cuda-modules/generic-builders/manifest.nix
+++ b/pkgs/development/cuda-modules/generic-builders/manifest.nix
@@ -38,7 +38,7 @@ let
     sourceTypes
     ;
 
-  inherit (backendStdenv) buildPlatform hostPlatform targetPlatform;
+  inherit (backendStdenv) hostPlatform;
 
   # Get the redist architectures for which package provides distributables.
   # These are used by meta.platforms.
@@ -157,39 +157,21 @@ backendStdenv.mkDerivation (
 
     # src :: Optional Derivation
     src =
-      # TODO(@connorbaker): Remove debugging lib.warn and inline this.
-      let
-        src =
-          trivial.mapNullable
-            (
-              { relative_path, sha256, ... }:
-              fetchurl {
-                url = "https://developer.download.nvidia.com/compute/${redistName}/redist/${relative_path}";
-                inherit sha256;
-              }
-            )
-            (redistribRelease.${hostPlatformRedistArch} or null);
-      in
-      lib.warn
-      ''
-        Info:
-        - redistName: ${redistName}
-        - hostPlatformRedistArch: ${hostPlatformRedistArch}
-        - pname: ${finalAttrs.pname}
-        - version: ${finalAttrs.version}
-        - outputs: ${builtins.toJSON finalAttrs.outputs}
-        - brokenConditions: ${builtins.toJSON finalAttrs.brokenConditions}
-        - badPlatformsConditions: ${builtins.toJSON finalAttrs.badPlatformsConditions}
-        - buildPlatform: ${buildPlatform.system}
-        - hostPlatform: ${hostPlatform.system}
-        - targetPlatform: ${targetPlatform.system}
-      ''
-      src;
+      trivial.mapNullable
+        (
+          { relative_path, sha256, ... }:
+          fetchurl {
+            url = "https://developer.download.nvidia.com/compute/${redistName}/redist/${relative_path}";
+            inherit sha256;
+          }
+        )
+        (redistribRelease.${hostPlatformRedistArch} or null);
 
     # Handle the pkg-config files:
     # 1. No FHS
     # 2. Location expected by the pkg-config wrapper
     # 3. Generate unversioned names too
+    # TODO(@connorbaker): Not all packages have a lib or dev output, so we should check for their existence.
     postPatch = ''
       for path in pkg-config pkgconfig ; do
         [[ -d "$path" ]] || continue
@@ -236,17 +218,6 @@ backendStdenv.mkDerivation (
         # Mark the CUDA toolkit root directory for the CUDA compatibility libraries
         markForCudatoolkitRootHook
         # (markForCudatoolkitRootHook.__spliced.buildHost or markForCudatoolkitRootHook)
-      ]
-      # autoAddCudaCompatRunpath depends on cuda_compat and would cause
-      # infinite recursion if applied to `cuda_compat` itself (beside the fact
-      # that it doesn't make sense in the first place)
-      ++ lib.optionals (pname != "cuda_compat" && flags.isJetsonBuild) [
-        # autoAddCudaCompatRunpath must appear AFTER autoAddDriverRunpath.
-        # See its documentation in ./setup-hooks/auto-add-cuda-compat-runpath-hook/default.nix.
-        # NOTE(@connorbaker): Because autoAddCudaCompatRunpath is in nativeBuildInputs, it tries to use cuda_compat
-        # from buildPackages, but we need to use the one from targetPackages.
-        # We can either use autoAddCudaCompatRunpath.__spliced.hostTarget or move it to buildInputs.
-        (autoAddCudaCompatRunpath.__spliced.hostTarget or autoAddCudaCompatRunpath)
       ];
 
     buildInputs =
@@ -256,6 +227,17 @@ backendStdenv.mkDerivation (
         # nvcc forces us to use an older gcc
         # NB: We don't actually know if this is the right thing to do
         stdenv.cc.cc.lib
+      ]
+      # autoAddCudaCompatRunpath depends on cuda_compat and would cause
+      # infinite recursion if applied to `cuda_compat` itself (beside the fact
+      # that it doesn't make sense in the first place)
+      ++ lib.optionals (pname != "cuda_compat" && flags.isJetsonBuild) [
+        # autoAddCudaCompatRunpath must appear AFTER autoAddDriverRunpath.
+        # See its documentation in ./setup-hooks/auto-add-cuda-compat-runpath-hook/default.nix.
+        # NOTE(@connorbaker): If autoAddCudaCompatRunpath is in nativeBuildInputs, it tries to use cuda_compat
+        # from buildPackages, but we need to use the one from pkgs (pkgsHostTarget).
+        # We can either use autoAddCudaCompatRunpath.__spliced.hostTarget or move it to buildInputs.
+        autoAddCudaCompatRunpath
       ];
 
     # Picked up by autoPatchelf

From 1f077ac1323bad24214aca56ecf1ec72a27aaad6 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Wed, 27 Mar 2024 03:26:57 +0000
Subject: [PATCH 31/34] 
 cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook: rewrite

---
 .../mark-for-cudatoolkit-root-hook.sh         | 72 +++++++++++--------
 1 file changed, 44 insertions(+), 28 deletions(-)

diff --git a/pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook/mark-for-cudatoolkit-root-hook.sh b/pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook/mark-for-cudatoolkit-root-hook.sh
index 475e19fb1db87..67c4f5ecf51b0 100644
--- a/pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook/mark-for-cudatoolkit-root-hook.sh
+++ b/pkgs/development/cuda-modules/setup-hooks/mark-for-cudatoolkit-root-hook/mark-for-cudatoolkit-root-hook.sh
@@ -1,48 +1,64 @@
 # shellcheck shell=bash
 
-guard=Sourcing
-reason=
+# Guard helper function
+# Returns 0 (success) if the hook should be run, 1 (failure) otherwise.
+# This allows us to use short-circuit evaluation to avoid running the hook when it shouldn't be.
+markForCUDAToolkit_ROOTGuard() {
+    local -i hostOffset=${hostOffset:?}
+    local -i targetOffset=${targetOffset:?}
+    local fnName="mark-for-cudatoolkit-root-hook::markForCUDAToolkit_ROOTGuard hostOffset=$hostOffset targetOffset=$targetOffset"
+    local guard=Skipping
+    local reason
 
-# export NIX_DEBUG=1
+    # This hook is meant only to add a stub file to the nix-support directory of the package including it in its
+    # nativeBuildInputs, so that the setup hook propagated by cuda_nvcc, setup-cuda-hook, can detect it and add the
+    # package to the CUDA toolkit root. Therefore, since it only modifies the package being built and will not be
+    # propagated, it should only ever be included in nativeBuildInputs.
+    if (( hostOffset == -1 && targetOffset == 0)); then
+        guard=Sourcing
+        reason="because the hook is in nativeBuildInputs relative to the package being built"
+    fi
+
+    echo "$fnName: $guard $reason" >&2
 
-# Only run the hook from nativeBuildInputs.
-# See the table under https://nixos.org/manual/nixpkgs/unstable/#dependency-propagation for information
-# about the different target combinations and their offsets.
+    # Recall that test commands return 0 for success and 1 for failure.
+    [[ "$guard" == Sourcing ]]
+    return $?
+}
 
-# Skip setup hook if we're neither a build-time dep, nor, temporarily, doing a
-# native compile.
-if [[ -v ${strictDeps-} ]]; then
-    guard=Skipping
-    reason=" because strictDeps is set"
-elif (( "${hostOffset:?}" < 0 )); then
-    guard=Skipping
-    reason=" because the hook is not in buildInputs"
-fi
+# Guard against calling the hook at the wrong time.
+markForCUDAToolkit_ROOTGuard || return 0
 
-if (( "${NIX_DEBUG:-0}" >= 1 )); then
-    echo "$guard hostOffset=$hostOffset targetOffset=${targetOffset:?} mark-for-cudatoolkit-root-hook$reason" >&2
-else
-    echo "$guard mark-for-cudatoolkit-root-hook$reason" >&2
-fi
+# Make a copy of the current offsets, so that we can use them in information messages; this is necessary because the
+# offsets are not consistently available in the environment during various phases of the build.
+declare -g snapshotHostOffset="${hostOffset:?}"
+declare -g snapshotTargetOffset="${targetOffset:?}"
 
-[[ "$guard" = Sourcing ]] || return 0
+markForCUDAToolkit_ROOTGetFnName() {
+    local fnName="mark-for-cudatoolkit-root-hook::${1:?}"
+    local hostOffset="${hostOffset:-$snapshotHostOffset}"
+    local targetOffset="${targetOffset:-$snapshotTargetOffset}"
+    echo "$fnName hostOffset=$hostOffset targetOffset=$targetOffset"
+}
 
 markForCUDAToolkit_ROOT() {
-    local fnName=mark-for-cudatoolkit-root-hook::markForCUDAToolkit_ROOT
-    echo "$fnName: Running" >&2
+    # Name function never needs to have return value checked.
+    # shellcheck disable=SC2155
+    local fnName="$(markForCUDAToolkit_ROOTGetFnName markForCUDAToolkit_ROOT)"
+    echo "$fnName: Running on ${prefix:?}" >&2
 
-    mkdir -p "${prefix:?}/nix-support"
     local markerPath="$prefix/nix-support/include-in-cudatoolkit-root"
+    mkdir -p "$(dirname "$markerPath")"
     if [[ -f "$markerPath" ]]; then
-        (( "${NIX_DEBUG:-0}" >= 1 )) && echo "$fnName: $markerPath exists, skipping" >&2
-        return
+        (( ${NIX_DEBUG:-0} >= 1 )) && echo "$fnName: $markerPath exists, skipping" >&2
+        return 0
     fi
 
     # Always create the file, even if it's empty, since setup-cuda-hook relies on its existence.
     # However, only populate it if strictDeps is not set.
     touch "$markerPath"
-    if [[ -z ${strictDeps-} ]]; then
-        (( "${NIX_DEBUG:-0}" >= 1 )) || echo "$fnName: populating $markerPath" >&2
+    if [[ -z "${strictDeps-}" ]]; then
+        (( ${NIX_DEBUG:-0} >= 1 )) || echo "$fnName: populating $markerPath" >&2
         echo "${pname:?}-${output:?}" > "$markerPath"
     fi
 }

From 3fc9e9baf4786acc679ff11e4cd0f2377b6c2e96 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Wed, 27 Mar 2024 03:28:32 +0000
Subject: [PATCH 32/34] cudaPackages.saxpy: getDev/getLib would not always
 select the desired output

---
 pkgs/development/cuda-modules/saxpy/default.nix | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pkgs/development/cuda-modules/saxpy/default.nix b/pkgs/development/cuda-modules/saxpy/default.nix
index e8701f0fd2df5..158b4cf846cc2 100644
--- a/pkgs/development/cuda-modules/saxpy/default.nix
+++ b/pkgs/development/cuda-modules/saxpy/default.nix
@@ -16,7 +16,6 @@ let
     flags
     libcublas
     ;
-  inherit (lib) getDev getLib getOutput;
 in
 backendStdenv.mkDerivation {
   pname = "saxpy";
@@ -37,10 +36,9 @@ backendStdenv.mkDerivation {
   buildInputs =
     lib.optionals (cudaOlder "11.4") [cudatoolkit]
     ++ lib.optionals (cudaAtLeast "11.4") [
-      (getDev libcublas)
-      (getLib libcublas)
-      (getOutput "static" libcublas)
       cuda_cudart
+      libcublas.dev
+      libcublas.lib
     ]
     ++ lib.optionals (cudaAtLeast "12.0") [cuda_cccl];
 

From 7fced1173fea4a484a68003482d75c66709e82d2 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Wed, 27 Mar 2024 03:37:38 +0000
Subject: [PATCH 33/34] cuda-modules/setup-hooks/setup-cuda-hook: rewrite

---
 .../setup-cuda-hook/setup-cuda-hook.sh        | 123 +++++++++++-------
 1 file changed, 76 insertions(+), 47 deletions(-)

diff --git a/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/setup-cuda-hook.sh b/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/setup-cuda-hook.sh
index 4c42a6dc26b48..694a4b6cf0ef9 100644
--- a/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/setup-cuda-hook.sh
+++ b/pkgs/development/cuda-modules/setup-hooks/setup-cuda-hook/setup-cuda-hook.sh
@@ -1,72 +1,97 @@
 # shellcheck shell=bash
 
-guard=Sourcing
-reason=
-
-# export NIX_DEBUG=1
-
-# Only run the hook from buildInputs: outside executables like cuda_nvcc, most
-# CUDA dependencies are needed at runtime, not build-time.
-# See the table under https://nixos.org/manual/nixpkgs/unstable/#dependency-propagation for information
-# about the different target combinations and their offsets.
-# Skip setup hook if we're neither a build-time dep, nor, temporarily, doing a
-# native compile.
-if [[ -v ${strictDeps-} ]]; then
-    guard=Skipping
-    reason=" because strictDeps is set"
-elif (( "${hostOffset:?}" < 0 )); then
-    guard=Skipping
-    reason=" because the hook is not in buildInputs"
-elif [[ -n ${cudaSetupHookOnce-} ]]; then
-    guard=Skipping
-    reason=" because the hook has been propagated more than once"
-fi
-
-if (( "${NIX_DEBUG:-0}" >= 1 )); then
-    echo "$guard hostOffset=$hostOffset targetOffset=${targetOffset:?} setup-cuda-hook$reason" >&2
-else
-    echo "$guard setup-cuda-hook$reason" >&2
-fi
-
-[[ "$guard" = Sourcing ]] || return 0
+# Guard helper function
+# Returns 0 (success) if the hook should be run, 1 (failure) otherwise.
+# This allows us to use short-circuit evaluation to avoid running the hook when it shouldn't be.
+setupCudaHookGuard() {
+    local -i hostOffset=${hostOffset:?}
+    local -i targetOffset=${targetOffset:?}
+    local fnName="setup-cuda-hook::setupCudaHookGuard hostOffset=$hostOffset targetOffset=$targetOffset"
+    local guard=Skipping
+    local reason=
+
+    # This hook is meant only to add a stub file to the nix-support directory of the package including it in its
+    # nativeBuildInputs, so that the setup hook propagated by cuda_nvcc, setup-cuda-hook, can detect it and add the
+    # package to the CUDA toolkit root. Therefore, since it only modifies the package being built and will not be
+    # propagated, it should only ever be included in nativeBuildInputs.
+    if (( hostOffset == -1 && targetOffset == 0)); then
+        guard=Sourcing
+        reason="because the hook is in nativeBuildInputs relative to the package being built"
+    elif [[ -n "${cudaSetupHookOnce-}" ]]; then
+        guard=Skipping
+        reason="because the hook has been propagated more than once"
+    fi
+
+    echo "$fnName: $guard $reason" >&2
+
+    # Recall that test commands return 0 for success and 1 for failure.
+    [[ "$guard" == Sourcing ]]
+    return $?
+}
+
+# Guard against calling the hook at the wrong time.
+setupCudaHookGuard || return 0
 
 declare -g cudaSetupHookOnce=1
 declare -Ag cudaHostPathsSeen=()
 declare -Ag cudaOutputToPath=()
 
-extendCudaHostPathsSeen() {
-    local fnName=setup-cuda-hook::extendCudaHostPathsSeen
-    (( "${NIX_DEBUG:-0}" >= 1 )) && echo "$fnName: $1" >&2
+# Make a copy of the current offsets, so that we can use them in information messages; this is necessary because the
+# offsets are not consistently available in the environment during various phases of the build.
+declare -g snapshotHostOffset="${hostOffset:?}"
+declare -g snapshotTargetOffset="${targetOffset:?}"
 
+setupCudaHookGetFnName() {
+    local fnName="setup-cuda-hook::${1:?}"
+    local hostOffset="${hostOffset:-$snapshotHostOffset}"
+    local targetOffset="${targetOffset:-$snapshotTargetOffset}"
+    echo "$fnName hostOffset=$hostOffset targetOffset=$targetOffset"
+}
+
+extendCudaHostPathsSeen() {
+    # Name function never needs to have return value checked.
+    # shellcheck disable=SC2155
+    local fnName="$(setupCudaHookGetFnName extendCudaHostPathsSeen)"
     local markerPath="$1/nix-support/include-in-cudatoolkit-root"
+    (( ${NIX_DEBUG:-0} >= 1 )) && echo "$fnName: checking for existence of $markerPath" >&2
+
     if [[ ! -f "$markerPath" ]]; then
-        (( "${NIX_DEBUG:-0}" >= 1 )) && echo "$fnName: skipping since $markerPath exists" >&2
-        return
+        (( ${NIX_DEBUG:-0} >= 1 )) && echo "$fnName: skipping since $markerPath does not exist" >&2
+        return 0
     fi
 
-    if [[ -v cudaHostPathsSeen[$1] ]]; then
-        (( "${NIX_DEBUG:-0}" >= 1 )) && echo "$fnName: skipping since $1 has already been seen" >&2
-        return
+    if [[ -v cudaHostPathsSeen["$1"] ]]; then
+        (( ${NIX_DEBUG:-0} >= 1 )) && echo "$fnName: skipping since $1 has already been seen" >&2
+        return 0
     fi
 
     # Add the path to the list of CUDA host paths.
     cudaHostPathsSeen["$1"]=1
+    (( ${NIX_DEBUG:-0} >= 1 )) && echo "$fnName: added $1 to cudaHostPathsSeen" >&2
+
+    # Only attempt to read the file referenced by markerPath if strictDeps is not set; otherwise it is blank and we
+    # don't need to read it.
+    [[ -n "${strictDeps-}" ]] && return 0
 
     # E.g. cuda_cudart-lib
     local cudaOutputName
-    read -r cudaOutputName < "$markerPath"
+    # Fail gracefully if the file is empty. This may happen if the package was built with strictDeps set,
+    # but the current build does not have strictDeps set.
+    read -r cudaOutputName < "$markerPath" || return 0
 
-    [[ -z "$cudaOutputName" ]] && return
+    [[ -z "$cudaOutputName" ]] && return 0
 
     local oldPath="${cudaOutputToPath[$cudaOutputName]-}"
     [[ -n "$oldPath" ]] && echo "$fnName: warning: overwriting $cudaOutputName from $oldPath to $1" >&2
     cudaOutputToPath["$cudaOutputName"]="$1"
 }
-addEnvHooks "$targetOffset" extendCudaHostPathsSeen
+addEnvHooks "${targetOffset:?}" extendCudaHostPathsSeen
 
 setupCUDAToolkit_ROOT() {
-    local fnName=setup-cuda-hook::setupCUDAToolkit_ROOT
-    (( "${NIX_DEBUG:-0}" >= 1 )) && echo "$fnName: cudaHostPathsSeen=${!cudaHostPathsSeen[*]}" >&2
+    # Name function never needs to have return value checked.
+    # shellcheck disable=SC2155
+    local fnName="$(setupCudaHookGetFnName setupCUDAToolkit_ROOT)"
+    (( ${NIX_DEBUG:-0} >= 1 )) && echo "$fnName: cudaHostPathsSeen=${!cudaHostPathsSeen[*]}" >&2
 
     for path in "${!cudaHostPathsSeen[@]}"; do
         addToSearchPathWithCustomDelimiter ";" CUDAToolkit_ROOT "$path"
@@ -81,10 +106,12 @@ setupCUDAToolkit_ROOT() {
 preConfigureHooks+=(setupCUDAToolkit_ROOT)
 
 setupCUDAToolkitCompilers() {
-    local fnName=setup-cuda-hook::setupCUDAToolkitCompilers
+    # Name function never needs to have return value checked.
+    # shellcheck disable=SC2155
+    local fnName="$(setupCudaHookGetFnName setupCUDAToolkitCompilers)"
     echo "$fnName: Running" >&2
 
-    [[ -n "${dontSetupCUDAToolkitCompilers-}" ]] && return
+    [[ -n "${dontSetupCUDAToolkitCompilers-}" ]] && return 0
 
     # Point NVCC at a compatible compiler
 
@@ -119,10 +146,12 @@ setupCUDAToolkitCompilers() {
 preConfigureHooks+=(setupCUDAToolkitCompilers)
 
 propagateCudaLibraries() {
-    local fnName=setup-cuda-hook::propagateCudaLibraries
-    (( "${NIX_DEBUG:-0}" >= 1 )) && echo "$fnName: cudaPropagateToOutput=$cudaPropagateToOutput cudaHostPathsSeen=${!cudaHostPathsSeen[*]}" >&2
+    # Name function never needs to have return value checked.
+    # shellcheck disable=SC2155
+    local fnName="$(setupCudaHookGetFnName propagateCudaLibraries)"
+    (( ${NIX_DEBUG:-0} >= 1 )) && echo "$fnName: cudaPropagateToOutput=$cudaPropagateToOutput cudaHostPathsSeen=${!cudaHostPathsSeen[*]}" >&2
 
-    [[ -z "${cudaPropagateToOutput-}" ]] && return
+    [[ -z "${cudaPropagateToOutput-}" ]] && return 0
 
     mkdir -p "${!cudaPropagateToOutput}/nix-support"
     # One'd expect this should be propagated-bulid-build-deps, but that doesn't seem to work

From b43bf06bf052f326636df3f27415b14190461040 Mon Sep 17 00:00:00 2001
From: Connor Baker <connor.baker@tweag.io>
Date: Wed, 27 Mar 2024 19:35:19 +0000
Subject: [PATCH 34/34] cudaPackages.saxpy: attempt manually setting flags for
 cross

---
 .../cuda-modules/cuda/overrides.nix           |   4 +-
 .../cuda-modules/saxpy/default.nix            | 119 ++++++++++++++++--
 2 files changed, 110 insertions(+), 13 deletions(-)

diff --git a/pkgs/development/cuda-modules/cuda/overrides.nix b/pkgs/development/cuda-modules/cuda/overrides.nix
index f0bcc8cbdd1ed..fe902e68f4000 100644
--- a/pkgs/development/cuda-modules/cuda/overrides.nix
+++ b/pkgs/development/cuda-modules/cuda/overrides.nix
@@ -212,7 +212,7 @@ filterAndCreateOverrides {
           EOF
         '';
 
-      propagatedNativeBuildInputs = (prevAttrs.propagatedNativeBuildInputs or [ ]) ++ [ cc ];
+      # propagatedNativeBuildInputs = (prevAttrs.propagatedNativeBuildInputs or [ ]) ++ [ cc ];
 
       # NOTE(@connorbaker):
       # Though it might seem odd or counter-intuitive to add the setup hook to `propagatedBuildInputs` instead of
@@ -220,7 +220,7 @@ filterAndCreateOverrides {
       # `propagatedNativeBuildInputs`, it stops being propagated to downstream packages during their build because
       # setup hooks in `propagatedNativeBuildInputs` are not designed to affect the runtime or build environment of
       # dependencies; they are only meant to affect the build environment of the package that directly includes them.
-      propagatedBuildInputs = (prevAttrs.propagatedBuildInputs or [ ]) ++ [ setupCudaHook ];
+      # propagatedBuildInputs = (prevAttrs.propagatedBuildInputs or [ ]) ++ [ setupCudaHook ];
 
       postInstall =
         (prevAttrs.postInstall or "")
diff --git a/pkgs/development/cuda-modules/saxpy/default.nix b/pkgs/development/cuda-modules/saxpy/default.nix
index 158b4cf846cc2..b457ee34b0586 100644
--- a/pkgs/development/cuda-modules/saxpy/default.nix
+++ b/pkgs/development/cuda-modules/saxpy/default.nix
@@ -30,17 +30,114 @@ backendStdenv.mkDerivation {
       cmake
       autoAddDriverRunpath
     ]
-    ++ lib.optionals (cudaOlder "11.4") [cudatoolkit]
-    ++ lib.optionals (cudaAtLeast "11.4") [cuda_nvcc];
-
-  buildInputs =
-    lib.optionals (cudaOlder "11.4") [cudatoolkit]
-    ++ lib.optionals (cudaAtLeast "11.4") [
-      cuda_cudart
-      libcublas.dev
-      libcublas.lib
-    ]
-    ++ lib.optionals (cudaAtLeast "12.0") [cuda_cccl];
+    ++ lib.optionals (cudaOlder "11.4") [ cudatoolkit ]
+    ++ lib.optionals (cudaAtLeast "11.4") [ cuda_nvcc ];
+
+  # buildInputs =
+  #   lib.optionals (cudaOlder "11.4") [ cudatoolkit ]
+  #   ++ lib.optionals (cudaAtLeast "11.4") [
+  #     cuda_cudart
+  #     libcublas
+  #     # libcublas.dev
+  #     # libcublas.lib
+  #   ]
+  #   ++ lib.optionals (cudaAtLeast "12.0") [ cuda_cccl ];
+
+  # TODO: CMake tells us CUDA_HOST_COMPILER is an unused variable; CMAKE_CUDA_HOST_COMPILER is used and we can set it.
+  # TODO: CMake tells us CUDAToolkit_INCLUDE_DIR is an unused variable; CUDAToolkit_INCLUDE_DIRS is used and we can set it.
+  # TODO: What is the difference between CUDA_CUDA_COMPILER and CMAKE_CUDA_HOST_COMPILER, or CUDACXX and CUDAHOSTCXX?
+  # TODO: The CUDA compiler source identification process used by CMake requires building and running a test program. This is not possible in a cross-compilation environment. We can use CMAKE_CUDA_FLAGS_INIT to get around it.
+  # TODO: Why aren't any of these correctly configured by the environment?
+  # TODO: See whether CUDAToolkit_INCLUDE_DIR etc is necessary, or just the LIBRARY_PATH and LD_LIBRARY_PATH.
+  # TODO: /nix/store/j2y057vz3i19yh4zjsan1s3q256q15rd-binutils-2.41/bin/ld: /nix/store/gh1azxmwdisz1q92h1hw20w9l72gwza7-libcublas-aarch64-unknown-linux-gnu-12.2.5.6-lib/lib/libcublas.so: error adding symbols: file in wrong format
+  preConfigure =
+    let
+      inherit (backendStdenv.__spliced.buildHost) cc;
+      ccFullPath = "${cc}/bin/${cc.targetPrefix}c++";
+      ccRoot = "${cc}";
+      nvccBuildHost = cuda_nvcc.__spliced.buildHost;
+      cudartBuildHost = cuda_cudart.__spliced.buildHost;
+
+      cudartHostTarget = cuda_cudart.__spliced.hostTarget;
+      ccclHostTarget = cuda_cccl.__spliced.hostTarget;
+      libcublasHostTarget = libcublas.__spliced.hostTarget;
+    in
+    # Working (until linker error)
+    # export NVCC_PREPEND_FLAGS+=" -I${cudartHostTarget}/include -I${ccclHostTarget}/include -L${cudartHostTarget}/lib -L${ccclHostTarget}/lib"
+    # export LIBRARY_PATH+="${cudartHostTarget}/lib"
+    # export LD_LIBRARY_PATH+="${cudartHostTarget}/lib"
+    # export CPATH="$CUDAToolkit_INCLUDE_DIRS"
+    #
+    # Ripped from setup-cuda-hook::setupCUDAToolkitCompilers, added logging
+    ''
+      # Name function never needs to have return value checked.
+      # shellcheck disable=SC2155
+
+      for path in "${cudartHostTarget}" "${ccclHostTarget}" "${libcublasHostTarget}" "${nvccBuildHost}"; do
+        if [[ -d "$path" ]]; then
+          echo "Adding $path to CUDAToolkit search path"
+          addToSearchPathWithCustomDelimiter ";" CUDAToolkit_ROOT "$path"
+          echo "CUDAToolkit_ROOT is now $CUDAToolkit_ROOT"
+        else
+          echo "Skipping $path as it is not a directory"
+        fi
+
+        if [[ -d "$path/include" ]]; then
+          echo "Adding $path/include to CUDAToolkit search path"
+          addToSearchPathWithCustomDelimiter ";" CUDAToolkit_INCLUDE_DIRS "$path/include"
+          echo "CUDAToolkit_INCLUDE_DIRS is now $CUDAToolkit_INCLUDE_DIRS"
+        else
+          echo "Skipping $path/include as it is not a directory"
+        fi
+      done
+
+      export cmakeFlagsArray+=(
+        -DCUDAToolkit_INCLUDE_DIRS="''${CUDAToolkit_INCLUDE_DIRS:-}"
+        -DCUDAToolkit_ROOT="''${CUDAToolkit_ROOT:-}"
+      )
+    ''
+    # Try to export the include dirs to CPATH, replacing the semicolons with colons
+    + ''
+      export CPATH="''${CUDAToolkit_INCLUDE_DIRS//;/:}"
+      echo "CPATH is now $CPATH"
+    ''
+    # Ripped from setup-cuda-hook::setupCUDAToolkitCompilers
+    + ''
+      # Point NVCC at a compatible compiler
+
+      # For CMake-based projects:
+      # https://cmake.org/cmake/help/latest/module/FindCUDA.html#input-variables
+      # https://cmake.org/cmake/help/latest/envvar/CUDAHOSTCXX.html
+      # https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_HOST_COMPILER.html
+
+      export cmakeFlagsArray+=(
+        -DCMAKE_CUDA_HOST_COMPILER="${ccFullPath}"
+      )
+
+      # For non-CMake projects:
+      # We prepend --compiler-bindir to nvcc flags.
+      # Downstream packages can override these, because NVCC
+      # uses the last --compiler-bindir it gets on the command line.
+      # FIXME: this results in "incompatible redefinition" warnings.
+      # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#compiler-bindir-directory-ccbin
+      export CUDAHOSTCXX="${ccFullPath}"
+
+      export NVCC_PREPEND_FLAGS+=" --compiler-bindir=${ccRoot}/bin"
+
+      # NOTE: We set -Xfatbin=-compress-all, which reduces the size of the compiled
+      # binaries. If binaries grow over 2GB, they will fail to link. This is a problem for us, as
+      # the default set of CUDA capabilities we build can regularly cause this to occur (for
+      # example, with Magma).
+      #
+      # @SomeoneSerge: original comment was made by @ConnorBaker in .../cudatoolkit/common.nix
+      export NVCC_PREPEND_FLAGS+=" -Xfatbin=-compress-all"
+    ''
+    # Try to get around compiler initialization via CMAKE_CUDA_FLAGS_INIT
+    + ''
+      export cmakeFlagsArray+=(
+        -DCMAKE_CUDA_FLAGS_INIT="-L${cudartBuildHost}/lib -I${cudartBuildHost}/include"
+      )
+    '';
 
   cmakeFlags = [
     (lib.cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (