{
  _cuda,
  backendStdenv,
  buildPackages,
  cmake,
  cuda_cccl,
  cuda_cudart,
  cuda_nvcc,
  cuda_nvml_dev,
  cuda_nvtx,
  cudaAtLeast,
  cudaMajorMinorVersion,
  cudaNamePrefix,
  fetchFromGitHub,
  flags,
  gdrcopy,
  lib,
  libfabric,
  mpi,
  nccl,
  ninja,
  pmix,
  python3Packages,
  rdma-core,
  ucx,
  # passthru.updateScript
  gitUpdater,
}:
let
  inherit (lib)
    cmakeBool
    cmakeFeature
    getBin
    getDev
    getExe
    getLib
    licenses
    maintainers
    teams
    ;
in
backendStdenv.mkDerivation (finalAttrs: {
  __structuredAttrs = true;
  strictDeps = true;

  # NOTE: Depends on the CUDA package set, so use cudaNamePrefix.
  name = "${cudaNamePrefix}-${finalAttrs.pname}-${finalAttrs.version}";
  pname = "libnvshmem";
  version = "3.4.5-0";

  src = fetchFromGitHub {
    owner = "NVIDIA";
    repo = "nvshmem";
    tag = "v${finalAttrs.version}";
    hash = "sha256-RHZzjDMYlL7vAVP1/UXM/Pt4bhajeWdCi3ihICeD2mc=";
  };

  outputs = [ "out" ];

  nativeBuildInputs = [
    cuda_nvcc
    cmake
    ninja

    # NOTE: mpi is in nativeBuildInputs because it contains compilers and is only discoverable by CMake
    # when a nativeBuildInput.
    mpi

    # NOTE: Python is required even if not building nvshmem4py:
    # https://github.com/NVIDIA/nvshmem/blob/131da55f643ac87c810ba0bc51d359258bf433a1/CMakeLists.txt#L173
    python3Packages.python
  ];

  # NOTE: Hardcoded standard versions mean CMake doesn't respect values we provide, so we need to patch the files.
  postPatch = ''
    for standardName in {CXX,CUDA}_STANDARD
    do
      while IFS= read -r cmakeFileToPatch
      do
        nixLog "patching $PWD/$cmakeFileToPatch to fix $standardName"
        substituteInPlace "$PWD/$cmakeFileToPatch" \
          --replace-fail \
            "$standardName 11" \
            "$standardName 17"
      done < <(grep --recursive --files-with-matches "$standardName 11")
    done
    unset -v cmakeFileToPatch
    unset -v standardName
  '';

  enableParallelBuilding = true;

  buildInputs = [
    cuda_cccl
    cuda_cudart
    cuda_nvml_dev
    cuda_nvtx
    gdrcopy
    libfabric
    nccl
    pmix
    rdma-core
    ucx
  ];

  # NOTE: This *must* be an environment variable NVIDIA saw fit to *configure and build CMake projects* while *inside*
  # a CMake build and didn't correctly thread arguments through, so the environment is the only way to get
  # configurations to the nested build.
  env.CUDA_HOME = (getBin cuda_nvcc).outPath;

  # https://docs.nvidia.com/nvshmem/release-notes-install-guide/install-guide/nvshmem-install-proc.html#other-distributions
  cmakeFlags = [
    (cmakeFeature "NVSHMEM_PREFIX" (placeholder "out"))

    (cmakeFeature "CUDA_HOME" (getBin cuda_nvcc).outPath)
    (cmakeFeature "CMAKE_CUDA_COMPILER" (getExe cuda_nvcc))

    (cmakeFeature "CMAKE_CUDA_ARCHITECTURES" flags.cmakeCudaArchitecturesString)

    (cmakeBool "NVSHMEM_USE_NCCL" true)
    (cmakeFeature "NCCL_HOME" (getDev nccl).outPath)

    (cmakeBool "NVSHMEM_USE_GDRCOPY" true)
    (cmakeFeature "GDRCOPY_HOME" (getDev gdrcopy).outPath)

    # NOTE: Make sure to use mpi from buildPackages to match the spliced version created through nativeBuildInputs.
    (cmakeBool "NVSHMEM_MPI_SUPPORT" true)
    (cmakeFeature "MPI_HOME" (getLib buildPackages.mpi).outPath)

    # TODO: Doesn't UCX need to be built with some argument when we want to use it with libnvshmem?
    (cmakeBool "NVSHMEM_UCX_SUPPORT" true)
    (cmakeFeature "UCX_HOME" (getDev ucx).outPath)

    (cmakeBool "NVSHMEM_LIBFABRIC_SUPPORT" true)
    (cmakeFeature "LIBFABRIC_HOME" (getDev libfabric).outPath)

    (cmakeBool "NVSHMEM_IBGDA_SUPPORT" true)
    # NOTE: no corresponding _HOME variable for IBGDA.

    (cmakeBool "NVSHMEM_PMIX_SUPPORT" true)
    (cmakeFeature "PMIX_HOME" (getDev pmix).outPath)

    (cmakeBool "NVSHMEM_BUILD_TESTS" true)
    (cmakeBool "NVSHMEM_BUILD_EXAMPLES" true)

    (cmakeBool "NVSHMEM_BUILD_DEB_PACKAGE" false)
    (cmakeBool "NVSHMEM_BUILD_RPM_PACKAGE" false)

    # TODO: Looks like a nightmare to package and depends on things we haven't packaged yet
    # https://github.com/NVIDIA/nvshmem/tree/131da55f643ac87c810ba0bc51d359258bf433a1/nvshmem4py
    (cmakeBool "NVSHMEM_BUILD_PYTHON_LIB" false)

    # NOTE: unsupported because it requires Clang
    (cmakeBool "NVSHMEM_BUILD_BITCODE_LIBRARY" false)
  ];

  postInstall = ''
    nixLog "moving top-level files in $out to $out/share"
    mv -v "$out"/{changelog,git_commit.txt,License.txt,version.txt} "$out/share/"
  '';

  doCheck = false;

  passthru = {
    updateScript = gitUpdater {
      inherit (finalAttrs) pname version;
      rev-prefix = "v";
    };

    brokenAssertions = [
      # CUDA pre-11.7 yeilds macro/type errors in src/include/internal/host_transport/cudawrap.h.
      {
        message = "NVSHMEM does not support CUDA releases earlier than 11.7 (found ${cudaMajorMinorVersion})";
        assertion = cudaAtLeast "11.7";
      }
    ];
  };

  meta = {
    description = "Parallel programming interface for NVIDIA GPUs based on OpenSHMEM";
    homepage = "https://github.com/NVIDIA/nvshmem";
    broken = _cuda.lib._mkMetaBroken finalAttrs;
    # NOTE: There are many licenses:
    # https://github.com/NVIDIA/nvshmem/blob/7dd48c9fd7aa2134264400802881269b7822bd2f/License.txt
    license = licenses.nvidiaCudaRedist;
    platforms = [
      "aarch64-linux"
      "x86_64-linux"
    ];
    maintainers = [ maintainers.connorbaker ];
    teams = [ teams.cuda ];
  };
})