{
  _cuda,
  addDriverRunpath,
  backendStdenv,
  cmake,
  cuda_cudart,
  cuda_nvcc,
  cuda_nvrtc,
  cudaNamePrefix,
  cudnn,
  fetchFromGitHub,
  flags,
  gtest,
  lib,
  libcublas,
  libcurand,
  ninja,
  python3Packages,
  # Options
  pythonSupport ? true,
  enableF16C ? false,
  enableTools ? false,
  # passthru.updateScript
  gitUpdater,
}:
let
  inherit (_cuda.lib) _mkMetaBadPlatforms;
  inherit (lib) licenses maintainers teams;
  inherit (lib.asserts) assertMsg;
  inherit (lib.attrsets) getBin;
  inherit (lib.lists) all optionals;
  inherit (lib.strings)
    cmakeBool
    cmakeFeature
    optionalString
    versionAtLeast
    ;
  inherit (lib.trivial) flip;
in
# TODO: Tests.
assert assertMsg (!enableTools) "enableTools is not yet implemented";
backendStdenv.mkDerivation (finalAttrs: {
  __structuredAttrs = true;
  strictDeps = true;

  # NOTE: Depends on the CUDA package set, so use cudaNamePrefix.
  name = "${cudaNamePrefix}-${finalAttrs.pname}-${finalAttrs.version}";
  pname = "cutlass";
  version = "3.9.2";

  src = fetchFromGitHub {
    owner = "NVIDIA";
    repo = "cutlass";
    tag = "v${finalAttrs.version}";
    hash = "sha256-teziPNA9csYvhkG5t2ht8W8x5+1YGGbHm8VKx4JoxgI=";
  };

  # TODO: As a header-only library, we should make sure we have an `include` directory or similar which is not a
  # superset of the `out` (`bin`) or `dev` outputs (whih is what the multiple-outputs setup hook does by default).
  outputs = [ "out" ] ++ optionals pythonSupport [ "dist" ];

  nativeBuildInputs = [
    cuda_nvcc
    cmake
    ninja
    python3Packages.python # Python is always required
  ]
  ++ optionals pythonSupport (
    with python3Packages;
    [
      build
      pythonOutputDistHook
      setuptools
    ]
  );

  postPatch =
    # Prepend some commands to the CUDA.cmake file so it can find the CUDA libraries using CMake's FindCUDAToolkit
    # module. These target names are used throughout the project; I (@connorbaker) did not choose them.
    ''
      nixLog "patching CUDA.cmake to use FindCUDAToolkit"
      mv ./CUDA.cmake ./_CUDA_Append.cmake
      cat > ./_CUDA_Prepend.cmake <<'EOF'
      find_package(CUDAToolkit REQUIRED)
      foreach(_target cudart cuda_driver nvrtc)
        if (NOT TARGET CUDA::''${_target})
          message(FATAL_ERROR "''${_target} Not Found")
        endif()
        message(STATUS "''${_target} library: ''${CUDA_''${_target}_LIBRARY}")
        add_library(''${_target} ALIAS CUDA::''${_target})
      endforeach()
      EOF
      cat ./_CUDA_Prepend.cmake ./_CUDA_Append.cmake > ./CUDA.cmake
    ''
    # Patch cutlass to use the provided NVCC.
    # '_CUDA_INSTALL_PATH = os.getenv("CUDA_INSTALL_PATH", _cuda_install_path_from_nvcc())' \
    # '_CUDA_INSTALL_PATH = "${getBin cuda_nvcc}"'
    + ''
      nixLog "patching python bindings to make cuda_install_path fail"
      substituteInPlace ./python/cutlass/__init__.py \
        --replace-fail \
          'def cuda_install_path():' \
      '
      def cuda_install_path():
          raise RuntimeException("not supported with Nixpkgs CUDA packaging")
      '
    ''
    # Patch the python bindings to use environment variables set by Nixpkgs.
    # https://github.com/NVIDIA/cutlass/blob/e94e888df3551224738bfa505787b515eae8352f/python/cutlass/backend/compiler.py#L80
    # https://github.com/NVIDIA/cutlass/blob/e94e888df3551224738bfa505787b515eae8352f/python/cutlass/backend/compiler.py#L81
    # https://github.com/NVIDIA/cutlass/blob/e94e888df3551224738bfa505787b515eae8352f/python/cutlass/backend/compiler.py#L317
    # https://github.com/NVIDIA/cutlass/blob/e94e888df3551224738bfa505787b515eae8352f/python/cutlass/backend/compiler.py#L319
    # https://github.com/NVIDIA/cutlass/blob/e94e888df3551224738bfa505787b515eae8352f/python/cutlass/backend/compiler.py#L344
    # https://github.com/NVIDIA/cutlass/blob/e94e888df3551224738bfa505787b515eae8352f/python/cutlass/backend/compiler.py#L360
    + ''
      nixLog "patching python bindings to use environment variables"
      substituteInPlace ./python/cutlass/backend/compiler.py \
        --replace-fail \
          'self.include_paths = include_paths' \
          'self.include_paths = include_paths + [root + "/include" for root in os.getenv("CUDAToolkit_ROOT").split(";")]' \
        --replace-fail \
          'self.flags = flags' \
          'self.flags = flags + ["-L" + root + "/lib" for root in os.getenv("CUDAToolkit_ROOT").split(";")]' \
        --replace-fail \
          "\''${cuda_install_path}/bin/nvcc" \
          '${getBin cuda_nvcc}/bin/nvcc' \
        --replace-fail \
          '"cuda_install_path": cuda_install_path(),' \
          "" \
        --replace-fail \
          'f"{cuda_install_path()}/bin/nvcc"' \
          '"${getBin cuda_nvcc}/bin/nvcc"' \
        --replace-fail \
          'cuda_install_path() + "/include",' \
          ""
    '';

  enableParallelBuilding = true;

  buildInputs = [
    cuda_cudart
    cuda_nvrtc
    libcurand
  ]
  ++ optionals enableTools [
    cudnn
    libcublas
  ];

  cmakeFlags = [
    (cmakeFeature "CUTLASS_NVCC_ARCHS" flags.cmakeCudaArchitecturesString)
    (cmakeBool "CUTLASS_ENABLE_EXAMPLES" false)

    # Tests.
    (cmakeBool "CUTLASS_ENABLE_TESTS" finalAttrs.doCheck)
    (cmakeBool "CUTLASS_ENABLE_GTEST_UNIT_TESTS" finalAttrs.doCheck)
    (cmakeBool "CUTLASS_USE_SYSTEM_GOOGLETEST" true)

    # NOTE: Both CUDNN and CUBLAS can be used by the examples and the profiler. Since they are large dependencies, they
    #       are disabled by default.
    (cmakeBool "CUTLASS_ENABLE_TOOLS" enableTools)
    (cmakeBool "CUTLASS_ENABLE_CUBLAS" enableTools)
    (cmakeBool "CUTLASS_ENABLE_CUDNN" enableTools)

    # NOTE: Requires x86_64 and hardware support.
    (cmakeBool "CUTLASS_ENABLE_F16C" enableF16C)

    # TODO: Unity builds are supposed to reduce build time, but this seems to just reduce the number of tasks
    # generated?
    # NOTE: Good explanation of unity builds:
    #       https://www.methodpark.de/blog/how-to-speed-up-clang-tidy-with-unity-builds.
    (cmakeBool "CUTLASS_UNITY_BUILD_ENABLED" false)
  ];

  postBuild = lib.optionalString pythonSupport ''
    pushd "$NIX_BUILD_TOP/$sourceRoot"
    nixLog "building Python wheel"
    pyproject-build \
      --no-isolation \
      --outdir "$NIX_BUILD_TOP/$sourceRoot/''${cmakeBuildDir:?}/dist/" \
      --wheel
    popd >/dev/null
  '';

  doCheck = false;

  checkInputs = [ gtest ];

  # NOTE: Because the test cases immediately create and try to run the binaries, we don't have an opportunity
  # to patch them with autoAddDriverRunpath. To get around this, we add the driver runpath to the environment.
  # TODO: This would break Jetson when using cuda_compat, as it must come first.
  preCheck = optionalString finalAttrs.doCheck ''
    export LD_LIBRARY_PATH="$(readlink -mnv "${addDriverRunpath.driverLink}/lib")"
  '';

  # This is *not* a derivation you want to build on a small machine.
  requiredSystemFeatures = optionals finalAttrs.doCheck [
    "big-parallel"
    "cuda"
  ];

  passthru = {
    updateScript = gitUpdater {
      inherit (finalAttrs) pname version;
      rev-prefix = "v";
    };
    # TODO:
    # tests.test = cutlass.overrideAttrs { doCheck = true; };

    # Include required architectures in compatibility check.
    # https://github.com/NVIDIA/cutlass/tree/main?tab=readme-ov-file#compatibility
    platformAssertions = [
      {
        message = "all capabilities are >= 7.0 (${builtins.toJSON flags.cudaCapabilities})";
        assertion = all (flip versionAtLeast "7.0") flags.cudaCapabilities;
      }
    ];
  };

  meta = {
    description = "CUDA Templates for Linear Algebra Subroutines";
    homepage = "https://github.com/NVIDIA/cutlass";
    license = licenses.asl20;
    platforms = [
      "aarch64-linux"
      "x86_64-linux"
    ];
    badPlatforms = _mkMetaBadPlatforms finalAttrs;
    maintainers = [ maintainers.connorbaker ];
    teams = [ teams.cuda ];
  };
})