pkgs/development/python-modules/flashinfer/default.nix at devShellTools-shell · tjh.dev/nixpkgs

tjh.dev / nixpkgs
Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
nixpkgs / pkgs / development / python-modules / flashinfer / default.nix
at devShellTools-shell 3.0 kB view raw
  1# NOTE: At runtime, FlashInfer will fall back to PyTorch’s JIT compilation if a
  2# requested kernel wasn’t pre-compiled in AOT mode, and JIT compilation always
  3# requires the CUDA toolkit (via nvcc) to be available.
  4#
  5# This means that if you plan to use flashinfer, you will need to set the
  6# environment variable `CUDA_HOME` to `cudatoolkit`.
  7{
  8  lib,
  9  config,
 10  buildPythonPackage,
 11  fetchFromGitHub,
 12  setuptools,
 13  cudaPackages,
 14  cmake,
 15  ninja,
 16  numpy,
 17  torch,
 18}:
 19
 20let
 21  pname = "flashinfer";
 22  version = "0.2.5";
 23
 24  src_cutlass = fetchFromGitHub {
 25    owner = "NVIDIA";
 26    repo = "cutlass";
 27    # Using the revision obtained in submodule inside flashinfer's `3rdparty`.
 28    rev = "df8a550d3917b0e97f416b2ed8c2d786f7f686a3";
 29    hash = "sha256-d4czDoEv0Focf1bJHOVGX4BDS/h5O7RPoM/RrujhgFQ=";
 30  };
 31
 32in
 33buildPythonPackage {
 34  format = "setuptools";
 35  inherit pname version;
 36
 37  src = fetchFromGitHub {
 38    owner = "flashinfer-ai";
 39    repo = "flashinfer";
 40    tag = "v${version}";
 41    hash = "sha256-YrYfatkI9DQkFEEGiF8CK/bTafaNga4Ufyt+882C0bQ=";
 42  };
 43
 44  build-system = [ setuptools ];
 45
 46  nativeBuildInputs = [
 47    cmake
 48    ninja
 49    (lib.getBin cudaPackages.cuda_nvcc)
 50  ];
 51  dontUseCmakeConfigure = true;
 52
 53  buildInputs = [
 54    cudaPackages.cuda_cudart
 55    cudaPackages.libcublas
 56    cudaPackages.cuda_cccl
 57    cudaPackages.libcurand
 58  ];
 59
 60  postPatch = ''
 61    rmdir 3rdparty/cutlass
 62    ln -s ${src_cutlass} 3rdparty/cutlass
 63  '';
 64
 65  # FlashInfer offers two installation modes:
 66  #
 67  # JIT mode: CUDA kernels are compiled at runtime using PyTorch’s JIT, with
 68  # compiled kernels cached for future use. JIT mode allows fast installation,
 69  # as no CUDA kernels are pre-compiled, making it ideal for development and
 70  # testing. JIT version is also available as a sdist in PyPI.
 71  #
 72  # AOT mode: Core CUDA kernels are pre-compiled and included in the library,
 73  # reducing runtime compilation overhead. If a required kernel is not
 74  # pre-compiled, it will be compiled at runtime using JIT. AOT mode is
 75  # recommended for production environments.
 76  #
 77  # Here we use opt for the AOT version.
 78  preConfigure = ''
 79    export FLASHINFER_ENABLE_AOT=1
 80    export TORCH_NVCC_FLAGS="--maxrregcount=64"
 81    export MAX_JOBS="$NIX_BUILD_CORES"
 82  '';
 83
 84  TORCH_CUDA_ARCH_LIST = lib.concatStringsSep ";" torch.cudaCapabilities;
 85
 86  dependencies = [
 87    numpy
 88    torch
 89  ];
 90
 91  meta = with lib; {
 92    broken = !torch.cudaSupport || !config.cudaSupport;
 93    homepage = "https://flashinfer.ai/";
 94    description = "Library and kernel generator for Large Language Models";
 95    longDescription = ''
 96      FlashInfer is a library and kernel generator for Large Language Models
 97      that provides high-performance implementation of LLM GPU kernels such as
 98      FlashAttention, PageAttention and LoRA. FlashInfer focus on LLM serving
 99      and inference, and delivers state-of-the-art performance across diverse
100      scenarios.
101    '';
102    license = licenses.asl20;
103    maintainers = with maintainers; [ breakds ];
104  };
105}