pkgs/development/python-modules/flashinfer/default.nix at flake-libs · tjh.dev/nixpkgs

tjh.dev / nixpkgs
Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
nixpkgs / pkgs / development / python-modules / flashinfer / default.nix
at flake-libs 103 lines 3.0 kB view raw
  1# NOTE: At runtime, FlashInfer will fall back to PyTorch’s JIT compilation if a
  2# requested kernel wasn’t pre-compiled in AOT mode, and JIT compilation always
  3# requires the CUDA toolkit (via nvcc) to be available.
  4#
  5# This means that if you plan to use flashinfer, you will need to set the
  6# environment varaible `CUDA_HOME` to `cudatoolkit`.
  7{
  8  lib,
  9  config,
 10  buildPythonPackage,
 11  fetchFromGitHub,
 12  setuptools,
 13  cudaPackages,
 14  cmake,
 15  ninja,
 16  numpy,
 17  torch,
 18}:
 19
 20let
 21  pname = "flashinfer";
 22  version = "0.2.5";
 23
 24  src_cutlass = fetchFromGitHub {
 25    owner = "NVIDIA";
 26    repo = "cutlass";
 27    # Using the revision obtained in submodule inside flashinfer's `3rdparty`.
 28    rev = "df8a550d3917b0e97f416b2ed8c2d786f7f686a3";
 29    hash = "sha256-d4czDoEv0Focf1bJHOVGX4BDS/h5O7RPoM/RrujhgFQ=";
 30  };
 31
 32in
 33buildPythonPackage {
 34  inherit pname version;
 35
 36  src = fetchFromGitHub {
 37    owner = "flashinfer-ai";
 38    repo = "flashinfer";
 39    tag = "v${version}";
 40    hash = "sha256-YrYfatkI9DQkFEEGiF8CK/bTafaNga4Ufyt+882C0bQ=";
 41  };
 42
 43  build-system = [ setuptools ];
 44
 45  nativeBuildInputs = [
 46    cmake
 47    ninja
 48    (lib.getBin cudaPackages.cuda_nvcc)
 49  ];
 50  dontUseCmakeConfigure = true;
 51
 52  buildInputs = [
 53    cudaPackages.cuda_cudart
 54    cudaPackages.libcublas
 55    cudaPackages.cuda_cccl
 56    cudaPackages.libcurand
 57  ];
 58
 59  postPatch = ''
 60    rmdir 3rdparty/cutlass
 61    ln -s ${src_cutlass} 3rdparty/cutlass
 62  '';
 63
 64  # FlashInfer offers two installation modes:
 65  #
 66  # JIT mode: CUDA kernels are compiled at runtime using PyTorch’s JIT, with
 67  # compiled kernels cached for future use. JIT mode allows fast installation,
 68  # as no CUDA kernels are pre-compiled, making it ideal for development and
 69  # testing. JIT version is also available as a sdist in PyPI.
 70  #
 71  # AOT mode: Core CUDA kernels are pre-compiled and included in the library,
 72  # reducing runtime compilation overhead. If a required kernel is not
 73  # pre-compiled, it will be compiled at runtime using JIT. AOT mode is
 74  # recommended for production environments.
 75  #
 76  # Here we use opt for the AOT version.
 77  preConfigure = ''
 78    export FLASHINFER_ENABLE_AOT=1
 79    export TORCH_NVCC_FLAGS="--maxrregcount=64"
 80  '';
 81
 82  TORCH_CUDA_ARCH_LIST = lib.concatStringsSep ";" torch.cudaCapabilities;
 83
 84  dependencies = [
 85    numpy
 86    torch
 87  ];
 88
 89  meta = with lib; {
 90    broken = !torch.cudaSupport || !config.cudaSupport;
 91    homepage = "https://flashinfer.ai/";
 92    description = "Library and kernel generator for Large Language Models";
 93    longDescription = ''
 94      FlashInfer is a library and kernel generator for Large Language Models
 95      that provides high-performance implementation of LLM GPU kernels such as
 96      FlashAttention, PageAttention and LoRA. FlashInfer focus on LLM serving
 97      and inference, and delivers state-of-the-art performance across diverse
 98      scenarios.
 99    '';
100    license = licenses.asl20;
101    maintainers = with maintainers; [ breakds ];
102  };
103}