1# NOTE: At runtime, FlashInfer will fall back to PyTorch’s JIT compilation if a
2# requested kernel wasn’t pre-compiled in AOT mode, and JIT compilation always
3# requires the CUDA toolkit (via nvcc) to be available.
4#
5# This means that if you plan to use flashinfer, you will need to set the
6# environment variable `CUDA_HOME` to `cudatoolkit`.
7{
8 lib,
9 config,
10 buildPythonPackage,
11 fetchFromGitHub,
12 setuptools,
13 cudaPackages,
14 cmake,
15 ninja,
16 numpy,
17 torch,
18}:
19
20let
21 pname = "flashinfer";
22 version = "0.2.5";
23
24 src_cutlass = fetchFromGitHub {
25 owner = "NVIDIA";
26 repo = "cutlass";
27 # Using the revision obtained in submodule inside flashinfer's `3rdparty`.
28 rev = "df8a550d3917b0e97f416b2ed8c2d786f7f686a3";
29 hash = "sha256-d4czDoEv0Focf1bJHOVGX4BDS/h5O7RPoM/RrujhgFQ=";
30 };
31
32in
33buildPythonPackage {
34 format = "setuptools";
35 inherit pname version;
36
37 src = fetchFromGitHub {
38 owner = "flashinfer-ai";
39 repo = "flashinfer";
40 tag = "v${version}";
41 hash = "sha256-YrYfatkI9DQkFEEGiF8CK/bTafaNga4Ufyt+882C0bQ=";
42 };
43
44 build-system = [ setuptools ];
45
46 nativeBuildInputs = [
47 cmake
48 ninja
49 (lib.getBin cudaPackages.cuda_nvcc)
50 ];
51 dontUseCmakeConfigure = true;
52
53 buildInputs = [
54 cudaPackages.cuda_cudart
55 cudaPackages.libcublas
56 cudaPackages.cuda_cccl
57 cudaPackages.libcurand
58 ];
59
60 postPatch = ''
61 rmdir 3rdparty/cutlass
62 ln -s ${src_cutlass} 3rdparty/cutlass
63 '';
64
65 # FlashInfer offers two installation modes:
66 #
67 # JIT mode: CUDA kernels are compiled at runtime using PyTorch’s JIT, with
68 # compiled kernels cached for future use. JIT mode allows fast installation,
69 # as no CUDA kernels are pre-compiled, making it ideal for development and
70 # testing. JIT version is also available as a sdist in PyPI.
71 #
72 # AOT mode: Core CUDA kernels are pre-compiled and included in the library,
73 # reducing runtime compilation overhead. If a required kernel is not
74 # pre-compiled, it will be compiled at runtime using JIT. AOT mode is
75 # recommended for production environments.
76 #
77 # Here we use opt for the AOT version.
78 preConfigure = ''
79 export FLASHINFER_ENABLE_AOT=1
80 export TORCH_NVCC_FLAGS="--maxrregcount=64"
81 export MAX_JOBS="$NIX_BUILD_CORES"
82 '';
83
84 TORCH_CUDA_ARCH_LIST = lib.concatStringsSep ";" torch.cudaCapabilities;
85
86 dependencies = [
87 numpy
88 torch
89 ];
90
91 meta = with lib; {
92 broken = !torch.cudaSupport || !config.cudaSupport;
93 homepage = "https://flashinfer.ai/";
94 description = "Library and kernel generator for Large Language Models";
95 longDescription = ''
96 FlashInfer is a library and kernel generator for Large Language Models
97 that provides high-performance implementation of LLM GPU kernels such as
98 FlashAttention, PageAttention and LoRA. FlashInfer focus on LLM serving
99 and inference, and delivers state-of-the-art performance across diverse
100 scenarios.
101 '';
102 license = licenses.asl20;
103 maintainers = with maintainers; [ breakds ];
104 };
105}