1# NOTE: At runtime, FlashInfer will fall back to PyTorch’s JIT compilation if a
2# requested kernel wasn’t pre-compiled in AOT mode, and JIT compilation always
3# requires the CUDA toolkit (via nvcc) to be available.
4#
5# This means that if you plan to use flashinfer, you will need to set the
6# environment varaible `CUDA_HOME` to `cudatoolkit`.
7{
8 lib,
9 config,
10 buildPythonPackage,
11 fetchFromGitHub,
12 setuptools,
13 cudaPackages,
14 cmake,
15 ninja,
16 numpy,
17 torch,
18}:
19
20let
21 pname = "flashinfer";
22 version = "0.2.5";
23
24 src_cutlass = fetchFromGitHub {
25 owner = "NVIDIA";
26 repo = "cutlass";
27 # Using the revision obtained in submodule inside flashinfer's `3rdparty`.
28 rev = "df8a550d3917b0e97f416b2ed8c2d786f7f686a3";
29 hash = "sha256-d4czDoEv0Focf1bJHOVGX4BDS/h5O7RPoM/RrujhgFQ=";
30 };
31
32in
33buildPythonPackage {
34 inherit pname version;
35
36 src = fetchFromGitHub {
37 owner = "flashinfer-ai";
38 repo = "flashinfer";
39 tag = "v${version}";
40 hash = "sha256-YrYfatkI9DQkFEEGiF8CK/bTafaNga4Ufyt+882C0bQ=";
41 };
42
43 build-system = [ setuptools ];
44
45 nativeBuildInputs = [
46 cmake
47 ninja
48 (lib.getBin cudaPackages.cuda_nvcc)
49 ];
50 dontUseCmakeConfigure = true;
51
52 buildInputs = [
53 cudaPackages.cuda_cudart
54 cudaPackages.libcublas
55 cudaPackages.cuda_cccl
56 cudaPackages.libcurand
57 ];
58
59 postPatch = ''
60 rmdir 3rdparty/cutlass
61 ln -s ${src_cutlass} 3rdparty/cutlass
62 '';
63
64 # FlashInfer offers two installation modes:
65 #
66 # JIT mode: CUDA kernels are compiled at runtime using PyTorch’s JIT, with
67 # compiled kernels cached for future use. JIT mode allows fast installation,
68 # as no CUDA kernels are pre-compiled, making it ideal for development and
69 # testing. JIT version is also available as a sdist in PyPI.
70 #
71 # AOT mode: Core CUDA kernels are pre-compiled and included in the library,
72 # reducing runtime compilation overhead. If a required kernel is not
73 # pre-compiled, it will be compiled at runtime using JIT. AOT mode is
74 # recommended for production environments.
75 #
76 # Here we use opt for the AOT version.
77 preConfigure = ''
78 export FLASHINFER_ENABLE_AOT=1
79 export TORCH_NVCC_FLAGS="--maxrregcount=64"
80 '';
81
82 TORCH_CUDA_ARCH_LIST = lib.concatStringsSep ";" torch.cudaCapabilities;
83
84 dependencies = [
85 numpy
86 torch
87 ];
88
89 meta = with lib; {
90 broken = !torch.cudaSupport || !config.cudaSupport;
91 homepage = "https://flashinfer.ai/";
92 description = "Library and kernel generator for Large Language Models";
93 longDescription = ''
94 FlashInfer is a library and kernel generator for Large Language Models
95 that provides high-performance implementation of LLM GPU kernels such as
96 FlashAttention, PageAttention and LoRA. FlashInfer focus on LLM serving
97 and inference, and delivers state-of-the-art performance across diverse
98 scenarios.
99 '';
100 license = licenses.asl20;
101 maintainers = with maintainers; [ breakds ];
102 };
103}