Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
1# NOTE: At runtime, FlashInfer will fall back to PyTorch’s JIT compilation if a 2# requested kernel wasn’t pre-compiled in AOT mode, and JIT compilation always 3# requires the CUDA toolkit (via nvcc) to be available. 4# 5# This means that if you plan to use flashinfer, you will need to set the 6# environment variable `CUDA_HOME` to `cudatoolkit`. 7{ 8 lib, 9 config, 10 buildPythonPackage, 11 fetchFromGitHub, 12 setuptools, 13 cudaPackages, 14 cmake, 15 ninja, 16 numpy, 17 torch, 18}: 19 20let 21 pname = "flashinfer"; 22 version = "0.2.5"; 23 24 src_cutlass = fetchFromGitHub { 25 owner = "NVIDIA"; 26 repo = "cutlass"; 27 # Using the revision obtained in submodule inside flashinfer's `3rdparty`. 28 rev = "df8a550d3917b0e97f416b2ed8c2d786f7f686a3"; 29 hash = "sha256-d4czDoEv0Focf1bJHOVGX4BDS/h5O7RPoM/RrujhgFQ="; 30 }; 31 32in 33buildPythonPackage { 34 format = "setuptools"; 35 inherit pname version; 36 37 src = fetchFromGitHub { 38 owner = "flashinfer-ai"; 39 repo = "flashinfer"; 40 tag = "v${version}"; 41 hash = "sha256-YrYfatkI9DQkFEEGiF8CK/bTafaNga4Ufyt+882C0bQ="; 42 }; 43 44 build-system = [ setuptools ]; 45 46 nativeBuildInputs = [ 47 cmake 48 ninja 49 (lib.getBin cudaPackages.cuda_nvcc) 50 ]; 51 dontUseCmakeConfigure = true; 52 53 buildInputs = [ 54 cudaPackages.cuda_cudart 55 cudaPackages.libcublas 56 cudaPackages.cuda_cccl 57 cudaPackages.libcurand 58 ]; 59 60 postPatch = '' 61 rmdir 3rdparty/cutlass 62 ln -s ${src_cutlass} 3rdparty/cutlass 63 ''; 64 65 # FlashInfer offers two installation modes: 66 # 67 # JIT mode: CUDA kernels are compiled at runtime using PyTorch’s JIT, with 68 # compiled kernels cached for future use. JIT mode allows fast installation, 69 # as no CUDA kernels are pre-compiled, making it ideal for development and 70 # testing. JIT version is also available as a sdist in PyPI. 71 # 72 # AOT mode: Core CUDA kernels are pre-compiled and included in the library, 73 # reducing runtime compilation overhead. If a required kernel is not 74 # pre-compiled, it will be compiled at runtime using JIT. AOT mode is 75 # recommended for production environments. 76 # 77 # Here we use opt for the AOT version. 78 preConfigure = '' 79 export FLASHINFER_ENABLE_AOT=1 80 export TORCH_NVCC_FLAGS="--maxrregcount=64" 81 export MAX_JOBS="$NIX_BUILD_CORES" 82 ''; 83 84 TORCH_CUDA_ARCH_LIST = lib.concatStringsSep ";" torch.cudaCapabilities; 85 86 dependencies = [ 87 numpy 88 torch 89 ]; 90 91 meta = with lib; { 92 broken = !torch.cudaSupport || !config.cudaSupport; 93 homepage = "https://flashinfer.ai/"; 94 description = "Library and kernel generator for Large Language Models"; 95 longDescription = '' 96 FlashInfer is a library and kernel generator for Large Language Models 97 that provides high-performance implementation of LLM GPU kernels such as 98 FlashAttention, PageAttention and LoRA. FlashInfer focus on LLM serving 99 and inference, and delivers state-of-the-art performance across diverse 100 scenarios. 101 ''; 102 license = licenses.asl20; 103 maintainers = with maintainers; [ breakds ]; 104 }; 105}