Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
at flake-libs 103 lines 3.0 kB view raw
1# NOTE: At runtime, FlashInfer will fall back to PyTorch’s JIT compilation if a 2# requested kernel wasn’t pre-compiled in AOT mode, and JIT compilation always 3# requires the CUDA toolkit (via nvcc) to be available. 4# 5# This means that if you plan to use flashinfer, you will need to set the 6# environment varaible `CUDA_HOME` to `cudatoolkit`. 7{ 8 lib, 9 config, 10 buildPythonPackage, 11 fetchFromGitHub, 12 setuptools, 13 cudaPackages, 14 cmake, 15 ninja, 16 numpy, 17 torch, 18}: 19 20let 21 pname = "flashinfer"; 22 version = "0.2.5"; 23 24 src_cutlass = fetchFromGitHub { 25 owner = "NVIDIA"; 26 repo = "cutlass"; 27 # Using the revision obtained in submodule inside flashinfer's `3rdparty`. 28 rev = "df8a550d3917b0e97f416b2ed8c2d786f7f686a3"; 29 hash = "sha256-d4czDoEv0Focf1bJHOVGX4BDS/h5O7RPoM/RrujhgFQ="; 30 }; 31 32in 33buildPythonPackage { 34 inherit pname version; 35 36 src = fetchFromGitHub { 37 owner = "flashinfer-ai"; 38 repo = "flashinfer"; 39 tag = "v${version}"; 40 hash = "sha256-YrYfatkI9DQkFEEGiF8CK/bTafaNga4Ufyt+882C0bQ="; 41 }; 42 43 build-system = [ setuptools ]; 44 45 nativeBuildInputs = [ 46 cmake 47 ninja 48 (lib.getBin cudaPackages.cuda_nvcc) 49 ]; 50 dontUseCmakeConfigure = true; 51 52 buildInputs = [ 53 cudaPackages.cuda_cudart 54 cudaPackages.libcublas 55 cudaPackages.cuda_cccl 56 cudaPackages.libcurand 57 ]; 58 59 postPatch = '' 60 rmdir 3rdparty/cutlass 61 ln -s ${src_cutlass} 3rdparty/cutlass 62 ''; 63 64 # FlashInfer offers two installation modes: 65 # 66 # JIT mode: CUDA kernels are compiled at runtime using PyTorch’s JIT, with 67 # compiled kernels cached for future use. JIT mode allows fast installation, 68 # as no CUDA kernels are pre-compiled, making it ideal for development and 69 # testing. JIT version is also available as a sdist in PyPI. 70 # 71 # AOT mode: Core CUDA kernels are pre-compiled and included in the library, 72 # reducing runtime compilation overhead. If a required kernel is not 73 # pre-compiled, it will be compiled at runtime using JIT. AOT mode is 74 # recommended for production environments. 75 # 76 # Here we use opt for the AOT version. 77 preConfigure = '' 78 export FLASHINFER_ENABLE_AOT=1 79 export TORCH_NVCC_FLAGS="--maxrregcount=64" 80 ''; 81 82 TORCH_CUDA_ARCH_LIST = lib.concatStringsSep ";" torch.cudaCapabilities; 83 84 dependencies = [ 85 numpy 86 torch 87 ]; 88 89 meta = with lib; { 90 broken = !torch.cudaSupport || !config.cudaSupport; 91 homepage = "https://flashinfer.ai/"; 92 description = "Library and kernel generator for Large Language Models"; 93 longDescription = '' 94 FlashInfer is a library and kernel generator for Large Language Models 95 that provides high-performance implementation of LLM GPU kernels such as 96 FlashAttention, PageAttention and LoRA. FlashInfer focus on LLM serving 97 and inference, and delivers state-of-the-art performance across diverse 98 scenarios. 99 ''; 100 license = licenses.asl20; 101 maintainers = with maintainers; [ breakds ]; 102 }; 103}