nixpkgs mirror (for testing) github.com/NixOS/nixpkgs
nix
at flake-libs 438 lines 11 kB view raw
1{ 2 lib, 3 stdenv, 4 python, 5 buildPythonPackage, 6 pythonRelaxDepsHook, 7 fetchFromGitHub, 8 symlinkJoin, 9 autoAddDriverRunpath, 10 11 # build system 12 cmake, 13 jinja2, 14 ninja, 15 packaging, 16 setuptools, 17 setuptools-scm, 18 wheel, 19 20 # dependencies 21 which, 22 torch, 23 outlines, 24 psutil, 25 ray, 26 pandas, 27 pyarrow, 28 sentencepiece, 29 numpy, 30 transformers, 31 xformers, 32 xgrammar, 33 numba, 34 fastapi, 35 uvicorn, 36 pydantic, 37 aioprometheus, 38 pynvml, 39 openai, 40 pyzmq, 41 tiktoken, 42 torchaudio, 43 torchvision, 44 py-cpuinfo, 45 lm-format-enforcer, 46 prometheus-fastapi-instrumentator, 47 cupy, 48 gguf, 49 einops, 50 importlib-metadata, 51 partial-json-parser, 52 compressed-tensors, 53 mistral-common, 54 msgspec, 55 numactl, 56 tokenizers, 57 oneDNN, 58 blake3, 59 depyf, 60 opencv-python-headless, 61 cachetools, 62 llguidance, 63 python-json-logger, 64 python-multipart, 65 llvmPackages, 66 67 cudaSupport ? torch.cudaSupport, 68 cudaPackages ? { }, 69 rocmSupport ? torch.rocmSupport, 70 rocmPackages ? { }, 71 gpuTargets ? [ ], 72}: 73 74let 75 inherit (lib) 76 lists 77 strings 78 trivial 79 ; 80 81 inherit (cudaPackages) flags; 82 83 shouldUsePkg = 84 pkg: if pkg != null && lib.meta.availableOn stdenv.hostPlatform pkg then pkg else null; 85 86 # see CMakeLists.txt, grepping for GIT_TAG near cutlass 87 # https://github.com/vllm-project/vllm/blob/v${version}/CMakeLists.txt 88 cutlass = fetchFromGitHub { 89 owner = "NVIDIA"; 90 repo = "cutlass"; 91 tag = "v3.8.0"; 92 hash = "sha256-oIzlbKRdOh6gp6nRZ8udLSqleBFoFtgM7liCBlHZLOk="; 93 }; 94 95 flashmla = stdenv.mkDerivation { 96 pname = "flashmla"; 97 # https://github.com/vllm-project/FlashMLA/blob/${src.rev}/setup.py 98 version = "1.0.0"; 99 100 # grep for GIT_TAG in the following file 101 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/flashmla.cmake 102 src = fetchFromGitHub { 103 owner = "vllm-project"; 104 repo = "FlashMLA"; 105 rev = "575f7724b9762f265bbee5889df9c7d630801845"; 106 hash = "sha256-8WrKMl0olr0nYV4FRJfwSaJ0F5gWQpssoFMjr9tbHBk="; 107 }; 108 109 dontConfigure = true; 110 111 # flashmla normally relies on `git submodule update` to fetch cutlass 112 buildPhase = '' 113 rm -rf csrc/cutlass 114 ln -sf ${cutlass} csrc/cutlass 115 ''; 116 117 installPhase = '' 118 cp -rva . $out 119 ''; 120 }; 121 122 vllm-flash-attn = stdenv.mkDerivation { 123 pname = "vllm-flash-attn"; 124 # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py 125 version = "2.7.2.post1"; 126 127 # grep for GIT_TAG in the following file 128 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake 129 src = fetchFromGitHub { 130 owner = "vllm-project"; 131 repo = "flash-attention"; 132 rev = "dc9d410b3e2d6534a4c70724c2515f4def670a22"; 133 hash = "sha256-ZQ0bOBIb+8IMmya8dmimKQ17KTBplX81IirdnBJpX5M="; 134 }; 135 136 dontConfigure = true; 137 138 # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass 139 buildPhase = '' 140 rm -rf csrc/cutlass 141 ln -sf ${cutlass} csrc/cutlass 142 ''; 143 144 installPhase = '' 145 cp -rva . $out 146 ''; 147 }; 148 149 cpuSupport = !cudaSupport && !rocmSupport; 150 151 # https://github.com/pytorch/pytorch/blob/v2.6.0/torch/utils/cpp_extension.py#L2046-L2048 152 supportedTorchCudaCapabilities = 153 let 154 real = [ 155 "3.5" 156 "3.7" 157 "5.0" 158 "5.2" 159 "5.3" 160 "6.0" 161 "6.1" 162 "6.2" 163 "7.0" 164 "7.2" 165 "7.5" 166 "8.0" 167 "8.6" 168 "8.7" 169 "8.9" 170 "9.0" 171 "9.0a" 172 "10.0" 173 ]; 174 ptx = lists.map (x: "${x}+PTX") real; 175 in 176 real ++ ptx; 177 178 # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements 179 # of the first list *from* the second list. That means: 180 # lists.subtractLists a b = b - a 181 182 # For CUDA 183 supportedCudaCapabilities = lists.intersectLists flags.cudaCapabilities supportedTorchCudaCapabilities; 184 unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities flags.cudaCapabilities; 185 186 isCudaJetson = cudaSupport && cudaPackages.flags.isJetsonBuild; 187 188 # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified. 189 gpuArchWarner = 190 supported: unsupported: 191 trivial.throwIf (supported == [ ]) ( 192 "No supported GPU targets specified. Requested GPU targets: " 193 + strings.concatStringsSep ", " unsupported 194 ) supported; 195 196 # Create the gpuTargetString. 197 gpuTargetString = strings.concatStringsSep ";" ( 198 if gpuTargets != [ ] then 199 # If gpuTargets is specified, it always takes priority. 200 gpuTargets 201 else if cudaSupport then 202 gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities 203 else if rocmSupport then 204 rocmPackages.clr.gpuTargets 205 else 206 throw "No GPU targets specified" 207 ); 208 209 mergedCudaLibraries = with cudaPackages; [ 210 cuda_cudart # cuda_runtime.h, -lcudart 211 cuda_cccl 212 libcusparse # cusparse.h 213 libcusolver # cusolverDn.h 214 cuda_nvtx 215 cuda_nvrtc 216 libcublas 217 ]; 218 219 # Some packages are not available on all platforms 220 nccl = shouldUsePkg (cudaPackages.nccl or null); 221 222 getAllOutputs = p: [ 223 (lib.getBin p) 224 (lib.getLib p) 225 (lib.getDev p) 226 ]; 227 228in 229 230buildPythonPackage rec { 231 pname = "vllm"; 232 version = "0.8.3"; 233 pyproject = true; 234 235 stdenv = torch.stdenv; 236 237 src = fetchFromGitHub { 238 owner = "vllm-project"; 239 repo = "vllm"; 240 tag = "v${version}"; 241 hash = "sha256-LiEBkVwJTT4WoCTk9pI0ykTjmv1pDMzksmFwVktoxMY="; 242 }; 243 244 patches = [ 245 ./0002-setup.py-nix-support-respect-cmakeFlags.patch 246 ./0003-propagate-pythonpath.patch 247 ./0004-drop-lsmod.patch 248 ]; 249 250 postPatch = 251 '' 252 # pythonRelaxDeps does not cover build-system 253 substituteInPlace pyproject.toml \ 254 --replace-fail "torch ==" "torch >=" 255 256 # Ignore the python version check because it hard-codes minor versions and 257 # lags behind `ray`'s python interpreter support 258 substituteInPlace CMakeLists.txt \ 259 --replace-fail \ 260 'set(PYTHON_SUPPORTED_VERSIONS' \ 261 'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"' 262 '' 263 + lib.optionalString (nccl == null) '' 264 # On platforms where NCCL is not supported (e.g. Jetson), substitute Gloo (provided by Torch) 265 substituteInPlace vllm/distributed/parallel_state.py \ 266 --replace-fail '"nccl"' '"gloo"' 267 ''; 268 269 nativeBuildInputs = 270 [ 271 which 272 ] 273 ++ lib.optionals rocmSupport [ 274 rocmPackages.hipcc 275 ] 276 ++ lib.optionals cudaSupport [ 277 cudaPackages.cuda_nvcc 278 autoAddDriverRunpath 279 ] 280 ++ lib.optionals isCudaJetson [ 281 cudaPackages.autoAddCudaCompatRunpath 282 ]; 283 284 build-system = [ 285 cmake 286 jinja2 287 ninja 288 packaging 289 setuptools 290 setuptools-scm 291 torch 292 ]; 293 294 buildInputs = 295 lib.optionals cpuSupport [ 296 oneDNN 297 ] 298 ++ lib.optionals (cpuSupport && stdenv.isLinux) [ 299 numactl 300 ] 301 ++ lib.optionals cudaSupport ( 302 mergedCudaLibraries 303 ++ (with cudaPackages; [ 304 nccl 305 cudnn 306 libcufile 307 ]) 308 ) 309 ++ lib.optionals rocmSupport ( 310 with rocmPackages; 311 [ 312 clr 313 rocthrust 314 rocprim 315 hipsparse 316 hipblas 317 ] 318 ) 319 ++ lib.optionals stdenv.cc.isClang [ 320 llvmPackages.openmp 321 ]; 322 323 dependencies = 324 [ 325 aioprometheus 326 blake3 327 cachetools 328 depyf 329 fastapi 330 llguidance 331 lm-format-enforcer 332 numpy 333 openai 334 opencv-python-headless 335 outlines 336 pandas 337 prometheus-fastapi-instrumentator 338 psutil 339 py-cpuinfo 340 pyarrow 341 pydantic 342 python-json-logger 343 python-multipart 344 pyzmq 345 ray 346 sentencepiece 347 tiktoken 348 tokenizers 349 msgspec 350 gguf 351 einops 352 importlib-metadata 353 partial-json-parser 354 compressed-tensors 355 mistral-common 356 torch 357 torchaudio 358 torchvision 359 transformers 360 uvicorn 361 xformers 362 xgrammar 363 numba 364 ] 365 ++ uvicorn.optional-dependencies.standard 366 ++ aioprometheus.optional-dependencies.starlette 367 ++ lib.optionals cudaSupport [ 368 cupy 369 pynvml 370 ]; 371 372 dontUseCmakeConfigure = true; 373 cmakeFlags = 374 [ 375 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") 376 (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}") 377 (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn}") 378 ] 379 ++ lib.optionals cudaSupport [ 380 (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}") 381 (lib.cmakeFeature "CUTLASS_NVCC_ARCHS_ENABLED" "${cudaPackages.flags.cmakeCudaArchitecturesString}") 382 (lib.cmakeFeature "CUDA_TOOLKIT_ROOT_DIR" "${symlinkJoin { 383 name = "cuda-merged-${cudaPackages.cudaMajorMinorVersion}"; 384 paths = builtins.concatMap getAllOutputs mergedCudaLibraries; 385 }}") 386 (lib.cmakeFeature "CAFFE2_USE_CUDNN" "ON") 387 (lib.cmakeFeature "CAFFE2_USE_CUFILE" "ON") 388 (lib.cmakeFeature "CUTLASS_ENABLE_CUBLAS" "ON") 389 ] 390 ++ lib.optionals cpuSupport [ 391 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_ONEDNN" "${lib.getDev oneDNN}") 392 ]; 393 394 env = 395 lib.optionalAttrs cudaSupport { 396 VLLM_TARGET_DEVICE = "cuda"; 397 CUDA_HOME = "${lib.getDev cudaPackages.cuda_nvcc}"; 398 } 399 // lib.optionalAttrs rocmSupport { 400 VLLM_TARGET_DEVICE = "rocm"; 401 # Otherwise it tries to enumerate host supported ROCM gfx archs, and that is not possible due to sandboxing. 402 PYTORCH_ROCM_ARCH = lib.strings.concatStringsSep ";" rocmPackages.clr.gpuTargets; 403 ROCM_HOME = "${rocmPackages.clr}"; 404 } 405 // lib.optionalAttrs cpuSupport { 406 VLLM_TARGET_DEVICE = "cpu"; 407 }; 408 409 preConfigure = '' 410 # See: https://github.com/vllm-project/vllm/blob/v0.7.1/setup.py#L75-L109 411 # There's also NVCC_THREADS but Nix/Nixpkgs doesn't really have this concept. 412 export MAX_JOBS="$NIX_BUILD_CORES" 413 ''; 414 415 pythonRelaxDeps = true; 416 417 pythonImportsCheck = [ "vllm" ]; 418 419 # updates the cutlass fetcher instead 420 passthru.skipBulkUpdate = true; 421 422 meta = with lib; { 423 description = "High-throughput and memory-efficient inference and serving engine for LLMs"; 424 changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}"; 425 homepage = "https://github.com/vllm-project/vllm"; 426 license = licenses.asl20; 427 maintainers = with maintainers; [ 428 happysalada 429 lach 430 ]; 431 badPlatforms = [ 432 # CMake Error at cmake/cpu_extension.cmake:78 (find_isa): 433 # find_isa Function invoked with incorrect arguments for function named: 434 # find_isa 435 "x86_64-darwin" 436 ]; 437 }; 438}