python312Packages.vllm: 0.8.3 -> 0.9.0.1 (#414949)

authored by Gaétan Lepage and committed by GitHub 5ad453fb 4ce0d860

+76 -28
+5 -5
pkgs/development/python-modules/bitsandbytes/default.nix
··· 11 12 let 13 pname = "bitsandbytes"; 14 - version = "0.45.1"; 15 16 inherit (torch) cudaPackages cudaSupport; 17 inherit (cudaPackages) cudaMajorMinorVersion; ··· 54 pyproject = true; 55 56 src = fetchFromGitHub { 57 - owner = "TimDettmers"; 58 repo = "bitsandbytes"; 59 tag = version; 60 - hash = "sha256-MZ+3mUXaAhRb+rBtE+eQqT3XdtFxlWJc/CmTEwQkKSA="; 61 }; 62 63 # By default, which library is loaded depends on the result of `torch.cuda.is_available()`. ··· 112 113 meta = { 114 description = "8-bit CUDA functions for PyTorch"; 115 - homepage = "https://github.com/TimDettmers/bitsandbytes"; 116 - changelog = "https://github.com/TimDettmers/bitsandbytes/releases/tag/${version}"; 117 license = lib.licenses.mit; 118 maintainers = with lib.maintainers; [ bcdarwin ]; 119 };
··· 11 12 let 13 pname = "bitsandbytes"; 14 + version = "0.46.0"; 15 16 inherit (torch) cudaPackages cudaSupport; 17 inherit (cudaPackages) cudaMajorMinorVersion; ··· 54 pyproject = true; 55 56 src = fetchFromGitHub { 57 + owner = "bitsandbytes-foundation"; 58 repo = "bitsandbytes"; 59 tag = version; 60 + hash = "sha256-q1ltNYO5Ex6F2bfCcsekdsWjzXoal7g4n/LIHVGuj+k="; 61 }; 62 63 # By default, which library is loaded depends on the result of `torch.cuda.is_available()`. ··· 112 113 meta = { 114 description = "8-bit CUDA functions for PyTorch"; 115 + homepage = "https://github.com/bitsandbytes-foundation/bitsandbytes"; 116 + changelog = "https://github.com/bitsandbytes-foundation/bitsandbytes/releases/tag/${version}"; 117 license = lib.licenses.mit; 118 maintainers = with lib.maintainers; [ bcdarwin ]; 119 };
+12
pkgs/development/python-modules/vllm/0005-drop-intel-reqs.patch
···
··· 1 + diff --git a/requirements/cpu.txt b/requirements/cpu.txt 2 + index 121330158..d41918883 100644 3 + --- a/requirements/cpu.txt 4 + +++ b/requirements/cpu.txt 5 + @@ -20,7 +20,3 @@ datasets # for benchmark scripts 6 + 7 + # cpu cannot use triton 3.3.0 8 + triton==3.2.0; platform_machine == "x86_64" 9 + - 10 + -# Intel Extension for PyTorch, only for x86_64 CPUs 11 + -intel-openmp==2024.2.1; platform_machine == "x86_64" 12 + -intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64"
+59 -23
pkgs/development/python-modules/vllm/default.nix
··· 3 stdenv, 4 python, 5 buildPythonPackage, 6 - pythonRelaxDepsHook, 7 fetchFromGitHub, 8 symlinkJoin, 9 autoAddDriverRunpath, 10 ··· 15 packaging, 16 setuptools, 17 setuptools-scm, 18 - wheel, 19 20 # dependencies 21 which, ··· 63 python-json-logger, 64 python-multipart, 65 llvmPackages, 66 67 cudaSupport ? torch.cudaSupport, 68 cudaPackages ? { }, ··· 88 cutlass = fetchFromGitHub { 89 owner = "NVIDIA"; 90 repo = "cutlass"; 91 - tag = "v3.8.0"; 92 - hash = "sha256-oIzlbKRdOh6gp6nRZ8udLSqleBFoFtgM7liCBlHZLOk="; 93 }; 94 95 flashmla = stdenv.mkDerivation { ··· 119 ''; 120 }; 121 122 - vllm-flash-attn = stdenv.mkDerivation { 123 pname = "vllm-flash-attn"; 124 # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py 125 - version = "2.7.2.post1"; 126 127 # grep for GIT_TAG in the following file 128 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake 129 src = fetchFromGitHub { 130 owner = "vllm-project"; 131 repo = "flash-attention"; 132 - rev = "dc9d410b3e2d6534a4c70724c2515f4def670a22"; 133 - hash = "sha256-ZQ0bOBIb+8IMmya8dmimKQ17KTBplX81IirdnBJpX5M="; 134 }; 135 136 dontConfigure = true; 137 138 - # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass 139 - buildPhase = '' 140 - rm -rf csrc/cutlass 141 - ln -sf ${cutlass} csrc/cutlass 142 - ''; 143 144 installPhase = '' 145 cp -rva . $out 146 ''; 147 - }; 148 149 cpuSupport = !cudaSupport && !rocmSupport; 150 151 - # https://github.com/pytorch/pytorch/blob/v2.6.0/torch/utils/cpp_extension.py#L2046-L2048 152 supportedTorchCudaCapabilities = 153 let 154 real = [ ··· 170 "9.0" 171 "9.0a" 172 "10.0" 173 ]; 174 ptx = lists.map (x: "${x}+PTX") real; 175 in ··· 229 230 buildPythonPackage rec { 231 pname = "vllm"; 232 - version = "0.8.3"; 233 pyproject = true; 234 235 stdenv = torch.stdenv; ··· 238 owner = "vllm-project"; 239 repo = "vllm"; 240 tag = "v${version}"; 241 - hash = "sha256-LiEBkVwJTT4WoCTk9pI0ykTjmv1pDMzksmFwVktoxMY="; 242 }; 243 244 patches = [ 245 ./0002-setup.py-nix-support-respect-cmakeFlags.patch 246 ./0003-propagate-pythonpath.patch 247 ./0004-drop-lsmod.patch 248 ]; 249 250 postPatch = ··· 259 --replace-fail \ 260 'set(PYTHON_SUPPORTED_VERSIONS' \ 261 'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"' 262 '' 263 + lib.optionalString (nccl == null) '' 264 # On platforms where NCCL is not supported (e.g. Jetson), substitute Gloo (provided by Torch) ··· 361 xformers 362 xgrammar 363 numba 364 ] 365 ++ uvicorn.optional-dependencies.standard 366 ++ aioprometheus.optional-dependencies.starlette 367 ++ lib.optionals cudaSupport [ 368 cupy 369 pynvml 370 ]; 371 372 dontUseCmakeConfigure = true; ··· 374 [ 375 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") 376 (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}") 377 - (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn}") 378 ] 379 ++ lib.optionals cudaSupport [ 380 (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}") ··· 416 417 pythonImportsCheck = [ "vllm" ]; 418 419 - # updates the cutlass fetcher instead 420 - passthru.skipBulkUpdate = true; 421 422 - meta = with lib; { 423 description = "High-throughput and memory-efficient inference and serving engine for LLMs"; 424 changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}"; 425 homepage = "https://github.com/vllm-project/vllm"; 426 - license = licenses.asl20; 427 - maintainers = with maintainers; [ 428 happysalada 429 lach 430 ];
··· 3 stdenv, 4 python, 5 buildPythonPackage, 6 fetchFromGitHub, 7 + fetchpatch, 8 symlinkJoin, 9 autoAddDriverRunpath, 10 ··· 15 packaging, 16 setuptools, 17 setuptools-scm, 18 19 # dependencies 20 which, ··· 62 python-json-logger, 63 python-multipart, 64 llvmPackages, 65 + opentelemetry-sdk, 66 + opentelemetry-api, 67 + opentelemetry-exporter-otlp, 68 + bitsandbytes, 69 + flashinfer, 70 + 71 + # internal dependency - for overriding in overlays 72 + vllm-flash-attn ? null, 73 74 cudaSupport ? torch.cudaSupport, 75 cudaPackages ? { }, ··· 95 cutlass = fetchFromGitHub { 96 owner = "NVIDIA"; 97 repo = "cutlass"; 98 + tag = "v3.9.2"; 99 + hash = "sha256-teziPNA9csYvhkG5t2ht8W8x5+1YGGbHm8VKx4JoxgI="; 100 }; 101 102 flashmla = stdenv.mkDerivation { ··· 126 ''; 127 }; 128 129 + vllm-flash-attn' = lib.defaultTo (stdenv.mkDerivation { 130 pname = "vllm-flash-attn"; 131 # https://github.com/vllm-project/flash-attention/blob/${src.rev}/vllm_flash_attn/__init__.py 132 + version = "2.7.4.post1"; 133 134 # grep for GIT_TAG in the following file 135 # https://github.com/vllm-project/vllm/blob/v${version}/cmake/external_projects/vllm_flash_attn.cmake 136 src = fetchFromGitHub { 137 owner = "vllm-project"; 138 repo = "flash-attention"; 139 + rev = "8798f27777fb57f447070301bf33a9f9c607f491"; 140 + hash = "sha256-UTUvATGN1NU/Bc8qo078q6bEgILLmlrjL7Yk2iAJhg4="; 141 }; 142 143 dontConfigure = true; 144 145 + # vllm-flash-attn normally relies on `git submodule update` to fetch cutlass and composable_kernel 146 + buildPhase = 147 + '' 148 + rm -rf csrc/cutlass 149 + ln -sf ${cutlass} csrc/cutlass 150 + '' 151 + + lib.optionalString (rocmSupport) '' 152 + rm -rf csrc/composable_kernel; 153 + ln -sf ${rocmPackages.composable_kernel} csrc/composable_kernel 154 + ''; 155 156 installPhase = '' 157 cp -rva . $out 158 ''; 159 + }) vllm-flash-attn; 160 161 cpuSupport = !cudaSupport && !rocmSupport; 162 163 + # https://github.com/pytorch/pytorch/blob/v2.7.0/torch/utils/cpp_extension.py#L2343-L2345 164 supportedTorchCudaCapabilities = 165 let 166 real = [ ··· 182 "9.0" 183 "9.0a" 184 "10.0" 185 + "10.0a" 186 + "10.1" 187 + "10.1a" 188 + "12.0" 189 + "12.0a" 190 ]; 191 ptx = lists.map (x: "${x}+PTX") real; 192 in ··· 246 247 buildPythonPackage rec { 248 pname = "vllm"; 249 + version = "0.9.0.1"; 250 pyproject = true; 251 252 stdenv = torch.stdenv; ··· 255 owner = "vllm-project"; 256 repo = "vllm"; 257 tag = "v${version}"; 258 + hash = "sha256-gNe/kdsDQno8Fd6mo29feWmbyC0c2+kljlVxY4v7R9U="; 259 }; 260 261 patches = [ 262 + (fetchpatch { 263 + name = "remove-unused-opentelemetry-semantic-conventions-ai-dep.patch"; 264 + url = "https://github.com/vllm-project/vllm/commit/6a5d7e45f52c3a13de43b8b4fa9033e3b342ebd2.patch"; 265 + hash = "sha256-KYthqu+6XwsYYd80PtfrMMjuRV9+ionccr7EbjE4jJE="; 266 + }) 267 ./0002-setup.py-nix-support-respect-cmakeFlags.patch 268 ./0003-propagate-pythonpath.patch 269 ./0004-drop-lsmod.patch 270 + ./0005-drop-intel-reqs.patch 271 ]; 272 273 postPatch = ··· 282 --replace-fail \ 283 'set(PYTHON_SUPPORTED_VERSIONS' \ 284 'set(PYTHON_SUPPORTED_VERSIONS "${lib.versions.majorMinor python.version}"' 285 + 286 + # Pass build environment PYTHONPATH to vLLM's Python configuration scripts 287 + substituteInPlace CMakeLists.txt \ 288 + --replace-fail '$PYTHONPATH' '$ENV{PYTHONPATH}' 289 '' 290 + lib.optionalString (nccl == null) '' 291 # On platforms where NCCL is not supported (e.g. Jetson), substitute Gloo (provided by Torch) ··· 388 xformers 389 xgrammar 390 numba 391 + opentelemetry-sdk 392 + opentelemetry-api 393 + opentelemetry-exporter-otlp 394 + bitsandbytes 395 ] 396 ++ uvicorn.optional-dependencies.standard 397 ++ aioprometheus.optional-dependencies.starlette 398 ++ lib.optionals cudaSupport [ 399 cupy 400 pynvml 401 + flashinfer 402 ]; 403 404 dontUseCmakeConfigure = true; ··· 406 [ 407 (lib.cmakeFeature "FETCHCONTENT_SOURCE_DIR_CUTLASS" "${lib.getDev cutlass}") 408 (lib.cmakeFeature "FLASH_MLA_SRC_DIR" "${lib.getDev flashmla}") 409 + (lib.cmakeFeature "VLLM_FLASH_ATTN_SRC_DIR" "${lib.getDev vllm-flash-attn'}") 410 ] 411 ++ lib.optionals cudaSupport [ 412 (lib.cmakeFeature "TORCH_CUDA_ARCH_LIST" "${gpuTargetString}") ··· 448 449 pythonImportsCheck = [ "vllm" ]; 450 451 + passthru = { 452 + # make internal dependency available to overlays 453 + vllm-flash-attn = vllm-flash-attn'; 454 + # updates the cutlass fetcher instead 455 + skipBulkUpdate = true; 456 + }; 457 458 + meta = { 459 description = "High-throughput and memory-efficient inference and serving engine for LLMs"; 460 changelog = "https://github.com/vllm-project/vllm/releases/tag/v${version}"; 461 homepage = "https://github.com/vllm-project/vllm"; 462 + license = lib.licenses.asl20; 463 + maintainers = with lib.maintainers; [ 464 happysalada 465 lach 466 ];