Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'close-range-cloexec-unshare-v5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux

Pull close_range fix from Christian Brauner:
"syzbot reported a bug when asking close_range() to unshare the file
descriptor table and making all fds close-on-exec.

If CLOSE_RANGE_UNSHARE the caller will receive a private file
descriptor table in case their file descriptor table is currently
shared before operating on the requested file descriptor range.

For the case where the caller has requested all file descriptors to be
actually closed via e.g. close_range(3, ~0U, CLOSE_RANGE_UNSHARE) the
kernel knows that the caller does not need any of the file descriptors
anymore and will optimize the close operation by only copying all
files in the range from 0 to 3 and no others.

However, if the caller requested CLOSE_RANGE_CLOEXEC together with
CLOSE_RANGE_UNSHARE the caller wants to still make use of the file
descriptors so the kernel needs to copy all of them and can't
optimize.

The original patch didn't account for this and thus could cause oopses
as evidenced by the syzbot report because it assumed that all fds had
been copied. Fix this by handling the CLOSE_RANGE_CLOEXEC case and
copying all fds if the two flags are specified together.

This should've been caught in the selftests but the original patch
didn't cover this case and I didn't catch it during review. So in
addition to the bugfix I'm also adding selftests. They will reliably
reproduce the bug on a non-fixed kernel and allows us to catch
regressions and verify correct behavior.

Note, the kernel selftest tree contained a bunch of changes that made
the original selftest fail to compile so there are small fixups in
here make them compile without warnings"

* tag 'close-range-cloexec-unshare-v5.11' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux:
selftests/core: add regression test for CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC
selftests/core: add test for CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC
selftests/core: handle missing syscall number for close_range
selftests/core: fix close_range_test build after XFAIL removal
close_range: unshare all fds for CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC

+278 -7
+3 -1
fs/file.c
··· 694 694 * If the requested range is greater than the current maximum, 695 695 * we're closing everything so only copy all file descriptors 696 696 * beneath the lowest file descriptor. 697 + * If the caller requested all fds to be made cloexec copy all 698 + * of the file descriptors since they still want to use them. 697 699 */ 698 - if (max_fd >= cur_max) 700 + if (!(flags & CLOSE_RANGE_CLOEXEC) && (max_fd >= cur_max)) 699 701 max_unshare_fds = fd; 700 702 701 703 ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
+275 -6
tools/testing/selftests/core/close_range_test.c
··· 17 17 #include "../clone3/clone3_selftests.h" 18 18 19 19 #ifndef __NR_close_range 20 - #define __NR_close_range -1 20 + #if defined __alpha__ 21 + #define __NR_close_range 546 22 + #elif defined _MIPS_SIM 23 + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ 24 + #define __NR_close_range (436 + 4000) 25 + #endif 26 + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ 27 + #define __NR_close_range (436 + 6000) 28 + #endif 29 + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ 30 + #define __NR_close_range (436 + 5000) 31 + #endif 32 + #elif defined __ia64__ 33 + #define __NR_close_range (436 + 1024) 34 + #else 35 + #define __NR_close_range 436 36 + #endif 21 37 #endif 22 38 23 39 #ifndef CLOSE_RANGE_UNSHARE ··· 118 102 int i, ret, status; 119 103 pid_t pid; 120 104 int open_fds[101]; 121 - struct clone_args args = { 105 + struct __clone_args args = { 122 106 .flags = CLONE_FILES, 123 107 .exit_signal = SIGCHLD, 124 108 }; ··· 207 191 int i, ret, status; 208 192 pid_t pid; 209 193 int open_fds[101]; 210 - struct clone_args args = { 194 + struct __clone_args args = { 211 195 .flags = CLONE_FILES, 212 196 .exit_signal = SIGCHLD, 213 197 }; ··· 257 241 fd = open("/dev/null", O_RDONLY); 258 242 ASSERT_GE(fd, 0) { 259 243 if (errno == ENOENT) 260 - XFAIL(return, "Skipping test since /dev/null does not exist"); 244 + SKIP(return, "Skipping test since /dev/null does not exist"); 261 245 } 262 246 263 247 open_fds[i] = fd; ··· 266 250 ret = sys_close_range(1000, 1000, CLOSE_RANGE_CLOEXEC); 267 251 if (ret < 0) { 268 252 if (errno == ENOSYS) 269 - XFAIL(return, "close_range() syscall not supported"); 253 + SKIP(return, "close_range() syscall not supported"); 270 254 if (errno == EINVAL) 271 - XFAIL(return, "close_range() doesn't support CLOSE_RANGE_CLOEXEC"); 255 + SKIP(return, "close_range() doesn't support CLOSE_RANGE_CLOEXEC"); 272 256 } 273 257 274 258 /* Ensure the FD_CLOEXEC bit is set also with a resource limit in place. */ ··· 313 297 } 314 298 } 315 299 300 + TEST(close_range_cloexec_unshare) 301 + { 302 + int i, ret; 303 + int open_fds[101]; 304 + struct rlimit rlimit; 305 + 306 + for (i = 0; i < ARRAY_SIZE(open_fds); i++) { 307 + int fd; 308 + 309 + fd = open("/dev/null", O_RDONLY); 310 + ASSERT_GE(fd, 0) { 311 + if (errno == ENOENT) 312 + SKIP(return, "Skipping test since /dev/null does not exist"); 313 + } 314 + 315 + open_fds[i] = fd; 316 + } 317 + 318 + ret = sys_close_range(1000, 1000, CLOSE_RANGE_CLOEXEC); 319 + if (ret < 0) { 320 + if (errno == ENOSYS) 321 + SKIP(return, "close_range() syscall not supported"); 322 + if (errno == EINVAL) 323 + SKIP(return, "close_range() doesn't support CLOSE_RANGE_CLOEXEC"); 324 + } 325 + 326 + /* Ensure the FD_CLOEXEC bit is set also with a resource limit in place. */ 327 + ASSERT_EQ(0, getrlimit(RLIMIT_NOFILE, &rlimit)); 328 + rlimit.rlim_cur = 25; 329 + ASSERT_EQ(0, setrlimit(RLIMIT_NOFILE, &rlimit)); 330 + 331 + /* Set close-on-exec for two ranges: [0-50] and [75-100]. */ 332 + ret = sys_close_range(open_fds[0], open_fds[50], 333 + CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_UNSHARE); 334 + ASSERT_EQ(0, ret); 335 + ret = sys_close_range(open_fds[75], open_fds[100], 336 + CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_UNSHARE); 337 + ASSERT_EQ(0, ret); 338 + 339 + for (i = 0; i <= 50; i++) { 340 + int flags = fcntl(open_fds[i], F_GETFD); 341 + 342 + EXPECT_GT(flags, -1); 343 + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); 344 + } 345 + 346 + for (i = 51; i <= 74; i++) { 347 + int flags = fcntl(open_fds[i], F_GETFD); 348 + 349 + EXPECT_GT(flags, -1); 350 + EXPECT_EQ(flags & FD_CLOEXEC, 0); 351 + } 352 + 353 + for (i = 75; i <= 100; i++) { 354 + int flags = fcntl(open_fds[i], F_GETFD); 355 + 356 + EXPECT_GT(flags, -1); 357 + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); 358 + } 359 + 360 + /* Test a common pattern. */ 361 + ret = sys_close_range(3, UINT_MAX, 362 + CLOSE_RANGE_CLOEXEC | CLOSE_RANGE_UNSHARE); 363 + for (i = 0; i <= 100; i++) { 364 + int flags = fcntl(open_fds[i], F_GETFD); 365 + 366 + EXPECT_GT(flags, -1); 367 + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); 368 + } 369 + } 370 + 371 + /* 372 + * Regression test for syzbot+96cfd2b22b3213646a93@syzkaller.appspotmail.com 373 + */ 374 + TEST(close_range_cloexec_syzbot) 375 + { 376 + int fd1, fd2, fd3, flags, ret, status; 377 + pid_t pid; 378 + struct __clone_args args = { 379 + .flags = CLONE_FILES, 380 + .exit_signal = SIGCHLD, 381 + }; 382 + 383 + /* Create a huge gap in the fd table. */ 384 + fd1 = open("/dev/null", O_RDWR); 385 + EXPECT_GT(fd1, 0); 386 + 387 + fd2 = dup2(fd1, 1000); 388 + EXPECT_GT(fd2, 0); 389 + 390 + pid = sys_clone3(&args, sizeof(args)); 391 + ASSERT_GE(pid, 0); 392 + 393 + if (pid == 0) { 394 + ret = sys_close_range(3, ~0U, CLOSE_RANGE_CLOEXEC); 395 + if (ret) 396 + exit(EXIT_FAILURE); 397 + 398 + /* 399 + * We now have a private file descriptor table and all 400 + * our open fds should still be open but made 401 + * close-on-exec. 402 + */ 403 + flags = fcntl(fd1, F_GETFD); 404 + EXPECT_GT(flags, -1); 405 + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); 406 + 407 + flags = fcntl(fd2, F_GETFD); 408 + EXPECT_GT(flags, -1); 409 + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); 410 + 411 + fd3 = dup2(fd1, 42); 412 + EXPECT_GT(fd3, 0); 413 + 414 + /* 415 + * Duplicating the file descriptor must remove the 416 + * FD_CLOEXEC flag. 417 + */ 418 + flags = fcntl(fd3, F_GETFD); 419 + EXPECT_GT(flags, -1); 420 + EXPECT_EQ(flags & FD_CLOEXEC, 0); 421 + 422 + exit(EXIT_SUCCESS); 423 + } 424 + 425 + EXPECT_EQ(waitpid(pid, &status, 0), pid); 426 + EXPECT_EQ(true, WIFEXITED(status)); 427 + EXPECT_EQ(0, WEXITSTATUS(status)); 428 + 429 + /* 430 + * We had a shared file descriptor table before along with requesting 431 + * close-on-exec so the original fds must not be close-on-exec. 432 + */ 433 + flags = fcntl(fd1, F_GETFD); 434 + EXPECT_GT(flags, -1); 435 + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); 436 + 437 + flags = fcntl(fd2, F_GETFD); 438 + EXPECT_GT(flags, -1); 439 + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); 440 + 441 + fd3 = dup2(fd1, 42); 442 + EXPECT_GT(fd3, 0); 443 + 444 + flags = fcntl(fd3, F_GETFD); 445 + EXPECT_GT(flags, -1); 446 + EXPECT_EQ(flags & FD_CLOEXEC, 0); 447 + 448 + EXPECT_EQ(close(fd1), 0); 449 + EXPECT_EQ(close(fd2), 0); 450 + EXPECT_EQ(close(fd3), 0); 451 + } 452 + 453 + /* 454 + * Regression test for syzbot+96cfd2b22b3213646a93@syzkaller.appspotmail.com 455 + */ 456 + TEST(close_range_cloexec_unshare_syzbot) 457 + { 458 + int i, fd1, fd2, fd3, flags, ret, status; 459 + pid_t pid; 460 + struct __clone_args args = { 461 + .flags = CLONE_FILES, 462 + .exit_signal = SIGCHLD, 463 + }; 464 + 465 + /* 466 + * Create a huge gap in the fd table. When we now call 467 + * CLOSE_RANGE_UNSHARE with a shared fd table and and with ~0U as upper 468 + * bound the kernel will only copy up to fd1 file descriptors into the 469 + * new fd table. If the kernel is buggy and doesn't handle 470 + * CLOSE_RANGE_CLOEXEC correctly it will not have copied all file 471 + * descriptors and we will oops! 472 + * 473 + * On a buggy kernel this should immediately oops. But let's loop just 474 + * to be sure. 475 + */ 476 + fd1 = open("/dev/null", O_RDWR); 477 + EXPECT_GT(fd1, 0); 478 + 479 + fd2 = dup2(fd1, 1000); 480 + EXPECT_GT(fd2, 0); 481 + 482 + for (i = 0; i < 100; i++) { 483 + 484 + pid = sys_clone3(&args, sizeof(args)); 485 + ASSERT_GE(pid, 0); 486 + 487 + if (pid == 0) { 488 + ret = sys_close_range(3, ~0U, CLOSE_RANGE_UNSHARE | 489 + CLOSE_RANGE_CLOEXEC); 490 + if (ret) 491 + exit(EXIT_FAILURE); 492 + 493 + /* 494 + * We now have a private file descriptor table and all 495 + * our open fds should still be open but made 496 + * close-on-exec. 497 + */ 498 + flags = fcntl(fd1, F_GETFD); 499 + EXPECT_GT(flags, -1); 500 + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); 501 + 502 + flags = fcntl(fd2, F_GETFD); 503 + EXPECT_GT(flags, -1); 504 + EXPECT_EQ(flags & FD_CLOEXEC, FD_CLOEXEC); 505 + 506 + fd3 = dup2(fd1, 42); 507 + EXPECT_GT(fd3, 0); 508 + 509 + /* 510 + * Duplicating the file descriptor must remove the 511 + * FD_CLOEXEC flag. 512 + */ 513 + flags = fcntl(fd3, F_GETFD); 514 + EXPECT_GT(flags, -1); 515 + EXPECT_EQ(flags & FD_CLOEXEC, 0); 516 + 517 + EXPECT_EQ(close(fd1), 0); 518 + EXPECT_EQ(close(fd2), 0); 519 + EXPECT_EQ(close(fd3), 0); 520 + 521 + exit(EXIT_SUCCESS); 522 + } 523 + 524 + EXPECT_EQ(waitpid(pid, &status, 0), pid); 525 + EXPECT_EQ(true, WIFEXITED(status)); 526 + EXPECT_EQ(0, WEXITSTATUS(status)); 527 + } 528 + 529 + /* 530 + * We created a private file descriptor table before along with 531 + * requesting close-on-exec so the original fds must not be 532 + * close-on-exec. 533 + */ 534 + flags = fcntl(fd1, F_GETFD); 535 + EXPECT_GT(flags, -1); 536 + EXPECT_EQ(flags & FD_CLOEXEC, 0); 537 + 538 + flags = fcntl(fd2, F_GETFD); 539 + EXPECT_GT(flags, -1); 540 + EXPECT_EQ(flags & FD_CLOEXEC, 0); 541 + 542 + fd3 = dup2(fd1, 42); 543 + EXPECT_GT(fd3, 0); 544 + 545 + flags = fcntl(fd3, F_GETFD); 546 + EXPECT_GT(flags, -1); 547 + EXPECT_EQ(flags & FD_CLOEXEC, 0); 548 + 549 + EXPECT_EQ(close(fd1), 0); 550 + EXPECT_EQ(close(fd2), 0); 551 + EXPECT_EQ(close(fd3), 0); 552 + } 316 553 317 554 TEST_HARNESS_MAIN