this repo has no description
at fixPythonPipStalling 1003 lines 26 kB view raw
1/* 2This file is part of Darling. 3 4Copyright (C) 2017 Lubos Dolezel 5 6Darling is free software: you can redistribute it and/or modify 7it under the terms of the GNU General Public License as published by 8the Free Software Foundation, either version 3 of the License, or 9(at your option) any later version. 10 11Darling is distributed in the hope that it will be useful, 12but WITHOUT ANY WARRANTY; without even the implied warranty of 13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14GNU General Public License for more details. 15 16You should have received a copy of the GNU General Public License 17along with Darling. If not, see <http://www.gnu.org/licenses/>. 18*/ 19 20#include <sys/types.h> 21#include <sys/stat.h> 22#include <fcntl.h> 23#include <sys/mman.h> 24#include <unistd.h> 25#include <stdio.h> 26#include <string.h> 27#include <errno.h> 28#include <stdlib.h> 29#include <stdint.h> 30#include <stdbool.h> 31#include <mach-o/loader.h> 32#include <mach-o/fat.h> 33#include <dlfcn.h> 34#include <endian.h> 35#include "commpage.h" 36#include "loader.h" 37#include <sys/resource.h> 38#include <sys/prctl.h> 39#include <sys/socket.h> 40#include <sys/un.h> 41#include <darlingserver/rpc.h> 42#include <sys/ptrace.h> 43#include <pthread.h> 44#include <sys/utsname.h> 45 46#ifndef PAGE_SIZE 47# define PAGE_SIZE 4096 48#endif 49#define PAGE_ALIGN(x) (x & ~(PAGE_SIZE-1)) 50 51static const char* dyld_path = INSTALL_PREFIX "/libexec/usr/lib/dyld"; 52 53struct sockaddr_un __dserver_socket_address_data = { 54 .sun_family = AF_UNIX, 55 .sun_path = "\0", 56}; 57 58int __dserver_main_thread_socket_fd = -1; 59int __dserver_process_lifetime_pipe_fd = -1; 60 61// The idea of mldr is to load dyld_path into memory and set up the stack 62// as described in dyldStartup.S. 63// After that, we pass control over to dyld. 64// 65// Additionally, mldr providers access to native platforms libdl.so APIs (ELF loader). 66 67#ifdef __x86_64__ 68static void load64(int fd, bool expect_dylinker, struct load_results* lr); 69static void reexec32(char** argv); 70#endif 71static void load32(int fd, bool expect_dylinker, struct load_results* lr); 72static void load_fat(int fd, cpu_type_t cpu, bool expect_dylinker, char** argv, struct load_results* lr); 73static void load(const char* path, cpu_type_t cpu, bool expect_dylinker, char** argv, struct load_results* lr); 74static int native_prot(int prot); 75static void setup_space(struct load_results* lr, bool is_64_bit); 76static void process_special_env(struct load_results* lr); 77static void start_thread(struct load_results* lr); 78static bool is_kernel_at_least(int major, int minor); 79static void* compatible_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset); 80#ifdef __x86_64__ 81static void setup_stack64(const char* filepath, struct load_results* lr); 82#endif 83static void setup_stack32(const char* filepath, struct load_results* lr); 84 85// this is called when argv[0] specifies an interpreter and we need to "unexpand" it (i.e. convert it from a Linux path to a vchrooted path) 86static void vchroot_unexpand_interpreter(struct load_results* lr); 87 88// UUID of the main executable 89uint8_t exe_uuid[16]; 90 91// globally visible for debugging/core-dumping purposes 92// however, this should not be relied on; a pointer to this should passed around to whoever needs the load_results structure 93__attribute__((used)) 94struct load_results mldr_load_results = {0}; 95 96static uint32_t stack_size = 0; 97 98static const char* const skip_env_vars[] = { 99 "__mldr_bprefs=", 100 "__mldr_sockpath=", 101 "__mldr_lifetime_pipe", 102}; 103 104void* __mldr_main_stack_top = NULL; 105 106static int kernel_major = -1; 107static int kernel_minor = -1; 108 109int main(int argc, char** argv, char** envp) 110{ 111 void** sp; 112 int pushCount = 0; 113 char *filename, *p = NULL; 114 size_t arg_strings_total_size_after = 0; 115 size_t orig_argv0_len = 0; 116 const char* orig_argv1 = NULL; 117 118 mldr_load_results.kernfd = -1; 119 mldr_load_results.argc = argc; 120 mldr_load_results.argv = argv; 121 122 while (envp[mldr_load_results.envc] != NULL) { 123 ++mldr_load_results.envc; 124 } 125 mldr_load_results.envp = envp; 126 127 // sys_execve() passes the original file path appended to the mldr path in argv[0]. 128 if (argc > 0) 129 p = strchr(argv[0], '!'); 130 131 if (argc <= 1) 132 { 133 if (p == NULL) { 134 fprintf(stderr, "mldr is part of Darling. It is not to be executed directly.\n"); 135 return 1; 136 } 137 else 138 { 139 fprintf(stderr, "mldr: warning: Executing with no argv[0]. Continuing anyway, but this is probably a bug.\n"); 140 } 141 } 142 143 if (p != NULL) 144 { 145 filename = (char*) __builtin_alloca(strlen(argv[0])+1); 146 strcpy(filename, p + 1); 147 } 148 else 149 { 150 filename = (char*) __builtin_alloca(strlen(argv[1])+1); 151 strcpy(filename, argv[1]); 152 } 153 154 // allow any process to ptrace us 155 // the only process we really care about being able to do this is the server, 156 // but we can't just use the server's PID, since it lies outside our PID namespace. 157 ptrace(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0); 158 159 process_special_env(&mldr_load_results); 160 161#ifdef __i386__ 162 load(filename, CPU_TYPE_X86, false, argv, &mldr_load_results); // accept i386 only 163#else 164 load(filename, 0, false, argv, &mldr_load_results); 165#endif 166 167 // this was previously necessary when we were loading the binary from the LKM 168 // (presumably because the break was detected incorrectly) 169 // but this shouldn't be necessary for loading Mach-O's from userspace (the heap space should already be set up properly). 170 // see https://github.com/darlinghq/darling/issues/469 for the issue this originally fixed in the LKM 171#if 0 172 if (prctl(PR_SET_MM, PR_SET_MM_BRK, PAGE_ALIGN(mldr_load_results.vm_addr_max), 0, 0) < 0) { 173 fprintf(stderr, "Failed to set BRK value\n"); 174 return 1; 175 } 176 177 if (prctl(PR_SET_MM, PR_SET_MM_START_BRK, PAGE_ALIGN(mldr_load_results.vm_addr_max), 0, 0) < 0) { 178 fprintf(stderr, "Failed to set BRK start\n"); 179 return 1; 180 } 181#endif 182 183 // adjust argv (remove mldr's argv[0]) 184 // NOTE: this code assumes that the current argv array points to contiguous strings. 185 // this is not necessarily true, although AFAIK this is always true on Linux. 186 // also note: we do it this way (moving the string contents in addition to the pointers) 187 // so that Linux sees our modified argv array without having to use PR_SET_MM_ARG_START 188 // and PR_SET_MM_ARG_END (since those require CAP_SYS_RESOURCE) 189 190 --mldr_load_results.argc; 191 192 orig_argv0_len = strlen(mldr_load_results.argv[0]) + 1; 193 orig_argv1 = mldr_load_results.argv[1]; 194 195 for (size_t i = 0; i < mldr_load_results.argc; ++i) { 196 mldr_load_results.argv[i] = mldr_load_results.argv[0] + arg_strings_total_size_after; 197 arg_strings_total_size_after += strlen(mldr_load_results.argv[i + 1]) + 1; 198 } 199 mldr_load_results.argv[mldr_load_results.argc] = NULL; 200 201 memmove(mldr_load_results.argv[0], orig_argv1, arg_strings_total_size_after); 202 memset(mldr_load_results.argv[0] + arg_strings_total_size_after, 0, orig_argv0_len); 203 204 if (p == NULL) { 205 vchroot_unexpand_interpreter(&mldr_load_results); 206 } 207 208 // adjust envp (remove special mldr variables) 209 // NOTE: same as for argv; here we assume the envp strings are contiguous 210 for (size_t i = 0; i < mldr_load_results.envc; ++i) { 211 if (!mldr_load_results.envp[i]) { 212 mldr_load_results.envc = i; 213 break; 214 } 215 216 size_t len = strlen(mldr_load_results.envp[i]) + 1; 217 218 // Don't pass these special env vars down to userland 219 #define SKIP_VAR(_name) \ 220 (len > sizeof(_name) - 1 && strncmp(mldr_load_results.envp[i], _name, sizeof(_name) - 1) == 0) 221 222 if ( 223 SKIP_VAR("__mldr_bprefs=") || 224 SKIP_VAR("__mldr_sockpath=") 225 ) { 226 size_t len_after = 0; 227 const char* orig_envp_i_plus_one = mldr_load_results.envp[i + 1]; 228 229 --mldr_load_results.envc; 230 231 for (size_t j = i; j < mldr_load_results.envc; ++j) { 232 mldr_load_results.envp[j] = mldr_load_results.envp[i] + len_after; 233 len_after += strlen(mldr_load_results.envp[j + 1]) + 1; 234 } 235 mldr_load_results.envp[mldr_load_results.envc] = NULL; 236 237 memmove(mldr_load_results.envp[i], orig_envp_i_plus_one, len_after); 238 memset(mldr_load_results.envp[i] + len_after, 0, len); 239 240 // we have to check this index again because it now points to a different string 241 --i; 242 continue; 243 } 244 } 245 246 if (mldr_load_results._32on64) 247 setup_stack32(filename, &mldr_load_results); 248 else 249#ifdef __x86_64__ 250 setup_stack64(filename, &mldr_load_results); 251#elif __aarch64__ 252 #error TODO: aarch64 253#else 254 abort(); 255#endif 256 257 int status = dserver_rpc_set_dyld_info(mldr_load_results.dyld_all_image_location, mldr_load_results.dyld_all_image_size); 258 if (status < 0) { 259 fprintf(stderr, "Failed to tell darlingserver about our dyld info\n"); 260 exit(1); 261 } 262 263 if (dserver_rpc_set_executable_path(filename, strlen(filename)) < 0) { 264 fprintf(stderr, "Failed to tell darlingserver about our executable path\n"); 265 exit(1); 266 } 267 268 __mldr_main_stack_top = (void*)mldr_load_results.stack_top; 269 270 start_thread(&mldr_load_results); 271 272 __builtin_unreachable(); 273} 274 275void load(const char* path, cpu_type_t forced_arch, bool expect_dylinker, char** argv, struct load_results* lr) 276{ 277 int fd; 278 uint32_t magic; 279 280 fd = open(path, O_RDONLY); 281 if (fd == -1) 282 { 283 fprintf(stderr, "Cannot open %s: %s\n", path, strerror(errno)); 284 exit(1); 285 } 286 287 // We need to read argv[1] and detect whether it's a 32 or 64-bit application. 288 // Then load the appropriate version of dyld from the fat file. 289 // In case the to-be-executed executable contains both, we prefer the 64-bit version, 290 // unless a special property has been passed to sys_posix_spawn() to force the 32-bit 291 // version. See posix_spawnattr_setbinpref_np(). 292 293 if (read(fd, &magic, sizeof(magic)) != sizeof(magic)) 294 { 295 fprintf(stderr, "Cannot read the file header of %s.\n", path); 296 exit(1); 297 } 298 299 if (magic == MH_MAGIC_64 || magic == MH_CIGAM_64) 300 { 301#ifdef __x86_64__ 302 lseek(fd, 0, SEEK_SET); 303 load64(fd, expect_dylinker, lr); 304#else 305 abort(); 306#endif 307 } 308 else if (magic == MH_MAGIC || magic == MH_CIGAM) 309 { 310#if !__x86_64__ 311 lseek(fd, 0, SEEK_SET); 312 load32(fd, expect_dylinker, lr); 313#else 314 // Re-run self as mldr32 315 reexec32(argv); 316#endif 317 } 318 else if (magic == FAT_MAGIC || magic == FAT_CIGAM) 319 { 320 lseek(fd, 0, SEEK_SET); 321 load_fat(fd, forced_arch, expect_dylinker, argv, lr); 322 } 323 else 324 { 325 fprintf(stderr, "Unknown file format: %s.\n", path); 326 exit(1); 327 } 328 329 close(fd); 330} 331 332static void load_fat(int fd, cpu_type_t forced_arch, bool expect_dylinker, char** argv, struct load_results* lr) { 333 struct fat_header fhdr; 334 struct fat_arch best_arch = {0}; 335 int bpref_index = -1; 336 337 best_arch.cputype = CPU_TYPE_ANY; 338 339 if (read(fd, &fhdr, sizeof(fhdr)) != sizeof(fhdr)) 340 { 341 fprintf(stderr, "Cannot read fat file header.\n"); 342 exit(1); 343 } 344 345 const bool swap = fhdr.magic == FAT_CIGAM; 346 347#define SWAP32(x) x = __bswap_32(x) 348 349 if (swap) 350 SWAP32(fhdr.nfat_arch); 351 352 uint32_t i; 353 for (i = 0; i < fhdr.nfat_arch; i++) 354 { 355 struct fat_arch arch; 356 357 if (read(fd, &arch, sizeof(arch)) != sizeof(arch)) 358 { 359 fprintf(stderr, "Cannot read fat_arch header.\n"); 360 exit(1); 361 } 362 363 if (swap) 364 { 365 SWAP32(arch.cputype); 366 SWAP32(arch.cpusubtype); 367 SWAP32(arch.offset); 368 SWAP32(arch.size); 369 SWAP32(arch.align); 370 } 371 372 if (!forced_arch) 373 { 374 int j; 375 for (j = 0; j < 4; j++) 376 { 377 if (lr->bprefs[j] && arch.cputype == lr->bprefs[j]) 378 { 379 if (bpref_index == -1 || bpref_index > j) 380 { 381 best_arch = arch; 382 bpref_index = j; 383 break; 384 } 385 } 386 } 387 388 if (bpref_index == -1) 389 { 390#if defined(__x86_64__) 391 if (arch.cputype == CPU_TYPE_X86_64) 392 best_arch = arch; 393 else if (best_arch.cputype == CPU_TYPE_ANY && arch.cputype == CPU_TYPE_X86) 394 best_arch = arch; 395#elif defined(__i386__) 396 if (arch.cputype == CPU_TYPE_X86) 397 best_arch = arch; 398#elif defined (__aarch64__) 399 #error TODO: arm 400#else 401 #error Unsupported CPU architecture 402#endif 403 } 404 } 405 else 406 { 407 if (arch.cputype == forced_arch) 408 best_arch = arch; 409 } 410 } 411 412 if (best_arch.cputype == CPU_TYPE_ANY) 413 { 414 fprintf(stderr, "No supported architecture found in fat binary.\n"); 415 exit(1); 416 } 417 418 if (lseek(fd, best_arch.offset, SEEK_SET) == -1) 419 { 420 fprintf(stderr, "Cannot seek to selected arch in fat binary.\n"); 421 exit(1); 422 } 423 424 if (best_arch.cputype & CPU_ARCH_ABI64) { 425#ifdef __x86_64__ 426 load64(fd, expect_dylinker, lr); 427#elif __aarch64__ 428 #error TODO: aarch64 429#else 430 abort(); 431#endif 432 } else { 433#if !__x86_64__ 434 load32(fd, expect_dylinker, lr); 435#else 436 // Re-run self as mldr32 437 reexec32(argv); 438#endif 439 } 440}; 441 442#ifdef __x86_64__ 443#define GEN_64BIT 444#include "loader.c" 445#include "stack.c" 446#undef GEN_64BIT 447#endif 448 449#define GEN_32BIT 450#include "loader.c" 451#include "stack.c" 452#undef GEN_32BIT 453 454int native_prot(int prot) 455{ 456 int protOut = 0; 457 458 if (prot & VM_PROT_READ) 459 protOut |= PROT_READ; 460 if (prot & VM_PROT_WRITE) 461 protOut |= PROT_WRITE; 462 if (prot & VM_PROT_EXECUTE) 463 protOut |= PROT_EXEC; 464 465 return protOut; 466} 467 468static void reexec32(char** argv) 469{ 470 char selfpath[1024]; 471 ssize_t len; 472 473 len = readlink("/proc/self/exe", selfpath, sizeof(selfpath)-3); 474 if (len == -1) 475 { 476 perror("Cannot readlink /proc/self/exe"); 477 abort(); 478 } 479 480 selfpath[len] = '\0'; 481 strcat(selfpath, "32"); 482 483 execv(selfpath, argv); 484 485 perror("Cannot re-execute as 32-bit process"); 486 abort(); 487} 488 489// Given that there's no proper way of passing special parameters to the binary loader 490// via execve(), we must do this via env variables 491static void process_special_env(struct load_results* lr) { 492 const char* str; 493 static char root_path[4096]; 494 495 lr->bprefs[0] = lr->bprefs[1] = lr->bprefs[2] = lr->bprefs[3] = 0; 496 str = getenv("__mldr_bprefs"); 497 498 if (str != NULL) { 499 sscanf(str, "%x,%x,%x,%x", &lr->bprefs[0], &lr->bprefs[1], &lr->bprefs[2], &lr->bprefs[3]); 500 } 501 502 str = getenv("__mldr_sockpath"); 503 504 if (str != NULL) { 505 if (strlen(str) > sizeof(__dserver_socket_address_data.sun_path) - 1) { 506 fprintf(stderr, "darlingserver socket path is too long\n"); 507 exit(1); 508 } 509 strncpy(__dserver_socket_address_data.sun_path, str, sizeof(__dserver_socket_address_data.sun_path) - 1); 510 __dserver_socket_address_data.sun_path[sizeof(__dserver_socket_address_data.sun_path) - 1] = '\0'; 511 512 lr->socket_path = __dserver_socket_address_data.sun_path; 513 } 514 515 lr->lifetime_pipe = -1; 516 str = getenv("__mldr_lifetime_pipe"); 517 518 if (str != NULL) { 519 sscanf(str, "%i", &lr->lifetime_pipe); 520 } 521 522 str = getenv("DYLD_ROOT_PATH"); 523 524 if (str != NULL && lr->root_path == NULL) { 525 strncpy(root_path, str, sizeof(root_path) - 1); 526 root_path[sizeof(root_path) - 1] = '\0'; 527 lr->root_path = root_path; 528 lr->root_path_length = strlen(lr->root_path); 529 } 530}; 531 532static void unset_special_env() { 533 unsetenv("__mldr_bprefs"); 534 unsetenv("__mldr_sockpath"); 535 unsetenv("__mldr_lifetime_pipe"); 536}; 537 538typedef struct socket_bitmap { 539 pthread_mutex_t mutex; 540 /** 541 * This is always next lowest available index. 542 * If this is equal to #bit_length, then the bitmap is full. 543 */ 544 size_t next_index; 545 uint8_t* bits; 546 size_t bit_length; 547 int highest; 548} socket_bitmap_t; 549 550static socket_bitmap_t socket_bitmap = { 551 .mutex = PTHREAD_MUTEX_INITIALIZER, 552 .next_index = 0, 553 .bits = NULL, 554 .bit_length = 0, 555 .highest = -1, 556}; 557 558static int socket_bitmap_get(socket_bitmap_t* bitmap) { 559 int fd = -1; 560 bool updated = false; 561 562 pthread_mutex_lock(&bitmap->mutex); 563 564 if (bitmap->highest == -1) { 565 // we need to initialize this bitmap 566 struct rlimit limit; 567 568 if (getrlimit(RLIMIT_NOFILE, &limit) < 0) { 569 goto out; 570 } 571 572 if (limit.rlim_cur == RLIM_INFINITY) { 573 // just default to 1024 574 limit.rlim_cur = 1024; 575 } 576 577 bitmap->highest = limit.rlim_cur - 1; 578 } 579 580 if (bitmap->next_index >= bitmap->bit_length) { 581 // we need to grow the bitmap 582 583 if ((bitmap->bit_length % 8) == 0) { 584 // we need to allocate an additional byte 585 586 void* ptr = realloc(bitmap->bits, (bitmap->bit_length / 8) + 1); 587 if (!ptr) { 588 goto out; 589 } 590 591 bitmap->bits = ptr; 592 593 bitmap->bits[bitmap->bit_length / 8] = 0; 594 } else { 595 // we just need to increment the bit length 596 } 597 598 ++bitmap->bit_length; 599 } 600 601 fd = bitmap->highest - bitmap->next_index; 602 603 bitmap->bits[bitmap->next_index / 8] |= 1 << (bitmap->next_index % 8); 604 605 // update the next available index 606 for (size_t i = bitmap->next_index + 1; i < bitmap->bit_length; ++i) { 607 size_t byte = i / 8; 608 uint8_t bit = i % 8; 609 610 if (bit == 0) { 611 // check the entire byte at once so we can avoid unnecessary iteration 612 if (bitmap->bits[byte] == 0xff) { 613 // this byte is full, skip it 614 i += 7; 615 continue; 616 } 617 } 618 619 if ((bitmap->bits[byte] & (1 << bit)) == 0) { 620 // this index is unused 621 bitmap->next_index = i; 622 updated = true; 623 break; 624 } 625 } 626 627 if (!updated) { 628 // all of our entries are currently in-use 629 bitmap->next_index = bitmap->bit_length; 630 } 631 632out: 633 pthread_mutex_unlock(&bitmap->mutex); 634 635 return fd; 636}; 637 638static void socket_bitmap_put(socket_bitmap_t* bitmap, int socket) { 639 size_t index; 640 641 pthread_mutex_lock(&bitmap->mutex); 642 643 index = bitmap->highest - socket; 644 645 bitmap->bits[index / 8] &= ~(1 << (index % 8)); 646 647 if (index < bitmap->next_index) { 648 bitmap->next_index = index; 649 } 650 651 if (index == bitmap->bit_length - 1) { 652 // we can shrink the bitmap 653 size_t old_byte_size = (bitmap->bit_length + 7) / 8; 654 size_t new_byte_size = old_byte_size; 655 656 while (bitmap->bit_length > 0) { 657 size_t index = bitmap->bit_length - 1; 658 659 if ((bitmap->bit_length % 8) == 0) { 660 // check the entire byte at once to avoid unnecessary iteration 661 if (bitmap->bits[(bitmap->bit_length / 8) - 1] == 0) { 662 // remove this entire byte 663 bitmap->bit_length -= 8; 664 continue; 665 } 666 } 667 668 if ((bitmap->bits[index / 8] & (1 << (index % 8))) == 0) { 669 // this bit is in-use, so we can't shrink any further 670 break; 671 } 672 673 --bitmap->bit_length; 674 } 675 676 new_byte_size = (bitmap->bit_length + 7) / 8; 677 678 if (old_byte_size != new_byte_size) { 679 // we can free one or more bytes from the bitmap 680 void* ptr = realloc(bitmap->bits, new_byte_size); 681 if (!ptr) { 682 goto out; 683 } 684 685 bitmap->bits = ptr; 686 } 687 } 688 689out: 690 pthread_mutex_unlock(&bitmap->mutex); 691}; 692 693int __mldr_create_rpc_socket(void) { 694 int pre_fd = -1; 695 int fd = -1; 696 697 pre_fd = socket(AF_UNIX, SOCK_DGRAM, 0); 698 if (pre_fd < 0) { 699 goto err_out; 700 } 701 702 fd = socket_bitmap_get(&socket_bitmap); 703 if (fd < 0) { 704 goto err_out; 705 } 706 707 if (dup2(pre_fd, fd) < 0) { 708 // we have to put it away ourselves here because `fd` is not yet valid, so we can't close() it in the error handler 709 socket_bitmap_put(&socket_bitmap, fd); 710 fd = -1; 711 goto err_out; 712 } 713 714 close(pre_fd); 715 pre_fd = -1; 716 717 // `fd` now contains the socket with the desired FD number returned by `socket_bitmap_get` 718 719 int fd_flags = fcntl(fd, F_GETFD); 720 if (fd_flags < 0) { 721 goto err_out; 722 } 723 if (fcntl(fd, F_SETFD, fd_flags | FD_CLOEXEC) < 0) { 724 goto err_out; 725 } 726 727 sa_family_t family = AF_UNIX; 728 if (bind(fd, (const struct sockaddr*)&family, sizeof(family)) < 0) { 729 goto err_out; 730 } 731 732out: 733 return fd; 734 735err_out: 736 if (fd >= 0) { 737 socket_bitmap_put(&socket_bitmap, fd); 738 close(fd); 739 } 740 741 if (pre_fd >= 0) { 742 close(pre_fd); 743 } 744 745 return -1; 746}; 747 748void __mldr_close_rpc_socket(int socket) { 749 close(socket); 750 socket_bitmap_put(&socket_bitmap, socket); 751}; 752 753int __mldr_create_process_lifetime_pipe(int* fds) { 754 // These pipes are not required for Linux 5.3 or newer, 755 // we already have pidfd_open. 756 if (is_kernel_at_least(5, 3)) { 757 fds[0] = fds[1] = -1; 758 return 0; 759 } 760 761 int pre_fds[2]; 762 if (pipe(pre_fds) == -1) { 763 goto err_out; 764 } 765 766 for (int i = 0; i < 2; ++i) { 767 fds[i] = socket_bitmap_get(&socket_bitmap); 768 if (fds[i] < 0) { 769 goto err_out; 770 } 771 772 if (dup2(pre_fds[i], fds[i]) < 0) { 773 socket_bitmap_put(&socket_bitmap, fds[i]); 774 fds[i] = -1; 775 goto err_out; 776 } 777 778 close(pre_fds[i]); 779 pre_fds[i] = -1; 780 } 781 782 return 0; 783 784err_out: 785 for (int i = 0; i < 2; ++i) { 786 if (fds[i] >= 0) { 787 socket_bitmap_put(&socket_bitmap, fds[i]); 788 close(fds[i]); 789 } 790 791 if (pre_fds[i] >= 0) { 792 close(pre_fds[i]); 793 } 794 } 795 796 return -1; 797} 798 799void __mldr_close_process_lifetime_pipe(int fd) { 800 if (fd != -1) { 801 close(fd); 802 socket_bitmap_put(&socket_bitmap, fd); 803 } 804} 805 806static void setup_space(struct load_results* lr, bool is_64_bit) { 807 commpage_setup(is_64_bit); 808 809 // Using the default stack top would cause the stack to be placed just above the commpage 810 // and would collide with it eventually. 811 // Instead, we manually allocate a new stack below the commpage. 812#if __x86_64__ 813 lr->stack_top = commpage_address(true); 814#elif __i386__ 815 lr->stack_top = commpage_address(false); 816#else 817 #error Unsupported architecture 818#endif 819 820 struct rlimit limit; 821 getrlimit(RLIMIT_STACK, &limit); 822 // allocate a few pages 16 pages if it's less than the limit; otherwise, allocate the limit 823 unsigned long size = PAGE_SIZE * 16; 824 if (limit.rlim_cur != RLIM_INFINITY && limit.rlim_cur < size) { 825 size = limit.rlim_cur; 826 } 827 828 if (compatible_mmap((void*)(lr->stack_top - size), size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE | MAP_GROWSDOWN, -1, 0) == MAP_FAILED) { 829 fprintf(stderr, "Failed to allocate stack of %lu bytes: %d (%s)\n", size, errno, strerror(errno)); 830 exit(1); 831 } 832 833 unset_special_env(); 834 835 lr->kernfd = __mldr_create_rpc_socket(); 836 if (lr->kernfd < 0) { 837 fprintf(stderr, "Failed to create socket\n"); 838 exit(1); 839 } 840 841 __dserver_main_thread_socket_fd = lr->kernfd; 842 843 int lifetime_pipe[2]; 844 845 // this process is created using exec from another Darling process. 846 // darlingserver should already have the read pipe, so we don't need 847 // to check that in. 848 if (lr->lifetime_pipe != -1) { 849 lifetime_pipe[1] = socket_bitmap_get(&socket_bitmap); 850 851 if (lr->lifetime_pipe != lifetime_pipe[1]) { 852 // move the existing pipe to a higher fd number, and invalidate 853 // the old fd to prevent interfering with fds provided by 854 // socket_bitmap_get 855 if (dup2(lr->lifetime_pipe, lifetime_pipe[1]) == -1) { 856 fprintf(stderr, "Failed to dup process lifetime pipe: %d (%s)\n", errno, strerror(errno)); 857 exit(1); 858 } 859 close(lr->lifetime_pipe); 860 } 861 862 lifetime_pipe[0] = -1; 863 } else { 864 if (__mldr_create_process_lifetime_pipe(lifetime_pipe) == -1) { 865 fprintf(stderr, "Failed to create process lifetime pipe: %d (%s)\n", errno, strerror(errno)); 866 exit(1); 867 } 868 } 869 870 lr->lifetime_pipe = lifetime_pipe[1]; 871 872 // store the write end of the pipe; the read end is sent to darlingserver. 873 __dserver_process_lifetime_pipe_fd = lifetime_pipe[1]; 874 875 int dummy_stack_variable; 876 if (dserver_rpc_checkin(false, &dummy_stack_variable, lifetime_pipe[0]) < 0) { 877 fprintf(stderr, "Failed to checkin with darlingserver\n"); 878 exit(1); 879 } 880 881 // keep our write end while closing the unused read end. 882 __mldr_close_process_lifetime_pipe(lifetime_pipe[0]); 883 884 if (!lr->root_path) { 885 static char vchroot_buffer[4096]; 886 uint64_t vchroot_path_length = 0; 887 888 int code = dserver_rpc_vchroot_path(vchroot_buffer, sizeof(vchroot_buffer), &vchroot_path_length); 889 if (code < 0) { 890 fprintf(stderr, "Failed to retrieve vchroot path from darlingserver: %d\n", code); 891 exit(1); 892 } 893 894 if (vchroot_path_length >= sizeof(vchroot_buffer)) { 895 fprintf(stderr, "Vchroot path is too large for buffer\n"); 896 exit(1); 897 } else if (vchroot_path_length > 0) { 898 lr->root_path = vchroot_buffer; 899 lr->root_path_length = vchroot_path_length; 900 } 901 } 902}; 903 904static void start_thread(struct load_results* lr) { 905#ifdef __x86_64__ 906 __asm__ volatile( 907 "mov %1, %%rsp\n" 908 "jmpq *%0" 909 :: 910 "m"(lr->entry_point), 911 "r"(lr->stack_top) 912 : 913 ); 914#elif defined(__i386__) 915 __asm__ volatile( 916 "mov %1, %%esp\n" 917 "jmp *%0" 918 :: 919 "m"(lr->entry_point), 920 "r"(lr->stack_top) 921 : 922 ); 923#elif defined(__arm__) 924 __asm__ volatile( 925 "mov sp, %1\n" 926 "bx %0" 927 :: 928 "r"(lr->entry_point), 929 "r"(lr->stack_top) 930 : 931 ); 932#else 933# error Unsupported platform! 934#endif 935}; 936 937static bool is_kernel_at_least(int major, int minor) { 938 if (kernel_major == -1) { 939 struct utsname uname_info; 940 if (uname(&uname_info) == -1) { 941 return false; 942 } 943 kernel_major = 0; 944 kernel_minor = 0; 945 size_t pos = 0; 946 while (uname_info.release[pos] != '\0' && uname_info.release[pos] != '.') { 947 kernel_major = kernel_major * 10 + uname_info.release[pos] - '0'; 948 ++pos; 949 } 950 ++pos; 951 while (uname_info.release[pos] != '\0' && uname_info.release[pos] != '.') { 952 kernel_minor = kernel_minor * 10 + uname_info.release[pos] - '0'; 953 ++pos; 954 } 955 } 956 957 if (major != kernel_major) { 958 return kernel_major > major; 959 } 960 961 return kernel_minor >= minor; 962} 963 964void* compatible_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) { 965 // MAP_FIXED_NOREPLACE is not supported on WSL1 (Linux < 4.17). 966 bool fixed_noreplace_hack = false; 967 if ((flags & MAP_FIXED_NOREPLACE) && !is_kernel_at_least(4, 17)) { 968 flags &= ~MAP_FIXED_NOREPLACE; 969 fixed_noreplace_hack = true; 970 } 971 void* result = mmap(addr, length, prot, flags, fd, offset); 972 // MAP_GROWSDOWN is not supported on WSL1. See https://github.com/microsoft/WSL/issues/8095. 973 if ((result == (void*)MAP_FAILED) && (flags & MAP_GROWSDOWN) && (errno == EOPNOTSUPP)) { 974 result = mmap(addr, length, prot, (flags & ~MAP_GROWSDOWN), fd, offset); 975 } 976 if (fixed_noreplace_hack) { 977 if (result != addr && result != (void*)MAP_FAILED) { 978 errno = ESRCH; 979 munmap(addr, length); 980 return MAP_FAILED; 981 } 982 } 983 return result; 984} 985 986static void vchroot_unexpand_interpreter(struct load_results* lr) { 987 static char unexpanded[4096]; 988 size_t length; 989 990 if (lr->root_path) { 991 length = strlen(lr->argv[0]); 992 993 if (strncmp(lr->argv[0], lr->root_path, lr->root_path_length) == 0) { 994 memmove(unexpanded, lr->argv[0] + lr->root_path_length, length - lr->root_path_length + 1); 995 } else { 996 // FIXME: potential buffer overflow 997 memmove(unexpanded + sizeof(SYSTEM_ROOT) - 1, lr->argv[0], length + 1); 998 memcpy(unexpanded, SYSTEM_ROOT, sizeof(SYSTEM_ROOT) - 1); 999 } 1000 1001 lr->argv[0] = unexpanded; 1002 } 1003};