this repo has no description
1/*
2This file is part of Darling.
3
4Copyright (C) 2017 Lubos Dolezel
5
6Darling is free software: you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation, either version 3 of the License, or
9(at your option) any later version.
10
11Darling is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
16You should have received a copy of the GNU General Public License
17along with Darling. If not, see <http://www.gnu.org/licenses/>.
18*/
19
20#include <sys/types.h>
21#include <sys/stat.h>
22#include <fcntl.h>
23#include <sys/mman.h>
24#include <unistd.h>
25#include <stdio.h>
26#include <string.h>
27#include <errno.h>
28#include <stdlib.h>
29#include <stdint.h>
30#include <stdbool.h>
31#include <mach-o/loader.h>
32#include <mach-o/fat.h>
33#include <dlfcn.h>
34#include <endian.h>
35#include "commpage.h"
36#include "loader.h"
37#include <sys/resource.h>
38#include <sys/prctl.h>
39#include <sys/socket.h>
40#include <sys/un.h>
41#include <darlingserver/rpc.h>
42#include <sys/ptrace.h>
43#include <pthread.h>
44#include <sys/utsname.h>
45
46#ifndef PAGE_SIZE
47# define PAGE_SIZE 4096
48#endif
49#define PAGE_ALIGN(x) (x & ~(PAGE_SIZE-1))
50
51static const char* dyld_path = INSTALL_PREFIX "/libexec/usr/lib/dyld";
52
53struct sockaddr_un __dserver_socket_address_data = {
54 .sun_family = AF_UNIX,
55 .sun_path = "\0",
56};
57
58int __dserver_main_thread_socket_fd = -1;
59int __dserver_process_lifetime_pipe_fd = -1;
60
61// The idea of mldr is to load dyld_path into memory and set up the stack
62// as described in dyldStartup.S.
63// After that, we pass control over to dyld.
64//
65// Additionally, mldr providers access to native platforms libdl.so APIs (ELF loader).
66
67#ifdef __x86_64__
68static void load64(int fd, bool expect_dylinker, struct load_results* lr);
69static void reexec32(char** argv);
70#endif
71static void load32(int fd, bool expect_dylinker, struct load_results* lr);
72static void load_fat(int fd, cpu_type_t cpu, bool expect_dylinker, char** argv, struct load_results* lr);
73static void load(const char* path, cpu_type_t cpu, bool expect_dylinker, char** argv, struct load_results* lr);
74static int native_prot(int prot);
75static void setup_space(struct load_results* lr, bool is_64_bit);
76static void process_special_env(struct load_results* lr);
77static void start_thread(struct load_results* lr);
78static bool is_kernel_at_least(int major, int minor);
79static void* compatible_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset);
80#ifdef __x86_64__
81static void setup_stack64(const char* filepath, struct load_results* lr);
82#endif
83static void setup_stack32(const char* filepath, struct load_results* lr);
84
85// this is called when argv[0] specifies an interpreter and we need to "unexpand" it (i.e. convert it from a Linux path to a vchrooted path)
86static void vchroot_unexpand_interpreter(struct load_results* lr);
87
88// UUID of the main executable
89uint8_t exe_uuid[16];
90
91// globally visible for debugging/core-dumping purposes
92// however, this should not be relied on; a pointer to this should passed around to whoever needs the load_results structure
93__attribute__((used))
94struct load_results mldr_load_results = {0};
95
96static uint32_t stack_size = 0;
97
98static const char* const skip_env_vars[] = {
99 "__mldr_bprefs=",
100 "__mldr_sockpath=",
101 "__mldr_lifetime_pipe",
102};
103
104void* __mldr_main_stack_top = NULL;
105
106static int kernel_major = -1;
107static int kernel_minor = -1;
108
109int main(int argc, char** argv, char** envp)
110{
111 void** sp;
112 int pushCount = 0;
113 char *filename, *p = NULL;
114 size_t arg_strings_total_size_after = 0;
115 size_t orig_argv0_len = 0;
116 const char* orig_argv1 = NULL;
117
118 mldr_load_results.kernfd = -1;
119 mldr_load_results.argc = argc;
120 mldr_load_results.argv = argv;
121
122 while (envp[mldr_load_results.envc] != NULL) {
123 ++mldr_load_results.envc;
124 }
125 mldr_load_results.envp = envp;
126
127 // sys_execve() passes the original file path appended to the mldr path in argv[0].
128 if (argc > 0)
129 p = strchr(argv[0], '!');
130
131 if (argc <= 1)
132 {
133 if (p == NULL) {
134 fprintf(stderr, "mldr is part of Darling. It is not to be executed directly.\n");
135 return 1;
136 }
137 else
138 {
139 fprintf(stderr, "mldr: warning: Executing with no argv[0]. Continuing anyway, but this is probably a bug.\n");
140 }
141 }
142
143 if (p != NULL)
144 {
145 filename = (char*) __builtin_alloca(strlen(argv[0])+1);
146 strcpy(filename, p + 1);
147 }
148 else
149 {
150 filename = (char*) __builtin_alloca(strlen(argv[1])+1);
151 strcpy(filename, argv[1]);
152 }
153
154 // allow any process to ptrace us
155 // the only process we really care about being able to do this is the server,
156 // but we can't just use the server's PID, since it lies outside our PID namespace.
157 ptrace(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0);
158
159 process_special_env(&mldr_load_results);
160
161#ifdef __i386__
162 load(filename, CPU_TYPE_X86, false, argv, &mldr_load_results); // accept i386 only
163#else
164 load(filename, 0, false, argv, &mldr_load_results);
165#endif
166
167 // this was previously necessary when we were loading the binary from the LKM
168 // (presumably because the break was detected incorrectly)
169 // but this shouldn't be necessary for loading Mach-O's from userspace (the heap space should already be set up properly).
170 // see https://github.com/darlinghq/darling/issues/469 for the issue this originally fixed in the LKM
171#if 0
172 if (prctl(PR_SET_MM, PR_SET_MM_BRK, PAGE_ALIGN(mldr_load_results.vm_addr_max), 0, 0) < 0) {
173 fprintf(stderr, "Failed to set BRK value\n");
174 return 1;
175 }
176
177 if (prctl(PR_SET_MM, PR_SET_MM_START_BRK, PAGE_ALIGN(mldr_load_results.vm_addr_max), 0, 0) < 0) {
178 fprintf(stderr, "Failed to set BRK start\n");
179 return 1;
180 }
181#endif
182
183 // adjust argv (remove mldr's argv[0])
184 // NOTE: this code assumes that the current argv array points to contiguous strings.
185 // this is not necessarily true, although AFAIK this is always true on Linux.
186 // also note: we do it this way (moving the string contents in addition to the pointers)
187 // so that Linux sees our modified argv array without having to use PR_SET_MM_ARG_START
188 // and PR_SET_MM_ARG_END (since those require CAP_SYS_RESOURCE)
189
190 --mldr_load_results.argc;
191
192 orig_argv0_len = strlen(mldr_load_results.argv[0]) + 1;
193 orig_argv1 = mldr_load_results.argv[1];
194
195 for (size_t i = 0; i < mldr_load_results.argc; ++i) {
196 mldr_load_results.argv[i] = mldr_load_results.argv[0] + arg_strings_total_size_after;
197 arg_strings_total_size_after += strlen(mldr_load_results.argv[i + 1]) + 1;
198 }
199 mldr_load_results.argv[mldr_load_results.argc] = NULL;
200
201 memmove(mldr_load_results.argv[0], orig_argv1, arg_strings_total_size_after);
202 memset(mldr_load_results.argv[0] + arg_strings_total_size_after, 0, orig_argv0_len);
203
204 if (p == NULL) {
205 vchroot_unexpand_interpreter(&mldr_load_results);
206 }
207
208 // adjust envp (remove special mldr variables)
209 // NOTE: same as for argv; here we assume the envp strings are contiguous
210 for (size_t i = 0; i < mldr_load_results.envc; ++i) {
211 if (!mldr_load_results.envp[i]) {
212 mldr_load_results.envc = i;
213 break;
214 }
215
216 size_t len = strlen(mldr_load_results.envp[i]) + 1;
217
218 // Don't pass these special env vars down to userland
219 #define SKIP_VAR(_name) \
220 (len > sizeof(_name) - 1 && strncmp(mldr_load_results.envp[i], _name, sizeof(_name) - 1) == 0)
221
222 if (
223 SKIP_VAR("__mldr_bprefs=") ||
224 SKIP_VAR("__mldr_sockpath=")
225 ) {
226 size_t len_after = 0;
227 const char* orig_envp_i_plus_one = mldr_load_results.envp[i + 1];
228
229 --mldr_load_results.envc;
230
231 for (size_t j = i; j < mldr_load_results.envc; ++j) {
232 mldr_load_results.envp[j] = mldr_load_results.envp[i] + len_after;
233 len_after += strlen(mldr_load_results.envp[j + 1]) + 1;
234 }
235 mldr_load_results.envp[mldr_load_results.envc] = NULL;
236
237 memmove(mldr_load_results.envp[i], orig_envp_i_plus_one, len_after);
238 memset(mldr_load_results.envp[i] + len_after, 0, len);
239
240 // we have to check this index again because it now points to a different string
241 --i;
242 continue;
243 }
244 }
245
246 if (mldr_load_results._32on64)
247 setup_stack32(filename, &mldr_load_results);
248 else
249#ifdef __x86_64__
250 setup_stack64(filename, &mldr_load_results);
251#elif __aarch64__
252 #error TODO: aarch64
253#else
254 abort();
255#endif
256
257 int status = dserver_rpc_set_dyld_info(mldr_load_results.dyld_all_image_location, mldr_load_results.dyld_all_image_size);
258 if (status < 0) {
259 fprintf(stderr, "Failed to tell darlingserver about our dyld info\n");
260 exit(1);
261 }
262
263 if (dserver_rpc_set_executable_path(filename, strlen(filename)) < 0) {
264 fprintf(stderr, "Failed to tell darlingserver about our executable path\n");
265 exit(1);
266 }
267
268 __mldr_main_stack_top = (void*)mldr_load_results.stack_top;
269
270 start_thread(&mldr_load_results);
271
272 __builtin_unreachable();
273}
274
275void load(const char* path, cpu_type_t forced_arch, bool expect_dylinker, char** argv, struct load_results* lr)
276{
277 int fd;
278 uint32_t magic;
279
280 fd = open(path, O_RDONLY);
281 if (fd == -1)
282 {
283 fprintf(stderr, "Cannot open %s: %s\n", path, strerror(errno));
284 exit(1);
285 }
286
287 // We need to read argv[1] and detect whether it's a 32 or 64-bit application.
288 // Then load the appropriate version of dyld from the fat file.
289 // In case the to-be-executed executable contains both, we prefer the 64-bit version,
290 // unless a special property has been passed to sys_posix_spawn() to force the 32-bit
291 // version. See posix_spawnattr_setbinpref_np().
292
293 if (read(fd, &magic, sizeof(magic)) != sizeof(magic))
294 {
295 fprintf(stderr, "Cannot read the file header of %s.\n", path);
296 exit(1);
297 }
298
299 if (magic == MH_MAGIC_64 || magic == MH_CIGAM_64)
300 {
301#ifdef __x86_64__
302 lseek(fd, 0, SEEK_SET);
303 load64(fd, expect_dylinker, lr);
304#else
305 abort();
306#endif
307 }
308 else if (magic == MH_MAGIC || magic == MH_CIGAM)
309 {
310#if !__x86_64__
311 lseek(fd, 0, SEEK_SET);
312 load32(fd, expect_dylinker, lr);
313#else
314 // Re-run self as mldr32
315 reexec32(argv);
316#endif
317 }
318 else if (magic == FAT_MAGIC || magic == FAT_CIGAM)
319 {
320 lseek(fd, 0, SEEK_SET);
321 load_fat(fd, forced_arch, expect_dylinker, argv, lr);
322 }
323 else
324 {
325 fprintf(stderr, "Unknown file format: %s.\n", path);
326 exit(1);
327 }
328
329 close(fd);
330}
331
332static void load_fat(int fd, cpu_type_t forced_arch, bool expect_dylinker, char** argv, struct load_results* lr) {
333 struct fat_header fhdr;
334 struct fat_arch best_arch = {0};
335 int bpref_index = -1;
336
337 best_arch.cputype = CPU_TYPE_ANY;
338
339 if (read(fd, &fhdr, sizeof(fhdr)) != sizeof(fhdr))
340 {
341 fprintf(stderr, "Cannot read fat file header.\n");
342 exit(1);
343 }
344
345 const bool swap = fhdr.magic == FAT_CIGAM;
346
347#define SWAP32(x) x = __bswap_32(x)
348
349 if (swap)
350 SWAP32(fhdr.nfat_arch);
351
352 uint32_t i;
353 for (i = 0; i < fhdr.nfat_arch; i++)
354 {
355 struct fat_arch arch;
356
357 if (read(fd, &arch, sizeof(arch)) != sizeof(arch))
358 {
359 fprintf(stderr, "Cannot read fat_arch header.\n");
360 exit(1);
361 }
362
363 if (swap)
364 {
365 SWAP32(arch.cputype);
366 SWAP32(arch.cpusubtype);
367 SWAP32(arch.offset);
368 SWAP32(arch.size);
369 SWAP32(arch.align);
370 }
371
372 if (!forced_arch)
373 {
374 int j;
375 for (j = 0; j < 4; j++)
376 {
377 if (lr->bprefs[j] && arch.cputype == lr->bprefs[j])
378 {
379 if (bpref_index == -1 || bpref_index > j)
380 {
381 best_arch = arch;
382 bpref_index = j;
383 break;
384 }
385 }
386 }
387
388 if (bpref_index == -1)
389 {
390#if defined(__x86_64__)
391 if (arch.cputype == CPU_TYPE_X86_64)
392 best_arch = arch;
393 else if (best_arch.cputype == CPU_TYPE_ANY && arch.cputype == CPU_TYPE_X86)
394 best_arch = arch;
395#elif defined(__i386__)
396 if (arch.cputype == CPU_TYPE_X86)
397 best_arch = arch;
398#elif defined (__aarch64__)
399 #error TODO: arm
400#else
401 #error Unsupported CPU architecture
402#endif
403 }
404 }
405 else
406 {
407 if (arch.cputype == forced_arch)
408 best_arch = arch;
409 }
410 }
411
412 if (best_arch.cputype == CPU_TYPE_ANY)
413 {
414 fprintf(stderr, "No supported architecture found in fat binary.\n");
415 exit(1);
416 }
417
418 if (lseek(fd, best_arch.offset, SEEK_SET) == -1)
419 {
420 fprintf(stderr, "Cannot seek to selected arch in fat binary.\n");
421 exit(1);
422 }
423
424 if (best_arch.cputype & CPU_ARCH_ABI64) {
425#ifdef __x86_64__
426 load64(fd, expect_dylinker, lr);
427#elif __aarch64__
428 #error TODO: aarch64
429#else
430 abort();
431#endif
432 } else {
433#if !__x86_64__
434 load32(fd, expect_dylinker, lr);
435#else
436 // Re-run self as mldr32
437 reexec32(argv);
438#endif
439 }
440};
441
442#ifdef __x86_64__
443#define GEN_64BIT
444#include "loader.c"
445#include "stack.c"
446#undef GEN_64BIT
447#endif
448
449#define GEN_32BIT
450#include "loader.c"
451#include "stack.c"
452#undef GEN_32BIT
453
454int native_prot(int prot)
455{
456 int protOut = 0;
457
458 if (prot & VM_PROT_READ)
459 protOut |= PROT_READ;
460 if (prot & VM_PROT_WRITE)
461 protOut |= PROT_WRITE;
462 if (prot & VM_PROT_EXECUTE)
463 protOut |= PROT_EXEC;
464
465 return protOut;
466}
467
468static void reexec32(char** argv)
469{
470 char selfpath[1024];
471 ssize_t len;
472
473 len = readlink("/proc/self/exe", selfpath, sizeof(selfpath)-3);
474 if (len == -1)
475 {
476 perror("Cannot readlink /proc/self/exe");
477 abort();
478 }
479
480 selfpath[len] = '\0';
481 strcat(selfpath, "32");
482
483 execv(selfpath, argv);
484
485 perror("Cannot re-execute as 32-bit process");
486 abort();
487}
488
489// Given that there's no proper way of passing special parameters to the binary loader
490// via execve(), we must do this via env variables
491static void process_special_env(struct load_results* lr) {
492 const char* str;
493 static char root_path[4096];
494
495 lr->bprefs[0] = lr->bprefs[1] = lr->bprefs[2] = lr->bprefs[3] = 0;
496 str = getenv("__mldr_bprefs");
497
498 if (str != NULL) {
499 sscanf(str, "%x,%x,%x,%x", &lr->bprefs[0], &lr->bprefs[1], &lr->bprefs[2], &lr->bprefs[3]);
500 }
501
502 str = getenv("__mldr_sockpath");
503
504 if (str != NULL) {
505 if (strlen(str) > sizeof(__dserver_socket_address_data.sun_path) - 1) {
506 fprintf(stderr, "darlingserver socket path is too long\n");
507 exit(1);
508 }
509 strncpy(__dserver_socket_address_data.sun_path, str, sizeof(__dserver_socket_address_data.sun_path) - 1);
510 __dserver_socket_address_data.sun_path[sizeof(__dserver_socket_address_data.sun_path) - 1] = '\0';
511
512 lr->socket_path = __dserver_socket_address_data.sun_path;
513 }
514
515 lr->lifetime_pipe = -1;
516 str = getenv("__mldr_lifetime_pipe");
517
518 if (str != NULL) {
519 sscanf(str, "%i", &lr->lifetime_pipe);
520 }
521
522 str = getenv("DYLD_ROOT_PATH");
523
524 if (str != NULL && lr->root_path == NULL) {
525 strncpy(root_path, str, sizeof(root_path) - 1);
526 root_path[sizeof(root_path) - 1] = '\0';
527 lr->root_path = root_path;
528 lr->root_path_length = strlen(lr->root_path);
529 }
530};
531
532static void unset_special_env() {
533 unsetenv("__mldr_bprefs");
534 unsetenv("__mldr_sockpath");
535 unsetenv("__mldr_lifetime_pipe");
536};
537
538typedef struct socket_bitmap {
539 pthread_mutex_t mutex;
540 /**
541 * This is always next lowest available index.
542 * If this is equal to #bit_length, then the bitmap is full.
543 */
544 size_t next_index;
545 uint8_t* bits;
546 size_t bit_length;
547 int highest;
548} socket_bitmap_t;
549
550static socket_bitmap_t socket_bitmap = {
551 .mutex = PTHREAD_MUTEX_INITIALIZER,
552 .next_index = 0,
553 .bits = NULL,
554 .bit_length = 0,
555 .highest = -1,
556};
557
558static int socket_bitmap_get(socket_bitmap_t* bitmap) {
559 int fd = -1;
560 bool updated = false;
561
562 pthread_mutex_lock(&bitmap->mutex);
563
564 if (bitmap->highest == -1) {
565 // we need to initialize this bitmap
566 struct rlimit limit;
567
568 if (getrlimit(RLIMIT_NOFILE, &limit) < 0) {
569 goto out;
570 }
571
572 if (limit.rlim_cur == RLIM_INFINITY) {
573 // just default to 1024
574 limit.rlim_cur = 1024;
575 }
576
577 bitmap->highest = limit.rlim_cur - 1;
578 }
579
580 if (bitmap->next_index >= bitmap->bit_length) {
581 // we need to grow the bitmap
582
583 if ((bitmap->bit_length % 8) == 0) {
584 // we need to allocate an additional byte
585
586 void* ptr = realloc(bitmap->bits, (bitmap->bit_length / 8) + 1);
587 if (!ptr) {
588 goto out;
589 }
590
591 bitmap->bits = ptr;
592
593 bitmap->bits[bitmap->bit_length / 8] = 0;
594 } else {
595 // we just need to increment the bit length
596 }
597
598 ++bitmap->bit_length;
599 }
600
601 fd = bitmap->highest - bitmap->next_index;
602
603 bitmap->bits[bitmap->next_index / 8] |= 1 << (bitmap->next_index % 8);
604
605 // update the next available index
606 for (size_t i = bitmap->next_index + 1; i < bitmap->bit_length; ++i) {
607 size_t byte = i / 8;
608 uint8_t bit = i % 8;
609
610 if (bit == 0) {
611 // check the entire byte at once so we can avoid unnecessary iteration
612 if (bitmap->bits[byte] == 0xff) {
613 // this byte is full, skip it
614 i += 7;
615 continue;
616 }
617 }
618
619 if ((bitmap->bits[byte] & (1 << bit)) == 0) {
620 // this index is unused
621 bitmap->next_index = i;
622 updated = true;
623 break;
624 }
625 }
626
627 if (!updated) {
628 // all of our entries are currently in-use
629 bitmap->next_index = bitmap->bit_length;
630 }
631
632out:
633 pthread_mutex_unlock(&bitmap->mutex);
634
635 return fd;
636};
637
638static void socket_bitmap_put(socket_bitmap_t* bitmap, int socket) {
639 size_t index;
640
641 pthread_mutex_lock(&bitmap->mutex);
642
643 index = bitmap->highest - socket;
644
645 bitmap->bits[index / 8] &= ~(1 << (index % 8));
646
647 if (index < bitmap->next_index) {
648 bitmap->next_index = index;
649 }
650
651 if (index == bitmap->bit_length - 1) {
652 // we can shrink the bitmap
653 size_t old_byte_size = (bitmap->bit_length + 7) / 8;
654 size_t new_byte_size = old_byte_size;
655
656 while (bitmap->bit_length > 0) {
657 size_t index = bitmap->bit_length - 1;
658
659 if ((bitmap->bit_length % 8) == 0) {
660 // check the entire byte at once to avoid unnecessary iteration
661 if (bitmap->bits[(bitmap->bit_length / 8) - 1] == 0) {
662 // remove this entire byte
663 bitmap->bit_length -= 8;
664 continue;
665 }
666 }
667
668 if ((bitmap->bits[index / 8] & (1 << (index % 8))) == 0) {
669 // this bit is in-use, so we can't shrink any further
670 break;
671 }
672
673 --bitmap->bit_length;
674 }
675
676 new_byte_size = (bitmap->bit_length + 7) / 8;
677
678 if (old_byte_size != new_byte_size) {
679 // we can free one or more bytes from the bitmap
680 void* ptr = realloc(bitmap->bits, new_byte_size);
681 if (!ptr) {
682 goto out;
683 }
684
685 bitmap->bits = ptr;
686 }
687 }
688
689out:
690 pthread_mutex_unlock(&bitmap->mutex);
691};
692
693int __mldr_create_rpc_socket(void) {
694 int pre_fd = -1;
695 int fd = -1;
696
697 pre_fd = socket(AF_UNIX, SOCK_DGRAM, 0);
698 if (pre_fd < 0) {
699 goto err_out;
700 }
701
702 fd = socket_bitmap_get(&socket_bitmap);
703 if (fd < 0) {
704 goto err_out;
705 }
706
707 if (dup2(pre_fd, fd) < 0) {
708 // we have to put it away ourselves here because `fd` is not yet valid, so we can't close() it in the error handler
709 socket_bitmap_put(&socket_bitmap, fd);
710 fd = -1;
711 goto err_out;
712 }
713
714 close(pre_fd);
715 pre_fd = -1;
716
717 // `fd` now contains the socket with the desired FD number returned by `socket_bitmap_get`
718
719 int fd_flags = fcntl(fd, F_GETFD);
720 if (fd_flags < 0) {
721 goto err_out;
722 }
723 if (fcntl(fd, F_SETFD, fd_flags | FD_CLOEXEC) < 0) {
724 goto err_out;
725 }
726
727 sa_family_t family = AF_UNIX;
728 if (bind(fd, (const struct sockaddr*)&family, sizeof(family)) < 0) {
729 goto err_out;
730 }
731
732out:
733 return fd;
734
735err_out:
736 if (fd >= 0) {
737 socket_bitmap_put(&socket_bitmap, fd);
738 close(fd);
739 }
740
741 if (pre_fd >= 0) {
742 close(pre_fd);
743 }
744
745 return -1;
746};
747
748void __mldr_close_rpc_socket(int socket) {
749 close(socket);
750 socket_bitmap_put(&socket_bitmap, socket);
751};
752
753int __mldr_create_process_lifetime_pipe(int* fds) {
754 // These pipes are not required for Linux 5.3 or newer,
755 // we already have pidfd_open.
756 if (is_kernel_at_least(5, 3)) {
757 fds[0] = fds[1] = -1;
758 return 0;
759 }
760
761 int pre_fds[2];
762 if (pipe(pre_fds) == -1) {
763 goto err_out;
764 }
765
766 for (int i = 0; i < 2; ++i) {
767 fds[i] = socket_bitmap_get(&socket_bitmap);
768 if (fds[i] < 0) {
769 goto err_out;
770 }
771
772 if (dup2(pre_fds[i], fds[i]) < 0) {
773 socket_bitmap_put(&socket_bitmap, fds[i]);
774 fds[i] = -1;
775 goto err_out;
776 }
777
778 close(pre_fds[i]);
779 pre_fds[i] = -1;
780 }
781
782 return 0;
783
784err_out:
785 for (int i = 0; i < 2; ++i) {
786 if (fds[i] >= 0) {
787 socket_bitmap_put(&socket_bitmap, fds[i]);
788 close(fds[i]);
789 }
790
791 if (pre_fds[i] >= 0) {
792 close(pre_fds[i]);
793 }
794 }
795
796 return -1;
797}
798
799void __mldr_close_process_lifetime_pipe(int fd) {
800 if (fd != -1) {
801 close(fd);
802 socket_bitmap_put(&socket_bitmap, fd);
803 }
804}
805
806static void setup_space(struct load_results* lr, bool is_64_bit) {
807 commpage_setup(is_64_bit);
808
809 // Using the default stack top would cause the stack to be placed just above the commpage
810 // and would collide with it eventually.
811 // Instead, we manually allocate a new stack below the commpage.
812#if __x86_64__
813 lr->stack_top = commpage_address(true);
814#elif __i386__
815 lr->stack_top = commpage_address(false);
816#else
817 #error Unsupported architecture
818#endif
819
820 struct rlimit limit;
821 getrlimit(RLIMIT_STACK, &limit);
822 // allocate a few pages 16 pages if it's less than the limit; otherwise, allocate the limit
823 unsigned long size = PAGE_SIZE * 16;
824 if (limit.rlim_cur != RLIM_INFINITY && limit.rlim_cur < size) {
825 size = limit.rlim_cur;
826 }
827
828 if (compatible_mmap((void*)(lr->stack_top - size), size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE | MAP_GROWSDOWN, -1, 0) == MAP_FAILED) {
829 fprintf(stderr, "Failed to allocate stack of %lu bytes: %d (%s)\n", size, errno, strerror(errno));
830 exit(1);
831 }
832
833 unset_special_env();
834
835 lr->kernfd = __mldr_create_rpc_socket();
836 if (lr->kernfd < 0) {
837 fprintf(stderr, "Failed to create socket\n");
838 exit(1);
839 }
840
841 __dserver_main_thread_socket_fd = lr->kernfd;
842
843 int lifetime_pipe[2];
844
845 // this process is created using exec from another Darling process.
846 // darlingserver should already have the read pipe, so we don't need
847 // to check that in.
848 if (lr->lifetime_pipe != -1) {
849 lifetime_pipe[1] = socket_bitmap_get(&socket_bitmap);
850
851 if (lr->lifetime_pipe != lifetime_pipe[1]) {
852 // move the existing pipe to a higher fd number, and invalidate
853 // the old fd to prevent interfering with fds provided by
854 // socket_bitmap_get
855 if (dup2(lr->lifetime_pipe, lifetime_pipe[1]) == -1) {
856 fprintf(stderr, "Failed to dup process lifetime pipe: %d (%s)\n", errno, strerror(errno));
857 exit(1);
858 }
859 close(lr->lifetime_pipe);
860 }
861
862 lifetime_pipe[0] = -1;
863 } else {
864 if (__mldr_create_process_lifetime_pipe(lifetime_pipe) == -1) {
865 fprintf(stderr, "Failed to create process lifetime pipe: %d (%s)\n", errno, strerror(errno));
866 exit(1);
867 }
868 }
869
870 lr->lifetime_pipe = lifetime_pipe[1];
871
872 // store the write end of the pipe; the read end is sent to darlingserver.
873 __dserver_process_lifetime_pipe_fd = lifetime_pipe[1];
874
875 int dummy_stack_variable;
876 if (dserver_rpc_checkin(false, &dummy_stack_variable, lifetime_pipe[0]) < 0) {
877 fprintf(stderr, "Failed to checkin with darlingserver\n");
878 exit(1);
879 }
880
881 // keep our write end while closing the unused read end.
882 __mldr_close_process_lifetime_pipe(lifetime_pipe[0]);
883
884 if (!lr->root_path) {
885 static char vchroot_buffer[4096];
886 uint64_t vchroot_path_length = 0;
887
888 int code = dserver_rpc_vchroot_path(vchroot_buffer, sizeof(vchroot_buffer), &vchroot_path_length);
889 if (code < 0) {
890 fprintf(stderr, "Failed to retrieve vchroot path from darlingserver: %d\n", code);
891 exit(1);
892 }
893
894 if (vchroot_path_length >= sizeof(vchroot_buffer)) {
895 fprintf(stderr, "Vchroot path is too large for buffer\n");
896 exit(1);
897 } else if (vchroot_path_length > 0) {
898 lr->root_path = vchroot_buffer;
899 lr->root_path_length = vchroot_path_length;
900 }
901 }
902};
903
904static void start_thread(struct load_results* lr) {
905#ifdef __x86_64__
906 __asm__ volatile(
907 "mov %1, %%rsp\n"
908 "jmpq *%0"
909 ::
910 "m"(lr->entry_point),
911 "r"(lr->stack_top)
912 :
913 );
914#elif defined(__i386__)
915 __asm__ volatile(
916 "mov %1, %%esp\n"
917 "jmp *%0"
918 ::
919 "m"(lr->entry_point),
920 "r"(lr->stack_top)
921 :
922 );
923#elif defined(__arm__)
924 __asm__ volatile(
925 "mov sp, %1\n"
926 "bx %0"
927 ::
928 "r"(lr->entry_point),
929 "r"(lr->stack_top)
930 :
931 );
932#else
933# error Unsupported platform!
934#endif
935};
936
937static bool is_kernel_at_least(int major, int minor) {
938 if (kernel_major == -1) {
939 struct utsname uname_info;
940 if (uname(&uname_info) == -1) {
941 return false;
942 }
943 kernel_major = 0;
944 kernel_minor = 0;
945 size_t pos = 0;
946 while (uname_info.release[pos] != '\0' && uname_info.release[pos] != '.') {
947 kernel_major = kernel_major * 10 + uname_info.release[pos] - '0';
948 ++pos;
949 }
950 ++pos;
951 while (uname_info.release[pos] != '\0' && uname_info.release[pos] != '.') {
952 kernel_minor = kernel_minor * 10 + uname_info.release[pos] - '0';
953 ++pos;
954 }
955 }
956
957 if (major != kernel_major) {
958 return kernel_major > major;
959 }
960
961 return kernel_minor >= minor;
962}
963
964void* compatible_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset) {
965 // MAP_FIXED_NOREPLACE is not supported on WSL1 (Linux < 4.17).
966 bool fixed_noreplace_hack = false;
967 if ((flags & MAP_FIXED_NOREPLACE) && !is_kernel_at_least(4, 17)) {
968 flags &= ~MAP_FIXED_NOREPLACE;
969 fixed_noreplace_hack = true;
970 }
971 void* result = mmap(addr, length, prot, flags, fd, offset);
972 // MAP_GROWSDOWN is not supported on WSL1. See https://github.com/microsoft/WSL/issues/8095.
973 if ((result == (void*)MAP_FAILED) && (flags & MAP_GROWSDOWN) && (errno == EOPNOTSUPP)) {
974 result = mmap(addr, length, prot, (flags & ~MAP_GROWSDOWN), fd, offset);
975 }
976 if (fixed_noreplace_hack) {
977 if (result != addr && result != (void*)MAP_FAILED) {
978 errno = ESRCH;
979 munmap(addr, length);
980 return MAP_FAILED;
981 }
982 }
983 return result;
984}
985
986static void vchroot_unexpand_interpreter(struct load_results* lr) {
987 static char unexpanded[4096];
988 size_t length;
989
990 if (lr->root_path) {
991 length = strlen(lr->argv[0]);
992
993 if (strncmp(lr->argv[0], lr->root_path, lr->root_path_length) == 0) {
994 memmove(unexpanded, lr->argv[0] + lr->root_path_length, length - lr->root_path_length + 1);
995 } else {
996 // FIXME: potential buffer overflow
997 memmove(unexpanded + sizeof(SYSTEM_ROOT) - 1, lr->argv[0], length + 1);
998 memcpy(unexpanded, SYSTEM_ROOT, sizeof(SYSTEM_ROOT) - 1);
999 }
1000
1001 lr->argv[0] = unexpanded;
1002 }
1003};