/* This file is part of Darling. Copyright (C) 2015-2018 Lubos Dolezel Darling is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Darling is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Darling. If not, see . */ #include "threads.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "dthreads.h" #include extern int __mldr_create_rpc_socket(void); extern void __mldr_close_rpc_socket(int socket); // The point of this file is build macOS threads on top of native libc's threads, // otherwise it would not be possible to make native calls from these threads. static __thread jmp_buf t_jmpbuf; static __thread void* t_freeaddr; static __thread size_t t_freesize; static __thread int t_server_socket = -1; static __thread darling_thread_create_callbacks_t t_callbacks = NULL; typedef void (*thread_ep)(void**, int, ...); struct arg_struct { thread_ep entry_point; uintptr_t real_entry_point; uintptr_t arg1; // `user_arg` for normal threads; `keventlist` for workqueues uintptr_t arg2; // `stack_addr` for normal threads; `flags` for workqueues uintptr_t arg3; // `flags` for normal threads; `nkevents` for workqueues union { void* _backwards_compat; // kept around to avoid modifying assembly int port; }; unsigned long pth_obj_size; void* pth; darling_thread_create_callbacks_t callbacks; uintptr_t stack_bottom; uintptr_t stack_addr; bool is_workqueue; }; static void* darling_thread_entry(void* p); #ifndef PTHREAD_STACK_MIN # define PTHREAD_STACK_MIN 16384 #endif #define DEFAULT_DTHREAD_GUARD_SIZE 0x1000 static inline void *align_16(uintptr_t ptr) { return (void *) ((uintptr_t) ptr & ~(uintptr_t) 15); } static dthread_t dthread_structure_init(dthread_t dthread, size_t guard_size, void* stack_addr, size_t stack_size, void* base_addr, size_t total_size) { // the pthread signature is the address of the pthread XORed with the "pointer munge" token passed in by the kernel // since the LKM doesn't pass in a token, it's always zero, so the signature is equal to just the address dthread->sig = (uintptr_t)dthread; dthread->tsd[DTHREAD_TSD_SLOT_PTHREAD_SELF] = dthread; dthread->tsd[DTHREAD_TSD_SLOT_ERRNO] = &dthread->err_no; dthread->tsd[DTHREAD_TSD_SLOT_PTHREAD_QOS_CLASS] = (void*)(uintptr_t)(DTHREAD_DEFAULT_PRIORITY); dthread->tsd[DTHREAD_TSD_SLOT_PTR_MUNGE] = 0; dthread->tl_has_custom_stack = 0; dthread->lock = (darwin_os_unfair_lock){0}; dthread->stackaddr = stack_addr; dthread->stackbottom = (char*)stack_addr - stack_size; dthread->freeaddr = base_addr; dthread->freesize = total_size; dthread->guardsize = guard_size; dthread->cancel_state = DTHREAD_CANCEL_ENABLE | DTHREAD_CANCEL_DEFERRED; // technically, these next values are defaults; we don't have a way to get more info from the user // // it's not too important since the only cases where we initialize the dthread structure ourselves is when we're working with workqueues, // and those initialize their own dthread structures when they get them dthread->tl_joinable = 1; dthread->inherit = DTHREAD_INHERIT_SCHED; dthread->tl_policy = DARWIN_POLICY_TIMESHARE; return dthread; }; static dthread_t dthread_structure_allocate(size_t stack_size, size_t guard_size, void** stack_addr) { size_t total_size = guard_size + stack_size + sizeof(struct _dthread); // allocate our stack, guard page, and dthread structure void* base_addr = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); // protect our guard page mprotect(base_addr, guard_size, PROT_NONE); /** * memory layout of newly allocated block: * * [base_addr] [base_addr + total_size] * -------------------------------------------- * | guard page | stack | dthread | */ // stack_addr points to the top of the stack (i.e. the highest address) *stack_addr = ((char*)base_addr) + stack_size + guard_size; // the dthread sits above the stack // (and by "above", i mean the lowest address of the dthread is the highest address of the stack) dthread_t dthread = (dthread_t)*stack_addr; // zero-out the entrire dthread structure memset(dthread, 0, sizeof(struct _dthread)); return dthread_structure_init(dthread, guard_size, *stack_addr, stack_size, base_addr, total_size); }; void* __darling_thread_create(unsigned long stack_size, unsigned long pth_obj_size, void* entry_point, uintptr_t real_entry_point, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, darling_thread_create_callbacks_t callbacks, void* pth) { struct arg_struct args = { .entry_point = (thread_ep)entry_point, .real_entry_point = real_entry_point, .arg1 = arg1, .arg2 = arg2, .arg3 = arg3, .port = 0, .pth_obj_size = pth_obj_size, .pth = NULL, // set later on .callbacks = callbacks, .stack_addr = 0, // set later on .is_workqueue = real_entry_point == 0, // our `workq_kernreturn` sets `real_entry_point` to NULL; `bsdthread_create` actually passes a value }; pthread_attr_t attr; pthread_t nativeLibcThread; pthread_attr_init(&attr); //pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); // pthread_attr_setstacksize(&attr, stack_size); // in some cases, we're already given a pthread object, stack, and guard page; // in those cases, just use what we're given (it also contains more information) // // otherwise, allocate them ourselves if (pth == NULL || args.is_workqueue) { pth = dthread_structure_allocate(stack_size, DEFAULT_DTHREAD_GUARD_SIZE, (void**)&args.stack_addr); } else if (!args.is_workqueue) { // `arg2` is `stack_addr` for normal threads args.stack_addr = arg2; } args.stack_bottom = args.stack_addr - stack_size; // pthread_attr_setstack is buggy. The documentation states we should provide the lowest // address of the stack, yet some versions regard it as the highest address instead. // Therefore it's better to just make the pthread stack as small as possible and then switch // to our own stack instead. //pthread_attr_setstack(&attr, ((char*)pth) + pth_obj_size, stack_size - pth_obj_size - 0x1000); // std::cout << "Allocated stack at " << pth << ", size " << stack_size << std::endl; pthread_attr_setstacksize(&attr, 4096); //pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); args.pth = pth; pthread_create(&nativeLibcThread, &attr, darling_thread_entry, &args); pthread_attr_destroy(&attr); while (args.pth != NULL) sched_yield(); return pth; } static void* darling_thread_entry(void* p) { struct arg_struct* in_args = (struct arg_struct*) p; struct arg_struct args; memcpy(&args, in_args, sizeof(args)); dthread_t dthread = args.pth; uintptr_t* flags = args.is_workqueue ? &args.arg2 : &args.arg3; // create a new dserver RPC socket int new_rpc_fd = __mldr_create_rpc_socket(); if (new_rpc_fd < 0) { // we can't do anything if we don't get our own separate connection to darlingserver fprintf(stderr, "Failed to create socket\n"); abort(); } // guard the new RPC FD args.callbacks->rpc_guard(new_rpc_fd); // the socket is ready; assign it now t_server_socket = new_rpc_fd; t_callbacks = args.callbacks; // libpthread now expects the kernel to set the TSD // so, since we're pretending to be the kernel handling threads... args.callbacks->thread_set_tsd_base(&dthread->tsd[0], 0); *flags |= args.is_workqueue ? DWQ_FLAG_THREAD_TSD_BASE_SET : DTHREAD_START_TSD_BASE_SET; // let's check-in with darlingserver on this new thread int dummy_stack_variable; // the lifetime pipe fd is ignored as the process should already have been registered if (dserver_rpc_explicit_checkin(t_server_socket, false, &dummy_stack_variable, -1) < 0) { // we can't do ANYTHING if darlingserver doesn't acknowledge us successfully abort(); } int thread_self_port = args.callbacks->thread_self_trap(); dthread->tsd[DTHREAD_TSD_SLOT_MACH_THREAD_SELF] = (void*)(intptr_t)thread_self_port; args.port = thread_self_port; in_args->pth = NULL; if (setjmp(t_jmpbuf)) { // Terminate the Linux thread munmap(t_freeaddr, t_freesize); pthread_detach(pthread_self()); return NULL; } void *stack_ptr = align_16(args.stack_addr); // No additional function calls should occur beyond this point. Otherwise, we will risk our // registers being call-clobbered. I recommend reading the following doc for more details: // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html #if __x86_64__ register void* arg1 asm("rdi") = args.pth; register int arg2 asm("esi") = args.port; register uintptr_t arg3 asm("rdx") = args.real_entry_point; register uintptr_t arg4 asm("rcx") = args.arg1; register uintptr_t arg5 asm("r8") = args.arg2; register uintptr_t arg6 asm("r9") = args.arg3; #elif __i386__ uintptr_t arg3 = args.real_entry_point; #endif if (arg3 == 0) { arg3 = (long) args.stack_bottom; } #ifdef __x86_64__ asm volatile( // Zero out the frame base register. "xorq %%rbp, %%rbp\n" // Switch to the new stack. "movq %[stack_ptr], %%rsp\n" // Push a fake return address. "pushq $0\n" // Jump to the entry point. "jmp *%[entry_point]" :: // Function arguments "r"(arg1),"r"(arg2),"r"(arg3),"r"(arg4),"r"(arg5),"r"(arg6), [entry_point] "r"(args.entry_point), [stack_ptr] "r"(stack_ptr) ); #elif defined(__i386__) // args in eax, ebx, ecx, edx, edi, esi __asm__ __volatile__ ( // Zero out the frame base register. "xorl %%ebp, %%ebp\n" // Switch to the new stack. "movl %[stack_ptr], %%esp\n" // Make sure stack is 16 aligned (before we push the fake return address) "sub $8, %%esp\n" // Unlike x86_64, all function arguments must be stored in the stack "pushl 16(%[args])\n" // 6th argument | args.arg3 "pushl 12(%[args])\n" // 5th argument | args.arg2 "pushl 8(%[args])\n" // 4th argument | args.arg1 "pushl %[arg3]\n" // 3rd argument | args3 "pushl 20(%[args])\n" // 2nd argument | args.port "pushl 28(%[args])\n" // 1st argument | args.pth // Push a fake return address. "pushl $0\n" // Jump to the entry point. "jmp *%[entry_point]" :: // Function arguments to push to the stack. [args] "r"(&args), [arg3]"r"(arg3), [entry_point] "r"(args.entry_point), [stack_ptr] "r"(stack_ptr) ); #else #error Not implemented // args.entry_point(args.pth, args.port, args.real_entry_point, args.arg1, args.arg2, args.arg3); #endif __builtin_unreachable(); } int __darling_thread_terminate(void* stackaddr, unsigned long freesize, unsigned long pthobj_size) { int checkout_result = 0; if (t_server_socket != -1) { checkout_result = dserver_rpc_explicit_checkout(t_server_socket, -1, false); } else { checkout_result = dserver_rpc_checkout(-1, false); } if (checkout_result < 0) { // failing to check-out is not fatal. // it's not ideal, but it's not fatal. #define CHECKOUT_FAILURE_MESSAGE "Failed to checkout" if (t_server_socket != -1) { dserver_rpc_explicit_kprintf(t_server_socket, CHECKOUT_FAILURE_MESSAGE, sizeof(CHECKOUT_FAILURE_MESSAGE) - 1); } else { dserver_rpc_kprintf(CHECKOUT_FAILURE_MESSAGE, sizeof(CHECKOUT_FAILURE_MESSAGE) - 1); } } // close the RPC FD (if necessary) // it should already have been unguarded by our caller if (t_server_socket != -1) { __mldr_close_rpc_socket(t_server_socket); } if (getpid() == syscall(SYS_gettid)) { // dispatch_main() calls pthread_exit(NULL) on the main thread, // which turns our process into a zombie on Linux. // Let's just hang around forever. sigset_t mask; memset(&mask, 0, sizeof(mask)); while (1) sigsuspend(&mask); } t_freeaddr = stackaddr; t_freesize = freesize; longjmp(t_jmpbuf, 1); __builtin_unreachable(); } extern void* __mldr_main_stack_top; void* __darling_thread_get_stack(void) { return __mldr_main_stack_top; } extern int __dserver_main_thread_socket_fd; int __darling_thread_rpc_socket(void) { if (t_server_socket == -1) { if (getpid() == syscall(SYS_gettid)) { // this is the main thread t_server_socket = __dserver_main_thread_socket_fd; } else { // threads should already have a per-thread socket assigned when they're created abort(); } } return t_server_socket; }; void __darling_thread_rpc_socket_refresh(void) { int new_rpc_fd = __mldr_create_rpc_socket(); if (new_rpc_fd < 0) { abort(); } t_server_socket = new_rpc_fd; // if this is the main thread, also update the socket used by mldr if (getpid() == syscall(SYS_gettid)) { __dserver_main_thread_socket_fd = t_server_socket; } };