src/startup/mldr/elfcalls/threads.c at fixPythonPipStalling

overby.me / darling-nix
fork atom
this repo has no description
fork atom
darling-nix / src / startup / mldr / elfcalls / threads.c
at fixPythonPipStalling 409 lines 13 kB view raw
wrap content
Thomas A [mldr] Rewrite darling_thread_entry Inline Assembly 2y ago
c061846e
  1/*
  2This file is part of Darling.
  3
  4Copyright (C) 2015-2018 Lubos Dolezel
  5
  6Darling is free software: you can redistribute it and/or modify
  7it under the terms of the GNU General Public License as published by
  8the Free Software Foundation, either version 3 of the License, or
  9(at your option) any later version.
 10
 11Darling is distributed in the hope that it will be useful,
 12but WITHOUT ANY WARRANTY; without even the implied warranty of
 13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14GNU General Public License for more details.
 15
 16You should have received a copy of the GNU General Public License
 17along with Darling.  If not, see <http://www.gnu.org/licenses/>.
 18*/
 19
 20#include "threads.h"
 21#include <pthread.h>
 22#include <sys/mman.h>
 23#include <semaphore.h>
 24#include <string.h>
 25#include <stdbool.h>
 26#include <stdlib.h>
 27#include <signal.h>
 28#include <unistd.h>
 29#include <sys/syscall.h>
 30#include <setjmp.h>
 31#include <sys/syscall.h>
 32#include <sys/socket.h>
 33#include <stdio.h>
 34#include <fcntl.h>
 35
 36#include "dthreads.h"
 37
 38#include <darlingserver/rpc.h>
 39
 40extern int __mldr_create_rpc_socket(void);
 41extern void __mldr_close_rpc_socket(int socket);
 42
 43// The point of this file is build macOS threads on top of native libc's threads,
 44// otherwise it would not be possible to make native calls from these threads.
 45
 46static __thread jmp_buf t_jmpbuf;
 47static __thread void* t_freeaddr;
 48static __thread size_t t_freesize;
 49static __thread int t_server_socket = -1;
 50static __thread darling_thread_create_callbacks_t t_callbacks = NULL;
 51
 52typedef void (*thread_ep)(void**, int, ...);
 53struct arg_struct
 54{
 55	thread_ep entry_point;
 56	uintptr_t real_entry_point;
 57	uintptr_t arg1; // `user_arg` for normal threads; `keventlist` for workqueues
 58	uintptr_t arg2; // `stack_addr` for normal threads; `flags` for workqueues
 59	uintptr_t arg3; // `flags` for normal threads; `nkevents` for workqueues
 60	union {
 61		void* _backwards_compat; // kept around to avoid modifying assembly
 62		int port;
 63	};
 64	unsigned long pth_obj_size;
 65	void* pth;
 66	darling_thread_create_callbacks_t callbacks;
 67	uintptr_t stack_bottom;
 68	uintptr_t stack_addr;
 69	bool is_workqueue;
 70};
 71
 72static void* darling_thread_entry(void* p);
 73
 74#ifndef PTHREAD_STACK_MIN
 75#	define PTHREAD_STACK_MIN 16384
 76#endif
 77
 78#define DEFAULT_DTHREAD_GUARD_SIZE 0x1000
 79
 80static inline void *align_16(uintptr_t ptr) {
 81	return (void *) ((uintptr_t) ptr & ~(uintptr_t) 15);
 82}
 83
 84static dthread_t dthread_structure_init(dthread_t dthread, size_t guard_size, void* stack_addr, size_t stack_size, void* base_addr, size_t total_size) {
 85	// the pthread signature is the address of the pthread XORed with the "pointer munge" token passed in by the kernel
 86	// since the LKM doesn't pass in a token, it's always zero, so the signature is equal to just the address
 87	dthread->sig = (uintptr_t)dthread;
 88
 89	dthread->tsd[DTHREAD_TSD_SLOT_PTHREAD_SELF] = dthread;
 90	dthread->tsd[DTHREAD_TSD_SLOT_ERRNO] = &dthread->err_no;
 91	dthread->tsd[DTHREAD_TSD_SLOT_PTHREAD_QOS_CLASS] = (void*)(uintptr_t)(DTHREAD_DEFAULT_PRIORITY);
 92	dthread->tsd[DTHREAD_TSD_SLOT_PTR_MUNGE] = 0;
 93	dthread->tl_has_custom_stack = 0;
 94	dthread->lock = (darwin_os_unfair_lock){0};
 95
 96	dthread->stackaddr = stack_addr;
 97	dthread->stackbottom = (char*)stack_addr - stack_size;
 98	dthread->freeaddr = base_addr;
 99	dthread->freesize = total_size;
100	dthread->guardsize = guard_size;
101
102	dthread->cancel_state = DTHREAD_CANCEL_ENABLE | DTHREAD_CANCEL_DEFERRED;
103
104	// technically, these next values are defaults; we don't have a way to get more info from the user
105	//
106	// it's not too important since the only cases where we initialize the dthread structure ourselves is when we're working with workqueues,
107	// and those initialize their own dthread structures when they get them
108
109	dthread->tl_joinable = 1;
110	dthread->inherit = DTHREAD_INHERIT_SCHED;
111	dthread->tl_policy = DARWIN_POLICY_TIMESHARE;
112
113	return dthread;
114};
115
116static dthread_t dthread_structure_allocate(size_t stack_size, size_t guard_size, void** stack_addr) {
117	size_t total_size = guard_size + stack_size + sizeof(struct _dthread);
118
119	// allocate our stack, guard page, and dthread structure
120	void* base_addr = mmap(NULL, total_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
121
122	// protect our guard page
123	mprotect(base_addr, guard_size, PROT_NONE);
124
125	/**
126	 * memory layout of newly allocated block:
127	 *
128	 * [base_addr]         [base_addr + total_size]
129	 * --------------------------------------------
130	 * | guard page |       stack       | dthread |
131	 */
132
133	// stack_addr points to the top of the stack (i.e. the highest address)
134	*stack_addr = ((char*)base_addr) + stack_size + guard_size;
135
136	// the dthread sits above the stack
137	// (and by "above", i mean the lowest address of the dthread is the highest address of the stack)
138	dthread_t dthread = (dthread_t)*stack_addr;
139	// zero-out the entrire dthread structure
140	memset(dthread, 0, sizeof(struct _dthread));
141
142	return dthread_structure_init(dthread, guard_size, *stack_addr, stack_size, base_addr, total_size);
143};
144
145void* __darling_thread_create(unsigned long stack_size, unsigned long pth_obj_size,
146				void* entry_point, uintptr_t real_entry_point,
147				uintptr_t arg1, uintptr_t arg2, uintptr_t arg3,
148				darling_thread_create_callbacks_t callbacks, void* pth)
149{
150	struct arg_struct args = {
151		.entry_point      = (thread_ep)entry_point,
152		.real_entry_point = real_entry_point,
153		.arg1             = arg1,
154		.arg2             = arg2,
155		.arg3             = arg3,
156		.port             = 0,
157		.pth_obj_size     = pth_obj_size,
158		.pth              = NULL, // set later on
159		.callbacks        = callbacks,
160		.stack_addr       = 0, // set later on
161		.is_workqueue     = real_entry_point == 0, // our `workq_kernreturn` sets `real_entry_point` to NULL; `bsdthread_create` actually passes a value
162	};
163	pthread_attr_t attr;
164	pthread_t nativeLibcThread;
165
166	pthread_attr_init(&attr);
167	//pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
168	// pthread_attr_setstacksize(&attr, stack_size);
169
170	// in some cases, we're already given a pthread object, stack, and guard page;
171	// in those cases, just use what we're given (it also contains more information)
172	//
173	// otherwise, allocate them ourselves
174	if (pth == NULL || args.is_workqueue) {
175		pth = dthread_structure_allocate(stack_size, DEFAULT_DTHREAD_GUARD_SIZE, (void**)&args.stack_addr);
176	} else if (!args.is_workqueue) {
177		// `arg2` is `stack_addr` for normal threads
178		args.stack_addr = arg2;
179	}
180
181	args.stack_bottom = args.stack_addr - stack_size;
182
183	// pthread_attr_setstack is buggy. The documentation states we should provide the lowest
184	// address of the stack, yet some versions regard it as the highest address instead.
185	// Therefore it's better to just make the pthread stack as small as possible and then switch
186	// to our own stack instead.
187	//pthread_attr_setstack(&attr, ((char*)pth) + pth_obj_size, stack_size - pth_obj_size - 0x1000);
188
189	// std::cout << "Allocated stack at " << pth << ", size " << stack_size << std::endl;
190
191	pthread_attr_setstacksize(&attr, 4096);
192
193	//pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
194
195	args.pth = pth;
196	pthread_create(&nativeLibcThread, &attr, darling_thread_entry, &args);
197	pthread_attr_destroy(&attr);
198
199	while (args.pth != NULL)
200		sched_yield();
201
202	return pth;
203}
204
205static void* darling_thread_entry(void* p)
206{
207	struct arg_struct* in_args = (struct arg_struct*) p;
208	struct arg_struct args;
209
210	memcpy(&args, in_args, sizeof(args));
211
212	dthread_t dthread = args.pth;
213	uintptr_t* flags = args.is_workqueue ? &args.arg2 : &args.arg3;
214
215	// create a new dserver RPC socket
216	int new_rpc_fd = __mldr_create_rpc_socket();
217	if (new_rpc_fd < 0) {
218		// we can't do anything if we don't get our own separate connection to darlingserver
219		fprintf(stderr, "Failed to create socket\n");
220		abort();
221	}
222
223	// guard the new RPC FD
224	args.callbacks->rpc_guard(new_rpc_fd);
225
226	// the socket is ready; assign it now
227	t_server_socket = new_rpc_fd;
228	t_callbacks = args.callbacks;
229
230	// libpthread now expects the kernel to set the TSD
231	// so, since we're pretending to be the kernel handling threads...
232	args.callbacks->thread_set_tsd_base(&dthread->tsd[0], 0);
233	*flags |= args.is_workqueue ? DWQ_FLAG_THREAD_TSD_BASE_SET : DTHREAD_START_TSD_BASE_SET;
234
235	// let's check-in with darlingserver on this new thread
236	int dummy_stack_variable;
237	// the lifetime pipe fd is ignored as the process should already have been registered
238	if (dserver_rpc_explicit_checkin(t_server_socket, false, &dummy_stack_variable, -1) < 0) {
239		// we can't do ANYTHING if darlingserver doesn't acknowledge us successfully
240		abort();
241	}
242
243	int thread_self_port = args.callbacks->thread_self_trap();
244	dthread->tsd[DTHREAD_TSD_SLOT_MACH_THREAD_SELF] = (void*)(intptr_t)thread_self_port;
245	args.port = thread_self_port;
246
247	in_args->pth = NULL;
248
249	if (setjmp(t_jmpbuf))
250	{
251		// Terminate the Linux thread
252		munmap(t_freeaddr, t_freesize);
253		pthread_detach(pthread_self());
254		return NULL;
255	}
256
257	void *stack_ptr = align_16(args.stack_addr);
258
259	// No additional function calls should occur beyond this point. Otherwise, we will risk our
260	// registers being call-clobbered. I recommend reading the following doc for more details:
261	// https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html
262#if __x86_64__
263	register void*     arg1 asm("rdi") = args.pth;
264	register int       arg2 asm("esi") = args.port;
265	register uintptr_t arg3 asm("rdx") = args.real_entry_point;
266	register uintptr_t arg4 asm("rcx") = args.arg1;
267	register uintptr_t arg5 asm("r8")  = args.arg2;
268	register uintptr_t arg6 asm("r9")  = args.arg3;
269#elif __i386__
270	uintptr_t arg3 = args.real_entry_point;
271#endif
272
273	if (arg3 == 0) {
274		arg3 = (long) args.stack_bottom;
275	}
276
277#ifdef __x86_64__
278	asm volatile(
279		// Zero out the frame base register.
280		"xorq %%rbp, %%rbp\n"
281		// Switch to the new stack.
282		"movq %[stack_ptr], %%rsp\n"
283		// Push a fake return address.
284		"pushq $0\n"
285		// Jump to the entry point.
286		"jmp *%[entry_point]" ::
287		
288		// Function arguments
289		"r"(arg1),"r"(arg2),"r"(arg3),"r"(arg4),"r"(arg5),"r"(arg6),
290		
291		[entry_point] "r"(args.entry_point),
292		[stack_ptr] "r"(stack_ptr)
293	);
294#elif defined(__i386__) // args in eax, ebx, ecx, edx, edi, esi
295	__asm__ __volatile__ (
296		// Zero out the frame base register.
297		"xorl %%ebp, %%ebp\n"
298		// Switch to the new stack.
299		"movl %[stack_ptr], %%esp\n"
300		// Make sure stack is 16 aligned (before we push the fake return address)
301		"sub $8, %%esp\n"
302		// Unlike x86_64, all function arguments must be stored in the stack
303		"pushl   16(%[args])\n"		// 6th argument | args.arg3
304		"pushl   12(%[args])\n"		// 5th argument | args.arg2
305		"pushl   8(%[args])\n"		// 4th argument | args.arg1
306		"pushl   %[arg3]\n"			// 3rd argument | args3
307		"pushl   20(%[args])\n"		// 2nd argument | args.port
308		"pushl   28(%[args])\n"		// 1st argument | args.pth
309		// Push a fake return address.
310		"pushl $0\n"
311		// Jump to the entry point.
312		"jmp *%[entry_point]" ::
313		
314		// Function arguments to push to the stack.
315		[args] "r"(&args), [arg3]"r"(arg3),
316
317		[entry_point] "r"(args.entry_point),
318		[stack_ptr] "r"(stack_ptr)
319	);
320#else
321	#error Not implemented
322	// args.entry_point(args.pth, args.port, args.real_entry_point, args.arg1, args.arg2, args.arg3);
323#endif
324	__builtin_unreachable();
325}
326
327int __darling_thread_terminate(void* stackaddr,
328				unsigned long freesize, unsigned long pthobj_size)
329{
330	int checkout_result = 0;
331
332	if (t_server_socket != -1) {
333		checkout_result = dserver_rpc_explicit_checkout(t_server_socket, -1, false);
334	} else {
335		checkout_result = dserver_rpc_checkout(-1, false);
336	}
337
338	if (checkout_result < 0) {
339		// failing to check-out is not fatal.
340		// it's not ideal, but it's not fatal.
341		#define CHECKOUT_FAILURE_MESSAGE "Failed to checkout"
342		if (t_server_socket != -1) {
343			dserver_rpc_explicit_kprintf(t_server_socket, CHECKOUT_FAILURE_MESSAGE, sizeof(CHECKOUT_FAILURE_MESSAGE) - 1);
344		} else {
345			dserver_rpc_kprintf(CHECKOUT_FAILURE_MESSAGE, sizeof(CHECKOUT_FAILURE_MESSAGE) - 1);
346		}
347	}
348
349	// close the RPC FD (if necessary)
350	// it should already have been unguarded by our caller
351	if (t_server_socket != -1) {
352		__mldr_close_rpc_socket(t_server_socket);
353	}
354
355	if (getpid() == syscall(SYS_gettid))
356	{
357		// dispatch_main() calls pthread_exit(NULL) on the main thread,
358		// which turns our process into a zombie on Linux.
359		// Let's just hang around forever.
360		sigset_t mask;
361		memset(&mask, 0, sizeof(mask));
362
363		while (1)
364			sigsuspend(&mask);
365	}
366
367	t_freeaddr = stackaddr;
368	t_freesize = freesize;
369
370	longjmp(t_jmpbuf, 1);
371
372	__builtin_unreachable();
373}
374
375extern void* __mldr_main_stack_top;
376
377void* __darling_thread_get_stack(void)
378{
379	return __mldr_main_stack_top;
380}
381
382extern int __dserver_main_thread_socket_fd;
383
384int __darling_thread_rpc_socket(void) {
385	if (t_server_socket == -1) {
386		if (getpid() == syscall(SYS_gettid)) {
387			// this is the main thread
388			t_server_socket = __dserver_main_thread_socket_fd;
389		} else {
390			// threads should already have a per-thread socket assigned when they're created
391			abort();
392		}
393	}
394	return t_server_socket;
395};
396
397void __darling_thread_rpc_socket_refresh(void) {
398	int new_rpc_fd = __mldr_create_rpc_socket();
399	if (new_rpc_fd < 0) {
400		abort();
401	}
402
403	t_server_socket = new_rpc_fd;
404
405	// if this is the main thread, also update the socket used by mldr
406	if (getpid() == syscall(SYS_gettid)) {
407		__dserver_main_thread_socket_fd = t_server_socket;
408	}
409};