Serenity Operating System
1/*
2 * Copyright (c) 2018-2021, Andreas Kling <kling@serenityos.org>
3 * Copyright (c) 2022, the SerenityOS developers.
4 *
5 * SPDX-License-Identifier: BSD-2-Clause
6 */
7
8#include <AK/ScopeGuard.h>
9#include <AK/TemporaryChange.h>
10#include <Kernel/Arch/CPU.h>
11#include <Kernel/Debug.h>
12#include <Kernel/FileSystem/Custody.h>
13#include <Kernel/FileSystem/OpenFileDescription.h>
14#include <Kernel/FileSystem/VirtualFileSystem.h>
15#include <Kernel/Memory/MemoryManager.h>
16#include <Kernel/Memory/Region.h>
17#include <Kernel/Memory/SharedInodeVMObject.h>
18#include <Kernel/Panic.h>
19#include <Kernel/PerformanceManager.h>
20#include <Kernel/Process.h>
21#include <Kernel/Random.h>
22#include <Kernel/Scheduler.h>
23#include <Kernel/Time/TimeManagement.h>
24#include <LibELF/AuxiliaryVector.h>
25#include <LibELF/Image.h>
26#include <LibELF/Validation.h>
27
28namespace Kernel {
29
30extern Memory::Region* g_signal_trampoline_region;
31
32struct LoadResult {
33 OwnPtr<Memory::AddressSpace> space;
34 FlatPtr load_base { 0 };
35 FlatPtr entry_eip { 0 };
36 size_t size { 0 };
37 LockWeakPtr<Memory::Region> tls_region;
38 size_t tls_size { 0 };
39 size_t tls_alignment { 0 };
40 LockWeakPtr<Memory::Region> stack_region;
41};
42
43static constexpr size_t auxiliary_vector_size = 15;
44static Array<ELF::AuxiliaryValue, auxiliary_vector_size> generate_auxiliary_vector(FlatPtr load_base, FlatPtr entry_eip, UserID uid, UserID euid, GroupID gid, GroupID egid, StringView executable_path, Optional<Process::ScopedDescriptionAllocation> const& main_program_fd_allocation);
45
46static bool validate_stack_size(Vector<NonnullOwnPtr<KString>> const& arguments, Vector<NonnullOwnPtr<KString>>& environment, Array<ELF::AuxiliaryValue, auxiliary_vector_size> const& auxiliary)
47{
48 size_t total_arguments_size = 0;
49 size_t total_environment_size = 0;
50 size_t total_auxiliary_size = 0;
51
52 for (auto const& a : arguments)
53 total_arguments_size += a->length() + 1;
54 for (auto const& e : environment)
55 total_environment_size += e->length() + 1;
56 for (auto const& v : auxiliary) {
57 if (!v.optional_string.is_empty())
58 total_auxiliary_size += round_up_to_power_of_two(v.optional_string.length() + 1, sizeof(FlatPtr));
59
60 if (v.auxv.a_type == ELF::AuxiliaryValue::Random)
61 total_auxiliary_size += round_up_to_power_of_two(16, sizeof(FlatPtr));
62 }
63
64 total_arguments_size += sizeof(char*) * (arguments.size() + 1);
65 total_environment_size += sizeof(char*) * (environment.size() + 1);
66 total_auxiliary_size += sizeof(auxv_t) * auxiliary.size();
67
68 if (total_arguments_size > Process::max_arguments_size)
69 return false;
70
71 if (total_environment_size > Process::max_environment_size)
72 return false;
73
74 if (total_auxiliary_size > Process::max_auxiliary_size)
75 return false;
76
77 return true;
78}
79
80static ErrorOr<FlatPtr> make_userspace_context_for_main_thread([[maybe_unused]] ThreadRegisters& regs, Memory::Region& region, Vector<NonnullOwnPtr<KString>> const& arguments,
81 Vector<NonnullOwnPtr<KString>> const& environment, Array<ELF::AuxiliaryValue, auxiliary_vector_size> auxiliary_values)
82{
83 FlatPtr new_sp = region.range().end().get();
84
85 // Add some bits of randomness to the user stack pointer.
86 new_sp -= round_up_to_power_of_two(get_fast_random<u32>() % 4096, 16);
87
88 auto push_on_new_stack = [&new_sp](FlatPtr value) {
89 new_sp -= sizeof(FlatPtr);
90 Userspace<FlatPtr*> stack_ptr = new_sp;
91 auto result = copy_to_user(stack_ptr, &value);
92 VERIFY(!result.is_error());
93 };
94
95 auto push_aux_value_on_new_stack = [&new_sp](auxv_t value) {
96 new_sp -= sizeof(auxv_t);
97 Userspace<auxv_t*> stack_ptr = new_sp;
98 auto result = copy_to_user(stack_ptr, &value);
99 VERIFY(!result.is_error());
100 };
101
102 auto push_string_on_new_stack = [&new_sp](StringView string) {
103 new_sp -= round_up_to_power_of_two(string.length() + 1, sizeof(FlatPtr));
104 Userspace<FlatPtr*> stack_ptr = new_sp;
105 auto result = copy_to_user(stack_ptr, string.characters_without_null_termination(), string.length() + 1);
106 VERIFY(!result.is_error());
107 };
108
109 Vector<FlatPtr> argv_entries;
110 for (auto const& argument : arguments) {
111 push_string_on_new_stack(argument->view());
112 TRY(argv_entries.try_append(new_sp));
113 }
114
115 Vector<FlatPtr> env_entries;
116 for (auto const& variable : environment) {
117 push_string_on_new_stack(variable->view());
118 TRY(env_entries.try_append(new_sp));
119 }
120
121 for (auto& value : auxiliary_values) {
122 if (!value.optional_string.is_empty()) {
123 push_string_on_new_stack(value.optional_string);
124 value.auxv.a_un.a_ptr = (void*)new_sp;
125 }
126 if (value.auxv.a_type == ELF::AuxiliaryValue::Random) {
127 u8 random_bytes[16] {};
128 get_fast_random_bytes({ random_bytes, sizeof(random_bytes) });
129 push_string_on_new_stack({ random_bytes, sizeof(random_bytes) });
130 value.auxv.a_un.a_ptr = (void*)new_sp;
131 }
132 }
133
134 for (ssize_t i = auxiliary_values.size() - 1; i >= 0; --i) {
135 auto& value = auxiliary_values[i];
136 push_aux_value_on_new_stack(value.auxv);
137 }
138
139 push_on_new_stack(0);
140 for (ssize_t i = env_entries.size() - 1; i >= 0; --i)
141 push_on_new_stack(env_entries[i]);
142 FlatPtr envp = new_sp;
143
144 push_on_new_stack(0);
145 for (ssize_t i = argv_entries.size() - 1; i >= 0; --i)
146 push_on_new_stack(argv_entries[i]);
147 FlatPtr argv = new_sp;
148
149 // NOTE: The stack needs to be 16-byte aligned.
150 new_sp -= new_sp % 16;
151
152#if ARCH(X86_64)
153 regs.rdi = argv_entries.size();
154 regs.rsi = argv;
155 regs.rdx = envp;
156#elif ARCH(AARCH64)
157 regs.x[0] = argv_entries.size();
158 regs.x[1] = argv;
159 regs.x[2] = envp;
160#else
161# error Unknown architecture
162#endif
163
164 VERIFY(new_sp % 16 == 0);
165
166 // FIXME: The way we're setting up the stack and passing arguments to the entry point isn't ABI-compliant
167 return new_sp;
168}
169
170struct RequiredLoadRange {
171 FlatPtr start { 0 };
172 FlatPtr end { 0 };
173};
174
175static ErrorOr<RequiredLoadRange> get_required_load_range(OpenFileDescription& program_description)
176{
177 auto& inode = *(program_description.inode());
178 auto vmobject = TRY(Memory::SharedInodeVMObject::try_create_with_inode(inode));
179
180 size_t executable_size = inode.size();
181 size_t rounded_executable_size = TRY(Memory::page_round_up(executable_size));
182 auto region = TRY(MM.allocate_kernel_region_with_vmobject(*vmobject, rounded_executable_size, "ELF memory range calculation"sv, Memory::Region::Access::Read));
183 auto elf_image = ELF::Image(region->vaddr().as_ptr(), executable_size);
184 if (!elf_image.is_valid()) {
185 return EINVAL;
186 }
187
188 RequiredLoadRange range {};
189 elf_image.for_each_program_header([&range](auto const& pheader) {
190 if (pheader.type() != PT_LOAD)
191 return;
192
193 auto region_start = (FlatPtr)pheader.vaddr().as_ptr();
194 auto region_end = region_start + pheader.size_in_memory();
195 if (range.start == 0 || region_start < range.start)
196 range.start = region_start;
197 if (range.end == 0 || region_end > range.end)
198 range.end = region_end;
199 });
200
201 VERIFY(range.end > range.start);
202 return range;
203};
204
205static ErrorOr<FlatPtr> get_load_offset(const ElfW(Ehdr) & main_program_header, OpenFileDescription& main_program_description, OpenFileDescription* interpreter_description)
206{
207 constexpr FlatPtr load_range_start = 0x08000000;
208 constexpr FlatPtr load_range_size = 65536 * PAGE_SIZE; // 2**16 * PAGE_SIZE = 256MB
209 constexpr FlatPtr minimum_load_offset_randomization_size = 10 * MiB;
210
211 auto random_load_offset_in_range([](auto start, auto size) {
212 return Memory::page_round_down(start + get_good_random<FlatPtr>() % size);
213 });
214
215 if (main_program_header.e_type == ET_DYN) {
216 return random_load_offset_in_range(load_range_start, load_range_size);
217 }
218
219 if (main_program_header.e_type != ET_EXEC)
220 return EINVAL;
221
222 auto main_program_load_range = TRY(get_required_load_range(main_program_description));
223
224 RequiredLoadRange selected_range {};
225
226 if (interpreter_description) {
227 auto interpreter_load_range = TRY(get_required_load_range(*interpreter_description));
228
229 auto interpreter_size_in_memory = interpreter_load_range.end - interpreter_load_range.start;
230 auto interpreter_load_range_end = load_range_start + load_range_size - interpreter_size_in_memory;
231
232 // No intersection
233 if (main_program_load_range.end < load_range_start || main_program_load_range.start > interpreter_load_range_end)
234 return random_load_offset_in_range(load_range_start, load_range_size);
235
236 RequiredLoadRange first_available_part = { load_range_start, main_program_load_range.start };
237 RequiredLoadRange second_available_part = { main_program_load_range.end, interpreter_load_range_end };
238
239 // Select larger part
240 if (first_available_part.end - first_available_part.start > second_available_part.end - second_available_part.start)
241 selected_range = first_available_part;
242 else
243 selected_range = second_available_part;
244 } else
245 selected_range = main_program_load_range;
246
247 // If main program is too big and leaves us without enough space for adequate loader randomization
248 if (selected_range.end - selected_range.start < minimum_load_offset_randomization_size)
249 return E2BIG;
250
251 return random_load_offset_in_range(selected_range.start, selected_range.end - selected_range.start);
252}
253
254enum class ShouldAllocateTls {
255 No,
256 Yes,
257};
258
259enum class ShouldAllowSyscalls {
260 No,
261 Yes,
262};
263
264static ErrorOr<LoadResult> load_elf_object(NonnullOwnPtr<Memory::AddressSpace> new_space, OpenFileDescription& object_description,
265 FlatPtr load_offset, ShouldAllocateTls should_allocate_tls, ShouldAllowSyscalls should_allow_syscalls)
266{
267 auto& inode = *(object_description.inode());
268 auto vmobject = TRY(Memory::SharedInodeVMObject::try_create_with_inode(inode));
269
270 if (vmobject->writable_mappings()) {
271 dbgln("Refusing to execute a write-mapped program");
272 return ETXTBSY;
273 }
274
275 size_t executable_size = inode.size();
276 size_t rounded_executable_size = TRY(Memory::page_round_up(executable_size));
277
278 auto executable_region = TRY(MM.allocate_kernel_region_with_vmobject(*vmobject, rounded_executable_size, "ELF loading"sv, Memory::Region::Access::Read));
279 auto elf_image = ELF::Image(executable_region->vaddr().as_ptr(), executable_size);
280
281 if (!elf_image.is_valid())
282 return ENOEXEC;
283
284 Memory::Region* master_tls_region { nullptr };
285 size_t master_tls_size = 0;
286 size_t master_tls_alignment = 0;
287 FlatPtr load_base_address = 0;
288 size_t stack_size = 0;
289
290 auto elf_name = TRY(object_description.pseudo_path());
291 VERIFY(!Processor::in_critical());
292
293 Memory::MemoryManager::enter_address_space(*new_space);
294
295 auto load_tls_section = [&](auto& program_header) -> ErrorOr<void> {
296 VERIFY(should_allocate_tls == ShouldAllocateTls::Yes);
297 VERIFY(program_header.size_in_memory());
298
299 if (!elf_image.is_within_image(program_header.raw_data(), program_header.size_in_image())) {
300 dbgln("Shenanigans! ELF PT_TLS header sneaks outside of executable.");
301 return ENOEXEC;
302 }
303
304 auto region_name = TRY(KString::formatted("{} (master-tls)", elf_name));
305 master_tls_region = TRY(new_space->allocate_region(Memory::RandomizeVirtualAddress::Yes, {}, program_header.size_in_memory(), PAGE_SIZE, region_name->view(), PROT_READ | PROT_WRITE, AllocationStrategy::Reserve));
306 master_tls_size = program_header.size_in_memory();
307 master_tls_alignment = program_header.alignment();
308
309 TRY(copy_to_user(master_tls_region->vaddr().as_ptr(), program_header.raw_data(), program_header.size_in_image()));
310 return {};
311 };
312
313 auto load_writable_section = [&](auto& program_header) -> ErrorOr<void> {
314 // Writable section: create a copy in memory.
315 VERIFY(program_header.alignment() % PAGE_SIZE == 0);
316
317 if (!elf_image.is_within_image(program_header.raw_data(), program_header.size_in_image())) {
318 dbgln("Shenanigans! Writable ELF PT_LOAD header sneaks outside of executable.");
319 return ENOEXEC;
320 }
321
322 int prot = 0;
323 if (program_header.is_readable())
324 prot |= PROT_READ;
325 if (program_header.is_writable())
326 prot |= PROT_WRITE;
327 auto region_name = TRY(KString::formatted("{} (data-{}{})", elf_name, program_header.is_readable() ? "r" : "", program_header.is_writable() ? "w" : ""));
328
329 auto range_base = VirtualAddress { Memory::page_round_down(program_header.vaddr().offset(load_offset).get()) };
330 size_t rounded_range_end = TRY(Memory::page_round_up(program_header.vaddr().offset(load_offset).offset(program_header.size_in_memory()).get()));
331 auto range_end = VirtualAddress { rounded_range_end };
332
333 auto region = TRY(new_space->allocate_region(Memory::RandomizeVirtualAddress::Yes, range_base, range_end.get() - range_base.get(), PAGE_SIZE, region_name->view(), prot, AllocationStrategy::Reserve));
334
335 // It's not always the case with PIE executables (and very well shouldn't be) that the
336 // virtual address in the program header matches the one we end up giving the process.
337 // In order to copy the data image correctly into memory, we need to copy the data starting at
338 // the right initial page offset into the pages allocated for the elf_alloc-XX section.
339 // FIXME: There's an opportunity to munmap, or at least mprotect, the padding space between
340 // the .text and .data PT_LOAD sections of the executable.
341 // Accessing it would definitely be a bug.
342 auto page_offset = program_header.vaddr();
343 page_offset.mask(~PAGE_MASK);
344 TRY(copy_to_user((u8*)region->vaddr().as_ptr() + page_offset.get(), program_header.raw_data(), program_header.size_in_image()));
345 return {};
346 };
347
348 auto load_section = [&](auto& program_header) -> ErrorOr<void> {
349 if (program_header.size_in_memory() == 0)
350 return {};
351
352 if (program_header.is_writable())
353 return load_writable_section(program_header);
354
355 // Non-writable section: map the executable itself in memory.
356 VERIFY(program_header.alignment() % PAGE_SIZE == 0);
357 int prot = 0;
358 if (program_header.is_readable())
359 prot |= PROT_READ;
360 if (program_header.is_writable())
361 prot |= PROT_WRITE;
362 if (program_header.is_executable())
363 prot |= PROT_EXEC;
364
365 auto range_base = VirtualAddress { Memory::page_round_down(program_header.vaddr().offset(load_offset).get()) };
366 size_t rounded_range_end = TRY(Memory::page_round_up(program_header.vaddr().offset(load_offset).offset(program_header.size_in_memory()).get()));
367 auto range_end = VirtualAddress { rounded_range_end };
368 auto region = TRY(new_space->allocate_region_with_vmobject(Memory::RandomizeVirtualAddress::Yes, range_base, range_end.get() - range_base.get(), program_header.alignment(), *vmobject, program_header.offset(), elf_name->view(), prot, true));
369
370 if (should_allow_syscalls == ShouldAllowSyscalls::Yes)
371 region->set_syscall_region(true);
372 if (program_header.offset() == 0)
373 load_base_address = (FlatPtr)region->vaddr().as_ptr();
374 return {};
375 };
376
377 auto load_elf_program_header = [&](auto& program_header) -> ErrorOr<void> {
378 if (program_header.type() == PT_TLS)
379 return load_tls_section(program_header);
380
381 if (program_header.type() == PT_LOAD)
382 return load_section(program_header);
383
384 if (program_header.type() == PT_GNU_STACK) {
385 stack_size = program_header.size_in_memory();
386 }
387
388 // NOTE: We ignore other program header types.
389 return {};
390 };
391
392 TRY([&] {
393 ErrorOr<void> result;
394 elf_image.for_each_program_header([&](ELF::Image::ProgramHeader const& program_header) {
395 result = load_elf_program_header(program_header);
396 return result.is_error() ? IterationDecision::Break : IterationDecision::Continue;
397 });
398 return result;
399 }());
400
401 if (stack_size == 0) {
402 stack_size = Thread::default_userspace_stack_size;
403 }
404
405 if (!elf_image.entry().offset(load_offset).get()) {
406 dbgln("do_exec: Failure loading program, entry pointer is invalid! {})", elf_image.entry().offset(load_offset));
407 return ENOEXEC;
408 }
409
410 auto* stack_region = TRY(new_space->allocate_region(Memory::RandomizeVirtualAddress::Yes, {}, stack_size, PAGE_SIZE, "Stack (Main thread)"sv, PROT_READ | PROT_WRITE, AllocationStrategy::Reserve));
411 stack_region->set_stack(true);
412
413 return LoadResult {
414 move(new_space),
415 load_base_address,
416 elf_image.entry().offset(load_offset).get(),
417 executable_size,
418 TRY(AK::try_make_weak_ptr_if_nonnull(master_tls_region)),
419 master_tls_size,
420 master_tls_alignment,
421 TRY(stack_region->try_make_weak_ptr())
422 };
423}
424
425ErrorOr<LoadResult>
426Process::load(NonnullRefPtr<OpenFileDescription> main_program_description,
427 RefPtr<OpenFileDescription> interpreter_description, const ElfW(Ehdr) & main_program_header)
428{
429 auto new_space = TRY(Memory::AddressSpace::try_create(nullptr));
430
431 ScopeGuard space_guard([&]() {
432 Memory::MemoryManager::enter_process_address_space(*this);
433 });
434
435 auto load_offset = TRY(get_load_offset(main_program_header, main_program_description, interpreter_description));
436
437 if (interpreter_description.is_null()) {
438 auto load_result = TRY(load_elf_object(move(new_space), main_program_description, load_offset, ShouldAllocateTls::Yes, ShouldAllowSyscalls::No));
439 m_master_tls_region = load_result.tls_region;
440 m_master_tls_size = load_result.tls_size;
441 m_master_tls_alignment = load_result.tls_alignment;
442 return load_result;
443 }
444
445 auto interpreter_load_result = TRY(load_elf_object(move(new_space), *interpreter_description, load_offset, ShouldAllocateTls::No, ShouldAllowSyscalls::Yes));
446
447 // TLS allocation will be done in userspace by the loader
448 VERIFY(!interpreter_load_result.tls_region);
449 VERIFY(!interpreter_load_result.tls_alignment);
450 VERIFY(!interpreter_load_result.tls_size);
451
452 return interpreter_load_result;
453}
454
455void Process::clear_signal_handlers_for_exec()
456{
457 // Comments are as they are presented in the POSIX specification, but slightly out of order.
458 for (size_t signal = 0; signal < m_signal_action_data.size(); signal++) {
459 // Except for SIGCHLD, signals set to be ignored by the calling process image shall be set to be ignored by the new process image.
460 // If the SIGCHLD signal is set to be ignored by the calling process image, it is unspecified whether the SIGCHLD signal is set
461 // to be ignored or to the default action in the new process image.
462 if (signal != SIGCHLD && m_signal_action_data[signal].handler_or_sigaction.get() == reinterpret_cast<FlatPtr>(SIG_IGN)) {
463 m_signal_action_data[signal] = {};
464 m_signal_action_data[signal].handler_or_sigaction.set(reinterpret_cast<FlatPtr>(SIG_IGN));
465 continue;
466 }
467
468 // Signals set to the default action in the calling process image shall be set to the default action in the new process image.
469 // Signals set to be caught by the calling process image shall be set to the default action in the new process image.
470 m_signal_action_data[signal] = {};
471 }
472}
473
474ErrorOr<void> Process::do_exec(NonnullRefPtr<OpenFileDescription> main_program_description, Vector<NonnullOwnPtr<KString>> arguments, Vector<NonnullOwnPtr<KString>> environment,
475 RefPtr<OpenFileDescription> interpreter_description, Thread*& new_main_thread, InterruptsState& previous_interrupts_state, const ElfW(Ehdr) & main_program_header)
476{
477 VERIFY(is_user_process());
478 VERIFY(!Processor::in_critical());
479 auto main_program_metadata = main_program_description->metadata();
480 // NOTE: Don't allow running SUID binaries at all if we are in a jail.
481 TRY(Process::current().jail().with([&](auto const& my_jail) -> ErrorOr<void> {
482 if (my_jail && (main_program_metadata.is_setuid() || main_program_metadata.is_setgid())) {
483 return Error::from_errno(EPERM);
484 }
485 return {};
486 }));
487
488 // Although we *could* handle a pseudo_path here, trying to execute something that doesn't have
489 // a custody (e.g. BlockDevice or RandomDevice) is pretty suspicious anyway.
490 auto path = TRY(main_program_description->original_absolute_path());
491
492 dbgln_if(EXEC_DEBUG, "do_exec: {}", path);
493
494 auto last_part = path->view().find_last_split_view('/');
495
496 auto new_process_name = TRY(KString::try_create(last_part));
497 auto new_main_thread_name = TRY(new_process_name->try_clone());
498
499 auto load_result = TRY(load(main_program_description, interpreter_description, main_program_header));
500
501 // NOTE: We don't need the interpreter executable description after this point.
502 // We destroy it here to prevent it from getting destroyed when we return from this function.
503 // That's important because when we're returning from this function, we're in a very delicate
504 // state where we can't block (e.g by trying to acquire a mutex in description teardown.)
505 bool has_interpreter = interpreter_description;
506 interpreter_description = nullptr;
507
508 auto* signal_trampoline_region = TRY(load_result.space->allocate_region_with_vmobject(Memory::RandomizeVirtualAddress::Yes, {}, PAGE_SIZE, PAGE_SIZE, g_signal_trampoline_region->vmobject(), 0, "Signal trampoline"sv, PROT_READ | PROT_EXEC, true));
509 signal_trampoline_region->set_syscall_region(true);
510
511 // (For dynamically linked executable) Allocate an FD for passing the main executable to the dynamic loader.
512 Optional<ScopedDescriptionAllocation> main_program_fd_allocation;
513 if (has_interpreter)
514 main_program_fd_allocation = TRY(allocate_fd());
515
516 auto old_credentials = this->credentials();
517 auto new_credentials = old_credentials;
518 auto old_process_attached_jail = m_attached_jail.with([&](auto& jail) -> RefPtr<Jail> { return jail; });
519 auto old_scoped_list = m_jail_process_list.with([&](auto& list) -> RefPtr<ProcessList> { return list; });
520
521 bool executable_is_setid = false;
522
523 if (!(main_program_description->custody()->mount_flags() & MS_NOSUID)) {
524 auto new_euid = old_credentials->euid();
525 auto new_egid = old_credentials->egid();
526 auto new_suid = old_credentials->suid();
527 auto new_sgid = old_credentials->sgid();
528
529 if (main_program_metadata.is_setuid()) {
530 executable_is_setid = true;
531 new_euid = main_program_metadata.uid;
532 new_suid = main_program_metadata.uid;
533 }
534 if (main_program_metadata.is_setgid()) {
535 executable_is_setid = true;
536 new_egid = main_program_metadata.gid;
537 new_sgid = main_program_metadata.gid;
538 }
539
540 if (executable_is_setid) {
541 new_credentials = TRY(Credentials::create(
542 old_credentials->uid(),
543 old_credentials->gid(),
544 new_euid,
545 new_egid,
546 new_suid,
547 new_sgid,
548 old_credentials->extra_gids(),
549 old_credentials->sid(),
550 old_credentials->pgid()));
551 }
552 }
553
554 // We commit to the new executable at this point. There is no turning back!
555
556 // Prevent other processes from attaching to us with ptrace while we're doing this.
557 MutexLocker ptrace_locker(ptrace_lock());
558
559 // Disable profiling temporarily in case it's running on this process.
560 auto was_profiling = m_profiling;
561 TemporaryChange profiling_disabler(m_profiling, false);
562
563 kill_threads_except_self();
564
565 with_mutable_protected_data([&](auto& protected_data) {
566 protected_data.credentials = move(new_credentials);
567 protected_data.dumpable = !executable_is_setid;
568 protected_data.executable_is_setid = executable_is_setid;
569 });
570
571 // We make sure to enter the new address space before destroying the old one.
572 // This ensures that the process always has a valid page directory.
573 Memory::MemoryManager::enter_address_space(*load_result.space);
574
575 m_space.with([&](auto& space) { space = load_result.space.release_nonnull(); });
576
577 m_executable.with([&](auto& executable) { executable = main_program_description->custody(); });
578 m_arguments = move(arguments);
579 m_attached_jail.with([&](auto& jail) {
580 jail = old_process_attached_jail;
581 });
582
583 m_jail_process_list.with([&](auto& list) {
584 list = old_scoped_list;
585 });
586
587 m_environment = move(environment);
588
589 TRY(m_unveil_data.with([&](auto& unveil_data) -> ErrorOr<void> {
590 TRY(m_exec_unveil_data.with([&](auto& exec_unveil_data) -> ErrorOr<void> {
591 // Note: If we have exec unveil data being waiting to be dispatched
592 // to the current execve'd program, then we apply the unveil data and
593 // ensure it is locked in the new program.
594 if (exec_unveil_data.state == VeilState::Dropped) {
595 unveil_data.state = VeilState::LockedInherited;
596 exec_unveil_data.state = VeilState::None;
597 unveil_data.paths = TRY(exec_unveil_data.paths.deep_copy());
598 } else {
599 unveil_data.state = VeilState::None;
600 exec_unveil_data.state = VeilState::None;
601 unveil_data.paths.clear();
602 unveil_data.paths.set_metadata({ TRY(KString::try_create("/"sv)), UnveilAccess::None, false });
603 }
604 exec_unveil_data.paths.clear();
605 exec_unveil_data.paths.set_metadata({ TRY(KString::try_create("/"sv)), UnveilAccess::None, false });
606 return {};
607 }));
608 return {};
609 }));
610
611 m_coredump_properties.for_each([](auto& property) {
612 property = {};
613 });
614
615 auto* current_thread = Thread::current();
616 current_thread->reset_signals_for_exec();
617
618 clear_signal_handlers_for_exec();
619
620 clear_futex_queues_on_exec();
621
622 m_fds.with_exclusive([&](auto& fds) {
623 fds.change_each([&](auto& file_description_metadata) {
624 if (file_description_metadata.is_valid() && file_description_metadata.flags() & FD_CLOEXEC)
625 file_description_metadata = {};
626 });
627 });
628
629 if (main_program_fd_allocation.has_value()) {
630 main_program_description->set_readable(true);
631 m_fds.with_exclusive([&](auto& fds) { fds[main_program_fd_allocation->fd].set(move(main_program_description), FD_CLOEXEC); });
632 }
633
634 new_main_thread = nullptr;
635 if (¤t_thread->process() == this) {
636 new_main_thread = current_thread;
637 } else {
638 for_each_thread([&](auto& thread) {
639 new_main_thread = &thread;
640 return IterationDecision::Break;
641 });
642 }
643 VERIFY(new_main_thread);
644
645 auto credentials = this->credentials();
646 auto auxv = generate_auxiliary_vector(load_result.load_base, load_result.entry_eip, credentials->uid(), credentials->euid(), credentials->gid(), credentials->egid(), path->view(), main_program_fd_allocation);
647
648 // FIXME: How much stack space does process startup need?
649 if (!validate_stack_size(m_arguments, m_environment, auxv))
650 return E2BIG;
651
652 // NOTE: We create the new stack before disabling interrupts since it will zero-fault
653 // and we don't want to deal with faults after this point.
654 auto new_userspace_sp = TRY(make_userspace_context_for_main_thread(new_main_thread->regs(), *load_result.stack_region.unsafe_ptr(), m_arguments, m_environment, move(auxv)));
655
656 set_name(move(new_process_name));
657 new_main_thread->set_name(move(new_main_thread_name));
658
659 if (wait_for_tracer_at_next_execve()) {
660 // Make sure we release the ptrace lock here or the tracer will block forever.
661 ptrace_locker.unlock();
662 Thread::current()->send_urgent_signal_to_self(SIGSTOP);
663 } else {
664 // Unlock regardless before disabling interrupts.
665 // Ensure we always unlock after checking ptrace status to avoid TOCTOU ptrace issues
666 ptrace_locker.unlock();
667 }
668
669 // We enter a critical section here because we don't want to get interrupted between do_exec()
670 // and Processor::assume_context() or the next context switch.
671 // If we used an InterruptDisabler that calls enable_interrupts() on exit, we might timer tick'd too soon in exec().
672 Processor::enter_critical();
673 previous_interrupts_state = processor_interrupts_state();
674 Processor::disable_interrupts();
675
676 // NOTE: Be careful to not trigger any page faults below!
677
678 with_mutable_protected_data([&](auto& protected_data) {
679 protected_data.promises = protected_data.execpromises.load();
680 protected_data.has_promises = protected_data.has_execpromises.load();
681
682 protected_data.execpromises = 0;
683 protected_data.has_execpromises = false;
684
685 protected_data.signal_trampoline = signal_trampoline_region->vaddr();
686
687 // FIXME: PID/TID ISSUE
688 protected_data.pid = new_main_thread->tid().value();
689 });
690
691 auto tsr_result = new_main_thread->make_thread_specific_region({});
692 if (tsr_result.is_error()) {
693 // FIXME: We cannot fail this late. Refactor this so the allocation happens before we commit to the new executable.
694 VERIFY_NOT_REACHED();
695 }
696 new_main_thread->reset_fpu_state();
697
698 auto& regs = new_main_thread->m_regs;
699 address_space().with([&](auto& space) {
700 regs.set_exec_state(load_result.entry_eip, new_userspace_sp, *space);
701 });
702
703 {
704 TemporaryChange profiling_disabler(m_profiling, was_profiling);
705 PerformanceManager::add_process_exec_event(*this);
706 }
707
708 u32 lock_count_to_restore;
709 [[maybe_unused]] auto rc = big_lock().force_unlock_exclusive_if_locked(lock_count_to_restore);
710 VERIFY_INTERRUPTS_DISABLED();
711 VERIFY(Processor::in_critical());
712 return {};
713}
714
715static Array<ELF::AuxiliaryValue, auxiliary_vector_size> generate_auxiliary_vector(FlatPtr load_base, FlatPtr entry_eip, UserID uid, UserID euid, GroupID gid, GroupID egid, StringView executable_path, Optional<Process::ScopedDescriptionAllocation> const& main_program_fd_allocation)
716{
717 return { {
718 // PHDR/EXECFD
719 // PH*
720 { ELF::AuxiliaryValue::PageSize, PAGE_SIZE },
721 { ELF::AuxiliaryValue::BaseAddress, (void*)load_base },
722
723 { ELF::AuxiliaryValue::Entry, (void*)entry_eip },
724 // NOTELF
725 { ELF::AuxiliaryValue::Uid, (long)uid.value() },
726 { ELF::AuxiliaryValue::EUid, (long)euid.value() },
727 { ELF::AuxiliaryValue::Gid, (long)gid.value() },
728 { ELF::AuxiliaryValue::EGid, (long)egid.value() },
729
730 { ELF::AuxiliaryValue::Platform, Processor::platform_string() },
731 // FIXME: This is platform specific
732#if ARCH(X86_64)
733 { ELF::AuxiliaryValue::HwCap, (long)CPUID(1).edx() },
734#elif ARCH(AARCH64)
735 { ELF::AuxiliaryValue::HwCap, (long)0 },
736#else
737# error "Unknown architecture"
738#endif
739
740 { ELF::AuxiliaryValue::ClockTick, (long)TimeManagement::the().ticks_per_second() },
741
742 // FIXME: Also take into account things like extended filesystem permissions? That's what linux does...
743 { ELF::AuxiliaryValue::Secure, ((uid != euid) || (gid != egid)) ? 1 : 0 },
744
745 { ELF::AuxiliaryValue::Random, nullptr },
746
747 { ELF::AuxiliaryValue::ExecFilename, executable_path },
748
749 main_program_fd_allocation.has_value() ? ELF::AuxiliaryValue { ELF::AuxiliaryValue::ExecFileDescriptor, main_program_fd_allocation->fd } : ELF::AuxiliaryValue { ELF::AuxiliaryValue::Ignore, 0L },
750
751 { ELF::AuxiliaryValue::Null, 0L },
752 } };
753}
754
755static ErrorOr<Vector<NonnullOwnPtr<KString>>> find_shebang_interpreter_for_executable(char const first_page[], size_t nread)
756{
757 int word_start = 2;
758 size_t word_length = 0;
759 if (nread > 2 && first_page[0] == '#' && first_page[1] == '!') {
760 Vector<NonnullOwnPtr<KString>> interpreter_words;
761
762 for (size_t i = 2; i < nread; ++i) {
763 if (first_page[i] == '\n') {
764 break;
765 }
766
767 if (first_page[i] != ' ') {
768 ++word_length;
769 }
770
771 if (first_page[i] == ' ') {
772 if (word_length > 0) {
773 auto word = TRY(KString::try_create(StringView { &first_page[word_start], word_length }));
774 TRY(interpreter_words.try_append(move(word)));
775 }
776 word_length = 0;
777 word_start = i + 1;
778 }
779 }
780
781 if (word_length > 0) {
782 auto word = TRY(KString::try_create(StringView { &first_page[word_start], word_length }));
783 TRY(interpreter_words.try_append(move(word)));
784 }
785
786 if (!interpreter_words.is_empty())
787 return interpreter_words;
788 }
789
790 return ENOEXEC;
791}
792
793ErrorOr<RefPtr<OpenFileDescription>> Process::find_elf_interpreter_for_executable(StringView path, ElfW(Ehdr) const& main_executable_header, size_t main_executable_header_size, size_t file_size)
794{
795 // Not using ErrorOr here because we'll want to do the same thing in userspace in the RTLD
796 StringBuilder interpreter_path_builder;
797 if (!TRY(ELF::validate_program_headers(main_executable_header, file_size, { &main_executable_header, main_executable_header_size }, &interpreter_path_builder))) {
798 dbgln("exec({}): File has invalid ELF Program headers", path);
799 return ENOEXEC;
800 }
801 auto interpreter_path = interpreter_path_builder.string_view();
802
803 if (!interpreter_path.is_empty()) {
804 dbgln_if(EXEC_DEBUG, "exec({}): Using program interpreter {}", path, interpreter_path);
805 auto interpreter_description = TRY(VirtualFileSystem::the().open(credentials(), interpreter_path, O_EXEC, 0, current_directory()));
806 auto interp_metadata = interpreter_description->metadata();
807
808 VERIFY(interpreter_description->inode());
809
810 // Validate the program interpreter as a valid elf binary.
811 // If your program interpreter is a #! file or something, it's time to stop playing games :)
812 if (interp_metadata.size < (int)sizeof(ElfW(Ehdr)))
813 return ENOEXEC;
814
815 char first_page[PAGE_SIZE] = {};
816 auto first_page_buffer = UserOrKernelBuffer::for_kernel_buffer((u8*)&first_page);
817 auto nread = TRY(interpreter_description->read(first_page_buffer, sizeof(first_page)));
818
819 if (nread < sizeof(ElfW(Ehdr)))
820 return ENOEXEC;
821
822 auto* elf_header = (ElfW(Ehdr)*)first_page;
823 if (!ELF::validate_elf_header(*elf_header, interp_metadata.size)) {
824 dbgln("exec({}): Interpreter ({}) has invalid ELF header", path, interpreter_path);
825 return ENOEXEC;
826 }
827
828 // Not using ErrorOr here because we'll want to do the same thing in userspace in the RTLD
829 StringBuilder interpreter_interpreter_path_builder;
830 if (!TRY(ELF::validate_program_headers(*elf_header, interp_metadata.size, { first_page, nread }, &interpreter_interpreter_path_builder))) {
831 dbgln("exec({}): Interpreter ({}) has invalid ELF Program headers", path, interpreter_path);
832 return ENOEXEC;
833 }
834 auto interpreter_interpreter_path = interpreter_interpreter_path_builder.string_view();
835
836 if (!interpreter_interpreter_path.is_empty()) {
837 dbgln("exec({}): Interpreter ({}) has its own interpreter ({})! No thank you!", path, interpreter_path, interpreter_interpreter_path);
838 return ELOOP;
839 }
840
841 return interpreter_description;
842 }
843
844 if (main_executable_header.e_type == ET_REL) {
845 // We can't exec an ET_REL, that's just an object file from the compiler
846 return ENOEXEC;
847 }
848 if (main_executable_header.e_type == ET_DYN) {
849 // If it's ET_DYN with no PT_INTERP, then it's a dynamic executable responsible
850 // for its own relocation (i.e. it's /usr/lib/Loader.so)
851 if (path != "/usr/lib/Loader.so")
852 dbgln("exec({}): WARNING - Dynamic ELF executable without a PT_INTERP header, and isn't /usr/lib/Loader.so", path);
853 return nullptr;
854 }
855
856 // No interpreter, but, path refers to a valid elf image
857 return nullptr;
858}
859
860ErrorOr<void> Process::exec(NonnullOwnPtr<KString> path, Vector<NonnullOwnPtr<KString>> arguments, Vector<NonnullOwnPtr<KString>> environment, Thread*& new_main_thread, InterruptsState& previous_interrupts_state, int recursion_depth)
861{
862 if (recursion_depth > 2) {
863 dbgln("exec({}): SHENANIGANS! recursed too far trying to find #! interpreter", path);
864 return ELOOP;
865 }
866
867 // Open the file to check what kind of binary format it is
868 // Currently supported formats:
869 // - #! interpreted file
870 // - ELF32
871 // * ET_EXEC binary that just gets loaded
872 // * ET_DYN binary that requires a program interpreter
873 //
874 auto description = TRY(VirtualFileSystem::the().open(credentials(), path->view(), O_EXEC, 0, current_directory()));
875 auto metadata = description->metadata();
876
877 if (!metadata.is_regular_file())
878 return EACCES;
879
880 // Always gonna need at least 3 bytes. these are for #!X
881 if (metadata.size < 3)
882 return ENOEXEC;
883
884 VERIFY(description->inode());
885
886 // Read the first page of the program into memory so we can validate the binfmt of it
887 char first_page[PAGE_SIZE];
888 auto first_page_buffer = UserOrKernelBuffer::for_kernel_buffer((u8*)&first_page);
889 auto nread = TRY(description->read(first_page_buffer, sizeof(first_page)));
890
891 // 1) #! interpreted file
892 auto shebang_result = find_shebang_interpreter_for_executable(first_page, nread);
893 if (!shebang_result.is_error()) {
894 auto shebang_words = shebang_result.release_value();
895 auto shebang_path = TRY(shebang_words.first()->try_clone());
896 arguments[0] = move(path);
897 TRY(arguments.try_prepend(move(shebang_words)));
898 return exec(move(shebang_path), move(arguments), move(environment), new_main_thread, previous_interrupts_state, ++recursion_depth);
899 }
900
901 // #2) ELF32 for i386
902
903 if (nread < sizeof(ElfW(Ehdr)))
904 return ENOEXEC;
905 auto const* main_program_header = (ElfW(Ehdr)*)first_page;
906
907 if (!ELF::validate_elf_header(*main_program_header, metadata.size)) {
908 dbgln("exec({}): File has invalid ELF header", path);
909 return ENOEXEC;
910 }
911
912 auto interpreter_description = TRY(find_elf_interpreter_for_executable(path->view(), *main_program_header, nread, metadata.size));
913 return do_exec(move(description), move(arguments), move(environment), move(interpreter_description), new_main_thread, previous_interrupts_state, *main_program_header);
914}
915
916ErrorOr<FlatPtr> Process::sys$execve(Userspace<Syscall::SC_execve_params const*> user_params)
917{
918 VERIFY_PROCESS_BIG_LOCK_ACQUIRED(this);
919 TRY(require_promise(Pledge::exec));
920
921 Thread* new_main_thread = nullptr;
922 InterruptsState previous_interrupts_state = InterruptsState::Enabled;
923
924 // NOTE: Be extremely careful with allocating any kernel memory in this function.
925 // On success, the kernel stack will be lost.
926 // The explicit block scope below is specifically placed to minimize the number
927 // of stack locals in this function.
928 {
929 auto params = TRY(copy_typed_from_user(user_params));
930
931 if (params.arguments.length > ARG_MAX || params.environment.length > ARG_MAX)
932 return E2BIG;
933
934 // NOTE: The caller is expected to always pass at least one argument by convention,
935 // the program path that was passed as params.path.
936 if (params.arguments.length == 0)
937 return EINVAL;
938
939 auto path = TRY(get_syscall_path_argument(params.path));
940
941 auto copy_user_strings = [](auto const& list, auto& output) -> ErrorOr<void> {
942 if (!list.length)
943 return {};
944 Checked<size_t> size = sizeof(*list.strings);
945 size *= list.length;
946 if (size.has_overflow())
947 return EOVERFLOW;
948 Vector<Syscall::StringArgument, 32> strings;
949 TRY(strings.try_resize(list.length));
950 TRY(copy_from_user(strings.data(), list.strings, size.value()));
951 for (size_t i = 0; i < list.length; ++i) {
952 auto string = TRY(try_copy_kstring_from_user(strings[i]));
953 TRY(output.try_append(move(string)));
954 }
955 return {};
956 };
957
958 Vector<NonnullOwnPtr<KString>> arguments;
959 TRY(copy_user_strings(params.arguments, arguments));
960
961 Vector<NonnullOwnPtr<KString>> environment;
962 TRY(copy_user_strings(params.environment, environment));
963
964 TRY(exec(move(path), move(arguments), move(environment), new_main_thread, previous_interrupts_state));
965 }
966
967 // NOTE: If we're here, the exec has succeeded and we've got a new executable image!
968 // We will not return normally from this function. Instead, the next time we
969 // get scheduled, it'll be at the entry point of the new executable.
970
971 VERIFY_INTERRUPTS_DISABLED();
972 VERIFY(Processor::in_critical());
973
974 auto* current_thread = Thread::current();
975 if (current_thread == new_main_thread) {
976 // We need to enter the scheduler lock before changing the state
977 // and it will be released after the context switch into that
978 // thread. We should also still be in our critical section
979 VERIFY(!g_scheduler_lock.is_locked_by_current_processor());
980 VERIFY(Processor::in_critical() == 1);
981 g_scheduler_lock.lock();
982 current_thread->set_state(Thread::State::Running);
983#if ARCH(X86_64)
984 FlatPtr prev_flags = previous_interrupts_state == InterruptsState::Enabled ? 0x200 : 0;
985 Processor::assume_context(*current_thread, prev_flags);
986 VERIFY_NOT_REACHED();
987#elif ARCH(AARCH64)
988 TODO_AARCH64();
989#else
990# error Unknown architecture
991#endif
992 }
993
994 // NOTE: This code path is taken in the non-syscall case, i.e when the kernel spawns
995 // a userspace process directly (such as /bin/SystemServer on startup)
996
997 restore_processor_interrupts_state(previous_interrupts_state);
998 Processor::leave_critical();
999 return 0;
1000}
1001
1002}