Serenity Operating System
1/*
2 * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright notice, this
9 * list of conditions and the following disclaimer.
10 *
11 * 2. Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include <AK/QuickSort.h>
28#include <AK/TemporaryChange.h>
29#include <Kernel/FileSystem/FileDescription.h>
30#include <Kernel/Net/Socket.h>
31#include <Kernel/Process.h>
32#include <Kernel/Profiling.h>
33#include <Kernel/RTC.h>
34#include <Kernel/Scheduler.h>
35#include <Kernel/Time/TimeManagement.h>
36#include <Kernel/TimerQueue.h>
37
38//#define LOG_EVERY_CONTEXT_SWITCH
39//#define SCHEDULER_DEBUG
40//#define SCHEDULER_RUNNABLE_DEBUG
41
42namespace Kernel {
43
44SchedulerData* g_scheduler_data;
45
46void Scheduler::init_thread(Thread& thread)
47{
48 g_scheduler_data->m_nonrunnable_threads.append(thread);
49}
50
51void Scheduler::update_state_for_thread(Thread& thread)
52{
53 ASSERT_INTERRUPTS_DISABLED();
54 auto& list = g_scheduler_data->thread_list_for_state(thread.state());
55
56 if (list.contains(thread))
57 return;
58
59 list.append(thread);
60}
61
62static u32 time_slice_for(const Thread& thread)
63{
64 // One time slice unit == 1ms
65 if (&thread == g_colonel)
66 return 1;
67 return 10;
68}
69
70timeval Scheduler::time_since_boot()
71{
72 return { TimeManagement::the().seconds_since_boot(), (suseconds_t)TimeManagement::the().ticks_this_second() * 1000 };
73}
74
75Thread* g_finalizer;
76Thread* g_colonel;
77WaitQueue* g_finalizer_wait_queue;
78bool g_finalizer_has_work;
79static Process* s_colonel_process;
80u64 g_uptime;
81
82struct TaskRedirectionData {
83 u16 selector;
84 TSS32 tss;
85};
86static TaskRedirectionData s_redirection;
87static bool s_active;
88
89bool Scheduler::is_active()
90{
91 return s_active;
92}
93
94Thread::JoinBlocker::JoinBlocker(Thread& joinee, void*& joinee_exit_value)
95 : m_joinee(joinee)
96 , m_joinee_exit_value(joinee_exit_value)
97{
98 ASSERT(m_joinee.m_joiner == nullptr);
99 m_joinee.m_joiner = Thread::current;
100 Thread::current->m_joinee = &joinee;
101}
102
103bool Thread::JoinBlocker::should_unblock(Thread& joiner, time_t, long)
104{
105 return !joiner.m_joinee;
106}
107
108Thread::FileDescriptionBlocker::FileDescriptionBlocker(const FileDescription& description)
109 : m_blocked_description(description)
110{
111}
112
113const FileDescription& Thread::FileDescriptionBlocker::blocked_description() const
114{
115 return m_blocked_description;
116}
117
118Thread::AcceptBlocker::AcceptBlocker(const FileDescription& description)
119 : FileDescriptionBlocker(description)
120{
121}
122
123bool Thread::AcceptBlocker::should_unblock(Thread&, time_t, long)
124{
125 auto& socket = *blocked_description().socket();
126 return socket.can_accept();
127}
128
129Thread::ConnectBlocker::ConnectBlocker(const FileDescription& description)
130 : FileDescriptionBlocker(description)
131{
132}
133
134bool Thread::ConnectBlocker::should_unblock(Thread&, time_t, long)
135{
136 auto& socket = *blocked_description().socket();
137 return socket.setup_state() == Socket::SetupState::Completed;
138}
139
140Thread::WriteBlocker::WriteBlocker(const FileDescription& description)
141 : FileDescriptionBlocker(description)
142{
143 if (description.is_socket()) {
144 auto& socket = *description.socket();
145 if (socket.has_send_timeout()) {
146 timeval deadline = Scheduler::time_since_boot();
147 deadline.tv_sec += socket.send_timeout().tv_sec;
148 deadline.tv_usec += socket.send_timeout().tv_usec;
149 deadline.tv_sec += (socket.send_timeout().tv_usec / 1000000) * 1;
150 deadline.tv_usec %= 1000000;
151 m_deadline = deadline;
152 }
153 }
154}
155
156bool Thread::WriteBlocker::should_unblock(Thread&, time_t now_sec, long now_usec)
157{
158 if (m_deadline.has_value()) {
159 bool timed_out = now_sec > m_deadline.value().tv_sec || (now_sec == m_deadline.value().tv_sec && now_usec >= m_deadline.value().tv_usec);
160 return timed_out || blocked_description().can_write();
161 }
162 return blocked_description().can_write();
163}
164
165Thread::ReadBlocker::ReadBlocker(const FileDescription& description)
166 : FileDescriptionBlocker(description)
167{
168 if (description.is_socket()) {
169 auto& socket = *description.socket();
170 if (socket.has_receive_timeout()) {
171 timeval deadline = Scheduler::time_since_boot();
172 deadline.tv_sec += socket.receive_timeout().tv_sec;
173 deadline.tv_usec += socket.receive_timeout().tv_usec;
174 deadline.tv_sec += (socket.receive_timeout().tv_usec / 1000000) * 1;
175 deadline.tv_usec %= 1000000;
176 m_deadline = deadline;
177 }
178 }
179}
180
181bool Thread::ReadBlocker::should_unblock(Thread&, time_t now_sec, long now_usec)
182{
183 if (m_deadline.has_value()) {
184 bool timed_out = now_sec > m_deadline.value().tv_sec || (now_sec == m_deadline.value().tv_sec && now_usec >= m_deadline.value().tv_usec);
185 return timed_out || blocked_description().can_read();
186 }
187 return blocked_description().can_read();
188}
189
190Thread::ConditionBlocker::ConditionBlocker(const char* state_string, Function<bool()>&& condition)
191 : m_block_until_condition(move(condition))
192 , m_state_string(state_string)
193{
194 ASSERT(m_block_until_condition);
195}
196
197bool Thread::ConditionBlocker::should_unblock(Thread&, time_t, long)
198{
199 return m_block_until_condition();
200}
201
202Thread::SleepBlocker::SleepBlocker(u64 wakeup_time)
203 : m_wakeup_time(wakeup_time)
204{
205}
206
207bool Thread::SleepBlocker::should_unblock(Thread&, time_t, long)
208{
209 return m_wakeup_time <= g_uptime;
210}
211
212Thread::SelectBlocker::SelectBlocker(const timeval& tv, bool select_has_timeout, const FDVector& read_fds, const FDVector& write_fds, const FDVector& except_fds)
213 : m_select_timeout(tv)
214 , m_select_has_timeout(select_has_timeout)
215 , m_select_read_fds(read_fds)
216 , m_select_write_fds(write_fds)
217 , m_select_exceptional_fds(except_fds)
218{
219}
220
221bool Thread::SelectBlocker::should_unblock(Thread& thread, time_t now_sec, long now_usec)
222{
223 if (m_select_has_timeout) {
224 if (now_sec > m_select_timeout.tv_sec || (now_sec == m_select_timeout.tv_sec && now_usec >= m_select_timeout.tv_usec))
225 return true;
226 }
227
228 auto& process = thread.process();
229 for (int fd : m_select_read_fds) {
230 if (!process.m_fds[fd])
231 continue;
232 if (process.m_fds[fd].description->can_read())
233 return true;
234 }
235 for (int fd : m_select_write_fds) {
236 if (!process.m_fds[fd])
237 continue;
238 if (process.m_fds[fd].description->can_write())
239 return true;
240 }
241
242 return false;
243}
244
245Thread::WaitBlocker::WaitBlocker(int wait_options, pid_t& waitee_pid)
246 : m_wait_options(wait_options)
247 , m_waitee_pid(waitee_pid)
248{
249}
250
251bool Thread::WaitBlocker::should_unblock(Thread& thread, time_t, long)
252{
253 bool should_unblock = false;
254 if (m_waitee_pid != -1) {
255 auto* peer = Process::from_pid(m_waitee_pid);
256 if (!peer)
257 return true;
258 }
259 thread.process().for_each_child([&](Process& child) {
260 if (m_waitee_pid != -1 && m_waitee_pid != child.pid())
261 return IterationDecision::Continue;
262
263 bool child_exited = child.is_dead();
264 bool child_stopped = false;
265 if (child.thread_count()) {
266 auto& child_thread = child.any_thread();
267 if (child_thread.state() == Thread::State::Stopped && !child_thread.has_pending_signal(SIGCONT))
268 child_stopped = true;
269 }
270
271 bool wait_finished = ((m_wait_options & WEXITED) && child_exited)
272 || ((m_wait_options & WSTOPPED) && child_stopped);
273
274 if (!wait_finished)
275 return IterationDecision::Continue;
276
277 m_waitee_pid = child.pid();
278 should_unblock = true;
279 return IterationDecision::Break;
280 });
281 return should_unblock;
282}
283
284Thread::SemiPermanentBlocker::SemiPermanentBlocker(Reason reason)
285 : m_reason(reason)
286{
287}
288
289bool Thread::SemiPermanentBlocker::should_unblock(Thread&, time_t, long)
290{
291 // someone else has to unblock us
292 return false;
293}
294
295// Called by the scheduler on threads that are blocked for some reason.
296// Make a decision as to whether to unblock them or not.
297void Thread::consider_unblock(time_t now_sec, long now_usec)
298{
299 switch (state()) {
300 case Thread::Invalid:
301 case Thread::Runnable:
302 case Thread::Running:
303 case Thread::Dead:
304 case Thread::Stopped:
305 case Thread::Queued:
306 case Thread::Dying:
307 /* don't know, don't care */
308 return;
309 case Thread::Blocked:
310 ASSERT(m_blocker != nullptr);
311 if (m_blocker->should_unblock(*this, now_sec, now_usec))
312 unblock();
313 return;
314 case Thread::Skip1SchedulerPass:
315 set_state(Thread::Skip0SchedulerPasses);
316 return;
317 case Thread::Skip0SchedulerPasses:
318 set_state(Thread::Runnable);
319 return;
320 }
321}
322
323bool Scheduler::pick_next()
324{
325 ASSERT_INTERRUPTS_DISABLED();
326 ASSERT(!s_active);
327
328 TemporaryChange<bool> change(s_active, true);
329
330 ASSERT(s_active);
331
332 if (!Thread::current) {
333 // XXX: The first ever context_switch() goes to the idle process.
334 // This to setup a reliable place we can return to.
335 return context_switch(*g_colonel);
336 }
337
338 auto now = time_since_boot();
339
340 auto now_sec = now.tv_sec;
341 auto now_usec = now.tv_usec;
342
343 // Check and unblock threads whose wait conditions have been met.
344 Scheduler::for_each_nonrunnable([&](Thread& thread) {
345 thread.consider_unblock(now_sec, now_usec);
346 return IterationDecision::Continue;
347 });
348
349 Process::for_each([&](Process& process) {
350 if (process.is_dead()) {
351 if (Process::current->pid() != process.pid() && (!process.ppid() || !Process::from_pid(process.ppid()))) {
352 auto name = process.name();
353 auto pid = process.pid();
354 auto exit_status = Process::reap(process);
355 dbg() << "Scheduler: Reaped unparented process " << name << "(" << pid << "), exit status: " << exit_status.si_status;
356 }
357 return IterationDecision::Continue;
358 }
359 if (process.m_alarm_deadline && g_uptime > process.m_alarm_deadline) {
360 process.m_alarm_deadline = 0;
361 process.send_signal(SIGALRM, nullptr);
362 }
363 return IterationDecision::Continue;
364 });
365
366 // Dispatch any pending signals.
367 Thread::for_each_living([](Thread& thread) -> IterationDecision {
368 if (!thread.has_unmasked_pending_signals())
369 return IterationDecision::Continue;
370 // FIXME: It would be nice if the Scheduler didn't have to worry about who is "current"
371 // For now, avoid dispatching signals to "current" and do it in a scheduling pass
372 // while some other process is interrupted. Otherwise a mess will be made.
373 if (&thread == Thread::current)
374 return IterationDecision::Continue;
375 // We know how to interrupt blocked processes, but if they are just executing
376 // at some random point in the kernel, let them continue.
377 // Before returning to userspace from a syscall, we will block a thread if it has any
378 // pending unmasked signals, allowing it to be dispatched then.
379 if (thread.in_kernel() && !thread.is_blocked() && !thread.is_stopped())
380 return IterationDecision::Continue;
381 // NOTE: dispatch_one_pending_signal() may unblock the process.
382 bool was_blocked = thread.is_blocked();
383 if (thread.dispatch_one_pending_signal() == ShouldUnblockThread::No)
384 return IterationDecision::Continue;
385 if (was_blocked) {
386 dbg() << "Unblock " << thread << " due to signal";
387 ASSERT(thread.m_blocker != nullptr);
388 thread.m_blocker->set_interrupted_by_signal();
389 thread.unblock();
390 }
391 return IterationDecision::Continue;
392 });
393
394#ifdef SCHEDULER_RUNNABLE_DEBUG
395 dbg() << "Non-runnables:";
396 Scheduler::for_each_nonrunnable([](Thread& thread) -> IterationDecision {
397 dbg() << " " << String::format("%-12s", thread.state_string()) << " " << thread << " @ " << String::format("%w", thread.tss().cs) << ":" << String::format("%x", thread.tss().eip);
398 return IterationDecision::Continue;
399 });
400
401 dbg() << "Runnables:";
402 Scheduler::for_each_runnable([](Thread& thread) -> IterationDecision {
403 dbg() << " " << String::format("%3u", thread.effective_priority()) << "/" << String::format("%2u", thread.priority()) << " " << String::format("%-12s", thread.state_string()) << " " << thread << " @ " << String::format("%w", thread.tss().cs) << ":" << String::format("%x", thread.tss().eip);
404 return IterationDecision::Continue;
405 });
406#endif
407
408 Vector<Thread*, 128> sorted_runnables;
409 for_each_runnable([&sorted_runnables](auto& thread) {
410 sorted_runnables.append(&thread);
411 return IterationDecision::Continue;
412 });
413 quick_sort(sorted_runnables, [](auto& a, auto& b) { return a->effective_priority() >= b->effective_priority(); });
414
415 Thread* thread_to_schedule = nullptr;
416
417 for (auto* thread : sorted_runnables) {
418 if (thread->process().is_being_inspected())
419 continue;
420
421 if (thread->process().exec_tid() && thread->process().exec_tid() != thread->tid())
422 continue;
423
424 ASSERT(thread->state() == Thread::Runnable || thread->state() == Thread::Running);
425
426 if (!thread_to_schedule) {
427 thread->m_extra_priority = 0;
428 thread_to_schedule = thread;
429 } else {
430 thread->m_extra_priority++;
431 }
432 }
433
434 if (!thread_to_schedule)
435 thread_to_schedule = g_colonel;
436
437#ifdef SCHEDULER_DEBUG
438 dbg() << "Scheduler: Switch to " << *thread_to_schedule << " @ " << String::format("%04x:%08x", thread_to_schedule->tss().cs, thread_to_schedule->tss().eip);
439#endif
440
441 return context_switch(*thread_to_schedule);
442}
443
444bool Scheduler::donate_to(Thread* beneficiary, const char* reason)
445{
446 InterruptDisabler disabler;
447 if (!Thread::is_thread(beneficiary))
448 return false;
449
450 (void)reason;
451 unsigned ticks_left = Thread::current->ticks_left();
452 if (!beneficiary || beneficiary->state() != Thread::Runnable || ticks_left <= 1)
453 return yield();
454
455 unsigned ticks_to_donate = min(ticks_left - 1, time_slice_for(*beneficiary));
456#ifdef SCHEDULER_DEBUG
457 dbg() << "Scheduler: Donating " << ticks_to_donate << " ticks to " << *beneficiary << ", reason=" << reason;
458#endif
459 context_switch(*beneficiary);
460 beneficiary->set_ticks_left(ticks_to_donate);
461 switch_now();
462 return false;
463}
464
465bool Scheduler::yield()
466{
467 InterruptDisabler disabler;
468 ASSERT(Thread::current);
469 if (!pick_next())
470 return false;
471 switch_now();
472 return true;
473}
474
475void Scheduler::pick_next_and_switch_now()
476{
477 bool someone_wants_to_run = pick_next();
478 ASSERT(someone_wants_to_run);
479 switch_now();
480}
481
482void Scheduler::switch_now()
483{
484 Descriptor& descriptor = get_gdt_entry(Thread::current->selector());
485 descriptor.type = 9;
486 asm("sti\n"
487 "ljmp *(%%eax)\n" ::"a"(&Thread::current->far_ptr()));
488}
489
490bool Scheduler::context_switch(Thread& thread)
491{
492 thread.set_ticks_left(time_slice_for(thread));
493 thread.did_schedule();
494
495 if (Thread::current == &thread)
496 return false;
497
498 if (Thread::current) {
499 // If the last process hasn't blocked (still marked as running),
500 // mark it as runnable for the next round.
501 if (Thread::current->state() == Thread::Running)
502 Thread::current->set_state(Thread::Runnable);
503
504 asm volatile("fxsave %0"
505 : "=m"(Thread::current->fpu_state()));
506
507#ifdef LOG_EVERY_CONTEXT_SWITCH
508 dbg() << "Scheduler: " << *Thread::current << " -> " << thread << " [" << thread.priority() << "] " << String::format("%w", thread.tss().cs) << ":" << String::format("%x", thread.tss().eip);
509#endif
510 }
511
512 Thread::current = &thread;
513 Process::current = &thread.process();
514
515 thread.set_state(Thread::Running);
516
517 asm volatile("fxrstor %0" ::"m"(Thread::current->fpu_state()));
518
519 if (!thread.selector()) {
520 thread.set_selector(gdt_alloc_entry());
521 auto& descriptor = get_gdt_entry(thread.selector());
522 descriptor.set_base(&thread.tss());
523 descriptor.set_limit(sizeof(TSS32));
524 descriptor.dpl = 0;
525 descriptor.segment_present = 1;
526 descriptor.granularity = 0;
527 descriptor.zero = 0;
528 descriptor.operation_size = 1;
529 descriptor.descriptor_type = 0;
530 }
531
532 if (!thread.thread_specific_data().is_null()) {
533 auto& descriptor = thread_specific_descriptor();
534 descriptor.set_base(thread.thread_specific_data().as_ptr());
535 descriptor.set_limit(sizeof(ThreadSpecificData*));
536 }
537
538 auto& descriptor = get_gdt_entry(thread.selector());
539 descriptor.type = 11; // Busy TSS
540 return true;
541}
542
543static void initialize_redirection()
544{
545 auto& descriptor = get_gdt_entry(s_redirection.selector);
546 descriptor.set_base(&s_redirection.tss);
547 descriptor.set_limit(sizeof(TSS32));
548 descriptor.dpl = 0;
549 descriptor.segment_present = 1;
550 descriptor.granularity = 0;
551 descriptor.zero = 0;
552 descriptor.operation_size = 1;
553 descriptor.descriptor_type = 0;
554 descriptor.type = 9;
555 flush_gdt();
556}
557
558void Scheduler::prepare_for_iret_to_new_process()
559{
560 auto& descriptor = get_gdt_entry(s_redirection.selector);
561 descriptor.type = 9;
562 s_redirection.tss.backlink = Thread::current->selector();
563 load_task_register(s_redirection.selector);
564}
565
566void Scheduler::prepare_to_modify_tss(Thread& thread)
567{
568 // This ensures that a currently running process modifying its own TSS
569 // in order to yield() and end up somewhere else doesn't just end up
570 // right after the yield().
571 if (Thread::current == &thread)
572 load_task_register(s_redirection.selector);
573}
574
575Process* Scheduler::colonel()
576{
577 return s_colonel_process;
578}
579
580void Scheduler::initialize()
581{
582 g_scheduler_data = new SchedulerData;
583 g_finalizer_wait_queue = new WaitQueue;
584 g_finalizer_has_work = false;
585 s_redirection.selector = gdt_alloc_entry();
586 initialize_redirection();
587 s_colonel_process = Process::create_kernel_process(g_colonel, "colonel", nullptr);
588 g_colonel->set_priority(THREAD_PRIORITY_MIN);
589 load_task_register(s_redirection.selector);
590}
591
592void Scheduler::timer_tick(const RegisterState& regs)
593{
594 if (!Thread::current)
595 return;
596
597 ++g_uptime;
598
599 timeval tv;
600 tv.tv_sec = TimeManagement::the().epoch_time();
601 tv.tv_usec = TimeManagement::the().ticks_this_second() * 1000;
602 Process::update_info_page_timestamp(tv);
603
604 if (Process::current->is_profiling()) {
605 SmapDisabler disabler;
606 auto backtrace = Thread::current->raw_backtrace(regs.ebp);
607 auto& sample = Profiling::next_sample_slot();
608 sample.pid = Process::current->pid();
609 sample.tid = Thread::current->tid();
610 sample.timestamp = g_uptime;
611 for (size_t i = 0; i < min(backtrace.size(), Profiling::max_stack_frame_count); ++i) {
612 sample.frames[i] = backtrace[i];
613 }
614 }
615
616 TimerQueue::the().fire();
617
618 if (Thread::current->tick())
619 return;
620
621 auto& outgoing_tss = Thread::current->tss();
622
623 if (!pick_next())
624 return;
625
626 outgoing_tss.gs = regs.gs;
627 outgoing_tss.fs = regs.fs;
628 outgoing_tss.es = regs.es;
629 outgoing_tss.ds = regs.ds;
630 outgoing_tss.edi = regs.edi;
631 outgoing_tss.esi = regs.esi;
632 outgoing_tss.ebp = regs.ebp;
633 outgoing_tss.ebx = regs.ebx;
634 outgoing_tss.edx = regs.edx;
635 outgoing_tss.ecx = regs.ecx;
636 outgoing_tss.eax = regs.eax;
637 outgoing_tss.eip = regs.eip;
638 outgoing_tss.cs = regs.cs;
639 outgoing_tss.eflags = regs.eflags;
640
641 // Compute process stack pointer.
642 // Add 16 for CS, EIP, EFLAGS, exception code (interrupt mechanic)
643 outgoing_tss.esp = regs.esp + 16;
644 outgoing_tss.ss = regs.ss;
645
646 if ((outgoing_tss.cs & 3) != 0) {
647 outgoing_tss.ss = regs.userspace_ss;
648 outgoing_tss.esp = regs.userspace_esp;
649 }
650 prepare_for_iret_to_new_process();
651
652 // Set the NT (nested task) flag.
653 asm(
654 "pushf\n"
655 "orl $0x00004000, (%esp)\n"
656 "popf\n");
657}
658
659static bool s_should_stop_idling = false;
660
661void Scheduler::stop_idling()
662{
663 if (Thread::current != g_colonel)
664 return;
665
666 s_should_stop_idling = true;
667}
668
669void Scheduler::idle_loop()
670{
671 for (;;) {
672 asm("hlt");
673 if (s_should_stop_idling) {
674 s_should_stop_idling = false;
675 yield();
676 }
677 }
678}
679
680}