Serenity Operating System
1/*
2 * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
3 *
4 * SPDX-License-Identifier: BSD-2-Clause
5 */
6
7#include <AK/AnyOf.h>
8#include <AK/GenericLexer.h>
9#include <AK/RefPtr.h>
10#include <AK/Singleton.h>
11#include <AK/StringBuilder.h>
12#include <Kernel/API/POSIX/errno.h>
13#include <Kernel/Debug.h>
14#include <Kernel/Devices/BlockDevice.h>
15#include <Kernel/Devices/DeviceManagement.h>
16#include <Kernel/FileSystem/Custody.h>
17#include <Kernel/FileSystem/FileBackedFileSystem.h>
18#include <Kernel/FileSystem/FileSystem.h>
19#include <Kernel/FileSystem/OpenFileDescription.h>
20#include <Kernel/FileSystem/VirtualFileSystem.h>
21#include <Kernel/KLexicalPath.h>
22#include <Kernel/KSyms.h>
23#include <Kernel/Process.h>
24#include <Kernel/Sections.h>
25
26namespace Kernel {
27
28static Singleton<VirtualFileSystem> s_the;
29static constexpr int root_mount_flags = 0;
30
31UNMAP_AFTER_INIT void VirtualFileSystem::initialize()
32{
33 s_the.ensure_instance();
34}
35
36VirtualFileSystem& VirtualFileSystem::the()
37{
38 return *s_the;
39}
40
41UNMAP_AFTER_INIT VirtualFileSystem::VirtualFileSystem()
42{
43}
44
45UNMAP_AFTER_INIT VirtualFileSystem::~VirtualFileSystem() = default;
46
47InodeIdentifier VirtualFileSystem::root_inode_id() const
48{
49 VERIFY(m_root_inode);
50 return m_root_inode->identifier();
51}
52
53bool VirtualFileSystem::mount_point_exists_at_inode(InodeIdentifier inode_identifier)
54{
55 return m_mounts.with([&](auto& mounts) -> bool {
56 return any_of(mounts, [&inode_identifier](auto const& existing_mount) {
57 return existing_mount.host() && existing_mount.host()->identifier() == inode_identifier;
58 });
59 });
60}
61
62ErrorOr<void> VirtualFileSystem::mount(FileSystem& fs, Custody& mount_point, int flags)
63{
64 auto new_mount = TRY(adopt_nonnull_own_or_enomem(new (nothrow) Mount(fs, &mount_point, flags)));
65 return m_mounts.with([&](auto& mounts) -> ErrorOr<void> {
66 auto& inode = mount_point.inode();
67 dbgln("VirtualFileSystem: FileSystemID {}, Mounting {} at inode {} with flags {}",
68 fs.fsid(),
69 fs.class_name(),
70 inode.identifier(),
71 flags);
72 if (mount_point_exists_at_inode(inode.identifier())) {
73 dbgln("VirtualFileSystem: Mounting unsuccessful - inode {} is already a mount-point.", inode.identifier());
74 return EBUSY;
75 }
76 // Note: Actually add a mount for the filesystem and increment the filesystem mounted count
77 new_mount->guest_fs().mounted_count({}).with([&](auto& mounted_count) {
78 mounted_count++;
79
80 // When this is the first time this FileSystem is mounted,
81 // begin managing the FileSystem by adding it to the list of
82 // managed file systems. This is symmetric with
83 // VirtualFileSystem::unmount()'s `remove()` calls (which remove
84 // the FileSystem once it is no longer mounted).
85 if (mounted_count == 1) {
86 m_file_systems_list.with([&](auto& fs_list) {
87 fs_list.append(fs);
88 });
89 if (fs.is_file_backed()) {
90 auto& file_backed_fs = static_cast<FileBackedFileSystem&>(fs);
91 m_file_backed_file_systems_list.with([&](auto& fs_list) {
92 fs_list.append(file_backed_fs);
93 });
94 }
95 }
96 });
97
98 // NOTE: Leak the mount pointer so it can be added to the mount list, but it won't be
99 // deleted after being added.
100 mounts.append(*new_mount.leak_ptr());
101 return {};
102 });
103}
104
105ErrorOr<void> VirtualFileSystem::bind_mount(Custody& source, Custody& mount_point, int flags)
106{
107 auto new_mount = TRY(adopt_nonnull_own_or_enomem(new (nothrow) Mount(source.inode(), mount_point, flags)));
108 return m_mounts.with([&](auto& mounts) -> ErrorOr<void> {
109 auto& inode = mount_point.inode();
110 dbgln("VirtualFileSystem: Bind-mounting inode {} at inode {}", source.inode().identifier(), inode.identifier());
111 if (mount_point_exists_at_inode(inode.identifier())) {
112 dbgln("VirtualFileSystem: Bind-mounting unsuccessful - inode {} is already a mount-point.",
113 mount_point.inode().identifier());
114 return EBUSY;
115 }
116
117 // NOTE: Leak the mount pointer so it can be added to the mount list, but it won't be
118 // deleted after being added.
119 mounts.append(*new_mount.leak_ptr());
120 return {};
121 });
122}
123
124ErrorOr<void> VirtualFileSystem::remount(Custody& mount_point, int new_flags)
125{
126 dbgln("VirtualFileSystem: Remounting inode {}", mount_point.inode().identifier());
127
128 auto* mount = find_mount_for_guest(mount_point.inode().identifier());
129 if (!mount)
130 return ENODEV;
131
132 mount->set_flags(new_flags);
133 return {};
134}
135
136void VirtualFileSystem::sync_filesystems()
137{
138 Vector<NonnullLockRefPtr<FileSystem>, 32> file_systems;
139 m_file_systems_list.with([&](auto const& list) {
140 for (auto& fs : list)
141 file_systems.append(fs);
142 });
143
144 for (auto& fs : file_systems)
145 fs->flush_writes();
146}
147
148void VirtualFileSystem::lock_all_filesystems()
149{
150 Vector<NonnullLockRefPtr<FileSystem>, 32> file_systems;
151 m_file_systems_list.with([&](auto const& list) {
152 for (auto& fs : list)
153 file_systems.append(fs);
154 });
155
156 for (auto& fs : file_systems)
157 fs->m_lock.lock();
158}
159
160ErrorOr<void> VirtualFileSystem::unmount(Custody& mountpoint_custody)
161{
162 auto& guest_inode = mountpoint_custody.inode();
163 auto custody_path = TRY(mountpoint_custody.try_serialize_absolute_path());
164 dbgln("VirtualFileSystem: unmount called with inode {} on mountpoint {}", guest_inode.identifier(), custody_path->view());
165
166 return m_mounts.with([&](auto& mounts) -> ErrorOr<void> {
167 for (auto& mount : mounts) {
168 if (&mount.guest() != &guest_inode)
169 continue;
170 auto mountpoint_path = TRY(mount.absolute_path());
171 if (custody_path->view() != mountpoint_path->view())
172 continue;
173 NonnullRefPtr<FileSystem> fs = mount.guest_fs();
174 TRY(fs->prepare_to_unmount());
175 fs->mounted_count({}).with([&](auto& mounted_count) {
176 VERIFY(mounted_count > 0);
177 if (mounted_count == 1) {
178 dbgln("VirtualFileSystem: Unmounting file system {} for the last time...", fs->fsid());
179 m_file_systems_list.with([&](auto& list) {
180 list.remove(*fs);
181 });
182 if (fs->is_file_backed()) {
183 dbgln("VirtualFileSystem: Unmounting file backed file system {} for the last time...", fs->fsid());
184 auto& file_backed_fs = static_cast<FileBackedFileSystem&>(*fs);
185 m_file_backed_file_systems_list.with([&](auto& list) {
186 list.remove(file_backed_fs);
187 });
188 }
189 } else {
190 mounted_count--;
191 }
192 });
193 dbgln("VirtualFileSystem: Unmounting file system {}...", fs->fsid());
194 mount.m_vfs_list_node.remove();
195 // Note: This is balanced by a `new` statement that is happening in various places before inserting the Mount object to the list.
196 delete &mount;
197 return {};
198 }
199 dbgln("VirtualFileSystem: Nothing mounted on inode {}", guest_inode.identifier());
200 return ENODEV;
201 });
202}
203
204ErrorOr<void> VirtualFileSystem::mount_root(FileSystem& fs)
205{
206 if (m_root_inode) {
207 dmesgln("VirtualFileSystem: mount_root can't mount another root");
208 return EEXIST;
209 }
210
211 auto new_mount = TRY(adopt_nonnull_own_or_enomem(new (nothrow) Mount(fs, nullptr, root_mount_flags)));
212 auto& root_inode = fs.root_inode();
213 if (!root_inode.is_directory()) {
214 dmesgln("VirtualFileSystem: root inode ({}) for / is not a directory :(", root_inode.identifier());
215 return ENOTDIR;
216 }
217
218 m_root_inode = root_inode;
219 if (fs.is_file_backed()) {
220 auto pseudo_path = TRY(static_cast<FileBackedFileSystem&>(fs).file_description().pseudo_path());
221 dmesgln("VirtualFileSystem: mounted root({}) from {} ({})", fs.fsid(), fs.class_name(), pseudo_path);
222 m_file_backed_file_systems_list.with([&](auto& list) {
223 list.append(static_cast<FileBackedFileSystem&>(fs));
224 });
225 } else {
226 dmesgln("VirtualFileSystem: mounted root({}) from {}", fs.fsid(), fs.class_name());
227 }
228
229 m_file_systems_list.with([&](auto& fs_list) {
230 fs_list.append(fs);
231 });
232
233 fs.mounted_count({}).with([&](auto& mounted_count) {
234 mounted_count++;
235 });
236
237 // Note: Actually add a mount for the filesystem and increment the filesystem mounted count
238 m_mounts.with([&](auto& mounts) {
239 // NOTE: Leak the mount pointer so it can be added to the mount list, but it won't be
240 // deleted after being added.
241 mounts.append(*new_mount.leak_ptr());
242 });
243
244 RefPtr<Custody> new_root_custody = TRY(Custody::try_create(nullptr, ""sv, *m_root_inode, root_mount_flags));
245 m_root_custody.with([&](auto& root_custody) {
246 swap(root_custody, new_root_custody);
247 });
248 return {};
249}
250
251auto VirtualFileSystem::find_mount_for_host(InodeIdentifier id) -> Mount*
252{
253 return m_mounts.with([&](auto& mounts) -> Mount* {
254 for (auto& mount : mounts) {
255 if (mount.host() && mount.host()->identifier() == id)
256 return &mount;
257 }
258 return nullptr;
259 });
260}
261
262auto VirtualFileSystem::find_mount_for_guest(InodeIdentifier id) -> Mount*
263{
264 return m_mounts.with([&](auto& mounts) -> Mount* {
265 for (auto& mount : mounts) {
266 if (mount.guest().identifier() == id)
267 return &mount;
268 }
269 return nullptr;
270 });
271}
272
273bool VirtualFileSystem::is_vfs_root(InodeIdentifier inode) const
274{
275 return inode == root_inode_id();
276}
277
278ErrorOr<void> VirtualFileSystem::traverse_directory_inode(Inode& dir_inode, Function<ErrorOr<void>(FileSystem::DirectoryEntryView const&)> callback)
279{
280 return dir_inode.traverse_as_directory([&](auto& entry) -> ErrorOr<void> {
281 InodeIdentifier resolved_inode;
282 if (auto mount = find_mount_for_host(entry.inode))
283 resolved_inode = mount->guest().identifier();
284 else
285 resolved_inode = entry.inode;
286
287 // FIXME: This is now broken considering chroot and bind mounts.
288 bool is_root_inode = dir_inode.identifier() == dir_inode.fs().root_inode().identifier();
289 if (is_root_inode && !is_vfs_root(dir_inode.identifier()) && entry.name == "..") {
290 auto mount = find_mount_for_guest(dir_inode.identifier());
291 VERIFY(mount);
292 VERIFY(mount->host());
293 resolved_inode = mount->host()->identifier();
294 }
295 TRY(callback({ entry.name, resolved_inode, entry.file_type }));
296 return {};
297 });
298}
299
300ErrorOr<void> VirtualFileSystem::utime(Credentials const& credentials, StringView path, Custody& base, time_t atime, time_t mtime)
301{
302 auto custody = TRY(resolve_path(credentials, path, base));
303 auto& inode = custody->inode();
304 if (!credentials.is_superuser() && inode.metadata().uid != credentials.euid())
305 return EACCES;
306 if (custody->is_readonly())
307 return EROFS;
308
309 TRY(inode.update_timestamps(Time::from_timespec({ atime, 0 }), {}, Time::from_timespec({ mtime, 0 })));
310 return {};
311}
312
313ErrorOr<void> VirtualFileSystem::utimensat(Credentials const& credentials, StringView path, Custody& base, timespec const& atime, timespec const& mtime, int options)
314{
315 auto custody = TRY(resolve_path(credentials, path, base, nullptr, options));
316 auto& inode = custody->inode();
317 if (!credentials.is_superuser() && inode.metadata().uid != credentials.euid())
318 return EACCES;
319 if (custody->is_readonly())
320 return EROFS;
321
322 // NOTE: A standard ext2 inode cannot store nanosecond timestamps.
323 TRY(inode.update_timestamps(
324 (atime.tv_nsec != UTIME_OMIT) ? Time::from_timespec(atime) : Optional<Time> {},
325 {},
326 (mtime.tv_nsec != UTIME_OMIT) ? Time::from_timespec(mtime) : Optional<Time> {}));
327
328 return {};
329}
330
331ErrorOr<InodeMetadata> VirtualFileSystem::lookup_metadata(Credentials const& credentials, StringView path, Custody& base, int options)
332{
333 auto custody = TRY(resolve_path(credentials, path, base, nullptr, options));
334 return custody->inode().metadata();
335}
336
337ErrorOr<NonnullLockRefPtr<FileBackedFileSystem>> VirtualFileSystem::find_already_existing_or_create_file_backed_file_system(OpenFileDescription& description, Function<ErrorOr<NonnullLockRefPtr<FileSystem>>(OpenFileDescription&)> callback)
338{
339 return TRY(m_file_backed_file_systems_list.with([&](auto& list) -> ErrorOr<NonnullLockRefPtr<FileBackedFileSystem>> {
340 for (auto& node : list) {
341 if (&node.file_description() == &description) {
342 return node;
343 }
344 if (&node.file() == &description.file()) {
345 return node;
346 }
347 }
348 auto fs = TRY(callback(description));
349
350 // The created FileSystem is only added to the file_systems_lists
351 // when the FS has been successfully initialized and mounted
352 // (in VirtualFileSystem::mount()). This prevents file systems which
353 // fail to initialize or mount from existing in the list when the
354 // FileSystem is destroyed after failure.
355 return static_ptr_cast<FileBackedFileSystem>(fs);
356 }));
357}
358
359ErrorOr<NonnullRefPtr<OpenFileDescription>> VirtualFileSystem::open(Credentials const& credentials, StringView path, int options, mode_t mode, Custody& base, Optional<UidAndGid> owner)
360{
361 return open(Process::current(), credentials, path, options, mode, base, owner);
362}
363
364ErrorOr<NonnullRefPtr<OpenFileDescription>> VirtualFileSystem::open(Process const& process, Credentials const& credentials, StringView path, int options, mode_t mode, Custody& base, Optional<UidAndGid> owner)
365{
366 if ((options & O_CREAT) && (options & O_DIRECTORY))
367 return EINVAL;
368
369 RefPtr<Custody> parent_custody;
370 auto custody_or_error = resolve_path(process, credentials, path, base, &parent_custody, options);
371 if (custody_or_error.is_error()) {
372 // NOTE: ENOENT with a non-null parent custody signals us that the immediate parent
373 // of the file exists, but the file itself does not.
374 if ((options & O_CREAT) && custody_or_error.error().code() == ENOENT && parent_custody)
375 return create(process, credentials, path, options, mode, *parent_custody, move(owner));
376 return custody_or_error.release_error();
377 }
378
379 if ((options & O_CREAT) && (options & O_EXCL))
380 return EEXIST;
381
382 auto& custody = *custody_or_error.value();
383 auto& inode = custody.inode();
384 auto metadata = inode.metadata();
385
386 if (metadata.is_regular_file() && (custody.mount_flags() & MS_NOREGULAR))
387 return EACCES;
388
389 if ((options & O_DIRECTORY) && !metadata.is_directory())
390 return ENOTDIR;
391
392 bool should_truncate_file = false;
393
394 if ((options & O_RDONLY) && !metadata.may_read(credentials))
395 return EACCES;
396
397 if (options & O_WRONLY) {
398 if (!metadata.may_write(credentials))
399 return EACCES;
400 if (metadata.is_directory())
401 return EISDIR;
402 should_truncate_file = options & O_TRUNC;
403 }
404 if (options & O_EXEC) {
405 if (!metadata.may_execute(credentials) || (custody.mount_flags() & MS_NOEXEC))
406 return EACCES;
407 }
408
409 if (metadata.is_fifo()) {
410 auto fifo = TRY(inode.fifo());
411 if (options & O_WRONLY) {
412 auto description = TRY(fifo->open_direction_blocking(FIFO::Direction::Writer));
413 description->set_rw_mode(options);
414 description->set_file_flags(options);
415 description->set_original_inode({}, inode);
416 return description;
417 } else if (options & O_RDONLY) {
418 auto description = TRY(fifo->open_direction_blocking(FIFO::Direction::Reader));
419 description->set_rw_mode(options);
420 description->set_file_flags(options);
421 description->set_original_inode({}, inode);
422 return description;
423 }
424 return EINVAL;
425 }
426
427 if (metadata.is_device()) {
428 if (custody.mount_flags() & MS_NODEV)
429 return EACCES;
430 auto device = DeviceManagement::the().get_device(metadata.major_device, metadata.minor_device);
431 if (device == nullptr) {
432 return ENODEV;
433 }
434 auto description = TRY(device->open(options));
435 description->set_original_inode({}, inode);
436 description->set_original_custody({}, custody);
437 return description;
438 }
439
440 // Check for read-only FS. Do this after handling devices, but before modifying the inode in any way.
441 if ((options & O_WRONLY) && custody.is_readonly())
442 return EROFS;
443
444 if (should_truncate_file) {
445 TRY(inode.truncate(0));
446 TRY(inode.update_timestamps({}, {}, kgettimeofday()));
447 }
448 auto description = TRY(OpenFileDescription::try_create(custody));
449 description->set_rw_mode(options);
450 description->set_file_flags(options);
451 return description;
452}
453
454ErrorOr<void> VirtualFileSystem::mknod(Credentials const& credentials, StringView path, mode_t mode, dev_t dev, Custody& base)
455{
456 if (!is_regular_file(mode) && !is_block_device(mode) && !is_character_device(mode) && !is_fifo(mode) && !is_socket(mode))
457 return EINVAL;
458
459 RefPtr<Custody> parent_custody;
460 auto existing_file_or_error = resolve_path(credentials, path, base, &parent_custody);
461 if (!existing_file_or_error.is_error())
462 return EEXIST;
463 if (!parent_custody)
464 return ENOENT;
465 if (existing_file_or_error.error().code() != ENOENT)
466 return existing_file_or_error.release_error();
467 auto& parent_inode = parent_custody->inode();
468 if (!parent_inode.metadata().may_write(credentials))
469 return EACCES;
470 if (parent_custody->is_readonly())
471 return EROFS;
472
473 auto basename = KLexicalPath::basename(path);
474 dbgln_if(VFS_DEBUG, "VirtualFileSystem::mknod: '{}' mode={} dev={} in {}", basename, mode, dev, parent_inode.identifier());
475 (void)TRY(parent_inode.create_child(basename, mode, dev, credentials.euid(), credentials.egid()));
476 return {};
477}
478
479ErrorOr<NonnullRefPtr<OpenFileDescription>> VirtualFileSystem::create(Credentials const& credentials, StringView path, int options, mode_t mode, Custody& parent_custody, Optional<UidAndGid> owner)
480{
481 return create(Process::current(), credentials, path, options, mode, parent_custody, owner);
482}
483
484ErrorOr<NonnullRefPtr<OpenFileDescription>> VirtualFileSystem::create(Process const& process, Credentials const& credentials, StringView path, int options, mode_t mode, Custody& parent_custody, Optional<UidAndGid> owner)
485{
486 auto basename = KLexicalPath::basename(path);
487 auto parent_path = TRY(parent_custody.try_serialize_absolute_path());
488 auto full_path = TRY(KLexicalPath::try_join(parent_path->view(), basename));
489 TRY(validate_path_against_process_veil(process, full_path->view(), options));
490
491 if (!is_socket(mode) && !is_fifo(mode) && !is_block_device(mode) && !is_character_device(mode)) {
492 // Turn it into a regular file. (This feels rather hackish.)
493 mode |= 0100000;
494 }
495
496 auto& parent_inode = parent_custody.inode();
497 if (!parent_inode.metadata().may_write(credentials))
498 return EACCES;
499 if (parent_custody.is_readonly())
500 return EROFS;
501 if (is_regular_file(mode) && (parent_custody.mount_flags() & MS_NOREGULAR))
502 return EACCES;
503
504 dbgln_if(VFS_DEBUG, "VirtualFileSystem::create: '{}' in {}", basename, parent_inode.identifier());
505 auto uid = owner.has_value() ? owner.value().uid : credentials.euid();
506 auto gid = owner.has_value() ? owner.value().gid : credentials.egid();
507
508 auto inode = TRY(parent_inode.create_child(basename, mode, 0, uid, gid));
509 auto custody = TRY(Custody::try_create(&parent_custody, basename, inode, parent_custody.mount_flags()));
510
511 auto description = TRY(OpenFileDescription::try_create(move(custody)));
512 description->set_rw_mode(options);
513 description->set_file_flags(options);
514 return description;
515}
516
517ErrorOr<void> VirtualFileSystem::mkdir(Credentials const& credentials, StringView path, mode_t mode, Custody& base)
518{
519 // Unlike in basically every other case, where it's only the last
520 // path component (the one being created) that is allowed not to
521 // exist, POSIX allows mkdir'ed path to have trailing slashes.
522 // Let's handle that case by trimming any trailing slashes.
523 path = path.trim("/"sv, TrimMode::Right);
524 if (path.is_empty()) {
525 // NOTE: This means the path was a series of slashes, which resolves to "/".
526 path = "/"sv;
527 }
528
529 RefPtr<Custody> parent_custody;
530 // FIXME: The errors returned by resolve_path_without_veil can leak information about paths that are not unveiled,
531 // e.g. when the error is EACCESS or similar.
532 auto result = resolve_path_without_veil(credentials, path, base, &parent_custody);
533 if (!result.is_error())
534 return EEXIST;
535 else if (!parent_custody)
536 return result.release_error();
537 // NOTE: If resolve_path fails with a non-null parent custody, the error should be ENOENT.
538 VERIFY(result.error().code() == ENOENT);
539
540 TRY(validate_path_against_process_veil(*parent_custody, O_CREAT));
541 auto& parent_inode = parent_custody->inode();
542 if (!parent_inode.metadata().may_write(credentials))
543 return EACCES;
544 if (parent_custody->is_readonly())
545 return EROFS;
546
547 auto basename = KLexicalPath::basename(path);
548 dbgln_if(VFS_DEBUG, "VirtualFileSystem::mkdir: '{}' in {}", basename, parent_inode.identifier());
549 (void)TRY(parent_inode.create_child(basename, S_IFDIR | mode, 0, credentials.euid(), credentials.egid()));
550 return {};
551}
552
553ErrorOr<void> VirtualFileSystem::access(Credentials const& credentials, StringView path, int mode, Custody& base, AccessFlags access_flags)
554{
555 auto should_follow_symlinks = !has_flag(access_flags, AccessFlags::DoNotFollowSymlinks);
556 auto custody = TRY(resolve_path(credentials, path, base, nullptr, should_follow_symlinks ? 0 : O_NOFOLLOW_NOERROR));
557
558 auto& inode = custody->inode();
559 auto metadata = inode.metadata();
560 auto use_effective_ids = has_flag(access_flags, AccessFlags::EffectiveAccess) ? UseEffectiveIDs::Yes : UseEffectiveIDs::No;
561 if (mode & R_OK) {
562 if (!metadata.may_read(credentials, use_effective_ids))
563 return EACCES;
564 }
565 if (mode & W_OK) {
566 if (!metadata.may_write(credentials, use_effective_ids))
567 return EACCES;
568 if (custody->is_readonly())
569 return EROFS;
570 }
571 if (mode & X_OK) {
572 if (!metadata.may_execute(credentials, use_effective_ids))
573 return EACCES;
574 }
575 return {};
576}
577
578ErrorOr<NonnullRefPtr<Custody>> VirtualFileSystem::open_directory(Credentials const& credentials, StringView path, Custody& base)
579{
580 auto custody = TRY(resolve_path(credentials, path, base));
581 auto& inode = custody->inode();
582 if (!inode.is_directory())
583 return ENOTDIR;
584 if (!inode.metadata().may_execute(credentials))
585 return EACCES;
586 return custody;
587}
588
589ErrorOr<void> VirtualFileSystem::chmod(Credentials const& credentials, Custody& custody, mode_t mode)
590{
591 auto& inode = custody.inode();
592
593 if (credentials.euid() != inode.metadata().uid && !credentials.is_superuser())
594 return EPERM;
595 if (custody.is_readonly())
596 return EROFS;
597
598 // Only change the permission bits.
599 mode = (inode.mode() & ~07777u) | (mode & 07777u);
600 return inode.chmod(mode);
601}
602
603ErrorOr<void> VirtualFileSystem::chmod(Credentials const& credentials, StringView path, mode_t mode, Custody& base, int options)
604{
605 auto custody = TRY(resolve_path(credentials, path, base, nullptr, options));
606 return chmod(credentials, custody, mode);
607}
608
609ErrorOr<void> VirtualFileSystem::rename(Credentials const& credentials, Custody& old_base, StringView old_path, Custody& new_base, StringView new_path)
610{
611 RefPtr<Custody> old_parent_custody;
612 auto old_custody = TRY(resolve_path(credentials, old_path, old_base, &old_parent_custody, O_NOFOLLOW_NOERROR));
613 auto& old_inode = old_custody->inode();
614
615 RefPtr<Custody> new_parent_custody;
616 auto new_custody_or_error = resolve_path(credentials, new_path, new_base, &new_parent_custody);
617 if (new_custody_or_error.is_error()) {
618 if (new_custody_or_error.error().code() != ENOENT || !new_parent_custody)
619 return new_custody_or_error.release_error();
620 }
621
622 if (!old_parent_custody || !new_parent_custody) {
623 return EPERM;
624 }
625
626 if (!new_custody_or_error.is_error()) {
627 auto& new_inode = new_custody_or_error.value()->inode();
628
629 if (old_inode.index() != new_inode.index() && old_inode.is_directory() && new_inode.is_directory()) {
630 size_t child_count = 0;
631 TRY(new_inode.traverse_as_directory([&child_count](auto&) -> ErrorOr<void> {
632 ++child_count;
633 return {};
634 }));
635 if (child_count > 2)
636 return ENOTEMPTY;
637 }
638 }
639
640 auto& old_parent_inode = old_parent_custody->inode();
641 auto& new_parent_inode = new_parent_custody->inode();
642
643 if (&old_parent_inode.fs() != &new_parent_inode.fs())
644 return EXDEV;
645
646 for (auto* new_ancestor = new_parent_custody.ptr(); new_ancestor; new_ancestor = new_ancestor->parent()) {
647 if (&old_inode == &new_ancestor->inode())
648 return EDIRINTOSELF;
649 }
650
651 if (!new_parent_inode.metadata().may_write(credentials))
652 return EACCES;
653
654 if (!old_parent_inode.metadata().may_write(credentials))
655 return EACCES;
656
657 if (old_parent_inode.metadata().is_sticky()) {
658 if (!credentials.is_superuser() && old_parent_inode.metadata().uid != credentials.euid() && old_inode.metadata().uid != credentials.euid())
659 return EACCES;
660 }
661
662 if (old_parent_custody->is_readonly() || new_parent_custody->is_readonly())
663 return EROFS;
664
665 auto old_basename = KLexicalPath::basename(old_path);
666 if (old_basename.is_empty() || old_basename == "."sv || old_basename == ".."sv)
667 return EINVAL;
668
669 auto new_basename = KLexicalPath::basename(new_path);
670 if (new_basename.is_empty() || new_basename == "."sv || new_basename == ".."sv)
671 return EINVAL;
672
673 if (old_basename == new_basename && old_parent_inode.index() == new_parent_inode.index())
674 return {};
675
676 if (!new_custody_or_error.is_error()) {
677 auto& new_custody = *new_custody_or_error.value();
678 auto& new_inode = new_custody.inode();
679 // When the source/dest inodes are the same (in other words,
680 // when `old_path` and `new_path` are the same), perform a no-op
681 // and return success.
682 // Linux (`vfs_rename()`) and OpenBSD (`dorenameat()`) appear to have
683 // this same no-op behavior.
684 if (&new_inode == &old_inode)
685 return {};
686 if (new_parent_inode.metadata().is_sticky()) {
687 if (!credentials.is_superuser() && new_inode.metadata().uid != credentials.euid())
688 return EACCES;
689 }
690 if (new_inode.is_directory() && !old_inode.is_directory())
691 return EISDIR;
692 TRY(new_parent_inode.remove_child(new_basename));
693 }
694
695 TRY(new_parent_inode.add_child(old_inode, new_basename, old_inode.mode()));
696 TRY(old_parent_inode.remove_child(old_basename));
697
698 // If the inode that we moved is a directory and we changed parent
699 // directories, then we also have to make .. point to the new parent inode,
700 // because .. is its own inode.
701 if (old_inode.is_directory() && old_parent_inode.index() != new_parent_inode.index()) {
702 TRY(old_inode.replace_child(".."sv, new_parent_inode));
703 }
704
705 return {};
706}
707
708ErrorOr<void> VirtualFileSystem::chown(Credentials const& credentials, Custody& custody, UserID a_uid, GroupID a_gid)
709{
710 auto& inode = custody.inode();
711 auto metadata = inode.metadata();
712
713 if (credentials.euid() != metadata.uid && !credentials.is_superuser())
714 return EPERM;
715
716 UserID new_uid = metadata.uid;
717 GroupID new_gid = metadata.gid;
718
719 if (a_uid != (uid_t)-1) {
720 if (credentials.euid() != a_uid && !credentials.is_superuser())
721 return EPERM;
722 new_uid = a_uid;
723 }
724 if (a_gid != (gid_t)-1) {
725 if (!credentials.in_group(a_gid) && !credentials.is_superuser())
726 return EPERM;
727 new_gid = a_gid;
728 }
729
730 if (custody.is_readonly())
731 return EROFS;
732
733 dbgln_if(VFS_DEBUG, "VirtualFileSystem::chown(): inode {} <- uid={} gid={}", inode.identifier(), new_uid, new_gid);
734
735 if (metadata.is_setuid() || metadata.is_setgid()) {
736 dbgln_if(VFS_DEBUG, "VirtualFileSystem::chown(): Stripping SUID/SGID bits from {}", inode.identifier());
737 TRY(inode.chmod(metadata.mode & ~(04000 | 02000)));
738 }
739
740 return inode.chown(new_uid, new_gid);
741}
742
743ErrorOr<void> VirtualFileSystem::chown(Credentials const& credentials, StringView path, UserID a_uid, GroupID a_gid, Custody& base, int options)
744{
745 auto custody = TRY(resolve_path(credentials, path, base, nullptr, options));
746 return chown(credentials, custody, a_uid, a_gid);
747}
748
749static bool hard_link_allowed(Credentials const& credentials, Inode const& inode)
750{
751 auto metadata = inode.metadata();
752
753 if (credentials.euid() == metadata.uid)
754 return true;
755
756 if (metadata.is_regular_file()
757 && !metadata.is_setuid()
758 && !(metadata.is_setgid() && metadata.mode & S_IXGRP)
759 && metadata.may_write(credentials)) {
760 return true;
761 }
762
763 return false;
764}
765
766ErrorOr<void> VirtualFileSystem::link(Credentials const& credentials, StringView old_path, StringView new_path, Custody& base)
767{
768 // NOTE: To prevent unveil bypass by creating an hardlink after unveiling a path as read-only,
769 // check that if write permission is allowed by the veil info on the old_path.
770 auto old_custody = TRY(resolve_path(credentials, old_path, base, nullptr, O_RDWR));
771 auto& old_inode = old_custody->inode();
772
773 RefPtr<Custody> parent_custody;
774 auto new_custody_or_error = resolve_path(credentials, new_path, base, &parent_custody);
775 if (!new_custody_or_error.is_error())
776 return EEXIST;
777
778 if (!parent_custody)
779 return ENOENT;
780
781 auto& parent_inode = parent_custody->inode();
782
783 if (parent_inode.fsid() != old_inode.fsid())
784 return EXDEV;
785
786 if (!parent_inode.metadata().may_write(credentials))
787 return EACCES;
788
789 if (old_inode.is_directory())
790 return EPERM;
791
792 if (parent_custody->is_readonly())
793 return EROFS;
794
795 if (!hard_link_allowed(credentials, old_inode))
796 return EPERM;
797
798 return parent_inode.add_child(old_inode, KLexicalPath::basename(new_path), old_inode.mode());
799}
800
801ErrorOr<void> VirtualFileSystem::unlink(Credentials const& credentials, StringView path, Custody& base)
802{
803 RefPtr<Custody> parent_custody;
804 auto custody = TRY(resolve_path(credentials, path, base, &parent_custody, O_NOFOLLOW_NOERROR | O_UNLINK_INTERNAL));
805 auto& inode = custody->inode();
806
807 if (inode.is_directory())
808 return EISDIR;
809
810 // We have just checked that the inode is not a directory, and thus it's not
811 // the root. So it should have a parent. Note that this would be invalidated
812 // if we were to support bind-mounting regular files on top of the root.
813 VERIFY(parent_custody);
814
815 auto& parent_inode = parent_custody->inode();
816 if (!parent_inode.metadata().may_write(credentials))
817 return EACCES;
818
819 if (parent_inode.metadata().is_sticky()) {
820 if (!credentials.is_superuser() && parent_inode.metadata().uid != credentials.euid() && inode.metadata().uid != credentials.euid())
821 return EACCES;
822 }
823
824 if (parent_custody->is_readonly())
825 return EROFS;
826
827 return parent_inode.remove_child(KLexicalPath::basename(path));
828}
829
830ErrorOr<void> VirtualFileSystem::symlink(Credentials const& credentials, StringView target, StringView linkpath, Custody& base)
831{
832 // NOTE: Check that the actual target (if it exists right now) is unveiled and prevent creating symlinks on veiled paths!
833 if (auto target_custody_or_error = resolve_path_without_veil(credentials, target, base, nullptr, O_RDWR, 0); !target_custody_or_error.is_error()) {
834 auto target_custody = target_custody_or_error.release_value();
835 TRY(validate_path_against_process_veil(*target_custody, O_RDWR));
836 }
837
838 RefPtr<Custody> parent_custody;
839 auto existing_custody_or_error = resolve_path(credentials, linkpath, base, &parent_custody, O_RDWR);
840 if (!existing_custody_or_error.is_error())
841 return EEXIST;
842 if (!parent_custody)
843 return ENOENT;
844
845 // NOTE: VERY IMPORTANT! We prevent creating symlinks in case the program didn't unveil the parent_custody
846 // path! For example, say the program wanted to create a symlink in /tmp/symlink to /tmp/test/pointee_symlink
847 // and unveiled the /tmp/test/ directory path beforehand, but not the /tmp directory path - the symlink syscall will
848 // fail here because we can't create the symlink in a parent directory path we didn't unveil beforehand.
849 TRY(validate_path_against_process_veil(*parent_custody, O_RDWR));
850
851 if (existing_custody_or_error.is_error() && existing_custody_or_error.error().code() != ENOENT)
852 return existing_custody_or_error.release_error();
853 auto& parent_inode = parent_custody->inode();
854 if (!parent_inode.metadata().may_write(credentials))
855 return EACCES;
856 if (parent_custody->is_readonly())
857 return EROFS;
858
859 auto basename = KLexicalPath::basename(linkpath);
860 dbgln_if(VFS_DEBUG, "VirtualFileSystem::symlink: '{}' (-> '{}') in {}", basename, target, parent_inode.identifier());
861
862 auto inode = TRY(parent_inode.create_child(basename, S_IFLNK | 0644, 0, credentials.euid(), credentials.egid()));
863
864 auto target_buffer = UserOrKernelBuffer::for_kernel_buffer(const_cast<u8*>((u8 const*)target.characters_without_null_termination()));
865 TRY(inode->write_bytes(0, target.length(), target_buffer, nullptr));
866 return {};
867}
868
869// https://pubs.opengroup.org/onlinepubs/9699919799/functions/rmdir.html
870ErrorOr<void> VirtualFileSystem::rmdir(Credentials const& credentials, StringView path, Custody& base)
871{
872 RefPtr<Custody> parent_custody;
873 auto custody = TRY(resolve_path(credentials, path, base, &parent_custody));
874 auto& inode = custody->inode();
875
876 auto last_component = KLexicalPath::basename(path);
877
878 // [EINVAL] The path argument contains a last component that is dot.
879 if (last_component == "."sv)
880 return EINVAL;
881
882 // [ENOTDIR] A component of path names an existing file that is neither a directory
883 // nor a symbolic link to a directory.
884 if (!inode.is_directory())
885 return ENOTDIR;
886
887 // [EBUSY] The directory to be removed is currently in use by the system or some process
888 // and the implementation considers this to be an error.
889 // NOTE: If there is no parent, that means we're trying to rmdir the root directory!
890 if (!parent_custody)
891 return EBUSY;
892
893 auto& parent_inode = parent_custody->inode();
894 auto parent_metadata = parent_inode.metadata();
895
896 // [EACCES] Search permission is denied on a component of the path prefix,
897 // or write permission is denied on the parent directory of the directory to be removed.
898 if (!parent_metadata.may_write(credentials))
899 return EACCES;
900
901 if (parent_metadata.is_sticky()) {
902 // [EACCES] The S_ISVTX flag is set on the directory containing the file referred to by the path argument
903 // and the process does not satisfy the criteria specified in XBD Directory Protection.
904 if (!credentials.is_superuser()
905 && inode.metadata().uid != credentials.euid()
906 && parent_metadata.uid != credentials.euid()) {
907 return EACCES;
908 }
909 }
910
911 size_t child_count = 0;
912 TRY(inode.traverse_as_directory([&child_count](auto&) -> ErrorOr<void> {
913 ++child_count;
914 return {};
915 }));
916
917 // [ENOTEMPTY] The path argument names a directory that is not an empty directory,
918 // or there are hard links to the directory other than dot or a single entry in dot-dot.
919 if (child_count != 2)
920 return ENOTEMPTY;
921
922 // [EROFS] The directory entry to be removed resides on a read-only file system.
923 if (custody->is_readonly())
924 return EROFS;
925
926 TRY(inode.remove_child("."sv));
927 TRY(inode.remove_child(".."sv));
928
929 return parent_inode.remove_child(KLexicalPath::basename(path));
930}
931
932ErrorOr<void> VirtualFileSystem::for_each_mount(Function<ErrorOr<void>(Mount const&)> callback) const
933{
934 return m_mounts.with([&](auto& mounts) -> ErrorOr<void> {
935 for (auto& mount : mounts)
936 TRY(callback(mount));
937 return {};
938 });
939}
940
941void VirtualFileSystem::sync()
942{
943 FileSystem::sync();
944}
945
946NonnullRefPtr<Custody> VirtualFileSystem::root_custody()
947{
948 return m_root_custody.with([](auto& root_custody) -> NonnullRefPtr<Custody> { return *root_custody; });
949}
950
951UnveilNode const& VirtualFileSystem::find_matching_unveiled_path(Process const& process, StringView path)
952{
953 VERIFY(process.veil_state() != VeilState::None);
954 return process.unveil_data().with([&](auto const& unveil_data) -> UnveilNode const& {
955 auto path_parts = KLexicalPath::parts(path);
956 return unveil_data.paths.traverse_until_last_accessible_node(path_parts.begin(), path_parts.end());
957 });
958}
959
960ErrorOr<void> VirtualFileSystem::validate_path_against_process_veil(Custody const& custody, int options)
961{
962 return validate_path_against_process_veil(Process::current(), custody, options);
963}
964
965ErrorOr<void> VirtualFileSystem::validate_path_against_process_veil(Process const& process, Custody const& custody, int options)
966{
967 if (process.veil_state() == VeilState::None)
968 return {};
969 auto absolute_path = TRY(custody.try_serialize_absolute_path());
970 return validate_path_against_process_veil(process, absolute_path->view(), options);
971}
972
973ErrorOr<void> VirtualFileSystem::validate_path_against_process_veil(Process const& process, StringView path, int options)
974{
975 if (process.veil_state() == VeilState::None)
976 return {};
977
978 VERIFY(path.starts_with('/'));
979 VERIFY(!path.contains("/../"sv) && !path.ends_with("/.."sv));
980 VERIFY(!path.contains("/./"sv) && !path.ends_with("/."sv));
981
982#ifdef SKIP_PATH_VALIDATION_FOR_COVERAGE_INSTRUMENTATION
983 // Skip veil validation against profile data when coverage is enabled for userspace
984 // so that all processes can write out coverage data even with veils in place
985 if (KLexicalPath::basename(path).ends_with(".profraw"sv))
986 return {};
987#endif
988
989 auto& unveiled_path = find_matching_unveiled_path(process, path);
990 if (unveiled_path.permissions() == UnveilAccess::None) {
991 dbgln("Rejecting path '{}' since it hasn't been unveiled.", path);
992 dump_backtrace();
993 return ENOENT;
994 }
995
996 if (options & O_CREAT) {
997 if (!(unveiled_path.permissions() & UnveilAccess::CreateOrRemove)) {
998 dbgln("Rejecting path '{}' since it hasn't been unveiled with 'c' permission.", path);
999 dump_backtrace();
1000 return EACCES;
1001 }
1002 }
1003 if (options & O_UNLINK_INTERNAL) {
1004 if (!(unveiled_path.permissions() & UnveilAccess::CreateOrRemove)) {
1005 dbgln("Rejecting path '{}' for unlink since it hasn't been unveiled with 'c' permission.", path);
1006 dump_backtrace();
1007 return EACCES;
1008 }
1009 return {};
1010 }
1011 if (options & O_RDONLY) {
1012 if (options & O_DIRECTORY) {
1013 if (!(unveiled_path.permissions() & (UnveilAccess::Read | UnveilAccess::Browse))) {
1014 dbgln("Rejecting path '{}' since it hasn't been unveiled with 'r' or 'b' permissions.", path);
1015 dump_backtrace();
1016 return EACCES;
1017 }
1018 } else {
1019 if (!(unveiled_path.permissions() & UnveilAccess::Read)) {
1020 dbgln("Rejecting path '{}' since it hasn't been unveiled with 'r' permission.", path);
1021 dump_backtrace();
1022 return EACCES;
1023 }
1024 }
1025 }
1026 if (options & O_WRONLY) {
1027 if (!(unveiled_path.permissions() & UnveilAccess::Write)) {
1028 dbgln("Rejecting path '{}' since it hasn't been unveiled with 'w' permission.", path);
1029 dump_backtrace();
1030 return EACCES;
1031 }
1032 }
1033 if (options & O_EXEC) {
1034 if (!(unveiled_path.permissions() & UnveilAccess::Execute)) {
1035 dbgln("Rejecting path '{}' since it hasn't been unveiled with 'x' permission.", path);
1036 dump_backtrace();
1037 return EACCES;
1038 }
1039 }
1040 return {};
1041}
1042
1043ErrorOr<void> VirtualFileSystem::validate_path_against_process_veil(StringView path, int options)
1044{
1045 return validate_path_against_process_veil(Process::current(), path, options);
1046}
1047
1048ErrorOr<NonnullRefPtr<Custody>> VirtualFileSystem::resolve_path(Credentials const& credentials, StringView path, NonnullRefPtr<Custody> base, RefPtr<Custody>* out_parent, int options, int symlink_recursion_level)
1049{
1050 return resolve_path(Process::current(), credentials, path, base, out_parent, options, symlink_recursion_level);
1051}
1052
1053ErrorOr<NonnullRefPtr<Custody>> VirtualFileSystem::resolve_path(Process const& process, Credentials const& credentials, StringView path, NonnullRefPtr<Custody> base, RefPtr<Custody>* out_parent, int options, int symlink_recursion_level)
1054{
1055 // FIXME: The errors returned by resolve_path_without_veil can leak information about paths that are not unveiled,
1056 // e.g. when the error is EACCESS or similar.
1057 auto custody = TRY(resolve_path_without_veil(credentials, path, base, out_parent, options, symlink_recursion_level));
1058 if (auto result = validate_path_against_process_veil(process, *custody, options); result.is_error()) {
1059 if (out_parent)
1060 out_parent->clear();
1061 return result.release_error();
1062 }
1063 return custody;
1064}
1065
1066static bool safe_to_follow_symlink(Credentials const& credentials, Inode const& inode, InodeMetadata const& parent_metadata)
1067{
1068 auto metadata = inode.metadata();
1069 if (credentials.euid() == metadata.uid)
1070 return true;
1071
1072 if (!(parent_metadata.is_sticky() && parent_metadata.mode & S_IWOTH))
1073 return true;
1074
1075 if (metadata.uid == parent_metadata.uid)
1076 return true;
1077
1078 return false;
1079}
1080
1081ErrorOr<NonnullRefPtr<Custody>> VirtualFileSystem::resolve_path_without_veil(Credentials const& credentials, StringView path, NonnullRefPtr<Custody> base, RefPtr<Custody>* out_parent, int options, int symlink_recursion_level)
1082{
1083 if (symlink_recursion_level >= symlink_recursion_limit)
1084 return ELOOP;
1085
1086 if (path.is_empty())
1087 return EINVAL;
1088
1089 GenericLexer path_lexer(path);
1090
1091 NonnullRefPtr<Custody> custody = path[0] == '/' ? root_custody() : base;
1092 bool extra_iteration = path[path.length() - 1] == '/';
1093
1094 while (!path_lexer.is_eof() || extra_iteration) {
1095 if (path_lexer.is_eof())
1096 extra_iteration = false;
1097 auto part = path_lexer.consume_until('/');
1098 path_lexer.ignore();
1099
1100 Custody& parent = custody;
1101 auto parent_metadata = parent.inode().metadata();
1102 if (!parent_metadata.is_directory())
1103 return ENOTDIR;
1104 // Ensure the current user is allowed to resolve paths inside this directory.
1105 if (!parent_metadata.may_execute(credentials))
1106 return EACCES;
1107
1108 bool have_more_parts = !path_lexer.is_eof() || extra_iteration;
1109
1110 if (part == "..") {
1111 // If we encounter a "..", take a step back, but don't go beyond the root.
1112 if (custody->parent())
1113 custody = *custody->parent();
1114 continue;
1115 } else if (part == "." || part.is_empty()) {
1116 continue;
1117 }
1118
1119 // Okay, let's look up this part.
1120 auto child_or_error = parent.inode().lookup(part);
1121 if (child_or_error.is_error()) {
1122 if (out_parent) {
1123 // ENOENT with a non-null parent custody signals to caller that
1124 // we found the immediate parent of the file, but the file itself
1125 // does not exist yet.
1126 *out_parent = have_more_parts ? nullptr : &parent;
1127 }
1128 return child_or_error.release_error();
1129 }
1130 auto child_inode = child_or_error.release_value();
1131
1132 int mount_flags_for_child = parent.mount_flags();
1133
1134 // See if there's something mounted on the child; in that case
1135 // we would need to return the guest inode, not the host inode.
1136 if (auto mount = find_mount_for_host(child_inode->identifier())) {
1137 child_inode = mount->guest();
1138 mount_flags_for_child = mount->flags();
1139 }
1140
1141 custody = TRY(Custody::try_create(&parent, part, *child_inode, mount_flags_for_child));
1142
1143 if (child_inode->metadata().is_symlink()) {
1144 if (!have_more_parts) {
1145 if (options & O_NOFOLLOW)
1146 return ELOOP;
1147 if (options & O_NOFOLLOW_NOERROR)
1148 break;
1149 }
1150
1151 if (!safe_to_follow_symlink(credentials, *child_inode, parent_metadata))
1152 return EACCES;
1153
1154 TRY(validate_path_against_process_veil(*custody, options));
1155
1156 auto symlink_target = TRY(child_inode->resolve_as_link(credentials, parent, out_parent, options, symlink_recursion_level + 1));
1157 if (!have_more_parts)
1158 return symlink_target;
1159
1160 // Now, resolve the remaining path relative to the symlink target.
1161 // We prepend a "." to it to ensure that it's not empty and that
1162 // any initial slashes it might have get interpreted properly.
1163 StringBuilder remaining_path;
1164 TRY(remaining_path.try_append('.'));
1165 TRY(remaining_path.try_append(path.substring_view_starting_after_substring(part)));
1166
1167 return resolve_path_without_veil(credentials, remaining_path.string_view(), symlink_target, out_parent, options, symlink_recursion_level + 1);
1168 }
1169 }
1170
1171 if (out_parent)
1172 *out_parent = custody->parent();
1173 return custody;
1174}
1175}