Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

proc: faster open/read/close with "permanent" files

Now that "struct proc_ops" exist we can start putting there stuff which
could not fly with VFS "struct file_operations"...

Most of fs/proc/inode.c file is dedicated to make open/read/.../close
reliable in the event of disappearing /proc entries which usually happens
if module is getting removed. Files like /proc/cpuinfo which never
disappear simply do not need such protection.

Save 2 atomic ops, 1 allocation, 1 free per open/read/close sequence for such
"permanent" files.

Enable "permanent" flag for

/proc/cpuinfo
/proc/kmsg
/proc/modules
/proc/slabinfo
/proc/stat
/proc/sysvipc/*
/proc/swaps

More will come once I figure out foolproof way to prevent out module
authors from marking their stuff "permanent" for performance reasons
when it is not.

This should help with scalability: benchmark is "read /proc/cpuinfo R times
by N threads scattered over the system".

N R t, s (before) t, s (after)
-----------------------------------------------------
64 4096 1.582458 1.530502 -3.2%
256 4096 6.371926 6.125168 -3.9%
1024 4096 25.64888 24.47528 -4.6%

Benchmark source:

#include <chrono>
#include <iostream>
#include <thread>
#include <vector>

#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>

const int NR_CPUS = sysconf(_SC_NPROCESSORS_ONLN);
int N;
const char *filename;
int R;

int xxx = 0;

int glue(int n)
{
cpu_set_t m;
CPU_ZERO(&m);
CPU_SET(n, &m);
return sched_setaffinity(0, sizeof(cpu_set_t), &m);
}

void f(int n)
{
glue(n % NR_CPUS);

while (*(volatile int *)&xxx == 0) {
}

for (int i = 0; i < R; i++) {
int fd = open(filename, O_RDONLY);
char buf[4096];
ssize_t rv = read(fd, buf, sizeof(buf));
asm volatile ("" :: "g" (rv));
close(fd);
}
}

int main(int argc, char *argv[])
{
if (argc < 4) {
std::cerr << "usage: " << argv[0] << ' ' << "N /proc/filename R
";
return 1;
}

N = atoi(argv[1]);
filename = argv[2];
R = atoi(argv[3]);

for (int i = 0; i < NR_CPUS; i++) {
if (glue(i) == 0)
break;
}

std::vector<std::thread> T;
T.reserve(N);
for (int i = 0; i < N; i++) {
T.emplace_back(f, i);
}

auto t0 = std::chrono::system_clock::now();
{
*(volatile int *)&xxx = 1;
for (auto& t: T) {
t.join();
}
}
auto t1 = std::chrono::system_clock::now();
std::chrono::duration<double> dt = t1 - t0;
std::cout << dt.count() << '
';

return 0;
}

P.S.:
Explicit randomization marker is added because adding non-function pointer
will silently disable structure layout randomization.

[akpm@linux-foundation.org: coding style fixes]
Reported-by: kbuild test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Joe Perches <joe@perches.com>
Link: http://lkml.kernel.org/r/20200222201539.GA22576@avx2
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Alexey Dobriyan and committed by
Linus Torvalds
d919b33d 904f394e

+194 -54
+1
fs/proc/cpuinfo.c
··· 17 17 } 18 18 19 19 static const struct proc_ops cpuinfo_proc_ops = { 20 + .proc_flags = PROC_ENTRY_PERMANENT, 20 21 .proc_open = cpuinfo_open, 21 22 .proc_read = seq_read, 22 23 .proc_lseek = seq_lseek,
+28 -3
fs/proc/generic.c
··· 531 531 return p; 532 532 } 533 533 534 + static inline void pde_set_flags(struct proc_dir_entry *pde) 535 + { 536 + if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT) 537 + pde->flags |= PROC_ENTRY_PERMANENT; 538 + } 539 + 534 540 struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, 535 541 struct proc_dir_entry *parent, 536 542 const struct proc_ops *proc_ops, void *data) ··· 547 541 if (!p) 548 542 return NULL; 549 543 p->proc_ops = proc_ops; 544 + pde_set_flags(p); 550 545 return proc_register(parent, p); 551 546 } 552 547 EXPORT_SYMBOL(proc_create_data); ··· 579 572 } 580 573 581 574 static const struct proc_ops proc_seq_ops = { 575 + /* not permanent -- can call into arbitrary seq_operations */ 582 576 .proc_open = proc_seq_open, 583 577 .proc_read = seq_read, 584 578 .proc_lseek = seq_lseek, ··· 610 602 } 611 603 612 604 static const struct proc_ops proc_single_ops = { 605 + /* not permanent -- can call into arbitrary ->single_show */ 613 606 .proc_open = proc_single_open, 614 607 .proc_read = seq_read, 615 608 .proc_lseek = seq_lseek, ··· 671 662 672 663 de = pde_subdir_find(parent, fn, len); 673 664 if (de) { 674 - rb_erase(&de->subdir_node, &parent->subdir); 675 - if (S_ISDIR(de->mode)) { 676 - parent->nlink--; 665 + if (unlikely(pde_is_permanent(de))) { 666 + WARN(1, "removing permanent /proc entry '%s'", de->name); 667 + de = NULL; 668 + } else { 669 + rb_erase(&de->subdir_node, &parent->subdir); 670 + if (S_ISDIR(de->mode)) 671 + parent->nlink--; 677 672 } 678 673 } 679 674 write_unlock(&proc_subdir_lock); ··· 713 700 write_unlock(&proc_subdir_lock); 714 701 return -ENOENT; 715 702 } 703 + if (unlikely(pde_is_permanent(root))) { 704 + write_unlock(&proc_subdir_lock); 705 + WARN(1, "removing permanent /proc entry '%s/%s'", 706 + root->parent->name, root->name); 707 + return -EINVAL; 708 + } 716 709 rb_erase(&root->subdir_node, &parent->subdir); 717 710 718 711 de = root; 719 712 while (1) { 720 713 next = pde_subdir_first(de); 721 714 if (next) { 715 + if (unlikely(pde_is_permanent(root))) { 716 + write_unlock(&proc_subdir_lock); 717 + WARN(1, "removing permanent /proc entry '%s/%s'", 718 + next->parent->name, next->name); 719 + return -EINVAL; 720 + } 722 721 rb_erase(&next->subdir_node, &de->subdir); 723 722 de = next; 724 723 continue;
+137 -50
fs/proc/inode.c
··· 259 259 spin_unlock(&de->pde_unload_lock); 260 260 } 261 261 262 + static loff_t pde_lseek(struct proc_dir_entry *pde, struct file *file, loff_t offset, int whence) 263 + { 264 + typeof_member(struct proc_ops, proc_lseek) lseek; 265 + 266 + lseek = pde->proc_ops->proc_lseek; 267 + if (!lseek) 268 + lseek = default_llseek; 269 + return lseek(file, offset, whence); 270 + } 271 + 262 272 static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence) 263 273 { 264 274 struct proc_dir_entry *pde = PDE(file_inode(file)); 265 275 loff_t rv = -EINVAL; 266 - if (use_pde(pde)) { 267 - typeof_member(struct proc_ops, proc_lseek) lseek; 268 276 269 - lseek = pde->proc_ops->proc_lseek; 270 - if (!lseek) 271 - lseek = default_llseek; 272 - rv = lseek(file, offset, whence); 277 + if (pde_is_permanent(pde)) { 278 + return pde_lseek(pde, file, offset, whence); 279 + } else if (use_pde(pde)) { 280 + rv = pde_lseek(pde, file, offset, whence); 273 281 unuse_pde(pde); 274 282 } 275 283 return rv; 284 + } 285 + 286 + static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos) 287 + { 288 + typeof_member(struct proc_ops, proc_read) read; 289 + 290 + read = pde->proc_ops->proc_read; 291 + if (read) 292 + return read(file, buf, count, ppos); 293 + return -EIO; 276 294 } 277 295 278 296 static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) 279 297 { 280 298 struct proc_dir_entry *pde = PDE(file_inode(file)); 281 299 ssize_t rv = -EIO; 282 - if (use_pde(pde)) { 283 - typeof_member(struct proc_ops, proc_read) read; 284 300 285 - read = pde->proc_ops->proc_read; 286 - if (read) 287 - rv = read(file, buf, count, ppos); 301 + if (pde_is_permanent(pde)) { 302 + return pde_read(pde, file, buf, count, ppos); 303 + } else if (use_pde(pde)) { 304 + rv = pde_read(pde, file, buf, count, ppos); 288 305 unuse_pde(pde); 289 306 } 290 307 return rv; 308 + } 309 + 310 + static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos) 311 + { 312 + typeof_member(struct proc_ops, proc_write) write; 313 + 314 + write = pde->proc_ops->proc_write; 315 + if (write) 316 + return write(file, buf, count, ppos); 317 + return -EIO; 291 318 } 292 319 293 320 static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) 294 321 { 295 322 struct proc_dir_entry *pde = PDE(file_inode(file)); 296 323 ssize_t rv = -EIO; 297 - if (use_pde(pde)) { 298 - typeof_member(struct proc_ops, proc_write) write; 299 324 300 - write = pde->proc_ops->proc_write; 301 - if (write) 302 - rv = write(file, buf, count, ppos); 325 + if (pde_is_permanent(pde)) { 326 + return pde_write(pde, file, buf, count, ppos); 327 + } else if (use_pde(pde)) { 328 + rv = pde_write(pde, file, buf, count, ppos); 303 329 unuse_pde(pde); 304 330 } 305 331 return rv; 332 + } 333 + 334 + static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts) 335 + { 336 + typeof_member(struct proc_ops, proc_poll) poll; 337 + 338 + poll = pde->proc_ops->proc_poll; 339 + if (poll) 340 + return poll(file, pts); 341 + return DEFAULT_POLLMASK; 306 342 } 307 343 308 344 static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) 309 345 { 310 346 struct proc_dir_entry *pde = PDE(file_inode(file)); 311 347 __poll_t rv = DEFAULT_POLLMASK; 312 - if (use_pde(pde)) { 313 - typeof_member(struct proc_ops, proc_poll) poll; 314 348 315 - poll = pde->proc_ops->proc_poll; 316 - if (poll) 317 - rv = poll(file, pts); 349 + if (pde_is_permanent(pde)) { 350 + return pde_poll(pde, file, pts); 351 + } else if (use_pde(pde)) { 352 + rv = pde_poll(pde, file, pts); 318 353 unuse_pde(pde); 319 354 } 320 355 return rv; 356 + } 357 + 358 + static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) 359 + { 360 + typeof_member(struct proc_ops, proc_ioctl) ioctl; 361 + 362 + ioctl = pde->proc_ops->proc_ioctl; 363 + if (ioctl) 364 + return ioctl(file, cmd, arg); 365 + return -ENOTTY; 321 366 } 322 367 323 368 static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 324 369 { 325 370 struct proc_dir_entry *pde = PDE(file_inode(file)); 326 371 long rv = -ENOTTY; 327 - if (use_pde(pde)) { 328 - typeof_member(struct proc_ops, proc_ioctl) ioctl; 329 372 330 - ioctl = pde->proc_ops->proc_ioctl; 331 - if (ioctl) 332 - rv = ioctl(file, cmd, arg); 373 + if (pde_is_permanent(pde)) { 374 + return pde_ioctl(pde, file, cmd, arg); 375 + } else if (use_pde(pde)) { 376 + rv = pde_ioctl(pde, file, cmd, arg); 333 377 unuse_pde(pde); 334 378 } 335 379 return rv; 336 380 } 337 381 338 382 #ifdef CONFIG_COMPAT 383 + static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) 384 + { 385 + typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; 386 + 387 + compat_ioctl = pde->proc_ops->proc_compat_ioctl; 388 + if (compat_ioctl) 389 + return compat_ioctl(file, cmd, arg); 390 + return -ENOTTY; 391 + } 392 + 339 393 static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 340 394 { 341 395 struct proc_dir_entry *pde = PDE(file_inode(file)); 342 396 long rv = -ENOTTY; 343 - if (use_pde(pde)) { 344 - typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; 345 - 346 - compat_ioctl = pde->proc_ops->proc_compat_ioctl; 347 - if (compat_ioctl) 348 - rv = compat_ioctl(file, cmd, arg); 397 + if (pde_is_permanent(pde)) { 398 + return pde_compat_ioctl(pde, file, cmd, arg); 399 + } else if (use_pde(pde)) { 400 + rv = pde_compat_ioctl(pde, file, cmd, arg); 349 401 unuse_pde(pde); 350 402 } 351 403 return rv; 352 404 } 353 405 #endif 354 406 407 + static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma) 408 + { 409 + typeof_member(struct proc_ops, proc_mmap) mmap; 410 + 411 + mmap = pde->proc_ops->proc_mmap; 412 + if (mmap) 413 + return mmap(file, vma); 414 + return -EIO; 415 + } 416 + 355 417 static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) 356 418 { 357 419 struct proc_dir_entry *pde = PDE(file_inode(file)); 358 420 int rv = -EIO; 359 - if (use_pde(pde)) { 360 - typeof_member(struct proc_ops, proc_mmap) mmap; 361 421 362 - mmap = pde->proc_ops->proc_mmap; 363 - if (mmap) 364 - rv = mmap(file, vma); 422 + if (pde_is_permanent(pde)) { 423 + return pde_mmap(pde, file, vma); 424 + } else if (use_pde(pde)) { 425 + rv = pde_mmap(pde, file, vma); 365 426 unuse_pde(pde); 366 427 } 367 428 return rv; 429 + } 430 + 431 + static unsigned long 432 + pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned long orig_addr, 433 + unsigned long len, unsigned long pgoff, 434 + unsigned long flags) 435 + { 436 + typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; 437 + 438 + get_area = pde->proc_ops->proc_get_unmapped_area; 439 + #ifdef CONFIG_MMU 440 + if (!get_area) 441 + get_area = current->mm->get_unmapped_area; 442 + #endif 443 + if (get_area) 444 + return get_area(file, orig_addr, len, pgoff, flags); 445 + return orig_addr; 368 446 } 369 447 370 448 static unsigned long ··· 453 375 struct proc_dir_entry *pde = PDE(file_inode(file)); 454 376 unsigned long rv = -EIO; 455 377 456 - if (use_pde(pde)) { 457 - typeof_member(struct proc_ops, proc_get_unmapped_area) get_area; 458 - 459 - get_area = pde->proc_ops->proc_get_unmapped_area; 460 - #ifdef CONFIG_MMU 461 - if (!get_area) 462 - get_area = current->mm->get_unmapped_area; 463 - #endif 464 - 465 - if (get_area) 466 - rv = get_area(file, orig_addr, len, pgoff, flags); 467 - else 468 - rv = orig_addr; 378 + if (pde_is_permanent(pde)) { 379 + return pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); 380 + } else if (use_pde(pde)) { 381 + rv = pde_get_unmapped_area(pde, file, orig_addr, len, pgoff, flags); 469 382 unuse_pde(pde); 470 383 } 471 384 return rv; ··· 469 400 typeof_member(struct proc_ops, proc_open) open; 470 401 typeof_member(struct proc_ops, proc_release) release; 471 402 struct pde_opener *pdeo; 403 + 404 + if (pde_is_permanent(pde)) { 405 + open = pde->proc_ops->proc_open; 406 + if (open) 407 + rv = open(inode, file); 408 + return rv; 409 + } 472 410 473 411 /* 474 412 * Ensure that ··· 526 450 { 527 451 struct proc_dir_entry *pde = PDE(inode); 528 452 struct pde_opener *pdeo; 453 + 454 + if (pde_is_permanent(pde)) { 455 + typeof_member(struct proc_ops, proc_release) release; 456 + 457 + release = pde->proc_ops->proc_release; 458 + if (release) { 459 + return release(inode, file); 460 + } 461 + return 0; 462 + } 463 + 529 464 spin_lock(&pde->pde_unload_lock); 530 465 list_for_each_entry(pdeo, &pde->pde_openers, lh) { 531 466 if (pdeo->file == file) {
+6
fs/proc/internal.h
··· 61 61 struct rb_node subdir_node; 62 62 char *name; 63 63 umode_t mode; 64 + u8 flags; 64 65 u8 namelen; 65 66 char inline_name[]; 66 67 } __randomize_layout; ··· 73 72 sizeof(struct proc_dir_entry) < 512 ? 512 : \ 74 73 0) 75 74 #define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry)) 75 + 76 + static inline bool pde_is_permanent(const struct proc_dir_entry *pde) 77 + { 78 + return pde->flags & PROC_ENTRY_PERMANENT; 79 + } 76 80 77 81 extern struct kmem_cache *proc_dir_entry_cache; 78 82 void pde_free(struct proc_dir_entry *pde);
+1
fs/proc/kmsg.c
··· 50 50 51 51 52 52 static const struct proc_ops kmsg_proc_ops = { 53 + .proc_flags = PROC_ENTRY_PERMANENT, 53 54 .proc_read = kmsg_read, 54 55 .proc_poll = kmsg_poll, 55 56 .proc_open = kmsg_open,
+1
fs/proc/stat.c
··· 224 224 } 225 225 226 226 static const struct proc_ops stat_proc_ops = { 227 + .proc_flags = PROC_ENTRY_PERMANENT, 227 228 .proc_open = stat_open, 228 229 .proc_read = seq_read, 229 230 .proc_lseek = seq_lseek,
+16 -1
include/linux/proc_fs.h
··· 5 5 #ifndef _LINUX_PROC_FS_H 6 6 #define _LINUX_PROC_FS_H 7 7 8 + #include <linux/compiler.h> 8 9 #include <linux/types.h> 9 10 #include <linux/fs.h> 10 11 ··· 13 12 struct seq_file; 14 13 struct seq_operations; 15 14 15 + enum { 16 + /* 17 + * All /proc entries using this ->proc_ops instance are never removed. 18 + * 19 + * If in doubt, ignore this flag. 20 + */ 21 + #ifdef MODULE 22 + PROC_ENTRY_PERMANENT = 0U, 23 + #else 24 + PROC_ENTRY_PERMANENT = 1U << 0, 25 + #endif 26 + }; 27 + 16 28 struct proc_ops { 29 + unsigned int proc_flags; 17 30 int (*proc_open)(struct inode *, struct file *); 18 31 ssize_t (*proc_read)(struct file *, char __user *, size_t, loff_t *); 19 32 ssize_t (*proc_write)(struct file *, const char __user *, size_t, loff_t *); ··· 40 25 #endif 41 26 int (*proc_mmap)(struct file *, struct vm_area_struct *); 42 27 unsigned long (*proc_get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); 43 - }; 28 + } __randomize_layout; 44 29 45 30 #ifdef CONFIG_PROC_FS 46 31
+1
ipc/util.c
··· 885 885 } 886 886 887 887 static const struct proc_ops sysvipc_proc_ops = { 888 + .proc_flags = PROC_ENTRY_PERMANENT, 888 889 .proc_open = sysvipc_proc_open, 889 890 .proc_read = seq_read, 890 891 .proc_lseek = seq_lseek,
+1
kernel/module.c
··· 4355 4355 } 4356 4356 4357 4357 static const struct proc_ops modules_proc_ops = { 4358 + .proc_flags = PROC_ENTRY_PERMANENT, 4358 4359 .proc_open = modules_open, 4359 4360 .proc_read = seq_read, 4360 4361 .proc_lseek = seq_lseek,
+1
mm/slab_common.c
··· 1581 1581 } 1582 1582 1583 1583 static const struct proc_ops slabinfo_proc_ops = { 1584 + .proc_flags = PROC_ENTRY_PERMANENT, 1584 1585 .proc_open = slabinfo_open, 1585 1586 .proc_read = seq_read, 1586 1587 .proc_write = slabinfo_write,
+1
mm/swapfile.c
··· 2797 2797 } 2798 2798 2799 2799 static const struct proc_ops swaps_proc_ops = { 2800 + .proc_flags = PROC_ENTRY_PERMANENT, 2800 2801 .proc_open = swaps_open, 2801 2802 .proc_read = seq_read, 2802 2803 .proc_lseek = seq_lseek,