Merge tag 'vfs-6.12.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull vfs file updates from Christian Brauner:
"This is the work to cleanup and shrink struct file significantly.

Right now, (focusing on x86) struct file is 232 bytes. After this
series struct file will be 184 bytes aka 3 cacheline and a spare 8
bytes for future extensions at the end of the struct.

With struct file being as ubiquitous as it is this should make a
difference for file heavy workloads and allow further optimizations in
the future.

- struct fown_struct was embedded into struct file letting it take up
32 bytes in total when really it shouldn't even be embedded in
struct file in the first place. Instead, actual users of struct
fown_struct now allocate the struct on demand. This frees up 24
bytes.

- Move struct file_ra_state into the union containg the cleanup hooks
and move f_iocb_flags out of the union. This closes a 4 byte hole
we created earlier and brings struct file to 192 bytes. Which means
struct file is 3 cachelines and we managed to shrink it by 40
bytes.

- Reorder struct file so that nothing crosses a cacheline.

I suspect that in the future we will end up reordering some members
to mitigate false sharing issues or just because someone does
actually provide really good perf data.

- Shrinking struct file to 192 bytes is only part of the work.

Files use a slab that is SLAB_TYPESAFE_BY_RCU and when a kmem cache
is created with SLAB_TYPESAFE_BY_RCU the free pointer must be
located outside of the object because the cache doesn't know what
part of the memory can safely be overwritten as it may be needed to
prevent object recycling.

That has the consequence that SLAB_TYPESAFE_BY_RCU may end up
adding a new cacheline.

So this also contains work to add a new kmem_cache_create_rcu()
function that allows the caller to specify an offset where the
freelist pointer is supposed to be placed. Thus avoiding the
implicit addition of a fourth cacheline.

- And finally this removes the f_version member in struct file.

The f_version member isn't particularly well-defined. It is mainly
used as a cookie to detect concurrent seeks when iterating
directories. But it is also abused by some subsystems for
completely unrelated things.

It is mostly a directory and filesystem specific thing that doesn't
really need to live in struct file and with its wonky semantics it
really lacks a specific function.

For pipes, f_version is (ab)used to defer poll notifications until
a write has happened. And struct pipe_inode_info is used by
multiple struct files in their ->private_data so there's no chance
of pushing that down into file->private_data without introducing
another pointer indirection.

But pipes don't rely on f_pos_lock so this adds a union into struct
file encompassing f_pos_lock and a pipe specific f_pipe member that
pipes can use. This union of course can be extended to other file
types and is similar to what we do in struct inode already"

* tag 'vfs-6.12.file' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: (26 commits)
fs: remove f_version
pipe: use f_pipe
fs: add f_pipe
ubifs: store cookie in private data
ufs: store cookie in private data
udf: store cookie in private data
proc: store cookie in private data
ocfs2: store cookie in private data
input: remove f_version abuse
ext4: store cookie in private data
ext2: store cookie in private data
affs: store cookie in private data
fs: add generic_llseek_cookie()
fs: use must_set_pos()
fs: add must_set_pos()
fs: add vfs_setpos_cookie()
s390: remove unused f_version
ceph: remove unused f_version
adi: remove unused f_version
mm: Removed @freeptr_offset to prevent doc warning
...

Linus Torvalds 2 years ago 3352633c 2775df6e

+754 -281

33 changed files

expand all

drivers

char

adi.c

input

input.c

net

tun.c

s390

char

hmcdrv_dev.c

tty

tty_io.c

affs

dir.c

ceph

dir.c

ext2

dir.c

ext4

dir.c

ext4.h

inline.c

fcntl.c

file_table.c

internal.h

locks.c

notify

dnotify

dnotify.c

ocfs2

dir.c

file.c

file.h

pipe.c

proc

base.c

read_write.c

ubifs

dir.c

udf

dir.c

ufs

dir.c

include

linux

fs.h

slab.h

slab_common.c

slub.c

net

core

sock.c

security

selinux

hooks.c

smack

smack_lsm.c

-1

drivers/char/adi.c

··· 190 190 191 191 if (offset != file->f_pos) { 192 192 file->f_pos = offset; 193 - file->f_version = 0; 194 193 ret = offset; 195 194 } 196 195

+22 -25

drivers/input/input.c

··· 1079 1079 wake_up(&input_devices_poll_wait); 1080 1080 } 1081 1081 1082 + struct input_seq_state { 1083 + unsigned short pos; 1084 + bool mutex_acquired; 1085 + int input_devices_state; 1086 + }; 1087 + 1082 1088 static __poll_t input_proc_devices_poll(struct file *file, poll_table *wait) 1083 1089 { 1090 + struct seq_file *seq = file->private_data; 1091 + struct input_seq_state *state = seq->private; 1092 + 1084 1093 poll_wait(file, &input_devices_poll_wait, wait); 1085 - if (file->f_version != input_devices_state) { 1086 - file->f_version = input_devices_state; 1094 + if (state->input_devices_state != input_devices_state) { 1095 + state->input_devices_state = input_devices_state; 1087 1096 return EPOLLIN | EPOLLRDNORM; 1088 1097 } 1089 1098 1090 1099 return 0; 1091 1100 } 1092 1101 1093 - union input_seq_state { 1094 - struct { 1095 - unsigned short pos; 1096 - bool mutex_acquired; 1097 - }; 1098 - void *p; 1099 - }; 1100 - 1101 1102 static void *input_devices_seq_start(struct seq_file *seq, loff_t *pos) 1102 1103 { 1103 - union input_seq_state *state = (union input_seq_state *)&seq->private; 1104 + struct input_seq_state *state = seq->private; 1104 1105 int error; 1105 - 1106 - /* We need to fit into seq->private pointer */ 1107 - BUILD_BUG_ON(sizeof(union input_seq_state) != sizeof(seq->private)); 1108 1106 1109 1107 error = mutex_lock_interruptible(&input_mutex); 1110 1108 if (error) { ··· 1122 1124 1123 1125 static void input_seq_stop(struct seq_file *seq, void *v) 1124 1126 { 1125 - union input_seq_state *state = (union input_seq_state *)&seq->private; 1127 + struct input_seq_state *state = seq->private; 1126 1128 1127 1129 if (state->mutex_acquired) 1128 1130 mutex_unlock(&input_mutex); ··· 1208 1210 1209 1211 static int input_proc_devices_open(struct inode *inode, struct file *file) 1210 1212 { 1211 - return seq_open(file, &input_devices_seq_ops); 1213 + return seq_open_private(file, &input_devices_seq_ops, 1214 + sizeof(struct input_seq_state)); 1212 1215 } 1213 1216 1214 1217 static const struct proc_ops input_devices_proc_ops = { ··· 1217 1218 .proc_poll = input_proc_devices_poll, 1218 1219 .proc_read = seq_read, 1219 1220 .proc_lseek = seq_lseek, 1220 - .proc_release = seq_release, 1221 + .proc_release = seq_release_private, 1221 1222 }; 1222 1223 1223 1224 static void *input_handlers_seq_start(struct seq_file *seq, loff_t *pos) 1224 1225 { 1225 - union input_seq_state *state = (union input_seq_state *)&seq->private; 1226 + struct input_seq_state *state = seq->private; 1226 1227 int error; 1227 - 1228 - /* We need to fit into seq->private pointer */ 1229 - BUILD_BUG_ON(sizeof(union input_seq_state) != sizeof(seq->private)); 1230 1228 1231 1229 error = mutex_lock_interruptible(&input_mutex); 1232 1230 if (error) { ··· 1239 1243 1240 1244 static void *input_handlers_seq_next(struct seq_file *seq, void *v, loff_t *pos) 1241 1245 { 1242 - union input_seq_state *state = (union input_seq_state *)&seq->private; 1246 + struct input_seq_state *state = seq->private; 1243 1247 1244 1248 state->pos = *pos + 1; 1245 1249 return seq_list_next(v, &input_handler_list, pos); ··· 1248 1252 static int input_handlers_seq_show(struct seq_file *seq, void *v) 1249 1253 { 1250 1254 struct input_handler *handler = container_of(v, struct input_handler, node); 1251 - union input_seq_state *state = (union input_seq_state *)&seq->private; 1255 + struct input_seq_state *state = seq->private; 1252 1256 1253 1257 seq_printf(seq, "N: Number=%u Name=%s", state->pos, handler->name); 1254 1258 if (handler->filter) ··· 1269 1273 1270 1274 static int input_proc_handlers_open(struct inode *inode, struct file *file) 1271 1275 { 1272 - return seq_open(file, &input_handlers_seq_ops); 1276 + return seq_open_private(file, &input_handlers_seq_ops, 1277 + sizeof(struct input_seq_state)); 1273 1278 } 1274 1279 1275 1280 static const struct proc_ops input_handlers_proc_ops = { 1276 1281 .proc_open = input_proc_handlers_open, 1277 1282 .proc_read = seq_read, 1278 1283 .proc_lseek = seq_lseek, 1279 - .proc_release = seq_release, 1284 + .proc_release = seq_release_private, 1280 1285 }; 1281 1286 1282 1287 static int __init input_proc_init(void)

drivers/net/tun.c

··· 3452 3452 struct tun_file *tfile = file->private_data; 3453 3453 int ret; 3454 3454 3455 + if (on) { 3456 + ret = file_f_owner_allocate(file); 3457 + if (ret) 3458 + goto out; 3459 + } 3460 + 3455 3461 if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0) 3456 3462 goto out; 3457 3463

-3

drivers/s390/char/hmcdrv_dev.c

··· 186 186 if (pos < 0) 187 187 return -EINVAL; 188 188 189 - if (fp->f_pos != pos) 190 - ++fp->f_version; 191 - 192 189 fp->f_pos = pos; 193 190 return pos; 194 191 }

drivers/tty/tty_io.c

··· 2225 2225 if (tty_paranoia_check(tty, file_inode(filp), "tty_fasync")) 2226 2226 goto out; 2227 2227 2228 + if (on) { 2229 + retval = file_f_owner_allocate(filp); 2230 + if (retval) 2231 + goto out; 2232 + } 2233 + 2228 2234 retval = fasync_helper(fd, filp, on, &tty->fasync); 2229 2235 if (retval <= 0) 2230 2236 goto out;

+38 -6

fs/affs/dir.c

··· 17 17 #include <linux/iversion.h> 18 18 #include "affs.h" 19 19 20 + struct affs_dir_data { 21 + unsigned long ino; 22 + u64 cookie; 23 + }; 24 + 20 25 static int affs_readdir(struct file *, struct dir_context *); 21 26 27 + static loff_t affs_dir_llseek(struct file *file, loff_t offset, int whence) 28 + { 29 + struct affs_dir_data *data = file->private_data; 30 + 31 + return generic_llseek_cookie(file, offset, whence, &data->cookie); 32 + } 33 + 34 + static int affs_dir_open(struct inode *inode, struct file *file) 35 + { 36 + struct affs_dir_data *data; 37 + 38 + data = kzalloc(sizeof(struct affs_dir_data), GFP_KERNEL); 39 + if (!data) 40 + return -ENOMEM; 41 + file->private_data = data; 42 + return 0; 43 + } 44 + 45 + static int affs_dir_release(struct inode *inode, struct file *file) 46 + { 47 + kfree(file->private_data); 48 + return 0; 49 + } 50 + 22 51 const struct file_operations affs_dir_operations = { 52 + .open = affs_dir_open, 23 53 .read = generic_read_dir, 24 - .llseek = generic_file_llseek, 54 + .llseek = affs_dir_llseek, 25 55 .iterate_shared = affs_readdir, 26 56 .fsync = affs_file_fsync, 57 + .release = affs_dir_release, 27 58 }; 28 59 29 60 /* ··· 76 45 affs_readdir(struct file *file, struct dir_context *ctx) 77 46 { 78 47 struct inode *inode = file_inode(file); 48 + struct affs_dir_data *data = file->private_data; 79 49 struct super_block *sb = inode->i_sb; 80 50 struct buffer_head *dir_bh = NULL; 81 51 struct buffer_head *fh_bh = NULL; ··· 91 59 pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos); 92 60 93 61 if (ctx->pos < 2) { 94 - file->private_data = (void *)0; 62 + data->ino = 0; 95 63 if (!dir_emit_dots(file, ctx)) 96 64 return 0; 97 65 } ··· 112 80 /* If the directory hasn't changed since the last call to readdir(), 113 81 * we can jump directly to where we left off. 114 82 */ 115 - ino = (u32)(long)file->private_data; 116 - if (ino && inode_eq_iversion(inode, file->f_version)) { 83 + ino = data->ino; 84 + if (ino && inode_eq_iversion(inode, data->cookie)) { 117 85 pr_debug("readdir() left off=%d\n", ino); 118 86 goto inside; 119 87 } ··· 163 131 } while (ino); 164 132 } 165 133 done: 166 - file->f_version = inode_query_iversion(inode); 167 - file->private_data = (void *)(long)ino; 134 + data->cookie = inode_query_iversion(inode); 135 + data->ino = ino; 168 136 affs_brelse(fh_bh); 169 137 170 138 out_brelse_dir:

-1

fs/ceph/dir.c

··· 707 707 708 708 if (offset != file->f_pos) { 709 709 file->f_pos = offset; 710 - file->f_version = 0; 711 710 dfi->file_info.flags &= ~CEPH_F_ATEND; 712 711 } 713 712 retval = offset;

+25 -3

fs/ext2/dir.c

··· 263 263 unsigned long n = pos >> PAGE_SHIFT; 264 264 unsigned long npages = dir_pages(inode); 265 265 unsigned chunk_mask = ~(ext2_chunk_size(inode)-1); 266 - bool need_revalidate = !inode_eq_iversion(inode, file->f_version); 266 + bool need_revalidate = !inode_eq_iversion(inode, *(u64 *)file->private_data); 267 267 bool has_filetype; 268 268 269 269 if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) ··· 290 290 offset = ext2_validate_entry(kaddr, offset, chunk_mask); 291 291 ctx->pos = (n<<PAGE_SHIFT) + offset; 292 292 } 293 - file->f_version = inode_query_iversion(inode); 293 + *(u64 *)file->private_data = inode_query_iversion(inode); 294 294 need_revalidate = false; 295 295 } 296 296 de = (ext2_dirent *)(kaddr+offset); ··· 703 703 return 0; 704 704 } 705 705 706 + static int ext2_dir_open(struct inode *inode, struct file *file) 707 + { 708 + file->private_data = kzalloc(sizeof(u64), GFP_KERNEL); 709 + if (!file->private_data) 710 + return -ENOMEM; 711 + return 0; 712 + } 713 + 714 + static int ext2_dir_release(struct inode *inode, struct file *file) 715 + { 716 + kfree(file->private_data); 717 + return 0; 718 + } 719 + 720 + static loff_t ext2_dir_llseek(struct file *file, loff_t offset, int whence) 721 + { 722 + return generic_llseek_cookie(file, offset, whence, 723 + (u64 *)file->private_data); 724 + } 725 + 706 726 const struct file_operations ext2_dir_operations = { 707 - .llseek = generic_file_llseek, 727 + .open = ext2_dir_open, 728 + .release = ext2_dir_release, 729 + .llseek = ext2_dir_llseek, 708 730 .read = generic_read_dir, 709 731 .iterate_shared = ext2_readdir, 710 732 .unlocked_ioctl = ext2_ioctl,

+27 -21

fs/ext4/dir.c

··· 133 133 struct super_block *sb = inode->i_sb; 134 134 struct buffer_head *bh = NULL; 135 135 struct fscrypt_str fstr = FSTR_INIT(NULL, 0); 136 + struct dir_private_info *info = file->private_data; 136 137 137 138 err = fscrypt_prepare_readdir(inode); 138 139 if (err) ··· 230 229 * readdir(2), then we might be pointing to an invalid 231 230 * dirent right now. Scan from the start of the block 232 231 * to make sure. */ 233 - if (!inode_eq_iversion(inode, file->f_version)) { 232 + if (!inode_eq_iversion(inode, info->cookie)) { 234 233 for (i = 0; i < sb->s_blocksize && i < offset; ) { 235 234 de = (struct ext4_dir_entry_2 *) 236 235 (bh->b_data + i); ··· 250 249 offset = i; 251 250 ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1)) 252 251 | offset; 253 - file->f_version = inode_query_iversion(inode); 252 + info->cookie = inode_query_iversion(inode); 254 253 } 255 254 256 255 while (ctx->pos < inode->i_size ··· 385 384 static loff_t ext4_dir_llseek(struct file *file, loff_t offset, int whence) 386 385 { 387 386 struct inode *inode = file->f_mapping->host; 387 + struct dir_private_info *info = file->private_data; 388 388 int dx_dir = is_dx_dir(inode); 389 389 loff_t ret, htree_max = ext4_get_htree_eof(file); 390 390 ··· 394 392 htree_max, htree_max); 395 393 else 396 394 ret = ext4_llseek(file, offset, whence); 397 - file->f_version = inode_peek_iversion(inode) - 1; 395 + info->cookie = inode_peek_iversion(inode) - 1; 398 396 return ret; 399 397 } 400 398 ··· 431 429 *root = RB_ROOT; 432 430 } 433 431 434 - 435 - static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, 436 - loff_t pos) 432 + static void ext4_htree_init_dir_info(struct file *filp, loff_t pos) 437 433 { 438 - struct dir_private_info *p; 434 + struct dir_private_info *p = filp->private_data; 439 435 440 - p = kzalloc(sizeof(*p), GFP_KERNEL); 441 - if (!p) 442 - return NULL; 443 - p->curr_hash = pos2maj_hash(filp, pos); 444 - p->curr_minor_hash = pos2min_hash(filp, pos); 445 - return p; 436 + if (is_dx_dir(file_inode(filp)) && !p->initialized) { 437 + p->curr_hash = pos2maj_hash(filp, pos); 438 + p->curr_minor_hash = pos2min_hash(filp, pos); 439 + p->initialized = true; 440 + } 446 441 } 447 442 448 443 void ext4_htree_free_dir_info(struct dir_private_info *p) ··· 551 552 struct fname *fname; 552 553 int ret = 0; 553 554 554 - if (!info) { 555 - info = ext4_htree_create_dir_info(file, ctx->pos); 556 - if (!info) 557 - return -ENOMEM; 558 - file->private_data = info; 559 - } 555 + ext4_htree_init_dir_info(file, ctx->pos); 560 556 561 557 if (ctx->pos == ext4_get_htree_eof(file)) 562 558 return 0; /* EOF */ ··· 584 590 * cached entries. 585 591 */ 586 592 if ((!info->curr_node) || 587 - !inode_eq_iversion(inode, file->f_version)) { 593 + !inode_eq_iversion(inode, info->cookie)) { 588 594 info->curr_node = NULL; 589 595 free_rb_tree_fname(&info->root); 590 - file->f_version = inode_query_iversion(inode); 596 + info->cookie = inode_query_iversion(inode); 591 597 ret = ext4_htree_fill_tree(file, info->curr_hash, 592 598 info->curr_minor_hash, 593 599 &info->next_hash); ··· 658 664 return 0; 659 665 } 660 666 667 + static int ext4_dir_open(struct inode *inode, struct file *file) 668 + { 669 + struct dir_private_info *info; 670 + 671 + info = kzalloc(sizeof(*info), GFP_KERNEL); 672 + if (!info) 673 + return -ENOMEM; 674 + file->private_data = info; 675 + return 0; 676 + } 677 + 661 678 const struct file_operations ext4_dir_operations = { 679 + .open = ext4_dir_open, 662 680 .llseek = ext4_dir_llseek, 663 681 .read = generic_read_dir, 664 682 .iterate_shared = ext4_readdir,

fs/ext4/ext4.h

··· 2553 2553 __u32 curr_hash; 2554 2554 __u32 curr_minor_hash; 2555 2555 __u32 next_hash; 2556 + u64 cookie; 2557 + bool initialized; 2556 2558 }; 2557 2559 2558 2560 /* calculate the first block number of the group */

+4 -3

fs/ext4/inline.c

··· 1460 1460 struct ext4_iloc iloc; 1461 1461 void *dir_buf = NULL; 1462 1462 int dotdot_offset, dotdot_size, extra_offset, extra_size; 1463 + struct dir_private_info *info = file->private_data; 1463 1464 1464 1465 ret = ext4_get_inode_loc(inode, &iloc); 1465 1466 if (ret) ··· 1504 1503 extra_size = extra_offset + inline_size; 1505 1504 1506 1505 /* 1507 - * If the version has changed since the last call to 1506 + * If the cookie has changed since the last call to 1508 1507 * readdir(2), then we might be pointing to an invalid 1509 1508 * dirent right now. Scan from the start of the inline 1510 1509 * dir to make sure. 1511 1510 */ 1512 - if (!inode_eq_iversion(inode, file->f_version)) { 1511 + if (!inode_eq_iversion(inode, info->cookie)) { 1513 1512 for (i = 0; i < extra_size && i < offset;) { 1514 1513 /* 1515 1514 * "." is with offset 0 and ··· 1541 1540 } 1542 1541 offset = i; 1543 1542 ctx->pos = offset; 1544 - file->f_version = inode_query_iversion(inode); 1543 + info->cookie = inode_query_iversion(inode); 1545 1544 } 1546 1545 1547 1546 while (ctx->pos < extra_size) {

+132 -34

fs/fcntl.c

··· 33 33 #include <asm/siginfo.h> 34 34 #include <linux/uaccess.h> 35 35 36 + #include "internal.h" 37 + 36 38 #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) 37 39 38 40 static int setfl(int fd, struct file * filp, unsigned int arg) ··· 89 87 return error; 90 88 } 91 89 90 + /* 91 + * Allocate an file->f_owner struct if it doesn't exist, handling racing 92 + * allocations correctly. 93 + */ 94 + int file_f_owner_allocate(struct file *file) 95 + { 96 + struct fown_struct *f_owner; 97 + 98 + f_owner = file_f_owner(file); 99 + if (f_owner) 100 + return 0; 101 + 102 + f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL); 103 + if (!f_owner) 104 + return -ENOMEM; 105 + 106 + rwlock_init(&f_owner->lock); 107 + f_owner->file = file; 108 + /* If someone else raced us, drop our allocation. */ 109 + if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner))) 110 + kfree(f_owner); 111 + return 0; 112 + } 113 + EXPORT_SYMBOL(file_f_owner_allocate); 114 + 115 + void file_f_owner_release(struct file *file) 116 + { 117 + struct fown_struct *f_owner; 118 + 119 + f_owner = file_f_owner(file); 120 + if (f_owner) { 121 + put_pid(f_owner->pid); 122 + kfree(f_owner); 123 + } 124 + } 125 + 92 126 static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, 93 127 int force) 94 128 { 95 - write_lock_irq(&filp->f_owner.lock); 96 - if (force || !filp->f_owner.pid) { 97 - put_pid(filp->f_owner.pid); 98 - filp->f_owner.pid = get_pid(pid); 99 - filp->f_owner.pid_type = type; 129 + struct fown_struct *f_owner; 130 + 131 + f_owner = file_f_owner(filp); 132 + if (WARN_ON_ONCE(!f_owner)) 133 + return; 134 + 135 + write_lock_irq(&f_owner->lock); 136 + if (force || !f_owner->pid) { 137 + put_pid(f_owner->pid); 138 + f_owner->pid = get_pid(pid); 139 + f_owner->pid_type = type; 100 140 101 141 if (pid) { 102 142 const struct cred *cred = current_cred(); 103 - filp->f_owner.uid = cred->uid; 104 - filp->f_owner.euid = cred->euid; 143 + f_owner->uid = cred->uid; 144 + f_owner->euid = cred->euid; 105 145 } 106 146 } 107 - write_unlock_irq(&filp->f_owner.lock); 147 + write_unlock_irq(&f_owner->lock); 108 148 } 109 149 110 150 void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, ··· 163 119 struct pid *pid = NULL; 164 120 int ret = 0; 165 121 122 + might_sleep(); 123 + 166 124 type = PIDTYPE_TGID; 167 125 if (who < 0) { 168 126 /* avoid overflow below */ ··· 174 128 type = PIDTYPE_PGID; 175 129 who = -who; 176 130 } 131 + 132 + ret = file_f_owner_allocate(filp); 133 + if (ret) 134 + return ret; 177 135 178 136 rcu_read_lock(); 179 137 if (who) { ··· 202 152 pid_t f_getown(struct file *filp) 203 153 { 204 154 pid_t pid = 0; 155 + struct fown_struct *f_owner; 205 156 206 - read_lock_irq(&filp->f_owner.lock); 157 + f_owner = file_f_owner(filp); 158 + if (!f_owner) 159 + return pid; 160 + 161 + read_lock_irq(&f_owner->lock); 207 162 rcu_read_lock(); 208 - if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) { 209 - pid = pid_vnr(filp->f_owner.pid); 210 - if (filp->f_owner.pid_type == PIDTYPE_PGID) 163 + if (pid_task(f_owner->pid, f_owner->pid_type)) { 164 + pid = pid_vnr(f_owner->pid); 165 + if (f_owner->pid_type == PIDTYPE_PGID) 211 166 pid = -pid; 212 167 } 213 168 rcu_read_unlock(); 214 - read_unlock_irq(&filp->f_owner.lock); 169 + read_unlock_irq(&f_owner->lock); 215 170 return pid; 216 171 } 217 172 ··· 249 194 return -EINVAL; 250 195 } 251 196 197 + ret = file_f_owner_allocate(filp); 198 + if (ret) 199 + return ret; 200 + 252 201 rcu_read_lock(); 253 202 pid = find_vpid(owner.pid); 254 203 if (owner.pid && !pid) ··· 269 210 struct f_owner_ex __user *owner_p = (void __user *)arg; 270 211 struct f_owner_ex owner = {}; 271 212 int ret = 0; 213 + struct fown_struct *f_owner; 214 + enum pid_type pid_type = PIDTYPE_PID; 272 215 273 - read_lock_irq(&filp->f_owner.lock); 274 - rcu_read_lock(); 275 - if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) 276 - owner.pid = pid_vnr(filp->f_owner.pid); 277 - rcu_read_unlock(); 278 - switch (filp->f_owner.pid_type) { 216 + f_owner = file_f_owner(filp); 217 + if (f_owner) { 218 + read_lock_irq(&f_owner->lock); 219 + rcu_read_lock(); 220 + if (pid_task(f_owner->pid, f_owner->pid_type)) 221 + owner.pid = pid_vnr(f_owner->pid); 222 + rcu_read_unlock(); 223 + pid_type = f_owner->pid_type; 224 + } 225 + 226 + switch (pid_type) { 279 227 case PIDTYPE_PID: 280 228 owner.type = F_OWNER_TID; 281 229 break; ··· 300 234 ret = -EINVAL; 301 235 break; 302 236 } 303 - read_unlock_irq(&filp->f_owner.lock); 237 + if (f_owner) 238 + read_unlock_irq(&f_owner->lock); 304 239 305 240 if (!ret) { 306 241 ret = copy_to_user(owner_p, &owner, sizeof(owner)); ··· 315 248 static int f_getowner_uids(struct file *filp, unsigned long arg) 316 249 { 317 250 struct user_namespace *user_ns = current_user_ns(); 251 + struct fown_struct *f_owner; 318 252 uid_t __user *dst = (void __user *)arg; 319 - uid_t src[2]; 253 + uid_t src[2] = {0, 0}; 320 254 int err; 321 255 322 - read_lock_irq(&filp->f_owner.lock); 323 - src[0] = from_kuid(user_ns, filp->f_owner.uid); 324 - src[1] = from_kuid(user_ns, filp->f_owner.euid); 325 - read_unlock_irq(&filp->f_owner.lock); 256 + f_owner = file_f_owner(filp); 257 + if (f_owner) { 258 + read_lock_irq(&f_owner->lock); 259 + src[0] = from_kuid(user_ns, f_owner->uid); 260 + src[1] = from_kuid(user_ns, f_owner->euid); 261 + read_unlock_irq(&f_owner->lock); 262 + } 326 263 327 264 err = put_user(src[0], &dst[0]); 328 265 err |= put_user(src[1], &dst[1]); ··· 420 349 return !!(filp->f_mode & FMODE_CREATED); 421 350 } 422 351 352 + static int f_owner_sig(struct file *filp, int signum, bool setsig) 353 + { 354 + int ret = 0; 355 + struct fown_struct *f_owner; 356 + 357 + might_sleep(); 358 + 359 + if (setsig) { 360 + if (!valid_signal(signum)) 361 + return -EINVAL; 362 + 363 + ret = file_f_owner_allocate(filp); 364 + if (ret) 365 + return ret; 366 + } 367 + 368 + f_owner = file_f_owner(filp); 369 + if (setsig) 370 + f_owner->signum = signum; 371 + else if (f_owner) 372 + ret = f_owner->signum; 373 + return ret; 374 + } 375 + 423 376 static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, 424 377 struct file *filp) 425 378 { ··· 525 430 err = f_getowner_uids(filp, arg); 526 431 break; 527 432 case F_GETSIG: 528 - err = filp->f_owner.signum; 433 + err = f_owner_sig(filp, 0, false); 529 434 break; 530 435 case F_SETSIG: 531 - /* arg == 0 restores default behaviour. */ 532 - if (!valid_signal(argi)) { 533 - break; 534 - } 535 - err = 0; 536 - filp->f_owner.signum = argi; 436 + err = f_owner_sig(filp, argi, true); 537 437 break; 538 438 case F_GETLEASE: 539 439 err = fcntl_getlease(filp); ··· 944 854 do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type); 945 855 } 946 856 947 - int send_sigurg(struct fown_struct *fown) 857 + int send_sigurg(struct file *file) 948 858 { 859 + struct fown_struct *fown; 949 860 struct task_struct *p; 950 861 enum pid_type type; 951 862 struct pid *pid; 952 863 unsigned long flags; 953 864 int ret = 0; 954 865 866 + fown = file_f_owner(file); 867 + if (!fown) 868 + return 0; 869 + 955 870 read_lock_irqsave(&fown->lock, flags); 956 871 957 872 type = fown->pid_type; ··· 1132 1037 } 1133 1038 read_lock_irqsave(&fa->fa_lock, flags); 1134 1039 if (fa->fa_file) { 1135 - fown = &fa->fa_file->f_owner; 1040 + fown = file_f_owner(fa->fa_file); 1041 + if (!fown) 1042 + goto next; 1136 1043 /* Don't send SIGURG to processes which have not set a 1137 1044 queued signum: SIGURG has its own default signalling 1138 1045 mechanism. */ 1139 1046 if (!(sig == SIGURG && fown->signum == 0)) 1140 1047 send_sigio(fown, fa->fa_fd, band); 1141 1048 } 1049 + next: 1142 1050 read_unlock_irqrestore(&fa->fa_lock, flags); 1143 1051 fa = rcu_dereference(fa->fa_next); 1144 1052 }

+11 -5

fs/file_table.c

··· 156 156 return error; 157 157 } 158 158 159 - rwlock_init(&f->f_owner.lock); 160 159 spin_lock_init(&f->f_lock); 160 + /* 161 + * Note that f_pos_lock is only used for files raising 162 + * FMODE_ATOMIC_POS and directories. Other files such as pipes 163 + * don't need it and since f_pos_lock is in a union may reuse 164 + * the space for other purposes. They are expected to initialize 165 + * the respective member when opening the file. 166 + */ 161 167 mutex_init(&f->f_pos_lock); 162 168 f->f_flags = flags; 163 169 f->f_mode = OPEN_FMODE(flags); ··· 434 428 cdev_put(inode->i_cdev); 435 429 } 436 430 fops_put(file->f_op); 437 - put_pid(file->f_owner.pid); 431 + file_f_owner_release(file); 438 432 put_file_access(file); 439 433 dput(dentry); 440 434 if (unlikely(mode & FMODE_NEED_UNMOUNT)) ··· 521 515 522 516 void __init files_init(void) 523 517 { 524 - filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, 525 - SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN | 526 - SLAB_PANIC | SLAB_ACCOUNT, NULL); 518 + filp_cachep = kmem_cache_create_rcu("filp", sizeof(struct file), 519 + offsetof(struct file, f_freeptr), 520 + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); 527 521 percpu_counter_init(&nr_files, 0, GFP_KERNEL); 528 522 } 529 523

fs/internal.h

··· 337 337 { 338 338 return path->mnt->mnt_root == path->dentry; 339 339 } 340 + void file_f_owner_release(struct file *file);

+5 -1

fs/locks.c

··· 1451 1451 struct file *filp = fl->c.flc_file; 1452 1452 1453 1453 f_delown(filp); 1454 - filp->f_owner.signum = 0; 1454 + file_f_owner(filp)->signum = 0; 1455 1455 fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync); 1456 1456 if (fl->fl_fasync != NULL) { 1457 1457 printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); ··· 1782 1782 1783 1783 lease = *flp; 1784 1784 trace_generic_add_lease(inode, lease); 1785 + 1786 + error = file_f_owner_allocate(filp); 1787 + if (error) 1788 + return error; 1785 1789 1786 1790 /* Note that arg is never F_UNLCK here */ 1787 1791 ctx = locks_get_lock_context(inode, arg);

+5 -1

fs/notify/dnotify/dnotify.c

··· 110 110 prev = &dn->dn_next; 111 111 continue; 112 112 } 113 - fown = &dn->dn_filp->f_owner; 113 + fown = file_f_owner(dn->dn_filp); 114 114 send_sigio(fown, dn->dn_fd, POLL_MSG); 115 115 if (dn->dn_mask & FS_DN_MULTISHOT) 116 116 prev = &dn->dn_next; ··· 315 315 error = -ENOMEM; 316 316 goto out_err; 317 317 } 318 + 319 + error = file_f_owner_allocate(filp); 320 + if (error) 321 + goto out_err; 318 322 319 323 /* set up the new_fsn_mark and new_dn_mark */ 320 324 new_fsn_mark = &new_dn_mark->fsn_mark;

+2 -1

fs/ocfs2/dir.c

··· 1932 1932 { 1933 1933 int error = 0; 1934 1934 struct inode *inode = file_inode(file); 1935 + struct ocfs2_file_private *fp = file->private_data; 1935 1936 int lock_level = 0; 1936 1937 1937 1938 trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); ··· 1953 1952 goto bail_nolock; 1954 1953 } 1955 1954 1956 - error = ocfs2_dir_foreach_blk(inode, &file->f_version, ctx, false); 1955 + error = ocfs2_dir_foreach_blk(inode, &fp->cookie, ctx, false); 1957 1956 1958 1957 ocfs2_inode_unlock(inode, lock_level); 1959 1958 if (error)

+9 -2

fs/ocfs2/file.c

··· 2751 2751 return remapped > 0 ? remapped : ret; 2752 2752 } 2753 2753 2754 + static loff_t ocfs2_dir_llseek(struct file *file, loff_t offset, int whence) 2755 + { 2756 + struct ocfs2_file_private *fp = file->private_data; 2757 + 2758 + return generic_llseek_cookie(file, offset, whence, &fp->cookie); 2759 + } 2760 + 2754 2761 const struct inode_operations ocfs2_file_iops = { 2755 2762 .setattr = ocfs2_setattr, 2756 2763 .getattr = ocfs2_getattr, ··· 2805 2798 2806 2799 WRAP_DIR_ITER(ocfs2_readdir) // FIXME! 2807 2800 const struct file_operations ocfs2_dops = { 2808 - .llseek = generic_file_llseek, 2801 + .llseek = ocfs2_dir_llseek, 2809 2802 .read = generic_read_dir, 2810 2803 .iterate_shared = shared_ocfs2_readdir, 2811 2804 .fsync = ocfs2_sync_file, ··· 2851 2844 }; 2852 2845 2853 2846 const struct file_operations ocfs2_dops_no_plocks = { 2854 - .llseek = generic_file_llseek, 2847 + .llseek = ocfs2_dir_llseek, 2855 2848 .read = generic_read_dir, 2856 2849 .iterate_shared = shared_ocfs2_readdir, 2857 2850 .fsync = ocfs2_sync_file,

fs/ocfs2/file.h

··· 20 20 enum ocfs2_alloc_restarted; 21 21 22 22 struct ocfs2_file_private { 23 + u64 cookie; 23 24 struct file *fp_file; 24 25 struct mutex fp_mutex; 25 26 struct ocfs2_lock_res fp_flock;

+5 -3

fs/pipe.c

··· 686 686 if (filp->f_mode & FMODE_READ) { 687 687 if (!pipe_empty(head, tail)) 688 688 mask |= EPOLLIN | EPOLLRDNORM; 689 - if (!pipe->writers && filp->f_version != pipe->w_counter) 689 + if (!pipe->writers && filp->f_pipe != pipe->w_counter) 690 690 mask |= EPOLLHUP; 691 691 } 692 692 ··· 945 945 } 946 946 947 947 f->private_data = inode->i_pipe; 948 + f->f_pipe = 0; 948 949 949 950 res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), 950 951 &pipefifo_fops); ··· 955 954 return PTR_ERR(res[0]); 956 955 } 957 956 res[0]->private_data = inode->i_pipe; 957 + res[0]->f_pipe = 0; 958 958 res[1] = f; 959 959 stream_open(inode, res[0]); 960 960 stream_open(inode, res[1]); ··· 1110 1108 bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; 1111 1109 int ret; 1112 1110 1113 - filp->f_version = 0; 1111 + filp->f_pipe = 0; 1114 1112 1115 1113 spin_lock(&inode->i_lock); 1116 1114 if (inode->i_pipe) { ··· 1157 1155 if ((filp->f_flags & O_NONBLOCK)) { 1158 1156 /* suppress EPOLLHUP until we have 1159 1157 * seen a writer */ 1160 - filp->f_version = pipe->w_counter; 1158 + filp->f_pipe = pipe->w_counter; 1161 1159 } else { 1162 1160 if (wait_for_partner(pipe, &pipe->w_counter)) 1163 1161 goto err_rd;

+24 -6

fs/proc/base.c

··· 3868 3868 if (!dir_emit_dots(file, ctx)) 3869 3869 return 0; 3870 3870 3871 - /* f_version caches the tgid value that the last readdir call couldn't 3872 - * return. lseek aka telldir automagically resets f_version to 0. 3871 + /* We cache the tgid value that the last readdir call couldn't 3872 + * return and lseek resets it to 0. 3873 3873 */ 3874 3874 ns = proc_pid_ns(inode->i_sb); 3875 - tid = (int)file->f_version; 3876 - file->f_version = 0; 3875 + tid = (int)(intptr_t)file->private_data; 3876 + file->private_data = NULL; 3877 3877 for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); 3878 3878 task; 3879 3879 task = next_tid(task), ctx->pos++) { ··· 3888 3888 proc_task_instantiate, task, NULL)) { 3889 3889 /* returning this tgid failed, save it as the first 3890 3890 * pid for the next readir call */ 3891 - file->f_version = (u64)tid; 3891 + file->private_data = (void *)(intptr_t)tid; 3892 3892 put_task_struct(task); 3893 3893 break; 3894 3894 } ··· 3913 3913 return 0; 3914 3914 } 3915 3915 3916 + /* 3917 + * proc_task_readdir() set @file->private_data to a positive integer 3918 + * value, so casting that to u64 is safe. generic_llseek_cookie() will 3919 + * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is 3920 + * here to catch any unexpected change in behavior either in 3921 + * proc_task_readdir() or generic_llseek_cookie(). 3922 + */ 3923 + static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence) 3924 + { 3925 + u64 cookie = (u64)(intptr_t)file->private_data; 3926 + loff_t off; 3927 + 3928 + off = generic_llseek_cookie(file, offset, whence, &cookie); 3929 + WARN_ON_ONCE(cookie > INT_MAX); 3930 + file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */ 3931 + return off; 3932 + } 3933 + 3916 3934 static const struct inode_operations proc_task_inode_operations = { 3917 3935 .lookup = proc_task_lookup, 3918 3936 .getattr = proc_task_getattr, ··· 3941 3923 static const struct file_operations proc_task_operations = { 3942 3924 .read = generic_read_dir, 3943 3925 .iterate_shared = proc_task_readdir, 3944 - .llseek = generic_file_llseek, 3926 + .llseek = proc_dir_llseek, 3945 3927 }; 3946 3928 3947 3929 void __init set_proc_pid_nlink(void)

+142 -49

fs/read_write.c

··· 40 40 } 41 41 42 42 /** 43 + * vfs_setpos_cookie - update the file offset for lseek and reset cookie 44 + * @file: file structure in question 45 + * @offset: file offset to seek to 46 + * @maxsize: maximum file size 47 + * @cookie: cookie to reset 48 + * 49 + * Update the file offset to the value specified by @offset if the given 50 + * offset is valid and it is not equal to the current file offset and 51 + * reset the specified cookie to indicate that a seek happened. 52 + * 53 + * Return the specified offset on success and -EINVAL on invalid offset. 54 + */ 55 + static loff_t vfs_setpos_cookie(struct file *file, loff_t offset, 56 + loff_t maxsize, u64 *cookie) 57 + { 58 + if (offset < 0 && !unsigned_offsets(file)) 59 + return -EINVAL; 60 + if (offset > maxsize) 61 + return -EINVAL; 62 + 63 + if (offset != file->f_pos) { 64 + file->f_pos = offset; 65 + if (cookie) 66 + *cookie = 0; 67 + } 68 + return offset; 69 + } 70 + 71 + /** 43 72 * vfs_setpos - update the file offset for lseek 44 73 * @file: file structure in question 45 74 * @offset: file offset to seek to ··· 82 53 */ 83 54 loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) 84 55 { 85 - if (offset < 0 && !unsigned_offsets(file)) 86 - return -EINVAL; 87 - if (offset > maxsize) 88 - return -EINVAL; 89 - 90 - if (offset != file->f_pos) { 91 - file->f_pos = offset; 92 - file->f_version = 0; 93 - } 94 - return offset; 56 + return vfs_setpos_cookie(file, offset, maxsize, NULL); 95 57 } 96 58 EXPORT_SYMBOL(vfs_setpos); 59 + 60 + /** 61 + * must_set_pos - check whether f_pos has to be updated 62 + * @file: file to seek on 63 + * @offset: offset to use 64 + * @whence: type of seek operation 65 + * @eof: end of file 66 + * 67 + * Check whether f_pos needs to be updated and update @offset according 68 + * to @whence. 69 + * 70 + * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be 71 + * updated, and negative error code on failure. 72 + */ 73 + static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof) 74 + { 75 + switch (whence) { 76 + case SEEK_END: 77 + *offset += eof; 78 + break; 79 + case SEEK_CUR: 80 + /* 81 + * Here we special-case the lseek(fd, 0, SEEK_CUR) 82 + * position-querying operation. Avoid rewriting the "same" 83 + * f_pos value back to the file because a concurrent read(), 84 + * write() or lseek() might have altered it 85 + */ 86 + if (*offset == 0) { 87 + *offset = file->f_pos; 88 + return 0; 89 + } 90 + break; 91 + case SEEK_DATA: 92 + /* 93 + * In the generic case the entire file is data, so as long as 94 + * offset isn't at the end of the file then the offset is data. 95 + */ 96 + if ((unsigned long long)*offset >= eof) 97 + return -ENXIO; 98 + break; 99 + case SEEK_HOLE: 100 + /* 101 + * There is a virtual hole at the end of the file, so as long as 102 + * offset isn't i_size or larger, return i_size. 103 + */ 104 + if ((unsigned long long)*offset >= eof) 105 + return -ENXIO; 106 + *offset = eof; 107 + break; 108 + } 109 + 110 + return 1; 111 + } 97 112 98 113 /** 99 114 * generic_file_llseek_size - generic llseek implementation for regular files ··· 159 86 generic_file_llseek_size(struct file *file, loff_t offset, int whence, 160 87 loff_t maxsize, loff_t eof) 161 88 { 162 - switch (whence) { 163 - case SEEK_END: 164 - offset += eof; 165 - break; 166 - case SEEK_CUR: 167 - /* 168 - * Here we special-case the lseek(fd, 0, SEEK_CUR) 169 - * position-querying operation. Avoid rewriting the "same" 170 - * f_pos value back to the file because a concurrent read(), 171 - * write() or lseek() might have altered it 172 - */ 173 - if (offset == 0) 174 - return file->f_pos; 175 - /* 176 - * f_lock protects against read/modify/write race with other 177 - * SEEK_CURs. Note that parallel writes and reads behave 178 - * like SEEK_SET. 179 - */ 180 - spin_lock(&file->f_lock); 181 - offset = vfs_setpos(file, file->f_pos + offset, maxsize); 182 - spin_unlock(&file->f_lock); 89 + int ret; 90 + 91 + ret = must_set_pos(file, &offset, whence, eof); 92 + if (ret < 0) 93 + return ret; 94 + if (ret == 0) 183 95 return offset; 184 - case SEEK_DATA: 96 + 97 + if (whence == SEEK_CUR) { 185 98 /* 186 - * In the generic case the entire file is data, so as long as 187 - * offset isn't at the end of the file then the offset is data. 99 + * f_lock protects against read/modify/write race with 100 + * other SEEK_CURs. Note that parallel writes and reads 101 + * behave like SEEK_SET. 188 102 */ 189 - if ((unsigned long long)offset >= eof) 190 - return -ENXIO; 191 - break; 192 - case SEEK_HOLE: 193 - /* 194 - * There is a virtual hole at the end of the file, so as long as 195 - * offset isn't i_size or larger, return i_size. 196 - */ 197 - if ((unsigned long long)offset >= eof) 198 - return -ENXIO; 199 - offset = eof; 200 - break; 103 + guard(spinlock)(&file->f_lock); 104 + return vfs_setpos(file, file->f_pos + offset, maxsize); 201 105 } 202 106 203 107 return vfs_setpos(file, offset, maxsize); 204 108 } 205 109 EXPORT_SYMBOL(generic_file_llseek_size); 110 + 111 + /** 112 + * generic_llseek_cookie - versioned llseek implementation 113 + * @file: file structure to seek on 114 + * @offset: file offset to seek to 115 + * @whence: type of seek 116 + * @cookie: cookie to update 117 + * 118 + * See generic_file_llseek for a general description and locking assumptions. 119 + * 120 + * In contrast to generic_file_llseek, this function also resets a 121 + * specified cookie to indicate a seek took place. 122 + */ 123 + loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence, 124 + u64 *cookie) 125 + { 126 + struct inode *inode = file->f_mapping->host; 127 + loff_t maxsize = inode->i_sb->s_maxbytes; 128 + loff_t eof = i_size_read(inode); 129 + int ret; 130 + 131 + if (WARN_ON_ONCE(!cookie)) 132 + return -EINVAL; 133 + 134 + /* 135 + * Require that this is only used for directories that guarantee 136 + * synchronization between readdir and seek so that an update to 137 + * @cookie is correctly synchronized with concurrent readdir. 138 + */ 139 + if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS))) 140 + return -EINVAL; 141 + 142 + ret = must_set_pos(file, &offset, whence, eof); 143 + if (ret < 0) 144 + return ret; 145 + if (ret == 0) 146 + return offset; 147 + 148 + /* No need to hold f_lock because we know that f_pos_lock is held. */ 149 + if (whence == SEEK_CUR) 150 + return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie); 151 + 152 + return vfs_setpos_cookie(file, offset, maxsize, cookie); 153 + } 154 + EXPORT_SYMBOL(generic_llseek_cookie); 206 155 207 156 /** 208 157 * generic_file_llseek - generic llseek implementation for regular files ··· 365 270 } 366 271 retval = -EINVAL; 367 272 if (offset >= 0 || unsigned_offsets(file)) { 368 - if (offset != file->f_pos) { 273 + if (offset != file->f_pos) 369 274 file->f_pos = offset; 370 - file->f_version = 0; 371 - } 372 275 retval = offset; 373 276 } 374 277 out:

+46 -18

fs/ubifs/dir.c

··· 555 555 return 0; 556 556 } 557 557 558 + struct ubifs_dir_data { 559 + struct ubifs_dent_node *dent; 560 + u64 cookie; 561 + }; 562 + 558 563 /* 559 564 * The classical Unix view for directory is that it is a linear array of 560 565 * (name, inode number) entries. Linux/VFS assumes this model as well. ··· 587 582 struct inode *dir = file_inode(file); 588 583 struct ubifs_info *c = dir->i_sb->s_fs_info; 589 584 bool encrypted = IS_ENCRYPTED(dir); 585 + struct ubifs_dir_data *data = file->private_data; 590 586 591 587 dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos); 592 588 ··· 610 604 fstr_real_len = fstr.len; 611 605 } 612 606 613 - if (file->f_version == 0) { 607 + if (data->cookie == 0) { 614 608 /* 615 - * The file was seek'ed, which means that @file->private_data 609 + * The file was seek'ed, which means that @data->dent 616 610 * is now invalid. This may also be just the first 617 611 * 'ubifs_readdir()' invocation, in which case 618 - * @file->private_data is NULL, and the below code is 612 + * @data->dent is NULL, and the below code is 619 613 * basically a no-op. 620 614 */ 621 - kfree(file->private_data); 622 - file->private_data = NULL; 615 + kfree(data->dent); 616 + data->dent = NULL; 623 617 } 624 618 625 619 /* 626 - * 'generic_file_llseek()' unconditionally sets @file->f_version to 627 - * zero, and we use this for detecting whether the file was seek'ed. 620 + * 'ubifs_dir_llseek()' sets @data->cookie to zero, and we use this 621 + * for detecting whether the file was seek'ed. 628 622 */ 629 - file->f_version = 1; 623 + data->cookie = 1; 630 624 631 625 /* File positions 0 and 1 correspond to "." and ".." */ 632 626 if (ctx->pos < 2) { 633 - ubifs_assert(c, !file->private_data); 627 + ubifs_assert(c, !data->dent); 634 628 if (!dir_emit_dots(file, ctx)) { 635 629 if (encrypted) 636 630 fscrypt_fname_free_buffer(&fstr); ··· 647 641 } 648 642 649 643 ctx->pos = key_hash_flash(c, &dent->key); 650 - file->private_data = dent; 644 + data->dent = dent; 651 645 } 652 646 653 - dent = file->private_data; 647 + dent = data->dent; 654 648 if (!dent) { 655 649 /* 656 650 * The directory was seek'ed to and is now readdir'ed. ··· 664 658 goto out; 665 659 } 666 660 ctx->pos = key_hash_flash(c, &dent->key); 667 - file->private_data = dent; 661 + data->dent = dent; 668 662 } 669 663 670 664 while (1) { ··· 707 701 goto out; 708 702 } 709 703 710 - kfree(file->private_data); 704 + kfree(data->dent); 711 705 ctx->pos = key_hash_flash(c, &dent->key); 712 - file->private_data = dent; 706 + data->dent = dent; 713 707 cond_resched(); 714 708 } 715 709 716 710 out: 717 - kfree(file->private_data); 718 - file->private_data = NULL; 711 + kfree(data->dent); 712 + data->dent = NULL; 719 713 720 714 if (encrypted) 721 715 fscrypt_fname_free_buffer(&fstr); ··· 739 733 /* Free saved readdir() state when the directory is closed */ 740 734 static int ubifs_dir_release(struct inode *dir, struct file *file) 741 735 { 742 - kfree(file->private_data); 736 + struct ubifs_dir_data *data = file->private_data; 737 + 738 + kfree(data->dent); 739 + kfree(data); 743 740 file->private_data = NULL; 744 741 return 0; 745 742 } ··· 1721 1712 return 0; 1722 1713 } 1723 1714 1715 + static int ubifs_dir_open(struct inode *inode, struct file *file) 1716 + { 1717 + struct ubifs_dir_data *data; 1718 + 1719 + data = kzalloc(sizeof(struct ubifs_dir_data), GFP_KERNEL); 1720 + if (!data) 1721 + return -ENOMEM; 1722 + file->private_data = data; 1723 + return 0; 1724 + } 1725 + 1726 + static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence) 1727 + { 1728 + struct ubifs_dir_data *data = file->private_data; 1729 + 1730 + return generic_llseek_cookie(file, offset, whence, &data->cookie); 1731 + } 1732 + 1724 1733 const struct inode_operations ubifs_dir_inode_operations = { 1725 1734 .lookup = ubifs_lookup, 1726 1735 .create = ubifs_create, ··· 1759 1732 }; 1760 1733 1761 1734 const struct file_operations ubifs_dir_operations = { 1762 - .llseek = generic_file_llseek, 1735 + .open = ubifs_dir_open, 1736 + .llseek = ubifs_dir_llseek, 1763 1737 .release = ubifs_dir_release, 1764 1738 .read = generic_read_dir, 1765 1739 .iterate_shared = ubifs_readdir,

+25 -3

fs/udf/dir.c

··· 60 60 * identifying beginning of dir entry (names are under user control), 61 61 * we need to scan the directory from the beginning. 62 62 */ 63 - if (!inode_eq_iversion(dir, file->f_version)) { 63 + if (!inode_eq_iversion(dir, *(u64 *)file->private_data)) { 64 64 emit_pos = nf_pos; 65 65 nf_pos = 0; 66 66 } else { ··· 122 122 udf_fiiter_release(&iter); 123 123 out: 124 124 if (pos_valid) 125 - file->f_version = inode_query_iversion(dir); 125 + *(u64 *)file->private_data = inode_query_iversion(dir); 126 126 kfree(fname); 127 127 128 128 return ret; 129 129 } 130 130 131 + static int udf_dir_open(struct inode *inode, struct file *file) 132 + { 133 + file->private_data = kzalloc(sizeof(u64), GFP_KERNEL); 134 + if (!file->private_data) 135 + return -ENOMEM; 136 + return 0; 137 + } 138 + 139 + static int udf_dir_release(struct inode *inode, struct file *file) 140 + { 141 + kfree(file->private_data); 142 + return 0; 143 + } 144 + 145 + static loff_t udf_dir_llseek(struct file *file, loff_t offset, int whence) 146 + { 147 + return generic_llseek_cookie(file, offset, whence, 148 + (u64 *)file->private_data); 149 + } 150 + 131 151 /* readdir and lookup functions */ 132 152 const struct file_operations udf_dir_operations = { 133 - .llseek = generic_file_llseek, 153 + .open = udf_dir_open, 154 + .release = udf_dir_release, 155 + .llseek = udf_dir_llseek, 134 156 .read = generic_read_dir, 135 157 .iterate_shared = udf_readdir, 136 158 .unlocked_ioctl = udf_ioctl,

+25 -3

fs/ufs/dir.c

··· 416 416 unsigned long n = pos >> PAGE_SHIFT; 417 417 unsigned long npages = dir_pages(inode); 418 418 unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); 419 - bool need_revalidate = !inode_eq_iversion(inode, file->f_version); 419 + bool need_revalidate = !inode_eq_iversion(inode, *(u64 *)file->private_data); 420 420 unsigned flags = UFS_SB(sb)->s_flags; 421 421 422 422 UFSD("BEGIN\n"); ··· 442 442 offset = ufs_validate_entry(sb, kaddr, offset, chunk_mask); 443 443 ctx->pos = (n<<PAGE_SHIFT) + offset; 444 444 } 445 - file->f_version = inode_query_iversion(inode); 445 + *(u64 *)file->private_data = inode_query_iversion(inode); 446 446 need_revalidate = false; 447 447 } 448 448 de = (struct ufs_dir_entry *)(kaddr+offset); ··· 627 627 return 0; 628 628 } 629 629 630 + static int ufs_dir_open(struct inode *inode, struct file *file) 631 + { 632 + file->private_data = kzalloc(sizeof(u64), GFP_KERNEL); 633 + if (!file->private_data) 634 + return -ENOMEM; 635 + return 0; 636 + } 637 + 638 + static int ufs_dir_release(struct inode *inode, struct file *file) 639 + { 640 + kfree(file->private_data); 641 + return 0; 642 + } 643 + 644 + static loff_t ufs_dir_llseek(struct file *file, loff_t offset, int whence) 645 + { 646 + return generic_llseek_cookie(file, offset, whence, 647 + (u64 *)file->private_data); 648 + } 649 + 630 650 const struct file_operations ufs_dir_operations = { 651 + .open = ufs_dir_open, 652 + .release = ufs_dir_release, 631 653 .read = generic_read_dir, 632 654 .iterate_shared = ufs_readdir, 633 655 .fsync = generic_file_fsync, 634 - .llseek = generic_file_llseek, 656 + .llseek = ufs_dir_llseek, 635 657 };

+67 -41

include/linux/fs.h

··· 963 963 } 964 964 965 965 struct fown_struct { 966 + struct file *file; /* backpointer for security modules */ 966 967 rwlock_t lock; /* protects pid, uid, euid fields */ 967 968 struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ 968 969 enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ ··· 1003 1002 index < ra->start + ra->size); 1004 1003 } 1005 1004 1006 - /* 1007 - * f_{lock,count,pos_lock} members can be highly contended and share 1008 - * the same cacheline. f_{lock,mode} are very frequently used together 1009 - * and so share the same cacheline as well. The read-mostly 1010 - * f_{path,inode,op} are kept on a separate cacheline. 1005 + /** 1006 + * struct file - Represents a file 1007 + * @f_count: reference count 1008 + * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. 1009 + * @f_mode: FMODE_* flags often used in hotpaths 1010 + * @f_op: file operations 1011 + * @f_mapping: Contents of a cacheable, mappable object. 1012 + * @private_data: filesystem or driver specific data 1013 + * @f_inode: cached inode 1014 + * @f_flags: file flags 1015 + * @f_iocb_flags: iocb flags 1016 + * @f_cred: stashed credentials of creator/opener 1017 + * @f_path: path of the file 1018 + * @f_pos_lock: lock protecting file position 1019 + * @f_pipe: specific to pipes 1020 + * @f_pos: file position 1021 + * @f_security: LSM security context of this file 1022 + * @f_owner: file owner 1023 + * @f_wb_err: writeback error 1024 + * @f_sb_err: per sb writeback errors 1025 + * @f_ep: link of all epoll hooks for this file 1026 + * @f_task_work: task work entry point 1027 + * @f_llist: work queue entrypoint 1028 + * @f_ra: file's readahead state 1029 + * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) 1011 1030 */ 1012 1031 struct file { 1013 - union { 1014 - /* fput() uses task work when closing and freeing file (default). */ 1015 - struct callback_head f_task_work; 1016 - /* fput() must use workqueue (most kernel threads). */ 1017 - struct llist_node f_llist; 1018 - unsigned int f_iocb_flags; 1019 - }; 1020 - 1021 - /* 1022 - * Protects f_ep, f_flags. 1023 - * Must not be taken from IRQ context. 1024 - */ 1025 - spinlock_t f_lock; 1026 - fmode_t f_mode; 1027 - atomic_long_t f_count; 1028 - struct mutex f_pos_lock; 1029 - loff_t f_pos; 1030 - unsigned int f_flags; 1031 - struct fown_struct f_owner; 1032 - const struct cred *f_cred; 1033 - struct file_ra_state f_ra; 1034 - struct path f_path; 1035 - struct inode *f_inode; /* cached value */ 1032 + atomic_long_t f_count; 1033 + spinlock_t f_lock; 1034 + fmode_t f_mode; 1036 1035 const struct file_operations *f_op; 1037 - 1038 - u64 f_version; 1036 + struct address_space *f_mapping; 1037 + void *private_data; 1038 + struct inode *f_inode; 1039 + unsigned int f_flags; 1040 + unsigned int f_iocb_flags; 1041 + const struct cred *f_cred; 1042 + /* --- cacheline 1 boundary (64 bytes) --- */ 1043 + struct path f_path; 1044 + union { 1045 + /* regular files (with FMODE_ATOMIC_POS) and directories */ 1046 + struct mutex f_pos_lock; 1047 + /* pipes */ 1048 + u64 f_pipe; 1049 + }; 1050 + loff_t f_pos; 1039 1051 #ifdef CONFIG_SECURITY 1040 - void *f_security; 1052 + void *f_security; 1041 1053 #endif 1042 - /* needed for tty driver, and maybe others */ 1043 - void *private_data; 1044 - 1054 + /* --- cacheline 2 boundary (128 bytes) --- */ 1055 + struct fown_struct *f_owner; 1056 + errseq_t f_wb_err; 1057 + errseq_t f_sb_err; 1045 1058 #ifdef CONFIG_EPOLL 1046 - /* Used by fs/eventpoll.c to link all the hooks to this file */ 1047 - struct hlist_head *f_ep; 1048 - #endif /* #ifdef CONFIG_EPOLL */ 1049 - struct address_space *f_mapping; 1050 - errseq_t f_wb_err; 1051 - errseq_t f_sb_err; /* for syncfs */ 1059 + struct hlist_head *f_ep; 1060 + #endif 1061 + union { 1062 + struct callback_head f_task_work; 1063 + struct llist_node f_llist; 1064 + struct file_ra_state f_ra; 1065 + freeptr_t f_freeptr; 1066 + }; 1067 + /* --- cacheline 3 boundary (192 bytes) --- */ 1052 1068 } __randomize_layout 1053 1069 __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ 1054 1070 ··· 1109 1091 #define OFFSET_MAX type_max(loff_t) 1110 1092 #define OFFT_OFFSET_MAX type_max(off_t) 1111 1093 #endif 1094 + 1095 + int file_f_owner_allocate(struct file *file); 1096 + static inline struct fown_struct *file_f_owner(const struct file *file) 1097 + { 1098 + return READ_ONCE(file->f_owner); 1099 + } 1112 1100 1113 1101 extern void send_sigio(struct fown_struct *fown, int fd, int band); 1114 1102 ··· 1164 1140 extern int f_setown(struct file *filp, int who, int force); 1165 1141 extern void f_delown(struct file *filp); 1166 1142 extern pid_t f_getown(struct file *filp); 1167 - extern int send_sigurg(struct fown_struct *fown); 1143 + extern int send_sigurg(struct file *file); 1168 1144 1169 1145 /* 1170 1146 * sb->s_flags. Note that these mirror the equivalent MS_* flags where ··· 3231 3207 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence); 3232 3208 extern loff_t generic_file_llseek_size(struct file *file, loff_t offset, 3233 3209 int whence, loff_t maxsize, loff_t eof); 3210 + loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence, 3211 + u64 *cookie); 3234 3212 extern loff_t fixed_size_llseek(struct file *file, loff_t offset, 3235 3213 int whence, loff_t size); 3236 3214 extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t);

include/linux/slab.h

··· 213 213 #endif 214 214 215 215 /* 216 + * freeptr_t represents a SLUB freelist pointer, which might be encoded 217 + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. 218 + */ 219 + typedef struct { unsigned long v; } freeptr_t; 220 + 221 + /* 216 222 * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. 217 223 * 218 224 * Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault. ··· 248 242 slab_flags_t flags, 249 243 unsigned int useroffset, unsigned int usersize, 250 244 void (*ctor)(void *)); 245 + struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, 246 + unsigned int freeptr_offset, 247 + slab_flags_t flags); 251 248 void kmem_cache_destroy(struct kmem_cache *s); 252 249 int kmem_cache_shrink(struct kmem_cache *s); 253 250

mm/slab.h

··· 261 261 unsigned int object_size; /* Object size without metadata */ 262 262 struct reciprocal_value reciprocal_size; 263 263 unsigned int offset; /* Free pointer offset */ 264 + /* Specific free pointer requested (if not UINT_MAX) */ 265 + unsigned int rcu_freeptr_offset; 264 266 #ifdef CONFIG_SLUB_CPU_PARTIAL 265 267 /* Number of per cpu partial objects to keep around */ 266 268 unsigned int cpu_partial;

+97 -41

mm/slab_common.c

··· 202 202 } 203 203 204 204 static struct kmem_cache *create_cache(const char *name, 205 - unsigned int object_size, unsigned int align, 206 - slab_flags_t flags, unsigned int useroffset, 207 - unsigned int usersize, void (*ctor)(void *), 208 - struct kmem_cache *root_cache) 205 + unsigned int object_size, unsigned int freeptr_offset, 206 + unsigned int align, slab_flags_t flags, 207 + unsigned int useroffset, unsigned int usersize, 208 + void (*ctor)(void *)) 209 209 { 210 210 struct kmem_cache *s; 211 211 int err; 212 212 213 213 if (WARN_ON(useroffset + usersize > object_size)) 214 214 useroffset = usersize = 0; 215 + 216 + /* If a custom freelist pointer is requested make sure it's sane. */ 217 + err = -EINVAL; 218 + if (freeptr_offset != UINT_MAX && 219 + (freeptr_offset >= object_size || !(flags & SLAB_TYPESAFE_BY_RCU) || 220 + !IS_ALIGNED(freeptr_offset, sizeof(freeptr_t)))) 221 + goto out; 215 222 216 223 err = -ENOMEM; 217 224 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); ··· 227 220 228 221 s->name = name; 229 222 s->size = s->object_size = object_size; 223 + s->rcu_freeptr_offset = freeptr_offset; 230 224 s->align = align; 231 225 s->ctor = ctor; 232 226 #ifdef CONFIG_HARDENED_USERCOPY 233 227 s->useroffset = useroffset; 234 228 s->usersize = usersize; 235 229 #endif 236 - 237 230 err = __kmem_cache_create(s, flags); 238 231 if (err) 239 232 goto out_free_cache; ··· 248 241 return ERR_PTR(err); 249 242 } 250 243 251 - /** 252 - * kmem_cache_create_usercopy - Create a cache with a region suitable 253 - * for copying to userspace 254 - * @name: A string which is used in /proc/slabinfo to identify this cache. 255 - * @size: The size of objects to be created in this cache. 256 - * @align: The required alignment for the objects. 257 - * @flags: SLAB flags 258 - * @useroffset: Usercopy region offset 259 - * @usersize: Usercopy region size 260 - * @ctor: A constructor for the objects. 261 - * 262 - * Cannot be called within a interrupt, but can be interrupted. 263 - * The @ctor is run when new pages are allocated by the cache. 264 - * 265 - * The flags are 266 - * 267 - * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 268 - * to catch references to uninitialised memory. 269 - * 270 - * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check 271 - * for buffer overruns. 272 - * 273 - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 274 - * cacheline. This can be beneficial if you're counting cycles as closely 275 - * as davem. 276 - * 277 - * Return: a pointer to the cache on success, NULL on failure. 278 - */ 279 - struct kmem_cache * 280 - kmem_cache_create_usercopy(const char *name, 281 - unsigned int size, unsigned int align, 282 - slab_flags_t flags, 244 + static struct kmem_cache * 245 + do_kmem_cache_create_usercopy(const char *name, 246 + unsigned int size, unsigned int freeptr_offset, 247 + unsigned int align, slab_flags_t flags, 283 248 unsigned int useroffset, unsigned int usersize, 284 249 void (*ctor)(void *)) 285 250 { ··· 311 332 goto out_unlock; 312 333 } 313 334 314 - s = create_cache(cache_name, size, 335 + s = create_cache(cache_name, size, freeptr_offset, 315 336 calculate_alignment(flags, align, size), 316 - flags, useroffset, usersize, ctor, NULL); 337 + flags, useroffset, usersize, ctor); 317 338 if (IS_ERR(s)) { 318 339 err = PTR_ERR(s); 319 340 kfree_const(cache_name); ··· 334 355 return NULL; 335 356 } 336 357 return s; 358 + } 359 + 360 + /** 361 + * kmem_cache_create_usercopy - Create a cache with a region suitable 362 + * for copying to userspace 363 + * @name: A string which is used in /proc/slabinfo to identify this cache. 364 + * @size: The size of objects to be created in this cache. 365 + * @align: The required alignment for the objects. 366 + * @flags: SLAB flags 367 + * @useroffset: Usercopy region offset 368 + * @usersize: Usercopy region size 369 + * @ctor: A constructor for the objects. 370 + * 371 + * Cannot be called within a interrupt, but can be interrupted. 372 + * The @ctor is run when new pages are allocated by the cache. 373 + * 374 + * The flags are 375 + * 376 + * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 377 + * to catch references to uninitialised memory. 378 + * 379 + * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check 380 + * for buffer overruns. 381 + * 382 + * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 383 + * cacheline. This can be beneficial if you're counting cycles as closely 384 + * as davem. 385 + * 386 + * Return: a pointer to the cache on success, NULL on failure. 387 + */ 388 + struct kmem_cache * 389 + kmem_cache_create_usercopy(const char *name, unsigned int size, 390 + unsigned int align, slab_flags_t flags, 391 + unsigned int useroffset, unsigned int usersize, 392 + void (*ctor)(void *)) 393 + { 394 + return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags, 395 + useroffset, usersize, ctor); 337 396 } 338 397 EXPORT_SYMBOL(kmem_cache_create_usercopy); 339 398 ··· 404 387 kmem_cache_create(const char *name, unsigned int size, unsigned int align, 405 388 slab_flags_t flags, void (*ctor)(void *)) 406 389 { 407 - return kmem_cache_create_usercopy(name, size, align, flags, 0, 0, 408 - ctor); 390 + return do_kmem_cache_create_usercopy(name, size, UINT_MAX, align, flags, 391 + 0, 0, ctor); 409 392 } 410 393 EXPORT_SYMBOL(kmem_cache_create); 394 + 395 + /** 396 + * kmem_cache_create_rcu - Create a SLAB_TYPESAFE_BY_RCU cache. 397 + * @name: A string which is used in /proc/slabinfo to identify this cache. 398 + * @size: The size of objects to be created in this cache. 399 + * @freeptr_offset: The offset into the memory to the free pointer 400 + * @flags: SLAB flags 401 + * 402 + * Cannot be called within an interrupt, but can be interrupted. 403 + * 404 + * See kmem_cache_create() for an explanation of possible @flags. 405 + * 406 + * By default SLAB_TYPESAFE_BY_RCU caches place the free pointer outside 407 + * of the object. This might cause the object to grow in size. Callers 408 + * that have a reason to avoid this can specify a custom free pointer 409 + * offset in their struct where the free pointer will be placed. 410 + * 411 + * Note that placing the free pointer inside the object requires the 412 + * caller to ensure that no fields are invalidated that are required to 413 + * guard against object recycling (See SLAB_TYPESAFE_BY_RCU for 414 + * details.). 415 + * 416 + * Using zero as a value for @freeptr_offset is valid. To request no 417 + * offset UINT_MAX must be specified. 418 + * 419 + * Note that @ctor isn't supported with custom free pointers as a @ctor 420 + * requires an external free pointer. 421 + * 422 + * Return: a pointer to the cache on success, NULL on failure. 423 + */ 424 + struct kmem_cache *kmem_cache_create_rcu(const char *name, unsigned int size, 425 + unsigned int freeptr_offset, 426 + slab_flags_t flags) 427 + { 428 + return do_kmem_cache_create_usercopy(name, size, freeptr_offset, 0, 429 + flags | SLAB_TYPESAFE_BY_RCU, 0, 0, 430 + NULL); 431 + } 432 + EXPORT_SYMBOL(kmem_cache_create_rcu); 411 433 412 434 static struct kmem_cache *kmem_buckets_cache __ro_after_init; 413 435

+13 -7

mm/slub.c

··· 466 466 *******************************************************************/ 467 467 468 468 /* 469 - * freeptr_t represents a SLUB freelist pointer, which might be encoded 470 - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. 471 - */ 472 - typedef struct { unsigned long v; } freeptr_t; 473 - 474 - /* 475 469 * Returns freelist pointer (ptr). With hardening, this is obfuscated 476 470 * with an XOR of the address where the pointer is held and a per-cache 477 471 * random number. ··· 3919 3925 /* 3920 3926 * If the object has been wiped upon free, make sure it's fully initialized by 3921 3927 * zeroing out freelist pointer. 3928 + * 3929 + * Note that we also wipe custom freelist pointers specified via 3930 + * s->rcu_freeptr_offset. 3922 3931 */ 3923 3932 static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, 3924 3933 void *obj) ··· 5145 5148 #endif 5146 5149 } 5147 5150 5151 + /* Was a valid freeptr offset requested? */ 5152 + static inline bool has_freeptr_offset(const struct kmem_cache *s) 5153 + { 5154 + return s->rcu_freeptr_offset != UINT_MAX; 5155 + } 5156 + 5148 5157 /* 5149 5158 * calculate_sizes() determines the order and the distribution of data within 5150 5159 * a slab object. ··· 5196 5193 */ 5197 5194 s->inuse = size; 5198 5195 5199 - if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || s->ctor || 5196 + if (((flags & SLAB_TYPESAFE_BY_RCU) && !has_freeptr_offset(s)) || 5197 + (flags & SLAB_POISON) || s->ctor || 5200 5198 ((flags & SLAB_RED_ZONE) && 5201 5199 (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { 5202 5200 /* ··· 5218 5214 */ 5219 5215 s->offset = size; 5220 5216 size += sizeof(void *); 5217 + } else if ((flags & SLAB_TYPESAFE_BY_RCU) && has_freeptr_offset(s)) { 5218 + s->offset = s->rcu_freeptr_offset; 5221 5219 } else { 5222 5220 /* 5223 5221 * Store freelist pointer near middle of object to keep

+1 -1

net/core/sock.c

··· 3497 3497 void sk_send_sigurg(struct sock *sk) 3498 3498 { 3499 3499 if (sk->sk_socket && sk->sk_socket->file) 3500 - if (send_sigurg(&sk->sk_socket->file->f_owner)) 3500 + if (send_sigurg(sk->sk_socket->file)) 3501 3501 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 3502 3502 } 3503 3503 EXPORT_SYMBOL(sk_send_sigurg);

+1 -1

security/selinux/hooks.c

··· 3950 3950 struct file_security_struct *fsec; 3951 3951 3952 3952 /* struct fown_struct is never outside the context of a struct file */ 3953 - file = container_of(fown, struct file, f_owner); 3953 + file = fown->file; 3954 3954 3955 3955 fsec = selinux_file(file); 3956 3956

+1 -1

security/smack/smack_lsm.c

··· 1950 1950 /* 1951 1951 * struct fown_struct is never outside the context of a struct file 1952 1952 */ 1953 - file = container_of(fown, struct file, f_owner); 1953 + file = fown->file; 1954 1954 1955 1955 /* we don't log here as rc can be overriden */ 1956 1956 blob = smack_file(file);