Merge tag 'fsnotify_for_v5.13-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs

+1 -4

fs/ext2/super.c

··· 1399 1399 struct super_block *sb = dentry->d_sb; 1400 1400 struct ext2_sb_info *sbi = EXT2_SB(sb); 1401 1401 struct ext2_super_block *es = sbi->s_es; 1402 - u64 fsid; 1403 1402 1404 1403 spin_lock(&sbi->s_lock); 1405 1404 ··· 1452 1453 buf->f_ffree = ext2_count_free_inodes(sb); 1453 1454 es->s_free_inodes_count = cpu_to_le32(buf->f_ffree); 1454 1455 buf->f_namelen = EXT2_NAME_LEN; 1455 - fsid = le64_to_cpup((void *)es->s_uuid) ^ 1456 - le64_to_cpup((void *)es->s_uuid + sizeof(u64)); 1457 - buf->f_fsid = u64_to_fsid(fsid); 1456 + buf->f_fsid = uuid_to_fsid(es->s_uuid); 1458 1457 spin_unlock(&sbi->s_lock); 1459 1458 return 0; 1460 1459 }

+1 -4

fs/ext4/super.c

··· 6153 6153 struct ext4_sb_info *sbi = EXT4_SB(sb); 6154 6154 struct ext4_super_block *es = sbi->s_es; 6155 6155 ext4_fsblk_t overhead = 0, resv_blocks; 6156 - u64 fsid; 6157 6156 s64 bfree; 6158 6157 resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters)); 6159 6158 ··· 6173 6174 buf->f_files = le32_to_cpu(es->s_inodes_count); 6174 6175 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); 6175 6176 buf->f_namelen = EXT4_NAME_LEN; 6176 - fsid = le64_to_cpup((void *)es->s_uuid) ^ 6177 - le64_to_cpup((void *)es->s_uuid + sizeof(u64)); 6178 - buf->f_fsid = u64_to_fsid(fsid); 6177 + buf->f_fsid = uuid_to_fsid(es->s_uuid); 6179 6178 6180 6179 #ifdef CONFIG_QUOTA 6181 6180 if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&

+121 -49

fs/notify/fanotify/fanotify.c

··· 14 14 #include <linux/audit.h> 15 15 #include <linux/sched/mm.h> 16 16 #include <linux/statfs.h> 17 + #include <linux/stringhash.h> 17 18 18 19 #include "fanotify.h" 19 20 ··· 23 22 return p1->mnt == p2->mnt && p1->dentry == p2->dentry; 24 23 } 25 24 25 + static unsigned int fanotify_hash_path(const struct path *path) 26 + { 27 + return hash_ptr(path->dentry, FANOTIFY_EVENT_HASH_BITS) ^ 28 + hash_ptr(path->mnt, FANOTIFY_EVENT_HASH_BITS); 29 + } 30 + 26 31 static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1, 27 32 __kernel_fsid_t *fsid2) 28 33 { 29 34 return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1]; 35 + } 36 + 37 + static unsigned int fanotify_hash_fsid(__kernel_fsid_t *fsid) 38 + { 39 + return hash_32(fsid->val[0], FANOTIFY_EVENT_HASH_BITS) ^ 40 + hash_32(fsid->val[1], FANOTIFY_EVENT_HASH_BITS); 30 41 } 31 42 32 43 static bool fanotify_fh_equal(struct fanotify_fh *fh1, ··· 49 36 50 37 return !fh1->len || 51 38 !memcmp(fanotify_fh_buf(fh1), fanotify_fh_buf(fh2), fh1->len); 39 + } 40 + 41 + static unsigned int fanotify_hash_fh(struct fanotify_fh *fh) 42 + { 43 + long salt = (long)fh->type | (long)fh->len << 8; 44 + 45 + /* 46 + * full_name_hash() works long by long, so it handles fh buf optimally. 47 + */ 48 + return full_name_hash((void *)salt, fanotify_fh_buf(fh), fh->len); 52 49 } 53 50 54 51 static bool fanotify_fid_event_equal(struct fanotify_fid_event *ffe1, ··· 111 88 return fanotify_info_equal(info1, info2); 112 89 } 113 90 114 - static bool fanotify_should_merge(struct fsnotify_event *old_fsn, 115 - struct fsnotify_event *new_fsn) 91 + static bool fanotify_should_merge(struct fanotify_event *old, 92 + struct fanotify_event *new) 116 93 { 117 - struct fanotify_event *old, *new; 94 + pr_debug("%s: old=%p new=%p\n", __func__, old, new); 118 95 119 - pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn); 120 - old = FANOTIFY_E(old_fsn); 121 - new = FANOTIFY_E(new_fsn); 122 - 123 - if (old_fsn->objectid != new_fsn->objectid || 96 + if (old->hash != new->hash || 124 97 old->type != new->type || old->pid != new->pid) 125 98 return false; 126 99 ··· 148 129 return false; 149 130 } 150 131 151 - /* and the list better be locked by something too! */ 152 - static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) 153 - { 154 - struct fsnotify_event *test_event; 155 - struct fanotify_event *new; 132 + /* Limit event merges to limit CPU overhead per event */ 133 + #define FANOTIFY_MAX_MERGE_EVENTS 128 156 134 157 - pr_debug("%s: list=%p event=%p\n", __func__, list, event); 158 - new = FANOTIFY_E(event); 135 + /* and the list better be locked by something too! */ 136 + static int fanotify_merge(struct fsnotify_group *group, 137 + struct fsnotify_event *event) 138 + { 139 + struct fanotify_event *old, *new = FANOTIFY_E(event); 140 + unsigned int bucket = fanotify_event_hash_bucket(group, new); 141 + struct hlist_head *hlist = &group->fanotify_data.merge_hash[bucket]; 142 + int i = 0; 143 + 144 + pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, 145 + group, event, bucket); 159 146 160 147 /* 161 148 * Don't merge a permission event with any other event so that we know ··· 171 146 if (fanotify_is_perm_event(new->mask)) 172 147 return 0; 173 148 174 - list_for_each_entry_reverse(test_event, list, list) { 175 - if (fanotify_should_merge(test_event, event)) { 176 - FANOTIFY_E(test_event)->mask |= new->mask; 149 + hlist_for_each_entry(old, hlist, merge_list) { 150 + if (++i > FANOTIFY_MAX_MERGE_EVENTS) 151 + break; 152 + if (fanotify_should_merge(old, new)) { 153 + old->mask |= new->mask; 177 154 return 1; 178 155 } 179 156 } ··· 211 184 return ret; 212 185 } 213 186 /* Event not yet reported? Just remove it. */ 214 - if (event->state == FAN_EVENT_INIT) 187 + if (event->state == FAN_EVENT_INIT) { 215 188 fsnotify_remove_queued_event(group, &event->fae.fse); 189 + /* Permission events are not supposed to be hashed */ 190 + WARN_ON_ONCE(!hlist_unhashed(&event->fae.merge_list)); 191 + } 216 192 /* 217 193 * Event may be also answered in case signal delivery raced 218 194 * with wakeup. In that case we have nothing to do besides ··· 359 329 * Return 0 on failure to encode. 360 330 */ 361 331 static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, 362 - unsigned int fh_len, gfp_t gfp) 332 + unsigned int fh_len, unsigned int *hash, 333 + gfp_t gfp) 363 334 { 364 335 int dwords, type = 0; 365 336 char *ext_buf = NULL; ··· 402 371 403 372 fh->type = type; 404 373 fh->len = fh_len; 374 + 375 + /* Mix fh into event merge key */ 376 + *hash ^= fanotify_hash_fh(fh); 405 377 406 378 return FANOTIFY_FH_HDR_LEN + fh_len; 407 379 ··· 459 425 } 460 426 461 427 static struct fanotify_event *fanotify_alloc_path_event(const struct path *path, 428 + unsigned int *hash, 462 429 gfp_t gfp) 463 430 { 464 431 struct fanotify_path_event *pevent; ··· 470 435 471 436 pevent->fae.type = FANOTIFY_EVENT_TYPE_PATH; 472 437 pevent->path = *path; 438 + *hash ^= fanotify_hash_path(path); 473 439 path_get(path); 474 440 475 441 return &pevent->fae; ··· 496 460 497 461 static struct fanotify_event *fanotify_alloc_fid_event(struct inode *id, 498 462 __kernel_fsid_t *fsid, 463 + unsigned int *hash, 499 464 gfp_t gfp) 500 465 { 501 466 struct fanotify_fid_event *ffe; ··· 507 470 508 471 ffe->fae.type = FANOTIFY_EVENT_TYPE_FID; 509 472 ffe->fsid = *fsid; 473 + *hash ^= fanotify_hash_fsid(fsid); 510 474 fanotify_encode_fh(&ffe->object_fh, id, fanotify_encode_fh_len(id), 511 - gfp); 475 + hash, gfp); 512 476 513 477 return &ffe->fae; 514 478 } 515 479 516 480 static struct fanotify_event *fanotify_alloc_name_event(struct inode *id, 517 481 __kernel_fsid_t *fsid, 518 - const struct qstr *file_name, 482 + const struct qstr *name, 519 483 struct inode *child, 484 + unsigned int *hash, 520 485 gfp_t gfp) 521 486 { 522 487 struct fanotify_name_event *fne; ··· 531 492 size = sizeof(*fne) + FANOTIFY_FH_HDR_LEN + dir_fh_len; 532 493 if (child_fh_len) 533 494 size += FANOTIFY_FH_HDR_LEN + child_fh_len; 534 - if (file_name) 535 - size += file_name->len + 1; 495 + if (name) 496 + size += name->len + 1; 536 497 fne = kmalloc(size, gfp); 537 498 if (!fne) 538 499 return NULL; 539 500 540 501 fne->fae.type = FANOTIFY_EVENT_TYPE_FID_NAME; 541 502 fne->fsid = *fsid; 503 + *hash ^= fanotify_hash_fsid(fsid); 542 504 info = &fne->info; 543 505 fanotify_info_init(info); 544 506 dfh = fanotify_info_dir_fh(info); 545 - info->dir_fh_totlen = fanotify_encode_fh(dfh, id, dir_fh_len, 0); 507 + info->dir_fh_totlen = fanotify_encode_fh(dfh, id, dir_fh_len, hash, 0); 546 508 if (child_fh_len) { 547 509 ffh = fanotify_info_file_fh(info); 548 - info->file_fh_totlen = fanotify_encode_fh(ffh, child, child_fh_len, 0); 510 + info->file_fh_totlen = fanotify_encode_fh(ffh, child, 511 + child_fh_len, hash, 0); 549 512 } 550 - if (file_name) 551 - fanotify_info_copy_name(info, file_name); 513 + if (name) { 514 + long salt = name->len; 515 + 516 + fanotify_info_copy_name(info, name); 517 + *hash ^= full_name_hash((void *)salt, name->name, name->len); 518 + } 552 519 553 520 pr_debug("%s: ino=%lu size=%u dir_fh_len=%u child_fh_len=%u name_len=%u name='%.*s'\n", 554 521 __func__, id->i_ino, size, dir_fh_len, child_fh_len, ··· 578 533 struct mem_cgroup *old_memcg; 579 534 struct inode *child = NULL; 580 535 bool name_event = false; 536 + unsigned int hash = 0; 537 + bool ondir = mask & FAN_ONDIR; 538 + struct pid *pid; 581 539 582 540 if ((fid_mode & FAN_REPORT_DIR_FID) && dirid) { 583 541 /* ··· 588 540 * report the child fid for events reported on a non-dir child 589 541 * in addition to reporting the parent fid and maybe child name. 590 542 */ 591 - if ((fid_mode & FAN_REPORT_FID) && 592 - id != dirid && !(mask & FAN_ONDIR)) 543 + if ((fid_mode & FAN_REPORT_FID) && id != dirid && !ondir) 593 544 child = id; 594 545 595 546 id = dirid; ··· 609 562 if (!(fid_mode & FAN_REPORT_NAME)) { 610 563 name_event = !!child; 611 564 file_name = NULL; 612 - } else if ((mask & ALL_FSNOTIFY_DIRENT_EVENTS) || 613 - !(mask & FAN_ONDIR)) { 565 + } else if ((mask & ALL_FSNOTIFY_DIRENT_EVENTS) || !ondir) { 614 566 name_event = true; 615 567 } 616 568 } ··· 632 586 event = fanotify_alloc_perm_event(path, gfp); 633 587 } else if (name_event && (file_name || child)) { 634 588 event = fanotify_alloc_name_event(id, fsid, file_name, child, 635 - gfp); 589 + &hash, gfp); 636 590 } else if (fid_mode) { 637 - event = fanotify_alloc_fid_event(id, fsid, gfp); 591 + event = fanotify_alloc_fid_event(id, fsid, &hash, gfp); 638 592 } else { 639 - event = fanotify_alloc_path_event(path, gfp); 593 + event = fanotify_alloc_path_event(path, &hash, gfp); 640 594 } 641 595 642 596 if (!event) 643 597 goto out; 644 598 645 - /* 646 - * Use the victim inode instead of the watching inode as the id for 647 - * event queue, so event reported on parent is merged with event 648 - * reported on child when both directory and child watches exist. 649 - */ 650 - fanotify_init_event(event, (unsigned long)id, mask); 651 599 if (FAN_GROUP_FLAG(group, FAN_REPORT_TID)) 652 - event->pid = get_pid(task_pid(current)); 600 + pid = get_pid(task_pid(current)); 653 601 else 654 - event->pid = get_pid(task_tgid(current)); 602 + pid = get_pid(task_tgid(current)); 603 + 604 + /* Mix event info, FAN_ONDIR flag and pid into event merge key */ 605 + hash ^= hash_long((unsigned long)pid | ondir, FANOTIFY_EVENT_HASH_BITS); 606 + fanotify_init_event(event, hash, mask); 607 + event->pid = pid; 655 608 656 609 out: 657 610 set_active_memcg(old_memcg); ··· 688 643 } 689 644 690 645 return fsid; 646 + } 647 + 648 + /* 649 + * Add an event to hash table for faster merge. 650 + */ 651 + static void fanotify_insert_event(struct fsnotify_group *group, 652 + struct fsnotify_event *fsn_event) 653 + { 654 + struct fanotify_event *event = FANOTIFY_E(fsn_event); 655 + unsigned int bucket = fanotify_event_hash_bucket(group, event); 656 + struct hlist_head *hlist = &group->fanotify_data.merge_hash[bucket]; 657 + 658 + assert_spin_locked(&group->notification_lock); 659 + 660 + pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, 661 + group, event, bucket); 662 + 663 + hlist_add_head(&event->merge_list, hlist); 691 664 } 692 665 693 666 static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, ··· 778 715 } 779 716 780 717 fsn_event = &event->fse; 781 - ret = fsnotify_add_event(group, fsn_event, fanotify_merge); 718 + ret = fsnotify_add_event(group, fsn_event, fanotify_merge, 719 + fanotify_is_hashed_event(mask) ? 720 + fanotify_insert_event : NULL); 782 721 if (ret) { 783 722 /* Permission events shouldn't be merged */ 784 723 BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS); ··· 801 736 802 737 static void fanotify_free_group_priv(struct fsnotify_group *group) 803 738 { 804 - struct user_struct *user; 805 - 806 - user = group->fanotify_data.user; 807 - atomic_dec(&user->fanotify_listeners); 808 - free_uid(user); 739 + kfree(group->fanotify_data.merge_hash); 740 + if (group->fanotify_data.ucounts) 741 + dec_ucount(group->fanotify_data.ucounts, 742 + UCOUNT_FANOTIFY_GROUPS); 809 743 } 810 744 811 745 static void fanotify_free_path_event(struct fanotify_event *event) ··· 860 796 } 861 797 } 862 798 799 + static void fanotify_freeing_mark(struct fsnotify_mark *mark, 800 + struct fsnotify_group *group) 801 + { 802 + if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) 803 + dec_ucount(group->fanotify_data.ucounts, UCOUNT_FANOTIFY_MARKS); 804 + } 805 + 863 806 static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) 864 807 { 865 808 kmem_cache_free(fanotify_mark_cache, fsn_mark); ··· 876 805 .handle_event = fanotify_handle_event, 877 806 .free_group_priv = fanotify_free_group_priv, 878 807 .free_event = fanotify_free_event, 808 + .freeing_mark = fanotify_freeing_mark, 879 809 .free_mark = fanotify_free_mark, 880 810 };

+43 -3

fs/notify/fanotify/fanotify.h

··· 3 3 #include <linux/path.h> 4 4 #include <linux/slab.h> 5 5 #include <linux/exportfs.h> 6 + #include <linux/hashtable.h> 6 7 7 8 extern struct kmem_cache *fanotify_mark_cache; 8 9 extern struct kmem_cache *fanotify_fid_event_cachep; ··· 116 115 info->name_len = 0; 117 116 } 118 117 118 + static inline unsigned int fanotify_info_len(struct fanotify_info *info) 119 + { 120 + return info->dir_fh_totlen + info->file_fh_totlen + info->name_len; 121 + } 122 + 119 123 static inline void fanotify_info_copy_name(struct fanotify_info *info, 120 124 const struct qstr *name) 121 125 { ··· 141 135 FANOTIFY_EVENT_TYPE_PATH, 142 136 FANOTIFY_EVENT_TYPE_PATH_PERM, 143 137 FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */ 138 + __FANOTIFY_EVENT_TYPE_NUM 144 139 }; 140 + 141 + #define FANOTIFY_EVENT_TYPE_BITS \ 142 + (ilog2(__FANOTIFY_EVENT_TYPE_NUM - 1) + 1) 143 + #define FANOTIFY_EVENT_HASH_BITS \ 144 + (32 - FANOTIFY_EVENT_TYPE_BITS) 145 145 146 146 struct fanotify_event { 147 147 struct fsnotify_event fse; 148 + struct hlist_node merge_list; /* List for hashed merge */ 148 149 u32 mask; 149 - enum fanotify_event_type type; 150 + struct { 151 + unsigned int type : FANOTIFY_EVENT_TYPE_BITS; 152 + unsigned int hash : FANOTIFY_EVENT_HASH_BITS; 153 + }; 150 154 struct pid *pid; 151 155 }; 152 156 153 157 static inline void fanotify_init_event(struct fanotify_event *event, 154 - unsigned long id, u32 mask) 158 + unsigned int hash, u32 mask) 155 159 { 156 - fsnotify_init_event(&event->fse, id); 160 + fsnotify_init_event(&event->fse); 161 + INIT_HLIST_NODE(&event->merge_list); 162 + event->hash = hash; 157 163 event->mask = mask; 158 164 event->pid = NULL; 159 165 } ··· 301 283 return &FANOTIFY_PERM(event)->path; 302 284 else 303 285 return NULL; 286 + } 287 + 288 + /* 289 + * Use 128 size hash table to speed up events merge. 290 + */ 291 + #define FANOTIFY_HTABLE_BITS (7) 292 + #define FANOTIFY_HTABLE_SIZE (1 << FANOTIFY_HTABLE_BITS) 293 + #define FANOTIFY_HTABLE_MASK (FANOTIFY_HTABLE_SIZE - 1) 294 + 295 + /* 296 + * Permission events and overflow event do not get merged - don't hash them. 297 + */ 298 + static inline bool fanotify_is_hashed_event(u32 mask) 299 + { 300 + return !fanotify_is_perm_event(mask) && !(mask & FS_Q_OVERFLOW); 301 + } 302 + 303 + static inline unsigned int fanotify_event_hash_bucket( 304 + struct fsnotify_group *group, 305 + struct fanotify_event *event) 306 + { 307 + return event->hash & FANOTIFY_HTABLE_MASK; 304 308 }

+186 -33

fs/notify/fanotify/fanotify_user.c

··· 27 27 #include "fanotify.h" 28 28 29 29 #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 30 - #define FANOTIFY_DEFAULT_MAX_MARKS 8192 31 - #define FANOTIFY_DEFAULT_MAX_LISTENERS 128 30 + #define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192 31 + #define FANOTIFY_DEFAULT_MAX_GROUPS 128 32 + 33 + /* 34 + * Legacy fanotify marks limits (8192) is per group and we introduced a tunable 35 + * limit of marks per user, similar to inotify. Effectively, the legacy limit 36 + * of fanotify marks per user is <max marks per group> * <max groups per user>. 37 + * This default limit (1M) also happens to match the increased limit of inotify 38 + * max_user_watches since v5.10. 39 + */ 40 + #define FANOTIFY_DEFAULT_MAX_USER_MARKS \ 41 + (FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS) 42 + 43 + /* 44 + * Most of the memory cost of adding an inode mark is pinning the marked inode. 45 + * The size of the filesystem inode struct is not uniform across filesystems, 46 + * so double the size of a VFS inode is used as a conservative approximation. 47 + */ 48 + #define INODE_MARK_COST (2 * sizeof(struct inode)) 49 + 50 + /* configurable via /proc/sys/fs/fanotify/ */ 51 + static int fanotify_max_queued_events __read_mostly; 52 + 53 + #ifdef CONFIG_SYSCTL 54 + 55 + #include <linux/sysctl.h> 56 + 57 + struct ctl_table fanotify_table[] = { 58 + { 59 + .procname = "max_user_groups", 60 + .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], 61 + .maxlen = sizeof(int), 62 + .mode = 0644, 63 + .proc_handler = proc_dointvec_minmax, 64 + .extra1 = SYSCTL_ZERO, 65 + }, 66 + { 67 + .procname = "max_user_marks", 68 + .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS], 69 + .maxlen = sizeof(int), 70 + .mode = 0644, 71 + .proc_handler = proc_dointvec_minmax, 72 + .extra1 = SYSCTL_ZERO, 73 + }, 74 + { 75 + .procname = "max_queued_events", 76 + .data = &fanotify_max_queued_events, 77 + .maxlen = sizeof(int), 78 + .mode = 0644, 79 + .proc_handler = proc_dointvec_minmax, 80 + .extra1 = SYSCTL_ZERO 81 + }, 82 + { } 83 + }; 84 + #endif /* CONFIG_SYSCTL */ 32 85 33 86 /* 34 87 * All flags that may be specified in parameter event_f_flags of fanotify_init. ··· 143 90 } 144 91 145 92 /* 93 + * Remove an hashed event from merge hash table. 94 + */ 95 + static void fanotify_unhash_event(struct fsnotify_group *group, 96 + struct fanotify_event *event) 97 + { 98 + assert_spin_locked(&group->notification_lock); 99 + 100 + pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, 101 + group, event, fanotify_event_hash_bucket(group, event)); 102 + 103 + if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list))) 104 + return; 105 + 106 + hlist_del_init(&event->merge_list); 107 + } 108 + 109 + /* 146 110 * Get an fanotify notification event if one exists and is small 147 111 * enough to fit in "count". Return an error pointer if the count 148 112 * is not large enough. When permission event is dequeued, its state is ··· 170 100 { 171 101 size_t event_size = FAN_EVENT_METADATA_LEN; 172 102 struct fanotify_event *event = NULL; 103 + struct fsnotify_event *fsn_event; 173 104 unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); 174 105 175 106 pr_debug("%s: group=%p count=%zd\n", __func__, group, count); 176 107 177 108 spin_lock(&group->notification_lock); 178 - if (fsnotify_notify_queue_is_empty(group)) 109 + fsn_event = fsnotify_peek_first_event(group); 110 + if (!fsn_event) 179 111 goto out; 180 112 181 - if (fid_mode) { 182 - event_size += fanotify_event_info_len(fid_mode, 183 - FANOTIFY_E(fsnotify_peek_first_event(group))); 184 - } 113 + event = FANOTIFY_E(fsn_event); 114 + if (fid_mode) 115 + event_size += fanotify_event_info_len(fid_mode, event); 185 116 186 117 if (event_size > count) { 187 118 event = ERR_PTR(-EINVAL); 188 119 goto out; 189 120 } 190 - event = FANOTIFY_E(fsnotify_remove_first_event(group)); 121 + 122 + /* 123 + * Held the notification_lock the whole time, so this is the 124 + * same event we peeked above. 125 + */ 126 + fsnotify_remove_first_event(group); 191 127 if (fanotify_is_perm_event(event->mask)) 192 128 FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED; 129 + if (fanotify_is_hashed_event(event->mask)) 130 + fanotify_unhash_event(group, event); 193 131 out: 194 132 spin_unlock(&group->notification_lock); 195 133 return event; ··· 419 341 metadata.reserved = 0; 420 342 metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS; 421 343 metadata.pid = pid_vnr(event->pid); 344 + /* 345 + * For an unprivileged listener, event->pid can be used to identify the 346 + * events generated by the listener process itself, without disclosing 347 + * the pids of other processes. 348 + */ 349 + if (!capable(CAP_SYS_ADMIN) && 350 + task_tgid(current) != event->pid) 351 + metadata.pid = 0; 422 352 423 353 if (path && path->mnt && path->dentry) { 424 354 fd = create_fd(group, path, &f); ··· 659 573 static int fanotify_release(struct inode *ignored, struct file *file) 660 574 { 661 575 struct fsnotify_group *group = file->private_data; 576 + struct fsnotify_event *fsn_event; 662 577 663 578 /* 664 579 * Stop new events from arriving in the notification queue. since ··· 688 601 * dequeue them and set the response. They will be freed once the 689 602 * response is consumed and fanotify_get_response() returns. 690 603 */ 691 - while (!fsnotify_notify_queue_is_empty(group)) { 692 - struct fanotify_event *event; 604 + while ((fsn_event = fsnotify_remove_first_event(group))) { 605 + struct fanotify_event *event = FANOTIFY_E(fsn_event); 693 606 694 - event = FANOTIFY_E(fsnotify_remove_first_event(group)); 695 607 if (!(event->mask & FANOTIFY_PERM_EVENTS)) { 696 608 spin_unlock(&group->notification_lock); 697 - fsnotify_destroy_event(group, &event->fse); 609 + fsnotify_destroy_event(group, fsn_event); 698 610 } else { 699 611 finish_permission_event(group, FANOTIFY_PERM(event), 700 612 FAN_ALLOW); ··· 908 822 unsigned int type, 909 823 __kernel_fsid_t *fsid) 910 824 { 825 + struct ucounts *ucounts = group->fanotify_data.ucounts; 911 826 struct fsnotify_mark *mark; 912 827 int ret; 913 828 914 - if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks) 829 + /* 830 + * Enforce per user marks limits per user in all containing user ns. 831 + * A group with FAN_UNLIMITED_MARKS does not contribute to mark count 832 + * in the limited groups account. 833 + */ 834 + if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) && 835 + !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS)) 915 836 return ERR_PTR(-ENOSPC); 916 837 917 838 mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); 918 - if (!mark) 919 - return ERR_PTR(-ENOMEM); 839 + if (!mark) { 840 + ret = -ENOMEM; 841 + goto out_dec_ucounts; 842 + } 920 843 921 844 fsnotify_init_mark(mark, group); 922 845 ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid); 923 846 if (ret) { 924 847 fsnotify_put_mark(mark); 925 - return ERR_PTR(ret); 848 + goto out_dec_ucounts; 926 849 } 927 850 928 851 return mark; 852 + 853 + out_dec_ucounts: 854 + if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS)) 855 + dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS); 856 + return ERR_PTR(ret); 929 857 } 930 858 931 859 ··· 1019 919 return &oevent->fse; 1020 920 } 1021 921 922 + static struct hlist_head *fanotify_alloc_merge_hash(void) 923 + { 924 + struct hlist_head *hash; 925 + 926 + hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS, 927 + GFP_KERNEL_ACCOUNT); 928 + if (!hash) 929 + return NULL; 930 + 931 + __hash_init(hash, FANOTIFY_HTABLE_SIZE); 932 + 933 + return hash; 934 + } 935 + 1022 936 /* fanotify syscalls */ 1023 937 SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) 1024 938 { 1025 939 struct fsnotify_group *group; 1026 940 int f_flags, fd; 1027 - struct user_struct *user; 1028 941 unsigned int fid_mode = flags & FANOTIFY_FID_BITS; 1029 942 unsigned int class = flags & FANOTIFY_CLASS_BITS; 1030 943 1031 944 pr_debug("%s: flags=%x event_f_flags=%x\n", 1032 945 __func__, flags, event_f_flags); 1033 946 1034 - if (!capable(CAP_SYS_ADMIN)) 1035 - return -EPERM; 947 + if (!capable(CAP_SYS_ADMIN)) { 948 + /* 949 + * An unprivileged user can setup an fanotify group with 950 + * limited functionality - an unprivileged group is limited to 951 + * notification events with file handles and it cannot use 952 + * unlimited queue/marks. 953 + */ 954 + if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode) 955 + return -EPERM; 956 + } 1036 957 1037 958 #ifdef CONFIG_AUDITSYSCALL 1038 959 if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT)) ··· 1084 963 if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID)) 1085 964 return -EINVAL; 1086 965 1087 - user = get_current_user(); 1088 - if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { 1089 - free_uid(user); 1090 - return -EMFILE; 1091 - } 1092 - 1093 966 f_flags = O_RDWR | FMODE_NONOTIFY; 1094 967 if (flags & FAN_CLOEXEC) 1095 968 f_flags |= O_CLOEXEC; ··· 1093 978 /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ 1094 979 group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops); 1095 980 if (IS_ERR(group)) { 1096 - free_uid(user); 1097 981 return PTR_ERR(group); 1098 982 } 1099 983 1100 - group->fanotify_data.user = user; 984 + /* Enforce groups limits per user in all containing user ns */ 985 + group->fanotify_data.ucounts = inc_ucount(current_user_ns(), 986 + current_euid(), 987 + UCOUNT_FANOTIFY_GROUPS); 988 + if (!group->fanotify_data.ucounts) { 989 + fd = -EMFILE; 990 + goto out_destroy_group; 991 + } 992 + 1101 993 group->fanotify_data.flags = flags; 1102 - atomic_inc(&user->fanotify_listeners); 1103 994 group->memcg = get_mem_cgroup_from_mm(current->mm); 995 + 996 + group->fanotify_data.merge_hash = fanotify_alloc_merge_hash(); 997 + if (!group->fanotify_data.merge_hash) { 998 + fd = -ENOMEM; 999 + goto out_destroy_group; 1000 + } 1104 1001 1105 1002 group->overflow_event = fanotify_alloc_overflow_event(); 1106 1003 if (unlikely(!group->overflow_event)) { ··· 1146 1019 goto out_destroy_group; 1147 1020 group->max_events = UINT_MAX; 1148 1021 } else { 1149 - group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS; 1022 + group->max_events = fanotify_max_queued_events; 1150 1023 } 1151 1024 1152 1025 if (flags & FAN_UNLIMITED_MARKS) { 1153 1026 fd = -EPERM; 1154 1027 if (!capable(CAP_SYS_ADMIN)) 1155 1028 goto out_destroy_group; 1156 - group->fanotify_data.max_marks = UINT_MAX; 1157 - } else { 1158 - group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS; 1159 1029 } 1160 1030 1161 1031 if (flags & FAN_ENABLE_AUDIT) { ··· 1250 1126 __func__, fanotify_fd, flags, dfd, pathname, mask); 1251 1127 1252 1128 /* we only use the lower 32 bits as of right now. */ 1253 - if (mask & ((__u64)0xffffffff << 32)) 1129 + if (upper_32_bits(mask)) 1254 1130 return -EINVAL; 1255 1131 1256 1132 if (flags & ~FANOTIFY_MARK_FLAGS) ··· 1303 1179 if (unlikely(f.file->f_op != &fanotify_fops)) 1304 1180 goto fput_and_out; 1305 1181 group = f.file->private_data; 1182 + 1183 + /* 1184 + * An unprivileged user is not allowed to watch a mount point nor 1185 + * a filesystem. 1186 + */ 1187 + ret = -EPERM; 1188 + if (!capable(CAP_SYS_ADMIN) && 1189 + mark_type != FAN_MARK_INODE) 1190 + goto fput_and_out; 1306 1191 1307 1192 /* 1308 1193 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not ··· 1445 1312 */ 1446 1313 static int __init fanotify_user_setup(void) 1447 1314 { 1315 + struct sysinfo si; 1316 + int max_marks; 1317 + 1318 + si_meminfo(&si); 1319 + /* 1320 + * Allow up to 1% of addressable memory to be accounted for per user 1321 + * marks limited to the range [8192, 1048576]. mount and sb marks are 1322 + * a lot cheaper than inode marks, but there is no reason for a user 1323 + * to have many of those, so calculate by the cost of inode marks. 1324 + */ 1325 + max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) / 1326 + INODE_MARK_COST; 1327 + max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS, 1328 + FANOTIFY_DEFAULT_MAX_USER_MARKS); 1329 + 1448 1330 BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10); 1449 1331 BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); 1450 1332 ··· 1473 1325 fanotify_perm_event_cachep = 1474 1326 KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); 1475 1327 } 1328 + 1329 + fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS; 1330 + init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = 1331 + FANOTIFY_DEFAULT_MAX_GROUPS; 1332 + init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks; 1476 1333 1477 1334 return 0; 1478 1335 }

+2 -1

fs/notify/fdinfo.c

··· 144 144 struct fsnotify_group *group = f->private_data; 145 145 146 146 seq_printf(m, "fanotify flags:%x event-flags:%x\n", 147 - group->fanotify_data.flags, group->fanotify_data.f_flags); 147 + group->fanotify_data.flags, 148 + group->fanotify_data.f_flags); 148 149 149 150 show_fdinfo(m, f, fanotify_fdinfo); 150 151 }

-1

fs/notify/group.c

··· 122 122 123 123 /* set to 0 when there a no external references to this group */ 124 124 refcount_set(&group->refcnt, 1); 125 - atomic_set(&group->num_marks, 0); 126 125 atomic_set(&group->user_waits, 0); 127 126 128 127 spin_lock_init(&group->notification_lock);

+5 -4

fs/notify/inotify/inotify_fsnotify.c

··· 46 46 return false; 47 47 } 48 48 49 - static int inotify_merge(struct list_head *list, 50 - struct fsnotify_event *event) 49 + static int inotify_merge(struct fsnotify_group *group, 50 + struct fsnotify_event *event) 51 51 { 52 + struct list_head *list = &group->notification_list; 52 53 struct fsnotify_event *last_event; 53 54 54 55 last_event = list_entry(list->prev, struct fsnotify_event, list); ··· 108 107 mask &= ~IN_ISDIR; 109 108 110 109 fsn_event = &event->fse; 111 - fsnotify_init_event(fsn_event, 0); 110 + fsnotify_init_event(fsn_event); 112 111 event->mask = mask; 113 112 event->wd = i_mark->wd; 114 113 event->sync_cookie = cookie; ··· 116 115 if (len) 117 116 strcpy(event->name, name->name); 118 117 119 - ret = fsnotify_add_event(group, fsn_event, inotify_merge); 118 + ret = fsnotify_add_event(group, fsn_event, inotify_merge, NULL); 120 119 if (ret) { 121 120 /* Our event wasn't used in the end. Free it. */ 122 121 fsnotify_destroy_event(group, fsn_event);

+3 -4

fs/notify/inotify/inotify_user.c

··· 146 146 size_t event_size = sizeof(struct inotify_event); 147 147 struct fsnotify_event *event; 148 148 149 - if (fsnotify_notify_queue_is_empty(group)) 150 - return NULL; 151 - 152 149 event = fsnotify_peek_first_event(group); 150 + if (!event) 151 + return NULL; 153 152 154 153 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 155 154 ··· 641 642 return ERR_PTR(-ENOMEM); 642 643 } 643 644 group->overflow_event = &oevent->fse; 644 - fsnotify_init_event(group->overflow_event, 0); 645 + fsnotify_init_event(group->overflow_event); 645 646 oevent->mask = FS_Q_OVERFLOW; 646 647 oevent->wd = -1; 647 648 oevent->sync_cookie = 0;

-4

fs/notify/mark.c

··· 391 391 list_del_init(&mark->g_list); 392 392 spin_unlock(&mark->lock); 393 393 394 - atomic_dec(&group->num_marks); 395 - 396 394 /* Drop mark reference acquired in fsnotify_add_mark_locked() */ 397 395 fsnotify_put_mark(mark); 398 396 } ··· 654 656 mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; 655 657 656 658 list_add(&mark->g_list, &group->marks_list); 657 - atomic_inc(&group->num_marks); 658 659 fsnotify_get_mark(mark); /* for g_list */ 659 660 spin_unlock(&mark->lock); 660 661 ··· 671 674 FSNOTIFY_MARK_FLAG_ATTACHED); 672 675 list_del_init(&mark->g_list); 673 676 spin_unlock(&mark->lock); 674 - atomic_dec(&group->num_marks); 675 677 676 678 fsnotify_put_mark(mark); 677 679 return ret;

+38 -34

fs/notify/notification.c

··· 47 47 } 48 48 EXPORT_SYMBOL_GPL(fsnotify_get_cookie); 49 49 50 - /* return true if the notify queue is empty, false otherwise */ 51 - bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) 52 - { 53 - assert_spin_locked(&group->notification_lock); 54 - return list_empty(&group->notification_list) ? true : false; 55 - } 56 - 57 50 void fsnotify_destroy_event(struct fsnotify_group *group, 58 51 struct fsnotify_event *event) 59 52 { ··· 68 75 } 69 76 70 77 /* 71 - * Add an event to the group notification queue. The group can later pull this 72 - * event off the queue to deal with. The function returns 0 if the event was 73 - * added to the queue, 1 if the event was merged with some other queued event, 78 + * Try to add an event to the notification queue. 79 + * The group can later pull this event off the queue to deal with. 80 + * The group can use the @merge hook to merge the event with a queued event. 81 + * The group can use the @insert hook to insert the event into hash table. 82 + * The function returns: 83 + * 0 if the event was added to a queue 84 + * 1 if the event was merged with some other queued event 74 85 * 2 if the event was not queued - either the queue of events has overflown 75 - * or the group is shutting down. 86 + * or the group is shutting down. 76 87 */ 77 88 int fsnotify_add_event(struct fsnotify_group *group, 78 89 struct fsnotify_event *event, 79 - int (*merge)(struct list_head *, 80 - struct fsnotify_event *)) 90 + int (*merge)(struct fsnotify_group *, 91 + struct fsnotify_event *), 92 + void (*insert)(struct fsnotify_group *, 93 + struct fsnotify_event *)) 81 94 { 82 95 int ret = 0; 83 96 struct list_head *list = &group->notification_list; ··· 110 111 } 111 112 112 113 if (!list_empty(list) && merge) { 113 - ret = merge(list, event); 114 + ret = merge(group, event); 114 115 if (ret) { 115 116 spin_unlock(&group->notification_lock); 116 117 return ret; ··· 120 121 queue: 121 122 group->q_len++; 122 123 list_add_tail(&event->list, list); 124 + if (insert) 125 + insert(group, event); 123 126 spin_unlock(&group->notification_lock); 124 127 125 128 wake_up(&group->notification_waitq); ··· 142 141 } 143 142 144 143 /* 145 - * Remove and return the first event from the notification list. It is the 146 - * responsibility of the caller to destroy the obtained event 147 - */ 148 - struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) 149 - { 150 - struct fsnotify_event *event; 151 - 152 - assert_spin_locked(&group->notification_lock); 153 - 154 - pr_debug("%s: group=%p\n", __func__, group); 155 - 156 - event = list_first_entry(&group->notification_list, 157 - struct fsnotify_event, list); 158 - fsnotify_remove_queued_event(group, event); 159 - return event; 160 - } 161 - 162 - /* 163 - * This will not remove the event, that must be done with 164 - * fsnotify_remove_first_event() 144 + * Return the first event on the notification list without removing it. 145 + * Returns NULL if the list is empty. 165 146 */ 166 147 struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group) 167 148 { 168 149 assert_spin_locked(&group->notification_lock); 169 150 151 + if (fsnotify_notify_queue_is_empty(group)) 152 + return NULL; 153 + 170 154 return list_first_entry(&group->notification_list, 171 155 struct fsnotify_event, list); 156 + } 157 + 158 + /* 159 + * Remove and return the first event from the notification list. It is the 160 + * responsibility of the caller to destroy the obtained event 161 + */ 162 + struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group) 163 + { 164 + struct fsnotify_event *event = fsnotify_peek_first_event(group); 165 + 166 + if (!event) 167 + return NULL; 168 + 169 + pr_debug("%s: group=%p event=%p\n", __func__, group, event); 170 + 171 + fsnotify_remove_queued_event(group, event); 172 + 173 + return event; 172 174 } 173 175 174 176 /*

+1 -4

fs/zonefs/super.c

··· 1177 1177 struct super_block *sb = dentry->d_sb; 1178 1178 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 1179 1179 enum zonefs_ztype t; 1180 - u64 fsid; 1181 1180 1182 1181 buf->f_type = ZONEFS_MAGIC; 1183 1182 buf->f_bsize = sb->s_blocksize; ··· 1199 1200 1200 1201 spin_unlock(&sbi->s_lock); 1201 1202 1202 - fsid = le64_to_cpup((void *)sbi->s_uuid.b) ^ 1203 - le64_to_cpup((void *)sbi->s_uuid.b + sizeof(u64)); 1204 - buf->f_fsid = u64_to_fsid(fsid); 1203 + buf->f_fsid = uuid_to_fsid(sbi->s_uuid.b); 1205 1204 1206 1205 return 0; 1207 1206 }

+31 -5

include/linux/fanotify.h

··· 2 2 #ifndef _LINUX_FANOTIFY_H 3 3 #define _LINUX_FANOTIFY_H 4 4 5 + #include <linux/sysctl.h> 5 6 #include <uapi/linux/fanotify.h> 7 + 8 + extern struct ctl_table fanotify_table[]; /* for sysctl */ 6 9 7 10 #define FAN_GROUP_FLAG(group, flag) \ 8 11 ((group)->fanotify_data.flags & (flag)) ··· 18 15 * these constant, the programs may break if re-compiled with new uapi headers 19 16 * and then run on an old kernel. 20 17 */ 21 - #define FANOTIFY_CLASS_BITS (FAN_CLASS_NOTIF | FAN_CLASS_CONTENT | \ 18 + 19 + /* Group classes where permission events are allowed */ 20 + #define FANOTIFY_PERM_CLASSES (FAN_CLASS_CONTENT | \ 22 21 FAN_CLASS_PRE_CONTENT) 22 + 23 + #define FANOTIFY_CLASS_BITS (FAN_CLASS_NOTIF | FANOTIFY_PERM_CLASSES) 23 24 24 25 #define FANOTIFY_FID_BITS (FAN_REPORT_FID | FAN_REPORT_DFID_NAME) 25 26 26 - #define FANOTIFY_INIT_FLAGS (FANOTIFY_CLASS_BITS | FANOTIFY_FID_BITS | \ 27 - FAN_REPORT_TID | \ 28 - FAN_CLOEXEC | FAN_NONBLOCK | \ 29 - FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS) 27 + /* 28 + * fanotify_init() flags that require CAP_SYS_ADMIN. 29 + * We do not allow unprivileged groups to request permission events. 30 + * We do not allow unprivileged groups to get other process pid in events. 31 + * We do not allow unprivileged groups to use unlimited resources. 32 + */ 33 + #define FANOTIFY_ADMIN_INIT_FLAGS (FANOTIFY_PERM_CLASSES | \ 34 + FAN_REPORT_TID | \ 35 + FAN_UNLIMITED_QUEUE | \ 36 + FAN_UNLIMITED_MARKS) 37 + 38 + /* 39 + * fanotify_init() flags that are allowed for user without CAP_SYS_ADMIN. 40 + * FAN_CLASS_NOTIF is the only class we allow for unprivileged group. 41 + * We do not allow unprivileged groups to get file descriptors in events, 42 + * so one of the flags for reporting file handles is required. 43 + */ 44 + #define FANOTIFY_USER_INIT_FLAGS (FAN_CLASS_NOTIF | \ 45 + FANOTIFY_FID_BITS | \ 46 + FAN_CLOEXEC | FAN_NONBLOCK) 47 + 48 + #define FANOTIFY_INIT_FLAGS (FANOTIFY_ADMIN_INIT_FLAGS | \ 49 + FANOTIFY_USER_INIT_FLAGS) 30 50 31 51 #define FANOTIFY_MARK_TYPE_BITS (FAN_MARK_INODE | FAN_MARK_MOUNT | \ 32 52 FAN_MARK_FILESYSTEM)

+16 -13

include/linux/fsnotify_backend.h

··· 167 167 */ 168 168 struct fsnotify_event { 169 169 struct list_head list; 170 - unsigned long objectid; /* identifier for queue merges */ 171 170 }; 172 171 173 172 /* ··· 206 207 207 208 /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */ 208 209 struct mutex mark_mutex; /* protect marks_list */ 209 - atomic_t num_marks; /* 1 for each mark and 1 for not being 210 - * past the point of no return when freeing 211 - * a group */ 212 210 atomic_t user_waits; /* Number of tasks waiting for user 213 211 * response */ 214 212 struct list_head marks_list; /* all inode marks for this group */ ··· 230 234 #endif 231 235 #ifdef CONFIG_FANOTIFY 232 236 struct fanotify_group_private_data { 237 + /* Hash table of events for merge */ 238 + struct hlist_head *merge_hash; 233 239 /* allows a group to block waiting for a userspace response */ 234 240 struct list_head access_list; 235 241 wait_queue_head_t access_waitq; 236 242 int flags; /* flags from fanotify_init() */ 237 243 int f_flags; /* event_f_flags from fanotify_init() */ 238 - unsigned int max_marks; 239 - struct user_struct *user; 244 + struct ucounts *ucounts; 240 245 } fanotify_data; 241 246 #endif /* CONFIG_FANOTIFY */ 242 247 }; ··· 484 487 /* attach the event to the group notification queue */ 485 488 extern int fsnotify_add_event(struct fsnotify_group *group, 486 489 struct fsnotify_event *event, 487 - int (*merge)(struct list_head *, 488 - struct fsnotify_event *)); 490 + int (*merge)(struct fsnotify_group *, 491 + struct fsnotify_event *), 492 + void (*insert)(struct fsnotify_group *, 493 + struct fsnotify_event *)); 489 494 /* Queue overflow event to a notification group */ 490 495 static inline void fsnotify_queue_overflow(struct fsnotify_group *group) 491 496 { 492 - fsnotify_add_event(group, group->overflow_event, NULL); 497 + fsnotify_add_event(group, group->overflow_event, NULL, NULL); 493 498 } 494 499 495 - /* true if the group notification queue is empty */ 500 + static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group) 501 + { 502 + assert_spin_locked(&group->notification_lock); 503 + 504 + return list_empty(&group->notification_list); 505 + } 506 + 496 507 extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group); 497 508 /* return, but do not dequeue the first event on the notification queue */ 498 509 extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group); ··· 581 576 extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info); 582 577 extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info); 583 578 584 - static inline void fsnotify_init_event(struct fsnotify_event *event, 585 - unsigned long objectid) 579 + static inline void fsnotify_init_event(struct fsnotify_event *event) 586 580 { 587 581 INIT_LIST_HEAD(&event->list); 588 - event->objectid = objectid; 589 582 } 590 583 591 584 #else

-3

include/linux/sched/user.h

··· 14 14 refcount_t __count; /* reference count */ 15 15 atomic_t processes; /* How many processes does this user have? */ 16 16 atomic_t sigpending; /* How many pending signals does this user have? */ 17 - #ifdef CONFIG_FANOTIFY 18 - atomic_t fanotify_listeners; 19 - #endif 20 17 #ifdef CONFIG_EPOLL 21 18 atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ 22 19 #endif

+8

include/linux/statfs.h

··· 4 4 5 5 #include <linux/types.h> 6 6 #include <asm/statfs.h> 7 + #include <asm/byteorder.h> 7 8 8 9 struct kstatfs { 9 10 long f_type; ··· 49 48 static inline __kernel_fsid_t u64_to_fsid(u64 v) 50 49 { 51 50 return (__kernel_fsid_t){.val = {(u32)v, (u32)(v>>32)}}; 51 + } 52 + 53 + /* Fold 16 bytes uuid to 64 bit fsid */ 54 + static inline __kernel_fsid_t uuid_to_fsid(__u8 *uuid) 55 + { 56 + return u64_to_fsid(le64_to_cpup((void *)uuid) ^ 57 + le64_to_cpup((void *)(uuid + sizeof(u64)))); 52 58 } 53 59 54 60 #endif

+4

include/linux/user_namespace.h

··· 50 50 UCOUNT_INOTIFY_INSTANCES, 51 51 UCOUNT_INOTIFY_WATCHES, 52 52 #endif 53 + #ifdef CONFIG_FANOTIFY 54 + UCOUNT_FANOTIFY_GROUPS, 55 + UCOUNT_FANOTIFY_MARKS, 56 + #endif 53 57 UCOUNT_COUNTS, 54 58 }; 55 59

+11 -1

kernel/sysctl.c

··· 148 148 #ifdef CONFIG_INOTIFY_USER 149 149 #include <linux/inotify.h> 150 150 #endif 151 + #ifdef CONFIG_FANOTIFY 152 + #include <linux/fanotify.h> 153 + #endif 151 154 152 155 #ifdef CONFIG_PROC_SYSCTL 153 156 ··· 3167 3164 .mode = 0555, 3168 3165 .child = inotify_table, 3169 3166 }, 3170 - #endif 3167 + #endif 3168 + #ifdef CONFIG_FANOTIFY 3169 + { 3170 + .procname = "fanotify", 3171 + .mode = 0555, 3172 + .child = fanotify_table, 3173 + }, 3174 + #endif 3171 3175 #ifdef CONFIG_EPOLL 3172 3176 { 3173 3177 .procname = "epoll",

+4

kernel/ucount.c

··· 74 74 UCOUNT_ENTRY("max_inotify_instances"), 75 75 UCOUNT_ENTRY("max_inotify_watches"), 76 76 #endif 77 + #ifdef CONFIG_FANOTIFY 78 + UCOUNT_ENTRY("max_fanotify_groups"), 79 + UCOUNT_ENTRY("max_fanotify_marks"), 80 + #endif 77 81 { } 78 82 }; 79 83 #endif /* CONFIG_SYSCTL */

+3

mm/shmem.c

··· 2846 2846 buf->f_ffree = sbinfo->free_inodes; 2847 2847 } 2848 2848 /* else leave those fields 0 like simple_statfs */ 2849 + 2850 + buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); 2851 + 2849 2852 return 0; 2850 2853 } 2851 2854