Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

fuse: allow batching of FORGET requests

Terje Malmedal reports that a fuse filesystem with 32 million inodes
on a machine with lots of memory can take up to 30 minutes to process
FORGET requests when all those inodes are evicted from the icache.

To solve this, create a BATCH_FORGET request that allows up to about
8000 FORGET requests to be sent in a single message.

This request is only sent if userspace supports interface version 7.16
or later, otherwise fall back to sending individual FORGET messages.

Reported-by: Terje Malmedal <terje.malmedal@usit.uio.no>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>

+97 -14
+81 -11
fs/fuse/dev.c
··· 254 254 void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, 255 255 u64 nodeid, u64 nlookup) 256 256 { 257 - forget->nodeid = nodeid; 258 - forget->nlookup = nlookup; 257 + forget->forget_one.nodeid = nodeid; 258 + forget->forget_one.nlookup = nlookup; 259 259 260 260 spin_lock(&fc->lock); 261 261 fc->forget_list_tail->next = forget; ··· 974 974 return err ? err : reqsize; 975 975 } 976 976 977 - static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc) 977 + static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc, 978 + unsigned max, 979 + unsigned *countp) 978 980 { 979 - struct fuse_forget_link *forget = fc->forget_list_head.next; 981 + struct fuse_forget_link *head = fc->forget_list_head.next; 982 + struct fuse_forget_link **newhead = &head; 983 + unsigned count; 980 984 981 - fc->forget_list_head.next = forget->next; 985 + for (count = 0; *newhead != NULL && count < max; count++) 986 + newhead = &(*newhead)->next; 987 + 988 + fc->forget_list_head.next = *newhead; 989 + *newhead = NULL; 982 990 if (fc->forget_list_head.next == NULL) 983 991 fc->forget_list_tail = &fc->forget_list_head; 984 992 985 - return forget; 993 + if (countp != NULL) 994 + *countp = count; 995 + 996 + return head; 986 997 } 987 998 988 999 static int fuse_read_single_forget(struct fuse_conn *fc, ··· 1002 991 __releases(fc->lock) 1003 992 { 1004 993 int err; 1005 - struct fuse_forget_link *forget = dequeue_forget(fc); 994 + struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL); 1006 995 struct fuse_forget_in arg = { 1007 - .nlookup = forget->nlookup, 996 + .nlookup = forget->forget_one.nlookup, 1008 997 }; 1009 998 struct fuse_in_header ih = { 1010 999 .opcode = FUSE_FORGET, 1011 - .nodeid = forget->nodeid, 1000 + .nodeid = forget->forget_one.nodeid, 1012 1001 .unique = fuse_get_unique(fc), 1013 1002 .len = sizeof(ih) + sizeof(arg), 1014 1003 }; ··· 1027 1016 return err; 1028 1017 1029 1018 return ih.len; 1019 + } 1020 + 1021 + static int fuse_read_batch_forget(struct fuse_conn *fc, 1022 + struct fuse_copy_state *cs, size_t nbytes) 1023 + __releases(fc->lock) 1024 + { 1025 + int err; 1026 + unsigned max_forgets; 1027 + unsigned count; 1028 + struct fuse_forget_link *head; 1029 + struct fuse_batch_forget_in arg = { .count = 0 }; 1030 + struct fuse_in_header ih = { 1031 + .opcode = FUSE_BATCH_FORGET, 1032 + .unique = fuse_get_unique(fc), 1033 + .len = sizeof(ih) + sizeof(arg), 1034 + }; 1035 + 1036 + if (nbytes < ih.len) { 1037 + spin_unlock(&fc->lock); 1038 + return -EINVAL; 1039 + } 1040 + 1041 + max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); 1042 + head = dequeue_forget(fc, max_forgets, &count); 1043 + spin_unlock(&fc->lock); 1044 + 1045 + arg.count = count; 1046 + ih.len += count * sizeof(struct fuse_forget_one); 1047 + err = fuse_copy_one(cs, &ih, sizeof(ih)); 1048 + if (!err) 1049 + err = fuse_copy_one(cs, &arg, sizeof(arg)); 1050 + 1051 + while (head) { 1052 + struct fuse_forget_link *forget = head; 1053 + 1054 + if (!err) { 1055 + err = fuse_copy_one(cs, &forget->forget_one, 1056 + sizeof(forget->forget_one)); 1057 + } 1058 + head = forget->next; 1059 + kfree(forget); 1060 + } 1061 + 1062 + fuse_copy_finish(cs); 1063 + 1064 + if (err) 1065 + return err; 1066 + 1067 + return ih.len; 1068 + } 1069 + 1070 + static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs, 1071 + size_t nbytes) 1072 + __releases(fc->lock) 1073 + { 1074 + if (fc->minor < 16 || fc->forget_list_head.next->next == NULL) 1075 + return fuse_read_single_forget(fc, cs, nbytes); 1076 + else 1077 + return fuse_read_batch_forget(fc, cs, nbytes); 1030 1078 } 1031 1079 1032 1080 /* ··· 1128 1058 1129 1059 if (forget_pending(fc)) { 1130 1060 if (list_empty(&fc->pending) || fc->forget_batch-- > 0) 1131 - return fuse_read_single_forget(fc, cs, nbytes); 1061 + return fuse_read_forget(fc, cs, nbytes); 1132 1062 1133 1063 if (fc->forget_batch <= -8) 1134 1064 fc->forget_batch = 16; ··· 1907 1837 end_requests(fc, &fc->pending); 1908 1838 end_requests(fc, &fc->processing); 1909 1839 while (forget_pending(fc)) 1910 - kfree(dequeue_forget(fc)); 1840 + kfree(dequeue_forget(fc, 1, NULL)); 1911 1841 } 1912 1842 1913 1843 /*
+1 -2
fs/fuse/fuse_i.h
··· 55 55 56 56 /* One forget request */ 57 57 struct fuse_forget_link { 58 - u64 nodeid; 59 - u64 nlookup; 58 + struct fuse_forget_one forget_one; 60 59 struct fuse_forget_link *next; 61 60 }; 62 61
+15 -1
include/linux/fuse.h
··· 41 41 * 7.15 42 42 * - add store notify 43 43 * - add retrieve notify 44 + * 45 + * 7.16 46 + * - add BATCH_FORGET request 44 47 */ 45 48 46 49 #ifndef _LINUX_FUSE_H ··· 75 72 #define FUSE_KERNEL_VERSION 7 76 73 77 74 /** Minor version number of this interface */ 78 - #define FUSE_KERNEL_MINOR_VERSION 15 75 + #define FUSE_KERNEL_MINOR_VERSION 16 79 76 80 77 /** The node ID of the root inode */ 81 78 #define FUSE_ROOT_ID 1 ··· 259 256 FUSE_IOCTL = 39, 260 257 FUSE_POLL = 40, 261 258 FUSE_NOTIFY_REPLY = 41, 259 + FUSE_BATCH_FORGET = 42, 262 260 263 261 /* CUSE specific operations */ 264 262 CUSE_INIT = 4096, ··· 292 288 293 289 struct fuse_forget_in { 294 290 __u64 nlookup; 291 + }; 292 + 293 + struct fuse_forget_one { 294 + __u64 nodeid; 295 + __u64 nlookup; 296 + }; 297 + 298 + struct fuse_batch_forget_in { 299 + __u32 count; 300 + __u32 dummy; 295 301 }; 296 302 297 303 struct fuse_getattr_in {