Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

fuse: separate queue for FORGET requests

Terje Malmedal reports that a fuse filesystem with 32 million inodes
on a machine with lots of memory can go unresponsive for up to 30
minutes when all those inodes are evicted from the icache.

The reason is that FORGET messages, sent when the inode is evicted,
are queued up together with regular filesystem requests, and while the
huge queue of FORGET messages are processed no other filesystem
operation can proceed.

Since a full fuse request structure is allocated for each inode, these
take up quite a bit of memory as well.

To solve these issues, create a slim 'fuse_forget_link' structure
containing just the minimum of information required to send the FORGET
request and chain these on a separate queue.

When userspace is asking for a request make sure that FORGET and
non-FORGET requests are selected fairly: for each 8 non-FORGET allow
16 FORGET requests. This will make sure FORGETs do not pile up, yet
other requests are also allowed to proceed while the queued FORGETs
are processed.

Reported-by: Terje Malmedal <terje.malmedal@usit.uio.no>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>

+133 -64
+77 -9
fs/fuse/dev.c
··· 251 251 kill_fasync(&fc->fasync, SIGIO, POLL_IN); 252 252 } 253 253 254 + void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, 255 + u64 nodeid, u64 nlookup) 256 + { 257 + forget->nodeid = nodeid; 258 + forget->nlookup = nlookup; 259 + 260 + spin_lock(&fc->lock); 261 + fc->forget_list_tail->next = forget; 262 + fc->forget_list_tail = forget; 263 + wake_up(&fc->waitq); 264 + kill_fasync(&fc->fasync, SIGIO, POLL_IN); 265 + spin_unlock(&fc->lock); 266 + } 267 + 254 268 static void flush_bg_queue(struct fuse_conn *fc) 255 269 { 256 270 while (fc->active_background < fc->max_background && ··· 450 436 req->out.h.error = -ENOTCONN; 451 437 request_end(fc, req); 452 438 } 453 - } 454 - 455 - void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) 456 - { 457 - req->isreply = 0; 458 - fuse_request_send_nowait(fc, req); 459 439 } 460 440 461 441 void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) ··· 904 896 return err; 905 897 } 906 898 899 + static int forget_pending(struct fuse_conn *fc) 900 + { 901 + return fc->forget_list_head.next != NULL; 902 + } 903 + 907 904 static int request_pending(struct fuse_conn *fc) 908 905 { 909 - return !list_empty(&fc->pending) || !list_empty(&fc->interrupts); 906 + return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) || 907 + forget_pending(fc); 910 908 } 911 909 912 910 /* Wait until a request is available on the pending list */ ··· 974 960 return err ? err : reqsize; 975 961 } 976 962 963 + static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc) 964 + { 965 + struct fuse_forget_link *forget = fc->forget_list_head.next; 966 + 967 + fc->forget_list_head.next = forget->next; 968 + if (fc->forget_list_head.next == NULL) 969 + fc->forget_list_tail = &fc->forget_list_head; 970 + 971 + return forget; 972 + } 973 + 974 + static int fuse_read_single_forget(struct fuse_conn *fc, 975 + struct fuse_copy_state *cs, 976 + size_t nbytes) 977 + __releases(fc->lock) 978 + { 979 + int err; 980 + struct fuse_forget_link *forget = dequeue_forget(fc); 981 + struct fuse_forget_in arg = { 982 + .nlookup = forget->nlookup, 983 + }; 984 + struct fuse_in_header ih = { 985 + .opcode = FUSE_FORGET, 986 + .nodeid = forget->nodeid, 987 + .unique = fuse_get_unique(fc), 988 + .len = sizeof(ih) + sizeof(arg), 989 + }; 990 + 991 + spin_unlock(&fc->lock); 992 + kfree(forget); 993 + if (nbytes < ih.len) 994 + return -EINVAL; 995 + 996 + err = fuse_copy_one(cs, &ih, sizeof(ih)); 997 + if (!err) 998 + err = fuse_copy_one(cs, &arg, sizeof(arg)); 999 + fuse_copy_finish(cs); 1000 + 1001 + if (err) 1002 + return err; 1003 + 1004 + return ih.len; 1005 + } 1006 + 977 1007 /* 978 1008 * Read a single request into the userspace filesystem's buffer. This 979 1009 * function waits until a request is available, then removes it from ··· 1054 996 req = list_entry(fc->interrupts.next, struct fuse_req, 1055 997 intr_entry); 1056 998 return fuse_read_interrupt(fc, cs, nbytes, req); 999 + } 1000 + 1001 + if (forget_pending(fc)) { 1002 + if (list_empty(&fc->pending) || fc->forget_batch-- > 0) 1003 + return fuse_read_single_forget(fc, cs, nbytes); 1004 + 1005 + if (fc->forget_batch <= -8) 1006 + fc->forget_batch = 16; 1057 1007 } 1058 1008 1059 1009 req = list_entry(fc->pending.next, struct fuse_req, list); ··· 1156 1090 if (!fc) 1157 1091 return -EPERM; 1158 1092 1159 - bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL); 1093 + bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); 1160 1094 if (!bufs) 1161 1095 return -ENOMEM; 1162 1096 ··· 1692 1626 if (!fc) 1693 1627 return -EPERM; 1694 1628 1695 - bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL); 1629 + bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); 1696 1630 if (!bufs) 1697 1631 return -ENOMEM; 1698 1632 ··· 1836 1770 flush_bg_queue(fc); 1837 1771 end_requests(fc, &fc->pending); 1838 1772 end_requests(fc, &fc->processing); 1773 + while (forget_pending(fc)) 1774 + kfree(dequeue_forget(fc)); 1839 1775 } 1840 1776 1841 1777 /*
+26 -27
fs/fuse/dir.c
··· 10 10 11 11 #include <linux/pagemap.h> 12 12 #include <linux/file.h> 13 - #include <linux/gfp.h> 14 13 #include <linux/sched.h> 15 14 #include <linux/namei.h> 15 + #include <linux/slab.h> 16 16 17 17 #if BITS_PER_LONG >= 64 18 18 static inline void fuse_dentry_settime(struct dentry *entry, u64 time) ··· 165 165 struct fuse_entry_out outarg; 166 166 struct fuse_conn *fc; 167 167 struct fuse_req *req; 168 - struct fuse_req *forget_req; 168 + struct fuse_forget_link *forget; 169 169 struct dentry *parent; 170 170 u64 attr_version; 171 171 ··· 178 178 if (IS_ERR(req)) 179 179 return 0; 180 180 181 - forget_req = fuse_get_req(fc); 182 - if (IS_ERR(forget_req)) { 181 + forget = fuse_alloc_forget(); 182 + if (!forget) { 183 183 fuse_put_request(fc, req); 184 184 return 0; 185 185 } ··· 199 199 if (!err) { 200 200 struct fuse_inode *fi = get_fuse_inode(inode); 201 201 if (outarg.nodeid != get_node_id(inode)) { 202 - fuse_send_forget(fc, forget_req, 203 - outarg.nodeid, 1); 202 + fuse_queue_forget(fc, forget, outarg.nodeid, 1); 204 203 return 0; 205 204 } 206 205 spin_lock(&fc->lock); 207 206 fi->nlookup++; 208 207 spin_unlock(&fc->lock); 209 208 } 210 - fuse_put_request(fc, forget_req); 209 + kfree(forget); 211 210 if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT) 212 211 return 0; 213 212 ··· 258 259 { 259 260 struct fuse_conn *fc = get_fuse_conn_super(sb); 260 261 struct fuse_req *req; 261 - struct fuse_req *forget_req; 262 + struct fuse_forget_link *forget; 262 263 u64 attr_version; 263 264 int err; 264 265 ··· 272 273 if (IS_ERR(req)) 273 274 goto out; 274 275 275 - forget_req = fuse_get_req(fc); 276 - err = PTR_ERR(forget_req); 277 - if (IS_ERR(forget_req)) { 276 + forget = fuse_alloc_forget(); 277 + err = -ENOMEM; 278 + if (!forget) { 278 279 fuse_put_request(fc, req); 279 280 goto out; 280 281 } ··· 300 301 attr_version); 301 302 err = -ENOMEM; 302 303 if (!*inode) { 303 - fuse_send_forget(fc, forget_req, outarg->nodeid, 1); 304 + fuse_queue_forget(fc, forget, outarg->nodeid, 1); 304 305 goto out; 305 306 } 306 307 err = 0; 307 308 308 309 out_put_forget: 309 - fuse_put_request(fc, forget_req); 310 + kfree(forget); 310 311 out: 311 312 return err; 312 313 } ··· 373 374 struct inode *inode; 374 375 struct fuse_conn *fc = get_fuse_conn(dir); 375 376 struct fuse_req *req; 376 - struct fuse_req *forget_req; 377 + struct fuse_forget_link *forget; 377 378 struct fuse_create_in inarg; 378 379 struct fuse_open_out outopen; 379 380 struct fuse_entry_out outentry; ··· 387 388 if (flags & O_DIRECT) 388 389 return -EINVAL; 389 390 390 - forget_req = fuse_get_req(fc); 391 - if (IS_ERR(forget_req)) 392 - return PTR_ERR(forget_req); 391 + forget = fuse_alloc_forget(); 392 + if (!forget) 393 + return -ENOMEM; 393 394 394 395 req = fuse_get_req(fc); 395 396 err = PTR_ERR(req); ··· 447 448 if (!inode) { 448 449 flags &= ~(O_CREAT | O_EXCL | O_TRUNC); 449 450 fuse_sync_release(ff, flags); 450 - fuse_send_forget(fc, forget_req, outentry.nodeid, 1); 451 + fuse_queue_forget(fc, forget, outentry.nodeid, 1); 451 452 return -ENOMEM; 452 453 } 453 - fuse_put_request(fc, forget_req); 454 + kfree(forget); 454 455 d_instantiate(entry, inode); 455 456 fuse_change_entry_timeout(entry, &outentry); 456 457 fuse_invalidate_attr(dir); ··· 468 469 out_put_request: 469 470 fuse_put_request(fc, req); 470 471 out_put_forget_req: 471 - fuse_put_request(fc, forget_req); 472 + kfree(forget); 472 473 return err; 473 474 } 474 475 ··· 482 483 struct fuse_entry_out outarg; 483 484 struct inode *inode; 484 485 int err; 485 - struct fuse_req *forget_req; 486 + struct fuse_forget_link *forget; 486 487 487 - forget_req = fuse_get_req(fc); 488 - if (IS_ERR(forget_req)) { 488 + forget = fuse_alloc_forget(); 489 + if (!forget) { 489 490 fuse_put_request(fc, req); 490 - return PTR_ERR(forget_req); 491 + return -ENOMEM; 491 492 } 492 493 493 494 memset(&outarg, 0, sizeof(outarg)); ··· 514 515 inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, 515 516 &outarg.attr, entry_attr_timeout(&outarg), 0); 516 517 if (!inode) { 517 - fuse_send_forget(fc, forget_req, outarg.nodeid, 1); 518 + fuse_queue_forget(fc, forget, outarg.nodeid, 1); 518 519 return -ENOMEM; 519 520 } 520 - fuse_put_request(fc, forget_req); 521 + kfree(forget); 521 522 522 523 if (S_ISDIR(inode->i_mode)) { 523 524 struct dentry *alias; ··· 540 541 return 0; 541 542 542 543 out_put_forget_req: 543 - fuse_put_request(fc, forget_req); 544 + kfree(forget); 544 545 return err; 545 546 } 546 547
+19 -9
fs/fuse/fuse_i.h
··· 53 53 extern unsigned max_user_bgreq; 54 54 extern unsigned max_user_congthresh; 55 55 56 + /* One forget request */ 57 + struct fuse_forget_link { 58 + u64 nodeid; 59 + u64 nlookup; 60 + struct fuse_forget_link *next; 61 + }; 62 + 56 63 /** FUSE inode */ 57 64 struct fuse_inode { 58 65 /** Inode data */ ··· 73 66 u64 nlookup; 74 67 75 68 /** The request used for sending the FORGET message */ 76 - struct fuse_req *forget_req; 69 + struct fuse_forget_link *forget; 77 70 78 71 /** Time in jiffies until the file attributes are valid */ 79 72 u64 i_time; ··· 262 255 263 256 /** Data for asynchronous requests */ 264 257 union { 265 - struct fuse_forget_in forget_in; 266 258 struct { 267 259 struct fuse_release_in in; 268 260 struct path path; ··· 374 368 375 369 /** Pending interrupts */ 376 370 struct list_head interrupts; 371 + 372 + /** Queue of pending forgets */ 373 + struct fuse_forget_link forget_list_head; 374 + struct fuse_forget_link *forget_list_tail; 375 + 376 + /** Batching of FORGET requests (positive indicates FORGET batch) */ 377 + int forget_batch; 377 378 378 379 /** Flag indicating if connection is blocked. This will be 379 380 the case before the INIT reply is received, and if there ··· 556 543 /** 557 544 * Send FORGET command 558 545 */ 559 - void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req, 560 - u64 nodeid, u64 nlookup); 546 + void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, 547 + u64 nodeid, u64 nlookup); 548 + 549 + struct fuse_forget_link *fuse_alloc_forget(void); 561 550 562 551 /** 563 552 * Initialize READ or READDIR request ··· 669 654 * Send a request (synchronous) 670 655 */ 671 656 void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); 672 - 673 - /** 674 - * Send a request with no reply 675 - */ 676 - void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); 677 657 678 658 /** 679 659 * Send a request in the background
+11 -19
fs/fuse/inode.c
··· 71 71 unsigned blksize; 72 72 }; 73 73 74 + struct fuse_forget_link *fuse_alloc_forget() 75 + { 76 + return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); 77 + } 78 + 74 79 static struct inode *fuse_alloc_inode(struct super_block *sb) 75 80 { 76 81 struct inode *inode; ··· 95 90 INIT_LIST_HEAD(&fi->queued_writes); 96 91 INIT_LIST_HEAD(&fi->writepages); 97 92 init_waitqueue_head(&fi->page_waitq); 98 - fi->forget_req = fuse_request_alloc(); 99 - if (!fi->forget_req) { 93 + fi->forget = fuse_alloc_forget(); 94 + if (!fi->forget) { 100 95 kmem_cache_free(fuse_inode_cachep, inode); 101 96 return NULL; 102 97 } ··· 109 104 struct fuse_inode *fi = get_fuse_inode(inode); 110 105 BUG_ON(!list_empty(&fi->write_files)); 111 106 BUG_ON(!list_empty(&fi->queued_writes)); 112 - if (fi->forget_req) 113 - fuse_request_free(fi->forget_req); 107 + kfree(fi->forget); 114 108 kmem_cache_free(fuse_inode_cachep, inode); 115 - } 116 - 117 - void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req, 118 - u64 nodeid, u64 nlookup) 119 - { 120 - struct fuse_forget_in *inarg = &req->misc.forget_in; 121 - inarg->nlookup = nlookup; 122 - req->in.h.opcode = FUSE_FORGET; 123 - req->in.h.nodeid = nodeid; 124 - req->in.numargs = 1; 125 - req->in.args[0].size = sizeof(struct fuse_forget_in); 126 - req->in.args[0].value = inarg; 127 - fuse_request_send_noreply(fc, req); 128 109 } 129 110 130 111 static void fuse_evict_inode(struct inode *inode) ··· 120 129 if (inode->i_sb->s_flags & MS_ACTIVE) { 121 130 struct fuse_conn *fc = get_fuse_conn(inode); 122 131 struct fuse_inode *fi = get_fuse_inode(inode); 123 - fuse_send_forget(fc, fi->forget_req, fi->nodeid, fi->nlookup); 124 - fi->forget_req = NULL; 132 + fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); 133 + fi->forget = NULL; 125 134 } 126 135 } 127 136 ··· 525 534 INIT_LIST_HEAD(&fc->interrupts); 526 535 INIT_LIST_HEAD(&fc->bg_queue); 527 536 INIT_LIST_HEAD(&fc->entry); 537 + fc->forget_list_tail = &fc->forget_list_head; 528 538 atomic_set(&fc->num_waiting, 0); 529 539 fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; 530 540 fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;