Merge tag 'ceph-for-5.13-rc1' of git://github.com/ceph/ceph-client

+1

fs/ceph/Kconfig

··· 6 6 select LIBCRC32C 7 7 select CRYPTO_AES 8 8 select CRYPTO 9 + select NETFS_SUPPORT 9 10 default n 10 11 help 11 12 Choose Y or M here to include support for mounting the

+272 -390

fs/ceph/addr.c

··· 12 12 #include <linux/signal.h> 13 13 #include <linux/iversion.h> 14 14 #include <linux/ktime.h> 15 + #include <linux/netfs.h> 15 16 16 17 #include "super.h" 17 18 #include "mds_client.h" ··· 61 60 #define CONGESTION_OFF_THRESH(congestion_kb) \ 62 61 (CONGESTION_ON_THRESH(congestion_kb) - \ 63 62 (CONGESTION_ON_THRESH(congestion_kb) >> 2)) 63 + 64 + static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, 65 + struct page *page, void **_fsdata); 64 66 65 67 static inline struct ceph_snap_context *page_snap_context(struct page *page) 66 68 { ··· 128 124 * PagePrivate so that we get invalidatepage callback. 129 125 */ 130 126 BUG_ON(PagePrivate(page)); 131 - page->private = (unsigned long)snapc; 132 - SetPagePrivate(page); 127 + attach_page_private(page, snapc); 133 128 134 129 ret = __set_page_dirty_nobuffers(page); 135 130 WARN_ON(!PageLocked(page)); ··· 147 144 { 148 145 struct inode *inode; 149 146 struct ceph_inode_info *ci; 150 - struct ceph_snap_context *snapc = page_snap_context(page); 147 + struct ceph_snap_context *snapc; 148 + 149 + wait_on_page_fscache(page); 151 150 152 151 inode = page->mapping->host; 153 152 ci = ceph_inode(inode); 154 153 155 - if (offset != 0 || length != PAGE_SIZE) { 154 + if (offset != 0 || length != thp_size(page)) { 156 155 dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n", 157 156 inode, page, page->index, offset, length); 158 157 return; 159 158 } 160 - 161 - ceph_invalidate_fscache_page(inode, page); 162 159 163 160 WARN_ON(!PageLocked(page)); 164 161 if (!PagePrivate(page)) ··· 167 164 dout("%p invalidatepage %p idx %lu full dirty page\n", 168 165 inode, page, page->index); 169 166 167 + snapc = detach_page_private(page); 170 168 ceph_put_wrbuffer_cap_refs(ci, 1, snapc); 171 169 ceph_put_snap_context(snapc); 172 - page->private = 0; 173 - ClearPagePrivate(page); 174 170 } 175 171 176 - static int ceph_releasepage(struct page *page, gfp_t g) 172 + static int ceph_releasepage(struct page *page, gfp_t gfp) 177 173 { 178 174 dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host, 179 175 page, page->index, PageDirty(page) ? "" : "not "); 180 176 181 - /* Can we release the page from the cache? */ 182 - if (!ceph_release_fscache_page(page, g)) 183 - return 0; 184 - 177 + if (PageFsCache(page)) { 178 + if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) 179 + return 0; 180 + wait_on_page_fscache(page); 181 + } 185 182 return !PagePrivate(page); 186 183 } 187 184 188 - /* read a single page, without unlocking it. */ 189 - static int ceph_do_readpage(struct file *filp, struct page *page) 185 + static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq) 190 186 { 191 - struct inode *inode = file_inode(filp); 187 + struct inode *inode = rreq->mapping->host; 188 + struct ceph_inode_info *ci = ceph_inode(inode); 189 + struct ceph_file_layout *lo = &ci->i_layout; 190 + u32 blockoff; 191 + u64 blockno; 192 + 193 + /* Expand the start downward */ 194 + blockno = div_u64_rem(rreq->start, lo->stripe_unit, &blockoff); 195 + rreq->start = blockno * lo->stripe_unit; 196 + rreq->len += blockoff; 197 + 198 + /* Now, round up the length to the next block */ 199 + rreq->len = roundup(rreq->len, lo->stripe_unit); 200 + } 201 + 202 + static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq) 203 + { 204 + struct inode *inode = subreq->rreq->mapping->host; 205 + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 206 + struct ceph_inode_info *ci = ceph_inode(inode); 207 + u64 objno, objoff; 208 + u32 xlen; 209 + 210 + /* Truncate the extent at the end of the current block */ 211 + ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len, 212 + &objno, &objoff, &xlen); 213 + subreq->len = min(xlen, fsc->mount_options->rsize); 214 + return true; 215 + } 216 + 217 + static void finish_netfs_read(struct ceph_osd_request *req) 218 + { 219 + struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode); 220 + struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); 221 + struct netfs_read_subrequest *subreq = req->r_priv; 222 + int num_pages; 223 + int err = req->r_result; 224 + 225 + ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, 226 + req->r_end_latency, err); 227 + 228 + dout("%s: result %d subreq->len=%zu i_size=%lld\n", __func__, req->r_result, 229 + subreq->len, i_size_read(req->r_inode)); 230 + 231 + /* no object means success but no data */ 232 + if (err == -ENOENT) 233 + err = 0; 234 + else if (err == -EBLOCKLISTED) 235 + fsc->blocklisted = true; 236 + 237 + if (err >= 0 && err < subreq->len) 238 + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); 239 + 240 + netfs_subreq_terminated(subreq, err, true); 241 + 242 + num_pages = calc_pages_for(osd_data->alignment, osd_data->length); 243 + ceph_put_page_vector(osd_data->pages, num_pages, false); 244 + iput(req->r_inode); 245 + } 246 + 247 + static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) 248 + { 249 + struct netfs_read_request *rreq = subreq->rreq; 250 + struct inode *inode = rreq->mapping->host; 192 251 struct ceph_inode_info *ci = ceph_inode(inode); 193 252 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 194 - struct ceph_osd_client *osdc = &fsc->client->osdc; 195 253 struct ceph_osd_request *req; 196 254 struct ceph_vino vino = ceph_vino(inode); 255 + struct iov_iter iter; 256 + struct page **pages; 257 + size_t page_off; 197 258 int err = 0; 198 - u64 off = page_offset(page); 199 - u64 len = PAGE_SIZE; 259 + u64 len = subreq->len; 200 260 201 - if (off >= i_size_read(inode)) { 202 - zero_user_segment(page, 0, PAGE_SIZE); 203 - SetPageUptodate(page); 204 - return 0; 261 + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, 262 + 0, 1, CEPH_OSD_OP_READ, 263 + CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica, 264 + NULL, ci->i_truncate_seq, ci->i_truncate_size, false); 265 + if (IS_ERR(req)) { 266 + err = PTR_ERR(req); 267 + req = NULL; 268 + goto out; 205 269 } 270 + 271 + dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len); 272 + iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len); 273 + err = iov_iter_get_pages_alloc(&iter, &pages, len, &page_off); 274 + if (err < 0) { 275 + dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err); 276 + goto out; 277 + } 278 + 279 + /* should always give us a page-aligned read */ 280 + WARN_ON_ONCE(page_off); 281 + len = err; 282 + 283 + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); 284 + req->r_callback = finish_netfs_read; 285 + req->r_priv = subreq; 286 + req->r_inode = inode; 287 + ihold(inode); 288 + 289 + err = ceph_osdc_start_request(req->r_osdc, req, false); 290 + if (err) 291 + iput(inode); 292 + out: 293 + ceph_osdc_put_request(req); 294 + if (err) 295 + netfs_subreq_terminated(subreq, err, false); 296 + dout("%s: result %d\n", __func__, err); 297 + } 298 + 299 + static void ceph_init_rreq(struct netfs_read_request *rreq, struct file *file) 300 + { 301 + } 302 + 303 + static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) 304 + { 305 + struct inode *inode = mapping->host; 306 + struct ceph_inode_info *ci = ceph_inode(inode); 307 + int got = (uintptr_t)priv; 308 + 309 + if (got) 310 + ceph_put_cap_refs(ci, got); 311 + } 312 + 313 + const struct netfs_read_request_ops ceph_netfs_read_ops = { 314 + .init_rreq = ceph_init_rreq, 315 + .is_cache_enabled = ceph_is_cache_enabled, 316 + .begin_cache_operation = ceph_begin_cache_operation, 317 + .issue_op = ceph_netfs_issue_op, 318 + .expand_readahead = ceph_netfs_expand_readahead, 319 + .clamp_length = ceph_netfs_clamp_length, 320 + .check_write_begin = ceph_netfs_check_write_begin, 321 + .cleanup = ceph_readahead_cleanup, 322 + }; 323 + 324 + /* read a single page, without unlocking it. */ 325 + static int ceph_readpage(struct file *file, struct page *page) 326 + { 327 + struct inode *inode = file_inode(file); 328 + struct ceph_inode_info *ci = ceph_inode(inode); 329 + struct ceph_vino vino = ceph_vino(inode); 330 + u64 off = page_offset(page); 331 + u64 len = thp_size(page); 206 332 207 333 if (ci->i_inline_version != CEPH_INLINE_NONE) { 208 334 /* 209 335 * Uptodate inline data should have been added 210 336 * into page cache while getting Fcr caps. 211 337 */ 212 - if (off == 0) 338 + if (off == 0) { 339 + unlock_page(page); 213 340 return -EINVAL; 214 - zero_user_segment(page, 0, PAGE_SIZE); 341 + } 342 + zero_user_segment(page, 0, thp_size(page)); 215 343 SetPageUptodate(page); 344 + unlock_page(page); 216 345 return 0; 217 346 } 218 347 219 - err = ceph_readpage_from_fscache(inode, page); 220 - if (err == 0) 221 - return -EINPROGRESS; 222 - 223 348 dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n", 224 - vino.ino, vino.snap, filp, off, len, page, page->index); 225 - req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1, 226 - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, 227 - ci->i_truncate_seq, ci->i_truncate_size, 228 - false); 229 - if (IS_ERR(req)) 230 - return PTR_ERR(req); 349 + vino.ino, vino.snap, file, off, len, page, page->index); 231 350 232 - osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); 233 - 234 - err = ceph_osdc_start_request(osdc, req, false); 235 - if (!err) 236 - err = ceph_osdc_wait_request(osdc, req); 237 - 238 - ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, 239 - req->r_end_latency, err); 240 - 241 - ceph_osdc_put_request(req); 242 - dout("readpage result %d\n", err); 243 - 244 - if (err == -ENOENT) 245 - err = 0; 246 - if (err < 0) { 247 - ceph_fscache_readpage_cancel(inode, page); 248 - if (err == -EBLOCKLISTED) 249 - fsc->blocklisted = true; 250 - goto out; 251 - } 252 - if (err < PAGE_SIZE) 253 - /* zero fill remainder of page */ 254 - zero_user_segment(page, err, PAGE_SIZE); 255 - else 256 - flush_dcache_page(page); 257 - 258 - SetPageUptodate(page); 259 - ceph_readpage_to_fscache(inode, page); 260 - 261 - out: 262 - return err < 0 ? err : 0; 351 + return netfs_readpage(file, page, &ceph_netfs_read_ops, NULL); 263 352 } 264 353 265 - static int ceph_readpage(struct file *filp, struct page *page) 354 + static void ceph_readahead(struct readahead_control *ractl) 266 355 { 267 - int r = ceph_do_readpage(filp, page); 268 - if (r != -EINPROGRESS) 269 - unlock_page(page); 270 - else 271 - r = 0; 272 - return r; 273 - } 274 - 275 - /* 276 - * Finish an async read(ahead) op. 277 - */ 278 - static void finish_read(struct ceph_osd_request *req) 279 - { 280 - struct inode *inode = req->r_inode; 281 - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 282 - struct ceph_osd_data *osd_data; 283 - int rc = req->r_result <= 0 ? req->r_result : 0; 284 - int bytes = req->r_result >= 0 ? req->r_result : 0; 285 - int num_pages; 286 - int i; 287 - 288 - dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); 289 - if (rc == -EBLOCKLISTED) 290 - ceph_inode_to_client(inode)->blocklisted = true; 291 - 292 - /* unlock all pages, zeroing any data we didn't read */ 293 - osd_data = osd_req_op_extent_osd_data(req, 0); 294 - BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); 295 - num_pages = calc_pages_for((u64)osd_data->alignment, 296 - (u64)osd_data->length); 297 - for (i = 0; i < num_pages; i++) { 298 - struct page *page = osd_data->pages[i]; 299 - 300 - if (rc < 0 && rc != -ENOENT) { 301 - ceph_fscache_readpage_cancel(inode, page); 302 - goto unlock; 303 - } 304 - if (bytes < (int)PAGE_SIZE) { 305 - /* zero (remainder of) page */ 306 - int s = bytes < 0 ? 0 : bytes; 307 - zero_user_segment(page, s, PAGE_SIZE); 308 - } 309 - dout("finish_read %p uptodate %p idx %lu\n", inode, page, 310 - page->index); 311 - flush_dcache_page(page); 312 - SetPageUptodate(page); 313 - ceph_readpage_to_fscache(inode, page); 314 - unlock: 315 - unlock_page(page); 316 - put_page(page); 317 - bytes -= PAGE_SIZE; 318 - } 319 - 320 - ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, 321 - req->r_end_latency, rc); 322 - 323 - kfree(osd_data->pages); 324 - } 325 - 326 - /* 327 - * start an async read(ahead) operation. return nr_pages we submitted 328 - * a read for on success, or negative error code. 329 - */ 330 - static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, 331 - struct list_head *page_list, int max) 332 - { 333 - struct ceph_osd_client *osdc = 334 - &ceph_inode_to_client(inode)->client->osdc; 335 - struct ceph_inode_info *ci = ceph_inode(inode); 336 - struct page *page = lru_to_page(page_list); 337 - struct ceph_vino vino; 338 - struct ceph_osd_request *req; 339 - u64 off; 340 - u64 len; 341 - int i; 342 - struct page **pages; 343 - pgoff_t next_index; 344 - int nr_pages = 0; 356 + struct inode *inode = file_inode(ractl->file); 357 + struct ceph_file_info *fi = ractl->file->private_data; 358 + struct ceph_rw_context *rw_ctx; 345 359 int got = 0; 346 360 int ret = 0; 347 361 348 - if (!rw_ctx) { 349 - /* caller of readpages does not hold buffer and read caps 350 - * (fadvise, madvise and readahead cases) */ 351 - int want = CEPH_CAP_FILE_CACHE; 352 - ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, 353 - true, &got); 354 - if (ret < 0) { 355 - dout("start_read %p, error getting cap\n", inode); 356 - } else if (!(got & want)) { 357 - dout("start_read %p, no cache cap\n", inode); 358 - ret = 0; 359 - } 360 - if (ret <= 0) { 361 - if (got) 362 - ceph_put_cap_refs(ci, got); 363 - while (!list_empty(page_list)) { 364 - page = lru_to_page(page_list); 365 - list_del(&page->lru); 366 - put_page(page); 367 - } 368 - return ret; 369 - } 370 - } 371 - 372 - off = (u64) page_offset(page); 373 - 374 - /* count pages */ 375 - next_index = page->index; 376 - list_for_each_entry_reverse(page, page_list, lru) { 377 - if (page->index != next_index) 378 - break; 379 - nr_pages++; 380 - next_index++; 381 - if (max && nr_pages == max) 382 - break; 383 - } 384 - len = nr_pages << PAGE_SHIFT; 385 - dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, 386 - off, len); 387 - vino = ceph_vino(inode); 388 - req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 389 - 0, 1, CEPH_OSD_OP_READ, 390 - CEPH_OSD_FLAG_READ, NULL, 391 - ci->i_truncate_seq, ci->i_truncate_size, 392 - false); 393 - if (IS_ERR(req)) { 394 - ret = PTR_ERR(req); 395 - goto out; 396 - } 397 - 398 - /* build page vector */ 399 - nr_pages = calc_pages_for(0, len); 400 - pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL); 401 - if (!pages) { 402 - ret = -ENOMEM; 403 - goto out_put; 404 - } 405 - for (i = 0; i < nr_pages; ++i) { 406 - page = list_entry(page_list->prev, struct page, lru); 407 - BUG_ON(PageLocked(page)); 408 - list_del(&page->lru); 409 - 410 - dout("start_read %p adding %p idx %lu\n", inode, page, 411 - page->index); 412 - if (add_to_page_cache_lru(page, &inode->i_data, page->index, 413 - GFP_KERNEL)) { 414 - ceph_fscache_uncache_page(inode, page); 415 - put_page(page); 416 - dout("start_read %p add_to_page_cache failed %p\n", 417 - inode, page); 418 - nr_pages = i; 419 - if (nr_pages > 0) { 420 - len = nr_pages << PAGE_SHIFT; 421 - osd_req_op_extent_update(req, 0, len); 422 - break; 423 - } 424 - goto out_pages; 425 - } 426 - pages[i] = page; 427 - } 428 - osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); 429 - req->r_callback = finish_read; 430 - req->r_inode = inode; 431 - 432 - dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); 433 - ret = ceph_osdc_start_request(osdc, req, false); 434 - if (ret < 0) 435 - goto out_pages; 436 - ceph_osdc_put_request(req); 437 - 438 - /* After adding locked pages to page cache, the inode holds cache cap. 439 - * So we can drop our cap refs. */ 440 - if (got) 441 - ceph_put_cap_refs(ci, got); 442 - 443 - return nr_pages; 444 - 445 - out_pages: 446 - for (i = 0; i < nr_pages; ++i) { 447 - ceph_fscache_readpage_cancel(inode, pages[i]); 448 - unlock_page(pages[i]); 449 - } 450 - ceph_put_page_vector(pages, nr_pages, false); 451 - out_put: 452 - ceph_osdc_put_request(req); 453 - out: 454 - if (got) 455 - ceph_put_cap_refs(ci, got); 456 - return ret; 457 - } 458 - 459 - 460 - /* 461 - * Read multiple pages. Leave pages we don't read + unlock in page_list; 462 - * the caller (VM) cleans them up. 463 - */ 464 - static int ceph_readpages(struct file *file, struct address_space *mapping, 465 - struct list_head *page_list, unsigned nr_pages) 466 - { 467 - struct inode *inode = file_inode(file); 468 - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 469 - struct ceph_file_info *fi = file->private_data; 470 - struct ceph_rw_context *rw_ctx; 471 - int rc = 0; 472 - int max = 0; 473 - 474 362 if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) 475 - return -EINVAL; 476 - 477 - rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, 478 - &nr_pages); 479 - 480 - if (rc == 0) 481 - goto out; 363 + return; 482 364 483 365 rw_ctx = ceph_find_rw_context(fi); 484 - max = fsc->mount_options->rsize >> PAGE_SHIFT; 485 - dout("readpages %p file %p ctx %p nr_pages %d max %d\n", 486 - inode, file, rw_ctx, nr_pages, max); 487 - while (!list_empty(page_list)) { 488 - rc = start_read(inode, rw_ctx, page_list, max); 489 - if (rc < 0) 490 - goto out; 491 - } 492 - out: 493 - ceph_fscache_readpages_cancel(inode, page_list); 366 + if (!rw_ctx) { 367 + /* 368 + * readahead callers do not necessarily hold Fcb caps 369 + * (e.g. fadvise, madvise). 370 + */ 371 + int want = CEPH_CAP_FILE_CACHE; 494 372 495 - dout("readpages %p file %p ret %d\n", inode, file, rc); 496 - return rc; 373 + ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got); 374 + if (ret < 0) 375 + dout("start_read %p, error getting cap\n", inode); 376 + else if (!(got & want)) 377 + dout("start_read %p, no cache cap\n", inode); 378 + 379 + if (ret <= 0) 380 + return; 381 + } 382 + netfs_readahead(ractl, &ceph_netfs_read_ops, (void *)(uintptr_t)got); 497 383 } 498 384 499 385 struct ceph_writeback_ctl ··· 477 585 spin_unlock(&ci->i_ceph_lock); 478 586 WARN_ON(!found); 479 587 } 480 - if (end > page_offset(page) + PAGE_SIZE) 481 - end = page_offset(page) + PAGE_SIZE; 588 + if (end > page_offset(page) + thp_size(page)) 589 + end = page_offset(page) + thp_size(page); 482 590 return end > start ? end - start : 0; 483 591 } 484 592 ··· 496 604 struct ceph_snap_context *snapc, *oldest; 497 605 loff_t page_off = page_offset(page); 498 606 int err; 499 - loff_t len = PAGE_SIZE; 607 + loff_t len = thp_size(page); 500 608 struct ceph_writeback_ctl ceph_wbc; 501 609 struct ceph_osd_client *osdc = &fsc->client->osdc; 502 610 struct ceph_osd_request *req; ··· 524 632 /* is this a partial page at end of file? */ 525 633 if (page_off >= ceph_wbc.i_size) { 526 634 dout("%p page eof %llu\n", page, ceph_wbc.i_size); 527 - page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); 635 + page->mapping->a_ops->invalidatepage(page, 0, thp_size(page)); 528 636 return 0; 529 637 } 530 638 ··· 550 658 } 551 659 552 660 /* it may be a short write due to an object boundary */ 553 - WARN_ON_ONCE(len > PAGE_SIZE); 661 + WARN_ON_ONCE(len > thp_size(page)); 554 662 osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); 555 663 dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len); 556 664 ··· 559 667 if (!err) 560 668 err = ceph_osdc_wait_request(osdc, req); 561 669 562 - ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, 670 + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, 563 671 req->r_end_latency, err); 564 672 565 673 ceph_osdc_put_request(req); ··· 587 695 dout("writepage cleaned page %p\n", page); 588 696 err = 0; /* vfs expects us to return 0 */ 589 697 } 590 - page->private = 0; 591 - ClearPagePrivate(page); 698 + oldest = detach_page_private(page); 699 + WARN_ON_ONCE(oldest != snapc); 592 700 end_page_writeback(page); 593 701 ceph_put_wrbuffer_cap_refs(ci, 1, snapc); 594 702 ceph_put_snap_context(snapc); /* page's reference */ ··· 647 755 ceph_clear_error_write(ci); 648 756 } 649 757 650 - ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, 758 + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, 651 759 req->r_end_latency, rc); 652 760 653 761 /* ··· 680 788 clear_bdi_congested(inode_to_bdi(inode), 681 789 BLK_RW_ASYNC); 682 790 683 - ceph_put_snap_context(page_snap_context(page)); 684 - page->private = 0; 685 - ClearPagePrivate(page); 686 - dout("unlocking %p\n", page); 791 + ceph_put_snap_context(detach_page_private(page)); 687 792 end_page_writeback(page); 793 + dout("unlocking %p\n", page); 688 794 689 795 if (remove_page) 690 796 generic_error_remove_page(inode->i_mapping, ··· 839 949 page_offset(page) >= i_size_read(inode)) && 840 950 clear_page_dirty_for_io(page)) 841 951 mapping->a_ops->invalidatepage(page, 842 - 0, PAGE_SIZE); 952 + 0, thp_size(page)); 843 953 unlock_page(page); 844 954 continue; 845 955 } ··· 928 1038 pages[locked_pages++] = page; 929 1039 pvec.pages[i] = NULL; 930 1040 931 - len += PAGE_SIZE; 1041 + len += thp_size(page); 932 1042 } 933 1043 934 1044 /* did we get anything? */ ··· 977 1087 BUG_ON(IS_ERR(req)); 978 1088 } 979 1089 BUG_ON(len < page_offset(pages[locked_pages - 1]) + 980 - PAGE_SIZE - offset); 1090 + thp_size(page) - offset); 981 1091 982 1092 req->r_callback = writepages_finish; 983 1093 req->r_inode = inode; ··· 1007 1117 } 1008 1118 1009 1119 set_page_writeback(pages[i]); 1010 - len += PAGE_SIZE; 1120 + len += thp_size(page); 1011 1121 } 1012 1122 1013 1123 if (ceph_wbc.size_stable) { ··· 1016 1126 /* writepages_finish() clears writeback pages 1017 1127 * according to the data length, so make sure 1018 1128 * data length covers all locked pages */ 1019 - u64 min_len = len + 1 - PAGE_SIZE; 1129 + u64 min_len = len + 1 - thp_size(page); 1020 1130 len = get_writepages_data_length(inode, pages[i - 1], 1021 1131 offset); 1022 1132 len = max(len, min_len); ··· 1192 1302 return NULL; 1193 1303 } 1194 1304 1305 + static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, 1306 + struct page *page, void **_fsdata) 1307 + { 1308 + struct inode *inode = file_inode(file); 1309 + struct ceph_inode_info *ci = ceph_inode(inode); 1310 + struct ceph_snap_context *snapc; 1311 + 1312 + snapc = ceph_find_incompatible(page); 1313 + if (snapc) { 1314 + int r; 1315 + 1316 + unlock_page(page); 1317 + put_page(page); 1318 + if (IS_ERR(snapc)) 1319 + return PTR_ERR(snapc); 1320 + 1321 + ceph_queue_writeback(inode); 1322 + r = wait_event_killable(ci->i_cap_wq, 1323 + context_is_writeable_or_written(inode, snapc)); 1324 + ceph_put_snap_context(snapc); 1325 + return r == 0 ? -EAGAIN : r; 1326 + } 1327 + return 0; 1328 + } 1329 + 1195 1330 /* 1196 1331 * We are only allowed to write into/dirty the page if the page is 1197 1332 * clean, or already dirty within the same snap context. ··· 1227 1312 { 1228 1313 struct inode *inode = file_inode(file); 1229 1314 struct ceph_inode_info *ci = ceph_inode(inode); 1230 - struct ceph_snap_context *snapc; 1231 1315 struct page *page = NULL; 1232 1316 pgoff_t index = pos >> PAGE_SHIFT; 1233 - int pos_in_page = pos & ~PAGE_MASK; 1234 - int r = 0; 1317 + int r; 1235 1318 1236 - dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); 1237 - 1238 - for (;;) { 1319 + /* 1320 + * Uninlining should have already been done and everything updated, EXCEPT 1321 + * for inline_version sent to the MDS. 1322 + */ 1323 + if (ci->i_inline_version != CEPH_INLINE_NONE) { 1239 1324 page = grab_cache_page_write_begin(mapping, index, flags); 1240 - if (!page) { 1241 - r = -ENOMEM; 1242 - break; 1243 - } 1325 + if (!page) 1326 + return -ENOMEM; 1244 1327 1245 - snapc = ceph_find_incompatible(page); 1246 - if (snapc) { 1247 - if (IS_ERR(snapc)) { 1248 - r = PTR_ERR(snapc); 1249 - break; 1328 + /* 1329 + * The inline_version on a new inode is set to 1. If that's the 1330 + * case, then the page is brand new and isn't yet Uptodate. 1331 + */ 1332 + r = 0; 1333 + if (index == 0 && ci->i_inline_version != 1) { 1334 + if (!PageUptodate(page)) { 1335 + WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n", 1336 + ci->i_inline_version); 1337 + r = -EINVAL; 1250 1338 } 1251 - unlock_page(page); 1252 - put_page(page); 1253 - page = NULL; 1254 - ceph_queue_writeback(inode); 1255 - r = wait_event_killable(ci->i_cap_wq, 1256 - context_is_writeable_or_written(inode, snapc)); 1257 - ceph_put_snap_context(snapc); 1258 - if (r != 0) 1259 - break; 1260 - continue; 1339 + goto out; 1261 1340 } 1262 - 1263 - if (PageUptodate(page)) { 1264 - dout(" page %p already uptodate\n", page); 1265 - break; 1266 - } 1267 - 1268 - /* 1269 - * In some cases we don't need to read at all: 1270 - * - full page write 1271 - * - write that lies completely beyond EOF 1272 - * - write that covers the the page from start to EOF or beyond it 1273 - */ 1274 - if ((pos_in_page == 0 && len == PAGE_SIZE) || 1275 - (pos >= i_size_read(inode)) || 1276 - (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) { 1277 - zero_user_segments(page, 0, pos_in_page, 1278 - pos_in_page + len, PAGE_SIZE); 1279 - break; 1280 - } 1281 - 1282 - /* 1283 - * We need to read it. If we get back -EINPROGRESS, then the page was 1284 - * handed off to fscache and it will be unlocked when the read completes. 1285 - * Refind the page in that case so we can reacquire the page lock. Otherwise 1286 - * we got a hard error or the read was completed synchronously. 1287 - */ 1288 - r = ceph_do_readpage(file, page); 1289 - if (r != -EINPROGRESS) 1290 - break; 1341 + zero_user_segment(page, 0, thp_size(page)); 1342 + SetPageUptodate(page); 1343 + goto out; 1291 1344 } 1292 1345 1346 + r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &page, NULL, 1347 + &ceph_netfs_read_ops, NULL); 1348 + out: 1349 + if (r == 0) 1350 + wait_on_page_fscache(page); 1293 1351 if (r < 0) { 1294 - if (page) { 1295 - unlock_page(page); 1352 + if (page) 1296 1353 put_page(page); 1297 - } 1298 1354 } else { 1355 + WARN_ON_ONCE(!PageLocked(page)); 1299 1356 *pagep = page; 1300 1357 } 1301 1358 return r; ··· 1325 1438 1326 1439 const struct address_space_operations ceph_aops = { 1327 1440 .readpage = ceph_readpage, 1328 - .readpages = ceph_readpages, 1441 + .readahead = ceph_readahead, 1329 1442 .writepage = ceph_writepage, 1330 1443 .writepages = ceph_writepages_start, 1331 1444 .write_begin = ceph_write_begin, ··· 1357 1470 struct inode *inode = file_inode(vma->vm_file); 1358 1471 struct ceph_inode_info *ci = ceph_inode(inode); 1359 1472 struct ceph_file_info *fi = vma->vm_file->private_data; 1360 - struct page *pinned_page = NULL; 1361 1473 loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT; 1362 1474 int want, got, err; 1363 1475 sigset_t oldset; ··· 1364 1478 1365 1479 ceph_block_sigs(&oldset); 1366 1480 1367 - dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", 1368 - inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE); 1481 + dout("filemap_fault %p %llx.%llx %llu trying to get caps\n", 1482 + inode, ceph_vinop(inode), off); 1369 1483 if (fi->fmode & CEPH_FILE_MODE_LAZY) 1370 1484 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 1371 1485 else 1372 1486 want = CEPH_CAP_FILE_CACHE; 1373 1487 1374 1488 got = 0; 1375 - err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, 1376 - &got, &pinned_page); 1489 + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, &got); 1377 1490 if (err < 0) 1378 1491 goto out_restore; 1379 1492 1380 - dout("filemap_fault %p %llu~%zd got cap refs on %s\n", 1381 - inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got)); 1493 + dout("filemap_fault %p %llu got cap refs on %s\n", 1494 + inode, off, ceph_cap_string(got)); 1382 1495 1383 1496 if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || 1384 1497 ci->i_inline_version == CEPH_INLINE_NONE) { ··· 1385 1500 ceph_add_rw_context(fi, &rw_ctx); 1386 1501 ret = filemap_fault(vmf); 1387 1502 ceph_del_rw_context(fi, &rw_ctx); 1388 - dout("filemap_fault %p %llu~%zd drop cap refs %s ret %x\n", 1389 - inode, off, (size_t)PAGE_SIZE, 1390 - ceph_cap_string(got), ret); 1503 + dout("filemap_fault %p %llu drop cap refs %s ret %x\n", 1504 + inode, off, ceph_cap_string(got), ret); 1391 1505 } else 1392 1506 err = -EAGAIN; 1393 1507 1394 - if (pinned_page) 1395 - put_page(pinned_page); 1396 1508 ceph_put_cap_refs(ci, got); 1397 1509 1398 1510 if (err != -EAGAIN) ··· 1424 1542 vmf->page = page; 1425 1543 ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; 1426 1544 out_inline: 1427 - dout("filemap_fault %p %llu~%zd read inline data ret %x\n", 1428 - inode, off, (size_t)PAGE_SIZE, ret); 1545 + dout("filemap_fault %p %llu read inline data ret %x\n", 1546 + inode, off, ret); 1429 1547 } 1430 1548 out_restore: 1431 1549 ceph_restore_sigs(&oldset); ··· 1435 1553 return ret; 1436 1554 } 1437 1555 1438 - /* 1439 - * Reuse write_begin here for simplicity. 1440 - */ 1441 1556 static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) 1442 1557 { 1443 1558 struct vm_area_struct *vma = vmf->vma; ··· 1470 1591 goto out_free; 1471 1592 } 1472 1593 1473 - if (off + PAGE_SIZE <= size) 1474 - len = PAGE_SIZE; 1594 + if (off + thp_size(page) <= size) 1595 + len = thp_size(page); 1475 1596 else 1476 - len = size & ~PAGE_MASK; 1597 + len = offset_in_thp(page, size); 1477 1598 1478 1599 dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", 1479 1600 inode, ceph_vinop(inode), off, len, size); ··· 1483 1604 want = CEPH_CAP_FILE_BUFFER; 1484 1605 1485 1606 got = 0; 1486 - err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, 1487 - &got, NULL); 1607 + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got); 1488 1608 if (err < 0) 1489 1609 goto out_free; 1490 1610 ··· 1710 1832 if (!err) 1711 1833 err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1712 1834 1713 - ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, 1835 + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, 1714 1836 req->r_end_latency, err); 1715 1837 1716 1838 out_put: ··· 1934 2056 struct ceph_string *pool_ns; 1935 2057 s64 pool; 1936 2058 int ret, flags; 2059 + 2060 + /* Only need to do this for regular files */ 2061 + if (!S_ISREG(inode->i_mode)) 2062 + return 0; 1937 2063 1938 2064 if (ci->i_vino.snap != CEPH_NOSNAP) { 1939 2065 /*

-125

fs/ceph/cache.c

··· 173 173 174 174 ci->fscache = NULL; 175 175 176 - fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode); 177 176 fscache_relinquish_cookie(cookie, &ci->i_vino, false); 178 177 } 179 178 ··· 193 194 dout("fscache_file_set_cookie %p %p disabling cache\n", 194 195 inode, filp); 195 196 fscache_disable_cookie(ci->fscache, &ci->i_vino, false); 196 - fscache_uncache_all_inode_pages(ci->fscache, inode); 197 197 } else { 198 198 fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode), 199 199 ceph_fscache_can_enable, inode); ··· 201 203 inode, filp); 202 204 } 203 205 } 204 - } 205 - 206 - static void ceph_readpage_from_fscache_complete(struct page *page, void *data, int error) 207 - { 208 - if (!error) 209 - SetPageUptodate(page); 210 - 211 - unlock_page(page); 212 - } 213 - 214 - static inline bool cache_valid(struct ceph_inode_info *ci) 215 - { 216 - return ci->i_fscache_gen == ci->i_rdcache_gen; 217 - } 218 - 219 - 220 - /* Atempt to read from the fscache, 221 - * 222 - * This function is called from the readpage_nounlock context. DO NOT attempt to 223 - * unlock the page here (or in the callback). 224 - */ 225 - int ceph_readpage_from_fscache(struct inode *inode, struct page *page) 226 - { 227 - struct ceph_inode_info *ci = ceph_inode(inode); 228 - int ret; 229 - 230 - if (!cache_valid(ci)) 231 - return -ENOBUFS; 232 - 233 - ret = fscache_read_or_alloc_page(ci->fscache, page, 234 - ceph_readpage_from_fscache_complete, NULL, 235 - GFP_KERNEL); 236 - 237 - switch (ret) { 238 - case 0: /* Page found */ 239 - dout("page read submitted\n"); 240 - return 0; 241 - case -ENOBUFS: /* Pages were not found, and can't be */ 242 - case -ENODATA: /* Pages were not found */ 243 - dout("page/inode not in cache\n"); 244 - return ret; 245 - default: 246 - dout("%s: unknown error ret = %i\n", __func__, ret); 247 - return ret; 248 - } 249 - } 250 - 251 - int ceph_readpages_from_fscache(struct inode *inode, 252 - struct address_space *mapping, 253 - struct list_head *pages, 254 - unsigned *nr_pages) 255 - { 256 - struct ceph_inode_info *ci = ceph_inode(inode); 257 - int ret; 258 - 259 - if (!cache_valid(ci)) 260 - return -ENOBUFS; 261 - 262 - ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages, 263 - ceph_readpage_from_fscache_complete, 264 - NULL, mapping_gfp_mask(mapping)); 265 - 266 - switch (ret) { 267 - case 0: /* All pages found */ 268 - dout("all-page read submitted\n"); 269 - return 0; 270 - case -ENOBUFS: /* Some pages were not found, and can't be */ 271 - case -ENODATA: /* some pages were not found */ 272 - dout("page/inode not in cache\n"); 273 - return ret; 274 - default: 275 - dout("%s: unknown error ret = %i\n", __func__, ret); 276 - return ret; 277 - } 278 - } 279 - 280 - void ceph_readpage_to_fscache(struct inode *inode, struct page *page) 281 - { 282 - struct ceph_inode_info *ci = ceph_inode(inode); 283 - int ret; 284 - 285 - if (!PageFsCache(page)) 286 - return; 287 - 288 - if (!cache_valid(ci)) 289 - return; 290 - 291 - ret = fscache_write_page(ci->fscache, page, i_size_read(inode), 292 - GFP_KERNEL); 293 - if (ret) 294 - fscache_uncache_page(ci->fscache, page); 295 - } 296 - 297 - void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) 298 - { 299 - struct ceph_inode_info *ci = ceph_inode(inode); 300 - 301 - if (!PageFsCache(page)) 302 - return; 303 - 304 - fscache_wait_on_page_write(ci->fscache, page); 305 - fscache_uncache_page(ci->fscache, page); 306 206 } 307 207 308 208 void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) ··· 224 328 __fscache_relinquish_cookie(fsc->fscache, NULL, false); 225 329 } 226 330 fsc->fscache = NULL; 227 - } 228 - 229 - /* 230 - * caller should hold CEPH_CAP_FILE_{RD,CACHE} 231 - */ 232 - void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) 233 - { 234 - if (cache_valid(ci)) 235 - return; 236 - 237 - /* resue i_truncate_mutex. There should be no pending 238 - * truncate while the caller holds CEPH_CAP_FILE_RD */ 239 - mutex_lock(&ci->i_truncate_mutex); 240 - if (!cache_valid(ci)) { 241 - if (fscache_check_consistency(ci->fscache, &ci->i_vino)) 242 - fscache_invalidate(ci->fscache); 243 - spin_lock(&ci->i_ceph_lock); 244 - ci->i_fscache_gen = ci->i_rdcache_gen; 245 - spin_unlock(&ci->i_ceph_lock); 246 - } 247 - mutex_unlock(&ci->i_truncate_mutex); 248 331 }

+26 -79

fs/ceph/cache.h

··· 9 9 #ifndef _CEPH_CACHE_H 10 10 #define _CEPH_CACHE_H 11 11 12 + #include <linux/netfs.h> 13 + 12 14 #ifdef CONFIG_CEPH_FSCACHE 13 15 14 16 extern struct fscache_netfs ceph_cache_netfs; ··· 31 29 struct address_space *mapping, 32 30 struct list_head *pages, 33 31 unsigned *nr_pages); 34 - void ceph_readpage_to_fscache(struct inode *inode, struct page *page); 35 - void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); 36 32 37 33 static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) 38 34 { 39 35 ci->fscache = NULL; 40 - ci->i_fscache_gen = 0; 36 + } 37 + 38 + static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) 39 + { 40 + return ci->fscache; 41 41 } 42 42 43 43 static inline void ceph_fscache_invalidate(struct inode *inode) ··· 47 43 fscache_invalidate(ceph_inode(inode)->fscache); 48 44 } 49 45 50 - static inline void ceph_fscache_uncache_page(struct inode *inode, 51 - struct page *page) 46 + static inline bool ceph_is_cache_enabled(struct inode *inode) 52 47 { 53 - struct ceph_inode_info *ci = ceph_inode(inode); 54 - return fscache_uncache_page(ci->fscache, page); 48 + struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(inode)); 49 + 50 + if (!cookie) 51 + return false; 52 + return fscache_cookie_enabled(cookie); 55 53 } 56 54 57 - static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) 55 + static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq) 58 56 { 59 - struct inode* inode = page->mapping->host; 60 - struct ceph_inode_info *ci = ceph_inode(inode); 61 - return fscache_maybe_release_page(ci->fscache, page, gfp); 62 - } 57 + struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode)); 63 58 64 - static inline void ceph_fscache_readpage_cancel(struct inode *inode, 65 - struct page *page) 66 - { 67 - struct ceph_inode_info *ci = ceph_inode(inode); 68 - if (fscache_cookie_valid(ci->fscache) && PageFsCache(page)) 69 - __fscache_uncache_page(ci->fscache, page); 59 + return fscache_begin_read_operation(rreq, cookie); 70 60 } 71 - 72 - static inline void ceph_fscache_readpages_cancel(struct inode *inode, 73 - struct list_head *pages) 74 - { 75 - struct ceph_inode_info *ci = ceph_inode(inode); 76 - return fscache_readpages_cancel(ci->fscache, pages); 77 - } 78 - 79 - static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) 80 - { 81 - ci->i_fscache_gen = ci->i_rdcache_gen - 1; 82 - } 83 - 84 61 #else 85 62 86 63 static inline int ceph_fscache_register(void) ··· 87 102 { 88 103 } 89 104 105 + static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) 106 + { 107 + return NULL; 108 + } 109 + 90 110 static inline void ceph_fscache_register_inode_cookie(struct inode *inode) 91 111 { 92 112 } ··· 105 115 { 106 116 } 107 117 108 - static inline void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) 109 - { 110 - } 111 - 112 - static inline void ceph_fscache_uncache_page(struct inode *inode, 113 - struct page *pages) 114 - { 115 - } 116 - 117 - static inline int ceph_readpage_from_fscache(struct inode* inode, 118 - struct page *page) 119 - { 120 - return -ENOBUFS; 121 - } 122 - 123 - static inline int ceph_readpages_from_fscache(struct inode *inode, 124 - struct address_space *mapping, 125 - struct list_head *pages, 126 - unsigned *nr_pages) 127 - { 128 - return -ENOBUFS; 129 - } 130 - 131 - static inline void ceph_readpage_to_fscache(struct inode *inode, 132 - struct page *page) 133 - { 134 - } 135 - 136 118 static inline void ceph_fscache_invalidate(struct inode *inode) 137 119 { 138 120 } 139 121 140 - static inline void ceph_invalidate_fscache_page(struct inode *inode, 141 - struct page *page) 122 + static inline bool ceph_is_cache_enabled(struct inode *inode) 142 123 { 124 + return false; 143 125 } 144 126 145 - static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) 127 + static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq) 146 128 { 147 - return 1; 129 + return -ENOBUFS; 148 130 } 149 - 150 - static inline void ceph_fscache_readpage_cancel(struct inode *inode, 151 - struct page *page) 152 - { 153 - } 154 - 155 - static inline void ceph_fscache_readpages_cancel(struct inode *inode, 156 - struct list_head *pages) 157 - { 158 - } 159 - 160 - static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) 161 - { 162 - } 163 - 164 131 #endif 165 132 166 - #endif 133 + #endif /* _CEPH_CACHE_H */

+9 -18

fs/ceph/caps.c

··· 1390 1390 arg->flush_tid = flush_tid; 1391 1391 arg->oldest_flush_tid = oldest_flush_tid; 1392 1392 1393 - arg->size = inode->i_size; 1393 + arg->size = i_size_read(inode); 1394 1394 ci->i_reported_size = arg->size; 1395 1395 arg->max_size = ci->i_wanted_max_size; 1396 1396 if (cap == ci->i_auth_cap) { ··· 1867 1867 u32 invalidating_gen = ci->i_rdcache_gen; 1868 1868 1869 1869 spin_unlock(&ci->i_ceph_lock); 1870 + ceph_fscache_invalidate(inode); 1870 1871 invalidate_mapping_pages(&inode->i_data, 0, -1); 1871 1872 spin_lock(&ci->i_ceph_lock); 1872 1873 ··· 1885 1884 1886 1885 bool __ceph_should_report_size(struct ceph_inode_info *ci) 1887 1886 { 1888 - loff_t size = ci->vfs_inode.i_size; 1887 + loff_t size = i_size_read(&ci->vfs_inode); 1889 1888 /* mds will adjust max size according to the reported size */ 1890 1889 if (ci->i_flushing_caps & CEPH_CAP_FILE_WR) 1891 1890 return false; ··· 2731 2730 *got = need | want; 2732 2731 else 2733 2732 *got = need; 2734 - if (S_ISREG(inode->i_mode) && 2735 - (need & CEPH_CAP_FILE_RD) && 2736 - !(*got & CEPH_CAP_FILE_CACHE)) 2737 - ceph_disable_fscache_readpage(ci); 2738 2733 ceph_take_cap_refs(ci, *got, true); 2739 2734 ret = 1; 2740 2735 } ··· 2855 2858 * due to a small max_size, make sure we check_max_size (and possibly 2856 2859 * ask the mds) so we don't get hung up indefinitely. 2857 2860 */ 2858 - int ceph_get_caps(struct file *filp, int need, int want, 2859 - loff_t endoff, int *got, struct page **pinned_page) 2861 + int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got) 2860 2862 { 2861 2863 struct ceph_file_info *fi = filp->private_data; 2862 2864 struct inode *inode = file_inode(filp); ··· 2953 2957 struct page *page = 2954 2958 find_get_page(inode->i_mapping, 0); 2955 2959 if (page) { 2956 - if (PageUptodate(page)) { 2957 - *pinned_page = page; 2958 - break; 2959 - } 2960 + bool uptodate = PageUptodate(page); 2961 + 2960 2962 put_page(page); 2963 + if (uptodate) 2964 + break; 2961 2965 } 2962 2966 /* 2963 2967 * drop cap refs first because getattr while ··· 2979 2983 } 2980 2984 break; 2981 2985 } 2982 - 2983 - if (S_ISREG(ci->vfs_inode.i_mode) && 2984 - (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) 2985 - ceph_fscache_revalidate_cookie(ci); 2986 - 2987 2986 *got = _got; 2988 2987 return 0; 2989 2988 } ··· 3299 3308 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", 3300 3309 inode, cap, session->s_mds, seq, ceph_cap_string(newcaps)); 3301 3310 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, 3302 - inode->i_size); 3311 + i_size_read(inode)); 3303 3312 3304 3313 3305 3314 /*

+6 -6

fs/ceph/debugfs.c

··· 162 162 seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); 163 163 seq_printf(s, "-----------------------------------------------------------------------------------\n"); 164 164 165 - spin_lock(&m->read_latency_lock); 165 + spin_lock(&m->read_metric_lock); 166 166 total = m->total_reads; 167 167 sum = m->read_latency_sum; 168 168 avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; 169 169 min = m->read_latency_min; 170 170 max = m->read_latency_max; 171 171 sq = m->read_latency_sq_sum; 172 - spin_unlock(&m->read_latency_lock); 172 + spin_unlock(&m->read_metric_lock); 173 173 CEPH_METRIC_SHOW("read", total, avg, min, max, sq); 174 174 175 - spin_lock(&m->write_latency_lock); 175 + spin_lock(&m->write_metric_lock); 176 176 total = m->total_writes; 177 177 sum = m->write_latency_sum; 178 178 avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; 179 179 min = m->write_latency_min; 180 180 max = m->write_latency_max; 181 181 sq = m->write_latency_sq_sum; 182 - spin_unlock(&m->write_latency_lock); 182 + spin_unlock(&m->write_metric_lock); 183 183 CEPH_METRIC_SHOW("write", total, avg, min, max, sq); 184 184 185 - spin_lock(&m->metadata_latency_lock); 185 + spin_lock(&m->metadata_metric_lock); 186 186 total = m->total_metadatas; 187 187 sum = m->metadata_latency_sum; 188 188 avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; 189 189 min = m->metadata_latency_min; 190 190 max = m->metadata_latency_max; 191 191 sq = m->metadata_latency_sq_sum; 192 - spin_unlock(&m->metadata_latency_lock); 192 + spin_unlock(&m->metadata_metric_lock); 193 193 CEPH_METRIC_SHOW("metadata", total, avg, min, max, sq); 194 194 195 195 seq_printf(s, "\n");

+21 -13

fs/ceph/dir.c

··· 631 631 switch (whence) { 632 632 case SEEK_CUR: 633 633 offset += file->f_pos; 634 + break; 634 635 case SEEK_SET: 635 636 break; 636 637 case SEEK_END: 637 638 retval = -EOPNOTSUPP; 639 + goto out; 638 640 default: 639 641 goto out; 640 642 } ··· 667 665 /* 668 666 * Handle lookups for the hidden .snap directory. 669 667 */ 670 - int ceph_handle_snapdir(struct ceph_mds_request *req, 671 - struct dentry *dentry, int err) 668 + struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, 669 + struct dentry *dentry, int err) 672 670 { 673 671 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 674 672 struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ ··· 676 674 /* .snap dir? */ 677 675 if (err == -ENOENT && 678 676 ceph_snap(parent) == CEPH_NOSNAP && 679 - strcmp(dentry->d_name.name, 680 - fsc->mount_options->snapdir_name) == 0) { 677 + strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) { 678 + struct dentry *res; 681 679 struct inode *inode = ceph_get_snapdir(parent); 682 - if (IS_ERR(inode)) 683 - return PTR_ERR(inode); 684 - dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n", 685 - dentry, dentry, inode); 686 - BUG_ON(!d_unhashed(dentry)); 687 - d_add(dentry, inode); 688 - err = 0; 680 + 681 + res = d_splice_alias(inode, dentry); 682 + dout("ENOENT on snapdir %p '%pd', linking to snapdir %p. Spliced dentry %p\n", 683 + dentry, dentry, inode, res); 684 + if (res) 685 + dentry = res; 689 686 } 690 - return err; 687 + return dentry; 691 688 } 692 689 693 690 /* ··· 742 741 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 743 742 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); 744 743 struct ceph_mds_request *req; 744 + struct dentry *res; 745 745 int op; 746 746 int mask; 747 747 int err; ··· 793 791 req->r_parent = dir; 794 792 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); 795 793 err = ceph_mdsc_do_request(mdsc, NULL, req); 796 - err = ceph_handle_snapdir(req, dentry, err); 794 + res = ceph_handle_snapdir(req, dentry, err); 795 + if (IS_ERR(res)) { 796 + err = PTR_ERR(res); 797 + } else { 798 + dentry = res; 799 + err = 0; 800 + } 797 801 dentry = ceph_finish_lookup(req, dentry, err); 798 802 ceph_mdsc_put_request(req); /* will dput(dentry) */ 799 803 dout("lookup result=%p\n", dentry);

+11 -1

fs/ceph/export.c

··· 129 129 130 130 vino.ino = ino; 131 131 vino.snap = CEPH_NOSNAP; 132 + 133 + if (ceph_vino_is_reserved(vino)) 134 + return ERR_PTR(-ESTALE); 135 + 132 136 inode = ceph_find_inode(sb, vino); 133 137 if (!inode) { 134 138 struct ceph_mds_request *req; ··· 182 178 return ERR_CAST(inode); 183 179 /* We need LINK caps to reliably check i_nlink */ 184 180 err = ceph_do_getattr(inode, CEPH_CAP_LINK_SHARED, false); 185 - if (err) 181 + if (err) { 182 + iput(inode); 186 183 return ERR_PTR(err); 184 + } 187 185 /* -ESTALE if inode as been unlinked and no file is open */ 188 186 if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) { 189 187 iput(inode); ··· 218 212 vino.ino = sfh->ino; 219 213 vino.snap = sfh->snapid; 220 214 } 215 + 216 + if (ceph_vino_is_reserved(vino)) 217 + return ERR_PTR(-ESTALE); 218 + 221 219 inode = ceph_find_inode(sb, vino); 222 220 if (inode) 223 221 return d_obtain_alias(inode);

+24 -28

fs/ceph/file.c

··· 739 739 err = ceph_mdsc_do_request(mdsc, 740 740 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, 741 741 req); 742 - err = ceph_handle_snapdir(req, dentry, err); 743 - if (err) 742 + dentry = ceph_handle_snapdir(req, dentry, err); 743 + if (IS_ERR(dentry)) { 744 + err = PTR_ERR(dentry); 744 745 goto out_req; 746 + } 747 + err = 0; 745 748 746 749 if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) 747 750 err = ceph_handle_notrace_create(dir, dentry); ··· 895 892 if (!ret) 896 893 ret = ceph_osdc_wait_request(osdc, req); 897 894 898 - ceph_update_read_latency(&fsc->mdsc->metric, 895 + ceph_update_read_metrics(&fsc->mdsc->metric, 899 896 req->r_start_latency, 900 897 req->r_end_latency, 901 898 ret); ··· 1037 1034 dout("ceph_aio_complete_req %p rc %d bytes %u\n", 1038 1035 inode, rc, osd_data->bvec_pos.iter.bi_size); 1039 1036 1040 - /* r_start_latency == 0 means the request was not submitted */ 1041 - if (req->r_start_latency) { 1042 - if (aio_req->write) 1043 - ceph_update_write_latency(metric, req->r_start_latency, 1044 - req->r_end_latency, rc); 1045 - else 1046 - ceph_update_read_latency(metric, req->r_start_latency, 1047 - req->r_end_latency, rc); 1048 - } 1049 - 1050 1037 if (rc == -EOLDSNAPC) { 1051 1038 struct ceph_aio_work *aio_work; 1052 1039 BUG_ON(!aio_req->write); ··· 1077 1084 iov_iter_advance(&i, rc); 1078 1085 iov_iter_zero(zlen, &i); 1079 1086 } 1087 + } 1088 + 1089 + /* r_start_latency == 0 means the request was not submitted */ 1090 + if (req->r_start_latency) { 1091 + if (aio_req->write) 1092 + ceph_update_write_metrics(metric, req->r_start_latency, 1093 + req->r_end_latency, rc); 1094 + else 1095 + ceph_update_read_metrics(metric, req->r_start_latency, 1096 + req->r_end_latency, rc); 1080 1097 } 1081 1098 1082 1099 put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs, ··· 1293 1290 ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 1294 1291 1295 1292 if (write) 1296 - ceph_update_write_latency(metric, req->r_start_latency, 1293 + ceph_update_write_metrics(metric, req->r_start_latency, 1297 1294 req->r_end_latency, ret); 1298 1295 else 1299 - ceph_update_read_latency(metric, req->r_start_latency, 1296 + ceph_update_read_metrics(metric, req->r_start_latency, 1300 1297 req->r_end_latency, ret); 1301 1298 1302 1299 size = i_size_read(inode); ··· 1470 1467 if (!ret) 1471 1468 ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 1472 1469 1473 - ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, 1470 + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, 1474 1471 req->r_end_latency, ret); 1475 1472 out: 1476 1473 ceph_osdc_put_request(req); ··· 1513 1510 size_t len = iov_iter_count(to); 1514 1511 struct inode *inode = file_inode(filp); 1515 1512 struct ceph_inode_info *ci = ceph_inode(inode); 1516 - struct page *pinned_page = NULL; 1517 1513 bool direct_lock = iocb->ki_flags & IOCB_DIRECT; 1518 1514 ssize_t ret; 1519 1515 int want, got = 0; ··· 1531 1529 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 1532 1530 else 1533 1531 want = CEPH_CAP_FILE_CACHE; 1534 - ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, 1535 - &got, &pinned_page); 1532 + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got); 1536 1533 if (ret < 0) { 1537 1534 if (iocb->ki_flags & IOCB_DIRECT) 1538 1535 ceph_end_io_direct(inode); ··· 1572 1571 1573 1572 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", 1574 1573 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); 1575 - if (pinned_page) { 1576 - put_page(pinned_page); 1577 - pinned_page = NULL; 1578 - } 1579 1574 ceph_put_cap_refs(ci, got); 1580 1575 1581 1576 if (direct_lock) ··· 1750 1753 else 1751 1754 want = CEPH_CAP_FILE_BUFFER; 1752 1755 got = 0; 1753 - err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, 1754 - &got, NULL); 1756 + err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got); 1755 1757 if (err < 0) 1756 1758 goto out; 1757 1759 ··· 2079 2083 else 2080 2084 want = CEPH_CAP_FILE_BUFFER; 2081 2085 2082 - ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); 2086 + ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got); 2083 2087 if (ret < 0) 2084 2088 goto unlock; 2085 2089 ··· 2117 2121 2118 2122 retry_caps: 2119 2123 ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, 2120 - dst_endoff, dst_got, NULL); 2124 + dst_endoff, dst_got); 2121 2125 if (ret < 0) 2122 2126 return ret; 2123 2127 ··· 2139 2143 return ret; 2140 2144 } 2141 2145 ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD, 2142 - CEPH_CAP_FILE_SHARED, -1, src_got, NULL); 2146 + CEPH_CAP_FILE_SHARED, -1, src_got); 2143 2147 if (ret < 0) 2144 2148 return ret; 2145 2149 /*... drop src_ci caps too, and retry */

+21 -15

fs/ceph/inode.c

··· 56 56 { 57 57 struct inode *inode; 58 58 59 + if (ceph_vino_is_reserved(vino)) 60 + return ERR_PTR(-EREMOTEIO); 61 + 59 62 inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare, 60 63 ceph_set_ino_cb, &vino); 61 64 if (!inode) ··· 102 99 inode->i_mtime = parent->i_mtime; 103 100 inode->i_ctime = parent->i_ctime; 104 101 inode->i_atime = parent->i_atime; 105 - inode->i_op = &ceph_snapdir_iops; 106 - inode->i_fop = &ceph_snapdir_fops; 107 - ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ 108 102 ci->i_rbytes = 0; 109 103 ci->i_btime = ceph_inode(parent)->i_btime; 110 104 111 - if (inode->i_state & I_NEW) 105 + if (inode->i_state & I_NEW) { 106 + inode->i_op = &ceph_snapdir_iops; 107 + inode->i_fop = &ceph_snapdir_fops; 108 + ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ 112 109 unlock_new_inode(inode); 110 + } 113 111 114 112 return inode; 115 113 } ··· 632 628 { 633 629 struct ceph_inode_info *ci = ceph_inode(inode); 634 630 int queue_trunc = 0; 631 + loff_t isize = i_size_read(inode); 635 632 636 633 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || 637 - (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { 638 - dout("size %lld -> %llu\n", inode->i_size, size); 634 + (truncate_seq == ci->i_truncate_seq && size > isize)) { 635 + dout("size %lld -> %llu\n", isize, size); 639 636 if (size > 0 && S_ISDIR(inode->i_mode)) { 640 637 pr_err("fill_file_size non-zero size for directory\n"); 641 638 size = 0; ··· 930 925 ci->i_rfiles = le64_to_cpu(info->rfiles); 931 926 ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); 932 927 ci->i_dir_pin = iinfo->dir_pin; 928 + ci->i_rsnaps = iinfo->rsnaps; 933 929 ceph_decode_timespec64(&ci->i_rctime, &info->rctime); 934 930 } 935 931 } ··· 1824 1818 bool ret; 1825 1819 1826 1820 spin_lock(&ci->i_ceph_lock); 1827 - dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); 1821 + dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size); 1828 1822 i_size_write(inode, size); 1829 1823 inode->i_blocks = calc_inode_blocks(size); 1830 1824 ··· 1900 1894 orig_gen = ci->i_rdcache_gen; 1901 1895 spin_unlock(&ci->i_ceph_lock); 1902 1896 1897 + ceph_fscache_invalidate(inode); 1903 1898 if (invalidate_inode_pages2(inode->i_mapping) < 0) { 1904 1899 pr_err("invalidate_pages %p fails\n", inode); 1905 1900 } ··· 2131 2124 } 2132 2125 } 2133 2126 if (ia_valid & ATTR_SIZE) { 2134 - dout("setattr %p size %lld -> %lld\n", inode, 2135 - inode->i_size, attr->ia_size); 2136 - if ((issued & CEPH_CAP_FILE_EXCL) && 2137 - attr->ia_size > inode->i_size) { 2127 + loff_t isize = i_size_read(inode); 2128 + 2129 + dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size); 2130 + if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > isize) { 2138 2131 i_size_write(inode, attr->ia_size); 2139 2132 inode->i_blocks = calc_inode_blocks(attr->ia_size); 2140 2133 ci->i_reported_size = attr->ia_size; 2141 2134 dirtied |= CEPH_CAP_FILE_EXCL; 2142 2135 ia_valid |= ATTR_MTIME; 2143 2136 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || 2144 - attr->ia_size != inode->i_size) { 2137 + attr->ia_size != isize) { 2145 2138 req->r_args.setattr.size = cpu_to_le64(attr->ia_size); 2146 - req->r_args.setattr.old_size = 2147 - cpu_to_le64(inode->i_size); 2139 + req->r_args.setattr.old_size = cpu_to_le64(isize); 2148 2140 mask |= CEPH_SETATTR_SIZE; 2149 2141 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | 2150 2142 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; ··· 2253 2247 return err; 2254 2248 2255 2249 if ((attr->ia_valid & ATTR_SIZE) && 2256 - attr->ia_size > max(inode->i_size, fsc->max_file_size)) 2250 + attr->ia_size > max(i_size_read(inode), fsc->max_file_size)) 2257 2251 return -EFBIG; 2258 2252 2259 2253 if ((attr->ia_valid & ATTR_SIZE) &&

+1 -1

fs/ceph/io.c

··· 118 118 } 119 119 120 120 /** 121 - * ceph_end_io_direct - declare the file is being used for direct i/o 121 + * ceph_start_io_direct - declare the file is being used for direct i/o 122 122 * @inode: file inode 123 123 * 124 124 * Declare that a direct I/O operation is about to start, and ensure

+17 -3

fs/ceph/mds_client.c

··· 176 176 memset(&info->snap_btime, 0, sizeof(info->snap_btime)); 177 177 } 178 178 179 + /* snapshot count, remains zero for v<=3 */ 180 + if (struct_v >= 4) { 181 + ceph_decode_64_safe(p, end, info->rsnaps, bad); 182 + } else { 183 + info->rsnaps = 0; 184 + } 185 + 179 186 *p = end; 180 187 } else { 181 188 if (features & CEPH_FEATURE_MDS_INLINE_DATA) { ··· 221 214 } 222 215 223 216 info->dir_pin = -ENODATA; 224 - /* info->snap_btime remains zero */ 217 + /* info->snap_btime and info->rsnaps remain zero */ 225 218 } 226 219 return 0; 227 220 bad: ··· 440 433 441 434 ceph_decode_64_safe(p, end, start, bad); 442 435 ceph_decode_64_safe(p, end, len, bad); 436 + 437 + /* Don't accept a delegation of system inodes */ 438 + if (start < CEPH_INO_SYSTEM_BASE) { 439 + pr_warn_ratelimited("ceph: ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n", 440 + start, len); 441 + continue; 442 + } 443 443 while (len--) { 444 444 int err = xa_insert(&s->s_delegated_inos, ino = start++, 445 445 DELEGATED_INO_AVAILABLE, ··· 3320 3306 /* kick calling process */ 3321 3307 complete_request(mdsc, req); 3322 3308 3323 - ceph_update_metadata_latency(&mdsc->metric, req->r_start_latency, 3309 + ceph_update_metadata_metrics(&mdsc->metric, req->r_start_latency, 3324 3310 req->r_end_latency, err); 3325 3311 out: 3326 3312 ceph_mdsc_put_request(req); ··· 3794 3780 rec.v1.cap_id = cpu_to_le64(cap->cap_id); 3795 3781 rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); 3796 3782 rec.v1.issued = cpu_to_le32(cap->issued); 3797 - rec.v1.size = cpu_to_le64(inode->i_size); 3783 + rec.v1.size = cpu_to_le64(i_size_read(inode)); 3798 3784 ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime); 3799 3785 ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime); 3800 3786 rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);

+1

fs/ceph/mds_client.h

··· 88 88 s32 dir_pin; 89 89 struct ceph_timespec btime; 90 90 struct ceph_timespec snap_btime; 91 + u64 rsnaps; 91 92 u64 change_attr; 92 93 }; 93 94

+49 -13

fs/ceph/metric.c

··· 17 17 struct ceph_metric_write_latency *write; 18 18 struct ceph_metric_metadata_latency *meta; 19 19 struct ceph_metric_dlease *dlease; 20 + struct ceph_opened_files *files; 21 + struct ceph_pinned_icaps *icaps; 22 + struct ceph_opened_inodes *inodes; 20 23 struct ceph_client_metric *m = &mdsc->metric; 21 24 u64 nr_caps = atomic64_read(&m->total_caps); 22 25 struct ceph_msg *msg; ··· 29 26 s32 len; 30 27 31 28 len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write) 32 - + sizeof(*meta) + sizeof(*dlease); 29 + + sizeof(*meta) + sizeof(*dlease) + sizeof(*files) 30 + + sizeof(*icaps) + sizeof(*inodes); 33 31 34 32 msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); 35 33 if (!msg) { ··· 97 93 dlease->hit = cpu_to_le64(percpu_counter_sum(&m->d_lease_hit)); 98 94 dlease->mis = cpu_to_le64(percpu_counter_sum(&m->d_lease_mis)); 99 95 dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries)); 96 + items++; 97 + 98 + sum = percpu_counter_sum(&m->total_inodes); 99 + 100 + /* encode the opened files metric */ 101 + files = (struct ceph_opened_files *)(dlease + 1); 102 + files->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES); 103 + files->ver = 1; 104 + files->compat = 1; 105 + files->data_len = cpu_to_le32(sizeof(*files) - 10); 106 + files->opened_files = cpu_to_le64(atomic64_read(&m->opened_files)); 107 + files->total = cpu_to_le64(sum); 108 + items++; 109 + 110 + /* encode the pinned icaps metric */ 111 + icaps = (struct ceph_pinned_icaps *)(files + 1); 112 + icaps->type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS); 113 + icaps->ver = 1; 114 + icaps->compat = 1; 115 + icaps->data_len = cpu_to_le32(sizeof(*icaps) - 10); 116 + icaps->pinned_icaps = cpu_to_le64(nr_caps); 117 + icaps->total = cpu_to_le64(sum); 118 + items++; 119 + 120 + /* encode the opened inodes metric */ 121 + inodes = (struct ceph_opened_inodes *)(icaps + 1); 122 + inodes->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES); 123 + inodes->ver = 1; 124 + inodes->compat = 1; 125 + inodes->data_len = cpu_to_le32(sizeof(*inodes) - 10); 126 + inodes->opened_inodes = cpu_to_le64(percpu_counter_sum(&m->opened_inodes)); 127 + inodes->total = cpu_to_le64(sum); 100 128 items++; 101 129 102 130 put_unaligned_le32(items, &head->num); ··· 219 183 if (ret) 220 184 goto err_i_caps_mis; 221 185 222 - spin_lock_init(&m->read_latency_lock); 186 + spin_lock_init(&m->read_metric_lock); 223 187 m->read_latency_sq_sum = 0; 224 188 m->read_latency_min = KTIME_MAX; 225 189 m->read_latency_max = 0; 226 190 m->total_reads = 0; 227 191 m->read_latency_sum = 0; 228 192 229 - spin_lock_init(&m->write_latency_lock); 193 + spin_lock_init(&m->write_metric_lock); 230 194 m->write_latency_sq_sum = 0; 231 195 m->write_latency_min = KTIME_MAX; 232 196 m->write_latency_max = 0; 233 197 m->total_writes = 0; 234 198 m->write_latency_sum = 0; 235 199 236 - spin_lock_init(&m->metadata_latency_lock); 200 + spin_lock_init(&m->metadata_metric_lock); 237 201 m->metadata_latency_sq_sum = 0; 238 202 m->metadata_latency_min = KTIME_MAX; 239 203 m->metadata_latency_max = 0; ··· 310 274 *sq_sump += sq; 311 275 } 312 276 313 - void ceph_update_read_latency(struct ceph_client_metric *m, 277 + void ceph_update_read_metrics(struct ceph_client_metric *m, 314 278 ktime_t r_start, ktime_t r_end, 315 279 int rc) 316 280 { ··· 319 283 if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT)) 320 284 return; 321 285 322 - spin_lock(&m->read_latency_lock); 286 + spin_lock(&m->read_metric_lock); 323 287 __update_latency(&m->total_reads, &m->read_latency_sum, 324 288 &m->read_latency_min, &m->read_latency_max, 325 289 &m->read_latency_sq_sum, lat); 326 - spin_unlock(&m->read_latency_lock); 290 + spin_unlock(&m->read_metric_lock); 327 291 } 328 292 329 - void ceph_update_write_latency(struct ceph_client_metric *m, 293 + void ceph_update_write_metrics(struct ceph_client_metric *m, 330 294 ktime_t r_start, ktime_t r_end, 331 295 int rc) 332 296 { ··· 335 299 if (unlikely(rc && rc != -ETIMEDOUT)) 336 300 return; 337 301 338 - spin_lock(&m->write_latency_lock); 302 + spin_lock(&m->write_metric_lock); 339 303 __update_latency(&m->total_writes, &m->write_latency_sum, 340 304 &m->write_latency_min, &m->write_latency_max, 341 305 &m->write_latency_sq_sum, lat); 342 - spin_unlock(&m->write_latency_lock); 306 + spin_unlock(&m->write_metric_lock); 343 307 } 344 308 345 - void ceph_update_metadata_latency(struct ceph_client_metric *m, 309 + void ceph_update_metadata_metrics(struct ceph_client_metric *m, 346 310 ktime_t r_start, ktime_t r_end, 347 311 int rc) 348 312 { ··· 351 315 if (unlikely(rc && rc != -ENOENT)) 352 316 return; 353 317 354 - spin_lock(&m->metadata_latency_lock); 318 + spin_lock(&m->metadata_metric_lock); 355 319 __update_latency(&m->total_metadatas, &m->metadata_latency_sum, 356 320 &m->metadata_latency_min, &m->metadata_latency_max, 357 321 &m->metadata_latency_sq_sum, lat); 358 - spin_unlock(&m->metadata_latency_lock); 322 + spin_unlock(&m->metadata_metric_lock); 359 323 }

+49 -7

fs/ceph/metric.h

··· 14 14 CLIENT_METRIC_TYPE_WRITE_LATENCY, 15 15 CLIENT_METRIC_TYPE_METADATA_LATENCY, 16 16 CLIENT_METRIC_TYPE_DENTRY_LEASE, 17 + CLIENT_METRIC_TYPE_OPENED_FILES, 18 + CLIENT_METRIC_TYPE_PINNED_ICAPS, 19 + CLIENT_METRIC_TYPE_OPENED_INODES, 17 20 18 - CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE, 21 + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_OPENED_INODES, 19 22 }; 20 23 21 24 /* ··· 31 28 CLIENT_METRIC_TYPE_WRITE_LATENCY, \ 32 29 CLIENT_METRIC_TYPE_METADATA_LATENCY, \ 33 30 CLIENT_METRIC_TYPE_DENTRY_LEASE, \ 31 + CLIENT_METRIC_TYPE_OPENED_FILES, \ 32 + CLIENT_METRIC_TYPE_PINNED_ICAPS, \ 33 + CLIENT_METRIC_TYPE_OPENED_INODES, \ 34 34 \ 35 35 CLIENT_METRIC_TYPE_MAX, \ 36 36 } ··· 100 94 __le64 total; 101 95 } __packed; 102 96 97 + /* metric opened files header */ 98 + struct ceph_opened_files { 99 + __le32 type; /* ceph metric type */ 100 + 101 + __u8 ver; 102 + __u8 compat; 103 + 104 + __le32 data_len; /* length of sizeof(opened_files + total) */ 105 + __le64 opened_files; 106 + __le64 total; 107 + } __packed; 108 + 109 + /* metric pinned i_caps header */ 110 + struct ceph_pinned_icaps { 111 + __le32 type; /* ceph metric type */ 112 + 113 + __u8 ver; 114 + __u8 compat; 115 + 116 + __le32 data_len; /* length of sizeof(pinned_icaps + total) */ 117 + __le64 pinned_icaps; 118 + __le64 total; 119 + } __packed; 120 + 121 + /* metric opened inodes header */ 122 + struct ceph_opened_inodes { 123 + __le32 type; /* ceph metric type */ 124 + 125 + __u8 ver; 126 + __u8 compat; 127 + 128 + __le32 data_len; /* length of sizeof(opened_inodes + total) */ 129 + __le64 opened_inodes; 130 + __le64 total; 131 + } __packed; 132 + 103 133 struct ceph_metric_head { 104 134 __le32 num; /* the number of metrics that will be sent */ 105 135 } __packed; ··· 150 108 struct percpu_counter i_caps_hit; 151 109 struct percpu_counter i_caps_mis; 152 110 153 - spinlock_t read_latency_lock; 111 + spinlock_t read_metric_lock; 154 112 u64 total_reads; 155 113 ktime_t read_latency_sum; 156 114 ktime_t read_latency_sq_sum; 157 115 ktime_t read_latency_min; 158 116 ktime_t read_latency_max; 159 117 160 - spinlock_t write_latency_lock; 118 + spinlock_t write_metric_lock; 161 119 u64 total_writes; 162 120 ktime_t write_latency_sum; 163 121 ktime_t write_latency_sq_sum; 164 122 ktime_t write_latency_min; 165 123 ktime_t write_latency_max; 166 124 167 - spinlock_t metadata_latency_lock; 125 + spinlock_t metadata_metric_lock; 168 126 u64 total_metadatas; 169 127 ktime_t metadata_latency_sum; 170 128 ktime_t metadata_latency_sq_sum; ··· 204 162 percpu_counter_inc(&m->i_caps_mis); 205 163 } 206 164 207 - extern void ceph_update_read_latency(struct ceph_client_metric *m, 165 + extern void ceph_update_read_metrics(struct ceph_client_metric *m, 208 166 ktime_t r_start, ktime_t r_end, 209 167 int rc); 210 - extern void ceph_update_write_latency(struct ceph_client_metric *m, 168 + extern void ceph_update_write_metrics(struct ceph_client_metric *m, 211 169 ktime_t r_start, ktime_t r_end, 212 170 int rc); 213 - extern void ceph_update_metadata_latency(struct ceph_client_metric *m, 171 + extern void ceph_update_metadata_metrics(struct ceph_client_metric *m, 214 172 ktime_t r_start, ktime_t r_end, 215 173 int rc); 216 174 #endif /* _FS_CEPH_MDS_METRIC_H */

+1 -1

fs/ceph/snap.c

··· 605 605 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 606 606 607 607 BUG_ON(capsnap->writing); 608 - capsnap->size = inode->i_size; 608 + capsnap->size = i_size_read(inode); 609 609 capsnap->mtime = inode->i_mtime; 610 610 capsnap->atime = inode->i_atime; 611 611 capsnap->ctime = inode->i_ctime;

+28 -4

fs/ceph/super.h

··· 21 21 #include <linux/ceph/libceph.h> 22 22 23 23 #ifdef CONFIG_CEPH_FSCACHE 24 + #define FSCACHE_USE_NEW_IO_API 24 25 #include <linux/fscache.h> 25 26 #endif 26 27 ··· 334 333 335 334 /* for dirs */ 336 335 struct timespec64 i_rctime; 337 - u64 i_rbytes, i_rfiles, i_rsubdirs; 336 + u64 i_rbytes, i_rfiles, i_rsubdirs, i_rsnaps; 338 337 u64 i_files, i_subdirs; 339 338 340 339 /* quotas */ ··· 428 427 429 428 #ifdef CONFIG_CEPH_FSCACHE 430 429 struct fscache_cookie *fscache; 431 - u32 i_fscache_gen; 432 430 #endif 433 431 errseq_t i_meta_err; 434 432 ··· 529 529 ci->i_vino.snap == pvino->snap; 530 530 } 531 531 532 + /* 533 + * The MDS reserves a set of inodes for its own usage. These should never 534 + * be accessible by clients, and so the MDS has no reason to ever hand these 535 + * out. The range is CEPH_MDS_INO_MDSDIR_OFFSET..CEPH_INO_SYSTEM_BASE. 536 + * 537 + * These come from src/mds/mdstypes.h in the ceph sources. 538 + */ 539 + #define CEPH_MAX_MDS 0x100 540 + #define CEPH_NUM_STRAY 10 541 + #define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS) 542 + #define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY)) 543 + 544 + static inline bool ceph_vino_is_reserved(const struct ceph_vino vino) 545 + { 546 + if (vino.ino < CEPH_INO_SYSTEM_BASE && 547 + vino.ino >= CEPH_MDS_INO_MDSDIR_OFFSET) { 548 + WARN_RATELIMIT(1, "Attempt to access reserved inode number 0x%llx", vino.ino); 549 + return true; 550 + } 551 + return false; 552 + } 532 553 533 554 static inline struct inode *ceph_find_inode(struct super_block *sb, 534 555 struct ceph_vino vino) 535 556 { 557 + if (ceph_vino_is_reserved(vino)) 558 + return NULL; 559 + 536 560 /* 537 561 * NB: The hashval will be run through the fs/inode.c hash function 538 562 * anyway, so there is no need to squash the inode number down to ··· 1180 1156 int mds, int drop, int unless); 1181 1157 1182 1158 extern int ceph_get_caps(struct file *filp, int need, int want, 1183 - loff_t endoff, int *got, struct page **pinned_page); 1159 + loff_t endoff, int *got); 1184 1160 extern int ceph_try_get_caps(struct inode *inode, 1185 1161 int need, int want, bool nonblock, int *got); 1186 1162 ··· 1217 1193 1218 1194 extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); 1219 1195 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); 1220 - extern int ceph_handle_snapdir(struct ceph_mds_request *req, 1196 + extern struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, 1221 1197 struct dentry *dentry, int err); 1222 1198 extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 1223 1199 struct dentry *dentry, int err);

+7

fs/ceph/xattr.c

··· 233 233 return ceph_fmt_xattr(val, size, "%lld", ci->i_rsubdirs); 234 234 } 235 235 236 + static ssize_t ceph_vxattrcb_dir_rsnaps(struct ceph_inode_info *ci, char *val, 237 + size_t size) 238 + { 239 + return ceph_fmt_xattr(val, size, "%lld", ci->i_rsnaps); 240 + } 241 + 236 242 static ssize_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, 237 243 size_t size) 238 244 { ··· 390 384 XATTR_RSTAT_FIELD(dir, rentries), 391 385 XATTR_RSTAT_FIELD(dir, rfiles), 392 386 XATTR_RSTAT_FIELD(dir, rsubdirs), 387 + XATTR_RSTAT_FIELD(dir, rsnaps), 393 388 XATTR_RSTAT_FIELD(dir, rbytes), 394 389 XATTR_RSTAT_FIELD(dir, rctime), 395 390 {

+23 -13

net/ceph/auth.c

··· 36 36 } 37 37 } 38 38 39 + static void set_global_id(struct ceph_auth_client *ac, u64 global_id) 40 + { 41 + dout("%s global_id %llu\n", __func__, global_id); 42 + 43 + if (!global_id) 44 + pr_err("got zero global_id\n"); 45 + 46 + if (ac->global_id && global_id != ac->global_id) 47 + pr_err("global_id changed from %llu to %llu\n", ac->global_id, 48 + global_id); 49 + 50 + ac->global_id = global_id; 51 + } 52 + 39 53 /* 40 54 * setup, teardown. 41 55 */ ··· 236 222 237 223 payload_end = payload + payload_len; 238 224 239 - if (global_id && ac->global_id != global_id) { 240 - dout(" set global_id %lld -> %lld\n", ac->global_id, global_id); 241 - ac->global_id = global_id; 242 - } 243 - 244 225 if (ac->negotiating) { 245 226 /* server does not support our protocols? */ 246 227 if (!protocol && result < 0) { ··· 262 253 263 254 ret = ac->ops->handle_reply(ac, result, payload, payload_end, 264 255 NULL, NULL, NULL, NULL); 265 - if (ret == -EAGAIN) 256 + if (ret == -EAGAIN) { 266 257 ret = build_request(ac, true, reply_buf, reply_len); 267 - else if (ret) 258 + goto out; 259 + } else if (ret) { 268 260 pr_err("auth protocol '%s' mauth authentication failed: %d\n", 269 261 ceph_auth_proto_name(ac->protocol), result); 262 + goto out; 263 + } 264 + 265 + set_global_id(ac, global_id); 270 266 271 267 out: 272 268 mutex_unlock(&ac->mutex); ··· 498 484 int ret; 499 485 500 486 mutex_lock(&ac->mutex); 501 - if (global_id && ac->global_id != global_id) { 502 - dout("%s global_id %llu -> %llu\n", __func__, ac->global_id, 503 - global_id); 504 - ac->global_id = global_id; 505 - } 506 - 507 487 ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len, 508 488 session_key, session_key_len, 509 489 con_secret, con_secret_len); 490 + if (!ret) 491 + set_global_id(ac, global_id); 510 492 mutex_unlock(&ac->mutex); 511 493 return ret; 512 494 }

+1 -1

net/ceph/auth_x.c

··· 526 526 if (ret < 0) 527 527 return ret; 528 528 529 - auth->struct_v = 2; /* nautilus+ */ 529 + auth->struct_v = 3; /* nautilus+ */ 530 530 auth->key = 0; 531 531 for (u = (u64 *)enc_buf; u + 1 <= (u64 *)(enc_buf + ret); u++) 532 532 auth->key ^= *(__le64 *)u;

+14 -6

net/ceph/decode.c

··· 4 4 #include <linux/inet.h> 5 5 6 6 #include <linux/ceph/decode.h> 7 + #include <linux/ceph/messenger.h> /* for ceph_pr_addr() */ 7 8 8 9 static int 9 10 ceph_decode_entity_addr_versioned(void **p, void *end, ··· 111 110 } 112 111 113 112 ceph_decode_32_safe(p, end, addr_cnt, e_inval); 113 + dout("%s addr_cnt %d\n", __func__, addr_cnt); 114 114 115 115 found = false; 116 116 for (i = 0; i < addr_cnt; i++) { ··· 119 117 if (ret) 120 118 return ret; 121 119 120 + dout("%s i %d addr %s\n", __func__, i, ceph_pr_addr(&tmp_addr)); 122 121 if (tmp_addr.type == my_type) { 123 122 if (found) { 124 123 pr_err("another match of type %d in addrvec\n", ··· 131 128 found = true; 132 129 } 133 130 } 134 - if (!found && addr_cnt != 0) { 135 - pr_err("no match of type %d in addrvec\n", 136 - le32_to_cpu(my_type)); 137 - return -ENOENT; 138 - } 139 131 140 - return 0; 132 + if (found) 133 + return 0; 134 + 135 + if (!addr_cnt) 136 + return 0; /* normal -- e.g. unused OSD id/slot */ 137 + 138 + if (addr_cnt == 1 && !memchr_inv(&tmp_addr, 0, sizeof(tmp_addr))) 139 + return 0; /* weird but effectively the same as !addr_cnt */ 140 + 141 + pr_err("no match of type %d in addrvec\n", le32_to_cpu(my_type)); 142 + return -ENOENT; 141 143 142 144 e_inval: 143 145 return -EINVAL;