Merge branch 'for-linus' of git://ceph.newdream.net/git/ceph-client

+1 -1

drivers/block/rbd.c

··· 260 260 kref_init(&rbdc->kref); 261 261 INIT_LIST_HEAD(&rbdc->node); 262 262 263 - rbdc->client = ceph_create_client(opt, rbdc); 263 + rbdc->client = ceph_create_client(opt, rbdc, 0, 0); 264 264 if (IS_ERR(rbdc->client)) 265 265 goto out_rbdc; 266 266 opt = NULL; /* Now rbdc->client is responsible for opt */

+126 -73

fs/ceph/addr.c

··· 228 228 } 229 229 230 230 /* 231 - * Build a vector of contiguous pages from the provided page list. 231 + * Finish an async read(ahead) op. 232 232 */ 233 - static struct page **page_vector_from_list(struct list_head *page_list, 234 - unsigned *nr_pages) 233 + static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) 235 234 { 235 + struct inode *inode = req->r_inode; 236 + struct ceph_osd_reply_head *replyhead; 237 + int rc, bytes; 238 + int i; 239 + 240 + /* parse reply */ 241 + replyhead = msg->front.iov_base; 242 + WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); 243 + rc = le32_to_cpu(replyhead->result); 244 + bytes = le32_to_cpu(msg->hdr.data_len); 245 + 246 + dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); 247 + 248 + /* unlock all pages, zeroing any data we didn't read */ 249 + for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) { 250 + struct page *page = req->r_pages[i]; 251 + 252 + if (bytes < (int)PAGE_CACHE_SIZE) { 253 + /* zero (remainder of) page */ 254 + int s = bytes < 0 ? 0 : bytes; 255 + zero_user_segment(page, s, PAGE_CACHE_SIZE); 256 + } 257 + dout("finish_read %p uptodate %p idx %lu\n", inode, page, 258 + page->index); 259 + flush_dcache_page(page); 260 + SetPageUptodate(page); 261 + unlock_page(page); 262 + page_cache_release(page); 263 + } 264 + kfree(req->r_pages); 265 + } 266 + 267 + /* 268 + * start an async read(ahead) operation. return nr_pages we submitted 269 + * a read for on success, or negative error code. 270 + */ 271 + static int start_read(struct inode *inode, struct list_head *page_list, int max) 272 + { 273 + struct ceph_osd_client *osdc = 274 + &ceph_inode_to_client(inode)->client->osdc; 275 + struct ceph_inode_info *ci = ceph_inode(inode); 276 + struct page *page = list_entry(page_list->prev, struct page, lru); 277 + struct ceph_osd_request *req; 278 + u64 off; 279 + u64 len; 280 + int i; 236 281 struct page **pages; 237 - struct page *page; 238 - int next_index, contig_pages = 0; 282 + pgoff_t next_index; 283 + int nr_pages = 0; 284 + int ret; 285 + 286 + off = page->index << PAGE_CACHE_SHIFT; 287 + 288 + /* count pages */ 289 + next_index = page->index; 290 + list_for_each_entry_reverse(page, page_list, lru) { 291 + if (page->index != next_index) 292 + break; 293 + nr_pages++; 294 + next_index++; 295 + if (max && nr_pages == max) 296 + break; 297 + } 298 + len = nr_pages << PAGE_CACHE_SHIFT; 299 + dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, 300 + off, len); 301 + 302 + req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), 303 + off, &len, 304 + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 305 + NULL, 0, 306 + ci->i_truncate_seq, ci->i_truncate_size, 307 + NULL, false, 1, 0); 308 + if (!req) 309 + return -ENOMEM; 239 310 240 311 /* build page vector */ 241 - pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS); 312 + nr_pages = len >> PAGE_CACHE_SHIFT; 313 + pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); 314 + ret = -ENOMEM; 242 315 if (!pages) 243 - return ERR_PTR(-ENOMEM); 244 - 245 - BUG_ON(list_empty(page_list)); 246 - next_index = list_entry(page_list->prev, struct page, lru)->index; 247 - list_for_each_entry_reverse(page, page_list, lru) { 248 - if (page->index == next_index) { 249 - dout("readpages page %d %p\n", contig_pages, page); 250 - pages[contig_pages] = page; 251 - contig_pages++; 252 - next_index++; 253 - } else { 254 - break; 316 + goto out; 317 + for (i = 0; i < nr_pages; ++i) { 318 + page = list_entry(page_list->prev, struct page, lru); 319 + BUG_ON(PageLocked(page)); 320 + list_del(&page->lru); 321 + 322 + dout("start_read %p adding %p idx %lu\n", inode, page, 323 + page->index); 324 + if (add_to_page_cache_lru(page, &inode->i_data, page->index, 325 + GFP_NOFS)) { 326 + page_cache_release(page); 327 + dout("start_read %p add_to_page_cache failed %p\n", 328 + inode, page); 329 + nr_pages = i; 330 + goto out_pages; 255 331 } 332 + pages[i] = page; 256 333 } 257 - *nr_pages = contig_pages; 258 - return pages; 334 + req->r_pages = pages; 335 + req->r_num_pages = nr_pages; 336 + req->r_callback = finish_read; 337 + req->r_inode = inode; 338 + 339 + dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); 340 + ret = ceph_osdc_start_request(osdc, req, false); 341 + if (ret < 0) 342 + goto out_pages; 343 + ceph_osdc_put_request(req); 344 + return nr_pages; 345 + 346 + out_pages: 347 + ceph_release_page_vector(pages, nr_pages); 348 + out: 349 + ceph_osdc_put_request(req); 350 + return ret; 259 351 } 352 + 260 353 261 354 /* 262 355 * Read multiple pages. Leave pages we don't read + unlock in page_list; ··· 359 266 struct list_head *page_list, unsigned nr_pages) 360 267 { 361 268 struct inode *inode = file->f_dentry->d_inode; 362 - struct ceph_inode_info *ci = ceph_inode(inode); 363 - struct ceph_osd_client *osdc = 364 - &ceph_inode_to_client(inode)->client->osdc; 269 + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 365 270 int rc = 0; 366 - struct page **pages; 367 - loff_t offset; 368 - u64 len; 271 + int max = 0; 369 272 370 - dout("readpages %p file %p nr_pages %d\n", 371 - inode, file, nr_pages); 273 + if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) 274 + max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) 275 + >> PAGE_SHIFT; 372 276 373 - pages = page_vector_from_list(page_list, &nr_pages); 374 - if (IS_ERR(pages)) 375 - return PTR_ERR(pages); 376 - 377 - /* guess read extent */ 378 - offset = pages[0]->index << PAGE_CACHE_SHIFT; 379 - len = nr_pages << PAGE_CACHE_SHIFT; 380 - rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 381 - offset, &len, 382 - ci->i_truncate_seq, ci->i_truncate_size, 383 - pages, nr_pages, 0); 384 - if (rc == -ENOENT) 385 - rc = 0; 386 - if (rc < 0) 387 - goto out; 388 - 389 - for (; !list_empty(page_list) && len > 0; 390 - rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { 391 - struct page *page = 392 - list_entry(page_list->prev, struct page, lru); 393 - 394 - list_del(&page->lru); 395 - 396 - if (rc < (int)PAGE_CACHE_SIZE) { 397 - /* zero (remainder of) page */ 398 - int s = rc < 0 ? 0 : rc; 399 - zero_user_segment(page, s, PAGE_CACHE_SIZE); 400 - } 401 - 402 - if (add_to_page_cache_lru(page, mapping, page->index, 403 - GFP_NOFS)) { 404 - page_cache_release(page); 405 - dout("readpages %p add_to_page_cache failed %p\n", 406 - inode, page); 407 - continue; 408 - } 409 - dout("readpages %p adding %p idx %lu\n", inode, page, 410 - page->index); 411 - flush_dcache_page(page); 412 - SetPageUptodate(page); 413 - unlock_page(page); 414 - page_cache_release(page); 277 + dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages, 278 + max); 279 + while (!list_empty(page_list)) { 280 + rc = start_read(inode, page_list, max); 281 + if (rc < 0) 282 + goto out; 283 + BUG_ON(rc == 0); 415 284 } 416 - rc = 0; 417 - 418 285 out: 419 - kfree(pages); 286 + dout("readpages %p file %p ret %d\n", inode, file, rc); 420 287 return rc; 421 288 } 422 289

+1 -1

fs/ceph/caps.c

··· 945 945 seq, issue_seq, mseq, follows, size, max_size, 946 946 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); 947 947 948 - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS); 948 + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false); 949 949 if (!msg) 950 950 return -ENOMEM; 951 951

+1 -45

fs/ceph/inode.c

··· 9 9 #include <linux/namei.h> 10 10 #include <linux/writeback.h> 11 11 #include <linux/vmalloc.h> 12 - #include <linux/pagevec.h> 13 12 14 13 #include "super.h" 15 14 #include "mds_client.h" ··· 1363 1364 } 1364 1365 1365 1366 /* 1366 - * invalidate any pages that are not dirty or under writeback. this 1367 - * includes pages that are clean and mapped. 1368 - */ 1369 - static void ceph_invalidate_nondirty_pages(struct address_space *mapping) 1370 - { 1371 - struct pagevec pvec; 1372 - pgoff_t next = 0; 1373 - int i; 1374 - 1375 - pagevec_init(&pvec, 0); 1376 - while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { 1377 - for (i = 0; i < pagevec_count(&pvec); i++) { 1378 - struct page *page = pvec.pages[i]; 1379 - pgoff_t index; 1380 - int skip_page = 1381 - (PageDirty(page) || PageWriteback(page)); 1382 - 1383 - if (!skip_page) 1384 - skip_page = !trylock_page(page); 1385 - 1386 - /* 1387 - * We really shouldn't be looking at the ->index of an 1388 - * unlocked page. But we're not allowed to lock these 1389 - * pages. So we rely upon nobody altering the ->index 1390 - * of this (pinned-by-us) page. 1391 - */ 1392 - index = page->index; 1393 - if (index > next) 1394 - next = index; 1395 - next++; 1396 - 1397 - if (skip_page) 1398 - continue; 1399 - 1400 - generic_error_remove_page(mapping, page); 1401 - unlock_page(page); 1402 - } 1403 - pagevec_release(&pvec); 1404 - cond_resched(); 1405 - } 1406 - } 1407 - 1408 - /* 1409 1367 * Invalidate inode pages in a worker thread. (This can't be done 1410 1368 * in the message handler context.) 1411 1369 */ ··· 1385 1429 orig_gen = ci->i_rdcache_gen; 1386 1430 spin_unlock(&inode->i_lock); 1387 1431 1388 - ceph_invalidate_nondirty_pages(inode->i_mapping); 1432 + truncate_inode_pages(&inode->i_data, 0); 1389 1433 1390 1434 spin_lock(&inode->i_lock); 1391 1435 if (orig_gen == ci->i_rdcache_gen &&

+28 -6

fs/ceph/ioctl.c

··· 42 42 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 43 43 struct ceph_mds_request *req; 44 44 struct ceph_ioctl_layout l; 45 + struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode); 46 + struct ceph_ioctl_layout nl; 45 47 int err, i; 46 48 47 - /* copy and validate */ 48 49 if (copy_from_user(&l, arg, sizeof(l))) 49 50 return -EFAULT; 50 51 51 - if ((l.object_size & ~PAGE_MASK) || 52 - (l.stripe_unit & ~PAGE_MASK) || 53 - !l.stripe_unit || 54 - (l.object_size && 55 - (unsigned)l.object_size % (unsigned)l.stripe_unit)) 52 + /* validate changed params against current layout */ 53 + err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT); 54 + if (!err) { 55 + nl.stripe_unit = ceph_file_layout_su(ci->i_layout); 56 + nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); 57 + nl.object_size = ceph_file_layout_object_size(ci->i_layout); 58 + nl.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool); 59 + nl.preferred_osd = 60 + (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred); 61 + } else 62 + return err; 63 + 64 + if (l.stripe_count) 65 + nl.stripe_count = l.stripe_count; 66 + if (l.stripe_unit) 67 + nl.stripe_unit = l.stripe_unit; 68 + if (l.object_size) 69 + nl.object_size = l.object_size; 70 + if (l.data_pool) 71 + nl.data_pool = l.data_pool; 72 + if (l.preferred_osd) 73 + nl.preferred_osd = l.preferred_osd; 74 + 75 + if ((nl.object_size & ~PAGE_MASK) || 76 + (nl.stripe_unit & ~PAGE_MASK) || 77 + ((unsigned)nl.object_size % (unsigned)nl.stripe_unit)) 56 78 return -EINVAL; 57 79 58 80 /* make sure it's a valid data pool */

+54 -1

fs/ceph/ioctl.h

··· 6 6 7 7 #define CEPH_IOCTL_MAGIC 0x97 8 8 9 - /* just use u64 to align sanely on all archs */ 9 + /* 10 + * CEPH_IOC_GET_LAYOUT - get file layout or dir layout policy 11 + * CEPH_IOC_SET_LAYOUT - set file layout 12 + * CEPH_IOC_SET_LAYOUT_POLICY - set dir layout policy 13 + * 14 + * The file layout specifies how file data is striped over objects in 15 + * the distributed object store, which object pool they belong to (if 16 + * it differs from the default), and an optional 'preferred osd' to 17 + * store them on. 18 + * 19 + * Files get a new layout based on the policy set on the containing 20 + * directory or one of its ancestors. The GET_LAYOUT ioctl will let 21 + * you examine the layout for a file or the policy on a directory. 22 + * 23 + * SET_LAYOUT will let you set a layout on a newly created file. This 24 + * only works immediately after the file is created and before any 25 + * data is written to it. 26 + * 27 + * SET_LAYOUT_POLICY will let you set a layout policy (default layout) 28 + * on a directory that will apply to any new files created in that 29 + * directory (or any child directory that doesn't specify a layout of 30 + * its own). 31 + */ 32 + 33 + /* use u64 to align sanely on all archs */ 10 34 struct ceph_ioctl_layout { 11 35 __u64 stripe_unit, stripe_count, object_size; 12 36 __u64 data_pool; ··· 45 21 struct ceph_ioctl_layout) 46 22 47 23 /* 24 + * CEPH_IOC_GET_DATALOC - get location of file data in the cluster 25 + * 48 26 * Extract identity, address of the OSD and object storing a given 49 27 * file offset. 50 28 */ ··· 65 39 #define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ 66 40 struct ceph_ioctl_dataloc) 67 41 42 + /* 43 + * CEPH_IOC_LAZYIO - relax consistency 44 + * 45 + * Normally Ceph switches to synchronous IO when multiple clients have 46 + * the file open (and or more for write). Reads and writes bypass the 47 + * page cache and go directly to the OSD. Setting this flag on a file 48 + * descriptor will allow buffered IO for this file in cases where the 49 + * application knows it won't interfere with other nodes (or doesn't 50 + * care). 51 + */ 68 52 #define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4) 53 + 54 + /* 55 + * CEPH_IOC_SYNCIO - force synchronous IO 56 + * 57 + * This ioctl sets a file flag that forces the synchronous IO that 58 + * bypasses the page cache, even if it is not necessary. This is 59 + * essentially the opposite behavior of IOC_LAZYIO. This forces the 60 + * same read/write path as a file opened by multiple clients when one 61 + * or more of those clients is opened for write. 62 + * 63 + * Note that this type of sync IO takes a different path than a file 64 + * opened with O_SYNC/D_SYNC (writes hit the page cache and are 65 + * immediately flushed on page boundaries). It is very similar to 66 + * O_DIRECT (writes bypass the page cache) excep that O_DIRECT writes 67 + * are not copied (user page must remain stable) and O_DIRECT writes 68 + * have alignment restrictions (on the buffer and file offset). 69 + */ 69 70 #define CEPH_IOC_SYNCIO _IO(CEPH_IOCTL_MAGIC, 5) 70 71 71 72 #endif

+6 -5

fs/ceph/mds_client.c

··· 764 764 struct ceph_msg *msg; 765 765 struct ceph_mds_session_head *h; 766 766 767 - msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS); 767 + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, 768 + false); 768 769 if (!msg) { 769 770 pr_err("create_session_msg ENOMEM creating msg\n"); 770 771 return NULL; ··· 1241 1240 while (session->s_num_cap_releases < session->s_nr_caps + extra) { 1242 1241 spin_unlock(&session->s_cap_lock); 1243 1242 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, 1244 - GFP_NOFS); 1243 + GFP_NOFS, false); 1245 1244 if (!msg) 1246 1245 goto out_unlocked; 1247 1246 dout("add_cap_releases %p msg %p now %d\n", session, msg, ··· 1653 1652 if (req->r_old_dentry_drop) 1654 1653 len += req->r_old_dentry->d_name.len; 1655 1654 1656 - msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS); 1655 + msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false); 1657 1656 if (!msg) { 1658 1657 msg = ERR_PTR(-ENOMEM); 1659 1658 goto out_free2; ··· 2519 2518 goto fail_nopagelist; 2520 2519 ceph_pagelist_init(pagelist); 2521 2520 2522 - reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS); 2521 + reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false); 2523 2522 if (!reply) 2524 2523 goto fail_nomsg; 2525 2524 ··· 2832 2831 dnamelen = dentry->d_name.len; 2833 2832 len += dnamelen; 2834 2833 2835 - msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS); 2834 + msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS, false); 2836 2835 if (!msg) 2837 2836 return; 2838 2837 lease = msg->front.iov_base;

+36 -25

fs/ceph/super.c

··· 114 114 enum { 115 115 Opt_wsize, 116 116 Opt_rsize, 117 + Opt_rasize, 117 118 Opt_caps_wanted_delay_min, 118 119 Opt_caps_wanted_delay_max, 119 120 Opt_cap_release_safety, ··· 137 136 static match_table_t fsopt_tokens = { 138 137 {Opt_wsize, "wsize=%d"}, 139 138 {Opt_rsize, "rsize=%d"}, 139 + {Opt_rasize, "rasize=%d"}, 140 140 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, 141 141 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, 142 142 {Opt_cap_release_safety, "cap_release_safety=%d"}, ··· 197 195 break; 198 196 case Opt_rsize: 199 197 fsopt->rsize = intval; 198 + break; 199 + case Opt_rasize: 200 + fsopt->rasize = intval; 200 201 break; 201 202 case Opt_caps_wanted_delay_min: 202 203 fsopt->caps_wanted_delay_min = intval; ··· 294 289 295 290 dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name); 296 291 297 - fsopt->sb_flags = flags; 298 - fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; 292 + fsopt->sb_flags = flags; 293 + fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; 299 294 300 - fsopt->rsize = CEPH_RSIZE_DEFAULT; 301 - fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); 295 + fsopt->rsize = CEPH_RSIZE_DEFAULT; 296 + fsopt->rasize = CEPH_RASIZE_DEFAULT; 297 + fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); 302 298 fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; 303 299 fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; 304 - fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; 305 - fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; 306 - fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; 307 - fsopt->congestion_kb = default_congestion_kb(); 308 - 309 - /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ 310 - err = -EINVAL; 311 - if (!dev_name) 312 - goto out; 313 - *path = strstr(dev_name, ":/"); 314 - if (*path == NULL) { 315 - pr_err("device name is missing path (no :/ in %s)\n", 316 - dev_name); 317 - goto out; 318 - } 300 + fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; 301 + fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; 302 + fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; 303 + fsopt->congestion_kb = default_congestion_kb(); 304 + 305 + /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ 306 + err = -EINVAL; 307 + if (!dev_name) 308 + goto out; 309 + *path = strstr(dev_name, ":/"); 310 + if (*path == NULL) { 311 + pr_err("device name is missing path (no :/ in %s)\n", 312 + dev_name); 313 + goto out; 314 + } 319 315 dev_name_end = *path; 320 316 dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); 321 317 ··· 382 376 seq_printf(m, ",wsize=%d", fsopt->wsize); 383 377 if (fsopt->rsize != CEPH_RSIZE_DEFAULT) 384 378 seq_printf(m, ",rsize=%d", fsopt->rsize); 379 + if (fsopt->rasize != CEPH_RASIZE_DEFAULT) 380 + seq_printf(m, ",rasize=%d", fsopt->rsize); 385 381 if (fsopt->congestion_kb != default_congestion_kb()) 386 382 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); 387 383 if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT) ··· 430 422 struct ceph_options *opt) 431 423 { 432 424 struct ceph_fs_client *fsc; 425 + const unsigned supported_features = 426 + CEPH_FEATURE_FLOCK | 427 + CEPH_FEATURE_DIRLAYOUTHASH; 428 + const unsigned required_features = 0; 433 429 int err = -ENOMEM; 434 430 435 431 fsc = kzalloc(sizeof(*fsc), GFP_KERNEL); 436 432 if (!fsc) 437 433 return ERR_PTR(-ENOMEM); 438 434 439 - fsc->client = ceph_create_client(opt, fsc); 435 + fsc->client = ceph_create_client(opt, fsc, supported_features, 436 + required_features); 440 437 if (IS_ERR(fsc->client)) { 441 438 err = PTR_ERR(fsc->client); 442 439 goto fail; 443 440 } 444 441 fsc->client->extra_mon_dispatch = extra_mon_dispatch; 445 - fsc->client->supported_features |= CEPH_FEATURE_FLOCK | 446 - CEPH_FEATURE_DIRLAYOUTHASH; 447 442 fsc->client->monc.want_mdsmap = 1; 448 443 449 444 fsc->mount_options = fsopt; ··· 785 774 { 786 775 int err; 787 776 788 - /* set ra_pages based on rsize mount option? */ 789 - if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) 777 + /* set ra_pages based on rasize mount option? */ 778 + if (fsc->mount_options->rasize >= PAGE_CACHE_SIZE) 790 779 fsc->backing_dev_info.ra_pages = 791 - (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) 780 + (fsc->mount_options->rasize + PAGE_CACHE_SIZE - 1) 792 781 >> PAGE_SHIFT; 793 782 else 794 783 fsc->backing_dev_info.ra_pages =

+11 -8

fs/ceph/super.h

··· 36 36 #define ceph_test_mount_opt(fsc, opt) \ 37 37 (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) 38 38 39 - #define CEPH_RSIZE_DEFAULT (512*1024) /* readahead */ 39 + #define CEPH_RSIZE_DEFAULT 0 /* max read size */ 40 + #define CEPH_RASIZE_DEFAULT (8192*1024) /* readahead */ 40 41 #define CEPH_MAX_READDIR_DEFAULT 1024 41 42 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) 42 43 #define CEPH_SNAPDIRNAME_DEFAULT ".snap" ··· 46 45 int flags; 47 46 int sb_flags; 48 47 49 - int wsize; 50 - int rsize; /* max readahead */ 48 + int wsize; /* max write size */ 49 + int rsize; /* max read size */ 50 + int rasize; /* max readahead */ 51 51 int congestion_kb; /* max writeback in flight */ 52 52 int caps_wanted_delay_min, caps_wanted_delay_max; 53 53 int cap_release_safety; ··· 346 344 * x86_64+ino32 64 32 347 345 * x86_64 64 64 348 346 */ 349 - static inline u32 ceph_ino_to_ino32(ino_t ino) 347 + static inline u32 ceph_ino_to_ino32(__u64 vino) 350 348 { 351 - ino ^= ino >> (sizeof(ino) * 8 - 32); 349 + u32 ino = vino & 0xffffffff; 350 + ino ^= vino >> 32; 352 351 if (!ino) 353 352 ino = 1; 354 353 return ino; ··· 360 357 */ 361 358 static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) 362 359 { 363 - ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ 364 360 #if BITS_PER_LONG == 32 365 - ino = ceph_ino_to_ino32(ino); 361 + return ceph_ino_to_ino32(vino.ino); 362 + #else 363 + return (ino_t)vino.ino; 366 364 #endif 367 - return ino; 368 365 } 369 366 370 367 /*

+3 -1

include/linux/ceph/libceph.h

··· 215 215 extern int ceph_compare_options(struct ceph_options *new_opt, 216 216 struct ceph_client *client); 217 217 extern struct ceph_client *ceph_create_client(struct ceph_options *opt, 218 - void *private); 218 + void *private, 219 + unsigned supported_features, 220 + unsigned required_features); 219 221 extern u64 ceph_client_id(struct ceph_client *client); 220 222 extern void ceph_destroy_client(struct ceph_client *client); 221 223 extern int __ceph_open_session(struct ceph_client *client,

+2 -1

include/linux/ceph/messenger.h

··· 237 237 extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); 238 238 extern void ceph_con_put(struct ceph_connection *con); 239 239 240 - extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags); 240 + extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, 241 + bool can_fail); 241 242 extern void ceph_msg_kfree(struct ceph_msg *m); 242 243 243 244

+14

net/ceph/Kconfig

··· 27 27 28 28 If unsure, say N. 29 29 30 + config CEPH_LIB_USE_DNS_RESOLVER 31 + bool "Use in-kernel support for DNS lookup" 32 + depends on CEPH_LIB 33 + select DNS_RESOLVER 34 + default n 35 + help 36 + If you say Y here, hostnames (e.g. monitor addresses) will 37 + be resolved using the CONFIG_DNS_RESOLVER facility. 38 + 39 + For information on how to use CONFIG_DNS_RESOLVER consult 40 + Documentation/networking/dns_resolver.txt 41 + 42 + If unsure, say N. 43 +

+23 -22

net/ceph/ceph_common.c

··· 432 432 /* 433 433 * create a fresh client instance 434 434 */ 435 - struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private) 435 + struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, 436 + unsigned supported_features, 437 + unsigned required_features) 436 438 { 437 439 struct ceph_client *client; 440 + struct ceph_entity_addr *myaddr = NULL; 438 441 int err = -ENOMEM; 439 442 440 443 client = kzalloc(sizeof(*client), GFP_KERNEL); ··· 452 449 client->auth_err = 0; 453 450 454 451 client->extra_mon_dispatch = NULL; 455 - client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT; 456 - client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT; 452 + client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT | 453 + supported_features; 454 + client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT | 455 + required_features; 457 456 458 - client->msgr = NULL; 457 + /* msgr */ 458 + if (ceph_test_opt(client, MYIP)) 459 + myaddr = &client->options->my_addr; 460 + client->msgr = ceph_messenger_create(myaddr, 461 + client->supported_features, 462 + client->required_features); 463 + if (IS_ERR(client->msgr)) { 464 + err = PTR_ERR(client->msgr); 465 + goto fail; 466 + } 467 + client->msgr->nocrc = ceph_test_opt(client, NOCRC); 459 468 460 469 /* subsystems */ 461 470 err = ceph_monc_init(&client->monc, client); 462 471 if (err < 0) 463 - goto fail; 472 + goto fail_msgr; 464 473 err = ceph_osdc_init(&client->osdc, client); 465 474 if (err < 0) 466 475 goto fail_monc; ··· 481 466 482 467 fail_monc: 483 468 ceph_monc_stop(&client->monc); 469 + fail_msgr: 470 + ceph_messenger_destroy(client->msgr); 484 471 fail: 485 472 kfree(client); 486 473 return ERR_PTR(err); ··· 507 490 508 491 ceph_debugfs_client_cleanup(client); 509 492 510 - if (client->msgr) 511 - ceph_messenger_destroy(client->msgr); 493 + ceph_messenger_destroy(client->msgr); 512 494 513 495 ceph_destroy_options(client->options); 514 496 ··· 530 514 */ 531 515 int __ceph_open_session(struct ceph_client *client, unsigned long started) 532 516 { 533 - struct ceph_entity_addr *myaddr = NULL; 534 517 int err; 535 518 unsigned long timeout = client->options->mount_timeout * HZ; 536 - 537 - /* initialize the messenger */ 538 - if (client->msgr == NULL) { 539 - if (ceph_test_opt(client, MYIP)) 540 - myaddr = &client->options->my_addr; 541 - client->msgr = ceph_messenger_create(myaddr, 542 - client->supported_features, 543 - client->required_features); 544 - if (IS_ERR(client->msgr)) { 545 - client->msgr = NULL; 546 - return PTR_ERR(client->msgr); 547 - } 548 - client->msgr->nocrc = ceph_test_opt(client, NOCRC); 549 - } 550 519 551 520 /* open session, and wait for mon and osd maps */ 552 521 err = ceph_monc_open_session(&client->monc);

+114 -16

net/ceph/messenger.c

··· 11 11 #include <linux/string.h> 12 12 #include <linux/bio.h> 13 13 #include <linux/blkdev.h> 14 + #include <linux/dns_resolver.h> 14 15 #include <net/tcp.h> 15 16 16 17 #include <linux/ceph/libceph.h> ··· 1079 1078 } 1080 1079 1081 1080 /* 1081 + * Unlike other *_pton function semantics, zero indicates success. 1082 + */ 1083 + static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss, 1084 + char delim, const char **ipend) 1085 + { 1086 + struct sockaddr_in *in4 = (void *)ss; 1087 + struct sockaddr_in6 *in6 = (void *)ss; 1088 + 1089 + memset(ss, 0, sizeof(*ss)); 1090 + 1091 + if (in4_pton(str, len, (u8 *)&in4->sin_addr.s_addr, delim, ipend)) { 1092 + ss->ss_family = AF_INET; 1093 + return 0; 1094 + } 1095 + 1096 + if (in6_pton(str, len, (u8 *)&in6->sin6_addr.s6_addr, delim, ipend)) { 1097 + ss->ss_family = AF_INET6; 1098 + return 0; 1099 + } 1100 + 1101 + return -EINVAL; 1102 + } 1103 + 1104 + /* 1105 + * Extract hostname string and resolve using kernel DNS facility. 1106 + */ 1107 + #ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER 1108 + static int ceph_dns_resolve_name(const char *name, size_t namelen, 1109 + struct sockaddr_storage *ss, char delim, const char **ipend) 1110 + { 1111 + const char *end, *delim_p; 1112 + char *colon_p, *ip_addr = NULL; 1113 + int ip_len, ret; 1114 + 1115 + /* 1116 + * The end of the hostname occurs immediately preceding the delimiter or 1117 + * the port marker (':') where the delimiter takes precedence. 1118 + */ 1119 + delim_p = memchr(name, delim, namelen); 1120 + colon_p = memchr(name, ':', namelen); 1121 + 1122 + if (delim_p && colon_p) 1123 + end = delim_p < colon_p ? delim_p : colon_p; 1124 + else if (!delim_p && colon_p) 1125 + end = colon_p; 1126 + else { 1127 + end = delim_p; 1128 + if (!end) /* case: hostname:/ */ 1129 + end = name + namelen; 1130 + } 1131 + 1132 + if (end <= name) 1133 + return -EINVAL; 1134 + 1135 + /* do dns_resolve upcall */ 1136 + ip_len = dns_query(NULL, name, end - name, NULL, &ip_addr, NULL); 1137 + if (ip_len > 0) 1138 + ret = ceph_pton(ip_addr, ip_len, ss, -1, NULL); 1139 + else 1140 + ret = -ESRCH; 1141 + 1142 + kfree(ip_addr); 1143 + 1144 + *ipend = end; 1145 + 1146 + pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name, 1147 + ret, ret ? "failed" : ceph_pr_addr(ss)); 1148 + 1149 + return ret; 1150 + } 1151 + #else 1152 + static inline int ceph_dns_resolve_name(const char *name, size_t namelen, 1153 + struct sockaddr_storage *ss, char delim, const char **ipend) 1154 + { 1155 + return -EINVAL; 1156 + } 1157 + #endif 1158 + 1159 + /* 1160 + * Parse a server name (IP or hostname). If a valid IP address is not found 1161 + * then try to extract a hostname to resolve using userspace DNS upcall. 1162 + */ 1163 + static int ceph_parse_server_name(const char *name, size_t namelen, 1164 + struct sockaddr_storage *ss, char delim, const char **ipend) 1165 + { 1166 + int ret; 1167 + 1168 + ret = ceph_pton(name, namelen, ss, delim, ipend); 1169 + if (ret) 1170 + ret = ceph_dns_resolve_name(name, namelen, ss, delim, ipend); 1171 + 1172 + return ret; 1173 + } 1174 + 1175 + /* 1082 1176 * Parse an ip[:port] list into an addr array. Use the default 1083 1177 * monitor port if a port isn't specified. 1084 1178 */ ··· 1181 1085 struct ceph_entity_addr *addr, 1182 1086 int max_count, int *count) 1183 1087 { 1184 - int i; 1088 + int i, ret = -EINVAL; 1185 1089 const char *p = c; 1186 1090 1187 1091 dout("parse_ips on '%.*s'\n", (int)(end-c), c); 1188 1092 for (i = 0; i < max_count; i++) { 1189 1093 const char *ipend; 1190 1094 struct sockaddr_storage *ss = &addr[i].in_addr; 1191 - struct sockaddr_in *in4 = (void *)ss; 1192 - struct sockaddr_in6 *in6 = (void *)ss; 1193 1095 int port; 1194 1096 char delim = ','; 1195 1097 ··· 1196 1102 p++; 1197 1103 } 1198 1104 1199 - memset(ss, 0, sizeof(*ss)); 1200 - if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr, 1201 - delim, &ipend)) 1202 - ss->ss_family = AF_INET; 1203 - else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, 1204 - delim, &ipend)) 1205 - ss->ss_family = AF_INET6; 1206 - else 1105 + ret = ceph_parse_server_name(p, end - p, ss, delim, &ipend); 1106 + if (ret) 1207 1107 goto bad; 1108 + ret = -EINVAL; 1109 + 1208 1110 p = ipend; 1209 1111 1210 1112 if (delim == ']') { ··· 1245 1155 1246 1156 bad: 1247 1157 pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); 1248 - return -EINVAL; 1158 + return ret; 1249 1159 } 1250 1160 EXPORT_SYMBOL(ceph_parse_ips); 1251 1161 ··· 2371 2281 * construct a new message with given type, size 2372 2282 * the new msg has a ref count of 1. 2373 2283 */ 2374 - struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags) 2284 + struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, 2285 + bool can_fail) 2375 2286 { 2376 2287 struct ceph_msg *m; 2377 2288 ··· 2424 2333 m->front.iov_base = kmalloc(front_len, flags); 2425 2334 } 2426 2335 if (m->front.iov_base == NULL) { 2427 - pr_err("msg_new can't allocate %d bytes\n", 2336 + dout("ceph_msg_new can't allocate %d bytes\n", 2428 2337 front_len); 2429 2338 goto out2; 2430 2339 } ··· 2439 2348 out2: 2440 2349 ceph_msg_put(m); 2441 2350 out: 2442 - pr_err("msg_new can't create type %d front %d\n", type, front_len); 2351 + if (!can_fail) { 2352 + pr_err("msg_new can't create type %d front %d\n", type, 2353 + front_len); 2354 + WARN_ON(1); 2355 + } else { 2356 + dout("msg_new can't create type %d front %d\n", type, 2357 + front_len); 2358 + } 2443 2359 return NULL; 2444 2360 } 2445 2361 EXPORT_SYMBOL(ceph_msg_new); ··· 2496 2398 } 2497 2399 if (!msg) { 2498 2400 *skip = 0; 2499 - msg = ceph_msg_new(type, front_len, GFP_NOFS); 2401 + msg = ceph_msg_new(type, front_len, GFP_NOFS, false); 2500 2402 if (!msg) { 2501 2403 pr_err("unable to allocate msg type %d len %d\n", 2502 2404 type, front_len);

+43 -36

net/ceph/mon_client.c

··· 116 116 */ 117 117 static void __close_session(struct ceph_mon_client *monc) 118 118 { 119 - if (monc->con) { 120 - dout("__close_session closing mon%d\n", monc->cur_mon); 121 - ceph_con_revoke(monc->con, monc->m_auth); 122 - ceph_con_close(monc->con); 123 - monc->cur_mon = -1; 124 - monc->pending_auth = 0; 125 - ceph_auth_reset(monc->auth); 126 - } 119 + dout("__close_session closing mon%d\n", monc->cur_mon); 120 + ceph_con_revoke(monc->con, monc->m_auth); 121 + ceph_con_close(monc->con); 122 + monc->cur_mon = -1; 123 + monc->pending_auth = 0; 124 + ceph_auth_reset(monc->auth); 127 125 } 128 126 129 127 /* ··· 300 302 */ 301 303 int ceph_monc_open_session(struct ceph_mon_client *monc) 302 304 { 303 - if (!monc->con) { 304 - monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL); 305 - if (!monc->con) 306 - return -ENOMEM; 307 - ceph_con_init(monc->client->msgr, monc->con); 308 - monc->con->private = monc; 309 - monc->con->ops = &mon_con_ops; 310 - } 311 - 312 305 mutex_lock(&monc->mutex); 313 306 __open_session(monc); 314 307 __schedule_delayed(monc); ··· 517 528 init_completion(&req->completion); 518 529 519 530 err = -ENOMEM; 520 - req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS); 531 + req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, 532 + true); 521 533 if (!req->request) 522 534 goto out; 523 - req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS); 535 + req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS, 536 + true); 524 537 if (!req->reply) 525 538 goto out; 526 539 ··· 617 626 init_completion(&req->completion); 618 627 619 628 err = -ENOMEM; 620 - req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS); 629 + req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS, 630 + true); 621 631 if (!req->request) 622 632 goto out; 623 - req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS); 633 + req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS, 634 + true); 624 635 if (!req->reply) 625 636 goto out; 626 637 ··· 748 755 if (err) 749 756 goto out; 750 757 751 - monc->con = NULL; 758 + /* connection */ 759 + monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL); 760 + if (!monc->con) 761 + goto out_monmap; 762 + ceph_con_init(monc->client->msgr, monc->con); 763 + monc->con->private = monc; 764 + monc->con->ops = &mon_con_ops; 752 765 753 766 /* authentication */ 754 767 monc->auth = ceph_auth_init(cl->options->name, 755 768 cl->options->key); 756 - if (IS_ERR(monc->auth)) 757 - return PTR_ERR(monc->auth); 769 + if (IS_ERR(monc->auth)) { 770 + err = PTR_ERR(monc->auth); 771 + goto out_con; 772 + } 758 773 monc->auth->want_keys = 759 774 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | 760 775 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; ··· 771 770 err = -ENOMEM; 772 771 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, 773 772 sizeof(struct ceph_mon_subscribe_ack), 774 - GFP_NOFS); 773 + GFP_NOFS, true); 775 774 if (!monc->m_subscribe_ack) 776 - goto out_monmap; 775 + goto out_auth; 777 776 778 - monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS); 777 + monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS, 778 + true); 779 779 if (!monc->m_subscribe) 780 780 goto out_subscribe_ack; 781 781 782 - monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS); 782 + monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS, 783 + true); 783 784 if (!monc->m_auth_reply) 784 785 goto out_subscribe; 785 786 786 - monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS); 787 + monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true); 787 788 monc->pending_auth = 0; 788 789 if (!monc->m_auth) 789 790 goto out_auth_reply; ··· 811 808 ceph_msg_put(monc->m_subscribe); 812 809 out_subscribe_ack: 813 810 ceph_msg_put(monc->m_subscribe_ack); 811 + out_auth: 812 + ceph_auth_destroy(monc->auth); 813 + out_con: 814 + monc->con->ops->put(monc->con); 814 815 out_monmap: 815 816 kfree(monc->monmap); 816 817 out: ··· 829 822 830 823 mutex_lock(&monc->mutex); 831 824 __close_session(monc); 832 - if (monc->con) { 833 - monc->con->private = NULL; 834 - monc->con->ops->put(monc->con); 835 - monc->con = NULL; 836 - } 825 + 826 + monc->con->private = NULL; 827 + monc->con->ops->put(monc->con); 828 + monc->con = NULL; 829 + 837 830 mutex_unlock(&monc->mutex); 838 831 839 832 ceph_auth_destroy(monc->auth); ··· 980 973 case CEPH_MSG_MON_MAP: 981 974 case CEPH_MSG_MDS_MAP: 982 975 case CEPH_MSG_OSD_MAP: 983 - m = ceph_msg_new(type, front_len, GFP_NOFS); 976 + m = ceph_msg_new(type, front_len, GFP_NOFS, false); 984 977 break; 985 978 } 986 979 ··· 1007 1000 if (!con->private) 1008 1001 goto out; 1009 1002 1010 - if (monc->con && !monc->hunting) 1003 + if (!monc->hunting) 1011 1004 pr_info("mon%d %s session lost, " 1012 1005 "hunting for new mon\n", monc->cur_mon, 1013 1006 ceph_pr_addr(&monc->con->peer_addr.in_addr));

+2 -2

net/ceph/msgpool.c

··· 12 12 struct ceph_msgpool *pool = arg; 13 13 struct ceph_msg *msg; 14 14 15 - msg = ceph_msg_new(0, pool->front_len, gfp_mask); 15 + msg = ceph_msg_new(0, pool->front_len, gfp_mask, true); 16 16 if (!msg) { 17 17 dout("msgpool_alloc %s failed\n", pool->name); 18 18 } else { ··· 61 61 WARN_ON(1); 62 62 63 63 /* try to alloc a fresh message */ 64 - return ceph_msg_new(0, front_len, GFP_NOFS); 64 + return ceph_msg_new(0, front_len, GFP_NOFS, false); 65 65 } 66 66 67 67 msg = mempool_alloc(pool->pool, GFP_NOFS);

+20 -14

net/ceph/osd_client.c

··· 227 227 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); 228 228 else 229 229 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, 230 - OSD_OPREPLY_FRONT_LEN, gfp_flags); 230 + OSD_OPREPLY_FRONT_LEN, gfp_flags, true); 231 231 if (!msg) { 232 232 ceph_osdc_put_request(req); 233 233 return NULL; ··· 250 250 if (use_mempool) 251 251 msg = ceph_msgpool_get(&osdc->msgpool_op, 0); 252 252 else 253 - msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags); 253 + msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true); 254 254 if (!msg) { 255 255 ceph_osdc_put_request(req); 256 256 return NULL; ··· 943 943 * Caller should hold map_sem for read and request_mutex. 944 944 */ 945 945 static int __map_request(struct ceph_osd_client *osdc, 946 - struct ceph_osd_request *req) 946 + struct ceph_osd_request *req, int force_resend) 947 947 { 948 948 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; 949 949 struct ceph_pg pgid; ··· 967 967 num = err; 968 968 } 969 969 970 - if ((req->r_osd && req->r_osd->o_osd == o && 970 + if ((!force_resend && 971 + req->r_osd && req->r_osd->o_osd == o && 971 972 req->r_sent >= req->r_osd->o_incarnation && 972 973 req->r_num_pg_osds == num && 973 974 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || ··· 1290 1289 * 1291 1290 * Caller should hold map_sem for read and request_mutex. 1292 1291 */ 1293 - static void kick_requests(struct ceph_osd_client *osdc) 1292 + static void kick_requests(struct ceph_osd_client *osdc, int force_resend) 1294 1293 { 1295 1294 struct ceph_osd_request *req, *nreq; 1296 1295 struct rb_node *p; 1297 1296 int needmap = 0; 1298 1297 int err; 1299 1298 1300 - dout("kick_requests\n"); 1299 + dout("kick_requests %s\n", force_resend ? " (force resend)" : ""); 1301 1300 mutex_lock(&osdc->request_mutex); 1302 1301 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { 1303 1302 req = rb_entry(p, struct ceph_osd_request, r_node); 1304 - err = __map_request(osdc, req); 1303 + err = __map_request(osdc, req, force_resend); 1305 1304 if (err < 0) 1306 1305 continue; /* error */ 1307 1306 if (req->r_osd == NULL) { ··· 1319 1318 r_linger_item) { 1320 1319 dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); 1321 1320 1322 - err = __map_request(osdc, req); 1321 + err = __map_request(osdc, req, force_resend); 1323 1322 if (err == 0) 1324 1323 continue; /* no change and no osd was specified */ 1325 1324 if (err < 0) ··· 1396 1395 ceph_osdmap_destroy(osdc->osdmap); 1397 1396 osdc->osdmap = newmap; 1398 1397 } 1399 - kick_requests(osdc); 1398 + kick_requests(osdc, 0); 1400 1399 reset_changed_osds(osdc); 1401 1400 } else { 1402 1401 dout("ignoring incremental map %u len %d\n", ··· 1424 1423 "older than our %u\n", epoch, maplen, 1425 1424 osdc->osdmap->epoch); 1426 1425 } else { 1426 + int skipped_map = 0; 1427 + 1427 1428 dout("taking full map %u len %d\n", epoch, maplen); 1428 1429 newmap = osdmap_decode(&p, p+maplen); 1429 1430 if (IS_ERR(newmap)) { ··· 1435 1432 BUG_ON(!newmap); 1436 1433 oldmap = osdc->osdmap; 1437 1434 osdc->osdmap = newmap; 1438 - if (oldmap) 1435 + if (oldmap) { 1436 + if (oldmap->epoch + 1 < newmap->epoch) 1437 + skipped_map = 1; 1439 1438 ceph_osdmap_destroy(oldmap); 1440 - kick_requests(osdc); 1439 + } 1440 + kick_requests(osdc, skipped_map); 1441 1441 } 1442 1442 p += maplen; 1443 1443 nr_maps--; ··· 1713 1707 * the request still han't been touched yet. 1714 1708 */ 1715 1709 if (req->r_sent == 0) { 1716 - rc = __map_request(osdc, req); 1710 + rc = __map_request(osdc, req, 0); 1717 1711 if (rc < 0) { 1718 1712 if (nofail) { 1719 1713 dout("osdc_start_request failed map, " ··· 2038 2032 if (front > req->r_reply->front.iov_len) { 2039 2033 pr_warning("get_reply front %d > preallocated %d\n", 2040 2034 front, (int)req->r_reply->front.iov_len); 2041 - m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS); 2035 + m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false); 2042 2036 if (!m) 2043 2037 goto out; 2044 2038 ceph_msg_put(req->r_reply); ··· 2086 2080 switch (type) { 2087 2081 case CEPH_MSG_OSD_MAP: 2088 2082 case CEPH_MSG_WATCH_NOTIFY: 2089 - return ceph_msg_new(type, front, GFP_NOFS); 2083 + return ceph_msg_new(type, front, GFP_NOFS, false); 2090 2084 case CEPH_MSG_OSD_OPREPLY: 2091 2085 return get_reply(con, hdr, skip); 2092 2086 default: