Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'ceph-for-4.17-rc1' of git://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
"The big ticket items are:

- support for rbd "fancy" striping (myself).

The striping feature bit is now fully implemented, allowing mapping
v2 images with non-default striping patterns. This completes
support for --image-format 2.

- CephFS quota support (Luis Henriques and Zheng Yan).

This set is based on the new SnapRealm code in the upcoming v13.y.z
("Mimic") release. Quota handling will be rejected on older
filesystems.

- memory usage improvements in CephFS (Chengguang Xu).

Directory specific bits have been split out of ceph_file_info and
some effort went into improving cap reservation code to avoid OOM
crashes.

Also included a bunch of assorted fixes all over the place from
Chengguang and others"

* tag 'ceph-for-4.17-rc1' of git://github.com/ceph/ceph-client: (67 commits)
ceph: quota: report root dir quota usage in statfs
ceph: quota: add counter for snaprealms with quota
ceph: quota: cache inode pointer in ceph_snap_realm
ceph: fix root quota realm check
ceph: don't check quota for snap inode
ceph: quota: update MDS when max_bytes is approaching
ceph: quota: support for ceph.quota.max_bytes
ceph: quota: don't allow cross-quota renames
ceph: quota: support for ceph.quota.max_files
ceph: quota: add initial infrastructure to support cephfs quotas
rbd: remove VLA usage
rbd: fix spelling mistake: "reregisteration" -> "reregistration"
ceph: rename function drop_leases() to a more descriptive name
ceph: fix invalid point dereference for error case in mdsc destroy
ceph: return proper bool type to caller instead of pointer
ceph: optimize memory usage
ceph: optimize mds session register
libceph, ceph: add __init attribution to init funcitons
ceph: filter out used flags when printing unused open flags
ceph: don't wait on writeback when there is no more dirty pages
...

+2669 -2031
+16
Documentation/filesystems/ceph.txt
··· 62 62 the identification of large disk space consumers relatively quick, as 63 63 no 'du' or similar recursive scan of the file system is required. 64 64 65 + Finally, Ceph also allows quotas to be set on any directory in the system. 66 + The quota can restrict the number of bytes or the number of files stored 67 + beneath that point in the directory hierarchy. Quotas can be set using 68 + extended attributes 'ceph.quota.max_files' and 'ceph.quota.max_bytes', eg: 69 + 70 + setfattr -n ceph.quota.max_bytes -v 100000000 /some/dir 71 + getfattr -n ceph.quota.max_bytes /some/dir 72 + 73 + A limitation of the current quotas implementation is that it relies on the 74 + cooperation of the client mounting the file system to stop writers when a 75 + limit is reached. A modified or adversarial client cannot be prevented 76 + from writing as much data as it needs. 65 77 66 78 Mount Syntax 67 79 ============ ··· 148 136 149 137 noasyncreaddir 150 138 Do not use the dcache as above for readdir. 139 + 140 + noquotadf 141 + Report overall filesystem usage in statfs instead of using the root 142 + directory quota. 151 143 152 144 More Information 153 145 ================
+1083 -1575
drivers/block/rbd.c
··· 32 32 #include <linux/ceph/osd_client.h> 33 33 #include <linux/ceph/mon_client.h> 34 34 #include <linux/ceph/cls_lock_client.h> 35 + #include <linux/ceph/striper.h> 35 36 #include <linux/ceph/decode.h> 36 37 #include <linux/parser.h> 37 38 #include <linux/bsearch.h> ··· 201 200 }; 202 201 203 202 struct rbd_img_request; 204 - typedef void (*rbd_img_callback_t)(struct rbd_img_request *); 205 - 206 - #define BAD_WHICH U32_MAX /* Good which or bad which, which? */ 207 - 208 - struct rbd_obj_request; 209 - typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *); 210 203 211 204 enum obj_request_type { 212 - OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 205 + OBJ_REQUEST_NODATA = 1, 206 + OBJ_REQUEST_BIO, /* pointer into provided bio (list) */ 207 + OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */ 208 + OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */ 213 209 }; 214 210 215 211 enum obj_operation_type { 212 + OBJ_OP_READ = 1, 216 213 OBJ_OP_WRITE, 217 - OBJ_OP_READ, 218 214 OBJ_OP_DISCARD, 219 215 }; 220 216 221 - enum obj_req_flags { 222 - OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 223 - OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 224 - OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */ 225 - OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */ 217 + /* 218 + * Writes go through the following state machine to deal with 219 + * layering: 220 + * 221 + * need copyup 222 + * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP 223 + * | ^ | 224 + * v \------------------------------/ 225 + * done 226 + * ^ 227 + * | 228 + * RBD_OBJ_WRITE_FLAT 229 + * 230 + * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether 231 + * there is a parent or not. 232 + */ 233 + enum rbd_obj_write_state { 234 + RBD_OBJ_WRITE_FLAT = 1, 235 + RBD_OBJ_WRITE_GUARD, 236 + RBD_OBJ_WRITE_COPYUP, 226 237 }; 227 238 228 239 struct rbd_obj_request { 229 - u64 object_no; 230 - u64 offset; /* object start byte */ 231 - u64 length; /* bytes from offset */ 232 - unsigned long flags; 233 - 234 - /* 235 - * An object request associated with an image will have its 236 - * img_data flag set; a standalone object request will not. 237 - * 238 - * A standalone object request will have which == BAD_WHICH 239 - * and a null obj_request pointer. 240 - * 241 - * An object request initiated in support of a layered image 242 - * object (to check for its existence before a write) will 243 - * have which == BAD_WHICH and a non-null obj_request pointer. 244 - * 245 - * Finally, an object request for rbd image data will have 246 - * which != BAD_WHICH, and will have a non-null img_request 247 - * pointer. The value of which will be in the range 248 - * 0..(img_request->obj_request_count-1). 249 - */ 240 + struct ceph_object_extent ex; 250 241 union { 251 - struct rbd_obj_request *obj_request; /* STAT op */ 242 + bool tried_parent; /* for reads */ 243 + enum rbd_obj_write_state write_state; /* for writes */ 244 + }; 245 + 246 + struct rbd_img_request *img_request; 247 + struct ceph_file_extent *img_extents; 248 + u32 num_img_extents; 249 + 250 + union { 251 + struct ceph_bio_iter bio_pos; 252 252 struct { 253 - struct rbd_img_request *img_request; 254 - u64 img_offset; 255 - /* links for img_request->obj_requests list */ 256 - struct list_head links; 253 + struct ceph_bvec_iter bvec_pos; 254 + u32 bvec_count; 255 + u32 bvec_idx; 257 256 }; 258 257 }; 259 - u32 which; /* posn image request list */ 260 - 261 - enum obj_request_type type; 262 - union { 263 - struct bio *bio_list; 264 - struct { 265 - struct page **pages; 266 - u32 page_count; 267 - }; 268 - }; 269 - struct page **copyup_pages; 270 - u32 copyup_page_count; 258 + struct bio_vec *copyup_bvecs; 259 + u32 copyup_bvec_count; 271 260 272 261 struct ceph_osd_request *osd_req; 273 262 274 263 u64 xferred; /* bytes transferred */ 275 264 int result; 276 265 277 - rbd_obj_callback_t callback; 278 - 279 266 struct kref kref; 280 267 }; 281 268 282 269 enum img_req_flags { 283 - IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ 284 270 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 285 271 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 286 - IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */ 287 272 }; 288 273 289 274 struct rbd_img_request { 290 275 struct rbd_device *rbd_dev; 291 - u64 offset; /* starting image byte offset */ 292 - u64 length; /* byte count from offset */ 276 + enum obj_operation_type op_type; 277 + enum obj_request_type data_type; 293 278 unsigned long flags; 294 279 union { 295 280 u64 snap_id; /* for reads */ ··· 285 298 struct request *rq; /* block request */ 286 299 struct rbd_obj_request *obj_request; /* obj req initiator */ 287 300 }; 288 - struct page **copyup_pages; 289 - u32 copyup_page_count; 290 - spinlock_t completion_lock;/* protects next_completion */ 291 - u32 next_completion; 292 - rbd_img_callback_t callback; 301 + spinlock_t completion_lock; 293 302 u64 xferred;/* aggregate bytes transferred */ 294 303 int result; /* first nonzero obj_request result */ 295 304 305 + struct list_head object_extents; /* obj_req.ex structs */ 296 306 u32 obj_request_count; 297 - struct list_head obj_requests; /* rbd_obj_request structs */ 307 + u32 pending_count; 298 308 299 309 struct kref kref; 300 310 }; 301 311 302 312 #define for_each_obj_request(ireq, oreq) \ 303 - list_for_each_entry(oreq, &(ireq)->obj_requests, links) 304 - #define for_each_obj_request_from(ireq, oreq) \ 305 - list_for_each_entry_from(oreq, &(ireq)->obj_requests, links) 313 + list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item) 306 314 #define for_each_obj_request_safe(ireq, oreq, n) \ 307 - list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links) 315 + list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item) 308 316 309 317 enum rbd_watch_state { 310 318 RBD_WATCH_STATE_UNREGISTERED, ··· 415 433 static struct kmem_cache *rbd_img_request_cache; 416 434 static struct kmem_cache *rbd_obj_request_cache; 417 435 418 - static struct bio_set *rbd_bio_clone; 419 - 420 436 static int rbd_major; 421 437 static DEFINE_IDA(rbd_dev_id_ida); 422 438 ··· 427 447 module_param(single_major, bool, S_IRUGO); 428 448 MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)"); 429 449 430 - static int rbd_img_request_submit(struct rbd_img_request *img_request); 431 - 432 450 static ssize_t rbd_add(struct bus_type *bus, const char *buf, 433 451 size_t count); 434 452 static ssize_t rbd_remove(struct bus_type *bus, const char *buf, ··· 436 458 static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf, 437 459 size_t count); 438 460 static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth); 439 - static void rbd_spec_put(struct rbd_spec *spec); 440 461 441 462 static int rbd_dev_id_to_minor(int dev_id) 442 463 { ··· 554 577 # define rbd_assert(expr) ((void) 0) 555 578 #endif /* !RBD_DEBUG */ 556 579 557 - static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request); 558 - static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request); 559 - static void rbd_img_parent_read(struct rbd_obj_request *obj_request); 560 580 static void rbd_dev_remove_parent(struct rbd_device *rbd_dev); 561 581 562 582 static int rbd_dev_refresh(struct rbd_device *rbd_dev); ··· 831 857 } 832 858 833 859 /* 834 - * Get a ceph client with specific addr and configuration, if one does 835 - * not exist create it. Either way, ceph_opts is consumed by this 836 - * function. 837 - */ 838 - static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 839 - { 840 - struct rbd_client *rbdc; 841 - 842 - mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); 843 - rbdc = rbd_client_find(ceph_opts); 844 - if (rbdc) /* using an existing client */ 845 - ceph_destroy_options(ceph_opts); 846 - else 847 - rbdc = rbd_client_create(ceph_opts); 848 - mutex_unlock(&client_mutex); 849 - 850 - return rbdc; 851 - } 852 - 853 - /* 854 860 * Destroy ceph client 855 861 * 856 862 * Caller must hold rbd_client_list_lock. ··· 856 902 { 857 903 if (rbdc) 858 904 kref_put(&rbdc->kref, rbd_client_release); 905 + } 906 + 907 + static int wait_for_latest_osdmap(struct ceph_client *client) 908 + { 909 + u64 newest_epoch; 910 + int ret; 911 + 912 + ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch); 913 + if (ret) 914 + return ret; 915 + 916 + if (client->osdc.osdmap->epoch >= newest_epoch) 917 + return 0; 918 + 919 + ceph_osdc_maybe_request_map(&client->osdc); 920 + return ceph_monc_wait_osdmap(&client->monc, newest_epoch, 921 + client->options->mount_timeout); 922 + } 923 + 924 + /* 925 + * Get a ceph client with specific addr and configuration, if one does 926 + * not exist create it. Either way, ceph_opts is consumed by this 927 + * function. 928 + */ 929 + static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts) 930 + { 931 + struct rbd_client *rbdc; 932 + int ret; 933 + 934 + mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING); 935 + rbdc = rbd_client_find(ceph_opts); 936 + if (rbdc) { 937 + ceph_destroy_options(ceph_opts); 938 + 939 + /* 940 + * Using an existing client. Make sure ->pg_pools is up to 941 + * date before we look up the pool id in do_rbd_add(). 942 + */ 943 + ret = wait_for_latest_osdmap(rbdc->client); 944 + if (ret) { 945 + rbd_warn(NULL, "failed to get latest osdmap: %d", ret); 946 + rbd_put_client(rbdc); 947 + rbdc = ERR_PTR(ret); 948 + } 949 + } else { 950 + rbdc = rbd_client_create(ceph_opts); 951 + } 952 + mutex_unlock(&client_mutex); 953 + 954 + return rbdc; 859 955 } 860 956 861 957 static bool rbd_image_format_valid(u32 image_format) ··· 1227 1223 rbd_dev->mapping.features = 0; 1228 1224 } 1229 1225 1230 - static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset) 1226 + static void zero_bvec(struct bio_vec *bv) 1231 1227 { 1232 - u64 segment_size = rbd_obj_bytes(&rbd_dev->header); 1233 - 1234 - return offset & (segment_size - 1); 1235 - } 1236 - 1237 - static u64 rbd_segment_length(struct rbd_device *rbd_dev, 1238 - u64 offset, u64 length) 1239 - { 1240 - u64 segment_size = rbd_obj_bytes(&rbd_dev->header); 1241 - 1242 - offset &= segment_size - 1; 1243 - 1244 - rbd_assert(length <= U64_MAX - offset); 1245 - if (offset + length > segment_size) 1246 - length = segment_size - offset; 1247 - 1248 - return length; 1249 - } 1250 - 1251 - /* 1252 - * bio helpers 1253 - */ 1254 - 1255 - static void bio_chain_put(struct bio *chain) 1256 - { 1257 - struct bio *tmp; 1258 - 1259 - while (chain) { 1260 - tmp = chain; 1261 - chain = chain->bi_next; 1262 - bio_put(tmp); 1263 - } 1264 - } 1265 - 1266 - /* 1267 - * zeros a bio chain, starting at specific offset 1268 - */ 1269 - static void zero_bio_chain(struct bio *chain, int start_ofs) 1270 - { 1271 - struct bio_vec bv; 1272 - struct bvec_iter iter; 1273 - unsigned long flags; 1274 1228 void *buf; 1275 - int pos = 0; 1229 + unsigned long flags; 1276 1230 1277 - while (chain) { 1278 - bio_for_each_segment(bv, chain, iter) { 1279 - if (pos + bv.bv_len > start_ofs) { 1280 - int remainder = max(start_ofs - pos, 0); 1281 - buf = bvec_kmap_irq(&bv, &flags); 1282 - memset(buf + remainder, 0, 1283 - bv.bv_len - remainder); 1284 - flush_dcache_page(bv.bv_page); 1285 - bvec_kunmap_irq(buf, &flags); 1286 - } 1287 - pos += bv.bv_len; 1288 - } 1289 - 1290 - chain = chain->bi_next; 1291 - } 1231 + buf = bvec_kmap_irq(bv, &flags); 1232 + memset(buf, 0, bv->bv_len); 1233 + flush_dcache_page(bv->bv_page); 1234 + bvec_kunmap_irq(buf, &flags); 1292 1235 } 1293 1236 1294 - /* 1295 - * similar to zero_bio_chain(), zeros data defined by a page array, 1296 - * starting at the given byte offset from the start of the array and 1297 - * continuing up to the given end offset. The pages array is 1298 - * assumed to be big enough to hold all bytes up to the end. 1299 - */ 1300 - static void zero_pages(struct page **pages, u64 offset, u64 end) 1237 + static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes) 1301 1238 { 1302 - struct page **page = &pages[offset >> PAGE_SHIFT]; 1239 + struct ceph_bio_iter it = *bio_pos; 1303 1240 1304 - rbd_assert(end > offset); 1305 - rbd_assert(end - offset <= (u64)SIZE_MAX); 1306 - while (offset < end) { 1307 - size_t page_offset; 1308 - size_t length; 1309 - unsigned long flags; 1310 - void *kaddr; 1311 - 1312 - page_offset = offset & ~PAGE_MASK; 1313 - length = min_t(size_t, PAGE_SIZE - page_offset, end - offset); 1314 - local_irq_save(flags); 1315 - kaddr = kmap_atomic(*page); 1316 - memset(kaddr + page_offset, 0, length); 1317 - flush_dcache_page(*page); 1318 - kunmap_atomic(kaddr); 1319 - local_irq_restore(flags); 1320 - 1321 - offset += length; 1322 - page++; 1323 - } 1241 + ceph_bio_iter_advance(&it, off); 1242 + ceph_bio_iter_advance_step(&it, bytes, ({ 1243 + zero_bvec(&bv); 1244 + })); 1324 1245 } 1325 1246 1326 - /* 1327 - * Clone a portion of a bio, starting at the given byte offset 1328 - * and continuing for the number of bytes indicated. 1329 - */ 1330 - static struct bio *bio_clone_range(struct bio *bio_src, 1331 - unsigned int offset, 1332 - unsigned int len, 1333 - gfp_t gfpmask) 1247 + static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes) 1334 1248 { 1335 - struct bio *bio; 1249 + struct ceph_bvec_iter it = *bvec_pos; 1336 1250 1337 - bio = bio_clone_fast(bio_src, gfpmask, rbd_bio_clone); 1338 - if (!bio) 1339 - return NULL; /* ENOMEM */ 1340 - 1341 - bio_advance(bio, offset); 1342 - bio->bi_iter.bi_size = len; 1343 - 1344 - return bio; 1251 + ceph_bvec_iter_advance(&it, off); 1252 + ceph_bvec_iter_advance_step(&it, bytes, ({ 1253 + zero_bvec(&bv); 1254 + })); 1345 1255 } 1346 1256 1347 1257 /* 1348 - * Clone a portion of a bio chain, starting at the given byte offset 1349 - * into the first bio in the source chain and continuing for the 1350 - * number of bytes indicated. The result is another bio chain of 1351 - * exactly the given length, or a null pointer on error. 1258 + * Zero a range in @obj_req data buffer defined by a bio (list) or 1259 + * (private) bio_vec array. 1352 1260 * 1353 - * The bio_src and offset parameters are both in-out. On entry they 1354 - * refer to the first source bio and the offset into that bio where 1355 - * the start of data to be cloned is located. 1356 - * 1357 - * On return, bio_src is updated to refer to the bio in the source 1358 - * chain that contains first un-cloned byte, and *offset will 1359 - * contain the offset of that byte within that bio. 1261 + * @off is relative to the start of the data buffer. 1360 1262 */ 1361 - static struct bio *bio_chain_clone_range(struct bio **bio_src, 1362 - unsigned int *offset, 1363 - unsigned int len, 1364 - gfp_t gfpmask) 1263 + static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, 1264 + u32 bytes) 1365 1265 { 1366 - struct bio *bi = *bio_src; 1367 - unsigned int off = *offset; 1368 - struct bio *chain = NULL; 1369 - struct bio **end; 1370 - 1371 - /* Build up a chain of clone bios up to the limit */ 1372 - 1373 - if (!bi || off >= bi->bi_iter.bi_size || !len) 1374 - return NULL; /* Nothing to clone */ 1375 - 1376 - end = &chain; 1377 - while (len) { 1378 - unsigned int bi_size; 1379 - struct bio *bio; 1380 - 1381 - if (!bi) { 1382 - rbd_warn(NULL, "bio_chain exhausted with %u left", len); 1383 - goto out_err; /* EINVAL; ran out of bio's */ 1384 - } 1385 - bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len); 1386 - bio = bio_clone_range(bi, off, bi_size, gfpmask); 1387 - if (!bio) 1388 - goto out_err; /* ENOMEM */ 1389 - 1390 - *end = bio; 1391 - end = &bio->bi_next; 1392 - 1393 - off += bi_size; 1394 - if (off == bi->bi_iter.bi_size) { 1395 - bi = bi->bi_next; 1396 - off = 0; 1397 - } 1398 - len -= bi_size; 1266 + switch (obj_req->img_request->data_type) { 1267 + case OBJ_REQUEST_BIO: 1268 + zero_bios(&obj_req->bio_pos, off, bytes); 1269 + break; 1270 + case OBJ_REQUEST_BVECS: 1271 + case OBJ_REQUEST_OWN_BVECS: 1272 + zero_bvecs(&obj_req->bvec_pos, off, bytes); 1273 + break; 1274 + default: 1275 + rbd_assert(0); 1399 1276 } 1400 - *bio_src = bi; 1401 - *offset = off; 1402 - 1403 - return chain; 1404 - out_err: 1405 - bio_chain_put(chain); 1406 - 1407 - return NULL; 1408 - } 1409 - 1410 - /* 1411 - * The default/initial value for all object request flags is 0. For 1412 - * each flag, once its value is set to 1 it is never reset to 0 1413 - * again. 1414 - */ 1415 - static void obj_request_img_data_set(struct rbd_obj_request *obj_request) 1416 - { 1417 - if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) { 1418 - struct rbd_device *rbd_dev; 1419 - 1420 - rbd_dev = obj_request->img_request->rbd_dev; 1421 - rbd_warn(rbd_dev, "obj_request %p already marked img_data", 1422 - obj_request); 1423 - } 1424 - } 1425 - 1426 - static bool obj_request_img_data_test(struct rbd_obj_request *obj_request) 1427 - { 1428 - smp_mb(); 1429 - return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0; 1430 - } 1431 - 1432 - static void obj_request_done_set(struct rbd_obj_request *obj_request) 1433 - { 1434 - if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) { 1435 - struct rbd_device *rbd_dev = NULL; 1436 - 1437 - if (obj_request_img_data_test(obj_request)) 1438 - rbd_dev = obj_request->img_request->rbd_dev; 1439 - rbd_warn(rbd_dev, "obj_request %p already marked done", 1440 - obj_request); 1441 - } 1442 - } 1443 - 1444 - static bool obj_request_done_test(struct rbd_obj_request *obj_request) 1445 - { 1446 - smp_mb(); 1447 - return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0; 1448 - } 1449 - 1450 - /* 1451 - * This sets the KNOWN flag after (possibly) setting the EXISTS 1452 - * flag. The latter is set based on the "exists" value provided. 1453 - * 1454 - * Note that for our purposes once an object exists it never goes 1455 - * away again. It's possible that the response from two existence 1456 - * checks are separated by the creation of the target object, and 1457 - * the first ("doesn't exist") response arrives *after* the second 1458 - * ("does exist"). In that case we ignore the second one. 1459 - */ 1460 - static void obj_request_existence_set(struct rbd_obj_request *obj_request, 1461 - bool exists) 1462 - { 1463 - if (exists) 1464 - set_bit(OBJ_REQ_EXISTS, &obj_request->flags); 1465 - set_bit(OBJ_REQ_KNOWN, &obj_request->flags); 1466 - smp_mb(); 1467 - } 1468 - 1469 - static bool obj_request_known_test(struct rbd_obj_request *obj_request) 1470 - { 1471 - smp_mb(); 1472 - return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0; 1473 - } 1474 - 1475 - static bool obj_request_exists_test(struct rbd_obj_request *obj_request) 1476 - { 1477 - smp_mb(); 1478 - return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0; 1479 - } 1480 - 1481 - static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request) 1482 - { 1483 - struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 1484 - 1485 - return obj_request->img_offset < 1486 - round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header)); 1487 - } 1488 - 1489 - static void rbd_obj_request_get(struct rbd_obj_request *obj_request) 1490 - { 1491 - dout("%s: obj %p (was %d)\n", __func__, obj_request, 1492 - kref_read(&obj_request->kref)); 1493 - kref_get(&obj_request->kref); 1494 1277 } 1495 1278 1496 1279 static void rbd_obj_request_destroy(struct kref *kref); ··· 1296 1505 kref_get(&img_request->kref); 1297 1506 } 1298 1507 1299 - static bool img_request_child_test(struct rbd_img_request *img_request); 1300 - static void rbd_parent_request_destroy(struct kref *kref); 1301 1508 static void rbd_img_request_destroy(struct kref *kref); 1302 1509 static void rbd_img_request_put(struct rbd_img_request *img_request) 1303 1510 { 1304 1511 rbd_assert(img_request != NULL); 1305 1512 dout("%s: img %p (was %d)\n", __func__, img_request, 1306 1513 kref_read(&img_request->kref)); 1307 - if (img_request_child_test(img_request)) 1308 - kref_put(&img_request->kref, rbd_parent_request_destroy); 1309 - else 1310 - kref_put(&img_request->kref, rbd_img_request_destroy); 1514 + kref_put(&img_request->kref, rbd_img_request_destroy); 1311 1515 } 1312 1516 1313 1517 static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request, ··· 1312 1526 1313 1527 /* Image request now owns object's original reference */ 1314 1528 obj_request->img_request = img_request; 1315 - obj_request->which = img_request->obj_request_count; 1316 - rbd_assert(!obj_request_img_data_test(obj_request)); 1317 - obj_request_img_data_set(obj_request); 1318 - rbd_assert(obj_request->which != BAD_WHICH); 1319 1529 img_request->obj_request_count++; 1320 - list_add_tail(&obj_request->links, &img_request->obj_requests); 1321 - dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 1322 - obj_request->which); 1530 + img_request->pending_count++; 1531 + dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 1323 1532 } 1324 1533 1325 1534 static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request, 1326 1535 struct rbd_obj_request *obj_request) 1327 1536 { 1328 - rbd_assert(obj_request->which != BAD_WHICH); 1329 - 1330 - dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request, 1331 - obj_request->which); 1332 - list_del(&obj_request->links); 1537 + dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 1538 + list_del(&obj_request->ex.oe_item); 1333 1539 rbd_assert(img_request->obj_request_count > 0); 1334 1540 img_request->obj_request_count--; 1335 - rbd_assert(obj_request->which == img_request->obj_request_count); 1336 - obj_request->which = BAD_WHICH; 1337 - rbd_assert(obj_request_img_data_test(obj_request)); 1338 1541 rbd_assert(obj_request->img_request == img_request); 1339 - obj_request->img_request = NULL; 1340 - obj_request->callback = NULL; 1341 1542 rbd_obj_request_put(obj_request); 1342 1543 } 1343 - 1344 - static bool obj_request_type_valid(enum obj_request_type type) 1345 - { 1346 - switch (type) { 1347 - case OBJ_REQUEST_NODATA: 1348 - case OBJ_REQUEST_BIO: 1349 - case OBJ_REQUEST_PAGES: 1350 - return true; 1351 - default: 1352 - return false; 1353 - } 1354 - } 1355 - 1356 - static void rbd_img_obj_callback(struct rbd_obj_request *obj_request); 1357 1544 1358 1545 static void rbd_obj_request_submit(struct rbd_obj_request *obj_request) 1359 1546 { 1360 1547 struct ceph_osd_request *osd_req = obj_request->osd_req; 1361 1548 1362 1549 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__, 1363 - obj_request, obj_request->object_no, obj_request->offset, 1364 - obj_request->length, osd_req); 1365 - if (obj_request_img_data_test(obj_request)) { 1366 - WARN_ON(obj_request->callback != rbd_img_obj_callback); 1367 - rbd_img_request_get(obj_request->img_request); 1368 - } 1550 + obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off, 1551 + obj_request->ex.oe_len, osd_req); 1369 1552 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); 1370 - } 1371 - 1372 - static void rbd_img_request_complete(struct rbd_img_request *img_request) 1373 - { 1374 - 1375 - dout("%s: img %p\n", __func__, img_request); 1376 - 1377 - /* 1378 - * If no error occurred, compute the aggregate transfer 1379 - * count for the image request. We could instead use 1380 - * atomic64_cmpxchg() to update it as each object request 1381 - * completes; not clear which way is better off hand. 1382 - */ 1383 - if (!img_request->result) { 1384 - struct rbd_obj_request *obj_request; 1385 - u64 xferred = 0; 1386 - 1387 - for_each_obj_request(img_request, obj_request) 1388 - xferred += obj_request->xferred; 1389 - img_request->xferred = xferred; 1390 - } 1391 - 1392 - if (img_request->callback) 1393 - img_request->callback(img_request); 1394 - else 1395 - rbd_img_request_put(img_request); 1396 1553 } 1397 1554 1398 1555 /* ··· 1343 1614 * is conditionally set to 1 at image request initialization time 1344 1615 * and currently never change thereafter. 1345 1616 */ 1346 - static void img_request_write_set(struct rbd_img_request *img_request) 1347 - { 1348 - set_bit(IMG_REQ_WRITE, &img_request->flags); 1349 - smp_mb(); 1350 - } 1351 - 1352 - static bool img_request_write_test(struct rbd_img_request *img_request) 1353 - { 1354 - smp_mb(); 1355 - return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; 1356 - } 1357 - 1358 - /* 1359 - * Set the discard flag when the img_request is an discard request 1360 - */ 1361 - static void img_request_discard_set(struct rbd_img_request *img_request) 1362 - { 1363 - set_bit(IMG_REQ_DISCARD, &img_request->flags); 1364 - smp_mb(); 1365 - } 1366 - 1367 - static bool img_request_discard_test(struct rbd_img_request *img_request) 1368 - { 1369 - smp_mb(); 1370 - return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0; 1371 - } 1372 - 1373 - static void img_request_child_set(struct rbd_img_request *img_request) 1374 - { 1375 - set_bit(IMG_REQ_CHILD, &img_request->flags); 1376 - smp_mb(); 1377 - } 1378 - 1379 - static void img_request_child_clear(struct rbd_img_request *img_request) 1380 - { 1381 - clear_bit(IMG_REQ_CHILD, &img_request->flags); 1382 - smp_mb(); 1383 - } 1384 - 1385 - static bool img_request_child_test(struct rbd_img_request *img_request) 1386 - { 1387 - smp_mb(); 1388 - return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0; 1389 - } 1390 - 1391 1617 static void img_request_layered_set(struct rbd_img_request *img_request) 1392 1618 { 1393 1619 set_bit(IMG_REQ_LAYERED, &img_request->flags); ··· 1361 1677 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1362 1678 } 1363 1679 1364 - static enum obj_operation_type 1365 - rbd_img_request_op_type(struct rbd_img_request *img_request) 1680 + static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req) 1366 1681 { 1367 - if (img_request_write_test(img_request)) 1368 - return OBJ_OP_WRITE; 1369 - else if (img_request_discard_test(img_request)) 1370 - return OBJ_OP_DISCARD; 1371 - else 1372 - return OBJ_OP_READ; 1682 + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 1683 + 1684 + return !obj_req->ex.oe_off && 1685 + obj_req->ex.oe_len == rbd_dev->layout.object_size; 1373 1686 } 1374 1687 1375 - static void 1376 - rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 1688 + static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req) 1377 1689 { 1378 - u64 xferred = obj_request->xferred; 1379 - u64 length = obj_request->length; 1690 + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 1380 1691 1381 - dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 1382 - obj_request, obj_request->img_request, obj_request->result, 1383 - xferred, length); 1384 - /* 1385 - * ENOENT means a hole in the image. We zero-fill the entire 1386 - * length of the request. A short read also implies zero-fill 1387 - * to the end of the request. An error requires the whole 1388 - * length of the request to be reported finished with an error 1389 - * to the block layer. In each case we update the xferred 1390 - * count to indicate the whole request was satisfied. 1391 - */ 1392 - rbd_assert(obj_request->type != OBJ_REQUEST_NODATA); 1393 - if (obj_request->result == -ENOENT) { 1394 - if (obj_request->type == OBJ_REQUEST_BIO) 1395 - zero_bio_chain(obj_request->bio_list, 0); 1396 - else 1397 - zero_pages(obj_request->pages, 0, length); 1398 - obj_request->result = 0; 1399 - } else if (xferred < length && !obj_request->result) { 1400 - if (obj_request->type == OBJ_REQUEST_BIO) 1401 - zero_bio_chain(obj_request->bio_list, xferred); 1402 - else 1403 - zero_pages(obj_request->pages, xferred, length); 1692 + return obj_req->ex.oe_off + obj_req->ex.oe_len == 1693 + rbd_dev->layout.object_size; 1694 + } 1695 + 1696 + static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req) 1697 + { 1698 + return ceph_file_extents_bytes(obj_req->img_extents, 1699 + obj_req->num_img_extents); 1700 + } 1701 + 1702 + static bool rbd_img_is_write(struct rbd_img_request *img_req) 1703 + { 1704 + switch (img_req->op_type) { 1705 + case OBJ_OP_READ: 1706 + return false; 1707 + case OBJ_OP_WRITE: 1708 + case OBJ_OP_DISCARD: 1709 + return true; 1710 + default: 1711 + rbd_assert(0); 1404 1712 } 1405 - obj_request->xferred = length; 1406 - obj_request_done_set(obj_request); 1407 1713 } 1408 1714 1409 - static void rbd_obj_request_complete(struct rbd_obj_request *obj_request) 1410 - { 1411 - dout("%s: obj %p cb %p\n", __func__, obj_request, 1412 - obj_request->callback); 1413 - obj_request->callback(obj_request); 1414 - } 1415 - 1416 - static void rbd_obj_request_error(struct rbd_obj_request *obj_request, int err) 1417 - { 1418 - obj_request->result = err; 1419 - obj_request->xferred = 0; 1420 - /* 1421 - * kludge - mirror rbd_obj_request_submit() to match a put in 1422 - * rbd_img_obj_callback() 1423 - */ 1424 - if (obj_request_img_data_test(obj_request)) { 1425 - WARN_ON(obj_request->callback != rbd_img_obj_callback); 1426 - rbd_img_request_get(obj_request->img_request); 1427 - } 1428 - obj_request_done_set(obj_request); 1429 - rbd_obj_request_complete(obj_request); 1430 - } 1431 - 1432 - static void rbd_osd_read_callback(struct rbd_obj_request *obj_request) 1433 - { 1434 - struct rbd_img_request *img_request = NULL; 1435 - struct rbd_device *rbd_dev = NULL; 1436 - bool layered = false; 1437 - 1438 - if (obj_request_img_data_test(obj_request)) { 1439 - img_request = obj_request->img_request; 1440 - layered = img_request && img_request_layered_test(img_request); 1441 - rbd_dev = img_request->rbd_dev; 1442 - } 1443 - 1444 - dout("%s: obj %p img %p result %d %llu/%llu\n", __func__, 1445 - obj_request, img_request, obj_request->result, 1446 - obj_request->xferred, obj_request->length); 1447 - if (layered && obj_request->result == -ENOENT && 1448 - obj_request->img_offset < rbd_dev->parent_overlap) 1449 - rbd_img_parent_read(obj_request); 1450 - else if (img_request) 1451 - rbd_img_obj_request_read_callback(obj_request); 1452 - else 1453 - obj_request_done_set(obj_request); 1454 - } 1455 - 1456 - static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) 1457 - { 1458 - dout("%s: obj %p result %d %llu\n", __func__, obj_request, 1459 - obj_request->result, obj_request->length); 1460 - /* 1461 - * There is no such thing as a successful short write. Set 1462 - * it to our originally-requested length. 1463 - */ 1464 - obj_request->xferred = obj_request->length; 1465 - obj_request_done_set(obj_request); 1466 - } 1467 - 1468 - static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request) 1469 - { 1470 - dout("%s: obj %p result %d %llu\n", __func__, obj_request, 1471 - obj_request->result, obj_request->length); 1472 - /* 1473 - * There is no such thing as a successful short discard. Set 1474 - * it to our originally-requested length. 1475 - */ 1476 - obj_request->xferred = obj_request->length; 1477 - /* discarding a non-existent object is not a problem */ 1478 - if (obj_request->result == -ENOENT) 1479 - obj_request->result = 0; 1480 - obj_request_done_set(obj_request); 1481 - } 1482 - 1483 - /* 1484 - * For a simple stat call there's nothing to do. We'll do more if 1485 - * this is part of a write sequence for a layered image. 1486 - */ 1487 - static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request) 1488 - { 1489 - dout("%s: obj %p\n", __func__, obj_request); 1490 - obj_request_done_set(obj_request); 1491 - } 1492 - 1493 - static void rbd_osd_call_callback(struct rbd_obj_request *obj_request) 1494 - { 1495 - dout("%s: obj %p\n", __func__, obj_request); 1496 - 1497 - if (obj_request_img_data_test(obj_request)) 1498 - rbd_osd_copyup_callback(obj_request); 1499 - else 1500 - obj_request_done_set(obj_request); 1501 - } 1715 + static void rbd_obj_handle_request(struct rbd_obj_request *obj_req); 1502 1716 1503 1717 static void rbd_osd_req_callback(struct ceph_osd_request *osd_req) 1504 1718 { 1505 - struct rbd_obj_request *obj_request = osd_req->r_priv; 1506 - u16 opcode; 1719 + struct rbd_obj_request *obj_req = osd_req->r_priv; 1507 1720 1508 - dout("%s: osd_req %p\n", __func__, osd_req); 1509 - rbd_assert(osd_req == obj_request->osd_req); 1510 - if (obj_request_img_data_test(obj_request)) { 1511 - rbd_assert(obj_request->img_request); 1512 - rbd_assert(obj_request->which != BAD_WHICH); 1513 - } else { 1514 - rbd_assert(obj_request->which == BAD_WHICH); 1515 - } 1721 + dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req, 1722 + osd_req->r_result, obj_req); 1723 + rbd_assert(osd_req == obj_req->osd_req); 1516 1724 1517 - if (osd_req->r_result < 0) 1518 - obj_request->result = osd_req->r_result; 1725 + obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0; 1726 + if (!obj_req->result && !rbd_img_is_write(obj_req->img_request)) 1727 + obj_req->xferred = osd_req->r_result; 1728 + else 1729 + /* 1730 + * Writes aren't allowed to return a data payload. In some 1731 + * guarded write cases (e.g. stat + zero on an empty object) 1732 + * a stat response makes it through, but we don't care. 1733 + */ 1734 + obj_req->xferred = 0; 1519 1735 1520 - /* 1521 - * We support a 64-bit length, but ultimately it has to be 1522 - * passed to the block layer, which just supports a 32-bit 1523 - * length field. 1524 - */ 1525 - obj_request->xferred = osd_req->r_ops[0].outdata_len; 1526 - rbd_assert(obj_request->xferred < (u64)UINT_MAX); 1527 - 1528 - opcode = osd_req->r_ops[0].op; 1529 - switch (opcode) { 1530 - case CEPH_OSD_OP_READ: 1531 - rbd_osd_read_callback(obj_request); 1532 - break; 1533 - case CEPH_OSD_OP_SETALLOCHINT: 1534 - rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE || 1535 - osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL); 1536 - /* fall through */ 1537 - case CEPH_OSD_OP_WRITE: 1538 - case CEPH_OSD_OP_WRITEFULL: 1539 - rbd_osd_write_callback(obj_request); 1540 - break; 1541 - case CEPH_OSD_OP_STAT: 1542 - rbd_osd_stat_callback(obj_request); 1543 - break; 1544 - case CEPH_OSD_OP_DELETE: 1545 - case CEPH_OSD_OP_TRUNCATE: 1546 - case CEPH_OSD_OP_ZERO: 1547 - rbd_osd_discard_callback(obj_request); 1548 - break; 1549 - case CEPH_OSD_OP_CALL: 1550 - rbd_osd_call_callback(obj_request); 1551 - break; 1552 - default: 1553 - rbd_warn(NULL, "unexpected OSD op: object_no %016llx opcode %d", 1554 - obj_request->object_no, opcode); 1555 - break; 1556 - } 1557 - 1558 - if (obj_request_done_test(obj_request)) 1559 - rbd_obj_request_complete(obj_request); 1736 + rbd_obj_handle_request(obj_req); 1560 1737 } 1561 1738 1562 1739 static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request) 1563 1740 { 1564 1741 struct ceph_osd_request *osd_req = obj_request->osd_req; 1565 1742 1566 - rbd_assert(obj_request_img_data_test(obj_request)); 1743 + osd_req->r_flags = CEPH_OSD_FLAG_READ; 1567 1744 osd_req->r_snapid = obj_request->img_request->snap_id; 1568 1745 } 1569 1746 ··· 1432 1887 { 1433 1888 struct ceph_osd_request *osd_req = obj_request->osd_req; 1434 1889 1890 + osd_req->r_flags = CEPH_OSD_FLAG_WRITE; 1435 1891 ktime_get_real_ts(&osd_req->r_mtime); 1436 - osd_req->r_data_offset = obj_request->offset; 1892 + osd_req->r_data_offset = obj_request->ex.oe_off; 1437 1893 } 1438 1894 1439 1895 static struct ceph_osd_request * 1440 - __rbd_osd_req_create(struct rbd_device *rbd_dev, 1441 - struct ceph_snap_context *snapc, 1442 - int num_ops, unsigned int flags, 1443 - struct rbd_obj_request *obj_request) 1896 + rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops) 1444 1897 { 1898 + struct rbd_img_request *img_req = obj_req->img_request; 1899 + struct rbd_device *rbd_dev = img_req->rbd_dev; 1445 1900 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 1446 1901 struct ceph_osd_request *req; 1447 1902 const char *name_format = rbd_dev->image_format == 1 ? 1448 1903 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT; 1449 1904 1450 - req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO); 1905 + req = ceph_osdc_alloc_request(osdc, 1906 + (rbd_img_is_write(img_req) ? img_req->snapc : NULL), 1907 + num_ops, false, GFP_NOIO); 1451 1908 if (!req) 1452 1909 return NULL; 1453 1910 1454 - req->r_flags = flags; 1455 1911 req->r_callback = rbd_osd_req_callback; 1456 - req->r_priv = obj_request; 1912 + req->r_priv = obj_req; 1457 1913 1458 1914 req->r_base_oloc.pool = rbd_dev->layout.pool_id; 1459 1915 if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, 1460 - rbd_dev->header.object_prefix, obj_request->object_no)) 1916 + rbd_dev->header.object_prefix, obj_req->ex.oe_objno)) 1461 1917 goto err_req; 1462 1918 1463 1919 if (ceph_osdc_alloc_messages(req, GFP_NOIO)) ··· 1471 1925 return NULL; 1472 1926 } 1473 1927 1474 - /* 1475 - * Create an osd request. A read request has one osd op (read). 1476 - * A write request has either one (watch) or two (hint+write) osd ops. 1477 - * (All rbd data writes are prefixed with an allocation hint op, but 1478 - * technically osd watch is a write request, hence this distinction.) 1479 - */ 1480 - static struct ceph_osd_request *rbd_osd_req_create( 1481 - struct rbd_device *rbd_dev, 1482 - enum obj_operation_type op_type, 1483 - unsigned int num_ops, 1484 - struct rbd_obj_request *obj_request) 1485 - { 1486 - struct ceph_snap_context *snapc = NULL; 1487 - 1488 - if (obj_request_img_data_test(obj_request) && 1489 - (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { 1490 - struct rbd_img_request *img_request = obj_request->img_request; 1491 - if (op_type == OBJ_OP_WRITE) { 1492 - rbd_assert(img_request_write_test(img_request)); 1493 - } else { 1494 - rbd_assert(img_request_discard_test(img_request)); 1495 - } 1496 - snapc = img_request->snapc; 1497 - } 1498 - 1499 - rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2)); 1500 - 1501 - return __rbd_osd_req_create(rbd_dev, snapc, num_ops, 1502 - (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ? 1503 - CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request); 1504 - } 1505 - 1506 - /* 1507 - * Create a copyup osd request based on the information in the object 1508 - * request supplied. A copyup request has two or three osd ops, a 1509 - * copyup method call, potentially a hint op, and a write or truncate 1510 - * or zero op. 1511 - */ 1512 - static struct ceph_osd_request * 1513 - rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) 1514 - { 1515 - struct rbd_img_request *img_request; 1516 - int num_osd_ops = 3; 1517 - 1518 - rbd_assert(obj_request_img_data_test(obj_request)); 1519 - img_request = obj_request->img_request; 1520 - rbd_assert(img_request); 1521 - rbd_assert(img_request_write_test(img_request) || 1522 - img_request_discard_test(img_request)); 1523 - 1524 - if (img_request_discard_test(img_request)) 1525 - num_osd_ops = 2; 1526 - 1527 - return __rbd_osd_req_create(img_request->rbd_dev, 1528 - img_request->snapc, num_osd_ops, 1529 - CEPH_OSD_FLAG_WRITE, obj_request); 1530 - } 1531 - 1532 1928 static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req) 1533 1929 { 1534 1930 ceph_osdc_put_request(osd_req); 1535 1931 } 1536 1932 1537 - static struct rbd_obj_request * 1538 - rbd_obj_request_create(enum obj_request_type type) 1933 + static struct rbd_obj_request *rbd_obj_request_create(void) 1539 1934 { 1540 1935 struct rbd_obj_request *obj_request; 1541 - 1542 - rbd_assert(obj_request_type_valid(type)); 1543 1936 1544 1937 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO); 1545 1938 if (!obj_request) 1546 1939 return NULL; 1547 1940 1548 - obj_request->which = BAD_WHICH; 1549 - obj_request->type = type; 1550 - INIT_LIST_HEAD(&obj_request->links); 1941 + ceph_object_extent_init(&obj_request->ex); 1551 1942 kref_init(&obj_request->kref); 1552 1943 1553 1944 dout("%s %p\n", __func__, obj_request); ··· 1494 2011 static void rbd_obj_request_destroy(struct kref *kref) 1495 2012 { 1496 2013 struct rbd_obj_request *obj_request; 2014 + u32 i; 1497 2015 1498 2016 obj_request = container_of(kref, struct rbd_obj_request, kref); 1499 2017 1500 2018 dout("%s: obj %p\n", __func__, obj_request); 1501 2019 1502 - rbd_assert(obj_request->img_request == NULL); 1503 - rbd_assert(obj_request->which == BAD_WHICH); 1504 - 1505 2020 if (obj_request->osd_req) 1506 2021 rbd_osd_req_destroy(obj_request->osd_req); 1507 2022 1508 - rbd_assert(obj_request_type_valid(obj_request->type)); 1509 - switch (obj_request->type) { 2023 + switch (obj_request->img_request->data_type) { 1510 2024 case OBJ_REQUEST_NODATA: 1511 - break; /* Nothing to do */ 1512 2025 case OBJ_REQUEST_BIO: 1513 - if (obj_request->bio_list) 1514 - bio_chain_put(obj_request->bio_list); 2026 + case OBJ_REQUEST_BVECS: 2027 + break; /* Nothing to do */ 2028 + case OBJ_REQUEST_OWN_BVECS: 2029 + kfree(obj_request->bvec_pos.bvecs); 1515 2030 break; 1516 - case OBJ_REQUEST_PAGES: 1517 - /* img_data requests don't own their page array */ 1518 - if (obj_request->pages && 1519 - !obj_request_img_data_test(obj_request)) 1520 - ceph_release_page_vector(obj_request->pages, 1521 - obj_request->page_count); 1522 - break; 2031 + default: 2032 + rbd_assert(0); 2033 + } 2034 + 2035 + kfree(obj_request->img_extents); 2036 + if (obj_request->copyup_bvecs) { 2037 + for (i = 0; i < obj_request->copyup_bvec_count; i++) { 2038 + if (obj_request->copyup_bvecs[i].bv_page) 2039 + __free_page(obj_request->copyup_bvecs[i].bv_page); 2040 + } 2041 + kfree(obj_request->copyup_bvecs); 1523 2042 } 1524 2043 1525 2044 kmem_cache_free(rbd_obj_request_cache, obj_request); ··· 1596 2111 */ 1597 2112 static struct rbd_img_request *rbd_img_request_create( 1598 2113 struct rbd_device *rbd_dev, 1599 - u64 offset, u64 length, 1600 2114 enum obj_operation_type op_type, 1601 2115 struct ceph_snap_context *snapc) 1602 2116 { ··· 1606 2122 return NULL; 1607 2123 1608 2124 img_request->rbd_dev = rbd_dev; 1609 - img_request->offset = offset; 1610 - img_request->length = length; 1611 - if (op_type == OBJ_OP_DISCARD) { 1612 - img_request_discard_set(img_request); 1613 - img_request->snapc = snapc; 1614 - } else if (op_type == OBJ_OP_WRITE) { 1615 - img_request_write_set(img_request); 1616 - img_request->snapc = snapc; 1617 - } else { 2125 + img_request->op_type = op_type; 2126 + if (!rbd_img_is_write(img_request)) 1618 2127 img_request->snap_id = rbd_dev->spec->snap_id; 1619 - } 2128 + else 2129 + img_request->snapc = snapc; 2130 + 1620 2131 if (rbd_dev_parent_get(rbd_dev)) 1621 2132 img_request_layered_set(img_request); 1622 2133 1623 2134 spin_lock_init(&img_request->completion_lock); 1624 - INIT_LIST_HEAD(&img_request->obj_requests); 2135 + INIT_LIST_HEAD(&img_request->object_extents); 1625 2136 kref_init(&img_request->kref); 1626 2137 1627 - dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 1628 - obj_op_name(op_type), offset, length, img_request); 1629 - 2138 + dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev, 2139 + obj_op_name(op_type), img_request); 1630 2140 return img_request; 1631 2141 } 1632 2142 ··· 1643 2165 rbd_dev_parent_put(img_request->rbd_dev); 1644 2166 } 1645 2167 1646 - if (img_request_write_test(img_request) || 1647 - img_request_discard_test(img_request)) 2168 + if (rbd_img_is_write(img_request)) 1648 2169 ceph_put_snap_context(img_request->snapc); 1649 2170 1650 2171 kmem_cache_free(rbd_img_request_cache, img_request); 1651 2172 } 1652 2173 1653 - static struct rbd_img_request *rbd_parent_request_create( 1654 - struct rbd_obj_request *obj_request, 1655 - u64 img_offset, u64 length) 2174 + static void prune_extents(struct ceph_file_extent *img_extents, 2175 + u32 *num_img_extents, u64 overlap) 1656 2176 { 1657 - struct rbd_img_request *parent_request; 1658 - struct rbd_device *rbd_dev; 2177 + u32 cnt = *num_img_extents; 1659 2178 1660 - rbd_assert(obj_request->img_request); 1661 - rbd_dev = obj_request->img_request->rbd_dev; 2179 + /* drop extents completely beyond the overlap */ 2180 + while (cnt && img_extents[cnt - 1].fe_off >= overlap) 2181 + cnt--; 1662 2182 1663 - parent_request = rbd_img_request_create(rbd_dev->parent, img_offset, 1664 - length, OBJ_OP_READ, NULL); 1665 - if (!parent_request) 1666 - return NULL; 2183 + if (cnt) { 2184 + struct ceph_file_extent *ex = &img_extents[cnt - 1]; 1667 2185 1668 - img_request_child_set(parent_request); 1669 - rbd_obj_request_get(obj_request); 1670 - parent_request->obj_request = obj_request; 1671 - 1672 - return parent_request; 1673 - } 1674 - 1675 - static void rbd_parent_request_destroy(struct kref *kref) 1676 - { 1677 - struct rbd_img_request *parent_request; 1678 - struct rbd_obj_request *orig_request; 1679 - 1680 - parent_request = container_of(kref, struct rbd_img_request, kref); 1681 - orig_request = parent_request->obj_request; 1682 - 1683 - parent_request->obj_request = NULL; 1684 - rbd_obj_request_put(orig_request); 1685 - img_request_child_clear(parent_request); 1686 - 1687 - rbd_img_request_destroy(kref); 1688 - } 1689 - 1690 - static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) 1691 - { 1692 - struct rbd_img_request *img_request; 1693 - unsigned int xferred; 1694 - int result; 1695 - bool more; 1696 - 1697 - rbd_assert(obj_request_img_data_test(obj_request)); 1698 - img_request = obj_request->img_request; 1699 - 1700 - rbd_assert(obj_request->xferred <= (u64)UINT_MAX); 1701 - xferred = (unsigned int)obj_request->xferred; 1702 - result = obj_request->result; 1703 - if (result) { 1704 - struct rbd_device *rbd_dev = img_request->rbd_dev; 1705 - enum obj_operation_type op_type; 1706 - 1707 - if (img_request_discard_test(img_request)) 1708 - op_type = OBJ_OP_DISCARD; 1709 - else if (img_request_write_test(img_request)) 1710 - op_type = OBJ_OP_WRITE; 1711 - else 1712 - op_type = OBJ_OP_READ; 1713 - 1714 - rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", 1715 - obj_op_name(op_type), obj_request->length, 1716 - obj_request->img_offset, obj_request->offset); 1717 - rbd_warn(rbd_dev, " result %d xferred %x", 1718 - result, xferred); 1719 - if (!img_request->result) 1720 - img_request->result = result; 1721 - /* 1722 - * Need to end I/O on the entire obj_request worth of 1723 - * bytes in case of error. 1724 - */ 1725 - xferred = obj_request->length; 2186 + /* trim final overlapping extent */ 2187 + if (ex->fe_off + ex->fe_len > overlap) 2188 + ex->fe_len = overlap - ex->fe_off; 1726 2189 } 1727 2190 1728 - if (img_request_child_test(img_request)) { 1729 - rbd_assert(img_request->obj_request != NULL); 1730 - more = obj_request->which < img_request->obj_request_count - 1; 1731 - } else { 1732 - blk_status_t status = errno_to_blk_status(result); 1733 - 1734 - rbd_assert(img_request->rq != NULL); 1735 - 1736 - more = blk_update_request(img_request->rq, status, xferred); 1737 - if (!more) 1738 - __blk_mq_end_request(img_request->rq, status); 1739 - } 1740 - 1741 - return more; 1742 - } 1743 - 1744 - static void rbd_img_obj_callback(struct rbd_obj_request *obj_request) 1745 - { 1746 - struct rbd_img_request *img_request; 1747 - u32 which = obj_request->which; 1748 - bool more = true; 1749 - 1750 - rbd_assert(obj_request_img_data_test(obj_request)); 1751 - img_request = obj_request->img_request; 1752 - 1753 - dout("%s: img %p obj %p\n", __func__, img_request, obj_request); 1754 - rbd_assert(img_request != NULL); 1755 - rbd_assert(img_request->obj_request_count > 0); 1756 - rbd_assert(which != BAD_WHICH); 1757 - rbd_assert(which < img_request->obj_request_count); 1758 - 1759 - spin_lock_irq(&img_request->completion_lock); 1760 - if (which != img_request->next_completion) 1761 - goto out; 1762 - 1763 - for_each_obj_request_from(img_request, obj_request) { 1764 - rbd_assert(more); 1765 - rbd_assert(which < img_request->obj_request_count); 1766 - 1767 - if (!obj_request_done_test(obj_request)) 1768 - break; 1769 - more = rbd_img_obj_end_request(obj_request); 1770 - which++; 1771 - } 1772 - 1773 - rbd_assert(more ^ (which == img_request->obj_request_count)); 1774 - img_request->next_completion = which; 1775 - out: 1776 - spin_unlock_irq(&img_request->completion_lock); 1777 - rbd_img_request_put(img_request); 1778 - 1779 - if (!more) 1780 - rbd_img_request_complete(img_request); 2191 + *num_img_extents = cnt; 1781 2192 } 1782 2193 1783 2194 /* 1784 - * Add individual osd ops to the given ceph_osd_request and prepare 1785 - * them for submission. num_ops is the current number of 1786 - * osd operations already to the object request. 2195 + * Determine the byte range(s) covered by either just the object extent 2196 + * or the entire object in the parent image. 1787 2197 */ 1788 - static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, 1789 - struct ceph_osd_request *osd_request, 1790 - enum obj_operation_type op_type, 1791 - unsigned int num_ops) 2198 + static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req, 2199 + bool entire) 1792 2200 { 1793 - struct rbd_img_request *img_request = obj_request->img_request; 1794 - struct rbd_device *rbd_dev = img_request->rbd_dev; 1795 - u64 object_size = rbd_obj_bytes(&rbd_dev->header); 1796 - u64 offset = obj_request->offset; 1797 - u64 length = obj_request->length; 1798 - u64 img_end; 1799 - u16 opcode; 1800 - 1801 - if (op_type == OBJ_OP_DISCARD) { 1802 - if (!offset && length == object_size && 1803 - (!img_request_layered_test(img_request) || 1804 - !obj_request_overlaps_parent(obj_request))) { 1805 - opcode = CEPH_OSD_OP_DELETE; 1806 - } else if ((offset + length == object_size)) { 1807 - opcode = CEPH_OSD_OP_TRUNCATE; 1808 - } else { 1809 - down_read(&rbd_dev->header_rwsem); 1810 - img_end = rbd_dev->header.image_size; 1811 - up_read(&rbd_dev->header_rwsem); 1812 - 1813 - if (obj_request->img_offset + length == img_end) 1814 - opcode = CEPH_OSD_OP_TRUNCATE; 1815 - else 1816 - opcode = CEPH_OSD_OP_ZERO; 1817 - } 1818 - } else if (op_type == OBJ_OP_WRITE) { 1819 - if (!offset && length == object_size) 1820 - opcode = CEPH_OSD_OP_WRITEFULL; 1821 - else 1822 - opcode = CEPH_OSD_OP_WRITE; 1823 - osd_req_op_alloc_hint_init(osd_request, num_ops, 1824 - object_size, object_size); 1825 - num_ops++; 1826 - } else { 1827 - opcode = CEPH_OSD_OP_READ; 1828 - } 1829 - 1830 - if (opcode == CEPH_OSD_OP_DELETE) 1831 - osd_req_op_init(osd_request, num_ops, opcode, 0); 1832 - else 1833 - osd_req_op_extent_init(osd_request, num_ops, opcode, 1834 - offset, length, 0, 0); 1835 - 1836 - if (obj_request->type == OBJ_REQUEST_BIO) 1837 - osd_req_op_extent_osd_data_bio(osd_request, num_ops, 1838 - obj_request->bio_list, length); 1839 - else if (obj_request->type == OBJ_REQUEST_PAGES) 1840 - osd_req_op_extent_osd_data_pages(osd_request, num_ops, 1841 - obj_request->pages, length, 1842 - offset & ~PAGE_MASK, false, false); 1843 - 1844 - /* Discards are also writes */ 1845 - if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) 1846 - rbd_osd_req_format_write(obj_request); 1847 - else 1848 - rbd_osd_req_format_read(obj_request); 1849 - } 1850 - 1851 - /* 1852 - * Split up an image request into one or more object requests, each 1853 - * to a different object. The "type" parameter indicates whether 1854 - * "data_desc" is the pointer to the head of a list of bio 1855 - * structures, or the base of a page array. In either case this 1856 - * function assumes data_desc describes memory sufficient to hold 1857 - * all data described by the image request. 1858 - */ 1859 - static int rbd_img_request_fill(struct rbd_img_request *img_request, 1860 - enum obj_request_type type, 1861 - void *data_desc) 1862 - { 1863 - struct rbd_device *rbd_dev = img_request->rbd_dev; 1864 - struct rbd_obj_request *obj_request = NULL; 1865 - struct rbd_obj_request *next_obj_request; 1866 - struct bio *bio_list = NULL; 1867 - unsigned int bio_offset = 0; 1868 - struct page **pages = NULL; 1869 - enum obj_operation_type op_type; 1870 - u64 img_offset; 1871 - u64 resid; 1872 - 1873 - dout("%s: img %p type %d data_desc %p\n", __func__, img_request, 1874 - (int)type, data_desc); 1875 - 1876 - img_offset = img_request->offset; 1877 - resid = img_request->length; 1878 - rbd_assert(resid > 0); 1879 - op_type = rbd_img_request_op_type(img_request); 1880 - 1881 - if (type == OBJ_REQUEST_BIO) { 1882 - bio_list = data_desc; 1883 - rbd_assert(img_offset == 1884 - bio_list->bi_iter.bi_sector << SECTOR_SHIFT); 1885 - } else if (type == OBJ_REQUEST_PAGES) { 1886 - pages = data_desc; 1887 - } 1888 - 1889 - while (resid) { 1890 - struct ceph_osd_request *osd_req; 1891 - u64 object_no = img_offset >> rbd_dev->header.obj_order; 1892 - u64 offset = rbd_segment_offset(rbd_dev, img_offset); 1893 - u64 length = rbd_segment_length(rbd_dev, img_offset, resid); 1894 - 1895 - obj_request = rbd_obj_request_create(type); 1896 - if (!obj_request) 1897 - goto out_unwind; 1898 - 1899 - obj_request->object_no = object_no; 1900 - obj_request->offset = offset; 1901 - obj_request->length = length; 1902 - 1903 - /* 1904 - * set obj_request->img_request before creating the 1905 - * osd_request so that it gets the right snapc 1906 - */ 1907 - rbd_img_obj_request_add(img_request, obj_request); 1908 - 1909 - if (type == OBJ_REQUEST_BIO) { 1910 - unsigned int clone_size; 1911 - 1912 - rbd_assert(length <= (u64)UINT_MAX); 1913 - clone_size = (unsigned int)length; 1914 - obj_request->bio_list = 1915 - bio_chain_clone_range(&bio_list, 1916 - &bio_offset, 1917 - clone_size, 1918 - GFP_NOIO); 1919 - if (!obj_request->bio_list) 1920 - goto out_unwind; 1921 - } else if (type == OBJ_REQUEST_PAGES) { 1922 - unsigned int page_count; 1923 - 1924 - obj_request->pages = pages; 1925 - page_count = (u32)calc_pages_for(offset, length); 1926 - obj_request->page_count = page_count; 1927 - if ((offset + length) & ~PAGE_MASK) 1928 - page_count--; /* more on last page */ 1929 - pages += page_count; 1930 - } 1931 - 1932 - osd_req = rbd_osd_req_create(rbd_dev, op_type, 1933 - (op_type == OBJ_OP_WRITE) ? 2 : 1, 1934 - obj_request); 1935 - if (!osd_req) 1936 - goto out_unwind; 1937 - 1938 - obj_request->osd_req = osd_req; 1939 - obj_request->callback = rbd_img_obj_callback; 1940 - obj_request->img_offset = img_offset; 1941 - 1942 - rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0); 1943 - 1944 - img_offset += length; 1945 - resid -= length; 1946 - } 1947 - 1948 - return 0; 1949 - 1950 - out_unwind: 1951 - for_each_obj_request_safe(img_request, obj_request, next_obj_request) 1952 - rbd_img_obj_request_del(img_request, obj_request); 1953 - 1954 - return -ENOMEM; 1955 - } 1956 - 1957 - static void 1958 - rbd_osd_copyup_callback(struct rbd_obj_request *obj_request) 1959 - { 1960 - struct rbd_img_request *img_request; 1961 - struct rbd_device *rbd_dev; 1962 - struct page **pages; 1963 - u32 page_count; 1964 - 1965 - dout("%s: obj %p\n", __func__, obj_request); 1966 - 1967 - rbd_assert(obj_request->type == OBJ_REQUEST_BIO || 1968 - obj_request->type == OBJ_REQUEST_NODATA); 1969 - rbd_assert(obj_request_img_data_test(obj_request)); 1970 - img_request = obj_request->img_request; 1971 - rbd_assert(img_request); 1972 - 1973 - rbd_dev = img_request->rbd_dev; 1974 - rbd_assert(rbd_dev); 1975 - 1976 - pages = obj_request->copyup_pages; 1977 - rbd_assert(pages != NULL); 1978 - obj_request->copyup_pages = NULL; 1979 - page_count = obj_request->copyup_page_count; 1980 - rbd_assert(page_count); 1981 - obj_request->copyup_page_count = 0; 1982 - ceph_release_page_vector(pages, page_count); 1983 - 1984 - /* 1985 - * We want the transfer count to reflect the size of the 1986 - * original write request. There is no such thing as a 1987 - * successful short write, so if the request was successful 1988 - * we can just set it to the originally-requested length. 1989 - */ 1990 - if (!obj_request->result) 1991 - obj_request->xferred = obj_request->length; 1992 - 1993 - obj_request_done_set(obj_request); 1994 - } 1995 - 1996 - static void 1997 - rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) 1998 - { 1999 - struct rbd_obj_request *orig_request; 2000 - struct ceph_osd_request *osd_req; 2001 - struct rbd_device *rbd_dev; 2002 - struct page **pages; 2003 - enum obj_operation_type op_type; 2004 - u32 page_count; 2005 - int img_result; 2006 - u64 parent_length; 2007 - 2008 - rbd_assert(img_request_child_test(img_request)); 2009 - 2010 - /* First get what we need from the image request */ 2011 - 2012 - pages = img_request->copyup_pages; 2013 - rbd_assert(pages != NULL); 2014 - img_request->copyup_pages = NULL; 2015 - page_count = img_request->copyup_page_count; 2016 - rbd_assert(page_count); 2017 - img_request->copyup_page_count = 0; 2018 - 2019 - orig_request = img_request->obj_request; 2020 - rbd_assert(orig_request != NULL); 2021 - rbd_assert(obj_request_type_valid(orig_request->type)); 2022 - img_result = img_request->result; 2023 - parent_length = img_request->length; 2024 - rbd_assert(img_result || parent_length == img_request->xferred); 2025 - rbd_img_request_put(img_request); 2026 - 2027 - rbd_assert(orig_request->img_request); 2028 - rbd_dev = orig_request->img_request->rbd_dev; 2029 - rbd_assert(rbd_dev); 2030 - 2031 - /* 2032 - * If the overlap has become 0 (most likely because the 2033 - * image has been flattened) we need to free the pages 2034 - * and re-submit the original write request. 2035 - */ 2036 - if (!rbd_dev->parent_overlap) { 2037 - ceph_release_page_vector(pages, page_count); 2038 - rbd_obj_request_submit(orig_request); 2039 - return; 2040 - } 2041 - 2042 - if (img_result) 2043 - goto out_err; 2044 - 2045 - /* 2046 - * The original osd request is of no use to use any more. 2047 - * We need a new one that can hold the three ops in a copyup 2048 - * request. Allocate the new copyup osd request for the 2049 - * original request, and release the old one. 2050 - */ 2051 - img_result = -ENOMEM; 2052 - osd_req = rbd_osd_req_create_copyup(orig_request); 2053 - if (!osd_req) 2054 - goto out_err; 2055 - rbd_osd_req_destroy(orig_request->osd_req); 2056 - orig_request->osd_req = osd_req; 2057 - orig_request->copyup_pages = pages; 2058 - orig_request->copyup_page_count = page_count; 2059 - 2060 - /* Initialize the copyup op */ 2061 - 2062 - osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup"); 2063 - osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, 2064 - false, false); 2065 - 2066 - /* Add the other op(s) */ 2067 - 2068 - op_type = rbd_img_request_op_type(orig_request->img_request); 2069 - rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1); 2070 - 2071 - /* All set, send it off. */ 2072 - 2073 - rbd_obj_request_submit(orig_request); 2074 - return; 2075 - 2076 - out_err: 2077 - ceph_release_page_vector(pages, page_count); 2078 - rbd_obj_request_error(orig_request, img_result); 2079 - } 2080 - 2081 - /* 2082 - * Read from the parent image the range of data that covers the 2083 - * entire target of the given object request. This is used for 2084 - * satisfying a layered image write request when the target of an 2085 - * object request from the image request does not exist. 2086 - * 2087 - * A page array big enough to hold the returned data is allocated 2088 - * and supplied to rbd_img_request_fill() as the "data descriptor." 2089 - * When the read completes, this page array will be transferred to 2090 - * the original object request for the copyup operation. 2091 - * 2092 - * If an error occurs, it is recorded as the result of the original 2093 - * object request in rbd_img_obj_exists_callback(). 2094 - */ 2095 - static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request) 2096 - { 2097 - struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 2098 - struct rbd_img_request *parent_request = NULL; 2099 - u64 img_offset; 2100 - u64 length; 2101 - struct page **pages = NULL; 2102 - u32 page_count; 2103 - int result; 2104 - 2105 - rbd_assert(rbd_dev->parent != NULL); 2106 - 2107 - /* 2108 - * Determine the byte range covered by the object in the 2109 - * child image to which the original request was to be sent. 2110 - */ 2111 - img_offset = obj_request->img_offset - obj_request->offset; 2112 - length = rbd_obj_bytes(&rbd_dev->header); 2113 - 2114 - /* 2115 - * There is no defined parent data beyond the parent 2116 - * overlap, so limit what we read at that boundary if 2117 - * necessary. 2118 - */ 2119 - if (img_offset + length > rbd_dev->parent_overlap) { 2120 - rbd_assert(img_offset < rbd_dev->parent_overlap); 2121 - length = rbd_dev->parent_overlap - img_offset; 2122 - } 2123 - 2124 - /* 2125 - * Allocate a page array big enough to receive the data read 2126 - * from the parent. 2127 - */ 2128 - page_count = (u32)calc_pages_for(0, length); 2129 - pages = ceph_alloc_page_vector(page_count, GFP_NOIO); 2130 - if (IS_ERR(pages)) { 2131 - result = PTR_ERR(pages); 2132 - pages = NULL; 2133 - goto out_err; 2134 - } 2135 - 2136 - result = -ENOMEM; 2137 - parent_request = rbd_parent_request_create(obj_request, 2138 - img_offset, length); 2139 - if (!parent_request) 2140 - goto out_err; 2141 - 2142 - result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages); 2143 - if (result) 2144 - goto out_err; 2145 - 2146 - parent_request->copyup_pages = pages; 2147 - parent_request->copyup_page_count = page_count; 2148 - parent_request->callback = rbd_img_obj_parent_read_full_callback; 2149 - 2150 - result = rbd_img_request_submit(parent_request); 2151 - if (!result) 2152 - return 0; 2153 - 2154 - parent_request->copyup_pages = NULL; 2155 - parent_request->copyup_page_count = 0; 2156 - out_err: 2157 - if (pages) 2158 - ceph_release_page_vector(pages, page_count); 2159 - if (parent_request) 2160 - rbd_img_request_put(parent_request); 2161 - return result; 2162 - } 2163 - 2164 - static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request) 2165 - { 2166 - struct rbd_obj_request *orig_request; 2167 - struct rbd_device *rbd_dev; 2168 - int result; 2169 - 2170 - rbd_assert(!obj_request_img_data_test(obj_request)); 2171 - 2172 - /* 2173 - * All we need from the object request is the original 2174 - * request and the result of the STAT op. Grab those, then 2175 - * we're done with the request. 2176 - */ 2177 - orig_request = obj_request->obj_request; 2178 - obj_request->obj_request = NULL; 2179 - rbd_obj_request_put(orig_request); 2180 - rbd_assert(orig_request); 2181 - rbd_assert(orig_request->img_request); 2182 - 2183 - result = obj_request->result; 2184 - obj_request->result = 0; 2185 - 2186 - dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__, 2187 - obj_request, orig_request, result, 2188 - obj_request->xferred, obj_request->length); 2189 - rbd_obj_request_put(obj_request); 2190 - 2191 - /* 2192 - * If the overlap has become 0 (most likely because the 2193 - * image has been flattened) we need to re-submit the 2194 - * original request. 2195 - */ 2196 - rbd_dev = orig_request->img_request->rbd_dev; 2197 - if (!rbd_dev->parent_overlap) { 2198 - rbd_obj_request_submit(orig_request); 2199 - return; 2200 - } 2201 - 2202 - /* 2203 - * Our only purpose here is to determine whether the object 2204 - * exists, and we don't want to treat the non-existence as 2205 - * an error. If something else comes back, transfer the 2206 - * error to the original request and complete it now. 2207 - */ 2208 - if (!result) { 2209 - obj_request_existence_set(orig_request, true); 2210 - } else if (result == -ENOENT) { 2211 - obj_request_existence_set(orig_request, false); 2212 - } else { 2213 - goto fail_orig_request; 2214 - } 2215 - 2216 - /* 2217 - * Resubmit the original request now that we have recorded 2218 - * whether the target object exists. 2219 - */ 2220 - result = rbd_img_obj_request_submit(orig_request); 2221 - if (result) 2222 - goto fail_orig_request; 2223 - 2224 - return; 2225 - 2226 - fail_orig_request: 2227 - rbd_obj_request_error(orig_request, result); 2228 - } 2229 - 2230 - static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) 2231 - { 2232 - struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 2233 - struct rbd_obj_request *stat_request; 2234 - struct page **pages; 2235 - u32 page_count; 2236 - size_t size; 2201 + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 2237 2202 int ret; 2238 2203 2239 - stat_request = rbd_obj_request_create(OBJ_REQUEST_PAGES); 2240 - if (!stat_request) 2204 + if (!rbd_dev->parent_overlap) 2205 + return 0; 2206 + 2207 + ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno, 2208 + entire ? 0 : obj_req->ex.oe_off, 2209 + entire ? rbd_dev->layout.object_size : 2210 + obj_req->ex.oe_len, 2211 + &obj_req->img_extents, 2212 + &obj_req->num_img_extents); 2213 + if (ret) 2214 + return ret; 2215 + 2216 + prune_extents(obj_req->img_extents, &obj_req->num_img_extents, 2217 + rbd_dev->parent_overlap); 2218 + return 0; 2219 + } 2220 + 2221 + static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) 2222 + { 2223 + switch (obj_req->img_request->data_type) { 2224 + case OBJ_REQUEST_BIO: 2225 + osd_req_op_extent_osd_data_bio(obj_req->osd_req, which, 2226 + &obj_req->bio_pos, 2227 + obj_req->ex.oe_len); 2228 + break; 2229 + case OBJ_REQUEST_BVECS: 2230 + case OBJ_REQUEST_OWN_BVECS: 2231 + rbd_assert(obj_req->bvec_pos.iter.bi_size == 2232 + obj_req->ex.oe_len); 2233 + rbd_assert(obj_req->bvec_idx == obj_req->bvec_count); 2234 + osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which, 2235 + &obj_req->bvec_pos); 2236 + break; 2237 + default: 2238 + rbd_assert(0); 2239 + } 2240 + } 2241 + 2242 + static int rbd_obj_setup_read(struct rbd_obj_request *obj_req) 2243 + { 2244 + obj_req->osd_req = rbd_osd_req_create(obj_req, 1); 2245 + if (!obj_req->osd_req) 2241 2246 return -ENOMEM; 2242 2247 2243 - stat_request->object_no = obj_request->object_no; 2248 + osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ, 2249 + obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); 2250 + rbd_osd_req_setup_data(obj_req, 0); 2244 2251 2245 - stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, 2246 - stat_request); 2247 - if (!stat_request->osd_req) { 2248 - ret = -ENOMEM; 2249 - goto fail_stat_request; 2250 - } 2252 + rbd_osd_req_format_read(obj_req); 2253 + return 0; 2254 + } 2255 + 2256 + static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req, 2257 + unsigned int which) 2258 + { 2259 + struct page **pages; 2251 2260 2252 2261 /* 2253 2262 * The response data for a STAT call consists of: ··· 1744 2779 * le32 tv_nsec; 1745 2780 * } mtime; 1746 2781 */ 1747 - size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32); 1748 - page_count = (u32)calc_pages_for(0, size); 1749 - pages = ceph_alloc_page_vector(page_count, GFP_NOIO); 1750 - if (IS_ERR(pages)) { 1751 - ret = PTR_ERR(pages); 1752 - goto fail_stat_request; 1753 - } 2782 + pages = ceph_alloc_page_vector(1, GFP_NOIO); 2783 + if (IS_ERR(pages)) 2784 + return PTR_ERR(pages); 1754 2785 1755 - osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0); 1756 - osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0, 1757 - false, false); 1758 - 1759 - rbd_obj_request_get(obj_request); 1760 - stat_request->obj_request = obj_request; 1761 - stat_request->pages = pages; 1762 - stat_request->page_count = page_count; 1763 - stat_request->callback = rbd_img_obj_exists_callback; 1764 - 1765 - rbd_obj_request_submit(stat_request); 2786 + osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0); 2787 + osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages, 2788 + 8 + sizeof(struct ceph_timespec), 2789 + 0, false, true); 1766 2790 return 0; 1767 - 1768 - fail_stat_request: 1769 - rbd_obj_request_put(stat_request); 1770 - return ret; 1771 2791 } 1772 2792 1773 - static bool img_obj_request_simple(struct rbd_obj_request *obj_request) 2793 + static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req, 2794 + unsigned int which) 1774 2795 { 1775 - struct rbd_img_request *img_request = obj_request->img_request; 1776 - struct rbd_device *rbd_dev = img_request->rbd_dev; 2796 + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 2797 + u16 opcode; 1777 2798 1778 - /* Reads */ 1779 - if (!img_request_write_test(img_request) && 1780 - !img_request_discard_test(img_request)) 1781 - return true; 2799 + osd_req_op_alloc_hint_init(obj_req->osd_req, which++, 2800 + rbd_dev->layout.object_size, 2801 + rbd_dev->layout.object_size); 1782 2802 1783 - /* Non-layered writes */ 1784 - if (!img_request_layered_test(img_request)) 1785 - return true; 2803 + if (rbd_obj_is_entire(obj_req)) 2804 + opcode = CEPH_OSD_OP_WRITEFULL; 2805 + else 2806 + opcode = CEPH_OSD_OP_WRITE; 1786 2807 1787 - /* 1788 - * Layered writes outside of the parent overlap range don't 1789 - * share any data with the parent. 1790 - */ 1791 - if (!obj_request_overlaps_parent(obj_request)) 1792 - return true; 2808 + osd_req_op_extent_init(obj_req->osd_req, which, opcode, 2809 + obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0); 2810 + rbd_osd_req_setup_data(obj_req, which++); 1793 2811 1794 - /* 1795 - * Entire-object layered writes - we will overwrite whatever 1796 - * parent data there is anyway. 1797 - */ 1798 - if (!obj_request->offset && 1799 - obj_request->length == rbd_obj_bytes(&rbd_dev->header)) 1800 - return true; 1801 - 1802 - /* 1803 - * If the object is known to already exist, its parent data has 1804 - * already been copied. 1805 - */ 1806 - if (obj_request_known_test(obj_request) && 1807 - obj_request_exists_test(obj_request)) 1808 - return true; 1809 - 1810 - return false; 2812 + rbd_assert(which == obj_req->osd_req->r_num_ops); 2813 + rbd_osd_req_format_write(obj_req); 1811 2814 } 1812 2815 1813 - static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) 2816 + static int rbd_obj_setup_write(struct rbd_obj_request *obj_req) 1814 2817 { 1815 - rbd_assert(obj_request_img_data_test(obj_request)); 1816 - rbd_assert(obj_request_type_valid(obj_request->type)); 1817 - rbd_assert(obj_request->img_request); 2818 + unsigned int num_osd_ops, which = 0; 2819 + int ret; 1818 2820 1819 - if (img_obj_request_simple(obj_request)) { 1820 - rbd_obj_request_submit(obj_request); 1821 - return 0; 2821 + /* reverse map the entire object onto the parent */ 2822 + ret = rbd_obj_calc_img_extents(obj_req, true); 2823 + if (ret) 2824 + return ret; 2825 + 2826 + if (obj_req->num_img_extents) { 2827 + obj_req->write_state = RBD_OBJ_WRITE_GUARD; 2828 + num_osd_ops = 3; /* stat + setallochint + write/writefull */ 2829 + } else { 2830 + obj_req->write_state = RBD_OBJ_WRITE_FLAT; 2831 + num_osd_ops = 2; /* setallochint + write/writefull */ 2832 + } 2833 + 2834 + obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); 2835 + if (!obj_req->osd_req) 2836 + return -ENOMEM; 2837 + 2838 + if (obj_req->num_img_extents) { 2839 + ret = __rbd_obj_setup_stat(obj_req, which++); 2840 + if (ret) 2841 + return ret; 2842 + } 2843 + 2844 + __rbd_obj_setup_write(obj_req, which); 2845 + return 0; 2846 + } 2847 + 2848 + static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req, 2849 + unsigned int which) 2850 + { 2851 + u16 opcode; 2852 + 2853 + if (rbd_obj_is_entire(obj_req)) { 2854 + if (obj_req->num_img_extents) { 2855 + osd_req_op_init(obj_req->osd_req, which++, 2856 + CEPH_OSD_OP_CREATE, 0); 2857 + opcode = CEPH_OSD_OP_TRUNCATE; 2858 + } else { 2859 + osd_req_op_init(obj_req->osd_req, which++, 2860 + CEPH_OSD_OP_DELETE, 0); 2861 + opcode = 0; 2862 + } 2863 + } else if (rbd_obj_is_tail(obj_req)) { 2864 + opcode = CEPH_OSD_OP_TRUNCATE; 2865 + } else { 2866 + opcode = CEPH_OSD_OP_ZERO; 2867 + } 2868 + 2869 + if (opcode) 2870 + osd_req_op_extent_init(obj_req->osd_req, which++, opcode, 2871 + obj_req->ex.oe_off, obj_req->ex.oe_len, 2872 + 0, 0); 2873 + 2874 + rbd_assert(which == obj_req->osd_req->r_num_ops); 2875 + rbd_osd_req_format_write(obj_req); 2876 + } 2877 + 2878 + static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req) 2879 + { 2880 + unsigned int num_osd_ops, which = 0; 2881 + int ret; 2882 + 2883 + /* reverse map the entire object onto the parent */ 2884 + ret = rbd_obj_calc_img_extents(obj_req, true); 2885 + if (ret) 2886 + return ret; 2887 + 2888 + if (rbd_obj_is_entire(obj_req)) { 2889 + obj_req->write_state = RBD_OBJ_WRITE_FLAT; 2890 + if (obj_req->num_img_extents) 2891 + num_osd_ops = 2; /* create + truncate */ 2892 + else 2893 + num_osd_ops = 1; /* delete */ 2894 + } else { 2895 + if (obj_req->num_img_extents) { 2896 + obj_req->write_state = RBD_OBJ_WRITE_GUARD; 2897 + num_osd_ops = 2; /* stat + truncate/zero */ 2898 + } else { 2899 + obj_req->write_state = RBD_OBJ_WRITE_FLAT; 2900 + num_osd_ops = 1; /* truncate/zero */ 2901 + } 2902 + } 2903 + 2904 + obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); 2905 + if (!obj_req->osd_req) 2906 + return -ENOMEM; 2907 + 2908 + if (!rbd_obj_is_entire(obj_req) && obj_req->num_img_extents) { 2909 + ret = __rbd_obj_setup_stat(obj_req, which++); 2910 + if (ret) 2911 + return ret; 2912 + } 2913 + 2914 + __rbd_obj_setup_discard(obj_req, which); 2915 + return 0; 2916 + } 2917 + 2918 + /* 2919 + * For each object request in @img_req, allocate an OSD request, add 2920 + * individual OSD ops and prepare them for submission. The number of 2921 + * OSD ops depends on op_type and the overlap point (if any). 2922 + */ 2923 + static int __rbd_img_fill_request(struct rbd_img_request *img_req) 2924 + { 2925 + struct rbd_obj_request *obj_req; 2926 + int ret; 2927 + 2928 + for_each_obj_request(img_req, obj_req) { 2929 + switch (img_req->op_type) { 2930 + case OBJ_OP_READ: 2931 + ret = rbd_obj_setup_read(obj_req); 2932 + break; 2933 + case OBJ_OP_WRITE: 2934 + ret = rbd_obj_setup_write(obj_req); 2935 + break; 2936 + case OBJ_OP_DISCARD: 2937 + ret = rbd_obj_setup_discard(obj_req); 2938 + break; 2939 + default: 2940 + rbd_assert(0); 2941 + } 2942 + if (ret) 2943 + return ret; 2944 + } 2945 + 2946 + return 0; 2947 + } 2948 + 2949 + union rbd_img_fill_iter { 2950 + struct ceph_bio_iter bio_iter; 2951 + struct ceph_bvec_iter bvec_iter; 2952 + }; 2953 + 2954 + struct rbd_img_fill_ctx { 2955 + enum obj_request_type pos_type; 2956 + union rbd_img_fill_iter *pos; 2957 + union rbd_img_fill_iter iter; 2958 + ceph_object_extent_fn_t set_pos_fn; 2959 + ceph_object_extent_fn_t count_fn; 2960 + ceph_object_extent_fn_t copy_fn; 2961 + }; 2962 + 2963 + static struct ceph_object_extent *alloc_object_extent(void *arg) 2964 + { 2965 + struct rbd_img_request *img_req = arg; 2966 + struct rbd_obj_request *obj_req; 2967 + 2968 + obj_req = rbd_obj_request_create(); 2969 + if (!obj_req) 2970 + return NULL; 2971 + 2972 + rbd_img_obj_request_add(img_req, obj_req); 2973 + return &obj_req->ex; 2974 + } 2975 + 2976 + /* 2977 + * While su != os && sc == 1 is technically not fancy (it's the same 2978 + * layout as su == os && sc == 1), we can't use the nocopy path for it 2979 + * because ->set_pos_fn() should be called only once per object. 2980 + * ceph_file_to_extents() invokes action_fn once per stripe unit, so 2981 + * treat su != os && sc == 1 as fancy. 2982 + */ 2983 + static bool rbd_layout_is_fancy(struct ceph_file_layout *l) 2984 + { 2985 + return l->stripe_unit != l->object_size; 2986 + } 2987 + 2988 + static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req, 2989 + struct ceph_file_extent *img_extents, 2990 + u32 num_img_extents, 2991 + struct rbd_img_fill_ctx *fctx) 2992 + { 2993 + u32 i; 2994 + int ret; 2995 + 2996 + img_req->data_type = fctx->pos_type; 2997 + 2998 + /* 2999 + * Create object requests and set each object request's starting 3000 + * position in the provided bio (list) or bio_vec array. 3001 + */ 3002 + fctx->iter = *fctx->pos; 3003 + for (i = 0; i < num_img_extents; i++) { 3004 + ret = ceph_file_to_extents(&img_req->rbd_dev->layout, 3005 + img_extents[i].fe_off, 3006 + img_extents[i].fe_len, 3007 + &img_req->object_extents, 3008 + alloc_object_extent, img_req, 3009 + fctx->set_pos_fn, &fctx->iter); 3010 + if (ret) 3011 + return ret; 3012 + } 3013 + 3014 + return __rbd_img_fill_request(img_req); 3015 + } 3016 + 3017 + /* 3018 + * Map a list of image extents to a list of object extents, create the 3019 + * corresponding object requests (normally each to a different object, 3020 + * but not always) and add them to @img_req. For each object request, 3021 + * set up its data descriptor to point to the corresponding chunk(s) of 3022 + * @fctx->pos data buffer. 3023 + * 3024 + * Because ceph_file_to_extents() will merge adjacent object extents 3025 + * together, each object request's data descriptor may point to multiple 3026 + * different chunks of @fctx->pos data buffer. 3027 + * 3028 + * @fctx->pos data buffer is assumed to be large enough. 3029 + */ 3030 + static int rbd_img_fill_request(struct rbd_img_request *img_req, 3031 + struct ceph_file_extent *img_extents, 3032 + u32 num_img_extents, 3033 + struct rbd_img_fill_ctx *fctx) 3034 + { 3035 + struct rbd_device *rbd_dev = img_req->rbd_dev; 3036 + struct rbd_obj_request *obj_req; 3037 + u32 i; 3038 + int ret; 3039 + 3040 + if (fctx->pos_type == OBJ_REQUEST_NODATA || 3041 + !rbd_layout_is_fancy(&rbd_dev->layout)) 3042 + return rbd_img_fill_request_nocopy(img_req, img_extents, 3043 + num_img_extents, fctx); 3044 + 3045 + img_req->data_type = OBJ_REQUEST_OWN_BVECS; 3046 + 3047 + /* 3048 + * Create object requests and determine ->bvec_count for each object 3049 + * request. Note that ->bvec_count sum over all object requests may 3050 + * be greater than the number of bio_vecs in the provided bio (list) 3051 + * or bio_vec array because when mapped, those bio_vecs can straddle 3052 + * stripe unit boundaries. 3053 + */ 3054 + fctx->iter = *fctx->pos; 3055 + for (i = 0; i < num_img_extents; i++) { 3056 + ret = ceph_file_to_extents(&rbd_dev->layout, 3057 + img_extents[i].fe_off, 3058 + img_extents[i].fe_len, 3059 + &img_req->object_extents, 3060 + alloc_object_extent, img_req, 3061 + fctx->count_fn, &fctx->iter); 3062 + if (ret) 3063 + return ret; 3064 + } 3065 + 3066 + for_each_obj_request(img_req, obj_req) { 3067 + obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count, 3068 + sizeof(*obj_req->bvec_pos.bvecs), 3069 + GFP_NOIO); 3070 + if (!obj_req->bvec_pos.bvecs) 3071 + return -ENOMEM; 1822 3072 } 1823 3073 1824 3074 /* 1825 - * It's a layered write. The target object might exist but 1826 - * we may not know that yet. If we know it doesn't exist, 1827 - * start by reading the data for the full target object from 1828 - * the parent so we can use it for a copyup to the target. 3075 + * Fill in each object request's private bio_vec array, splitting and 3076 + * rearranging the provided bio_vecs in stripe unit chunks as needed. 1829 3077 */ 1830 - if (obj_request_known_test(obj_request)) 1831 - return rbd_img_obj_parent_read_full(obj_request); 3078 + fctx->iter = *fctx->pos; 3079 + for (i = 0; i < num_img_extents; i++) { 3080 + ret = ceph_iterate_extents(&rbd_dev->layout, 3081 + img_extents[i].fe_off, 3082 + img_extents[i].fe_len, 3083 + &img_req->object_extents, 3084 + fctx->copy_fn, &fctx->iter); 3085 + if (ret) 3086 + return ret; 3087 + } 1832 3088 1833 - /* We don't know whether the target exists. Go find out. */ 1834 - 1835 - return rbd_img_obj_exists_submit(obj_request); 3089 + return __rbd_img_fill_request(img_req); 1836 3090 } 1837 3091 1838 - static int rbd_img_request_submit(struct rbd_img_request *img_request) 3092 + static int rbd_img_fill_nodata(struct rbd_img_request *img_req, 3093 + u64 off, u64 len) 3094 + { 3095 + struct ceph_file_extent ex = { off, len }; 3096 + union rbd_img_fill_iter dummy; 3097 + struct rbd_img_fill_ctx fctx = { 3098 + .pos_type = OBJ_REQUEST_NODATA, 3099 + .pos = &dummy, 3100 + }; 3101 + 3102 + return rbd_img_fill_request(img_req, &ex, 1, &fctx); 3103 + } 3104 + 3105 + static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg) 3106 + { 3107 + struct rbd_obj_request *obj_req = 3108 + container_of(ex, struct rbd_obj_request, ex); 3109 + struct ceph_bio_iter *it = arg; 3110 + 3111 + dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); 3112 + obj_req->bio_pos = *it; 3113 + ceph_bio_iter_advance(it, bytes); 3114 + } 3115 + 3116 + static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 3117 + { 3118 + struct rbd_obj_request *obj_req = 3119 + container_of(ex, struct rbd_obj_request, ex); 3120 + struct ceph_bio_iter *it = arg; 3121 + 3122 + dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); 3123 + ceph_bio_iter_advance_step(it, bytes, ({ 3124 + obj_req->bvec_count++; 3125 + })); 3126 + 3127 + } 3128 + 3129 + static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 3130 + { 3131 + struct rbd_obj_request *obj_req = 3132 + container_of(ex, struct rbd_obj_request, ex); 3133 + struct ceph_bio_iter *it = arg; 3134 + 3135 + dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); 3136 + ceph_bio_iter_advance_step(it, bytes, ({ 3137 + obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv; 3138 + obj_req->bvec_pos.iter.bi_size += bv.bv_len; 3139 + })); 3140 + } 3141 + 3142 + static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req, 3143 + struct ceph_file_extent *img_extents, 3144 + u32 num_img_extents, 3145 + struct ceph_bio_iter *bio_pos) 3146 + { 3147 + struct rbd_img_fill_ctx fctx = { 3148 + .pos_type = OBJ_REQUEST_BIO, 3149 + .pos = (union rbd_img_fill_iter *)bio_pos, 3150 + .set_pos_fn = set_bio_pos, 3151 + .count_fn = count_bio_bvecs, 3152 + .copy_fn = copy_bio_bvecs, 3153 + }; 3154 + 3155 + return rbd_img_fill_request(img_req, img_extents, num_img_extents, 3156 + &fctx); 3157 + } 3158 + 3159 + static int rbd_img_fill_from_bio(struct rbd_img_request *img_req, 3160 + u64 off, u64 len, struct bio *bio) 3161 + { 3162 + struct ceph_file_extent ex = { off, len }; 3163 + struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter }; 3164 + 3165 + return __rbd_img_fill_from_bio(img_req, &ex, 1, &it); 3166 + } 3167 + 3168 + static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg) 3169 + { 3170 + struct rbd_obj_request *obj_req = 3171 + container_of(ex, struct rbd_obj_request, ex); 3172 + struct ceph_bvec_iter *it = arg; 3173 + 3174 + obj_req->bvec_pos = *it; 3175 + ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes); 3176 + ceph_bvec_iter_advance(it, bytes); 3177 + } 3178 + 3179 + static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 3180 + { 3181 + struct rbd_obj_request *obj_req = 3182 + container_of(ex, struct rbd_obj_request, ex); 3183 + struct ceph_bvec_iter *it = arg; 3184 + 3185 + ceph_bvec_iter_advance_step(it, bytes, ({ 3186 + obj_req->bvec_count++; 3187 + })); 3188 + } 3189 + 3190 + static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) 3191 + { 3192 + struct rbd_obj_request *obj_req = 3193 + container_of(ex, struct rbd_obj_request, ex); 3194 + struct ceph_bvec_iter *it = arg; 3195 + 3196 + ceph_bvec_iter_advance_step(it, bytes, ({ 3197 + obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv; 3198 + obj_req->bvec_pos.iter.bi_size += bv.bv_len; 3199 + })); 3200 + } 3201 + 3202 + static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, 3203 + struct ceph_file_extent *img_extents, 3204 + u32 num_img_extents, 3205 + struct ceph_bvec_iter *bvec_pos) 3206 + { 3207 + struct rbd_img_fill_ctx fctx = { 3208 + .pos_type = OBJ_REQUEST_BVECS, 3209 + .pos = (union rbd_img_fill_iter *)bvec_pos, 3210 + .set_pos_fn = set_bvec_pos, 3211 + .count_fn = count_bvecs, 3212 + .copy_fn = copy_bvecs, 3213 + }; 3214 + 3215 + return rbd_img_fill_request(img_req, img_extents, num_img_extents, 3216 + &fctx); 3217 + } 3218 + 3219 + static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, 3220 + struct ceph_file_extent *img_extents, 3221 + u32 num_img_extents, 3222 + struct bio_vec *bvecs) 3223 + { 3224 + struct ceph_bvec_iter it = { 3225 + .bvecs = bvecs, 3226 + .iter = { .bi_size = ceph_file_extents_bytes(img_extents, 3227 + num_img_extents) }, 3228 + }; 3229 + 3230 + return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents, 3231 + &it); 3232 + } 3233 + 3234 + static void rbd_img_request_submit(struct rbd_img_request *img_request) 1839 3235 { 1840 3236 struct rbd_obj_request *obj_request; 1841 - struct rbd_obj_request *next_obj_request; 1842 - int ret = 0; 1843 3237 1844 3238 dout("%s: img %p\n", __func__, img_request); 1845 3239 1846 3240 rbd_img_request_get(img_request); 1847 - for_each_obj_request_safe(img_request, obj_request, next_obj_request) { 1848 - ret = rbd_img_obj_request_submit(obj_request); 1849 - if (ret) 1850 - goto out_put_ireq; 1851 - } 3241 + for_each_obj_request(img_request, obj_request) 3242 + rbd_obj_request_submit(obj_request); 1852 3243 1853 - out_put_ireq: 1854 3244 rbd_img_request_put(img_request); 1855 - return ret; 1856 3245 } 1857 3246 1858 - static void rbd_img_parent_read_callback(struct rbd_img_request *img_request) 3247 + static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) 1859 3248 { 1860 - struct rbd_obj_request *obj_request; 1861 - struct rbd_device *rbd_dev; 1862 - u64 obj_end; 1863 - u64 img_xferred; 1864 - int img_result; 3249 + struct rbd_img_request *img_req = obj_req->img_request; 3250 + struct rbd_img_request *child_img_req; 3251 + int ret; 1865 3252 1866 - rbd_assert(img_request_child_test(img_request)); 3253 + child_img_req = rbd_img_request_create(img_req->rbd_dev->parent, 3254 + OBJ_OP_READ, NULL); 3255 + if (!child_img_req) 3256 + return -ENOMEM; 1867 3257 1868 - /* First get what we need from the image request and release it */ 3258 + __set_bit(IMG_REQ_CHILD, &child_img_req->flags); 3259 + child_img_req->obj_request = obj_req; 1869 3260 1870 - obj_request = img_request->obj_request; 1871 - img_xferred = img_request->xferred; 1872 - img_result = img_request->result; 1873 - rbd_img_request_put(img_request); 3261 + if (!rbd_img_is_write(img_req)) { 3262 + switch (img_req->data_type) { 3263 + case OBJ_REQUEST_BIO: 3264 + ret = __rbd_img_fill_from_bio(child_img_req, 3265 + obj_req->img_extents, 3266 + obj_req->num_img_extents, 3267 + &obj_req->bio_pos); 3268 + break; 3269 + case OBJ_REQUEST_BVECS: 3270 + case OBJ_REQUEST_OWN_BVECS: 3271 + ret = __rbd_img_fill_from_bvecs(child_img_req, 3272 + obj_req->img_extents, 3273 + obj_req->num_img_extents, 3274 + &obj_req->bvec_pos); 3275 + break; 3276 + default: 3277 + rbd_assert(0); 3278 + } 3279 + } else { 3280 + ret = rbd_img_fill_from_bvecs(child_img_req, 3281 + obj_req->img_extents, 3282 + obj_req->num_img_extents, 3283 + obj_req->copyup_bvecs); 3284 + } 3285 + if (ret) { 3286 + rbd_img_request_put(child_img_req); 3287 + return ret; 3288 + } 3289 + 3290 + rbd_img_request_submit(child_img_req); 3291 + return 0; 3292 + } 3293 + 3294 + static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req) 3295 + { 3296 + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 3297 + int ret; 3298 + 3299 + if (obj_req->result == -ENOENT && 3300 + rbd_dev->parent_overlap && !obj_req->tried_parent) { 3301 + /* reverse map this object extent onto the parent */ 3302 + ret = rbd_obj_calc_img_extents(obj_req, false); 3303 + if (ret) { 3304 + obj_req->result = ret; 3305 + return true; 3306 + } 3307 + 3308 + if (obj_req->num_img_extents) { 3309 + obj_req->tried_parent = true; 3310 + ret = rbd_obj_read_from_parent(obj_req); 3311 + if (ret) { 3312 + obj_req->result = ret; 3313 + return true; 3314 + } 3315 + return false; 3316 + } 3317 + } 1874 3318 1875 3319 /* 1876 - * If the overlap has become 0 (most likely because the 1877 - * image has been flattened) we need to re-submit the 1878 - * original request. 3320 + * -ENOENT means a hole in the image -- zero-fill the entire 3321 + * length of the request. A short read also implies zero-fill 3322 + * to the end of the request. In both cases we update xferred 3323 + * count to indicate the whole request was satisfied. 1879 3324 */ 1880 - rbd_assert(obj_request); 1881 - rbd_assert(obj_request->img_request); 1882 - rbd_dev = obj_request->img_request->rbd_dev; 1883 - if (!rbd_dev->parent_overlap) { 1884 - rbd_obj_request_submit(obj_request); 3325 + if (obj_req->result == -ENOENT || 3326 + (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) { 3327 + rbd_assert(!obj_req->xferred || !obj_req->result); 3328 + rbd_obj_zero_range(obj_req, obj_req->xferred, 3329 + obj_req->ex.oe_len - obj_req->xferred); 3330 + obj_req->result = 0; 3331 + obj_req->xferred = obj_req->ex.oe_len; 3332 + } 3333 + 3334 + return true; 3335 + } 3336 + 3337 + /* 3338 + * copyup_bvecs pages are never highmem pages 3339 + */ 3340 + static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes) 3341 + { 3342 + struct ceph_bvec_iter it = { 3343 + .bvecs = bvecs, 3344 + .iter = { .bi_size = bytes }, 3345 + }; 3346 + 3347 + ceph_bvec_iter_advance_step(&it, bytes, ({ 3348 + if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0, 3349 + bv.bv_len)) 3350 + return false; 3351 + })); 3352 + return true; 3353 + } 3354 + 3355 + static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) 3356 + { 3357 + unsigned int num_osd_ops = obj_req->osd_req->r_num_ops; 3358 + 3359 + dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes); 3360 + rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT); 3361 + rbd_osd_req_destroy(obj_req->osd_req); 3362 + 3363 + /* 3364 + * Create a copyup request with the same number of OSD ops as 3365 + * the original request. The original request was stat + op(s), 3366 + * the new copyup request will be copyup + the same op(s). 3367 + */ 3368 + obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops); 3369 + if (!obj_req->osd_req) 3370 + return -ENOMEM; 3371 + 3372 + /* 3373 + * Only send non-zero copyup data to save some I/O and network 3374 + * bandwidth -- zero copyup data is equivalent to the object not 3375 + * existing. 3376 + */ 3377 + if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) { 3378 + dout("%s obj_req %p detected zeroes\n", __func__, obj_req); 3379 + bytes = 0; 3380 + } 3381 + 3382 + osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd", 3383 + "copyup"); 3384 + osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0, 3385 + obj_req->copyup_bvecs, bytes); 3386 + 3387 + switch (obj_req->img_request->op_type) { 3388 + case OBJ_OP_WRITE: 3389 + __rbd_obj_setup_write(obj_req, 1); 3390 + break; 3391 + case OBJ_OP_DISCARD: 3392 + rbd_assert(!rbd_obj_is_entire(obj_req)); 3393 + __rbd_obj_setup_discard(obj_req, 1); 3394 + break; 3395 + default: 3396 + rbd_assert(0); 3397 + } 3398 + 3399 + rbd_obj_request_submit(obj_req); 3400 + return 0; 3401 + } 3402 + 3403 + static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap) 3404 + { 3405 + u32 i; 3406 + 3407 + rbd_assert(!obj_req->copyup_bvecs); 3408 + obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap); 3409 + obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count, 3410 + sizeof(*obj_req->copyup_bvecs), 3411 + GFP_NOIO); 3412 + if (!obj_req->copyup_bvecs) 3413 + return -ENOMEM; 3414 + 3415 + for (i = 0; i < obj_req->copyup_bvec_count; i++) { 3416 + unsigned int len = min(obj_overlap, (u64)PAGE_SIZE); 3417 + 3418 + obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO); 3419 + if (!obj_req->copyup_bvecs[i].bv_page) 3420 + return -ENOMEM; 3421 + 3422 + obj_req->copyup_bvecs[i].bv_offset = 0; 3423 + obj_req->copyup_bvecs[i].bv_len = len; 3424 + obj_overlap -= len; 3425 + } 3426 + 3427 + rbd_assert(!obj_overlap); 3428 + return 0; 3429 + } 3430 + 3431 + static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req) 3432 + { 3433 + struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev; 3434 + int ret; 3435 + 3436 + rbd_assert(obj_req->num_img_extents); 3437 + prune_extents(obj_req->img_extents, &obj_req->num_img_extents, 3438 + rbd_dev->parent_overlap); 3439 + if (!obj_req->num_img_extents) { 3440 + /* 3441 + * The overlap has become 0 (most likely because the 3442 + * image has been flattened). Use rbd_obj_issue_copyup() 3443 + * to re-submit the original write request -- the copyup 3444 + * operation itself will be a no-op, since someone must 3445 + * have populated the child object while we weren't 3446 + * looking. Move to WRITE_FLAT state as we'll be done 3447 + * with the operation once the null copyup completes. 3448 + */ 3449 + obj_req->write_state = RBD_OBJ_WRITE_FLAT; 3450 + return rbd_obj_issue_copyup(obj_req, 0); 3451 + } 3452 + 3453 + ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req)); 3454 + if (ret) 3455 + return ret; 3456 + 3457 + obj_req->write_state = RBD_OBJ_WRITE_COPYUP; 3458 + return rbd_obj_read_from_parent(obj_req); 3459 + } 3460 + 3461 + static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req) 3462 + { 3463 + int ret; 3464 + 3465 + again: 3466 + switch (obj_req->write_state) { 3467 + case RBD_OBJ_WRITE_GUARD: 3468 + rbd_assert(!obj_req->xferred); 3469 + if (obj_req->result == -ENOENT) { 3470 + /* 3471 + * The target object doesn't exist. Read the data for 3472 + * the entire target object up to the overlap point (if 3473 + * any) from the parent, so we can use it for a copyup. 3474 + */ 3475 + ret = rbd_obj_handle_write_guard(obj_req); 3476 + if (ret) { 3477 + obj_req->result = ret; 3478 + return true; 3479 + } 3480 + return false; 3481 + } 3482 + /* fall through */ 3483 + case RBD_OBJ_WRITE_FLAT: 3484 + if (!obj_req->result) 3485 + /* 3486 + * There is no such thing as a successful short 3487 + * write -- indicate the whole request was satisfied. 3488 + */ 3489 + obj_req->xferred = obj_req->ex.oe_len; 3490 + return true; 3491 + case RBD_OBJ_WRITE_COPYUP: 3492 + obj_req->write_state = RBD_OBJ_WRITE_GUARD; 3493 + if (obj_req->result) 3494 + goto again; 3495 + 3496 + rbd_assert(obj_req->xferred); 3497 + ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred); 3498 + if (ret) { 3499 + obj_req->result = ret; 3500 + return true; 3501 + } 3502 + return false; 3503 + default: 3504 + rbd_assert(0); 3505 + } 3506 + } 3507 + 3508 + /* 3509 + * Returns true if @obj_req is completed, or false otherwise. 3510 + */ 3511 + static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req) 3512 + { 3513 + switch (obj_req->img_request->op_type) { 3514 + case OBJ_OP_READ: 3515 + return rbd_obj_handle_read(obj_req); 3516 + case OBJ_OP_WRITE: 3517 + return rbd_obj_handle_write(obj_req); 3518 + case OBJ_OP_DISCARD: 3519 + if (rbd_obj_handle_write(obj_req)) { 3520 + /* 3521 + * Hide -ENOENT from delete/truncate/zero -- discarding 3522 + * a non-existent object is not a problem. 3523 + */ 3524 + if (obj_req->result == -ENOENT) { 3525 + obj_req->result = 0; 3526 + obj_req->xferred = obj_req->ex.oe_len; 3527 + } 3528 + return true; 3529 + } 3530 + return false; 3531 + default: 3532 + rbd_assert(0); 3533 + } 3534 + } 3535 + 3536 + static void rbd_obj_end_request(struct rbd_obj_request *obj_req) 3537 + { 3538 + struct rbd_img_request *img_req = obj_req->img_request; 3539 + 3540 + rbd_assert((!obj_req->result && 3541 + obj_req->xferred == obj_req->ex.oe_len) || 3542 + (obj_req->result < 0 && !obj_req->xferred)); 3543 + if (!obj_req->result) { 3544 + img_req->xferred += obj_req->xferred; 1885 3545 return; 1886 3546 } 1887 3547 1888 - obj_request->result = img_result; 1889 - if (obj_request->result) 1890 - goto out; 1891 - 1892 - /* 1893 - * We need to zero anything beyond the parent overlap 1894 - * boundary. Since rbd_img_obj_request_read_callback() 1895 - * will zero anything beyond the end of a short read, an 1896 - * easy way to do this is to pretend the data from the 1897 - * parent came up short--ending at the overlap boundary. 1898 - */ 1899 - rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length); 1900 - obj_end = obj_request->img_offset + obj_request->length; 1901 - if (obj_end > rbd_dev->parent_overlap) { 1902 - u64 xferred = 0; 1903 - 1904 - if (obj_request->img_offset < rbd_dev->parent_overlap) 1905 - xferred = rbd_dev->parent_overlap - 1906 - obj_request->img_offset; 1907 - 1908 - obj_request->xferred = min(img_xferred, xferred); 1909 - } else { 1910 - obj_request->xferred = img_xferred; 3548 + rbd_warn(img_req->rbd_dev, 3549 + "%s at objno %llu %llu~%llu result %d xferred %llu", 3550 + obj_op_name(img_req->op_type), obj_req->ex.oe_objno, 3551 + obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result, 3552 + obj_req->xferred); 3553 + if (!img_req->result) { 3554 + img_req->result = obj_req->result; 3555 + img_req->xferred = 0; 1911 3556 } 1912 - out: 1913 - rbd_img_obj_request_read_callback(obj_request); 1914 - rbd_obj_request_complete(obj_request); 1915 3557 } 1916 3558 1917 - static void rbd_img_parent_read(struct rbd_obj_request *obj_request) 3559 + static void rbd_img_end_child_request(struct rbd_img_request *img_req) 1918 3560 { 1919 - struct rbd_img_request *img_request; 1920 - int result; 3561 + struct rbd_obj_request *obj_req = img_req->obj_request; 1921 3562 1922 - rbd_assert(obj_request_img_data_test(obj_request)); 1923 - rbd_assert(obj_request->img_request != NULL); 1924 - rbd_assert(obj_request->result == (s32) -ENOENT); 1925 - rbd_assert(obj_request_type_valid(obj_request->type)); 3563 + rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags)); 3564 + rbd_assert((!img_req->result && 3565 + img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) || 3566 + (img_req->result < 0 && !img_req->xferred)); 1926 3567 1927 - /* rbd_read_finish(obj_request, obj_request->length); */ 1928 - img_request = rbd_parent_request_create(obj_request, 1929 - obj_request->img_offset, 1930 - obj_request->length); 1931 - result = -ENOMEM; 1932 - if (!img_request) 1933 - goto out_err; 3568 + obj_req->result = img_req->result; 3569 + obj_req->xferred = img_req->xferred; 3570 + rbd_img_request_put(img_req); 3571 + } 1934 3572 1935 - if (obj_request->type == OBJ_REQUEST_BIO) 1936 - result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 1937 - obj_request->bio_list); 1938 - else 1939 - result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES, 1940 - obj_request->pages); 1941 - if (result) 1942 - goto out_err; 3573 + static void rbd_img_end_request(struct rbd_img_request *img_req) 3574 + { 3575 + rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags)); 3576 + rbd_assert((!img_req->result && 3577 + img_req->xferred == blk_rq_bytes(img_req->rq)) || 3578 + (img_req->result < 0 && !img_req->xferred)); 1943 3579 1944 - img_request->callback = rbd_img_parent_read_callback; 1945 - result = rbd_img_request_submit(img_request); 1946 - if (result) 1947 - goto out_err; 3580 + blk_mq_end_request(img_req->rq, 3581 + errno_to_blk_status(img_req->result)); 3582 + rbd_img_request_put(img_req); 3583 + } 1948 3584 1949 - return; 1950 - out_err: 1951 - if (img_request) 1952 - rbd_img_request_put(img_request); 1953 - obj_request->result = result; 1954 - obj_request->xferred = 0; 1955 - obj_request_done_set(obj_request); 3585 + static void rbd_obj_handle_request(struct rbd_obj_request *obj_req) 3586 + { 3587 + struct rbd_img_request *img_req; 3588 + 3589 + again: 3590 + if (!__rbd_obj_handle_request(obj_req)) 3591 + return; 3592 + 3593 + img_req = obj_req->img_request; 3594 + spin_lock(&img_req->completion_lock); 3595 + rbd_obj_end_request(obj_req); 3596 + rbd_assert(img_req->pending_count); 3597 + if (--img_req->pending_count) { 3598 + spin_unlock(&img_req->completion_lock); 3599 + return; 3600 + } 3601 + 3602 + spin_unlock(&img_req->completion_lock); 3603 + if (test_bit(IMG_REQ_CHILD, &img_req->flags)) { 3604 + obj_req = img_req->obj_request; 3605 + rbd_img_end_child_request(img_req); 3606 + goto again; 3607 + } 3608 + rbd_img_end_request(img_req); 1956 3609 } 1957 3610 1958 3611 static const struct rbd_client_id rbd_empty_cid; ··· 2674 3091 { 2675 3092 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2676 3093 struct rbd_client_id cid = rbd_get_cid(rbd_dev); 2677 - int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN; 2678 - char buf[buf_size]; 3094 + char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN]; 3095 + int buf_size = sizeof(buf); 2679 3096 void *p = buf; 2680 3097 2681 3098 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op); ··· 3193 3610 u64 notify_id, u64 cookie, s32 *result) 3194 3611 { 3195 3612 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 3196 - int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN; 3197 - char buf[buf_size]; 3613 + char buf[4 + CEPH_ENCODING_START_BLK_LEN]; 3614 + int buf_size = sizeof(buf); 3198 3615 int ret; 3199 3616 3200 3617 if (result) { ··· 3470 3887 3471 3888 ret = rbd_dev_refresh(rbd_dev); 3472 3889 if (ret) 3473 - rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret); 3890 + rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret); 3474 3891 } 3475 3892 3476 3893 /* ··· 3653 4070 } 3654 4071 } 3655 4072 3656 - img_request = rbd_img_request_create(rbd_dev, offset, length, op_type, 3657 - snapc); 4073 + img_request = rbd_img_request_create(rbd_dev, op_type, snapc); 3658 4074 if (!img_request) { 3659 4075 result = -ENOMEM; 3660 4076 goto err_unlock; ··· 3662 4080 snapc = NULL; /* img_request consumes a ref */ 3663 4081 3664 4082 if (op_type == OBJ_OP_DISCARD) 3665 - result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, 3666 - NULL); 4083 + result = rbd_img_fill_nodata(img_request, offset, length); 3667 4084 else 3668 - result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, 3669 - rq->bio); 4085 + result = rbd_img_fill_from_bio(img_request, offset, length, 4086 + rq->bio); 3670 4087 if (result) 3671 4088 goto err_img_request; 3672 4089 3673 - result = rbd_img_request_submit(img_request); 3674 - if (result) 3675 - goto err_img_request; 3676 - 4090 + rbd_img_request_submit(img_request); 3677 4091 if (must_be_locked) 3678 4092 up_read(&rbd_dev->lock_rwsem); 3679 4093 return; ··· 3947 4369 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE); 3948 4370 q->limits.max_sectors = queue_max_hw_sectors(q); 3949 4371 blk_queue_max_segments(q, USHRT_MAX); 3950 - blk_queue_max_segment_size(q, segment_size); 4372 + blk_queue_max_segment_size(q, UINT_MAX); 3951 4373 blk_queue_io_min(q, segment_size); 3952 4374 blk_queue_io_opt(q, segment_size); 3953 4375 ··· 4635 5057 } __attribute__ ((packed)) striping_info_buf = { 0 }; 4636 5058 size_t size = sizeof (striping_info_buf); 4637 5059 void *p; 4638 - u64 obj_size; 4639 - u64 stripe_unit; 4640 - u64 stripe_count; 4641 5060 int ret; 4642 5061 4643 5062 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, ··· 4646 5071 if (ret < size) 4647 5072 return -ERANGE; 4648 5073 4649 - /* 4650 - * We don't actually support the "fancy striping" feature 4651 - * (STRIPINGV2) yet, but if the striping sizes are the 4652 - * defaults the behavior is the same as before. So find 4653 - * out, and only fail if the image has non-default values. 4654 - */ 4655 - ret = -EINVAL; 4656 - obj_size = rbd_obj_bytes(&rbd_dev->header); 4657 5074 p = &striping_info_buf; 4658 - stripe_unit = ceph_decode_64(&p); 4659 - if (stripe_unit != obj_size) { 4660 - rbd_warn(rbd_dev, "unsupported stripe unit " 4661 - "(got %llu want %llu)", 4662 - stripe_unit, obj_size); 4663 - return -EINVAL; 4664 - } 4665 - stripe_count = ceph_decode_64(&p); 4666 - if (stripe_count != 1) { 4667 - rbd_warn(rbd_dev, "unsupported stripe count " 4668 - "(got %llu want 1)", stripe_count); 4669 - return -EINVAL; 4670 - } 4671 - rbd_dev->header.stripe_unit = stripe_unit; 4672 - rbd_dev->header.stripe_count = stripe_count; 4673 - 5075 + rbd_dev->header.stripe_unit = ceph_decode_64(&p); 5076 + rbd_dev->header.stripe_count = ceph_decode_64(&p); 4674 5077 return 0; 4675 5078 } 4676 5079 ··· 5206 5653 return ret; 5207 5654 } 5208 5655 5209 - /* 5210 - * Return pool id (>= 0) or a negative error code. 5211 - */ 5212 - static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) 5213 - { 5214 - struct ceph_options *opts = rbdc->client->options; 5215 - u64 newest_epoch; 5216 - int tries = 0; 5217 - int ret; 5218 - 5219 - again: 5220 - ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); 5221 - if (ret == -ENOENT && tries++ < 1) { 5222 - ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap", 5223 - &newest_epoch); 5224 - if (ret < 0) 5225 - return ret; 5226 - 5227 - if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { 5228 - ceph_osdc_maybe_request_map(&rbdc->client->osdc); 5229 - (void) ceph_monc_wait_osdmap(&rbdc->client->monc, 5230 - newest_epoch, 5231 - opts->mount_timeout); 5232 - goto again; 5233 - } else { 5234 - /* the osdmap we have is new enough */ 5235 - return -ENOENT; 5236 - } 5237 - } 5238 - 5239 - return ret; 5240 - } 5241 - 5242 5656 static void rbd_dev_image_unlock(struct rbd_device *rbd_dev) 5243 5657 { 5244 5658 down_write(&rbd_dev->lock_rwsem); ··· 5634 6114 } 5635 6115 5636 6116 /* pick the pool */ 5637 - rc = rbd_add_get_pool_id(rbdc, spec->pool_name); 6117 + rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name); 5638 6118 if (rc < 0) { 5639 6119 if (rc == -ENOENT) 5640 6120 pr_info("pool %s does not exist\n", spec->pool_name); ··· 5886 6366 if (!rbd_obj_request_cache) 5887 6367 goto out_err; 5888 6368 5889 - rbd_assert(!rbd_bio_clone); 5890 - rbd_bio_clone = bioset_create(BIO_POOL_SIZE, 0, 0); 5891 - if (!rbd_bio_clone) 5892 - goto out_err_clone; 5893 - 5894 6369 return 0; 5895 6370 5896 - out_err_clone: 5897 - kmem_cache_destroy(rbd_obj_request_cache); 5898 - rbd_obj_request_cache = NULL; 5899 6371 out_err: 5900 6372 kmem_cache_destroy(rbd_img_request_cache); 5901 6373 rbd_img_request_cache = NULL; ··· 5903 6391 rbd_assert(rbd_img_request_cache); 5904 6392 kmem_cache_destroy(rbd_img_request_cache); 5905 6393 rbd_img_request_cache = NULL; 5906 - 5907 - rbd_assert(rbd_bio_clone); 5908 - bioset_free(rbd_bio_clone); 5909 - rbd_bio_clone = NULL; 5910 6394 } 5911 6395 5912 6396 static int __init rbd_init(void)
+1 -1
fs/ceph/Makefile
··· 6 6 obj-$(CONFIG_CEPH_FS) += ceph.o 7 7 8 8 ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ 9 - export.o caps.o snap.o xattr.o \ 9 + export.o caps.o snap.o xattr.o quota.o \ 10 10 mds_client.o mdsmap.o strings.o ceph_frag.o \ 11 11 debugfs.o 12 12
+32 -31
fs/ceph/addr.c
··· 15 15 #include "mds_client.h" 16 16 #include "cache.h" 17 17 #include <linux/ceph/osd_client.h> 18 + #include <linux/ceph/striper.h> 18 19 19 20 /* 20 21 * Ceph address space ops. ··· 439 438 { 440 439 struct inode *inode = file_inode(file); 441 440 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 442 - struct ceph_file_info *ci = file->private_data; 441 + struct ceph_file_info *fi = file->private_data; 443 442 struct ceph_rw_context *rw_ctx; 444 443 int rc = 0; 445 444 int max = 0; ··· 453 452 if (rc == 0) 454 453 goto out; 455 454 456 - rw_ctx = ceph_find_rw_context(ci); 455 + rw_ctx = ceph_find_rw_context(fi); 457 456 max = fsc->mount_options->rsize >> PAGE_SHIFT; 458 457 dout("readpages %p file %p ctx %p nr_pages %d max %d\n", 459 458 inode, file, rw_ctx, nr_pages, max); ··· 801 800 struct ceph_osd_request *req = NULL; 802 801 struct ceph_writeback_ctl ceph_wbc; 803 802 bool should_loop, range_whole = false; 804 - bool stop, done = false; 803 + bool done = false; 805 804 806 805 dout("writepages_start %p (mode=%s)\n", inode, 807 806 wbc->sync_mode == WB_SYNC_NONE ? "NONE" : ··· 857 856 * in that range can be associated with newer snapc. 858 857 * They are not writeable until we write all dirty pages 859 858 * associated with 'snapc' get written */ 860 - if (index > 0 || wbc->sync_mode != WB_SYNC_NONE) 859 + if (index > 0) 861 860 should_loop = true; 862 861 dout(" non-head snapc, range whole\n"); 863 862 } ··· 865 864 ceph_put_snap_context(last_snapc); 866 865 last_snapc = snapc; 867 866 868 - stop = false; 869 - while (!stop && index <= end) { 867 + while (!done && index <= end) { 870 868 int num_ops = 0, op_idx; 871 869 unsigned i, pvec_pages, max_pages, locked_pages = 0; 872 870 struct page **pages = NULL, **data_pages; ··· 898 898 unlock_page(page); 899 899 continue; 900 900 } 901 - if (strip_unit_end && (page->index > strip_unit_end)) { 902 - dout("end of strip unit %p\n", page); 901 + /* only if matching snap context */ 902 + pgsnapc = page_snap_context(page); 903 + if (pgsnapc != snapc) { 904 + dout("page snapc %p %lld != oldest %p %lld\n", 905 + pgsnapc, pgsnapc->seq, snapc, snapc->seq); 906 + if (!should_loop && 907 + !ceph_wbc.head_snapc && 908 + wbc->sync_mode != WB_SYNC_NONE) 909 + should_loop = true; 903 910 unlock_page(page); 904 - break; 911 + continue; 905 912 } 906 913 if (page_offset(page) >= ceph_wbc.i_size) { 907 914 dout("%p page eof %llu\n", 908 915 page, ceph_wbc.i_size); 909 - /* not done if range_cyclic */ 910 - stop = true; 916 + if (ceph_wbc.size_stable || 917 + page_offset(page) >= i_size_read(inode)) 918 + mapping->a_ops->invalidatepage(page, 919 + 0, PAGE_SIZE); 920 + unlock_page(page); 921 + continue; 922 + } 923 + if (strip_unit_end && (page->index > strip_unit_end)) { 924 + dout("end of strip unit %p\n", page); 911 925 unlock_page(page); 912 926 break; 913 927 } ··· 933 919 } 934 920 dout("waiting on writeback %p\n", page); 935 921 wait_on_page_writeback(page); 936 - } 937 - 938 - /* only if matching snap context */ 939 - pgsnapc = page_snap_context(page); 940 - if (pgsnapc != snapc) { 941 - dout("page snapc %p %lld != oldest %p %lld\n", 942 - pgsnapc, pgsnapc->seq, snapc, snapc->seq); 943 - unlock_page(page); 944 - continue; 945 922 } 946 923 947 924 if (!clear_page_dirty_for_io(page)) { ··· 950 945 if (locked_pages == 0) { 951 946 u64 objnum; 952 947 u64 objoff; 948 + u32 xlen; 953 949 954 950 /* prepare async write request */ 955 951 offset = (u64)page_offset(page); 956 - len = wsize; 957 - 958 - rc = ceph_calc_file_object_mapping(&ci->i_layout, 959 - offset, len, 960 - &objnum, &objoff, 961 - &len); 962 - if (rc < 0) { 963 - unlock_page(page); 964 - break; 965 - } 952 + ceph_calc_file_object_mapping(&ci->i_layout, 953 + offset, wsize, 954 + &objnum, &objoff, 955 + &xlen); 956 + len = xlen; 966 957 967 958 num_ops = 1; 968 959 strip_unit_end = page->index + ··· 1147 1146 * we tagged for writeback prior to entering this loop. 1148 1147 */ 1149 1148 if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) 1150 - done = stop = true; 1149 + done = true; 1151 1150 1152 1151 release_pvec_pages: 1153 1152 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
+2 -2
fs/ceph/cache.c
··· 51 51 .type = FSCACHE_COOKIE_TYPE_INDEX, 52 52 }; 53 53 54 - int ceph_fscache_register(void) 54 + int __init ceph_fscache_register(void) 55 55 { 56 56 return fscache_register_netfs(&ceph_cache_netfs); 57 57 } ··· 135 135 if (memcmp(data, &aux, sizeof(aux)) != 0) 136 136 return FSCACHE_CHECKAUX_OBSOLETE; 137 137 138 - dout("ceph inode 0x%p cached okay", ci); 138 + dout("ceph inode 0x%p cached okay\n", ci); 139 139 return FSCACHE_CHECKAUX_OKAY; 140 140 } 141 141
+102 -36
fs/ceph/caps.c
··· 184 184 mdsc->caps_avail_count); 185 185 spin_unlock(&mdsc->caps_list_lock); 186 186 187 - for (i = have; i < need; i++) { 188 - retry: 187 + for (i = have; i < need; ) { 189 188 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 190 - if (!cap) { 191 - if (!trimmed) { 192 - for (j = 0; j < mdsc->max_sessions; j++) { 193 - s = __ceph_lookup_mds_session(mdsc, j); 194 - if (!s) 195 - continue; 196 - mutex_unlock(&mdsc->mutex); 197 - 198 - mutex_lock(&s->s_mutex); 199 - max_caps = s->s_nr_caps - (need - i); 200 - ceph_trim_caps(mdsc, s, max_caps); 201 - mutex_unlock(&s->s_mutex); 202 - 203 - ceph_put_mds_session(s); 204 - mutex_lock(&mdsc->mutex); 205 - } 206 - trimmed = true; 207 - goto retry; 208 - } else { 209 - pr_warn("reserve caps ctx=%p ENOMEM " 210 - "need=%d got=%d\n", 211 - ctx, need, have + alloc); 212 - goto out_nomem; 213 - } 189 + if (cap) { 190 + list_add(&cap->caps_item, &newcaps); 191 + alloc++; 192 + i++; 193 + continue; 214 194 } 215 - list_add(&cap->caps_item, &newcaps); 216 - alloc++; 195 + 196 + if (!trimmed) { 197 + for (j = 0; j < mdsc->max_sessions; j++) { 198 + s = __ceph_lookup_mds_session(mdsc, j); 199 + if (!s) 200 + continue; 201 + mutex_unlock(&mdsc->mutex); 202 + 203 + mutex_lock(&s->s_mutex); 204 + max_caps = s->s_nr_caps - (need - i); 205 + ceph_trim_caps(mdsc, s, max_caps); 206 + mutex_unlock(&s->s_mutex); 207 + 208 + ceph_put_mds_session(s); 209 + mutex_lock(&mdsc->mutex); 210 + } 211 + trimmed = true; 212 + 213 + spin_lock(&mdsc->caps_list_lock); 214 + if (mdsc->caps_avail_count) { 215 + int more_have; 216 + if (mdsc->caps_avail_count >= need - i) 217 + more_have = need - i; 218 + else 219 + more_have = mdsc->caps_avail_count; 220 + 221 + i += more_have; 222 + have += more_have; 223 + mdsc->caps_avail_count -= more_have; 224 + mdsc->caps_reserve_count += more_have; 225 + 226 + } 227 + spin_unlock(&mdsc->caps_list_lock); 228 + 229 + continue; 230 + } 231 + 232 + pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", 233 + ctx, need, have + alloc); 234 + goto out_nomem; 217 235 } 218 236 BUG_ON(have + alloc != need); 219 237 ··· 252 234 return 0; 253 235 254 236 out_nomem: 255 - while (!list_empty(&newcaps)) { 256 - cap = list_first_entry(&newcaps, 257 - struct ceph_cap, caps_item); 258 - list_del(&cap->caps_item); 259 - kmem_cache_free(ceph_cap_cachep, cap); 260 - } 261 237 262 238 spin_lock(&mdsc->caps_list_lock); 263 239 mdsc->caps_avail_count += have; 264 240 mdsc->caps_reserve_count -= have; 241 + 242 + while (!list_empty(&newcaps)) { 243 + cap = list_first_entry(&newcaps, 244 + struct ceph_cap, caps_item); 245 + list_del(&cap->caps_item); 246 + 247 + /* Keep some preallocated caps around (ceph_min_count), to 248 + * avoid lots of free/alloc churn. */ 249 + if (mdsc->caps_avail_count >= 250 + mdsc->caps_reserve_count + mdsc->caps_min_count) { 251 + kmem_cache_free(ceph_cap_cachep, cap); 252 + } else { 253 + mdsc->caps_avail_count++; 254 + mdsc->caps_total_count++; 255 + list_add(&cap->caps_item, &mdsc->caps_list); 256 + } 257 + } 258 + 265 259 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + 266 260 mdsc->caps_reserve_count + 267 261 mdsc->caps_avail_count); ··· 284 254 int ceph_unreserve_caps(struct ceph_mds_client *mdsc, 285 255 struct ceph_cap_reservation *ctx) 286 256 { 257 + int i; 258 + struct ceph_cap *cap; 259 + 287 260 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); 288 261 if (ctx->count) { 289 262 spin_lock(&mdsc->caps_list_lock); 290 263 BUG_ON(mdsc->caps_reserve_count < ctx->count); 291 264 mdsc->caps_reserve_count -= ctx->count; 292 - mdsc->caps_avail_count += ctx->count; 265 + if (mdsc->caps_avail_count >= 266 + mdsc->caps_reserve_count + mdsc->caps_min_count) { 267 + mdsc->caps_total_count -= ctx->count; 268 + for (i = 0; i < ctx->count; i++) { 269 + cap = list_first_entry(&mdsc->caps_list, 270 + struct ceph_cap, caps_item); 271 + list_del(&cap->caps_item); 272 + kmem_cache_free(ceph_cap_cachep, cap); 273 + } 274 + } else { 275 + mdsc->caps_avail_count += ctx->count; 276 + } 293 277 ctx->count = 0; 294 278 dout("unreserve caps %d = %d used + %d resv + %d avail\n", 295 279 mdsc->caps_total_count, mdsc->caps_use_count, ··· 329 285 mdsc->caps_use_count++; 330 286 mdsc->caps_total_count++; 331 287 spin_unlock(&mdsc->caps_list_lock); 288 + } else { 289 + spin_lock(&mdsc->caps_list_lock); 290 + if (mdsc->caps_avail_count) { 291 + BUG_ON(list_empty(&mdsc->caps_list)); 292 + 293 + mdsc->caps_avail_count--; 294 + mdsc->caps_use_count++; 295 + cap = list_first_entry(&mdsc->caps_list, 296 + struct ceph_cap, caps_item); 297 + list_del(&cap->caps_item); 298 + 299 + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + 300 + mdsc->caps_reserve_count + mdsc->caps_avail_count); 301 + } 302 + spin_unlock(&mdsc->caps_list_lock); 332 303 } 304 + 333 305 return cap; 334 306 } 335 307 ··· 401 341 { 402 342 struct ceph_mds_client *mdsc = fsc->mdsc; 403 343 344 + spin_lock(&mdsc->caps_list_lock); 345 + 404 346 if (total) 405 347 *total = mdsc->caps_total_count; 406 348 if (avail) ··· 413 351 *reserved = mdsc->caps_reserve_count; 414 352 if (min) 415 353 *min = mdsc->caps_min_count; 354 + 355 + spin_unlock(&mdsc->caps_list_lock); 416 356 } 417 357 418 358 /* ··· 703 639 } 704 640 705 641 spin_lock(&realm->inodes_with_caps_lock); 706 - ci->i_snap_realm = realm; 707 642 list_add(&ci->i_snap_realm_item, 708 643 &realm->inodes_with_caps); 644 + ci->i_snap_realm = realm; 645 + if (realm->ino == ci->i_vino.ino) 646 + realm->inode = inode; 709 647 spin_unlock(&realm->inodes_with_caps_lock); 710 648 711 649 if (oldrealm)
+4 -4
fs/ceph/debugfs.c
··· 260 260 goto out; 261 261 262 262 fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", 263 - 0600, 263 + 0400, 264 264 fsc->client->debugfs_dir, 265 265 fsc, 266 266 &mdsmap_show_fops); ··· 268 268 goto out; 269 269 270 270 fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions", 271 - 0600, 271 + 0400, 272 272 fsc->client->debugfs_dir, 273 273 fsc, 274 274 &mds_sessions_show_fops); ··· 276 276 goto out; 277 277 278 278 fsc->debugfs_mdsc = debugfs_create_file("mdsc", 279 - 0600, 279 + 0400, 280 280 fsc->client->debugfs_dir, 281 281 fsc, 282 282 &mdsc_show_fops); ··· 292 292 goto out; 293 293 294 294 fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", 295 - 0600, 295 + 0400, 296 296 fsc->client->debugfs_dir, 297 297 fsc, 298 298 &dentry_lru_show_fops);
+112 -92
fs/ceph/dir.c
··· 101 101 * regardless of what dir changes take place on the 102 102 * server. 103 103 */ 104 - static int note_last_dentry(struct ceph_file_info *fi, const char *name, 104 + static int note_last_dentry(struct ceph_dir_file_info *dfi, const char *name, 105 105 int len, unsigned next_offset) 106 106 { 107 107 char *buf = kmalloc(len+1, GFP_KERNEL); 108 108 if (!buf) 109 109 return -ENOMEM; 110 - kfree(fi->last_name); 111 - fi->last_name = buf; 112 - memcpy(fi->last_name, name, len); 113 - fi->last_name[len] = 0; 114 - fi->next_offset = next_offset; 115 - dout("note_last_dentry '%s'\n", fi->last_name); 110 + kfree(dfi->last_name); 111 + dfi->last_name = buf; 112 + memcpy(dfi->last_name, name, len); 113 + dfi->last_name[len] = 0; 114 + dfi->next_offset = next_offset; 115 + dout("note_last_dentry '%s'\n", dfi->last_name); 116 116 return 0; 117 117 } 118 118 ··· 174 174 static int __dcache_readdir(struct file *file, struct dir_context *ctx, 175 175 int shared_gen) 176 176 { 177 - struct ceph_file_info *fi = file->private_data; 177 + struct ceph_dir_file_info *dfi = file->private_data; 178 178 struct dentry *parent = file->f_path.dentry; 179 179 struct inode *dir = d_inode(parent); 180 180 struct dentry *dentry, *last = NULL; ··· 221 221 bool emit_dentry = false; 222 222 dentry = __dcache_find_get_entry(parent, idx++, &cache_ctl); 223 223 if (!dentry) { 224 - fi->flags |= CEPH_F_ATEND; 224 + dfi->file_info.flags |= CEPH_F_ATEND; 225 225 err = 0; 226 226 break; 227 227 } ··· 272 272 if (last) { 273 273 int ret; 274 274 di = ceph_dentry(last); 275 - ret = note_last_dentry(fi, last->d_name.name, last->d_name.len, 275 + ret = note_last_dentry(dfi, last->d_name.name, last->d_name.len, 276 276 fpos_off(di->offset) + 1); 277 277 if (ret < 0) 278 278 err = ret; 279 279 dput(last); 280 280 /* last_name no longer match cache index */ 281 - if (fi->readdir_cache_idx >= 0) { 282 - fi->readdir_cache_idx = -1; 283 - fi->dir_release_count = 0; 281 + if (dfi->readdir_cache_idx >= 0) { 282 + dfi->readdir_cache_idx = -1; 283 + dfi->dir_release_count = 0; 284 284 } 285 285 } 286 286 return err; 287 287 } 288 288 289 - static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos) 289 + static bool need_send_readdir(struct ceph_dir_file_info *dfi, loff_t pos) 290 290 { 291 - if (!fi->last_readdir) 291 + if (!dfi->last_readdir) 292 292 return true; 293 293 if (is_hash_order(pos)) 294 - return !ceph_frag_contains_value(fi->frag, fpos_hash(pos)); 294 + return !ceph_frag_contains_value(dfi->frag, fpos_hash(pos)); 295 295 else 296 - return fi->frag != fpos_frag(pos); 296 + return dfi->frag != fpos_frag(pos); 297 297 } 298 298 299 299 static int ceph_readdir(struct file *file, struct dir_context *ctx) 300 300 { 301 - struct ceph_file_info *fi = file->private_data; 301 + struct ceph_dir_file_info *dfi = file->private_data; 302 302 struct inode *inode = file_inode(file); 303 303 struct ceph_inode_info *ci = ceph_inode(inode); 304 304 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); ··· 309 309 struct ceph_mds_reply_info_parsed *rinfo; 310 310 311 311 dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); 312 - if (fi->flags & CEPH_F_ATEND) 312 + if (dfi->file_info.flags & CEPH_F_ATEND) 313 313 return 0; 314 314 315 315 /* always start with . and .. */ ··· 350 350 /* proceed with a normal readdir */ 351 351 more: 352 352 /* do we have the correct frag content buffered? */ 353 - if (need_send_readdir(fi, ctx->pos)) { 353 + if (need_send_readdir(dfi, ctx->pos)) { 354 354 struct ceph_mds_request *req; 355 355 int op = ceph_snap(inode) == CEPH_SNAPDIR ? 356 356 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; 357 357 358 358 /* discard old result, if any */ 359 - if (fi->last_readdir) { 360 - ceph_mdsc_put_request(fi->last_readdir); 361 - fi->last_readdir = NULL; 359 + if (dfi->last_readdir) { 360 + ceph_mdsc_put_request(dfi->last_readdir); 361 + dfi->last_readdir = NULL; 362 362 } 363 363 364 364 if (is_hash_order(ctx->pos)) { ··· 372 372 } 373 373 374 374 dout("readdir fetching %llx.%llx frag %x offset '%s'\n", 375 - ceph_vinop(inode), frag, fi->last_name); 375 + ceph_vinop(inode), frag, dfi->last_name); 376 376 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 377 377 if (IS_ERR(req)) 378 378 return PTR_ERR(req); ··· 388 388 __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); 389 389 req->r_inode_drop = CEPH_CAP_FILE_EXCL; 390 390 } 391 - if (fi->last_name) { 392 - req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); 391 + if (dfi->last_name) { 392 + req->r_path2 = kstrdup(dfi->last_name, GFP_KERNEL); 393 393 if (!req->r_path2) { 394 394 ceph_mdsc_put_request(req); 395 395 return -ENOMEM; ··· 399 399 cpu_to_le32(fpos_hash(ctx->pos)); 400 400 } 401 401 402 - req->r_dir_release_cnt = fi->dir_release_count; 403 - req->r_dir_ordered_cnt = fi->dir_ordered_count; 404 - req->r_readdir_cache_idx = fi->readdir_cache_idx; 405 - req->r_readdir_offset = fi->next_offset; 402 + req->r_dir_release_cnt = dfi->dir_release_count; 403 + req->r_dir_ordered_cnt = dfi->dir_ordered_count; 404 + req->r_readdir_cache_idx = dfi->readdir_cache_idx; 405 + req->r_readdir_offset = dfi->next_offset; 406 406 req->r_args.readdir.frag = cpu_to_le32(frag); 407 407 req->r_args.readdir.flags = 408 408 cpu_to_le16(CEPH_READDIR_REPLY_BITFLAGS); ··· 426 426 if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { 427 427 frag = le32_to_cpu(rinfo->dir_dir->frag); 428 428 if (!rinfo->hash_order) { 429 - fi->next_offset = req->r_readdir_offset; 429 + dfi->next_offset = req->r_readdir_offset; 430 430 /* adjust ctx->pos to beginning of frag */ 431 431 ctx->pos = ceph_make_fpos(frag, 432 - fi->next_offset, 432 + dfi->next_offset, 433 433 false); 434 434 } 435 435 } 436 436 437 - fi->frag = frag; 438 - fi->last_readdir = req; 437 + dfi->frag = frag; 438 + dfi->last_readdir = req; 439 439 440 440 if (test_bit(CEPH_MDS_R_DID_PREPOPULATE, &req->r_req_flags)) { 441 - fi->readdir_cache_idx = req->r_readdir_cache_idx; 442 - if (fi->readdir_cache_idx < 0) { 441 + dfi->readdir_cache_idx = req->r_readdir_cache_idx; 442 + if (dfi->readdir_cache_idx < 0) { 443 443 /* preclude from marking dir ordered */ 444 - fi->dir_ordered_count = 0; 444 + dfi->dir_ordered_count = 0; 445 445 } else if (ceph_frag_is_leftmost(frag) && 446 - fi->next_offset == 2) { 446 + dfi->next_offset == 2) { 447 447 /* note dir version at start of readdir so 448 448 * we can tell if any dentries get dropped */ 449 - fi->dir_release_count = req->r_dir_release_cnt; 450 - fi->dir_ordered_count = req->r_dir_ordered_cnt; 449 + dfi->dir_release_count = req->r_dir_release_cnt; 450 + dfi->dir_ordered_count = req->r_dir_ordered_cnt; 451 451 } 452 452 } else { 453 - dout("readdir !did_prepopulate"); 453 + dout("readdir !did_prepopulate\n"); 454 454 /* disable readdir cache */ 455 - fi->readdir_cache_idx = -1; 455 + dfi->readdir_cache_idx = -1; 456 456 /* preclude from marking dir complete */ 457 - fi->dir_release_count = 0; 457 + dfi->dir_release_count = 0; 458 458 } 459 459 460 460 /* note next offset and last dentry name */ ··· 463 463 rinfo->dir_entries + (rinfo->dir_nr-1); 464 464 unsigned next_offset = req->r_reply_info.dir_end ? 465 465 2 : (fpos_off(rde->offset) + 1); 466 - err = note_last_dentry(fi, rde->name, rde->name_len, 466 + err = note_last_dentry(dfi, rde->name, rde->name_len, 467 467 next_offset); 468 468 if (err) 469 469 return err; 470 470 } else if (req->r_reply_info.dir_end) { 471 - fi->next_offset = 2; 471 + dfi->next_offset = 2; 472 472 /* keep last name */ 473 473 } 474 474 } 475 475 476 - rinfo = &fi->last_readdir->r_reply_info; 476 + rinfo = &dfi->last_readdir->r_reply_info; 477 477 dout("readdir frag %x num %d pos %llx chunk first %llx\n", 478 - fi->frag, rinfo->dir_nr, ctx->pos, 478 + dfi->frag, rinfo->dir_nr, ctx->pos, 479 479 rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); 480 480 481 481 i = 0; ··· 519 519 ctx->pos++; 520 520 } 521 521 522 - ceph_mdsc_put_request(fi->last_readdir); 523 - fi->last_readdir = NULL; 522 + ceph_mdsc_put_request(dfi->last_readdir); 523 + dfi->last_readdir = NULL; 524 524 525 - if (fi->next_offset > 2) { 526 - frag = fi->frag; 525 + if (dfi->next_offset > 2) { 526 + frag = dfi->frag; 527 527 goto more; 528 528 } 529 529 530 530 /* more frags? */ 531 - if (!ceph_frag_is_rightmost(fi->frag)) { 532 - frag = ceph_frag_next(fi->frag); 531 + if (!ceph_frag_is_rightmost(dfi->frag)) { 532 + frag = ceph_frag_next(dfi->frag); 533 533 if (is_hash_order(ctx->pos)) { 534 534 loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), 535 - fi->next_offset, true); 535 + dfi->next_offset, true); 536 536 if (new_pos > ctx->pos) 537 537 ctx->pos = new_pos; 538 538 /* keep last_name */ 539 539 } else { 540 - ctx->pos = ceph_make_fpos(frag, fi->next_offset, false); 541 - kfree(fi->last_name); 542 - fi->last_name = NULL; 540 + ctx->pos = ceph_make_fpos(frag, dfi->next_offset, 541 + false); 542 + kfree(dfi->last_name); 543 + dfi->last_name = NULL; 543 544 } 544 545 dout("readdir next frag is %x\n", frag); 545 546 goto more; 546 547 } 547 - fi->flags |= CEPH_F_ATEND; 548 + dfi->file_info.flags |= CEPH_F_ATEND; 548 549 549 550 /* 550 551 * if dir_release_count still matches the dir, no dentries 551 552 * were released during the whole readdir, and we should have 552 553 * the complete dir contents in our cache. 553 554 */ 554 - if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) { 555 + if (atomic64_read(&ci->i_release_count) == 556 + dfi->dir_release_count) { 555 557 spin_lock(&ci->i_ceph_lock); 556 - if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) { 558 + if (dfi->dir_ordered_count == 559 + atomic64_read(&ci->i_ordered_count)) { 557 560 dout(" marking %p complete and ordered\n", inode); 558 561 /* use i_size to track number of entries in 559 562 * readdir cache */ 560 - BUG_ON(fi->readdir_cache_idx < 0); 561 - i_size_write(inode, fi->readdir_cache_idx * 563 + BUG_ON(dfi->readdir_cache_idx < 0); 564 + i_size_write(inode, dfi->readdir_cache_idx * 562 565 sizeof(struct dentry*)); 563 566 } else { 564 567 dout(" marking %p complete\n", inode); 565 568 } 566 - __ceph_dir_set_complete(ci, fi->dir_release_count, 567 - fi->dir_ordered_count); 569 + __ceph_dir_set_complete(ci, dfi->dir_release_count, 570 + dfi->dir_ordered_count); 568 571 spin_unlock(&ci->i_ceph_lock); 569 572 } 570 573 ··· 575 572 return 0; 576 573 } 577 574 578 - static void reset_readdir(struct ceph_file_info *fi) 575 + static void reset_readdir(struct ceph_dir_file_info *dfi) 579 576 { 580 - if (fi->last_readdir) { 581 - ceph_mdsc_put_request(fi->last_readdir); 582 - fi->last_readdir = NULL; 577 + if (dfi->last_readdir) { 578 + ceph_mdsc_put_request(dfi->last_readdir); 579 + dfi->last_readdir = NULL; 583 580 } 584 - kfree(fi->last_name); 585 - fi->last_name = NULL; 586 - fi->dir_release_count = 0; 587 - fi->readdir_cache_idx = -1; 588 - fi->next_offset = 2; /* compensate for . and .. */ 589 - fi->flags &= ~CEPH_F_ATEND; 581 + kfree(dfi->last_name); 582 + dfi->last_name = NULL; 583 + dfi->dir_release_count = 0; 584 + dfi->readdir_cache_idx = -1; 585 + dfi->next_offset = 2; /* compensate for . and .. */ 586 + dfi->file_info.flags &= ~CEPH_F_ATEND; 590 587 } 591 588 592 589 /* 593 590 * discard buffered readdir content on seekdir(0), or seek to new frag, 594 591 * or seek prior to current chunk 595 592 */ 596 - static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) 593 + static bool need_reset_readdir(struct ceph_dir_file_info *dfi, loff_t new_pos) 597 594 { 598 595 struct ceph_mds_reply_info_parsed *rinfo; 599 596 loff_t chunk_offset; ··· 602 599 if (is_hash_order(new_pos)) { 603 600 /* no need to reset last_name for a forward seek when 604 601 * dentries are sotred in hash order */ 605 - } else if (fi->frag != fpos_frag(new_pos)) { 602 + } else if (dfi->frag != fpos_frag(new_pos)) { 606 603 return true; 607 604 } 608 - rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; 605 + rinfo = dfi->last_readdir ? &dfi->last_readdir->r_reply_info : NULL; 609 606 if (!rinfo || !rinfo->dir_nr) 610 607 return true; 611 608 chunk_offset = rinfo->dir_entries[0].offset; ··· 615 612 616 613 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) 617 614 { 618 - struct ceph_file_info *fi = file->private_data; 615 + struct ceph_dir_file_info *dfi = file->private_data; 619 616 struct inode *inode = file->f_mapping->host; 620 617 loff_t retval; 621 618 ··· 633 630 } 634 631 635 632 if (offset >= 0) { 636 - if (need_reset_readdir(fi, offset)) { 633 + if (need_reset_readdir(dfi, offset)) { 637 634 dout("dir_llseek dropping %p content\n", file); 638 - reset_readdir(fi); 635 + reset_readdir(dfi); 639 636 } else if (is_hash_order(offset) && offset > file->f_pos) { 640 637 /* for hash offset, we don't know if a forward seek 641 638 * is within same frag */ 642 - fi->dir_release_count = 0; 643 - fi->readdir_cache_idx = -1; 639 + dfi->dir_release_count = 0; 640 + dfi->readdir_cache_idx = -1; 644 641 } 645 642 646 643 if (offset != file->f_pos) { 647 644 file->f_pos = offset; 648 645 file->f_version = 0; 649 - fi->flags &= ~CEPH_F_ATEND; 646 + dfi->file_info.flags &= ~CEPH_F_ATEND; 650 647 } 651 648 retval = offset; 652 649 } ··· 827 824 if (ceph_snap(dir) != CEPH_NOSNAP) 828 825 return -EROFS; 829 826 827 + if (ceph_quota_is_max_files_exceeded(dir)) 828 + return -EDQUOT; 829 + 830 830 err = ceph_pre_init_acls(dir, &mode, &acls); 831 831 if (err < 0) 832 832 return err; ··· 883 877 if (ceph_snap(dir) != CEPH_NOSNAP) 884 878 return -EROFS; 885 879 880 + if (ceph_quota_is_max_files_exceeded(dir)) 881 + return -EDQUOT; 882 + 886 883 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); 887 884 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); 888 885 if (IS_ERR(req)) { ··· 932 923 dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); 933 924 op = CEPH_MDS_OP_MKDIR; 934 925 } else { 926 + goto out; 927 + } 928 + 929 + if (op == CEPH_MDS_OP_MKDIR && 930 + ceph_quota_is_max_files_exceeded(dir)) { 931 + err = -EDQUOT; 935 932 goto out; 936 933 } 937 934 ··· 1080 1065 else 1081 1066 return -EROFS; 1082 1067 } 1068 + /* don't allow cross-quota renames */ 1069 + if ((old_dir != new_dir) && 1070 + (!ceph_quota_is_same_realm(old_dir, new_dir))) 1071 + return -EXDEV; 1072 + 1083 1073 dout("rename dir %p dentry %p to dir %p dentry %p\n", 1084 1074 old_dir, old_dentry, new_dir, new_dentry); 1085 1075 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); ··· 1371 1351 static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, 1372 1352 loff_t *ppos) 1373 1353 { 1374 - struct ceph_file_info *cf = file->private_data; 1354 + struct ceph_dir_file_info *dfi = file->private_data; 1375 1355 struct inode *inode = file_inode(file); 1376 1356 struct ceph_inode_info *ci = ceph_inode(inode); 1377 1357 int left; ··· 1380 1360 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) 1381 1361 return -EISDIR; 1382 1362 1383 - if (!cf->dir_info) { 1384 - cf->dir_info = kmalloc(bufsize, GFP_KERNEL); 1385 - if (!cf->dir_info) 1363 + if (!dfi->dir_info) { 1364 + dfi->dir_info = kmalloc(bufsize, GFP_KERNEL); 1365 + if (!dfi->dir_info) 1386 1366 return -ENOMEM; 1387 - cf->dir_info_len = 1388 - snprintf(cf->dir_info, bufsize, 1367 + dfi->dir_info_len = 1368 + snprintf(dfi->dir_info, bufsize, 1389 1369 "entries: %20lld\n" 1390 1370 " files: %20lld\n" 1391 1371 " subdirs: %20lld\n" ··· 1405 1385 (long)ci->i_rctime.tv_nsec); 1406 1386 } 1407 1387 1408 - if (*ppos >= cf->dir_info_len) 1388 + if (*ppos >= dfi->dir_info_len) 1409 1389 return 0; 1410 - size = min_t(unsigned, size, cf->dir_info_len-*ppos); 1411 - left = copy_to_user(buf, cf->dir_info + *ppos, size); 1390 + size = min_t(unsigned, size, dfi->dir_info_len-*ppos); 1391 + left = copy_to_user(buf, dfi->dir_info + *ppos, size); 1412 1392 if (left == size) 1413 1393 return -EFAULT; 1414 1394 *ppos += (size - left);
+87 -30
fs/ceph/file.c
··· 30 30 break; 31 31 } 32 32 33 + flags &= ~O_ACCMODE; 34 + 33 35 #define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; } 34 36 35 37 ceph_sys2wire(O_CREAT); ··· 43 41 #undef ceph_sys2wire 44 42 45 43 if (flags) 46 - dout("unused open flags: %x", flags); 44 + dout("unused open flags: %x\n", flags); 47 45 48 46 return cpu_to_le32(wire_flags); 49 47 } ··· 161 159 return req; 162 160 } 163 161 162 + static int ceph_init_file_info(struct inode *inode, struct file *file, 163 + int fmode, bool isdir) 164 + { 165 + struct ceph_file_info *fi; 166 + 167 + dout("%s %p %p 0%o (%s)\n", __func__, inode, file, 168 + inode->i_mode, isdir ? "dir" : "regular"); 169 + BUG_ON(inode->i_fop->release != ceph_release); 170 + 171 + if (isdir) { 172 + struct ceph_dir_file_info *dfi = 173 + kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); 174 + if (!dfi) { 175 + ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ 176 + return -ENOMEM; 177 + } 178 + 179 + file->private_data = dfi; 180 + fi = &dfi->file_info; 181 + dfi->next_offset = 2; 182 + dfi->readdir_cache_idx = -1; 183 + } else { 184 + fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); 185 + if (!fi) { 186 + ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ 187 + return -ENOMEM; 188 + } 189 + 190 + file->private_data = fi; 191 + } 192 + 193 + fi->fmode = fmode; 194 + spin_lock_init(&fi->rw_contexts_lock); 195 + INIT_LIST_HEAD(&fi->rw_contexts); 196 + 197 + return 0; 198 + } 199 + 164 200 /* 165 201 * initialize private struct file data. 166 202 * if we fail, clean up by dropping fmode reference on the ceph_inode 167 203 */ 168 204 static int ceph_init_file(struct inode *inode, struct file *file, int fmode) 169 205 { 170 - struct ceph_file_info *cf; 171 206 int ret = 0; 172 207 173 208 switch (inode->i_mode & S_IFMT) { ··· 212 173 ceph_fscache_register_inode_cookie(inode); 213 174 ceph_fscache_file_set_cookie(inode, file); 214 175 case S_IFDIR: 215 - dout("init_file %p %p 0%o (regular)\n", inode, file, 216 - inode->i_mode); 217 - cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); 218 - if (!cf) { 219 - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ 220 - return -ENOMEM; 221 - } 222 - cf->fmode = fmode; 223 - 224 - spin_lock_init(&cf->rw_contexts_lock); 225 - INIT_LIST_HEAD(&cf->rw_contexts); 226 - 227 - cf->next_offset = 2; 228 - cf->readdir_cache_idx = -1; 229 - file->private_data = cf; 230 - BUG_ON(inode->i_fop->release != ceph_release); 176 + ret = ceph_init_file_info(inode, file, fmode, 177 + S_ISDIR(inode->i_mode)); 178 + if (ret) 179 + return ret; 231 180 break; 232 181 233 182 case S_IFLNK: ··· 305 278 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); 306 279 struct ceph_mds_client *mdsc = fsc->mdsc; 307 280 struct ceph_mds_request *req; 308 - struct ceph_file_info *cf = file->private_data; 281 + struct ceph_file_info *fi = file->private_data; 309 282 int err; 310 283 int flags, fmode, wanted; 311 284 312 - if (cf) { 285 + if (fi) { 313 286 dout("open file %p is already opened\n", file); 314 287 return 0; 315 288 } ··· 402 375 struct ceph_mds_request *req; 403 376 struct dentry *dn; 404 377 struct ceph_acls_info acls = {}; 405 - int mask; 378 + int mask; 406 379 int err; 407 380 408 381 dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n", ··· 413 386 return -ENAMETOOLONG; 414 387 415 388 if (flags & O_CREAT) { 389 + if (ceph_quota_is_max_files_exceeded(dir)) 390 + return -EDQUOT; 416 391 err = ceph_pre_init_acls(dir, &mode, &acls); 417 392 if (err < 0) 418 393 return err; ··· 489 460 int ceph_release(struct inode *inode, struct file *file) 490 461 { 491 462 struct ceph_inode_info *ci = ceph_inode(inode); 492 - struct ceph_file_info *cf = file->private_data; 493 463 494 - dout("release inode %p file %p\n", inode, file); 495 - ceph_put_fmode(ci, cf->fmode); 496 - if (cf->last_readdir) 497 - ceph_mdsc_put_request(cf->last_readdir); 498 - kfree(cf->last_name); 499 - kfree(cf->dir_info); 500 - WARN_ON(!list_empty(&cf->rw_contexts)); 501 - kmem_cache_free(ceph_file_cachep, cf); 464 + if (S_ISDIR(inode->i_mode)) { 465 + struct ceph_dir_file_info *dfi = file->private_data; 466 + dout("release inode %p dir file %p\n", inode, file); 467 + WARN_ON(!list_empty(&dfi->file_info.rw_contexts)); 468 + 469 + ceph_put_fmode(ci, dfi->file_info.fmode); 470 + 471 + if (dfi->last_readdir) 472 + ceph_mdsc_put_request(dfi->last_readdir); 473 + kfree(dfi->last_name); 474 + kfree(dfi->dir_info); 475 + kmem_cache_free(ceph_dir_file_cachep, dfi); 476 + } else { 477 + struct ceph_file_info *fi = file->private_data; 478 + dout("release inode %p regular file %p\n", inode, file); 479 + WARN_ON(!list_empty(&fi->rw_contexts)); 480 + 481 + ceph_put_fmode(ci, fi->fmode); 482 + kmem_cache_free(ceph_file_cachep, fi); 483 + } 502 484 503 485 /* wake up anyone waiting for caps on this inode */ 504 486 wake_up_all(&ci->i_cap_wq); ··· 1378 1338 1379 1339 pos = iocb->ki_pos; 1380 1340 count = iov_iter_count(from); 1341 + if (ceph_quota_is_max_bytes_exceeded(inode, pos + count)) { 1342 + err = -EDQUOT; 1343 + goto out; 1344 + } 1345 + 1381 1346 err = file_remove_privs(file); 1382 1347 if (err) 1383 1348 goto out; ··· 1464 1419 1465 1420 if (written >= 0) { 1466 1421 int dirty; 1422 + 1467 1423 spin_lock(&ci->i_ceph_lock); 1468 1424 ci->i_inline_version = CEPH_INLINE_NONE; 1469 1425 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, ··· 1472 1426 spin_unlock(&ci->i_ceph_lock); 1473 1427 if (dirty) 1474 1428 __mark_inode_dirty(inode, dirty); 1429 + if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) 1430 + ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL); 1475 1431 } 1476 1432 1477 1433 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", ··· 1716 1668 goto unlock; 1717 1669 } 1718 1670 1671 + if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) && 1672 + ceph_quota_is_max_bytes_exceeded(inode, offset + length)) { 1673 + ret = -EDQUOT; 1674 + goto unlock; 1675 + } 1676 + 1719 1677 if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && 1720 1678 !(mode & FALLOC_FL_PUNCH_HOLE)) { 1721 1679 ret = -ENOSPC; ··· 1770 1716 spin_unlock(&ci->i_ceph_lock); 1771 1717 if (dirty) 1772 1718 __mark_inode_dirty(inode, dirty); 1719 + if ((endoff > size) && 1720 + ceph_quota_is_max_bytes_approaching(inode, endoff)) 1721 + ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL); 1773 1722 } 1774 1723 1775 1724 ceph_put_cap_refs(ci, got);
+15 -11
fs/ceph/inode.c
··· 441 441 atomic64_set(&ci->i_complete_seq[1], 0); 442 442 ci->i_symlink = NULL; 443 443 444 + ci->i_max_bytes = 0; 445 + ci->i_max_files = 0; 446 + 444 447 memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); 445 448 RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL); 446 449 ··· 539 536 540 537 ceph_queue_caps_release(inode); 541 538 539 + if (__ceph_has_any_quota(ci)) 540 + ceph_adjust_quota_realms_count(inode, false); 541 + 542 542 /* 543 543 * we may still have a snap_realm reference if there are stray 544 544 * caps in i_snap_caps. ··· 554 548 dout(" dropping residual ref to snap realm %p\n", realm); 555 549 spin_lock(&realm->inodes_with_caps_lock); 556 550 list_del_init(&ci->i_snap_realm_item); 551 + ci->i_snap_realm = NULL; 552 + if (realm->ino == ci->i_vino.ino) 553 + realm->inode = NULL; 557 554 spin_unlock(&realm->inodes_with_caps_lock); 558 555 ceph_put_snap_realm(mdsc, realm); 559 556 } ··· 798 789 ci->i_version = le64_to_cpu(info->version); 799 790 inode->i_rdev = le32_to_cpu(info->rdev); 800 791 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 792 + 793 + __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); 801 794 802 795 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && 803 796 (issued & CEPH_CAP_AUTH_EXCL) == 0) { ··· 1878 1867 * possibly truncate them.. so write AND block! 1879 1868 */ 1880 1869 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { 1881 - struct ceph_cap_snap *capsnap; 1882 - to = ci->i_truncate_size; 1883 - list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 1884 - // MDS should have revoked Frw caps 1885 - WARN_ON_ONCE(capsnap->writing); 1886 - if (capsnap->dirty_pages && capsnap->size > to) 1887 - to = capsnap->size; 1888 - } 1889 1870 spin_unlock(&ci->i_ceph_lock); 1890 1871 dout("__do_pending_vmtruncate %p flushing snaps first\n", 1891 1872 inode); 1892 - 1893 - truncate_pagecache(inode, to); 1894 - 1895 1873 filemap_write_and_wait_range(&inode->i_data, 0, 1896 1874 inode->i_sb->s_maxbytes); 1897 1875 goto retry; ··· 2151 2151 err = setattr_prepare(dentry, attr); 2152 2152 if (err != 0) 2153 2153 return err; 2154 + 2155 + if ((attr->ia_valid & ATTR_SIZE) && 2156 + ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size)) 2157 + return -EDQUOT; 2154 2158 2155 2159 err = __ceph_setattr(inode, attr); 2156 2160
+4 -9
fs/ceph/ioctl.c
··· 5 5 #include "super.h" 6 6 #include "mds_client.h" 7 7 #include "ioctl.h" 8 - 8 + #include <linux/ceph/striper.h> 9 9 10 10 /* 11 11 * ioctls ··· 185 185 &ceph_sb_to_client(inode->i_sb)->client->osdc; 186 186 struct ceph_object_locator oloc; 187 187 CEPH_DEFINE_OID_ONSTACK(oid); 188 - u64 len = 1, olen; 188 + u32 xlen; 189 189 u64 tmp; 190 190 struct ceph_pg pgid; 191 191 int r; ··· 195 195 return -EFAULT; 196 196 197 197 down_read(&osdc->lock); 198 - r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, 199 - &dl.object_no, &dl.object_offset, 200 - &olen); 201 - if (r < 0) { 202 - up_read(&osdc->lock); 203 - return -EIO; 204 - } 198 + ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, 1, 199 + &dl.object_no, &dl.object_offset, &xlen); 205 200 dl.file_offset -= dl.object_offset; 206 201 dl.object_size = ci->i_layout.object_size; 207 202 dl.block_size = ci->i_layout.stripe_unit;
+10 -10
fs/ceph/locks.c
··· 95 95 owner = secure_addr(fl->fl_owner); 96 96 97 97 dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, " 98 - "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type, 98 + "start: %llu, length: %llu, wait: %d, type: %d\n", (int)lock_type, 99 99 (int)operation, owner, (u64)fl->fl_pid, fl->fl_start, length, 100 100 wait, fl->fl_type); 101 101 ··· 132 132 } 133 133 ceph_mdsc_put_request(req); 134 134 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, " 135 - "length: %llu, wait: %d, type: %d, err code %d", (int)lock_type, 135 + "length: %llu, wait: %d, type: %d, err code %d\n", (int)lock_type, 136 136 (int)operation, (u64)fl->fl_pid, fl->fl_start, 137 137 length, wait, fl->fl_type, err); 138 138 return err; ··· 226 226 if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) 227 227 return -ENOLCK; 228 228 229 - dout("ceph_lock, fl_owner: %p", fl->fl_owner); 229 + dout("ceph_lock, fl_owner: %p\n", fl->fl_owner); 230 230 231 231 /* set wait bit as appropriate, then make command as Ceph expects it*/ 232 232 if (IS_GETLK(cmd)) ··· 264 264 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl); 265 265 if (!err) { 266 266 if (op == CEPH_MDS_OP_SETFILELOCK) { 267 - dout("mds locked, locking locally"); 267 + dout("mds locked, locking locally\n"); 268 268 err = posix_lock_file(file, fl, NULL); 269 269 if (err) { 270 270 /* undo! This should only happen if ··· 272 272 * deadlock. */ 273 273 ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, 274 274 CEPH_LOCK_UNLOCK, 0, fl); 275 - dout("got %d on posix_lock_file, undid lock", 275 + dout("got %d on posix_lock_file, undid lock\n", 276 276 err); 277 277 } 278 278 } ··· 294 294 if (fl->fl_type & LOCK_MAND) 295 295 return -EOPNOTSUPP; 296 296 297 - dout("ceph_flock, fl_file: %p", fl->fl_file); 297 + dout("ceph_flock, fl_file: %p\n", fl->fl_file); 298 298 299 299 spin_lock(&ci->i_ceph_lock); 300 300 if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { ··· 329 329 ceph_lock_message(CEPH_LOCK_FLOCK, 330 330 CEPH_MDS_OP_SETFILELOCK, 331 331 inode, CEPH_LOCK_UNLOCK, 0, fl); 332 - dout("got %d on locks_lock_file_wait, undid lock", err); 332 + dout("got %d on locks_lock_file_wait, undid lock\n", err); 333 333 } 334 334 } 335 335 return err; ··· 356 356 ++(*flock_count); 357 357 spin_unlock(&ctx->flc_lock); 358 358 } 359 - dout("counted %d flock locks and %d fcntl locks", 359 + dout("counted %d flock locks and %d fcntl locks\n", 360 360 *flock_count, *fcntl_count); 361 361 } 362 362 ··· 384 384 cephlock->type = CEPH_LOCK_UNLOCK; 385 385 break; 386 386 default: 387 - dout("Have unknown lock type %d", lock->fl_type); 387 + dout("Have unknown lock type %d\n", lock->fl_type); 388 388 err = -EINVAL; 389 389 } 390 390 ··· 407 407 int seen_flock = 0; 408 408 int l = 0; 409 409 410 - dout("encoding %d flock and %d fcntl locks", num_flock_locks, 410 + dout("encoding %d flock and %d fcntl locks\n", num_flock_locks, 411 411 num_fcntl_locks); 412 412 413 413 if (!ctx)
+58 -29
fs/ceph/mds_client.c
··· 100 100 } else 101 101 info->inline_version = CEPH_INLINE_NONE; 102 102 103 + if (features & CEPH_FEATURE_MDS_QUOTA) { 104 + u8 struct_v, struct_compat; 105 + u32 struct_len; 106 + 107 + /* 108 + * both struct_v and struct_compat are expected to be >= 1 109 + */ 110 + ceph_decode_8_safe(p, end, struct_v, bad); 111 + ceph_decode_8_safe(p, end, struct_compat, bad); 112 + if (!struct_v || !struct_compat) 113 + goto bad; 114 + ceph_decode_32_safe(p, end, struct_len, bad); 115 + ceph_decode_need(p, end, struct_len, bad); 116 + ceph_decode_64_safe(p, end, info->max_bytes, bad); 117 + ceph_decode_64_safe(p, end, info->max_files, bad); 118 + } else { 119 + info->max_bytes = 0; 120 + info->max_files = 0; 121 + } 122 + 103 123 info->pool_ns_len = 0; 104 124 info->pool_ns_data = NULL; 105 125 if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { ··· 404 384 refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref)); 405 385 return s; 406 386 } else { 407 - dout("mdsc get_session %p 0 -- FAIL", s); 387 + dout("mdsc get_session %p 0 -- FAIL\n", s); 408 388 return NULL; 409 389 } 410 390 } ··· 439 419 440 420 static bool __have_session(struct ceph_mds_client *mdsc, int mds) 441 421 { 442 - if (mds >= mdsc->max_sessions) 422 + if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) 443 423 return false; 444 - return mdsc->sessions[mds]; 424 + else 425 + return true; 445 426 } 446 427 447 428 static int __verify_registered_session(struct ceph_mds_client *mdsc, ··· 469 448 s = kzalloc(sizeof(*s), GFP_NOFS); 470 449 if (!s) 471 450 return ERR_PTR(-ENOMEM); 451 + 452 + if (mds >= mdsc->max_sessions) { 453 + int newmax = 1 << get_count_order(mds + 1); 454 + struct ceph_mds_session **sa; 455 + 456 + dout("%s: realloc to %d\n", __func__, newmax); 457 + sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); 458 + if (!sa) 459 + goto fail_realloc; 460 + if (mdsc->sessions) { 461 + memcpy(sa, mdsc->sessions, 462 + mdsc->max_sessions * sizeof(void *)); 463 + kfree(mdsc->sessions); 464 + } 465 + mdsc->sessions = sa; 466 + mdsc->max_sessions = newmax; 467 + } 468 + 469 + dout("%s: mds%d\n", __func__, mds); 472 470 s->s_mdsc = mdsc; 473 471 s->s_mds = mds; 474 472 s->s_state = CEPH_MDS_SESSION_NEW; ··· 516 476 INIT_LIST_HEAD(&s->s_cap_releases); 517 477 INIT_LIST_HEAD(&s->s_cap_flushing); 518 478 519 - dout("register_session mds%d\n", mds); 520 - if (mds >= mdsc->max_sessions) { 521 - int newmax = 1 << get_count_order(mds+1); 522 - struct ceph_mds_session **sa; 523 - 524 - dout("register_session realloc to %d\n", newmax); 525 - sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); 526 - if (!sa) 527 - goto fail_realloc; 528 - if (mdsc->sessions) { 529 - memcpy(sa, mdsc->sessions, 530 - mdsc->max_sessions * sizeof(void *)); 531 - kfree(mdsc->sessions); 532 - } 533 - mdsc->sessions = sa; 534 - mdsc->max_sessions = newmax; 535 - } 536 479 mdsc->sessions[mds] = s; 537 480 atomic_inc(&mdsc->num_sessions); 538 481 refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */ ··· 2554 2531 * Otherwise we just have to return an ESTALE 2555 2532 */ 2556 2533 if (result == -ESTALE) { 2557 - dout("got ESTALE on request %llu", req->r_tid); 2534 + dout("got ESTALE on request %llu\n", req->r_tid); 2558 2535 req->r_resend_mds = -1; 2559 2536 if (req->r_direct_mode != USE_AUTH_MDS) { 2560 - dout("not using auth, setting for that now"); 2537 + dout("not using auth, setting for that now\n"); 2561 2538 req->r_direct_mode = USE_AUTH_MDS; 2562 2539 __do_request(mdsc, req); 2563 2540 mutex_unlock(&mdsc->mutex); ··· 2565 2542 } else { 2566 2543 int mds = __choose_mds(mdsc, req); 2567 2544 if (mds >= 0 && mds != req->r_session->s_mds) { 2568 - dout("but auth changed, so resending"); 2545 + dout("but auth changed, so resending\n"); 2569 2546 __do_request(mdsc, req); 2570 2547 mutex_unlock(&mdsc->mutex); 2571 2548 goto out; 2572 2549 } 2573 2550 } 2574 - dout("have to return ESTALE on request %llu", req->r_tid); 2551 + dout("have to return ESTALE on request %llu\n", req->r_tid); 2575 2552 } 2576 2553 2577 2554 ··· 3493 3470 } 3494 3471 3495 3472 /* 3496 - * drop all leases (and dentry refs) in preparation for umount 3473 + * lock unlock sessions, to wait ongoing session activities 3497 3474 */ 3498 - static void drop_leases(struct ceph_mds_client *mdsc) 3475 + static void lock_unlock_sessions(struct ceph_mds_client *mdsc) 3499 3476 { 3500 3477 int i; 3501 3478 3502 - dout("drop_leases\n"); 3503 3479 mutex_lock(&mdsc->mutex); 3504 3480 for (i = 0; i < mdsc->max_sessions; i++) { 3505 3481 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); ··· 3594 3572 if (!mdsc) 3595 3573 return -ENOMEM; 3596 3574 mdsc->fsc = fsc; 3597 - fsc->mdsc = mdsc; 3598 3575 mutex_init(&mdsc->mutex); 3599 3576 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 3600 3577 if (!mdsc->mdsmap) { ··· 3601 3580 return -ENOMEM; 3602 3581 } 3603 3582 3583 + fsc->mdsc = mdsc; 3604 3584 init_completion(&mdsc->safe_umount_waiters); 3605 3585 init_waitqueue_head(&mdsc->session_close_wq); 3606 3586 INIT_LIST_HEAD(&mdsc->waiting_for_map); ··· 3609 3587 atomic_set(&mdsc->num_sessions, 0); 3610 3588 mdsc->max_sessions = 0; 3611 3589 mdsc->stopping = 0; 3590 + atomic64_set(&mdsc->quotarealms_count, 0); 3612 3591 mdsc->last_snap_seq = 0; 3613 3592 init_rwsem(&mdsc->snap_rwsem); 3614 3593 mdsc->snap_realms = RB_ROOT; ··· 3683 3660 dout("pre_umount\n"); 3684 3661 mdsc->stopping = 1; 3685 3662 3686 - drop_leases(mdsc); 3663 + lock_unlock_sessions(mdsc); 3687 3664 ceph_flush_dirty_caps(mdsc); 3688 3665 wait_requests(mdsc); 3689 3666 ··· 3880 3857 { 3881 3858 struct ceph_mds_client *mdsc = fsc->mdsc; 3882 3859 dout("mdsc_destroy %p\n", mdsc); 3860 + 3861 + if (!mdsc) 3862 + return; 3883 3863 3884 3864 /* flush out any connection work with references to us */ 3885 3865 ceph_msgr_flush(); ··· 4102 4076 break; 4103 4077 case CEPH_MSG_CLIENT_LEASE: 4104 4078 handle_lease(mdsc, s, msg); 4079 + break; 4080 + case CEPH_MSG_CLIENT_QUOTA: 4081 + ceph_handle_quota(mdsc, s, msg); 4105 4082 break; 4106 4083 4107 4084 default:
+4
fs/ceph/mds_client.h
··· 49 49 char *inline_data; 50 50 u32 pool_ns_len; 51 51 char *pool_ns_data; 52 + u64 max_bytes; 53 + u64 max_files; 52 54 }; 53 55 54 56 struct ceph_mds_reply_dir_entry { ··· 313 311 atomic_t num_sessions; 314 312 int max_sessions; /* len of s_mds_sessions */ 315 313 int stopping; /* true if shutting down */ 314 + 315 + atomic64_t quotarealms_count; /* # realms with quota */ 316 316 317 317 /* 318 318 * snap_rwsem will cover cap linkage into snaprealms, and
+361
fs/ceph/quota.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * quota.c - CephFS quota 4 + * 5 + * Copyright (C) 2017-2018 SUSE 6 + * 7 + * This program is free software; you can redistribute it and/or 8 + * modify it under the terms of the GNU General Public License 9 + * as published by the Free Software Foundation; either version 2 10 + * of the License, or (at your option) any later version. 11 + * 12 + * This program is distributed in the hope that it will be useful, 13 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 + * GNU General Public License for more details. 16 + * 17 + * You should have received a copy of the GNU General Public License 18 + * along with this program; if not, see <http://www.gnu.org/licenses/>. 19 + */ 20 + 21 + #include <linux/statfs.h> 22 + 23 + #include "super.h" 24 + #include "mds_client.h" 25 + 26 + void ceph_adjust_quota_realms_count(struct inode *inode, bool inc) 27 + { 28 + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 29 + if (inc) 30 + atomic64_inc(&mdsc->quotarealms_count); 31 + else 32 + atomic64_dec(&mdsc->quotarealms_count); 33 + } 34 + 35 + static inline bool ceph_has_realms_with_quotas(struct inode *inode) 36 + { 37 + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 38 + return atomic64_read(&mdsc->quotarealms_count) > 0; 39 + } 40 + 41 + void ceph_handle_quota(struct ceph_mds_client *mdsc, 42 + struct ceph_mds_session *session, 43 + struct ceph_msg *msg) 44 + { 45 + struct super_block *sb = mdsc->fsc->sb; 46 + struct ceph_mds_quota *h = msg->front.iov_base; 47 + struct ceph_vino vino; 48 + struct inode *inode; 49 + struct ceph_inode_info *ci; 50 + 51 + if (msg->front.iov_len != sizeof(*h)) { 52 + pr_err("%s corrupt message mds%d len %d\n", __func__, 53 + session->s_mds, (int)msg->front.iov_len); 54 + ceph_msg_dump(msg); 55 + return; 56 + } 57 + 58 + /* increment msg sequence number */ 59 + mutex_lock(&session->s_mutex); 60 + session->s_seq++; 61 + mutex_unlock(&session->s_mutex); 62 + 63 + /* lookup inode */ 64 + vino.ino = le64_to_cpu(h->ino); 65 + vino.snap = CEPH_NOSNAP; 66 + inode = ceph_find_inode(sb, vino); 67 + if (!inode) { 68 + pr_warn("Failed to find inode %llu\n", vino.ino); 69 + return; 70 + } 71 + ci = ceph_inode(inode); 72 + 73 + spin_lock(&ci->i_ceph_lock); 74 + ci->i_rbytes = le64_to_cpu(h->rbytes); 75 + ci->i_rfiles = le64_to_cpu(h->rfiles); 76 + ci->i_rsubdirs = le64_to_cpu(h->rsubdirs); 77 + __ceph_update_quota(ci, le64_to_cpu(h->max_bytes), 78 + le64_to_cpu(h->max_files)); 79 + spin_unlock(&ci->i_ceph_lock); 80 + 81 + iput(inode); 82 + } 83 + 84 + /* 85 + * This function walks through the snaprealm for an inode and returns the 86 + * ceph_snap_realm for the first snaprealm that has quotas set (either max_files 87 + * or max_bytes). If the root is reached, return the root ceph_snap_realm 88 + * instead. 89 + * 90 + * Note that the caller is responsible for calling ceph_put_snap_realm() on the 91 + * returned realm. 92 + */ 93 + static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc, 94 + struct inode *inode) 95 + { 96 + struct ceph_inode_info *ci = NULL; 97 + struct ceph_snap_realm *realm, *next; 98 + struct inode *in; 99 + bool has_quota; 100 + 101 + if (ceph_snap(inode) != CEPH_NOSNAP) 102 + return NULL; 103 + 104 + realm = ceph_inode(inode)->i_snap_realm; 105 + if (realm) 106 + ceph_get_snap_realm(mdsc, realm); 107 + else 108 + pr_err_ratelimited("get_quota_realm: ino (%llx.%llx) " 109 + "null i_snap_realm\n", ceph_vinop(inode)); 110 + while (realm) { 111 + spin_lock(&realm->inodes_with_caps_lock); 112 + in = realm->inode ? igrab(realm->inode) : NULL; 113 + spin_unlock(&realm->inodes_with_caps_lock); 114 + if (!in) 115 + break; 116 + 117 + ci = ceph_inode(in); 118 + has_quota = __ceph_has_any_quota(ci); 119 + iput(in); 120 + 121 + next = realm->parent; 122 + if (has_quota || !next) 123 + return realm; 124 + 125 + ceph_get_snap_realm(mdsc, next); 126 + ceph_put_snap_realm(mdsc, realm); 127 + realm = next; 128 + } 129 + if (realm) 130 + ceph_put_snap_realm(mdsc, realm); 131 + 132 + return NULL; 133 + } 134 + 135 + bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) 136 + { 137 + struct ceph_mds_client *mdsc = ceph_inode_to_client(old)->mdsc; 138 + struct ceph_snap_realm *old_realm, *new_realm; 139 + bool is_same; 140 + 141 + down_read(&mdsc->snap_rwsem); 142 + old_realm = get_quota_realm(mdsc, old); 143 + new_realm = get_quota_realm(mdsc, new); 144 + is_same = (old_realm == new_realm); 145 + up_read(&mdsc->snap_rwsem); 146 + 147 + if (old_realm) 148 + ceph_put_snap_realm(mdsc, old_realm); 149 + if (new_realm) 150 + ceph_put_snap_realm(mdsc, new_realm); 151 + 152 + return is_same; 153 + } 154 + 155 + enum quota_check_op { 156 + QUOTA_CHECK_MAX_FILES_OP, /* check quota max_files limit */ 157 + QUOTA_CHECK_MAX_BYTES_OP, /* check quota max_files limit */ 158 + QUOTA_CHECK_MAX_BYTES_APPROACHING_OP /* check if quota max_files 159 + limit is approaching */ 160 + }; 161 + 162 + /* 163 + * check_quota_exceeded() will walk up the snaprealm hierarchy and, for each 164 + * realm, it will execute quota check operation defined by the 'op' parameter. 165 + * The snaprealm walk is interrupted if the quota check detects that the quota 166 + * is exceeded or if the root inode is reached. 167 + */ 168 + static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op, 169 + loff_t delta) 170 + { 171 + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 172 + struct ceph_inode_info *ci; 173 + struct ceph_snap_realm *realm, *next; 174 + struct inode *in; 175 + u64 max, rvalue; 176 + bool exceeded = false; 177 + 178 + if (ceph_snap(inode) != CEPH_NOSNAP) 179 + return false; 180 + 181 + down_read(&mdsc->snap_rwsem); 182 + realm = ceph_inode(inode)->i_snap_realm; 183 + if (realm) 184 + ceph_get_snap_realm(mdsc, realm); 185 + else 186 + pr_err_ratelimited("check_quota_exceeded: ino (%llx.%llx) " 187 + "null i_snap_realm\n", ceph_vinop(inode)); 188 + while (realm) { 189 + spin_lock(&realm->inodes_with_caps_lock); 190 + in = realm->inode ? igrab(realm->inode) : NULL; 191 + spin_unlock(&realm->inodes_with_caps_lock); 192 + if (!in) 193 + break; 194 + 195 + ci = ceph_inode(in); 196 + spin_lock(&ci->i_ceph_lock); 197 + if (op == QUOTA_CHECK_MAX_FILES_OP) { 198 + max = ci->i_max_files; 199 + rvalue = ci->i_rfiles + ci->i_rsubdirs; 200 + } else { 201 + max = ci->i_max_bytes; 202 + rvalue = ci->i_rbytes; 203 + } 204 + spin_unlock(&ci->i_ceph_lock); 205 + switch (op) { 206 + case QUOTA_CHECK_MAX_FILES_OP: 207 + exceeded = (max && (rvalue >= max)); 208 + break; 209 + case QUOTA_CHECK_MAX_BYTES_OP: 210 + exceeded = (max && (rvalue + delta > max)); 211 + break; 212 + case QUOTA_CHECK_MAX_BYTES_APPROACHING_OP: 213 + if (max) { 214 + if (rvalue >= max) 215 + exceeded = true; 216 + else { 217 + /* 218 + * when we're writing more that 1/16th 219 + * of the available space 220 + */ 221 + exceeded = 222 + (((max - rvalue) >> 4) < delta); 223 + } 224 + } 225 + break; 226 + default: 227 + /* Shouldn't happen */ 228 + pr_warn("Invalid quota check op (%d)\n", op); 229 + exceeded = true; /* Just break the loop */ 230 + } 231 + iput(in); 232 + 233 + next = realm->parent; 234 + if (exceeded || !next) 235 + break; 236 + ceph_get_snap_realm(mdsc, next); 237 + ceph_put_snap_realm(mdsc, realm); 238 + realm = next; 239 + } 240 + ceph_put_snap_realm(mdsc, realm); 241 + up_read(&mdsc->snap_rwsem); 242 + 243 + return exceeded; 244 + } 245 + 246 + /* 247 + * ceph_quota_is_max_files_exceeded - check if we can create a new file 248 + * @inode: directory where a new file is being created 249 + * 250 + * This functions returns true is max_files quota allows a new file to be 251 + * created. It is necessary to walk through the snaprealm hierarchy (until the 252 + * FS root) to check all realms with quotas set. 253 + */ 254 + bool ceph_quota_is_max_files_exceeded(struct inode *inode) 255 + { 256 + if (!ceph_has_realms_with_quotas(inode)) 257 + return false; 258 + 259 + WARN_ON(!S_ISDIR(inode->i_mode)); 260 + 261 + return check_quota_exceeded(inode, QUOTA_CHECK_MAX_FILES_OP, 0); 262 + } 263 + 264 + /* 265 + * ceph_quota_is_max_bytes_exceeded - check if we can write to a file 266 + * @inode: inode being written 267 + * @newsize: new size if write succeeds 268 + * 269 + * This functions returns true is max_bytes quota allows a file size to reach 270 + * @newsize; it returns false otherwise. 271 + */ 272 + bool ceph_quota_is_max_bytes_exceeded(struct inode *inode, loff_t newsize) 273 + { 274 + loff_t size = i_size_read(inode); 275 + 276 + if (!ceph_has_realms_with_quotas(inode)) 277 + return false; 278 + 279 + /* return immediately if we're decreasing file size */ 280 + if (newsize <= size) 281 + return false; 282 + 283 + return check_quota_exceeded(inode, QUOTA_CHECK_MAX_BYTES_OP, (newsize - size)); 284 + } 285 + 286 + /* 287 + * ceph_quota_is_max_bytes_approaching - check if we're reaching max_bytes 288 + * @inode: inode being written 289 + * @newsize: new size if write succeeds 290 + * 291 + * This function returns true if the new file size @newsize will be consuming 292 + * more than 1/16th of the available quota space; it returns false otherwise. 293 + */ 294 + bool ceph_quota_is_max_bytes_approaching(struct inode *inode, loff_t newsize) 295 + { 296 + loff_t size = ceph_inode(inode)->i_reported_size; 297 + 298 + if (!ceph_has_realms_with_quotas(inode)) 299 + return false; 300 + 301 + /* return immediately if we're decreasing file size */ 302 + if (newsize <= size) 303 + return false; 304 + 305 + return check_quota_exceeded(inode, QUOTA_CHECK_MAX_BYTES_APPROACHING_OP, 306 + (newsize - size)); 307 + } 308 + 309 + /* 310 + * ceph_quota_update_statfs - if root has quota update statfs with quota status 311 + * @fsc: filesystem client instance 312 + * @buf: statfs to update 313 + * 314 + * If the mounted filesystem root has max_bytes quota set, update the filesystem 315 + * statistics with the quota status. 316 + * 317 + * This function returns true if the stats have been updated, false otherwise. 318 + */ 319 + bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) 320 + { 321 + struct ceph_mds_client *mdsc = fsc->mdsc; 322 + struct ceph_inode_info *ci; 323 + struct ceph_snap_realm *realm; 324 + struct inode *in; 325 + u64 total = 0, used, free; 326 + bool is_updated = false; 327 + 328 + down_read(&mdsc->snap_rwsem); 329 + realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root)); 330 + up_read(&mdsc->snap_rwsem); 331 + if (!realm) 332 + return false; 333 + 334 + spin_lock(&realm->inodes_with_caps_lock); 335 + in = realm->inode ? igrab(realm->inode) : NULL; 336 + spin_unlock(&realm->inodes_with_caps_lock); 337 + if (in) { 338 + ci = ceph_inode(in); 339 + spin_lock(&ci->i_ceph_lock); 340 + if (ci->i_max_bytes) { 341 + total = ci->i_max_bytes >> CEPH_BLOCK_SHIFT; 342 + used = ci->i_rbytes >> CEPH_BLOCK_SHIFT; 343 + /* It is possible for a quota to be exceeded. 344 + * Report 'zero' in that case 345 + */ 346 + free = total > used ? total - used : 0; 347 + } 348 + spin_unlock(&ci->i_ceph_lock); 349 + if (total) { 350 + buf->f_blocks = total; 351 + buf->f_bfree = free; 352 + buf->f_bavail = free; 353 + is_updated = true; 354 + } 355 + iput(in); 356 + } 357 + ceph_put_snap_realm(mdsc, realm); 358 + 359 + return is_updated; 360 + } 361 +
+2
fs/ceph/snap.c
··· 931 931 list_add(&ci->i_snap_realm_item, 932 932 &realm->inodes_with_caps); 933 933 ci->i_snap_realm = realm; 934 + if (realm->ino == ci->i_vino.ino) 935 + realm->inode = inode; 934 936 spin_unlock(&realm->inodes_with_caps_lock); 935 937 936 938 spin_unlock(&ci->i_ceph_lock);
+39 -11
fs/ceph/super.c
··· 76 76 */ 77 77 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; 78 78 buf->f_frsize = 1 << CEPH_BLOCK_SHIFT; 79 - buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); 80 - buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); 81 - buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); 79 + 80 + /* 81 + * By default use root quota for stats; fallback to overall filesystem 82 + * usage if using 'noquotadf' mount option or if the root dir doesn't 83 + * have max_bytes quota set. 84 + */ 85 + if (ceph_test_mount_opt(fsc, NOQUOTADF) || 86 + !ceph_quota_update_statfs(fsc, buf)) { 87 + buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); 88 + buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); 89 + buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); 90 + } 82 91 83 92 buf->f_files = le64_to_cpu(st.num_objects); 84 93 buf->f_ffree = -1; ··· 160 151 Opt_acl, 161 152 #endif 162 153 Opt_noacl, 154 + Opt_quotadf, 155 + Opt_noquotadf, 163 156 }; 164 157 165 158 static match_table_t fsopt_tokens = { ··· 198 187 {Opt_acl, "acl"}, 199 188 #endif 200 189 {Opt_noacl, "noacl"}, 190 + {Opt_quotadf, "quotadf"}, 191 + {Opt_noquotadf, "noquotadf"}, 201 192 {-1, NULL} 202 193 }; 203 194 ··· 327 314 break; 328 315 case Opt_fscache: 329 316 fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; 317 + kfree(fsopt->fscache_uniq); 318 + fsopt->fscache_uniq = NULL; 330 319 break; 331 320 case Opt_nofscache: 332 321 fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; 322 + kfree(fsopt->fscache_uniq); 323 + fsopt->fscache_uniq = NULL; 333 324 break; 334 325 case Opt_poolperm: 335 326 fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM; 336 - printk ("pool perm"); 337 327 break; 338 328 case Opt_nopoolperm: 339 329 fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM; ··· 346 330 break; 347 331 case Opt_norequire_active_mds: 348 332 fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT; 333 + break; 334 + case Opt_quotadf: 335 + fsopt->flags &= ~CEPH_MOUNT_OPT_NOQUOTADF; 336 + break; 337 + case Opt_noquotadf: 338 + fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF; 349 339 break; 350 340 #ifdef CONFIG_CEPH_FS_POSIX_ACL 351 341 case Opt_acl: ··· 535 513 if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) 536 514 seq_puts(m, ",nodcache"); 537 515 if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) { 538 - if (fsopt->fscache_uniq) 539 - seq_printf(m, ",fsc=%s", fsopt->fscache_uniq); 540 - else 541 - seq_puts(m, ",fsc"); 516 + seq_show_option(m, "fsc", fsopt->fscache_uniq); 542 517 } 543 518 if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM) 544 519 seq_puts(m, ",nopoolperm"); 520 + if (fsopt->flags & CEPH_MOUNT_OPT_NOQUOTADF) 521 + seq_puts(m, ",noquotadf"); 545 522 546 523 #ifdef CONFIG_CEPH_FS_POSIX_ACL 547 524 if (fsopt->sb_flags & SB_POSIXACL) ··· 550 529 #endif 551 530 552 531 if (fsopt->mds_namespace) 553 - seq_printf(m, ",mds_namespace=%s", fsopt->mds_namespace); 532 + seq_show_option(m, "mds_namespace", fsopt->mds_namespace); 554 533 if (fsopt->wsize) 555 534 seq_printf(m, ",wsize=%d", fsopt->wsize); 556 535 if (fsopt->rsize != CEPH_MAX_READ_SIZE) ··· 700 679 struct kmem_cache *ceph_cap_flush_cachep; 701 680 struct kmem_cache *ceph_dentry_cachep; 702 681 struct kmem_cache *ceph_file_cachep; 682 + struct kmem_cache *ceph_dir_file_cachep; 703 683 704 684 static void ceph_inode_init_once(void *foo) 705 685 { ··· 720 698 if (!ceph_inode_cachep) 721 699 return -ENOMEM; 722 700 723 - ceph_cap_cachep = KMEM_CACHE(ceph_cap, 724 - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); 701 + ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD); 725 702 if (!ceph_cap_cachep) 726 703 goto bad_cap; 727 704 ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, ··· 737 716 if (!ceph_file_cachep) 738 717 goto bad_file; 739 718 719 + ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, SLAB_MEM_SPREAD); 720 + if (!ceph_dir_file_cachep) 721 + goto bad_dir_file; 722 + 740 723 error = ceph_fscache_register(); 741 724 if (error) 742 725 goto bad_fscache; ··· 748 723 return 0; 749 724 750 725 bad_fscache: 726 + kmem_cache_destroy(ceph_dir_file_cachep); 727 + bad_dir_file: 751 728 kmem_cache_destroy(ceph_file_cachep); 752 729 bad_file: 753 730 kmem_cache_destroy(ceph_dentry_cachep); ··· 775 748 kmem_cache_destroy(ceph_cap_flush_cachep); 776 749 kmem_cache_destroy(ceph_dentry_cachep); 777 750 kmem_cache_destroy(ceph_file_cachep); 751 + kmem_cache_destroy(ceph_dir_file_cachep); 778 752 779 753 ceph_fscache_unregister(); 780 754 }
+42
fs/ceph/super.h
··· 39 39 #define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ 40 40 #define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ 41 41 #define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */ 42 + #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ 42 43 43 44 #define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE 44 45 ··· 310 309 struct timespec i_rctime; 311 310 u64 i_rbytes, i_rfiles, i_rsubdirs; 312 311 u64 i_files, i_subdirs; 312 + 313 + /* quotas */ 314 + u64 i_max_bytes, i_max_files; 313 315 314 316 struct rb_root i_fragtree; 315 317 int i_fragtree_nsplits; ··· 675 671 676 672 spinlock_t rw_contexts_lock; 677 673 struct list_head rw_contexts; 674 + }; 675 + 676 + struct ceph_dir_file_info { 677 + struct ceph_file_info file_info; 678 678 679 679 /* readdir: position within the dir */ 680 680 u32 frag; ··· 756 748 */ 757 749 struct ceph_snap_realm { 758 750 u64 ino; 751 + struct inode *inode; 759 752 atomic_t nref; 760 753 struct rb_node node; 761 754 ··· 1074 1065 /* debugfs.c */ 1075 1066 extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); 1076 1067 extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client); 1068 + 1069 + /* quota.c */ 1070 + static inline bool __ceph_has_any_quota(struct ceph_inode_info *ci) 1071 + { 1072 + return ci->i_max_files || ci->i_max_bytes; 1073 + } 1074 + 1075 + extern void ceph_adjust_quota_realms_count(struct inode *inode, bool inc); 1076 + 1077 + static inline void __ceph_update_quota(struct ceph_inode_info *ci, 1078 + u64 max_bytes, u64 max_files) 1079 + { 1080 + bool had_quota, has_quota; 1081 + had_quota = __ceph_has_any_quota(ci); 1082 + ci->i_max_bytes = max_bytes; 1083 + ci->i_max_files = max_files; 1084 + has_quota = __ceph_has_any_quota(ci); 1085 + 1086 + if (had_quota != has_quota) 1087 + ceph_adjust_quota_realms_count(&ci->vfs_inode, has_quota); 1088 + } 1089 + 1090 + extern void ceph_handle_quota(struct ceph_mds_client *mdsc, 1091 + struct ceph_mds_session *session, 1092 + struct ceph_msg *msg); 1093 + extern bool ceph_quota_is_max_files_exceeded(struct inode *inode); 1094 + extern bool ceph_quota_is_same_realm(struct inode *old, struct inode *new); 1095 + extern bool ceph_quota_is_max_bytes_exceeded(struct inode *inode, 1096 + loff_t newlen); 1097 + extern bool ceph_quota_is_max_bytes_approaching(struct inode *inode, 1098 + loff_t newlen); 1099 + extern bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, 1100 + struct kstatfs *buf); 1077 1101 1078 1102 #endif /* _FS_CEPH_SUPER_H */
+44
fs/ceph/xattr.c
··· 224 224 (long)ci->i_rctime.tv_nsec); 225 225 } 226 226 227 + /* quotas */ 228 + 229 + static bool ceph_vxattrcb_quota_exists(struct ceph_inode_info *ci) 230 + { 231 + return (ci->i_max_files || ci->i_max_bytes); 232 + } 233 + 234 + static size_t ceph_vxattrcb_quota(struct ceph_inode_info *ci, char *val, 235 + size_t size) 236 + { 237 + return snprintf(val, size, "max_bytes=%llu max_files=%llu", 238 + ci->i_max_bytes, ci->i_max_files); 239 + } 240 + 241 + static size_t ceph_vxattrcb_quota_max_bytes(struct ceph_inode_info *ci, 242 + char *val, size_t size) 243 + { 244 + return snprintf(val, size, "%llu", ci->i_max_bytes); 245 + } 246 + 247 + static size_t ceph_vxattrcb_quota_max_files(struct ceph_inode_info *ci, 248 + char *val, size_t size) 249 + { 250 + return snprintf(val, size, "%llu", ci->i_max_files); 251 + } 227 252 228 253 #define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name 229 254 #define CEPH_XATTR_NAME2(_type, _name, _name2) \ ··· 271 246 .readonly = false, \ 272 247 .hidden = true, \ 273 248 .exists_cb = ceph_vxattrcb_layout_exists, \ 249 + } 250 + #define XATTR_QUOTA_FIELD(_type, _name) \ 251 + { \ 252 + .name = CEPH_XATTR_NAME(_type, _name), \ 253 + .name_size = sizeof(CEPH_XATTR_NAME(_type, _name)), \ 254 + .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ 255 + .readonly = false, \ 256 + .hidden = true, \ 257 + .exists_cb = ceph_vxattrcb_quota_exists, \ 274 258 } 275 259 276 260 static struct ceph_vxattr ceph_dir_vxattrs[] = { ··· 304 270 XATTR_NAME_CEPH(dir, rsubdirs), 305 271 XATTR_NAME_CEPH(dir, rbytes), 306 272 XATTR_NAME_CEPH(dir, rctime), 273 + { 274 + .name = "ceph.quota", 275 + .name_size = sizeof("ceph.quota"), 276 + .getxattr_cb = ceph_vxattrcb_quota, 277 + .readonly = false, 278 + .hidden = true, 279 + .exists_cb = ceph_vxattrcb_quota_exists, 280 + }, 281 + XATTR_QUOTA_FIELD(quota, max_bytes), 282 + XATTR_QUOTA_FIELD(quota, max_files), 307 283 { .name = NULL, 0 } /* Required table terminator */ 308 284 }; 309 285 static size_t ceph_dir_vxattrs_name_size; /* total size of all names */
+1
include/linux/ceph/ceph_features.h
··· 204 204 CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ 205 205 CEPH_FEATURE_MSGR_KEEPALIVE2 | \ 206 206 CEPH_FEATURE_OSD_POOLRESEND | \ 207 + CEPH_FEATURE_MDS_QUOTA | \ 207 208 CEPH_FEATURE_CRUSH_V4 | \ 208 209 CEPH_FEATURE_NEW_OSDOP_ENCODING | \ 209 210 CEPH_FEATURE_SERVER_JEWEL | \
+17
include/linux/ceph/ceph_fs.h
··· 134 134 #define CEPH_MSG_CLIENT_LEASE 0x311 135 135 #define CEPH_MSG_CLIENT_SNAP 0x312 136 136 #define CEPH_MSG_CLIENT_CAPRELEASE 0x313 137 + #define CEPH_MSG_CLIENT_QUOTA 0x314 137 138 138 139 /* pool ops */ 139 140 #define CEPH_MSG_POOLOP_REPLY 48 ··· 807 806 __le32 num_prior_parent_snaps; 808 807 } __attribute__ ((packed)); 809 808 /* followed by my snap list, then prior parent snap list */ 809 + 810 + /* 811 + * quotas 812 + */ 813 + struct ceph_mds_quota { 814 + __le64 ino; /* ino */ 815 + struct ceph_timespec rctime; 816 + __le64 rbytes; /* dir stats */ 817 + __le64 rfiles; 818 + __le64 rsubdirs; 819 + __u8 struct_v; /* compat */ 820 + __u8 struct_compat; 821 + __le32 struct_len; 822 + __le64 max_bytes; /* quota max. bytes */ 823 + __le64 max_files; /* quota max. files */ 824 + } __attribute__ ((packed)); 810 825 811 826 #endif
+1
include/linux/ceph/libceph.h
··· 262 262 extern struct kmem_cache *ceph_cap_flush_cachep; 263 263 extern struct kmem_cache *ceph_dentry_cachep; 264 264 extern struct kmem_cache *ceph_file_cachep; 265 + extern struct kmem_cache *ceph_dir_file_cachep; 265 266 266 267 /* ceph_common.c */ 267 268 extern bool libceph_compatible(void *data);
+93 -8
include/linux/ceph/messenger.h
··· 76 76 #ifdef CONFIG_BLOCK 77 77 CEPH_MSG_DATA_BIO, /* data source/destination is a bio list */ 78 78 #endif /* CONFIG_BLOCK */ 79 + CEPH_MSG_DATA_BVECS, /* data source/destination is a bio_vec array */ 79 80 }; 80 81 81 82 static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) ··· 88 87 #ifdef CONFIG_BLOCK 89 88 case CEPH_MSG_DATA_BIO: 90 89 #endif /* CONFIG_BLOCK */ 90 + case CEPH_MSG_DATA_BVECS: 91 91 return true; 92 92 default: 93 93 return false; 94 94 } 95 95 } 96 + 97 + #ifdef CONFIG_BLOCK 98 + 99 + struct ceph_bio_iter { 100 + struct bio *bio; 101 + struct bvec_iter iter; 102 + }; 103 + 104 + #define __ceph_bio_iter_advance_step(it, n, STEP) do { \ 105 + unsigned int __n = (n), __cur_n; \ 106 + \ 107 + while (__n) { \ 108 + BUG_ON(!(it)->iter.bi_size); \ 109 + __cur_n = min((it)->iter.bi_size, __n); \ 110 + (void)(STEP); \ 111 + bio_advance_iter((it)->bio, &(it)->iter, __cur_n); \ 112 + if (!(it)->iter.bi_size && (it)->bio->bi_next) { \ 113 + dout("__ceph_bio_iter_advance_step next bio\n"); \ 114 + (it)->bio = (it)->bio->bi_next; \ 115 + (it)->iter = (it)->bio->bi_iter; \ 116 + } \ 117 + __n -= __cur_n; \ 118 + } \ 119 + } while (0) 120 + 121 + /* 122 + * Advance @it by @n bytes. 123 + */ 124 + #define ceph_bio_iter_advance(it, n) \ 125 + __ceph_bio_iter_advance_step(it, n, 0) 126 + 127 + /* 128 + * Advance @it by @n bytes, executing BVEC_STEP for each bio_vec. 129 + */ 130 + #define ceph_bio_iter_advance_step(it, n, BVEC_STEP) \ 131 + __ceph_bio_iter_advance_step(it, n, ({ \ 132 + struct bio_vec bv; \ 133 + struct bvec_iter __cur_iter; \ 134 + \ 135 + __cur_iter = (it)->iter; \ 136 + __cur_iter.bi_size = __cur_n; \ 137 + __bio_for_each_segment(bv, (it)->bio, __cur_iter, __cur_iter) \ 138 + (void)(BVEC_STEP); \ 139 + })) 140 + 141 + #endif /* CONFIG_BLOCK */ 142 + 143 + struct ceph_bvec_iter { 144 + struct bio_vec *bvecs; 145 + struct bvec_iter iter; 146 + }; 147 + 148 + #define __ceph_bvec_iter_advance_step(it, n, STEP) do { \ 149 + BUG_ON((n) > (it)->iter.bi_size); \ 150 + (void)(STEP); \ 151 + bvec_iter_advance((it)->bvecs, &(it)->iter, (n)); \ 152 + } while (0) 153 + 154 + /* 155 + * Advance @it by @n bytes. 156 + */ 157 + #define ceph_bvec_iter_advance(it, n) \ 158 + __ceph_bvec_iter_advance_step(it, n, 0) 159 + 160 + /* 161 + * Advance @it by @n bytes, executing BVEC_STEP for each bio_vec. 162 + */ 163 + #define ceph_bvec_iter_advance_step(it, n, BVEC_STEP) \ 164 + __ceph_bvec_iter_advance_step(it, n, ({ \ 165 + struct bio_vec bv; \ 166 + struct bvec_iter __cur_iter; \ 167 + \ 168 + __cur_iter = (it)->iter; \ 169 + __cur_iter.bi_size = (n); \ 170 + for_each_bvec(bv, (it)->bvecs, __cur_iter, __cur_iter) \ 171 + (void)(BVEC_STEP); \ 172 + })) 173 + 174 + #define ceph_bvec_iter_shorten(it, n) do { \ 175 + BUG_ON((n) > (it)->iter.bi_size); \ 176 + (it)->iter.bi_size = (n); \ 177 + } while (0) 96 178 97 179 struct ceph_msg_data { 98 180 struct list_head links; /* ceph_msg->data */ ··· 183 99 union { 184 100 #ifdef CONFIG_BLOCK 185 101 struct { 186 - struct bio *bio; 187 - size_t bio_length; 102 + struct ceph_bio_iter bio_pos; 103 + u32 bio_length; 188 104 }; 189 105 #endif /* CONFIG_BLOCK */ 106 + struct ceph_bvec_iter bvec_pos; 190 107 struct { 191 108 struct page **pages; /* NOT OWNER. */ 192 109 size_t length; /* total # bytes */ ··· 207 122 bool need_crc; /* crc update needed */ 208 123 union { 209 124 #ifdef CONFIG_BLOCK 210 - struct { /* bio */ 211 - struct bio *bio; /* bio from list */ 212 - struct bvec_iter bvec_iter; 213 - }; 125 + struct ceph_bio_iter bio_iter; 214 126 #endif /* CONFIG_BLOCK */ 127 + struct bvec_iter bvec_iter; 215 128 struct { /* pages */ 216 129 unsigned int page_offset; /* offset in page */ 217 130 unsigned short page_index; /* index in array */ ··· 373 290 extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg, 374 291 struct ceph_pagelist *pagelist); 375 292 #ifdef CONFIG_BLOCK 376 - extern void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, 377 - size_t length); 293 + void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, 294 + u32 length); 378 295 #endif /* CONFIG_BLOCK */ 296 + void ceph_msg_data_add_bvecs(struct ceph_msg *msg, 297 + struct ceph_bvec_iter *bvec_pos); 379 298 380 299 extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, 381 300 bool can_fail);
+14 -5
include/linux/ceph/osd_client.h
··· 57 57 #ifdef CONFIG_BLOCK 58 58 CEPH_OSD_DATA_TYPE_BIO, 59 59 #endif /* CONFIG_BLOCK */ 60 + CEPH_OSD_DATA_TYPE_BVECS, 60 61 }; 61 62 62 63 struct ceph_osd_data { ··· 73 72 struct ceph_pagelist *pagelist; 74 73 #ifdef CONFIG_BLOCK 75 74 struct { 76 - struct bio *bio; /* list of bios */ 77 - size_t bio_length; /* total in list */ 75 + struct ceph_bio_iter bio_pos; 76 + u32 bio_length; 78 77 }; 79 78 #endif /* CONFIG_BLOCK */ 79 + struct ceph_bvec_iter bvec_pos; 80 80 }; 81 81 }; 82 82 ··· 407 405 unsigned int which, 408 406 struct ceph_pagelist *pagelist); 409 407 #ifdef CONFIG_BLOCK 410 - extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, 411 - unsigned int which, 412 - struct bio *bio, size_t bio_length); 408 + void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, 409 + unsigned int which, 410 + struct ceph_bio_iter *bio_pos, 411 + u32 bio_length); 413 412 #endif /* CONFIG_BLOCK */ 413 + void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, 414 + unsigned int which, 415 + struct ceph_bvec_iter *bvec_pos); 414 416 415 417 extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *, 416 418 unsigned int which, ··· 424 418 struct page **pages, u64 length, 425 419 u32 alignment, bool pages_from_pool, 426 420 bool own_pages); 421 + void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req, 422 + unsigned int which, 423 + struct bio_vec *bvecs, u32 bytes); 427 424 extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, 428 425 unsigned int which, 429 426 struct page **pages, u64 length,
-6
include/linux/ceph/osdmap.h
··· 5 5 #include <linux/rbtree.h> 6 6 #include <linux/ceph/types.h> 7 7 #include <linux/ceph/decode.h> 8 - #include <linux/ceph/ceph_fs.h> 9 8 #include <linux/crush/crush.h> 10 9 11 10 /* ··· 278 279 bool ceph_osds_changed(const struct ceph_osds *old_acting, 279 280 const struct ceph_osds *new_acting, 280 281 bool any_change); 281 - 282 - /* calculate mapping of a file extent to an object */ 283 - extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 284 - u64 off, u64 len, 285 - u64 *bno, u64 *oxoff, u64 *oxlen); 286 282 287 283 int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, 288 284 const struct ceph_object_id *oid,
+69
include/linux/ceph/striper.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _LINUX_CEPH_STRIPER_H 3 + #define _LINUX_CEPH_STRIPER_H 4 + 5 + #include <linux/list.h> 6 + #include <linux/types.h> 7 + 8 + struct ceph_file_layout; 9 + 10 + void ceph_calc_file_object_mapping(struct ceph_file_layout *l, 11 + u64 off, u64 len, 12 + u64 *objno, u64 *objoff, u32 *xlen); 13 + 14 + struct ceph_object_extent { 15 + struct list_head oe_item; 16 + u64 oe_objno; 17 + u64 oe_off; 18 + u64 oe_len; 19 + }; 20 + 21 + static inline void ceph_object_extent_init(struct ceph_object_extent *ex) 22 + { 23 + INIT_LIST_HEAD(&ex->oe_item); 24 + } 25 + 26 + /* 27 + * Called for each mapped stripe unit. 28 + * 29 + * @bytes: number of bytes mapped, i.e. the minimum of the full length 30 + * requested (file extent length) or the remainder of the stripe 31 + * unit within an object 32 + */ 33 + typedef void (*ceph_object_extent_fn_t)(struct ceph_object_extent *ex, 34 + u32 bytes, void *arg); 35 + 36 + int ceph_file_to_extents(struct ceph_file_layout *l, u64 off, u64 len, 37 + struct list_head *object_extents, 38 + struct ceph_object_extent *alloc_fn(void *arg), 39 + void *alloc_arg, 40 + ceph_object_extent_fn_t action_fn, 41 + void *action_arg); 42 + int ceph_iterate_extents(struct ceph_file_layout *l, u64 off, u64 len, 43 + struct list_head *object_extents, 44 + ceph_object_extent_fn_t action_fn, 45 + void *action_arg); 46 + 47 + struct ceph_file_extent { 48 + u64 fe_off; 49 + u64 fe_len; 50 + }; 51 + 52 + static inline u64 ceph_file_extents_bytes(struct ceph_file_extent *file_extents, 53 + u32 num_file_extents) 54 + { 55 + u64 bytes = 0; 56 + u32 i; 57 + 58 + for (i = 0; i < num_file_extents; i++) 59 + bytes += file_extents[i].fe_len; 60 + 61 + return bytes; 62 + } 63 + 64 + int ceph_extent_to_file(struct ceph_file_layout *l, 65 + u64 objno, u64 objoff, u64 objlen, 66 + struct ceph_file_extent **file_extents, 67 + u32 *num_file_extents); 68 + 69 + #endif
+1
net/ceph/Makefile
··· 8 8 mon_client.o \ 9 9 cls_lock_client.o \ 10 10 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ 11 + striper.o \ 11 12 debugfs.o \ 12 13 auth.o auth_none.o \ 13 14 crypto.o armor.o \
+7 -1
net/ceph/ceph_common.c
··· 72 72 case CEPH_MSG_MON_GET_VERSION: return "mon_get_version"; 73 73 case CEPH_MSG_MON_GET_VERSION_REPLY: return "mon_get_version_reply"; 74 74 case CEPH_MSG_MDS_MAP: return "mds_map"; 75 + case CEPH_MSG_FS_MAP_USER: return "fs_map_user"; 75 76 case CEPH_MSG_CLIENT_SESSION: return "client_session"; 76 77 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; 77 78 case CEPH_MSG_CLIENT_REQUEST: return "client_request"; ··· 80 79 case CEPH_MSG_CLIENT_REPLY: return "client_reply"; 81 80 case CEPH_MSG_CLIENT_CAPS: return "client_caps"; 82 81 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; 82 + case CEPH_MSG_CLIENT_QUOTA: return "client_quota"; 83 83 case CEPH_MSG_CLIENT_SNAP: return "client_snap"; 84 84 case CEPH_MSG_CLIENT_LEASE: return "client_lease"; 85 + case CEPH_MSG_POOLOP_REPLY: return "poolop_reply"; 86 + case CEPH_MSG_POOLOP: return "poolop"; 87 + case CEPH_MSG_MON_COMMAND: return "mon_command"; 88 + case CEPH_MSG_MON_COMMAND_ACK: return "mon_command_ack"; 85 89 case CEPH_MSG_OSD_MAP: return "osd_map"; 86 90 case CEPH_MSG_OSD_OP: return "osd_op"; 87 91 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; ··· 223 217 224 218 if (i == 16) 225 219 err = 0; 226 - dout("parse_fsid ret %d got fsid %pU", err, fsid); 220 + dout("parse_fsid ret %d got fsid %pU\n", err, fsid); 227 221 return err; 228 222 } 229 223
+4 -2
net/ceph/crypto.c
··· 347 347 .destroy = ceph_key_destroy, 348 348 }; 349 349 350 - int ceph_crypto_init(void) { 350 + int __init ceph_crypto_init(void) 351 + { 351 352 return register_key_type(&key_type_ceph); 352 353 } 353 354 354 - void ceph_crypto_shutdown(void) { 355 + void ceph_crypto_shutdown(void) 356 + { 355 357 unregister_key_type(&key_type_ceph); 356 358 }
+7 -10
net/ceph/debugfs.c
··· 389 389 CEPH_DEFINE_SHOW_FUNC(osdc_show) 390 390 CEPH_DEFINE_SHOW_FUNC(client_options_show) 391 391 392 - int ceph_debugfs_init(void) 392 + int __init ceph_debugfs_init(void) 393 393 { 394 394 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); 395 395 if (!ceph_debugfs_dir) ··· 418 418 goto out; 419 419 420 420 client->monc.debugfs_file = debugfs_create_file("monc", 421 - 0600, 421 + 0400, 422 422 client->debugfs_dir, 423 423 client, 424 424 &monc_show_fops); ··· 426 426 goto out; 427 427 428 428 client->osdc.debugfs_file = debugfs_create_file("osdc", 429 - 0600, 429 + 0400, 430 430 client->debugfs_dir, 431 431 client, 432 432 &osdc_show_fops); ··· 434 434 goto out; 435 435 436 436 client->debugfs_monmap = debugfs_create_file("monmap", 437 - 0600, 437 + 0400, 438 438 client->debugfs_dir, 439 439 client, 440 440 &monmap_show_fops); ··· 442 442 goto out; 443 443 444 444 client->debugfs_osdmap = debugfs_create_file("osdmap", 445 - 0600, 445 + 0400, 446 446 client->debugfs_dir, 447 447 client, 448 448 &osdmap_show_fops); ··· 450 450 goto out; 451 451 452 452 client->debugfs_options = debugfs_create_file("client_options", 453 - 0600, 453 + 0400, 454 454 client->debugfs_dir, 455 455 client, 456 456 &client_options_show_fops); ··· 477 477 478 478 #else /* CONFIG_DEBUG_FS */ 479 479 480 - int ceph_debugfs_init(void) 480 + int __init ceph_debugfs_init(void) 481 481 { 482 482 return 0; 483 483 } ··· 496 496 } 497 497 498 498 #endif /* CONFIG_DEBUG_FS */ 499 - 500 - EXPORT_SYMBOL(ceph_debugfs_init); 501 - EXPORT_SYMBOL(ceph_debugfs_cleanup);
+122 -72
net/ceph/messenger.c
··· 277 277 ceph_msgr_slab_exit(); 278 278 } 279 279 280 - int ceph_msgr_init(void) 280 + int __init ceph_msgr_init(void) 281 281 { 282 282 if (ceph_msgr_slab_init()) 283 283 return -ENOMEM; ··· 299 299 300 300 return -ENOMEM; 301 301 } 302 - EXPORT_SYMBOL(ceph_msgr_init); 303 302 304 303 void ceph_msgr_exit(void) 305 304 { ··· 306 307 307 308 _ceph_msgr_exit(); 308 309 } 309 - EXPORT_SYMBOL(ceph_msgr_exit); 310 310 311 311 void ceph_msgr_flush(void) 312 312 { ··· 837 839 size_t length) 838 840 { 839 841 struct ceph_msg_data *data = cursor->data; 840 - struct bio *bio; 842 + struct ceph_bio_iter *it = &cursor->bio_iter; 841 843 842 - BUG_ON(data->type != CEPH_MSG_DATA_BIO); 844 + cursor->resid = min_t(size_t, length, data->bio_length); 845 + *it = data->bio_pos; 846 + if (cursor->resid < it->iter.bi_size) 847 + it->iter.bi_size = cursor->resid; 843 848 844 - bio = data->bio; 845 - BUG_ON(!bio); 846 - 847 - cursor->resid = min(length, data->bio_length); 848 - cursor->bio = bio; 849 - cursor->bvec_iter = bio->bi_iter; 850 - cursor->last_piece = 851 - cursor->resid <= bio_iter_len(bio, cursor->bvec_iter); 849 + BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); 850 + cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); 852 851 } 853 852 854 853 static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, 855 854 size_t *page_offset, 856 855 size_t *length) 857 856 { 858 - struct ceph_msg_data *data = cursor->data; 859 - struct bio *bio; 860 - struct bio_vec bio_vec; 857 + struct bio_vec bv = bio_iter_iovec(cursor->bio_iter.bio, 858 + cursor->bio_iter.iter); 861 859 862 - BUG_ON(data->type != CEPH_MSG_DATA_BIO); 863 - 864 - bio = cursor->bio; 865 - BUG_ON(!bio); 866 - 867 - bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); 868 - 869 - *page_offset = (size_t) bio_vec.bv_offset; 870 - BUG_ON(*page_offset >= PAGE_SIZE); 871 - if (cursor->last_piece) /* pagelist offset is always 0 */ 872 - *length = cursor->resid; 873 - else 874 - *length = (size_t) bio_vec.bv_len; 875 - BUG_ON(*length > cursor->resid); 876 - BUG_ON(*page_offset + *length > PAGE_SIZE); 877 - 878 - return bio_vec.bv_page; 860 + *page_offset = bv.bv_offset; 861 + *length = bv.bv_len; 862 + return bv.bv_page; 879 863 } 880 864 881 865 static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, 882 866 size_t bytes) 883 867 { 884 - struct bio *bio; 885 - struct bio_vec bio_vec; 868 + struct ceph_bio_iter *it = &cursor->bio_iter; 886 869 887 - BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO); 888 - 889 - bio = cursor->bio; 890 - BUG_ON(!bio); 891 - 892 - bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); 893 - 894 - /* Advance the cursor offset */ 895 - 896 - BUG_ON(cursor->resid < bytes); 870 + BUG_ON(bytes > cursor->resid); 871 + BUG_ON(bytes > bio_iter_len(it->bio, it->iter)); 897 872 cursor->resid -= bytes; 873 + bio_advance_iter(it->bio, &it->iter, bytes); 898 874 899 - bio_advance_iter(bio, &cursor->bvec_iter, bytes); 875 + if (!cursor->resid) { 876 + BUG_ON(!cursor->last_piece); 877 + return false; /* no more data */ 878 + } 900 879 901 - if (bytes < bio_vec.bv_len) 880 + if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done)) 902 881 return false; /* more bytes to process in this segment */ 903 882 904 - /* Move on to the next segment, and possibly the next bio */ 905 - 906 - if (!cursor->bvec_iter.bi_size) { 907 - bio = bio->bi_next; 908 - cursor->bio = bio; 909 - if (bio) 910 - cursor->bvec_iter = bio->bi_iter; 911 - else 912 - memset(&cursor->bvec_iter, 0, 913 - sizeof(cursor->bvec_iter)); 883 + if (!it->iter.bi_size) { 884 + it->bio = it->bio->bi_next; 885 + it->iter = it->bio->bi_iter; 886 + if (cursor->resid < it->iter.bi_size) 887 + it->iter.bi_size = cursor->resid; 914 888 } 915 889 916 - if (!cursor->last_piece) { 917 - BUG_ON(!cursor->resid); 918 - BUG_ON(!bio); 919 - /* A short read is OK, so use <= rather than == */ 920 - if (cursor->resid <= bio_iter_len(bio, cursor->bvec_iter)) 921 - cursor->last_piece = true; 922 - } 923 - 890 + BUG_ON(cursor->last_piece); 891 + BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); 892 + cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); 924 893 return true; 925 894 } 926 895 #endif /* CONFIG_BLOCK */ 896 + 897 + static void ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor *cursor, 898 + size_t length) 899 + { 900 + struct ceph_msg_data *data = cursor->data; 901 + struct bio_vec *bvecs = data->bvec_pos.bvecs; 902 + 903 + cursor->resid = min_t(size_t, length, data->bvec_pos.iter.bi_size); 904 + cursor->bvec_iter = data->bvec_pos.iter; 905 + cursor->bvec_iter.bi_size = cursor->resid; 906 + 907 + BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); 908 + cursor->last_piece = 909 + cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter); 910 + } 911 + 912 + static struct page *ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor *cursor, 913 + size_t *page_offset, 914 + size_t *length) 915 + { 916 + struct bio_vec bv = bvec_iter_bvec(cursor->data->bvec_pos.bvecs, 917 + cursor->bvec_iter); 918 + 919 + *page_offset = bv.bv_offset; 920 + *length = bv.bv_len; 921 + return bv.bv_page; 922 + } 923 + 924 + static bool ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor *cursor, 925 + size_t bytes) 926 + { 927 + struct bio_vec *bvecs = cursor->data->bvec_pos.bvecs; 928 + 929 + BUG_ON(bytes > cursor->resid); 930 + BUG_ON(bytes > bvec_iter_len(bvecs, cursor->bvec_iter)); 931 + cursor->resid -= bytes; 932 + bvec_iter_advance(bvecs, &cursor->bvec_iter, bytes); 933 + 934 + if (!cursor->resid) { 935 + BUG_ON(!cursor->last_piece); 936 + return false; /* no more data */ 937 + } 938 + 939 + if (!bytes || cursor->bvec_iter.bi_bvec_done) 940 + return false; /* more bytes to process in this segment */ 941 + 942 + BUG_ON(cursor->last_piece); 943 + BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); 944 + cursor->last_piece = 945 + cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter); 946 + return true; 947 + } 927 948 928 949 /* 929 950 * For a page array, a piece comes from the first page in the array ··· 1127 1110 ceph_msg_data_bio_cursor_init(cursor, length); 1128 1111 break; 1129 1112 #endif /* CONFIG_BLOCK */ 1113 + case CEPH_MSG_DATA_BVECS: 1114 + ceph_msg_data_bvecs_cursor_init(cursor, length); 1115 + break; 1130 1116 case CEPH_MSG_DATA_NONE: 1131 1117 default: 1132 1118 /* BUG(); */ ··· 1178 1158 page = ceph_msg_data_bio_next(cursor, page_offset, length); 1179 1159 break; 1180 1160 #endif /* CONFIG_BLOCK */ 1161 + case CEPH_MSG_DATA_BVECS: 1162 + page = ceph_msg_data_bvecs_next(cursor, page_offset, length); 1163 + break; 1181 1164 case CEPH_MSG_DATA_NONE: 1182 1165 default: 1183 1166 page = NULL; 1184 1167 break; 1185 1168 } 1169 + 1186 1170 BUG_ON(!page); 1187 1171 BUG_ON(*page_offset + *length > PAGE_SIZE); 1188 1172 BUG_ON(!*length); 1173 + BUG_ON(*length > cursor->resid); 1189 1174 if (last_piece) 1190 1175 *last_piece = cursor->last_piece; 1191 1176 ··· 1219 1194 new_piece = ceph_msg_data_bio_advance(cursor, bytes); 1220 1195 break; 1221 1196 #endif /* CONFIG_BLOCK */ 1197 + case CEPH_MSG_DATA_BVECS: 1198 + new_piece = ceph_msg_data_bvecs_advance(cursor, bytes); 1199 + break; 1222 1200 case CEPH_MSG_DATA_NONE: 1223 1201 default: 1224 1202 BUG(); ··· 1603 1575 * been revoked, so use the zero page. 1604 1576 */ 1605 1577 crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0; 1606 - while (cursor->resid) { 1578 + while (cursor->total_resid) { 1607 1579 struct page *page; 1608 1580 size_t page_offset; 1609 1581 size_t length; 1610 1582 bool last_piece; 1611 1583 int ret; 1584 + 1585 + if (!cursor->resid) { 1586 + ceph_msg_data_advance(cursor, 0); 1587 + continue; 1588 + } 1612 1589 1613 1590 page = ceph_msg_data_next(cursor, &page_offset, &length, 1614 1591 &last_piece); ··· 2330 2297 2331 2298 if (do_datacrc) 2332 2299 crc = con->in_data_crc; 2333 - while (cursor->resid) { 2300 + while (cursor->total_resid) { 2301 + if (!cursor->resid) { 2302 + ceph_msg_data_advance(cursor, 0); 2303 + continue; 2304 + } 2305 + 2334 2306 page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); 2335 2307 ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); 2336 2308 if (ret <= 0) { ··· 3300 3262 EXPORT_SYMBOL(ceph_msg_data_add_pagelist); 3301 3263 3302 3264 #ifdef CONFIG_BLOCK 3303 - void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, 3304 - size_t length) 3265 + void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, 3266 + u32 length) 3305 3267 { 3306 3268 struct ceph_msg_data *data; 3307 3269 3308 - BUG_ON(!bio); 3309 - 3310 3270 data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); 3311 3271 BUG_ON(!data); 3312 - data->bio = bio; 3272 + data->bio_pos = *bio_pos; 3313 3273 data->bio_length = length; 3314 3274 3315 3275 list_add_tail(&data->links, &msg->data); ··· 3315 3279 } 3316 3280 EXPORT_SYMBOL(ceph_msg_data_add_bio); 3317 3281 #endif /* CONFIG_BLOCK */ 3282 + 3283 + void ceph_msg_data_add_bvecs(struct ceph_msg *msg, 3284 + struct ceph_bvec_iter *bvec_pos) 3285 + { 3286 + struct ceph_msg_data *data; 3287 + 3288 + data = ceph_msg_data_create(CEPH_MSG_DATA_BVECS); 3289 + BUG_ON(!data); 3290 + data->bvec_pos = *bvec_pos; 3291 + 3292 + list_add_tail(&data->links, &msg->data); 3293 + msg->data_length += bvec_pos->iter.bi_size; 3294 + } 3295 + EXPORT_SYMBOL(ceph_msg_data_add_bvecs); 3318 3296 3319 3297 /* 3320 3298 * construct a new message with given type, size
+1 -1
net/ceph/mon_client.c
··· 60 60 num_mon = ceph_decode_32(&p); 61 61 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad); 62 62 63 - if (num_mon >= CEPH_MAX_MON) 63 + if (num_mon > CEPH_MAX_MON) 64 64 goto bad; 65 65 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS); 66 66 if (m == NULL)
+53 -14
net/ceph/osd_client.c
··· 20 20 #include <linux/ceph/decode.h> 21 21 #include <linux/ceph/auth.h> 22 22 #include <linux/ceph/pagelist.h> 23 + #include <linux/ceph/striper.h> 23 24 24 25 #define OSD_OPREPLY_FRONT_LEN 512 25 26 ··· 104 103 u64 *objnum, u64 *objoff, u64 *objlen) 105 104 { 106 105 u64 orig_len = *plen; 107 - int r; 106 + u32 xlen; 108 107 109 108 /* object extent? */ 110 - r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum, 111 - objoff, objlen); 112 - if (r < 0) 113 - return r; 109 + ceph_calc_file_object_mapping(layout, off, orig_len, objnum, 110 + objoff, &xlen); 111 + *objlen = xlen; 114 112 if (*objlen < orig_len) { 115 113 *plen = *objlen; 116 114 dout(" skipping last %llu, final file extent %llu~%llu\n", ··· 117 117 } 118 118 119 119 dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen); 120 - 121 120 return 0; 122 121 } 123 122 ··· 147 148 148 149 #ifdef CONFIG_BLOCK 149 150 static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, 150 - struct bio *bio, size_t bio_length) 151 + struct ceph_bio_iter *bio_pos, 152 + u32 bio_length) 151 153 { 152 154 osd_data->type = CEPH_OSD_DATA_TYPE_BIO; 153 - osd_data->bio = bio; 155 + osd_data->bio_pos = *bio_pos; 154 156 osd_data->bio_length = bio_length; 155 157 } 156 158 #endif /* CONFIG_BLOCK */ 159 + 160 + static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data, 161 + struct ceph_bvec_iter *bvec_pos) 162 + { 163 + osd_data->type = CEPH_OSD_DATA_TYPE_BVECS; 164 + osd_data->bvec_pos = *bvec_pos; 165 + } 157 166 158 167 #define osd_req_op_data(oreq, whch, typ, fld) \ 159 168 ({ \ ··· 225 218 226 219 #ifdef CONFIG_BLOCK 227 220 void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, 228 - unsigned int which, struct bio *bio, size_t bio_length) 221 + unsigned int which, 222 + struct ceph_bio_iter *bio_pos, 223 + u32 bio_length) 229 224 { 230 225 struct ceph_osd_data *osd_data; 231 226 232 227 osd_data = osd_req_op_data(osd_req, which, extent, osd_data); 233 - ceph_osd_data_bio_init(osd_data, bio, bio_length); 228 + ceph_osd_data_bio_init(osd_data, bio_pos, bio_length); 234 229 } 235 230 EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); 236 231 #endif /* CONFIG_BLOCK */ 232 + 233 + void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, 234 + unsigned int which, 235 + struct ceph_bvec_iter *bvec_pos) 236 + { 237 + struct ceph_osd_data *osd_data; 238 + 239 + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); 240 + ceph_osd_data_bvecs_init(osd_data, bvec_pos); 241 + } 242 + EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos); 237 243 238 244 static void osd_req_op_cls_request_info_pagelist( 239 245 struct ceph_osd_request *osd_req, ··· 285 265 } 286 266 EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); 287 267 268 + void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req, 269 + unsigned int which, 270 + struct bio_vec *bvecs, u32 bytes) 271 + { 272 + struct ceph_osd_data *osd_data; 273 + struct ceph_bvec_iter it = { 274 + .bvecs = bvecs, 275 + .iter = { .bi_size = bytes }, 276 + }; 277 + 278 + osd_data = osd_req_op_data(osd_req, which, cls, request_data); 279 + ceph_osd_data_bvecs_init(osd_data, &it); 280 + osd_req->r_ops[which].cls.indata_len += bytes; 281 + osd_req->r_ops[which].indata_len += bytes; 282 + } 283 + EXPORT_SYMBOL(osd_req_op_cls_request_data_bvecs); 284 + 288 285 void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, 289 286 unsigned int which, struct page **pages, u64 length, 290 287 u32 alignment, bool pages_from_pool, bool own_pages) ··· 327 290 case CEPH_OSD_DATA_TYPE_BIO: 328 291 return (u64)osd_data->bio_length; 329 292 #endif /* CONFIG_BLOCK */ 293 + case CEPH_OSD_DATA_TYPE_BVECS: 294 + return osd_data->bvec_pos.iter.bi_size; 330 295 default: 331 296 WARN(true, "unrecognized data type %d\n", (int)osd_data->type); 332 297 return 0; ··· 867 828 ceph_msg_data_add_pagelist(msg, osd_data->pagelist); 868 829 #ifdef CONFIG_BLOCK 869 830 } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { 870 - ceph_msg_data_add_bio(msg, osd_data->bio, length); 831 + ceph_msg_data_add_bio(msg, &osd_data->bio_pos, length); 871 832 #endif 833 + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) { 834 + ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos); 872 835 } else { 873 836 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); 874 837 } ··· 5106 5065 } 5107 5066 EXPORT_SYMBOL(ceph_osdc_writepages); 5108 5067 5109 - int ceph_osdc_setup(void) 5068 + int __init ceph_osdc_setup(void) 5110 5069 { 5111 5070 size_t size = sizeof(struct ceph_osd_request) + 5112 5071 CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op); ··· 5117 5076 5118 5077 return ceph_osd_request_cache ? 0 : -ENOMEM; 5119 5078 } 5120 - EXPORT_SYMBOL(ceph_osdc_setup); 5121 5079 5122 5080 void ceph_osdc_cleanup(void) 5123 5081 { ··· 5124 5084 kmem_cache_destroy(ceph_osd_request_cache); 5125 5085 ceph_osd_request_cache = NULL; 5126 5086 } 5127 - EXPORT_SYMBOL(ceph_osdc_cleanup); 5128 5087 5129 5088 /* 5130 5089 * handle incoming message
-71
net/ceph/osdmap.c
··· 4 4 5 5 #include <linux/module.h> 6 6 #include <linux/slab.h> 7 - #include <asm/div64.h> 8 7 9 8 #include <linux/ceph/libceph.h> 10 9 #include <linux/ceph/osdmap.h> ··· 2138 2139 2139 2140 return false; 2140 2141 } 2141 - 2142 - /* 2143 - * calculate file layout from given offset, length. 2144 - * fill in correct oid, logical length, and object extent 2145 - * offset, length. 2146 - * 2147 - * for now, we write only a single su, until we can 2148 - * pass a stride back to the caller. 2149 - */ 2150 - int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, 2151 - u64 off, u64 len, 2152 - u64 *ono, 2153 - u64 *oxoff, u64 *oxlen) 2154 - { 2155 - u32 osize = layout->object_size; 2156 - u32 su = layout->stripe_unit; 2157 - u32 sc = layout->stripe_count; 2158 - u32 bl, stripeno, stripepos, objsetno; 2159 - u32 su_per_object; 2160 - u64 t, su_offset; 2161 - 2162 - dout("mapping %llu~%llu osize %u fl_su %u\n", off, len, 2163 - osize, su); 2164 - if (su == 0 || sc == 0) 2165 - goto invalid; 2166 - su_per_object = osize / su; 2167 - if (su_per_object == 0) 2168 - goto invalid; 2169 - dout("osize %u / su %u = su_per_object %u\n", osize, su, 2170 - su_per_object); 2171 - 2172 - if ((su & ~PAGE_MASK) != 0) 2173 - goto invalid; 2174 - 2175 - /* bl = *off / su; */ 2176 - t = off; 2177 - do_div(t, su); 2178 - bl = t; 2179 - dout("off %llu / su %u = bl %u\n", off, su, bl); 2180 - 2181 - stripeno = bl / sc; 2182 - stripepos = bl % sc; 2183 - objsetno = stripeno / su_per_object; 2184 - 2185 - *ono = objsetno * sc + stripepos; 2186 - dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned int)*ono); 2187 - 2188 - /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */ 2189 - t = off; 2190 - su_offset = do_div(t, su); 2191 - *oxoff = su_offset + (stripeno % su_per_object) * su; 2192 - 2193 - /* 2194 - * Calculate the length of the extent being written to the selected 2195 - * object. This is the minimum of the full length requested (len) or 2196 - * the remainder of the current stripe being written to. 2197 - */ 2198 - *oxlen = min_t(u64, len, su - su_offset); 2199 - 2200 - dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); 2201 - return 0; 2202 - 2203 - invalid: 2204 - dout(" invalid layout\n"); 2205 - *ono = 0; 2206 - *oxoff = 0; 2207 - *oxlen = 0; 2208 - return -EINVAL; 2209 - } 2210 - EXPORT_SYMBOL(ceph_calc_file_object_mapping); 2211 2142 2212 2143 /* 2213 2144 * Map an object into a PG.
+261
net/ceph/striper.c
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + 3 + #include <linux/ceph/ceph_debug.h> 4 + 5 + #include <linux/math64.h> 6 + #include <linux/slab.h> 7 + 8 + #include <linux/ceph/striper.h> 9 + #include <linux/ceph/types.h> 10 + 11 + /* 12 + * Map a file extent to a stripe unit within an object. 13 + * Fill in objno, offset into object, and object extent length (i.e. the 14 + * number of bytes mapped, less than or equal to @l->stripe_unit). 15 + * 16 + * Example for stripe_count = 3, stripes_per_object = 4: 17 + * 18 + * blockno | 0 3 6 9 | 1 4 7 10 | 2 5 8 11 | 12 15 18 21 | 13 16 19 19 + * stripeno | 0 1 2 3 | 0 1 2 3 | 0 1 2 3 | 4 5 6 7 | 4 5 6 20 + * stripepos | 0 | 1 | 2 | 0 | 1 21 + * objno | 0 | 1 | 2 | 3 | 4 22 + * objsetno | 0 | 1 23 + */ 24 + void ceph_calc_file_object_mapping(struct ceph_file_layout *l, 25 + u64 off, u64 len, 26 + u64 *objno, u64 *objoff, u32 *xlen) 27 + { 28 + u32 stripes_per_object = l->object_size / l->stripe_unit; 29 + u64 blockno; /* which su in the file (i.e. globally) */ 30 + u32 blockoff; /* offset into su */ 31 + u64 stripeno; /* which stripe */ 32 + u32 stripepos; /* which su in the stripe, 33 + which object in the object set */ 34 + u64 objsetno; /* which object set */ 35 + u32 objsetpos; /* which stripe in the object set */ 36 + 37 + blockno = div_u64_rem(off, l->stripe_unit, &blockoff); 38 + stripeno = div_u64_rem(blockno, l->stripe_count, &stripepos); 39 + objsetno = div_u64_rem(stripeno, stripes_per_object, &objsetpos); 40 + 41 + *objno = objsetno * l->stripe_count + stripepos; 42 + *objoff = objsetpos * l->stripe_unit + blockoff; 43 + *xlen = min_t(u64, len, l->stripe_unit - blockoff); 44 + } 45 + EXPORT_SYMBOL(ceph_calc_file_object_mapping); 46 + 47 + /* 48 + * Return the last extent with given objno (@object_extents is sorted 49 + * by objno). If not found, return NULL and set @add_pos so that the 50 + * new extent can be added with list_add(add_pos, new_ex). 51 + */ 52 + static struct ceph_object_extent * 53 + lookup_last(struct list_head *object_extents, u64 objno, 54 + struct list_head **add_pos) 55 + { 56 + struct list_head *pos; 57 + 58 + list_for_each_prev(pos, object_extents) { 59 + struct ceph_object_extent *ex = 60 + list_entry(pos, typeof(*ex), oe_item); 61 + 62 + if (ex->oe_objno == objno) 63 + return ex; 64 + 65 + if (ex->oe_objno < objno) 66 + break; 67 + } 68 + 69 + *add_pos = pos; 70 + return NULL; 71 + } 72 + 73 + static struct ceph_object_extent * 74 + lookup_containing(struct list_head *object_extents, u64 objno, 75 + u64 objoff, u32 xlen) 76 + { 77 + struct ceph_object_extent *ex; 78 + 79 + list_for_each_entry(ex, object_extents, oe_item) { 80 + if (ex->oe_objno == objno && 81 + ex->oe_off <= objoff && 82 + ex->oe_off + ex->oe_len >= objoff + xlen) /* paranoia */ 83 + return ex; 84 + 85 + if (ex->oe_objno > objno) 86 + break; 87 + } 88 + 89 + return NULL; 90 + } 91 + 92 + /* 93 + * Map a file extent to a sorted list of object extents. 94 + * 95 + * We want only one (or as few as possible) object extents per object. 96 + * Adjacent object extents will be merged together, each returned object 97 + * extent may reverse map to multiple different file extents. 98 + * 99 + * Call @alloc_fn for each new object extent and @action_fn for each 100 + * mapped stripe unit, whether it was merged into an already allocated 101 + * object extent or started a new object extent. 102 + * 103 + * Newly allocated object extents are added to @object_extents. 104 + * To keep @object_extents sorted, successive calls to this function 105 + * must map successive file extents (i.e. the list of file extents that 106 + * are mapped using the same @object_extents must be sorted). 107 + * 108 + * The caller is responsible for @object_extents. 109 + */ 110 + int ceph_file_to_extents(struct ceph_file_layout *l, u64 off, u64 len, 111 + struct list_head *object_extents, 112 + struct ceph_object_extent *alloc_fn(void *arg), 113 + void *alloc_arg, 114 + ceph_object_extent_fn_t action_fn, 115 + void *action_arg) 116 + { 117 + struct ceph_object_extent *last_ex, *ex; 118 + 119 + while (len) { 120 + struct list_head *add_pos = NULL; 121 + u64 objno, objoff; 122 + u32 xlen; 123 + 124 + ceph_calc_file_object_mapping(l, off, len, &objno, &objoff, 125 + &xlen); 126 + 127 + last_ex = lookup_last(object_extents, objno, &add_pos); 128 + if (!last_ex || last_ex->oe_off + last_ex->oe_len != objoff) { 129 + ex = alloc_fn(alloc_arg); 130 + if (!ex) 131 + return -ENOMEM; 132 + 133 + ex->oe_objno = objno; 134 + ex->oe_off = objoff; 135 + ex->oe_len = xlen; 136 + if (action_fn) 137 + action_fn(ex, xlen, action_arg); 138 + 139 + if (!last_ex) 140 + list_add(&ex->oe_item, add_pos); 141 + else 142 + list_add(&ex->oe_item, &last_ex->oe_item); 143 + } else { 144 + last_ex->oe_len += xlen; 145 + if (action_fn) 146 + action_fn(last_ex, xlen, action_arg); 147 + } 148 + 149 + off += xlen; 150 + len -= xlen; 151 + } 152 + 153 + for (last_ex = list_first_entry(object_extents, typeof(*ex), oe_item), 154 + ex = list_next_entry(last_ex, oe_item); 155 + &ex->oe_item != object_extents; 156 + last_ex = ex, ex = list_next_entry(ex, oe_item)) { 157 + if (last_ex->oe_objno > ex->oe_objno || 158 + (last_ex->oe_objno == ex->oe_objno && 159 + last_ex->oe_off + last_ex->oe_len >= ex->oe_off)) { 160 + WARN(1, "%s: object_extents list not sorted!\n", 161 + __func__); 162 + return -EINVAL; 163 + } 164 + } 165 + 166 + return 0; 167 + } 168 + EXPORT_SYMBOL(ceph_file_to_extents); 169 + 170 + /* 171 + * A stripped down, non-allocating version of ceph_file_to_extents(), 172 + * for when @object_extents is already populated. 173 + */ 174 + int ceph_iterate_extents(struct ceph_file_layout *l, u64 off, u64 len, 175 + struct list_head *object_extents, 176 + ceph_object_extent_fn_t action_fn, 177 + void *action_arg) 178 + { 179 + while (len) { 180 + struct ceph_object_extent *ex; 181 + u64 objno, objoff; 182 + u32 xlen; 183 + 184 + ceph_calc_file_object_mapping(l, off, len, &objno, &objoff, 185 + &xlen); 186 + 187 + ex = lookup_containing(object_extents, objno, objoff, xlen); 188 + if (!ex) { 189 + WARN(1, "%s: objno %llu %llu~%u not found!\n", 190 + __func__, objno, objoff, xlen); 191 + return -EINVAL; 192 + } 193 + 194 + action_fn(ex, xlen, action_arg); 195 + 196 + off += xlen; 197 + len -= xlen; 198 + } 199 + 200 + return 0; 201 + } 202 + EXPORT_SYMBOL(ceph_iterate_extents); 203 + 204 + /* 205 + * Reverse map an object extent to a sorted list of file extents. 206 + * 207 + * On success, the caller is responsible for: 208 + * 209 + * kfree(file_extents) 210 + */ 211 + int ceph_extent_to_file(struct ceph_file_layout *l, 212 + u64 objno, u64 objoff, u64 objlen, 213 + struct ceph_file_extent **file_extents, 214 + u32 *num_file_extents) 215 + { 216 + u32 stripes_per_object = l->object_size / l->stripe_unit; 217 + u64 blockno; /* which su */ 218 + u32 blockoff; /* offset into su */ 219 + u64 stripeno; /* which stripe */ 220 + u32 stripepos; /* which su in the stripe, 221 + which object in the object set */ 222 + u64 objsetno; /* which object set */ 223 + u32 i = 0; 224 + 225 + if (!objlen) { 226 + *file_extents = NULL; 227 + *num_file_extents = 0; 228 + return 0; 229 + } 230 + 231 + *num_file_extents = DIV_ROUND_UP_ULL(objoff + objlen, l->stripe_unit) - 232 + DIV_ROUND_DOWN_ULL(objoff, l->stripe_unit); 233 + *file_extents = kmalloc_array(*num_file_extents, sizeof(**file_extents), 234 + GFP_NOIO); 235 + if (!*file_extents) 236 + return -ENOMEM; 237 + 238 + div_u64_rem(objoff, l->stripe_unit, &blockoff); 239 + while (objlen) { 240 + u64 off, len; 241 + 242 + objsetno = div_u64_rem(objno, l->stripe_count, &stripepos); 243 + stripeno = div_u64(objoff, l->stripe_unit) + 244 + objsetno * stripes_per_object; 245 + blockno = stripeno * l->stripe_count + stripepos; 246 + off = blockno * l->stripe_unit + blockoff; 247 + len = min_t(u64, objlen, l->stripe_unit - blockoff); 248 + 249 + (*file_extents)[i].fe_off = off; 250 + (*file_extents)[i].fe_len = len; 251 + 252 + blockoff = 0; 253 + objoff += len; 254 + objlen -= len; 255 + i++; 256 + } 257 + 258 + BUG_ON(i != *num_file_extents); 259 + return 0; 260 + } 261 + EXPORT_SYMBOL(ceph_extent_to_file);