at v3.1 410 lines 11 kB view raw
1/* 2 * linux/fs/nfs/blocklayout/blocklayoutdev.c 3 * 4 * Device operations for the pnfs nfs4 file layout driver. 5 * 6 * Copyright (c) 2006 The Regents of the University of Michigan. 7 * All rights reserved. 8 * 9 * Andy Adamson <andros@citi.umich.edu> 10 * Fred Isaman <iisaman@umich.edu> 11 * 12 * permission is granted to use, copy, create derivative works and 13 * redistribute this software and such derivative works for any purpose, 14 * so long as the name of the university of michigan is not used in 15 * any advertising or publicity pertaining to the use or distribution 16 * of this software without specific, written prior authorization. if 17 * the above copyright notice or any other identification of the 18 * university of michigan is included in any copy of any portion of 19 * this software, then the disclaimer below must also be included. 20 * 21 * this software is provided as is, without representation from the 22 * university of michigan as to its fitness for any purpose, and without 23 * warranty by the university of michigan of any kind, either express 24 * or implied, including without limitation the implied warranties of 25 * merchantability and fitness for a particular purpose. the regents 26 * of the university of michigan shall not be liable for any damages, 27 * including special, indirect, incidental, or consequential damages, 28 * with respect to any claim arising out or in connection with the use 29 * of the software, even if it has been or is hereafter advised of the 30 * possibility of such damages. 31 */ 32#include <linux/module.h> 33#include <linux/buffer_head.h> /* __bread */ 34 35#include <linux/genhd.h> 36#include <linux/blkdev.h> 37#include <linux/hash.h> 38 39#include "blocklayout.h" 40 41#define NFSDBG_FACILITY NFSDBG_PNFS_LD 42 43static int decode_sector_number(__be32 **rp, sector_t *sp) 44{ 45 uint64_t s; 46 47 *rp = xdr_decode_hyper(*rp, &s); 48 if (s & 0x1ff) { 49 printk(KERN_WARNING "%s: sector not aligned\n", __func__); 50 return -1; 51 } 52 *sp = s >> SECTOR_SHIFT; 53 return 0; 54} 55 56/* Open a block_device by device number. */ 57struct block_device *nfs4_blkdev_get(dev_t dev) 58{ 59 struct block_device *bd; 60 61 dprintk("%s enter\n", __func__); 62 bd = blkdev_get_by_dev(dev, FMODE_READ, NULL); 63 if (IS_ERR(bd)) 64 goto fail; 65 return bd; 66fail: 67 dprintk("%s failed to open device : %ld\n", 68 __func__, PTR_ERR(bd)); 69 return NULL; 70} 71 72/* 73 * Release the block device 74 */ 75int nfs4_blkdev_put(struct block_device *bdev) 76{ 77 dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), 78 MINOR(bdev->bd_dev)); 79 return blkdev_put(bdev, FMODE_READ); 80} 81 82/* 83 * Shouldn't there be a rpc_generic_upcall() to do this for us? 84 */ 85ssize_t bl_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, 86 char __user *dst, size_t buflen) 87{ 88 char *data = (char *)msg->data + msg->copied; 89 size_t mlen = min(msg->len - msg->copied, buflen); 90 unsigned long left; 91 92 left = copy_to_user(dst, data, mlen); 93 if (left == mlen) { 94 msg->errno = -EFAULT; 95 return -EFAULT; 96 } 97 98 mlen -= left; 99 msg->copied += mlen; 100 msg->errno = 0; 101 return mlen; 102} 103 104static struct bl_dev_msg bl_mount_reply; 105 106ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, 107 size_t mlen) 108{ 109 if (mlen != sizeof (struct bl_dev_msg)) 110 return -EINVAL; 111 112 if (copy_from_user(&bl_mount_reply, src, mlen) != 0) 113 return -EFAULT; 114 115 wake_up(&bl_wq); 116 117 return mlen; 118} 119 120void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) 121{ 122 if (msg->errno >= 0) 123 return; 124 wake_up(&bl_wq); 125} 126 127/* 128 * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. 129 */ 130struct pnfs_block_dev * 131nfs4_blk_decode_device(struct nfs_server *server, 132 struct pnfs_device *dev) 133{ 134 struct pnfs_block_dev *rv = NULL; 135 struct block_device *bd = NULL; 136 struct rpc_pipe_msg msg; 137 struct bl_msg_hdr bl_msg = { 138 .type = BL_DEVICE_MOUNT, 139 .totallen = dev->mincount, 140 }; 141 uint8_t *dataptr; 142 DECLARE_WAITQUEUE(wq, current); 143 struct bl_dev_msg *reply = &bl_mount_reply; 144 int offset, len, i; 145 146 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); 147 dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, 148 dev->mincount); 149 150 memset(&msg, 0, sizeof(msg)); 151 msg.data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); 152 if (!msg.data) { 153 rv = ERR_PTR(-ENOMEM); 154 goto out; 155 } 156 157 memcpy(msg.data, &bl_msg, sizeof(bl_msg)); 158 dataptr = (uint8_t *) msg.data; 159 len = dev->mincount; 160 offset = sizeof(bl_msg); 161 for (i = 0; len > 0; i++) { 162 memcpy(&dataptr[offset], page_address(dev->pages[i]), 163 len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); 164 len -= PAGE_CACHE_SIZE; 165 offset += PAGE_CACHE_SIZE; 166 } 167 msg.len = sizeof(bl_msg) + dev->mincount; 168 169 dprintk("%s CALLING USERSPACE DAEMON\n", __func__); 170 add_wait_queue(&bl_wq, &wq); 171 if (rpc_queue_upcall(bl_device_pipe->d_inode, &msg) < 0) { 172 remove_wait_queue(&bl_wq, &wq); 173 goto out; 174 } 175 176 set_current_state(TASK_UNINTERRUPTIBLE); 177 schedule(); 178 __set_current_state(TASK_RUNNING); 179 remove_wait_queue(&bl_wq, &wq); 180 181 if (reply->status != BL_DEVICE_REQUEST_PROC) { 182 dprintk("%s failed to open device: %d\n", 183 __func__, reply->status); 184 rv = ERR_PTR(-EINVAL); 185 goto out; 186 } 187 188 bd = nfs4_blkdev_get(MKDEV(reply->major, reply->minor)); 189 if (IS_ERR(bd)) { 190 dprintk("%s failed to open device : %ld\n", 191 __func__, PTR_ERR(bd)); 192 goto out; 193 } 194 195 rv = kzalloc(sizeof(*rv), GFP_NOFS); 196 if (!rv) { 197 rv = ERR_PTR(-ENOMEM); 198 goto out; 199 } 200 201 rv->bm_mdev = bd; 202 memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); 203 dprintk("%s Created device %s with bd_block_size %u\n", 204 __func__, 205 bd->bd_disk->disk_name, 206 bd->bd_block_size); 207 208out: 209 kfree(msg.data); 210 return rv; 211} 212 213/* Map deviceid returned by the server to constructed block_device */ 214static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, 215 struct nfs4_deviceid *id) 216{ 217 struct block_device *rv = NULL; 218 struct block_mount_id *mid; 219 struct pnfs_block_dev *dev; 220 221 dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); 222 mid = BLK_ID(lo); 223 spin_lock(&mid->bm_lock); 224 list_for_each_entry(dev, &mid->bm_devlist, bm_node) { 225 if (memcmp(id->data, dev->bm_mdevid.data, 226 NFS4_DEVICEID4_SIZE) == 0) { 227 rv = dev->bm_mdev; 228 goto out; 229 } 230 } 231 out: 232 spin_unlock(&mid->bm_lock); 233 dprintk("%s returning %p\n", __func__, rv); 234 return rv; 235} 236 237/* Tracks info needed to ensure extents in layout obey constraints of spec */ 238struct layout_verification { 239 u32 mode; /* R or RW */ 240 u64 start; /* Expected start of next non-COW extent */ 241 u64 inval; /* Start of INVAL coverage */ 242 u64 cowread; /* End of COW read coverage */ 243}; 244 245/* Verify the extent meets the layout requirements of the pnfs-block draft, 246 * section 2.3.1. 247 */ 248static int verify_extent(struct pnfs_block_extent *be, 249 struct layout_verification *lv) 250{ 251 if (lv->mode == IOMODE_READ) { 252 if (be->be_state == PNFS_BLOCK_READWRITE_DATA || 253 be->be_state == PNFS_BLOCK_INVALID_DATA) 254 return -EIO; 255 if (be->be_f_offset != lv->start) 256 return -EIO; 257 lv->start += be->be_length; 258 return 0; 259 } 260 /* lv->mode == IOMODE_RW */ 261 if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { 262 if (be->be_f_offset != lv->start) 263 return -EIO; 264 if (lv->cowread > lv->start) 265 return -EIO; 266 lv->start += be->be_length; 267 lv->inval = lv->start; 268 return 0; 269 } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 270 if (be->be_f_offset != lv->start) 271 return -EIO; 272 lv->start += be->be_length; 273 return 0; 274 } else if (be->be_state == PNFS_BLOCK_READ_DATA) { 275 if (be->be_f_offset > lv->start) 276 return -EIO; 277 if (be->be_f_offset < lv->inval) 278 return -EIO; 279 if (be->be_f_offset < lv->cowread) 280 return -EIO; 281 /* It looks like you might want to min this with lv->start, 282 * but you really don't. 283 */ 284 lv->inval = lv->inval + be->be_length; 285 lv->cowread = be->be_f_offset + be->be_length; 286 return 0; 287 } else 288 return -EIO; 289} 290 291/* XDR decode pnfs_block_layout4 structure */ 292int 293nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, 294 struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) 295{ 296 struct pnfs_block_layout *bl = BLK_LO2EXT(lo); 297 int i, status = -EIO; 298 uint32_t count; 299 struct pnfs_block_extent *be = NULL, *save; 300 struct xdr_stream stream; 301 struct xdr_buf buf; 302 struct page *scratch; 303 __be32 *p; 304 struct layout_verification lv = { 305 .mode = lgr->range.iomode, 306 .start = lgr->range.offset >> SECTOR_SHIFT, 307 .inval = lgr->range.offset >> SECTOR_SHIFT, 308 .cowread = lgr->range.offset >> SECTOR_SHIFT, 309 }; 310 LIST_HEAD(extents); 311 312 dprintk("---> %s\n", __func__); 313 314 scratch = alloc_page(gfp_flags); 315 if (!scratch) 316 return -ENOMEM; 317 318 xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); 319 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); 320 321 p = xdr_inline_decode(&stream, 4); 322 if (unlikely(!p)) 323 goto out_err; 324 325 count = be32_to_cpup(p++); 326 327 dprintk("%s enter, number of extents %i\n", __func__, count); 328 p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count); 329 if (unlikely(!p)) 330 goto out_err; 331 332 /* Decode individual extents, putting them in temporary 333 * staging area until whole layout is decoded to make error 334 * recovery easier. 335 */ 336 for (i = 0; i < count; i++) { 337 be = bl_alloc_extent(); 338 if (!be) { 339 status = -ENOMEM; 340 goto out_err; 341 } 342 memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE); 343 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); 344 be->be_mdev = translate_devid(lo, &be->be_devid); 345 if (!be->be_mdev) 346 goto out_err; 347 348 /* The next three values are read in as bytes, 349 * but stored as 512-byte sector lengths 350 */ 351 if (decode_sector_number(&p, &be->be_f_offset) < 0) 352 goto out_err; 353 if (decode_sector_number(&p, &be->be_length) < 0) 354 goto out_err; 355 if (decode_sector_number(&p, &be->be_v_offset) < 0) 356 goto out_err; 357 be->be_state = be32_to_cpup(p++); 358 if (be->be_state == PNFS_BLOCK_INVALID_DATA) 359 be->be_inval = &bl->bl_inval; 360 if (verify_extent(be, &lv)) { 361 dprintk("%s verify failed\n", __func__); 362 goto out_err; 363 } 364 list_add_tail(&be->be_node, &extents); 365 } 366 if (lgr->range.offset + lgr->range.length != 367 lv.start << SECTOR_SHIFT) { 368 dprintk("%s Final length mismatch\n", __func__); 369 be = NULL; 370 goto out_err; 371 } 372 if (lv.start < lv.cowread) { 373 dprintk("%s Final uncovered COW extent\n", __func__); 374 be = NULL; 375 goto out_err; 376 } 377 /* Extents decoded properly, now try to merge them in to 378 * existing layout extents. 379 */ 380 spin_lock(&bl->bl_ext_lock); 381 list_for_each_entry_safe(be, save, &extents, be_node) { 382 list_del(&be->be_node); 383 status = bl_add_merge_extent(bl, be); 384 if (status) { 385 spin_unlock(&bl->bl_ext_lock); 386 /* This is a fairly catastrophic error, as the 387 * entire layout extent lists are now corrupted. 388 * We should have some way to distinguish this. 389 */ 390 be = NULL; 391 goto out_err; 392 } 393 } 394 spin_unlock(&bl->bl_ext_lock); 395 status = 0; 396 out: 397 __free_page(scratch); 398 dprintk("%s returns %i\n", __func__, status); 399 return status; 400 401 out_err: 402 bl_put_extent(be); 403 while (!list_empty(&extents)) { 404 be = list_first_entry(&extents, struct pnfs_block_extent, 405 be_node); 406 list_del(&be->be_node); 407 bl_put_extent(be); 408 } 409 goto out; 410}