Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
at v2.6.18-rc2 401 lines 12 kB view raw
1/* 2 * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 * 32 * $Id: iser_memory.c 6964 2006-05-07 11:11:43Z ogerlitz $ 33 */ 34#include <linux/module.h> 35#include <linux/kernel.h> 36#include <linux/slab.h> 37#include <linux/mm.h> 38#include <asm/io.h> 39#include <asm/scatterlist.h> 40#include <linux/scatterlist.h> 41 42#include "iscsi_iser.h" 43 44#define ISER_KMALLOC_THRESHOLD 0x20000 /* 128K - kmalloc limit */ 45/** 46 * Decrements the reference count for the 47 * registered buffer & releases it 48 * 49 * returns 0 if released, 1 if deferred 50 */ 51int iser_regd_buff_release(struct iser_regd_buf *regd_buf) 52{ 53 struct device *dma_device; 54 55 if ((atomic_read(&regd_buf->ref_count) == 0) || 56 atomic_dec_and_test(&regd_buf->ref_count)) { 57 /* if we used the dma mr, unreg is just NOP */ 58 if (regd_buf->reg.rkey != 0) 59 iser_unreg_mem(&regd_buf->reg); 60 61 if (regd_buf->dma_addr) { 62 dma_device = regd_buf->device->ib_device->dma_device; 63 dma_unmap_single(dma_device, 64 regd_buf->dma_addr, 65 regd_buf->data_size, 66 regd_buf->direction); 67 } 68 /* else this regd buf is associated with task which we */ 69 /* dma_unmap_single/sg later */ 70 return 0; 71 } else { 72 iser_dbg("Release deferred, regd.buff: 0x%p\n", regd_buf); 73 return 1; 74 } 75} 76 77/** 78 * iser_reg_single - fills registered buffer descriptor with 79 * registration information 80 */ 81void iser_reg_single(struct iser_device *device, 82 struct iser_regd_buf *regd_buf, 83 enum dma_data_direction direction) 84{ 85 dma_addr_t dma_addr; 86 87 dma_addr = dma_map_single(device->ib_device->dma_device, 88 regd_buf->virt_addr, 89 regd_buf->data_size, direction); 90 BUG_ON(dma_mapping_error(dma_addr)); 91 92 regd_buf->reg.lkey = device->mr->lkey; 93 regd_buf->reg.rkey = 0; /* indicate there's no need to unreg */ 94 regd_buf->reg.len = regd_buf->data_size; 95 regd_buf->reg.va = dma_addr; 96 97 regd_buf->dma_addr = dma_addr; 98 regd_buf->direction = direction; 99} 100 101/** 102 * iser_start_rdma_unaligned_sg 103 */ 104int iser_start_rdma_unaligned_sg(struct iscsi_iser_cmd_task *iser_ctask, 105 enum iser_data_dir cmd_dir) 106{ 107 int dma_nents; 108 struct device *dma_device; 109 char *mem = NULL; 110 struct iser_data_buf *data = &iser_ctask->data[cmd_dir]; 111 unsigned long cmd_data_len = data->data_len; 112 113 if (cmd_data_len > ISER_KMALLOC_THRESHOLD) 114 mem = (void *)__get_free_pages(GFP_NOIO, 115 long_log2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); 116 else 117 mem = kmalloc(cmd_data_len, GFP_NOIO); 118 119 if (mem == NULL) { 120 iser_err("Failed to allocate mem size %d %d for copying sglist\n", 121 data->size,(int)cmd_data_len); 122 return -ENOMEM; 123 } 124 125 if (cmd_dir == ISER_DIR_OUT) { 126 /* copy the unaligned sg the buffer which is used for RDMA */ 127 struct scatterlist *sg = (struct scatterlist *)data->buf; 128 int i; 129 char *p, *from; 130 131 for (p = mem, i = 0; i < data->size; i++) { 132 from = kmap_atomic(sg[i].page, KM_USER0); 133 memcpy(p, 134 from + sg[i].offset, 135 sg[i].length); 136 kunmap_atomic(from, KM_USER0); 137 p += sg[i].length; 138 } 139 } 140 141 sg_init_one(&iser_ctask->data_copy[cmd_dir].sg_single, mem, cmd_data_len); 142 iser_ctask->data_copy[cmd_dir].buf = 143 &iser_ctask->data_copy[cmd_dir].sg_single; 144 iser_ctask->data_copy[cmd_dir].size = 1; 145 146 iser_ctask->data_copy[cmd_dir].copy_buf = mem; 147 148 dma_device = iser_ctask->iser_conn->ib_conn->device->ib_device->dma_device; 149 150 if (cmd_dir == ISER_DIR_OUT) 151 dma_nents = dma_map_sg(dma_device, 152 &iser_ctask->data_copy[cmd_dir].sg_single, 153 1, DMA_TO_DEVICE); 154 else 155 dma_nents = dma_map_sg(dma_device, 156 &iser_ctask->data_copy[cmd_dir].sg_single, 157 1, DMA_FROM_DEVICE); 158 159 BUG_ON(dma_nents == 0); 160 161 iser_ctask->data_copy[cmd_dir].dma_nents = dma_nents; 162 return 0; 163} 164 165/** 166 * iser_finalize_rdma_unaligned_sg 167 */ 168void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_cmd_task *iser_ctask, 169 enum iser_data_dir cmd_dir) 170{ 171 struct device *dma_device; 172 struct iser_data_buf *mem_copy; 173 unsigned long cmd_data_len; 174 175 dma_device = iser_ctask->iser_conn->ib_conn->device->ib_device->dma_device; 176 mem_copy = &iser_ctask->data_copy[cmd_dir]; 177 178 if (cmd_dir == ISER_DIR_OUT) 179 dma_unmap_sg(dma_device, &mem_copy->sg_single, 1, 180 DMA_TO_DEVICE); 181 else 182 dma_unmap_sg(dma_device, &mem_copy->sg_single, 1, 183 DMA_FROM_DEVICE); 184 185 if (cmd_dir == ISER_DIR_IN) { 186 char *mem; 187 struct scatterlist *sg; 188 unsigned char *p, *to; 189 unsigned int sg_size; 190 int i; 191 192 /* copy back read RDMA to unaligned sg */ 193 mem = mem_copy->copy_buf; 194 195 sg = (struct scatterlist *)iser_ctask->data[ISER_DIR_IN].buf; 196 sg_size = iser_ctask->data[ISER_DIR_IN].size; 197 198 for (p = mem, i = 0; i < sg_size; i++){ 199 to = kmap_atomic(sg[i].page, KM_SOFTIRQ0); 200 memcpy(to + sg[i].offset, 201 p, 202 sg[i].length); 203 kunmap_atomic(to, KM_SOFTIRQ0); 204 p += sg[i].length; 205 } 206 } 207 208 cmd_data_len = iser_ctask->data[cmd_dir].data_len; 209 210 if (cmd_data_len > ISER_KMALLOC_THRESHOLD) 211 free_pages((unsigned long)mem_copy->copy_buf, 212 long_log2(roundup_pow_of_two(cmd_data_len)) - PAGE_SHIFT); 213 else 214 kfree(mem_copy->copy_buf); 215 216 mem_copy->copy_buf = NULL; 217} 218 219/** 220 * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses 221 * and returns the length of resulting physical address array (may be less than 222 * the original due to possible compaction). 223 * 224 * we build a "page vec" under the assumption that the SG meets the RDMA 225 * alignment requirements. Other then the first and last SG elements, all 226 * the "internal" elements can be compacted into a list whose elements are 227 * dma addresses of physical pages. The code supports also the weird case 228 * where --few fragments of the same page-- are present in the SG as 229 * consecutive elements. Also, it handles one entry SG. 230 */ 231static int iser_sg_to_page_vec(struct iser_data_buf *data, 232 struct iser_page_vec *page_vec) 233{ 234 struct scatterlist *sg = (struct scatterlist *)data->buf; 235 dma_addr_t first_addr, last_addr, page; 236 int start_aligned, end_aligned; 237 unsigned int cur_page = 0; 238 unsigned long total_sz = 0; 239 int i; 240 241 /* compute the offset of first element */ 242 page_vec->offset = (u64) sg[0].offset; 243 244 for (i = 0; i < data->dma_nents; i++) { 245 total_sz += sg_dma_len(&sg[i]); 246 247 first_addr = sg_dma_address(&sg[i]); 248 last_addr = first_addr + sg_dma_len(&sg[i]); 249 250 start_aligned = !(first_addr & ~PAGE_MASK); 251 end_aligned = !(last_addr & ~PAGE_MASK); 252 253 /* continue to collect page fragments till aligned or SG ends */ 254 while (!end_aligned && (i + 1 < data->dma_nents)) { 255 i++; 256 total_sz += sg_dma_len(&sg[i]); 257 last_addr = sg_dma_address(&sg[i]) + sg_dma_len(&sg[i]); 258 end_aligned = !(last_addr & ~PAGE_MASK); 259 } 260 261 first_addr = first_addr & PAGE_MASK; 262 263 for (page = first_addr; page < last_addr; page += PAGE_SIZE) 264 page_vec->pages[cur_page++] = page; 265 266 } 267 page_vec->data_size = total_sz; 268 iser_dbg("page_vec->data_size:%d cur_page %d\n", page_vec->data_size,cur_page); 269 return cur_page; 270} 271 272#define MASK_4K ((1UL << 12) - 1) /* 0xFFF */ 273#define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & MASK_4K) == 0) 274 275/** 276 * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned 277 * for RDMA sub-list of a scatter-gather list of memory buffers, and returns 278 * the number of entries which are aligned correctly. Supports the case where 279 * consecutive SG elements are actually fragments of the same physcial page. 280 */ 281static unsigned int iser_data_buf_aligned_len(struct iser_data_buf *data) 282{ 283 struct scatterlist *sg; 284 dma_addr_t end_addr, next_addr; 285 int i, cnt; 286 unsigned int ret_len = 0; 287 288 sg = (struct scatterlist *)data->buf; 289 290 for (cnt = 0, i = 0; i < data->dma_nents; i++, cnt++) { 291 /* iser_dbg("Checking sg iobuf [%d]: phys=0x%08lX " 292 "offset: %ld sz: %ld\n", i, 293 (unsigned long)page_to_phys(sg[i].page), 294 (unsigned long)sg[i].offset, 295 (unsigned long)sg[i].length); */ 296 end_addr = sg_dma_address(&sg[i]) + 297 sg_dma_len(&sg[i]); 298 /* iser_dbg("Checking sg iobuf end address " 299 "0x%08lX\n", end_addr); */ 300 if (i + 1 < data->dma_nents) { 301 next_addr = sg_dma_address(&sg[i+1]); 302 /* are i, i+1 fragments of the same page? */ 303 if (end_addr == next_addr) 304 continue; 305 else if (!IS_4K_ALIGNED(end_addr)) { 306 ret_len = cnt + 1; 307 break; 308 } 309 } 310 } 311 if (i == data->dma_nents) 312 ret_len = cnt; /* loop ended */ 313 iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n", 314 ret_len, data->dma_nents, data); 315 return ret_len; 316} 317 318static void iser_data_buf_dump(struct iser_data_buf *data) 319{ 320 struct scatterlist *sg = (struct scatterlist *)data->buf; 321 int i; 322 323 for (i = 0; i < data->size; i++) 324 iser_err("sg[%d] dma_addr:0x%lX page:0x%p " 325 "off:%d sz:%d dma_len:%d\n", 326 i, (unsigned long)sg_dma_address(&sg[i]), 327 sg[i].page, sg[i].offset, 328 sg[i].length,sg_dma_len(&sg[i])); 329} 330 331static void iser_dump_page_vec(struct iser_page_vec *page_vec) 332{ 333 int i; 334 335 iser_err("page vec length %d data size %d\n", 336 page_vec->length, page_vec->data_size); 337 for (i = 0; i < page_vec->length; i++) 338 iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]); 339} 340 341static void iser_page_vec_build(struct iser_data_buf *data, 342 struct iser_page_vec *page_vec) 343{ 344 int page_vec_len = 0; 345 346 page_vec->length = 0; 347 page_vec->offset = 0; 348 349 iser_dbg("Translating sg sz: %d\n", data->dma_nents); 350 page_vec_len = iser_sg_to_page_vec(data,page_vec); 351 iser_dbg("sg len %d page_vec_len %d\n", data->dma_nents,page_vec_len); 352 353 page_vec->length = page_vec_len; 354 355 if (page_vec_len * PAGE_SIZE < page_vec->data_size) { 356 iser_err("page_vec too short to hold this SG\n"); 357 iser_data_buf_dump(data); 358 iser_dump_page_vec(page_vec); 359 BUG(); 360 } 361} 362 363/** 364 * iser_reg_rdma_mem - Registers memory intended for RDMA, 365 * obtaining rkey and va 366 * 367 * returns 0 on success, errno code on failure 368 */ 369int iser_reg_rdma_mem(struct iscsi_iser_cmd_task *iser_ctask, 370 enum iser_data_dir cmd_dir) 371{ 372 struct iser_conn *ib_conn = iser_ctask->iser_conn->ib_conn; 373 struct iser_data_buf *mem = &iser_ctask->data[cmd_dir]; 374 struct iser_regd_buf *regd_buf; 375 int aligned_len; 376 int err; 377 378 regd_buf = &iser_ctask->rdma_regd[cmd_dir]; 379 380 aligned_len = iser_data_buf_aligned_len(mem); 381 if (aligned_len != mem->size) { 382 iser_err("rdma alignment violation %d/%d aligned\n", 383 aligned_len, mem->size); 384 iser_data_buf_dump(mem); 385 /* allocate copy buf, if we are writing, copy the */ 386 /* unaligned scatterlist, dma map the copy */ 387 if (iser_start_rdma_unaligned_sg(iser_ctask, cmd_dir) != 0) 388 return -ENOMEM; 389 mem = &iser_ctask->data_copy[cmd_dir]; 390 } 391 392 iser_page_vec_build(mem, ib_conn->page_vec); 393 err = iser_reg_page_vec(ib_conn, ib_conn->page_vec, &regd_buf->reg); 394 if (err) 395 return err; 396 397 /* take a reference on this regd buf such that it will not be released * 398 * (eg in send dto completion) before we get the scsi response */ 399 atomic_inc(&regd_buf->ref_count); 400 return 0; 401}