Merge tag 'for-4.18/dm-changes-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

+68

Documentation/device-mapper/writecache.txt

··· 1 + The writecache target caches writes on persistent memory or on SSD. It 2 + doesn't cache reads because reads are supposed to be cached in page cache 3 + in normal RAM. 4 + 5 + When the device is constructed, the first sector should be zeroed or the 6 + first sector should contain valid superblock from previous invocation. 7 + 8 + Constructor parameters: 9 + 1. type of the cache device - "p" or "s" 10 + p - persistent memory 11 + s - SSD 12 + 2. the underlying device that will be cached 13 + 3. the cache device 14 + 4. block size (4096 is recommended; the maximum block size is the page 15 + size) 16 + 5. the number of optional parameters (the parameters with an argument 17 + count as two) 18 + high_watermark n (default: 50) 19 + start writeback when the number of used blocks reach this 20 + watermark 21 + low_watermark x (default: 45) 22 + stop writeback when the number of used blocks drops below 23 + this watermark 24 + writeback_jobs n (default: unlimited) 25 + limit the number of blocks that are in flight during 26 + writeback. Setting this value reduces writeback 27 + throughput, but it may improve latency of read requests 28 + autocommit_blocks n (default: 64 for pmem, 65536 for ssd) 29 + when the application writes this amount of blocks without 30 + issuing the FLUSH request, the blocks are automatically 31 + commited 32 + autocommit_time ms (default: 1000) 33 + autocommit time in milliseconds. The data is automatically 34 + commited if this time passes and no FLUSH request is 35 + received 36 + fua (by default on) 37 + applicable only to persistent memory - use the FUA flag 38 + when writing data from persistent memory back to the 39 + underlying device 40 + nofua 41 + applicable only to persistent memory - don't use the FUA 42 + flag when writing back data and send the FLUSH request 43 + afterwards 44 + - some underlying devices perform better with fua, some 45 + with nofua. The user should test it 46 + 47 + Status: 48 + 1. error indicator - 0 if there was no error, otherwise error number 49 + 2. the number of blocks 50 + 3. the number of free blocks 51 + 4. the number of blocks under writeback 52 + 53 + Messages: 54 + flush 55 + flush the cache device. The message returns successfully 56 + if the cache device was flushed without an error 57 + flush_on_suspend 58 + flush the cache device on next suspend. Use this message 59 + when you are going to remove the cache device. The proper 60 + sequence for removing the cache device is: 61 + 1. send the "flush_on_suspend" message 62 + 2. load an inactive table with a linear target that maps 63 + to the underlying device 64 + 3. suspend the device 65 + 4. ask for status and verify that there are no errors 66 + 5. resume the device, so that it will use the linear 67 + target 68 + 6. the cache device is now inactive and it can be deleted

+11

drivers/md/Kconfig

··· 334 334 of less memory utilization, improved performance and increased 335 335 adaptability in the face of changing workloads. 336 336 337 + config DM_WRITECACHE 338 + tristate "Writecache target" 339 + depends on BLK_DEV_DM 340 + ---help--- 341 + The writecache target caches writes on persistent memory or SSD. 342 + It is intended for databases or other programs that need extremely 343 + low commit latency. 344 + 345 + The writecache target doesn't cache reads because reads are supposed 346 + to be cached in standard RAM. 347 + 337 348 config DM_ERA 338 349 tristate "Era target (EXPERIMENTAL)" 339 350 depends on BLK_DEV_DM

+1

drivers/md/Makefile

··· 67 67 obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o 68 68 obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o 69 69 obj-$(CONFIG_DM_ZONED) += dm-zoned.o 70 + obj-$(CONFIG_DM_WRITECACHE) += dm-writecache.o 70 71 71 72 ifeq ($(CONFIG_DM_UEVENT),y) 72 73 dm-mod-objs += dm-uevent.o

+1 -1

drivers/md/dm-bio-prison-v1.c

··· 19 19 20 20 struct dm_bio_prison { 21 21 spinlock_t lock; 22 - mempool_t cell_pool; 23 22 struct rb_root cells; 23 + mempool_t cell_pool; 24 24 }; 25 25 26 26 static struct kmem_cache *_cell_cache;

+1 -1

drivers/md/dm-bio-prison-v2.c

··· 21 21 struct workqueue_struct *wq; 22 22 23 23 spinlock_t lock; 24 - mempool_t cell_pool; 25 24 struct rb_root cells; 25 + mempool_t cell_pool; 26 26 }; 27 27 28 28 static struct kmem_cache *_cell_cache;

+33 -30

drivers/md/dm-cache-target.c

··· 371 371 372 372 struct cache { 373 373 struct dm_target *ti; 374 - struct dm_target_callbacks callbacks; 374 + spinlock_t lock; 375 + 376 + /* 377 + * Fields for converting from sectors to blocks. 378 + */ 379 + int sectors_per_block_shift; 380 + sector_t sectors_per_block; 375 381 376 382 struct dm_cache_metadata *cmd; 377 383 ··· 408 402 dm_cblock_t cache_size; 409 403 410 404 /* 411 - * Fields for converting from sectors to blocks. 405 + * Invalidation fields. 412 406 */ 413 - sector_t sectors_per_block; 414 - int sectors_per_block_shift; 407 + spinlock_t invalidation_lock; 408 + struct list_head invalidation_requests; 415 409 416 - spinlock_t lock; 417 - struct bio_list deferred_bios; 418 410 sector_t migration_threshold; 419 411 wait_queue_head_t migration_wait; 420 412 atomic_t nr_allocated_migrations; ··· 423 419 */ 424 420 atomic_t nr_io_migrations; 425 421 422 + struct bio_list deferred_bios; 423 + 426 424 struct rw_semaphore quiesce_lock; 427 425 428 - /* 429 - * cache_size entries, dirty if set 430 - */ 431 - atomic_t nr_dirty; 432 - unsigned long *dirty_bitset; 426 + struct dm_target_callbacks callbacks; 433 427 434 428 /* 435 429 * origin_blocks entries, discarded if set. ··· 444 442 const char **ctr_args; 445 443 446 444 struct dm_kcopyd_client *copier; 447 - struct workqueue_struct *wq; 448 445 struct work_struct deferred_bio_worker; 449 446 struct work_struct migration_worker; 447 + struct workqueue_struct *wq; 450 448 struct delayed_work waker; 451 449 struct dm_bio_prison_v2 *prison; 452 - struct bio_set bs; 453 450 454 - mempool_t migration_pool; 451 + /* 452 + * cache_size entries, dirty if set 453 + */ 454 + unsigned long *dirty_bitset; 455 + atomic_t nr_dirty; 455 456 456 - struct dm_cache_policy *policy; 457 457 unsigned policy_nr_args; 458 - 459 - bool need_tick_bio:1; 460 - bool sized:1; 461 - bool invalidate:1; 462 - bool commit_requested:1; 463 - bool loaded_mappings:1; 464 - bool loaded_discards:1; 458 + struct dm_cache_policy *policy; 465 459 466 460 /* 467 461 * Cache features such as write-through. ··· 466 468 467 469 struct cache_stats stats; 468 470 469 - /* 470 - * Invalidation fields. 471 - */ 472 - spinlock_t invalidation_lock; 473 - struct list_head invalidation_requests; 471 + bool need_tick_bio:1; 472 + bool sized:1; 473 + bool invalidate:1; 474 + bool commit_requested:1; 475 + bool loaded_mappings:1; 476 + bool loaded_discards:1; 477 + 478 + struct rw_semaphore background_work_lock; 479 + 480 + struct batcher committer; 481 + struct work_struct commit_ws; 474 482 475 483 struct io_tracker tracker; 476 484 477 - struct work_struct commit_ws; 478 - struct batcher committer; 485 + mempool_t migration_pool; 479 486 480 - struct rw_semaphore background_work_lock; 487 + struct bio_set bs; 481 488 }; 482 489 483 490 struct per_bio_data {

+19 -19

drivers/md/dm-core.h

··· 31 31 struct mapped_device { 32 32 struct mutex suspend_lock; 33 33 34 + struct mutex table_devices_lock; 35 + struct list_head table_devices; 36 + 34 37 /* 35 38 * The current mapping (struct dm_table *). 36 39 * Use dm_get_live_table{_fast} or take suspend_lock for ··· 41 38 */ 42 39 void __rcu *map; 43 40 44 - struct list_head table_devices; 45 - struct mutex table_devices_lock; 46 - 47 41 unsigned long flags; 48 42 49 - struct request_queue *queue; 50 - int numa_node_id; 51 - 52 - enum dm_queue_mode type; 53 43 /* Protect queue and type against concurrent access. */ 54 44 struct mutex type_lock; 45 + enum dm_queue_mode type; 46 + 47 + int numa_node_id; 48 + struct request_queue *queue; 55 49 56 50 atomic_t holders; 57 51 atomic_t open_count; ··· 56 56 struct dm_target *immutable_target; 57 57 struct target_type *immutable_target_type; 58 58 59 + char name[16]; 59 60 struct gendisk *disk; 60 61 struct dax_device *dax_dev; 61 - char name[16]; 62 - 63 - void *interface_ptr; 64 62 65 63 /* 66 64 * A list of ios that arrived while we were suspended. 67 65 */ 68 - atomic_t pending[2]; 69 - wait_queue_head_t wait; 70 66 struct work_struct work; 67 + wait_queue_head_t wait; 68 + atomic_t pending[2]; 71 69 spinlock_t deferred_lock; 72 70 struct bio_list deferred; 71 + 72 + void *interface_ptr; 73 73 74 74 /* 75 75 * Event handling. ··· 84 84 unsigned internal_suspend_count; 85 85 86 86 /* 87 - * Processing queue (flush) 88 - */ 89 - struct workqueue_struct *wq; 90 - 91 - /* 92 87 * io objects are allocated from here. 93 88 */ 94 89 struct bio_set io_bs; 95 90 struct bio_set bs; 91 + 92 + /* 93 + * Processing queue (flush) 94 + */ 95 + struct workqueue_struct *wq; 96 96 97 97 /* 98 98 * freeze/thaw support require holding onto a super block ··· 102 102 /* forced geometry settings */ 103 103 struct hd_geometry geometry; 104 104 105 - struct block_device *bdev; 106 - 107 105 /* kobject and completion */ 108 106 struct dm_kobject_holder kobj_holder; 107 + 108 + struct block_device *bdev; 109 109 110 110 /* zero-length flush that will be cloned and submitted to targets */ 111 111 struct bio flush_bio;

+13 -13

drivers/md/dm-crypt.c

··· 139 139 struct dm_dev *dev; 140 140 sector_t start; 141 141 142 - /* 143 - * pool for per bio private data, crypto requests, 144 - * encryption requeusts/buffer pages and integrity tags 145 - */ 146 - mempool_t req_pool; 147 - mempool_t page_pool; 148 - mempool_t tag_pool; 149 - unsigned tag_pool_max_sectors; 150 - 151 142 struct percpu_counter n_allocated_pages; 152 - 153 - struct bio_set bs; 154 - struct mutex bio_alloc_lock; 155 143 156 144 struct workqueue_struct *io_queue; 157 145 struct workqueue_struct *crypt_queue; 158 146 159 - struct task_struct *write_thread; 160 147 wait_queue_head_t write_thread_wait; 148 + struct task_struct *write_thread; 161 149 struct rb_root write_tree; 162 150 163 151 char *cipher; ··· 200 212 unsigned int integrity_tag_size; 201 213 unsigned int integrity_iv_size; 202 214 unsigned int on_disk_tag_size; 215 + 216 + /* 217 + * pool for per bio private data, crypto requests, 218 + * encryption requeusts/buffer pages and integrity tags 219 + */ 220 + unsigned tag_pool_max_sectors; 221 + mempool_t tag_pool; 222 + mempool_t req_pool; 223 + mempool_t page_pool; 224 + 225 + struct bio_set bs; 226 + struct mutex bio_alloc_lock; 203 227 204 228 u8 *authenc_key; /* space for keys in authenc() format (if used) */ 205 229 u8 key[0];

+2 -1

drivers/md/dm-ioctl.c

··· 1344 1344 goto err_unlock_md_type; 1345 1345 } 1346 1346 } else if (!is_valid_type(dm_get_md_type(md), dm_table_get_type(t))) { 1347 - DMWARN("can't change device type after initial table load."); 1347 + DMWARN("can't change device type (old=%u vs new=%u) after initial table load.", 1348 + dm_get_md_type(md), dm_table_get_type(t)); 1348 1349 r = -EINVAL; 1349 1350 goto err_unlock_md_type; 1350 1351 }

+2 -1

drivers/md/dm-kcopyd.c

··· 45 45 struct dm_io_client *io_client; 46 46 47 47 wait_queue_head_t destroyq; 48 - atomic_t nr_jobs; 49 48 50 49 mempool_t job_pool; 51 50 ··· 52 53 struct work_struct kcopyd_work; 53 54 54 55 struct dm_kcopyd_throttle *throttle; 56 + 57 + atomic_t nr_jobs; 55 58 56 59 /* 57 60 * We maintain three lists of jobs:

+12 -11

drivers/md/dm-region-hash.c

··· 63 63 64 64 /* hash table */ 65 65 rwlock_t hash_lock; 66 - mempool_t region_pool; 67 66 unsigned mask; 68 67 unsigned nr_buckets; 69 68 unsigned prime; 70 69 unsigned shift; 71 70 struct list_head *buckets; 72 71 73 - unsigned max_recovery; /* Max # of regions to recover in parallel */ 74 - 75 - spinlock_t region_lock; 76 - atomic_t recovery_in_flight; 77 - struct semaphore recovery_count; 78 - struct list_head clean_regions; 79 - struct list_head quiesced_regions; 80 - struct list_head recovered_regions; 81 - struct list_head failed_recovered_regions; 82 - 83 72 /* 84 73 * If there was a flush failure no regions can be marked clean. 85 74 */ 86 75 int flush_failure; 76 + 77 + unsigned max_recovery; /* Max # of regions to recover in parallel */ 78 + 79 + spinlock_t region_lock; 80 + atomic_t recovery_in_flight; 81 + struct list_head clean_regions; 82 + struct list_head quiesced_regions; 83 + struct list_head recovered_regions; 84 + struct list_head failed_recovered_regions; 85 + struct semaphore recovery_count; 86 + 87 + mempool_t region_pool; 87 88 88 89 void *context; 89 90 sector_t target_begin;

+3 -2

drivers/md/dm-thin.c

··· 240 240 struct dm_bio_prison *prison; 241 241 struct dm_kcopyd_client *copier; 242 242 243 + struct work_struct worker; 243 244 struct workqueue_struct *wq; 244 245 struct throttle throttle; 245 - struct work_struct worker; 246 246 struct delayed_work waker; 247 247 struct delayed_work no_space_timeout; 248 248 ··· 260 260 struct dm_deferred_set *all_io_ds; 261 261 262 262 struct dm_thin_new_mapping *next_mapping; 263 - mempool_t mapping_pool; 264 263 265 264 process_bio_fn process_bio; 266 265 process_bio_fn process_discard; ··· 272 273 process_mapping_fn process_prepared_discard_pt2; 273 274 274 275 struct dm_bio_prison_cell **cell_sort_array; 276 + 277 + mempool_t mapping_pool; 275 278 }; 276 279 277 280 static enum pool_mode get_pool_mode(struct pool *pool);

+2305

drivers/md/dm-writecache.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2018 Red Hat. All rights reserved. 4 + * 5 + * This file is released under the GPL. 6 + */ 7 + 8 + #include <linux/device-mapper.h> 9 + #include <linux/module.h> 10 + #include <linux/init.h> 11 + #include <linux/vmalloc.h> 12 + #include <linux/kthread.h> 13 + #include <linux/dm-io.h> 14 + #include <linux/dm-kcopyd.h> 15 + #include <linux/dax.h> 16 + #include <linux/pfn_t.h> 17 + #include <linux/libnvdimm.h> 18 + 19 + #define DM_MSG_PREFIX "writecache" 20 + 21 + #define HIGH_WATERMARK 50 22 + #define LOW_WATERMARK 45 23 + #define MAX_WRITEBACK_JOBS 0 24 + #define ENDIO_LATENCY 16 25 + #define WRITEBACK_LATENCY 64 26 + #define AUTOCOMMIT_BLOCKS_SSD 65536 27 + #define AUTOCOMMIT_BLOCKS_PMEM 64 28 + #define AUTOCOMMIT_MSEC 1000 29 + 30 + #define BITMAP_GRANULARITY 65536 31 + #if BITMAP_GRANULARITY < PAGE_SIZE 32 + #undef BITMAP_GRANULARITY 33 + #define BITMAP_GRANULARITY PAGE_SIZE 34 + #endif 35 + 36 + #if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER) 37 + #define DM_WRITECACHE_HAS_PMEM 38 + #endif 39 + 40 + #ifdef DM_WRITECACHE_HAS_PMEM 41 + #define pmem_assign(dest, src) \ 42 + do { \ 43 + typeof(dest) uniq = (src); \ 44 + memcpy_flushcache(&(dest), &uniq, sizeof(dest)); \ 45 + } while (0) 46 + #else 47 + #define pmem_assign(dest, src) ((dest) = (src)) 48 + #endif 49 + 50 + #if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && defined(DM_WRITECACHE_HAS_PMEM) 51 + #define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 52 + #endif 53 + 54 + #define MEMORY_SUPERBLOCK_MAGIC 0x23489321 55 + #define MEMORY_SUPERBLOCK_VERSION 1 56 + 57 + struct wc_memory_entry { 58 + __le64 original_sector; 59 + __le64 seq_count; 60 + }; 61 + 62 + struct wc_memory_superblock { 63 + union { 64 + struct { 65 + __le32 magic; 66 + __le32 version; 67 + __le32 block_size; 68 + __le32 pad; 69 + __le64 n_blocks; 70 + __le64 seq_count; 71 + }; 72 + __le64 padding[8]; 73 + }; 74 + struct wc_memory_entry entries[0]; 75 + }; 76 + 77 + struct wc_entry { 78 + struct rb_node rb_node; 79 + struct list_head lru; 80 + unsigned short wc_list_contiguous; 81 + bool write_in_progress 82 + #if BITS_PER_LONG == 64 83 + :1 84 + #endif 85 + ; 86 + unsigned long index 87 + #if BITS_PER_LONG == 64 88 + :47 89 + #endif 90 + ; 91 + #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 92 + uint64_t original_sector; 93 + uint64_t seq_count; 94 + #endif 95 + }; 96 + 97 + #ifdef DM_WRITECACHE_HAS_PMEM 98 + #define WC_MODE_PMEM(wc) ((wc)->pmem_mode) 99 + #define WC_MODE_FUA(wc) ((wc)->writeback_fua) 100 + #else 101 + #define WC_MODE_PMEM(wc) false 102 + #define WC_MODE_FUA(wc) false 103 + #endif 104 + #define WC_MODE_SORT_FREELIST(wc) (!WC_MODE_PMEM(wc)) 105 + 106 + struct dm_writecache { 107 + struct mutex lock; 108 + struct list_head lru; 109 + union { 110 + struct list_head freelist; 111 + struct { 112 + struct rb_root freetree; 113 + struct wc_entry *current_free; 114 + }; 115 + }; 116 + struct rb_root tree; 117 + 118 + size_t freelist_size; 119 + size_t writeback_size; 120 + size_t freelist_high_watermark; 121 + size_t freelist_low_watermark; 122 + 123 + unsigned uncommitted_blocks; 124 + unsigned autocommit_blocks; 125 + unsigned max_writeback_jobs; 126 + 127 + int error; 128 + 129 + unsigned long autocommit_jiffies; 130 + struct timer_list autocommit_timer; 131 + struct wait_queue_head freelist_wait; 132 + 133 + atomic_t bio_in_progress[2]; 134 + struct wait_queue_head bio_in_progress_wait[2]; 135 + 136 + struct dm_target *ti; 137 + struct dm_dev *dev; 138 + struct dm_dev *ssd_dev; 139 + void *memory_map; 140 + uint64_t memory_map_size; 141 + size_t metadata_sectors; 142 + size_t n_blocks; 143 + uint64_t seq_count; 144 + void *block_start; 145 + struct wc_entry *entries; 146 + unsigned block_size; 147 + unsigned char block_size_bits; 148 + 149 + bool pmem_mode:1; 150 + bool writeback_fua:1; 151 + 152 + bool overwrote_committed:1; 153 + bool memory_vmapped:1; 154 + 155 + bool high_wm_percent_set:1; 156 + bool low_wm_percent_set:1; 157 + bool max_writeback_jobs_set:1; 158 + bool autocommit_blocks_set:1; 159 + bool autocommit_time_set:1; 160 + bool writeback_fua_set:1; 161 + bool flush_on_suspend:1; 162 + 163 + unsigned writeback_all; 164 + struct workqueue_struct *writeback_wq; 165 + struct work_struct writeback_work; 166 + struct work_struct flush_work; 167 + 168 + struct dm_io_client *dm_io; 169 + 170 + raw_spinlock_t endio_list_lock; 171 + struct list_head endio_list; 172 + struct task_struct *endio_thread; 173 + 174 + struct task_struct *flush_thread; 175 + struct bio_list flush_list; 176 + 177 + struct dm_kcopyd_client *dm_kcopyd; 178 + unsigned long *dirty_bitmap; 179 + unsigned dirty_bitmap_size; 180 + 181 + struct bio_set bio_set; 182 + mempool_t copy_pool; 183 + }; 184 + 185 + #define WB_LIST_INLINE 16 186 + 187 + struct writeback_struct { 188 + struct list_head endio_entry; 189 + struct dm_writecache *wc; 190 + struct wc_entry **wc_list; 191 + unsigned wc_list_n; 192 + unsigned page_offset; 193 + struct page *page; 194 + struct wc_entry *wc_list_inline[WB_LIST_INLINE]; 195 + struct bio bio; 196 + }; 197 + 198 + struct copy_struct { 199 + struct list_head endio_entry; 200 + struct dm_writecache *wc; 201 + struct wc_entry *e; 202 + unsigned n_entries; 203 + int error; 204 + }; 205 + 206 + DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle, 207 + "A percentage of time allocated for data copying"); 208 + 209 + static void wc_lock(struct dm_writecache *wc) 210 + { 211 + mutex_lock(&wc->lock); 212 + } 213 + 214 + static void wc_unlock(struct dm_writecache *wc) 215 + { 216 + mutex_unlock(&wc->lock); 217 + } 218 + 219 + #ifdef DM_WRITECACHE_HAS_PMEM 220 + static int persistent_memory_claim(struct dm_writecache *wc) 221 + { 222 + int r; 223 + loff_t s; 224 + long p, da; 225 + pfn_t pfn; 226 + int id; 227 + struct page **pages; 228 + 229 + wc->memory_vmapped = false; 230 + 231 + if (!wc->ssd_dev->dax_dev) { 232 + r = -EOPNOTSUPP; 233 + goto err1; 234 + } 235 + s = wc->memory_map_size; 236 + p = s >> PAGE_SHIFT; 237 + if (!p) { 238 + r = -EINVAL; 239 + goto err1; 240 + } 241 + if (p != s >> PAGE_SHIFT) { 242 + r = -EOVERFLOW; 243 + goto err1; 244 + } 245 + 246 + id = dax_read_lock(); 247 + 248 + da = dax_direct_access(wc->ssd_dev->dax_dev, 0, p, &wc->memory_map, &pfn); 249 + if (da < 0) { 250 + wc->memory_map = NULL; 251 + r = da; 252 + goto err2; 253 + } 254 + if (!pfn_t_has_page(pfn)) { 255 + wc->memory_map = NULL; 256 + r = -EOPNOTSUPP; 257 + goto err2; 258 + } 259 + if (da != p) { 260 + long i; 261 + wc->memory_map = NULL; 262 + pages = kvmalloc(p * sizeof(struct page *), GFP_KERNEL); 263 + if (!pages) { 264 + r = -ENOMEM; 265 + goto err2; 266 + } 267 + i = 0; 268 + do { 269 + long daa; 270 + void *dummy_addr; 271 + daa = dax_direct_access(wc->ssd_dev->dax_dev, i, p - i, 272 + &dummy_addr, &pfn); 273 + if (daa <= 0) { 274 + r = daa ? daa : -EINVAL; 275 + goto err3; 276 + } 277 + if (!pfn_t_has_page(pfn)) { 278 + r = -EOPNOTSUPP; 279 + goto err3; 280 + } 281 + while (daa-- && i < p) { 282 + pages[i++] = pfn_t_to_page(pfn); 283 + pfn.val++; 284 + } 285 + } while (i < p); 286 + wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL); 287 + if (!wc->memory_map) { 288 + r = -ENOMEM; 289 + goto err3; 290 + } 291 + kvfree(pages); 292 + wc->memory_vmapped = true; 293 + } 294 + 295 + dax_read_unlock(id); 296 + return 0; 297 + err3: 298 + kvfree(pages); 299 + err2: 300 + dax_read_unlock(id); 301 + err1: 302 + return r; 303 + } 304 + #else 305 + static int persistent_memory_claim(struct dm_writecache *wc) 306 + { 307 + BUG(); 308 + } 309 + #endif 310 + 311 + static void persistent_memory_release(struct dm_writecache *wc) 312 + { 313 + if (wc->memory_vmapped) 314 + vunmap(wc->memory_map); 315 + } 316 + 317 + static struct page *persistent_memory_page(void *addr) 318 + { 319 + if (is_vmalloc_addr(addr)) 320 + return vmalloc_to_page(addr); 321 + else 322 + return virt_to_page(addr); 323 + } 324 + 325 + static unsigned persistent_memory_page_offset(void *addr) 326 + { 327 + return (unsigned long)addr & (PAGE_SIZE - 1); 328 + } 329 + 330 + static void persistent_memory_flush_cache(void *ptr, size_t size) 331 + { 332 + if (is_vmalloc_addr(ptr)) 333 + flush_kernel_vmap_range(ptr, size); 334 + } 335 + 336 + static void persistent_memory_invalidate_cache(void *ptr, size_t size) 337 + { 338 + if (is_vmalloc_addr(ptr)) 339 + invalidate_kernel_vmap_range(ptr, size); 340 + } 341 + 342 + static struct wc_memory_superblock *sb(struct dm_writecache *wc) 343 + { 344 + return wc->memory_map; 345 + } 346 + 347 + static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e) 348 + { 349 + if (is_power_of_2(sizeof(struct wc_entry)) && 0) 350 + return &sb(wc)->entries[e - wc->entries]; 351 + else 352 + return &sb(wc)->entries[e->index]; 353 + } 354 + 355 + static void *memory_data(struct dm_writecache *wc, struct wc_entry *e) 356 + { 357 + return (char *)wc->block_start + (e->index << wc->block_size_bits); 358 + } 359 + 360 + static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e) 361 + { 362 + return wc->metadata_sectors + 363 + ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT)); 364 + } 365 + 366 + static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e) 367 + { 368 + #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 369 + return e->original_sector; 370 + #else 371 + return le64_to_cpu(memory_entry(wc, e)->original_sector); 372 + #endif 373 + } 374 + 375 + static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e) 376 + { 377 + #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 378 + return e->seq_count; 379 + #else 380 + return le64_to_cpu(memory_entry(wc, e)->seq_count); 381 + #endif 382 + } 383 + 384 + static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e) 385 + { 386 + #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 387 + e->seq_count = -1; 388 + #endif 389 + pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1)); 390 + } 391 + 392 + static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e, 393 + uint64_t original_sector, uint64_t seq_count) 394 + { 395 + struct wc_memory_entry me; 396 + #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 397 + e->original_sector = original_sector; 398 + e->seq_count = seq_count; 399 + #endif 400 + me.original_sector = cpu_to_le64(original_sector); 401 + me.seq_count = cpu_to_le64(seq_count); 402 + pmem_assign(*memory_entry(wc, e), me); 403 + } 404 + 405 + #define writecache_error(wc, err, msg, arg...) \ 406 + do { \ 407 + if (!cmpxchg(&(wc)->error, 0, err)) \ 408 + DMERR(msg, ##arg); \ 409 + wake_up(&(wc)->freelist_wait); \ 410 + } while (0) 411 + 412 + #define writecache_has_error(wc) (unlikely(READ_ONCE((wc)->error))) 413 + 414 + static void writecache_flush_all_metadata(struct dm_writecache *wc) 415 + { 416 + if (!WC_MODE_PMEM(wc)) 417 + memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size); 418 + } 419 + 420 + static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size) 421 + { 422 + if (!WC_MODE_PMEM(wc)) 423 + __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY, 424 + wc->dirty_bitmap); 425 + } 426 + 427 + static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev); 428 + 429 + struct io_notify { 430 + struct dm_writecache *wc; 431 + struct completion c; 432 + atomic_t count; 433 + }; 434 + 435 + static void writecache_notify_io(unsigned long error, void *context) 436 + { 437 + struct io_notify *endio = context; 438 + 439 + if (unlikely(error != 0)) 440 + writecache_error(endio->wc, -EIO, "error writing metadata"); 441 + BUG_ON(atomic_read(&endio->count) <= 0); 442 + if (atomic_dec_and_test(&endio->count)) 443 + complete(&endio->c); 444 + } 445 + 446 + static void ssd_commit_flushed(struct dm_writecache *wc) 447 + { 448 + struct dm_io_region region; 449 + struct dm_io_request req; 450 + struct io_notify endio = { 451 + wc, 452 + COMPLETION_INITIALIZER_ONSTACK(endio.c), 453 + ATOMIC_INIT(1), 454 + }; 455 + unsigned bitmap_bits = wc->dirty_bitmap_size * BITS_PER_LONG; 456 + unsigned i = 0; 457 + 458 + while (1) { 459 + unsigned j; 460 + i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i); 461 + if (unlikely(i == bitmap_bits)) 462 + break; 463 + j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i); 464 + 465 + region.bdev = wc->ssd_dev->bdev; 466 + region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT); 467 + region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT); 468 + 469 + if (unlikely(region.sector >= wc->metadata_sectors)) 470 + break; 471 + if (unlikely(region.sector + region.count > wc->metadata_sectors)) 472 + region.count = wc->metadata_sectors - region.sector; 473 + 474 + atomic_inc(&endio.count); 475 + req.bi_op = REQ_OP_WRITE; 476 + req.bi_op_flags = REQ_SYNC; 477 + req.mem.type = DM_IO_VMA; 478 + req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY; 479 + req.client = wc->dm_io; 480 + req.notify.fn = writecache_notify_io; 481 + req.notify.context = &endio; 482 + 483 + /* writing via async dm-io (implied by notify.fn above) won't return an error */ 484 + (void) dm_io(&req, 1, &region, NULL); 485 + i = j; 486 + } 487 + 488 + writecache_notify_io(0, &endio); 489 + wait_for_completion_io(&endio.c); 490 + 491 + writecache_disk_flush(wc, wc->ssd_dev); 492 + 493 + memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size); 494 + } 495 + 496 + static void writecache_commit_flushed(struct dm_writecache *wc) 497 + { 498 + if (WC_MODE_PMEM(wc)) 499 + wmb(); 500 + else 501 + ssd_commit_flushed(wc); 502 + } 503 + 504 + static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) 505 + { 506 + int r; 507 + struct dm_io_region region; 508 + struct dm_io_request req; 509 + 510 + region.bdev = dev->bdev; 511 + region.sector = 0; 512 + region.count = 0; 513 + req.bi_op = REQ_OP_WRITE; 514 + req.bi_op_flags = REQ_PREFLUSH; 515 + req.mem.type = DM_IO_KMEM; 516 + req.mem.ptr.addr = NULL; 517 + req.client = wc->dm_io; 518 + req.notify.fn = NULL; 519 + 520 + r = dm_io(&req, 1, &region, NULL); 521 + if (unlikely(r)) 522 + writecache_error(wc, r, "error flushing metadata: %d", r); 523 + } 524 + 525 + static void writecache_wait_for_ios(struct dm_writecache *wc, int direction) 526 + { 527 + wait_event(wc->bio_in_progress_wait[direction], 528 + !atomic_read(&wc->bio_in_progress[direction])); 529 + } 530 + 531 + #define WFE_RETURN_FOLLOWING 1 532 + #define WFE_LOWEST_SEQ 2 533 + 534 + static struct wc_entry *writecache_find_entry(struct dm_writecache *wc, 535 + uint64_t block, int flags) 536 + { 537 + struct wc_entry *e; 538 + struct rb_node *node = wc->tree.rb_node; 539 + 540 + if (unlikely(!node)) 541 + return NULL; 542 + 543 + while (1) { 544 + e = container_of(node, struct wc_entry, rb_node); 545 + if (read_original_sector(wc, e) == block) 546 + break; 547 + node = (read_original_sector(wc, e) >= block ? 548 + e->rb_node.rb_left : e->rb_node.rb_right); 549 + if (unlikely(!node)) { 550 + if (!(flags & WFE_RETURN_FOLLOWING)) { 551 + return NULL; 552 + } 553 + if (read_original_sector(wc, e) >= block) { 554 + break; 555 + } else { 556 + node = rb_next(&e->rb_node); 557 + if (unlikely(!node)) { 558 + return NULL; 559 + } 560 + e = container_of(node, struct wc_entry, rb_node); 561 + break; 562 + } 563 + } 564 + } 565 + 566 + while (1) { 567 + struct wc_entry *e2; 568 + if (flags & WFE_LOWEST_SEQ) 569 + node = rb_prev(&e->rb_node); 570 + else 571 + node = rb_next(&e->rb_node); 572 + if (!node) 573 + return e; 574 + e2 = container_of(node, struct wc_entry, rb_node); 575 + if (read_original_sector(wc, e2) != block) 576 + return e; 577 + e = e2; 578 + } 579 + } 580 + 581 + static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins) 582 + { 583 + struct wc_entry *e; 584 + struct rb_node **node = &wc->tree.rb_node, *parent = NULL; 585 + 586 + while (*node) { 587 + e = container_of(*node, struct wc_entry, rb_node); 588 + parent = &e->rb_node; 589 + if (read_original_sector(wc, e) > read_original_sector(wc, ins)) 590 + node = &parent->rb_left; 591 + else 592 + node = &parent->rb_right; 593 + } 594 + rb_link_node(&ins->rb_node, parent, node); 595 + rb_insert_color(&ins->rb_node, &wc->tree); 596 + list_add(&ins->lru, &wc->lru); 597 + } 598 + 599 + static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e) 600 + { 601 + list_del(&e->lru); 602 + rb_erase(&e->rb_node, &wc->tree); 603 + } 604 + 605 + static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e) 606 + { 607 + if (WC_MODE_SORT_FREELIST(wc)) { 608 + struct rb_node **node = &wc->freetree.rb_node, *parent = NULL; 609 + if (unlikely(!*node)) 610 + wc->current_free = e; 611 + while (*node) { 612 + parent = *node; 613 + if (&e->rb_node < *node) 614 + node = &parent->rb_left; 615 + else 616 + node = &parent->rb_right; 617 + } 618 + rb_link_node(&e->rb_node, parent, node); 619 + rb_insert_color(&e->rb_node, &wc->freetree); 620 + } else { 621 + list_add_tail(&e->lru, &wc->freelist); 622 + } 623 + wc->freelist_size++; 624 + } 625 + 626 + static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc) 627 + { 628 + struct wc_entry *e; 629 + 630 + if (WC_MODE_SORT_FREELIST(wc)) { 631 + struct rb_node *next; 632 + if (unlikely(!wc->current_free)) 633 + return NULL; 634 + e = wc->current_free; 635 + next = rb_next(&e->rb_node); 636 + rb_erase(&e->rb_node, &wc->freetree); 637 + if (unlikely(!next)) 638 + next = rb_first(&wc->freetree); 639 + wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL; 640 + } else { 641 + if (unlikely(list_empty(&wc->freelist))) 642 + return NULL; 643 + e = container_of(wc->freelist.next, struct wc_entry, lru); 644 + list_del(&e->lru); 645 + } 646 + wc->freelist_size--; 647 + if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark)) 648 + queue_work(wc->writeback_wq, &wc->writeback_work); 649 + 650 + return e; 651 + } 652 + 653 + static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e) 654 + { 655 + writecache_unlink(wc, e); 656 + writecache_add_to_freelist(wc, e); 657 + clear_seq_count(wc, e); 658 + writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 659 + if (unlikely(waitqueue_active(&wc->freelist_wait))) 660 + wake_up(&wc->freelist_wait); 661 + } 662 + 663 + static void writecache_wait_on_freelist(struct dm_writecache *wc) 664 + { 665 + DEFINE_WAIT(wait); 666 + 667 + prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE); 668 + wc_unlock(wc); 669 + io_schedule(); 670 + finish_wait(&wc->freelist_wait, &wait); 671 + wc_lock(wc); 672 + } 673 + 674 + static void writecache_poison_lists(struct dm_writecache *wc) 675 + { 676 + /* 677 + * Catch incorrect access to these values while the device is suspended. 678 + */ 679 + memset(&wc->tree, -1, sizeof wc->tree); 680 + wc->lru.next = LIST_POISON1; 681 + wc->lru.prev = LIST_POISON2; 682 + wc->freelist.next = LIST_POISON1; 683 + wc->freelist.prev = LIST_POISON2; 684 + } 685 + 686 + static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e) 687 + { 688 + writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 689 + if (WC_MODE_PMEM(wc)) 690 + writecache_flush_region(wc, memory_data(wc, e), wc->block_size); 691 + } 692 + 693 + static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e) 694 + { 695 + return read_seq_count(wc, e) < wc->seq_count; 696 + } 697 + 698 + static void writecache_flush(struct dm_writecache *wc) 699 + { 700 + struct wc_entry *e, *e2; 701 + bool need_flush_after_free; 702 + 703 + wc->uncommitted_blocks = 0; 704 + del_timer(&wc->autocommit_timer); 705 + 706 + if (list_empty(&wc->lru)) 707 + return; 708 + 709 + e = container_of(wc->lru.next, struct wc_entry, lru); 710 + if (writecache_entry_is_committed(wc, e)) { 711 + if (wc->overwrote_committed) { 712 + writecache_wait_for_ios(wc, WRITE); 713 + writecache_disk_flush(wc, wc->ssd_dev); 714 + wc->overwrote_committed = false; 715 + } 716 + return; 717 + } 718 + while (1) { 719 + writecache_flush_entry(wc, e); 720 + if (unlikely(e->lru.next == &wc->lru)) 721 + break; 722 + e2 = container_of(e->lru.next, struct wc_entry, lru); 723 + if (writecache_entry_is_committed(wc, e2)) 724 + break; 725 + e = e2; 726 + cond_resched(); 727 + } 728 + writecache_commit_flushed(wc); 729 + 730 + writecache_wait_for_ios(wc, WRITE); 731 + 732 + wc->seq_count++; 733 + pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); 734 + writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count); 735 + writecache_commit_flushed(wc); 736 + 737 + wc->overwrote_committed = false; 738 + 739 + need_flush_after_free = false; 740 + while (1) { 741 + /* Free another committed entry with lower seq-count */ 742 + struct rb_node *rb_node = rb_prev(&e->rb_node); 743 + 744 + if (rb_node) { 745 + e2 = container_of(rb_node, struct wc_entry, rb_node); 746 + if (read_original_sector(wc, e2) == read_original_sector(wc, e) && 747 + likely(!e2->write_in_progress)) { 748 + writecache_free_entry(wc, e2); 749 + need_flush_after_free = true; 750 + } 751 + } 752 + if (unlikely(e->lru.prev == &wc->lru)) 753 + break; 754 + e = container_of(e->lru.prev, struct wc_entry, lru); 755 + cond_resched(); 756 + } 757 + 758 + if (need_flush_after_free) 759 + writecache_commit_flushed(wc); 760 + } 761 + 762 + static void writecache_flush_work(struct work_struct *work) 763 + { 764 + struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work); 765 + 766 + wc_lock(wc); 767 + writecache_flush(wc); 768 + wc_unlock(wc); 769 + } 770 + 771 + static void writecache_autocommit_timer(struct timer_list *t) 772 + { 773 + struct dm_writecache *wc = from_timer(wc, t, autocommit_timer); 774 + if (!writecache_has_error(wc)) 775 + queue_work(wc->writeback_wq, &wc->flush_work); 776 + } 777 + 778 + static void writecache_schedule_autocommit(struct dm_writecache *wc) 779 + { 780 + if (!timer_pending(&wc->autocommit_timer)) 781 + mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies); 782 + } 783 + 784 + static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end) 785 + { 786 + struct wc_entry *e; 787 + bool discarded_something = false; 788 + 789 + e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ); 790 + if (unlikely(!e)) 791 + return; 792 + 793 + while (read_original_sector(wc, e) < end) { 794 + struct rb_node *node = rb_next(&e->rb_node); 795 + 796 + if (likely(!e->write_in_progress)) { 797 + if (!discarded_something) { 798 + writecache_wait_for_ios(wc, READ); 799 + writecache_wait_for_ios(wc, WRITE); 800 + discarded_something = true; 801 + } 802 + writecache_free_entry(wc, e); 803 + } 804 + 805 + if (!node) 806 + break; 807 + 808 + e = container_of(node, struct wc_entry, rb_node); 809 + } 810 + 811 + if (discarded_something) 812 + writecache_commit_flushed(wc); 813 + } 814 + 815 + static bool writecache_wait_for_writeback(struct dm_writecache *wc) 816 + { 817 + if (wc->writeback_size) { 818 + writecache_wait_on_freelist(wc); 819 + return true; 820 + } 821 + return false; 822 + } 823 + 824 + static void writecache_suspend(struct dm_target *ti) 825 + { 826 + struct dm_writecache *wc = ti->private; 827 + bool flush_on_suspend; 828 + 829 + del_timer_sync(&wc->autocommit_timer); 830 + 831 + wc_lock(wc); 832 + writecache_flush(wc); 833 + flush_on_suspend = wc->flush_on_suspend; 834 + if (flush_on_suspend) { 835 + wc->flush_on_suspend = false; 836 + wc->writeback_all++; 837 + queue_work(wc->writeback_wq, &wc->writeback_work); 838 + } 839 + wc_unlock(wc); 840 + 841 + flush_workqueue(wc->writeback_wq); 842 + 843 + wc_lock(wc); 844 + if (flush_on_suspend) 845 + wc->writeback_all--; 846 + while (writecache_wait_for_writeback(wc)); 847 + 848 + if (WC_MODE_PMEM(wc)) 849 + persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); 850 + 851 + writecache_poison_lists(wc); 852 + 853 + wc_unlock(wc); 854 + } 855 + 856 + static int writecache_alloc_entries(struct dm_writecache *wc) 857 + { 858 + size_t b; 859 + 860 + if (wc->entries) 861 + return 0; 862 + wc->entries = vmalloc(sizeof(struct wc_entry) * wc->n_blocks); 863 + if (!wc->entries) 864 + return -ENOMEM; 865 + for (b = 0; b < wc->n_blocks; b++) { 866 + struct wc_entry *e = &wc->entries[b]; 867 + e->index = b; 868 + e->write_in_progress = false; 869 + } 870 + 871 + return 0; 872 + } 873 + 874 + static void writecache_resume(struct dm_target *ti) 875 + { 876 + struct dm_writecache *wc = ti->private; 877 + size_t b; 878 + bool need_flush = false; 879 + __le64 sb_seq_count; 880 + int r; 881 + 882 + wc_lock(wc); 883 + 884 + if (WC_MODE_PMEM(wc)) 885 + persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size); 886 + 887 + wc->tree = RB_ROOT; 888 + INIT_LIST_HEAD(&wc->lru); 889 + if (WC_MODE_SORT_FREELIST(wc)) { 890 + wc->freetree = RB_ROOT; 891 + wc->current_free = NULL; 892 + } else { 893 + INIT_LIST_HEAD(&wc->freelist); 894 + } 895 + wc->freelist_size = 0; 896 + 897 + r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t)); 898 + if (r) { 899 + writecache_error(wc, r, "hardware memory error when reading superblock: %d", r); 900 + sb_seq_count = cpu_to_le64(0); 901 + } 902 + wc->seq_count = le64_to_cpu(sb_seq_count); 903 + 904 + #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS 905 + for (b = 0; b < wc->n_blocks; b++) { 906 + struct wc_entry *e = &wc->entries[b]; 907 + struct wc_memory_entry wme; 908 + if (writecache_has_error(wc)) { 909 + e->original_sector = -1; 910 + e->seq_count = -1; 911 + continue; 912 + } 913 + r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry)); 914 + if (r) { 915 + writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d", 916 + (unsigned long)b, r); 917 + e->original_sector = -1; 918 + e->seq_count = -1; 919 + } else { 920 + e->original_sector = le64_to_cpu(wme.original_sector); 921 + e->seq_count = le64_to_cpu(wme.seq_count); 922 + } 923 + } 924 + #endif 925 + for (b = 0; b < wc->n_blocks; b++) { 926 + struct wc_entry *e = &wc->entries[b]; 927 + if (!writecache_entry_is_committed(wc, e)) { 928 + if (read_seq_count(wc, e) != -1) { 929 + erase_this: 930 + clear_seq_count(wc, e); 931 + need_flush = true; 932 + } 933 + writecache_add_to_freelist(wc, e); 934 + } else { 935 + struct wc_entry *old; 936 + 937 + old = writecache_find_entry(wc, read_original_sector(wc, e), 0); 938 + if (!old) { 939 + writecache_insert_entry(wc, e); 940 + } else { 941 + if (read_seq_count(wc, old) == read_seq_count(wc, e)) { 942 + writecache_error(wc, -EINVAL, 943 + "two identical entries, position %llu, sector %llu, sequence %llu", 944 + (unsigned long long)b, (unsigned long long)read_original_sector(wc, e), 945 + (unsigned long long)read_seq_count(wc, e)); 946 + } 947 + if (read_seq_count(wc, old) > read_seq_count(wc, e)) { 948 + goto erase_this; 949 + } else { 950 + writecache_free_entry(wc, old); 951 + writecache_insert_entry(wc, e); 952 + need_flush = true; 953 + } 954 + } 955 + } 956 + cond_resched(); 957 + } 958 + 959 + if (need_flush) { 960 + writecache_flush_all_metadata(wc); 961 + writecache_commit_flushed(wc); 962 + } 963 + 964 + wc_unlock(wc); 965 + } 966 + 967 + static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc) 968 + { 969 + if (argc != 1) 970 + return -EINVAL; 971 + 972 + wc_lock(wc); 973 + if (dm_suspended(wc->ti)) { 974 + wc_unlock(wc); 975 + return -EBUSY; 976 + } 977 + if (writecache_has_error(wc)) { 978 + wc_unlock(wc); 979 + return -EIO; 980 + } 981 + 982 + writecache_flush(wc); 983 + wc->writeback_all++; 984 + queue_work(wc->writeback_wq, &wc->writeback_work); 985 + wc_unlock(wc); 986 + 987 + flush_workqueue(wc->writeback_wq); 988 + 989 + wc_lock(wc); 990 + wc->writeback_all--; 991 + if (writecache_has_error(wc)) { 992 + wc_unlock(wc); 993 + return -EIO; 994 + } 995 + wc_unlock(wc); 996 + 997 + return 0; 998 + } 999 + 1000 + static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc) 1001 + { 1002 + if (argc != 1) 1003 + return -EINVAL; 1004 + 1005 + wc_lock(wc); 1006 + wc->flush_on_suspend = true; 1007 + wc_unlock(wc); 1008 + 1009 + return 0; 1010 + } 1011 + 1012 + static int writecache_message(struct dm_target *ti, unsigned argc, char **argv, 1013 + char *result, unsigned maxlen) 1014 + { 1015 + int r = -EINVAL; 1016 + struct dm_writecache *wc = ti->private; 1017 + 1018 + if (!strcasecmp(argv[0], "flush")) 1019 + r = process_flush_mesg(argc, argv, wc); 1020 + else if (!strcasecmp(argv[0], "flush_on_suspend")) 1021 + r = process_flush_on_suspend_mesg(argc, argv, wc); 1022 + else 1023 + DMERR("unrecognised message received: %s", argv[0]); 1024 + 1025 + return r; 1026 + } 1027 + 1028 + static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data) 1029 + { 1030 + void *buf; 1031 + unsigned long flags; 1032 + unsigned size; 1033 + int rw = bio_data_dir(bio); 1034 + unsigned remaining_size = wc->block_size; 1035 + 1036 + do { 1037 + struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter); 1038 + buf = bvec_kmap_irq(&bv, &flags); 1039 + size = bv.bv_len; 1040 + if (unlikely(size > remaining_size)) 1041 + size = remaining_size; 1042 + 1043 + if (rw == READ) { 1044 + int r; 1045 + r = memcpy_mcsafe(buf, data, size); 1046 + flush_dcache_page(bio_page(bio)); 1047 + if (unlikely(r)) { 1048 + writecache_error(wc, r, "hardware memory error when reading data: %d", r); 1049 + bio->bi_status = BLK_STS_IOERR; 1050 + } 1051 + } else { 1052 + flush_dcache_page(bio_page(bio)); 1053 + memcpy_flushcache(data, buf, size); 1054 + } 1055 + 1056 + bvec_kunmap_irq(buf, &flags); 1057 + 1058 + data = (char *)data + size; 1059 + remaining_size -= size; 1060 + bio_advance(bio, size); 1061 + } while (unlikely(remaining_size)); 1062 + } 1063 + 1064 + static int writecache_flush_thread(void *data) 1065 + { 1066 + struct dm_writecache *wc = data; 1067 + 1068 + while (1) { 1069 + struct bio *bio; 1070 + 1071 + wc_lock(wc); 1072 + bio = bio_list_pop(&wc->flush_list); 1073 + if (!bio) { 1074 + set_current_state(TASK_INTERRUPTIBLE); 1075 + wc_unlock(wc); 1076 + 1077 + if (unlikely(kthread_should_stop())) { 1078 + set_current_state(TASK_RUNNING); 1079 + break; 1080 + } 1081 + 1082 + schedule(); 1083 + continue; 1084 + } 1085 + 1086 + if (bio_op(bio) == REQ_OP_DISCARD) { 1087 + writecache_discard(wc, bio->bi_iter.bi_sector, 1088 + bio_end_sector(bio)); 1089 + wc_unlock(wc); 1090 + bio_set_dev(bio, wc->dev->bdev); 1091 + generic_make_request(bio); 1092 + } else { 1093 + writecache_flush(wc); 1094 + wc_unlock(wc); 1095 + if (writecache_has_error(wc)) 1096 + bio->bi_status = BLK_STS_IOERR; 1097 + bio_endio(bio); 1098 + } 1099 + } 1100 + 1101 + return 0; 1102 + } 1103 + 1104 + static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio) 1105 + { 1106 + if (bio_list_empty(&wc->flush_list)) 1107 + wake_up_process(wc->flush_thread); 1108 + bio_list_add(&wc->flush_list, bio); 1109 + } 1110 + 1111 + static int writecache_map(struct dm_target *ti, struct bio *bio) 1112 + { 1113 + struct wc_entry *e; 1114 + struct dm_writecache *wc = ti->private; 1115 + 1116 + bio->bi_private = NULL; 1117 + 1118 + wc_lock(wc); 1119 + 1120 + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { 1121 + if (writecache_has_error(wc)) 1122 + goto unlock_error; 1123 + if (WC_MODE_PMEM(wc)) { 1124 + writecache_flush(wc); 1125 + if (writecache_has_error(wc)) 1126 + goto unlock_error; 1127 + goto unlock_submit; 1128 + } else { 1129 + writecache_offload_bio(wc, bio); 1130 + goto unlock_return; 1131 + } 1132 + } 1133 + 1134 + bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector); 1135 + 1136 + if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) & 1137 + (wc->block_size / 512 - 1)) != 0)) { 1138 + DMERR("I/O is not aligned, sector %llu, size %u, block size %u", 1139 + (unsigned long long)bio->bi_iter.bi_sector, 1140 + bio->bi_iter.bi_size, wc->block_size); 1141 + goto unlock_error; 1142 + } 1143 + 1144 + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { 1145 + if (writecache_has_error(wc)) 1146 + goto unlock_error; 1147 + if (WC_MODE_PMEM(wc)) { 1148 + writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio)); 1149 + goto unlock_remap_origin; 1150 + } else { 1151 + writecache_offload_bio(wc, bio); 1152 + goto unlock_return; 1153 + } 1154 + } 1155 + 1156 + if (bio_data_dir(bio) == READ) { 1157 + read_next_block: 1158 + e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING); 1159 + if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) { 1160 + if (WC_MODE_PMEM(wc)) { 1161 + bio_copy_block(wc, bio, memory_data(wc, e)); 1162 + if (bio->bi_iter.bi_size) 1163 + goto read_next_block; 1164 + goto unlock_submit; 1165 + } else { 1166 + dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); 1167 + bio_set_dev(bio, wc->ssd_dev->bdev); 1168 + bio->bi_iter.bi_sector = cache_sector(wc, e); 1169 + if (!writecache_entry_is_committed(wc, e)) 1170 + writecache_wait_for_ios(wc, WRITE); 1171 + goto unlock_remap; 1172 + } 1173 + } else { 1174 + if (e) { 1175 + sector_t next_boundary = 1176 + read_original_sector(wc, e) - bio->bi_iter.bi_sector; 1177 + if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) { 1178 + dm_accept_partial_bio(bio, next_boundary); 1179 + } 1180 + } 1181 + goto unlock_remap_origin; 1182 + } 1183 + } else { 1184 + do { 1185 + if (writecache_has_error(wc)) 1186 + goto unlock_error; 1187 + e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0); 1188 + if (e) { 1189 + if (!writecache_entry_is_committed(wc, e)) 1190 + goto bio_copy; 1191 + if (!WC_MODE_PMEM(wc) && !e->write_in_progress) { 1192 + wc->overwrote_committed = true; 1193 + goto bio_copy; 1194 + } 1195 + } 1196 + e = writecache_pop_from_freelist(wc); 1197 + if (unlikely(!e)) { 1198 + writecache_wait_on_freelist(wc); 1199 + continue; 1200 + } 1201 + write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count); 1202 + writecache_insert_entry(wc, e); 1203 + wc->uncommitted_blocks++; 1204 + bio_copy: 1205 + if (WC_MODE_PMEM(wc)) { 1206 + bio_copy_block(wc, bio, memory_data(wc, e)); 1207 + } else { 1208 + dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT); 1209 + bio_set_dev(bio, wc->ssd_dev->bdev); 1210 + bio->bi_iter.bi_sector = cache_sector(wc, e); 1211 + if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) { 1212 + wc->uncommitted_blocks = 0; 1213 + queue_work(wc->writeback_wq, &wc->flush_work); 1214 + } else { 1215 + writecache_schedule_autocommit(wc); 1216 + } 1217 + goto unlock_remap; 1218 + } 1219 + } while (bio->bi_iter.bi_size); 1220 + 1221 + if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) 1222 + writecache_flush(wc); 1223 + else 1224 + writecache_schedule_autocommit(wc); 1225 + goto unlock_submit; 1226 + } 1227 + 1228 + unlock_remap_origin: 1229 + bio_set_dev(bio, wc->dev->bdev); 1230 + wc_unlock(wc); 1231 + return DM_MAPIO_REMAPPED; 1232 + 1233 + unlock_remap: 1234 + /* make sure that writecache_end_io decrements bio_in_progress: */ 1235 + bio->bi_private = (void *)1; 1236 + atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]); 1237 + wc_unlock(wc); 1238 + return DM_MAPIO_REMAPPED; 1239 + 1240 + unlock_submit: 1241 + wc_unlock(wc); 1242 + bio_endio(bio); 1243 + return DM_MAPIO_SUBMITTED; 1244 + 1245 + unlock_return: 1246 + wc_unlock(wc); 1247 + return DM_MAPIO_SUBMITTED; 1248 + 1249 + unlock_error: 1250 + wc_unlock(wc); 1251 + bio_io_error(bio); 1252 + return DM_MAPIO_SUBMITTED; 1253 + } 1254 + 1255 + static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status) 1256 + { 1257 + struct dm_writecache *wc = ti->private; 1258 + 1259 + if (bio->bi_private != NULL) { 1260 + int dir = bio_data_dir(bio); 1261 + if (atomic_dec_and_test(&wc->bio_in_progress[dir])) 1262 + if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir]))) 1263 + wake_up(&wc->bio_in_progress_wait[dir]); 1264 + } 1265 + return 0; 1266 + } 1267 + 1268 + static int writecache_iterate_devices(struct dm_target *ti, 1269 + iterate_devices_callout_fn fn, void *data) 1270 + { 1271 + struct dm_writecache *wc = ti->private; 1272 + 1273 + return fn(ti, wc->dev, 0, ti->len, data); 1274 + } 1275 + 1276 + static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits) 1277 + { 1278 + struct dm_writecache *wc = ti->private; 1279 + 1280 + if (limits->logical_block_size < wc->block_size) 1281 + limits->logical_block_size = wc->block_size; 1282 + 1283 + if (limits->physical_block_size < wc->block_size) 1284 + limits->physical_block_size = wc->block_size; 1285 + 1286 + if (limits->io_min < wc->block_size) 1287 + limits->io_min = wc->block_size; 1288 + } 1289 + 1290 + 1291 + static void writecache_writeback_endio(struct bio *bio) 1292 + { 1293 + struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio); 1294 + struct dm_writecache *wc = wb->wc; 1295 + unsigned long flags; 1296 + 1297 + raw_spin_lock_irqsave(&wc->endio_list_lock, flags); 1298 + if (unlikely(list_empty(&wc->endio_list))) 1299 + wake_up_process(wc->endio_thread); 1300 + list_add_tail(&wb->endio_entry, &wc->endio_list); 1301 + raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags); 1302 + } 1303 + 1304 + static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr) 1305 + { 1306 + struct copy_struct *c = ptr; 1307 + struct dm_writecache *wc = c->wc; 1308 + 1309 + c->error = likely(!(read_err | write_err)) ? 0 : -EIO; 1310 + 1311 + raw_spin_lock_irq(&wc->endio_list_lock); 1312 + if (unlikely(list_empty(&wc->endio_list))) 1313 + wake_up_process(wc->endio_thread); 1314 + list_add_tail(&c->endio_entry, &wc->endio_list); 1315 + raw_spin_unlock_irq(&wc->endio_list_lock); 1316 + } 1317 + 1318 + static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list) 1319 + { 1320 + unsigned i; 1321 + struct writeback_struct *wb; 1322 + struct wc_entry *e; 1323 + unsigned long n_walked = 0; 1324 + 1325 + do { 1326 + wb = list_entry(list->next, struct writeback_struct, endio_entry); 1327 + list_del(&wb->endio_entry); 1328 + 1329 + if (unlikely(wb->bio.bi_status != BLK_STS_OK)) 1330 + writecache_error(wc, blk_status_to_errno(wb->bio.bi_status), 1331 + "write error %d", wb->bio.bi_status); 1332 + i = 0; 1333 + do { 1334 + e = wb->wc_list[i]; 1335 + BUG_ON(!e->write_in_progress); 1336 + e->write_in_progress = false; 1337 + INIT_LIST_HEAD(&e->lru); 1338 + if (!writecache_has_error(wc)) 1339 + writecache_free_entry(wc, e); 1340 + BUG_ON(!wc->writeback_size); 1341 + wc->writeback_size--; 1342 + n_walked++; 1343 + if (unlikely(n_walked >= ENDIO_LATENCY)) { 1344 + writecache_commit_flushed(wc); 1345 + wc_unlock(wc); 1346 + wc_lock(wc); 1347 + n_walked = 0; 1348 + } 1349 + } while (++i < wb->wc_list_n); 1350 + 1351 + if (wb->wc_list != wb->wc_list_inline) 1352 + kfree(wb->wc_list); 1353 + bio_put(&wb->bio); 1354 + } while (!list_empty(list)); 1355 + } 1356 + 1357 + static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list) 1358 + { 1359 + struct copy_struct *c; 1360 + struct wc_entry *e; 1361 + 1362 + do { 1363 + c = list_entry(list->next, struct copy_struct, endio_entry); 1364 + list_del(&c->endio_entry); 1365 + 1366 + if (unlikely(c->error)) 1367 + writecache_error(wc, c->error, "copy error"); 1368 + 1369 + e = c->e; 1370 + do { 1371 + BUG_ON(!e->write_in_progress); 1372 + e->write_in_progress = false; 1373 + INIT_LIST_HEAD(&e->lru); 1374 + if (!writecache_has_error(wc)) 1375 + writecache_free_entry(wc, e); 1376 + 1377 + BUG_ON(!wc->writeback_size); 1378 + wc->writeback_size--; 1379 + e++; 1380 + } while (--c->n_entries); 1381 + mempool_free(c, &wc->copy_pool); 1382 + } while (!list_empty(list)); 1383 + } 1384 + 1385 + static int writecache_endio_thread(void *data) 1386 + { 1387 + struct dm_writecache *wc = data; 1388 + 1389 + while (1) { 1390 + struct list_head list; 1391 + 1392 + raw_spin_lock_irq(&wc->endio_list_lock); 1393 + if (!list_empty(&wc->endio_list)) 1394 + goto pop_from_list; 1395 + set_current_state(TASK_INTERRUPTIBLE); 1396 + raw_spin_unlock_irq(&wc->endio_list_lock); 1397 + 1398 + if (unlikely(kthread_should_stop())) { 1399 + set_current_state(TASK_RUNNING); 1400 + break; 1401 + } 1402 + 1403 + schedule(); 1404 + 1405 + continue; 1406 + 1407 + pop_from_list: 1408 + list = wc->endio_list; 1409 + list.next->prev = list.prev->next = &list; 1410 + INIT_LIST_HEAD(&wc->endio_list); 1411 + raw_spin_unlock_irq(&wc->endio_list_lock); 1412 + 1413 + if (!WC_MODE_FUA(wc)) 1414 + writecache_disk_flush(wc, wc->dev); 1415 + 1416 + wc_lock(wc); 1417 + 1418 + if (WC_MODE_PMEM(wc)) { 1419 + __writecache_endio_pmem(wc, &list); 1420 + } else { 1421 + __writecache_endio_ssd(wc, &list); 1422 + writecache_wait_for_ios(wc, READ); 1423 + } 1424 + 1425 + writecache_commit_flushed(wc); 1426 + 1427 + wc_unlock(wc); 1428 + } 1429 + 1430 + return 0; 1431 + } 1432 + 1433 + static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp) 1434 + { 1435 + struct dm_writecache *wc = wb->wc; 1436 + unsigned block_size = wc->block_size; 1437 + void *address = memory_data(wc, e); 1438 + 1439 + persistent_memory_flush_cache(address, block_size); 1440 + return bio_add_page(&wb->bio, persistent_memory_page(address), 1441 + block_size, persistent_memory_page_offset(address)) != 0; 1442 + } 1443 + 1444 + struct writeback_list { 1445 + struct list_head list; 1446 + size_t size; 1447 + }; 1448 + 1449 + static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl) 1450 + { 1451 + if (unlikely(wc->max_writeback_jobs)) { 1452 + if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) { 1453 + wc_lock(wc); 1454 + while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs) 1455 + writecache_wait_on_freelist(wc); 1456 + wc_unlock(wc); 1457 + } 1458 + } 1459 + cond_resched(); 1460 + } 1461 + 1462 + static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl) 1463 + { 1464 + struct wc_entry *e, *f; 1465 + struct bio *bio; 1466 + struct writeback_struct *wb; 1467 + unsigned max_pages; 1468 + 1469 + while (wbl->size) { 1470 + wbl->size--; 1471 + e = container_of(wbl->list.prev, struct wc_entry, lru); 1472 + list_del(&e->lru); 1473 + 1474 + max_pages = e->wc_list_contiguous; 1475 + 1476 + bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set); 1477 + wb = container_of(bio, struct writeback_struct, bio); 1478 + wb->wc = wc; 1479 + wb->bio.bi_end_io = writecache_writeback_endio; 1480 + bio_set_dev(&wb->bio, wc->dev->bdev); 1481 + wb->bio.bi_iter.bi_sector = read_original_sector(wc, e); 1482 + wb->page_offset = PAGE_SIZE; 1483 + if (max_pages <= WB_LIST_INLINE || 1484 + unlikely(!(wb->wc_list = kmalloc(max_pages * sizeof(struct wc_entry *), 1485 + GFP_NOIO | __GFP_NORETRY | 1486 + __GFP_NOMEMALLOC | __GFP_NOWARN)))) { 1487 + wb->wc_list = wb->wc_list_inline; 1488 + max_pages = WB_LIST_INLINE; 1489 + } 1490 + 1491 + BUG_ON(!wc_add_block(wb, e, GFP_NOIO)); 1492 + 1493 + wb->wc_list[0] = e; 1494 + wb->wc_list_n = 1; 1495 + 1496 + while (wbl->size && wb->wc_list_n < max_pages) { 1497 + f = container_of(wbl->list.prev, struct wc_entry, lru); 1498 + if (read_original_sector(wc, f) != 1499 + read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT)) 1500 + break; 1501 + if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN)) 1502 + break; 1503 + wbl->size--; 1504 + list_del(&f->lru); 1505 + wb->wc_list[wb->wc_list_n++] = f; 1506 + e = f; 1507 + } 1508 + bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA); 1509 + if (writecache_has_error(wc)) { 1510 + bio->bi_status = BLK_STS_IOERR; 1511 + bio_endio(&wb->bio); 1512 + } else { 1513 + submit_bio(&wb->bio); 1514 + } 1515 + 1516 + __writeback_throttle(wc, wbl); 1517 + } 1518 + } 1519 + 1520 + static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl) 1521 + { 1522 + struct wc_entry *e, *f; 1523 + struct dm_io_region from, to; 1524 + struct copy_struct *c; 1525 + 1526 + while (wbl->size) { 1527 + unsigned n_sectors; 1528 + 1529 + wbl->size--; 1530 + e = container_of(wbl->list.prev, struct wc_entry, lru); 1531 + list_del(&e->lru); 1532 + 1533 + n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT); 1534 + 1535 + from.bdev = wc->ssd_dev->bdev; 1536 + from.sector = cache_sector(wc, e); 1537 + from.count = n_sectors; 1538 + to.bdev = wc->dev->bdev; 1539 + to.sector = read_original_sector(wc, e); 1540 + to.count = n_sectors; 1541 + 1542 + c = mempool_alloc(&wc->copy_pool, GFP_NOIO); 1543 + c->wc = wc; 1544 + c->e = e; 1545 + c->n_entries = e->wc_list_contiguous; 1546 + 1547 + while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) { 1548 + wbl->size--; 1549 + f = container_of(wbl->list.prev, struct wc_entry, lru); 1550 + BUG_ON(f != e + 1); 1551 + list_del(&f->lru); 1552 + e = f; 1553 + } 1554 + 1555 + dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c); 1556 + 1557 + __writeback_throttle(wc, wbl); 1558 + } 1559 + } 1560 + 1561 + static void writecache_writeback(struct work_struct *work) 1562 + { 1563 + struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work); 1564 + struct blk_plug plug; 1565 + struct wc_entry *e, *f, *g; 1566 + struct rb_node *node, *next_node; 1567 + struct list_head skipped; 1568 + struct writeback_list wbl; 1569 + unsigned long n_walked; 1570 + 1571 + wc_lock(wc); 1572 + restart: 1573 + if (writecache_has_error(wc)) { 1574 + wc_unlock(wc); 1575 + return; 1576 + } 1577 + 1578 + if (unlikely(wc->writeback_all)) { 1579 + if (writecache_wait_for_writeback(wc)) 1580 + goto restart; 1581 + } 1582 + 1583 + if (wc->overwrote_committed) { 1584 + writecache_wait_for_ios(wc, WRITE); 1585 + } 1586 + 1587 + n_walked = 0; 1588 + INIT_LIST_HEAD(&skipped); 1589 + INIT_LIST_HEAD(&wbl.list); 1590 + wbl.size = 0; 1591 + while (!list_empty(&wc->lru) && 1592 + (wc->writeback_all || 1593 + wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark)) { 1594 + 1595 + n_walked++; 1596 + if (unlikely(n_walked > WRITEBACK_LATENCY) && 1597 + likely(!wc->writeback_all) && likely(!dm_suspended(wc->ti))) { 1598 + queue_work(wc->writeback_wq, &wc->writeback_work); 1599 + break; 1600 + } 1601 + 1602 + e = container_of(wc->lru.prev, struct wc_entry, lru); 1603 + BUG_ON(e->write_in_progress); 1604 + if (unlikely(!writecache_entry_is_committed(wc, e))) { 1605 + writecache_flush(wc); 1606 + } 1607 + node = rb_prev(&e->rb_node); 1608 + if (node) { 1609 + f = container_of(node, struct wc_entry, rb_node); 1610 + if (unlikely(read_original_sector(wc, f) == 1611 + read_original_sector(wc, e))) { 1612 + BUG_ON(!f->write_in_progress); 1613 + list_del(&e->lru); 1614 + list_add(&e->lru, &skipped); 1615 + cond_resched(); 1616 + continue; 1617 + } 1618 + } 1619 + wc->writeback_size++; 1620 + list_del(&e->lru); 1621 + list_add(&e->lru, &wbl.list); 1622 + wbl.size++; 1623 + e->write_in_progress = true; 1624 + e->wc_list_contiguous = 1; 1625 + 1626 + f = e; 1627 + 1628 + while (1) { 1629 + next_node = rb_next(&f->rb_node); 1630 + if (unlikely(!next_node)) 1631 + break; 1632 + g = container_of(next_node, struct wc_entry, rb_node); 1633 + if (read_original_sector(wc, g) == 1634 + read_original_sector(wc, f)) { 1635 + f = g; 1636 + continue; 1637 + } 1638 + if (read_original_sector(wc, g) != 1639 + read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT)) 1640 + break; 1641 + if (unlikely(g->write_in_progress)) 1642 + break; 1643 + if (unlikely(!writecache_entry_is_committed(wc, g))) 1644 + break; 1645 + 1646 + if (!WC_MODE_PMEM(wc)) { 1647 + if (g != f + 1) 1648 + break; 1649 + } 1650 + 1651 + n_walked++; 1652 + //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all)) 1653 + // break; 1654 + 1655 + wc->writeback_size++; 1656 + list_del(&g->lru); 1657 + list_add(&g->lru, &wbl.list); 1658 + wbl.size++; 1659 + g->write_in_progress = true; 1660 + g->wc_list_contiguous = BIO_MAX_PAGES; 1661 + f = g; 1662 + e->wc_list_contiguous++; 1663 + if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES)) 1664 + break; 1665 + } 1666 + cond_resched(); 1667 + } 1668 + 1669 + if (!list_empty(&skipped)) { 1670 + list_splice_tail(&skipped, &wc->lru); 1671 + /* 1672 + * If we didn't do any progress, we must wait until some 1673 + * writeback finishes to avoid burning CPU in a loop 1674 + */ 1675 + if (unlikely(!wbl.size)) 1676 + writecache_wait_for_writeback(wc); 1677 + } 1678 + 1679 + wc_unlock(wc); 1680 + 1681 + blk_start_plug(&plug); 1682 + 1683 + if (WC_MODE_PMEM(wc)) 1684 + __writecache_writeback_pmem(wc, &wbl); 1685 + else 1686 + __writecache_writeback_ssd(wc, &wbl); 1687 + 1688 + blk_finish_plug(&plug); 1689 + 1690 + if (unlikely(wc->writeback_all)) { 1691 + wc_lock(wc); 1692 + while (writecache_wait_for_writeback(wc)); 1693 + wc_unlock(wc); 1694 + } 1695 + } 1696 + 1697 + static int calculate_memory_size(uint64_t device_size, unsigned block_size, 1698 + size_t *n_blocks_p, size_t *n_metadata_blocks_p) 1699 + { 1700 + uint64_t n_blocks, offset; 1701 + struct wc_entry e; 1702 + 1703 + n_blocks = device_size; 1704 + do_div(n_blocks, block_size + sizeof(struct wc_memory_entry)); 1705 + 1706 + while (1) { 1707 + if (!n_blocks) 1708 + return -ENOSPC; 1709 + /* Verify the following entries[n_blocks] won't overflow */ 1710 + if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) / 1711 + sizeof(struct wc_memory_entry))) 1712 + return -EFBIG; 1713 + offset = offsetof(struct wc_memory_superblock, entries[n_blocks]); 1714 + offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1); 1715 + if (offset + n_blocks * block_size <= device_size) 1716 + break; 1717 + n_blocks--; 1718 + } 1719 + 1720 + /* check if the bit field overflows */ 1721 + e.index = n_blocks; 1722 + if (e.index != n_blocks) 1723 + return -EFBIG; 1724 + 1725 + if (n_blocks_p) 1726 + *n_blocks_p = n_blocks; 1727 + if (n_metadata_blocks_p) 1728 + *n_metadata_blocks_p = offset >> __ffs(block_size); 1729 + return 0; 1730 + } 1731 + 1732 + static int init_memory(struct dm_writecache *wc) 1733 + { 1734 + size_t b; 1735 + int r; 1736 + 1737 + r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL); 1738 + if (r) 1739 + return r; 1740 + 1741 + r = writecache_alloc_entries(wc); 1742 + if (r) 1743 + return r; 1744 + 1745 + for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++) 1746 + pmem_assign(sb(wc)->padding[b], cpu_to_le64(0)); 1747 + pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION)); 1748 + pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size)); 1749 + pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks)); 1750 + pmem_assign(sb(wc)->seq_count, cpu_to_le64(0)); 1751 + 1752 + for (b = 0; b < wc->n_blocks; b++) 1753 + write_original_sector_seq_count(wc, &wc->entries[b], -1, -1); 1754 + 1755 + writecache_flush_all_metadata(wc); 1756 + writecache_commit_flushed(wc); 1757 + pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC)); 1758 + writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic); 1759 + writecache_commit_flushed(wc); 1760 + 1761 + return 0; 1762 + } 1763 + 1764 + static void writecache_dtr(struct dm_target *ti) 1765 + { 1766 + struct dm_writecache *wc = ti->private; 1767 + 1768 + if (!wc) 1769 + return; 1770 + 1771 + if (wc->endio_thread) 1772 + kthread_stop(wc->endio_thread); 1773 + 1774 + if (wc->flush_thread) 1775 + kthread_stop(wc->flush_thread); 1776 + 1777 + bioset_exit(&wc->bio_set); 1778 + 1779 + mempool_exit(&wc->copy_pool); 1780 + 1781 + if (wc->writeback_wq) 1782 + destroy_workqueue(wc->writeback_wq); 1783 + 1784 + if (wc->dev) 1785 + dm_put_device(ti, wc->dev); 1786 + 1787 + if (wc->ssd_dev) 1788 + dm_put_device(ti, wc->ssd_dev); 1789 + 1790 + if (wc->entries) 1791 + vfree(wc->entries); 1792 + 1793 + if (wc->memory_map) { 1794 + if (WC_MODE_PMEM(wc)) 1795 + persistent_memory_release(wc); 1796 + else 1797 + vfree(wc->memory_map); 1798 + } 1799 + 1800 + if (wc->dm_kcopyd) 1801 + dm_kcopyd_client_destroy(wc->dm_kcopyd); 1802 + 1803 + if (wc->dm_io) 1804 + dm_io_client_destroy(wc->dm_io); 1805 + 1806 + if (wc->dirty_bitmap) 1807 + vfree(wc->dirty_bitmap); 1808 + 1809 + kfree(wc); 1810 + } 1811 + 1812 + static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) 1813 + { 1814 + struct dm_writecache *wc; 1815 + struct dm_arg_set as; 1816 + const char *string; 1817 + unsigned opt_params; 1818 + size_t offset, data_size; 1819 + int i, r; 1820 + char dummy; 1821 + int high_wm_percent = HIGH_WATERMARK; 1822 + int low_wm_percent = LOW_WATERMARK; 1823 + uint64_t x; 1824 + struct wc_memory_superblock s; 1825 + 1826 + static struct dm_arg _args[] = { 1827 + {0, 10, "Invalid number of feature args"}, 1828 + }; 1829 + 1830 + as.argc = argc; 1831 + as.argv = argv; 1832 + 1833 + wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL); 1834 + if (!wc) { 1835 + ti->error = "Cannot allocate writecache structure"; 1836 + r = -ENOMEM; 1837 + goto bad; 1838 + } 1839 + ti->private = wc; 1840 + wc->ti = ti; 1841 + 1842 + mutex_init(&wc->lock); 1843 + writecache_poison_lists(wc); 1844 + init_waitqueue_head(&wc->freelist_wait); 1845 + timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0); 1846 + 1847 + for (i = 0; i < 2; i++) { 1848 + atomic_set(&wc->bio_in_progress[i], 0); 1849 + init_waitqueue_head(&wc->bio_in_progress_wait[i]); 1850 + } 1851 + 1852 + wc->dm_io = dm_io_client_create(); 1853 + if (IS_ERR(wc->dm_io)) { 1854 + r = PTR_ERR(wc->dm_io); 1855 + ti->error = "Unable to allocate dm-io client"; 1856 + wc->dm_io = NULL; 1857 + goto bad; 1858 + } 1859 + 1860 + wc->writeback_wq = alloc_workqueue("writecache-writeabck", WQ_MEM_RECLAIM, 1); 1861 + if (!wc->writeback_wq) { 1862 + r = -ENOMEM; 1863 + ti->error = "Could not allocate writeback workqueue"; 1864 + goto bad; 1865 + } 1866 + INIT_WORK(&wc->writeback_work, writecache_writeback); 1867 + INIT_WORK(&wc->flush_work, writecache_flush_work); 1868 + 1869 + raw_spin_lock_init(&wc->endio_list_lock); 1870 + INIT_LIST_HEAD(&wc->endio_list); 1871 + wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio"); 1872 + if (IS_ERR(wc->endio_thread)) { 1873 + r = PTR_ERR(wc->endio_thread); 1874 + wc->endio_thread = NULL; 1875 + ti->error = "Couldn't spawn endio thread"; 1876 + goto bad; 1877 + } 1878 + wake_up_process(wc->endio_thread); 1879 + 1880 + /* 1881 + * Parse the mode (pmem or ssd) 1882 + */ 1883 + string = dm_shift_arg(&as); 1884 + if (!string) 1885 + goto bad_arguments; 1886 + 1887 + if (!strcasecmp(string, "s")) { 1888 + wc->pmem_mode = false; 1889 + } else if (!strcasecmp(string, "p")) { 1890 + #ifdef DM_WRITECACHE_HAS_PMEM 1891 + wc->pmem_mode = true; 1892 + wc->writeback_fua = true; 1893 + #else 1894 + /* 1895 + * If the architecture doesn't support persistent memory or 1896 + * the kernel doesn't support any DAX drivers, this driver can 1897 + * only be used in SSD-only mode. 1898 + */ 1899 + r = -EOPNOTSUPP; 1900 + ti->error = "Persistent memory or DAX not supported on this system"; 1901 + goto bad; 1902 + #endif 1903 + } else { 1904 + goto bad_arguments; 1905 + } 1906 + 1907 + if (WC_MODE_PMEM(wc)) { 1908 + r = bioset_init(&wc->bio_set, BIO_POOL_SIZE, 1909 + offsetof(struct writeback_struct, bio), 1910 + BIOSET_NEED_BVECS); 1911 + if (r) { 1912 + ti->error = "Could not allocate bio set"; 1913 + goto bad; 1914 + } 1915 + } else { 1916 + r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct)); 1917 + if (r) { 1918 + ti->error = "Could not allocate mempool"; 1919 + goto bad; 1920 + } 1921 + } 1922 + 1923 + /* 1924 + * Parse the origin data device 1925 + */ 1926 + string = dm_shift_arg(&as); 1927 + if (!string) 1928 + goto bad_arguments; 1929 + r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev); 1930 + if (r) { 1931 + ti->error = "Origin data device lookup failed"; 1932 + goto bad; 1933 + } 1934 + 1935 + /* 1936 + * Parse cache data device (be it pmem or ssd) 1937 + */ 1938 + string = dm_shift_arg(&as); 1939 + if (!string) 1940 + goto bad_arguments; 1941 + 1942 + r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev); 1943 + if (r) { 1944 + ti->error = "Cache data device lookup failed"; 1945 + goto bad; 1946 + } 1947 + wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode); 1948 + 1949 + if (WC_MODE_PMEM(wc)) { 1950 + r = persistent_memory_claim(wc); 1951 + if (r) { 1952 + ti->error = "Unable to map persistent memory for cache"; 1953 + goto bad; 1954 + } 1955 + } 1956 + 1957 + /* 1958 + * Parse the cache block size 1959 + */ 1960 + string = dm_shift_arg(&as); 1961 + if (!string) 1962 + goto bad_arguments; 1963 + if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 || 1964 + wc->block_size < 512 || wc->block_size > PAGE_SIZE || 1965 + (wc->block_size & (wc->block_size - 1))) { 1966 + r = -EINVAL; 1967 + ti->error = "Invalid block size"; 1968 + goto bad; 1969 + } 1970 + wc->block_size_bits = __ffs(wc->block_size); 1971 + 1972 + wc->max_writeback_jobs = MAX_WRITEBACK_JOBS; 1973 + wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM; 1974 + wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC); 1975 + 1976 + /* 1977 + * Parse optional arguments 1978 + */ 1979 + r = dm_read_arg_group(_args, &as, &opt_params, &ti->error); 1980 + if (r) 1981 + goto bad; 1982 + 1983 + while (opt_params) { 1984 + string = dm_shift_arg(&as), opt_params--; 1985 + if (!strcasecmp(string, "high_watermark") && opt_params >= 1) { 1986 + string = dm_shift_arg(&as), opt_params--; 1987 + if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1) 1988 + goto invalid_optional; 1989 + if (high_wm_percent < 0 || high_wm_percent > 100) 1990 + goto invalid_optional; 1991 + wc->high_wm_percent_set = true; 1992 + } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) { 1993 + string = dm_shift_arg(&as), opt_params--; 1994 + if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1) 1995 + goto invalid_optional; 1996 + if (low_wm_percent < 0 || low_wm_percent > 100) 1997 + goto invalid_optional; 1998 + wc->low_wm_percent_set = true; 1999 + } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) { 2000 + string = dm_shift_arg(&as), opt_params--; 2001 + if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1) 2002 + goto invalid_optional; 2003 + wc->max_writeback_jobs_set = true; 2004 + } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) { 2005 + string = dm_shift_arg(&as), opt_params--; 2006 + if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1) 2007 + goto invalid_optional; 2008 + wc->autocommit_blocks_set = true; 2009 + } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) { 2010 + unsigned autocommit_msecs; 2011 + string = dm_shift_arg(&as), opt_params--; 2012 + if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1) 2013 + goto invalid_optional; 2014 + if (autocommit_msecs > 3600000) 2015 + goto invalid_optional; 2016 + wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); 2017 + wc->autocommit_time_set = true; 2018 + } else if (!strcasecmp(string, "fua")) { 2019 + if (WC_MODE_PMEM(wc)) { 2020 + wc->writeback_fua = true; 2021 + wc->writeback_fua_set = true; 2022 + } else goto invalid_optional; 2023 + } else if (!strcasecmp(string, "nofua")) { 2024 + if (WC_MODE_PMEM(wc)) { 2025 + wc->writeback_fua = false; 2026 + wc->writeback_fua_set = true; 2027 + } else goto invalid_optional; 2028 + } else { 2029 + invalid_optional: 2030 + r = -EINVAL; 2031 + ti->error = "Invalid optional argument"; 2032 + goto bad; 2033 + } 2034 + } 2035 + 2036 + if (high_wm_percent < low_wm_percent) { 2037 + r = -EINVAL; 2038 + ti->error = "High watermark must be greater than or equal to low watermark"; 2039 + goto bad; 2040 + } 2041 + 2042 + if (!WC_MODE_PMEM(wc)) { 2043 + struct dm_io_region region; 2044 + struct dm_io_request req; 2045 + size_t n_blocks, n_metadata_blocks; 2046 + uint64_t n_bitmap_bits; 2047 + 2048 + bio_list_init(&wc->flush_list); 2049 + wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush"); 2050 + if (IS_ERR(wc->flush_thread)) { 2051 + r = PTR_ERR(wc->flush_thread); 2052 + wc->flush_thread = NULL; 2053 + ti->error = "Couldn't spawn endio thread"; 2054 + goto bad; 2055 + } 2056 + wake_up_process(wc->flush_thread); 2057 + 2058 + r = calculate_memory_size(wc->memory_map_size, wc->block_size, 2059 + &n_blocks, &n_metadata_blocks); 2060 + if (r) { 2061 + ti->error = "Invalid device size"; 2062 + goto bad; 2063 + } 2064 + 2065 + n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) + 2066 + BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY; 2067 + /* this is limitation of test_bit functions */ 2068 + if (n_bitmap_bits > 1U << 31) { 2069 + r = -EFBIG; 2070 + ti->error = "Invalid device size"; 2071 + goto bad; 2072 + } 2073 + 2074 + wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits); 2075 + if (!wc->memory_map) { 2076 + r = -ENOMEM; 2077 + ti->error = "Unable to allocate memory for metadata"; 2078 + goto bad; 2079 + } 2080 + 2081 + wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle); 2082 + if (IS_ERR(wc->dm_kcopyd)) { 2083 + r = PTR_ERR(wc->dm_kcopyd); 2084 + ti->error = "Unable to allocate dm-kcopyd client"; 2085 + wc->dm_kcopyd = NULL; 2086 + goto bad; 2087 + } 2088 + 2089 + wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT); 2090 + wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) / 2091 + BITS_PER_LONG * sizeof(unsigned long); 2092 + wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size); 2093 + if (!wc->dirty_bitmap) { 2094 + r = -ENOMEM; 2095 + ti->error = "Unable to allocate dirty bitmap"; 2096 + goto bad; 2097 + } 2098 + 2099 + region.bdev = wc->ssd_dev->bdev; 2100 + region.sector = 0; 2101 + region.count = wc->metadata_sectors; 2102 + req.bi_op = REQ_OP_READ; 2103 + req.bi_op_flags = REQ_SYNC; 2104 + req.mem.type = DM_IO_VMA; 2105 + req.mem.ptr.vma = (char *)wc->memory_map; 2106 + req.client = wc->dm_io; 2107 + req.notify.fn = NULL; 2108 + 2109 + r = dm_io(&req, 1, &region, NULL); 2110 + if (r) { 2111 + ti->error = "Unable to read metadata"; 2112 + goto bad; 2113 + } 2114 + } 2115 + 2116 + r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock)); 2117 + if (r) { 2118 + ti->error = "Hardware memory error when reading superblock"; 2119 + goto bad; 2120 + } 2121 + if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) { 2122 + r = init_memory(wc); 2123 + if (r) { 2124 + ti->error = "Unable to initialize device"; 2125 + goto bad; 2126 + } 2127 + r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock)); 2128 + if (r) { 2129 + ti->error = "Hardware memory error when reading superblock"; 2130 + goto bad; 2131 + } 2132 + } 2133 + 2134 + if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) { 2135 + ti->error = "Invalid magic in the superblock"; 2136 + r = -EINVAL; 2137 + goto bad; 2138 + } 2139 + 2140 + if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) { 2141 + ti->error = "Invalid version in the superblock"; 2142 + r = -EINVAL; 2143 + goto bad; 2144 + } 2145 + 2146 + if (le32_to_cpu(s.block_size) != wc->block_size) { 2147 + ti->error = "Block size does not match superblock"; 2148 + r = -EINVAL; 2149 + goto bad; 2150 + } 2151 + 2152 + wc->n_blocks = le64_to_cpu(s.n_blocks); 2153 + 2154 + offset = wc->n_blocks * sizeof(struct wc_memory_entry); 2155 + if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) { 2156 + overflow: 2157 + ti->error = "Overflow in size calculation"; 2158 + r = -EINVAL; 2159 + goto bad; 2160 + } 2161 + offset += sizeof(struct wc_memory_superblock); 2162 + if (offset < sizeof(struct wc_memory_superblock)) 2163 + goto overflow; 2164 + offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1); 2165 + data_size = wc->n_blocks * (size_t)wc->block_size; 2166 + if (!offset || (data_size / wc->block_size != wc->n_blocks) || 2167 + (offset + data_size < offset)) 2168 + goto overflow; 2169 + if (offset + data_size > wc->memory_map_size) { 2170 + ti->error = "Memory area is too small"; 2171 + r = -EINVAL; 2172 + goto bad; 2173 + } 2174 + 2175 + wc->metadata_sectors = offset >> SECTOR_SHIFT; 2176 + wc->block_start = (char *)sb(wc) + offset; 2177 + 2178 + x = (uint64_t)wc->n_blocks * (100 - high_wm_percent); 2179 + x += 50; 2180 + do_div(x, 100); 2181 + wc->freelist_high_watermark = x; 2182 + x = (uint64_t)wc->n_blocks * (100 - low_wm_percent); 2183 + x += 50; 2184 + do_div(x, 100); 2185 + wc->freelist_low_watermark = x; 2186 + 2187 + r = writecache_alloc_entries(wc); 2188 + if (r) { 2189 + ti->error = "Cannot allocate memory"; 2190 + goto bad; 2191 + } 2192 + 2193 + ti->num_flush_bios = 1; 2194 + ti->flush_supported = true; 2195 + ti->num_discard_bios = 1; 2196 + 2197 + if (WC_MODE_PMEM(wc)) 2198 + persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size); 2199 + 2200 + return 0; 2201 + 2202 + bad_arguments: 2203 + r = -EINVAL; 2204 + ti->error = "Bad arguments"; 2205 + bad: 2206 + writecache_dtr(ti); 2207 + return r; 2208 + } 2209 + 2210 + static void writecache_status(struct dm_target *ti, status_type_t type, 2211 + unsigned status_flags, char *result, unsigned maxlen) 2212 + { 2213 + struct dm_writecache *wc = ti->private; 2214 + unsigned extra_args; 2215 + unsigned sz = 0; 2216 + uint64_t x; 2217 + 2218 + switch (type) { 2219 + case STATUSTYPE_INFO: 2220 + DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc), 2221 + (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size, 2222 + (unsigned long long)wc->writeback_size); 2223 + break; 2224 + case STATUSTYPE_TABLE: 2225 + DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's', 2226 + wc->dev->name, wc->ssd_dev->name, wc->block_size); 2227 + extra_args = 0; 2228 + if (wc->high_wm_percent_set) 2229 + extra_args += 2; 2230 + if (wc->low_wm_percent_set) 2231 + extra_args += 2; 2232 + if (wc->max_writeback_jobs_set) 2233 + extra_args += 2; 2234 + if (wc->autocommit_blocks_set) 2235 + extra_args += 2; 2236 + if (wc->autocommit_time_set) 2237 + extra_args += 2; 2238 + if (wc->writeback_fua_set) 2239 + extra_args++; 2240 + 2241 + DMEMIT("%u", extra_args); 2242 + if (wc->high_wm_percent_set) { 2243 + x = (uint64_t)wc->freelist_high_watermark * 100; 2244 + x += wc->n_blocks / 2; 2245 + do_div(x, (size_t)wc->n_blocks); 2246 + DMEMIT(" high_watermark %u", 100 - (unsigned)x); 2247 + } 2248 + if (wc->low_wm_percent_set) { 2249 + x = (uint64_t)wc->freelist_low_watermark * 100; 2250 + x += wc->n_blocks / 2; 2251 + do_div(x, (size_t)wc->n_blocks); 2252 + DMEMIT(" low_watermark %u", 100 - (unsigned)x); 2253 + } 2254 + if (wc->max_writeback_jobs_set) 2255 + DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs); 2256 + if (wc->autocommit_blocks_set) 2257 + DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); 2258 + if (wc->autocommit_time_set) 2259 + DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies)); 2260 + if (wc->writeback_fua_set) 2261 + DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); 2262 + break; 2263 + } 2264 + } 2265 + 2266 + static struct target_type writecache_target = { 2267 + .name = "writecache", 2268 + .version = {1, 0, 0}, 2269 + .module = THIS_MODULE, 2270 + .ctr = writecache_ctr, 2271 + .dtr = writecache_dtr, 2272 + .status = writecache_status, 2273 + .postsuspend = writecache_suspend, 2274 + .resume = writecache_resume, 2275 + .message = writecache_message, 2276 + .map = writecache_map, 2277 + .end_io = writecache_end_io, 2278 + .iterate_devices = writecache_iterate_devices, 2279 + .io_hints = writecache_io_hints, 2280 + }; 2281 + 2282 + static int __init dm_writecache_init(void) 2283 + { 2284 + int r; 2285 + 2286 + r = dm_register_target(&writecache_target); 2287 + if (r < 0) { 2288 + DMERR("register failed %d", r); 2289 + return r; 2290 + } 2291 + 2292 + return 0; 2293 + } 2294 + 2295 + static void __exit dm_writecache_exit(void) 2296 + { 2297 + dm_unregister_target(&writecache_target); 2298 + } 2299 + 2300 + module_init(dm_writecache_init); 2301 + module_exit(dm_writecache_exit); 2302 + 2303 + MODULE_DESCRIPTION(DM_NAME " writecache target"); 2304 + MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); 2305 + MODULE_LICENSE("GPL");

+1 -1

drivers/md/dm-zoned-target.c

··· 52 52 struct dmz_reclaim *reclaim; 53 53 54 54 /* For chunk work */ 55 - struct mutex chunk_lock; 56 55 struct radix_tree_root chunk_rxtree; 57 56 struct workqueue_struct *chunk_wq; 57 + struct mutex chunk_lock; 58 58 59 59 /* For cloned BIOs to zones */ 60 60 struct bio_set bio_set;