Merge branch 'for-6.12/block' into for-6.12/io_uring-discard

+2 -3

MAINTAINERS

··· 3786 3786 F: fs/befs/ 3787 3787 3788 3788 BFQ I/O SCHEDULER 3789 - M: Paolo Valente <paolo.valente@unimore.it> 3790 - M: Jens Axboe <axboe@kernel.dk> 3789 + M: Yu Kuai <yukuai3@huawei.com> 3791 3790 L: linux-block@vger.kernel.org 3792 - S: Maintained 3791 + S: Odd Fixes 3793 3792 F: Documentation/block/bfq-iosched.rst 3794 3793 F: block/bfq-* 3795 3794

+1 -7

block/bfq-cgroup.c

··· 679 679 bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); 680 680 bfqg_and_blkg_put(old_parent); 681 681 682 - if (entity->parent && 683 - entity->parent->last_bfqq_created == bfqq) 684 - entity->parent->last_bfqq_created = NULL; 685 - else if (bfqd->last_bfqq_created == bfqq) 686 - bfqd->last_bfqq_created = NULL; 687 - 682 + bfq_reassign_last_bfqq(bfqq, NULL); 688 683 entity->parent = bfqg->my_entity; 689 684 entity->sched_data = &bfqg->sched_data; 690 685 /* pin down bfqg and its associated blkg */ ··· 736 741 */ 737 742 bfq_put_cooperator(sync_bfqq); 738 743 bic_set_bfqq(bic, NULL, true, act_idx); 739 - bfq_release_process_ref(bfqd, sync_bfqq); 740 744 } 741 745 } 742 746

+113 -93

block/bfq-iosched.c

··· 2911 2911 struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; 2912 2912 2913 2913 /* if a merge has already been setup, then proceed with that first */ 2914 - if (bfqq->new_bfqq) 2915 - return bfqq->new_bfqq; 2914 + new_bfqq = bfqq->new_bfqq; 2915 + if (new_bfqq) { 2916 + while (new_bfqq->new_bfqq) 2917 + new_bfqq = new_bfqq->new_bfqq; 2918 + return new_bfqq; 2919 + } 2916 2920 2917 2921 /* 2918 2922 * Check delayed stable merge for rotational or non-queueing ··· 3097 3093 } 3098 3094 3099 3095 3100 - static void 3101 - bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq) 3096 + void bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, 3097 + struct bfq_queue *new_bfqq) 3102 3098 { 3103 3099 if (cur_bfqq->entity.parent && 3104 3100 cur_bfqq->entity.parent->last_bfqq_created == cur_bfqq) ··· 3129 3125 bfq_put_queue(bfqq); 3130 3126 } 3131 3127 3132 - static void 3133 - bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, 3134 - struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) 3128 + static struct bfq_queue *bfq_merge_bfqqs(struct bfq_data *bfqd, 3129 + struct bfq_io_cq *bic, 3130 + struct bfq_queue *bfqq) 3135 3131 { 3132 + struct bfq_queue *new_bfqq = bfqq->new_bfqq; 3133 + 3136 3134 bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", 3137 3135 (unsigned long)new_bfqq->pid); 3138 3136 /* Save weight raising and idle window of the merged queues */ ··· 3228 3222 bfq_reassign_last_bfqq(bfqq, new_bfqq); 3229 3223 3230 3224 bfq_release_process_ref(bfqd, bfqq); 3225 + 3226 + return new_bfqq; 3231 3227 } 3232 3228 3233 3229 static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, ··· 3265 3257 * fulfilled, i.e., bic can be redirected to new_bfqq 3266 3258 * and bfqq can be put. 3267 3259 */ 3268 - bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq, 3269 - new_bfqq); 3270 - /* 3271 - * If we get here, bio will be queued into new_queue, 3272 - * so use new_bfqq to decide whether bio and rq can be 3273 - * merged. 3274 - */ 3275 - bfqq = new_bfqq; 3260 + while (bfqq != new_bfqq) 3261 + bfqq = bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq); 3276 3262 3277 3263 /* 3278 3264 * Change also bqfd->bio_bfqq, as ··· 5434 5432 bfq_put_queue(__bfqq); 5435 5433 __bfqq = next; 5436 5434 } 5435 + 5436 + bfq_release_process_ref(bfqq->bfqd, bfqq); 5437 5437 } 5438 5438 5439 5439 static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ··· 5448 5444 bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); 5449 5445 5450 5446 bfq_put_cooperator(bfqq); 5451 - 5452 - bfq_release_process_ref(bfqd, bfqq); 5453 5447 } 5454 5448 5455 5449 static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync, ··· 5703 5701 * state before killing it. 5704 5702 */ 5705 5703 bfqq->bic = bic; 5706 - bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); 5707 - 5708 - return new_bfqq; 5704 + return bfq_merge_bfqqs(bfqd, bic, bfqq); 5709 5705 } 5710 5706 5711 5707 /* ··· 6158 6158 bool waiting, idle_timer_disabled = false; 6159 6159 6160 6160 if (new_bfqq) { 6161 + struct bfq_queue *old_bfqq = bfqq; 6161 6162 /* 6162 6163 * Release the request's reference to the old bfqq 6163 6164 * and make sure one is taken to the shared queue. ··· 6175 6174 * new_bfqq. 6176 6175 */ 6177 6176 if (bic_to_bfqq(RQ_BIC(rq), true, 6178 - bfq_actuator_index(bfqd, rq->bio)) == bfqq) 6179 - bfq_merge_bfqqs(bfqd, RQ_BIC(rq), 6180 - bfqq, new_bfqq); 6177 + bfq_actuator_index(bfqd, rq->bio)) == bfqq) { 6178 + while (bfqq != new_bfqq) 6179 + bfqq = bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq); 6180 + } 6181 6181 6182 - bfq_clear_bfqq_just_created(bfqq); 6182 + bfq_clear_bfqq_just_created(old_bfqq); 6183 6183 /* 6184 6184 * rq is about to be enqueued into new_bfqq, 6185 6185 * release rq reference on bfqq 6186 6186 */ 6187 - bfq_put_queue(bfqq); 6187 + bfq_put_queue(old_bfqq); 6188 6188 rq->elv.priv[1] = new_bfqq; 6189 - bfqq = new_bfqq; 6190 6189 } 6191 6190 6192 6191 bfq_update_io_thinktime(bfqd, bfqq); ··· 6724 6723 { 6725 6724 bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); 6726 6725 6727 - if (bfqq_process_refs(bfqq) == 1) { 6726 + if (bfqq_process_refs(bfqq) == 1 && !bfqq->new_bfqq) { 6728 6727 bfqq->pid = current->pid; 6729 6728 bfq_clear_bfqq_coop(bfqq); 6730 6729 bfq_clear_bfqq_split_coop(bfqq); ··· 6734 6733 bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx); 6735 6734 6736 6735 bfq_put_cooperator(bfqq); 6737 - 6738 - bfq_release_process_ref(bfqq->bfqd, bfqq); 6739 6736 return NULL; 6740 6737 } 6741 6738 6742 - static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, 6743 - struct bfq_io_cq *bic, 6744 - struct bio *bio, 6745 - bool split, bool is_sync, 6746 - bool *new_queue) 6739 + static struct bfq_queue * 6740 + __bfq_get_bfqq_handle_split(struct bfq_data *bfqd, struct bfq_io_cq *bic, 6741 + struct bio *bio, bool split, bool is_sync, 6742 + bool *new_queue) 6747 6743 { 6748 6744 unsigned int act_idx = bfq_actuator_index(bfqd, bio); 6749 6745 struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, act_idx); ··· 6819 6821 rq->elv.priv[0] = rq->elv.priv[1] = NULL; 6820 6822 } 6821 6823 6824 + static struct bfq_queue *bfq_waker_bfqq(struct bfq_queue *bfqq) 6825 + { 6826 + struct bfq_queue *new_bfqq = bfqq->new_bfqq; 6827 + struct bfq_queue *waker_bfqq = bfqq->waker_bfqq; 6828 + 6829 + if (!waker_bfqq) 6830 + return NULL; 6831 + 6832 + while (new_bfqq) { 6833 + if (new_bfqq == waker_bfqq) { 6834 + /* 6835 + * If waker_bfqq is in the merge chain, and current 6836 + * is the only procress. 6837 + */ 6838 + if (bfqq_process_refs(waker_bfqq) == 1) 6839 + return NULL; 6840 + break; 6841 + } 6842 + 6843 + new_bfqq = new_bfqq->new_bfqq; 6844 + } 6845 + 6846 + return waker_bfqq; 6847 + } 6848 + 6849 + static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, 6850 + struct bfq_io_cq *bic, 6851 + struct bio *bio, 6852 + unsigned int idx, 6853 + bool is_sync) 6854 + { 6855 + struct bfq_queue *waker_bfqq; 6856 + struct bfq_queue *bfqq; 6857 + bool new_queue = false; 6858 + 6859 + bfqq = __bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, 6860 + &new_queue); 6861 + if (unlikely(new_queue)) 6862 + return bfqq; 6863 + 6864 + /* If the queue was seeky for too long, break it apart. */ 6865 + if (!bfq_bfqq_coop(bfqq) || !bfq_bfqq_split_coop(bfqq) || 6866 + bic->bfqq_data[idx].stably_merged) 6867 + return bfqq; 6868 + 6869 + waker_bfqq = bfq_waker_bfqq(bfqq); 6870 + 6871 + /* Update bic before losing reference to bfqq */ 6872 + if (bfq_bfqq_in_large_burst(bfqq)) 6873 + bic->bfqq_data[idx].saved_in_large_burst = true; 6874 + 6875 + bfqq = bfq_split_bfqq(bic, bfqq); 6876 + if (bfqq) { 6877 + bfq_bfqq_resume_state(bfqq, bfqd, bic, true); 6878 + return bfqq; 6879 + } 6880 + 6881 + bfqq = __bfq_get_bfqq_handle_split(bfqd, bic, bio, true, is_sync, NULL); 6882 + if (unlikely(bfqq == &bfqd->oom_bfqq)) 6883 + return bfqq; 6884 + 6885 + bfq_bfqq_resume_state(bfqq, bfqd, bic, false); 6886 + bfqq->waker_bfqq = waker_bfqq; 6887 + bfqq->tentative_waker_bfqq = NULL; 6888 + 6889 + /* 6890 + * If the waker queue disappears, then new_bfqq->waker_bfqq must be 6891 + * reset. So insert new_bfqq into the 6892 + * woken_list of the waker. See 6893 + * bfq_check_waker for details. 6894 + */ 6895 + if (waker_bfqq) 6896 + hlist_add_head(&bfqq->woken_list_node, 6897 + &bfqq->waker_bfqq->woken_list); 6898 + 6899 + return bfqq; 6900 + } 6901 + 6822 6902 /* 6823 6903 * If needed, init rq, allocate bfq data structures associated with 6824 6904 * rq, and increment reference counters in the destination bfq_queue ··· 6928 6852 struct bfq_io_cq *bic; 6929 6853 const int is_sync = rq_is_sync(rq); 6930 6854 struct bfq_queue *bfqq; 6931 - bool new_queue = false; 6932 - bool bfqq_already_existing = false, split = false; 6933 6855 unsigned int a_idx = bfq_actuator_index(bfqd, bio); 6934 6856 6935 6857 if (unlikely(!rq->elv.icq)) ··· 6944 6870 return RQ_BFQQ(rq); 6945 6871 6946 6872 bic = icq_to_bic(rq->elv.icq); 6947 - 6948 6873 bfq_check_ioprio_change(bic, bio); 6949 - 6950 6874 bfq_bic_update_cgroup(bic, bio); 6951 - 6952 - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, 6953 - &new_queue); 6954 - 6955 - if (likely(!new_queue)) { 6956 - /* If the queue was seeky for too long, break it apart. */ 6957 - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) && 6958 - !bic->bfqq_data[a_idx].stably_merged) { 6959 - struct bfq_queue *old_bfqq = bfqq; 6960 - 6961 - /* Update bic before losing reference to bfqq */ 6962 - if (bfq_bfqq_in_large_burst(bfqq)) 6963 - bic->bfqq_data[a_idx].saved_in_large_burst = 6964 - true; 6965 - 6966 - bfqq = bfq_split_bfqq(bic, bfqq); 6967 - split = true; 6968 - 6969 - if (!bfqq) { 6970 - bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, 6971 - true, is_sync, 6972 - NULL); 6973 - if (unlikely(bfqq == &bfqd->oom_bfqq)) 6974 - bfqq_already_existing = true; 6975 - } else 6976 - bfqq_already_existing = true; 6977 - 6978 - if (!bfqq_already_existing) { 6979 - bfqq->waker_bfqq = old_bfqq->waker_bfqq; 6980 - bfqq->tentative_waker_bfqq = NULL; 6981 - 6982 - /* 6983 - * If the waker queue disappears, then 6984 - * new_bfqq->waker_bfqq must be 6985 - * reset. So insert new_bfqq into the 6986 - * woken_list of the waker. See 6987 - * bfq_check_waker for details. 6988 - */ 6989 - if (bfqq->waker_bfqq) 6990 - hlist_add_head(&bfqq->woken_list_node, 6991 - &bfqq->waker_bfqq->woken_list); 6992 - } 6993 - } 6994 - } 6875 + bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, a_idx, is_sync); 6995 6876 6996 6877 bfqq_request_allocated(bfqq); 6997 6878 bfqq->ref++; ··· 6963 6934 * addition, if the queue has also just been split, we have to 6964 6935 * resume its state. 6965 6936 */ 6966 - if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { 6937 + if (likely(bfqq != &bfqd->oom_bfqq) && !bfqq->new_bfqq && 6938 + bfqq_process_refs(bfqq) == 1) 6967 6939 bfqq->bic = bic; 6968 - if (split) { 6969 - /* 6970 - * The queue has just been split from a shared 6971 - * queue: restore the idle window and the 6972 - * possible weight raising period. 6973 - */ 6974 - bfq_bfqq_resume_state(bfqq, bfqd, bic, 6975 - bfqq_already_existing); 6976 - } 6977 - } 6978 6940 6979 6941 /* 6980 6942 * Consider bfqq as possibly belonging to a burst of newly

+2 -6

block/bfq-iosched.h

··· 1156 1156 void bfq_add_bfqq_busy(struct bfq_queue *bfqq); 1157 1157 void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq); 1158 1158 void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq); 1159 + void bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, 1160 + struct bfq_queue *new_bfqq); 1159 1161 1160 1162 /* --------------- end of interface of B-WF2Q+ ---------------- */ 1161 1163 ··· 1185 1183 "%s " fmt, pid_str, ##args); \ 1186 1184 } while (0) 1187 1185 1188 - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ 1189 - blk_add_cgroup_trace_msg((bfqd)->queue, \ 1190 - &bfqg_to_blkg(bfqg)->blkcg->css, fmt, ##args); \ 1191 - } while (0) 1192 - 1193 1186 #else /* CONFIG_BFQ_GROUP_IOSCHED */ 1194 1187 1195 1188 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ ··· 1194 1197 bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \ 1195 1198 blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \ 1196 1199 } while (0) 1197 - #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) 1198 1200 1199 1201 #endif /* CONFIG_BFQ_GROUP_IOSCHED */ 1200 1202

+87 -25

block/bio.c

··· 931 931 if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) 932 932 return false; 933 933 934 - *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); 934 + *same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) & 935 + PAGE_MASK)); 935 936 if (!*same_page) { 936 937 if (IS_ENABLED(CONFIG_KMSAN)) 937 938 return false; ··· 1015 1014 bio->bi_vcnt++; 1016 1015 bio->bi_iter.bi_size += len; 1017 1016 return len; 1017 + } 1018 + 1019 + /** 1020 + * bio_add_hw_folio - attempt to add a folio to a bio with hw constraints 1021 + * @q: the target queue 1022 + * @bio: destination bio 1023 + * @folio: folio to add 1024 + * @len: vec entry length 1025 + * @offset: vec entry offset in the folio 1026 + * @max_sectors: maximum number of sectors that can be added 1027 + * @same_page: return if the segment has been merged inside the same folio 1028 + * 1029 + * Add a folio to a bio while respecting the hardware max_sectors, max_segment 1030 + * and gap limitations. 1031 + */ 1032 + int bio_add_hw_folio(struct request_queue *q, struct bio *bio, 1033 + struct folio *folio, size_t len, size_t offset, 1034 + unsigned int max_sectors, bool *same_page) 1035 + { 1036 + if (len > UINT_MAX || offset > UINT_MAX) 1037 + return 0; 1038 + return bio_add_hw_page(q, bio, folio_page(folio, 0), len, offset, 1039 + max_sectors, same_page); 1018 1040 } 1019 1041 1020 1042 /** ··· 1190 1166 struct folio_iter fi; 1191 1167 1192 1168 bio_for_each_folio_all(fi, bio) { 1193 - struct page *page; 1194 1169 size_t nr_pages; 1195 1170 1196 1171 if (mark_dirty) { ··· 1197 1174 folio_mark_dirty(fi.folio); 1198 1175 folio_unlock(fi.folio); 1199 1176 } 1200 - page = folio_page(fi.folio, fi.offset / PAGE_SIZE); 1201 1177 nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE - 1202 1178 fi.offset / PAGE_SIZE + 1; 1203 - do { 1204 - bio_release_page(bio, page++); 1205 - } while (--nr_pages != 0); 1179 + unpin_user_folio(fi.folio, nr_pages); 1206 1180 } 1207 1181 } 1208 1182 EXPORT_SYMBOL_GPL(__bio_release_pages); ··· 1224 1204 bio_set_flag(bio, BIO_CLONED); 1225 1205 } 1226 1206 1227 - static int bio_iov_add_page(struct bio *bio, struct page *page, 1228 - unsigned int len, unsigned int offset) 1207 + static int bio_iov_add_folio(struct bio *bio, struct folio *folio, size_t len, 1208 + size_t offset) 1229 1209 { 1230 1210 bool same_page = false; 1231 1211 ··· 1234 1214 1235 1215 if (bio->bi_vcnt > 0 && 1236 1216 bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], 1237 - page, len, offset, &same_page)) { 1217 + folio_page(folio, 0), len, offset, 1218 + &same_page)) { 1238 1219 bio->bi_iter.bi_size += len; 1239 - if (same_page) 1240 - bio_release_page(bio, page); 1220 + if (same_page && bio_flagged(bio, BIO_PAGE_PINNED)) 1221 + unpin_user_folio(folio, 1); 1241 1222 return 0; 1242 1223 } 1243 - __bio_add_page(bio, page, len, offset); 1224 + bio_add_folio_nofail(bio, folio, len, offset); 1244 1225 return 0; 1245 1226 } 1246 1227 1247 - static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page, 1248 - unsigned int len, unsigned int offset) 1228 + static int bio_iov_add_zone_append_folio(struct bio *bio, struct folio *folio, 1229 + size_t len, size_t offset) 1249 1230 { 1250 1231 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 1251 1232 bool same_page = false; 1252 1233 1253 - if (bio_add_hw_page(q, bio, page, len, offset, 1234 + if (bio_add_hw_folio(q, bio, folio, len, offset, 1254 1235 queue_max_zone_append_sectors(q), &same_page) != len) 1255 1236 return -EINVAL; 1256 - if (same_page) 1257 - bio_release_page(bio, page); 1237 + if (same_page && bio_flagged(bio, BIO_PAGE_PINNED)) 1238 + unpin_user_folio(folio, 1); 1258 1239 return 0; 1240 + } 1241 + 1242 + static unsigned int get_contig_folio_len(unsigned int *num_pages, 1243 + struct page **pages, unsigned int i, 1244 + struct folio *folio, size_t left, 1245 + size_t offset) 1246 + { 1247 + size_t bytes = left; 1248 + size_t contig_sz = min_t(size_t, PAGE_SIZE - offset, bytes); 1249 + unsigned int j; 1250 + 1251 + /* 1252 + * We might COW a single page in the middle of 1253 + * a large folio, so we have to check that all 1254 + * pages belong to the same folio. 1255 + */ 1256 + bytes -= contig_sz; 1257 + for (j = i + 1; j < i + *num_pages; j++) { 1258 + size_t next = min_t(size_t, PAGE_SIZE, bytes); 1259 + 1260 + if (page_folio(pages[j]) != folio || 1261 + pages[j] != pages[j - 1] + 1) { 1262 + break; 1263 + } 1264 + contig_sz += next; 1265 + bytes -= next; 1266 + } 1267 + *num_pages = j - i; 1268 + 1269 + return contig_sz; 1259 1270 } 1260 1271 1261 1272 #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) ··· 1308 1257 unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; 1309 1258 struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; 1310 1259 struct page **pages = (struct page **)bv; 1311 - ssize_t size, left; 1312 - unsigned len, i = 0; 1313 - size_t offset; 1260 + ssize_t size; 1261 + unsigned int num_pages, i = 0; 1262 + size_t offset, folio_offset, left, len; 1314 1263 int ret = 0; 1315 1264 1316 1265 /* ··· 1350 1299 goto out; 1351 1300 } 1352 1301 1353 - for (left = size, i = 0; left > 0; left -= len, i++) { 1302 + for (left = size, i = 0; left > 0; left -= len, i += num_pages) { 1354 1303 struct page *page = pages[i]; 1304 + struct folio *folio = page_folio(page); 1355 1305 1356 - len = min_t(size_t, PAGE_SIZE - offset, left); 1306 + folio_offset = ((size_t)folio_page_idx(folio, page) << 1307 + PAGE_SHIFT) + offset; 1308 + 1309 + len = min(folio_size(folio) - folio_offset, left); 1310 + 1311 + num_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); 1312 + 1313 + if (num_pages > 1) 1314 + len = get_contig_folio_len(&num_pages, pages, i, 1315 + folio, left, offset); 1316 + 1357 1317 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 1358 - ret = bio_iov_add_zone_append_page(bio, page, len, 1359 - offset); 1318 + ret = bio_iov_add_zone_append_folio(bio, folio, len, 1319 + folio_offset); 1360 1320 if (ret) 1361 1321 break; 1362 1322 } else 1363 - bio_iov_add_page(bio, page, len, offset); 1323 + bio_iov_add_folio(bio, folio, len, folio_offset); 1364 1324 1365 1325 offset = 0; 1366 1326 }

+13 -10

block/blk-cgroup.c

··· 1458 1458 struct request_queue *q = disk->queue; 1459 1459 struct blkcg_gq *new_blkg, *blkg; 1460 1460 bool preloaded; 1461 - int ret; 1462 1461 1463 1462 new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); 1464 1463 if (!new_blkg) ··· 1477 1478 if (preloaded) 1478 1479 radix_tree_preload_end(); 1479 1480 1480 - ret = blk_ioprio_init(disk); 1481 - if (ret) 1482 - goto err_destroy_all; 1483 - 1484 1481 return 0; 1485 1482 1486 - err_destroy_all: 1487 - blkg_destroy_all(disk); 1488 - return ret; 1489 1483 err_unlock: 1490 1484 spin_unlock_irq(&q->queue_lock); 1491 1485 if (preloaded) ··· 1545 1553 1546 1554 if (blkcg_policy_enabled(q, pol)) 1547 1555 return 0; 1556 + 1557 + /* 1558 + * Policy is allowed to be registered without pd_alloc_fn/pd_free_fn, 1559 + * for example, ioprio. Such policy will work on blkcg level, not disk 1560 + * level, and don't need to be activated. 1561 + */ 1562 + if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn)) 1563 + return -EINVAL; 1548 1564 1549 1565 if (queue_is_mq(q)) 1550 1566 blk_mq_freeze_queue(q); ··· 1733 1733 goto err_unlock; 1734 1734 } 1735 1735 1736 - /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */ 1736 + /* 1737 + * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy 1738 + * without pd_alloc_fn/pd_free_fn can't be activated. 1739 + */ 1737 1740 if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || 1738 - (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) 1741 + (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) 1739 1742 goto err_unlock; 1740 1743 1741 1744 /* register @pol */

-1

block/blk-cgroup.h

··· 485 485 static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, 486 486 struct blkcg_policy *pol) { return NULL; } 487 487 static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } 488 - static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } 489 488 static inline void blkg_get(struct blkcg_gq *blkg) { } 490 489 static inline void blkg_put(struct blkcg_gq *blkg) { } 491 490 static inline void blkcg_bio_issue_init(struct bio *bio) { }

+5 -3

block/blk-iocost.c

··· 2076 2076 struct ioc_now *now) 2077 2077 { 2078 2078 struct ioc_gq *iocg; 2079 - u64 dur, usage_pct, nr_cycles; 2079 + u64 dur, usage_pct, nr_cycles, nr_cycles_shift; 2080 2080 2081 2081 /* if no debtor, reset the cycle */ 2082 2082 if (!nr_debtors) { ··· 2138 2138 old_debt = iocg->abs_vdebt; 2139 2139 old_delay = iocg->delay; 2140 2140 2141 + nr_cycles_shift = min_t(u64, nr_cycles, BITS_PER_LONG - 1); 2141 2142 if (iocg->abs_vdebt) 2142 - iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1; 2143 + iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles_shift ?: 1; 2144 + 2143 2145 if (iocg->delay) 2144 - iocg->delay = iocg->delay >> nr_cycles ?: 1; 2146 + iocg->delay = iocg->delay >> nr_cycles_shift ?: 1; 2145 2147 2146 2148 iocg_kick_waitq(iocg, true, now); 2147 2149

+1 -56

block/blk-ioprio.c

··· 50 50 static struct blkcg_policy ioprio_policy; 51 51 52 52 /** 53 - * struct ioprio_blkg - Per (cgroup, request queue) data. 54 - * @pd: blkg_policy_data structure. 55 - */ 56 - struct ioprio_blkg { 57 - struct blkg_policy_data pd; 58 - }; 59 - 60 - /** 61 53 * struct ioprio_blkcg - Per cgroup data. 62 54 * @cpd: blkcg_policy_data structure. 63 55 * @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>. ··· 58 66 struct blkcg_policy_data cpd; 59 67 enum prio_policy prio_policy; 60 68 }; 61 - 62 - static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd) 63 - { 64 - return pd ? container_of(pd, struct ioprio_blkg, pd) : NULL; 65 - } 66 69 67 70 static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg) 68 71 { ··· 69 82 ioprio_blkcg_from_css(struct cgroup_subsys_state *css) 70 83 { 71 84 return blkcg_to_ioprio_blkcg(css_to_blkcg(css)); 72 - } 73 - 74 - static struct ioprio_blkcg *ioprio_blkcg_from_bio(struct bio *bio) 75 - { 76 - struct blkg_policy_data *pd = blkg_to_pd(bio->bi_blkg, &ioprio_policy); 77 - 78 - if (!pd) 79 - return NULL; 80 - 81 - return blkcg_to_ioprio_blkcg(pd->blkg->blkcg); 82 85 } 83 86 84 87 static int ioprio_show_prio_policy(struct seq_file *sf, void *v) ··· 93 116 return ret; 94 117 blkcg->prio_policy = ret; 95 118 return nbytes; 96 - } 97 - 98 - static struct blkg_policy_data * 99 - ioprio_alloc_pd(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) 100 - { 101 - struct ioprio_blkg *ioprio_blkg; 102 - 103 - ioprio_blkg = kzalloc(sizeof(*ioprio_blkg), gfp); 104 - if (!ioprio_blkg) 105 - return NULL; 106 - 107 - return &ioprio_blkg->pd; 108 - } 109 - 110 - static void ioprio_free_pd(struct blkg_policy_data *pd) 111 - { 112 - struct ioprio_blkg *ioprio_blkg = pd_to_ioprio(pd); 113 - 114 - kfree(ioprio_blkg); 115 119 } 116 120 117 121 static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp) ··· 137 179 138 180 .cpd_alloc_fn = ioprio_alloc_cpd, 139 181 .cpd_free_fn = ioprio_free_cpd, 140 - 141 - .pd_alloc_fn = ioprio_alloc_pd, 142 - .pd_free_fn = ioprio_free_pd, 143 182 }; 144 183 145 184 void blkcg_set_ioprio(struct bio *bio) 146 185 { 147 - struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio); 186 + struct ioprio_blkcg *blkcg = blkcg_to_ioprio_blkcg(bio->bi_blkg->blkcg); 148 187 u16 prio; 149 188 150 189 if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE) ··· 172 217 IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); 173 218 if (prio > bio->bi_ioprio) 174 219 bio->bi_ioprio = prio; 175 - } 176 - 177 - void blk_ioprio_exit(struct gendisk *disk) 178 - { 179 - blkcg_deactivate_policy(disk, &ioprio_policy); 180 - } 181 - 182 - int blk_ioprio_init(struct gendisk *disk) 183 - { 184 - return blkcg_activate_policy(disk, &ioprio_policy); 185 220 } 186 221 187 222 static int __init ioprio_init(void)

-9

block/blk-ioprio.h

··· 9 9 struct bio; 10 10 11 11 #ifdef CONFIG_BLK_CGROUP_IOPRIO 12 - int blk_ioprio_init(struct gendisk *disk); 13 - void blk_ioprio_exit(struct gendisk *disk); 14 12 void blkcg_set_ioprio(struct bio *bio); 15 13 #else 16 - static inline int blk_ioprio_init(struct gendisk *disk) 17 - { 18 - return 0; 19 - } 20 - static inline void blk_ioprio_exit(struct gendisk *disk) 21 - { 22 - } 23 14 static inline void blkcg_set_ioprio(struct bio *bio) 24 15 { 25 16 }

+74 -88

block/blk-merge.c

··· 105 105 return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT; 106 106 } 107 107 108 - static struct bio *bio_split_discard(struct bio *bio, 109 - const struct queue_limits *lim, 110 - unsigned *nsegs, struct bio_set *bs) 108 + static struct bio *bio_submit_split(struct bio *bio, int split_sectors) 109 + { 110 + if (unlikely(split_sectors < 0)) { 111 + bio->bi_status = errno_to_blk_status(split_sectors); 112 + bio_endio(bio); 113 + return NULL; 114 + } 115 + 116 + if (split_sectors) { 117 + struct bio *split; 118 + 119 + split = bio_split(bio, split_sectors, GFP_NOIO, 120 + &bio->bi_bdev->bd_disk->bio_split); 121 + split->bi_opf |= REQ_NOMERGE; 122 + blkcg_bio_issue_init(split); 123 + bio_chain(split, bio); 124 + trace_block_split(split, bio->bi_iter.bi_sector); 125 + WARN_ON_ONCE(bio_zone_write_plugging(bio)); 126 + submit_bio_noacct(bio); 127 + return split; 128 + } 129 + 130 + return bio; 131 + } 132 + 133 + struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, 134 + unsigned *nsegs) 111 135 { 112 136 unsigned int max_discard_sectors, granularity; 113 137 sector_t tmp; ··· 145 121 min(lim->max_discard_sectors, bio_allowed_max_sectors(lim)); 146 122 max_discard_sectors -= max_discard_sectors % granularity; 147 123 if (unlikely(!max_discard_sectors)) 148 - return NULL; 124 + return bio; 149 125 150 126 if (bio_sectors(bio) <= max_discard_sectors) 151 - return NULL; 127 + return bio; 152 128 153 129 split_sectors = max_discard_sectors; 154 130 ··· 163 139 if (split_sectors > tmp) 164 140 split_sectors -= tmp; 165 141 166 - return bio_split(bio, split_sectors, GFP_NOIO, bs); 142 + return bio_submit_split(bio, split_sectors); 167 143 } 168 144 169 - static struct bio *bio_split_write_zeroes(struct bio *bio, 170 - const struct queue_limits *lim, 171 - unsigned *nsegs, struct bio_set *bs) 145 + struct bio *bio_split_write_zeroes(struct bio *bio, 146 + const struct queue_limits *lim, unsigned *nsegs) 172 147 { 173 148 *nsegs = 0; 174 149 if (!lim->max_write_zeroes_sectors) 175 - return NULL; 150 + return bio; 176 151 if (bio_sectors(bio) <= lim->max_write_zeroes_sectors) 177 - return NULL; 178 - return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs); 152 + return bio; 153 + return bio_submit_split(bio, lim->max_write_zeroes_sectors); 179 154 } 180 155 181 156 static inline unsigned int blk_boundary_sectors(const struct queue_limits *lim, ··· 297 274 } 298 275 299 276 /** 300 - * bio_split_rw - split a bio in two bios 277 + * bio_split_rw_at - check if and where to split a read/write bio 301 278 * @bio: [in] bio to be split 302 279 * @lim: [in] queue limits to split based on 303 280 * @segs: [out] number of segments in the bio with the first half of the sectors 304 - * @bs: [in] bio set to allocate the clone from 305 281 * @max_bytes: [in] maximum number of bytes per bio 306 282 * 307 - * Clone @bio, update the bi_iter of the clone to represent the first sectors 308 - * of @bio and update @bio->bi_iter to represent the remaining sectors. The 309 - * following is guaranteed for the cloned bio: 310 - * - That it has at most @max_bytes worth of data 311 - * - That it has at most queue_max_segments(@q) segments. 312 - * 313 - * Except for discard requests the cloned bio will point at the bi_io_vec of 314 - * the original bio. It is the responsibility of the caller to ensure that the 315 - * original bio is not freed before the cloned bio. The caller is also 316 - * responsible for ensuring that @bs is only destroyed after processing of the 317 - * split bio has finished. 283 + * Find out if @bio needs to be split to fit the queue limits in @lim and a 284 + * maximum size of @max_bytes. Returns a negative error number if @bio can't be 285 + * split, 0 if the bio doesn't have to be split, or a positive sector offset if 286 + * @bio needs to be split. 318 287 */ 319 - struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, 320 - unsigned *segs, struct bio_set *bs, unsigned max_bytes) 288 + int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, 289 + unsigned *segs, unsigned max_bytes) 321 290 { 322 291 struct bio_vec bv, bvprv, *bvprvp = NULL; 323 292 struct bvec_iter iter; ··· 339 324 } 340 325 341 326 *segs = nsegs; 342 - return NULL; 327 + return 0; 343 328 split: 344 - if (bio->bi_opf & REQ_ATOMIC) { 345 - bio->bi_status = BLK_STS_INVAL; 346 - bio_endio(bio); 347 - return ERR_PTR(-EINVAL); 348 - } 329 + if (bio->bi_opf & REQ_ATOMIC) 330 + return -EINVAL; 331 + 349 332 /* 350 333 * We can't sanely support splitting for a REQ_NOWAIT bio. End it 351 334 * with EAGAIN if splitting is required and return an error pointer. 352 335 */ 353 - if (bio->bi_opf & REQ_NOWAIT) { 354 - bio->bi_status = BLK_STS_AGAIN; 355 - bio_endio(bio); 356 - return ERR_PTR(-EAGAIN); 357 - } 336 + if (bio->bi_opf & REQ_NOWAIT) 337 + return -EAGAIN; 358 338 359 339 *segs = nsegs; 360 340 ··· 366 356 * big IO can be trival, disable iopoll when split needed. 367 357 */ 368 358 bio_clear_polled(bio); 369 - return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs); 359 + return bytes >> SECTOR_SHIFT; 370 360 } 371 - EXPORT_SYMBOL_GPL(bio_split_rw); 361 + EXPORT_SYMBOL_GPL(bio_split_rw_at); 372 362 373 - /** 374 - * __bio_split_to_limits - split a bio to fit the queue limits 375 - * @bio: bio to be split 376 - * @lim: queue limits to split based on 377 - * @nr_segs: returns the number of segments in the returned bio 378 - * 379 - * Check if @bio needs splitting based on the queue limits, and if so split off 380 - * a bio fitting the limits from the beginning of @bio and return it. @bio is 381 - * shortened to the remainder and re-submitted. 382 - * 383 - * The split bio is allocated from @q->bio_split, which is provided by the 384 - * block layer. 385 - */ 386 - struct bio *__bio_split_to_limits(struct bio *bio, 387 - const struct queue_limits *lim, 388 - unsigned int *nr_segs) 363 + struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, 364 + unsigned *nr_segs) 389 365 { 390 - struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split; 391 - struct bio *split; 366 + return bio_submit_split(bio, 367 + bio_split_rw_at(bio, lim, nr_segs, 368 + get_max_io_size(bio, lim) << SECTOR_SHIFT)); 369 + } 392 370 393 - switch (bio_op(bio)) { 394 - case REQ_OP_DISCARD: 395 - case REQ_OP_SECURE_ERASE: 396 - split = bio_split_discard(bio, lim, nr_segs, bs); 397 - break; 398 - case REQ_OP_WRITE_ZEROES: 399 - split = bio_split_write_zeroes(bio, lim, nr_segs, bs); 400 - break; 401 - default: 402 - split = bio_split_rw(bio, lim, nr_segs, bs, 403 - get_max_io_size(bio, lim) << SECTOR_SHIFT); 404 - if (IS_ERR(split)) 405 - return NULL; 406 - break; 407 - } 371 + /* 372 + * REQ_OP_ZONE_APPEND bios must never be split by the block layer. 373 + * 374 + * But we want the nr_segs calculation provided by bio_split_rw_at, and having 375 + * a good sanity check that the submitter built the bio correctly is nice to 376 + * have as well. 377 + */ 378 + struct bio *bio_split_zone_append(struct bio *bio, 379 + const struct queue_limits *lim, unsigned *nr_segs) 380 + { 381 + unsigned int max_sectors = queue_limits_max_zone_append_sectors(lim); 382 + int split_sectors; 408 383 409 - if (split) { 410 - /* there isn't chance to merge the split bio */ 411 - split->bi_opf |= REQ_NOMERGE; 412 - 413 - blkcg_bio_issue_init(split); 414 - bio_chain(split, bio); 415 - trace_block_split(split, bio->bi_iter.bi_sector); 416 - WARN_ON_ONCE(bio_zone_write_plugging(bio)); 417 - submit_bio_noacct(bio); 418 - return split; 419 - } 420 - return bio; 384 + split_sectors = bio_split_rw_at(bio, lim, nr_segs, 385 + max_sectors << SECTOR_SHIFT); 386 + if (WARN_ON_ONCE(split_sectors > 0)) 387 + split_sectors = -EINVAL; 388 + return bio_submit_split(bio, split_sectors); 421 389 } 422 390 423 391 /** ··· 414 426 const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits; 415 427 unsigned int nr_segs; 416 428 417 - if (bio_may_exceed_limits(bio, lim)) 418 - return __bio_split_to_limits(bio, lim, &nr_segs); 419 - return bio; 429 + return __bio_split_to_limits(bio, lim, &nr_segs); 420 430 } 421 431 EXPORT_SYMBOL(bio_split_to_limits); 422 432

+8 -6

block/blk-mq.c

··· 2753 2753 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 2754 2754 { 2755 2755 struct request *rq; 2756 + unsigned int depth; 2756 2757 2757 2758 /* 2758 2759 * We may have been called recursively midway through handling ··· 2764 2763 */ 2765 2764 if (plug->rq_count == 0) 2766 2765 return; 2766 + depth = plug->rq_count; 2767 2767 plug->rq_count = 0; 2768 2768 2769 2769 if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { ··· 2772 2770 2773 2771 rq = rq_list_peek(&plug->mq_list); 2774 2772 q = rq->q; 2773 + trace_block_unplug(q, depth, true); 2775 2774 2776 2775 /* 2777 2776 * Peek first request and see if we have a ->queue_rqs() hook. ··· 2942 2939 struct blk_plug *plug = current->plug; 2943 2940 const int is_sync = op_is_sync(bio->bi_opf); 2944 2941 struct blk_mq_hw_ctx *hctx; 2945 - unsigned int nr_segs = 1; 2942 + unsigned int nr_segs; 2946 2943 struct request *rq; 2947 2944 blk_status_t ret; 2948 2945 ··· 2984 2981 goto queue_exit; 2985 2982 } 2986 2983 2987 - if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { 2988 - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); 2989 - if (!bio) 2990 - goto queue_exit; 2991 - } 2984 + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); 2985 + if (!bio) 2986 + goto queue_exit; 2987 + 2992 2988 if (!bio_integrity_prep(bio)) 2993 2989 goto queue_exit; 2994 2990

+1 -1

block/blk-rq-qos.c

··· 263 263 has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq, 264 264 TASK_UNINTERRUPTIBLE); 265 265 do { 266 - /* The memory barrier in set_task_state saves us here. */ 266 + /* The memory barrier in set_current_state saves us here. */ 267 267 if (data.got_token) 268 268 break; 269 269 if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {

+41 -26

block/blk-throttle.c

··· 1584 1584 spin_unlock_irq(&q->queue_lock); 1585 1585 } 1586 1586 1587 + static bool tg_within_limit(struct throtl_grp *tg, struct bio *bio, bool rw) 1588 + { 1589 + /* throtl is FIFO - if bios are already queued, should queue */ 1590 + if (tg->service_queue.nr_queued[rw]) 1591 + return false; 1592 + 1593 + return tg_may_dispatch(tg, bio, NULL); 1594 + } 1595 + 1596 + static void tg_dispatch_in_debt(struct throtl_grp *tg, struct bio *bio, bool rw) 1597 + { 1598 + if (!bio_flagged(bio, BIO_BPS_THROTTLED)) 1599 + tg->carryover_bytes[rw] -= throtl_bio_data_size(bio); 1600 + tg->carryover_ios[rw]--; 1601 + } 1602 + 1587 1603 bool __blk_throtl_bio(struct bio *bio) 1588 1604 { 1589 1605 struct request_queue *q = bdev_get_queue(bio->bi_bdev); ··· 1616 1600 sq = &tg->service_queue; 1617 1601 1618 1602 while (true) { 1619 - if (tg->last_low_overflow_time[rw] == 0) 1620 - tg->last_low_overflow_time[rw] = jiffies; 1621 - /* throtl is FIFO - if bios are already queued, should queue */ 1622 - if (sq->nr_queued[rw]) 1623 - break; 1603 + if (tg_within_limit(tg, bio, rw)) { 1604 + /* within limits, let's charge and dispatch directly */ 1605 + throtl_charge_bio(tg, bio); 1624 1606 1625 - /* if above limits, break to queue */ 1626 - if (!tg_may_dispatch(tg, bio, NULL)) { 1627 - tg->last_low_overflow_time[rw] = jiffies; 1607 + /* 1608 + * We need to trim slice even when bios are not being 1609 + * queued otherwise it might happen that a bio is not 1610 + * queued for a long time and slice keeps on extending 1611 + * and trim is not called for a long time. Now if limits 1612 + * are reduced suddenly we take into account all the IO 1613 + * dispatched so far at new low rate and * newly queued 1614 + * IO gets a really long dispatch time. 1615 + * 1616 + * So keep on trimming slice even if bio is not queued. 1617 + */ 1618 + throtl_trim_slice(tg, rw); 1619 + } else if (bio_issue_as_root_blkg(bio)) { 1620 + /* 1621 + * IOs which may cause priority inversions are 1622 + * dispatched directly, even if they're over limit. 1623 + * Debts are handled by carryover_bytes/ios while 1624 + * calculating wait time. 1625 + */ 1626 + tg_dispatch_in_debt(tg, bio, rw); 1627 + } else { 1628 + /* if above limits, break to queue */ 1628 1629 break; 1629 1630 } 1630 - 1631 - /* within limits, let's charge and dispatch directly */ 1632 - throtl_charge_bio(tg, bio); 1633 - 1634 - /* 1635 - * We need to trim slice even when bios are not being queued 1636 - * otherwise it might happen that a bio is not queued for 1637 - * a long time and slice keeps on extending and trim is not 1638 - * called for a long time. Now if limits are reduced suddenly 1639 - * we take into account all the IO dispatched so far at new 1640 - * low rate and * newly queued IO gets a really long dispatch 1641 - * time. 1642 - * 1643 - * So keep on trimming slice even if bio is not queued. 1644 - */ 1645 - throtl_trim_slice(tg, rw); 1646 1631 1647 1632 /* 1648 1633 * @bio passed through this layer without being throttled. ··· 1666 1649 tg_bps_limit(tg, rw), 1667 1650 tg->io_disp[rw], tg_iops_limit(tg, rw), 1668 1651 sq->nr_queued[READ], sq->nr_queued[WRITE]); 1669 - 1670 - tg->last_low_overflow_time[rw] = jiffies; 1671 1652 1672 1653 td->nr_queued[rw]++; 1673 1654 throtl_add_bio_tg(bio, qn, tg);

-2

block/blk-throttle.h

··· 106 106 /* Number of bio's dispatched in current slice */ 107 107 unsigned int io_disp[2]; 108 108 109 - unsigned long last_low_overflow_time[2]; 110 - 111 109 uint64_t last_bytes_disp[2]; 112 110 unsigned int last_io_disp[2]; 113 111

+60 -22

block/blk.h

··· 331 331 ssize_t part_timeout_store(struct device *, struct device_attribute *, 332 332 const char *, size_t); 333 333 334 - static inline bool bio_may_exceed_limits(struct bio *bio, 335 - const struct queue_limits *lim) 336 - { 337 - switch (bio_op(bio)) { 338 - case REQ_OP_DISCARD: 339 - case REQ_OP_SECURE_ERASE: 340 - case REQ_OP_WRITE_ZEROES: 341 - return true; /* non-trivial splitting decisions */ 342 - default: 343 - break; 344 - } 334 + struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim, 335 + unsigned *nsegs); 336 + struct bio *bio_split_write_zeroes(struct bio *bio, 337 + const struct queue_limits *lim, unsigned *nsegs); 338 + struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, 339 + unsigned *nr_segs); 340 + struct bio *bio_split_zone_append(struct bio *bio, 341 + const struct queue_limits *lim, unsigned *nr_segs); 345 342 346 - /* 347 - * All drivers must accept single-segments bios that are <= PAGE_SIZE. 348 - * This is a quick and dirty check that relies on the fact that 349 - * bi_io_vec[0] is always valid if a bio has data. The check might 350 - * lead to occasional false negatives when bios are cloned, but compared 351 - * to the performance impact of cloned bios themselves the loop below 352 - * doesn't matter anyway. 353 - */ 343 + /* 344 + * All drivers must accept single-segments bios that are smaller than PAGE_SIZE. 345 + * 346 + * This is a quick and dirty check that relies on the fact that bi_io_vec[0] is 347 + * always valid if a bio has data. The check might lead to occasional false 348 + * positives when bios are cloned, but compared to the performance impact of 349 + * cloned bios themselves the loop below doesn't matter anyway. 350 + */ 351 + static inline bool bio_may_need_split(struct bio *bio, 352 + const struct queue_limits *lim) 353 + { 354 354 return lim->chunk_sectors || bio->bi_vcnt != 1 || 355 355 bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; 356 356 } 357 357 358 - struct bio *__bio_split_to_limits(struct bio *bio, 359 - const struct queue_limits *lim, 360 - unsigned int *nr_segs); 358 + /** 359 + * __bio_split_to_limits - split a bio to fit the queue limits 360 + * @bio: bio to be split 361 + * @lim: queue limits to split based on 362 + * @nr_segs: returns the number of segments in the returned bio 363 + * 364 + * Check if @bio needs splitting based on the queue limits, and if so split off 365 + * a bio fitting the limits from the beginning of @bio and return it. @bio is 366 + * shortened to the remainder and re-submitted. 367 + * 368 + * The split bio is allocated from @q->bio_split, which is provided by the 369 + * block layer. 370 + */ 371 + static inline struct bio *__bio_split_to_limits(struct bio *bio, 372 + const struct queue_limits *lim, unsigned int *nr_segs) 373 + { 374 + switch (bio_op(bio)) { 375 + case REQ_OP_READ: 376 + case REQ_OP_WRITE: 377 + if (bio_may_need_split(bio, lim)) 378 + return bio_split_rw(bio, lim, nr_segs); 379 + *nr_segs = 1; 380 + return bio; 381 + case REQ_OP_ZONE_APPEND: 382 + return bio_split_zone_append(bio, lim, nr_segs); 383 + case REQ_OP_DISCARD: 384 + case REQ_OP_SECURE_ERASE: 385 + return bio_split_discard(bio, lim, nr_segs); 386 + case REQ_OP_WRITE_ZEROES: 387 + return bio_split_write_zeroes(bio, lim, nr_segs); 388 + default: 389 + /* other operations can't be split */ 390 + *nr_segs = 0; 391 + return bio; 392 + } 393 + } 394 + 361 395 int ll_back_merge_fn(struct request *req, struct bio *bio, 362 396 unsigned int nr_segs); 363 397 bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, ··· 572 538 573 539 int bio_add_hw_page(struct request_queue *q, struct bio *bio, 574 540 struct page *page, unsigned int len, unsigned int offset, 541 + unsigned int max_sectors, bool *same_page); 542 + 543 + int bio_add_hw_folio(struct request_queue *q, struct bio *bio, 544 + struct folio *folio, size_t len, size_t offset, 575 545 unsigned int max_sectors, bool *same_page); 576 546 577 547 /*

+5 -4

block/ioctl.c

··· 126 126 return -EINVAL; 127 127 128 128 filemap_invalidate_lock(bdev->bd_mapping); 129 - err = truncate_bdev_range(bdev, mode, start, start + len - 1); 129 + err = truncate_bdev_range(bdev, mode, start, end - 1); 130 130 if (err) 131 131 goto fail; 132 132 ··· 163 163 static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode, 164 164 void __user *argp) 165 165 { 166 - uint64_t start, len; 166 + uint64_t start, len, end; 167 167 uint64_t range[2]; 168 168 int err; 169 169 ··· 178 178 len = range[1]; 179 179 if ((start & 511) || (len & 511)) 180 180 return -EINVAL; 181 - if (start + len > bdev_nr_bytes(bdev)) 181 + if (check_add_overflow(start, len, &end) || 182 + end > bdev_nr_bytes(bdev)) 182 183 return -EINVAL; 183 184 184 185 filemap_invalidate_lock(bdev->bd_mapping); 185 - err = truncate_bdev_range(bdev, mode, start, start + len - 1); 186 + err = truncate_bdev_range(bdev, mode, start, end - 1); 186 187 if (!err) 187 188 err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9, 188 189 GFP_KERNEL);

+2 -6

block/t10-pi.c

··· 8 8 #include <linux/blk-integrity.h> 9 9 #include <linux/crc-t10dif.h> 10 10 #include <linux/crc64.h> 11 - #include <linux/module.h> 12 11 #include <net/checksum.h> 13 12 #include <asm/unaligned.h> 14 13 #include "blk.h" ··· 239 240 } 240 241 } 241 242 242 - static bool ext_pi_ref_escape(u8 *ref_tag) 243 + static bool ext_pi_ref_escape(const u8 ref_tag[6]) 243 244 { 244 - static u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; 245 + static const u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; 245 246 246 247 return memcmp(ref_tag, ref_escape, sizeof(ref_escape)) == 0; 247 248 } ··· 471 472 else 472 473 t10_pi_type1_complete(rq, nr_bytes); 473 474 } 474 - 475 - MODULE_DESCRIPTION("T10 Protection Information module"); 476 - MODULE_LICENSE("GPL");

-11

drivers/block/drbd/drbd_int.h

··· 297 297 unsigned long flags; 298 298 }; 299 299 300 - /* Prototype declaration of function defined in drbd_receiver.c */ 301 - int drbdd_init(struct drbd_thread *); 302 - int drbd_asender(struct drbd_thread *); 303 - 304 300 /* drbd_epoch flag bits */ 305 301 enum { 306 302 DE_HAVE_BARRIER_NUMBER, ··· 860 864 struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */ 861 865 struct list_head net_ee; /* zero-copy network send in progress */ 862 866 863 - int next_barrier_nr; 864 867 struct list_head resync_reads; 865 868 atomic_t pp_in_use; /* allocated from page pool */ 866 869 atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */ ··· 1385 1390 extern void do_submit(struct work_struct *ws); 1386 1391 extern void __drbd_make_request(struct drbd_device *, struct bio *); 1387 1392 void drbd_submit_bio(struct bio *bio); 1388 - extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req); 1389 - extern int is_valid_ar_handle(struct drbd_request *, sector_t); 1390 - 1391 1393 1392 1394 /* drbd_nl.c */ 1393 1395 ··· 1466 1474 extern int w_send_write_hint(struct drbd_work *, int); 1467 1475 extern int w_send_dblock(struct drbd_work *, int); 1468 1476 extern int w_send_read_req(struct drbd_work *, int); 1469 - extern int w_e_reissue(struct drbd_work *, int); 1470 1477 extern int w_restart_disk_io(struct drbd_work *, int); 1471 1478 extern int w_send_out_of_sync(struct drbd_work *, int); 1472 1479 ··· 1479 1488 sector_t start, unsigned int nr_sectors, int flags); 1480 1489 extern int drbd_receiver(struct drbd_thread *thi); 1481 1490 extern int drbd_ack_receiver(struct drbd_thread *thi); 1482 - extern void drbd_send_ping_wf(struct work_struct *ws); 1483 1491 extern void drbd_send_acks_wf(struct work_struct *ws); 1484 1492 extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); 1485 1493 extern bool drbd_rs_should_slow_down(struct drbd_peer_device *peer_device, sector_t sector, ··· 1494 1504 #define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0) 1495 1505 #define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1) 1496 1506 extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool); 1497 - extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); 1498 1507 extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); 1499 1508 extern int drbd_connected(struct drbd_peer_device *); 1500 1509

+1 -1

drivers/block/drbd/drbd_main.c

··· 1550 1550 * put_page(); and would cause either a VM_BUG directly, or 1551 1551 * __page_cache_release a page that would actually still be referenced 1552 1552 * by someone, leading to some obscure delayed Oops somewhere else. */ 1553 - if (!drbd_disable_sendpage && sendpage_ok(page)) 1553 + if (!drbd_disable_sendpage && sendpages_ok(page, len, offset)) 1554 1554 msg.msg_flags |= MSG_NOSIGNAL | MSG_SPLICE_PAGES; 1555 1555 1556 1556 drbd_update_congested(peer_device->connection);

+1 -1

drivers/block/drbd/drbd_state.c

··· 876 876 ns.disk == D_OUTDATED) 877 877 rv = SS_CONNECTED_OUTDATES; 878 878 879 - else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && 879 + else if (nc && (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && 880 880 (nc->verify_alg[0] == 0)) 881 881 rv = SS_NO_VERIFY_ALG; 882 882

+1 -18

drivers/block/mtip32xx/mtip32xx.c

··· 2269 2269 .llseek = no_llseek, 2270 2270 }; 2271 2271 2272 - static int mtip_hw_debugfs_init(struct driver_data *dd) 2272 + static void mtip_hw_debugfs_init(struct driver_data *dd) 2273 2273 { 2274 - if (!dfs_parent) 2275 - return -1; 2276 - 2277 2274 dd->dfs_node = debugfs_create_dir(dd->disk->disk_name, dfs_parent); 2278 - if (IS_ERR_OR_NULL(dd->dfs_node)) { 2279 - dev_warn(&dd->pdev->dev, 2280 - "Error creating node %s under debugfs\n", 2281 - dd->disk->disk_name); 2282 - dd->dfs_node = NULL; 2283 - return -1; 2284 - } 2285 - 2286 2275 debugfs_create_file("flags", 0444, dd->dfs_node, dd, &mtip_flags_fops); 2287 2276 debugfs_create_file("registers", 0444, dd->dfs_node, dd, 2288 2277 &mtip_regs_fops); 2289 - 2290 - return 0; 2291 2278 } 2292 2279 2293 2280 static void mtip_hw_debugfs_exit(struct driver_data *dd) ··· 4030 4043 mtip_major = error; 4031 4044 4032 4045 dfs_parent = debugfs_create_dir("rssd", NULL); 4033 - if (IS_ERR_OR_NULL(dfs_parent)) { 4034 - pr_warn("Error creating debugfs parent\n"); 4035 - dfs_parent = NULL; 4036 - } 4037 4046 4038 4047 /* Register our PCI operations. */ 4039 4048 error = pci_register_driver(&mtip_pci_driver);

+26 -2

drivers/block/nbd.c

··· 181 181 { 182 182 struct request *req = blk_mq_rq_from_pdu(cmd); 183 183 184 + lockdep_assert_held(&cmd->lock); 185 + 186 + /* 187 + * Clear INFLIGHT flag so that this cmd won't be completed in 188 + * normal completion path 189 + * 190 + * INFLIGHT flag will be set when the cmd is queued to nbd next 191 + * time. 192 + */ 193 + __clear_bit(NBD_CMD_INFLIGHT, &cmd->flags); 194 + 184 195 if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags)) 185 196 blk_mq_requeue_request(req, true); 186 197 } ··· 350 339 351 340 lim = queue_limits_start_update(nbd->disk->queue); 352 341 if (nbd->config->flags & NBD_FLAG_SEND_TRIM) 353 - lim.max_hw_discard_sectors = UINT_MAX; 342 + lim.max_hw_discard_sectors = UINT_MAX >> SECTOR_SHIFT; 354 343 else 355 344 lim.max_hw_discard_sectors = 0; 356 345 if (!(nbd->config->flags & NBD_FLAG_SEND_FLUSH)) { ··· 361 350 lim.features |= BLK_FEAT_WRITE_CACHE; 362 351 lim.features &= ~BLK_FEAT_FUA; 363 352 } 353 + if (nbd->config->flags & NBD_FLAG_ROTATIONAL) 354 + lim.features |= BLK_FEAT_ROTATIONAL; 355 + if (nbd->config->flags & NBD_FLAG_SEND_WRITE_ZEROES) 356 + lim.max_write_zeroes_sectors = UINT_MAX >> SECTOR_SHIFT; 357 + 364 358 lim.logical_block_size = blksize; 365 359 lim.physical_block_size = blksize; 366 360 error = queue_limits_commit_update(nbd->disk->queue, &lim); ··· 434 418 return NBD_CMD_WRITE; 435 419 case REQ_OP_READ: 436 420 return NBD_CMD_READ; 421 + case REQ_OP_WRITE_ZEROES: 422 + return NBD_CMD_WRITE_ZEROES; 437 423 default: 438 424 return U32_MAX; 439 425 } ··· 506 488 nbd_mark_nsock_dead(nbd, nsock, 1); 507 489 mutex_unlock(&nsock->tx_lock); 508 490 } 509 - mutex_unlock(&cmd->lock); 510 491 nbd_requeue_cmd(cmd); 492 + mutex_unlock(&cmd->lock); 511 493 nbd_config_put(nbd); 512 494 return BLK_EH_DONE; 513 495 } ··· 652 634 653 635 if (req->cmd_flags & REQ_FUA) 654 636 nbd_cmd_flags |= NBD_CMD_FLAG_FUA; 637 + if ((req->cmd_flags & REQ_NOUNMAP) && (type == NBD_CMD_WRITE_ZEROES)) 638 + nbd_cmd_flags |= NBD_CMD_FLAG_NO_HOLE; 655 639 656 640 /* We did a partial send previously, and we at least sent the whole 657 641 * request struct, so just go and send the rest of the pages in the ··· 1723 1703 seq_puts(s, "NBD_FLAG_SEND_FUA\n"); 1724 1704 if (flags & NBD_FLAG_SEND_TRIM) 1725 1705 seq_puts(s, "NBD_FLAG_SEND_TRIM\n"); 1706 + if (flags & NBD_FLAG_SEND_WRITE_ZEROES) 1707 + seq_puts(s, "NBD_FLAG_SEND_WRITE_ZEROES\n"); 1708 + if (flags & NBD_FLAG_ROTATIONAL) 1709 + seq_puts(s, "NBD_FLAG_ROTATIONAL\n"); 1726 1710 1727 1711 return 0; 1728 1712 }

-2

drivers/block/pktcdvd.c

··· 498 498 if (!pkt_debugfs_root) 499 499 return; 500 500 pd->dfs_d_root = debugfs_create_dir(pd->disk->disk_name, pkt_debugfs_root); 501 - if (!pd->dfs_d_root) 502 - return; 503 501 504 502 pd->dfs_f_info = debugfs_create_file("info", 0444, pd->dfs_d_root, 505 503 pd, &pkt_seq_fops);

+9 -2

drivers/block/rnbd/rnbd-srv.c

··· 149 149 rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL); 150 150 if (bio_add_page(bio, virt_to_page(data), datalen, 151 151 offset_in_page(data)) != datalen) { 152 - rnbd_srv_err(sess_dev, "Failed to map data to bio\n"); 152 + rnbd_srv_err_rl(sess_dev, "Failed to map data to bio\n"); 153 153 err = -EINVAL; 154 154 goto bio_put; 155 155 } 156 156 157 + bio->bi_opf = rnbd_to_bio_flags(le32_to_cpu(msg->rw)); 158 + if (bio_has_data(bio) && 159 + bio->bi_iter.bi_size != le32_to_cpu(msg->bi_size)) { 160 + rnbd_srv_err_rl(sess_dev, "Datalen mismatch: bio bi_size (%u), bi_size (%u)\n", 161 + bio->bi_iter.bi_size, msg->bi_size); 162 + err = -EINVAL; 163 + goto bio_put; 164 + } 157 165 bio->bi_end_io = rnbd_dev_bi_end_io; 158 166 bio->bi_private = priv; 159 167 bio->bi_iter.bi_sector = le64_to_cpu(msg->sector); 160 - bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size); 161 168 prio = srv_sess->ver < RNBD_PROTO_VER_MAJOR || 162 169 usrlen < sizeof(*msg) ? 0 : le16_to_cpu(msg->prio); 163 170 bio_set_prio(bio, prio);

+46 -16

drivers/block/ublk_drv.c

··· 71 71 struct llist_node node; 72 72 73 73 struct kref ref; 74 - __u64 sector; 75 - __u32 operation; 76 - __u32 nr_zones; 77 74 }; 78 75 79 76 struct ublk_uring_cmd_pdu { ··· 211 214 212 215 #ifdef CONFIG_BLK_DEV_ZONED 213 216 217 + struct ublk_zoned_report_desc { 218 + __u64 sector; 219 + __u32 operation; 220 + __u32 nr_zones; 221 + }; 222 + 223 + static DEFINE_XARRAY(ublk_zoned_report_descs); 224 + 225 + static int ublk_zoned_insert_report_desc(const struct request *req, 226 + struct ublk_zoned_report_desc *desc) 227 + { 228 + return xa_insert(&ublk_zoned_report_descs, (unsigned long)req, 229 + desc, GFP_KERNEL); 230 + } 231 + 232 + static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc( 233 + const struct request *req) 234 + { 235 + return xa_erase(&ublk_zoned_report_descs, (unsigned long)req); 236 + } 237 + 238 + static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc( 239 + const struct request *req) 240 + { 241 + return xa_load(&ublk_zoned_report_descs, (unsigned long)req); 242 + } 243 + 214 244 static int ublk_get_nr_zones(const struct ublk_device *ub) 215 245 { 216 246 const struct ublk_param_basic *p = &ub->params.basic; ··· 332 308 unsigned int zones_in_request = 333 309 min_t(unsigned int, remaining_zones, max_zones_per_request); 334 310 struct request *req; 335 - struct ublk_rq_data *pdu; 311 + struct ublk_zoned_report_desc desc; 336 312 blk_status_t status; 337 313 338 314 memset(buffer, 0, buffer_length); ··· 343 319 goto out; 344 320 } 345 321 346 - pdu = blk_mq_rq_to_pdu(req); 347 - pdu->operation = UBLK_IO_OP_REPORT_ZONES; 348 - pdu->sector = sector; 349 - pdu->nr_zones = zones_in_request; 322 + desc.operation = UBLK_IO_OP_REPORT_ZONES; 323 + desc.sector = sector; 324 + desc.nr_zones = zones_in_request; 325 + ret = ublk_zoned_insert_report_desc(req, &desc); 326 + if (ret) 327 + goto free_req; 350 328 351 329 ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length, 352 330 GFP_KERNEL); 353 - if (ret) { 354 - blk_mq_free_request(req); 355 - goto out; 356 - } 331 + if (ret) 332 + goto erase_desc; 357 333 358 334 status = blk_execute_rq(req, 0); 359 335 ret = blk_status_to_errno(status); 336 + erase_desc: 337 + ublk_zoned_erase_report_desc(req); 338 + free_req: 360 339 blk_mq_free_request(req); 361 340 if (ret) 362 341 goto out; ··· 393 366 { 394 367 struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); 395 368 struct ublk_io *io = &ubq->ios[req->tag]; 396 - struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req); 369 + struct ublk_zoned_report_desc *desc; 397 370 u32 ublk_op; 398 371 399 372 switch (req_op(req)) { ··· 416 389 ublk_op = UBLK_IO_OP_ZONE_RESET_ALL; 417 390 break; 418 391 case REQ_OP_DRV_IN: 419 - ublk_op = pdu->operation; 392 + desc = ublk_zoned_get_report_desc(req); 393 + if (!desc) 394 + return BLK_STS_IOERR; 395 + ublk_op = desc->operation; 420 396 switch (ublk_op) { 421 397 case UBLK_IO_OP_REPORT_ZONES: 422 398 iod->op_flags = ublk_op | ublk_req_build_flags(req); 423 - iod->nr_zones = pdu->nr_zones; 424 - iod->start_sector = pdu->sector; 399 + iod->nr_zones = desc->nr_zones; 400 + iod->start_sector = desc->sector; 425 401 return BLK_STS_OK; 426 402 default: 427 403 return BLK_STS_IOERR;

+10 -6

drivers/block/zram/zram_drv.c

··· 59 59 60 60 static int zram_slot_trylock(struct zram *zram, u32 index) 61 61 { 62 - return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); 62 + return spin_trylock(&zram->table[index].lock); 63 63 } 64 64 65 65 static void zram_slot_lock(struct zram *zram, u32 index) 66 66 { 67 - bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags); 67 + spin_lock(&zram->table[index].lock); 68 68 } 69 69 70 70 static void zram_slot_unlock(struct zram *zram, u32 index) 71 71 { 72 - bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); 72 + spin_unlock(&zram->table[index].lock); 73 73 } 74 74 75 75 static inline bool init_done(struct zram *zram) ··· 1211 1211 1212 1212 static bool zram_meta_alloc(struct zram *zram, u64 disksize) 1213 1213 { 1214 - size_t num_pages; 1214 + size_t num_pages, index; 1215 1215 1216 1216 num_pages = disksize >> PAGE_SHIFT; 1217 1217 zram->table = vzalloc(array_size(num_pages, sizeof(*zram->table))); ··· 1226 1226 1227 1227 if (!huge_class_size) 1228 1228 huge_class_size = zs_huge_class_size(zram->mem_pool); 1229 + 1230 + for (index = 0; index < num_pages; index++) 1231 + spin_lock_init(&zram->table[index].lock); 1229 1232 return true; 1230 1233 } 1231 1234 ··· 1286 1283 zram_set_handle(zram, index, 0); 1287 1284 zram_set_obj_size(zram, index, 0); 1288 1285 WARN_ON_ONCE(zram->table[index].flags & 1289 - ~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB)); 1286 + ~(1UL << ZRAM_UNDER_WB)); 1290 1287 } 1291 1288 1292 1289 /* ··· 2404 2401 2405 2402 static int __init zram_init(void) 2406 2403 { 2404 + struct zram_table_entry zram_te; 2407 2405 int ret; 2408 2406 2409 - BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > BITS_PER_LONG); 2407 + BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > sizeof(zram_te.flags) * 8); 2410 2408 2411 2409 ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare", 2412 2410 zcomp_cpu_up_prepare, zcomp_cpu_dead);

+3 -4

drivers/block/zram/zram_drv.h

··· 45 45 46 46 /* Flags for zram pages (table[page_no].flags) */ 47 47 enum zram_pageflags { 48 - /* zram slot is locked */ 49 - ZRAM_LOCK = ZRAM_FLAG_SHIFT, 50 - ZRAM_SAME, /* Page consists the same element */ 48 + ZRAM_SAME = ZRAM_FLAG_SHIFT, /* Page consists the same element */ 51 49 ZRAM_WB, /* page is stored on backing_device */ 52 50 ZRAM_UNDER_WB, /* page is under writeback */ 53 51 ZRAM_HUGE, /* Incompressible page */ ··· 66 68 unsigned long handle; 67 69 unsigned long element; 68 70 }; 69 - unsigned long flags; 71 + unsigned int flags; 72 + spinlock_t lock; 70 73 #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME 71 74 ktime_t ac_time; 72 75 #endif

+5 -2

drivers/md/dm-raid.c

··· 3949 3949 /* Try loading the bitmap unless "raid0", which does not have one */ 3950 3950 if (!rs_is_raid0(rs) && 3951 3951 !test_and_set_bit(RT_FLAG_RS_BITMAP_LOADED, &rs->runtime_flags)) { 3952 - r = md_bitmap_load(&rs->md); 3952 + struct mddev *mddev = &rs->md; 3953 + 3954 + r = mddev->bitmap_ops->load(mddev); 3953 3955 if (r) 3954 3956 DMERR("Failed to load bitmap"); 3955 3957 } ··· 4068 4066 mddev->bitmap_info.chunksize != to_bytes(rs->requested_bitmap_chunk_sectors)))) { 4069 4067 int chunksize = to_bytes(rs->requested_bitmap_chunk_sectors) ?: mddev->bitmap_info.chunksize; 4070 4068 4071 - r = md_bitmap_resize(mddev->bitmap, mddev->dev_sectors, chunksize, 0); 4069 + r = mddev->bitmap_ops->resize(mddev, mddev->dev_sectors, 4070 + chunksize, false); 4072 4071 if (r) 4073 4072 DMERR("Failed to resize bitmap"); 4074 4073 }

+439 -143

drivers/md/md-bitmap.c

··· 32 32 #include "md.h" 33 33 #include "md-bitmap.h" 34 34 35 + #define BITMAP_MAJOR_LO 3 36 + /* version 4 insists the bitmap is in little-endian order 37 + * with version 3, it is host-endian which is non-portable 38 + * Version 5 is currently set only for clustered devices 39 + */ 40 + #define BITMAP_MAJOR_HI 4 41 + #define BITMAP_MAJOR_CLUSTERED 5 42 + #define BITMAP_MAJOR_HOSTENDIAN 3 43 + 44 + /* 45 + * in-memory bitmap: 46 + * 47 + * Use 16 bit block counters to track pending writes to each "chunk". 48 + * The 2 high order bits are special-purpose, the first is a flag indicating 49 + * whether a resync is needed. The second is a flag indicating whether a 50 + * resync is active. 51 + * This means that the counter is actually 14 bits: 52 + * 53 + * +--------+--------+------------------------------------------------+ 54 + * | resync | resync | counter | 55 + * | needed | active | | 56 + * | (0-1) | (0-1) | (0-16383) | 57 + * +--------+--------+------------------------------------------------+ 58 + * 59 + * The "resync needed" bit is set when: 60 + * a '1' bit is read from storage at startup. 61 + * a write request fails on some drives 62 + * a resync is aborted on a chunk with 'resync active' set 63 + * It is cleared (and resync-active set) when a resync starts across all drives 64 + * of the chunk. 65 + * 66 + * 67 + * The "resync active" bit is set when: 68 + * a resync is started on all drives, and resync_needed is set. 69 + * resync_needed will be cleared (as long as resync_active wasn't already set). 70 + * It is cleared when a resync completes. 71 + * 72 + * The counter counts pending write requests, plus the on-disk bit. 73 + * When the counter is '1' and the resync bits are clear, the on-disk 74 + * bit can be cleared as well, thus setting the counter to 0. 75 + * When we set a bit, or in the counter (to start a write), if the fields is 76 + * 0, we first set the disk bit and set the counter to 1. 77 + * 78 + * If the counter is 0, the on-disk bit is clear and the stripe is clean 79 + * Anything that dirties the stripe pushes the counter to 2 (at least) 80 + * and sets the on-disk bit (lazily). 81 + * If a periodic sweep find the counter at 2, it is decremented to 1. 82 + * If the sweep find the counter at 1, the on-disk bit is cleared and the 83 + * counter goes to zero. 84 + * 85 + * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block 86 + * counters as a fallback when "page" memory cannot be allocated: 87 + * 88 + * Normal case (page memory allocated): 89 + * 90 + * page pointer (32-bit) 91 + * 92 + * [ ] ------+ 93 + * | 94 + * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) 95 + * c1 c2 c2048 96 + * 97 + * Hijacked case (page memory allocation failed): 98 + * 99 + * hijacked page pointer (32-bit) 100 + * 101 + * [ ][ ] (no page memory allocated) 102 + * counter #1 (16-bit) counter #2 (16-bit) 103 + * 104 + */ 105 + 106 + #define PAGE_BITS (PAGE_SIZE << 3) 107 + #define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) 108 + 109 + #define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) 110 + #define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) 111 + #define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) 112 + 113 + /* how many counters per page? */ 114 + #define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) 115 + /* same, except a shift value for more efficient bitops */ 116 + #define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) 117 + /* same, except a mask value for more efficient bitops */ 118 + #define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) 119 + 120 + #define BITMAP_BLOCK_SHIFT 9 121 + 122 + /* 123 + * bitmap structures: 124 + */ 125 + 126 + /* the in-memory bitmap is represented by bitmap_pages */ 127 + struct bitmap_page { 128 + /* 129 + * map points to the actual memory page 130 + */ 131 + char *map; 132 + /* 133 + * in emergencies (when map cannot be alloced), hijack the map 134 + * pointer and use it as two counters itself 135 + */ 136 + unsigned int hijacked:1; 137 + /* 138 + * If any counter in this page is '1' or '2' - and so could be 139 + * cleared then that page is marked as 'pending' 140 + */ 141 + unsigned int pending:1; 142 + /* 143 + * count of dirty bits on the page 144 + */ 145 + unsigned int count:30; 146 + }; 147 + 148 + /* the main bitmap structure - one per mddev */ 149 + struct bitmap { 150 + 151 + struct bitmap_counts { 152 + spinlock_t lock; 153 + struct bitmap_page *bp; 154 + /* total number of pages in the bitmap */ 155 + unsigned long pages; 156 + /* number of pages not yet allocated */ 157 + unsigned long missing_pages; 158 + /* chunksize = 2^chunkshift (for bitops) */ 159 + unsigned long chunkshift; 160 + /* total number of data chunks for the array */ 161 + unsigned long chunks; 162 + } counts; 163 + 164 + struct mddev *mddev; /* the md device that the bitmap is for */ 165 + 166 + __u64 events_cleared; 167 + int need_sync; 168 + 169 + struct bitmap_storage { 170 + /* backing disk file */ 171 + struct file *file; 172 + /* cached copy of the bitmap file superblock */ 173 + struct page *sb_page; 174 + unsigned long sb_index; 175 + /* list of cache pages for the file */ 176 + struct page **filemap; 177 + /* attributes associated filemap pages */ 178 + unsigned long *filemap_attr; 179 + /* number of pages in the file */ 180 + unsigned long file_pages; 181 + /* total bytes in the bitmap */ 182 + unsigned long bytes; 183 + } storage; 184 + 185 + unsigned long flags; 186 + 187 + int allclean; 188 + 189 + atomic_t behind_writes; 190 + /* highest actual value at runtime */ 191 + unsigned long behind_writes_used; 192 + 193 + /* 194 + * the bitmap daemon - periodically wakes up and sweeps the bitmap 195 + * file, cleaning up bits and flushing out pages to disk as necessary 196 + */ 197 + unsigned long daemon_lastrun; /* jiffies of last run */ 198 + /* 199 + * when we lasted called end_sync to update bitmap with resync 200 + * progress. 201 + */ 202 + unsigned long last_end_sync; 203 + 204 + /* pending writes to the bitmap file */ 205 + atomic_t pending_writes; 206 + wait_queue_head_t write_wait; 207 + wait_queue_head_t overflow_wait; 208 + wait_queue_head_t behind_wait; 209 + 210 + struct kernfs_node *sysfs_can_clear; 211 + /* slot offset for clustered env */ 212 + int cluster_slot; 213 + }; 214 + 215 + static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, 216 + int chunksize, bool init); 217 + 35 218 static inline char *bmname(struct bitmap *bitmap) 36 219 { 37 220 return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; 221 + } 222 + 223 + static bool __bitmap_enabled(struct bitmap *bitmap) 224 + { 225 + return bitmap->storage.filemap && 226 + !test_bit(BITMAP_STALE, &bitmap->flags); 227 + } 228 + 229 + static bool bitmap_enabled(struct mddev *mddev) 230 + { 231 + struct bitmap *bitmap = mddev->bitmap; 232 + 233 + if (!bitmap) 234 + return false; 235 + 236 + return __bitmap_enabled(bitmap); 38 237 } 39 238 40 239 /* ··· 671 472 672 473 673 474 /* update the event counter and sync the superblock to disk */ 674 - void md_bitmap_update_sb(struct bitmap *bitmap) 475 + static void bitmap_update_sb(void *data) 675 476 { 676 477 bitmap_super_t *sb; 478 + struct bitmap *bitmap = data; 677 479 678 480 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ 679 481 return; ··· 710 510 write_sb_page(bitmap, bitmap->storage.sb_index, 711 511 bitmap->storage.sb_page, 1); 712 512 } 713 - EXPORT_SYMBOL(md_bitmap_update_sb); 714 513 715 - /* print out the bitmap file superblock */ 716 - void md_bitmap_print_sb(struct bitmap *bitmap) 514 + static void bitmap_print_sb(struct bitmap *bitmap) 717 515 { 718 516 bitmap_super_t *sb; 719 517 ··· 958 760 bitmap->mddev->bitmap_info.space > sectors_reserved) 959 761 bitmap->mddev->bitmap_info.space = sectors_reserved; 960 762 } else { 961 - md_bitmap_print_sb(bitmap); 763 + bitmap_print_sb(bitmap); 962 764 if (bitmap->cluster_slot < 0) 963 765 md_cluster_stop(bitmap->mddev); 964 766 } ··· 1091 893 static void md_bitmap_file_kick(struct bitmap *bitmap) 1092 894 { 1093 895 if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { 1094 - md_bitmap_update_sb(bitmap); 896 + bitmap_update_sb(bitmap); 1095 897 1096 898 if (bitmap->storage.file) { 1097 899 pr_warn("%s: kicking failed bitmap file %pD4 from array!\n", ··· 1226 1028 /* this gets called when the md device is ready to unplug its underlying 1227 1029 * (slave) device queues -- before we let any writes go down, we need to 1228 1030 * sync the dirty pages of the bitmap file to disk */ 1229 - void md_bitmap_unplug(struct bitmap *bitmap) 1031 + static void __bitmap_unplug(struct bitmap *bitmap) 1230 1032 { 1231 1033 unsigned long i; 1232 1034 int dirty, need_write; 1233 1035 int writing = 0; 1234 1036 1235 - if (!md_bitmap_enabled(bitmap)) 1037 + if (!__bitmap_enabled(bitmap)) 1236 1038 return; 1237 1039 1238 1040 /* look at each page to see if there are any set bits that need to be ··· 1258 1060 if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) 1259 1061 md_bitmap_file_kick(bitmap); 1260 1062 } 1261 - EXPORT_SYMBOL(md_bitmap_unplug); 1262 1063 1263 1064 struct bitmap_unplug_work { 1264 1065 struct work_struct work; ··· 1270 1073 struct bitmap_unplug_work *unplug_work = 1271 1074 container_of(work, struct bitmap_unplug_work, work); 1272 1075 1273 - md_bitmap_unplug(unplug_work->bitmap); 1076 + __bitmap_unplug(unplug_work->bitmap); 1274 1077 complete(unplug_work->done); 1275 1078 } 1276 1079 1277 - void md_bitmap_unplug_async(struct bitmap *bitmap) 1080 + static void bitmap_unplug_async(struct bitmap *bitmap) 1278 1081 { 1279 1082 DECLARE_COMPLETION_ONSTACK(done); 1280 1083 struct bitmap_unplug_work unplug_work; ··· 1286 1089 queue_work(md_bitmap_wq, &unplug_work.work); 1287 1090 wait_for_completion(&done); 1288 1091 } 1289 - EXPORT_SYMBOL(md_bitmap_unplug_async); 1092 + 1093 + static void bitmap_unplug(struct mddev *mddev, bool sync) 1094 + { 1095 + struct bitmap *bitmap = mddev->bitmap; 1096 + 1097 + if (!bitmap) 1098 + return; 1099 + 1100 + if (sync) 1101 + __bitmap_unplug(bitmap); 1102 + else 1103 + bitmap_unplug_async(bitmap); 1104 + } 1290 1105 1291 1106 static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); 1292 1107 ··· 1435 1226 return ret; 1436 1227 } 1437 1228 1438 - void md_bitmap_write_all(struct bitmap *bitmap) 1229 + /* just flag bitmap pages as needing to be written. */ 1230 + static void bitmap_write_all(struct mddev *mddev) 1439 1231 { 1440 - /* We don't actually write all bitmap blocks here, 1441 - * just flag them as needing to be written 1442 - */ 1443 1232 int i; 1233 + struct bitmap *bitmap = mddev->bitmap; 1444 1234 1445 1235 if (!bitmap || !bitmap->storage.filemap) 1446 1236 return; 1237 + 1238 + /* Only one copy, so nothing needed */ 1447 1239 if (bitmap->storage.file) 1448 - /* Only one copy, so nothing needed */ 1449 1240 return; 1450 1241 1451 1242 for (i = 0; i < bitmap->storage.file_pages; i++) 1452 - set_page_attr(bitmap, i, 1453 - BITMAP_PAGE_NEEDWRITE); 1243 + set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); 1454 1244 bitmap->allclean = 0; 1455 1245 } 1456 1246 ··· 1498 1290 * bitmap daemon -- periodically wakes up to clean bits and flush pages 1499 1291 * out to disk 1500 1292 */ 1501 - void md_bitmap_daemon_work(struct mddev *mddev) 1293 + static void bitmap_daemon_work(struct mddev *mddev) 1502 1294 { 1503 1295 struct bitmap *bitmap; 1504 1296 unsigned long j; ··· 1669 1461 &(bitmap->bp[page].map[pageoff]); 1670 1462 } 1671 1463 1672 - int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) 1464 + static int bitmap_startwrite(struct mddev *mddev, sector_t offset, 1465 + unsigned long sectors, bool behind) 1673 1466 { 1467 + struct bitmap *bitmap = mddev->bitmap; 1468 + 1674 1469 if (!bitmap) 1675 1470 return 0; 1676 1471 ··· 1734 1523 } 1735 1524 return 0; 1736 1525 } 1737 - EXPORT_SYMBOL(md_bitmap_startwrite); 1738 1526 1739 - void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, 1740 - unsigned long sectors, int success, int behind) 1527 + static void bitmap_endwrite(struct mddev *mddev, sector_t offset, 1528 + unsigned long sectors, bool success, bool behind) 1741 1529 { 1530 + struct bitmap *bitmap = mddev->bitmap; 1531 + 1742 1532 if (!bitmap) 1743 1533 return; 1534 + 1744 1535 if (behind) { 1745 1536 if (atomic_dec_and_test(&bitmap->behind_writes)) 1746 1537 wake_up(&bitmap->behind_wait); ··· 1789 1576 sectors = 0; 1790 1577 } 1791 1578 } 1792 - EXPORT_SYMBOL(md_bitmap_endwrite); 1793 1579 1794 - static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, 1795 - int degraded) 1580 + static bool __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, 1581 + sector_t *blocks, bool degraded) 1796 1582 { 1797 1583 bitmap_counter_t *bmc; 1798 - int rv; 1584 + bool rv; 1585 + 1799 1586 if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */ 1800 1587 *blocks = 1024; 1801 - return 1; /* always resync if no bitmap */ 1588 + return true; /* always resync if no bitmap */ 1802 1589 } 1803 1590 spin_lock_irq(&bitmap->counts.lock); 1591 + 1592 + rv = false; 1804 1593 bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0); 1805 - rv = 0; 1806 1594 if (bmc) { 1807 1595 /* locked */ 1808 - if (RESYNC(*bmc)) 1809 - rv = 1; 1810 - else if (NEEDED(*bmc)) { 1811 - rv = 1; 1596 + if (RESYNC(*bmc)) { 1597 + rv = true; 1598 + } else if (NEEDED(*bmc)) { 1599 + rv = true; 1812 1600 if (!degraded) { /* don't set/clear bits if degraded */ 1813 1601 *bmc |= RESYNC_MASK; 1814 1602 *bmc &= ~NEEDED_MASK; ··· 1817 1603 } 1818 1604 } 1819 1605 spin_unlock_irq(&bitmap->counts.lock); 1606 + 1820 1607 return rv; 1821 1608 } 1822 1609 1823 - int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, 1824 - int degraded) 1610 + static bool bitmap_start_sync(struct mddev *mddev, sector_t offset, 1611 + sector_t *blocks, bool degraded) 1825 1612 { 1826 1613 /* bitmap_start_sync must always report on multiples of whole 1827 1614 * pages, otherwise resync (which is very PAGE_SIZE based) will ··· 1831 1616 * At least PAGE_SIZE>>9 blocks are covered. 1832 1617 * Return the 'or' of the result. 1833 1618 */ 1834 - int rv = 0; 1619 + bool rv = false; 1835 1620 sector_t blocks1; 1836 1621 1837 1622 *blocks = 0; 1838 1623 while (*blocks < (PAGE_SIZE>>9)) { 1839 - rv |= __bitmap_start_sync(bitmap, offset, 1624 + rv |= __bitmap_start_sync(mddev->bitmap, offset, 1840 1625 &blocks1, degraded); 1841 1626 offset += blocks1; 1842 1627 *blocks += blocks1; 1843 1628 } 1629 + 1844 1630 return rv; 1845 1631 } 1846 - EXPORT_SYMBOL(md_bitmap_start_sync); 1847 1632 1848 - void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted) 1633 + static void __bitmap_end_sync(struct bitmap *bitmap, sector_t offset, 1634 + sector_t *blocks, bool aborted) 1849 1635 { 1850 1636 bitmap_counter_t *bmc; 1851 1637 unsigned long flags; ··· 1875 1659 unlock: 1876 1660 spin_unlock_irqrestore(&bitmap->counts.lock, flags); 1877 1661 } 1878 - EXPORT_SYMBOL(md_bitmap_end_sync); 1879 1662 1880 - void md_bitmap_close_sync(struct bitmap *bitmap) 1663 + static void bitmap_end_sync(struct mddev *mddev, sector_t offset, 1664 + sector_t *blocks) 1665 + { 1666 + __bitmap_end_sync(mddev->bitmap, offset, blocks, true); 1667 + } 1668 + 1669 + static void bitmap_close_sync(struct mddev *mddev) 1881 1670 { 1882 1671 /* Sync has finished, and any bitmap chunks that weren't synced 1883 1672 * properly have been aborted. It remains to us to clear the ··· 1890 1669 */ 1891 1670 sector_t sector = 0; 1892 1671 sector_t blocks; 1672 + struct bitmap *bitmap = mddev->bitmap; 1673 + 1893 1674 if (!bitmap) 1894 1675 return; 1676 + 1895 1677 while (sector < bitmap->mddev->resync_max_sectors) { 1896 - md_bitmap_end_sync(bitmap, sector, &blocks, 0); 1678 + __bitmap_end_sync(bitmap, sector, &blocks, false); 1897 1679 sector += blocks; 1898 1680 } 1899 1681 } 1900 - EXPORT_SYMBOL(md_bitmap_close_sync); 1901 1682 1902 - void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) 1683 + static void bitmap_cond_end_sync(struct mddev *mddev, sector_t sector, 1684 + bool force) 1903 1685 { 1904 1686 sector_t s = 0; 1905 1687 sector_t blocks; 1688 + struct bitmap *bitmap = mddev->bitmap; 1906 1689 1907 1690 if (!bitmap) 1908 1691 return; ··· 1925 1700 sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); 1926 1701 s = 0; 1927 1702 while (s < sector && s < bitmap->mddev->resync_max_sectors) { 1928 - md_bitmap_end_sync(bitmap, s, &blocks, 0); 1703 + __bitmap_end_sync(bitmap, s, &blocks, false); 1929 1704 s += blocks; 1930 1705 } 1931 1706 bitmap->last_end_sync = jiffies; 1932 1707 sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed); 1933 1708 } 1934 - EXPORT_SYMBOL(md_bitmap_cond_end_sync); 1935 1709 1936 - void md_bitmap_sync_with_cluster(struct mddev *mddev, 1937 - sector_t old_lo, sector_t old_hi, 1938 - sector_t new_lo, sector_t new_hi) 1710 + static void bitmap_sync_with_cluster(struct mddev *mddev, 1711 + sector_t old_lo, sector_t old_hi, 1712 + sector_t new_lo, sector_t new_hi) 1939 1713 { 1940 1714 struct bitmap *bitmap = mddev->bitmap; 1941 1715 sector_t sector, blocks = 0; 1942 1716 1943 1717 for (sector = old_lo; sector < new_lo; ) { 1944 - md_bitmap_end_sync(bitmap, sector, &blocks, 0); 1718 + __bitmap_end_sync(bitmap, sector, &blocks, false); 1945 1719 sector += blocks; 1946 1720 } 1947 1721 WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n"); 1948 1722 1949 1723 for (sector = old_hi; sector < new_hi; ) { 1950 - md_bitmap_start_sync(bitmap, sector, &blocks, 0); 1724 + bitmap_start_sync(mddev, sector, &blocks, false); 1951 1725 sector += blocks; 1952 1726 } 1953 1727 WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n"); 1954 1728 } 1955 - EXPORT_SYMBOL(md_bitmap_sync_with_cluster); 1956 1729 1957 1730 static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) 1958 1731 { ··· 1979 1756 } 1980 1757 1981 1758 /* dirty the memory and file bits for bitmap chunks "s" to "e" */ 1982 - void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) 1759 + static void bitmap_dirty_bits(struct mddev *mddev, unsigned long s, 1760 + unsigned long e) 1983 1761 { 1984 1762 unsigned long chunk; 1763 + struct bitmap *bitmap = mddev->bitmap; 1764 + 1765 + if (!bitmap) 1766 + return; 1985 1767 1986 1768 for (chunk = s; chunk <= e; chunk++) { 1987 1769 sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift; 1770 + 1988 1771 md_bitmap_set_memory_bits(bitmap, sec, 1); 1989 1772 md_bitmap_file_set_bit(bitmap, sec); 1990 1773 if (sec < bitmap->mddev->recovery_cp) ··· 2002 1773 } 2003 1774 } 2004 1775 2005 - /* 2006 - * flush out any pending updates 2007 - */ 2008 - void md_bitmap_flush(struct mddev *mddev) 1776 + static void bitmap_flush(struct mddev *mddev) 2009 1777 { 2010 1778 struct bitmap *bitmap = mddev->bitmap; 2011 1779 long sleep; ··· 2015 1789 */ 2016 1790 sleep = mddev->bitmap_info.daemon_sleep * 2; 2017 1791 bitmap->daemon_lastrun -= sleep; 2018 - md_bitmap_daemon_work(mddev); 1792 + bitmap_daemon_work(mddev); 2019 1793 bitmap->daemon_lastrun -= sleep; 2020 - md_bitmap_daemon_work(mddev); 1794 + bitmap_daemon_work(mddev); 2021 1795 bitmap->daemon_lastrun -= sleep; 2022 - md_bitmap_daemon_work(mddev); 1796 + bitmap_daemon_work(mddev); 2023 1797 if (mddev->bitmap_info.external) 2024 1798 md_super_wait(mddev); 2025 - md_bitmap_update_sb(bitmap); 1799 + bitmap_update_sb(bitmap); 2026 1800 } 2027 1801 2028 - /* 2029 - * free memory that was allocated 2030 - */ 2031 - void md_bitmap_free(struct bitmap *bitmap) 1802 + static void md_bitmap_free(void *data) 2032 1803 { 2033 1804 unsigned long k, pages; 2034 1805 struct bitmap_page *bp; 1806 + struct bitmap *bitmap = data; 2035 1807 2036 1808 if (!bitmap) /* there was no bitmap */ 2037 1809 return; ··· 2060 1836 kfree(bp); 2061 1837 kfree(bitmap); 2062 1838 } 2063 - EXPORT_SYMBOL(md_bitmap_free); 2064 1839 2065 - void md_bitmap_wait_behind_writes(struct mddev *mddev) 1840 + static void bitmap_wait_behind_writes(struct mddev *mddev) 2066 1841 { 2067 1842 struct bitmap *bitmap = mddev->bitmap; 2068 1843 ··· 2075 1852 } 2076 1853 } 2077 1854 2078 - void md_bitmap_destroy(struct mddev *mddev) 1855 + static void bitmap_destroy(struct mddev *mddev) 2079 1856 { 2080 1857 struct bitmap *bitmap = mddev->bitmap; 2081 1858 2082 1859 if (!bitmap) /* there was no bitmap */ 2083 1860 return; 2084 1861 2085 - md_bitmap_wait_behind_writes(mddev); 1862 + bitmap_wait_behind_writes(mddev); 2086 1863 if (!mddev->serialize_policy) 2087 1864 mddev_destroy_serial_pool(mddev, NULL); 2088 1865 ··· 2101 1878 * if this returns an error, bitmap_destroy must be called to do clean up 2102 1879 * once mddev->bitmap is set 2103 1880 */ 2104 - struct bitmap *md_bitmap_create(struct mddev *mddev, int slot) 1881 + static struct bitmap *__bitmap_create(struct mddev *mddev, int slot) 2105 1882 { 2106 1883 struct bitmap *bitmap; 2107 1884 sector_t blocks = mddev->resync_max_sectors; ··· 2171 1948 goto error; 2172 1949 2173 1950 bitmap->daemon_lastrun = jiffies; 2174 - err = md_bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1); 1951 + err = __bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1952 + true); 2175 1953 if (err) 2176 1954 goto error; 2177 1955 ··· 2189 1965 return ERR_PTR(err); 2190 1966 } 2191 1967 2192 - int md_bitmap_load(struct mddev *mddev) 1968 + static int bitmap_create(struct mddev *mddev, int slot) 1969 + { 1970 + struct bitmap *bitmap = __bitmap_create(mddev, slot); 1971 + 1972 + if (IS_ERR(bitmap)) 1973 + return PTR_ERR(bitmap); 1974 + 1975 + mddev->bitmap = bitmap; 1976 + return 0; 1977 + } 1978 + 1979 + static int bitmap_load(struct mddev *mddev) 2193 1980 { 2194 1981 int err = 0; 2195 1982 sector_t start = 0; ··· 2224 1989 */ 2225 1990 while (sector < mddev->resync_max_sectors) { 2226 1991 sector_t blocks; 2227 - md_bitmap_start_sync(bitmap, sector, &blocks, 0); 1992 + bitmap_start_sync(mddev, sector, &blocks, false); 2228 1993 sector += blocks; 2229 1994 } 2230 - md_bitmap_close_sync(bitmap); 1995 + bitmap_close_sync(mddev); 2231 1996 2232 1997 if (mddev->degraded == 0 2233 1998 || bitmap->events_cleared == mddev->events) ··· 2249 2014 mddev_set_timeout(mddev, mddev->bitmap_info.daemon_sleep, true); 2250 2015 md_wakeup_thread(mddev->thread); 2251 2016 2252 - md_bitmap_update_sb(bitmap); 2017 + bitmap_update_sb(bitmap); 2253 2018 2254 2019 if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) 2255 2020 err = -EIO; 2256 2021 out: 2257 2022 return err; 2258 2023 } 2259 - EXPORT_SYMBOL_GPL(md_bitmap_load); 2260 2024 2261 2025 /* caller need to free returned bitmap with md_bitmap_free() */ 2262 - struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot) 2026 + static void *bitmap_get_from_slot(struct mddev *mddev, int slot) 2263 2027 { 2264 2028 int rv = 0; 2265 2029 struct bitmap *bitmap; 2266 2030 2267 - bitmap = md_bitmap_create(mddev, slot); 2031 + bitmap = __bitmap_create(mddev, slot); 2268 2032 if (IS_ERR(bitmap)) { 2269 2033 rv = PTR_ERR(bitmap); 2270 2034 return ERR_PTR(rv); ··· 2277 2043 2278 2044 return bitmap; 2279 2045 } 2280 - EXPORT_SYMBOL(get_bitmap_from_slot); 2281 2046 2282 2047 /* Loads the bitmap associated with slot and copies the resync information 2283 2048 * to our bitmap 2284 2049 */ 2285 - int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, 2286 - sector_t *low, sector_t *high, bool clear_bits) 2050 + static int bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *low, 2051 + sector_t *high, bool clear_bits) 2287 2052 { 2288 2053 int rv = 0, i, j; 2289 2054 sector_t block, lo = 0, hi = 0; 2290 2055 struct bitmap_counts *counts; 2291 2056 struct bitmap *bitmap; 2292 2057 2293 - bitmap = get_bitmap_from_slot(mddev, slot); 2058 + bitmap = bitmap_get_from_slot(mddev, slot); 2294 2059 if (IS_ERR(bitmap)) { 2295 2060 pr_err("%s can't get bitmap from slot %d\n", __func__, slot); 2296 2061 return -1; ··· 2309 2076 } 2310 2077 2311 2078 if (clear_bits) { 2312 - md_bitmap_update_sb(bitmap); 2079 + bitmap_update_sb(bitmap); 2313 2080 /* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs 2314 2081 * BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */ 2315 2082 for (i = 0; i < bitmap->storage.file_pages; i++) 2316 2083 if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING)) 2317 2084 set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); 2318 - md_bitmap_unplug(bitmap); 2085 + __bitmap_unplug(bitmap); 2319 2086 } 2320 - md_bitmap_unplug(mddev->bitmap); 2087 + __bitmap_unplug(mddev->bitmap); 2321 2088 *low = lo; 2322 2089 *high = hi; 2323 2090 md_bitmap_free(bitmap); 2324 2091 2325 2092 return rv; 2326 2093 } 2327 - EXPORT_SYMBOL_GPL(md_bitmap_copy_from_slot); 2328 2094 2329 - 2330 - void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap) 2095 + static void bitmap_set_pages(void *data, unsigned long pages) 2331 2096 { 2332 - unsigned long chunk_kb; 2333 - struct bitmap_counts *counts; 2097 + struct bitmap *bitmap = data; 2334 2098 2335 - if (!bitmap) 2336 - return; 2337 - 2338 - counts = &bitmap->counts; 2339 - 2340 - chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; 2341 - seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 2342 - "%lu%s chunk", 2343 - counts->pages - counts->missing_pages, 2344 - counts->pages, 2345 - (counts->pages - counts->missing_pages) 2346 - << (PAGE_SHIFT - 10), 2347 - chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, 2348 - chunk_kb ? "KB" : "B"); 2349 - if (bitmap->storage.file) { 2350 - seq_printf(seq, ", file: "); 2351 - seq_file_path(seq, bitmap->storage.file, " \t\n"); 2352 - } 2353 - 2354 - seq_printf(seq, "\n"); 2099 + bitmap->counts.pages = pages; 2355 2100 } 2356 2101 2357 - int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, 2358 - int chunksize, int init) 2102 + static int bitmap_get_stats(void *data, struct md_bitmap_stats *stats) 2103 + { 2104 + struct bitmap_storage *storage; 2105 + struct bitmap_counts *counts; 2106 + struct bitmap *bitmap = data; 2107 + bitmap_super_t *sb; 2108 + 2109 + if (!bitmap) 2110 + return -ENOENT; 2111 + 2112 + sb = kmap_local_page(bitmap->storage.sb_page); 2113 + stats->sync_size = le64_to_cpu(sb->sync_size); 2114 + kunmap_local(sb); 2115 + 2116 + counts = &bitmap->counts; 2117 + stats->missing_pages = counts->missing_pages; 2118 + stats->pages = counts->pages; 2119 + 2120 + storage = &bitmap->storage; 2121 + stats->file_pages = storage->file_pages; 2122 + stats->file = storage->file; 2123 + 2124 + stats->behind_writes = atomic_read(&bitmap->behind_writes); 2125 + stats->behind_wait = wq_has_sleeper(&bitmap->behind_wait); 2126 + stats->events_cleared = bitmap->events_cleared; 2127 + return 0; 2128 + } 2129 + 2130 + static int __bitmap_resize(struct bitmap *bitmap, sector_t blocks, 2131 + int chunksize, bool init) 2359 2132 { 2360 2133 /* If chunk_size is 0, choose an appropriate chunk size. 2361 2134 * Then possibly allocate new storage space. ··· 2559 2320 spin_unlock_irq(&bitmap->counts.lock); 2560 2321 2561 2322 if (!init) { 2562 - md_bitmap_unplug(bitmap); 2323 + __bitmap_unplug(bitmap); 2563 2324 bitmap->mddev->pers->quiesce(bitmap->mddev, 0); 2564 2325 } 2565 2326 ret = 0; 2566 2327 err: 2567 2328 return ret; 2568 2329 } 2569 - EXPORT_SYMBOL_GPL(md_bitmap_resize); 2330 + 2331 + static int bitmap_resize(struct mddev *mddev, sector_t blocks, int chunksize, 2332 + bool init) 2333 + { 2334 + struct bitmap *bitmap = mddev->bitmap; 2335 + 2336 + if (!bitmap) 2337 + return 0; 2338 + 2339 + return __bitmap_resize(bitmap, blocks, chunksize, init); 2340 + } 2570 2341 2571 2342 static ssize_t 2572 2343 location_show(struct mddev *mddev, char *page) ··· 2616 2367 goto out; 2617 2368 } 2618 2369 2619 - md_bitmap_destroy(mddev); 2370 + bitmap_destroy(mddev); 2620 2371 mddev->bitmap_info.offset = 0; 2621 2372 if (mddev->bitmap_info.file) { 2622 2373 struct file *f = mddev->bitmap_info.file; ··· 2626 2377 } else { 2627 2378 /* No bitmap, OK to set a location */ 2628 2379 long long offset; 2629 - struct bitmap *bitmap; 2630 2380 2631 2381 if (strncmp(buf, "none", 4) == 0) 2632 2382 /* nothing to be done */; ··· 2652 2404 } 2653 2405 2654 2406 mddev->bitmap_info.offset = offset; 2655 - bitmap = md_bitmap_create(mddev, -1); 2656 - if (IS_ERR(bitmap)) { 2657 - rv = PTR_ERR(bitmap); 2407 + rv = bitmap_create(mddev, -1); 2408 + if (rv) 2658 2409 goto out; 2659 - } 2660 2410 2661 - mddev->bitmap = bitmap; 2662 - rv = md_bitmap_load(mddev); 2411 + rv = bitmap_load(mddev); 2663 2412 if (rv) { 2664 2413 mddev->bitmap_info.offset = 0; 2665 - md_bitmap_destroy(mddev); 2414 + bitmap_destroy(mddev); 2666 2415 goto out; 2667 2416 } 2668 2417 } ··· 2695 2450 static ssize_t 2696 2451 space_store(struct mddev *mddev, const char *buf, size_t len) 2697 2452 { 2453 + struct bitmap *bitmap; 2698 2454 unsigned long sectors; 2699 2455 int rv; 2700 2456 ··· 2706 2460 if (sectors == 0) 2707 2461 return -EINVAL; 2708 2462 2709 - if (mddev->bitmap && 2710 - sectors < (mddev->bitmap->storage.bytes + 511) >> 9) 2463 + bitmap = mddev->bitmap; 2464 + if (bitmap && sectors < (bitmap->storage.bytes + 511) >> 9) 2711 2465 return -EFBIG; /* Bitmap is too big for this small space */ 2712 2466 2713 2467 /* could make sure it isn't too big, but that isn't really ··· 2815 2569 mddev_create_serial_pool(mddev, rdev); 2816 2570 } 2817 2571 if (old_mwb != backlog) 2818 - md_bitmap_update_sb(mddev->bitmap); 2572 + bitmap_update_sb(mddev->bitmap); 2819 2573 2820 2574 mddev_unlock_and_resume(mddev); 2821 2575 return len; ··· 2884 2638 static ssize_t can_clear_show(struct mddev *mddev, char *page) 2885 2639 { 2886 2640 int len; 2641 + struct bitmap *bitmap; 2642 + 2887 2643 spin_lock(&mddev->lock); 2888 - if (mddev->bitmap) 2889 - len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? 2890 - "false" : "true")); 2644 + bitmap = mddev->bitmap; 2645 + if (bitmap) 2646 + len = sprintf(page, "%s\n", (bitmap->need_sync ? "false" : 2647 + "true")); 2891 2648 else 2892 2649 len = sprintf(page, "\n"); 2893 2650 spin_unlock(&mddev->lock); ··· 2899 2650 2900 2651 static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len) 2901 2652 { 2902 - if (mddev->bitmap == NULL) 2653 + struct bitmap *bitmap = mddev->bitmap; 2654 + 2655 + if (!bitmap) 2903 2656 return -ENOENT; 2904 - if (strncmp(buf, "false", 5) == 0) 2905 - mddev->bitmap->need_sync = 1; 2906 - else if (strncmp(buf, "true", 4) == 0) { 2657 + 2658 + if (strncmp(buf, "false", 5) == 0) { 2659 + bitmap->need_sync = 1; 2660 + return len; 2661 + } 2662 + 2663 + if (strncmp(buf, "true", 4) == 0) { 2907 2664 if (mddev->degraded) 2908 2665 return -EBUSY; 2909 - mddev->bitmap->need_sync = 0; 2910 - } else 2911 - return -EINVAL; 2912 - return len; 2666 + bitmap->need_sync = 0; 2667 + return len; 2668 + } 2669 + 2670 + return -EINVAL; 2913 2671 } 2914 2672 2915 2673 static struct md_sysfs_entry bitmap_can_clear = ··· 2926 2670 behind_writes_used_show(struct mddev *mddev, char *page) 2927 2671 { 2928 2672 ssize_t ret; 2673 + struct bitmap *bitmap; 2674 + 2929 2675 spin_lock(&mddev->lock); 2930 - if (mddev->bitmap == NULL) 2676 + bitmap = mddev->bitmap; 2677 + if (!bitmap) 2931 2678 ret = sprintf(page, "0\n"); 2932 2679 else 2933 - ret = sprintf(page, "%lu\n", 2934 - mddev->bitmap->behind_writes_used); 2680 + ret = sprintf(page, "%lu\n", bitmap->behind_writes_used); 2935 2681 spin_unlock(&mddev->lock); 2682 + 2936 2683 return ret; 2937 2684 } 2938 2685 2939 2686 static ssize_t 2940 2687 behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len) 2941 2688 { 2942 - if (mddev->bitmap) 2943 - mddev->bitmap->behind_writes_used = 0; 2689 + struct bitmap *bitmap = mddev->bitmap; 2690 + 2691 + if (bitmap) 2692 + bitmap->behind_writes_used = 0; 2944 2693 return len; 2945 2694 } 2946 2695 ··· 2968 2707 .name = "bitmap", 2969 2708 .attrs = md_bitmap_attrs, 2970 2709 }; 2710 + 2711 + static struct bitmap_operations bitmap_ops = { 2712 + .enabled = bitmap_enabled, 2713 + .create = bitmap_create, 2714 + .resize = bitmap_resize, 2715 + .load = bitmap_load, 2716 + .destroy = bitmap_destroy, 2717 + .flush = bitmap_flush, 2718 + .write_all = bitmap_write_all, 2719 + .dirty_bits = bitmap_dirty_bits, 2720 + .unplug = bitmap_unplug, 2721 + .daemon_work = bitmap_daemon_work, 2722 + .wait_behind_writes = bitmap_wait_behind_writes, 2723 + 2724 + .startwrite = bitmap_startwrite, 2725 + .endwrite = bitmap_endwrite, 2726 + .start_sync = bitmap_start_sync, 2727 + .end_sync = bitmap_end_sync, 2728 + .cond_end_sync = bitmap_cond_end_sync, 2729 + .close_sync = bitmap_close_sync, 2730 + 2731 + .update_sb = bitmap_update_sb, 2732 + .get_stats = bitmap_get_stats, 2733 + 2734 + .sync_with_cluster = bitmap_sync_with_cluster, 2735 + .get_from_slot = bitmap_get_from_slot, 2736 + .copy_from_slot = bitmap_copy_from_slot, 2737 + .set_pages = bitmap_set_pages, 2738 + .free = md_bitmap_free, 2739 + }; 2740 + 2741 + void mddev_set_bitmap_ops(struct mddev *mddev) 2742 + { 2743 + mddev->bitmap_ops = &bitmap_ops; 2744 + }

+44 -216

drivers/md/md-bitmap.h

··· 7 7 #ifndef BITMAP_H 8 8 #define BITMAP_H 1 9 9 10 - #define BITMAP_MAJOR_LO 3 11 - /* version 4 insists the bitmap is in little-endian order 12 - * with version 3, it is host-endian which is non-portable 13 - * Version 5 is currently set only for clustered devices 14 - */ 15 - #define BITMAP_MAJOR_HI 4 16 - #define BITMAP_MAJOR_CLUSTERED 5 17 - #define BITMAP_MAJOR_HOSTENDIAN 3 18 - 19 - /* 20 - * in-memory bitmap: 21 - * 22 - * Use 16 bit block counters to track pending writes to each "chunk". 23 - * The 2 high order bits are special-purpose, the first is a flag indicating 24 - * whether a resync is needed. The second is a flag indicating whether a 25 - * resync is active. 26 - * This means that the counter is actually 14 bits: 27 - * 28 - * +--------+--------+------------------------------------------------+ 29 - * | resync | resync | counter | 30 - * | needed | active | | 31 - * | (0-1) | (0-1) | (0-16383) | 32 - * +--------+--------+------------------------------------------------+ 33 - * 34 - * The "resync needed" bit is set when: 35 - * a '1' bit is read from storage at startup. 36 - * a write request fails on some drives 37 - * a resync is aborted on a chunk with 'resync active' set 38 - * It is cleared (and resync-active set) when a resync starts across all drives 39 - * of the chunk. 40 - * 41 - * 42 - * The "resync active" bit is set when: 43 - * a resync is started on all drives, and resync_needed is set. 44 - * resync_needed will be cleared (as long as resync_active wasn't already set). 45 - * It is cleared when a resync completes. 46 - * 47 - * The counter counts pending write requests, plus the on-disk bit. 48 - * When the counter is '1' and the resync bits are clear, the on-disk 49 - * bit can be cleared as well, thus setting the counter to 0. 50 - * When we set a bit, or in the counter (to start a write), if the fields is 51 - * 0, we first set the disk bit and set the counter to 1. 52 - * 53 - * If the counter is 0, the on-disk bit is clear and the stripe is clean 54 - * Anything that dirties the stripe pushes the counter to 2 (at least) 55 - * and sets the on-disk bit (lazily). 56 - * If a periodic sweep find the counter at 2, it is decremented to 1. 57 - * If the sweep find the counter at 1, the on-disk bit is cleared and the 58 - * counter goes to zero. 59 - * 60 - * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block 61 - * counters as a fallback when "page" memory cannot be allocated: 62 - * 63 - * Normal case (page memory allocated): 64 - * 65 - * page pointer (32-bit) 66 - * 67 - * [ ] ------+ 68 - * | 69 - * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) 70 - * c1 c2 c2048 71 - * 72 - * Hijacked case (page memory allocation failed): 73 - * 74 - * hijacked page pointer (32-bit) 75 - * 76 - * [ ][ ] (no page memory allocated) 77 - * counter #1 (16-bit) counter #2 (16-bit) 78 - * 79 - */ 80 - 81 - #ifdef __KERNEL__ 82 - 83 - #define PAGE_BITS (PAGE_SIZE << 3) 84 - #define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) 10 + #define BITMAP_MAGIC 0x6d746962 85 11 86 12 typedef __u16 bitmap_counter_t; 87 13 #define COUNTER_BITS 16 ··· 17 91 #define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) 18 92 #define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) 19 93 #define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) 20 - #define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) 21 - #define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) 22 - #define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) 23 - 24 - /* how many counters per page? */ 25 - #define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) 26 - /* same, except a shift value for more efficient bitops */ 27 - #define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) 28 - /* same, except a mask value for more efficient bitops */ 29 - #define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) 30 - 31 - #define BITMAP_BLOCK_SHIFT 9 32 - 33 - #endif 34 - 35 - /* 36 - * bitmap structures: 37 - */ 38 - 39 - #define BITMAP_MAGIC 0x6d746962 40 94 41 95 /* use these for bitmap->flags and bitmap->sb->state bit-fields */ 42 96 enum bitmap_state { ··· 58 152 * devices. For raid10 it is the size of the array. 59 153 */ 60 154 61 - #ifdef __KERNEL__ 155 + struct md_bitmap_stats { 156 + u64 events_cleared; 157 + int behind_writes; 158 + bool behind_wait; 62 159 63 - /* the in-memory bitmap is represented by bitmap_pages */ 64 - struct bitmap_page { 65 - /* 66 - * map points to the actual memory page 67 - */ 68 - char *map; 69 - /* 70 - * in emergencies (when map cannot be alloced), hijack the map 71 - * pointer and use it as two counters itself 72 - */ 73 - unsigned int hijacked:1; 74 - /* 75 - * If any counter in this page is '1' or '2' - and so could be 76 - * cleared then that page is marked as 'pending' 77 - */ 78 - unsigned int pending:1; 79 - /* 80 - * count of dirty bits on the page 81 - */ 82 - unsigned int count:30; 160 + unsigned long missing_pages; 161 + unsigned long file_pages; 162 + unsigned long sync_size; 163 + unsigned long pages; 164 + struct file *file; 83 165 }; 84 166 85 - /* the main bitmap structure - one per mddev */ 86 - struct bitmap { 167 + struct bitmap_operations { 168 + bool (*enabled)(struct mddev *mddev); 169 + int (*create)(struct mddev *mddev, int slot); 170 + int (*resize)(struct mddev *mddev, sector_t blocks, int chunksize, 171 + bool init); 87 172 88 - struct bitmap_counts { 89 - spinlock_t lock; 90 - struct bitmap_page *bp; 91 - unsigned long pages; /* total number of pages 92 - * in the bitmap */ 93 - unsigned long missing_pages; /* number of pages 94 - * not yet allocated */ 95 - unsigned long chunkshift; /* chunksize = 2^chunkshift 96 - * (for bitops) */ 97 - unsigned long chunks; /* Total number of data 98 - * chunks for the array */ 99 - } counts; 173 + int (*load)(struct mddev *mddev); 174 + void (*destroy)(struct mddev *mddev); 175 + void (*flush)(struct mddev *mddev); 176 + void (*write_all)(struct mddev *mddev); 177 + void (*dirty_bits)(struct mddev *mddev, unsigned long s, 178 + unsigned long e); 179 + void (*unplug)(struct mddev *mddev, bool sync); 180 + void (*daemon_work)(struct mddev *mddev); 181 + void (*wait_behind_writes)(struct mddev *mddev); 100 182 101 - struct mddev *mddev; /* the md device that the bitmap is for */ 183 + int (*startwrite)(struct mddev *mddev, sector_t offset, 184 + unsigned long sectors, bool behind); 185 + void (*endwrite)(struct mddev *mddev, sector_t offset, 186 + unsigned long sectors, bool success, bool behind); 187 + bool (*start_sync)(struct mddev *mddev, sector_t offset, 188 + sector_t *blocks, bool degraded); 189 + void (*end_sync)(struct mddev *mddev, sector_t offset, sector_t *blocks); 190 + void (*cond_end_sync)(struct mddev *mddev, sector_t sector, bool force); 191 + void (*close_sync)(struct mddev *mddev); 102 192 103 - __u64 events_cleared; 104 - int need_sync; 193 + void (*update_sb)(void *data); 194 + int (*get_stats)(void *data, struct md_bitmap_stats *stats); 105 195 106 - struct bitmap_storage { 107 - struct file *file; /* backing disk file */ 108 - struct page *sb_page; /* cached copy of the bitmap 109 - * file superblock */ 110 - unsigned long sb_index; 111 - struct page **filemap; /* list of cache pages for 112 - * the file */ 113 - unsigned long *filemap_attr; /* attributes associated 114 - * w/ filemap pages */ 115 - unsigned long file_pages; /* number of pages in the file*/ 116 - unsigned long bytes; /* total bytes in the bitmap */ 117 - } storage; 118 - 119 - unsigned long flags; 120 - 121 - int allclean; 122 - 123 - atomic_t behind_writes; 124 - unsigned long behind_writes_used; /* highest actual value at runtime */ 125 - 126 - /* 127 - * the bitmap daemon - periodically wakes up and sweeps the bitmap 128 - * file, cleaning up bits and flushing out pages to disk as necessary 129 - */ 130 - unsigned long daemon_lastrun; /* jiffies of last run */ 131 - unsigned long last_end_sync; /* when we lasted called end_sync to 132 - * update bitmap with resync progress */ 133 - 134 - atomic_t pending_writes; /* pending writes to the bitmap file */ 135 - wait_queue_head_t write_wait; 136 - wait_queue_head_t overflow_wait; 137 - wait_queue_head_t behind_wait; 138 - 139 - struct kernfs_node *sysfs_can_clear; 140 - int cluster_slot; /* Slot offset for clustered env */ 196 + void (*sync_with_cluster)(struct mddev *mddev, 197 + sector_t old_lo, sector_t old_hi, 198 + sector_t new_lo, sector_t new_hi); 199 + void *(*get_from_slot)(struct mddev *mddev, int slot); 200 + int (*copy_from_slot)(struct mddev *mddev, int slot, sector_t *lo, 201 + sector_t *hi, bool clear_bits); 202 + void (*set_pages)(void *data, unsigned long pages); 203 + void (*free)(void *data); 141 204 }; 142 205 143 206 /* the bitmap API */ 144 - 145 - /* these are used only by md/bitmap */ 146 - struct bitmap *md_bitmap_create(struct mddev *mddev, int slot); 147 - int md_bitmap_load(struct mddev *mddev); 148 - void md_bitmap_flush(struct mddev *mddev); 149 - void md_bitmap_destroy(struct mddev *mddev); 150 - 151 - void md_bitmap_print_sb(struct bitmap *bitmap); 152 - void md_bitmap_update_sb(struct bitmap *bitmap); 153 - void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap); 154 - 155 - int md_bitmap_setallbits(struct bitmap *bitmap); 156 - void md_bitmap_write_all(struct bitmap *bitmap); 157 - 158 - void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); 159 - 160 - /* these are exported */ 161 - int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, 162 - unsigned long sectors, int behind); 163 - void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, 164 - unsigned long sectors, int success, int behind); 165 - int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); 166 - void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); 167 - void md_bitmap_close_sync(struct bitmap *bitmap); 168 - void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); 169 - void md_bitmap_sync_with_cluster(struct mddev *mddev, 170 - sector_t old_lo, sector_t old_hi, 171 - sector_t new_lo, sector_t new_hi); 172 - 173 - void md_bitmap_unplug(struct bitmap *bitmap); 174 - void md_bitmap_unplug_async(struct bitmap *bitmap); 175 - void md_bitmap_daemon_work(struct mddev *mddev); 176 - 177 - int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, 178 - int chunksize, int init); 179 - struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot); 180 - int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, 181 - sector_t *lo, sector_t *hi, bool clear_bits); 182 - void md_bitmap_free(struct bitmap *bitmap); 183 - void md_bitmap_wait_behind_writes(struct mddev *mddev); 184 - 185 - static inline bool md_bitmap_enabled(struct bitmap *bitmap) 186 - { 187 - return bitmap && bitmap->storage.filemap && 188 - !test_bit(BITMAP_STALE, &bitmap->flags); 189 - } 190 - 191 - #endif 207 + void mddev_set_bitmap_ops(struct mddev *mddev); 192 208 193 209 #endif

+53 -38

drivers/md/md-cluster.c

··· 317 317 str, ret); 318 318 goto clear_bit; 319 319 } 320 - ret = md_bitmap_copy_from_slot(mddev, slot, &lo, &hi, true); 320 + ret = mddev->bitmap_ops->copy_from_slot(mddev, slot, &lo, &hi, true); 321 321 if (ret) { 322 322 pr_err("md-cluster: Could not copy data from bitmap %d\n", slot); 323 323 goto clear_bit; ··· 497 497 * we don't want to trigger lots of WARN. 498 498 */ 499 499 if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) 500 - md_bitmap_sync_with_cluster(mddev, cinfo->sync_low, 501 - cinfo->sync_hi, lo, hi); 500 + mddev->bitmap_ops->sync_with_cluster(mddev, cinfo->sync_low, 501 + cinfo->sync_hi, lo, hi); 502 502 cinfo->sync_low = lo; 503 503 cinfo->sync_hi = hi; 504 504 ··· 628 628 break; 629 629 case BITMAP_RESIZE: 630 630 if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0)) 631 - ret = md_bitmap_resize(mddev->bitmap, 632 - le64_to_cpu(msg->high), 0, 0); 631 + ret = mddev->bitmap_ops->resize(mddev, 632 + le64_to_cpu(msg->high), 633 + 0, false); 633 634 break; 634 635 default: 635 636 ret = -1; ··· 857 856 } 858 857 859 858 /* Read the disk bitmap sb and check if it needs recovery */ 860 - ret = md_bitmap_copy_from_slot(mddev, i, &lo, &hi, false); 859 + ret = mddev->bitmap_ops->copy_from_slot(mddev, i, &lo, &hi, false); 861 860 if (ret) { 862 861 pr_warn("md-cluster: Could not gather bitmaps from slot %d", i); 863 862 lockres_free(bm_lockres); ··· 1144 1143 1145 1144 static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize) 1146 1145 { 1147 - struct bitmap_counts *counts; 1148 - char str[64]; 1149 - struct dlm_lock_resource *bm_lockres; 1150 - struct bitmap *bitmap = mddev->bitmap; 1151 - unsigned long my_pages = bitmap->counts.pages; 1146 + void *bitmap = mddev->bitmap; 1147 + struct md_bitmap_stats stats; 1148 + unsigned long my_pages; 1152 1149 int i, rv; 1153 1150 1151 + rv = mddev->bitmap_ops->get_stats(bitmap, &stats); 1152 + if (rv) 1153 + return rv; 1154 + 1155 + my_pages = stats.pages; 1154 1156 /* 1155 1157 * We need to ensure all the nodes can grow to a larger 1156 1158 * bitmap size before make the reshaping. ··· 1163 1159 return rv; 1164 1160 1165 1161 for (i = 0; i < mddev->bitmap_info.nodes; i++) { 1162 + struct dlm_lock_resource *bm_lockres; 1163 + char str[64]; 1164 + 1166 1165 if (i == md_cluster_ops->slot_number(mddev)) 1167 1166 continue; 1168 1167 1169 - bitmap = get_bitmap_from_slot(mddev, i); 1168 + bitmap = mddev->bitmap_ops->get_from_slot(mddev, i); 1170 1169 if (IS_ERR(bitmap)) { 1171 1170 pr_err("can't get bitmap from slot %d\n", i); 1172 1171 bitmap = NULL; 1173 1172 goto out; 1174 1173 } 1175 - counts = &bitmap->counts; 1176 1174 1175 + rv = mddev->bitmap_ops->get_stats(bitmap, &stats); 1176 + if (rv) 1177 + goto out; 1177 1178 /* 1178 1179 * If we can hold the bitmap lock of one node then 1179 1180 * the slot is not occupied, update the pages. ··· 1192 1183 bm_lockres->flags |= DLM_LKF_NOQUEUE; 1193 1184 rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); 1194 1185 if (!rv) 1195 - counts->pages = my_pages; 1186 + mddev->bitmap_ops->set_pages(bitmap, my_pages); 1196 1187 lockres_free(bm_lockres); 1197 1188 1198 - if (my_pages != counts->pages) 1189 + if (my_pages != stats.pages) 1199 1190 /* 1200 1191 * Let's revert the bitmap size if one node 1201 1192 * can't resize bitmap 1202 1193 */ 1203 1194 goto out; 1204 - md_bitmap_free(bitmap); 1195 + mddev->bitmap_ops->free(bitmap); 1205 1196 } 1206 1197 1207 1198 return 0; 1208 1199 out: 1209 - md_bitmap_free(bitmap); 1200 + mddev->bitmap_ops->free(bitmap); 1210 1201 update_bitmap_size(mddev, oldsize); 1211 1202 return -1; 1212 1203 } ··· 1216 1207 */ 1217 1208 static int cluster_check_sync_size(struct mddev *mddev) 1218 1209 { 1219 - int i, rv; 1220 - bitmap_super_t *sb; 1221 - unsigned long my_sync_size, sync_size = 0; 1222 - int node_num = mddev->bitmap_info.nodes; 1223 1210 int current_slot = md_cluster_ops->slot_number(mddev); 1224 - struct bitmap *bitmap = mddev->bitmap; 1225 - char str[64]; 1211 + int node_num = mddev->bitmap_info.nodes; 1226 1212 struct dlm_lock_resource *bm_lockres; 1213 + struct md_bitmap_stats stats; 1214 + void *bitmap = mddev->bitmap; 1215 + unsigned long sync_size = 0; 1216 + unsigned long my_sync_size; 1217 + char str[64]; 1218 + int i, rv; 1227 1219 1228 - sb = kmap_atomic(bitmap->storage.sb_page); 1229 - my_sync_size = sb->sync_size; 1230 - kunmap_atomic(sb); 1220 + rv = mddev->bitmap_ops->get_stats(bitmap, &stats); 1221 + if (rv) 1222 + return rv; 1223 + 1224 + my_sync_size = stats.sync_size; 1231 1225 1232 1226 for (i = 0; i < node_num; i++) { 1233 1227 if (i == current_slot) 1234 1228 continue; 1235 1229 1236 - bitmap = get_bitmap_from_slot(mddev, i); 1230 + bitmap = mddev->bitmap_ops->get_from_slot(mddev, i); 1237 1231 if (IS_ERR(bitmap)) { 1238 1232 pr_err("can't get bitmap from slot %d\n", i); 1239 1233 return -1; ··· 1250 1238 bm_lockres = lockres_init(mddev, str, NULL, 1); 1251 1239 if (!bm_lockres) { 1252 1240 pr_err("md-cluster: Cannot initialize %s\n", str); 1253 - md_bitmap_free(bitmap); 1241 + mddev->bitmap_ops->free(bitmap); 1254 1242 return -1; 1255 1243 } 1256 1244 bm_lockres->flags |= DLM_LKF_NOQUEUE; 1257 1245 rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); 1258 1246 if (!rv) 1259 - md_bitmap_update_sb(bitmap); 1247 + mddev->bitmap_ops->update_sb(bitmap); 1260 1248 lockres_free(bm_lockres); 1261 1249 1262 - sb = kmap_atomic(bitmap->storage.sb_page); 1263 - if (sync_size == 0) 1264 - sync_size = sb->sync_size; 1265 - else if (sync_size != sb->sync_size) { 1266 - kunmap_atomic(sb); 1267 - md_bitmap_free(bitmap); 1250 + rv = mddev->bitmap_ops->get_stats(bitmap, &stats); 1251 + if (rv) { 1252 + mddev->bitmap_ops->free(bitmap); 1253 + return rv; 1254 + } 1255 + 1256 + if (sync_size == 0) { 1257 + sync_size = stats.sync_size; 1258 + } else if (sync_size != stats.sync_size) { 1259 + mddev->bitmap_ops->free(bitmap); 1268 1260 return -1; 1269 1261 } 1270 - kunmap_atomic(sb); 1271 - md_bitmap_free(bitmap); 1262 + mddev->bitmap_ops->free(bitmap); 1272 1263 } 1273 1264 1274 1265 return (my_sync_size == sync_size) ? 0 : -1; ··· 1600 1585 for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) { 1601 1586 if (sn == (cinfo->slot_number - 1)) 1602 1587 continue; 1603 - err = md_bitmap_copy_from_slot(mddev, sn, &lo, &hi, false); 1588 + err = mddev->bitmap_ops->copy_from_slot(mddev, sn, &lo, &hi, false); 1604 1589 if (err) { 1605 1590 pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn); 1606 1591 goto out;

+155 -185

drivers/md/md.c

··· 546 546 return 0; 547 547 } 548 548 549 - /* 550 - * Generic flush handling for md 551 - */ 552 - 553 - static void md_end_flush(struct bio *bio) 554 - { 555 - struct md_rdev *rdev = bio->bi_private; 556 - struct mddev *mddev = rdev->mddev; 557 - 558 - bio_put(bio); 559 - 560 - rdev_dec_pending(rdev, mddev); 561 - 562 - if (atomic_dec_and_test(&mddev->flush_pending)) 563 - /* The pre-request flush has finished */ 564 - queue_work(md_wq, &mddev->flush_work); 565 - } 566 - 567 - static void md_submit_flush_data(struct work_struct *ws); 568 - 569 - static void submit_flushes(struct work_struct *ws) 570 - { 571 - struct mddev *mddev = container_of(ws, struct mddev, flush_work); 572 - struct md_rdev *rdev; 573 - 574 - mddev->start_flush = ktime_get_boottime(); 575 - INIT_WORK(&mddev->flush_work, md_submit_flush_data); 576 - atomic_set(&mddev->flush_pending, 1); 577 - rcu_read_lock(); 578 - rdev_for_each_rcu(rdev, mddev) 579 - if (rdev->raid_disk >= 0 && 580 - !test_bit(Faulty, &rdev->flags)) { 581 - struct bio *bi; 582 - 583 - atomic_inc(&rdev->nr_pending); 584 - rcu_read_unlock(); 585 - bi = bio_alloc_bioset(rdev->bdev, 0, 586 - REQ_OP_WRITE | REQ_PREFLUSH, 587 - GFP_NOIO, &mddev->bio_set); 588 - bi->bi_end_io = md_end_flush; 589 - bi->bi_private = rdev; 590 - atomic_inc(&mddev->flush_pending); 591 - submit_bio(bi); 592 - rcu_read_lock(); 593 - } 594 - rcu_read_unlock(); 595 - if (atomic_dec_and_test(&mddev->flush_pending)) 596 - queue_work(md_wq, &mddev->flush_work); 597 - } 598 - 599 - static void md_submit_flush_data(struct work_struct *ws) 600 - { 601 - struct mddev *mddev = container_of(ws, struct mddev, flush_work); 602 - struct bio *bio = mddev->flush_bio; 603 - 604 - /* 605 - * must reset flush_bio before calling into md_handle_request to avoid a 606 - * deadlock, because other bios passed md_handle_request suspend check 607 - * could wait for this and below md_handle_request could wait for those 608 - * bios because of suspend check 609 - */ 610 - spin_lock_irq(&mddev->lock); 611 - mddev->prev_flush_start = mddev->start_flush; 612 - mddev->flush_bio = NULL; 613 - spin_unlock_irq(&mddev->lock); 614 - wake_up(&mddev->sb_wait); 615 - 616 - if (bio->bi_iter.bi_size == 0) { 617 - /* an empty barrier - all done */ 618 - bio_endio(bio); 619 - } else { 620 - bio->bi_opf &= ~REQ_PREFLUSH; 621 - 622 - /* 623 - * make_requst() will never return error here, it only 624 - * returns error in raid5_make_request() by dm-raid. 625 - * Since dm always splits data and flush operation into 626 - * two separate io, io size of flush submitted by dm 627 - * always is 0, make_request() will not be called here. 628 - */ 629 - if (WARN_ON_ONCE(!mddev->pers->make_request(mddev, bio))) 630 - bio_io_error(bio); 631 - } 632 - 633 - /* The pair is percpu_ref_get() from md_flush_request() */ 634 - percpu_ref_put(&mddev->active_io); 635 - } 636 - 637 - /* 638 - * Manages consolidation of flushes and submitting any flushes needed for 639 - * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is 640 - * being finished in another context. Returns false if the flushing is 641 - * complete but still needs the I/O portion of the bio to be processed. 642 - */ 643 549 bool md_flush_request(struct mddev *mddev, struct bio *bio) 644 550 { 645 - ktime_t req_start = ktime_get_boottime(); 646 - spin_lock_irq(&mddev->lock); 647 - /* flush requests wait until ongoing flush completes, 648 - * hence coalescing all the pending requests. 551 + struct md_rdev *rdev; 552 + struct bio *new; 553 + 554 + /* 555 + * md_flush_reqeust() should be called under md_handle_request() and 556 + * 'active_io' is already grabbed. Hence it's safe to get rdev directly 557 + * without rcu protection. 649 558 */ 650 - wait_event_lock_irq(mddev->sb_wait, 651 - !mddev->flush_bio || 652 - ktime_before(req_start, mddev->prev_flush_start), 653 - mddev->lock); 654 - /* new request after previous flush is completed */ 655 - if (ktime_after(req_start, mddev->prev_flush_start)) { 656 - WARN_ON(mddev->flush_bio); 657 - /* 658 - * Grab a reference to make sure mddev_suspend() will wait for 659 - * this flush to be done. 660 - * 661 - * md_flush_reqeust() is called under md_handle_request() and 662 - * 'active_io' is already grabbed, hence percpu_ref_is_zero() 663 - * won't pass, percpu_ref_tryget_live() can't be used because 664 - * percpu_ref_kill() can be called by mddev_suspend() 665 - * concurrently. 666 - */ 667 - WARN_ON(percpu_ref_is_zero(&mddev->active_io)); 668 - percpu_ref_get(&mddev->active_io); 669 - mddev->flush_bio = bio; 670 - spin_unlock_irq(&mddev->lock); 671 - INIT_WORK(&mddev->flush_work, submit_flushes); 672 - queue_work(md_wq, &mddev->flush_work); 673 - return true; 559 + WARN_ON(percpu_ref_is_zero(&mddev->active_io)); 560 + 561 + rdev_for_each(rdev, mddev) { 562 + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) 563 + continue; 564 + 565 + new = bio_alloc_bioset(rdev->bdev, 0, 566 + REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO, 567 + &mddev->bio_set); 568 + bio_chain(new, bio); 569 + submit_bio(new); 674 570 } 675 571 676 - /* flush was performed for some other bio while we waited. */ 677 - spin_unlock_irq(&mddev->lock); 678 - if (bio->bi_iter.bi_size == 0) { 679 - /* pure flush without data - all done */ 572 + if (bio_sectors(bio) == 0) { 680 573 bio_endio(bio); 681 574 return true; 682 575 } ··· 656 763 atomic_set(&mddev->openers, 0); 657 764 atomic_set(&mddev->sync_seq, 0); 658 765 spin_lock_init(&mddev->lock); 659 - atomic_set(&mddev->flush_pending, 0); 660 766 init_waitqueue_head(&mddev->sb_wait); 661 767 init_waitqueue_head(&mddev->recovery_wait); 662 768 mddev->reshape_position = MaxSector; ··· 664 772 mddev->resync_min = 0; 665 773 mddev->resync_max = MaxSector; 666 774 mddev->level = LEVEL_NONE; 775 + mddev_set_bitmap_ops(mddev); 667 776 668 777 INIT_WORK(&mddev->sync_work, md_start_sync); 669 778 INIT_WORK(&mddev->del_work, mddev_delayed_delete); ··· 1265 1372 return ret; 1266 1373 } 1267 1374 1375 + static u64 md_bitmap_events_cleared(struct mddev *mddev) 1376 + { 1377 + struct md_bitmap_stats stats; 1378 + int err; 1379 + 1380 + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 1381 + if (err) 1382 + return 0; 1383 + 1384 + return stats.events_cleared; 1385 + } 1386 + 1268 1387 /* 1269 1388 * validate_super for 0.90.0 1270 1389 * note: we are not using "freshest" for 0.9 superblock ··· 1369 1464 /* if adding to array with a bitmap, then we can accept an 1370 1465 * older device ... but not too old. 1371 1466 */ 1372 - if (ev1 < mddev->bitmap->events_cleared) 1467 + if (ev1 < md_bitmap_events_cleared(mddev)) 1373 1468 return 0; 1374 1469 if (ev1 < mddev->events) 1375 1470 set_bit(Bitmap_sync, &rdev->flags); ··· 1896 1991 /* If adding to array with a bitmap, then we can accept an 1897 1992 * older device, but not too old. 1898 1993 */ 1899 - if (ev1 < mddev->bitmap->events_cleared) 1994 + if (ev1 < md_bitmap_events_cleared(mddev)) 1900 1995 return 0; 1901 1996 if (ev1 < mddev->events) 1902 1997 set_bit(Bitmap_sync, &rdev->flags); ··· 2228 2323 unsigned long long new_offset) 2229 2324 { 2230 2325 /* All necessary checks on new >= old have been done */ 2231 - struct bitmap *bitmap; 2232 2326 if (new_offset >= rdev->data_offset) 2233 2327 return 1; 2234 2328 ··· 2244 2340 */ 2245 2341 if (rdev->sb_start + (32+4)*2 > new_offset) 2246 2342 return 0; 2247 - bitmap = rdev->mddev->bitmap; 2248 - if (bitmap && !rdev->mddev->bitmap_info.file && 2249 - rdev->sb_start + rdev->mddev->bitmap_info.offset + 2250 - bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 2251 - return 0; 2343 + 2344 + if (!rdev->mddev->bitmap_info.file) { 2345 + struct mddev *mddev = rdev->mddev; 2346 + struct md_bitmap_stats stats; 2347 + int err; 2348 + 2349 + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 2350 + if (!err && rdev->sb_start + mddev->bitmap_info.offset + 2351 + stats.file_pages * (PAGE_SIZE >> 9) > new_offset) 2352 + return 0; 2353 + } 2354 + 2252 2355 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 2253 2356 return 0; 2254 2357 ··· 2731 2820 2732 2821 mddev_add_trace_msg(mddev, "md md_update_sb"); 2733 2822 rewrite: 2734 - md_bitmap_update_sb(mddev->bitmap); 2823 + mddev->bitmap_ops->update_sb(mddev->bitmap); 2735 2824 rdev_for_each(rdev, mddev) { 2736 2825 if (rdev->sb_loaded != 1) 2737 2826 continue; /* no noise on spare devices */ ··· 4053 4142 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 4054 4143 4055 4144 static ssize_t 4145 + new_level_show(struct mddev *mddev, char *page) 4146 + { 4147 + return sprintf(page, "%d\n", mddev->new_level); 4148 + } 4149 + 4150 + static ssize_t 4151 + new_level_store(struct mddev *mddev, const char *buf, size_t len) 4152 + { 4153 + unsigned int n; 4154 + int err; 4155 + 4156 + err = kstrtouint(buf, 10, &n); 4157 + if (err < 0) 4158 + return err; 4159 + err = mddev_lock(mddev); 4160 + if (err) 4161 + return err; 4162 + 4163 + mddev->new_level = n; 4164 + md_update_sb(mddev, 1); 4165 + 4166 + mddev_unlock(mddev); 4167 + return len; 4168 + } 4169 + static struct md_sysfs_entry md_new_level = 4170 + __ATTR(new_level, 0664, new_level_show, new_level_store); 4171 + 4172 + static ssize_t 4056 4173 layout_show(struct mddev *mddev, char *page) 4057 4174 { 4058 4175 /* just a number, not meaningful for all levels */ ··· 4619 4680 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4620 4681 while (*buf) { 4621 4682 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4622 - if (buf == end) break; 4683 + if (buf == end) 4684 + break; 4685 + 4623 4686 if (*end == '-') { /* range */ 4624 4687 buf = end + 1; 4625 4688 end_chunk = simple_strtoul(buf, &end, 0); 4626 - if (buf == end) break; 4689 + if (buf == end) 4690 + break; 4627 4691 } 4628 - if (*end && !isspace(*end)) break; 4629 - md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 4692 + 4693 + if (*end && !isspace(*end)) 4694 + break; 4695 + 4696 + mddev->bitmap_ops->dirty_bits(mddev, chunk, end_chunk); 4630 4697 buf = skip_spaces(end); 4631 4698 } 4632 - md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 4699 + mddev->bitmap_ops->unplug(mddev, true); /* flush the bits to disk */ 4633 4700 out: 4634 4701 mddev_unlock(mddev); 4635 4702 return len; ··· 5611 5666 5612 5667 static struct attribute *md_default_attrs[] = { 5613 5668 &md_level.attr, 5669 + &md_new_level.attr, 5614 5670 &md_layout.attr, 5615 5671 &md_raid_disks.attr, 5616 5672 &md_uuid.attr, ··· 6152 6206 } 6153 6207 if (err == 0 && pers->sync_request && 6154 6208 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 6155 - struct bitmap *bitmap; 6156 - 6157 - bitmap = md_bitmap_create(mddev, -1); 6158 - if (IS_ERR(bitmap)) { 6159 - err = PTR_ERR(bitmap); 6209 + err = mddev->bitmap_ops->create(mddev, -1); 6210 + if (err) 6160 6211 pr_warn("%s: failed to create bitmap (%d)\n", 6161 6212 mdname(mddev), err); 6162 - } else 6163 - mddev->bitmap = bitmap; 6164 - 6165 6213 } 6166 6214 if (err) 6167 6215 goto bitmap_abort; ··· 6225 6285 pers->free(mddev, mddev->private); 6226 6286 mddev->private = NULL; 6227 6287 module_put(pers->owner); 6228 - md_bitmap_destroy(mddev); 6288 + mddev->bitmap_ops->destroy(mddev); 6229 6289 abort: 6230 6290 bioset_exit(&mddev->io_clone_set); 6231 6291 exit_sync_set: ··· 6244 6304 err = md_run(mddev); 6245 6305 if (err) 6246 6306 goto out; 6247 - err = md_bitmap_load(mddev); 6307 + 6308 + err = mddev->bitmap_ops->load(mddev); 6248 6309 if (err) { 6249 - md_bitmap_destroy(mddev); 6310 + mddev->bitmap_ops->destroy(mddev); 6250 6311 goto out; 6251 6312 } 6252 6313 ··· 6391 6450 mddev->pers->quiesce(mddev, 1); 6392 6451 mddev->pers->quiesce(mddev, 0); 6393 6452 } 6394 - md_bitmap_flush(mddev); 6453 + 6454 + mddev->bitmap_ops->flush(mddev); 6395 6455 6396 6456 if (md_is_rdwr(mddev) && 6397 6457 ((!mddev->in_sync && !mddev_is_clustered(mddev)) || ··· 6419 6477 6420 6478 static void mddev_detach(struct mddev *mddev) 6421 6479 { 6422 - md_bitmap_wait_behind_writes(mddev); 6480 + mddev->bitmap_ops->wait_behind_writes(mddev); 6423 6481 if (mddev->pers && mddev->pers->quiesce && !is_md_suspended(mddev)) { 6424 6482 mddev->pers->quiesce(mddev, 1); 6425 6483 mddev->pers->quiesce(mddev, 0); ··· 6434 6492 static void __md_stop(struct mddev *mddev) 6435 6493 { 6436 6494 struct md_personality *pers = mddev->pers; 6437 - md_bitmap_destroy(mddev); 6495 + 6496 + mddev->bitmap_ops->destroy(mddev); 6438 6497 mddev_detach(mddev); 6439 6498 spin_lock(&mddev->lock); 6440 6499 mddev->pers = NULL; ··· 7213 7270 err = 0; 7214 7271 if (mddev->pers) { 7215 7272 if (fd >= 0) { 7216 - struct bitmap *bitmap; 7273 + err = mddev->bitmap_ops->create(mddev, -1); 7274 + if (!err) 7275 + err = mddev->bitmap_ops->load(mddev); 7217 7276 7218 - bitmap = md_bitmap_create(mddev, -1); 7219 - if (!IS_ERR(bitmap)) { 7220 - mddev->bitmap = bitmap; 7221 - err = md_bitmap_load(mddev); 7222 - } else 7223 - err = PTR_ERR(bitmap); 7224 7277 if (err) { 7225 - md_bitmap_destroy(mddev); 7278 + mddev->bitmap_ops->destroy(mddev); 7226 7279 fd = -1; 7227 7280 } 7228 7281 } else if (fd < 0) { 7229 - md_bitmap_destroy(mddev); 7282 + mddev->bitmap_ops->destroy(mddev); 7230 7283 } 7231 7284 } 7285 + 7232 7286 if (fd < 0) { 7233 7287 struct file *f = mddev->bitmap_info.file; 7234 7288 if (f) { ··· 7494 7554 goto err; 7495 7555 } 7496 7556 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 7497 - struct bitmap *bitmap; 7498 7557 /* add the bitmap */ 7499 7558 if (mddev->bitmap) { 7500 7559 rv = -EEXIST; ··· 7507 7568 mddev->bitmap_info.default_offset; 7508 7569 mddev->bitmap_info.space = 7509 7570 mddev->bitmap_info.default_space; 7510 - bitmap = md_bitmap_create(mddev, -1); 7511 - if (!IS_ERR(bitmap)) { 7512 - mddev->bitmap = bitmap; 7513 - rv = md_bitmap_load(mddev); 7514 - } else 7515 - rv = PTR_ERR(bitmap); 7571 + rv = mddev->bitmap_ops->create(mddev, -1); 7572 + if (!rv) 7573 + rv = mddev->bitmap_ops->load(mddev); 7574 + 7516 7575 if (rv) 7517 - md_bitmap_destroy(mddev); 7576 + mddev->bitmap_ops->destroy(mddev); 7518 7577 } else { 7519 - /* remove the bitmap */ 7520 - if (!mddev->bitmap) { 7521 - rv = -ENOENT; 7578 + struct md_bitmap_stats stats; 7579 + 7580 + rv = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 7581 + if (rv) 7522 7582 goto err; 7523 - } 7524 - if (mddev->bitmap->storage.file) { 7583 + 7584 + if (stats.file) { 7525 7585 rv = -EINVAL; 7526 7586 goto err; 7527 7587 } 7588 + 7528 7589 if (mddev->bitmap_info.nodes) { 7529 7590 /* hold PW on all the bitmap lock */ 7530 7591 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) { ··· 7539 7600 module_put(md_cluster_mod); 7540 7601 mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; 7541 7602 } 7542 - md_bitmap_destroy(mddev); 7603 + mddev->bitmap_ops->destroy(mddev); 7543 7604 mddev->bitmap_info.offset = 0; 7544 7605 } 7545 7606 } ··· 8309 8370 spin_unlock(&all_mddevs_lock); 8310 8371 } 8311 8372 8373 + static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) 8374 + { 8375 + struct md_bitmap_stats stats; 8376 + unsigned long used_pages; 8377 + unsigned long chunk_kb; 8378 + int err; 8379 + 8380 + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 8381 + if (err) 8382 + return; 8383 + 8384 + chunk_kb = mddev->bitmap_info.chunksize >> 10; 8385 + used_pages = stats.pages - stats.missing_pages; 8386 + 8387 + seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk", 8388 + used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10), 8389 + chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, 8390 + chunk_kb ? "KB" : "B"); 8391 + 8392 + if (stats.file) { 8393 + seq_puts(seq, ", file: "); 8394 + seq_file_path(seq, stats.file, " \t\n"); 8395 + } 8396 + 8397 + seq_putc(seq, '\n'); 8398 + } 8399 + 8312 8400 static int md_seq_show(struct seq_file *seq, void *v) 8313 8401 { 8314 8402 struct mddev *mddev; ··· 8356 8390 spin_unlock(&all_mddevs_lock); 8357 8391 spin_lock(&mddev->lock); 8358 8392 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 8359 - seq_printf(seq, "%s : %sactive", mdname(mddev), 8360 - mddev->pers ? "" : "in"); 8393 + seq_printf(seq, "%s : ", mdname(mddev)); 8361 8394 if (mddev->pers) { 8395 + if (test_bit(MD_BROKEN, &mddev->flags)) 8396 + seq_printf(seq, "broken"); 8397 + else 8398 + seq_printf(seq, "active"); 8362 8399 if (mddev->ro == MD_RDONLY) 8363 8400 seq_printf(seq, " (read-only)"); 8364 8401 if (mddev->ro == MD_AUTO_READ) 8365 8402 seq_printf(seq, " (auto-read-only)"); 8366 8403 seq_printf(seq, " %s", mddev->pers->name); 8404 + } else { 8405 + seq_printf(seq, "inactive"); 8367 8406 } 8368 8407 8369 8408 sectors = 0; ··· 8424 8453 } else 8425 8454 seq_printf(seq, "\n "); 8426 8455 8427 - md_bitmap_status(seq, mddev->bitmap); 8456 + md_bitmap_status(seq, mddev); 8428 8457 8429 8458 seq_printf(seq, "\n"); 8430 8459 } ··· 8639 8668 BUG_ON(mddev->ro == MD_RDONLY); 8640 8669 if (mddev->ro == MD_AUTO_READ) { 8641 8670 /* need to switch to read/write */ 8642 - flush_work(&mddev->sync_work); 8643 8671 mddev->ro = MD_RDWR; 8644 8672 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 8645 8673 md_wakeup_thread(mddev->thread); ··· 9476 9506 * stored on all devices. So make sure all bitmap pages get written. 9477 9507 */ 9478 9508 if (spares) 9479 - md_bitmap_write_all(mddev->bitmap); 9509 + mddev->bitmap_ops->write_all(mddev); 9480 9510 9481 9511 name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ? 9482 9512 "reshape" : "resync"; ··· 9564 9594 void md_check_recovery(struct mddev *mddev) 9565 9595 { 9566 9596 if (mddev->bitmap) 9567 - md_bitmap_daemon_work(mddev); 9597 + mddev->bitmap_ops->daemon_work(mddev); 9568 9598 9569 9599 if (signal_pending(current)) { 9570 9600 if (mddev->pers->sync_request && !mddev->external) { ··· 9935 9965 if (ret) 9936 9966 pr_info("md-cluster: resize failed\n"); 9937 9967 else 9938 - md_bitmap_update_sb(mddev->bitmap); 9968 + mddev->bitmap_ops->update_sb(mddev->bitmap); 9939 9969 } 9940 9970 9941 9971 /* Check for change of roles in the active devices */

+2 -11

drivers/md/md.h

··· 535 535 struct percpu_ref writes_pending; 536 536 int sync_checkers; /* # of threads checking writes_pending */ 537 537 538 - struct bitmap *bitmap; /* the bitmap for the device */ 538 + void *bitmap; /* the bitmap for the device */ 539 + struct bitmap_operations *bitmap_ops; 539 540 struct { 540 541 struct file *file; /* the bitmap file */ 541 542 loff_t offset; /* offset from superblock of ··· 572 571 */ 573 572 struct bio_set io_clone_set; 574 573 575 - /* Generic flush handling. 576 - * The last to finish preflush schedules a worker to submit 577 - * the rest of the request (without the REQ_PREFLUSH flag). 578 - */ 579 - struct bio *flush_bio; 580 - atomic_t flush_pending; 581 - ktime_t start_flush, prev_flush_start; /* prev_flush_start is when the previous completed 582 - * flush was started. 583 - */ 584 - struct work_struct flush_work; 585 574 struct work_struct event_work; /* used by dm to report failure event */ 586 575 mempool_t *serial_info_pool; 587 576 void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);

+3 -6

drivers/md/raid1-10.c

··· 140 140 * If bitmap is not enabled, it's safe to submit the io directly, and 141 141 * this can get optimal performance. 142 142 */ 143 - if (!md_bitmap_enabled(mddev->bitmap)) { 143 + if (!mddev->bitmap_ops->enabled(mddev)) { 144 144 raid1_submit_write(bio); 145 145 return true; 146 146 } ··· 166 166 * while current io submission must wait for bitmap io to be done. In order to 167 167 * avoid such deadlock, submit bitmap io asynchronously. 168 168 */ 169 - static inline void raid1_prepare_flush_writes(struct bitmap *bitmap) 169 + static inline void raid1_prepare_flush_writes(struct mddev *mddev) 170 170 { 171 - if (current->bio_list) 172 - md_bitmap_unplug_async(bitmap); 173 - else 174 - md_bitmap_unplug(bitmap); 171 + mddev->bitmap_ops->unplug(mddev, current->bio_list == NULL); 175 172 } 176 173 177 174 /*

+45 -54

drivers/md/raid1.c

··· 411 411 412 412 static void close_write(struct r1bio *r1_bio) 413 413 { 414 + struct mddev *mddev = r1_bio->mddev; 415 + 414 416 /* it really is the end of this request */ 415 417 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 416 418 bio_free_pages(r1_bio->behind_master_bio); 417 419 bio_put(r1_bio->behind_master_bio); 418 420 r1_bio->behind_master_bio = NULL; 419 421 } 422 + 420 423 /* clear the bitmap if all writes complete successfully */ 421 - md_bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, 422 - r1_bio->sectors, 423 - !test_bit(R1BIO_Degraded, &r1_bio->state), 424 - test_bit(R1BIO_BehindIO, &r1_bio->state)); 425 - md_write_end(r1_bio->mddev); 424 + mddev->bitmap_ops->endwrite(mddev, r1_bio->sector, r1_bio->sectors, 425 + !test_bit(R1BIO_Degraded, &r1_bio->state), 426 + test_bit(R1BIO_BehindIO, &r1_bio->state)); 427 + md_write_end(mddev); 426 428 } 427 429 428 430 static void r1_bio_write_done(struct r1bio *r1_bio) ··· 902 900 static void flush_bio_list(struct r1conf *conf, struct bio *bio) 903 901 { 904 902 /* flush any pending bitmap writes to disk before proceeding w/ I/O */ 905 - raid1_prepare_flush_writes(conf->mddev->bitmap); 903 + raid1_prepare_flush_writes(conf->mddev); 906 904 wake_up_barrier(conf); 907 905 908 906 while (bio) { /* submit pending writes */ ··· 1319 1317 struct r1conf *conf = mddev->private; 1320 1318 struct raid1_info *mirror; 1321 1319 struct bio *read_bio; 1322 - struct bitmap *bitmap = mddev->bitmap; 1323 1320 const enum req_op op = bio_op(bio); 1324 1321 const blk_opf_t do_sync = bio->bi_opf & REQ_SYNC; 1325 1322 int max_sectors; 1326 1323 int rdisk; 1327 1324 bool r1bio_existed = !!r1_bio; 1328 - char b[BDEVNAME_SIZE]; 1329 1325 1330 1326 /* 1331 1327 * If r1_bio is set, we are blocking the raid1d thread ··· 1331 1331 * emergency memory if needed. 1332 1332 */ 1333 1333 gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO; 1334 - 1335 - if (r1bio_existed) { 1336 - /* Need to get the block device name carefully */ 1337 - struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev; 1338 - 1339 - if (rdev) 1340 - snprintf(b, sizeof(b), "%pg", rdev->bdev); 1341 - else 1342 - strcpy(b, "???"); 1343 - } 1344 1334 1345 1335 /* 1346 1336 * Still need barrier for READ in case that whole ··· 1353 1363 * used and no empty request is available. 1354 1364 */ 1355 1365 rdisk = read_balance(conf, r1_bio, &max_sectors); 1356 - 1357 1366 if (rdisk < 0) { 1358 1367 /* couldn't find anywhere to read from */ 1359 - if (r1bio_existed) { 1360 - pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", 1368 + if (r1bio_existed) 1369 + pr_crit_ratelimited("md/raid1:%s: %pg: unrecoverable I/O read error for block %llu\n", 1361 1370 mdname(mddev), 1362 - b, 1363 - (unsigned long long)r1_bio->sector); 1364 - } 1371 + conf->mirrors[r1_bio->read_disk].rdev->bdev, 1372 + r1_bio->sector); 1365 1373 raid_end_bio_io(r1_bio); 1366 1374 return; 1367 1375 } ··· 1371 1383 (unsigned long long)r1_bio->sector, 1372 1384 mirror->rdev->bdev); 1373 1385 1374 - if (test_bit(WriteMostly, &mirror->rdev->flags) && 1375 - bitmap) { 1386 + if (test_bit(WriteMostly, &mirror->rdev->flags)) { 1376 1387 /* 1377 1388 * Reading from a write-mostly device must take care not to 1378 1389 * over-take any writes that are 'behind' 1379 1390 */ 1380 1391 mddev_add_trace_msg(mddev, "raid1 wait behind writes"); 1381 - wait_event(bitmap->behind_wait, 1382 - atomic_read(&bitmap->behind_writes) == 0); 1392 + mddev->bitmap_ops->wait_behind_writes(mddev); 1383 1393 } 1384 1394 1385 1395 if (max_sectors < bio_sectors(bio)) { ··· 1418 1432 struct r1conf *conf = mddev->private; 1419 1433 struct r1bio *r1_bio; 1420 1434 int i, disks; 1421 - struct bitmap *bitmap = mddev->bitmap; 1422 1435 unsigned long flags; 1423 1436 struct md_rdev *blocked_rdev; 1424 1437 int first_clone; ··· 1570 1585 * at a time and thus needs a new bio that can fit the whole payload 1571 1586 * this bio in page sized chunks. 1572 1587 */ 1573 - if (write_behind && bitmap) 1588 + if (write_behind && mddev->bitmap) 1574 1589 max_sectors = min_t(int, max_sectors, 1575 1590 BIO_MAX_VECS * (PAGE_SIZE >> 9)); 1576 1591 if (max_sectors < bio_sectors(bio)) { ··· 1597 1612 continue; 1598 1613 1599 1614 if (first_clone) { 1615 + unsigned long max_write_behind = 1616 + mddev->bitmap_info.max_write_behind; 1617 + struct md_bitmap_stats stats; 1618 + int err; 1619 + 1600 1620 /* do behind I/O ? 1601 1621 * Not if there are too many, or cannot 1602 1622 * allocate memory, or a reader on WriteMostly 1603 1623 * is waiting for behind writes to flush */ 1604 - if (bitmap && write_behind && 1605 - (atomic_read(&bitmap->behind_writes) 1606 - < mddev->bitmap_info.max_write_behind) && 1607 - !waitqueue_active(&bitmap->behind_wait)) { 1624 + err = mddev->bitmap_ops->get_stats(mddev->bitmap, &stats); 1625 + if (!err && write_behind && !stats.behind_wait && 1626 + stats.behind_writes < max_write_behind) 1608 1627 alloc_behind_master_bio(r1_bio, bio); 1609 - } 1610 1628 1611 - md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors, 1612 - test_bit(R1BIO_BehindIO, &r1_bio->state)); 1629 + mddev->bitmap_ops->startwrite( 1630 + mddev, r1_bio->sector, r1_bio->sectors, 1631 + test_bit(R1BIO_BehindIO, &r1_bio->state)); 1613 1632 first_clone = 0; 1614 1633 } 1615 1634 ··· 2031 2042 2032 2043 /* make sure these bits don't get cleared. */ 2033 2044 do { 2034 - md_bitmap_end_sync(mddev->bitmap, s, &sync_blocks, 1); 2045 + mddev->bitmap_ops->end_sync(mddev, s, &sync_blocks); 2035 2046 s += sync_blocks; 2036 2047 sectors_to_go -= sync_blocks; 2037 2048 } while (sectors_to_go > 0); ··· 2760 2771 int wonly = -1; 2761 2772 int write_targets = 0, read_targets = 0; 2762 2773 sector_t sync_blocks; 2763 - int still_degraded = 0; 2774 + bool still_degraded = false; 2764 2775 int good_sectors = RESYNC_SECTORS; 2765 2776 int min_bad = 0; /* number of sectors that are bad in all devices */ 2766 2777 int idx = sector_to_idx(sector_nr); ··· 2777 2788 * We can find the current addess in mddev->curr_resync 2778 2789 */ 2779 2790 if (mddev->curr_resync < max_sector) /* aborted */ 2780 - md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 2781 - &sync_blocks, 1); 2791 + mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, 2792 + &sync_blocks); 2782 2793 else /* completed sync */ 2783 2794 conf->fullsync = 0; 2784 2795 2785 - md_bitmap_close_sync(mddev->bitmap); 2796 + mddev->bitmap_ops->close_sync(mddev); 2786 2797 close_sync(conf); 2787 2798 2788 2799 if (mddev_is_clustered(mddev)) { ··· 2802 2813 /* before building a request, check if we can skip these blocks.. 2803 2814 * This call the bitmap_start_sync doesn't actually record anything 2804 2815 */ 2805 - if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 2816 + if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, true) && 2806 2817 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 2807 2818 /* We can skip this block, and probably several more */ 2808 2819 *skipped = 1; ··· 2820 2831 * sector_nr + two times RESYNC_SECTORS 2821 2832 */ 2822 2833 2823 - md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, 2824 - mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); 2825 - 2834 + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, 2835 + mddev_is_clustered(mddev) && 2836 + (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); 2826 2837 2827 2838 if (raise_barrier(conf, sector_nr)) 2828 2839 return 0; ··· 2853 2864 if (rdev == NULL || 2854 2865 test_bit(Faulty, &rdev->flags)) { 2855 2866 if (i < conf->raid_disks) 2856 - still_degraded = 1; 2867 + still_degraded = true; 2857 2868 } else if (!test_bit(In_sync, &rdev->flags)) { 2858 2869 bio->bi_opf = REQ_OP_WRITE; 2859 2870 bio->bi_end_io = end_sync_write; ··· 2977 2988 if (len == 0) 2978 2989 break; 2979 2990 if (sync_blocks == 0) { 2980 - if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, 2981 - &sync_blocks, still_degraded) && 2991 + if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, 2992 + &sync_blocks, still_degraded) && 2982 2993 !conf->fullsync && 2983 2994 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 2984 2995 break; ··· 3302 3313 * worth it. 3303 3314 */ 3304 3315 sector_t newsize = raid1_size(mddev, sectors, 0); 3316 + int ret; 3317 + 3305 3318 if (mddev->external_size && 3306 3319 mddev->array_sectors > newsize) 3307 3320 return -EINVAL; 3308 - if (mddev->bitmap) { 3309 - int ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); 3310 - if (ret) 3311 - return ret; 3312 - } 3321 + 3322 + ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); 3323 + if (ret) 3324 + return ret; 3325 + 3313 3326 md_set_array_sectors(mddev, newsize); 3314 3327 if (sectors > mddev->dev_sectors && 3315 3328 mddev->recovery_cp > mddev->dev_sectors) {

+41 -34

drivers/md/raid10.c

··· 426 426 427 427 static void close_write(struct r10bio *r10_bio) 428 428 { 429 + struct mddev *mddev = r10_bio->mddev; 430 + 429 431 /* clear the bitmap if all writes complete successfully */ 430 - md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, 431 - r10_bio->sectors, 432 - !test_bit(R10BIO_Degraded, &r10_bio->state), 433 - 0); 434 - md_write_end(r10_bio->mddev); 432 + mddev->bitmap_ops->endwrite(mddev, r10_bio->sector, r10_bio->sectors, 433 + !test_bit(R10BIO_Degraded, &r10_bio->state), 434 + false); 435 + md_write_end(mddev); 435 436 } 436 437 437 438 static void one_write_done(struct r10bio *r10_bio) ··· 885 884 __set_current_state(TASK_RUNNING); 886 885 887 886 blk_start_plug(&plug); 888 - raid1_prepare_flush_writes(conf->mddev->bitmap); 887 + raid1_prepare_flush_writes(conf->mddev); 889 888 wake_up(&conf->wait_barrier); 890 889 891 890 while (bio) { /* submit pending writes */ ··· 1101 1100 1102 1101 /* we aren't scheduling, so we can do the write-out directly. */ 1103 1102 bio = bio_list_get(&plug->pending); 1104 - raid1_prepare_flush_writes(mddev->bitmap); 1103 + raid1_prepare_flush_writes(mddev); 1105 1104 wake_up_barrier(conf); 1106 1105 1107 1106 while (bio) { /* submit pending writes */ ··· 1493 1492 md_account_bio(mddev, &bio); 1494 1493 r10_bio->master_bio = bio; 1495 1494 atomic_set(&r10_bio->remaining, 1); 1496 - md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); 1495 + mddev->bitmap_ops->startwrite(mddev, r10_bio->sector, r10_bio->sectors, 1496 + false); 1497 1497 1498 1498 for (i = 0; i < conf->copies; i++) { 1499 1499 if (r10_bio->devs[i].bio) ··· 2467 2465 s = PAGE_SIZE >> 9; 2468 2466 2469 2467 rdev = conf->mirrors[dr].rdev; 2470 - addr = r10_bio->devs[0].addr + sect, 2468 + addr = r10_bio->devs[0].addr + sect; 2471 2469 ok = sync_page_io(rdev, 2472 2470 addr, 2473 2471 s << 9, ··· 3194 3192 3195 3193 if (mddev->curr_resync < max_sector) { /* aborted */ 3196 3194 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3197 - md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 3198 - &sync_blocks, 1); 3195 + mddev->bitmap_ops->end_sync(mddev, 3196 + mddev->curr_resync, 3197 + &sync_blocks); 3199 3198 else for (i = 0; i < conf->geo.raid_disks; i++) { 3200 3199 sector_t sect = 3201 3200 raid10_find_virt(conf, mddev->curr_resync, i); 3202 - md_bitmap_end_sync(mddev->bitmap, sect, 3203 - &sync_blocks, 1); 3201 + 3202 + mddev->bitmap_ops->end_sync(mddev, sect, 3203 + &sync_blocks); 3204 3204 } 3205 3205 } else { 3206 3206 /* completed sync */ ··· 3222 3218 } 3223 3219 conf->fullsync = 0; 3224 3220 } 3225 - md_bitmap_close_sync(mddev->bitmap); 3221 + mddev->bitmap_ops->close_sync(mddev); 3226 3222 close_sync(conf); 3227 3223 *skipped = 1; 3228 3224 return sectors_skipped; ··· 3291 3287 r10_bio = NULL; 3292 3288 3293 3289 for (i = 0 ; i < conf->geo.raid_disks; i++) { 3294 - int still_degraded; 3290 + bool still_degraded; 3295 3291 struct r10bio *rb2; 3296 3292 sector_t sect; 3297 - int must_sync; 3293 + bool must_sync; 3298 3294 int any_working; 3299 3295 struct raid10_info *mirror = &conf->mirrors[i]; 3300 3296 struct md_rdev *mrdev, *mreplace; ··· 3311 3307 if (!mrdev && !mreplace) 3312 3308 continue; 3313 3309 3314 - still_degraded = 0; 3310 + still_degraded = false; 3315 3311 /* want to reconstruct this device */ 3316 3312 rb2 = r10_bio; 3317 3313 sect = raid10_find_virt(conf, sector_nr, i); ··· 3324 3320 * we only need to recover the block if it is set in 3325 3321 * the bitmap 3326 3322 */ 3327 - must_sync = md_bitmap_start_sync(mddev->bitmap, sect, 3328 - &sync_blocks, 1); 3323 + must_sync = mddev->bitmap_ops->start_sync(mddev, sect, 3324 + &sync_blocks, 3325 + true); 3329 3326 if (sync_blocks < max_sync) 3330 3327 max_sync = sync_blocks; 3331 3328 if (!must_sync && ··· 3364 3359 struct md_rdev *rdev = conf->mirrors[j].rdev; 3365 3360 3366 3361 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) { 3367 - still_degraded = 1; 3362 + still_degraded = false; 3368 3363 break; 3369 3364 } 3370 3365 } 3371 3366 3372 - must_sync = md_bitmap_start_sync(mddev->bitmap, sect, 3373 - &sync_blocks, still_degraded); 3367 + must_sync = mddev->bitmap_ops->start_sync(mddev, sect, 3368 + &sync_blocks, still_degraded); 3374 3369 3375 3370 any_working = 0; 3376 3371 for (j=0; j<conf->copies;j++) { ··· 3543 3538 * safety reason, which ensures curr_resync_completed is 3544 3539 * updated in bitmap_cond_end_sync. 3545 3540 */ 3546 - md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, 3541 + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, 3547 3542 mddev_is_clustered(mddev) && 3548 3543 (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); 3549 3544 3550 - if (!md_bitmap_start_sync(mddev->bitmap, sector_nr, 3551 - &sync_blocks, mddev->degraded) && 3545 + if (!mddev->bitmap_ops->start_sync(mddev, sector_nr, 3546 + &sync_blocks, 3547 + mddev->degraded) && 3552 3548 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, 3553 3549 &mddev->recovery)) { 3554 3550 /* We can skip this block */ ··· 4196 4190 */ 4197 4191 struct r10conf *conf = mddev->private; 4198 4192 sector_t oldsize, size; 4193 + int ret; 4199 4194 4200 4195 if (mddev->reshape_position != MaxSector) 4201 4196 return -EBUSY; ··· 4209 4202 if (mddev->external_size && 4210 4203 mddev->array_sectors > size) 4211 4204 return -EINVAL; 4212 - if (mddev->bitmap) { 4213 - int ret = md_bitmap_resize(mddev->bitmap, size, 0, 0); 4214 - if (ret) 4215 - return ret; 4216 - } 4205 + 4206 + ret = mddev->bitmap_ops->resize(mddev, size, 0, false); 4207 + if (ret) 4208 + return ret; 4209 + 4217 4210 md_set_array_sectors(mddev, size); 4218 4211 if (sectors > mddev->dev_sectors && 4219 4212 mddev->recovery_cp > oldsize) { ··· 4479 4472 newsize = raid10_size(mddev, 0, conf->geo.raid_disks); 4480 4473 4481 4474 if (!mddev_is_clustered(mddev)) { 4482 - ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); 4475 + ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); 4483 4476 if (ret) 4484 4477 goto abort; 4485 4478 else ··· 4494 4487 4495 4488 /* 4496 4489 * some node is already performing reshape, and no need to 4497 - * call md_bitmap_resize again since it should be called when 4490 + * call bitmap_ops->resize again since it should be called when 4498 4491 * receiving BITMAP_RESIZE msg 4499 4492 */ 4500 4493 if ((sb && (le32_to_cpu(sb->feature_map) & 4501 4494 MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize)) 4502 4495 goto out; 4503 4496 4504 - ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0); 4497 + ret = mddev->bitmap_ops->resize(mddev, newsize, 0, false); 4505 4498 if (ret) 4506 4499 goto abort; 4507 4500 4508 4501 ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize); 4509 4502 if (ret) { 4510 - md_bitmap_resize(mddev->bitmap, oldsize, 0, 0); 4503 + mddev->bitmap_ops->resize(mddev, oldsize, 0, false); 4511 4504 goto abort; 4512 4505 } 4513 4506 }

+5 -9

drivers/md/raid5-cache.c

··· 313 313 if (sh->dev[i].written) { 314 314 set_bit(R5_UPTODATE, &sh->dev[i].flags); 315 315 r5c_return_dev_pending_writes(conf, &sh->dev[i]); 316 - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, 317 - RAID5_STRIPE_SECTORS(conf), 318 - !test_bit(STRIPE_DEGRADED, &sh->state), 319 - 0); 316 + conf->mddev->bitmap_ops->endwrite(conf->mddev, 317 + sh->sector, RAID5_STRIPE_SECTORS(conf), 318 + !test_bit(STRIPE_DEGRADED, &sh->state), 319 + false); 320 320 } 321 321 } 322 322 } ··· 2798 2798 { 2799 2799 struct r5l_log *log = READ_ONCE(conf->log); 2800 2800 int i; 2801 - int do_wakeup = 0; 2802 2801 sector_t tree_index; 2803 2802 void __rcu **pslot; 2804 2803 uintptr_t refcount; ··· 2814 2815 for (i = sh->disks; i--; ) { 2815 2816 clear_bit(R5_InJournal, &sh->dev[i].flags); 2816 2817 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2817 - do_wakeup = 1; 2818 + wake_up_bit(&sh->dev[i].flags, R5_Overlap); 2818 2819 } 2819 2820 2820 2821 /* ··· 2826 2827 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2827 2828 if (atomic_dec_and_test(&conf->pending_full_writes)) 2828 2829 md_wakeup_thread(conf->mddev->thread); 2829 - 2830 - if (do_wakeup) 2831 - wake_up(&conf->wait_for_overlap); 2832 2830 2833 2831 spin_lock_irq(&log->stripe_in_journal_lock); 2834 2832 list_del_init(&sh->r5c);

+82 -75

drivers/md/raid5.c

··· 2337 2337 for (i = disks; i--; ) { 2338 2338 struct r5dev *dev = &sh->dev[i]; 2339 2339 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 2340 - wake_up(&sh->raid_conf->wait_for_overlap); 2340 + wake_up_bit(&dev->flags, R5_Overlap); 2341 2341 } 2342 2342 } 2343 2343 local_unlock(&conf->percpu->lock); ··· 3473 3473 * With PPL only writes to consecutive data chunks within a 3474 3474 * stripe are allowed because for a single stripe_head we can 3475 3475 * only have one PPL entry at a time, which describes one data 3476 - * range. Not really an overlap, but wait_for_overlap can be 3476 + * range. Not really an overlap, but R5_Overlap can be 3477 3477 * used to handle this. 3478 3478 */ 3479 3479 sector_t sector; ··· 3563 3563 */ 3564 3564 set_bit(STRIPE_BITMAP_PENDING, &sh->state); 3565 3565 spin_unlock_irq(&sh->stripe_lock); 3566 - md_bitmap_startwrite(conf->mddev->bitmap, sh->sector, 3567 - RAID5_STRIPE_SECTORS(conf), 0); 3566 + conf->mddev->bitmap_ops->startwrite(conf->mddev, sh->sector, 3567 + RAID5_STRIPE_SECTORS(conf), false); 3568 3568 spin_lock_irq(&sh->stripe_lock); 3569 3569 clear_bit(STRIPE_BITMAP_PENDING, &sh->state); 3570 3570 if (!sh->batch_head) { ··· 3652 3652 log_stripe_write_finished(sh); 3653 3653 3654 3654 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3655 - wake_up(&conf->wait_for_overlap); 3655 + wake_up_bit(&sh->dev[i].flags, R5_Overlap); 3656 3656 3657 3657 while (bi && bi->bi_iter.bi_sector < 3658 3658 sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { ··· 3663 3663 bi = nextbi; 3664 3664 } 3665 3665 if (bitmap_end) 3666 - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3667 - RAID5_STRIPE_SECTORS(conf), 0, 0); 3666 + conf->mddev->bitmap_ops->endwrite(conf->mddev, 3667 + sh->sector, RAID5_STRIPE_SECTORS(conf), 3668 + false, false); 3668 3669 bitmap_end = 0; 3669 3670 /* and fail all 'written' */ 3670 3671 bi = sh->dev[i].written; ··· 3697 3696 sh->dev[i].toread = NULL; 3698 3697 spin_unlock_irq(&sh->stripe_lock); 3699 3698 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 3700 - wake_up(&conf->wait_for_overlap); 3699 + wake_up_bit(&sh->dev[i].flags, R5_Overlap); 3701 3700 if (bi) 3702 3701 s->to_read--; 3703 3702 while (bi && bi->bi_iter.bi_sector < ··· 3710 3709 } 3711 3710 } 3712 3711 if (bitmap_end) 3713 - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, 3714 - RAID5_STRIPE_SECTORS(conf), 0, 0); 3712 + conf->mddev->bitmap_ops->endwrite(conf->mddev, 3713 + sh->sector, RAID5_STRIPE_SECTORS(conf), 3714 + false, false); 3715 3715 /* If we were in the middle of a write the parity block might 3716 3716 * still be locked - so just clear all R5_LOCKED flags 3717 3717 */ ··· 3736 3734 BUG_ON(sh->batch_head); 3737 3735 clear_bit(STRIPE_SYNCING, &sh->state); 3738 3736 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 3739 - wake_up(&conf->wait_for_overlap); 3737 + wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap); 3740 3738 s->syncing = 0; 3741 3739 s->replacing = 0; 3742 3740 /* There is nothing more to do for sync/check/repair. ··· 4061 4059 bio_endio(wbi); 4062 4060 wbi = wbi2; 4063 4061 } 4064 - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, 4065 - RAID5_STRIPE_SECTORS(conf), 4066 - !test_bit(STRIPE_DEGRADED, &sh->state), 4067 - 0); 4062 + conf->mddev->bitmap_ops->endwrite(conf->mddev, 4063 + sh->sector, RAID5_STRIPE_SECTORS(conf), 4064 + !test_bit(STRIPE_DEGRADED, &sh->state), 4065 + false); 4068 4066 if (head_sh->batch_head) { 4069 4067 sh = list_first_entry(&sh->batch_list, 4070 4068 struct stripe_head, ··· 4877 4875 { 4878 4876 struct stripe_head *sh, *next; 4879 4877 int i; 4880 - int do_wakeup = 0; 4881 4878 4882 4879 list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) { 4883 4880 ··· 4912 4911 spin_unlock_irq(&sh->stripe_lock); 4913 4912 for (i = 0; i < sh->disks; i++) { 4914 4913 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 4915 - do_wakeup = 1; 4914 + wake_up_bit(&sh->dev[i].flags, R5_Overlap); 4916 4915 sh->dev[i].flags = head_sh->dev[i].flags & 4917 4916 (~((1 << R5_WriteError) | (1 << R5_Overlap))); 4918 4917 } ··· 4926 4925 spin_unlock_irq(&head_sh->stripe_lock); 4927 4926 for (i = 0; i < head_sh->disks; i++) 4928 4927 if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags)) 4929 - do_wakeup = 1; 4928 + wake_up_bit(&head_sh->dev[i].flags, R5_Overlap); 4930 4929 if (head_sh->state & handle_flags) 4931 4930 set_bit(STRIPE_HANDLE, &head_sh->state); 4932 - 4933 - if (do_wakeup) 4934 - wake_up(&head_sh->raid_conf->wait_for_overlap); 4935 4931 } 4936 4932 4937 4933 static void handle_stripe(struct stripe_head *sh) ··· 5194 5196 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); 5195 5197 clear_bit(STRIPE_SYNCING, &sh->state); 5196 5198 if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) 5197 - wake_up(&conf->wait_for_overlap); 5199 + wake_up_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap); 5198 5200 } 5199 5201 5200 5202 /* If the failed drives are just a ReadError, then we might need ··· 5257 5259 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 5258 5260 clear_bit(STRIPE_EXPAND_READY, &sh->state); 5259 5261 atomic_dec(&conf->reshape_stripes); 5260 - wake_up(&conf->wait_for_overlap); 5262 + wake_up(&conf->wait_for_reshape); 5261 5263 md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); 5262 5264 } 5263 5265 ··· 5751 5753 int d; 5752 5754 again: 5753 5755 sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0); 5754 - prepare_to_wait(&conf->wait_for_overlap, &w, 5755 - TASK_UNINTERRUPTIBLE); 5756 5756 set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); 5757 5757 if (test_bit(STRIPE_SYNCING, &sh->state)) { 5758 5758 raid5_release_stripe(sh); 5759 - schedule(); 5759 + wait_on_bit(&sh->dev[sh->pd_idx].flags, R5_Overlap, 5760 + TASK_UNINTERRUPTIBLE); 5760 5761 goto again; 5761 5762 } 5762 5763 clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); ··· 5767 5770 set_bit(R5_Overlap, &sh->dev[d].flags); 5768 5771 spin_unlock_irq(&sh->stripe_lock); 5769 5772 raid5_release_stripe(sh); 5770 - schedule(); 5773 + wait_on_bit(&sh->dev[d].flags, R5_Overlap, 5774 + TASK_UNINTERRUPTIBLE); 5771 5775 goto again; 5772 5776 } 5773 5777 } 5774 5778 set_bit(STRIPE_DISCARD, &sh->state); 5775 - finish_wait(&conf->wait_for_overlap, &w); 5776 5779 sh->overwrite_disks = 0; 5777 5780 for (d = 0; d < conf->raid_disks; d++) { 5778 5781 if (d == sh->pd_idx || d == sh->qd_idx) ··· 5785 5788 } 5786 5789 spin_unlock_irq(&sh->stripe_lock); 5787 5790 if (conf->mddev->bitmap) { 5788 - for (d = 0; 5789 - d < conf->raid_disks - conf->max_degraded; 5791 + for (d = 0; d < conf->raid_disks - conf->max_degraded; 5790 5792 d++) 5791 - md_bitmap_startwrite(mddev->bitmap, 5792 - sh->sector, 5793 - RAID5_STRIPE_SECTORS(conf), 5794 - 0); 5793 + mddev->bitmap_ops->startwrite(mddev, sh->sector, 5794 + RAID5_STRIPE_SECTORS(conf), false); 5795 5795 sh->bm_seq = conf->seq_flush + 1; 5796 5796 set_bit(STRIPE_BIT_DELAY, &sh->state); 5797 5797 } ··· 5849 5855 struct bio *bi, int forwrite, int previous) 5850 5856 { 5851 5857 int dd_idx; 5852 - int ret = 1; 5853 5858 5854 5859 spin_lock_irq(&sh->stripe_lock); 5855 5860 ··· 5864 5871 5865 5872 if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { 5866 5873 set_bit(R5_Overlap, &dev->flags); 5867 - ret = 0; 5868 - continue; 5874 + spin_unlock_irq(&sh->stripe_lock); 5875 + raid5_release_stripe(sh); 5876 + /* release batch_last before wait to avoid risk of deadlock */ 5877 + if (ctx->batch_last) { 5878 + raid5_release_stripe(ctx->batch_last); 5879 + ctx->batch_last = NULL; 5880 + } 5881 + md_wakeup_thread(conf->mddev->thread); 5882 + wait_on_bit(&dev->flags, R5_Overlap, TASK_UNINTERRUPTIBLE); 5883 + return 0; 5869 5884 } 5870 5885 } 5871 - 5872 - if (!ret) 5873 - goto out; 5874 5886 5875 5887 for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { 5876 5888 struct r5dev *dev = &sh->dev[dd_idx]; ··· 5892 5894 RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do); 5893 5895 } 5894 5896 5895 - out: 5896 5897 spin_unlock_irq(&sh->stripe_lock); 5897 - return ret; 5898 + return 1; 5898 5899 } 5899 5900 5900 5901 enum reshape_loc { ··· 5989 5992 goto out_release; 5990 5993 } 5991 5994 5992 - if (test_bit(STRIPE_EXPANDING, &sh->state) || 5993 - !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) { 5994 - /* 5995 - * Stripe is busy expanding or add failed due to 5996 - * overlap. Flush everything and wait a while. 5997 - */ 5995 + if (test_bit(STRIPE_EXPANDING, &sh->state)) { 5998 5996 md_wakeup_thread(mddev->thread); 5999 5997 ret = STRIPE_SCHEDULE_AND_RETRY; 6000 5998 goto out_release; 5999 + } 6000 + 6001 + if (!add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) { 6002 + ret = STRIPE_RETRY; 6003 + goto out; 6001 6004 } 6002 6005 6003 6006 if (stripe_can_batch(sh)) { ··· 6070 6073 static bool raid5_make_request(struct mddev *mddev, struct bio * bi) 6071 6074 { 6072 6075 DEFINE_WAIT_FUNC(wait, woken_wake_function); 6076 + bool on_wq; 6073 6077 struct r5conf *conf = mddev->private; 6074 6078 sector_t logical_sector; 6075 6079 struct stripe_request_ctx ctx = {}; ··· 6144 6146 * sequential IO pattern. We don't bother with the optimization when 6145 6147 * reshaping as the performance benefit is not worth the complexity. 6146 6148 */ 6147 - if (likely(conf->reshape_progress == MaxSector)) 6149 + if (likely(conf->reshape_progress == MaxSector)) { 6148 6150 logical_sector = raid5_bio_lowest_chunk_sector(conf, bi); 6151 + on_wq = false; 6152 + } else { 6153 + add_wait_queue(&conf->wait_for_reshape, &wait); 6154 + on_wq = true; 6155 + } 6149 6156 s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf); 6150 6157 6151 - add_wait_queue(&conf->wait_for_overlap, &wait); 6152 6158 while (1) { 6153 6159 res = make_stripe_request(mddev, conf, &ctx, logical_sector, 6154 6160 bi); ··· 6163 6161 continue; 6164 6162 6165 6163 if (res == STRIPE_SCHEDULE_AND_RETRY) { 6164 + WARN_ON_ONCE(!on_wq); 6166 6165 /* 6167 6166 * Must release the reference to batch_last before 6168 6167 * scheduling and waiting for work to be done, ··· 6188 6185 logical_sector = ctx.first_sector + 6189 6186 (s << RAID5_STRIPE_SHIFT(conf)); 6190 6187 } 6191 - remove_wait_queue(&conf->wait_for_overlap, &wait); 6188 + if (unlikely(on_wq)) 6189 + remove_wait_queue(&conf->wait_for_reshape, &wait); 6192 6190 6193 6191 if (ctx.batch_last) 6194 6192 raid5_release_stripe(ctx.batch_last); ··· 6342 6338 : (safepos < writepos && readpos > writepos)) || 6343 6339 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 6344 6340 /* Cannot proceed until we've updated the superblock... */ 6345 - wait_event(conf->wait_for_overlap, 6341 + wait_event(conf->wait_for_reshape, 6346 6342 atomic_read(&conf->reshape_stripes)==0 6347 6343 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 6348 6344 if (atomic_read(&conf->reshape_stripes) != 0) ··· 6368 6364 spin_lock_irq(&conf->device_lock); 6369 6365 conf->reshape_safe = mddev->reshape_position; 6370 6366 spin_unlock_irq(&conf->device_lock); 6371 - wake_up(&conf->wait_for_overlap); 6367 + wake_up(&conf->wait_for_reshape); 6372 6368 sysfs_notify_dirent_safe(mddev->sysfs_completed); 6373 6369 } 6374 6370 ··· 6451 6447 (sector_nr - mddev->curr_resync_completed) * 2 6452 6448 >= mddev->resync_max - mddev->curr_resync_completed) { 6453 6449 /* Cannot proceed until we've updated the superblock... */ 6454 - wait_event(conf->wait_for_overlap, 6450 + wait_event(conf->wait_for_reshape, 6455 6451 atomic_read(&conf->reshape_stripes) == 0 6456 6452 || test_bit(MD_RECOVERY_INTR, &mddev->recovery)); 6457 6453 if (atomic_read(&conf->reshape_stripes) != 0) ··· 6477 6473 spin_lock_irq(&conf->device_lock); 6478 6474 conf->reshape_safe = mddev->reshape_position; 6479 6475 spin_unlock_irq(&conf->device_lock); 6480 - wake_up(&conf->wait_for_overlap); 6476 + wake_up(&conf->wait_for_reshape); 6481 6477 sysfs_notify_dirent_safe(mddev->sysfs_completed); 6482 6478 } 6483 6479 ret: ··· 6490 6486 struct r5conf *conf = mddev->private; 6491 6487 struct stripe_head *sh; 6492 6488 sector_t sync_blocks; 6493 - int still_degraded = 0; 6489 + bool still_degraded = false; 6494 6490 int i; 6495 6491 6496 6492 if (sector_nr >= max_sector) { ··· 6502 6498 } 6503 6499 6504 6500 if (mddev->curr_resync < max_sector) /* aborted */ 6505 - md_bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 6506 - &sync_blocks, 1); 6501 + mddev->bitmap_ops->end_sync(mddev, mddev->curr_resync, 6502 + &sync_blocks); 6507 6503 else /* completed sync */ 6508 6504 conf->fullsync = 0; 6509 - md_bitmap_close_sync(mddev->bitmap); 6505 + mddev->bitmap_ops->close_sync(mddev); 6510 6506 6511 6507 return 0; 6512 6508 } 6513 6509 6514 6510 /* Allow raid5_quiesce to complete */ 6515 - wait_event(conf->wait_for_overlap, conf->quiesce != 2); 6511 + wait_event(conf->wait_for_reshape, conf->quiesce != 2); 6516 6512 6517 6513 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6518 6514 return reshape_request(mddev, sector_nr, skipped); ··· 6535 6531 } 6536 6532 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 6537 6533 !conf->fullsync && 6538 - !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 6534 + !mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, 6535 + true) && 6539 6536 sync_blocks >= RAID5_STRIPE_SECTORS(conf)) { 6540 6537 /* we can skip this block, and probably more */ 6541 6538 do_div(sync_blocks, RAID5_STRIPE_SECTORS(conf)); ··· 6545 6540 return sync_blocks * RAID5_STRIPE_SECTORS(conf); 6546 6541 } 6547 6542 6548 - md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); 6543 + mddev->bitmap_ops->cond_end_sync(mddev, sector_nr, false); 6549 6544 6550 6545 sh = raid5_get_active_stripe(conf, NULL, sector_nr, 6551 6546 R5_GAS_NOBLOCK); ··· 6564 6559 struct md_rdev *rdev = conf->disks[i].rdev; 6565 6560 6566 6561 if (rdev == NULL || test_bit(Faulty, &rdev->flags)) 6567 - still_degraded = 1; 6562 + still_degraded = true; 6568 6563 } 6569 6564 6570 - md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 6565 + mddev->bitmap_ops->start_sync(mddev, sector_nr, &sync_blocks, 6566 + still_degraded); 6571 6567 6572 6568 set_bit(STRIPE_SYNC_REQUESTED, &sh->state); 6573 6569 set_bit(STRIPE_HANDLE, &sh->state); ··· 6773 6767 /* Now is a good time to flush some bitmap updates */ 6774 6768 conf->seq_flush++; 6775 6769 spin_unlock_irq(&conf->device_lock); 6776 - md_bitmap_unplug(mddev->bitmap); 6770 + mddev->bitmap_ops->unplug(mddev, true); 6777 6771 spin_lock_irq(&conf->device_lock); 6778 6772 conf->seq_write = conf->seq_flush; 6779 6773 activate_bit_delay(conf, conf->temp_inactive_list); ··· 7498 7492 7499 7493 init_waitqueue_head(&conf->wait_for_quiescent); 7500 7494 init_waitqueue_head(&conf->wait_for_stripe); 7501 - init_waitqueue_head(&conf->wait_for_overlap); 7495 + init_waitqueue_head(&conf->wait_for_reshape); 7502 7496 INIT_LIST_HEAD(&conf->handle_list); 7503 7497 INIT_LIST_HEAD(&conf->loprio_list); 7504 7498 INIT_LIST_HEAD(&conf->hold_list); ··· 8318 8312 */ 8319 8313 sector_t newsize; 8320 8314 struct r5conf *conf = mddev->private; 8315 + int ret; 8321 8316 8322 8317 if (raid5_has_log(conf) || raid5_has_ppl(conf)) 8323 8318 return -EINVAL; ··· 8327 8320 if (mddev->external_size && 8328 8321 mddev->array_sectors > newsize) 8329 8322 return -EINVAL; 8330 - if (mddev->bitmap) { 8331 - int ret = md_bitmap_resize(mddev->bitmap, sectors, 0, 0); 8332 - if (ret) 8333 - return ret; 8334 - } 8323 + 8324 + ret = mddev->bitmap_ops->resize(mddev, sectors, 0, false); 8325 + if (ret) 8326 + return ret; 8327 + 8335 8328 md_set_array_sectors(mddev, newsize); 8336 8329 if (sectors > mddev->dev_sectors && 8337 8330 mddev->recovery_cp > mddev->dev_sectors) { ··· 8557 8550 !test_bit(In_sync, &rdev->flags)) 8558 8551 rdev->recovery_offset = MaxSector; 8559 8552 spin_unlock_irq(&conf->device_lock); 8560 - wake_up(&conf->wait_for_overlap); 8553 + wake_up(&conf->wait_for_reshape); 8561 8554 8562 8555 mddev_update_io_opt(conf->mddev, 8563 8556 conf->raid_disks - conf->max_degraded); ··· 8621 8614 conf->quiesce = 1; 8622 8615 unlock_all_device_hash_locks_irq(conf); 8623 8616 /* allow reshape to continue */ 8624 - wake_up(&conf->wait_for_overlap); 8617 + wake_up(&conf->wait_for_reshape); 8625 8618 } else { 8626 8619 /* re-enable writes */ 8627 8620 lock_all_device_hash_locks_irq(conf); 8628 8621 conf->quiesce = 0; 8629 8622 wake_up(&conf->wait_for_quiescent); 8630 - wake_up(&conf->wait_for_overlap); 8623 + wake_up(&conf->wait_for_reshape); 8631 8624 unlock_all_device_hash_locks_irq(conf); 8632 8625 } 8633 8626 log_quiesce(conf, quiesce); ··· 8946 8939 { 8947 8940 struct r5conf *conf = mddev->private; 8948 8941 8949 - wake_up(&conf->wait_for_overlap); 8942 + wake_up(&conf->wait_for_reshape); 8950 8943 } 8951 8944 8952 8945 static struct md_personality raid6_personality =

+1 -1

drivers/md/raid5.h

··· 668 668 struct llist_head released_stripes; 669 669 wait_queue_head_t wait_for_quiescent; 670 670 wait_queue_head_t wait_for_stripe; 671 - wait_queue_head_t wait_for_overlap; 671 + wait_queue_head_t wait_for_reshape; 672 672 unsigned long cache_state; 673 673 struct shrinker *shrinker; 674 674 int pool_size; /* number of disks in stripeheads in pool */

+48 -10

drivers/nvme/common/keyring.c

··· 20 20 } 21 21 EXPORT_SYMBOL_GPL(nvme_keyring_id); 22 22 23 + static bool nvme_tls_psk_revoked(struct key *psk) 24 + { 25 + return test_bit(KEY_FLAG_REVOKED, &psk->flags) || 26 + test_bit(KEY_FLAG_INVALIDATED, &psk->flags); 27 + } 28 + 29 + struct key *nvme_tls_key_lookup(key_serial_t key_id) 30 + { 31 + struct key *key = key_lookup(key_id); 32 + 33 + if (IS_ERR(key)) { 34 + pr_err("key id %08x not found\n", key_id); 35 + return key; 36 + } 37 + if (nvme_tls_psk_revoked(key)) { 38 + pr_err("key id %08x revoked\n", key_id); 39 + return ERR_PTR(-EKEYREVOKED); 40 + } 41 + return key; 42 + } 43 + EXPORT_SYMBOL_GPL(nvme_tls_key_lookup); 44 + 23 45 static void nvme_tls_psk_describe(const struct key *key, struct seq_file *m) 24 46 { 25 47 seq_puts(m, key->description); ··· 58 36 pr_debug("%s: no key description\n", __func__); 59 37 return false; 60 38 } 61 - match_len = strlen(key->description); 62 - pr_debug("%s: id %s len %zd\n", __func__, key->description, match_len); 63 - 64 39 if (!match_data->raw_data) { 65 40 pr_debug("%s: no match data\n", __func__); 66 41 return false; 67 42 } 68 43 match_id = match_data->raw_data; 44 + match_len = strlen(match_id); 69 45 pr_debug("%s: match '%s' '%s' len %zd\n", 70 46 __func__, match_id, key->description, match_len); 71 47 return !memcmp(key->description, match_id, match_len); ··· 91 71 92 72 static struct key *nvme_tls_psk_lookup(struct key *keyring, 93 73 const char *hostnqn, const char *subnqn, 94 - int hmac, bool generated) 74 + u8 hmac, u8 psk_ver, bool generated) 95 75 { 96 76 char *identity; 97 77 size_t identity_len = (NVMF_NQN_SIZE) * 2 + 11; ··· 102 82 if (!identity) 103 83 return ERR_PTR(-ENOMEM); 104 84 105 - snprintf(identity, identity_len, "NVMe0%c%02d %s %s", 106 - generated ? 'G' : 'R', hmac, hostnqn, subnqn); 85 + snprintf(identity, identity_len, "NVMe%u%c%02u %s %s", 86 + psk_ver, generated ? 'G' : 'R', hmac, hostnqn, subnqn); 107 87 108 88 if (!keyring) 109 89 keyring = nvme_keyring; ··· 127 107 /* 128 108 * NVMe PSK priority list 129 109 * 130 - * 'Retained' PSKs (ie 'generated == false') 131 - * should be preferred to 'generated' PSKs, 132 - * and SHA-384 should be preferred to SHA-256. 110 + * 'Retained' PSKs (ie 'generated == false') should be preferred to 'generated' 111 + * PSKs, PSKs with hash (psk_ver 1) should be preferred to PSKs without hash 112 + * (psk_ver 0), and SHA-384 should be preferred to SHA-256. 133 113 */ 134 114 static struct nvme_tls_psk_priority_list { 135 115 bool generated; 116 + u8 psk_ver; 136 117 enum nvme_tcp_tls_cipher cipher; 137 118 } nvme_tls_psk_prio[] = { 138 119 { .generated = false, 120 + .psk_ver = 1, 139 121 .cipher = NVME_TCP_TLS_CIPHER_SHA384, }, 140 122 { .generated = false, 123 + .psk_ver = 1, 124 + .cipher = NVME_TCP_TLS_CIPHER_SHA256, }, 125 + { .generated = false, 126 + .psk_ver = 0, 127 + .cipher = NVME_TCP_TLS_CIPHER_SHA384, }, 128 + { .generated = false, 129 + .psk_ver = 0, 141 130 .cipher = NVME_TCP_TLS_CIPHER_SHA256, }, 142 131 { .generated = true, 132 + .psk_ver = 1, 143 133 .cipher = NVME_TCP_TLS_CIPHER_SHA384, }, 144 134 { .generated = true, 135 + .psk_ver = 1, 136 + .cipher = NVME_TCP_TLS_CIPHER_SHA256, }, 137 + { .generated = true, 138 + .psk_ver = 0, 139 + .cipher = NVME_TCP_TLS_CIPHER_SHA384, }, 140 + { .generated = true, 141 + .psk_ver = 0, 145 142 .cipher = NVME_TCP_TLS_CIPHER_SHA256, }, 146 143 }; 147 144 ··· 174 137 175 138 for (prio = 0; prio < ARRAY_SIZE(nvme_tls_psk_prio); prio++) { 176 139 bool generated = nvme_tls_psk_prio[prio].generated; 140 + u8 ver = nvme_tls_psk_prio[prio].psk_ver; 177 141 enum nvme_tcp_tls_cipher cipher = nvme_tls_psk_prio[prio].cipher; 178 142 179 143 tls_key = nvme_tls_psk_lookup(keyring, hostnqn, subnqn, 180 - cipher, generated); 144 + cipher, ver, generated); 181 145 if (!IS_ERR(tls_key)) { 182 146 tls_key_id = tls_key->serial; 183 147 key_put(tls_key);

+1

drivers/nvme/host/Kconfig

··· 109 109 bool "NVMe over Fabrics In-Band Authentication in host side" 110 110 depends on NVME_CORE 111 111 select NVME_AUTH 112 + select NVME_KEYRING if NVME_TCP_TLS 112 113 help 113 114 This provides support for NVMe over Fabrics In-Band Authentication in 114 115 host side.

+41 -4

drivers/nvme/host/core.c

··· 4 4 * Copyright (c) 2011-2014, Intel Corporation. 5 5 */ 6 6 7 + #include <linux/async.h> 7 8 #include <linux/blkdev.h> 8 9 #include <linux/blk-mq.h> 9 10 #include <linux/blk-integrity.h> ··· 988 987 cmnd->rw.length = 989 988 cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1); 990 989 cmnd->rw.reftag = 0; 991 - cmnd->rw.apptag = 0; 992 - cmnd->rw.appmask = 0; 990 + cmnd->rw.lbat = 0; 991 + cmnd->rw.lbatm = 0; 993 992 994 993 if (ns->head->ms) { 995 994 /* ··· 4041 4040 } 4042 4041 } 4043 4042 4043 + /** 4044 + * struct async_scan_info - keeps track of controller & NSIDs to scan 4045 + * @ctrl: Controller on which namespaces are being scanned 4046 + * @next_nsid: Index of next NSID to scan in ns_list 4047 + * @ns_list: Pointer to list of NSIDs to scan 4048 + * 4049 + * Note: There is a single async_scan_info structure shared by all instances 4050 + * of nvme_scan_ns_async() scanning a given controller, so the atomic 4051 + * operations on next_nsid are critical to ensure each instance scans a unique 4052 + * NSID. 4053 + */ 4054 + struct async_scan_info { 4055 + struct nvme_ctrl *ctrl; 4056 + atomic_t next_nsid; 4057 + __le32 *ns_list; 4058 + }; 4059 + 4060 + static void nvme_scan_ns_async(void *data, async_cookie_t cookie) 4061 + { 4062 + struct async_scan_info *scan_info = data; 4063 + int idx; 4064 + u32 nsid; 4065 + 4066 + idx = (u32)atomic_fetch_inc(&scan_info->next_nsid); 4067 + nsid = le32_to_cpu(scan_info->ns_list[idx]); 4068 + 4069 + nvme_scan_ns(scan_info->ctrl, nsid); 4070 + } 4071 + 4044 4072 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, 4045 4073 unsigned nsid) 4046 4074 { ··· 4096 4066 __le32 *ns_list; 4097 4067 u32 prev = 0; 4098 4068 int ret = 0, i; 4069 + ASYNC_DOMAIN(domain); 4070 + struct async_scan_info scan_info; 4099 4071 4100 4072 ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL); 4101 4073 if (!ns_list) 4102 4074 return -ENOMEM; 4103 4075 4076 + scan_info.ctrl = ctrl; 4077 + scan_info.ns_list = ns_list; 4104 4078 for (;;) { 4105 4079 struct nvme_command cmd = { 4106 4080 .identify.opcode = nvme_admin_identify, ··· 4120 4086 goto free; 4121 4087 } 4122 4088 4089 + atomic_set(&scan_info.next_nsid, 0); 4123 4090 for (i = 0; i < nr_entries; i++) { 4124 4091 u32 nsid = le32_to_cpu(ns_list[i]); 4125 4092 4126 4093 if (!nsid) /* end of the list? */ 4127 4094 goto out; 4128 - nvme_scan_ns(ctrl, nsid); 4095 + async_schedule_domain(nvme_scan_ns_async, &scan_info, 4096 + &domain); 4129 4097 while (++prev < nsid) 4130 4098 nvme_ns_remove_by_nsid(ctrl, prev); 4131 4099 } 4100 + async_synchronize_full_domain(&domain); 4132 4101 } 4133 4102 out: 4134 4103 nvme_remove_invalid_namespaces(ctrl, prev); 4135 4104 free: 4105 + async_synchronize_full_domain(&domain); 4136 4106 kfree(ns_list); 4137 4107 return ret; 4138 4108 } ··· 4716 4678 4717 4679 if (!subsys || ctrl->instance != subsys->instance) 4718 4680 ida_free(&nvme_instance_ida, ctrl->instance); 4719 - key_put(ctrl->tls_key); 4720 4681 nvme_free_cels(ctrl); 4721 4682 nvme_mpath_uninit(ctrl); 4722 4683 cleanup_srcu_struct(&ctrl->srcu);

+1 -1

drivers/nvme/host/fabrics.c

··· 665 665 return ERR_PTR(-EINVAL); 666 666 } 667 667 668 - key = key_lookup(key_id); 668 + key = nvme_tls_key_lookup(key_id); 669 669 if (IS_ERR(key)) 670 670 pr_err("key id %08x not found\n", key_id); 671 671 else

+16 -10

drivers/nvme/host/ioctl.c

··· 4 4 * Copyright (c) 2017-2021 Christoph Hellwig. 5 5 */ 6 6 #include <linux/bio-integrity.h> 7 + #include <linux/blk-integrity.h> 7 8 #include <linux/ptrace.h> /* for force_successful_syscall_return */ 8 9 #include <linux/nvme_ioctl.h> 9 10 #include <linux/io_uring/cmd.h> ··· 120 119 struct request_queue *q = req->q; 121 120 struct nvme_ns *ns = q->queuedata; 122 121 struct block_device *bdev = ns ? ns->disk->part0 : NULL; 122 + bool supports_metadata = bdev && blk_get_integrity(bdev->bd_disk); 123 + bool has_metadata = meta_buffer && meta_len; 123 124 struct bio *bio = NULL; 124 125 int ret; 126 + 127 + if (has_metadata && !supports_metadata) 128 + return -EINVAL; 125 129 126 130 if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) { 127 131 struct iov_iter iter; ··· 149 143 goto out; 150 144 151 145 bio = req->bio; 152 - if (bdev) { 146 + if (bdev) 153 147 bio_set_dev(bio, bdev); 154 - if (meta_buffer && meta_len) { 155 - ret = bio_integrity_map_user(bio, meta_buffer, meta_len, 156 - meta_seed); 157 - if (ret) 158 - goto out_unmap; 159 - req->cmd_flags |= REQ_INTEGRITY; 160 - } 148 + 149 + if (has_metadata) { 150 + ret = bio_integrity_map_user(bio, meta_buffer, meta_len, 151 + meta_seed); 152 + if (ret) 153 + goto out_unmap; 154 + req->cmd_flags |= REQ_INTEGRITY; 161 155 } 162 156 163 157 return ret; ··· 266 260 c.rw.control = cpu_to_le16(io.control); 267 261 c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); 268 262 c.rw.reftag = cpu_to_le32(io.reftag); 269 - c.rw.apptag = cpu_to_le16(io.apptag); 270 - c.rw.appmask = cpu_to_le16(io.appmask); 263 + c.rw.lbat = cpu_to_le16(io.apptag); 264 + c.rw.lbatm = cpu_to_le16(io.appmask); 271 265 272 266 return nvme_submit_user_cmd(ns->queue, &c, io.addr, length, metadata, 273 267 meta_len, lower_32_bits(io.slba), NULL, 0, 0);

+1 -1

drivers/nvme/host/nvme.h

··· 372 372 struct nvme_dhchap_key *ctrl_key; 373 373 u16 transaction; 374 374 #endif 375 - struct key *tls_key; 375 + key_serial_t tls_pskid; 376 376 377 377 /* Power saving configuration */ 378 378 u64 ps_max_latency_us;

+4 -2

drivers/nvme/host/rdma.c

··· 1363 1363 if (control & NVME_RW_PRINFO_PRCHK_REF) 1364 1364 domain->sig.dif.ref_remap = true; 1365 1365 1366 - domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag); 1367 - domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask); 1366 + domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.lbat); 1367 + domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.lbatm); 1368 1368 domain->sig.dif.app_escape = true; 1369 1369 if (pi_type == NVME_NS_DPS_PI_TYPE3) 1370 1370 domain->sig.dif.ref_escape = true; ··· 1876 1876 */ 1877 1877 priv.hrqsize = cpu_to_le16(queue->queue_size); 1878 1878 priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize); 1879 + /* cntlid should only be set when creating an I/O queue */ 1880 + priv.cntlid = cpu_to_le16(ctrl->ctrl.cntlid); 1879 1881 } 1880 1882 1881 1883 ret = rdma_connect_locked(queue->cm_id, &param);

+69 -21

drivers/nvme/host/sysfs.c

··· 664 664 nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store); 665 665 #endif 666 666 667 - #ifdef CONFIG_NVME_TCP_TLS 668 - static ssize_t tls_key_show(struct device *dev, 669 - struct device_attribute *attr, char *buf) 670 - { 671 - struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 672 - 673 - if (!ctrl->tls_key) 674 - return 0; 675 - return sysfs_emit(buf, "%08x", key_serial(ctrl->tls_key)); 676 - } 677 - static DEVICE_ATTR_RO(tls_key); 678 - #endif 679 - 680 667 static struct attribute *nvme_dev_attrs[] = { 681 668 &dev_attr_reset_controller.attr, 682 669 &dev_attr_rescan_controller.attr, ··· 690 703 #ifdef CONFIG_NVME_HOST_AUTH 691 704 &dev_attr_dhchap_secret.attr, 692 705 &dev_attr_dhchap_ctrl_secret.attr, 693 - #endif 694 - #ifdef CONFIG_NVME_TCP_TLS 695 - &dev_attr_tls_key.attr, 696 706 #endif 697 707 &dev_attr_adm_passthru_err_log_enabled.attr, 698 708 NULL ··· 721 737 if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts) 722 738 return 0; 723 739 #endif 724 - #ifdef CONFIG_NVME_TCP_TLS 725 - if (a == &dev_attr_tls_key.attr && 726 - (!ctrl->opts || strcmp(ctrl->opts->transport, "tcp"))) 727 - return 0; 728 - #endif 729 740 730 741 return a->mode; 731 742 } ··· 731 752 }; 732 753 EXPORT_SYMBOL_GPL(nvme_dev_attrs_group); 733 754 755 + #ifdef CONFIG_NVME_TCP_TLS 756 + static ssize_t tls_key_show(struct device *dev, 757 + struct device_attribute *attr, char *buf) 758 + { 759 + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 760 + 761 + if (!ctrl->tls_pskid) 762 + return 0; 763 + return sysfs_emit(buf, "%08x\n", ctrl->tls_pskid); 764 + } 765 + static DEVICE_ATTR_RO(tls_key); 766 + 767 + static ssize_t tls_configured_key_show(struct device *dev, 768 + struct device_attribute *attr, char *buf) 769 + { 770 + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 771 + struct key *key = ctrl->opts->tls_key; 772 + 773 + return sysfs_emit(buf, "%08x\n", key_serial(key)); 774 + } 775 + static DEVICE_ATTR_RO(tls_configured_key); 776 + 777 + static ssize_t tls_keyring_show(struct device *dev, 778 + struct device_attribute *attr, char *buf) 779 + { 780 + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 781 + struct key *keyring = ctrl->opts->keyring; 782 + 783 + return sysfs_emit(buf, "%s\n", keyring->description); 784 + } 785 + static DEVICE_ATTR_RO(tls_keyring); 786 + 787 + static struct attribute *nvme_tls_attrs[] = { 788 + &dev_attr_tls_key.attr, 789 + &dev_attr_tls_configured_key.attr, 790 + &dev_attr_tls_keyring.attr, 791 + }; 792 + 793 + static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj, 794 + struct attribute *a, int n) 795 + { 796 + struct device *dev = container_of(kobj, struct device, kobj); 797 + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); 798 + 799 + if (!ctrl->opts || strcmp(ctrl->opts->transport, "tcp")) 800 + return 0; 801 + 802 + if (a == &dev_attr_tls_key.attr && 803 + !ctrl->opts->tls) 804 + return 0; 805 + if (a == &dev_attr_tls_configured_key.attr && 806 + !ctrl->opts->tls_key) 807 + return 0; 808 + if (a == &dev_attr_tls_keyring.attr && 809 + !ctrl->opts->keyring) 810 + return 0; 811 + 812 + return a->mode; 813 + } 814 + 815 + const struct attribute_group nvme_tls_attrs_group = { 816 + .attrs = nvme_tls_attrs, 817 + .is_visible = nvme_tls_attrs_are_visible, 818 + }; 819 + #endif 820 + 734 821 const struct attribute_group *nvme_dev_attr_groups[] = { 735 822 &nvme_dev_attrs_group, 823 + #ifdef CONFIG_NVME_TCP_TLS 824 + &nvme_tls_attrs_group, 825 + #endif 736 826 NULL, 737 827 }; 738 828

+42 -15

drivers/nvme/host/tcp.c

··· 165 165 166 166 bool hdr_digest; 167 167 bool data_digest; 168 + bool tls_enabled; 168 169 struct ahash_request *rcv_hash; 169 170 struct ahash_request *snd_hash; 170 171 __le32 exp_ddgst; ··· 214 213 return queue - queue->ctrl->queues; 215 214 } 216 215 217 - static inline bool nvme_tcp_tls(struct nvme_ctrl *ctrl) 216 + /* 217 + * Check if the queue is TLS encrypted 218 + */ 219 + static inline bool nvme_tcp_queue_tls(struct nvme_tcp_queue *queue) 220 + { 221 + if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) 222 + return 0; 223 + 224 + return queue->tls_enabled; 225 + } 226 + 227 + /* 228 + * Check if TLS is configured for the controller. 229 + */ 230 + static inline bool nvme_tcp_tls_configured(struct nvme_ctrl *ctrl) 218 231 { 219 232 if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) 220 233 return 0; ··· 383 368 384 369 static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) 385 370 { 386 - return !nvme_tcp_tls(&queue->ctrl->ctrl) && 371 + return !nvme_tcp_queue_tls(queue) && 387 372 nvme_tcp_queue_has_pending(queue); 388 373 } 389 374 ··· 1066 1051 else 1067 1052 msg.msg_flags |= MSG_MORE; 1068 1053 1069 - if (!sendpage_ok(page)) 1054 + if (!sendpages_ok(page, len, offset)) 1070 1055 msg.msg_flags &= ~MSG_SPLICE_PAGES; 1071 1056 1072 1057 bvec_set_page(&bvec, page, len, offset); ··· 1442 1427 memset(&msg, 0, sizeof(msg)); 1443 1428 iov.iov_base = icresp; 1444 1429 iov.iov_len = sizeof(*icresp); 1445 - if (nvme_tcp_tls(&queue->ctrl->ctrl)) { 1430 + if (nvme_tcp_queue_tls(queue)) { 1446 1431 msg.msg_control = cbuf; 1447 1432 msg.msg_controllen = sizeof(cbuf); 1448 1433 } ··· 1454 1439 goto free_icresp; 1455 1440 } 1456 1441 ret = -ENOTCONN; 1457 - if (nvme_tcp_tls(&queue->ctrl->ctrl)) { 1442 + if (nvme_tcp_queue_tls(queue)) { 1458 1443 ctype = tls_get_record_type(queue->sock->sk, 1459 1444 (struct cmsghdr *)cbuf); 1460 1445 if (ctype != TLS_RECORD_TYPE_DATA) { ··· 1596 1581 goto out_complete; 1597 1582 } 1598 1583 1599 - tls_key = key_lookup(pskid); 1584 + tls_key = nvme_tls_key_lookup(pskid); 1600 1585 if (IS_ERR(tls_key)) { 1601 1586 dev_warn(ctrl->ctrl.device, "queue %d: Invalid key %x\n", 1602 1587 qid, pskid); 1603 1588 queue->tls_err = -ENOKEY; 1604 1589 } else { 1605 - ctrl->ctrl.tls_key = tls_key; 1590 + queue->tls_enabled = true; 1591 + if (qid == 0) 1592 + ctrl->ctrl.tls_pskid = key_serial(tls_key); 1593 + key_put(tls_key); 1606 1594 queue->tls_err = 0; 1607 1595 } 1608 1596 ··· 1786 1768 } 1787 1769 1788 1770 /* If PSKs are configured try to start TLS */ 1789 - if (IS_ENABLED(CONFIG_NVME_TCP_TLS) && pskid) { 1771 + if (nvme_tcp_tls_configured(nctrl) && pskid) { 1790 1772 ret = nvme_tcp_start_tls(nctrl, queue, pskid); 1791 1773 if (ret) 1792 1774 goto err_init_connect; ··· 1847 1829 mutex_lock(&queue->queue_lock); 1848 1830 if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) 1849 1831 __nvme_tcp_stop_queue(queue); 1832 + /* Stopping the queue will disable TLS */ 1833 + queue->tls_enabled = false; 1850 1834 mutex_unlock(&queue->queue_lock); 1851 1835 } 1852 1836 ··· 1945 1925 int ret; 1946 1926 key_serial_t pskid = 0; 1947 1927 1948 - if (nvme_tcp_tls(ctrl)) { 1928 + if (nvme_tcp_tls_configured(ctrl)) { 1949 1929 if (ctrl->opts->tls_key) 1950 1930 pskid = key_serial(ctrl->opts->tls_key); 1951 - else 1931 + else { 1952 1932 pskid = nvme_tls_psk_default(ctrl->opts->keyring, 1953 1933 ctrl->opts->host->nqn, 1954 1934 ctrl->opts->subsysnqn); 1955 - if (!pskid) { 1956 - dev_err(ctrl->device, "no valid PSK found\n"); 1957 - return -ENOKEY; 1935 + if (!pskid) { 1936 + dev_err(ctrl->device, "no valid PSK found\n"); 1937 + return -ENOKEY; 1938 + } 1958 1939 } 1959 1940 } 1960 1941 ··· 1978 1957 { 1979 1958 int i, ret; 1980 1959 1981 - if (nvme_tcp_tls(ctrl) && !ctrl->tls_key) { 1960 + if (nvme_tcp_tls_configured(ctrl) && !ctrl->tls_pskid) { 1982 1961 dev_err(ctrl->device, "no PSK negotiated\n"); 1983 1962 return -ENOKEY; 1984 1963 } 1964 + 1985 1965 for (i = 1; i < ctrl->queue_count; i++) { 1986 1966 ret = nvme_tcp_alloc_queue(ctrl, i, 1987 - key_serial(ctrl->tls_key)); 1967 + ctrl->tls_pskid); 1988 1968 if (ret) 1989 1969 goto out_free_queues; 1990 1970 } ··· 2166 2144 if (remove) 2167 2145 nvme_unquiesce_admin_queue(ctrl); 2168 2146 nvme_tcp_destroy_admin_queue(ctrl, remove); 2147 + if (ctrl->tls_pskid) { 2148 + dev_dbg(ctrl->device, "Wipe negotiated TLS_PSK %08x\n", 2149 + ctrl->tls_pskid); 2150 + ctrl->tls_pskid = 0; 2151 + } 2169 2152 } 2170 2153 2171 2154 static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,

-2

drivers/nvme/target/admin-cmd.c

··· 1015 1015 1016 1016 if (nvme_is_fabrics(cmd)) 1017 1017 return nvmet_parse_fabrics_admin_cmd(req); 1018 - if (unlikely(!nvmet_check_auth_status(req))) 1019 - return NVME_SC_AUTH_REQUIRED | NVME_STATUS_DNR; 1020 1018 if (nvmet_is_disc_subsys(nvmet_req_subsys(req))) 1021 1019 return nvmet_parse_discovery_cmd(req); 1022 1020

+12

drivers/nvme/target/auth.c

··· 25 25 unsigned char key_hash; 26 26 char *dhchap_secret; 27 27 28 + if (!strlen(secret)) { 29 + if (set_ctrl) { 30 + kfree(host->dhchap_ctrl_secret); 31 + host->dhchap_ctrl_secret = NULL; 32 + host->dhchap_ctrl_key_hash = 0; 33 + } else { 34 + kfree(host->dhchap_secret); 35 + host->dhchap_secret = NULL; 36 + host->dhchap_key_hash = 0; 37 + } 38 + return 0; 39 + } 28 40 if (sscanf(secret, "DHHC-1:%hhd:%*s", &key_hash) != 1) 29 41 return -EINVAL; 30 42 if (key_hash > 3) {

+2 -2

drivers/nvme/target/rdma.c

··· 578 578 if (control & NVME_RW_PRINFO_PRCHK_REF) 579 579 domain->sig.dif.ref_remap = true; 580 580 581 - domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag); 582 - domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask); 581 + domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.lbat); 582 + domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.lbatm); 583 583 domain->sig.dif.app_escape = true; 584 584 if (pi_type == NVME_NS_DPS_PI_TYPE3) 585 585 domain->sig.dif.ref_escape = true;

+18 -12

fs/btrfs/bio.c

··· 73 73 74 74 static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, 75 75 struct btrfs_bio *orig_bbio, 76 - u64 map_length, bool use_append) 76 + u64 map_length) 77 77 { 78 78 struct btrfs_bio *bbio; 79 79 struct bio *bio; 80 80 81 - if (use_append) { 82 - unsigned int nr_segs; 83 - 84 - bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs, 85 - &btrfs_clone_bioset, map_length); 86 - } else { 87 - bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, 88 - GFP_NOFS, &btrfs_clone_bioset); 89 - } 81 + bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS, 82 + &btrfs_clone_bioset); 90 83 bbio = btrfs_bio(bio); 91 84 btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); 92 85 bbio->inode = orig_bbio->inode; ··· 657 664 return true; 658 665 } 659 666 667 + static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length) 668 + { 669 + unsigned int nr_segs; 670 + int sector_offset; 671 + 672 + map_length = min(map_length, bbio->fs_info->max_zone_append_size); 673 + sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, 674 + &nr_segs, map_length); 675 + if (sector_offset) 676 + return sector_offset << SECTOR_SHIFT; 677 + return map_length; 678 + } 679 + 660 680 static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) 661 681 { 662 682 struct btrfs_inode *inode = bbio->inode; ··· 696 690 697 691 map_length = min(map_length, length); 698 692 if (use_append) 699 - map_length = min(map_length, fs_info->max_zone_append_size); 693 + map_length = btrfs_append_map_length(bbio, map_length); 700 694 701 695 if (map_length < length) { 702 - bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append); 696 + bbio = btrfs_split_bio(fs_info, bbio, map_length); 703 697 bio = &bbio->bio; 704 698 } 705 699

+2 -2

include/linux/bio.h

··· 324 324 void bio_trim(struct bio *bio, sector_t offset, sector_t size); 325 325 extern struct bio *bio_split(struct bio *bio, int sectors, 326 326 gfp_t gfp, struct bio_set *bs); 327 - struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, 328 - unsigned *segs, struct bio_set *bs, unsigned max_bytes); 327 + int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, 328 + unsigned *segs, unsigned max_bytes); 329 329 330 330 /** 331 331 * bio_next_split - get next @sectors from a bio, splitting if necessary

+2 -1

include/linux/blkdev.h

··· 1187 1187 return q->limits.max_segment_size; 1188 1188 } 1189 1189 1190 - static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l) 1190 + static inline unsigned int 1191 + queue_limits_max_zone_append_sectors(const struct queue_limits *l) 1191 1192 { 1192 1193 unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors); 1193 1194

+1

include/linux/mm.h

··· 1601 1601 void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, 1602 1602 bool make_dirty); 1603 1603 void unpin_user_pages(struct page **pages, unsigned long npages); 1604 + void unpin_user_folio(struct folio *folio, unsigned long npages); 1604 1605 void unpin_folios(struct folio **folios, unsigned long nfolios); 1605 1606 1606 1607 static inline bool is_cow_mapping(vm_flags_t flags)

+19

include/linux/net.h

··· 322 322 return !PageSlab(page) && page_count(page) >= 1; 323 323 } 324 324 325 + /* 326 + * Check sendpage_ok on contiguous pages. 327 + */ 328 + static inline bool sendpages_ok(struct page *page, size_t len, size_t offset) 329 + { 330 + struct page *p = page + (offset >> PAGE_SHIFT); 331 + size_t count = 0; 332 + 333 + while (count < len) { 334 + if (!sendpage_ok(p)) 335 + return false; 336 + 337 + p++; 338 + count += PAGE_SIZE; 339 + } 340 + 341 + return true; 342 + } 343 + 325 344 int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, 326 345 size_t num, size_t len); 327 346 int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg,

+5 -1

include/linux/nvme-keyring.h

··· 12 12 const char *hostnqn, const char *subnqn); 13 13 14 14 key_serial_t nvme_keyring_id(void); 15 - 15 + struct key *nvme_tls_key_lookup(key_serial_t key_id); 16 16 #else 17 17 18 18 static inline key_serial_t nvme_tls_psk_default(struct key *keyring, ··· 23 23 static inline key_serial_t nvme_keyring_id(void) 24 24 { 25 25 return 0; 26 + } 27 + static inline struct key *nvme_tls_key_lookup(key_serial_t key_id) 28 + { 29 + return ERR_PTR(-ENOTSUPP); 26 30 } 27 31 #endif /* !CONFIG_NVME_KEYRING */ 28 32 #endif /* _NVME_KEYRING_H */

+5 -1

include/linux/nvme-rdma.h

··· 25 25 NVME_RDMA_CM_NO_RSC = 0x06, 26 26 NVME_RDMA_CM_INVALID_IRD = 0x07, 27 27 NVME_RDMA_CM_INVALID_ORD = 0x08, 28 + NVME_RDMA_CM_INVALID_CNTLID = 0x09, 28 29 }; 29 30 30 31 static inline const char *nvme_rdma_cm_msg(enum nvme_rdma_cm_status status) ··· 47 46 return "invalid IRD"; 48 47 case NVME_RDMA_CM_INVALID_ORD: 49 48 return "Invalid ORD"; 49 + case NVME_RDMA_CM_INVALID_CNTLID: 50 + return "invalid controller ID"; 50 51 default: 51 52 return "unrecognized reason"; 52 53 } ··· 67 64 __le16 qid; 68 65 __le16 hrqsize; 69 66 __le16 hsqsize; 70 - u8 rsvd[24]; 67 + __le16 cntlid; 68 + u8 rsvd[22]; 71 69 }; 72 70 73 71 /**

+4 -4

include/linux/nvme.h

··· 987 987 __le16 control; 988 988 __le32 dsmgmt; 989 989 __le32 reftag; 990 - __le16 apptag; 991 - __le16 appmask; 990 + __le16 lbat; 991 + __le16 lbatm; 992 992 }; 993 993 994 994 enum { ··· 1057 1057 __le16 control; 1058 1058 __le32 dsmgmt; 1059 1059 __le32 reftag; 1060 - __le16 apptag; 1061 - __le16 appmask; 1060 + __le16 lbat; 1061 + __le16 lbatm; 1062 1062 }; 1063 1063 1064 1064 enum nvme_zone_mgmt_action {

+6 -2

include/uapi/linux/nbd.h

··· 42 42 NBD_CMD_WRITE = 1, 43 43 NBD_CMD_DISC = 2, 44 44 NBD_CMD_FLUSH = 3, 45 - NBD_CMD_TRIM = 4 45 + NBD_CMD_TRIM = 4, 46 46 /* userspace defines additional extension commands */ 47 + NBD_CMD_WRITE_ZEROES = 6, 47 48 }; 48 49 49 50 /* values for flags field, these are server interaction specific. */ ··· 52 51 #define NBD_FLAG_READ_ONLY (1 << 1) /* device is read-only */ 53 52 #define NBD_FLAG_SEND_FLUSH (1 << 2) /* can flush writeback cache */ 54 53 #define NBD_FLAG_SEND_FUA (1 << 3) /* send FUA (forced unit access) */ 55 - /* there is a gap here to match userspace */ 54 + #define NBD_FLAG_ROTATIONAL (1 << 4) /* device is rotational */ 56 55 #define NBD_FLAG_SEND_TRIM (1 << 5) /* send trim/discard */ 56 + #define NBD_FLAG_SEND_WRITE_ZEROES (1 << 6) /* supports WRITE_ZEROES */ 57 + /* there is a gap here to match userspace */ 57 58 #define NBD_FLAG_CAN_MULTI_CONN (1 << 8) /* Server supports multiple connections per export. */ 58 59 59 60 /* values for cmd flags in the upper 16 bits of request type */ 60 61 #define NBD_CMD_FLAG_FUA (1 << 16) /* FUA (forced unit access) op */ 62 + #define NBD_CMD_FLAG_NO_HOLE (1 << 17) /* Do not punch a hole for WRITE_ZEROES */ 61 63 62 64 /* These are client behavior specific flags. */ 63 65 #define NBD_CFLAG_DESTROY_ON_DISCONNECT (1 << 0) /* delete the nbd device on

+13

mm/gup.c

··· 416 416 EXPORT_SYMBOL(unpin_user_pages); 417 417 418 418 /** 419 + * unpin_user_folio() - release pages of a folio 420 + * @folio: pointer to folio to be released 421 + * @npages: number of pages of same folio 422 + * 423 + * Release npages of the folio 424 + */ 425 + void unpin_user_folio(struct folio *folio, unsigned long npages) 426 + { 427 + gup_put_folio(folio, npages, FOLL_PIN); 428 + } 429 + EXPORT_SYMBOL(unpin_user_folio); 430 + 431 + /** 419 432 * unpin_folios() - release an array of gup-pinned folios. 420 433 * @folios: array of folios to be marked dirty and released. 421 434 * @nfolios: number of folios in the @folios array.