md/r5cache: flush data only stripes in r5l_recovery_log()

For safer operation, all arrays start in write-through mode, which has been
better tested and is more mature. And actually the write-through/write-mode
isn't persistent after array restarted, so we always start array in
write-through mode. However, if recovery found data-only stripes before the
shutdown (from previous write-back mode), it is not safe to start the array in
write-through mode, as write-through mode can not handle stripes with data in
write-back cache. To solve this problem, we flush all data-only stripes in
r5l_recovery_log(). When r5l_recovery_log() returns, the array starts with
empty cache in write-through mode.

This logic is implemented in r5c_recovery_flush_data_only_stripes():

1. enable write back cache
2. flush all stripes
3. wake up conf->mddev->thread
4. wait for all stripes get flushed (reuse wait_for_quiescent)
5. disable write back cache

The wait in 4 will be waked up in release_inactive_stripe_list()
when conf->active_stripes reaches 0.

It is safe to wake up mddev->thread here because all the resource
required for the thread has been initialized.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>

authored by Song Liu and committed by Shaohua Li a85dd7b8 ba02684d

+45 -16
+5
drivers/md/md.c
··· 5291 5291 if (start_readonly && mddev->ro == 0) 5292 5292 mddev->ro = 2; /* read-only, but switch on first write */ 5293 5293 5294 + /* 5295 + * NOTE: some pers->run(), for example r5l_recovery_log(), wakes 5296 + * up mddev->thread. It is important to initialize critical 5297 + * resources for mddev->thread BEFORE calling pers->run(). 5298 + */ 5294 5299 err = pers->run(mddev); 5295 5300 if (err) 5296 5301 pr_warn("md: pers->run() failed ...\n");
+40 -16
drivers/md/raid5-cache.c
··· 2060 2060 r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, 2061 2061 struct r5l_recovery_ctx *ctx) 2062 2062 { 2063 - struct stripe_head *sh, *next; 2063 + struct stripe_head *sh; 2064 2064 struct mddev *mddev = log->rdev->mddev; 2065 2065 struct page *page; 2066 2066 sector_t next_checkpoint = MaxSector; ··· 2074 2074 2075 2075 WARN_ON(list_empty(&ctx->cached_list)); 2076 2076 2077 - list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { 2077 + list_for_each_entry(sh, &ctx->cached_list, lru) { 2078 2078 struct r5l_meta_block *mb; 2079 2079 int i; 2080 2080 int offset; ··· 2124 2124 ctx->pos = write_pos; 2125 2125 ctx->seq += 1; 2126 2126 next_checkpoint = sh->log_start; 2127 - list_del_init(&sh->lru); 2128 - raid5_release_stripe(sh); 2129 2127 } 2130 2128 log->next_checkpoint = next_checkpoint; 2131 2129 __free_page(page); 2132 2130 return 0; 2131 + } 2132 + 2133 + static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, 2134 + struct r5l_recovery_ctx *ctx) 2135 + { 2136 + struct mddev *mddev = log->rdev->mddev; 2137 + struct r5conf *conf = mddev->private; 2138 + struct stripe_head *sh, *next; 2139 + 2140 + if (ctx->data_only_stripes == 0) 2141 + return; 2142 + 2143 + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK; 2144 + 2145 + list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { 2146 + r5c_make_stripe_write_out(sh); 2147 + set_bit(STRIPE_HANDLE, &sh->state); 2148 + list_del_init(&sh->lru); 2149 + raid5_release_stripe(sh); 2150 + } 2151 + 2152 + md_wakeup_thread(conf->mddev->thread); 2153 + /* reuse conf->wait_for_quiescent in recovery */ 2154 + wait_event(conf->wait_for_quiescent, 2155 + atomic_read(&conf->active_stripes) == 0); 2156 + 2157 + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; 2133 2158 } 2134 2159 2135 2160 static int r5l_recovery_log(struct r5l_log *log) ··· 2183 2158 pos = ctx.pos; 2184 2159 ctx.seq += 10000; 2185 2160 2186 - if (ctx.data_only_stripes == 0) { 2187 - log->next_checkpoint = ctx.pos; 2188 - r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); 2189 - ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); 2190 - } 2191 2161 2192 2162 if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0)) 2193 2163 pr_debug("md/raid:%s: starting from clean shutdown\n", 2194 2164 mdname(mddev)); 2195 - else { 2165 + else 2196 2166 pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", 2197 2167 mdname(mddev), ctx.data_only_stripes, 2198 2168 ctx.data_parity_stripes); 2199 2169 2200 - if (ctx.data_only_stripes > 0) 2201 - if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { 2202 - pr_err("md/raid:%s: failed to rewrite stripes to journal\n", 2203 - mdname(mddev)); 2204 - return -EIO; 2205 - } 2170 + if (ctx.data_only_stripes == 0) { 2171 + log->next_checkpoint = ctx.pos; 2172 + r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++); 2173 + ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS); 2174 + } else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) { 2175 + pr_err("md/raid:%s: failed to rewrite stripes to journal\n", 2176 + mdname(mddev)); 2177 + return -EIO; 2206 2178 } 2207 2179 2208 2180 log->log_start = ctx.pos; 2209 2181 log->seq = ctx.seq; 2210 2182 log->last_checkpoint = pos; 2211 2183 r5l_write_super(log, pos); 2184 + 2185 + r5c_recovery_flush_data_only_stripes(log, &ctx); 2212 2186 return 0; 2213 2187 } 2214 2188