Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

UBI: do not switch to R/O mode on read errors

This patch improves UBI errors handling. ATM UBI switches to
R/O mode when the WL worker fails to read the source PEB.
This means that the upper layers (e.g., UBIFS) has no
chances to unmap the erroneous PEB and fix the error.
This patch changes this behaviour and makes UBI put PEBs
like this into a separate RB-tree, thus preventing the
WL worker from hitting the same read errors again and
again.

But there is a 10% limit on a maximum amount of PEBs like this.
If there are too much of them, UBI switches to R/O mode.

Additionally, this patch teaches UBI not to panic and
switch to R/O mode if after a PEB has been copied, the
target LEB cannot be read back. Instead, now UBI cancels
the operation and schedules the target PEB for torturing.

The error paths has been tested by ingecting errors
into 'ubi_eba_copy_leb()'.

Signed-off-by: Artem Bityutskiy <Artem.Bityutskiy@nokia.com>

+74 -15
+9
drivers/mtd/ubi/build.c
··· 633 633 } 634 634 635 635 /* 636 + * Set maximum amount of physical erroneous eraseblocks to be 10%. 637 + * Erroneous PEB are those which have read errors. 638 + */ 639 + ubi->max_erroneous = ubi->peb_count / 10; 640 + if (ubi->max_erroneous < 16) 641 + ubi->max_erroneous = 16; 642 + dbg_msg("max_erroneous %d", ubi->max_erroneous); 643 + 644 + /* 636 645 * It may happen that EC and VID headers are situated in one minimal 637 646 * I/O unit. In this case we can only accept this UBI image in 638 647 * read-only mode.
+13 -6
drivers/mtd/ubi/eba.c
··· 419 419 * not implemented. 420 420 */ 421 421 if (err == UBI_IO_BAD_VID_HDR) { 422 - ubi_warn("bad VID header at PEB %d, LEB" 423 - "%d:%d", pnum, vol_id, lnum); 422 + ubi_warn("corrupted VID header at PEB " 423 + "%d, LEB %d:%d", pnum, vol_id, 424 + lnum); 424 425 err = -EBADMSG; 425 426 } else 426 427 ubi_ro_mode(ubi); ··· 1033 1032 if (err && err != UBI_IO_BITFLIPS) { 1034 1033 ubi_warn("error %d while reading data from PEB %d", 1035 1034 err, from); 1035 + if (err == -EIO) 1036 + err = MOVE_SOURCE_RD_ERR; 1036 1037 goto out_unlock_buf; 1037 1038 } 1038 1039 ··· 1081 1078 /* Read the VID header back and check if it was written correctly */ 1082 1079 err = ubi_io_read_vid_hdr(ubi, to, vid_hdr, 1); 1083 1080 if (err) { 1084 - if (err != UBI_IO_BITFLIPS) 1081 + if (err != UBI_IO_BITFLIPS) { 1085 1082 ubi_warn("cannot read VID header back from PEB %d", to); 1086 - else 1083 + if (err == -EIO) 1084 + err = MOVE_TARGET_RD_ERR; 1085 + } else 1087 1086 err = MOVE_CANCEL_BITFLIPS; 1088 1087 goto out_unlock_buf; 1089 1088 } ··· 1107 1102 1108 1103 err = ubi_io_read_data(ubi, ubi->peb_buf2, to, 0, aldata_size); 1109 1104 if (err) { 1110 - if (err != UBI_IO_BITFLIPS) 1105 + if (err != UBI_IO_BITFLIPS) { 1111 1106 ubi_warn("cannot read data back from PEB %d", 1112 1107 to); 1113 - else 1108 + if (err == -EIO) 1109 + err = MOVE_TARGET_RD_ERR; 1110 + } else 1114 1111 err = MOVE_CANCEL_BITFLIPS; 1115 1112 goto out_unlock_buf; 1116 1113 }
+14 -2
drivers/mtd/ubi/ubi.h
··· 105 105 * 106 106 * MOVE_CANCEL_RACE: canceled because the volume is being deleted, the source 107 107 * PEB was put meanwhile, or there is I/O on the source PEB 108 + * MOVE_SOURCE_RD_ERR: canceled because there was a read error from the source 109 + * PEB 110 + * MOVE_TARGET_RD_ERR: canceled because there was a read error from the target 111 + * PEB 108 112 * MOVE_TARGET_WR_ERR: canceled because there was a write error to the target 109 113 * PEB 110 114 * MOVE_CANCEL_BITFLIPS: canceled because a bit-flip was detected in the ··· 116 112 */ 117 113 enum { 118 114 MOVE_CANCEL_RACE = 1, 115 + MOVE_SOURCE_RD_ERR, 116 + MOVE_TARGET_RD_ERR, 119 117 MOVE_TARGET_WR_ERR, 120 118 MOVE_CANCEL_BITFLIPS, 121 119 }; ··· 340 334 * @alc_mutex: serializes "atomic LEB change" operations 341 335 * 342 336 * @used: RB-tree of used physical eraseblocks 337 + * @erroneous: RB-tree of erroneous used physical eraseblocks 343 338 * @free: RB-tree of free physical eraseblocks 344 339 * @scrub: RB-tree of physical eraseblocks which need scrubbing 345 340 * @pq: protection queue (contain physical eraseblocks which are temporarily 346 341 * protected from the wear-leveling worker) 347 342 * @pq_head: protection queue head 348 343 * @wl_lock: protects the @used, @free, @pq, @pq_head, @lookuptbl, @move_from, 349 - * @move_to, @move_to_put @erase_pending, @wl_scheduled and @works 350 - * fields 344 + * @move_to, @move_to_put @erase_pending, @wl_scheduled, @works and 345 + * @erroneous_peb_count fields 351 346 * @move_mutex: serializes eraseblock moves 352 347 * @work_sem: synchronizes the WL worker with use tasks 353 348 * @wl_scheduled: non-zero if the wear-leveling was scheduled ··· 368 361 * @peb_size: physical eraseblock size 369 362 * @bad_peb_count: count of bad physical eraseblocks 370 363 * @good_peb_count: count of good physical eraseblocks 364 + * @erroneous_peb_count: count of erroneous physical eraseblocks in @erroneous 365 + * @max_erroneous: maximum allowed amount of erroneous physical eraseblocks 371 366 * @min_io_size: minimal input/output unit size of the underlying MTD device 372 367 * @hdrs_min_io_size: minimal I/O unit size used for VID and EC headers 373 368 * @ro_mode: if the UBI device is in read-only mode ··· 427 418 428 419 /* Wear-leveling sub-system's stuff */ 429 420 struct rb_root used; 421 + struct rb_root erroneous; 430 422 struct rb_root free; 431 423 struct rb_root scrub; 432 424 struct list_head pq[UBI_PROT_QUEUE_LEN]; ··· 452 442 int peb_size; 453 443 int bad_peb_count; 454 444 int good_peb_count; 445 + int erroneous_peb_count; 446 + int max_erroneous; 455 447 int min_io_size; 456 448 int hdrs_min_io_size; 457 449 int ro_mode;
+38 -7
drivers/mtd/ubi/wl.c
··· 55 55 * 56 56 * As it was said, for the UBI sub-system all physical eraseblocks are either 57 57 * "free" or "used". Free eraseblock are kept in the @wl->free RB-tree, while 58 - * used eraseblocks are kept in @wl->used or @wl->scrub RB-trees, or 59 - * (temporarily) in the @wl->pq queue. 58 + * used eraseblocks are kept in @wl->used, @wl->erroneous, or @wl->scrub 59 + * RB-trees, as well as (temporarily) in the @wl->pq queue. 60 60 * 61 61 * When the WL sub-system returns a physical eraseblock, the physical 62 62 * eraseblock is protected from being moved for some "time". For this reason, ··· 83 83 * used. The former state corresponds to the @wl->free tree. The latter state 84 84 * is split up on several sub-states: 85 85 * o the WL movement is allowed (@wl->used tree); 86 + * o the WL movement is disallowed (@wl->erroneous) becouse the PEB is 87 + * erroneous - e.g., there was a read error; 86 88 * o the WL movement is temporarily prohibited (@wl->pq queue); 87 89 * o scrubbing is needed (@wl->scrub tree). 88 90 * ··· 655 653 static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk, 656 654 int cancel) 657 655 { 658 - int err, scrubbing = 0, torture = 0, protect = 0; 656 + int err, scrubbing = 0, torture = 0, protect = 0, erroneous = 0; 659 657 struct ubi_wl_entry *e1, *e2; 660 658 struct ubi_vid_hdr *vid_hdr; 661 659 ··· 771 769 goto out_not_moved; 772 770 } 773 771 774 - if (err == MOVE_CANCEL_BITFLIPS || 775 - err == MOVE_TARGET_WR_ERR) { 772 + if (err == MOVE_CANCEL_BITFLIPS || err == MOVE_TARGET_WR_ERR || 773 + err == MOVE_TARGET_RD_ERR) { 776 774 /* Target PEB bit-flips or write error, torture it */ 777 775 torture = 1; 776 + goto out_not_moved; 777 + } 778 + 779 + if (err == MOVE_SOURCE_RD_ERR) { 780 + /* 781 + * An error happened while reading the source PEB. Do 782 + * not switch to R/O mode in this case, and give the 783 + * upper layers a possibility to recover from this, 784 + * e.g. by unmapping corresponding LEB. Instead, just 785 + * put thie PEB to the @ubi->erroneus list to prevent 786 + * UBI from trying to move the over and over again. 787 + */ 788 + if (ubi->erroneous_peb_count > ubi->max_erroneous) { 789 + ubi_err("too many erroneous eraseblocks (%d)", 790 + ubi->erroneous_peb_count); 791 + goto out_error; 792 + } 793 + erroneous = 1; 778 794 goto out_not_moved; 779 795 } 780 796 ··· 852 832 spin_lock(&ubi->wl_lock); 853 833 if (protect) 854 834 prot_queue_add(ubi, e1); 855 - else if (scrubbing) 835 + else if (erroneous) { 836 + wl_tree_add(e1, &ubi->erroneous); 837 + ubi->erroneous_peb_count += 1; 838 + } else if (scrubbing) 856 839 wl_tree_add(e1, &ubi->scrub); 857 840 else 858 841 wl_tree_add(e1, &ubi->used); ··· 1139 1116 } else if (in_wl_tree(e, &ubi->scrub)) { 1140 1117 paranoid_check_in_wl_tree(e, &ubi->scrub); 1141 1118 rb_erase(&e->u.rb, &ubi->scrub); 1119 + } else if (in_wl_tree(e, &ubi->erroneous)) { 1120 + paranoid_check_in_wl_tree(e, &ubi->erroneous); 1121 + rb_erase(&e->u.rb, &ubi->erroneous); 1122 + ubi->erroneous_peb_count -= 1; 1123 + ubi_assert(ubi->erroneous_peb_count >= 0); 1124 + /* Erronious PEBs should be tortured */ 1125 + torture = 1; 1142 1126 } else { 1143 1127 err = prot_queue_del(ubi, e->pnum); 1144 1128 if (err) { ··· 1394 1364 struct ubi_scan_leb *seb, *tmp; 1395 1365 struct ubi_wl_entry *e; 1396 1366 1397 - ubi->used = ubi->free = ubi->scrub = RB_ROOT; 1367 + ubi->used = ubi->erroneous = ubi->free = ubi->scrub = RB_ROOT; 1398 1368 spin_lock_init(&ubi->wl_lock); 1399 1369 mutex_init(&ubi->move_mutex); 1400 1370 init_rwsem(&ubi->work_sem); ··· 1532 1502 cancel_pending(ubi); 1533 1503 protection_queue_destroy(ubi); 1534 1504 tree_destroy(&ubi->used); 1505 + tree_destroy(&ubi->erroneous); 1535 1506 tree_destroy(&ubi->free); 1536 1507 tree_destroy(&ubi->scrub); 1537 1508 kfree(ubi->lookuptbl);