Merge branch 'linux-next' of git://git.infradead.org/ubifs-2.6

* 'linux-next' of git://git.infradead.org/ubifs-2.6:
UBIFS: fix recovery bug
UBIFS: add R/O compatibility
UBIFS: fix compiler warnings
UBIFS: fully sort GCed nodes
UBIFS: fix commentaries
UBIFS: introduce a helpful variable
UBIFS: use KERN_CONT
UBIFS: fix lprops committing bug
UBIFS: fix bogus assertion
UBIFS: fix bug where page is marked uptodate when out of space
UBIFS: amend key_hash return value
UBIFS: improve find function interface
UBIFS: list usage cleanup
UBIFS: fix dbg_chk_lpt_sz()

+484 -267
+17 -20
fs/ubifs/budget.c
··· 194 } 195 196 /** 197 - * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index. 198 * @c: UBIFS file-system description object 199 * 200 - * This function calculates and returns the number of eraseblocks which should 201 - * be kept for index usage. 202 */ 203 int ubifs_calc_min_idx_lebs(struct ubifs_info *c) 204 { 205 - int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz; 206 long long idx_size; 207 208 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; 209 - 210 /* And make sure we have thrice the index size of space reserved */ 211 - idx_size = idx_size + (idx_size << 1); 212 - 213 /* 214 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' 215 * pair, nor similarly the two variables for the new index size, so we 216 * have to do this costly 64-bit division on fast-path. 217 */ 218 - idx_size += eff_leb_size - 1; 219 - idx_lebs = div_u64(idx_size, eff_leb_size); 220 /* 221 * The index head is not available for the in-the-gaps method, so add an 222 * extra LEB to compensate. ··· 307 * do_budget_space - reserve flash space for index and data growth. 308 * @c: UBIFS file-system description object 309 * 310 - * This function makes sure UBIFS has enough free eraseblocks for index growth 311 - * and data. 312 * 313 * When budgeting index space, UBIFS reserves thrice as many LEBs as the index 314 * would take if it was consolidated and written to the flash. This guarantees 315 * that the "in-the-gaps" commit method always succeeds and UBIFS will always 316 * be able to commit dirty index. So this function basically adds amount of 317 * budgeted index space to the size of the current index, multiplies this by 3, 318 - * and makes sure this does not exceed the amount of free eraseblocks. 319 * 320 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: 321 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might 322 * be large, because UBIFS does not do any index consolidation as long as 323 * there is free space. IOW, the index may take a lot of LEBs, but the LEBs 324 * will contain a lot of dirt. 325 - * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be 326 - * consolidated to take up to @c->min_idx_lebs LEBs. 327 * 328 * This function returns zero in case of success, and %-ENOSPC in case of 329 * failure. ··· 692 * This function calculates amount of free space to report to user-space. 693 * 694 * Because UBIFS may introduce substantial overhead (the index, node headers, 695 - * alignment, wastage at the end of eraseblocks, etc), it cannot report real 696 - * amount of free flash space it has (well, because not all dirty space is 697 - * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so, 698 - * it would bread user expectations about what free space is. Users seem to 699 - * accustomed to assume that if the file-system reports N bytes of free space, 700 - * they would be able to fit a file of N bytes to the FS. This almost works for 701 * traditional file-systems, because they have way less overhead than UBIFS. 702 * So, to keep users happy, UBIFS tries to take the overhead into account. 703 */
··· 194 } 195 196 /** 197 + * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index. 198 * @c: UBIFS file-system description object 199 * 200 + * This function calculates and returns the number of LEBs which should be kept 201 + * for index usage. 202 */ 203 int ubifs_calc_min_idx_lebs(struct ubifs_info *c) 204 { 205 + int idx_lebs; 206 long long idx_size; 207 208 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; 209 /* And make sure we have thrice the index size of space reserved */ 210 + idx_size += idx_size << 1; 211 /* 212 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' 213 * pair, nor similarly the two variables for the new index size, so we 214 * have to do this costly 64-bit division on fast-path. 215 */ 216 + idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size); 217 /* 218 * The index head is not available for the in-the-gaps method, so add an 219 * extra LEB to compensate. ··· 310 * do_budget_space - reserve flash space for index and data growth. 311 * @c: UBIFS file-system description object 312 * 313 + * This function makes sure UBIFS has enough free LEBs for index growth and 314 + * data. 315 * 316 * When budgeting index space, UBIFS reserves thrice as many LEBs as the index 317 * would take if it was consolidated and written to the flash. This guarantees 318 * that the "in-the-gaps" commit method always succeeds and UBIFS will always 319 * be able to commit dirty index. So this function basically adds amount of 320 * budgeted index space to the size of the current index, multiplies this by 3, 321 + * and makes sure this does not exceed the amount of free LEBs. 322 * 323 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: 324 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might 325 * be large, because UBIFS does not do any index consolidation as long as 326 * there is free space. IOW, the index may take a lot of LEBs, but the LEBs 327 * will contain a lot of dirt. 328 + * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW, 329 + * the index may be consolidated to take up to @c->min_idx_lebs LEBs. 330 * 331 * This function returns zero in case of success, and %-ENOSPC in case of 332 * failure. ··· 695 * This function calculates amount of free space to report to user-space. 696 * 697 * Because UBIFS may introduce substantial overhead (the index, node headers, 698 + * alignment, wastage at the end of LEBs, etc), it cannot report real amount of 699 + * free flash space it has (well, because not all dirty space is reclaimable, 700 + * UBIFS does not actually know the real amount). If UBIFS did so, it would 701 + * bread user expectations about what free space is. Users seem to accustomed 702 + * to assume that if the file-system reports N bytes of free space, they would 703 + * be able to fit a file of N bytes to the FS. This almost works for 704 * traditional file-systems, because they have way less overhead than UBIFS. 705 * So, to keep users happy, UBIFS tries to take the overhead into account. 706 */
+3 -3
fs/ubifs/debug.c
··· 479 "bad or corrupted node)"); 480 else { 481 for (i = 0; i < nlen && dent->name[i]; i++) 482 - printk("%c", dent->name[i]); 483 } 484 - printk("\n"); 485 486 break; 487 } ··· 1214 1215 /* 1216 * Make sure the last key in our znode is less or 1217 - * equivalent than the the key in zbranch which goes 1218 * after our pointing zbranch. 1219 */ 1220 cmp = keys_cmp(c, max,
··· 479 "bad or corrupted node)"); 480 else { 481 for (i = 0; i < nlen && dent->name[i]; i++) 482 + printk(KERN_CONT "%c", dent->name[i]); 483 } 484 + printk(KERN_CONT "\n"); 485 486 break; 487 } ··· 1214 1215 /* 1216 * Make sure the last key in our znode is less or 1217 + * equivalent than the key in the zbranch which goes 1218 * after our pointing zbranch. 1219 */ 1220 cmp = keys_cmp(c, max,
+13 -3
fs/ubifs/file.c
··· 430 struct ubifs_inode *ui = ubifs_inode(inode); 431 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size); 433 struct page *page; 434 435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); ··· 445 446 if (!PageUptodate(page)) { 447 /* The page is not loaded from the flash */ 448 - if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) 449 /* 450 * We change whole page so no need to load it. But we 451 * have to set the @PG_checked flag to make the further ··· 454 * the media. 455 */ 456 SetPageChecked(page); 457 - else { 458 err = do_readpage(page); 459 if (err) { 460 unlock_page(page); ··· 471 err = allocate_budget(c, page, ui, appending); 472 if (unlikely(err)) { 473 ubifs_assert(err == -ENOSPC); 474 /* 475 * Budgeting failed which means it would have to force 476 * write-back but didn't, because we set the @fast flag in the ··· 959 * whole index and correct all inode sizes, which is long an unacceptable. 960 * 961 * To prevent situations like this, UBIFS writes pages back only if they are 962 - * within last synchronized inode size, i.e. the the size which has been 963 * written to the flash media last time. Otherwise, UBIFS forces inode 964 * write-back, thus making sure the on-flash inode contains current inode size, 965 * and then keeps writing pages back.
··· 430 struct ubifs_inode *ui = ubifs_inode(inode); 431 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size); 433 + int skipped_read = 0; 434 struct page *page; 435 436 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); ··· 444 445 if (!PageUptodate(page)) { 446 /* The page is not loaded from the flash */ 447 + if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) { 448 /* 449 * We change whole page so no need to load it. But we 450 * have to set the @PG_checked flag to make the further ··· 453 * the media. 454 */ 455 SetPageChecked(page); 456 + skipped_read = 1; 457 + } else { 458 err = do_readpage(page); 459 if (err) { 460 unlock_page(page); ··· 469 err = allocate_budget(c, page, ui, appending); 470 if (unlikely(err)) { 471 ubifs_assert(err == -ENOSPC); 472 + /* 473 + * If we skipped reading the page because we were going to 474 + * write all of it, then it is not up to date. 475 + */ 476 + if (skipped_read) { 477 + ClearPageChecked(page); 478 + ClearPageUptodate(page); 479 + } 480 /* 481 * Budgeting failed which means it would have to force 482 * write-back but didn't, because we set the @fast flag in the ··· 949 * whole index and correct all inode sizes, which is long an unacceptable. 950 * 951 * To prevent situations like this, UBIFS writes pages back only if they are 952 + * within the last synchronized inode size, i.e. the size which has been 953 * written to the flash media last time. Otherwise, UBIFS forces inode 954 * write-back, thus making sure the on-flash inode contains current inode size, 955 * and then keeps writing pages back.
+6 -6
fs/ubifs/find.c
··· 478 * ubifs_find_free_space - find a data LEB with free space. 479 * @c: the UBIFS file-system description object 480 * @min_space: minimum amount of required free space 481 - * @free: contains amount of free space in the LEB on exit 482 * @squeeze: whether to try to find space in a non-empty LEB first 483 * 484 * This function looks for an LEB with at least @min_space bytes of free space. ··· 490 * failed to find a LEB with @min_space bytes of free space and other a negative 491 * error codes in case of failure. 492 */ 493 - int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, 494 int squeeze) 495 { 496 const struct ubifs_lprops *lprops; ··· 558 spin_unlock(&c->space_lock); 559 } 560 561 - *free = lprops->free; 562 ubifs_release_lprops(c); 563 564 - if (*free == c->leb_size) { 565 /* 566 * Ensure that empty LEBs have been unmapped. They may not have 567 * been, for example, because of an unclean unmount. Also ··· 573 return err; 574 } 575 576 - dbg_find("found LEB %d, free %d", lnum, *free); 577 - ubifs_assert(*free >= min_space); 578 return lnum; 579 580 out:
··· 478 * ubifs_find_free_space - find a data LEB with free space. 479 * @c: the UBIFS file-system description object 480 * @min_space: minimum amount of required free space 481 + * @offs: contains offset of where free space starts on exit 482 * @squeeze: whether to try to find space in a non-empty LEB first 483 * 484 * This function looks for an LEB with at least @min_space bytes of free space. ··· 490 * failed to find a LEB with @min_space bytes of free space and other a negative 491 * error codes in case of failure. 492 */ 493 + int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs, 494 int squeeze) 495 { 496 const struct ubifs_lprops *lprops; ··· 558 spin_unlock(&c->space_lock); 559 } 560 561 + *offs = c->leb_size - lprops->free; 562 ubifs_release_lprops(c); 563 564 + if (*offs == 0) { 565 /* 566 * Ensure that empty LEBs have been unmapped. They may not have 567 * been, for example, because of an unclean unmount. Also ··· 573 return err; 574 } 575 576 + dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs); 577 + ubifs_assert(*offs <= c->leb_size - min_space); 578 return lnum; 579 580 out:
+298 -134
fs/ubifs/gc.c
··· 47 * have to waste large pieces of free space at the end of LEB B, because nodes 48 * from LEB A would not fit. And the worst situation is when all nodes are of 49 * maximum size. So dark watermark is the amount of free + dirty space in LEB 50 - * which are guaranteed to be reclaimable. If LEB has less space, the GC migh 51 * be unable to reclaim it. So, LEBs with free + dirty greater than dark 52 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so 53 * good, and GC takes extra care when moving them. ··· 55 56 #include <linux/pagemap.h> 57 #include "ubifs.h" 58 - 59 - /* 60 - * GC tries to optimize the way it fit nodes to available space, and it sorts 61 - * nodes a little. The below constants are watermarks which define "large", 62 - * "medium", and "small" nodes. 63 - */ 64 - #define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4) 65 - #define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ 66 67 /* 68 * GC may need to move more than one LEB to make progress. The below constants ··· 108 } 109 110 /** 111 - * joinup - bring data nodes for an inode together. 112 - * @c: UBIFS file-system description object 113 - * @sleb: describes scanned LEB 114 - * @inum: inode number 115 - * @blk: block number 116 - * @data: list to which to add data nodes 117 * 118 - * This function looks at the first few nodes in the scanned LEB @sleb and adds 119 - * them to @data if they are data nodes from @inum and have a larger block 120 - * number than @blk. This function returns %0 on success and a negative error 121 - * code on failure. 122 */ 123 - static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum, 124 - unsigned int blk, struct list_head *data) 125 { 126 - int err, cnt = 6, lnum = sleb->lnum, offs; 127 - struct ubifs_scan_node *snod, *tmp; 128 - union ubifs_key *key; 129 130 - list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { 131 - key = &snod->key; 132 - if (key_inum(c, key) == inum && 133 - key_type(c, key) == UBIFS_DATA_KEY && 134 - key_block(c, key) > blk) { 135 - offs = snod->offs; 136 - err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0); 137 - if (err < 0) 138 - return err; 139 - list_del(&snod->list); 140 - if (err) { 141 - list_add_tail(&snod->list, data); 142 - blk = key_block(c, key); 143 - } else 144 - kfree(snod); 145 - cnt = 6; 146 - } else if (--cnt == 0) 147 break; 148 } 149 - return 0; 150 } 151 152 /** 153 - * move_nodes - move nodes. 154 - * @c: UBIFS file-system description object 155 - * @sleb: describes nodes to move 156 * 157 - * This function moves valid nodes from data LEB described by @sleb to the GC 158 - * journal head. The obsolete nodes are dropped. 159 - * 160 - * When moving nodes we have to deal with classical bin-packing problem: the 161 - * space in the current GC journal head LEB and in @c->gc_lnum are the "bins", 162 - * where the nodes in the @sleb->nodes list are the elements which should be 163 - * fit optimally to the bins. This function uses the "first fit decreasing" 164 - * strategy, although it does not really sort the nodes but just split them on 165 - * 3 classes - large, medium, and small, so they are roughly sorted. 166 - * 167 - * This function returns zero in case of success, %-EAGAIN if commit is 168 - * required, and other negative error codes in case of other failures. 169 */ 170 - static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) 171 { 172 struct ubifs_scan_node *snod, *tmp; 173 - struct list_head data, large, medium, small; 174 - struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; 175 - int avail, err, min = INT_MAX; 176 - unsigned int blk = 0; 177 - ino_t inum = 0; 178 179 - INIT_LIST_HEAD(&data); 180 - INIT_LIST_HEAD(&large); 181 - INIT_LIST_HEAD(&medium); 182 - INIT_LIST_HEAD(&small); 183 184 - while (!list_empty(&sleb->nodes)) { 185 - struct list_head *lst = sleb->nodes.next; 186 - 187 - snod = list_entry(lst, struct ubifs_scan_node, list); 188 189 ubifs_assert(snod->type != UBIFS_IDX_NODE); 190 ubifs_assert(snod->type != UBIFS_REF_NODE); ··· 332 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, 333 snod->offs, 0); 334 if (err < 0) 335 - goto out; 336 337 - list_del(lst); 338 if (!err) { 339 /* The node is obsolete, remove it from the list */ 340 kfree(snod); 341 continue; 342 } 343 344 - /* 345 - * Sort the list of nodes so that data nodes go first, large 346 - * nodes go second, and small nodes go last. 347 - */ 348 - if (key_type(c, &snod->key) == UBIFS_DATA_KEY) { 349 - if (inum != key_inum(c, &snod->key)) { 350 - if (inum) { 351 - /* 352 - * Try to move data nodes from the same 353 - * inode together. 354 - */ 355 - err = joinup(c, sleb, inum, blk, &data); 356 - if (err) 357 - goto out; 358 - } 359 - inum = key_inum(c, &snod->key); 360 - blk = key_block(c, &snod->key); 361 - } 362 - list_add_tail(lst, &data); 363 - } else if (snod->len > MEDIUM_NODE_WM) 364 - list_add_tail(lst, &large); 365 - else if (snod->len > SMALL_NODE_WM) 366 - list_add_tail(lst, &medium); 367 - else 368 - list_add_tail(lst, &small); 369 370 - /* And find the smallest node */ 371 - if (snod->len < min) 372 - min = snod->len; 373 } 374 375 - /* 376 - * Join the tree lists so that we'd have one roughly sorted list 377 - * ('large' will be the head of the joined list). 378 - */ 379 - list_splice(&data, &large); 380 - list_splice(&medium, large.prev); 381 - list_splice(&small, large.prev); 382 383 if (wbuf->lnum == -1) { 384 /* ··· 406 */ 407 err = switch_gc_head(c); 408 if (err) 409 - goto out; 410 } 411 412 /* Write nodes to their new location. Use the first-fit strategy */ 413 while (1) { 414 - avail = c->leb_size - wbuf->offs - wbuf->used; 415 - list_for_each_entry_safe(snod, tmp, &large, list) { 416 - int new_lnum, new_offs; 417 418 if (avail < min) 419 break; 420 421 - if (snod->len > avail) 422 - /* This node does not fit */ 423 continue; 424 425 - cond_resched(); 426 - 427 - new_lnum = wbuf->lnum; 428 - new_offs = wbuf->offs + wbuf->used; 429 - err = ubifs_wbuf_write_nolock(wbuf, snod->node, 430 - snod->len); 431 if (err) 432 goto out; 433 - err = ubifs_tnc_replace(c, &snod->key, sleb->lnum, 434 - snod->offs, new_lnum, new_offs, 435 - snod->len); 436 - if (err) 437 - goto out; 438 - 439 - avail = c->leb_size - wbuf->offs - wbuf->used; 440 - list_del(&snod->list); 441 - kfree(snod); 442 } 443 444 - if (list_empty(&large)) 445 break; 446 447 /* ··· 473 return 0; 474 475 out: 476 - list_for_each_entry_safe(snod, tmp, &large, list) { 477 - list_del(&snod->list); 478 - kfree(snod); 479 - } 480 return err; 481 } 482
··· 47 * have to waste large pieces of free space at the end of LEB B, because nodes 48 * from LEB A would not fit. And the worst situation is when all nodes are of 49 * maximum size. So dark watermark is the amount of free + dirty space in LEB 50 + * which are guaranteed to be reclaimable. If LEB has less space, the GC might 51 * be unable to reclaim it. So, LEBs with free + dirty greater than dark 52 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so 53 * good, and GC takes extra care when moving them. ··· 55 56 #include <linux/pagemap.h> 57 #include "ubifs.h" 58 59 /* 60 * GC may need to move more than one LEB to make progress. The below constants ··· 116 } 117 118 /** 119 + * list_sort - sort a list. 120 + * @priv: private data, passed to @cmp 121 + * @head: the list to sort 122 + * @cmp: the elements comparison function 123 * 124 + * This function has been implemented by Mark J Roberts <mjr@znex.org>. It 125 + * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted 126 + * in ascending order. 127 + * 128 + * The comparison function @cmp is supposed to return a negative value if @a is 129 + * than @b, and a positive value if @a is greater than @b. If @a and @b are 130 + * equivalent, then it does not matter what this function returns. 131 */ 132 + static void list_sort(void *priv, struct list_head *head, 133 + int (*cmp)(void *priv, struct list_head *a, 134 + struct list_head *b)) 135 { 136 + struct list_head *p, *q, *e, *list, *tail, *oldhead; 137 + int insize, nmerges, psize, qsize, i; 138 139 + if (list_empty(head)) 140 + return; 141 + 142 + list = head->next; 143 + list_del(head); 144 + insize = 1; 145 + for (;;) { 146 + p = oldhead = list; 147 + list = tail = NULL; 148 + nmerges = 0; 149 + 150 + while (p) { 151 + nmerges++; 152 + q = p; 153 + psize = 0; 154 + for (i = 0; i < insize; i++) { 155 + psize++; 156 + q = q->next == oldhead ? NULL : q->next; 157 + if (!q) 158 + break; 159 + } 160 + 161 + qsize = insize; 162 + while (psize > 0 || (qsize > 0 && q)) { 163 + if (!psize) { 164 + e = q; 165 + q = q->next; 166 + qsize--; 167 + if (q == oldhead) 168 + q = NULL; 169 + } else if (!qsize || !q) { 170 + e = p; 171 + p = p->next; 172 + psize--; 173 + if (p == oldhead) 174 + p = NULL; 175 + } else if (cmp(priv, p, q) <= 0) { 176 + e = p; 177 + p = p->next; 178 + psize--; 179 + if (p == oldhead) 180 + p = NULL; 181 + } else { 182 + e = q; 183 + q = q->next; 184 + qsize--; 185 + if (q == oldhead) 186 + q = NULL; 187 + } 188 + if (tail) 189 + tail->next = e; 190 + else 191 + list = e; 192 + e->prev = tail; 193 + tail = e; 194 + } 195 + p = q; 196 + } 197 + 198 + tail->next = list; 199 + list->prev = tail; 200 + 201 + if (nmerges <= 1) 202 break; 203 + 204 + insize *= 2; 205 } 206 + 207 + head->next = list; 208 + head->prev = list->prev; 209 + list->prev->next = head; 210 + list->prev = head; 211 } 212 213 /** 214 + * data_nodes_cmp - compare 2 data nodes. 215 + * @priv: UBIFS file-system description object 216 + * @a: first data node 217 + * @a: second data node 218 * 219 + * This function compares data nodes @a and @b. Returns %1 if @a has greater 220 + * inode or block number, and %-1 otherwise. 221 */ 222 + int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) 223 + { 224 + ino_t inuma, inumb; 225 + struct ubifs_info *c = priv; 226 + struct ubifs_scan_node *sa, *sb; 227 + 228 + cond_resched(); 229 + sa = list_entry(a, struct ubifs_scan_node, list); 230 + sb = list_entry(b, struct ubifs_scan_node, list); 231 + ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY); 232 + ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY); 233 + 234 + inuma = key_inum(c, &sa->key); 235 + inumb = key_inum(c, &sb->key); 236 + 237 + if (inuma == inumb) { 238 + unsigned int blka = key_block(c, &sa->key); 239 + unsigned int blkb = key_block(c, &sb->key); 240 + 241 + if (blka <= blkb) 242 + return -1; 243 + } else if (inuma <= inumb) 244 + return -1; 245 + 246 + return 1; 247 + } 248 + 249 + /* 250 + * nondata_nodes_cmp - compare 2 non-data nodes. 251 + * @priv: UBIFS file-system description object 252 + * @a: first node 253 + * @a: second node 254 + * 255 + * This function compares nodes @a and @b. It makes sure that inode nodes go 256 + * first and sorted by length in descending order. Directory entry nodes go 257 + * after inode nodes and are sorted in ascending hash valuer order. 258 + */ 259 + int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) 260 + { 261 + int typea, typeb; 262 + ino_t inuma, inumb; 263 + struct ubifs_info *c = priv; 264 + struct ubifs_scan_node *sa, *sb; 265 + 266 + cond_resched(); 267 + sa = list_entry(a, struct ubifs_scan_node, list); 268 + sb = list_entry(b, struct ubifs_scan_node, list); 269 + typea = key_type(c, &sa->key); 270 + typeb = key_type(c, &sb->key); 271 + ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY); 272 + 273 + /* Inodes go before directory entries */ 274 + if (typea == UBIFS_INO_KEY) { 275 + if (typeb == UBIFS_INO_KEY) 276 + return sb->len - sa->len; 277 + return -1; 278 + } 279 + if (typeb == UBIFS_INO_KEY) 280 + return 1; 281 + 282 + ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY); 283 + inuma = key_inum(c, &sa->key); 284 + inumb = key_inum(c, &sb->key); 285 + 286 + if (inuma == inumb) { 287 + uint32_t hasha = key_hash(c, &sa->key); 288 + uint32_t hashb = key_hash(c, &sb->key); 289 + 290 + if (hasha <= hashb) 291 + return -1; 292 + } else if (inuma <= inumb) 293 + return -1; 294 + 295 + return 1; 296 + } 297 + 298 + /** 299 + * sort_nodes - sort nodes for GC. 300 + * @c: UBIFS file-system description object 301 + * @sleb: describes nodes to sort and contains the result on exit 302 + * @nondata: contains non-data nodes on exit 303 + * @min: minimum node size is returned here 304 + * 305 + * This function sorts the list of inodes to garbage collect. First of all, it 306 + * kills obsolete nodes and separates data and non-data nodes to the 307 + * @sleb->nodes and @nondata lists correspondingly. 308 + * 309 + * Data nodes are then sorted in block number order - this is important for 310 + * bulk-read; data nodes with lower inode number go before data nodes with 311 + * higher inode number, and data nodes with lower block number go before data 312 + * nodes with higher block number; 313 + * 314 + * Non-data nodes are sorted as follows. 315 + * o First go inode nodes - they are sorted in descending length order. 316 + * o Then go directory entry nodes - they are sorted in hash order, which 317 + * should supposedly optimize 'readdir()'. Direntry nodes with lower parent 318 + * inode number go before direntry nodes with higher parent inode number, 319 + * and direntry nodes with lower name hash values go before direntry nodes 320 + * with higher name hash values. 321 + * 322 + * This function returns zero in case of success and a negative error code in 323 + * case of failure. 324 + */ 325 + static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb, 326 + struct list_head *nondata, int *min) 327 { 328 struct ubifs_scan_node *snod, *tmp; 329 330 + *min = INT_MAX; 331 332 + /* Separate data nodes and non-data nodes */ 333 + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { 334 + int err; 335 336 ubifs_assert(snod->type != UBIFS_IDX_NODE); 337 ubifs_assert(snod->type != UBIFS_REF_NODE); ··· 201 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, 202 snod->offs, 0); 203 if (err < 0) 204 + return err; 205 206 if (!err) { 207 /* The node is obsolete, remove it from the list */ 208 + list_del(&snod->list); 209 kfree(snod); 210 continue; 211 } 212 213 + if (snod->len < *min) 214 + *min = snod->len; 215 216 + if (key_type(c, &snod->key) != UBIFS_DATA_KEY) 217 + list_move_tail(&snod->list, nondata); 218 } 219 220 + /* Sort data and non-data nodes */ 221 + list_sort(c, &sleb->nodes, &data_nodes_cmp); 222 + list_sort(c, nondata, &nondata_nodes_cmp); 223 + return 0; 224 + } 225 + 226 + /** 227 + * move_node - move a node. 228 + * @c: UBIFS file-system description object 229 + * @sleb: describes the LEB to move nodes from 230 + * @snod: the mode to move 231 + * @wbuf: write-buffer to move node to 232 + * 233 + * This function moves node @snod to @wbuf, changes TNC correspondingly, and 234 + * destroys @snod. Returns zero in case of success and a negative error code in 235 + * case of failure. 236 + */ 237 + static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb, 238 + struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf) 239 + { 240 + int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used; 241 + 242 + cond_resched(); 243 + err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len); 244 + if (err) 245 + return err; 246 + 247 + err = ubifs_tnc_replace(c, &snod->key, sleb->lnum, 248 + snod->offs, new_lnum, new_offs, 249 + snod->len); 250 + list_del(&snod->list); 251 + kfree(snod); 252 + return err; 253 + } 254 + 255 + /** 256 + * move_nodes - move nodes. 257 + * @c: UBIFS file-system description object 258 + * @sleb: describes the LEB to move nodes from 259 + * 260 + * This function moves valid nodes from data LEB described by @sleb to the GC 261 + * journal head. This function returns zero in case of success, %-EAGAIN if 262 + * commit is required, and other negative error codes in case of other 263 + * failures. 264 + */ 265 + static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) 266 + { 267 + int err, min; 268 + LIST_HEAD(nondata); 269 + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; 270 271 if (wbuf->lnum == -1) { 272 /* ··· 256 */ 257 err = switch_gc_head(c); 258 if (err) 259 + return err; 260 } 261 + 262 + err = sort_nodes(c, sleb, &nondata, &min); 263 + if (err) 264 + goto out; 265 266 /* Write nodes to their new location. Use the first-fit strategy */ 267 while (1) { 268 + int avail; 269 + struct ubifs_scan_node *snod, *tmp; 270 271 + /* Move data nodes */ 272 + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { 273 + avail = c->leb_size - wbuf->offs - wbuf->used; 274 + if (snod->len > avail) 275 + /* 276 + * Do not skip data nodes in order to optimize 277 + * bulk-read. 278 + */ 279 + break; 280 + 281 + err = move_node(c, sleb, snod, wbuf); 282 + if (err) 283 + goto out; 284 + } 285 + 286 + /* Move non-data nodes */ 287 + list_for_each_entry_safe(snod, tmp, &nondata, list) { 288 + avail = c->leb_size - wbuf->offs - wbuf->used; 289 if (avail < min) 290 break; 291 292 + if (snod->len > avail) { 293 + /* 294 + * Keep going only if this is an inode with 295 + * some data. Otherwise stop and switch the GC 296 + * head. IOW, we assume that data-less inode 297 + * nodes and direntry nodes are roughly of the 298 + * same size. 299 + */ 300 + if (key_type(c, &snod->key) == UBIFS_DENT_KEY || 301 + snod->len == UBIFS_INO_NODE_SZ) 302 + break; 303 continue; 304 + } 305 306 + err = move_node(c, sleb, snod, wbuf); 307 if (err) 308 goto out; 309 } 310 311 + if (list_empty(&sleb->nodes) && list_empty(&nondata)) 312 break; 313 314 /* ··· 306 return 0; 307 308 out: 309 + list_splice_tail(&nondata, &sleb->nodes); 310 return err; 311 } 312
+3 -4
fs/ubifs/journal.c
··· 114 */ 115 static int reserve_space(struct ubifs_info *c, int jhead, int len) 116 { 117 - int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze; 118 struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; 119 120 /* ··· 139 * Write buffer wasn't seek'ed or there is no enough space - look for an 140 * LEB with some empty space. 141 */ 142 - lnum = ubifs_find_free_space(c, len, &free, squeeze); 143 if (lnum >= 0) { 144 /* Found an LEB, add it to the journal head */ 145 - offs = c->leb_size - free; 146 err = ubifs_add_bud_to_log(c, jhead, lnum, offs); 147 if (err) 148 goto out_return; ··· 1365 * @host: host inode 1366 * 1367 * This function writes the updated version of an extended attribute inode and 1368 - * the host inode tho the journal (to the base head). The host inode is written 1369 * after the extended attribute inode in order to guarantee that the extended 1370 * attribute will be flushed when the inode is synchronized by 'fsync()' and 1371 * consequently, the write-buffer is synchronized. This function returns zero
··· 114 */ 115 static int reserve_space(struct ubifs_info *c, int jhead, int len) 116 { 117 + int err = 0, err1, retries = 0, avail, lnum, offs, squeeze; 118 struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; 119 120 /* ··· 139 * Write buffer wasn't seek'ed or there is no enough space - look for an 140 * LEB with some empty space. 141 */ 142 + lnum = ubifs_find_free_space(c, len, &offs, squeeze); 143 if (lnum >= 0) { 144 /* Found an LEB, add it to the journal head */ 145 err = ubifs_add_bud_to_log(c, jhead, lnum, offs); 146 if (err) 147 goto out_return; ··· 1366 * @host: host inode 1367 * 1368 * This function writes the updated version of an extended attribute inode and 1369 + * the host inode to the journal (to the base head). The host inode is written 1370 * after the extended attribute inode in order to guarantee that the extended 1371 * attribute will be flushed when the inode is synchronized by 'fsync()' and 1372 * consequently, the write-buffer is synchronized. This function returns zero
+3 -3
fs/ubifs/key.h
··· 381 * @c: UBIFS file-system description object 382 * @key: the key to get hash from 383 */ 384 - static inline int key_hash(const struct ubifs_info *c, 385 - const union ubifs_key *key) 386 { 387 return key->u32[1] & UBIFS_S_KEY_HASH_MASK; 388 } ··· 392 * @c: UBIFS file-system description object 393 * @k: the key to get hash from 394 */ 395 - static inline int key_hash_flash(const struct ubifs_info *c, const void *k) 396 { 397 const union ubifs_key *key = k; 398
··· 381 * @c: UBIFS file-system description object 382 * @key: the key to get hash from 383 */ 384 + static inline uint32_t key_hash(const struct ubifs_info *c, 385 + const union ubifs_key *key) 386 { 387 return key->u32[1] & UBIFS_S_KEY_HASH_MASK; 388 } ··· 392 * @c: UBIFS file-system description object 393 * @k: the key to get hash from 394 */ 395 + static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k) 396 { 397 const union ubifs_key *key = k; 398
+2 -3
fs/ubifs/log.c
··· 239 } 240 241 /* 242 - * Make sure the the amount of space in buds will not exceed 243 * 'c->max_bud_bytes' limit, because we want to guarantee mount time 244 * limits. 245 * ··· 367 bud->jhead, c->leb_size - bud->start, 368 c->cmt_bud_bytes); 369 rb_erase(p1, &c->buds); 370 - list_del(&bud->list); 371 /* 372 * If the commit does not finish, the recovery will need 373 * to replay the journal, in which case the old buds ··· 374 * commit i.e. do not allow them to be garbage 375 * collected. 376 */ 377 - list_add(&bud->list, &c->old_buds); 378 } 379 } 380 spin_unlock(&c->buds_lock);
··· 239 } 240 241 /* 242 + * Make sure the amount of space in buds will not exceed the 243 * 'c->max_bud_bytes' limit, because we want to guarantee mount time 244 * limits. 245 * ··· 367 bud->jhead, c->leb_size - bud->start, 368 c->cmt_bud_bytes); 369 rb_erase(p1, &c->buds); 370 /* 371 * If the commit does not finish, the recovery will need 372 * to replay the journal, in which case the old buds ··· 375 * commit i.e. do not allow them to be garbage 376 * collected. 377 */ 378 + list_move(&bud->list, &c->old_buds); 379 } 380 } 381 spin_unlock(&c->buds_lock);
+19 -15
fs/ubifs/lpt_commit.c
··· 229 while (offs + len > c->leb_size) { 230 alen = ALIGN(offs, c->min_io_size); 231 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 232 - dbg_chk_lpt_sz(c, 2, alen - offs); 233 err = alloc_lpt_leb(c, &lnum); 234 if (err) 235 goto no_space; ··· 272 if (offs + c->lsave_sz > c->leb_size) { 273 alen = ALIGN(offs, c->min_io_size); 274 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 275 - dbg_chk_lpt_sz(c, 2, alen - offs); 276 err = alloc_lpt_leb(c, &lnum); 277 if (err) 278 goto no_space; ··· 292 if (offs + c->ltab_sz > c->leb_size) { 293 alen = ALIGN(offs, c->min_io_size); 294 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 295 - dbg_chk_lpt_sz(c, 2, alen - offs); 296 err = alloc_lpt_leb(c, &lnum); 297 if (err) 298 goto no_space; ··· 416 alen, UBI_SHORTTERM); 417 if (err) 418 return err; 419 - dbg_chk_lpt_sz(c, 4, alen - wlen); 420 } 421 - dbg_chk_lpt_sz(c, 2, 0); 422 err = realloc_lpt_leb(c, &lnum); 423 if (err) 424 goto no_space; 425 - offs = 0; 426 - from = 0; 427 ubifs_assert(lnum >= c->lpt_first && 428 lnum <= c->lpt_last); 429 err = ubifs_leb_unmap(c, lnum); ··· 475 UBI_SHORTTERM); 476 if (err) 477 return err; 478 - dbg_chk_lpt_sz(c, 2, alen - wlen); 479 err = realloc_lpt_leb(c, &lnum); 480 if (err) 481 goto no_space; 482 - offs = 0; 483 ubifs_assert(lnum >= c->lpt_first && 484 lnum <= c->lpt_last); 485 err = ubifs_leb_unmap(c, lnum); ··· 502 UBI_SHORTTERM); 503 if (err) 504 return err; 505 - dbg_chk_lpt_sz(c, 2, alen - wlen); 506 err = realloc_lpt_leb(c, &lnum); 507 if (err) 508 goto no_space; 509 - offs = 0; 510 ubifs_assert(lnum >= c->lpt_first && 511 lnum <= c->lpt_last); 512 err = ubifs_leb_unmap(c, lnum); ··· 1754 /** 1755 * dbg_chk_lpt_sz - check LPT does not write more than LPT size. 1756 * @c: the UBIFS file-system description object 1757 - * @action: action 1758 * @len: length written 1759 * 1760 * This function returns %0 on success and a negative error code on failure. 1761 */ 1762 int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) 1763 { ··· 1921 lnum, offs); 1922 err = ubifs_unpack_nnode(c, buf, &nnode); 1923 for (i = 0; i < UBIFS_LPT_FANOUT; i++) { 1924 - printk("%d:%d", nnode.nbranch[i].lnum, 1925 nnode.nbranch[i].offs); 1926 if (i != UBIFS_LPT_FANOUT - 1) 1927 - printk(", "); 1928 } 1929 - printk("\n"); 1930 break; 1931 } 1932 case UBIFS_LPT_LTAB:
··· 229 while (offs + len > c->leb_size) { 230 alen = ALIGN(offs, c->min_io_size); 231 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 232 + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); 233 err = alloc_lpt_leb(c, &lnum); 234 if (err) 235 goto no_space; ··· 272 if (offs + c->lsave_sz > c->leb_size) { 273 alen = ALIGN(offs, c->min_io_size); 274 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 275 + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); 276 err = alloc_lpt_leb(c, &lnum); 277 if (err) 278 goto no_space; ··· 292 if (offs + c->ltab_sz > c->leb_size) { 293 alen = ALIGN(offs, c->min_io_size); 294 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 295 + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); 296 err = alloc_lpt_leb(c, &lnum); 297 if (err) 298 goto no_space; ··· 416 alen, UBI_SHORTTERM); 417 if (err) 418 return err; 419 } 420 + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); 421 err = realloc_lpt_leb(c, &lnum); 422 if (err) 423 goto no_space; 424 + offs = from = 0; 425 ubifs_assert(lnum >= c->lpt_first && 426 lnum <= c->lpt_last); 427 err = ubifs_leb_unmap(c, lnum); ··· 477 UBI_SHORTTERM); 478 if (err) 479 return err; 480 + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); 481 err = realloc_lpt_leb(c, &lnum); 482 if (err) 483 goto no_space; 484 + offs = from = 0; 485 ubifs_assert(lnum >= c->lpt_first && 486 lnum <= c->lpt_last); 487 err = ubifs_leb_unmap(c, lnum); ··· 504 UBI_SHORTTERM); 505 if (err) 506 return err; 507 + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); 508 err = realloc_lpt_leb(c, &lnum); 509 if (err) 510 goto no_space; 511 + offs = from = 0; 512 ubifs_assert(lnum >= c->lpt_first && 513 lnum <= c->lpt_last); 514 err = ubifs_leb_unmap(c, lnum); ··· 1756 /** 1757 * dbg_chk_lpt_sz - check LPT does not write more than LPT size. 1758 * @c: the UBIFS file-system description object 1759 + * @action: what to do 1760 * @len: length written 1761 * 1762 * This function returns %0 on success and a negative error code on failure. 1763 + * The @action argument may be one of: 1764 + * o %0 - LPT debugging checking starts, initialize debugging variables; 1765 + * o %1 - wrote an LPT node, increase LPT size by @len bytes; 1766 + * o %2 - switched to a different LEB and wasted @len bytes; 1767 + * o %3 - check that we've written the right number of bytes. 1768 + * o %4 - wasted @len bytes; 1769 */ 1770 int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) 1771 { ··· 1917 lnum, offs); 1918 err = ubifs_unpack_nnode(c, buf, &nnode); 1919 for (i = 0; i < UBIFS_LPT_FANOUT; i++) { 1920 + printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum, 1921 nnode.nbranch[i].offs); 1922 if (i != UBIFS_LPT_FANOUT - 1) 1923 + printk(KERN_CONT ", "); 1924 } 1925 + printk(KERN_CONT "\n"); 1926 break; 1927 } 1928 case UBIFS_LPT_LTAB:
+23 -47
fs/ubifs/recovery.c
··· 425 * @lnum: LEB number of the LEB from which @buf was read 426 * @offs: offset from which @buf was read 427 * 428 - * This function scans @buf for more nodes and returns %0 is a node is found and 429 - * %1 if no more nodes are found. 430 */ 431 static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, 432 int lnum, int offs) 433 { 434 - int skip, next_offs = 0; 435 436 - if (len > UBIFS_DATA_NODE_SZ) { 437 - struct ubifs_ch *ch = buf; 438 - int dlen = le32_to_cpu(ch->len); 439 - 440 - if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ && 441 - dlen <= UBIFS_MAX_DATA_NODE_SZ) 442 - /* The corrupt node looks like a data node */ 443 - next_offs = ALIGN(offs + dlen, 8); 444 } 445 - 446 - if (c->min_io_size == 1) 447 - skip = 8; 448 - else 449 - skip = ALIGN(offs + 1, c->min_io_size) - offs; 450 - 451 - offs += skip; 452 - buf += skip; 453 - len -= skip; 454 - while (len > 8) { 455 - struct ubifs_ch *ch = buf; 456 - uint32_t magic = le32_to_cpu(ch->magic); 457 - int ret; 458 - 459 - if (magic == UBIFS_NODE_MAGIC) { 460 - ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1); 461 - if (ret == SCANNED_A_NODE || ret > 0) { 462 - /* 463 - * There is a small chance this is just data in 464 - * a data node, so check that possibility. e.g. 465 - * this is part of a file that itself contains 466 - * a UBIFS image. 467 - */ 468 - if (next_offs && offs + le32_to_cpu(ch->len) <= 469 - next_offs) 470 - continue; 471 - dbg_rcvry("unexpected node at %d:%d", lnum, 472 - offs); 473 - return 0; 474 - } 475 - } 476 - offs += 8; 477 - buf += 8; 478 - len -= 8; 479 - } 480 - return 1; 481 } 482 483 /**
··· 425 * @lnum: LEB number of the LEB from which @buf was read 426 * @offs: offset from which @buf was read 427 * 428 + * This function ensures that the corrupted node at @offs is the last thing 429 + * written to a LEB. This function returns %1 if more data is not found and 430 + * %0 if more data is found. 431 */ 432 static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, 433 int lnum, int offs) 434 { 435 + struct ubifs_ch *ch = buf; 436 + int skip, dlen = le32_to_cpu(ch->len); 437 438 + /* Check for empty space after the corrupt node's common header */ 439 + skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs; 440 + if (is_empty(buf + skip, len - skip)) 441 + return 1; 442 + /* 443 + * The area after the common header size is not empty, so the common 444 + * header must be intact. Check it. 445 + */ 446 + if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) { 447 + dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs); 448 + return 0; 449 } 450 + /* Now we know the corrupt node's length we can skip over it */ 451 + skip = ALIGN(offs + dlen, c->min_io_size) - offs; 452 + /* After which there should be empty space */ 453 + if (is_empty(buf + skip, len - skip)) 454 + return 1; 455 + dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip); 456 + return 0; 457 } 458 459 /**
+1 -1
fs/ubifs/replay.c
··· 143 dirty -= c->leb_size - lp->free; 144 /* 145 * If the replay order was perfect the dirty space would now be 146 - * zero. The order is not perfect because the the journal heads 147 * race with each other. This is not a problem but is does mean 148 * that the dirty space may temporarily exceed c->leb_size 149 * during the replay.
··· 143 dirty -= c->leb_size - lp->free; 144 /* 145 * If the replay order was perfect the dirty space would now be 146 + * zero. The order is not perfect because the journal heads 147 * race with each other. This is not a problem but is does mean 148 * that the dirty space may temporarily exceed c->leb_size 149 * during the replay.
+29 -7
fs/ubifs/sb.c
··· 193 if (tmp64 > DEFAULT_MAX_RP_SIZE) 194 tmp64 = DEFAULT_MAX_RP_SIZE; 195 sup->rp_size = cpu_to_le64(tmp64); 196 197 err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM); 198 kfree(sup); ··· 533 if (IS_ERR(sup)) 534 return PTR_ERR(sup); 535 536 /* 537 * The software supports all previous versions but not future versions, 538 * due to the unavailability of time-travelling equipment. 539 */ 540 - c->fmt_version = le32_to_cpu(sup->fmt_version); 541 if (c->fmt_version > UBIFS_FORMAT_VERSION) { 542 - ubifs_err("on-flash format version is %d, but software only " 543 - "supports up to version %d", c->fmt_version, 544 - UBIFS_FORMAT_VERSION); 545 - err = -EINVAL; 546 - goto out; 547 } 548 549 if (c->fmt_version < 3) { ··· 646 c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS; 647 c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs; 648 c->main_first = c->leb_cnt - c->main_lebs; 649 - c->report_rp_size = ubifs_reported_space(c, c->rp_size); 650 651 err = validate_sb(c, sup); 652 out:
··· 193 if (tmp64 > DEFAULT_MAX_RP_SIZE) 194 tmp64 = DEFAULT_MAX_RP_SIZE; 195 sup->rp_size = cpu_to_le64(tmp64); 196 + sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION); 197 198 err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM); 199 kfree(sup); ··· 532 if (IS_ERR(sup)) 533 return PTR_ERR(sup); 534 535 + c->fmt_version = le32_to_cpu(sup->fmt_version); 536 + c->ro_compat_version = le32_to_cpu(sup->ro_compat_version); 537 + 538 /* 539 * The software supports all previous versions but not future versions, 540 * due to the unavailability of time-travelling equipment. 541 */ 542 if (c->fmt_version > UBIFS_FORMAT_VERSION) { 543 + struct super_block *sb = c->vfs_sb; 544 + int mounting_ro = sb->s_flags & MS_RDONLY; 545 + 546 + ubifs_assert(!c->ro_media || mounting_ro); 547 + if (!mounting_ro || 548 + c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) { 549 + ubifs_err("on-flash format version is w%d/r%d, but " 550 + "software only supports up to version " 551 + "w%d/r%d", c->fmt_version, 552 + c->ro_compat_version, UBIFS_FORMAT_VERSION, 553 + UBIFS_RO_COMPAT_VERSION); 554 + if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) { 555 + ubifs_msg("only R/O mounting is possible"); 556 + err = -EROFS; 557 + } else 558 + err = -EINVAL; 559 + goto out; 560 + } 561 + 562 + /* 563 + * The FS is mounted R/O, and the media format is 564 + * R/O-compatible with the UBIFS implementation, so we can 565 + * mount. 566 + */ 567 + c->rw_incompat = 1; 568 } 569 570 if (c->fmt_version < 3) { ··· 623 c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS; 624 c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs; 625 c->main_first = c->leb_cnt - c->main_lebs; 626 627 err = validate_sb(c, sup); 628 out:
+2 -4
fs/ubifs/shrinker.c
··· 206 * Move this one to the end of the list to provide some 207 * fairness. 208 */ 209 - list_del(&c->infos_list); 210 - list_add_tail(&c->infos_list, &ubifs_infos); 211 mutex_unlock(&c->umount_mutex); 212 if (freed >= nr) 213 break; ··· 262 } 263 264 if (i == 1) { 265 - list_del(&c->infos_list); 266 - list_add_tail(&c->infos_list, &ubifs_infos); 267 spin_unlock(&ubifs_infos_lock); 268 269 ubifs_request_bg_commit(c);
··· 206 * Move this one to the end of the list to provide some 207 * fairness. 208 */ 209 + list_move_tail(&c->infos_list, &ubifs_infos); 210 mutex_unlock(&c->umount_mutex); 211 if (freed >= nr) 212 break; ··· 263 } 264 265 if (i == 1) { 266 + list_move_tail(&c->infos_list, &ubifs_infos); 267 spin_unlock(&ubifs_infos_lock); 268 269 ubifs_request_bg_commit(c);
+27 -10
fs/ubifs/super.c
··· 421 seq_printf(s, ",no_chk_data_crc"); 422 423 if (c->mount_opts.override_compr) { 424 - seq_printf(s, ",compr="); 425 - seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type)); 426 } 427 428 return 0; ··· 700 if (err) 701 return err; 702 703 return 0; 704 } 705 ··· 718 long long tmp64; 719 720 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 721 722 /* 723 * Calculate total amount of FS blocks. This number is not used ··· 1204 goto out_cbuf; 1205 1206 /* Create background thread */ 1207 - c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); 1208 if (IS_ERR(c->bgt)) { 1209 err = PTR_ERR(c->bgt); 1210 c->bgt = NULL; ··· 1321 else { 1322 c->need_recovery = 0; 1323 ubifs_msg("recovery completed"); 1324 - /* GC LEB has to be empty and taken at this point */ 1325 - ubifs_assert(c->lst.taken_empty_lebs == 1); 1326 } 1327 } else 1328 - ubifs_assert(c->lst.taken_empty_lebs == 1); 1329 1330 err = dbg_check_filesystem(c); 1331 if (err) ··· 1351 x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; 1352 ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d " 1353 "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); 1354 - ubifs_msg("media format: %d (latest is %d)", 1355 - c->fmt_version, UBIFS_FORMAT_VERSION); 1356 ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); 1357 ubifs_msg("reserved for root: %llu bytes (%llu KiB)", 1358 c->report_rp_size, c->report_rp_size >> 10); ··· 1493 { 1494 int err, lnum; 1495 1496 mutex_lock(&c->umount_mutex); 1497 dbg_save_space_info(c); 1498 c->remounting_rw = 1; ··· 1571 ubifs_create_buds_lists(c); 1572 1573 /* Create background thread */ 1574 - c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); 1575 if (IS_ERR(c->bgt)) { 1576 err = PTR_ERR(c->bgt); 1577 c->bgt = NULL; ··· 1792 c->bu.buf = NULL; 1793 } 1794 1795 - ubifs_assert(c->lst.taken_empty_lebs == 1); 1796 return 0; 1797 } 1798
··· 421 seq_printf(s, ",no_chk_data_crc"); 422 423 if (c->mount_opts.override_compr) { 424 + seq_printf(s, ",compr=%s", 425 + ubifs_compr_name(c->mount_opts.compr_type)); 426 } 427 428 return 0; ··· 700 if (err) 701 return err; 702 703 + /* Initialize effective LEB size used in budgeting calculations */ 704 + c->idx_leb_size = c->leb_size - c->max_idx_node_sz; 705 return 0; 706 } 707 ··· 716 long long tmp64; 717 718 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 719 + c->report_rp_size = ubifs_reported_space(c, c->rp_size); 720 721 /* 722 * Calculate total amount of FS blocks. This number is not used ··· 1201 goto out_cbuf; 1202 1203 /* Create background thread */ 1204 + c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); 1205 if (IS_ERR(c->bgt)) { 1206 err = PTR_ERR(c->bgt); 1207 c->bgt = NULL; ··· 1318 else { 1319 c->need_recovery = 0; 1320 ubifs_msg("recovery completed"); 1321 + /* 1322 + * GC LEB has to be empty and taken at this point. But 1323 + * the journal head LEBs may also be accounted as 1324 + * "empty taken" if they are empty. 1325 + */ 1326 + ubifs_assert(c->lst.taken_empty_lebs > 0); 1327 } 1328 } else 1329 + ubifs_assert(c->lst.taken_empty_lebs > 0); 1330 1331 err = dbg_check_filesystem(c); 1332 if (err) ··· 1344 x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; 1345 ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d " 1346 "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); 1347 + ubifs_msg("media format: w%d/r%d (latest is w%d/r%d)", 1348 + c->fmt_version, c->ro_compat_version, 1349 + UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); 1350 ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); 1351 ubifs_msg("reserved for root: %llu bytes (%llu KiB)", 1352 c->report_rp_size, c->report_rp_size >> 10); ··· 1485 { 1486 int err, lnum; 1487 1488 + if (c->rw_incompat) { 1489 + ubifs_err("the file-system is not R/W-compatible"); 1490 + ubifs_msg("on-flash format version is w%d/r%d, but software " 1491 + "only supports up to version w%d/r%d", c->fmt_version, 1492 + c->ro_compat_version, UBIFS_FORMAT_VERSION, 1493 + UBIFS_RO_COMPAT_VERSION); 1494 + return -EROFS; 1495 + } 1496 + 1497 mutex_lock(&c->umount_mutex); 1498 dbg_save_space_info(c); 1499 c->remounting_rw = 1; ··· 1554 ubifs_create_buds_lists(c); 1555 1556 /* Create background thread */ 1557 + c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); 1558 if (IS_ERR(c->bgt)) { 1559 err = PTR_ERR(c->bgt); 1560 c->bgt = NULL; ··· 1775 c->bu.buf = NULL; 1776 } 1777 1778 + ubifs_assert(c->lst.taken_empty_lebs > 0); 1779 return 0; 1780 } 1781
+1 -1
fs/ubifs/tnc.c
··· 1252 * splitting in the middle of the colliding sequence. Also, when 1253 * removing the leftmost key, we would have to correct the key of the 1254 * parent node, which would introduce additional complications. Namely, 1255 - * if we changed the the leftmost key of the parent znode, the garbage 1256 * collector would be unable to find it (GC is doing this when GC'ing 1257 * indexing LEBs). Although we already have an additional RB-tree where 1258 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
··· 1252 * splitting in the middle of the colliding sequence. Also, when 1253 * removing the leftmost key, we would have to correct the key of the 1254 * parent node, which would introduce additional complications. Namely, 1255 + * if we changed the leftmost key of the parent znode, the garbage 1256 * collector would be unable to find it (GC is doing this when GC'ing 1257 * indexing LEBs). Although we already have an additional RB-tree where 1258 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
+27 -3
fs/ubifs/ubifs-media.h
··· 36 /* UBIFS node magic number (must not have the padding byte first or last) */ 37 #define UBIFS_NODE_MAGIC 0x06101831 38 39 - /* UBIFS on-flash format version */ 40 #define UBIFS_FORMAT_VERSION 4 41 42 /* Minimum logical eraseblock size in bytes */ 43 #define UBIFS_MIN_LEB_SZ (15*1024) ··· 75 76 /* 77 * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes 78 - * shorter than uncompressed data length, UBIFS preferes to leave this data 79 * node uncompress, because it'll be read faster. 80 */ 81 #define UBIFS_MIN_COMPRESS_DIFF 64 ··· 608 * @padding2: reserved for future, zeroes 609 * @time_gran: time granularity in nanoseconds 610 * @uuid: UUID generated when the file system image was created 611 */ 612 struct ubifs_sb_node { 613 struct ubifs_ch ch; ··· 635 __le64 rp_size; 636 __le32 time_gran; 637 __u8 uuid[16]; 638 - __u8 padding2[3972]; 639 } __attribute__ ((packed)); 640 641 /**
··· 36 /* UBIFS node magic number (must not have the padding byte first or last) */ 37 #define UBIFS_NODE_MAGIC 0x06101831 38 39 + /* 40 + * UBIFS on-flash format version. This version is increased when the on-flash 41 + * format is changing. If this happens, UBIFS is will support older versions as 42 + * well. But older UBIFS code will not support newer formats. Format changes 43 + * will be rare and only when absolutely necessary, e.g. to fix a bug or to add 44 + * a new feature. 45 + * 46 + * UBIFS went into mainline kernel with format version 4. The older formats 47 + * were development formats. 48 + */ 49 #define UBIFS_FORMAT_VERSION 4 50 + 51 + /* 52 + * Read-only compatibility version. If the UBIFS format is changed, older UBIFS 53 + * implementations will not be able to mount newer formats in read-write mode. 54 + * However, depending on the change, it may be possible to mount newer formats 55 + * in R/O mode. This is indicated by the R/O compatibility version which is 56 + * stored in the super-block. 57 + * 58 + * This is needed to support boot-loaders which only need R/O mounting. With 59 + * this flag it is possible to do UBIFS format changes without a need to update 60 + * boot-loaders. 61 + */ 62 + #define UBIFS_RO_COMPAT_VERSION 0 63 64 /* Minimum logical eraseblock size in bytes */ 65 #define UBIFS_MIN_LEB_SZ (15*1024) ··· 53 54 /* 55 * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes 56 + * shorter than uncompressed data length, UBIFS prefers to leave this data 57 * node uncompress, because it'll be read faster. 58 */ 59 #define UBIFS_MIN_COMPRESS_DIFF 64 ··· 586 * @padding2: reserved for future, zeroes 587 * @time_gran: time granularity in nanoseconds 588 * @uuid: UUID generated when the file system image was created 589 + * @ro_compat_version: UBIFS R/O compatibility version 590 */ 591 struct ubifs_sb_node { 592 struct ubifs_ch ch; ··· 612 __le64 rp_size; 613 __le32 time_gran; 614 __u8 uuid[16]; 615 + __le32 ro_compat_version; 616 + __u8 padding2[3968]; 617 } __attribute__ ((packed)); 618 619 /**
+10 -3
fs/ubifs/ubifs.h
··· 934 * by @commit_sem 935 * @cnt_lock: protects @highest_inum and @max_sqnum counters 936 * @fmt_version: UBIFS on-flash format version 937 * @uuid: UUID from super block 938 * 939 * @lhead_lnum: log head logical eraseblock number ··· 967 * recovery) 968 * @bulk_read: enable bulk-reads 969 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) 970 * 971 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and 972 * @calc_idx_sz ··· 1017 * @min_io_shift: number of bits in @min_io_size minus one 1018 * @leb_size: logical eraseblock size in bytes 1019 * @half_leb_size: half LEB size 1020 * @leb_cnt: count of logical eraseblocks 1021 * @max_leb_cnt: maximum count of logical eraseblocks 1022 * @old_leb_cnt: count of logical eraseblocks before re-size ··· 1136 * previous commit start 1137 * @uncat_list: list of un-categorized LEBs 1138 * @empty_list: list of empty LEBs 1139 - * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size) 1140 - * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size) 1141 * @freeable_cnt: number of freeable LEBs in @freeable_list 1142 * 1143 * @ltab_lnum: LEB number of LPT's own lprops table ··· 1181 unsigned long long cmt_no; 1182 spinlock_t cnt_lock; 1183 int fmt_version; 1184 unsigned char uuid[16]; 1185 1186 int lhead_lnum; ··· 1210 unsigned int no_chk_data_crc:1; 1211 unsigned int bulk_read:1; 1212 unsigned int default_compr:2; 1213 1214 struct mutex tnc_mutex; 1215 struct ubifs_zbranch zroot; ··· 1259 int min_io_shift; 1260 int leb_size; 1261 int half_leb_size; 1262 int leb_cnt; 1263 int max_leb_cnt; 1264 int old_leb_cnt; ··· 1507 long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); 1508 1509 /* find.c */ 1510 - int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, 1511 int squeeze); 1512 int ubifs_find_free_leb_for_idx(struct ubifs_info *c); 1513 int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,
··· 934 * by @commit_sem 935 * @cnt_lock: protects @highest_inum and @max_sqnum counters 936 * @fmt_version: UBIFS on-flash format version 937 + * @ro_compat_version: R/O compatibility version 938 * @uuid: UUID from super block 939 * 940 * @lhead_lnum: log head logical eraseblock number ··· 966 * recovery) 967 * @bulk_read: enable bulk-reads 968 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) 969 + * @rw_incompat: the media is not R/W compatible 970 * 971 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and 972 * @calc_idx_sz ··· 1015 * @min_io_shift: number of bits in @min_io_size minus one 1016 * @leb_size: logical eraseblock size in bytes 1017 * @half_leb_size: half LEB size 1018 + * @idx_leb_size: how many bytes of an LEB are effectively available when it is 1019 + * used to store indexing nodes (@leb_size - @max_idx_node_sz) 1020 * @leb_cnt: count of logical eraseblocks 1021 * @max_leb_cnt: maximum count of logical eraseblocks 1022 * @old_leb_cnt: count of logical eraseblocks before re-size ··· 1132 * previous commit start 1133 * @uncat_list: list of un-categorized LEBs 1134 * @empty_list: list of empty LEBs 1135 + * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size) 1136 + * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size) 1137 * @freeable_cnt: number of freeable LEBs in @freeable_list 1138 * 1139 * @ltab_lnum: LEB number of LPT's own lprops table ··· 1177 unsigned long long cmt_no; 1178 spinlock_t cnt_lock; 1179 int fmt_version; 1180 + int ro_compat_version; 1181 unsigned char uuid[16]; 1182 1183 int lhead_lnum; ··· 1205 unsigned int no_chk_data_crc:1; 1206 unsigned int bulk_read:1; 1207 unsigned int default_compr:2; 1208 + unsigned int rw_incompat:1; 1209 1210 struct mutex tnc_mutex; 1211 struct ubifs_zbranch zroot; ··· 1253 int min_io_shift; 1254 int leb_size; 1255 int half_leb_size; 1256 + int idx_leb_size; 1257 int leb_cnt; 1258 int max_leb_cnt; 1259 int old_leb_cnt; ··· 1500 long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); 1501 1502 /* find.c */ 1503 + int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs, 1504 int squeeze); 1505 int ubifs_find_free_leb_for_idx(struct ubifs_info *c); 1506 int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,