Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
"15 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
mm: numa: mark huge PTEs young when clearing NUMA hinting faults
mm: numa: slow PTE scan rate if migration failures occur
mm: numa: preserve PTE write permissions across a NUMA hinting fault
mm: numa: group related processes based on VMA flags instead of page table flags
hfsplus: fix B-tree corruption after insertion at position 0
MAINTAINERS: add Jan as DMI/SMBIOS support maintainer
fs/affs/file.c: unlock/release page on error
mm/page_alloc.c: call kernel_map_pages in unset_migrateype_isolate
mm/slub: fix lockups on PREEMPT && !SMP kernels
mm/memory hotplug: postpone the reset of obsolete pgdat
MAINTAINERS: correct rtc armada38x pattern entry
mm/pagewalk.c: prevent positive return value of walk_page_test() from being passed to callers
mm: fix anon_vma->degree underflow in anon_vma endless growing prevention
drivers/rtc/rtc-mrst: fix suspend/resume
aoe: update aoe maintainer information

+106 -71
+10 -3
MAINTAINERS
··· 1186 1186 L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) 1187 1187 S: Maintained 1188 1188 F: arch/arm/mach-mvebu/ 1189 - F: drivers/rtc/armada38x-rtc 1189 + F: drivers/rtc/rtc-armada38x.c 1190 1190 1191 1191 ARM/Marvell Berlin SoC support 1192 1192 M: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com> ··· 1675 1675 F: include/linux/platform_data/at24.h 1676 1676 1677 1677 ATA OVER ETHERNET (AOE) DRIVER 1678 - M: "Ed L. Cashin" <ecashin@coraid.com> 1679 - W: http://support.coraid.com/support/linux 1678 + M: "Ed L. Cashin" <ed.cashin@acm.org> 1679 + W: http://www.openaoe.org/ 1680 1680 S: Supported 1681 1681 F: Documentation/aoe/ 1682 1682 F: drivers/block/aoe/ ··· 3251 3251 S: Maintained 3252 3252 F: Documentation/hwmon/dme1737 3253 3253 F: drivers/hwmon/dme1737.c 3254 + 3255 + DMI/SMBIOS SUPPORT 3256 + M: Jean Delvare <jdelvare@suse.de> 3257 + S: Maintained 3258 + F: drivers/firmware/dmi-id.c 3259 + F: drivers/firmware/dmi_scan.c 3260 + F: include/linux/dmi.h 3254 3261 3255 3262 DOCKING STATION DRIVER 3256 3263 M: Shaohua Li <shaohua.li@intel.com>
+9 -8
drivers/rtc/rtc-mrst.c
··· 413 413 mrst->dev = NULL; 414 414 } 415 415 416 - #ifdef CONFIG_PM 417 - static int mrst_suspend(struct device *dev, pm_message_t mesg) 416 + #ifdef CONFIG_PM_SLEEP 417 + static int mrst_suspend(struct device *dev) 418 418 { 419 419 struct mrst_rtc *mrst = dev_get_drvdata(dev); 420 420 unsigned char tmp; ··· 453 453 */ 454 454 static inline int mrst_poweroff(struct device *dev) 455 455 { 456 - return mrst_suspend(dev, PMSG_HIBERNATE); 456 + return mrst_suspend(dev); 457 457 } 458 458 459 459 static int mrst_resume(struct device *dev) ··· 490 490 return 0; 491 491 } 492 492 493 + static SIMPLE_DEV_PM_OPS(mrst_pm_ops, mrst_suspend, mrst_resume); 494 + #define MRST_PM_OPS (&mrst_pm_ops) 495 + 493 496 #else 494 - #define mrst_suspend NULL 495 - #define mrst_resume NULL 497 + #define MRST_PM_OPS NULL 496 498 497 499 static inline int mrst_poweroff(struct device *dev) 498 500 { ··· 531 529 .remove = vrtc_mrst_platform_remove, 532 530 .shutdown = vrtc_mrst_platform_shutdown, 533 531 .driver = { 534 - .name = (char *) driver_name, 535 - .suspend = mrst_suspend, 536 - .resume = mrst_resume, 532 + .name = driver_name, 533 + .pm = MRST_PM_OPS, 537 534 } 538 535 }; 539 536
+12 -7
fs/affs/file.c
··· 699 699 boff = tmp % bsize; 700 700 if (boff) { 701 701 bh = affs_bread_ino(inode, bidx, 0); 702 - if (IS_ERR(bh)) 703 - return PTR_ERR(bh); 702 + if (IS_ERR(bh)) { 703 + written = PTR_ERR(bh); 704 + goto err_first_bh; 705 + } 704 706 tmp = min(bsize - boff, to - from); 705 707 BUG_ON(boff + tmp > bsize || tmp > bsize); 706 708 memcpy(AFFS_DATA(bh) + boff, data + from, tmp); ··· 714 712 bidx++; 715 713 } else if (bidx) { 716 714 bh = affs_bread_ino(inode, bidx - 1, 0); 717 - if (IS_ERR(bh)) 718 - return PTR_ERR(bh); 715 + if (IS_ERR(bh)) { 716 + written = PTR_ERR(bh); 717 + goto err_first_bh; 718 + } 719 719 } 720 720 while (from + bsize <= to) { 721 721 prev_bh = bh; 722 722 bh = affs_getemptyblk_ino(inode, bidx); 723 723 if (IS_ERR(bh)) 724 - goto out; 724 + goto err_bh; 725 725 memcpy(AFFS_DATA(bh), data + from, bsize); 726 726 if (buffer_new(bh)) { 727 727 AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); ··· 755 751 prev_bh = bh; 756 752 bh = affs_bread_ino(inode, bidx, 1); 757 753 if (IS_ERR(bh)) 758 - goto out; 754 + goto err_bh; 759 755 tmp = min(bsize, to - from); 760 756 BUG_ON(tmp > bsize); 761 757 memcpy(AFFS_DATA(bh), data + from, tmp); ··· 794 790 if (tmp > inode->i_size) 795 791 inode->i_size = AFFS_I(inode)->mmu_private = tmp; 796 792 793 + err_first_bh: 797 794 unlock_page(page); 798 795 page_cache_release(page); 799 796 800 797 return written; 801 798 802 - out: 799 + err_bh: 803 800 bh = prev_bh; 804 801 if (!written) 805 802 written = PTR_ERR(bh);
+11 -9
fs/hfsplus/brec.c
··· 131 131 hfs_bnode_write(node, entry, data_off + key_len, entry_len); 132 132 hfs_bnode_dump(node); 133 133 134 - if (new_node) { 135 - /* update parent key if we inserted a key 136 - * at the start of the first node 137 - */ 138 - if (!rec && new_node != node) 139 - hfs_brec_update_parent(fd); 134 + /* 135 + * update parent key if we inserted a key 136 + * at the start of the node and it is not the new node 137 + */ 138 + if (!rec && new_node != node) { 139 + hfs_bnode_read_key(node, fd->search_key, data_off + size); 140 + hfs_brec_update_parent(fd); 141 + } 140 142 143 + if (new_node) { 141 144 hfs_bnode_put(fd->bnode); 142 145 if (!new_node->parent) { 143 146 hfs_btree_inc_height(tree); ··· 170 167 } 171 168 goto again; 172 169 } 173 - 174 - if (!rec) 175 - hfs_brec_update_parent(fd); 176 170 177 171 return 0; 178 172 } ··· 370 370 if (IS_ERR(parent)) 371 371 return PTR_ERR(parent); 372 372 __hfs_brec_find(parent, fd, hfs_find_rec_by_key); 373 + if (fd->record < 0) 374 + return -ENOENT; 373 375 hfs_bnode_dump(parent); 374 376 rec = fd->record; 375 377
+5 -4
include/linux/sched.h
··· 1625 1625 1626 1626 /* 1627 1627 * numa_faults_locality tracks if faults recorded during the last 1628 - * scan window were remote/local. The task scan period is adapted 1629 - * based on the locality of the faults with different weights 1630 - * depending on whether they were shared or private faults 1628 + * scan window were remote/local or failed to migrate. The task scan 1629 + * period is adapted based on the locality of the faults with different 1630 + * weights depending on whether they were shared or private faults 1631 1631 */ 1632 - unsigned long numa_faults_locality[2]; 1632 + unsigned long numa_faults_locality[3]; 1633 1633 1634 1634 unsigned long numa_pages_migrated; 1635 1635 #endif /* CONFIG_NUMA_BALANCING */ ··· 1719 1719 #define TNF_NO_GROUP 0x02 1720 1720 #define TNF_SHARED 0x04 1721 1721 #define TNF_FAULT_LOCAL 0x08 1722 + #define TNF_MIGRATE_FAIL 0x10 1722 1723 1723 1724 #ifdef CONFIG_NUMA_BALANCING 1724 1725 extern void task_numa_fault(int last_node, int node, int pages, int flags);
+6 -2
kernel/sched/fair.c
··· 1609 1609 /* 1610 1610 * If there were no record hinting faults then either the task is 1611 1611 * completely idle or all activity is areas that are not of interest 1612 - * to automatic numa balancing. Scan slower 1612 + * to automatic numa balancing. Related to that, if there were failed 1613 + * migration then it implies we are migrating too quickly or the local 1614 + * node is overloaded. In either case, scan slower 1613 1615 */ 1614 - if (local + shared == 0) { 1616 + if (local + shared == 0 || p->numa_faults_locality[2]) { 1615 1617 p->numa_scan_period = min(p->numa_scan_period_max, 1616 1618 p->numa_scan_period << 1); 1617 1619 ··· 2082 2080 2083 2081 if (migrated) 2084 2082 p->numa_pages_migrated += pages; 2083 + if (flags & TNF_MIGRATE_FAIL) 2084 + p->numa_faults_locality[2] += pages; 2085 2085 2086 2086 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; 2087 2087 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
+13 -13
mm/huge_memory.c
··· 1260 1260 int target_nid, last_cpupid = -1; 1261 1261 bool page_locked; 1262 1262 bool migrated = false; 1263 + bool was_writable; 1263 1264 int flags = 0; 1264 1265 1265 1266 /* A PROT_NONE fault should not end up here */ ··· 1292 1291 flags |= TNF_FAULT_LOCAL; 1293 1292 } 1294 1293 1295 - /* 1296 - * Avoid grouping on DSO/COW pages in specific and RO pages 1297 - * in general, RO pages shouldn't hurt as much anyway since 1298 - * they can be in shared cache state. 1299 - * 1300 - * FIXME! This checks "pmd_dirty()" as an approximation of 1301 - * "is this a read-only page", since checking "pmd_write()" 1302 - * is even more broken. We haven't actually turned this into 1303 - * a writable page, so pmd_write() will always be false. 1304 - */ 1305 - if (!pmd_dirty(pmd)) 1294 + /* See similar comment in do_numa_page for explanation */ 1295 + if (!(vma->vm_flags & VM_WRITE)) 1306 1296 flags |= TNF_NO_GROUP; 1307 1297 1308 1298 /* ··· 1350 1358 if (migrated) { 1351 1359 flags |= TNF_MIGRATED; 1352 1360 page_nid = target_nid; 1353 - } 1361 + } else 1362 + flags |= TNF_MIGRATE_FAIL; 1354 1363 1355 1364 goto out; 1356 1365 clear_pmdnuma: 1357 1366 BUG_ON(!PageLocked(page)); 1367 + was_writable = pmd_write(pmd); 1358 1368 pmd = pmd_modify(pmd, vma->vm_page_prot); 1369 + pmd = pmd_mkyoung(pmd); 1370 + if (was_writable) 1371 + pmd = pmd_mkwrite(pmd); 1359 1372 set_pmd_at(mm, haddr, pmdp, pmd); 1360 1373 update_mmu_cache_pmd(vma, addr, pmdp); 1361 1374 unlock_page(page); ··· 1484 1487 1485 1488 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 1486 1489 pmd_t entry; 1490 + bool preserve_write = prot_numa && pmd_write(*pmd); 1487 1491 ret = 1; 1488 1492 1489 1493 /* ··· 1500 1502 if (!prot_numa || !pmd_protnone(*pmd)) { 1501 1503 entry = pmdp_get_and_clear_notify(mm, addr, pmd); 1502 1504 entry = pmd_modify(entry, newprot); 1505 + if (preserve_write) 1506 + entry = pmd_mkwrite(entry); 1503 1507 ret = HPAGE_PMD_NR; 1504 1508 set_pmd_at(mm, addr, pmd, entry); 1505 - BUG_ON(pmd_write(entry)); 1509 + BUG_ON(!preserve_write && pmd_write(entry)); 1506 1510 } 1507 1511 spin_unlock(ptl); 1508 1512 }
+12 -10
mm/memory.c
··· 3035 3035 int last_cpupid; 3036 3036 int target_nid; 3037 3037 bool migrated = false; 3038 + bool was_writable = pte_write(pte); 3038 3039 int flags = 0; 3039 3040 3040 3041 /* A PROT_NONE fault should not end up here */ ··· 3060 3059 /* Make it present again */ 3061 3060 pte = pte_modify(pte, vma->vm_page_prot); 3062 3061 pte = pte_mkyoung(pte); 3062 + if (was_writable) 3063 + pte = pte_mkwrite(pte); 3063 3064 set_pte_at(mm, addr, ptep, pte); 3064 3065 update_mmu_cache(vma, addr, ptep); 3065 3066 ··· 3072 3069 } 3073 3070 3074 3071 /* 3075 - * Avoid grouping on DSO/COW pages in specific and RO pages 3076 - * in general, RO pages shouldn't hurt as much anyway since 3077 - * they can be in shared cache state. 3078 - * 3079 - * FIXME! This checks "pmd_dirty()" as an approximation of 3080 - * "is this a read-only page", since checking "pmd_write()" 3081 - * is even more broken. We haven't actually turned this into 3082 - * a writable page, so pmd_write() will always be false. 3072 + * Avoid grouping on RO pages in general. RO pages shouldn't hurt as 3073 + * much anyway since they can be in shared cache state. This misses 3074 + * the case where a mapping is writable but the process never writes 3075 + * to it but pte_write gets cleared during protection updates and 3076 + * pte_dirty has unpredictable behaviour between PTE scan updates, 3077 + * background writeback, dirty balancing and application behaviour. 3083 3078 */ 3084 - if (!pte_dirty(pte)) 3079 + if (!(vma->vm_flags & VM_WRITE)) 3085 3080 flags |= TNF_NO_GROUP; 3086 3081 3087 3082 /* ··· 3103 3102 if (migrated) { 3104 3103 page_nid = target_nid; 3105 3104 flags |= TNF_MIGRATED; 3106 - } 3105 + } else 3106 + flags |= TNF_MIGRATE_FAIL; 3107 3107 3108 3108 out: 3109 3109 if (page_nid != -1)
+4 -9
mm/memory_hotplug.c
··· 1092 1092 return NULL; 1093 1093 1094 1094 arch_refresh_nodedata(nid, pgdat); 1095 + } else { 1096 + /* Reset the nr_zones and classzone_idx to 0 before reuse */ 1097 + pgdat->nr_zones = 0; 1098 + pgdat->classzone_idx = 0; 1095 1099 } 1096 1100 1097 1101 /* we can use NODE_DATA(nid) from here */ ··· 1981 1977 if (is_vmalloc_addr(zone->wait_table)) 1982 1978 vfree(zone->wait_table); 1983 1979 } 1984 - 1985 - /* 1986 - * Since there is no way to guarentee the address of pgdat/zone is not 1987 - * on stack of any kernel threads or used by other kernel objects 1988 - * without reference counting or other symchronizing method, do not 1989 - * reset node_data and free pgdat here. Just reset it to 0 and reuse 1990 - * the memory when the node is online again. 1991 - */ 1992 - memset(pgdat, 0, sizeof(*pgdat)); 1993 1980 } 1994 1981 EXPORT_SYMBOL(try_offline_node); 1995 1982
+1 -3
mm/mmap.c
··· 774 774 775 775 importer->anon_vma = exporter->anon_vma; 776 776 error = anon_vma_clone(importer, exporter); 777 - if (error) { 778 - importer->anon_vma = NULL; 777 + if (error) 779 778 return error; 780 - } 781 779 } 782 780 } 783 781
+3
mm/mprotect.c
··· 75 75 oldpte = *pte; 76 76 if (pte_present(oldpte)) { 77 77 pte_t ptent; 78 + bool preserve_write = prot_numa && pte_write(oldpte); 78 79 79 80 /* 80 81 * Avoid trapping faults against the zero or KSM ··· 95 94 96 95 ptent = ptep_modify_prot_start(mm, addr, pte); 97 96 ptent = pte_modify(ptent, newprot); 97 + if (preserve_write) 98 + ptent = pte_mkwrite(ptent); 98 99 99 100 /* Avoid taking write faults for known dirty pages */ 100 101 if (dirty_accountable && pte_dirty(ptent) &&
+1
mm/page_isolation.c
··· 103 103 104 104 if (!is_migrate_isolate_page(buddy)) { 105 105 __isolate_free_page(page, order); 106 + kernel_map_pages(page, (1 << order), 1); 106 107 set_page_refcounted(page); 107 108 isolated_page = page; 108 109 }
+8 -1
mm/pagewalk.c
··· 265 265 vma = vma->vm_next; 266 266 267 267 err = walk_page_test(start, next, walk); 268 - if (err > 0) 268 + if (err > 0) { 269 + /* 270 + * positive return values are purely for 271 + * controlling the pagewalk, so should never 272 + * be passed to the callers. 273 + */ 274 + err = 0; 269 275 continue; 276 + } 270 277 if (err < 0) 271 278 break; 272 279 }
+7
mm/rmap.c
··· 287 287 return 0; 288 288 289 289 enomem_failure: 290 + /* 291 + * dst->anon_vma is dropped here otherwise its degree can be incorrectly 292 + * decremented in unlink_anon_vmas(). 293 + * We can safely do this because callers of anon_vma_clone() don't care 294 + * about dst->anon_vma if anon_vma_clone() failed. 295 + */ 296 + dst->anon_vma = NULL; 290 297 unlink_anon_vmas(dst); 291 298 return -ENOMEM; 292 299 }
+4 -2
mm/slub.c
··· 2449 2449 do { 2450 2450 tid = this_cpu_read(s->cpu_slab->tid); 2451 2451 c = raw_cpu_ptr(s->cpu_slab); 2452 - } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); 2452 + } while (IS_ENABLED(CONFIG_PREEMPT) && 2453 + unlikely(tid != READ_ONCE(c->tid))); 2453 2454 2454 2455 /* 2455 2456 * Irqless object alloc/free algorithm used here depends on sequence ··· 2719 2718 do { 2720 2719 tid = this_cpu_read(s->cpu_slab->tid); 2721 2720 c = raw_cpu_ptr(s->cpu_slab); 2722 - } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); 2721 + } while (IS_ENABLED(CONFIG_PREEMPT) && 2722 + unlikely(tid != READ_ONCE(c->tid))); 2723 2723 2724 2724 /* Same with comment on barrier() in slab_alloc_node() */ 2725 2725 barrier();