Merge tag 'powerpc-4.7-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux

+8

Documentation/ABI/testing/sysfs-class-cxl

··· 233 233 0 = don't trust, the image may be different (default) 234 234 1 = trust that the image will not change. 235 235 Users: https://github.com/ibm-capi/libcxl 236 + 237 + What: /sys/class/cxl/<card>/psl_timebase_synced 238 + Date: March 2016 239 + Contact: linuxppc-dev@lists.ozlabs.org 240 + Description: read only 241 + Returns 1 if the psl timebase register is synchronized 242 + with the core timebase register, 0 otherwise. 243 + Users: https://github.com/ibm-capi/libcxl

+1 -1

Documentation/features/perf/perf-regs/arch-support.txt

··· 27 27 | nios2: | TODO | 28 28 | openrisc: | TODO | 29 29 | parisc: | TODO | 30 - | powerpc: | TODO | 30 + | powerpc: | ok | 31 31 | s390: | TODO | 32 32 | score: | TODO | 33 33 | sh: | TODO |

+1 -1

Documentation/features/perf/perf-stackdump/arch-support.txt

··· 27 27 | nios2: | TODO | 28 28 | openrisc: | TODO | 29 29 | parisc: | TODO | 30 - | powerpc: | TODO | 30 + | powerpc: | ok | 31 31 | s390: | TODO | 32 32 | score: | TODO | 33 33 | sh: | TODO |

+1 -1

Documentation/powerpc/eeh-pci-error-recovery.txt

··· 12 12 The IBM POWER-based pSeries and iSeries computers include PCI bus 13 13 controller chips that have extended capabilities for detecting and 14 14 reporting a large variety of PCI bus error conditions. These features 15 - go under the name of "EEH", for "Extended Error Handling". The EEH 15 + go under the name of "EEH", for "Enhanced Error Handling". The EEH 16 16 hardware features allow PCI bus errors to be cleared and a PCI 17 17 card to be "rebooted", without also having to reboot the operating 18 18 system.

+13

MAINTAINERS

··· 6675 6675 S: Supported 6676 6676 F: Documentation/powerpc/ 6677 6677 F: arch/powerpc/ 6678 + F: drivers/char/tpm/tpm_ibmvtpm* 6679 + F: drivers/crypto/nx/ 6680 + F: drivers/crypto/vmx/ 6681 + F: drivers/net/ethernet/ibm/ibmveth.* 6682 + F: drivers/net/ethernet/ibm/ibmvnic.* 6683 + F: drivers/pci/hotplug/rpa* 6684 + F: drivers/scsi/ibmvscsi/ 6685 + N: opal 6686 + N: /pmac 6687 + N: powermac 6688 + N: powernv 6689 + N: [^a-z0-9]ps3 6690 + N: pseries 6678 6691 6679 6692 LINUX FOR POWER MACINTOSH 6680 6693 M: Benjamin Herrenschmidt <benh@kernel.crashing.org>

+4 -3

arch/powerpc/Kconfig

··· 116 116 select GENERIC_ATOMIC64 if PPC32 117 117 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE 118 118 select HAVE_PERF_EVENTS 119 + select HAVE_PERF_REGS 120 + select HAVE_PERF_USER_STACK_DUMP 119 121 select HAVE_REGS_AND_STACK_ACCESS_API 120 122 select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64 121 123 select ARCH_WANT_IPC_PARSE_VERSION ··· 608 606 609 607 config FORCE_MAX_ZONEORDER 610 608 int "Maximum zone order" 611 - range 9 64 if PPC64 && PPC_64K_PAGES 609 + range 8 9 if PPC64 && PPC_64K_PAGES 612 610 default "9" if PPC64 && PPC_64K_PAGES 613 - range 13 64 if PPC64 && !PPC_64K_PAGES 611 + range 9 13 if PPC64 && !PPC_64K_PAGES 614 612 default "13" if PPC64 && !PPC_64K_PAGES 615 613 range 9 64 if PPC32 && PPC_16K_PAGES 616 614 default "9" if PPC32 && PPC_16K_PAGES ··· 797 795 798 796 config FSL_LBC 799 797 bool "Freescale Local Bus support" 800 - depends on FSL_SOC 801 798 help 802 799 Enables reporting of errors from the Freescale local bus 803 800 controller. Also contains some common code used by

-8

arch/powerpc/Kconfig.debug

··· 19 19 depends on !PPC_DISABLE_WERROR 20 20 default y 21 21 22 - config STRICT_MM_TYPECHECKS 23 - bool "Do extra type checking on mm types" 24 - default n 25 - help 26 - This option turns on extra type checking for some mm related types. 27 - 28 - If you don't know what this means, say N. 29 - 30 22 config PRINT_STACK_DEPTH 31 23 int "Stack depth to print" if DEBUG_KERNEL 32 24 default 64

+3 -3

arch/powerpc/boot/Makefile

··· 362 362 $(obj)/cuImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) 363 363 $(call if_changed,wrap,cuboot-$*,,$(obj)/$*.dtb) 364 364 365 - $(obj)/cuImage.%: vmlinux $(obj)/fsl/%.dtb $(wrapperbits) 366 - $(call if_changed,wrap,cuboot-$*,,$(obj)/fsl/$*.dtb) 367 - 368 365 $(obj)/simpleImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) 369 366 $(call if_changed,wrap,simpleboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz) 370 367 ··· 376 379 377 380 # Rule to build device tree blobs 378 381 $(obj)/%.dtb: $(src)/dts/%.dts FORCE 382 + $(call if_changed_dep,dtc) 383 + 384 + $(obj)/%.dtb: $(src)/dts/fsl/%.dts FORCE 379 385 $(call if_changed_dep,dtc) 380 386 381 387 # If there isn't a platform selected then just strip the vmlinux.

+4

arch/powerpc/boot/dts/fsl/gef_ppc9a.dts

··· 211 211 0x0 0x00400000>; 212 212 }; 213 213 }; 214 + 215 + pci1: pcie@fef09000 { 216 + status = "disabled"; 217 + }; 214 218 }; 215 219 216 220 /include/ "mpc8641si-post.dtsi"

-22

arch/powerpc/boot/dts/fsl/gef_sbc310.dts

··· 24 24 model = "GEF_SBC310"; 25 25 compatible = "gef,sbc310"; 26 26 27 - aliases { 28 - pci1 = &pci1; 29 - }; 30 - 31 27 memory { 32 28 device_type = "memory"; 33 29 reg = <0x0 0x40000000>; // set by uboot ··· 219 223 }; 220 224 221 225 pci1: pcie@fef09000 { 222 - compatible = "fsl,mpc8641-pcie"; 223 - device_type = "pci"; 224 - #size-cells = <2>; 225 - #address-cells = <3>; 226 226 reg = <0xfef09000 0x1000>; 227 - bus-range = <0x0 0xff>; 228 227 ranges = <0x02000000 0x0 0xc0000000 0xc0000000 0x0 0x20000000 229 228 0x01000000 0x0 0x00000000 0xfe400000 0x0 0x00400000>; 230 - clock-frequency = <100000000>; 231 - interrupts = <0x19 0x2 0 0>; 232 - interrupt-map-mask = <0xf800 0x0 0x0 0x7>; 233 - interrupt-map = < 234 - 0x0000 0x0 0x0 0x1 &mpic 0x4 0x2 235 - 0x0000 0x0 0x0 0x2 &mpic 0x5 0x2 236 - 0x0000 0x0 0x0 0x3 &mpic 0x6 0x2 237 - 0x0000 0x0 0x0 0x4 &mpic 0x7 0x2 238 - >; 239 229 240 230 pcie@0 { 241 - reg = <0 0 0 0 0>; 242 - #size-cells = <2>; 243 - #address-cells = <3>; 244 - device_type = "pci"; 245 231 ranges = <0x02000000 0x0 0xc0000000 246 232 0x02000000 0x0 0xc0000000 247 233 0x0 0x20000000

+4

arch/powerpc/boot/dts/fsl/gef_sbc610.dts

··· 209 209 0x0 0x00400000>; 210 210 }; 211 211 }; 212 + 213 + pci1: pcie@fef09000 { 214 + status = "disabled"; 215 + }; 212 216 }; 213 217 214 218 /include/ "mpc8641si-post.dtsi"

+1 -23

arch/powerpc/boot/dts/fsl/mpc8641_hpcn.dts

··· 15 15 model = "MPC8641HPCN"; 16 16 compatible = "fsl,mpc8641hpcn"; 17 17 18 - aliases { 19 - pci1 = &pci1; 20 - }; 21 - 22 18 memory { 23 19 device_type = "memory"; 24 20 reg = <0x00000000 0x40000000>; // 1G at 0x0 ··· 355 359 }; 356 360 357 361 pci1: pcie@ffe09000 { 358 - compatible = "fsl,mpc8641-pcie"; 359 - device_type = "pci"; 360 - #size-cells = <2>; 361 - #address-cells = <3>; 362 362 reg = <0xffe09000 0x1000>; 363 - bus-range = <0 0xff>; 364 363 ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x20000000 365 364 0x01000000 0x0 0x00000000 0xffc10000 0x0 0x00010000>; 366 - clock-frequency = <100000000>; 367 - interrupts = <25 2 0 0>; 368 - interrupt-map-mask = <0xf800 0 0 7>; 369 - interrupt-map = < 370 - /* IDSEL 0x0 */ 371 - 0x0000 0 0 1 &mpic 4 1 372 - 0x0000 0 0 2 &mpic 5 1 373 - 0x0000 0 0 3 &mpic 6 1 374 - 0x0000 0 0 4 &mpic 7 1 375 - >; 365 + 376 366 pcie@0 { 377 - reg = <0 0 0 0 0>; 378 - #size-cells = <2>; 379 - #address-cells = <3>; 380 - device_type = "pci"; 381 367 ranges = <0x02000000 0x0 0xa0000000 382 368 0x02000000 0x0 0xa0000000 383 369 0x0 0x20000000

+1 -23

arch/powerpc/boot/dts/fsl/mpc8641_hpcn_36b.dts

··· 17 17 #address-cells = <2>; 18 18 #size-cells = <2>; 19 19 20 - aliases { 21 - pci1 = &pci1; 22 - }; 23 - 24 20 memory { 25 21 device_type = "memory"; 26 22 reg = <0x0 0x00000000 0x0 0x40000000>; // 1G at 0x0 ··· 322 326 }; 323 327 324 328 pci1: pcie@fffe09000 { 325 - compatible = "fsl,mpc8641-pcie"; 326 - device_type = "pci"; 327 - #size-cells = <2>; 328 - #address-cells = <3>; 329 329 reg = <0x0f 0xffe09000 0x0 0x1000>; 330 - bus-range = <0x0 0xff>; 331 330 ranges = <0x02000000 0x0 0xe0000000 0x0c 0x20000000 0x0 0x20000000 332 331 0x01000000 0x0 0x00000000 0x0f 0xffc10000 0x0 0x00010000>; 333 - clock-frequency = <100000000>; 334 - interrupts = <25 2 0 0>; 335 - interrupt-map-mask = <0xf800 0 0 7>; 336 - interrupt-map = < 337 - /* IDSEL 0x0 */ 338 - 0x0000 0 0 1 &mpic 4 1 339 - 0x0000 0 0 2 &mpic 5 1 340 - 0x0000 0 0 3 &mpic 6 1 341 - 0x0000 0 0 4 &mpic 7 1 342 - >; 332 + 343 333 pcie@0 { 344 - reg = <0 0 0 0 0>; 345 - #size-cells = <2>; 346 - #address-cells = <3>; 347 - device_type = "pci"; 348 334 ranges = <0x02000000 0x0 0xe0000000 349 335 0x02000000 0x0 0xe0000000 350 336 0x0 0x20000000

+35 -8

arch/powerpc/boot/dts/fsl/mpc8641si-post.dtsi

··· 102 102 bus-range = <0x0 0xff>; 103 103 clock-frequency = <100000000>; 104 104 interrupts = <24 2 0 0>; 105 - interrupt-map-mask = <0xf800 0x0 0x0 0x7>; 106 - 107 - interrupt-map = < 108 - 0x0000 0x0 0x0 0x1 &mpic 0x0 0x1 109 - 0x0000 0x0 0x0 0x2 &mpic 0x1 0x1 110 - 0x0000 0x0 0x0 0x3 &mpic 0x2 0x1 111 - 0x0000 0x0 0x0 0x4 &mpic 0x3 0x1 112 - >; 113 105 114 106 pcie@0 { 115 107 reg = <0 0 0 0 0>; 108 + #interrupt-cells = <1>; 116 109 #size-cells = <2>; 117 110 #address-cells = <3>; 118 111 device_type = "pci"; 112 + interrupts = <24 2 0 0>; 113 + interrupt-map-mask = <0xf800 0x0 0x0 0x7>; 114 + interrupt-map = < 115 + 0x0000 0x0 0x0 0x1 &mpic 0x0 0x1 0x0 0x0 116 + 0x0000 0x0 0x0 0x2 &mpic 0x1 0x1 0x0 0x0 117 + 0x0000 0x0 0x0 0x3 &mpic 0x2 0x1 0x0 0x0 118 + 0x0000 0x0 0x0 0x4 &mpic 0x3 0x1 0x0 0x0 119 + >; 120 + }; 121 + }; 122 + 123 + &pci1 { 124 + compatible = "fsl,mpc8641-pcie"; 125 + device_type = "pci"; 126 + #size-cells = <2>; 127 + #address-cells = <3>; 128 + bus-range = <0x0 0xff>; 129 + clock-frequency = <100000000>; 130 + interrupts = <25 2 0 0>; 131 + 132 + pcie@0 { 133 + reg = <0 0 0 0 0>; 134 + #interrupt-cells = <1>; 135 + #size-cells = <2>; 136 + #address-cells = <3>; 137 + device_type = "pci"; 138 + interrupts = <25 2 0 0>; 139 + interrupt-map-mask = <0xf800 0x0 0x0 0x7>; 140 + interrupt-map = < 141 + 0x0000 0x0 0x0 0x1 &mpic 0x4 0x1 0x0 0x0 142 + 0x0000 0x0 0x0 0x2 &mpic 0x5 0x1 0x0 0x0 143 + 0x0000 0x0 0x0 0x3 &mpic 0x6 0x1 0x0 0x0 144 + 0x0000 0x0 0x0 0x4 &mpic 0x7 0x1 0x0 0x0 145 + >; 119 146 }; 120 147 };

+1

arch/powerpc/boot/dts/fsl/mpc8641si-pre.dtsi

··· 25 25 serial0 = &serial0; 26 26 serial1 = &serial1; 27 27 pci0 = &pci0; 28 + pci1 = &pci1; 28 29 }; 29 30 30 31 cpus {

-23

arch/powerpc/boot/dts/fsl/sbc8641d.dts

··· 19 19 model = "SBC8641D"; 20 20 compatible = "wind,sbc8641"; 21 21 22 - aliases { 23 - pci1 = &pci1; 24 - }; 25 - 26 22 memory { 27 23 device_type = "memory"; 28 24 reg = <0x00000000 0x20000000>; // 512M at 0x0 ··· 161 165 }; 162 166 163 167 pci1: pcie@f8009000 { 164 - compatible = "fsl,mpc8641-pcie"; 165 - device_type = "pci"; 166 - #size-cells = <2>; 167 - #address-cells = <3>; 168 168 reg = <0xf8009000 0x1000>; 169 - bus-range = <0 0xff>; 170 169 ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x20000000 171 170 0x01000000 0x0 0x00000000 0xe3000000 0x0 0x00100000>; 172 - clock-frequency = <100000000>; 173 - interrupts = <25 2 0 0>; 174 - interrupt-map-mask = <0xf800 0 0 7>; 175 - interrupt-map = < 176 - /* IDSEL 0x0 */ 177 - 0x0000 0 0 1 &mpic 4 1 178 - 0x0000 0 0 2 &mpic 5 1 179 - 0x0000 0 0 3 &mpic 6 1 180 - 0x0000 0 0 4 &mpic 7 1 181 - >; 182 171 183 172 pcie@0 { 184 - reg = <0 0 0 0 0>; 185 - #size-cells = <2>; 186 - #address-cells = <3>; 187 - device_type = "pci"; 188 173 ranges = <0x02000000 0x0 0xa0000000 189 174 0x02000000 0x0 0xa0000000 190 175 0x0 0x20000000

+1 -1

arch/powerpc/boot/dts/fsl/t1023si-post.dtsi

··· 263 263 }; 264 264 265 265 rcpm: global-utilities@e2000 { 266 - compatible = "fsl,t1023-rcpm", "fsl,qoriq-rcpm-2.0"; 266 + compatible = "fsl,t1023-rcpm", "fsl,qoriq-rcpm-2.1"; 267 267 reg = <0xe2000 0x1000>; 268 268 }; 269 269

+1 -1

arch/powerpc/boot/dts/fsl/t1040si-post.dtsi

··· 472 472 }; 473 473 474 474 rcpm: global-utilities@e2000 { 475 - compatible = "fsl,t1040-rcpm", "fsl,qoriq-rcpm-2.0"; 475 + compatible = "fsl,t1040-rcpm", "fsl,qoriq-rcpm-2.1"; 476 476 reg = <0xe2000 0x1000>; 477 477 }; 478 478

+1 -1

arch/powerpc/boot/dts/fsl/t104xrdb.dtsi

··· 109 109 flash@0 { 110 110 #address-cells = <1>; 111 111 #size-cells = <1>; 112 - compatible = "micron,n25q512a", "jedec,spi-nor"; 112 + compatible = "micron,n25q512ax3", "jedec,spi-nor"; 113 113 reg = <0>; 114 114 spi-max-frequency = <10000000>; /* input clock */ 115 115 };

+1 -1

arch/powerpc/boot/dts/fsl/t208xrdb.dtsi

··· 113 113 flash@0 { 114 114 #address-cells = <1>; 115 115 #size-cells = <1>; 116 - compatible = "micron,n25q512a", "jedec,spi-nor"; 116 + compatible = "micron,n25q512ax3", "jedec,spi-nor"; 117 117 reg = <0>; 118 118 spi-max-frequency = <10000000>; /* input clock */ 119 119 };

-3

arch/powerpc/include/asm/book3s/32/hash.h

··· 39 39 #define _PMD_PRESENT_MASK (PAGE_MASK) 40 40 #define _PMD_BAD (~PAGE_MASK) 41 41 42 - /* Hash table based platforms need atomic updates of the linux PTE */ 43 - #define PTE_ATOMIC_UPDATES 1 44 - 45 42 #endif /* __KERNEL__ */ 46 43 #endif /* _ASM_POWERPC_BOOK3S_32_HASH_H */

+3 -3

arch/powerpc/include/asm/book3s/32/mmu-hash.h

··· 1 - #ifndef _ASM_POWERPC_MMU_HASH32_H_ 2 - #define _ASM_POWERPC_MMU_HASH32_H_ 1 + #ifndef _ASM_POWERPC_BOOK3S_32_MMU_HASH_H_ 2 + #define _ASM_POWERPC_BOOK3S_32_MMU_HASH_H_ 3 3 /* 4 4 * 32-bit hash table MMU support 5 5 */ ··· 90 90 #define mmu_virtual_psize MMU_PAGE_4K 91 91 #define mmu_linear_psize MMU_PAGE_256M 92 92 93 - #endif /* _ASM_POWERPC_MMU_HASH32_H_ */ 93 + #endif /* _ASM_POWERPC_BOOK3S_32_MMU_HASH_H_ */

+109

arch/powerpc/include/asm/book3s/32/pgalloc.h

··· 1 + #ifndef _ASM_POWERPC_BOOK3S_32_PGALLOC_H 2 + #define _ASM_POWERPC_BOOK3S_32_PGALLOC_H 3 + 4 + #include <linux/threads.h> 5 + 6 + /* For 32-bit, all levels of page tables are just drawn from get_free_page() */ 7 + #define MAX_PGTABLE_INDEX_SIZE 0 8 + 9 + extern void __bad_pte(pmd_t *pmd); 10 + 11 + extern pgd_t *pgd_alloc(struct mm_struct *mm); 12 + extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); 13 + 14 + /* 15 + * We don't have any real pmd's, and this code never triggers because 16 + * the pgd will always be present.. 17 + */ 18 + /* #define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) */ 19 + #define pmd_free(mm, x) do { } while (0) 20 + #define __pmd_free_tlb(tlb,x,a) do { } while (0) 21 + /* #define pgd_populate(mm, pmd, pte) BUG() */ 22 + 23 + #ifndef CONFIG_BOOKE 24 + 25 + static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, 26 + pte_t *pte) 27 + { 28 + *pmdp = __pmd(__pa(pte) | _PMD_PRESENT); 29 + } 30 + 31 + static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, 32 + pgtable_t pte_page) 33 + { 34 + *pmdp = __pmd((page_to_pfn(pte_page) << PAGE_SHIFT) | _PMD_PRESENT); 35 + } 36 + 37 + #define pmd_pgtable(pmd) pmd_page(pmd) 38 + #else 39 + 40 + static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, 41 + pte_t *pte) 42 + { 43 + *pmdp = __pmd((unsigned long)pte | _PMD_PRESENT); 44 + } 45 + 46 + static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp, 47 + pgtable_t pte_page) 48 + { 49 + *pmdp = __pmd((unsigned long)lowmem_page_address(pte_page) | _PMD_PRESENT); 50 + } 51 + 52 + #define pmd_pgtable(pmd) pmd_page(pmd) 53 + #endif 54 + 55 + extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); 56 + extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr); 57 + 58 + static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) 59 + { 60 + free_page((unsigned long)pte); 61 + } 62 + 63 + static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) 64 + { 65 + pgtable_page_dtor(ptepage); 66 + __free_page(ptepage); 67 + } 68 + 69 + static inline void pgtable_free(void *table, unsigned index_size) 70 + { 71 + BUG_ON(index_size); /* 32-bit doesn't use this */ 72 + free_page((unsigned long)table); 73 + } 74 + 75 + #define check_pgt_cache() do { } while (0) 76 + 77 + #ifdef CONFIG_SMP 78 + static inline void pgtable_free_tlb(struct mmu_gather *tlb, 79 + void *table, int shift) 80 + { 81 + unsigned long pgf = (unsigned long)table; 82 + BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); 83 + pgf |= shift; 84 + tlb_remove_table(tlb, (void *)pgf); 85 + } 86 + 87 + static inline void __tlb_remove_table(void *_table) 88 + { 89 + void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); 90 + unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; 91 + 92 + pgtable_free(table, shift); 93 + } 94 + #else 95 + static inline void pgtable_free_tlb(struct mmu_gather *tlb, 96 + void *table, int shift) 97 + { 98 + pgtable_free(table, shift); 99 + } 100 + #endif 101 + 102 + static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, 103 + unsigned long address) 104 + { 105 + tlb_flush_pgtable(tlb, address); 106 + pgtable_page_dtor(table); 107 + pgtable_free_tlb(tlb, page_address(table), 0); 108 + } 109 + #endif /* _ASM_POWERPC_BOOK3S_32_PGALLOC_H */

+78 -66

arch/powerpc/include/asm/book3s/64/hash-4k.h

··· 5 5 * for each page table entry. The PMD and PGD level use a 32b record for 6 6 * each entry by assuming that each entry is page aligned. 7 7 */ 8 - #define PTE_INDEX_SIZE 9 9 - #define PMD_INDEX_SIZE 7 10 - #define PUD_INDEX_SIZE 9 11 - #define PGD_INDEX_SIZE 9 8 + #define H_PTE_INDEX_SIZE 9 9 + #define H_PMD_INDEX_SIZE 7 10 + #define H_PUD_INDEX_SIZE 9 11 + #define H_PGD_INDEX_SIZE 9 12 12 13 13 #ifndef __ASSEMBLY__ 14 - #define PTE_TABLE_SIZE (sizeof(pte_t) << PTE_INDEX_SIZE) 15 - #define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) 16 - #define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) 17 - #define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) 18 - #endif /* __ASSEMBLY__ */ 19 - 20 - #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) 21 - #define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) 22 - #define PTRS_PER_PUD (1 << PUD_INDEX_SIZE) 23 - #define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) 24 - 25 - /* PMD_SHIFT determines what a second-level page table entry can map */ 26 - #define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE) 27 - #define PMD_SIZE (1UL << PMD_SHIFT) 28 - #define PMD_MASK (~(PMD_SIZE-1)) 14 + #define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE) 15 + #define H_PMD_TABLE_SIZE (sizeof(pmd_t) << H_PMD_INDEX_SIZE) 16 + #define H_PUD_TABLE_SIZE (sizeof(pud_t) << H_PUD_INDEX_SIZE) 17 + #define H_PGD_TABLE_SIZE (sizeof(pgd_t) << H_PGD_INDEX_SIZE) 29 18 30 19 /* With 4k base page size, hugepage PTEs go at the PMD level */ 31 20 #define MIN_HUGEPTE_SHIFT PMD_SHIFT 32 21 33 - /* PUD_SHIFT determines what a third-level page table entry can map */ 34 - #define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) 35 - #define PUD_SIZE (1UL << PUD_SHIFT) 36 - #define PUD_MASK (~(PUD_SIZE-1)) 37 - 38 - /* PGDIR_SHIFT determines what a fourth-level page table entry can map */ 39 - #define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE) 40 - #define PGDIR_SIZE (1UL << PGDIR_SHIFT) 41 - #define PGDIR_MASK (~(PGDIR_SIZE-1)) 42 - 43 - /* Bits to mask out from a PMD to get to the PTE page */ 44 - #define PMD_MASKED_BITS 0 45 - /* Bits to mask out from a PUD to get to the PMD page */ 46 - #define PUD_MASKED_BITS 0 47 - /* Bits to mask out from a PGD to get to the PUD page */ 48 - #define PGD_MASKED_BITS 0 49 - 50 22 /* PTE flags to conserve for HPTE identification */ 51 - #define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | \ 52 - _PAGE_F_SECOND | _PAGE_F_GIX) 53 - 54 - /* shift to put page number into pte */ 55 - #define PTE_RPN_SHIFT (12) 56 - #define PTE_RPN_SIZE (45) /* gives 57-bit real addresses */ 57 - 58 - #define _PAGE_4K_PFN 0 59 - #ifndef __ASSEMBLY__ 23 + #define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | \ 24 + H_PAGE_F_SECOND | H_PAGE_F_GIX) 25 + /* 26 + * Not supported by 4k linux page size 27 + */ 28 + #define H_PAGE_4K_PFN 0x0 29 + #define H_PAGE_THP_HUGE 0x0 30 + #define H_PAGE_COMBO 0x0 31 + #define H_PTE_FRAG_NR 0 32 + #define H_PTE_FRAG_SIZE_SHIFT 0 60 33 /* 61 34 * On all 4K setups, remap_4k_pfn() equates to remap_pfn_range() 62 35 */ ··· 37 64 remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot)) 38 65 39 66 #ifdef CONFIG_HUGETLB_PAGE 40 - /* 41 - * For 4k page size, we support explicit hugepage via hugepd 42 - */ 43 - static inline int pmd_huge(pmd_t pmd) 44 - { 45 - return 0; 46 - } 47 - 48 - static inline int pud_huge(pud_t pud) 49 - { 50 - return 0; 51 - } 52 - 53 - static inline int pgd_huge(pgd_t pgd) 54 - { 55 - return 0; 56 - } 57 - #define pgd_huge pgd_huge 58 - 59 - static inline int hugepd_ok(hugepd_t hpd) 67 + static inline int hash__hugepd_ok(hugepd_t hpd) 60 68 { 61 69 /* 62 70 * if it is not a pte and have hugepd shift mask ··· 48 94 return true; 49 95 return false; 50 96 } 51 - #define is_hugepd(hpd) (hugepd_ok(hpd)) 97 + #endif 98 + 99 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 100 + 101 + static inline char *get_hpte_slot_array(pmd_t *pmdp) 102 + { 103 + BUG(); 104 + return NULL; 105 + } 106 + 107 + static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index) 108 + { 109 + BUG(); 110 + return 0; 111 + } 112 + 113 + static inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array, 114 + int index) 115 + { 116 + BUG(); 117 + return 0; 118 + } 119 + 120 + static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array, 121 + unsigned int index, unsigned int hidx) 122 + { 123 + BUG(); 124 + } 125 + 126 + static inline int hash__pmd_trans_huge(pmd_t pmd) 127 + { 128 + return 0; 129 + } 130 + 131 + static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b) 132 + { 133 + BUG(); 134 + return 0; 135 + } 136 + 137 + static inline pmd_t hash__pmd_mkhuge(pmd_t pmd) 138 + { 139 + BUG(); 140 + return pmd; 141 + } 142 + 143 + extern unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, 144 + unsigned long addr, pmd_t *pmdp, 145 + unsigned long clr, unsigned long set); 146 + extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, 147 + unsigned long address, pmd_t *pmdp); 148 + extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 149 + pgtable_t pgtable); 150 + extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); 151 + extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, 152 + unsigned long address, pmd_t *pmdp); 153 + extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, 154 + unsigned long addr, pmd_t *pmdp); 155 + extern int hash__has_transparent_hugepage(void); 52 156 #endif 53 157 54 158 #endif /* !__ASSEMBLY__ */

+63 -154

arch/powerpc/include/asm/book3s/64/hash-64k.h

··· 1 1 #ifndef _ASM_POWERPC_BOOK3S_64_HASH_64K_H 2 2 #define _ASM_POWERPC_BOOK3S_64_HASH_64K_H 3 3 4 - #define PTE_INDEX_SIZE 8 5 - #define PMD_INDEX_SIZE 5 6 - #define PUD_INDEX_SIZE 5 7 - #define PGD_INDEX_SIZE 12 8 - 9 - #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) 10 - #define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) 11 - #define PTRS_PER_PUD (1 << PUD_INDEX_SIZE) 12 - #define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) 4 + #define H_PTE_INDEX_SIZE 8 5 + #define H_PMD_INDEX_SIZE 5 6 + #define H_PUD_INDEX_SIZE 5 7 + #define H_PGD_INDEX_SIZE 12 13 8 14 9 /* With 4k base page size, hugepage PTEs go at the PMD level */ 15 10 #define MIN_HUGEPTE_SHIFT PAGE_SHIFT 16 11 17 - /* PMD_SHIFT determines what a second-level page table entry can map */ 18 - #define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE) 19 - #define PMD_SIZE (1UL << PMD_SHIFT) 20 - #define PMD_MASK (~(PMD_SIZE-1)) 21 - 22 - /* PUD_SHIFT determines what a third-level page table entry can map */ 23 - #define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) 24 - #define PUD_SIZE (1UL << PUD_SHIFT) 25 - #define PUD_MASK (~(PUD_SIZE-1)) 26 - 27 - /* PGDIR_SHIFT determines what a fourth-level page table entry can map */ 28 - #define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE) 29 - #define PGDIR_SIZE (1UL << PGDIR_SHIFT) 30 - #define PGDIR_MASK (~(PGDIR_SIZE-1)) 31 - 32 - #define _PAGE_COMBO 0x00001000 /* this is a combo 4k page */ 33 - #define _PAGE_4K_PFN 0x00002000 /* PFN is for a single 4k page */ 12 + #define H_PAGE_COMBO 0x00001000 /* this is a combo 4k page */ 13 + #define H_PAGE_4K_PFN 0x00002000 /* PFN is for a single 4k page */ 34 14 /* 35 - * Used to track subpage group valid if _PAGE_COMBO is set 36 - * This overloads _PAGE_F_GIX and _PAGE_F_SECOND 15 + * We need to differentiate between explicit huge page and THP huge 16 + * page, since THP huge page also need to track real subpage details 37 17 */ 38 - #define _PAGE_COMBO_VALID (_PAGE_F_GIX | _PAGE_F_SECOND) 18 + #define H_PAGE_THP_HUGE H_PAGE_4K_PFN 19 + 20 + /* 21 + * Used to track subpage group valid if H_PAGE_COMBO is set 22 + * This overloads H_PAGE_F_GIX and H_PAGE_F_SECOND 23 + */ 24 + #define H_PAGE_COMBO_VALID (H_PAGE_F_GIX | H_PAGE_F_SECOND) 39 25 40 26 /* PTE flags to conserve for HPTE identification */ 41 - #define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_F_SECOND | \ 42 - _PAGE_F_GIX | _PAGE_HASHPTE | _PAGE_COMBO) 43 - 44 - /* Shift to put page number into pte. 45 - * 46 - * That gives us a max RPN of 41 bits, which means a max of 57 bits 47 - * of addressable physical space, or 53 bits for the special 4k PFNs. 48 - */ 49 - #define PTE_RPN_SHIFT (16) 50 - #define PTE_RPN_SIZE (41) 51 - 27 + #define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_F_SECOND | \ 28 + H_PAGE_F_GIX | H_PAGE_HASHPTE | H_PAGE_COMBO) 52 29 /* 53 30 * we support 16 fragments per PTE page of 64K size. 54 31 */ 55 - #define PTE_FRAG_NR 16 32 + #define H_PTE_FRAG_NR 16 56 33 /* 57 34 * We use a 2K PTE page fragment and another 2K for storing 58 35 * real_pte_t hash index 59 36 */ 60 - #define PTE_FRAG_SIZE_SHIFT 12 37 + #define H_PTE_FRAG_SIZE_SHIFT 12 61 38 #define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT) 62 39 63 - /* Bits to mask out from a PMD to get to the PTE page */ 64 - #define PMD_MASKED_BITS 0xc0000000000000ffUL 65 - /* Bits to mask out from a PUD to get to the PMD page */ 66 - #define PUD_MASKED_BITS 0xc0000000000000ffUL 67 - /* Bits to mask out from a PGD to get to the PUD page */ 68 - #define PGD_MASKED_BITS 0xc0000000000000ffUL 69 - 70 40 #ifndef __ASSEMBLY__ 41 + #include <asm/errno.h> 71 42 72 43 /* 73 44 * With 64K pages on hash table, we have a special PTE format that ··· 54 83 55 84 rpte.pte = pte; 56 85 rpte.hidx = 0; 57 - if (pte_val(pte) & _PAGE_COMBO) { 86 + if (pte_val(pte) & H_PAGE_COMBO) { 58 87 /* 59 - * Make sure we order the hidx load against the _PAGE_COMBO 88 + * Make sure we order the hidx load against the H_PAGE_COMBO 60 89 * check. The store side ordering is done in __hash_page_4K 61 90 */ 62 91 smp_rmb(); ··· 68 97 69 98 static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index) 70 99 { 71 - if ((pte_val(rpte.pte) & _PAGE_COMBO)) 100 + if ((pte_val(rpte.pte) & H_PAGE_COMBO)) 72 101 return (rpte.hidx >> (index<<2)) & 0xf; 73 - return (pte_val(rpte.pte) >> _PAGE_F_GIX_SHIFT) & 0xf; 102 + return (pte_val(rpte.pte) >> H_PAGE_F_GIX_SHIFT) & 0xf; 74 103 } 75 104 76 105 #define __rpte_to_pte(r) ((r).pte) ··· 93 122 #define pte_iterate_hashed_end() } while(0); } } while(0) 94 123 95 124 #define pte_pagesize_index(mm, addr, pte) \ 96 - (((pte) & _PAGE_COMBO)? MMU_PAGE_4K: MMU_PAGE_64K) 125 + (((pte) & H_PAGE_COMBO)? MMU_PAGE_4K: MMU_PAGE_64K) 97 126 98 - #define remap_4k_pfn(vma, addr, pfn, prot) \ 99 - (WARN_ON(((pfn) >= (1UL << PTE_RPN_SIZE))) ? -EINVAL : \ 100 - remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, \ 101 - __pgprot(pgprot_val((prot)) | _PAGE_4K_PFN))) 127 + extern int remap_pfn_range(struct vm_area_struct *, unsigned long addr, 128 + unsigned long pfn, unsigned long size, pgprot_t); 129 + static inline int hash__remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, 130 + unsigned long pfn, pgprot_t prot) 131 + { 132 + if (pfn > (PTE_RPN_MASK >> PAGE_SHIFT)) { 133 + WARN(1, "remap_4k_pfn called with wrong pfn value\n"); 134 + return -EINVAL; 135 + } 136 + return remap_pfn_range(vma, addr, pfn, PAGE_SIZE, 137 + __pgprot(pgprot_val(prot) | H_PAGE_4K_PFN)); 138 + } 102 139 103 - #define PTE_TABLE_SIZE PTE_FRAG_SIZE 140 + #define H_PTE_TABLE_SIZE PTE_FRAG_SIZE 104 141 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 105 - #define PMD_TABLE_SIZE ((sizeof(pmd_t) << PMD_INDEX_SIZE) + (sizeof(unsigned long) << PMD_INDEX_SIZE)) 142 + #define H_PMD_TABLE_SIZE ((sizeof(pmd_t) << PMD_INDEX_SIZE) + \ 143 + (sizeof(unsigned long) << PMD_INDEX_SIZE)) 106 144 #else 107 - #define PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) 145 + #define H_PMD_TABLE_SIZE (sizeof(pmd_t) << PMD_INDEX_SIZE) 108 146 #endif 109 - #define PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) 110 - #define PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) 111 - 112 - #ifdef CONFIG_HUGETLB_PAGE 113 - /* 114 - * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have 115 - * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD; 116 - * 117 - * Defined in such a way that we can optimize away code block at build time 118 - * if CONFIG_HUGETLB_PAGE=n. 119 - */ 120 - static inline int pmd_huge(pmd_t pmd) 121 - { 122 - /* 123 - * leaf pte for huge page 124 - */ 125 - return !!(pmd_val(pmd) & _PAGE_PTE); 126 - } 127 - 128 - static inline int pud_huge(pud_t pud) 129 - { 130 - /* 131 - * leaf pte for huge page 132 - */ 133 - return !!(pud_val(pud) & _PAGE_PTE); 134 - } 135 - 136 - static inline int pgd_huge(pgd_t pgd) 137 - { 138 - /* 139 - * leaf pte for huge page 140 - */ 141 - return !!(pgd_val(pgd) & _PAGE_PTE); 142 - } 143 - #define pgd_huge pgd_huge 144 - 145 - #ifdef CONFIG_DEBUG_VM 146 - extern int hugepd_ok(hugepd_t hpd); 147 - #define is_hugepd(hpd) (hugepd_ok(hpd)) 148 - #else 149 - /* 150 - * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't 151 - * need to setup hugepage directory for them. Our pte and page directory format 152 - * enable us to have this enabled. 153 - */ 154 - static inline int hugepd_ok(hugepd_t hpd) 155 - { 156 - return 0; 157 - } 158 - #define is_hugepd(pdep) 0 159 - #endif /* CONFIG_DEBUG_VM */ 160 - 161 - #endif /* CONFIG_HUGETLB_PAGE */ 147 + #define H_PUD_TABLE_SIZE (sizeof(pud_t) << PUD_INDEX_SIZE) 148 + #define H_PGD_TABLE_SIZE (sizeof(pgd_t) << PGD_INDEX_SIZE) 162 149 163 150 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 164 - extern unsigned long pmd_hugepage_update(struct mm_struct *mm, 165 - unsigned long addr, 166 - pmd_t *pmdp, 167 - unsigned long clr, 168 - unsigned long set); 169 151 static inline char *get_hpte_slot_array(pmd_t *pmdp) 170 152 { 171 153 /* ··· 177 253 * that for explicit huge pages. 178 254 * 179 255 */ 180 - static inline int pmd_trans_huge(pmd_t pmd) 256 + static inline int hash__pmd_trans_huge(pmd_t pmd) 181 257 { 182 - return !!((pmd_val(pmd) & (_PAGE_PTE | _PAGE_THP_HUGE)) == 183 - (_PAGE_PTE | _PAGE_THP_HUGE)); 258 + return !!((pmd_val(pmd) & (_PAGE_PTE | H_PAGE_THP_HUGE)) == 259 + (_PAGE_PTE | H_PAGE_THP_HUGE)); 184 260 } 185 261 186 - static inline int pmd_large(pmd_t pmd) 262 + static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b) 187 263 { 188 - return !!(pmd_val(pmd) & _PAGE_PTE); 264 + return (((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); 189 265 } 190 266 191 - static inline pmd_t pmd_mknotpresent(pmd_t pmd) 267 + static inline pmd_t hash__pmd_mkhuge(pmd_t pmd) 192 268 { 193 - return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT); 269 + return __pmd(pmd_val(pmd) | (_PAGE_PTE | H_PAGE_THP_HUGE)); 194 270 } 195 271 196 - #define __HAVE_ARCH_PMD_SAME 197 - static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) 198 - { 199 - return (((pmd_val(pmd_a) ^ pmd_val(pmd_b)) & ~_PAGE_HPTEFLAGS) == 0); 200 - } 201 - 202 - static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, 203 - unsigned long addr, pmd_t *pmdp) 204 - { 205 - unsigned long old; 206 - 207 - if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) 208 - return 0; 209 - old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0); 210 - return ((old & _PAGE_ACCESSED) != 0); 211 - } 212 - 213 - #define __HAVE_ARCH_PMDP_SET_WRPROTECT 214 - static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, 215 - pmd_t *pmdp) 216 - { 217 - 218 - if ((pmd_val(*pmdp) & _PAGE_RW) == 0) 219 - return; 220 - 221 - pmd_hugepage_update(mm, addr, pmdp, _PAGE_RW, 0); 222 - } 223 - 272 + extern unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, 273 + unsigned long addr, pmd_t *pmdp, 274 + unsigned long clr, unsigned long set); 275 + extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, 276 + unsigned long address, pmd_t *pmdp); 277 + extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 278 + pgtable_t pgtable); 279 + extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); 280 + extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, 281 + unsigned long address, pmd_t *pmdp); 282 + extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, 283 + unsigned long addr, pmd_t *pmdp); 284 + extern int hash__has_transparent_hugepage(void); 224 285 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 225 286 #endif /* __ASSEMBLY__ */ 226 287

+65 -420

arch/powerpc/include/asm/book3s/64/hash.h

··· 13 13 * We could create separate kernel read-only if we used the 3 PP bits 14 14 * combinations that newer processors provide but we currently don't. 15 15 */ 16 - #define _PAGE_BIT_SWAP_TYPE 0 17 - 18 - #define _PAGE_EXEC 0x00001 /* execute permission */ 19 - #define _PAGE_RW 0x00002 /* read & write access allowed */ 20 - #define _PAGE_READ 0x00004 /* read access allowed */ 21 - #define _PAGE_USER 0x00008 /* page may be accessed by userspace */ 22 - #define _PAGE_GUARDED 0x00010 /* G: guarded (side-effect) page */ 23 - /* M (memory coherence) is always set in the HPTE, so we don't need it here */ 24 - #define _PAGE_COHERENT 0x0 25 - #define _PAGE_NO_CACHE 0x00020 /* I: cache inhibit */ 26 - #define _PAGE_WRITETHRU 0x00040 /* W: cache write-through */ 27 - #define _PAGE_DIRTY 0x00080 /* C: page changed */ 28 - #define _PAGE_ACCESSED 0x00100 /* R: page referenced */ 29 - #define _PAGE_SPECIAL 0x00400 /* software: special page */ 30 - #define _PAGE_BUSY 0x00800 /* software: PTE & hash are busy */ 31 - 32 - #ifdef CONFIG_MEM_SOFT_DIRTY 33 - #define _PAGE_SOFT_DIRTY 0x200 /* software: software dirty tracking */ 34 - #else 35 - #define _PAGE_SOFT_DIRTY 0x000 36 - #endif 37 - 38 - #define _PAGE_F_GIX_SHIFT 57 39 - #define _PAGE_F_GIX (7ul << 57) /* HPTE index within HPTEG */ 40 - #define _PAGE_F_SECOND (1ul << 60) /* HPTE is in 2ndary HPTEG */ 41 - #define _PAGE_HASHPTE (1ul << 61) /* PTE has associated HPTE */ 42 - #define _PAGE_PTE (1ul << 62) /* distinguishes PTEs from pointers */ 43 - #define _PAGE_PRESENT (1ul << 63) /* pte contains a translation */ 44 - 45 - /* 46 - * We need to differentiate between explicit huge page and THP huge 47 - * page, since THP huge page also need to track real subpage details 48 - */ 49 - #define _PAGE_THP_HUGE _PAGE_4K_PFN 50 - 51 - /* 52 - * set of bits not changed in pmd_modify. 53 - */ 54 - #define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ 55 - _PAGE_ACCESSED | _PAGE_THP_HUGE | _PAGE_PTE | \ 56 - _PAGE_SOFT_DIRTY) 57 - 16 + #define H_PAGE_BUSY 0x00800 /* software: PTE & hash are busy */ 17 + #define H_PTE_NONE_MASK _PAGE_HPTEFLAGS 18 + #define H_PAGE_F_GIX_SHIFT 57 19 + #define H_PAGE_F_GIX (7ul << 57) /* HPTE index within HPTEG */ 20 + #define H_PAGE_F_SECOND (1ul << 60) /* HPTE is in 2ndary HPTEG */ 21 + #define H_PAGE_HASHPTE (1ul << 61) /* PTE has associated HPTE */ 58 22 59 23 #ifdef CONFIG_PPC_64K_PAGES 60 24 #include <asm/book3s/64/hash-64k.h> ··· 29 65 /* 30 66 * Size of EA range mapped by our pagetables. 31 67 */ 32 - #define PGTABLE_EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \ 33 - PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT) 34 - #define PGTABLE_RANGE (ASM_CONST(1) << PGTABLE_EADDR_SIZE) 68 + #define H_PGTABLE_EADDR_SIZE (H_PTE_INDEX_SIZE + H_PMD_INDEX_SIZE + \ 69 + H_PUD_INDEX_SIZE + H_PGD_INDEX_SIZE + PAGE_SHIFT) 70 + #define H_PGTABLE_RANGE (ASM_CONST(1) << H_PGTABLE_EADDR_SIZE) 35 71 36 72 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 37 - #define PMD_CACHE_INDEX (PMD_INDEX_SIZE + 1) 73 + /* 74 + * only with hash we need to use the second half of pmd page table 75 + * to store pointer to deposited pgtable_t 76 + */ 77 + #define H_PMD_CACHE_INDEX (H_PMD_INDEX_SIZE + 1) 38 78 #else 39 - #define PMD_CACHE_INDEX PMD_INDEX_SIZE 79 + #define H_PMD_CACHE_INDEX H_PMD_INDEX_SIZE 40 80 #endif 41 81 /* 42 82 * Define the address range of the kernel non-linear virtual area 43 83 */ 44 - #define KERN_VIRT_START ASM_CONST(0xD000000000000000) 45 - #define KERN_VIRT_SIZE ASM_CONST(0x0000100000000000) 84 + #define H_KERN_VIRT_START ASM_CONST(0xD000000000000000) 85 + #define H_KERN_VIRT_SIZE ASM_CONST(0x0000100000000000) 46 86 47 87 /* 48 88 * The vmalloc space starts at the beginning of that region, and 49 89 * occupies half of it on hash CPUs and a quarter of it on Book3E 50 90 * (we keep a quarter for the virtual memmap) 51 91 */ 52 - #define VMALLOC_START KERN_VIRT_START 53 - #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) 54 - #define VMALLOC_END (VMALLOC_START + VMALLOC_SIZE) 92 + #define H_VMALLOC_START H_KERN_VIRT_START 93 + #define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE >> 1) 94 + #define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) 55 95 56 96 /* 57 97 * Region IDs ··· 64 96 #define REGION_MASK (0xfUL << REGION_SHIFT) 65 97 #define REGION_ID(ea) (((unsigned long)(ea)) >> REGION_SHIFT) 66 98 67 - #define VMALLOC_REGION_ID (REGION_ID(VMALLOC_START)) 99 + #define VMALLOC_REGION_ID (REGION_ID(H_VMALLOC_START)) 68 100 #define KERNEL_REGION_ID (REGION_ID(PAGE_OFFSET)) 69 101 #define VMEMMAP_REGION_ID (0xfUL) /* Server only */ 70 102 #define USER_REGION_ID (0UL) ··· 73 105 * Defines the address of the vmemap area, in its own region on 74 106 * hash table CPUs. 75 107 */ 76 - #define VMEMMAP_BASE (VMEMMAP_REGION_ID << REGION_SHIFT) 108 + #define H_VMEMMAP_BASE (VMEMMAP_REGION_ID << REGION_SHIFT) 77 109 78 110 #ifdef CONFIG_PPC_MM_SLICES 79 111 #define HAVE_ARCH_UNMAPPED_AREA 80 112 #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 81 113 #endif /* CONFIG_PPC_MM_SLICES */ 82 114 83 - /* No separate kernel read-only */ 84 - #define _PAGE_KERNEL_RW (_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */ 85 - #define _PAGE_KERNEL_RO _PAGE_KERNEL_RW 86 - #define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC) 87 - 88 - /* Strong Access Ordering */ 89 - #define _PAGE_SAO (_PAGE_WRITETHRU | _PAGE_NO_CACHE | _PAGE_COHERENT) 90 - 91 - /* No page size encoding in the linux PTE */ 92 - #define _PAGE_PSIZE 0 93 115 94 116 /* PTEIDX nibble */ 95 117 #define _PTEIDX_SECONDARY 0x8 96 118 #define _PTEIDX_GROUP_IX 0x7 97 119 98 - /* Hash table based platforms need atomic updates of the linux PTE */ 99 - #define PTE_ATOMIC_UPDATES 1 100 - #define _PTE_NONE_MASK _PAGE_HPTEFLAGS 101 - /* 102 - * The mask convered by the RPN must be a ULL on 32-bit platforms with 103 - * 64-bit PTEs 104 - */ 105 - #define PTE_RPN_MASK (((1UL << PTE_RPN_SIZE) - 1) << PTE_RPN_SHIFT) 106 - /* 107 - * _PAGE_CHG_MASK masks of bits that are to be preserved across 108 - * pgprot changes 109 - */ 110 - #define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ 111 - _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE | \ 112 - _PAGE_SOFT_DIRTY) 113 - /* 114 - * Mask of bits returned by pte_pgprot() 115 - */ 116 - #define PAGE_PROT_BITS (_PAGE_GUARDED | _PAGE_COHERENT | _PAGE_NO_CACHE | \ 117 - _PAGE_WRITETHRU | _PAGE_4K_PFN | \ 118 - _PAGE_USER | _PAGE_ACCESSED | \ 119 - _PAGE_RW | _PAGE_DIRTY | _PAGE_EXEC | \ 120 - _PAGE_SOFT_DIRTY) 121 - /* 122 - * We define 2 sets of base prot bits, one for basic pages (ie, 123 - * cacheable kernel and user pages) and one for non cacheable 124 - * pages. We always set _PAGE_COHERENT when SMP is enabled or 125 - * the processor might need it for DMA coherency. 126 - */ 127 - #define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) 128 - #define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) 129 - 130 - /* Permission masks used to generate the __P and __S table, 131 - * 132 - * Note:__pgprot is defined in arch/powerpc/include/asm/page.h 133 - * 134 - * Write permissions imply read permissions for now (we could make write-only 135 - * pages on BookE but we don't bother for now). Execute permission control is 136 - * possible on platforms that define _PAGE_EXEC 137 - * 138 - * Note due to the way vm flags are laid out, the bits are XWR 139 - */ 140 - #define PAGE_NONE __pgprot(_PAGE_BASE) 141 - #define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) 142 - #define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | \ 143 - _PAGE_EXEC) 144 - #define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER ) 145 - #define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) 146 - #define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER ) 147 - #define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) 148 - 149 - #define __P000 PAGE_NONE 150 - #define __P001 PAGE_READONLY 151 - #define __P010 PAGE_COPY 152 - #define __P011 PAGE_COPY 153 - #define __P100 PAGE_READONLY_X 154 - #define __P101 PAGE_READONLY_X 155 - #define __P110 PAGE_COPY_X 156 - #define __P111 PAGE_COPY_X 157 - 158 - #define __S000 PAGE_NONE 159 - #define __S001 PAGE_READONLY 160 - #define __S010 PAGE_SHARED 161 - #define __S011 PAGE_SHARED 162 - #define __S100 PAGE_READONLY_X 163 - #define __S101 PAGE_READONLY_X 164 - #define __S110 PAGE_SHARED_X 165 - #define __S111 PAGE_SHARED_X 166 - 167 - /* Permission masks used for kernel mappings */ 168 - #define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) 169 - #define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ 170 - _PAGE_NO_CACHE) 171 - #define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ 172 - _PAGE_NO_CACHE | _PAGE_GUARDED) 173 - #define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) 174 - #define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) 175 - #define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) 176 - 177 - /* Protection used for kernel text. We want the debuggers to be able to 178 - * set breakpoints anywhere, so don't write protect the kernel text 179 - * on platforms where such control is possible. 180 - */ 181 - #if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) ||\ 182 - defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) 183 - #define PAGE_KERNEL_TEXT PAGE_KERNEL_X 184 - #else 185 - #define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX 186 - #endif 187 - 188 - /* Make modules code happy. We don't set RO yet */ 189 - #define PAGE_KERNEL_EXEC PAGE_KERNEL_X 190 - #define PAGE_AGP (PAGE_KERNEL_NC) 191 - 192 - #define PMD_BAD_BITS (PTE_TABLE_SIZE-1) 193 - #define PUD_BAD_BITS (PMD_TABLE_SIZE-1) 120 + #define H_PMD_BAD_BITS (PTE_TABLE_SIZE-1) 121 + #define H_PUD_BAD_BITS (PMD_TABLE_SIZE-1) 194 122 195 123 #ifndef __ASSEMBLY__ 196 - #define pmd_bad(pmd) (pmd_val(pmd) & PMD_BAD_BITS) 197 - #define pmd_page_vaddr(pmd) __va(pmd_val(pmd) & ~PMD_MASKED_BITS) 198 - 199 - #define pud_bad(pud) (pud_val(pud) & PUD_BAD_BITS) 200 - #define pud_page_vaddr(pud) __va(pud_val(pud) & ~PUD_MASKED_BITS) 201 - 202 - /* Pointers in the page table tree are physical addresses */ 203 - #define __pgtable_ptr_val(ptr) __pa(ptr) 204 - 205 - #define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1)) 206 - #define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1)) 207 - #define pmd_index(address) (((address) >> (PMD_SHIFT)) & (PTRS_PER_PMD - 1)) 208 - #define pte_index(address) (((address) >> (PAGE_SHIFT)) & (PTRS_PER_PTE - 1)) 124 + #define hash__pmd_bad(pmd) (pmd_val(pmd) & H_PMD_BAD_BITS) 125 + #define hash__pud_bad(pud) (pud_val(pud) & H_PUD_BAD_BITS) 126 + static inline int hash__pgd_bad(pgd_t pgd) 127 + { 128 + return (pgd_val(pgd) == 0); 129 + } 209 130 210 131 extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, 211 132 pte_t *ptep, unsigned long pte, int huge); 212 133 extern unsigned long htab_convert_pte_flags(unsigned long pteflags); 213 134 /* Atomic PTE updates */ 214 - static inline unsigned long pte_update(struct mm_struct *mm, 215 - unsigned long addr, 216 - pte_t *ptep, unsigned long clr, 217 - unsigned long set, 218 - int huge) 135 + static inline unsigned long hash__pte_update(struct mm_struct *mm, 136 + unsigned long addr, 137 + pte_t *ptep, unsigned long clr, 138 + unsigned long set, 139 + int huge) 219 140 { 220 - unsigned long old, tmp; 141 + __be64 old_be, tmp_be; 142 + unsigned long old; 221 143 222 144 __asm__ __volatile__( 223 145 "1: ldarx %0,0,%3 # pte_update\n\ 224 - andi. %1,%0,%6\n\ 146 + and. %1,%0,%6\n\ 225 147 bne- 1b \n\ 226 148 andc %1,%0,%4 \n\ 227 149 or %1,%1,%7\n\ 228 150 stdcx. %1,0,%3 \n\ 229 151 bne- 1b" 230 - : "=&r" (old), "=&r" (tmp), "=m" (*ptep) 231 - : "r" (ptep), "r" (clr), "m" (*ptep), "i" (_PAGE_BUSY), "r" (set) 152 + : "=&r" (old_be), "=&r" (tmp_be), "=m" (*ptep) 153 + : "r" (ptep), "r" (cpu_to_be64(clr)), "m" (*ptep), 154 + "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) 232 155 : "cc" ); 233 156 /* huge pages use the old page table lock */ 234 157 if (!huge) 235 158 assert_pte_locked(mm, addr); 236 159 237 - if (old & _PAGE_HASHPTE) 160 + old = be64_to_cpu(old_be); 161 + if (old & H_PAGE_HASHPTE) 238 162 hpte_need_flush(mm, addr, ptep, old, huge); 239 163 240 164 return old; 241 165 } 242 166 243 - static inline int __ptep_test_and_clear_young(struct mm_struct *mm, 244 - unsigned long addr, pte_t *ptep) 245 - { 246 - unsigned long old; 247 - 248 - if ((pte_val(*ptep) & (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) 249 - return 0; 250 - old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); 251 - return (old & _PAGE_ACCESSED) != 0; 252 - } 253 - #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG 254 - #define ptep_test_and_clear_young(__vma, __addr, __ptep) \ 255 - ({ \ 256 - int __r; \ 257 - __r = __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \ 258 - __r; \ 259 - }) 260 - 261 - #define __HAVE_ARCH_PTEP_SET_WRPROTECT 262 - static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, 263 - pte_t *ptep) 264 - { 265 - 266 - if ((pte_val(*ptep) & _PAGE_RW) == 0) 267 - return; 268 - 269 - pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); 270 - } 271 - 272 - static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, 273 - unsigned long addr, pte_t *ptep) 274 - { 275 - if ((pte_val(*ptep) & _PAGE_RW) == 0) 276 - return; 277 - 278 - pte_update(mm, addr, ptep, _PAGE_RW, 0, 1); 279 - } 280 - 281 - /* 282 - * We currently remove entries from the hashtable regardless of whether 283 - * the entry was young or dirty. The generic routines only flush if the 284 - * entry was young or dirty which is not good enough. 285 - * 286 - * We should be more intelligent about this but for the moment we override 287 - * these functions and force a tlb flush unconditionally 288 - */ 289 - #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH 290 - #define ptep_clear_flush_young(__vma, __address, __ptep) \ 291 - ({ \ 292 - int __young = __ptep_test_and_clear_young((__vma)->vm_mm, __address, \ 293 - __ptep); \ 294 - __young; \ 295 - }) 296 - 297 - #define __HAVE_ARCH_PTEP_GET_AND_CLEAR 298 - static inline pte_t ptep_get_and_clear(struct mm_struct *mm, 299 - unsigned long addr, pte_t *ptep) 300 - { 301 - unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0); 302 - return __pte(old); 303 - } 304 - 305 - static inline void pte_clear(struct mm_struct *mm, unsigned long addr, 306 - pte_t * ptep) 307 - { 308 - pte_update(mm, addr, ptep, ~0UL, 0, 0); 309 - } 310 - 311 - 312 167 /* Set the dirty and/or accessed bits atomically in a linux PTE, this 313 168 * function doesn't need to flush the hash entry 314 169 */ 315 - static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) 170 + static inline void hash__ptep_set_access_flags(pte_t *ptep, pte_t entry) 316 171 { 317 - unsigned long bits = pte_val(entry) & 318 - (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC | 319 - _PAGE_SOFT_DIRTY); 172 + __be64 old, tmp, val, mask; 320 173 321 - unsigned long old, tmp; 174 + mask = cpu_to_be64(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_READ | _PAGE_WRITE | 175 + _PAGE_EXEC | _PAGE_SOFT_DIRTY); 176 + 177 + val = pte_raw(entry) & mask; 322 178 323 179 __asm__ __volatile__( 324 180 "1: ldarx %0,0,%4\n\ 325 - andi. %1,%0,%6\n\ 181 + and. %1,%0,%6\n\ 326 182 bne- 1b \n\ 327 183 or %0,%3,%0\n\ 328 184 stdcx. %0,0,%4\n\ 329 185 bne- 1b" 330 186 :"=&r" (old), "=&r" (tmp), "=m" (*ptep) 331 - :"r" (bits), "r" (ptep), "m" (*ptep), "i" (_PAGE_BUSY) 187 + :"r" (val), "r" (ptep), "m" (*ptep), "r" (cpu_to_be64(H_PAGE_BUSY)) 332 188 :"cc"); 333 189 } 334 190 335 - static inline int pgd_bad(pgd_t pgd) 191 + static inline int hash__pte_same(pte_t pte_a, pte_t pte_b) 336 192 { 337 - return (pgd_val(pgd) == 0); 193 + return (((pte_raw(pte_a) ^ pte_raw(pte_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); 338 194 } 339 195 340 - #define __HAVE_ARCH_PTE_SAME 341 - #define pte_same(A,B) (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0) 342 - static inline unsigned long pgd_page_vaddr(pgd_t pgd) 196 + static inline int hash__pte_none(pte_t pte) 343 197 { 344 - return (unsigned long)__va(pgd_val(pgd) & ~PGD_MASKED_BITS); 345 - } 346 - 347 - 348 - /* Generic accessors to PTE bits */ 349 - static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_RW);} 350 - static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); } 351 - static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSED); } 352 - static inline int pte_special(pte_t pte) { return !!(pte_val(pte) & _PAGE_SPECIAL); } 353 - static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; } 354 - static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } 355 - 356 - #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY 357 - static inline bool pte_soft_dirty(pte_t pte) 358 - { 359 - return !!(pte_val(pte) & _PAGE_SOFT_DIRTY); 360 - } 361 - static inline pte_t pte_mksoft_dirty(pte_t pte) 362 - { 363 - return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY); 364 - } 365 - 366 - static inline pte_t pte_clear_soft_dirty(pte_t pte) 367 - { 368 - return __pte(pte_val(pte) & ~_PAGE_SOFT_DIRTY); 369 - } 370 - #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ 371 - 372 - #ifdef CONFIG_NUMA_BALANCING 373 - /* 374 - * These work without NUMA balancing but the kernel does not care. See the 375 - * comment in include/asm-generic/pgtable.h . On powerpc, this will only 376 - * work for user pages and always return true for kernel pages. 377 - */ 378 - static inline int pte_protnone(pte_t pte) 379 - { 380 - return (pte_val(pte) & 381 - (_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT; 382 - } 383 - #endif /* CONFIG_NUMA_BALANCING */ 384 - 385 - static inline int pte_present(pte_t pte) 386 - { 387 - return !!(pte_val(pte) & _PAGE_PRESENT); 388 - } 389 - 390 - /* Conversion functions: convert a page and protection to a page entry, 391 - * and a page entry and page directory to the page they refer to. 392 - * 393 - * Even if PTEs can be unsigned long long, a PFN is always an unsigned 394 - * long for now. 395 - */ 396 - static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) 397 - { 398 - return __pte((((pte_basic_t)(pfn) << PTE_RPN_SHIFT) & PTE_RPN_MASK) | 399 - pgprot_val(pgprot)); 400 - } 401 - 402 - static inline unsigned long pte_pfn(pte_t pte) 403 - { 404 - return (pte_val(pte) & PTE_RPN_MASK) >> PTE_RPN_SHIFT; 405 - } 406 - 407 - /* Generic modifiers for PTE bits */ 408 - static inline pte_t pte_wrprotect(pte_t pte) 409 - { 410 - return __pte(pte_val(pte) & ~_PAGE_RW); 411 - } 412 - 413 - static inline pte_t pte_mkclean(pte_t pte) 414 - { 415 - return __pte(pte_val(pte) & ~_PAGE_DIRTY); 416 - } 417 - 418 - static inline pte_t pte_mkold(pte_t pte) 419 - { 420 - return __pte(pte_val(pte) & ~_PAGE_ACCESSED); 421 - } 422 - 423 - static inline pte_t pte_mkwrite(pte_t pte) 424 - { 425 - return __pte(pte_val(pte) | _PAGE_RW); 426 - } 427 - 428 - static inline pte_t pte_mkdirty(pte_t pte) 429 - { 430 - return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY); 431 - } 432 - 433 - static inline pte_t pte_mkyoung(pte_t pte) 434 - { 435 - return __pte(pte_val(pte) | _PAGE_ACCESSED); 436 - } 437 - 438 - static inline pte_t pte_mkspecial(pte_t pte) 439 - { 440 - return __pte(pte_val(pte) | _PAGE_SPECIAL); 441 - } 442 - 443 - static inline pte_t pte_mkhuge(pte_t pte) 444 - { 445 - return pte; 446 - } 447 - 448 - static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) 449 - { 450 - return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); 198 + return (pte_val(pte) & ~H_PTE_NONE_MASK) == 0; 451 199 } 452 200 453 201 /* This low level function performs the actual PTE insertion ··· 171 487 * an horrible mess that I'm not going to try to clean up now but 172 488 * I'm keeping it in one place rather than spread around 173 489 */ 174 - static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, 175 - pte_t *ptep, pte_t pte, int percpu) 490 + static inline void hash__set_pte_at(struct mm_struct *mm, unsigned long addr, 491 + pte_t *ptep, pte_t pte, int percpu) 176 492 { 177 493 /* 178 494 * Anything else just stores the PTE normally. That covers all 64-bit 179 495 * cases, and 32-bit non-hash with 32-bit PTEs. 180 496 */ 181 497 *ptep = pte; 182 - } 183 - 184 - /* 185 - * Macro to mark a page protection value as "uncacheable". 186 - */ 187 - 188 - #define _PAGE_CACHE_CTL (_PAGE_COHERENT | _PAGE_GUARDED | _PAGE_NO_CACHE | \ 189 - _PAGE_WRITETHRU) 190 - 191 - #define pgprot_noncached pgprot_noncached 192 - static inline pgprot_t pgprot_noncached(pgprot_t prot) 193 - { 194 - return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | 195 - _PAGE_NO_CACHE | _PAGE_GUARDED); 196 - } 197 - 198 - #define pgprot_noncached_wc pgprot_noncached_wc 199 - static inline pgprot_t pgprot_noncached_wc(pgprot_t prot) 200 - { 201 - return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | 202 - _PAGE_NO_CACHE); 203 - } 204 - 205 - #define pgprot_cached pgprot_cached 206 - static inline pgprot_t pgprot_cached(pgprot_t prot) 207 - { 208 - return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | 209 - _PAGE_COHERENT); 210 - } 211 - 212 - #define pgprot_cached_wthru pgprot_cached_wthru 213 - static inline pgprot_t pgprot_cached_wthru(pgprot_t prot) 214 - { 215 - return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | 216 - _PAGE_COHERENT | _PAGE_WRITETHRU); 217 - } 218 - 219 - #define pgprot_cached_noncoherent pgprot_cached_noncoherent 220 - static inline pgprot_t pgprot_cached_noncoherent(pgprot_t prot) 221 - { 222 - return __pgprot(pgprot_val(prot) & ~_PAGE_CACHE_CTL); 223 - } 224 - 225 - #define pgprot_writecombine pgprot_writecombine 226 - static inline pgprot_t pgprot_writecombine(pgprot_t prot) 227 - { 228 - return pgprot_noncached_wc(prot); 229 498 } 230 499 231 500 #ifdef CONFIG_TRANSPARENT_HUGEPAGE ··· 193 556 } 194 557 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 195 558 559 + 560 + extern int hash__map_kernel_page(unsigned long ea, unsigned long pa, 561 + unsigned long flags); 562 + extern int __meminit hash__vmemmap_create_mapping(unsigned long start, 563 + unsigned long page_size, 564 + unsigned long phys); 565 + extern void hash__vmemmap_remove_mapping(unsigned long start, 566 + unsigned long page_size); 196 567 #endif /* !__ASSEMBLY__ */ 197 568 #endif /* __KERNEL__ */ 198 569 #endif /* _ASM_POWERPC_BOOK3S_64_HASH_H */

+14

arch/powerpc/include/asm/book3s/64/hugetlb-radix.h

··· 1 + #ifndef _ASM_POWERPC_BOOK3S_64_HUGETLB_RADIX_H 2 + #define _ASM_POWERPC_BOOK3S_64_HUGETLB_RADIX_H 3 + /* 4 + * For radix we want generic code to handle hugetlb. But then if we want 5 + * both hash and radix to be enabled together we need to workaround the 6 + * limitations. 7 + */ 8 + void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); 9 + void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr); 10 + extern unsigned long 11 + radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 12 + unsigned long len, unsigned long pgoff, 13 + unsigned long flags); 14 + #endif

+17 -62

arch/powerpc/include/asm/book3s/64/mmu-hash.h

··· 1 - #ifndef _ASM_POWERPC_MMU_HASH64_H_ 2 - #define _ASM_POWERPC_MMU_HASH64_H_ 1 + #ifndef _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ 2 + #define _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ 3 3 /* 4 4 * PowerPC64 memory management structures 5 5 * ··· 78 78 #define HPTE_V_SECONDARY ASM_CONST(0x0000000000000002) 79 79 #define HPTE_V_VALID ASM_CONST(0x0000000000000001) 80 80 81 + /* 82 + * ISA 3.0 have a different HPTE format. 83 + */ 84 + #define HPTE_R_3_0_SSIZE_SHIFT 58 81 85 #define HPTE_R_PP0 ASM_CONST(0x8000000000000000) 82 86 #define HPTE_R_TS ASM_CONST(0x4000000000000000) 83 87 #define HPTE_R_KEY_HI ASM_CONST(0x3000000000000000) ··· 119 115 #define POWER7_TLB_SETS 128 /* # sets in POWER7 TLB */ 120 116 #define POWER8_TLB_SETS 512 /* # sets in POWER8 TLB */ 121 117 #define POWER9_TLB_SETS_HASH 256 /* # sets in POWER9 TLB Hash mode */ 118 + #define POWER9_TLB_SETS_RADIX 128 /* # sets in POWER9 TLB Radix mode */ 122 119 123 120 #ifndef __ASSEMBLY__ 124 121 ··· 132 127 extern unsigned long htab_size_bytes; 133 128 extern unsigned long htab_hash_mask; 134 129 135 - /* 136 - * Page size definition 137 - * 138 - * shift : is the "PAGE_SHIFT" value for that page size 139 - * sllp : is a bit mask with the value of SLB L || LP to be or'ed 140 - * directly to a slbmte "vsid" value 141 - * penc : is the HPTE encoding mask for the "LP" field: 142 - * 143 - */ 144 - struct mmu_psize_def 145 - { 146 - unsigned int shift; /* number of bits */ 147 - int penc[MMU_PAGE_COUNT]; /* HPTE encoding */ 148 - unsigned int tlbiel; /* tlbiel supported for that page size */ 149 - unsigned long avpnm; /* bits to mask out in AVPN in the HPTE */ 150 - unsigned long sllp; /* SLB L||LP (exact mask to use in slbmte) */ 151 - }; 152 - extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; 153 130 154 131 static inline int shift_to_mmu_psize(unsigned int shift) 155 132 { ··· 197 210 /* 198 211 * The current system page and segment sizes 199 212 */ 200 - extern int mmu_linear_psize; 201 - extern int mmu_virtual_psize; 202 - extern int mmu_vmalloc_psize; 203 - extern int mmu_vmemmap_psize; 204 - extern int mmu_io_psize; 205 213 extern int mmu_kernel_ssize; 206 214 extern int mmu_highuser_ssize; 207 215 extern u16 mmu_slb_size; ··· 229 247 */ 230 248 v = (vpn >> (23 - VPN_SHIFT)) & ~(mmu_psize_defs[psize].avpnm); 231 249 v <<= HPTE_V_AVPN_SHIFT; 232 - v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT; 250 + if (!cpu_has_feature(CPU_FTR_ARCH_300)) 251 + v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT; 233 252 return v; 234 253 } 235 254 ··· 254 271 * aligned for the requested page size 255 272 */ 256 273 static inline unsigned long hpte_encode_r(unsigned long pa, int base_psize, 257 - int actual_psize) 274 + int actual_psize, int ssize) 258 275 { 276 + 277 + if (cpu_has_feature(CPU_FTR_ARCH_300)) 278 + pa |= ((unsigned long) ssize) << HPTE_R_3_0_SSIZE_SHIFT; 279 + 259 280 /* A 4K page needs no special encoding */ 260 281 if (actual_psize == MMU_PAGE_4K) 261 282 return pa & HPTE_R_RPN; ··· 463 476 add rt,rt,rx 464 477 465 478 /* 4 bits per slice and we have one slice per 1TB */ 466 - #define SLICE_ARRAY_SIZE (PGTABLE_RANGE >> 41) 479 + #define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) 467 480 468 481 #ifndef __ASSEMBLY__ 469 482 ··· 498 511 static inline void subpage_prot_free(struct mm_struct *mm) {} 499 512 static inline void subpage_prot_init_new_context(struct mm_struct *mm) { } 500 513 #endif /* CONFIG_PPC_SUBPAGE_PROT */ 501 - 502 - typedef unsigned long mm_context_id_t; 503 - struct spinlock; 504 - 505 - typedef struct { 506 - mm_context_id_t id; 507 - u16 user_psize; /* page size index */ 508 - 509 - #ifdef CONFIG_PPC_MM_SLICES 510 - u64 low_slices_psize; /* SLB page size encodings */ 511 - unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; 512 - #else 513 - u16 sllp; /* SLB page size encoding */ 514 - #endif 515 - unsigned long vdso_base; 516 - #ifdef CONFIG_PPC_SUBPAGE_PROT 517 - struct subpage_prot_table spt; 518 - #endif /* CONFIG_PPC_SUBPAGE_PROT */ 519 - #ifdef CONFIG_PPC_ICSWX 520 - struct spinlock *cop_lockp; /* guard acop and cop_pid */ 521 - unsigned long acop; /* mask of enabled coprocessor types */ 522 - unsigned int cop_pid; /* pid value used with coprocessors */ 523 - #endif /* CONFIG_PPC_ICSWX */ 524 - #ifdef CONFIG_PPC_64K_PAGES 525 - /* for 4K PTE fragment support */ 526 - void *pte_frag; 527 - #endif 528 - #ifdef CONFIG_SPAPR_TCE_IOMMU 529 - struct list_head iommu_group_mem_list; 530 - #endif 531 - } mm_context_t; 532 - 533 514 534 515 #if 0 535 516 /* ··· 534 579 /* 535 580 * Bad address. We return VSID 0 for that 536 581 */ 537 - if ((ea & ~REGION_MASK) >= PGTABLE_RANGE) 582 + if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE) 538 583 return 0; 539 584 540 585 if (ssize == MMU_SEGSIZE_256M) ··· 568 613 569 614 #endif /* __ASSEMBLY__ */ 570 615 571 - #endif /* _ASM_POWERPC_MMU_HASH64_H_ */ 616 + #endif /* _ASM_POWERPC_BOOK3S_64_MMU_HASH_H_ */

+137

arch/powerpc/include/asm/book3s/64/mmu.h

··· 1 + #ifndef _ASM_POWERPC_BOOK3S_64_MMU_H_ 2 + #define _ASM_POWERPC_BOOK3S_64_MMU_H_ 3 + 4 + #ifndef __ASSEMBLY__ 5 + /* 6 + * Page size definition 7 + * 8 + * shift : is the "PAGE_SHIFT" value for that page size 9 + * sllp : is a bit mask with the value of SLB L || LP to be or'ed 10 + * directly to a slbmte "vsid" value 11 + * penc : is the HPTE encoding mask for the "LP" field: 12 + * 13 + */ 14 + struct mmu_psize_def { 15 + unsigned int shift; /* number of bits */ 16 + int penc[MMU_PAGE_COUNT]; /* HPTE encoding */ 17 + unsigned int tlbiel; /* tlbiel supported for that page size */ 18 + unsigned long avpnm; /* bits to mask out in AVPN in the HPTE */ 19 + union { 20 + unsigned long sllp; /* SLB L||LP (exact mask to use in slbmte) */ 21 + unsigned long ap; /* Ap encoding used by PowerISA 3.0 */ 22 + }; 23 + }; 24 + extern struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT]; 25 + 26 + #define radix_enabled() mmu_has_feature(MMU_FTR_RADIX) 27 + 28 + #endif /* __ASSEMBLY__ */ 29 + 30 + /* 64-bit classic hash table MMU */ 31 + #include <asm/book3s/64/mmu-hash.h> 32 + 33 + #ifndef __ASSEMBLY__ 34 + /* 35 + * ISA 3.0 partiton and process table entry format 36 + */ 37 + struct prtb_entry { 38 + __be64 prtb0; 39 + __be64 prtb1; 40 + }; 41 + extern struct prtb_entry *process_tb; 42 + 43 + struct patb_entry { 44 + __be64 patb0; 45 + __be64 patb1; 46 + }; 47 + extern struct patb_entry *partition_tb; 48 + 49 + #define PATB_HR (1UL << 63) 50 + #define PATB_GR (1UL << 63) 51 + #define RPDB_MASK 0x0ffffffffffff00fUL 52 + #define RPDB_SHIFT (1UL << 8) 53 + /* 54 + * Limit process table to PAGE_SIZE table. This 55 + * also limit the max pid we can support. 56 + * MAX_USER_CONTEXT * 16 bytes of space. 57 + */ 58 + #define PRTB_SIZE_SHIFT (CONTEXT_BITS + 4) 59 + /* 60 + * Power9 currently only support 64K partition table size. 61 + */ 62 + #define PATB_SIZE_SHIFT 16 63 + 64 + typedef unsigned long mm_context_id_t; 65 + struct spinlock; 66 + 67 + typedef struct { 68 + mm_context_id_t id; 69 + u16 user_psize; /* page size index */ 70 + 71 + #ifdef CONFIG_PPC_MM_SLICES 72 + u64 low_slices_psize; /* SLB page size encodings */ 73 + unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; 74 + #else 75 + u16 sllp; /* SLB page size encoding */ 76 + #endif 77 + unsigned long vdso_base; 78 + #ifdef CONFIG_PPC_SUBPAGE_PROT 79 + struct subpage_prot_table spt; 80 + #endif /* CONFIG_PPC_SUBPAGE_PROT */ 81 + #ifdef CONFIG_PPC_ICSWX 82 + struct spinlock *cop_lockp; /* guard acop and cop_pid */ 83 + unsigned long acop; /* mask of enabled coprocessor types */ 84 + unsigned int cop_pid; /* pid value used with coprocessors */ 85 + #endif /* CONFIG_PPC_ICSWX */ 86 + #ifdef CONFIG_PPC_64K_PAGES 87 + /* for 4K PTE fragment support */ 88 + void *pte_frag; 89 + #endif 90 + #ifdef CONFIG_SPAPR_TCE_IOMMU 91 + struct list_head iommu_group_mem_list; 92 + #endif 93 + } mm_context_t; 94 + 95 + /* 96 + * The current system page and segment sizes 97 + */ 98 + extern int mmu_linear_psize; 99 + extern int mmu_virtual_psize; 100 + extern int mmu_vmalloc_psize; 101 + extern int mmu_vmemmap_psize; 102 + extern int mmu_io_psize; 103 + 104 + /* MMU initialization */ 105 + extern void radix_init_native(void); 106 + extern void hash__early_init_mmu(void); 107 + extern void radix__early_init_mmu(void); 108 + static inline void early_init_mmu(void) 109 + { 110 + if (radix_enabled()) 111 + return radix__early_init_mmu(); 112 + return hash__early_init_mmu(); 113 + } 114 + extern void hash__early_init_mmu_secondary(void); 115 + extern void radix__early_init_mmu_secondary(void); 116 + static inline void early_init_mmu_secondary(void) 117 + { 118 + if (radix_enabled()) 119 + return radix__early_init_mmu_secondary(); 120 + return hash__early_init_mmu_secondary(); 121 + } 122 + 123 + extern void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, 124 + phys_addr_t first_memblock_size); 125 + extern void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, 126 + phys_addr_t first_memblock_size); 127 + static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base, 128 + phys_addr_t first_memblock_size) 129 + { 130 + if (radix_enabled()) 131 + return radix__setup_initial_memory_limit(first_memblock_base, 132 + first_memblock_size); 133 + return hash__setup_initial_memory_limit(first_memblock_base, 134 + first_memblock_size); 135 + } 136 + #endif /* __ASSEMBLY__ */ 137 + #endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */

+207

arch/powerpc/include/asm/book3s/64/pgalloc.h

··· 1 + #ifndef _ASM_POWERPC_BOOK3S_64_PGALLOC_H 2 + #define _ASM_POWERPC_BOOK3S_64_PGALLOC_H 3 + /* 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public License 6 + * as published by the Free Software Foundation; either version 7 + * 2 of the License, or (at your option) any later version. 8 + */ 9 + 10 + #include <linux/slab.h> 11 + #include <linux/cpumask.h> 12 + #include <linux/percpu.h> 13 + 14 + struct vmemmap_backing { 15 + struct vmemmap_backing *list; 16 + unsigned long phys; 17 + unsigned long virt_addr; 18 + }; 19 + extern struct vmemmap_backing *vmemmap_list; 20 + 21 + /* 22 + * Functions that deal with pagetables that could be at any level of 23 + * the table need to be passed an "index_size" so they know how to 24 + * handle allocation. For PTE pages (which are linked to a struct 25 + * page for now, and drawn from the main get_free_pages() pool), the 26 + * allocation size will be (2^index_size * sizeof(pointer)) and 27 + * allocations are drawn from the kmem_cache in PGT_CACHE(index_size). 28 + * 29 + * The maximum index size needs to be big enough to allow any 30 + * pagetable sizes we need, but small enough to fit in the low bits of 31 + * any page table pointer. In other words all pagetables, even tiny 32 + * ones, must be aligned to allow at least enough low 0 bits to 33 + * contain this value. This value is also used as a mask, so it must 34 + * be one less than a power of two. 35 + */ 36 + #define MAX_PGTABLE_INDEX_SIZE 0xf 37 + 38 + extern struct kmem_cache *pgtable_cache[]; 39 + #define PGT_CACHE(shift) ({ \ 40 + BUG_ON(!(shift)); \ 41 + pgtable_cache[(shift) - 1]; \ 42 + }) 43 + 44 + #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO 45 + 46 + extern pte_t *pte_fragment_alloc(struct mm_struct *, unsigned long, int); 47 + extern void pte_fragment_free(unsigned long *, int); 48 + extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); 49 + #ifdef CONFIG_SMP 50 + extern void __tlb_remove_table(void *_table); 51 + #endif 52 + 53 + static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm) 54 + { 55 + #ifdef CONFIG_PPC_64K_PAGES 56 + return (pgd_t *)__get_free_page(PGALLOC_GFP); 57 + #else 58 + struct page *page; 59 + page = alloc_pages(PGALLOC_GFP, 4); 60 + if (!page) 61 + return NULL; 62 + return (pgd_t *) page_address(page); 63 + #endif 64 + } 65 + 66 + static inline void radix__pgd_free(struct mm_struct *mm, pgd_t *pgd) 67 + { 68 + #ifdef CONFIG_PPC_64K_PAGES 69 + free_page((unsigned long)pgd); 70 + #else 71 + free_pages((unsigned long)pgd, 4); 72 + #endif 73 + } 74 + 75 + static inline pgd_t *pgd_alloc(struct mm_struct *mm) 76 + { 77 + if (radix_enabled()) 78 + return radix__pgd_alloc(mm); 79 + return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE), GFP_KERNEL); 80 + } 81 + 82 + static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) 83 + { 84 + if (radix_enabled()) 85 + return radix__pgd_free(mm, pgd); 86 + kmem_cache_free(PGT_CACHE(PGD_INDEX_SIZE), pgd); 87 + } 88 + 89 + static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) 90 + { 91 + pgd_set(pgd, __pgtable_ptr_val(pud) | PGD_VAL_BITS); 92 + } 93 + 94 + static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) 95 + { 96 + return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), 97 + GFP_KERNEL|__GFP_REPEAT); 98 + } 99 + 100 + static inline void pud_free(struct mm_struct *mm, pud_t *pud) 101 + { 102 + kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud); 103 + } 104 + 105 + static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) 106 + { 107 + pud_set(pud, __pgtable_ptr_val(pmd) | PUD_VAL_BITS); 108 + } 109 + 110 + static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, 111 + unsigned long address) 112 + { 113 + pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE); 114 + } 115 + 116 + static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) 117 + { 118 + return kmem_cache_alloc(PGT_CACHE(PMD_CACHE_INDEX), 119 + GFP_KERNEL|__GFP_REPEAT); 120 + } 121 + 122 + static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) 123 + { 124 + kmem_cache_free(PGT_CACHE(PMD_CACHE_INDEX), pmd); 125 + } 126 + 127 + static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, 128 + unsigned long address) 129 + { 130 + return pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX); 131 + } 132 + 133 + static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, 134 + pte_t *pte) 135 + { 136 + pmd_set(pmd, __pgtable_ptr_val(pte) | PMD_VAL_BITS); 137 + } 138 + 139 + static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, 140 + pgtable_t pte_page) 141 + { 142 + pmd_set(pmd, __pgtable_ptr_val(pte_page) | PMD_VAL_BITS); 143 + } 144 + 145 + static inline pgtable_t pmd_pgtable(pmd_t pmd) 146 + { 147 + return (pgtable_t)pmd_page_vaddr(pmd); 148 + } 149 + 150 + #ifdef CONFIG_PPC_4K_PAGES 151 + static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, 152 + unsigned long address) 153 + { 154 + return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); 155 + } 156 + 157 + static inline pgtable_t pte_alloc_one(struct mm_struct *mm, 158 + unsigned long address) 159 + { 160 + struct page *page; 161 + pte_t *pte; 162 + 163 + pte = pte_alloc_one_kernel(mm, address); 164 + if (!pte) 165 + return NULL; 166 + page = virt_to_page(pte); 167 + if (!pgtable_page_ctor(page)) { 168 + __free_page(page); 169 + return NULL; 170 + } 171 + return pte; 172 + } 173 + #else /* if CONFIG_PPC_64K_PAGES */ 174 + 175 + static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, 176 + unsigned long address) 177 + { 178 + return (pte_t *)pte_fragment_alloc(mm, address, 1); 179 + } 180 + 181 + static inline pgtable_t pte_alloc_one(struct mm_struct *mm, 182 + unsigned long address) 183 + { 184 + return (pgtable_t)pte_fragment_alloc(mm, address, 0); 185 + } 186 + #endif 187 + 188 + static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) 189 + { 190 + pte_fragment_free((unsigned long *)pte, 1); 191 + } 192 + 193 + static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) 194 + { 195 + pte_fragment_free((unsigned long *)ptepage, 0); 196 + } 197 + 198 + static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, 199 + unsigned long address) 200 + { 201 + tlb_flush_pgtable(tlb, address); 202 + pgtable_free_tlb(tlb, table, 0); 203 + } 204 + 205 + #define check_pgt_cache() do { } while (0) 206 + 207 + #endif /* _ASM_POWERPC_BOOK3S_64_PGALLOC_H */

+53

arch/powerpc/include/asm/book3s/64/pgtable-4k.h

··· 1 + #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H 2 + #define _ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H 3 + /* 4 + * hash 4k can't share hugetlb and also doesn't support THP 5 + */ 6 + #ifndef __ASSEMBLY__ 7 + #ifdef CONFIG_HUGETLB_PAGE 8 + static inline int pmd_huge(pmd_t pmd) 9 + { 10 + /* 11 + * leaf pte for huge page 12 + */ 13 + if (radix_enabled()) 14 + return !!(pmd_val(pmd) & _PAGE_PTE); 15 + return 0; 16 + } 17 + 18 + static inline int pud_huge(pud_t pud) 19 + { 20 + /* 21 + * leaf pte for huge page 22 + */ 23 + if (radix_enabled()) 24 + return !!(pud_val(pud) & _PAGE_PTE); 25 + return 0; 26 + } 27 + 28 + static inline int pgd_huge(pgd_t pgd) 29 + { 30 + /* 31 + * leaf pte for huge page 32 + */ 33 + if (radix_enabled()) 34 + return !!(pgd_val(pgd) & _PAGE_PTE); 35 + return 0; 36 + } 37 + #define pgd_huge pgd_huge 38 + /* 39 + * With radix , we have hugepage ptes in the pud and pmd entries. We don't 40 + * need to setup hugepage directory for them. Our pte and page directory format 41 + * enable us to have this enabled. 42 + */ 43 + static inline int hugepd_ok(hugepd_t hpd) 44 + { 45 + if (radix_enabled()) 46 + return 0; 47 + return hash__hugepd_ok(hpd); 48 + } 49 + #define is_hugepd(hpd) (hugepd_ok(hpd)) 50 + #endif /* CONFIG_HUGETLB_PAGE */ 51 + #endif /* __ASSEMBLY__ */ 52 + 53 + #endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H */

+64

arch/powerpc/include/asm/book3s/64/pgtable-64k.h

··· 1 + #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H 2 + #define _ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H 3 + 4 + #ifndef __ASSEMBLY__ 5 + #ifdef CONFIG_HUGETLB_PAGE 6 + /* 7 + * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have 8 + * 16GB hugepage pte in PGD and 16MB hugepage pte at PMD; 9 + * 10 + * Defined in such a way that we can optimize away code block at build time 11 + * if CONFIG_HUGETLB_PAGE=n. 12 + */ 13 + static inline int pmd_huge(pmd_t pmd) 14 + { 15 + /* 16 + * leaf pte for huge page 17 + */ 18 + return !!(pmd_val(pmd) & _PAGE_PTE); 19 + } 20 + 21 + static inline int pud_huge(pud_t pud) 22 + { 23 + /* 24 + * leaf pte for huge page 25 + */ 26 + return !!(pud_val(pud) & _PAGE_PTE); 27 + } 28 + 29 + static inline int pgd_huge(pgd_t pgd) 30 + { 31 + /* 32 + * leaf pte for huge page 33 + */ 34 + return !!(pgd_val(pgd) & _PAGE_PTE); 35 + } 36 + #define pgd_huge pgd_huge 37 + 38 + #ifdef CONFIG_DEBUG_VM 39 + extern int hugepd_ok(hugepd_t hpd); 40 + #define is_hugepd(hpd) (hugepd_ok(hpd)) 41 + #else 42 + /* 43 + * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't 44 + * need to setup hugepage directory for them. Our pte and page directory format 45 + * enable us to have this enabled. 46 + */ 47 + static inline int hugepd_ok(hugepd_t hpd) 48 + { 49 + return 0; 50 + } 51 + #define is_hugepd(pdep) 0 52 + #endif /* CONFIG_DEBUG_VM */ 53 + 54 + #endif /* CONFIG_HUGETLB_PAGE */ 55 + 56 + static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, 57 + unsigned long pfn, pgprot_t prot) 58 + { 59 + if (radix_enabled()) 60 + BUG(); 61 + return hash__remap_4k_pfn(vma, addr, pfn, prot); 62 + } 63 + #endif /* __ASSEMBLY__ */ 64 + #endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_64K_H */

+743 -83

arch/powerpc/include/asm/book3s/64/pgtable.h

··· 1 1 #ifndef _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ 2 2 #define _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ 3 + 3 4 /* 4 - * This file contains the functions and defines necessary to modify and use 5 - * the ppc64 hashed page table. 5 + * Common bits between hash and Radix page table 6 6 */ 7 + #define _PAGE_BIT_SWAP_TYPE 0 8 + 9 + #define _PAGE_EXEC 0x00001 /* execute permission */ 10 + #define _PAGE_WRITE 0x00002 /* write access allowed */ 11 + #define _PAGE_READ 0x00004 /* read access allowed */ 12 + #define _PAGE_RW (_PAGE_READ | _PAGE_WRITE) 13 + #define _PAGE_RWX (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC) 14 + #define _PAGE_PRIVILEGED 0x00008 /* kernel access only */ 15 + #define _PAGE_SAO 0x00010 /* Strong access order */ 16 + #define _PAGE_NON_IDEMPOTENT 0x00020 /* non idempotent memory */ 17 + #define _PAGE_TOLERANT 0x00030 /* tolerant memory, cache inhibited */ 18 + #define _PAGE_DIRTY 0x00080 /* C: page changed */ 19 + #define _PAGE_ACCESSED 0x00100 /* R: page referenced */ 20 + /* 21 + * Software bits 22 + */ 23 + #define _RPAGE_SW0 0x2000000000000000UL 24 + #define _RPAGE_SW1 0x00800 25 + #define _RPAGE_SW2 0x00400 26 + #define _RPAGE_SW3 0x00200 27 + #ifdef CONFIG_MEM_SOFT_DIRTY 28 + #define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */ 29 + #else 30 + #define _PAGE_SOFT_DIRTY 0x00000 31 + #endif 32 + #define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */ 33 + 34 + 35 + #define _PAGE_PTE (1ul << 62) /* distinguishes PTEs from pointers */ 36 + #define _PAGE_PRESENT (1ul << 63) /* pte contains a translation */ 37 + /* 38 + * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE 39 + * Instead of fixing all of them, add an alternate define which 40 + * maps CI pte mapping. 41 + */ 42 + #define _PAGE_NO_CACHE _PAGE_TOLERANT 43 + /* 44 + * We support 57 bit real address in pte. Clear everything above 57, and 45 + * every thing below PAGE_SHIFT; 46 + */ 47 + #define PTE_RPN_MASK (((1UL << 57) - 1) & (PAGE_MASK)) 48 + /* 49 + * set of bits not changed in pmd_modify. Even though we have hash specific bits 50 + * in here, on radix we expect them to be zero. 51 + */ 52 + #define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ 53 + _PAGE_ACCESSED | H_PAGE_THP_HUGE | _PAGE_PTE | \ 54 + _PAGE_SOFT_DIRTY) 55 + /* 56 + * user access blocked by key 57 + */ 58 + #define _PAGE_KERNEL_RW (_PAGE_PRIVILEGED | _PAGE_RW | _PAGE_DIRTY) 59 + #define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_READ) 60 + #define _PAGE_KERNEL_RWX (_PAGE_PRIVILEGED | _PAGE_DIRTY | \ 61 + _PAGE_RW | _PAGE_EXEC) 62 + /* 63 + * No page size encoding in the linux PTE 64 + */ 65 + #define _PAGE_PSIZE 0 66 + /* 67 + * _PAGE_CHG_MASK masks of bits that are to be preserved across 68 + * pgprot changes 69 + */ 70 + #define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ 71 + _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE | \ 72 + _PAGE_SOFT_DIRTY) 73 + /* 74 + * Mask of bits returned by pte_pgprot() 75 + */ 76 + #define PAGE_PROT_BITS (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT | \ 77 + H_PAGE_4K_PFN | _PAGE_PRIVILEGED | _PAGE_ACCESSED | \ 78 + _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY | _PAGE_EXEC | \ 79 + _PAGE_SOFT_DIRTY) 80 + /* 81 + * We define 2 sets of base prot bits, one for basic pages (ie, 82 + * cacheable kernel and user pages) and one for non cacheable 83 + * pages. We always set _PAGE_COHERENT when SMP is enabled or 84 + * the processor might need it for DMA coherency. 85 + */ 86 + #define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) 87 + #define _PAGE_BASE (_PAGE_BASE_NC) 88 + 89 + /* Permission masks used to generate the __P and __S table, 90 + * 91 + * Note:__pgprot is defined in arch/powerpc/include/asm/page.h 92 + * 93 + * Write permissions imply read permissions for now (we could make write-only 94 + * pages on BookE but we don't bother for now). Execute permission control is 95 + * possible on platforms that define _PAGE_EXEC 96 + * 97 + * Note due to the way vm flags are laid out, the bits are XWR 98 + */ 99 + #define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_PRIVILEGED) 100 + #define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_RW) 101 + #define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_RW | _PAGE_EXEC) 102 + #define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_READ) 103 + #define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) 104 + #define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_READ) 105 + #define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) 106 + 107 + #define __P000 PAGE_NONE 108 + #define __P001 PAGE_READONLY 109 + #define __P010 PAGE_COPY 110 + #define __P011 PAGE_COPY 111 + #define __P100 PAGE_READONLY_X 112 + #define __P101 PAGE_READONLY_X 113 + #define __P110 PAGE_COPY_X 114 + #define __P111 PAGE_COPY_X 115 + 116 + #define __S000 PAGE_NONE 117 + #define __S001 PAGE_READONLY 118 + #define __S010 PAGE_SHARED 119 + #define __S011 PAGE_SHARED 120 + #define __S100 PAGE_READONLY_X 121 + #define __S101 PAGE_READONLY_X 122 + #define __S110 PAGE_SHARED_X 123 + #define __S111 PAGE_SHARED_X 124 + 125 + /* Permission masks used for kernel mappings */ 126 + #define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) 127 + #define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ 128 + _PAGE_TOLERANT) 129 + #define PAGE_KERNEL_NCG __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | \ 130 + _PAGE_NON_IDEMPOTENT) 131 + #define PAGE_KERNEL_X __pgprot(_PAGE_BASE | _PAGE_KERNEL_RWX) 132 + #define PAGE_KERNEL_RO __pgprot(_PAGE_BASE | _PAGE_KERNEL_RO) 133 + #define PAGE_KERNEL_ROX __pgprot(_PAGE_BASE | _PAGE_KERNEL_ROX) 134 + 135 + /* 136 + * Protection used for kernel text. We want the debuggers to be able to 137 + * set breakpoints anywhere, so don't write protect the kernel text 138 + * on platforms where such control is possible. 139 + */ 140 + #if defined(CONFIG_KGDB) || defined(CONFIG_XMON) || defined(CONFIG_BDI_SWITCH) || \ 141 + defined(CONFIG_KPROBES) || defined(CONFIG_DYNAMIC_FTRACE) 142 + #define PAGE_KERNEL_TEXT PAGE_KERNEL_X 143 + #else 144 + #define PAGE_KERNEL_TEXT PAGE_KERNEL_ROX 145 + #endif 146 + 147 + /* Make modules code happy. We don't set RO yet */ 148 + #define PAGE_KERNEL_EXEC PAGE_KERNEL_X 149 + #define PAGE_AGP (PAGE_KERNEL_NC) 150 + 151 + #ifndef __ASSEMBLY__ 152 + /* 153 + * page table defines 154 + */ 155 + extern unsigned long __pte_index_size; 156 + extern unsigned long __pmd_index_size; 157 + extern unsigned long __pud_index_size; 158 + extern unsigned long __pgd_index_size; 159 + extern unsigned long __pmd_cache_index; 160 + #define PTE_INDEX_SIZE __pte_index_size 161 + #define PMD_INDEX_SIZE __pmd_index_size 162 + #define PUD_INDEX_SIZE __pud_index_size 163 + #define PGD_INDEX_SIZE __pgd_index_size 164 + #define PMD_CACHE_INDEX __pmd_cache_index 165 + /* 166 + * Because of use of pte fragments and THP, size of page table 167 + * are not always derived out of index size above. 168 + */ 169 + extern unsigned long __pte_table_size; 170 + extern unsigned long __pmd_table_size; 171 + extern unsigned long __pud_table_size; 172 + extern unsigned long __pgd_table_size; 173 + #define PTE_TABLE_SIZE __pte_table_size 174 + #define PMD_TABLE_SIZE __pmd_table_size 175 + #define PUD_TABLE_SIZE __pud_table_size 176 + #define PGD_TABLE_SIZE __pgd_table_size 177 + 178 + extern unsigned long __pmd_val_bits; 179 + extern unsigned long __pud_val_bits; 180 + extern unsigned long __pgd_val_bits; 181 + #define PMD_VAL_BITS __pmd_val_bits 182 + #define PUD_VAL_BITS __pud_val_bits 183 + #define PGD_VAL_BITS __pgd_val_bits 184 + 185 + extern unsigned long __pte_frag_nr; 186 + #define PTE_FRAG_NR __pte_frag_nr 187 + extern unsigned long __pte_frag_size_shift; 188 + #define PTE_FRAG_SIZE_SHIFT __pte_frag_size_shift 189 + #define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT) 190 + /* 191 + * Pgtable size used by swapper, init in asm code 192 + */ 193 + #define MAX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE) 194 + 195 + #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) 196 + #define PTRS_PER_PMD (1 << PMD_INDEX_SIZE) 197 + #define PTRS_PER_PUD (1 << PUD_INDEX_SIZE) 198 + #define PTRS_PER_PGD (1 << PGD_INDEX_SIZE) 199 + 200 + /* PMD_SHIFT determines what a second-level page table entry can map */ 201 + #define PMD_SHIFT (PAGE_SHIFT + PTE_INDEX_SIZE) 202 + #define PMD_SIZE (1UL << PMD_SHIFT) 203 + #define PMD_MASK (~(PMD_SIZE-1)) 204 + 205 + /* PUD_SHIFT determines what a third-level page table entry can map */ 206 + #define PUD_SHIFT (PMD_SHIFT + PMD_INDEX_SIZE) 207 + #define PUD_SIZE (1UL << PUD_SHIFT) 208 + #define PUD_MASK (~(PUD_SIZE-1)) 209 + 210 + /* PGDIR_SHIFT determines what a fourth-level page table entry can map */ 211 + #define PGDIR_SHIFT (PUD_SHIFT + PUD_INDEX_SIZE) 212 + #define PGDIR_SIZE (1UL << PGDIR_SHIFT) 213 + #define PGDIR_MASK (~(PGDIR_SIZE-1)) 214 + 215 + /* Bits to mask out from a PMD to get to the PTE page */ 216 + #define PMD_MASKED_BITS 0xc0000000000000ffUL 217 + /* Bits to mask out from a PUD to get to the PMD page */ 218 + #define PUD_MASKED_BITS 0xc0000000000000ffUL 219 + /* Bits to mask out from a PGD to get to the PUD page */ 220 + #define PGD_MASKED_BITS 0xc0000000000000ffUL 221 + 222 + extern unsigned long __vmalloc_start; 223 + extern unsigned long __vmalloc_end; 224 + #define VMALLOC_START __vmalloc_start 225 + #define VMALLOC_END __vmalloc_end 226 + 227 + extern unsigned long __kernel_virt_start; 228 + extern unsigned long __kernel_virt_size; 229 + #define KERN_VIRT_START __kernel_virt_start 230 + #define KERN_VIRT_SIZE __kernel_virt_size 231 + extern struct page *vmemmap; 232 + extern unsigned long ioremap_bot; 233 + #endif /* __ASSEMBLY__ */ 7 234 8 235 #include <asm/book3s/64/hash.h> 9 - #include <asm/barrier.h> 236 + #include <asm/book3s/64/radix.h> 10 237 238 + #ifdef CONFIG_PPC_64K_PAGES 239 + #include <asm/book3s/64/pgtable-64k.h> 240 + #else 241 + #include <asm/book3s/64/pgtable-4k.h> 242 + #endif 243 + 244 + #include <asm/barrier.h> 11 245 /* 12 246 * The second half of the kernel virtual space is used for IO mappings, 13 247 * it's itself carved into the PIO region (ISA and PHB IO space) and ··· 260 26 #define IOREMAP_BASE (PHB_IO_END) 261 27 #define IOREMAP_END (KERN_VIRT_START + KERN_VIRT_SIZE) 262 28 263 - #define vmemmap ((struct page *)VMEMMAP_BASE) 264 - 265 29 /* Advertise special mapping type for AGP */ 266 30 #define HAVE_PAGE_AGP 267 31 ··· 277 45 278 46 #define __real_pte(e,p) ((real_pte_t){(e)}) 279 47 #define __rpte_to_pte(r) ((r).pte) 280 - #define __rpte_to_hidx(r,index) (pte_val(__rpte_to_pte(r)) >>_PAGE_F_GIX_SHIFT) 48 + #define __rpte_to_hidx(r,index) (pte_val(__rpte_to_pte(r)) >> H_PAGE_F_GIX_SHIFT) 281 49 282 50 #define pte_iterate_hashed_subpages(rpte, psize, va, index, shift) \ 283 51 do { \ ··· 294 62 295 63 #endif /* __real_pte */ 296 64 65 + static inline unsigned long pte_update(struct mm_struct *mm, unsigned long addr, 66 + pte_t *ptep, unsigned long clr, 67 + unsigned long set, int huge) 68 + { 69 + if (radix_enabled()) 70 + return radix__pte_update(mm, addr, ptep, clr, set, huge); 71 + return hash__pte_update(mm, addr, ptep, clr, set, huge); 72 + } 73 + /* 74 + * For hash even if we have _PAGE_ACCESSED = 0, we do a pte_update. 75 + * We currently remove entries from the hashtable regardless of whether 76 + * the entry was young or dirty. 77 + * 78 + * We should be more intelligent about this but for the moment we override 79 + * these functions and force a tlb flush unconditionally 80 + * For radix: H_PAGE_HASHPTE should be zero. Hence we can use the same 81 + * function for both hash and radix. 82 + */ 83 + static inline int __ptep_test_and_clear_young(struct mm_struct *mm, 84 + unsigned long addr, pte_t *ptep) 85 + { 86 + unsigned long old; 87 + 88 + if ((pte_val(*ptep) & (_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) 89 + return 0; 90 + old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); 91 + return (old & _PAGE_ACCESSED) != 0; 92 + } 93 + 94 + #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG 95 + #define ptep_test_and_clear_young(__vma, __addr, __ptep) \ 96 + ({ \ 97 + int __r; \ 98 + __r = __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \ 99 + __r; \ 100 + }) 101 + 102 + #define __HAVE_ARCH_PTEP_SET_WRPROTECT 103 + static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, 104 + pte_t *ptep) 105 + { 106 + 107 + if ((pte_val(*ptep) & _PAGE_WRITE) == 0) 108 + return; 109 + 110 + pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0); 111 + } 112 + 113 + static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, 114 + unsigned long addr, pte_t *ptep) 115 + { 116 + if ((pte_val(*ptep) & _PAGE_WRITE) == 0) 117 + return; 118 + 119 + pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 1); 120 + } 121 + 122 + #define __HAVE_ARCH_PTEP_GET_AND_CLEAR 123 + static inline pte_t ptep_get_and_clear(struct mm_struct *mm, 124 + unsigned long addr, pte_t *ptep) 125 + { 126 + unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0); 127 + return __pte(old); 128 + } 129 + 130 + static inline void pte_clear(struct mm_struct *mm, unsigned long addr, 131 + pte_t * ptep) 132 + { 133 + pte_update(mm, addr, ptep, ~0UL, 0, 0); 134 + } 135 + static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_WRITE);} 136 + static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); } 137 + static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSED); } 138 + static inline int pte_special(pte_t pte) { return !!(pte_val(pte) & _PAGE_SPECIAL); } 139 + static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } 140 + 141 + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY 142 + static inline bool pte_soft_dirty(pte_t pte) 143 + { 144 + return !!(pte_val(pte) & _PAGE_SOFT_DIRTY); 145 + } 146 + static inline pte_t pte_mksoft_dirty(pte_t pte) 147 + { 148 + return __pte(pte_val(pte) | _PAGE_SOFT_DIRTY); 149 + } 150 + 151 + static inline pte_t pte_clear_soft_dirty(pte_t pte) 152 + { 153 + return __pte(pte_val(pte) & ~_PAGE_SOFT_DIRTY); 154 + } 155 + #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ 156 + 157 + #ifdef CONFIG_NUMA_BALANCING 158 + /* 159 + * These work without NUMA balancing but the kernel does not care. See the 160 + * comment in include/asm-generic/pgtable.h . On powerpc, this will only 161 + * work for user pages and always return true for kernel pages. 162 + */ 163 + static inline int pte_protnone(pte_t pte) 164 + { 165 + return (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PRIVILEGED)) == 166 + (_PAGE_PRESENT | _PAGE_PRIVILEGED); 167 + } 168 + #endif /* CONFIG_NUMA_BALANCING */ 169 + 170 + static inline int pte_present(pte_t pte) 171 + { 172 + return !!(pte_val(pte) & _PAGE_PRESENT); 173 + } 174 + /* 175 + * Conversion functions: convert a page and protection to a page entry, 176 + * and a page entry and page directory to the page they refer to. 177 + * 178 + * Even if PTEs can be unsigned long long, a PFN is always an unsigned 179 + * long for now. 180 + */ 181 + static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) 182 + { 183 + return __pte((((pte_basic_t)(pfn) << PAGE_SHIFT) & PTE_RPN_MASK) | 184 + pgprot_val(pgprot)); 185 + } 186 + 187 + static inline unsigned long pte_pfn(pte_t pte) 188 + { 189 + return (pte_val(pte) & PTE_RPN_MASK) >> PAGE_SHIFT; 190 + } 191 + 192 + /* Generic modifiers for PTE bits */ 193 + static inline pte_t pte_wrprotect(pte_t pte) 194 + { 195 + return __pte(pte_val(pte) & ~_PAGE_WRITE); 196 + } 197 + 198 + static inline pte_t pte_mkclean(pte_t pte) 199 + { 200 + return __pte(pte_val(pte) & ~_PAGE_DIRTY); 201 + } 202 + 203 + static inline pte_t pte_mkold(pte_t pte) 204 + { 205 + return __pte(pte_val(pte) & ~_PAGE_ACCESSED); 206 + } 207 + 208 + static inline pte_t pte_mkwrite(pte_t pte) 209 + { 210 + /* 211 + * write implies read, hence set both 212 + */ 213 + return __pte(pte_val(pte) | _PAGE_RW); 214 + } 215 + 216 + static inline pte_t pte_mkdirty(pte_t pte) 217 + { 218 + return __pte(pte_val(pte) | _PAGE_DIRTY | _PAGE_SOFT_DIRTY); 219 + } 220 + 221 + static inline pte_t pte_mkyoung(pte_t pte) 222 + { 223 + return __pte(pte_val(pte) | _PAGE_ACCESSED); 224 + } 225 + 226 + static inline pte_t pte_mkspecial(pte_t pte) 227 + { 228 + return __pte(pte_val(pte) | _PAGE_SPECIAL); 229 + } 230 + 231 + static inline pte_t pte_mkhuge(pte_t pte) 232 + { 233 + return pte; 234 + } 235 + 236 + static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) 237 + { 238 + /* FIXME!! check whether this need to be a conditional */ 239 + return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); 240 + } 241 + 242 + static inline bool pte_user(pte_t pte) 243 + { 244 + return !(pte_val(pte) & _PAGE_PRIVILEGED); 245 + } 246 + 247 + /* Encode and de-code a swap entry */ 248 + #define MAX_SWAPFILES_CHECK() do { \ 249 + BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \ 250 + /* \ 251 + * Don't have overlapping bits with _PAGE_HPTEFLAGS \ 252 + * We filter HPTEFLAGS on set_pte. \ 253 + */ \ 254 + BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \ 255 + BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \ 256 + } while (0) 257 + /* 258 + * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT; 259 + */ 260 + #define SWP_TYPE_BITS 5 261 + #define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ 262 + & ((1UL << SWP_TYPE_BITS) - 1)) 263 + #define __swp_offset(x) (((x).val & PTE_RPN_MASK) >> PAGE_SHIFT) 264 + #define __swp_entry(type, offset) ((swp_entry_t) { \ 265 + ((type) << _PAGE_BIT_SWAP_TYPE) \ 266 + | (((offset) << PAGE_SHIFT) & PTE_RPN_MASK)}) 267 + /* 268 + * swp_entry_t must be independent of pte bits. We build a swp_entry_t from 269 + * swap type and offset we get from swap and convert that to pte to find a 270 + * matching pte in linux page table. 271 + * Clear bits not found in swap entries here. 272 + */ 273 + #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE }) 274 + #define __swp_entry_to_pte(x) __pte((x).val | _PAGE_PTE) 275 + 276 + #ifdef CONFIG_MEM_SOFT_DIRTY 277 + #define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE)) 278 + #else 279 + #define _PAGE_SWP_SOFT_DIRTY 0UL 280 + #endif /* CONFIG_MEM_SOFT_DIRTY */ 281 + 282 + #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY 283 + static inline pte_t pte_swp_mksoft_dirty(pte_t pte) 284 + { 285 + return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY); 286 + } 287 + static inline bool pte_swp_soft_dirty(pte_t pte) 288 + { 289 + return !!(pte_val(pte) & _PAGE_SWP_SOFT_DIRTY); 290 + } 291 + static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) 292 + { 293 + return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY); 294 + } 295 + #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ 296 + 297 + static inline bool check_pte_access(unsigned long access, unsigned long ptev) 298 + { 299 + /* 300 + * This check for _PAGE_RWX and _PAGE_PRESENT bits 301 + */ 302 + if (access & ~ptev) 303 + return false; 304 + /* 305 + * This check for access to privilege space 306 + */ 307 + if ((access & _PAGE_PRIVILEGED) != (ptev & _PAGE_PRIVILEGED)) 308 + return false; 309 + 310 + return true; 311 + } 312 + /* 313 + * Generic functions with hash/radix callbacks 314 + */ 315 + 316 + static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry) 317 + { 318 + if (radix_enabled()) 319 + return radix__ptep_set_access_flags(ptep, entry); 320 + return hash__ptep_set_access_flags(ptep, entry); 321 + } 322 + 323 + #define __HAVE_ARCH_PTE_SAME 324 + static inline int pte_same(pte_t pte_a, pte_t pte_b) 325 + { 326 + if (radix_enabled()) 327 + return radix__pte_same(pte_a, pte_b); 328 + return hash__pte_same(pte_a, pte_b); 329 + } 330 + 331 + static inline int pte_none(pte_t pte) 332 + { 333 + if (radix_enabled()) 334 + return radix__pte_none(pte); 335 + return hash__pte_none(pte); 336 + } 337 + 338 + static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, 339 + pte_t *ptep, pte_t pte, int percpu) 340 + { 341 + if (radix_enabled()) 342 + return radix__set_pte_at(mm, addr, ptep, pte, percpu); 343 + return hash__set_pte_at(mm, addr, ptep, pte, percpu); 344 + } 345 + 346 + #define _PAGE_CACHE_CTL (_PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT) 347 + 348 + #define pgprot_noncached pgprot_noncached 349 + static inline pgprot_t pgprot_noncached(pgprot_t prot) 350 + { 351 + return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | 352 + _PAGE_NON_IDEMPOTENT); 353 + } 354 + 355 + #define pgprot_noncached_wc pgprot_noncached_wc 356 + static inline pgprot_t pgprot_noncached_wc(pgprot_t prot) 357 + { 358 + return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | 359 + _PAGE_TOLERANT); 360 + } 361 + 362 + #define pgprot_cached pgprot_cached 363 + static inline pgprot_t pgprot_cached(pgprot_t prot) 364 + { 365 + return __pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL)); 366 + } 367 + 368 + #define pgprot_writecombine pgprot_writecombine 369 + static inline pgprot_t pgprot_writecombine(pgprot_t prot) 370 + { 371 + return pgprot_noncached_wc(prot); 372 + } 373 + /* 374 + * check a pte mapping have cache inhibited property 375 + */ 376 + static inline bool pte_ci(pte_t pte) 377 + { 378 + unsigned long pte_v = pte_val(pte); 379 + 380 + if (((pte_v & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) || 381 + ((pte_v & _PAGE_CACHE_CTL) == _PAGE_NON_IDEMPOTENT)) 382 + return true; 383 + return false; 384 + } 385 + 297 386 static inline void pmd_set(pmd_t *pmdp, unsigned long val) 298 387 { 299 388 *pmdp = __pmd(val); ··· 627 74 628 75 #define pmd_none(pmd) (!pmd_val(pmd)) 629 76 #define pmd_present(pmd) (!pmd_none(pmd)) 77 + 78 + static inline int pmd_bad(pmd_t pmd) 79 + { 80 + if (radix_enabled()) 81 + return radix__pmd_bad(pmd); 82 + return hash__pmd_bad(pmd); 83 + } 630 84 631 85 static inline void pud_set(pud_t *pudp, unsigned long val) 632 86 { ··· 660 100 return __pud(pte_val(pte)); 661 101 } 662 102 #define pud_write(pud) pte_write(pud_pte(pud)) 103 + 104 + static inline int pud_bad(pud_t pud) 105 + { 106 + if (radix_enabled()) 107 + return radix__pud_bad(pud); 108 + return hash__pud_bad(pud); 109 + } 110 + 111 + 663 112 #define pgd_write(pgd) pte_write(pgd_pte(pgd)) 664 113 static inline void pgd_set(pgd_t *pgdp, unsigned long val) 665 114 { ··· 693 124 return __pgd(pte_val(pte)); 694 125 } 695 126 127 + static inline int pgd_bad(pgd_t pgd) 128 + { 129 + if (radix_enabled()) 130 + return radix__pgd_bad(pgd); 131 + return hash__pgd_bad(pgd); 132 + } 133 + 696 134 extern struct page *pgd_page(pgd_t pgd); 135 + 136 + /* Pointers in the page table tree are physical addresses */ 137 + #define __pgtable_ptr_val(ptr) __pa(ptr) 138 + 139 + #define pmd_page_vaddr(pmd) __va(pmd_val(pmd) & ~PMD_MASKED_BITS) 140 + #define pud_page_vaddr(pud) __va(pud_val(pud) & ~PUD_MASKED_BITS) 141 + #define pgd_page_vaddr(pgd) __va(pgd_val(pgd) & ~PGD_MASKED_BITS) 142 + 143 + #define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1)) 144 + #define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1)) 145 + #define pmd_index(address) (((address) >> (PMD_SHIFT)) & (PTRS_PER_PMD - 1)) 146 + #define pte_index(address) (((address) >> (PAGE_SHIFT)) & (PTRS_PER_PTE - 1)) 697 147 698 148 /* 699 149 * Find an entry in a page-table-directory. We combine the address region ··· 744 156 #define pgd_ERROR(e) \ 745 157 pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) 746 158 747 - /* Encode and de-code a swap entry */ 748 - #define MAX_SWAPFILES_CHECK() do { \ 749 - BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS); \ 750 - /* \ 751 - * Don't have overlapping bits with _PAGE_HPTEFLAGS \ 752 - * We filter HPTEFLAGS on set_pte. \ 753 - */ \ 754 - BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \ 755 - BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \ 756 - } while (0) 757 - /* 758 - * on pte we don't need handle RADIX_TREE_EXCEPTIONAL_SHIFT; 759 - */ 760 - #define SWP_TYPE_BITS 5 761 - #define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \ 762 - & ((1UL << SWP_TYPE_BITS) - 1)) 763 - #define __swp_offset(x) (((x).val & PTE_RPN_MASK) >> PTE_RPN_SHIFT) 764 - #define __swp_entry(type, offset) ((swp_entry_t) { \ 765 - ((type) << _PAGE_BIT_SWAP_TYPE) \ 766 - | (((offset) << PTE_RPN_SHIFT) & PTE_RPN_MASK)}) 767 - /* 768 - * swp_entry_t must be independent of pte bits. We build a swp_entry_t from 769 - * swap type and offset we get from swap and convert that to pte to find a 770 - * matching pte in linux page table. 771 - * Clear bits not found in swap entries here. 772 - */ 773 - #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) & ~_PAGE_PTE }) 774 - #define __swp_entry_to_pte(x) __pte((x).val | _PAGE_PTE) 775 - 776 - #ifdef CONFIG_MEM_SOFT_DIRTY 777 - #define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE)) 778 - #else 779 - #define _PAGE_SWP_SOFT_DIRTY 0UL 780 - #endif /* CONFIG_MEM_SOFT_DIRTY */ 781 - 782 - #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY 783 - static inline pte_t pte_swp_mksoft_dirty(pte_t pte) 784 - { 785 - return __pte(pte_val(pte) | _PAGE_SWP_SOFT_DIRTY); 786 - } 787 - static inline bool pte_swp_soft_dirty(pte_t pte) 788 - { 789 - return !!(pte_val(pte) & _PAGE_SWP_SOFT_DIRTY); 790 - } 791 - static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) 792 - { 793 - return __pte(pte_val(pte) & ~_PAGE_SWP_SOFT_DIRTY); 794 - } 795 - #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ 796 - 797 159 void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); 798 160 void pgtable_cache_init(void); 799 161 162 + static inline int map_kernel_page(unsigned long ea, unsigned long pa, 163 + unsigned long flags) 164 + { 165 + if (radix_enabled()) { 166 + #if defined(CONFIG_PPC_RADIX_MMU) && defined(DEBUG_VM) 167 + unsigned long page_size = 1 << mmu_psize_defs[mmu_io_psize].shift; 168 + WARN((page_size != PAGE_SIZE), "I/O page size != PAGE_SIZE"); 169 + #endif 170 + return radix__map_kernel_page(ea, pa, __pgprot(flags), PAGE_SIZE); 171 + } 172 + return hash__map_kernel_page(ea, pa, flags); 173 + } 174 + 175 + static inline int __meminit vmemmap_create_mapping(unsigned long start, 176 + unsigned long page_size, 177 + unsigned long phys) 178 + { 179 + if (radix_enabled()) 180 + return radix__vmemmap_create_mapping(start, page_size, phys); 181 + return hash__vmemmap_create_mapping(start, page_size, phys); 182 + } 183 + 184 + #ifdef CONFIG_MEMORY_HOTPLUG 185 + static inline void vmemmap_remove_mapping(unsigned long start, 186 + unsigned long page_size) 187 + { 188 + if (radix_enabled()) 189 + return radix__vmemmap_remove_mapping(start, page_size); 190 + return hash__vmemmap_remove_mapping(start, page_size); 191 + } 192 + #endif 800 193 struct page *realmode_pfn_to_page(unsigned long pfn); 801 - 802 - #ifdef CONFIG_TRANSPARENT_HUGEPAGE 803 - extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); 804 - extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); 805 - extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); 806 - extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, 807 - pmd_t *pmdp, pmd_t pmd); 808 - extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, 809 - pmd_t *pmd); 810 - #define has_transparent_hugepage has_transparent_hugepage 811 - extern int has_transparent_hugepage(void); 812 - #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 813 - 814 194 815 195 static inline pte_t pmd_pte(pmd_t pmd) 816 196 { ··· 794 238 { 795 239 return (pte_t *)pmd; 796 240 } 797 - 798 241 #define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd)) 799 242 #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) 800 243 #define pmd_young(pmd) pte_young(pmd_pte(pmd)) ··· 820 265 #define __HAVE_ARCH_PMD_WRITE 821 266 #define pmd_write(pmd) pte_write(pmd_pte(pmd)) 822 267 268 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 269 + extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); 270 + extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); 271 + extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); 272 + extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, 273 + pmd_t *pmdp, pmd_t pmd); 274 + extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, 275 + pmd_t *pmd); 276 + extern int hash__has_transparent_hugepage(void); 277 + static inline int has_transparent_hugepage(void) 278 + { 279 + if (radix_enabled()) 280 + return radix__has_transparent_hugepage(); 281 + return hash__has_transparent_hugepage(); 282 + } 283 + #define has_transparent_hugepage has_transparent_hugepage 284 + 285 + static inline unsigned long 286 + pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, 287 + unsigned long clr, unsigned long set) 288 + { 289 + if (radix_enabled()) 290 + return radix__pmd_hugepage_update(mm, addr, pmdp, clr, set); 291 + return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set); 292 + } 293 + 294 + static inline int pmd_large(pmd_t pmd) 295 + { 296 + return !!(pmd_val(pmd) & _PAGE_PTE); 297 + } 298 + 299 + static inline pmd_t pmd_mknotpresent(pmd_t pmd) 300 + { 301 + return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT); 302 + } 303 + /* 304 + * For radix we should always find H_PAGE_HASHPTE zero. Hence 305 + * the below will work for radix too 306 + */ 307 + static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, 308 + unsigned long addr, pmd_t *pmdp) 309 + { 310 + unsigned long old; 311 + 312 + if ((pmd_val(*pmdp) & (_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) 313 + return 0; 314 + old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0); 315 + return ((old & _PAGE_ACCESSED) != 0); 316 + } 317 + 318 + #define __HAVE_ARCH_PMDP_SET_WRPROTECT 319 + static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, 320 + pmd_t *pmdp) 321 + { 322 + 323 + if ((pmd_val(*pmdp) & _PAGE_WRITE) == 0) 324 + return; 325 + 326 + pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0); 327 + } 328 + 329 + static inline int pmd_trans_huge(pmd_t pmd) 330 + { 331 + if (radix_enabled()) 332 + return radix__pmd_trans_huge(pmd); 333 + return hash__pmd_trans_huge(pmd); 334 + } 335 + 336 + #define __HAVE_ARCH_PMD_SAME 337 + static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) 338 + { 339 + if (radix_enabled()) 340 + return radix__pmd_same(pmd_a, pmd_b); 341 + return hash__pmd_same(pmd_a, pmd_b); 342 + } 343 + 823 344 static inline pmd_t pmd_mkhuge(pmd_t pmd) 824 345 { 825 - return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_THP_HUGE)); 346 + if (radix_enabled()) 347 + return radix__pmd_mkhuge(pmd); 348 + return hash__pmd_mkhuge(pmd); 826 349 } 827 350 828 351 #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS ··· 911 278 #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG 912 279 extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, 913 280 unsigned long address, pmd_t *pmdp); 914 - #define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH 915 - extern int pmdp_clear_flush_young(struct vm_area_struct *vma, 916 - unsigned long address, pmd_t *pmdp); 917 281 918 282 #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR 919 - extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, 920 - unsigned long addr, pmd_t *pmdp); 283 + static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, 284 + unsigned long addr, pmd_t *pmdp) 285 + { 286 + if (radix_enabled()) 287 + return radix__pmdp_huge_get_and_clear(mm, addr, pmdp); 288 + return hash__pmdp_huge_get_and_clear(mm, addr, pmdp); 289 + } 921 290 922 - extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, 923 - unsigned long address, pmd_t *pmdp); 291 + static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, 292 + unsigned long address, pmd_t *pmdp) 293 + { 294 + if (radix_enabled()) 295 + return radix__pmdp_collapse_flush(vma, address, pmdp); 296 + return hash__pmdp_collapse_flush(vma, address, pmdp); 297 + } 924 298 #define pmdp_collapse_flush pmdp_collapse_flush 925 299 926 300 #define __HAVE_ARCH_PGTABLE_DEPOSIT 927 - extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 928 - pgtable_t pgtable); 301 + static inline void pgtable_trans_huge_deposit(struct mm_struct *mm, 302 + pmd_t *pmdp, pgtable_t pgtable) 303 + { 304 + if (radix_enabled()) 305 + return radix__pgtable_trans_huge_deposit(mm, pmdp, pgtable); 306 + return hash__pgtable_trans_huge_deposit(mm, pmdp, pgtable); 307 + } 308 + 929 309 #define __HAVE_ARCH_PGTABLE_WITHDRAW 930 - extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); 310 + static inline pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, 311 + pmd_t *pmdp) 312 + { 313 + if (radix_enabled()) 314 + return radix__pgtable_trans_huge_withdraw(mm, pmdp); 315 + return hash__pgtable_trans_huge_withdraw(mm, pmdp); 316 + } 931 317 932 318 #define __HAVE_ARCH_PMDP_INVALIDATE 933 319 extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, 934 320 pmd_t *pmdp); 935 321 936 322 #define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE 937 - extern void pmdp_huge_split_prepare(struct vm_area_struct *vma, 938 - unsigned long address, pmd_t *pmdp); 323 + static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, 324 + unsigned long address, pmd_t *pmdp) 325 + { 326 + if (radix_enabled()) 327 + return radix__pmdp_huge_split_prepare(vma, address, pmdp); 328 + return hash__pmdp_huge_split_prepare(vma, address, pmdp); 329 + } 939 330 940 331 #define pmd_move_must_withdraw pmd_move_must_withdraw 941 332 struct spinlock; 942 333 static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, 943 334 struct spinlock *old_pmd_ptl) 944 335 { 336 + if (radix_enabled()) 337 + return false; 945 338 /* 946 339 * Archs like ppc64 use pgtable to store per pmd 947 340 * specific information. So when we switch the pmd, ··· 975 316 */ 976 317 return true; 977 318 } 319 + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 978 320 #endif /* __ASSEMBLY__ */ 979 321 #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */

+12

arch/powerpc/include/asm/book3s/64/radix-4k.h

··· 1 + #ifndef _ASM_POWERPC_PGTABLE_RADIX_4K_H 2 + #define _ASM_POWERPC_PGTABLE_RADIX_4K_H 3 + 4 + /* 5 + * For 4K page size supported index is 13/9/9/9 6 + */ 7 + #define RADIX_PTE_INDEX_SIZE 9 /* 2MB huge page */ 8 + #define RADIX_PMD_INDEX_SIZE 9 /* 1G huge page */ 9 + #define RADIX_PUD_INDEX_SIZE 9 10 + #define RADIX_PGD_INDEX_SIZE 13 11 + 12 + #endif /* _ASM_POWERPC_PGTABLE_RADIX_4K_H */

+12

arch/powerpc/include/asm/book3s/64/radix-64k.h

··· 1 + #ifndef _ASM_POWERPC_PGTABLE_RADIX_64K_H 2 + #define _ASM_POWERPC_PGTABLE_RADIX_64K_H 3 + 4 + /* 5 + * For 64K page size supported index is 13/9/9/5 6 + */ 7 + #define RADIX_PTE_INDEX_SIZE 5 /* 2MB huge page */ 8 + #define RADIX_PMD_INDEX_SIZE 9 /* 1G huge page */ 9 + #define RADIX_PUD_INDEX_SIZE 9 10 + #define RADIX_PGD_INDEX_SIZE 13 11 + 12 + #endif /* _ASM_POWERPC_PGTABLE_RADIX_64K_H */

+232

arch/powerpc/include/asm/book3s/64/radix.h

··· 1 + #ifndef _ASM_POWERPC_PGTABLE_RADIX_H 2 + #define _ASM_POWERPC_PGTABLE_RADIX_H 3 + 4 + #ifndef __ASSEMBLY__ 5 + #include <asm/cmpxchg.h> 6 + #endif 7 + 8 + #ifdef CONFIG_PPC_64K_PAGES 9 + #include <asm/book3s/64/radix-64k.h> 10 + #else 11 + #include <asm/book3s/64/radix-4k.h> 12 + #endif 13 + 14 + /* An empty PTE can still have a R or C writeback */ 15 + #define RADIX_PTE_NONE_MASK (_PAGE_DIRTY | _PAGE_ACCESSED) 16 + 17 + /* Bits to set in a RPMD/RPUD/RPGD */ 18 + #define RADIX_PMD_VAL_BITS (0x8000000000000000UL | RADIX_PTE_INDEX_SIZE) 19 + #define RADIX_PUD_VAL_BITS (0x8000000000000000UL | RADIX_PMD_INDEX_SIZE) 20 + #define RADIX_PGD_VAL_BITS (0x8000000000000000UL | RADIX_PUD_INDEX_SIZE) 21 + 22 + /* Don't have anything in the reserved bits and leaf bits */ 23 + #define RADIX_PMD_BAD_BITS 0x60000000000000e0UL 24 + #define RADIX_PUD_BAD_BITS 0x60000000000000e0UL 25 + #define RADIX_PGD_BAD_BITS 0x60000000000000e0UL 26 + 27 + /* 28 + * Size of EA range mapped by our pagetables. 29 + */ 30 + #define RADIX_PGTABLE_EADDR_SIZE (RADIX_PTE_INDEX_SIZE + RADIX_PMD_INDEX_SIZE + \ 31 + RADIX_PUD_INDEX_SIZE + RADIX_PGD_INDEX_SIZE + PAGE_SHIFT) 32 + #define RADIX_PGTABLE_RANGE (ASM_CONST(1) << RADIX_PGTABLE_EADDR_SIZE) 33 + 34 + /* 35 + * We support 52 bit address space, Use top bit for kernel 36 + * virtual mapping. Also make sure kernel fit in the top 37 + * quadrant. 38 + * 39 + * +------------------+ 40 + * +------------------+ Kernel virtual map (0xc008000000000000) 41 + * | | 42 + * | | 43 + * | | 44 + * 0b11......+------------------+ Kernel linear map (0xc....) 45 + * | | 46 + * | 2 quadrant | 47 + * | | 48 + * 0b10......+------------------+ 49 + * | | 50 + * | 1 quadrant | 51 + * | | 52 + * 0b01......+------------------+ 53 + * | | 54 + * | 0 quadrant | 55 + * | | 56 + * 0b00......+------------------+ 57 + * 58 + * 59 + * 3rd quadrant expanded: 60 + * +------------------------------+ 61 + * | | 62 + * | | 63 + * | | 64 + * +------------------------------+ Kernel IO map end (0xc010000000000000) 65 + * | | 66 + * | | 67 + * | 1/2 of virtual map | 68 + * | | 69 + * | | 70 + * +------------------------------+ Kernel IO map start 71 + * | | 72 + * | 1/4 of virtual map | 73 + * | | 74 + * +------------------------------+ Kernel vmemap start 75 + * | | 76 + * | 1/4 of virtual map | 77 + * | | 78 + * +------------------------------+ Kernel virt start (0xc008000000000000) 79 + * | | 80 + * | | 81 + * | | 82 + * +------------------------------+ Kernel linear (0xc.....) 83 + */ 84 + 85 + #define RADIX_KERN_VIRT_START ASM_CONST(0xc008000000000000) 86 + #define RADIX_KERN_VIRT_SIZE ASM_CONST(0x0008000000000000) 87 + 88 + /* 89 + * The vmalloc space starts at the beginning of that region, and 90 + * occupies a quarter of it on radix config. 91 + * (we keep a quarter for the virtual memmap) 92 + */ 93 + #define RADIX_VMALLOC_START RADIX_KERN_VIRT_START 94 + #define RADIX_VMALLOC_SIZE (RADIX_KERN_VIRT_SIZE >> 2) 95 + #define RADIX_VMALLOC_END (RADIX_VMALLOC_START + RADIX_VMALLOC_SIZE) 96 + /* 97 + * Defines the address of the vmemap area, in its own region on 98 + * hash table CPUs. 99 + */ 100 + #define RADIX_VMEMMAP_BASE (RADIX_VMALLOC_END) 101 + 102 + #ifndef __ASSEMBLY__ 103 + #define RADIX_PTE_TABLE_SIZE (sizeof(pte_t) << RADIX_PTE_INDEX_SIZE) 104 + #define RADIX_PMD_TABLE_SIZE (sizeof(pmd_t) << RADIX_PMD_INDEX_SIZE) 105 + #define RADIX_PUD_TABLE_SIZE (sizeof(pud_t) << RADIX_PUD_INDEX_SIZE) 106 + #define RADIX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE) 107 + 108 + static inline unsigned long radix__pte_update(struct mm_struct *mm, 109 + unsigned long addr, 110 + pte_t *ptep, unsigned long clr, 111 + unsigned long set, 112 + int huge) 113 + { 114 + pte_t pte; 115 + unsigned long old_pte, new_pte; 116 + 117 + do { 118 + pte = READ_ONCE(*ptep); 119 + old_pte = pte_val(pte); 120 + new_pte = (old_pte | set) & ~clr; 121 + 122 + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); 123 + 124 + /* We already do a sync in cmpxchg, is ptesync needed ?*/ 125 + asm volatile("ptesync" : : : "memory"); 126 + /* huge pages use the old page table lock */ 127 + if (!huge) 128 + assert_pte_locked(mm, addr); 129 + 130 + return old_pte; 131 + } 132 + 133 + /* 134 + * Set the dirty and/or accessed bits atomically in a linux PTE, this 135 + * function doesn't need to invalidate tlb. 136 + */ 137 + static inline void radix__ptep_set_access_flags(pte_t *ptep, pte_t entry) 138 + { 139 + pte_t pte; 140 + unsigned long old_pte, new_pte; 141 + unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED | 142 + _PAGE_RW | _PAGE_EXEC); 143 + do { 144 + pte = READ_ONCE(*ptep); 145 + old_pte = pte_val(pte); 146 + new_pte = old_pte | set; 147 + 148 + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); 149 + 150 + /* We already do a sync in cmpxchg, is ptesync needed ?*/ 151 + asm volatile("ptesync" : : : "memory"); 152 + } 153 + 154 + static inline int radix__pte_same(pte_t pte_a, pte_t pte_b) 155 + { 156 + return ((pte_raw(pte_a) ^ pte_raw(pte_b)) == 0); 157 + } 158 + 159 + static inline int radix__pte_none(pte_t pte) 160 + { 161 + return (pte_val(pte) & ~RADIX_PTE_NONE_MASK) == 0; 162 + } 163 + 164 + static inline void radix__set_pte_at(struct mm_struct *mm, unsigned long addr, 165 + pte_t *ptep, pte_t pte, int percpu) 166 + { 167 + *ptep = pte; 168 + asm volatile("ptesync" : : : "memory"); 169 + } 170 + 171 + static inline int radix__pmd_bad(pmd_t pmd) 172 + { 173 + return !!(pmd_val(pmd) & RADIX_PMD_BAD_BITS); 174 + } 175 + 176 + static inline int radix__pmd_same(pmd_t pmd_a, pmd_t pmd_b) 177 + { 178 + return ((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) == 0); 179 + } 180 + 181 + static inline int radix__pud_bad(pud_t pud) 182 + { 183 + return !!(pud_val(pud) & RADIX_PUD_BAD_BITS); 184 + } 185 + 186 + 187 + static inline int radix__pgd_bad(pgd_t pgd) 188 + { 189 + return !!(pgd_val(pgd) & RADIX_PGD_BAD_BITS); 190 + } 191 + 192 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 193 + 194 + static inline int radix__pmd_trans_huge(pmd_t pmd) 195 + { 196 + return !!(pmd_val(pmd) & _PAGE_PTE); 197 + } 198 + 199 + static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) 200 + { 201 + return __pmd(pmd_val(pmd) | _PAGE_PTE); 202 + } 203 + static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma, 204 + unsigned long address, pmd_t *pmdp) 205 + { 206 + /* Nothing to do for radix. */ 207 + return; 208 + } 209 + 210 + extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, 211 + pmd_t *pmdp, unsigned long clr, 212 + unsigned long set); 213 + extern pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, 214 + unsigned long address, pmd_t *pmdp); 215 + extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 216 + pgtable_t pgtable); 217 + extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); 218 + extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, 219 + unsigned long addr, pmd_t *pmdp); 220 + extern int radix__has_transparent_hugepage(void); 221 + #endif 222 + 223 + extern int __meminit radix__vmemmap_create_mapping(unsigned long start, 224 + unsigned long page_size, 225 + unsigned long phys); 226 + extern void radix__vmemmap_remove_mapping(unsigned long start, 227 + unsigned long page_size); 228 + 229 + extern int radix__map_kernel_page(unsigned long ea, unsigned long pa, 230 + pgprot_t flags, unsigned int psz); 231 + #endif /* __ASSEMBLY__ */ 232 + #endif

+24 -17

arch/powerpc/include/asm/book3s/64/tlbflush-hash.h

··· 1 1 #ifndef _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H 2 2 #define _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H 3 3 4 - #define MMU_NO_CONTEXT 0 5 - 6 4 /* 7 5 * TLB flushing for 64-bit hash-MMU CPUs 8 6 */ ··· 27 29 28 30 static inline void arch_enter_lazy_mmu_mode(void) 29 31 { 30 - struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch); 32 + struct ppc64_tlb_batch *batch; 31 33 34 + if (radix_enabled()) 35 + return; 36 + batch = this_cpu_ptr(&ppc64_tlb_batch); 32 37 batch->active = 1; 33 38 } 34 39 35 40 static inline void arch_leave_lazy_mmu_mode(void) 36 41 { 37 - struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch); 42 + struct ppc64_tlb_batch *batch; 43 + 44 + if (radix_enabled()) 45 + return; 46 + batch = this_cpu_ptr(&ppc64_tlb_batch); 38 47 39 48 if (batch->index) 40 49 __flush_tlb_pending(batch); ··· 57 52 extern void flush_hash_hugepage(unsigned long vsid, unsigned long addr, 58 53 pmd_t *pmdp, unsigned int psize, int ssize, 59 54 unsigned long flags); 60 - 61 - static inline void local_flush_tlb_mm(struct mm_struct *mm) 55 + static inline void hash__local_flush_tlb_mm(struct mm_struct *mm) 62 56 { 63 57 } 64 58 65 - static inline void flush_tlb_mm(struct mm_struct *mm) 59 + static inline void hash__flush_tlb_mm(struct mm_struct *mm) 66 60 { 67 61 } 68 62 69 - static inline void local_flush_tlb_page(struct vm_area_struct *vma, 70 - unsigned long vmaddr) 63 + static inline void hash__local_flush_tlb_page(struct vm_area_struct *vma, 64 + unsigned long vmaddr) 71 65 { 72 66 } 73 67 74 - static inline void flush_tlb_page(struct vm_area_struct *vma, 75 - unsigned long vmaddr) 68 + static inline void hash__flush_tlb_page(struct vm_area_struct *vma, 69 + unsigned long vmaddr) 76 70 { 77 71 } 78 72 79 - static inline void flush_tlb_page_nohash(struct vm_area_struct *vma, 80 - unsigned long vmaddr) 73 + static inline void hash__flush_tlb_page_nohash(struct vm_area_struct *vma, 74 + unsigned long vmaddr) 81 75 { 82 76 } 83 77 84 - static inline void flush_tlb_range(struct vm_area_struct *vma, 85 - unsigned long start, unsigned long end) 78 + static inline void hash__flush_tlb_range(struct vm_area_struct *vma, 79 + unsigned long start, unsigned long end) 86 80 { 87 81 } 88 82 89 - static inline void flush_tlb_kernel_range(unsigned long start, 90 - unsigned long end) 83 + static inline void hash__flush_tlb_kernel_range(unsigned long start, 84 + unsigned long end) 91 85 { 92 86 } 93 87 88 + 89 + struct mmu_gather; 90 + extern void hash__tlb_flush(struct mmu_gather *tlb); 94 91 /* Private function for use by PCI IO mapping code */ 95 92 extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, 96 93 unsigned long end);

+33

arch/powerpc/include/asm/book3s/64/tlbflush-radix.h

··· 1 + #ifndef _ASM_POWERPC_TLBFLUSH_RADIX_H 2 + #define _ASM_POWERPC_TLBFLUSH_RADIX_H 3 + 4 + struct vm_area_struct; 5 + struct mm_struct; 6 + struct mmu_gather; 7 + 8 + static inline int mmu_get_ap(int psize) 9 + { 10 + return mmu_psize_defs[psize].ap; 11 + } 12 + 13 + extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, 14 + unsigned long end); 15 + extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end); 16 + 17 + extern void radix__local_flush_tlb_mm(struct mm_struct *mm); 18 + extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); 19 + extern void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, 20 + unsigned long ap, int nid); 21 + extern void radix__tlb_flush(struct mmu_gather *tlb); 22 + #ifdef CONFIG_SMP 23 + extern void radix__flush_tlb_mm(struct mm_struct *mm); 24 + extern void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); 25 + extern void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, 26 + unsigned long ap, int nid); 27 + #else 28 + #define radix__flush_tlb_mm(mm) radix__local_flush_tlb_mm(mm) 29 + #define radix__flush_tlb_page(vma,addr) radix__local_flush_tlb_page(vma,addr) 30 + #define radix___flush_tlb_page(mm,addr,p,i) radix___local_flush_tlb_page(mm,addr,p,i) 31 + #endif 32 + 33 + #endif

+76

arch/powerpc/include/asm/book3s/64/tlbflush.h

··· 1 + #ifndef _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H 2 + #define _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H 3 + 4 + #define MMU_NO_CONTEXT ~0UL 5 + 6 + 7 + #include <asm/book3s/64/tlbflush-hash.h> 8 + #include <asm/book3s/64/tlbflush-radix.h> 9 + 10 + static inline void flush_tlb_range(struct vm_area_struct *vma, 11 + unsigned long start, unsigned long end) 12 + { 13 + if (radix_enabled()) 14 + return radix__flush_tlb_range(vma, start, end); 15 + return hash__flush_tlb_range(vma, start, end); 16 + } 17 + 18 + static inline void flush_tlb_kernel_range(unsigned long start, 19 + unsigned long end) 20 + { 21 + if (radix_enabled()) 22 + return radix__flush_tlb_kernel_range(start, end); 23 + return hash__flush_tlb_kernel_range(start, end); 24 + } 25 + 26 + static inline void local_flush_tlb_mm(struct mm_struct *mm) 27 + { 28 + if (radix_enabled()) 29 + return radix__local_flush_tlb_mm(mm); 30 + return hash__local_flush_tlb_mm(mm); 31 + } 32 + 33 + static inline void local_flush_tlb_page(struct vm_area_struct *vma, 34 + unsigned long vmaddr) 35 + { 36 + if (radix_enabled()) 37 + return radix__local_flush_tlb_page(vma, vmaddr); 38 + return hash__local_flush_tlb_page(vma, vmaddr); 39 + } 40 + 41 + static inline void flush_tlb_page_nohash(struct vm_area_struct *vma, 42 + unsigned long vmaddr) 43 + { 44 + if (radix_enabled()) 45 + return radix__flush_tlb_page(vma, vmaddr); 46 + return hash__flush_tlb_page_nohash(vma, vmaddr); 47 + } 48 + 49 + static inline void tlb_flush(struct mmu_gather *tlb) 50 + { 51 + if (radix_enabled()) 52 + return radix__tlb_flush(tlb); 53 + return hash__tlb_flush(tlb); 54 + } 55 + 56 + #ifdef CONFIG_SMP 57 + static inline void flush_tlb_mm(struct mm_struct *mm) 58 + { 59 + if (radix_enabled()) 60 + return radix__flush_tlb_mm(mm); 61 + return hash__flush_tlb_mm(mm); 62 + } 63 + 64 + static inline void flush_tlb_page(struct vm_area_struct *vma, 65 + unsigned long vmaddr) 66 + { 67 + if (radix_enabled()) 68 + return radix__flush_tlb_page(vma, vmaddr); 69 + return hash__flush_tlb_page(vma, vmaddr); 70 + } 71 + #else 72 + #define flush_tlb_mm(mm) local_flush_tlb_mm(mm) 73 + #define flush_tlb_page(vma, addr) local_flush_tlb_page(vma, addr) 74 + #endif /* CONFIG_SMP */ 75 + 76 + #endif /* _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H */

+19

arch/powerpc/include/asm/book3s/pgalloc.h

··· 1 + #ifndef _ASM_POWERPC_BOOK3S_PGALLOC_H 2 + #define _ASM_POWERPC_BOOK3S_PGALLOC_H 3 + 4 + #include <linux/mm.h> 5 + 6 + extern void tlb_remove_table(struct mmu_gather *tlb, void *table); 7 + static inline void tlb_flush_pgtable(struct mmu_gather *tlb, 8 + unsigned long address) 9 + { 10 + 11 + } 12 + 13 + #ifdef CONFIG_PPC64 14 + #include <asm/book3s/64/pgalloc.h> 15 + #else 16 + #include <asm/book3s/32/pgalloc.h> 17 + #endif 18 + 19 + #endif /* _ASM_POWERPC_BOOK3S_PGALLOC_H */

+14

arch/powerpc/include/asm/hugetlb.h

··· 8 8 extern struct kmem_cache *hugepte_cache; 9 9 10 10 #ifdef CONFIG_PPC_BOOK3S_64 11 + 12 + #include <asm/book3s/64/hugetlb-radix.h> 11 13 /* 12 14 * This should work for other subarchs too. But right now we use the 13 15 * new format only for 64bit book3s ··· 33 31 { 34 32 return mmu_psize_to_shift(hugepd_mmu_psize(hpd)); 35 33 } 34 + static inline void flush_hugetlb_page(struct vm_area_struct *vma, 35 + unsigned long vmaddr) 36 + { 37 + if (radix_enabled()) 38 + return radix__flush_hugetlb_page(vma, vmaddr); 39 + } 36 40 41 + static inline void __local_flush_hugetlb_page(struct vm_area_struct *vma, 42 + unsigned long vmaddr) 43 + { 44 + if (radix_enabled()) 45 + return radix__local_flush_hugetlb_page(vma, vmaddr); 46 + } 37 47 #else 38 48 39 49 static inline pte_t *hugepd_page(hugepd_t hpd)

+13 -23

arch/powerpc/include/asm/kvm_book3s_64.h

··· 276 276 return ptel; 277 277 } 278 278 279 - static inline int hpte_cache_flags_ok(unsigned long ptel, unsigned long io_type) 279 + static inline bool hpte_cache_flags_ok(unsigned long hptel, bool is_ci) 280 280 { 281 - unsigned int wimg = ptel & HPTE_R_WIMG; 281 + unsigned int wimg = hptel & HPTE_R_WIMG; 282 282 283 283 /* Handle SAO */ 284 284 if (wimg == (HPTE_R_W | HPTE_R_I | HPTE_R_M) && 285 285 cpu_has_feature(CPU_FTR_ARCH_206)) 286 286 wimg = HPTE_R_M; 287 287 288 - if (!io_type) 288 + if (!is_ci) 289 289 return wimg == HPTE_R_M; 290 - 291 - return (wimg & (HPTE_R_W | HPTE_R_I)) == io_type; 290 + /* 291 + * if host is mapped cache inhibited, make sure hptel also have 292 + * cache inhibited. 293 + */ 294 + if (wimg & HPTE_R_W) /* FIXME!! is this ok for all guest. ? */ 295 + return false; 296 + return !!(wimg & HPTE_R_I); 292 297 } 293 298 294 299 /* ··· 310 305 */ 311 306 old_pte = READ_ONCE(*ptep); 312 307 /* 313 - * wait until _PAGE_BUSY is clear then set it atomically 308 + * wait until H_PAGE_BUSY is clear then set it atomically 314 309 */ 315 - if (unlikely(pte_val(old_pte) & _PAGE_BUSY)) { 310 + if (unlikely(pte_val(old_pte) & H_PAGE_BUSY)) { 316 311 cpu_relax(); 317 312 continue; 318 313 } ··· 324 319 if (writing && pte_write(old_pte)) 325 320 new_pte = pte_mkdirty(new_pte); 326 321 327 - if (pte_val(old_pte) == __cmpxchg_u64((unsigned long *)ptep, 328 - pte_val(old_pte), 329 - pte_val(new_pte))) { 322 + if (pte_xchg(ptep, old_pte, new_pte)) 330 323 break; 331 - } 332 324 } 333 325 return new_pte; 334 - } 335 - 336 - 337 - /* Return HPTE cache control bits corresponding to Linux pte bits */ 338 - static inline unsigned long hpte_cache_bits(unsigned long pte_val) 339 - { 340 - #if _PAGE_NO_CACHE == HPTE_R_I && _PAGE_WRITETHRU == HPTE_R_W 341 - return pte_val & (HPTE_R_W | HPTE_R_I); 342 - #else 343 - return ((pte_val & _PAGE_NO_CACHE) ? HPTE_R_I : 0) + 344 - ((pte_val & _PAGE_WRITETHRU) ? HPTE_R_W : 0); 345 - #endif 346 326 } 347 327 348 328 static inline bool hpte_read_permission(unsigned long pp, unsigned long key)

+1

arch/powerpc/include/asm/machdep.h

··· 256 256 #ifdef CONFIG_ARCH_RANDOM 257 257 int (*get_random_seed)(unsigned long *v); 258 258 #endif 259 + int (*update_partition_table)(u64); 259 260 }; 260 261 261 262 extern void e500_idle(void);

+39 -12

arch/powerpc/include/asm/mmu.h

··· 88 88 */ 89 89 #define MMU_FTR_1T_SEGMENT ASM_CONST(0x40000000) 90 90 91 + /* 92 + * Radix page table available 93 + */ 94 + #define MMU_FTR_RADIX ASM_CONST(0x80000000) 95 + 91 96 /* MMU feature bit sets for various CPUs */ 92 97 #define MMU_FTRS_DEFAULT_HPTE_ARCH_V2 \ 93 98 MMU_FTR_HPTE_TABLE | MMU_FTR_PPCAS_ARCH_V2 ··· 115 110 DECLARE_PER_CPU(int, next_tlbcam_idx); 116 111 #endif 117 112 113 + enum { 114 + MMU_FTRS_POSSIBLE = MMU_FTR_HPTE_TABLE | MMU_FTR_TYPE_8xx | 115 + MMU_FTR_TYPE_40x | MMU_FTR_TYPE_44x | MMU_FTR_TYPE_FSL_E | 116 + MMU_FTR_TYPE_47x | MMU_FTR_USE_HIGH_BATS | MMU_FTR_BIG_PHYS | 117 + MMU_FTR_USE_TLBIVAX_BCAST | MMU_FTR_USE_TLBILX | 118 + MMU_FTR_LOCK_BCAST_INVAL | MMU_FTR_NEED_DTLB_SW_LRU | 119 + MMU_FTR_USE_TLBRSRV | MMU_FTR_USE_PAIRED_MAS | 120 + MMU_FTR_NO_SLBIE_B | MMU_FTR_16M_PAGE | MMU_FTR_TLBIEL | 121 + MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE | 122 + MMU_FTR_1T_SEGMENT | 123 + #ifdef CONFIG_PPC_RADIX_MMU 124 + MMU_FTR_RADIX | 125 + #endif 126 + 0, 127 + }; 128 + 118 129 static inline int mmu_has_feature(unsigned long feature) 119 130 { 120 - return (cur_cpu_spec->mmu_features & feature); 131 + return (MMU_FTRS_POSSIBLE & cur_cpu_spec->mmu_features & feature); 121 132 } 122 133 123 134 static inline void mmu_clear_feature(unsigned long feature) ··· 142 121 } 143 122 144 123 extern unsigned int __start___mmu_ftr_fixup, __stop___mmu_ftr_fixup; 145 - 146 - /* MMU initialization */ 147 - extern void early_init_mmu(void); 148 - extern void early_init_mmu_secondary(void); 149 - 150 - extern void setup_initial_memory_limit(phys_addr_t first_memblock_base, 151 - phys_addr_t first_memblock_size); 152 124 153 125 #ifdef CONFIG_PPC64 154 126 /* This is our real memory area size on ppc64 server, on embedded, we ··· 195 181 196 182 #define MMU_PAGE_COUNT 15 197 183 198 - #if defined(CONFIG_PPC_STD_MMU_64) 199 - /* 64-bit classic hash table MMU */ 200 - #include <asm/book3s/64/mmu-hash.h> 201 - #elif defined(CONFIG_PPC_STD_MMU_32) 184 + #ifdef CONFIG_PPC_BOOK3S_64 185 + #include <asm/book3s/64/mmu.h> 186 + #else /* CONFIG_PPC_BOOK3S_64 */ 187 + 188 + #ifndef __ASSEMBLY__ 189 + /* MMU initialization */ 190 + extern void early_init_mmu(void); 191 + extern void early_init_mmu_secondary(void); 192 + extern void setup_initial_memory_limit(phys_addr_t first_memblock_base, 193 + phys_addr_t first_memblock_size); 194 + #endif /* __ASSEMBLY__ */ 195 + #endif 196 + 197 + #if defined(CONFIG_PPC_STD_MMU_32) 202 198 /* 32-bit classic hash table MMU */ 203 199 #include <asm/book3s/32/mmu-hash.h> 204 200 #elif defined(CONFIG_40x) ··· 225 201 # include <asm/mmu-8xx.h> 226 202 #endif 227 203 204 + #ifndef radix_enabled 205 + #define radix_enabled() (0) 206 + #endif 228 207 229 208 #endif /* __KERNEL__ */ 230 209 #endif /* _ASM_POWERPC_MMU_H_ */

+17 -12

arch/powerpc/include/asm/mmu_context.h

··· 33 33 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); 34 34 extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem); 35 35 #endif 36 - 37 - extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next); 38 36 extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm); 39 37 extern void set_context(unsigned long id, pgd_t *pgd); 40 38 41 39 #ifdef CONFIG_PPC_BOOK3S_64 40 + extern void radix__switch_mmu_context(struct mm_struct *prev, 41 + struct mm_struct *next); 42 + static inline void switch_mmu_context(struct mm_struct *prev, 43 + struct mm_struct *next, 44 + struct task_struct *tsk) 45 + { 46 + if (radix_enabled()) 47 + return radix__switch_mmu_context(prev, next); 48 + return switch_slb(tsk, next); 49 + } 50 + 42 51 extern int __init_new_context(void); 43 52 extern void __destroy_context(int context_id); 44 53 static inline void mmu_context_init(void) { } 45 54 #else 55 + extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, 56 + struct task_struct *tsk); 46 57 extern unsigned long __init_new_context(void); 47 58 extern void __destroy_context(unsigned long context_id); 48 59 extern void mmu_context_init(void); ··· 99 88 if (cpu_has_feature(CPU_FTR_ALTIVEC)) 100 89 asm volatile ("dssall"); 101 90 #endif /* CONFIG_ALTIVEC */ 102 - 103 - /* The actual HW switching method differs between the various 104 - * sub architectures. 91 + /* 92 + * The actual HW switching method differs between the various 93 + * sub architectures. Out of line for now 105 94 */ 106 - #ifdef CONFIG_PPC_STD_MMU_64 107 - switch_slb(tsk, next); 108 - #else 109 - /* Out of line for now */ 110 - switch_mmu_context(prev, next); 111 - #endif 112 - 95 + switch_mmu_context(prev, next, tsk); 113 96 } 114 97 115 98 #define deactivate_mm(tsk,mm) do { } while (0)

+7 -3

arch/powerpc/include/asm/nohash/64/pgtable.h

··· 108 108 #ifndef __ASSEMBLY__ 109 109 /* pte_clear moved to later in this file */ 110 110 111 - /* Pointers in the page table tree are virtual addresses */ 112 - #define __pgtable_ptr_val(ptr) ((unsigned long)(ptr)) 113 - 114 111 #define PMD_BAD_BITS (PTE_TABLE_SIZE-1) 115 112 #define PUD_BAD_BITS (PMD_TABLE_SIZE-1) 116 113 ··· 359 362 360 363 void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); 361 364 void pgtable_cache_init(void); 365 + extern int map_kernel_page(unsigned long ea, unsigned long pa, 366 + unsigned long flags); 367 + extern int __meminit vmemmap_create_mapping(unsigned long start, 368 + unsigned long page_size, 369 + unsigned long phys); 370 + extern void vmemmap_remove_mapping(unsigned long start, 371 + unsigned long page_size); 362 372 #endif /* __ASSEMBLY__ */ 363 373 364 374 #endif /* _ASM_POWERPC_NOHASH_64_PGTABLE_H */

+23

arch/powerpc/include/asm/nohash/pgalloc.h

··· 1 + #ifndef _ASM_POWERPC_NOHASH_PGALLOC_H 2 + #define _ASM_POWERPC_NOHASH_PGALLOC_H 3 + 4 + #include <linux/mm.h> 5 + 6 + extern void tlb_remove_table(struct mmu_gather *tlb, void *table); 7 + #ifdef CONFIG_PPC64 8 + extern void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address); 9 + #else 10 + /* 44x etc which is BOOKE not BOOK3E */ 11 + static inline void tlb_flush_pgtable(struct mmu_gather *tlb, 12 + unsigned long address) 13 + { 14 + 15 + } 16 + #endif /* !CONFIG_PPC_BOOK3E */ 17 + 18 + #ifdef CONFIG_PPC64 19 + #include <asm/nohash/64/pgalloc.h> 20 + #else 21 + #include <asm/nohash/32/pgalloc.h> 22 + #endif 23 + #endif /* _ASM_POWERPC_NOHASH_PGALLOC_H */

+8 -8

arch/powerpc/include/asm/opal-api.h

··· 368 368 }; 369 369 370 370 enum opal_msg_type { 371 - OPAL_MSG_ASYNC_COMP = 0, /* params[0] = token, params[1] = rc, 371 + OPAL_MSG_ASYNC_COMP = 0, /* params[0] = token, params[1] = rc, 372 372 * additional params function-specific 373 373 */ 374 - OPAL_MSG_MEM_ERR, 375 - OPAL_MSG_EPOW, 376 - OPAL_MSG_SHUTDOWN, /* params[0] = 1 reboot, 0 shutdown */ 377 - OPAL_MSG_HMI_EVT, 378 - OPAL_MSG_DPO, 379 - OPAL_MSG_PRD, 380 - OPAL_MSG_OCC, 374 + OPAL_MSG_MEM_ERR = 1, 375 + OPAL_MSG_EPOW = 2, 376 + OPAL_MSG_SHUTDOWN = 3, /* params[0] = 1 reboot, 0 shutdown */ 377 + OPAL_MSG_HMI_EVT = 4, 378 + OPAL_MSG_DPO = 5, 379 + OPAL_MSG_PRD = 6, 380 + OPAL_MSG_OCC = 7, 381 381 OPAL_MSG_TYPE_MAX, 382 382 }; 383 383

+13 -1

arch/powerpc/include/asm/page.h

··· 288 288 289 289 #ifndef __ASSEMBLY__ 290 290 291 + #ifdef CONFIG_PPC_BOOK3S_64 292 + #include <asm/pgtable-be-types.h> 293 + #else 291 294 #include <asm/pgtable-types.h> 295 + #endif 292 296 293 297 typedef struct { signed long pd; } hugepd_t; 294 298 ··· 316 312 #endif 317 313 318 314 struct vm_area_struct; 319 - 315 + #ifdef CONFIG_PPC_BOOK3S_64 316 + /* 317 + * For BOOK3s 64 with 4k and 64K linux page size 318 + * we want to use pointers, because the page table 319 + * actually store pfn 320 + */ 321 + typedef pte_t *pgtable_t; 322 + #else 320 323 #if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC64) 321 324 typedef pte_t *pgtable_t; 322 325 #else 323 326 typedef struct page *pgtable_t; 327 + #endif 324 328 #endif 325 329 326 330 #include <asm-generic/memory_model.h>

+1 -4

arch/powerpc/include/asm/page_64.h

··· 93 93 94 94 #define SLICE_LOW_TOP (0x100000000ul) 95 95 #define SLICE_NUM_LOW (SLICE_LOW_TOP >> SLICE_LOW_SHIFT) 96 - #define SLICE_NUM_HIGH (PGTABLE_RANGE >> SLICE_HIGH_SHIFT) 96 + #define SLICE_NUM_HIGH (H_PGTABLE_RANGE >> SLICE_HIGH_SHIFT) 97 97 98 98 #define GET_LOW_SLICE_INDEX(addr) ((addr) >> SLICE_LOW_SHIFT) 99 99 #define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT) ··· 128 128 extern void slice_set_range_psize(struct mm_struct *mm, unsigned long start, 129 129 unsigned long len, unsigned int psize); 130 130 131 - #define slice_mm_new_context(mm) ((mm)->context.id == MMU_NO_CONTEXT) 132 - 133 131 #endif /* __ASSEMBLY__ */ 134 132 #else 135 133 #define slice_init() ··· 149 151 150 152 #define slice_set_range_psize(mm, start, len, psize) \ 151 153 slice_set_user_psize((mm), (psize)) 152 - #define slice_mm_new_context(mm) 1 153 154 #endif /* CONFIG_PPC_MM_SLICES */ 154 155 155 156 #ifdef CONFIG_HUGETLB_PAGE

+22 -19

arch/powerpc/include/asm/pci-bridge.h

··· 17 17 * PCI controller operations 18 18 */ 19 19 struct pci_controller_ops { 20 - void (*dma_dev_setup)(struct pci_dev *dev); 20 + void (*dma_dev_setup)(struct pci_dev *pdev); 21 21 void (*dma_bus_setup)(struct pci_bus *bus); 22 22 23 - int (*probe_mode)(struct pci_bus *); 23 + int (*probe_mode)(struct pci_bus *bus); 24 24 25 25 /* Called when pci_enable_device() is called. Returns true to 26 26 * allow assignment/enabling of the device. */ 27 - bool (*enable_device_hook)(struct pci_dev *); 27 + bool (*enable_device_hook)(struct pci_dev *pdev); 28 28 29 - void (*disable_device)(struct pci_dev *); 29 + void (*disable_device)(struct pci_dev *pdev); 30 30 31 - void (*release_device)(struct pci_dev *); 31 + void (*release_device)(struct pci_dev *pdev); 32 32 33 33 /* Called during PCI resource reassignment */ 34 - resource_size_t (*window_alignment)(struct pci_bus *, unsigned long type); 35 - void (*reset_secondary_bus)(struct pci_dev *dev); 34 + resource_size_t (*window_alignment)(struct pci_bus *bus, 35 + unsigned long type); 36 + void (*reset_secondary_bus)(struct pci_dev *pdev); 36 37 37 38 #ifdef CONFIG_PCI_MSI 38 - int (*setup_msi_irqs)(struct pci_dev *dev, 39 + int (*setup_msi_irqs)(struct pci_dev *pdev, 39 40 int nvec, int type); 40 - void (*teardown_msi_irqs)(struct pci_dev *dev); 41 + void (*teardown_msi_irqs)(struct pci_dev *pdev); 41 42 #endif 42 43 43 - int (*dma_set_mask)(struct pci_dev *dev, u64 dma_mask); 44 - u64 (*dma_get_required_mask)(struct pci_dev *dev); 44 + int (*dma_set_mask)(struct pci_dev *pdev, u64 dma_mask); 45 + u64 (*dma_get_required_mask)(struct pci_dev *pdev); 45 46 46 - void (*shutdown)(struct pci_controller *); 47 + void (*shutdown)(struct pci_controller *hose); 47 48 }; 48 49 49 50 /* ··· 209 208 #ifdef CONFIG_EEH 210 209 struct eeh_dev *edev; /* eeh device */ 211 210 #endif 212 - #define IODA_INVALID_PE (-1) 211 + #define IODA_INVALID_PE 0xFFFFFFFF 213 212 #ifdef CONFIG_PPC_POWERNV 214 - int pe_number; 213 + unsigned int pe_number; 215 214 int vf_index; /* VF index in the PF */ 216 215 #ifdef CONFIG_PCI_IOV 217 216 u16 vfs_expanded; /* number of VFs IOV BAR expanded */ 218 217 u16 num_vfs; /* number of VFs enabled*/ 219 - int *pe_num_map; /* PE# for the first VF PE or array */ 218 + unsigned int *pe_num_map; /* PE# for the first VF PE or array */ 220 219 bool m64_single_mode; /* Use M64 BAR in Single Mode */ 221 220 #define IODA_INVALID_M64 (-1) 222 221 int (*m64_map)[PCI_SRIOV_NUM_BARS]; ··· 235 234 extern struct pci_dn *pci_get_pdn(struct pci_dev *pdev); 236 235 extern struct pci_dn *add_dev_pci_data(struct pci_dev *pdev); 237 236 extern void remove_dev_pci_data(struct pci_dev *pdev); 238 - extern void *update_dn_pci_info(struct device_node *dn, void *data); 237 + extern struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, 238 + struct device_node *dn); 239 + extern void pci_remove_device_node_info(struct device_node *dn); 239 240 240 241 static inline int pci_device_from_OF_node(struct device_node *np, 241 242 u8 *bus, u8 *devfn) ··· 259 256 #endif 260 257 261 258 /** Find the bus corresponding to the indicated device node */ 262 - extern struct pci_bus *pcibios_find_pci_bus(struct device_node *dn); 259 + extern struct pci_bus *pci_find_bus_by_node(struct device_node *dn); 263 260 264 261 /** Remove all of the PCI devices under this bus */ 265 - extern void pcibios_remove_pci_devices(struct pci_bus *bus); 262 + extern void pci_hp_remove_devices(struct pci_bus *bus); 266 263 267 264 /** Discover new pci devices under this bus, and add them */ 268 - extern void pcibios_add_pci_devices(struct pci_bus *bus); 265 + extern void pci_hp_add_devices(struct pci_bus *bus); 269 266 270 267 271 268 extern void isa_bridge_find_early(struct pci_controller *hose);

arch/powerpc/include/asm/pgalloc-32.h arch/powerpc/include/asm/nohash/32/pgalloc.h

+19 -73

arch/powerpc/include/asm/pgalloc-64.h arch/powerpc/include/asm/nohash/64/pgalloc.h

··· 53 53 54 54 #ifndef CONFIG_PPC_64K_PAGES 55 55 56 - #define pgd_populate(MM, PGD, PUD) pgd_set(PGD, __pgtable_ptr_val(PUD)) 56 + #define pgd_populate(MM, PGD, PUD) pgd_set(PGD, (unsigned long)PUD) 57 57 58 58 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) 59 59 { ··· 68 68 69 69 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) 70 70 { 71 - pud_set(pud, __pgtable_ptr_val(pmd)); 71 + pud_set(pud, (unsigned long)pmd); 72 72 } 73 73 74 74 static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, 75 75 pte_t *pte) 76 76 { 77 - pmd_set(pmd, __pgtable_ptr_val(pte)); 77 + pmd_set(pmd, (unsigned long)pte); 78 78 } 79 79 80 80 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, 81 81 pgtable_t pte_page) 82 82 { 83 - pmd_set(pmd, __pgtable_ptr_val(page_address(pte_page))); 83 + pmd_set(pmd, (unsigned long)page_address(pte_page)); 84 84 } 85 85 86 86 #define pmd_pgtable(pmd) pmd_page(pmd) ··· 119 119 __free_page(ptepage); 120 120 } 121 121 122 - static inline void pgtable_free(void *table, unsigned index_size) 123 - { 124 - if (!index_size) 125 - free_page((unsigned long)table); 126 - else { 127 - BUG_ON(index_size > MAX_PGTABLE_INDEX_SIZE); 128 - kmem_cache_free(PGT_CACHE(index_size), table); 129 - } 130 - } 131 - 122 + extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); 132 123 #ifdef CONFIG_SMP 133 - static inline void pgtable_free_tlb(struct mmu_gather *tlb, 134 - void *table, int shift) 135 - { 136 - unsigned long pgf = (unsigned long)table; 137 - BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); 138 - pgf |= shift; 139 - tlb_remove_table(tlb, (void *)pgf); 140 - } 141 - 142 - static inline void __tlb_remove_table(void *_table) 143 - { 144 - void *table = (void *)((unsigned long)_table & ~MAX_PGTABLE_INDEX_SIZE); 145 - unsigned shift = (unsigned long)_table & MAX_PGTABLE_INDEX_SIZE; 146 - 147 - pgtable_free(table, shift); 148 - } 149 - #else /* !CONFIG_SMP */ 150 - static inline void pgtable_free_tlb(struct mmu_gather *tlb, 151 - void *table, int shift) 152 - { 153 - pgtable_free(table, shift); 154 - } 155 - #endif /* CONFIG_SMP */ 156 - 124 + extern void __tlb_remove_table(void *_table); 125 + #endif 157 126 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, 158 127 unsigned long address) 159 128 { 160 129 tlb_flush_pgtable(tlb, address); 161 - pgtable_page_dtor(table); 162 130 pgtable_free_tlb(tlb, page_address(table), 0); 163 131 } 164 132 165 133 #else /* if CONFIG_PPC_64K_PAGES */ 166 134 167 - extern pte_t *page_table_alloc(struct mm_struct *, unsigned long, int); 168 - extern void page_table_free(struct mm_struct *, unsigned long *, int); 135 + extern pte_t *pte_fragment_alloc(struct mm_struct *, unsigned long, int); 136 + extern void pte_fragment_free(unsigned long *, int); 169 137 extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift); 170 138 #ifdef CONFIG_SMP 171 139 extern void __tlb_remove_table(void *_table); 172 140 #endif 173 141 174 - #ifndef __PAGETABLE_PUD_FOLDED 175 - /* book3s 64 is 4 level page table */ 176 - static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) 177 - { 178 - pgd_set(pgd, __pgtable_ptr_val(pud)); 179 - } 180 - 181 - static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) 182 - { 183 - return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE), 184 - GFP_KERNEL|__GFP_REPEAT); 185 - } 186 - 187 - static inline void pud_free(struct mm_struct *mm, pud_t *pud) 188 - { 189 - kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud); 190 - } 191 - #endif 192 - 193 - static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) 194 - { 195 - pud_set(pud, __pgtable_ptr_val(pmd)); 196 - } 142 + #define pud_populate(mm, pud, pmd) pud_set(pud, (unsigned long)pmd) 197 143 198 144 static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, 199 145 pte_t *pte) 200 146 { 201 - pmd_set(pmd, __pgtable_ptr_val(pte)); 147 + pmd_set(pmd, (unsigned long)pte); 202 148 } 203 149 204 150 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, 205 151 pgtable_t pte_page) 206 152 { 207 - pmd_set(pmd, __pgtable_ptr_val(pte_page)); 153 + pmd_set(pmd, (unsigned long)pte_page); 208 154 } 209 155 210 156 static inline pgtable_t pmd_pgtable(pmd_t pmd) 211 157 { 212 - return (pgtable_t)pmd_page_vaddr(pmd); 158 + return (pgtable_t)(pmd_val(pmd) & ~PMD_MASKED_BITS); 213 159 } 214 160 215 161 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, 216 162 unsigned long address) 217 163 { 218 - return (pte_t *)page_table_alloc(mm, address, 1); 164 + return (pte_t *)pte_fragment_alloc(mm, address, 1); 219 165 } 220 166 221 167 static inline pgtable_t pte_alloc_one(struct mm_struct *mm, 222 168 unsigned long address) 223 169 { 224 - return (pgtable_t)page_table_alloc(mm, address, 0); 170 + return (pgtable_t)pte_fragment_alloc(mm, address, 0); 225 171 } 226 172 227 173 static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) 228 174 { 229 - page_table_free(mm, (unsigned long *)pte, 1); 175 + pte_fragment_fre((unsigned long *)pte, 1); 230 176 } 231 177 232 178 static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) 233 179 { 234 - page_table_free(mm, (unsigned long *)ptepage, 0); 180 + pte_fragment_free((unsigned long *)ptepage, 0); 235 181 } 236 182 237 183 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, ··· 201 255 202 256 #define __pmd_free_tlb(tlb, pmd, addr) \ 203 257 pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX) 204 - #ifndef __PAGETABLE_PUD_FOLDED 258 + #ifndef CONFIG_PPC_64K_PAGES 205 259 #define __pud_free_tlb(tlb, pud, addr) \ 206 260 pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE) 207 261 208 - #endif /* __PAGETABLE_PUD_FOLDED */ 262 + #endif /* CONFIG_PPC_64K_PAGES */ 209 263 210 264 #define check_pgt_cache() do { } while (0) 211 265

+3 -16

arch/powerpc/include/asm/pgalloc.h

··· 1 1 #ifndef _ASM_POWERPC_PGALLOC_H 2 2 #define _ASM_POWERPC_PGALLOC_H 3 - #ifdef __KERNEL__ 4 3 5 4 #include <linux/mm.h> 6 5 7 - #ifdef CONFIG_PPC_BOOK3E 8 - extern void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address); 9 - #else /* CONFIG_PPC_BOOK3E */ 10 - static inline void tlb_flush_pgtable(struct mmu_gather *tlb, 11 - unsigned long address) 12 - { 13 - } 14 - #endif /* !CONFIG_PPC_BOOK3E */ 15 - 16 - extern void tlb_remove_table(struct mmu_gather *tlb, void *table); 17 - 18 - #ifdef CONFIG_PPC64 19 - #include <asm/pgalloc-64.h> 6 + #ifdef CONFIG_PPC_BOOK3S 7 + #include <asm/book3s/pgalloc.h> 20 8 #else 21 - #include <asm/pgalloc-32.h> 9 + #include <asm/nohash/pgalloc.h> 22 10 #endif 23 11 24 - #endif /* __KERNEL__ */ 25 12 #endif /* _ASM_POWERPC_PGALLOC_H */

+92

arch/powerpc/include/asm/pgtable-be-types.h

··· 1 + #ifndef _ASM_POWERPC_PGTABLE_BE_TYPES_H 2 + #define _ASM_POWERPC_PGTABLE_BE_TYPES_H 3 + 4 + #include <asm/cmpxchg.h> 5 + 6 + /* PTE level */ 7 + typedef struct { __be64 pte; } pte_t; 8 + #define __pte(x) ((pte_t) { cpu_to_be64(x) }) 9 + static inline unsigned long pte_val(pte_t x) 10 + { 11 + return be64_to_cpu(x.pte); 12 + } 13 + 14 + static inline __be64 pte_raw(pte_t x) 15 + { 16 + return x.pte; 17 + } 18 + 19 + /* PMD level */ 20 + #ifdef CONFIG_PPC64 21 + typedef struct { __be64 pmd; } pmd_t; 22 + #define __pmd(x) ((pmd_t) { cpu_to_be64(x) }) 23 + static inline unsigned long pmd_val(pmd_t x) 24 + { 25 + return be64_to_cpu(x.pmd); 26 + } 27 + 28 + static inline __be64 pmd_raw(pmd_t x) 29 + { 30 + return x.pmd; 31 + } 32 + 33 + /* 34 + * 64 bit hash always use 4 level table. Everybody else use 4 level 35 + * only for 4K page size. 36 + */ 37 + #if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES) 38 + typedef struct { __be64 pud; } pud_t; 39 + #define __pud(x) ((pud_t) { cpu_to_be64(x) }) 40 + static inline unsigned long pud_val(pud_t x) 41 + { 42 + return be64_to_cpu(x.pud); 43 + } 44 + #endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */ 45 + #endif /* CONFIG_PPC64 */ 46 + 47 + /* PGD level */ 48 + typedef struct { __be64 pgd; } pgd_t; 49 + #define __pgd(x) ((pgd_t) { cpu_to_be64(x) }) 50 + static inline unsigned long pgd_val(pgd_t x) 51 + { 52 + return be64_to_cpu(x.pgd); 53 + } 54 + 55 + /* Page protection bits */ 56 + typedef struct { unsigned long pgprot; } pgprot_t; 57 + #define pgprot_val(x) ((x).pgprot) 58 + #define __pgprot(x) ((pgprot_t) { (x) }) 59 + 60 + /* 61 + * With hash config 64k pages additionally define a bigger "real PTE" type that 62 + * gathers the "second half" part of the PTE for pseudo 64k pages 63 + */ 64 + #if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64) 65 + typedef struct { pte_t pte; unsigned long hidx; } real_pte_t; 66 + #else 67 + typedef struct { pte_t pte; } real_pte_t; 68 + #endif 69 + 70 + static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) 71 + { 72 + unsigned long *p = (unsigned long *)ptep; 73 + __be64 prev; 74 + 75 + prev = (__force __be64)__cmpxchg_u64(p, (__force unsigned long)pte_raw(old), 76 + (__force unsigned long)pte_raw(new)); 77 + 78 + return pte_raw(old) == prev; 79 + } 80 + 81 + static inline bool pmd_xchg(pmd_t *pmdp, pmd_t old, pmd_t new) 82 + { 83 + unsigned long *p = (unsigned long *)pmdp; 84 + __be64 prev; 85 + 86 + prev = (__force __be64)__cmpxchg_u64(p, (__force unsigned long)pmd_raw(old), 87 + (__force unsigned long)pmd_raw(new)); 88 + 89 + return pmd_raw(old) == prev; 90 + } 91 + 92 + #endif /* _ASM_POWERPC_PGTABLE_BE_TYPES_H */

+12 -46

arch/powerpc/include/asm/pgtable-types.h

··· 1 1 #ifndef _ASM_POWERPC_PGTABLE_TYPES_H 2 2 #define _ASM_POWERPC_PGTABLE_TYPES_H 3 3 4 - #ifdef CONFIG_STRICT_MM_TYPECHECKS 5 - /* These are used to make use of C type-checking. */ 6 - 7 4 /* PTE level */ 8 5 typedef struct { pte_basic_t pte; } pte_t; 9 6 #define __pte(x) ((pte_t) { (x) }) ··· 45 48 #define pgprot_val(x) ((x).pgprot) 46 49 #define __pgprot(x) ((pgprot_t) { (x) }) 47 50 48 - #else 49 - 50 - /* 51 - * .. while these make it easier on the compiler 52 - */ 53 - 54 - typedef pte_basic_t pte_t; 55 - #define __pte(x) (x) 56 - static inline pte_basic_t pte_val(pte_t pte) 57 - { 58 - return pte; 59 - } 60 - 61 - #ifdef CONFIG_PPC64 62 - typedef unsigned long pmd_t; 63 - #define __pmd(x) (x) 64 - static inline unsigned long pmd_val(pmd_t pmd) 65 - { 66 - return pmd; 67 - } 68 - 69 - #if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES) 70 - typedef unsigned long pud_t; 71 - #define __pud(x) (x) 72 - static inline unsigned long pud_val(pud_t pud) 73 - { 74 - return pud; 75 - } 76 - #endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */ 77 - #endif /* CONFIG_PPC64 */ 78 - 79 - typedef unsigned long pgd_t; 80 - #define __pgd(x) (x) 81 - static inline unsigned long pgd_val(pgd_t pgd) 82 - { 83 - return pgd; 84 - } 85 - 86 - typedef unsigned long pgprot_t; 87 - #define pgprot_val(x) (x) 88 - #define __pgprot(x) (x) 89 - 90 - #endif /* CONFIG_STRICT_MM_TYPECHECKS */ 91 51 /* 92 52 * With hash config 64k pages additionally define a bigger "real PTE" type that 93 53 * gathers the "second half" part of the PTE for pseudo 64k pages ··· 54 100 #else 55 101 typedef struct { pte_t pte; } real_pte_t; 56 102 #endif 103 + 104 + #ifdef CONFIG_PPC_STD_MMU_64 105 + #include <asm/cmpxchg.h> 106 + 107 + static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) 108 + { 109 + unsigned long *p = (unsigned long *)ptep; 110 + 111 + return pte_val(old) == __cmpxchg_u64(p, pte_val(old), pte_val(new)); 112 + } 113 + #endif 114 + 57 115 #endif /* _ASM_POWERPC_PGTABLE_TYPES_H */

+2

arch/powerpc/include/asm/ppc-opcode.h

··· 131 131 /* sorted alphabetically */ 132 132 #define PPC_INST_BHRBE 0x7c00025c 133 133 #define PPC_INST_CLRBHRB 0x7c00035c 134 + #define PPC_INST_CP_ABORT 0x7c00068c 134 135 #define PPC_INST_DCBA 0x7c0005ec 135 136 #define PPC_INST_DCBA_MASK 0xfc0007fe 136 137 #define PPC_INST_DCBAL 0x7c2005ec ··· 286 285 #endif 287 286 288 287 /* Deal with instructions that older assemblers aren't aware of */ 288 + #define PPC_CP_ABORT stringify_in_c(.long PPC_INST_CP_ABORT) 289 289 #define PPC_DCBAL(a, b) stringify_in_c(.long PPC_INST_DCBAL | \ 290 290 __PPC_RA(a) | __PPC_RB(b)) 291 291 #define PPC_DCBZL(a, b) stringify_in_c(.long PPC_INST_DCBZL | \

+3 -3

arch/powerpc/include/asm/ppc-pci.h

··· 33 33 struct device_node; 34 34 struct pci_dn; 35 35 36 - typedef void *(*traverse_func)(struct device_node *me, void *data); 37 - void *traverse_pci_devices(struct device_node *start, traverse_func pre, 38 - void *data); 36 + void *pci_traverse_device_nodes(struct device_node *start, 37 + void *(*fn)(struct device_node *, void *), 38 + void *data); 39 39 void *traverse_pci_dn(struct pci_dn *root, 40 40 void *(*fn)(struct pci_dn *, void *), 41 41 void *data);

+3

arch/powerpc/include/asm/ppc_asm.h

··· 427 427 li r4,1024; \ 428 428 mtctr r4; \ 429 429 lis r4,KERNELBASE@h; \ 430 + .machine push; \ 431 + .machine "power4"; \ 430 432 0: tlbie r4; \ 433 + .machine pop; \ 431 434 addi r4,r4,0x1000; \ 432 435 bdnz 0b 433 436 #endif

+19 -7

arch/powerpc/include/asm/pte-common.h

··· 76 76 */ 77 77 #ifndef __ASSEMBLY__ 78 78 extern unsigned long bad_call_to_PMD_PAGE_SIZE(void); 79 + 80 + /* 81 + * Don't just check for any non zero bits in __PAGE_USER, since for book3e 82 + * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in 83 + * _PAGE_USER. Need to explicitly match _PAGE_BAP_UR bit in that case too. 84 + */ 85 + static inline bool pte_user(pte_t pte) 86 + { 87 + return (pte_val(pte) & _PAGE_USER) == _PAGE_USER; 88 + } 79 89 #endif /* __ASSEMBLY__ */ 80 90 81 91 /* Location of the PFN in the PTE. Most 32-bit platforms use the same ··· 194 184 /* Make modules code happy. We don't set RO yet */ 195 185 #define PAGE_KERNEL_EXEC PAGE_KERNEL_X 196 186 197 - /* 198 - * Don't just check for any non zero bits in __PAGE_USER, since for book3e 199 - * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in 200 - * _PAGE_USER. Need to explicitly match _PAGE_BAP_UR bit in that case too. 201 - */ 202 - #define pte_user(val) ((val & _PAGE_USER) == _PAGE_USER) 203 - 204 187 /* Advertise special mapping type for AGP */ 205 188 #define PAGE_AGP (PAGE_KERNEL_NC) 206 189 #define HAVE_PAGE_AGP ··· 201 198 /* Advertise support for _PAGE_SPECIAL */ 202 199 #define __HAVE_ARCH_PTE_SPECIAL 203 200 201 + #ifndef _PAGE_READ 202 + /* if not defined, we should not find _PAGE_WRITE too */ 203 + #define _PAGE_READ 0 204 + #define _PAGE_WRITE _PAGE_RW 205 + #endif 206 + 207 + #ifndef H_PAGE_4K_PFN 208 + #define H_PAGE_4K_PFN 0 209 + #endif

+3

arch/powerpc/include/asm/reg.h

··· 347 347 #define LPCR_LPES_SH 2 348 348 #define LPCR_RMI 0x00000002 /* real mode is cache inhibit */ 349 349 #define LPCR_HDICE 0x00000001 /* Hyp Decr enable (HV,PR,EE) */ 350 + #define LPCR_UPRT 0x00400000 /* Use Process Table (ISA 3) */ 350 351 #ifndef SPRN_LPID 351 352 #define SPRN_LPID 0x13F /* Logical Partition Identifier */ 352 353 #endif ··· 588 587 #define SPRN_PIR 0x3FF /* Processor Identification Register */ 589 588 #endif 590 589 #define SPRN_TIR 0x1BE /* Thread Identification Register */ 590 + #define SPRN_PTCR 0x1D0 /* Partition table control Register */ 591 591 #define SPRN_PSPB 0x09F /* Problem State Priority Boost reg */ 592 592 #define SPRN_PTEHI 0x3D5 /* 981 7450 PTE HI word (S/W TLB load) */ 593 593 #define SPRN_PTELO 0x3D6 /* 982 7450 PTE LO word (S/W TLB load) */ ··· 1184 1182 #define PVR_970GX 0x0045 1185 1183 #define PVR_POWER7p 0x004A 1186 1184 #define PVR_POWER8E 0x004B 1185 + #define PVR_POWER8NVL 0x004C 1187 1186 #define PVR_POWER8 0x004D 1188 1187 #define PVR_BE 0x0070 1189 1188 #define PVR_PA6T 0x0090

+2 -1

arch/powerpc/include/asm/tlbflush.h

··· 58 58 59 59 #elif defined(CONFIG_PPC_STD_MMU_32) 60 60 61 + #define MMU_NO_CONTEXT (0) 61 62 /* 62 63 * TLB flushing for "classic" hash-MMU 32-bit CPUs, 6xx, 7xx, 7xxx 63 64 */ ··· 79 78 } 80 79 81 80 #elif defined(CONFIG_PPC_STD_MMU_64) 82 - #include <asm/book3s/64/tlbflush-hash.h> 81 + #include <asm/book3s/64/tlbflush.h> 83 82 #else 84 83 #error Unsupported MMU type 85 84 #endif

+50

arch/powerpc/include/uapi/asm/perf_regs.h

··· 1 + #ifndef _UAPI_ASM_POWERPC_PERF_REGS_H 2 + #define _UAPI_ASM_POWERPC_PERF_REGS_H 3 + 4 + enum perf_event_powerpc_regs { 5 + PERF_REG_POWERPC_R0, 6 + PERF_REG_POWERPC_R1, 7 + PERF_REG_POWERPC_R2, 8 + PERF_REG_POWERPC_R3, 9 + PERF_REG_POWERPC_R4, 10 + PERF_REG_POWERPC_R5, 11 + PERF_REG_POWERPC_R6, 12 + PERF_REG_POWERPC_R7, 13 + PERF_REG_POWERPC_R8, 14 + PERF_REG_POWERPC_R9, 15 + PERF_REG_POWERPC_R10, 16 + PERF_REG_POWERPC_R11, 17 + PERF_REG_POWERPC_R12, 18 + PERF_REG_POWERPC_R13, 19 + PERF_REG_POWERPC_R14, 20 + PERF_REG_POWERPC_R15, 21 + PERF_REG_POWERPC_R16, 22 + PERF_REG_POWERPC_R17, 23 + PERF_REG_POWERPC_R18, 24 + PERF_REG_POWERPC_R19, 25 + PERF_REG_POWERPC_R20, 26 + PERF_REG_POWERPC_R21, 27 + PERF_REG_POWERPC_R22, 28 + PERF_REG_POWERPC_R23, 29 + PERF_REG_POWERPC_R24, 30 + PERF_REG_POWERPC_R25, 31 + PERF_REG_POWERPC_R26, 32 + PERF_REG_POWERPC_R27, 33 + PERF_REG_POWERPC_R28, 34 + PERF_REG_POWERPC_R29, 35 + PERF_REG_POWERPC_R30, 36 + PERF_REG_POWERPC_R31, 37 + PERF_REG_POWERPC_NIP, 38 + PERF_REG_POWERPC_MSR, 39 + PERF_REG_POWERPC_ORIG_R3, 40 + PERF_REG_POWERPC_CTR, 41 + PERF_REG_POWERPC_LINK, 42 + PERF_REG_POWERPC_XER, 43 + PERF_REG_POWERPC_CCR, 44 + PERF_REG_POWERPC_SOFTE, 45 + PERF_REG_POWERPC_TRAP, 46 + PERF_REG_POWERPC_DAR, 47 + PERF_REG_POWERPC_DSISR, 48 + PERF_REG_POWERPC_MAX, 49 + }; 50 + #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */

+4

arch/powerpc/kernel/asm-offsets.c

··· 438 438 DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry)); 439 439 #endif 440 440 441 + #ifdef MAX_PGD_TABLE_SIZE 442 + DEFINE(PGD_TABLE_SIZE, MAX_PGD_TABLE_SIZE); 443 + #else 441 444 DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE); 445 + #endif 442 446 DEFINE(PTE_SIZE, sizeof(pte_t)); 443 447 444 448 #ifdef CONFIG_KVM

+1 -1

arch/powerpc/kernel/btext.c

··· 162 162 offset = ((unsigned long) dispDeviceBase) - base; 163 163 size = dispDeviceRowBytes * dispDeviceRect[3] + offset 164 164 + dispDeviceRect[0]; 165 - vbase = __ioremap(base, size, _PAGE_NO_CACHE); 165 + vbase = __ioremap(base, size, pgprot_val(pgprot_noncached_wc(__pgprot(0)))); 166 166 if (vbase == 0) 167 167 return; 168 168 logicalDisplayBase = vbase + offset;

-2

arch/powerpc/kernel/cputable.c

··· 63 63 extern void __setup_cpu_ppc970(unsigned long offset, struct cpu_spec* spec); 64 64 extern void __setup_cpu_ppc970MP(unsigned long offset, struct cpu_spec* spec); 65 65 extern void __setup_cpu_pa6t(unsigned long offset, struct cpu_spec* spec); 66 - extern void __setup_cpu_a2(unsigned long offset, struct cpu_spec* spec); 67 66 extern void __restore_cpu_pa6t(void); 68 67 extern void __restore_cpu_ppc970(void); 69 68 extern void __setup_cpu_power7(unsigned long offset, struct cpu_spec* spec); ··· 71 72 extern void __restore_cpu_power8(void); 72 73 extern void __setup_cpu_power9(unsigned long offset, struct cpu_spec* spec); 73 74 extern void __restore_cpu_power9(void); 74 - extern void __restore_cpu_a2(void); 75 75 extern void __flush_tlb_power7(unsigned int action); 76 76 extern void __flush_tlb_power8(unsigned int action); 77 77 extern void __flush_tlb_power9(unsigned int action);

+3 -6

arch/powerpc/kernel/eeh.c

··· 48 48 49 49 50 50 /** Overview: 51 - * EEH, or "Extended Error Handling" is a PCI bridge technology for 51 + * EEH, or "Enhanced Error Handling" is a PCI bridge technology for 52 52 * dealing with PCI bus errors that can't be dealt with within the 53 53 * usual PCI framework, except by check-stopping the CPU. Systems 54 54 * that are designed for high-availability/reliability cannot afford ··· 1068 1068 struct pci_controller *phb; 1069 1069 struct eeh_dev *edev = pdn_to_eeh_dev(pdn); 1070 1070 1071 - if (!edev || !eeh_enabled()) 1071 + if (!edev) 1072 1072 return; 1073 1073 1074 1074 if (!eeh_has_flag(EEH_PROBE_MODE_DEVTREE)) ··· 1336 1336 id->subdevice != pdev->subsystem_device) 1337 1337 continue; 1338 1338 1339 - goto reset; 1339 + return eeh_pe_reset_and_recover(pe); 1340 1340 } 1341 1341 } 1342 1342 1343 1343 return eeh_unfreeze_pe(pe, true); 1344 - 1345 - reset: 1346 - return eeh_pe_reset_and_recover(pe); 1347 1344 } 1348 1345 1349 1346 /**

+30 -16

arch/powerpc/kernel/eeh_driver.c

··· 171 171 if (!edev) 172 172 return NULL; 173 173 174 + /* 175 + * We cannot access the config space on some adapters. 176 + * Otherwise, it will cause fenced PHB. We don't save 177 + * the content in their config space and will restore 178 + * from the initial config space saved when the EEH 179 + * device is created. 180 + */ 181 + if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) 182 + return NULL; 183 + 174 184 pdev = eeh_dev_to_pci_dev(edev); 175 185 if (!pdev) 176 186 return NULL; ··· 321 311 322 312 if (!edev) 323 313 return NULL; 314 + 315 + /* 316 + * The content in the config space isn't saved because 317 + * the blocked config space on some adapters. We have 318 + * to restore the initial saved config space when the 319 + * EEH device is created. 320 + */ 321 + if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) { 322 + if (list_is_last(&edev->list, &edev->pe->edevs)) 323 + eeh_pe_restore_bars(edev->pe); 324 + 325 + return NULL; 326 + } 324 327 325 328 pdev = eeh_dev_to_pci_dev(edev); 326 329 if (!pdev) ··· 575 552 576 553 int eeh_pe_reset_and_recover(struct eeh_pe *pe) 577 554 { 578 - int result, ret; 555 + int ret; 579 556 580 557 /* Bail if the PE is being recovered */ 581 558 if (pe->state & EEH_PE_RECOVERING) ··· 586 563 587 564 /* Save states */ 588 565 eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL); 589 - 590 - /* Report error */ 591 - eeh_pe_dev_traverse(pe, eeh_report_error, &result); 592 566 593 567 /* Issue reset */ 594 568 ret = eeh_reset_pe(pe); ··· 601 581 return ret; 602 582 } 603 583 604 - /* Notify completion of reset */ 605 - eeh_pe_dev_traverse(pe, eeh_report_reset, &result); 606 - 607 584 /* Restore device state */ 608 585 eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL); 609 - 610 - /* Resume */ 611 - eeh_pe_dev_traverse(pe, eeh_report_resume, NULL); 612 586 613 587 /* Clear recovery mode */ 614 588 eeh_pe_state_clear(pe, EEH_PE_RECOVERING); ··· 635 621 * We don't remove the corresponding PE instances because 636 622 * we need the information afterwords. The attached EEH 637 623 * devices are expected to be attached soon when calling 638 - * into pcibios_add_pci_devices(). 624 + * into pci_hp_add_devices(). 639 625 */ 640 626 eeh_pe_state_mark(pe, EEH_PE_KEEP); 641 627 if (bus) { ··· 644 630 } else { 645 631 eeh_pe_state_clear(pe, EEH_PE_PRI_BUS); 646 632 pci_lock_rescan_remove(); 647 - pcibios_remove_pci_devices(bus); 633 + pci_hp_remove_devices(bus); 648 634 pci_unlock_rescan_remove(); 649 635 } 650 636 } else if (frozen_bus) { ··· 695 681 if (pe->type & EEH_PE_VF) 696 682 eeh_add_virt_device(edev, NULL); 697 683 else 698 - pcibios_add_pci_devices(bus); 684 + pci_hp_add_devices(bus); 699 685 } else if (frozen_bus && rmv_data->removed) { 700 686 pr_info("EEH: Sleep 5s ahead of partial hotplug\n"); 701 687 ssleep(5); ··· 705 691 if (pe->type & EEH_PE_VF) 706 692 eeh_add_virt_device(edev, NULL); 707 693 else 708 - pcibios_add_pci_devices(frozen_bus); 694 + pci_hp_add_devices(frozen_bus); 709 695 } 710 696 eeh_pe_state_clear(pe, EEH_PE_KEEP); 711 697 ··· 910 896 eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); 911 897 912 898 pci_lock_rescan_remove(); 913 - pcibios_remove_pci_devices(frozen_bus); 899 + pci_hp_remove_devices(frozen_bus); 914 900 pci_unlock_rescan_remove(); 915 901 } 916 902 } ··· 995 981 bus = eeh_pe_bus_get(phb_pe); 996 982 eeh_pe_dev_traverse(pe, 997 983 eeh_report_failure, NULL); 998 - pcibios_remove_pci_devices(bus); 984 + pci_hp_remove_devices(bus); 999 985 } 1000 986 pci_unlock_rescan_remove(); 1001 987 }

+1 -1

arch/powerpc/kernel/eeh_event.c

··· 36 36 37 37 static DEFINE_SPINLOCK(eeh_eventlist_lock); 38 38 static struct semaphore eeh_eventlist_sem; 39 - LIST_HEAD(eeh_eventlist); 39 + static LIST_HEAD(eeh_eventlist); 40 40 41 41 /** 42 42 * eeh_event_handler - Dispatch EEH events.

+1 -1

arch/powerpc/kernel/eeh_pe.c

··· 249 249 } else { 250 250 if (edev->pe_config_addr && 251 251 (edev->pe_config_addr == pe->addr)) 252 - return pe; 252 + return pe; 253 253 } 254 254 255 255 /* Try BDF address */

+14 -2

arch/powerpc/kernel/entry_64.S

··· 37 37 #include <asm/hw_irq.h> 38 38 #include <asm/context_tracking.h> 39 39 #include <asm/tm.h> 40 + #include <asm/ppc-opcode.h> 40 41 41 42 /* 42 43 * System calls. ··· 510 509 ldarx r6,0,r1 511 510 END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS) 512 511 512 + BEGIN_FTR_SECTION 513 + /* 514 + * A cp_abort (copy paste abort) here ensures that when context switching, a 515 + * copy from one process can't leak into the paste of another. 516 + */ 517 + PPC_CP_ABORT 518 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 519 + 513 520 #ifdef CONFIG_PPC_BOOK3S 514 521 /* Cancel all explict user streams as they will have no use after context 515 522 * switch and will stop the HW from creating streams itself ··· 529 520 std r6,PACACURRENT(r13) /* Set new 'current' */ 530 521 531 522 ld r8,KSP(r4) /* new stack pointer */ 532 - #ifdef CONFIG_PPC_BOOK3S 523 + #ifdef CONFIG_PPC_STD_MMU_64 524 + BEGIN_MMU_FTR_SECTION 525 + b 2f 526 + END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX) 533 527 BEGIN_FTR_SECTION 534 528 clrrdi r6,r8,28 /* get its ESID */ 535 529 clrrdi r9,r1,28 /* get current sp ESID */ ··· 578 566 slbmte r7,r0 579 567 isync 580 568 2: 581 - #endif /* !CONFIG_PPC_BOOK3S */ 569 + #endif /* CONFIG_PPC_STD_MMU_64 */ 582 570 583 571 CURRENT_THREAD_INFO(r7, r8) /* base of new stack */ 584 572 /* Note: this uses SWITCH_FRAME_SIZE rather than INT_FRAME_SIZE

+40 -123

arch/powerpc/kernel/exceptions-64s.S

··· 189 189 #endif /* CONFIG_PPC_P7_NAP */ 190 190 EXCEPTION_PROLOG_0(PACA_EXMC) 191 191 BEGIN_FTR_SECTION 192 - b machine_check_pSeries_early 192 + b machine_check_powernv_early 193 193 FTR_SECTION_ELSE 194 194 b machine_check_pSeries_0 195 195 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) ··· 209 209 EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380) 210 210 std r3,PACA_EXSLB+EX_R3(r13) 211 211 mfspr r3,SPRN_DAR 212 - #ifdef __DISABLED__ 213 - /* Keep that around for when we re-implement dynamic VSIDs */ 214 - cmpdi r3,0 215 - bge slb_miss_user_pseries 216 - #endif /* __DISABLED__ */ 217 212 mfspr r12,SPRN_SRR1 218 213 #ifndef CONFIG_RELOCATABLE 219 214 b slb_miss_realmode ··· 235 240 EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x480) 236 241 std r3,PACA_EXSLB+EX_R3(r13) 237 242 mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ 238 - #ifdef __DISABLED__ 239 - /* Keep that around for when we re-implement dynamic VSIDs */ 240 - cmpdi r3,0 241 - bge slb_miss_user_pseries 242 - #endif /* __DISABLED__ */ 243 243 mfspr r12,SPRN_SRR1 244 244 #ifndef CONFIG_RELOCATABLE 245 245 b slb_miss_realmode ··· 433 443 434 444 .align 7 435 445 /* moved from 0x200 */ 436 - machine_check_pSeries_early: 446 + machine_check_powernv_early: 437 447 BEGIN_FTR_SECTION 438 448 EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) 439 449 /* ··· 699 709 700 710 #endif /* CONFIG_PPC_PSERIES */ 701 711 702 - #ifdef __DISABLED__ 703 - /* 704 - * This is used for when the SLB miss handler has to go virtual, 705 - * which doesn't happen for now anymore but will once we re-implement 706 - * dynamic VSIDs for shared page tables 707 - */ 708 - slb_miss_user_pseries: 709 - std r10,PACA_EXGEN+EX_R10(r13) 710 - std r11,PACA_EXGEN+EX_R11(r13) 711 - std r12,PACA_EXGEN+EX_R12(r13) 712 - GET_SCRATCH0(r10) 713 - ld r11,PACA_EXSLB+EX_R9(r13) 714 - ld r12,PACA_EXSLB+EX_R3(r13) 715 - std r10,PACA_EXGEN+EX_R13(r13) 716 - std r11,PACA_EXGEN+EX_R9(r13) 717 - std r12,PACA_EXGEN+EX_R3(r13) 718 - clrrdi r12,r13,32 719 - mfmsr r10 720 - mfspr r11,SRR0 /* save SRR0 */ 721 - ori r12,r12,slb_miss_user_common@l /* virt addr of handler */ 722 - ori r10,r10,MSR_IR|MSR_DR|MSR_RI 723 - mtspr SRR0,r12 724 - mfspr r12,SRR1 /* and SRR1 */ 725 - mtspr SRR1,r10 726 - rfid 727 - b . /* prevent spec. execution */ 728 - #endif /* __DISABLED__ */ 729 - 730 712 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER 731 713 kvmppc_skip_interrupt: 732 714 /* ··· 726 764 #endif 727 765 728 766 /* 729 - * Code from here down to __end_handlers is invoked from the 730 - * exception prologs above. Because the prologs assemble the 731 - * addresses of these handlers using the LOAD_HANDLER macro, 732 - * which uses an ori instruction, these handlers must be in 733 - * the first 64k of the kernel image. 767 + * Ensure that any handlers that get invoked from the exception prologs 768 + * above are below the first 64KB (0x10000) of the kernel image because 769 + * the prologs assemble the addresses of these handlers using the 770 + * LOAD_HANDLER macro, which uses an ori instruction. 734 771 */ 735 772 736 773 /*** Common interrupt handlers ***/ ··· 914 953 #endif 915 954 STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist) 916 955 917 - /* Other future vectors */ 918 - .align 7 919 - .globl __end_interrupts 920 - __end_interrupts: 921 - 922 956 .align 7 923 957 system_call_entry: 924 958 b system_call_common ··· 939 983 ld r3,PACA_EXGEN+EX_DAR(r13) 940 984 lwz r4,PACA_EXGEN+EX_DSISR(r13) 941 985 li r5,0x300 986 + std r3,_DAR(r1) 987 + std r4,_DSISR(r1) 988 + BEGIN_MMU_FTR_SECTION 942 989 b do_hash_page /* Try to handle as hpte fault */ 990 + MMU_FTR_SECTION_ELSE 991 + b handle_page_fault 992 + ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX) 943 993 944 994 .align 7 945 995 .globl h_data_storage_common ··· 970 1008 ld r3,_NIP(r1) 971 1009 andis. r4,r12,0x5820 972 1010 li r5,0x400 1011 + std r3,_DAR(r1) 1012 + std r4,_DSISR(r1) 1013 + BEGIN_MMU_FTR_SECTION 973 1014 b do_hash_page /* Try to handle as hpte fault */ 1015 + MMU_FTR_SECTION_ELSE 1016 + b handle_page_fault 1017 + ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_RADIX) 974 1018 975 1019 STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception) 976 - 977 - /* 978 - * Here is the common SLB miss user that is used when going to virtual 979 - * mode for SLB misses, that is currently not used 980 - */ 981 - #ifdef __DISABLED__ 982 - .align 7 983 - .globl slb_miss_user_common 984 - slb_miss_user_common: 985 - mflr r10 986 - std r3,PACA_EXGEN+EX_DAR(r13) 987 - stw r9,PACA_EXGEN+EX_CCR(r13) 988 - std r10,PACA_EXGEN+EX_LR(r13) 989 - std r11,PACA_EXGEN+EX_SRR0(r13) 990 - bl slb_allocate_user 991 - 992 - ld r10,PACA_EXGEN+EX_LR(r13) 993 - ld r3,PACA_EXGEN+EX_R3(r13) 994 - lwz r9,PACA_EXGEN+EX_CCR(r13) 995 - ld r11,PACA_EXGEN+EX_SRR0(r13) 996 - mtlr r10 997 - beq- slb_miss_fault 998 - 999 - andi. r10,r12,MSR_RI /* check for unrecoverable exception */ 1000 - beq- unrecov_user_slb 1001 - mfmsr r10 1002 - 1003 - .machine push 1004 - .machine "power4" 1005 - mtcrf 0x80,r9 1006 - .machine pop 1007 - 1008 - clrrdi r10,r10,2 /* clear RI before setting SRR0/1 */ 1009 - mtmsrd r10,1 1010 - 1011 - mtspr SRR0,r11 1012 - mtspr SRR1,r12 1013 - 1014 - ld r9,PACA_EXGEN+EX_R9(r13) 1015 - ld r10,PACA_EXGEN+EX_R10(r13) 1016 - ld r11,PACA_EXGEN+EX_R11(r13) 1017 - ld r12,PACA_EXGEN+EX_R12(r13) 1018 - ld r13,PACA_EXGEN+EX_R13(r13) 1019 - rfid 1020 - b . 1021 - 1022 - slb_miss_fault: 1023 - EXCEPTION_PROLOG_COMMON(0x380, PACA_EXGEN) 1024 - ld r4,PACA_EXGEN+EX_DAR(r13) 1025 - li r5,0 1026 - std r4,_DAR(r1) 1027 - std r5,_DSISR(r1) 1028 - b handle_page_fault 1029 - 1030 - unrecov_user_slb: 1031 - EXCEPTION_PROLOG_COMMON(0x4200, PACA_EXGEN) 1032 - RECONCILE_IRQ_STATE(r10, r11) 1033 - bl save_nvgprs 1034 - 1: addi r3,r1,STACK_FRAME_OVERHEAD 1035 - bl unrecoverable_exception 1036 - b 1b 1037 - 1038 - #endif /* __DISABLED__ */ 1039 - 1040 1020 1041 1021 /* 1042 1022 * Machine check is different because we use a different ··· 1134 1230 STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception) 1135 1231 STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception) 1136 1232 1137 - .align 7 1138 - .globl __end_handlers 1139 - __end_handlers: 1140 - 1141 1233 /* Equivalents to the above handlers for relocation-on interrupt vectors */ 1142 1234 STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist) 1143 1235 MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell) ··· 1143 1243 STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable) 1144 1244 STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable) 1145 1245 STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable) 1246 + 1247 + /* 1248 + * The __end_interrupts marker must be past the out-of-line (OOL) 1249 + * handlers, so that they are copied to real address 0x100 when running 1250 + * a relocatable kernel. This ensures they can be reached from the short 1251 + * trampoline handlers (like 0x4f00, 0x4f20, etc.) which branch 1252 + * directly, without using LOAD_HANDLER(). 1253 + */ 1254 + .align 7 1255 + .globl __end_interrupts 1256 + __end_interrupts: 1146 1257 1147 1258 #if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) 1148 1259 /* ··· 1387 1476 stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ 1388 1477 std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ 1389 1478 1479 + #ifdef CONFIG_PPC_STD_MMU_64 1480 + BEGIN_MMU_FTR_SECTION 1390 1481 bl slb_allocate_realmode 1391 - 1482 + END_MMU_FTR_SECTION_IFCLR(MMU_FTR_RADIX) 1483 + #endif 1392 1484 /* All done -- return from exception. */ 1393 1485 1394 1486 ld r10,PACA_EXSLB+EX_LR(r13) ··· 1399 1485 lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ 1400 1486 1401 1487 mtlr r10 1402 - 1488 + BEGIN_MMU_FTR_SECTION 1489 + b 2f 1490 + END_MMU_FTR_SECTION_IFSET(MMU_FTR_RADIX) 1403 1491 andi. r10,r12,MSR_RI /* check for unrecoverable exception */ 1404 1492 beq- 2f 1405 1493 ··· 1452 1536 */ 1453 1537 .align 7 1454 1538 do_hash_page: 1455 - std r3,_DAR(r1) 1456 - std r4,_DSISR(r1) 1457 - 1539 + #ifdef CONFIG_PPC_STD_MMU_64 1458 1540 andis. r0,r4,0xa410 /* weird error? */ 1459 1541 bne- handle_page_fault /* if not, try to insert a HPTE */ 1460 1542 andis. r0,r4,DSISR_DABRMATCH@h ··· 1480 1566 1481 1567 /* Error */ 1482 1568 blt- 13f 1569 + #endif /* CONFIG_PPC_STD_MMU_64 */ 1483 1570 1484 1571 /* Here we have a page fault that hash_page can't handle. */ 1485 1572 handle_page_fault: ··· 1507 1592 12: b ret_from_except_lite 1508 1593 1509 1594 1595 + #ifdef CONFIG_PPC_STD_MMU_64 1510 1596 /* We have a page fault that hash_page could handle but HV refused 1511 1597 * the PTE insertion 1512 1598 */ ··· 1517 1601 ld r4,_DAR(r1) 1518 1602 bl low_hash_fault 1519 1603 b ret_from_except 1604 + #endif 1520 1605 1521 1606 /* 1522 1607 * We come here as a result of a DSI at a point where we don't want

+10

arch/powerpc/kernel/ftrace.c

··· 607 607 return sys_call_table[nr*2]; 608 608 } 609 609 #endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_PPC64 */ 610 + 611 + #if defined(CONFIG_PPC64) && (!defined(_CALL_ELF) || _CALL_ELF != 2) 612 + char *arch_ftrace_match_adjust(char *str, const char *search) 613 + { 614 + if (str[0] == '.' && search[0] != '.') 615 + return str + 1; 616 + else 617 + return str; 618 + } 619 + #endif /* defined(CONFIG_PPC64) && (!defined(_CALL_ELF) || _CALL_ELF != 2) */

+9 -6

arch/powerpc/kernel/head_64.S

··· 973 973 * This stuff goes at the beginning of the bss, which is page-aligned. 974 974 */ 975 975 .section ".bss" 976 - 977 - .align PAGE_SHIFT 978 - 979 - .globl empty_zero_page 980 - empty_zero_page: 981 - .space PAGE_SIZE 976 + /* 977 + * pgd dir should be aligned to PGD_TABLE_SIZE which is 64K. 978 + * We will need to find a better way to fix this 979 + */ 980 + .align 16 982 981 983 982 .globl swapper_pg_dir 984 983 swapper_pg_dir: 985 984 .space PGD_TABLE_SIZE 985 + 986 + .globl empty_zero_page 987 + empty_zero_page: 988 + .space PAGE_SIZE

+1 -1

arch/powerpc/kernel/ibmebus.c

··· 408 408 return len+1; 409 409 } 410 410 411 - struct device_attribute ibmebus_bus_device_attrs[] = { 411 + static struct device_attribute ibmebus_bus_device_attrs[] = { 412 412 __ATTR_RO(devspec), 413 413 __ATTR_RO(name), 414 414 __ATTR_RO(modalias),

+2 -2

arch/powerpc/kernel/isa-bridge.c

··· 109 109 size = 0x10000; 110 110 111 111 __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, 112 - size, _PAGE_NO_CACHE|_PAGE_GUARDED); 112 + size, pgprot_val(pgprot_noncached(__pgprot(0)))); 113 113 return; 114 114 115 115 inval_range: 116 116 printk(KERN_ERR "no ISA IO ranges or unexpected isa range, " 117 117 "mapping 64k\n"); 118 118 __ioremap_at(phb_io_base_phys, (void *)ISA_IO_BASE, 119 - 0x10000, _PAGE_NO_CACHE|_PAGE_GUARDED); 119 + 0x10000, pgprot_val(pgprot_noncached(__pgprot(0)))); 120 120 } 121 121 122 122

+5 -13

arch/powerpc/kernel/machine_kexec.c

··· 228 228 229 229 static void __init export_crashk_values(struct device_node *node) 230 230 { 231 - struct property *prop; 232 - 233 231 /* There might be existing crash kernel properties, but we can't 234 232 * be sure what's in them, so remove them. */ 235 - prop = of_find_property(node, "linux,crashkernel-base", NULL); 236 - if (prop) 237 - of_remove_property(node, prop); 238 - 239 - prop = of_find_property(node, "linux,crashkernel-size", NULL); 240 - if (prop) 241 - of_remove_property(node, prop); 233 + of_remove_property(node, of_find_property(node, 234 + "linux,crashkernel-base", NULL)); 235 + of_remove_property(node, of_find_property(node, 236 + "linux,crashkernel-size", NULL)); 242 237 243 238 if (crashk_res.start != 0) { 244 239 crashk_base = cpu_to_be_ulong(crashk_res.start), ··· 253 258 static int __init kexec_setup(void) 254 259 { 255 260 struct device_node *node; 256 - struct property *prop; 257 261 258 262 node = of_find_node_by_path("/chosen"); 259 263 if (!node) 260 264 return -ENOENT; 261 265 262 266 /* remove any stale properties so ours can be found */ 263 - prop = of_find_property(node, kernel_end_prop.name, NULL); 264 - if (prop) 265 - of_remove_property(node, prop); 267 + of_remove_property(node, of_find_property(node, kernel_end_prop.name, NULL)); 266 268 267 269 /* information needed by userspace when using default_machine_kexec */ 268 270 kernel_end = cpu_to_be_ulong(__pa(_end));

+6 -9

arch/powerpc/kernel/machine_kexec_64.c

··· 76 76 * end of the blocked region (begin >= high). Use the 77 77 * boolean identity !(a || b) === (!a && !b). 78 78 */ 79 + #ifdef CONFIG_PPC_STD_MMU_64 79 80 if (htab_address) { 80 81 low = __pa(htab_address); 81 82 high = low + htab_size_bytes; ··· 89 88 return -ETXTBSY; 90 89 } 91 90 } 91 + #endif /* CONFIG_PPC_STD_MMU_64 */ 92 92 93 93 /* We also should not overwrite the tce tables */ 94 94 for_each_node_by_type(node, "pci") { ··· 383 381 /* NOTREACHED */ 384 382 } 385 383 386 - #ifndef CONFIG_PPC_BOOK3E 384 + #ifdef CONFIG_PPC_STD_MMU_64 387 385 /* Values we need to export to the second kernel via the device tree. */ 388 386 static unsigned long htab_base; 389 387 static unsigned long htab_size; ··· 403 401 static int __init export_htab_values(void) 404 402 { 405 403 struct device_node *node; 406 - struct property *prop; 407 404 408 405 /* On machines with no htab htab_address is NULL */ 409 406 if (!htab_address) ··· 413 412 return -ENODEV; 414 413 415 414 /* remove any stale propertys so ours can be found */ 416 - prop = of_find_property(node, htab_base_prop.name, NULL); 417 - if (prop) 418 - of_remove_property(node, prop); 419 - prop = of_find_property(node, htab_size_prop.name, NULL); 420 - if (prop) 421 - of_remove_property(node, prop); 415 + of_remove_property(node, of_find_property(node, htab_base_prop.name, NULL)); 416 + of_remove_property(node, of_find_property(node, htab_size_prop.name, NULL)); 422 417 423 418 htab_base = cpu_to_be64(__pa(htab_address)); 424 419 of_add_property(node, &htab_base_prop); ··· 425 428 return 0; 426 429 } 427 430 late_initcall(export_htab_values); 428 - #endif /* !CONFIG_PPC_BOOK3E */ 431 + #endif /* CONFIG_PPC_STD_MMU_64 */

+1 -1

arch/powerpc/kernel/mce.c

··· 37 37 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); 38 38 39 39 static void machine_check_process_queued_event(struct irq_work *work); 40 - struct irq_work mce_event_process_work = { 40 + static struct irq_work mce_event_process_work = { 41 41 .func = machine_check_process_queued_event, 42 42 }; 43 43

+13

arch/powerpc/kernel/mce_power.c

··· 72 72 73 73 void __flush_tlb_power9(unsigned int action) 74 74 { 75 + if (radix_enabled()) 76 + flush_tlb_206(POWER9_TLB_SETS_RADIX, action); 77 + 75 78 flush_tlb_206(POWER9_TLB_SETS_HASH, action); 76 79 } 77 80 78 81 79 82 /* flush SLBs and reload */ 83 + #ifdef CONFIG_PPC_STD_MMU_64 80 84 static void flush_and_reload_slb(void) 81 85 { 82 86 struct slb_shadow *slb; ··· 114 110 asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb)); 115 111 } 116 112 } 113 + #endif 117 114 118 115 static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits) 119 116 { ··· 125 120 * reset the error bits whenever we handle them so that at the end 126 121 * we can check whether we handled all of them or not. 127 122 * */ 123 + #ifdef CONFIG_PPC_STD_MMU_64 128 124 if (dsisr & slb_error_bits) { 129 125 flush_and_reload_slb(); 130 126 /* reset error bits */ ··· 137 131 /* reset error bits */ 138 132 dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB; 139 133 } 134 + #endif 140 135 /* Any other errors we don't understand? */ 141 136 if (dsisr & 0xffffffffUL) 142 137 handled = 0; ··· 157 150 switch (P7_SRR1_MC_IFETCH(srr1)) { 158 151 case 0: 159 152 break; 153 + #ifdef CONFIG_PPC_STD_MMU_64 160 154 case P7_SRR1_MC_IFETCH_SLB_PARITY: 161 155 case P7_SRR1_MC_IFETCH_SLB_MULTIHIT: 162 156 /* flush and reload SLBs for SLB errors. */ ··· 170 162 handled = 1; 171 163 } 172 164 break; 165 + #endif 173 166 default: 174 167 break; 175 168 } ··· 184 175 185 176 handled = mce_handle_common_ierror(srr1); 186 177 178 + #ifdef CONFIG_PPC_STD_MMU_64 187 179 if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) { 188 180 flush_and_reload_slb(); 189 181 handled = 1; 190 182 } 183 + #endif 191 184 return handled; 192 185 } 193 186 ··· 332 321 333 322 handled = mce_handle_common_ierror(srr1); 334 323 324 + #ifdef CONFIG_PPC_STD_MMU_64 335 325 if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) { 336 326 flush_and_reload_slb(); 337 327 handled = 1; 338 328 } 329 + #endif 339 330 return handled; 340 331 } 341 332

-6

arch/powerpc/kernel/misc_32.S

··· 599 599 mr r4,r10 600 600 blr 601 601 602 - _GLOBAL(abs) 603 - srawi r4,r3,31 604 - xor r3,r3,r4 605 - sub r3,r3,r4 606 - blr 607 - 608 602 #ifdef CONFIG_SMP 609 603 _GLOBAL(start_secondary_resume) 610 604 /* Reset stack */

+1 -11

arch/powerpc/kernel/nvram_64.c

··· 15 15 * parsing code. 16 16 */ 17 17 18 - #include <linux/module.h> 19 - 20 18 #include <linux/types.h> 21 19 #include <linux/errno.h> 22 20 #include <linux/fs.h> ··· 1229 1231 1230 1232 return rc; 1231 1233 } 1232 - 1233 - static void __exit nvram_cleanup(void) 1234 - { 1235 - misc_deregister( &nvram_dev ); 1236 - } 1237 - 1238 - module_init(nvram_init); 1239 - module_exit(nvram_cleanup); 1240 - MODULE_LICENSE("GPL"); 1234 + device_initcall(nvram_init);

+38 -9

arch/powerpc/kernel/pci-hotplug.c

··· 21 21 #include <asm/firmware.h> 22 22 #include <asm/eeh.h> 23 23 24 + static struct pci_bus *find_bus_among_children(struct pci_bus *bus, 25 + struct device_node *dn) 26 + { 27 + struct pci_bus *child = NULL; 28 + struct pci_bus *tmp; 29 + 30 + if (pci_bus_to_OF_node(bus) == dn) 31 + return bus; 32 + 33 + list_for_each_entry(tmp, &bus->children, node) { 34 + child = find_bus_among_children(tmp, dn); 35 + if (child) 36 + break; 37 + } 38 + 39 + return child; 40 + } 41 + 42 + struct pci_bus *pci_find_bus_by_node(struct device_node *dn) 43 + { 44 + struct pci_dn *pdn = PCI_DN(dn); 45 + 46 + if (!pdn || !pdn->phb || !pdn->phb->bus) 47 + return NULL; 48 + 49 + return find_bus_among_children(pdn->phb->bus, dn); 50 + } 51 + EXPORT_SYMBOL_GPL(pci_find_bus_by_node); 52 + 24 53 /** 25 54 * pcibios_release_device - release PCI device 26 55 * @dev: PCI device ··· 67 38 } 68 39 69 40 /** 70 - * pcibios_remove_pci_devices - remove all devices under this bus 41 + * pci_hp_remove_devices - remove all devices under this bus 71 42 * @bus: the indicated PCI bus 72 43 * 73 44 * Remove all of the PCI devices under this bus both from the 74 45 * linux pci device tree, and from the powerpc EEH address cache. 75 46 */ 76 - void pcibios_remove_pci_devices(struct pci_bus *bus) 47 + void pci_hp_remove_devices(struct pci_bus *bus) 77 48 { 78 49 struct pci_dev *dev, *tmp; 79 50 struct pci_bus *child_bus; 80 51 81 52 /* First go down child busses */ 82 53 list_for_each_entry(child_bus, &bus->children, node) 83 - pcibios_remove_pci_devices(child_bus); 54 + pci_hp_remove_devices(child_bus); 84 55 85 56 pr_debug("PCI: Removing devices on bus %04x:%02x\n", 86 57 pci_domain_nr(bus), bus->number); ··· 89 60 pci_stop_and_remove_bus_device(dev); 90 61 } 91 62 } 92 - 93 - EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices); 63 + EXPORT_SYMBOL_GPL(pci_hp_remove_devices); 94 64 95 65 /** 96 - * pcibios_add_pci_devices - adds new pci devices to bus 66 + * pci_hp_add_devices - adds new pci devices to bus 97 67 * @bus: the indicated PCI bus 98 68 * 99 69 * This routine will find and fixup new pci devices under ··· 102 74 * is how this routine differs from other, similar pcibios 103 75 * routines.) 104 76 */ 105 - void pcibios_add_pci_devices(struct pci_bus * bus) 77 + void pci_hp_add_devices(struct pci_bus *bus) 106 78 { 107 79 int slotno, mode, pass, max; 108 80 struct pci_dev *dev; ··· 120 92 if (mode == PCI_PROBE_DEVTREE) { 121 93 /* use ofdt-based probe */ 122 94 of_rescan_bus(dn, bus); 123 - } else if (mode == PCI_PROBE_NORMAL) { 95 + } else if (mode == PCI_PROBE_NORMAL && 96 + dn->child && PCI_DN(dn->child)) { 124 97 /* 125 98 * Use legacy probe. In the partial hotplug case, we 126 99 * probably have grandchildren devices unplugged. So ··· 143 114 } 144 115 pcibios_finish_adding_to_bus(bus); 145 116 } 146 - EXPORT_SYMBOL_GPL(pcibios_add_pci_devices); 117 + EXPORT_SYMBOL_GPL(pci_hp_add_devices);

+3 -2

arch/powerpc/kernel/pci_64.c

··· 38 38 * ISA drivers use hard coded offsets. If no ISA bus exists nothing 39 39 * is mapped on the first 64K of IO space 40 40 */ 41 - unsigned long pci_io_base = ISA_IO_BASE; 41 + unsigned long pci_io_base; 42 42 EXPORT_SYMBOL(pci_io_base); 43 43 44 44 static int __init pcibios_init(void) ··· 47 47 48 48 printk(KERN_INFO "PCI: Probing PCI hardware\n"); 49 49 50 + pci_io_base = ISA_IO_BASE; 50 51 /* For now, override phys_mem_access_prot. If we need it,g 51 52 * later, we may move that initialization to each ppc_md 52 53 */ ··· 160 159 161 160 /* Establish the mapping */ 162 161 if (__ioremap_at(phys_page, area->addr, size_page, 163 - _PAGE_NO_CACHE | _PAGE_GUARDED) == NULL) 162 + pgprot_val(pgprot_noncached(__pgprot(0)))) == NULL) 164 163 return -ENOMEM; 165 164 166 165 /* Fixup hose IO resource */

+51 -15

arch/powerpc/kernel/pci_dn.c

··· 282 282 #endif /* CONFIG_PCI_IOV */ 283 283 } 284 284 285 - /* 286 - * Traverse_func that inits the PCI fields of the device node. 287 - * NOTE: this *must* be done before read/write config to the device. 288 - */ 289 - void *update_dn_pci_info(struct device_node *dn, void *data) 285 + struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, 286 + struct device_node *dn) 290 287 { 291 - struct pci_controller *phb = data; 292 288 const __be32 *type = of_get_property(dn, "ibm,pci-config-space-type", NULL); 293 289 const __be32 *regs; 294 290 struct device_node *parent; ··· 295 299 return NULL; 296 300 dn->data = pdn; 297 301 pdn->node = dn; 298 - pdn->phb = phb; 302 + pdn->phb = hose; 299 303 #ifdef CONFIG_PPC_POWERNV 300 304 pdn->pe_number = IODA_INVALID_PE; 301 305 #endif ··· 327 331 if (pdn->parent) 328 332 list_add_tail(&pdn->list, &pdn->parent->child_list); 329 333 330 - return NULL; 334 + return pdn; 331 335 } 336 + EXPORT_SYMBOL_GPL(pci_add_device_node_info); 337 + 338 + void pci_remove_device_node_info(struct device_node *dn) 339 + { 340 + struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL; 341 + #ifdef CONFIG_EEH 342 + struct eeh_dev *edev = pdn_to_eeh_dev(pdn); 343 + 344 + if (edev) 345 + edev->pdn = NULL; 346 + #endif 347 + 348 + if (!pdn) 349 + return; 350 + 351 + WARN_ON(!list_empty(&pdn->child_list)); 352 + list_del(&pdn->list); 353 + if (pdn->parent) 354 + of_node_put(pdn->parent->node); 355 + 356 + dn->data = NULL; 357 + kfree(pdn); 358 + } 359 + EXPORT_SYMBOL_GPL(pci_remove_device_node_info); 332 360 333 361 /* 334 362 * Traverse a device tree stopping each PCI device in the tree. ··· 372 352 * one of these nodes we also assume its siblings are non-pci for 373 353 * performance. 374 354 */ 375 - void *traverse_pci_devices(struct device_node *start, traverse_func pre, 376 - void *data) 355 + void *pci_traverse_device_nodes(struct device_node *start, 356 + void *(*fn)(struct device_node *, void *), 357 + void *data) 377 358 { 378 359 struct device_node *dn, *nextdn; 379 360 void *ret; ··· 389 368 if (classp) 390 369 class = of_read_number(classp, 1); 391 370 392 - if (pre && ((ret = pre(dn, data)) != NULL)) 393 - return ret; 371 + if (fn) { 372 + ret = fn(dn, data); 373 + if (ret) 374 + return ret; 375 + } 394 376 395 377 /* If we are a PCI bridge, go down */ 396 378 if (dn->child && ((class >> 8) == PCI_CLASS_BRIDGE_PCI || ··· 415 391 } 416 392 return NULL; 417 393 } 394 + EXPORT_SYMBOL_GPL(pci_traverse_device_nodes); 418 395 419 396 static struct pci_dn *pci_dn_next_one(struct pci_dn *root, 420 397 struct pci_dn *pdn) ··· 457 432 return NULL; 458 433 } 459 434 435 + static void *add_pdn(struct device_node *dn, void *data) 436 + { 437 + struct pci_controller *hose = data; 438 + struct pci_dn *pdn; 439 + 440 + pdn = pci_add_device_node_info(hose, dn); 441 + if (!pdn) 442 + return ERR_PTR(-ENOMEM); 443 + 444 + return NULL; 445 + } 446 + 460 447 /** 461 448 * pci_devs_phb_init_dynamic - setup pci devices under this PHB 462 449 * phb: pci-to-host bridge (top-level bridge connecting to cpu) ··· 483 446 struct pci_dn *pdn; 484 447 485 448 /* PHB nodes themselves must not match */ 486 - update_dn_pci_info(dn, phb); 487 - pdn = dn->data; 449 + pdn = pci_add_device_node_info(phb, dn); 488 450 if (pdn) { 489 451 pdn->devfn = pdn->busno = -1; 490 452 pdn->vendor_id = pdn->device_id = pdn->class_code = 0; ··· 492 456 } 493 457 494 458 /* Update dn->phb ptrs for new phb and children devices */ 495 - traverse_pci_devices(dn, update_dn_pci_info, phb); 459 + pci_traverse_device_nodes(dn, add_pdn, phb); 496 460 } 497 461 498 462 /**

+11 -6

arch/powerpc/kernel/process.c

··· 38 38 #include <linux/random.h> 39 39 #include <linux/hw_breakpoint.h> 40 40 #include <linux/uaccess.h> 41 + #include <linux/elf-randomize.h> 41 42 42 43 #include <asm/pgtable.h> 43 44 #include <asm/io.h> ··· 56 55 #include <asm/firmware.h> 57 56 #endif 58 57 #include <asm/code-patching.h> 58 + #include <asm/exec.h> 59 59 #include <asm/livepatch.h> 60 60 61 61 #include <linux/kprobes.h> ··· 1079 1077 } 1080 1078 #endif /* CONFIG_PPC64 */ 1081 1079 1082 - #ifdef CONFIG_PPC_BOOK3S_64 1080 + #ifdef CONFIG_PPC_STD_MMU_64 1083 1081 batch = this_cpu_ptr(&ppc64_tlb_batch); 1084 1082 if (batch->active) { 1085 1083 current_thread_info()->local_flags |= _TLF_LAZY_MMU; ··· 1087 1085 __flush_tlb_pending(batch); 1088 1086 batch->active = 0; 1089 1087 } 1090 - #endif /* CONFIG_PPC_BOOK3S_64 */ 1088 + #endif /* CONFIG_PPC_STD_MMU_64 */ 1091 1089 1092 1090 #ifdef CONFIG_PPC_ADV_DEBUG_REGS 1093 1091 switch_booke_debug_regs(&new->thread.debug); ··· 1133 1131 1134 1132 last = _switch(old_thread, new_thread); 1135 1133 1136 - #ifdef CONFIG_PPC_BOOK3S_64 1134 + #ifdef CONFIG_PPC_STD_MMU_64 1137 1135 if (current_thread_info()->local_flags & _TLF_LAZY_MMU) { 1138 1136 current_thread_info()->local_flags &= ~_TLF_LAZY_MMU; 1139 1137 batch = this_cpu_ptr(&ppc64_tlb_batch); ··· 1142 1140 1143 1141 if (current_thread_info()->task->thread.regs) 1144 1142 restore_math(current_thread_info()->task->thread.regs); 1145 - 1146 - #endif /* CONFIG_PPC_BOOK3S_64 */ 1143 + #endif /* CONFIG_PPC_STD_MMU_64 */ 1147 1144 1148 1145 return last; 1149 1146 } ··· 1376 1375 #ifdef CONFIG_PPC_STD_MMU_64 1377 1376 unsigned long sp_vsid; 1378 1377 unsigned long llp = mmu_psize_defs[mmu_linear_psize].sllp; 1378 + 1379 + if (radix_enabled()) 1380 + return; 1379 1381 1380 1382 if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) 1381 1383 sp_vsid = get_kernel_vsid(sp, MMU_SEGSIZE_1T) ··· 1928 1924 * the heap, we can put it above 1TB so it is backed by a 1TB 1929 1925 * segment. Otherwise the heap will be in the bottom 1TB 1930 1926 * which always uses 256MB segments and this may result in a 1931 - * performance penalty. 1927 + * performance penalty. We don't need to worry about radix. For 1928 + * radix, mmu_highuser_ssize remains unchanged from 256MB. 1932 1929 */ 1933 1930 if (!is_32bit_task() && (mmu_highuser_ssize == MMU_SEGSIZE_1T)) 1934 1931 base = max_t(unsigned long, mm->brk, 1UL << SID_SHIFT_1T);

+2

arch/powerpc/kernel/prom.c

··· 34 34 #include <linux/of.h> 35 35 #include <linux/of_fdt.h> 36 36 #include <linux/libfdt.h> 37 + #include <linux/cpu.h> 37 38 38 39 #include <asm/prom.h> 39 40 #include <asm/rtas.h> ··· 168 167 */ 169 168 {CPU_FTR_TM_COMP, 0, 0, 170 169 PPC_FEATURE2_HTM_COMP|PPC_FEATURE2_HTM_NOSC_COMP, 22, 0, 0}, 170 + {0, MMU_FTR_RADIX, 0, 0, 40, 0, 0}, 171 171 }; 172 172 173 173 static void __init scan_features(unsigned long node, const unsigned char *ftrs,

+1 -1

arch/powerpc/kernel/rtasd.c

··· 442 442 } 443 443 444 444 static void rtas_event_scan(struct work_struct *w); 445 - DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan); 445 + static DECLARE_DELAYED_WORK(event_scan_work, rtas_event_scan); 446 446 447 447 /* 448 448 * Delay should be at least one second since some machines have problems if

-6

arch/powerpc/kernel/setup-common.c

··· 128 128 machine_shutdown(); 129 129 if (ppc_md.restart) 130 130 ppc_md.restart(cmd); 131 - #ifdef CONFIG_SMP 132 131 smp_send_stop(); 133 - #endif 134 132 printk(KERN_EMERG "System Halted, OK to turn off power\n"); 135 133 local_irq_disable(); 136 134 while (1) ; ··· 139 141 machine_shutdown(); 140 142 if (pm_power_off) 141 143 pm_power_off(); 142 - #ifdef CONFIG_SMP 143 144 smp_send_stop(); 144 - #endif 145 145 printk(KERN_EMERG "System Halted, OK to turn off power\n"); 146 146 local_irq_disable(); 147 147 while (1) ; ··· 155 159 machine_shutdown(); 156 160 if (ppc_md.halt) 157 161 ppc_md.halt(); 158 - #ifdef CONFIG_SMP 159 162 smp_send_stop(); 160 - #endif 161 163 printk(KERN_EMERG "System Halted, OK to turn off power\n"); 162 164 local_irq_disable(); 163 165 while (1) ;

+1 -1

arch/powerpc/kernel/swsusp.c

··· 31 31 void restore_processor_state(void) 32 32 { 33 33 #ifdef CONFIG_PPC32 34 - switch_mmu_context(current->active_mm, current->active_mm); 34 + switch_mmu_context(current->active_mm, current->active_mm, NULL); 35 35 #endif 36 36 }

+1

arch/powerpc/kernel/time.c

··· 55 55 #include <linux/delay.h> 56 56 #include <linux/irq_work.h> 57 57 #include <linux/clk-provider.h> 58 + #include <linux/suspend.h> 58 59 #include <asm/trace.h> 59 60 60 61 #include <asm/io.h>

+2 -2

arch/powerpc/kernel/vio.c

··· 87 87 * @curr: bytes currently allocated 88 88 * @high: high water mark for IO data usage 89 89 */ 90 - struct vio_cmo { 90 + static struct vio_cmo { 91 91 spinlock_t lock; 92 92 struct delayed_work balance_q; 93 93 struct list_head device_list; ··· 615 615 return dma_iommu_ops.get_required_mask(dev); 616 616 } 617 617 618 - struct dma_map_ops vio_dma_mapping_ops = { 618 + static struct dma_map_ops vio_dma_mapping_ops = { 619 619 .alloc = vio_dma_iommu_alloc_coherent, 620 620 .free = vio_dma_iommu_free_coherent, 621 621 .mmap = dma_direct_mmap_coherent,

+5 -6

arch/powerpc/kvm/book3s_64_mmu_hv.c

··· 447 447 struct revmap_entry *rev; 448 448 struct page *page, *pages[1]; 449 449 long index, ret, npages; 450 - unsigned long is_io; 450 + bool is_ci; 451 451 unsigned int writing, write_ok; 452 452 struct vm_area_struct *vma; 453 453 unsigned long rcbits; ··· 503 503 smp_rmb(); 504 504 505 505 ret = -EFAULT; 506 - is_io = 0; 506 + is_ci = false; 507 507 pfn = 0; 508 508 page = NULL; 509 509 pte_size = PAGE_SIZE; ··· 521 521 pfn = vma->vm_pgoff + 522 522 ((hva - vma->vm_start) >> PAGE_SHIFT); 523 523 pte_size = psize; 524 - is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); 524 + is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot)))); 525 525 write_ok = vma->vm_flags & VM_WRITE; 526 526 } 527 527 up_read(&current->mm->mmap_sem); ··· 558 558 goto out_put; 559 559 560 560 /* Check WIMG vs. the actual page we're accessing */ 561 - if (!hpte_cache_flags_ok(r, is_io)) { 562 - if (is_io) 561 + if (!hpte_cache_flags_ok(r, is_ci)) { 562 + if (is_ci) 563 563 goto out_put; 564 - 565 564 /* 566 565 * Allow guest to map emulated device memory as 567 566 * uncacheable, but actually make it cacheable.

+6

arch/powerpc/kvm/book3s_hv.c

··· 3272 3272 if (!cpu_has_feature(CPU_FTR_HVMODE) || 3273 3273 !cpu_has_feature(CPU_FTR_ARCH_206)) 3274 3274 return -EIO; 3275 + /* 3276 + * Disable KVM for Power9, untill the required bits merged. 3277 + */ 3278 + if (cpu_has_feature(CPU_FTR_ARCH_300)) 3279 + return -EIO; 3280 + 3275 3281 return 0; 3276 3282 } 3277 3283

+6 -6

arch/powerpc/kvm/book3s_hv_rm_mmu.c

··· 175 175 unsigned long g_ptel; 176 176 struct kvm_memory_slot *memslot; 177 177 unsigned hpage_shift; 178 - unsigned long is_io; 178 + bool is_ci; 179 179 unsigned long *rmap; 180 180 pte_t *ptep; 181 181 unsigned int writing; ··· 199 199 gfn = gpa >> PAGE_SHIFT; 200 200 memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); 201 201 pa = 0; 202 - is_io = ~0ul; 202 + is_ci = false; 203 203 rmap = NULL; 204 204 if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) { 205 205 /* Emulated MMIO - mark this with key=31 */ ··· 250 250 if (writing && !pte_write(pte)) 251 251 /* make the actual HPTE be read-only */ 252 252 ptel = hpte_make_readonly(ptel); 253 - is_io = hpte_cache_bits(pte_val(pte)); 253 + is_ci = pte_ci(pte); 254 254 pa = pte_pfn(pte) << PAGE_SHIFT; 255 255 pa |= hva & (host_pte_size - 1); 256 256 pa |= gpa & ~PAGE_MASK; ··· 267 267 else 268 268 pteh |= HPTE_V_ABSENT; 269 269 270 - /* Check WIMG */ 271 - if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) { 272 - if (is_io) 270 + /*If we had host pte mapping then Check WIMG */ 271 + if (ptep && !hpte_cache_flags_ok(ptel, is_ci)) { 272 + if (is_ci) 273 273 return H_PARAMETER; 274 274 /* 275 275 * Allow guest to map emulated device memory as

+5 -1

arch/powerpc/kvm/book3s_pr.c

··· 1713 1713 1714 1714 static int kvmppc_core_check_processor_compat_pr(void) 1715 1715 { 1716 - /* we are always compatible */ 1716 + /* 1717 + * Disable KVM for Power9 untill the required bits merged. 1718 + */ 1719 + if (cpu_has_feature(CPU_FTR_ARCH_300)) 1720 + return -EIO; 1717 1721 return 0; 1718 1722 } 1719 1723

+1 -1

arch/powerpc/lib/copy_32.S

··· 217 217 bdnz 40b 218 218 65: blr 219 219 220 - _GLOBAL(generic_memcpy) 220 + generic_memcpy: 221 221 srwi. r7,r5,3 222 222 addi r6,r3,-4 223 223 addi r4,r4,-4

+5

arch/powerpc/lib/sstep.c

··· 925 925 } 926 926 } 927 927 #endif 928 + break; /* illegal instruction */ 928 929 929 930 case 31: 930 931 switch ((instr >> 1) & 0x3ff) { ··· 1819 1818 case 4: 1820 1819 __get_user_asmx(val, op.ea, err, "lwarx"); 1821 1820 break; 1821 + #ifdef __powerpc64__ 1822 1822 case 8: 1823 1823 __get_user_asmx(val, op.ea, err, "ldarx"); 1824 1824 break; 1825 + #endif 1825 1826 default: 1826 1827 return 0; 1827 1828 } ··· 1844 1841 case 4: 1845 1842 __put_user_asmx(op.val, op.ea, err, "stwcx.", cr); 1846 1843 break; 1844 + #ifdef __powerpc64__ 1847 1845 case 8: 1848 1846 __put_user_asmx(op.val, op.ea, err, "stdcx.", cr); 1849 1847 break; 1848 + #endif 1850 1849 default: 1851 1850 return 0; 1852 1851 }

+10

arch/powerpc/lib/xor_vmx.c

··· 17 17 * 18 18 * Author: Anton Blanchard <anton@au.ibm.com> 19 19 */ 20 + 21 + /* 22 + * Sparse (as at v0.5.0) gets very, very confused by this file. 23 + * Make it a bit simpler for it. 24 + */ 25 + #if !defined(__CHECKER__) 20 26 #include <altivec.h> 27 + #else 28 + #define vec_xor(a, b) a ^ b 29 + #define vector __attribute__((vector_size(16))) 30 + #endif 21 31 22 32 #include <linux/preempt.h> 23 33 #include <linux/export.h>

+6 -4

arch/powerpc/mm/Makefile

··· 13 13 tlb_nohash_low.o 14 14 obj-$(CONFIG_PPC_BOOK3E) += tlb_low_$(CONFIG_WORD_SIZE)e.o 15 15 hash64-$(CONFIG_PPC_NATIVE) := hash_native_64.o 16 - obj-$(CONFIG_PPC_STD_MMU_64) += hash_utils_64.o slb_low.o slb.o $(hash64-y) 17 - obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o 18 - obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(CONFIG_WORD_SIZE).o \ 19 - mmu_context_hash$(CONFIG_WORD_SIZE).o 16 + obj-$(CONFIG_PPC_BOOK3E_64) += pgtable-book3e.o 17 + obj-$(CONFIG_PPC_STD_MMU_64) += pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o 18 + obj-$(CONFIG_PPC_RADIX_MMU) += pgtable-radix.o tlb-radix.o 19 + obj-$(CONFIG_PPC_STD_MMU_32) += ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o 20 + obj-$(CONFIG_PPC_STD_MMU) += tlb_hash$(CONFIG_WORD_SIZE).o 20 21 ifeq ($(CONFIG_PPC_STD_MMU_64),y) 21 22 obj-$(CONFIG_PPC_4K_PAGES) += hash64_4k.o 22 23 obj-$(CONFIG_PPC_64K_PAGES) += hash64_64k.o ··· 34 33 obj-y += hugetlbpage.o 35 34 ifeq ($(CONFIG_HUGETLB_PAGE),y) 36 35 obj-$(CONFIG_PPC_STD_MMU_64) += hugetlbpage-hash64.o 36 + obj-$(CONFIG_PPC_RADIX_MMU) += hugetlbpage-radix.o 37 37 obj-$(CONFIG_PPC_BOOK3E_MMU) += hugetlbpage-book3e.o 38 38 endif 39 39 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += hugepage-hash64.o

+1 -1

arch/powerpc/mm/fsl_booke_mmu.c

··· 135 135 TLBCAM[index].MAS7 = (u64)phys >> 32; 136 136 137 137 /* Below is unlikely -- only for large user pages or similar */ 138 - if (pte_user(flags)) { 138 + if (pte_user(__pte(flags))) { 139 139 TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; 140 140 TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); 141 141 }

+15 -14

arch/powerpc/mm/hash64_4k.c

··· 34 34 35 35 old_pte = pte_val(pte); 36 36 /* If PTE busy, retry the access */ 37 - if (unlikely(old_pte & _PAGE_BUSY)) 37 + if (unlikely(old_pte & H_PAGE_BUSY)) 38 38 return 0; 39 39 /* If PTE permissions don't match, take page fault */ 40 - if (unlikely(access & ~old_pte)) 40 + if (unlikely(!check_pte_access(access, old_pte))) 41 41 return 1; 42 42 /* 43 43 * Try to lock the PTE, add ACCESSED and DIRTY if it was 44 44 * a write access. Since this is 4K insert of 64K page size 45 - * also add _PAGE_COMBO 45 + * also add H_PAGE_COMBO 46 46 */ 47 - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; 48 - if (access & _PAGE_RW) 47 + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; 48 + if (access & _PAGE_WRITE) 49 49 new_pte |= _PAGE_DIRTY; 50 - } while (old_pte != __cmpxchg_u64((unsigned long *)ptep, 51 - old_pte, new_pte)); 50 + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); 51 + 52 52 /* 53 53 * PP bits. _PAGE_USER is already PP bit 0x2, so we only 54 54 * need to add in 0x1 if it's a read-only user page ··· 60 60 rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); 61 61 62 62 vpn = hpt_vpn(ea, vsid, ssize); 63 - if (unlikely(old_pte & _PAGE_HASHPTE)) { 63 + if (unlikely(old_pte & H_PAGE_HASHPTE)) { 64 64 /* 65 65 * There MIGHT be an HPTE for this pte 66 66 */ 67 67 hash = hpt_hash(vpn, shift, ssize); 68 - if (old_pte & _PAGE_F_SECOND) 68 + if (old_pte & H_PAGE_F_SECOND) 69 69 hash = ~hash; 70 70 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 71 - slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT; 71 + slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT; 72 72 73 73 if (ppc_md.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_4K, 74 74 MMU_PAGE_4K, ssize, flags) == -1) 75 75 old_pte &= ~_PAGE_HPTEFLAGS; 76 76 } 77 77 78 - if (likely(!(old_pte & _PAGE_HASHPTE))) { 78 + if (likely(!(old_pte & H_PAGE_HASHPTE))) { 79 79 80 80 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; 81 81 hash = hpt_hash(vpn, shift, ssize); ··· 115 115 MMU_PAGE_4K, MMU_PAGE_4K, old_pte); 116 116 return -1; 117 117 } 118 - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; 119 - new_pte |= (slot << _PAGE_F_GIX_SHIFT) & (_PAGE_F_SECOND | _PAGE_F_GIX); 118 + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; 119 + new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & 120 + (H_PAGE_F_SECOND | H_PAGE_F_GIX); 120 121 } 121 - *ptep = __pte(new_pte & ~_PAGE_BUSY); 122 + *ptep = __pte(new_pte & ~H_PAGE_BUSY); 122 123 return 0; 123 124 }

+35 -36

arch/powerpc/mm/hash64_64k.c

··· 23 23 unsigned long g_idx; 24 24 unsigned long ptev = pte_val(rpte.pte); 25 25 26 - g_idx = (ptev & _PAGE_COMBO_VALID) >> _PAGE_F_GIX_SHIFT; 26 + g_idx = (ptev & H_PAGE_COMBO_VALID) >> H_PAGE_F_GIX_SHIFT; 27 27 index = index >> 2; 28 28 if (g_idx & (0x1 << index)) 29 29 return true; ··· 37 37 { 38 38 unsigned long g_idx; 39 39 40 - if (!(ptev & _PAGE_COMBO)) 40 + if (!(ptev & H_PAGE_COMBO)) 41 41 return ptev; 42 42 index = index >> 2; 43 43 g_idx = 0x1 << index; 44 44 45 - return ptev | (g_idx << _PAGE_F_GIX_SHIFT); 45 + return ptev | (g_idx << H_PAGE_F_GIX_SHIFT); 46 46 } 47 47 48 48 int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, ··· 66 66 67 67 old_pte = pte_val(pte); 68 68 /* If PTE busy, retry the access */ 69 - if (unlikely(old_pte & _PAGE_BUSY)) 69 + if (unlikely(old_pte & H_PAGE_BUSY)) 70 70 return 0; 71 71 /* If PTE permissions don't match, take page fault */ 72 - if (unlikely(access & ~old_pte)) 72 + if (unlikely(!check_pte_access(access, old_pte))) 73 73 return 1; 74 74 /* 75 75 * Try to lock the PTE, add ACCESSED and DIRTY if it was 76 76 * a write access. Since this is 4K insert of 64K page size 77 - * also add _PAGE_COMBO 77 + * also add H_PAGE_COMBO 78 78 */ 79 - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED | _PAGE_COMBO; 80 - if (access & _PAGE_RW) 79 + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED | H_PAGE_COMBO; 80 + if (access & _PAGE_WRITE) 81 81 new_pte |= _PAGE_DIRTY; 82 - } while (old_pte != __cmpxchg_u64((unsigned long *)ptep, 83 - old_pte, new_pte)); 82 + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); 83 + 84 84 /* 85 85 * Handle the subpage protection bits 86 86 */ ··· 103 103 /* 104 104 *None of the sub 4k page is hashed 105 105 */ 106 - if (!(old_pte & _PAGE_HASHPTE)) 106 + if (!(old_pte & H_PAGE_HASHPTE)) 107 107 goto htab_insert_hpte; 108 108 /* 109 109 * Check if the pte was already inserted into the hash table 110 110 * as a 64k HW page, and invalidate the 64k HPTE if so. 111 111 */ 112 - if (!(old_pte & _PAGE_COMBO)) { 112 + if (!(old_pte & H_PAGE_COMBO)) { 113 113 flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags); 114 114 /* 115 115 * clear the old slot details from the old and new pte. 116 116 * On hash insert failure we use old pte value and we don't 117 117 * want slot information there if we have a insert failure. 118 118 */ 119 - old_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND); 120 - new_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND); 119 + old_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND); 120 + new_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND); 121 121 goto htab_insert_hpte; 122 122 } 123 123 /* ··· 143 143 if (ret == -1) 144 144 goto htab_insert_hpte; 145 145 146 - *ptep = __pte(new_pte & ~_PAGE_BUSY); 146 + *ptep = __pte(new_pte & ~H_PAGE_BUSY); 147 147 return 0; 148 148 } 149 149 150 150 htab_insert_hpte: 151 151 /* 152 - * handle _PAGE_4K_PFN case 152 + * handle H_PAGE_4K_PFN case 153 153 */ 154 - if (old_pte & _PAGE_4K_PFN) { 154 + if (old_pte & H_PAGE_4K_PFN) { 155 155 /* 156 156 * All the sub 4k page have the same 157 157 * physical address. ··· 199 199 } 200 200 /* 201 201 * Insert slot number & secondary bit in PTE second half, 202 - * clear _PAGE_BUSY and set appropriate HPTE slot bit 203 - * Since we have _PAGE_BUSY set on ptep, we can be sure 202 + * clear H_PAGE_BUSY and set appropriate HPTE slot bit 203 + * Since we have H_PAGE_BUSY set on ptep, we can be sure 204 204 * nobody is undating hidx. 205 205 */ 206 206 hidxp = (unsigned long *)(ptep + PTRS_PER_PTE); 207 207 rpte.hidx &= ~(0xfUL << (subpg_index << 2)); 208 208 *hidxp = rpte.hidx | (slot << (subpg_index << 2)); 209 209 new_pte = mark_subptegroup_valid(new_pte, subpg_index); 210 - new_pte |= _PAGE_HASHPTE; 210 + new_pte |= H_PAGE_HASHPTE; 211 211 /* 212 212 * check __real_pte for details on matching smp_rmb() 213 213 */ 214 214 smp_wmb(); 215 - *ptep = __pte(new_pte & ~_PAGE_BUSY); 215 + *ptep = __pte(new_pte & ~H_PAGE_BUSY); 216 216 return 0; 217 217 } 218 218 ··· 220 220 unsigned long vsid, pte_t *ptep, unsigned long trap, 221 221 unsigned long flags, int ssize) 222 222 { 223 - 224 223 unsigned long hpte_group; 225 224 unsigned long rflags, pa; 226 225 unsigned long old_pte, new_pte; ··· 234 235 235 236 old_pte = pte_val(pte); 236 237 /* If PTE busy, retry the access */ 237 - if (unlikely(old_pte & _PAGE_BUSY)) 238 + if (unlikely(old_pte & H_PAGE_BUSY)) 238 239 return 0; 239 240 /* If PTE permissions don't match, take page fault */ 240 - if (unlikely(access & ~old_pte)) 241 + if (unlikely(!check_pte_access(access, old_pte))) 241 242 return 1; 242 243 /* 243 244 * Check if PTE has the cache-inhibit bit set 244 245 * If so, bail out and refault as a 4k page 245 246 */ 246 247 if (!mmu_has_feature(MMU_FTR_CI_LARGE_PAGE) && 247 - unlikely(old_pte & _PAGE_NO_CACHE)) 248 + unlikely(pte_ci(pte))) 248 249 return 0; 249 250 /* 250 251 * Try to lock the PTE, add ACCESSED and DIRTY if it was 251 252 * a write access. 252 253 */ 253 - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; 254 - if (access & _PAGE_RW) 254 + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; 255 + if (access & _PAGE_WRITE) 255 256 new_pte |= _PAGE_DIRTY; 256 - } while (old_pte != __cmpxchg_u64((unsigned long *)ptep, 257 - old_pte, new_pte)); 257 + } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); 258 258 259 259 rflags = htab_convert_pte_flags(new_pte); 260 260 ··· 262 264 rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); 263 265 264 266 vpn = hpt_vpn(ea, vsid, ssize); 265 - if (unlikely(old_pte & _PAGE_HASHPTE)) { 267 + if (unlikely(old_pte & H_PAGE_HASHPTE)) { 266 268 /* 267 269 * There MIGHT be an HPTE for this pte 268 270 */ 269 271 hash = hpt_hash(vpn, shift, ssize); 270 - if (old_pte & _PAGE_F_SECOND) 272 + if (old_pte & H_PAGE_F_SECOND) 271 273 hash = ~hash; 272 274 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 273 - slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT; 275 + slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT; 274 276 275 277 if (ppc_md.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K, 276 278 MMU_PAGE_64K, ssize, flags) == -1) 277 279 old_pte &= ~_PAGE_HPTEFLAGS; 278 280 } 279 281 280 - if (likely(!(old_pte & _PAGE_HASHPTE))) { 282 + if (likely(!(old_pte & H_PAGE_HASHPTE))) { 281 283 282 284 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; 283 285 hash = hpt_hash(vpn, shift, ssize); ··· 317 319 MMU_PAGE_64K, MMU_PAGE_64K, old_pte); 318 320 return -1; 319 321 } 320 - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; 321 - new_pte |= (slot << _PAGE_F_GIX_SHIFT) & (_PAGE_F_SECOND | _PAGE_F_GIX); 322 + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; 323 + new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & 324 + (H_PAGE_F_SECOND | H_PAGE_F_GIX); 322 325 } 323 - *ptep = __pte(new_pte & ~_PAGE_BUSY); 326 + *ptep = __pte(new_pte & ~H_PAGE_BUSY); 324 327 return 0; 325 328 }

+10 -1

arch/powerpc/mm/hash_native_64.c

··· 221 221 return -1; 222 222 223 223 hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; 224 - hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; 224 + hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags; 225 225 226 226 if (!(vflags & HPTE_V_BOLTED)) { 227 227 DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n", ··· 719 719 local_irq_restore(flags); 720 720 } 721 721 722 + static int native_update_partition_table(u64 patb1) 723 + { 724 + partition_tb->patb1 = cpu_to_be64(patb1); 725 + return 0; 726 + } 727 + 722 728 void __init hpte_init_native(void) 723 729 { 724 730 ppc_md.hpte_invalidate = native_hpte_invalidate; ··· 735 729 ppc_md.hpte_clear_all = native_hpte_clear; 736 730 ppc_md.flush_hash_range = native_flush_hash_range; 737 731 ppc_md.hugepage_invalidate = native_hugepage_invalidate; 732 + 733 + if (cpu_has_feature(CPU_FTR_ARCH_300)) 734 + ppc_md.update_partition_table = native_update_partition_table; 738 735 }

+149 -41

arch/powerpc/mm/hash_utils_64.c

··· 167 167 if ((pteflags & _PAGE_EXEC) == 0) 168 168 rflags |= HPTE_R_N; 169 169 /* 170 - * PP bits: 170 + * PPP bits: 171 171 * Linux uses slb key 0 for kernel and 1 for user. 172 - * kernel areas are mapped with PP=00 173 - * and there is no kernel RO (_PAGE_KERNEL_RO). 174 - * User area is mapped with PP=0x2 for read/write 175 - * or PP=0x3 for read-only (including writeable but clean pages). 172 + * kernel RW areas are mapped with PPP=0b000 173 + * User area is mapped with PPP=0b010 for read/write 174 + * or PPP=0b011 for read-only (including writeable but clean pages). 176 175 */ 177 - if (pteflags & _PAGE_USER) { 178 - rflags |= 0x2; 179 - if (!((pteflags & _PAGE_RW) && (pteflags & _PAGE_DIRTY))) 176 + if (pteflags & _PAGE_PRIVILEGED) { 177 + /* 178 + * Kernel read only mapped with ppp bits 0b110 179 + */ 180 + if (!(pteflags & _PAGE_WRITE)) 181 + rflags |= (HPTE_R_PP0 | 0x2); 182 + } else { 183 + if (pteflags & _PAGE_RWX) 184 + rflags |= 0x2; 185 + if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY))) 180 186 rflags |= 0x1; 181 187 } 182 188 /* ··· 192 186 /* 193 187 * Add in WIG bits 194 188 */ 195 - if (pteflags & _PAGE_WRITETHRU) 196 - rflags |= HPTE_R_W; 197 - if (pteflags & _PAGE_NO_CACHE) 189 + 190 + if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_TOLERANT) 198 191 rflags |= HPTE_R_I; 199 - if (pteflags & _PAGE_GUARDED) 200 - rflags |= HPTE_R_G; 192 + if ((pteflags & _PAGE_CACHE_CTL ) == _PAGE_NON_IDEMPOTENT) 193 + rflags |= (HPTE_R_I | HPTE_R_G); 194 + if ((pteflags & _PAGE_CACHE_CTL) == _PAGE_SAO) 195 + rflags |= (HPTE_R_I | HPTE_R_W); 201 196 202 197 return rflags; 203 198 } ··· 676 669 } 677 670 #endif /* CONFIG_MEMORY_HOTPLUG */ 678 671 672 + static void __init hash_init_partition_table(phys_addr_t hash_table, 673 + unsigned long pteg_count) 674 + { 675 + unsigned long ps_field; 676 + unsigned long htab_size; 677 + unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; 678 + 679 + /* 680 + * slb llp encoding for the page size used in VPM real mode. 681 + * We can ignore that for lpid 0 682 + */ 683 + ps_field = 0; 684 + htab_size = __ilog2(pteg_count) - 11; 685 + 686 + BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large."); 687 + partition_tb = __va(memblock_alloc_base(patb_size, patb_size, 688 + MEMBLOCK_ALLOC_ANYWHERE)); 689 + 690 + /* Initialize the Partition Table with no entries */ 691 + memset((void *)partition_tb, 0, patb_size); 692 + partition_tb->patb0 = cpu_to_be64(ps_field | hash_table | htab_size); 693 + /* 694 + * FIXME!! This should be done via update_partition table 695 + * For now UPRT is 0 for us. 696 + */ 697 + partition_tb->patb1 = 0; 698 + DBG("Partition table %p\n", partition_tb); 699 + /* 700 + * update partition table control register, 701 + * 64 K size. 702 + */ 703 + mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); 704 + 705 + } 706 + 679 707 static void __init htab_initialize(void) 680 708 { 681 709 unsigned long table; ··· 779 737 /* Initialize the HPT with no entries */ 780 738 memset((void *)table, 0, htab_size_bytes); 781 739 782 - /* Set SDR1 */ 783 - mtspr(SPRN_SDR1, _SDR1); 740 + if (!cpu_has_feature(CPU_FTR_ARCH_300)) 741 + /* Set SDR1 */ 742 + mtspr(SPRN_SDR1, _SDR1); 743 + else 744 + hash_init_partition_table(table, pteg_count); 784 745 } 785 746 786 747 prot = pgprot_val(PAGE_KERNEL); ··· 868 823 #undef KB 869 824 #undef MB 870 825 871 - void __init early_init_mmu(void) 826 + void __init hash__early_init_mmu(void) 872 827 { 828 + /* 829 + * initialize page table size 830 + */ 831 + __pte_frag_nr = H_PTE_FRAG_NR; 832 + __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; 833 + 834 + __pte_index_size = H_PTE_INDEX_SIZE; 835 + __pmd_index_size = H_PMD_INDEX_SIZE; 836 + __pud_index_size = H_PUD_INDEX_SIZE; 837 + __pgd_index_size = H_PGD_INDEX_SIZE; 838 + __pmd_cache_index = H_PMD_CACHE_INDEX; 839 + __pte_table_size = H_PTE_TABLE_SIZE; 840 + __pmd_table_size = H_PMD_TABLE_SIZE; 841 + __pud_table_size = H_PUD_TABLE_SIZE; 842 + __pgd_table_size = H_PGD_TABLE_SIZE; 843 + /* 844 + * 4k use hugepd format, so for hash set then to 845 + * zero 846 + */ 847 + __pmd_val_bits = 0; 848 + __pud_val_bits = 0; 849 + __pgd_val_bits = 0; 850 + 851 + __kernel_virt_start = H_KERN_VIRT_START; 852 + __kernel_virt_size = H_KERN_VIRT_SIZE; 853 + __vmalloc_start = H_VMALLOC_START; 854 + __vmalloc_end = H_VMALLOC_END; 855 + vmemmap = (struct page *)H_VMEMMAP_BASE; 856 + ioremap_bot = IOREMAP_BASE; 857 + 873 858 /* Initialize the MMU Hash table and create the linear mapping 874 859 * of memory. Has to be done before SLB initialization as this is 875 860 * currently where the page size encoding is obtained. ··· 911 836 } 912 837 913 838 #ifdef CONFIG_SMP 914 - void early_init_mmu_secondary(void) 839 + void hash__early_init_mmu_secondary(void) 915 840 { 916 841 /* Initialize hash table for that CPU */ 917 - if (!firmware_has_feature(FW_FEATURE_LPAR)) 918 - mtspr(SPRN_SDR1, _SDR1); 919 - 842 + if (!firmware_has_feature(FW_FEATURE_LPAR)) { 843 + if (!cpu_has_feature(CPU_FTR_ARCH_300)) 844 + mtspr(SPRN_SDR1, _SDR1); 845 + else 846 + mtspr(SPRN_PTCR, 847 + __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); 848 + } 920 849 /* Initialize SLB */ 921 850 slb_initialize(); 922 851 } ··· 999 920 * Userspace sets the subpage permissions using the subpage_prot system call. 1000 921 * 1001 922 * Result is 0: full permissions, _PAGE_RW: read-only, 1002 - * _PAGE_USER or _PAGE_USER|_PAGE_RW: no access. 923 + * _PAGE_RWX: no access. 1003 924 */ 1004 925 static int subpage_protection(struct mm_struct *mm, unsigned long ea) 1005 926 { ··· 1025 946 /* extract 2-bit bitfield for this 4k subpage */ 1026 947 spp >>= 30 - 2 * ((ea >> 12) & 0xf); 1027 948 1028 - /* turn 0,1,2,3 into combination of _PAGE_USER and _PAGE_RW */ 1029 - spp = ((spp & 2) ? _PAGE_USER : 0) | ((spp & 1) ? _PAGE_RW : 0); 949 + /* 950 + * 0 -> full premission 951 + * 1 -> Read only 952 + * 2 -> no access. 953 + * We return the flag that need to be cleared. 954 + */ 955 + spp = ((spp & 2) ? _PAGE_RWX : 0) | ((spp & 1) ? _PAGE_WRITE : 0); 1030 956 return spp; 1031 957 } 1032 958 ··· 1168 1084 /* Pre-check access permissions (will be re-checked atomically 1169 1085 * in __hash_page_XX but this pre-check is a fast path 1170 1086 */ 1171 - if (access & ~pte_val(*ptep)) { 1087 + if (!check_pte_access(access, pte_val(*ptep))) { 1172 1088 DBG_LOW(" no access !\n"); 1173 1089 rc = 1; 1174 1090 goto bail; ··· 1206 1122 #endif 1207 1123 /* Do actual hashing */ 1208 1124 #ifdef CONFIG_PPC_64K_PAGES 1209 - /* If _PAGE_4K_PFN is set, make sure this is a 4k segment */ 1210 - if ((pte_val(*ptep) & _PAGE_4K_PFN) && psize == MMU_PAGE_64K) { 1125 + /* If H_PAGE_4K_PFN is set, make sure this is a 4k segment */ 1126 + if ((pte_val(*ptep) & H_PAGE_4K_PFN) && psize == MMU_PAGE_64K) { 1211 1127 demote_segment_4k(mm, ea); 1212 1128 psize = MMU_PAGE_4K; 1213 1129 } ··· 1215 1131 /* If this PTE is non-cacheable and we have restrictions on 1216 1132 * using non cacheable large pages, then we switch to 4k 1217 1133 */ 1218 - if (mmu_ci_restrictions && psize == MMU_PAGE_64K && 1219 - (pte_val(*ptep) & _PAGE_NO_CACHE)) { 1134 + if (mmu_ci_restrictions && psize == MMU_PAGE_64K && pte_ci(*ptep)) { 1220 1135 if (user_region) { 1221 1136 demote_segment_4k(mm, ea); 1222 1137 psize = MMU_PAGE_4K; ··· 1292 1209 int __hash_page(unsigned long ea, unsigned long msr, unsigned long trap, 1293 1210 unsigned long dsisr) 1294 1211 { 1295 - unsigned long access = _PAGE_PRESENT; 1212 + unsigned long access = _PAGE_PRESENT | _PAGE_READ; 1296 1213 unsigned long flags = 0; 1297 1214 struct mm_struct *mm = current->mm; 1298 1215 ··· 1303 1220 flags |= HPTE_NOHPTE_UPDATE; 1304 1221 1305 1222 if (dsisr & DSISR_ISSTORE) 1306 - access |= _PAGE_RW; 1223 + access |= _PAGE_WRITE; 1307 1224 /* 1308 - * We need to set the _PAGE_USER bit if MSR_PR is set or if we are 1309 - * accessing a userspace segment (even from the kernel). We assume 1310 - * kernel addresses always have the high bit set. 1225 + * We set _PAGE_PRIVILEGED only when 1226 + * kernel mode access kernel space. 1227 + * 1228 + * _PAGE_PRIVILEGED is NOT set 1229 + * 1) when kernel mode access user space 1230 + * 2) user space access kernel space. 1311 1231 */ 1232 + access |= _PAGE_PRIVILEGED; 1312 1233 if ((msr & MSR_PR) || (REGION_ID(ea) == USER_REGION_ID)) 1313 - access |= _PAGE_USER; 1234 + access &= ~_PAGE_PRIVILEGED; 1314 1235 1315 1236 if (trap == 0x400) 1316 1237 access |= _PAGE_EXEC; 1317 1238 1318 1239 return hash_page_mm(mm, ea, access, trap, flags); 1319 1240 } 1241 + 1242 + #ifdef CONFIG_PPC_MM_SLICES 1243 + static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) 1244 + { 1245 + int psize = get_slice_psize(mm, ea); 1246 + 1247 + /* We only prefault standard pages for now */ 1248 + if (unlikely(psize != mm->context.user_psize)) 1249 + return false; 1250 + 1251 + /* 1252 + * Don't prefault if subpage protection is enabled for the EA. 1253 + */ 1254 + if (unlikely((psize == MMU_PAGE_4K) && subpage_protection(mm, ea))) 1255 + return false; 1256 + 1257 + return true; 1258 + } 1259 + #else 1260 + static bool should_hash_preload(struct mm_struct *mm, unsigned long ea) 1261 + { 1262 + return true; 1263 + } 1264 + #endif 1320 1265 1321 1266 void hash_preload(struct mm_struct *mm, unsigned long ea, 1322 1267 unsigned long access, unsigned long trap) ··· 1358 1247 1359 1248 BUG_ON(REGION_ID(ea) != USER_REGION_ID); 1360 1249 1361 - #ifdef CONFIG_PPC_MM_SLICES 1362 - /* We only prefault standard pages for now */ 1363 - if (unlikely(get_slice_psize(mm, ea) != mm->context.user_psize)) 1250 + if (!should_hash_preload(mm, ea)) 1364 1251 return; 1365 - #endif 1366 1252 1367 1253 DBG_LOW("hash_preload(mm=%p, mm->pgdir=%p, ea=%016lx, access=%lx," 1368 1254 " trap=%lx\n", mm, mm->pgd, ea, access, trap); ··· 1390 1282 1391 1283 WARN_ON(hugepage_shift); 1392 1284 #ifdef CONFIG_PPC_64K_PAGES 1393 - /* If either _PAGE_4K_PFN or _PAGE_NO_CACHE is set (and we are on 1285 + /* If either H_PAGE_4K_PFN or cache inhibited is set (and we are on 1394 1286 * a 64K kernel), then we don't preload, hash_page() will take 1395 1287 * care of it once we actually try to access the page. 1396 1288 * That way we don't have to duplicate all of the logic for segment 1397 1289 * page size demotion here 1398 1290 */ 1399 - if (pte_val(*ptep) & (_PAGE_4K_PFN | _PAGE_NO_CACHE)) 1291 + if ((pte_val(*ptep) & H_PAGE_4K_PFN) || pte_ci(*ptep)) 1400 1292 goto out_exit; 1401 1293 #endif /* CONFIG_PPC_64K_PAGES */ 1402 1294 ··· 1678 1570 } 1679 1571 #endif /* CONFIG_DEBUG_PAGEALLOC */ 1680 1572 1681 - void setup_initial_memory_limit(phys_addr_t first_memblock_base, 1573 + void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, 1682 1574 phys_addr_t first_memblock_size) 1683 1575 { 1684 1576 /* We don't currently support the first MEMBLOCK not mapping 0

+11 -11

arch/powerpc/mm/hugepage-hash64.c

··· 37 37 38 38 old_pmd = pmd_val(pmd); 39 39 /* If PMD busy, retry the access */ 40 - if (unlikely(old_pmd & _PAGE_BUSY)) 40 + if (unlikely(old_pmd & H_PAGE_BUSY)) 41 41 return 0; 42 42 /* If PMD permissions don't match, take page fault */ 43 - if (unlikely(access & ~old_pmd)) 43 + if (unlikely(!check_pte_access(access, old_pmd))) 44 44 return 1; 45 45 /* 46 46 * Try to lock the PTE, add ACCESSED and DIRTY if it was 47 47 * a write access 48 48 */ 49 - new_pmd = old_pmd | _PAGE_BUSY | _PAGE_ACCESSED; 50 - if (access & _PAGE_RW) 49 + new_pmd = old_pmd | H_PAGE_BUSY | _PAGE_ACCESSED; 50 + if (access & _PAGE_WRITE) 51 51 new_pmd |= _PAGE_DIRTY; 52 - } while (old_pmd != __cmpxchg_u64((unsigned long *)pmdp, 53 - old_pmd, new_pmd)); 52 + } while (!pmd_xchg(pmdp, __pmd(old_pmd), __pmd(new_pmd))); 53 + 54 54 rflags = htab_convert_pte_flags(new_pmd); 55 55 56 56 #if 0 ··· 78 78 * base page size. This is because demote_segment won't flush 79 79 * hash page table entries. 80 80 */ 81 - if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO)) { 81 + if ((old_pmd & H_PAGE_HASHPTE) && !(old_pmd & H_PAGE_COMBO)) { 82 82 flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K, 83 83 ssize, flags); 84 84 /* ··· 125 125 hash = hpt_hash(vpn, shift, ssize); 126 126 /* insert new entry */ 127 127 pa = pmd_pfn(__pmd(old_pmd)) << PAGE_SHIFT; 128 - new_pmd |= _PAGE_HASHPTE; 128 + new_pmd |= H_PAGE_HASHPTE; 129 129 130 130 repeat: 131 131 hpte_group = ((hash & htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL; ··· 169 169 mark_hpte_slot_valid(hpte_slot_array, index, slot); 170 170 } 171 171 /* 172 - * Mark the pte with _PAGE_COMBO, if we are trying to hash it with 172 + * Mark the pte with H_PAGE_COMBO, if we are trying to hash it with 173 173 * base page size 4k. 174 174 */ 175 175 if (psize == MMU_PAGE_4K) 176 - new_pmd |= _PAGE_COMBO; 176 + new_pmd |= H_PAGE_COMBO; 177 177 /* 178 178 * The hpte valid is stored in the pgtable whose address is in the 179 179 * second half of the PMD. Order this against clearing of the busy bit in 180 180 * huge pmd. 181 181 */ 182 182 smp_wmb(); 183 - *pmdp = __pmd(new_pmd & ~_PAGE_BUSY); 183 + *pmdp = __pmd(new_pmd & ~H_PAGE_BUSY); 184 184 return 0; 185 185 }

+15 -14

arch/powerpc/mm/hugetlbpage-hash64.c

··· 47 47 do { 48 48 old_pte = pte_val(*ptep); 49 49 /* If PTE busy, retry the access */ 50 - if (unlikely(old_pte & _PAGE_BUSY)) 50 + if (unlikely(old_pte & H_PAGE_BUSY)) 51 51 return 0; 52 52 /* If PTE permissions don't match, take page fault */ 53 - if (unlikely(access & ~old_pte)) 53 + if (unlikely(!check_pte_access(access, old_pte))) 54 54 return 1; 55 + 55 56 /* Try to lock the PTE, add ACCESSED and DIRTY if it was 56 57 * a write access */ 57 - new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; 58 - if (access & _PAGE_RW) 58 + new_pte = old_pte | H_PAGE_BUSY | _PAGE_ACCESSED; 59 + if (access & _PAGE_WRITE) 59 60 new_pte |= _PAGE_DIRTY; 60 - } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, 61 - old_pte, new_pte)); 61 + } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte))); 62 + 62 63 rflags = htab_convert_pte_flags(new_pte); 63 64 64 65 sz = ((1UL) << shift); ··· 69 68 rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); 70 69 71 70 /* Check if pte already has an hpte (case 2) */ 72 - if (unlikely(old_pte & _PAGE_HASHPTE)) { 71 + if (unlikely(old_pte & H_PAGE_HASHPTE)) { 73 72 /* There MIGHT be an HPTE for this pte */ 74 73 unsigned long hash, slot; 75 74 76 75 hash = hpt_hash(vpn, shift, ssize); 77 - if (old_pte & _PAGE_F_SECOND) 76 + if (old_pte & H_PAGE_F_SECOND) 78 77 hash = ~hash; 79 78 slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; 80 - slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT; 79 + slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT; 81 80 82 81 if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize, 83 82 mmu_psize, ssize, flags) == -1) 84 83 old_pte &= ~_PAGE_HPTEFLAGS; 85 84 } 86 85 87 - if (likely(!(old_pte & _PAGE_HASHPTE))) { 86 + if (likely(!(old_pte & H_PAGE_HASHPTE))) { 88 87 unsigned long hash = hpt_hash(vpn, shift, ssize); 89 88 90 89 pa = pte_pfn(__pte(old_pte)) << PAGE_SHIFT; 91 90 92 91 /* clear HPTE slot informations in new PTE */ 93 - new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | _PAGE_HASHPTE; 92 + new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE; 94 93 95 94 slot = hpte_insert_repeating(hash, vpn, pa, rflags, 0, 96 95 mmu_psize, ssize); ··· 106 105 return -1; 107 106 } 108 107 109 - new_pte |= (slot << _PAGE_F_GIX_SHIFT) & 110 - (_PAGE_F_SECOND | _PAGE_F_GIX); 108 + new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & 109 + (H_PAGE_F_SECOND | H_PAGE_F_GIX); 111 110 } 112 111 113 112 /* 114 113 * No need to use ldarx/stdcx here 115 114 */ 116 - *ptep = __pte(new_pte & ~_PAGE_BUSY); 115 + *ptep = __pte(new_pte & ~H_PAGE_BUSY); 117 116 return 0; 118 117 } 119 118

+87

arch/powerpc/mm/hugetlbpage-radix.c

··· 1 + #include <linux/mm.h> 2 + #include <linux/hugetlb.h> 3 + #include <asm/pgtable.h> 4 + #include <asm/pgalloc.h> 5 + #include <asm/cacheflush.h> 6 + #include <asm/machdep.h> 7 + #include <asm/mman.h> 8 + 9 + void radix__flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) 10 + { 11 + unsigned long ap, shift; 12 + struct hstate *hstate = hstate_file(vma->vm_file); 13 + 14 + shift = huge_page_shift(hstate); 15 + if (shift == mmu_psize_defs[MMU_PAGE_2M].shift) 16 + ap = mmu_get_ap(MMU_PAGE_2M); 17 + else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift) 18 + ap = mmu_get_ap(MMU_PAGE_1G); 19 + else { 20 + WARN(1, "Wrong huge page shift\n"); 21 + return ; 22 + } 23 + radix___flush_tlb_page(vma->vm_mm, vmaddr, ap, 0); 24 + } 25 + 26 + void radix__local_flush_hugetlb_page(struct vm_area_struct *vma, unsigned long vmaddr) 27 + { 28 + unsigned long ap, shift; 29 + struct hstate *hstate = hstate_file(vma->vm_file); 30 + 31 + shift = huge_page_shift(hstate); 32 + if (shift == mmu_psize_defs[MMU_PAGE_2M].shift) 33 + ap = mmu_get_ap(MMU_PAGE_2M); 34 + else if (shift == mmu_psize_defs[MMU_PAGE_1G].shift) 35 + ap = mmu_get_ap(MMU_PAGE_1G); 36 + else { 37 + WARN(1, "Wrong huge page shift\n"); 38 + return ; 39 + } 40 + radix___local_flush_tlb_page(vma->vm_mm, vmaddr, ap, 0); 41 + } 42 + 43 + /* 44 + * A vairant of hugetlb_get_unmapped_area doing topdown search 45 + * FIXME!! should we do as x86 does or non hugetlb area does ? 46 + * ie, use topdown or not based on mmap_is_legacy check ? 47 + */ 48 + unsigned long 49 + radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 50 + unsigned long len, unsigned long pgoff, 51 + unsigned long flags) 52 + { 53 + struct mm_struct *mm = current->mm; 54 + struct vm_area_struct *vma; 55 + struct hstate *h = hstate_file(file); 56 + struct vm_unmapped_area_info info; 57 + 58 + if (len & ~huge_page_mask(h)) 59 + return -EINVAL; 60 + if (len > TASK_SIZE) 61 + return -ENOMEM; 62 + 63 + if (flags & MAP_FIXED) { 64 + if (prepare_hugepage_range(file, addr, len)) 65 + return -EINVAL; 66 + return addr; 67 + } 68 + 69 + if (addr) { 70 + addr = ALIGN(addr, huge_page_size(h)); 71 + vma = find_vma(mm, addr); 72 + if (TASK_SIZE - len >= addr && 73 + (!vma || addr + len <= vma->vm_start)) 74 + return addr; 75 + } 76 + /* 77 + * We are always doing an topdown search here. Slice code 78 + * does that too. 79 + */ 80 + info.flags = VM_UNMAPPED_AREA_TOPDOWN; 81 + info.length = len; 82 + info.low_limit = PAGE_SIZE; 83 + info.high_limit = current->mm->mmap_base; 84 + info.align_mask = PAGE_MASK & ~huge_page_mask(h); 85 + info.align_offset = 0; 86 + return vm_unmapped_area(&info); 87 + }

+13 -7

arch/powerpc/mm/hugetlbpage.c

··· 711 711 struct hstate *hstate = hstate_file(file); 712 712 int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate)); 713 713 714 + if (radix_enabled()) 715 + return radix__hugetlb_get_unmapped_area(file, addr, len, 716 + pgoff, flags); 714 717 return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1); 715 718 } 716 719 #endif ··· 722 719 { 723 720 #ifdef CONFIG_PPC_MM_SLICES 724 721 unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start); 725 - 726 - return 1UL << mmu_psize_to_shift(psize); 727 - #else 722 + /* With radix we don't use slice, so derive it from vma*/ 723 + if (!radix_enabled()) 724 + return 1UL << mmu_psize_to_shift(psize); 725 + #endif 728 726 if (!is_vm_hugetlb_page(vma)) 729 727 return PAGE_SIZE; 730 728 731 729 return huge_page_size(hstate_vma(vma)); 732 - #endif 733 730 } 734 731 735 732 static inline bool is_power_of_4(unsigned long x) ··· 828 825 { 829 826 int psize; 830 827 831 - if (!mmu_has_feature(MMU_FTR_16M_PAGE)) 828 + if (!radix_enabled() && !mmu_has_feature(MMU_FTR_16M_PAGE)) 832 829 return -ENODEV; 833 830 834 831 for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { ··· 868 865 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_16M].shift; 869 866 else if (mmu_psize_defs[MMU_PAGE_1M].shift) 870 867 HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_1M].shift; 868 + else if (mmu_psize_defs[MMU_PAGE_2M].shift) 869 + HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_2M].shift; 870 + 871 871 872 872 return 0; 873 873 } ··· 1011 1005 end = pte_end; 1012 1006 1013 1007 pte = READ_ONCE(*ptep); 1014 - mask = _PAGE_PRESENT | _PAGE_USER; 1008 + mask = _PAGE_PRESENT | _PAGE_READ; 1015 1009 if (write) 1016 - mask |= _PAGE_RW; 1010 + mask |= _PAGE_WRITE; 1017 1011 1018 1012 if ((pte_val(pte) & mask) != mask) 1019 1013 return 0;

+2 -71

arch/powerpc/mm/init_64.c

··· 66 66 #include "mmu_decl.h" 67 67 68 68 #ifdef CONFIG_PPC_STD_MMU_64 69 - #if PGTABLE_RANGE > USER_VSID_RANGE 69 + #if H_PGTABLE_RANGE > USER_VSID_RANGE 70 70 #warning Limited user VSID range means pagetable space is wasted 71 71 #endif 72 72 73 - #if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) 73 + #if (TASK_SIZE_USER64 < H_PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) 74 74 #warning TASK_SIZE is smaller than it needs to be. 75 75 #endif 76 76 #endif /* CONFIG_PPC_STD_MMU_64 */ ··· 188 188 189 189 return 0; 190 190 } 191 - 192 - /* On hash-based CPUs, the vmemmap is bolted in the hash table. 193 - * 194 - * On Book3E CPUs, the vmemmap is currently mapped in the top half of 195 - * the vmalloc space using normal page tables, though the size of 196 - * pages encoded in the PTEs can be different 197 - */ 198 - 199 - #ifdef CONFIG_PPC_BOOK3E 200 - static int __meminit vmemmap_create_mapping(unsigned long start, 201 - unsigned long page_size, 202 - unsigned long phys) 203 - { 204 - /* Create a PTE encoding without page size */ 205 - unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | 206 - _PAGE_KERNEL_RW; 207 - 208 - /* PTEs only contain page size encodings up to 32M */ 209 - BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); 210 - 211 - /* Encode the size in the PTE */ 212 - flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; 213 - 214 - /* For each PTE for that area, map things. Note that we don't 215 - * increment phys because all PTEs are of the large size and 216 - * thus must have the low bits clear 217 - */ 218 - for (i = 0; i < page_size; i += PAGE_SIZE) 219 - BUG_ON(map_kernel_page(start + i, phys, flags)); 220 - 221 - return 0; 222 - } 223 - 224 - #ifdef CONFIG_MEMORY_HOTPLUG 225 - static void vmemmap_remove_mapping(unsigned long start, 226 - unsigned long page_size) 227 - { 228 - } 229 - #endif 230 - #else /* CONFIG_PPC_BOOK3E */ 231 - static int __meminit vmemmap_create_mapping(unsigned long start, 232 - unsigned long page_size, 233 - unsigned long phys) 234 - { 235 - int rc = htab_bolt_mapping(start, start + page_size, phys, 236 - pgprot_val(PAGE_KERNEL), 237 - mmu_vmemmap_psize, mmu_kernel_ssize); 238 - if (rc < 0) { 239 - int rc2 = htab_remove_mapping(start, start + page_size, 240 - mmu_vmemmap_psize, 241 - mmu_kernel_ssize); 242 - BUG_ON(rc2 && (rc2 != -ENOENT)); 243 - } 244 - return rc; 245 - } 246 - 247 - #ifdef CONFIG_MEMORY_HOTPLUG 248 - static void vmemmap_remove_mapping(unsigned long start, 249 - unsigned long page_size) 250 - { 251 - int rc = htab_remove_mapping(start, start + page_size, 252 - mmu_vmemmap_psize, 253 - mmu_kernel_ssize); 254 - BUG_ON((rc < 0) && (rc != -ENOENT)); 255 - WARN_ON(rc == -ENOENT); 256 - } 257 - #endif 258 - 259 - #endif /* CONFIG_PPC_BOOK3E */ 260 191 261 192 struct vmemmap_backing *vmemmap_list; 262 193 static struct vmemmap_backing *next;

+21 -19

arch/powerpc/mm/mem.c

··· 68 68 EXPORT_SYMBOL(kmap_pte); 69 69 pgprot_t kmap_prot; 70 70 EXPORT_SYMBOL(kmap_prot); 71 + #define TOP_ZONE ZONE_HIGHMEM 71 72 72 73 static inline pte_t *virt_to_kpte(unsigned long vaddr) 73 74 { 74 75 return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), 75 76 vaddr), vaddr), vaddr); 76 77 } 78 + #else 79 + #define TOP_ZONE ZONE_NORMAL 77 80 #endif 78 81 79 82 int page_is_ram(unsigned long pfn) ··· 270 267 */ 271 268 int dma_pfn_limit_to_zone(u64 pfn_limit) 272 269 { 273 - enum zone_type top_zone = ZONE_NORMAL; 274 270 int i; 275 271 276 - #ifdef CONFIG_HIGHMEM 277 - top_zone = ZONE_HIGHMEM; 278 - #endif 279 - 280 - for (i = top_zone; i >= 0; i--) { 272 + for (i = TOP_ZONE; i >= 0; i--) { 281 273 if (max_zone_pfns[i] <= pfn_limit) 282 274 return i; 283 275 } ··· 287 289 { 288 290 unsigned long long total_ram = memblock_phys_mem_size(); 289 291 phys_addr_t top_of_ram = memblock_end_of_DRAM(); 290 - enum zone_type top_zone; 291 292 292 293 #ifdef CONFIG_PPC32 293 294 unsigned long v = __fix_to_virt(__end_of_fixed_addresses - 1); ··· 310 313 (long int)((top_of_ram - total_ram) >> 20)); 311 314 312 315 #ifdef CONFIG_HIGHMEM 313 - top_zone = ZONE_HIGHMEM; 314 316 limit_zone_pfn(ZONE_NORMAL, lowmem_end_addr >> PAGE_SHIFT); 315 - #else 316 - top_zone = ZONE_NORMAL; 317 317 #endif 318 - 319 - limit_zone_pfn(top_zone, top_of_ram >> PAGE_SHIFT); 318 + limit_zone_pfn(TOP_ZONE, top_of_ram >> PAGE_SHIFT); 320 319 zone_limits_final = true; 321 320 free_area_init_nodes(max_zone_pfns); 322 321 ··· 491 498 * We don't need to worry about _PAGE_PRESENT here because we are 492 499 * called with either mm->page_table_lock held or ptl lock held 493 500 */ 494 - unsigned long access = 0, trap; 501 + unsigned long access, trap; 502 + 503 + if (radix_enabled()) 504 + return; 495 505 496 506 /* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */ 497 507 if (!pte_young(*ptep) || address >= TASK_SIZE) ··· 507 511 * 508 512 * We also avoid filling the hash if not coming from a fault 509 513 */ 510 - if (current->thread.regs == NULL) 514 + 515 + trap = current->thread.regs ? TRAP(current->thread.regs) : 0UL; 516 + switch (trap) { 517 + case 0x300: 518 + access = 0UL; 519 + break; 520 + case 0x400: 521 + access = _PAGE_EXEC; 522 + break; 523 + default: 511 524 return; 512 - trap = TRAP(current->thread.regs); 513 - if (trap == 0x400) 514 - access |= _PAGE_EXEC; 515 - else if (trap != 0x300) 516 - return; 525 + } 526 + 517 527 hash_preload(vma->vm_mm, address, access, trap); 518 528 #endif /* CONFIG_PPC_STD_MMU */ 519 529 #if (defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_FSL_BOOK3E)) \

+110

arch/powerpc/mm/mmap.c

··· 26 26 #include <linux/mm.h> 27 27 #include <linux/random.h> 28 28 #include <linux/sched.h> 29 + #include <linux/elf-randomize.h> 30 + #include <linux/security.h> 31 + #include <linux/mman.h> 29 32 30 33 /* 31 34 * Top of mmap area (just below the process stack). ··· 81 78 return PAGE_ALIGN(TASK_SIZE - gap - rnd); 82 79 } 83 80 81 + #ifdef CONFIG_PPC_RADIX_MMU 82 + /* 83 + * Same function as generic code used only for radix, because we don't need to overload 84 + * the generic one. But we will have to duplicate, because hash select 85 + * HAVE_ARCH_UNMAPPED_AREA 86 + */ 87 + static unsigned long 88 + radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, 89 + unsigned long len, unsigned long pgoff, 90 + unsigned long flags) 91 + { 92 + struct mm_struct *mm = current->mm; 93 + struct vm_area_struct *vma; 94 + struct vm_unmapped_area_info info; 95 + 96 + if (len > TASK_SIZE - mmap_min_addr) 97 + return -ENOMEM; 98 + 99 + if (flags & MAP_FIXED) 100 + return addr; 101 + 102 + if (addr) { 103 + addr = PAGE_ALIGN(addr); 104 + vma = find_vma(mm, addr); 105 + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && 106 + (!vma || addr + len <= vma->vm_start)) 107 + return addr; 108 + } 109 + 110 + info.flags = 0; 111 + info.length = len; 112 + info.low_limit = mm->mmap_base; 113 + info.high_limit = TASK_SIZE; 114 + info.align_mask = 0; 115 + return vm_unmapped_area(&info); 116 + } 117 + 118 + static unsigned long 119 + radix__arch_get_unmapped_area_topdown(struct file *filp, 120 + const unsigned long addr0, 121 + const unsigned long len, 122 + const unsigned long pgoff, 123 + const unsigned long flags) 124 + { 125 + struct vm_area_struct *vma; 126 + struct mm_struct *mm = current->mm; 127 + unsigned long addr = addr0; 128 + struct vm_unmapped_area_info info; 129 + 130 + /* requested length too big for entire address space */ 131 + if (len > TASK_SIZE - mmap_min_addr) 132 + return -ENOMEM; 133 + 134 + if (flags & MAP_FIXED) 135 + return addr; 136 + 137 + /* requesting a specific address */ 138 + if (addr) { 139 + addr = PAGE_ALIGN(addr); 140 + vma = find_vma(mm, addr); 141 + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && 142 + (!vma || addr + len <= vma->vm_start)) 143 + return addr; 144 + } 145 + 146 + info.flags = VM_UNMAPPED_AREA_TOPDOWN; 147 + info.length = len; 148 + info.low_limit = max(PAGE_SIZE, mmap_min_addr); 149 + info.high_limit = mm->mmap_base; 150 + info.align_mask = 0; 151 + addr = vm_unmapped_area(&info); 152 + 153 + /* 154 + * A failed mmap() very likely causes application failure, 155 + * so fall back to the bottom-up function here. This scenario 156 + * can happen with large stack limits and large mmap() 157 + * allocations. 158 + */ 159 + if (addr & ~PAGE_MASK) { 160 + VM_BUG_ON(addr != -ENOMEM); 161 + info.flags = 0; 162 + info.low_limit = TASK_UNMAPPED_BASE; 163 + info.high_limit = TASK_SIZE; 164 + addr = vm_unmapped_area(&info); 165 + } 166 + 167 + return addr; 168 + } 169 + 170 + static void radix__arch_pick_mmap_layout(struct mm_struct *mm, 171 + unsigned long random_factor) 172 + { 173 + if (mmap_is_legacy()) { 174 + mm->mmap_base = TASK_UNMAPPED_BASE; 175 + mm->get_unmapped_area = radix__arch_get_unmapped_area; 176 + } else { 177 + mm->mmap_base = mmap_base(random_factor); 178 + mm->get_unmapped_area = radix__arch_get_unmapped_area_topdown; 179 + } 180 + } 181 + #else 182 + /* dummy */ 183 + extern void radix__arch_pick_mmap_layout(struct mm_struct *mm, 184 + unsigned long random_factor); 185 + #endif 84 186 /* 85 187 * This function, called very early during the creation of a new 86 188 * process VM image, sets up which VM layout function to use: ··· 197 89 if (current->flags & PF_RANDOMIZE) 198 90 random_factor = arch_mmap_rnd(); 199 91 92 + if (radix_enabled()) 93 + return radix__arch_pick_mmap_layout(mm, random_factor); 200 94 /* 201 95 * Fall back to the standard layout if the personality 202 96 * bit is set, or if the expected stack growth is unlimited:

+44 -8

arch/powerpc/mm/mmu_context_hash64.c arch/powerpc/mm/mmu_context_book3s64.c

··· 58 58 return index; 59 59 } 60 60 EXPORT_SYMBOL_GPL(__init_new_context); 61 + static int radix__init_new_context(struct mm_struct *mm, int index) 62 + { 63 + unsigned long rts_field; 64 + 65 + /* 66 + * set the process table entry, 67 + */ 68 + rts_field = 3ull << PPC_BITLSHIFT(2); 69 + process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE); 70 + return 0; 71 + } 61 72 62 73 int init_new_context(struct task_struct *tsk, struct mm_struct *mm) 63 74 { ··· 78 67 if (index < 0) 79 68 return index; 80 69 81 - /* The old code would re-promote on fork, we don't do that 82 - * when using slices as it could cause problem promoting slices 83 - * that have been forced down to 4K 84 - */ 85 - if (slice_mm_new_context(mm)) 86 - slice_set_user_psize(mm, mmu_virtual_psize); 87 - subpage_prot_init_new_context(mm); 70 + if (radix_enabled()) { 71 + radix__init_new_context(mm, index); 72 + } else { 73 + 74 + /* The old code would re-promote on fork, we don't do that 75 + * when using slices as it could cause problem promoting slices 76 + * that have been forced down to 4K 77 + * 78 + * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check 79 + * explicitly against context.id == 0. This ensures that we 80 + * properly initialize context slice details for newly allocated 81 + * mm's (which will have id == 0) and don't alter context slice 82 + * inherited via fork (which will have id != 0). 83 + * 84 + * We should not be calling init_new_context() on init_mm. Hence a 85 + * check against 0 is ok. 86 + */ 87 + if (mm->context.id == 0) 88 + slice_set_user_psize(mm, mmu_virtual_psize); 89 + subpage_prot_init_new_context(mm); 90 + } 88 91 mm->context.id = index; 89 92 #ifdef CONFIG_PPC_ICSWX 90 93 mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL); ··· 169 144 mm->context.cop_lockp = NULL; 170 145 #endif /* CONFIG_PPC_ICSWX */ 171 146 147 + if (radix_enabled()) 148 + process_tb[mm->context.id].prtb1 = 0; 149 + else 150 + subpage_prot_free(mm); 172 151 destroy_pagetable_page(mm); 173 152 __destroy_context(mm->context.id); 174 - subpage_prot_free(mm); 175 153 mm->context.id = MMU_NO_CONTEXT; 176 154 } 155 + 156 + #ifdef CONFIG_PPC_RADIX_MMU 157 + void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) 158 + { 159 + mtspr(SPRN_PID, next->context.id); 160 + asm volatile("isync": : :"memory"); 161 + } 162 + #endif

+3 -3

arch/powerpc/mm/mmu_context_nohash.c

··· 226 226 static void context_check_map(void) { } 227 227 #endif 228 228 229 - void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) 229 + void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, 230 + struct task_struct *tsk) 230 231 { 231 232 unsigned int i, id, cpu = smp_processor_id(); 232 233 unsigned long *map; ··· 335 334 mm->context.active = 0; 336 335 337 336 #ifdef CONFIG_PPC_MM_SLICES 338 - if (slice_mm_new_context(mm)) 339 - slice_set_user_psize(mm, mmu_virtual_psize); 337 + slice_set_user_psize(mm, mmu_virtual_psize); 340 338 #endif 341 339 342 340 return 0;

-5

arch/powerpc/mm/mmu_decl.h

··· 108 108 109 109 #endif /* CONFIG_PPC32 */ 110 110 111 - #ifdef CONFIG_PPC64 112 - extern int map_kernel_page(unsigned long ea, unsigned long pa, 113 - unsigned long flags); 114 - #endif /* CONFIG_PPC64 */ 115 - 116 111 extern unsigned long ioremap_bot; 117 112 extern unsigned long __max_low_memory; 118 113 extern phys_addr_t __initial_memory_limit_addr;

+122

arch/powerpc/mm/pgtable-book3e.c

··· 1 + /* 2 + * Copyright 2005, Paul Mackerras, IBM Corporation. 3 + * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. 4 + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. 5 + * 6 + * This program is free software; you can redistribute it and/or 7 + * modify it under the terms of the GNU General Public License 8 + * as published by the Free Software Foundation; either version 9 + * 2 of the License, or (at your option) any later version. 10 + */ 11 + 12 + #include <linux/sched.h> 13 + #include <linux/memblock.h> 14 + #include <asm/pgalloc.h> 15 + #include <asm/tlb.h> 16 + #include <asm/dma.h> 17 + 18 + #include "mmu_decl.h" 19 + 20 + #ifdef CONFIG_SPARSEMEM_VMEMMAP 21 + /* 22 + * On Book3E CPUs, the vmemmap is currently mapped in the top half of 23 + * the vmalloc space using normal page tables, though the size of 24 + * pages encoded in the PTEs can be different 25 + */ 26 + int __meminit vmemmap_create_mapping(unsigned long start, 27 + unsigned long page_size, 28 + unsigned long phys) 29 + { 30 + /* Create a PTE encoding without page size */ 31 + unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED | 32 + _PAGE_KERNEL_RW; 33 + 34 + /* PTEs only contain page size encodings up to 32M */ 35 + BUG_ON(mmu_psize_defs[mmu_vmemmap_psize].enc > 0xf); 36 + 37 + /* Encode the size in the PTE */ 38 + flags |= mmu_psize_defs[mmu_vmemmap_psize].enc << 8; 39 + 40 + /* For each PTE for that area, map things. Note that we don't 41 + * increment phys because all PTEs are of the large size and 42 + * thus must have the low bits clear 43 + */ 44 + for (i = 0; i < page_size; i += PAGE_SIZE) 45 + BUG_ON(map_kernel_page(start + i, phys, flags)); 46 + 47 + return 0; 48 + } 49 + 50 + #ifdef CONFIG_MEMORY_HOTPLUG 51 + void vmemmap_remove_mapping(unsigned long start, 52 + unsigned long page_size) 53 + { 54 + } 55 + #endif 56 + #endif /* CONFIG_SPARSEMEM_VMEMMAP */ 57 + 58 + static __ref void *early_alloc_pgtable(unsigned long size) 59 + { 60 + void *pt; 61 + 62 + pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS))); 63 + memset(pt, 0, size); 64 + 65 + return pt; 66 + } 67 + 68 + /* 69 + * map_kernel_page currently only called by __ioremap 70 + * map_kernel_page adds an entry to the ioremap page table 71 + * and adds an entry to the HPT, possibly bolting it 72 + */ 73 + int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) 74 + { 75 + pgd_t *pgdp; 76 + pud_t *pudp; 77 + pmd_t *pmdp; 78 + pte_t *ptep; 79 + 80 + BUILD_BUG_ON(TASK_SIZE_USER64 > PGTABLE_RANGE); 81 + if (slab_is_available()) { 82 + pgdp = pgd_offset_k(ea); 83 + pudp = pud_alloc(&init_mm, pgdp, ea); 84 + if (!pudp) 85 + return -ENOMEM; 86 + pmdp = pmd_alloc(&init_mm, pudp, ea); 87 + if (!pmdp) 88 + return -ENOMEM; 89 + ptep = pte_alloc_kernel(pmdp, ea); 90 + if (!ptep) 91 + return -ENOMEM; 92 + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, 93 + __pgprot(flags))); 94 + } else { 95 + pgdp = pgd_offset_k(ea); 96 + #ifndef __PAGETABLE_PUD_FOLDED 97 + if (pgd_none(*pgdp)) { 98 + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); 99 + BUG_ON(pudp == NULL); 100 + pgd_populate(&init_mm, pgdp, pudp); 101 + } 102 + #endif /* !__PAGETABLE_PUD_FOLDED */ 103 + pudp = pud_offset(pgdp, ea); 104 + if (pud_none(*pudp)) { 105 + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); 106 + BUG_ON(pmdp == NULL); 107 + pud_populate(&init_mm, pudp, pmdp); 108 + } 109 + pmdp = pmd_offset(pudp, ea); 110 + if (!pmd_present(*pmdp)) { 111 + ptep = early_alloc_pgtable(PAGE_SIZE); 112 + BUG_ON(ptep == NULL); 113 + pmd_populate_kernel(&init_mm, pmdp, ptep); 114 + } 115 + ptep = pte_offset_kernel(pmdp, ea); 116 + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, 117 + __pgprot(flags))); 118 + } 119 + 120 + smp_wmb(); 121 + return 0; 122 + }

+118

arch/powerpc/mm/pgtable-book3s64.c

··· 1 + /* 2 + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public License 6 + * as published by the Free Software Foundation; either version 7 + * 2 of the License, or (at your option) any later version. 8 + */ 9 + 10 + #include <linux/sched.h> 11 + #include <asm/pgalloc.h> 12 + #include <asm/tlb.h> 13 + 14 + #include "mmu_decl.h" 15 + #include <trace/events/thp.h> 16 + 17 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 18 + /* 19 + * This is called when relaxing access to a hugepage. It's also called in the page 20 + * fault path when we don't hit any of the major fault cases, ie, a minor 21 + * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have 22 + * handled those two for us, we additionally deal with missing execute 23 + * permission here on some processors 24 + */ 25 + int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, 26 + pmd_t *pmdp, pmd_t entry, int dirty) 27 + { 28 + int changed; 29 + #ifdef CONFIG_DEBUG_VM 30 + WARN_ON(!pmd_trans_huge(*pmdp)); 31 + assert_spin_locked(&vma->vm_mm->page_table_lock); 32 + #endif 33 + changed = !pmd_same(*(pmdp), entry); 34 + if (changed) { 35 + __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); 36 + /* 37 + * Since we are not supporting SW TLB systems, we don't 38 + * have any thing similar to flush_tlb_page_nohash() 39 + */ 40 + } 41 + return changed; 42 + } 43 + 44 + int pmdp_test_and_clear_young(struct vm_area_struct *vma, 45 + unsigned long address, pmd_t *pmdp) 46 + { 47 + return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); 48 + } 49 + /* 50 + * set a new huge pmd. We should not be called for updating 51 + * an existing pmd entry. That should go via pmd_hugepage_update. 52 + */ 53 + void set_pmd_at(struct mm_struct *mm, unsigned long addr, 54 + pmd_t *pmdp, pmd_t pmd) 55 + { 56 + #ifdef CONFIG_DEBUG_VM 57 + WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp))); 58 + assert_spin_locked(&mm->page_table_lock); 59 + WARN_ON(!pmd_trans_huge(pmd)); 60 + #endif 61 + trace_hugepage_set_pmd(addr, pmd_val(pmd)); 62 + return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); 63 + } 64 + /* 65 + * We use this to invalidate a pmdp entry before switching from a 66 + * hugepte to regular pmd entry. 67 + */ 68 + void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, 69 + pmd_t *pmdp) 70 + { 71 + pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); 72 + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 73 + /* 74 + * This ensures that generic code that rely on IRQ disabling 75 + * to prevent a parallel THP split work as expected. 76 + */ 77 + kick_all_cpus_sync(); 78 + } 79 + 80 + static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) 81 + { 82 + return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); 83 + } 84 + 85 + pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) 86 + { 87 + unsigned long pmdv; 88 + 89 + pmdv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; 90 + return pmd_set_protbits(__pmd(pmdv), pgprot); 91 + } 92 + 93 + pmd_t mk_pmd(struct page *page, pgprot_t pgprot) 94 + { 95 + return pfn_pmd(page_to_pfn(page), pgprot); 96 + } 97 + 98 + pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) 99 + { 100 + unsigned long pmdv; 101 + 102 + pmdv = pmd_val(pmd); 103 + pmdv &= _HPAGE_CHG_MASK; 104 + return pmd_set_protbits(__pmd(pmdv), newprot); 105 + } 106 + 107 + /* 108 + * This is called at the end of handling a user page fault, when the 109 + * fault has been handled by updating a HUGE PMD entry in the linux page tables. 110 + * We use it to preload an HPTE into the hash table corresponding to 111 + * the updated linux HUGE PMD entry. 112 + */ 113 + void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, 114 + pmd_t *pmd) 115 + { 116 + return; 117 + } 118 + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */

+342

arch/powerpc/mm/pgtable-hash64.c

··· 1 + /* 2 + * Copyright 2005, Paul Mackerras, IBM Corporation. 3 + * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. 4 + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. 5 + * 6 + * This program is free software; you can redistribute it and/or 7 + * modify it under the terms of the GNU General Public License 8 + * as published by the Free Software Foundation; either version 9 + * 2 of the License, or (at your option) any later version. 10 + */ 11 + 12 + #include <linux/sched.h> 13 + #include <asm/pgalloc.h> 14 + #include <asm/tlb.h> 15 + 16 + #include "mmu_decl.h" 17 + 18 + #define CREATE_TRACE_POINTS 19 + #include <trace/events/thp.h> 20 + 21 + #ifdef CONFIG_SPARSEMEM_VMEMMAP 22 + /* 23 + * On hash-based CPUs, the vmemmap is bolted in the hash table. 24 + * 25 + */ 26 + int __meminit hash__vmemmap_create_mapping(unsigned long start, 27 + unsigned long page_size, 28 + unsigned long phys) 29 + { 30 + int rc = htab_bolt_mapping(start, start + page_size, phys, 31 + pgprot_val(PAGE_KERNEL), 32 + mmu_vmemmap_psize, mmu_kernel_ssize); 33 + if (rc < 0) { 34 + int rc2 = htab_remove_mapping(start, start + page_size, 35 + mmu_vmemmap_psize, 36 + mmu_kernel_ssize); 37 + BUG_ON(rc2 && (rc2 != -ENOENT)); 38 + } 39 + return rc; 40 + } 41 + 42 + #ifdef CONFIG_MEMORY_HOTPLUG 43 + void hash__vmemmap_remove_mapping(unsigned long start, 44 + unsigned long page_size) 45 + { 46 + int rc = htab_remove_mapping(start, start + page_size, 47 + mmu_vmemmap_psize, 48 + mmu_kernel_ssize); 49 + BUG_ON((rc < 0) && (rc != -ENOENT)); 50 + WARN_ON(rc == -ENOENT); 51 + } 52 + #endif 53 + #endif /* CONFIG_SPARSEMEM_VMEMMAP */ 54 + 55 + /* 56 + * map_kernel_page currently only called by __ioremap 57 + * map_kernel_page adds an entry to the ioremap page table 58 + * and adds an entry to the HPT, possibly bolting it 59 + */ 60 + int hash__map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) 61 + { 62 + pgd_t *pgdp; 63 + pud_t *pudp; 64 + pmd_t *pmdp; 65 + pte_t *ptep; 66 + 67 + BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); 68 + if (slab_is_available()) { 69 + pgdp = pgd_offset_k(ea); 70 + pudp = pud_alloc(&init_mm, pgdp, ea); 71 + if (!pudp) 72 + return -ENOMEM; 73 + pmdp = pmd_alloc(&init_mm, pudp, ea); 74 + if (!pmdp) 75 + return -ENOMEM; 76 + ptep = pte_alloc_kernel(pmdp, ea); 77 + if (!ptep) 78 + return -ENOMEM; 79 + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, 80 + __pgprot(flags))); 81 + } else { 82 + /* 83 + * If the mm subsystem is not fully up, we cannot create a 84 + * linux page table entry for this mapping. Simply bolt an 85 + * entry in the hardware page table. 86 + * 87 + */ 88 + if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, 89 + mmu_io_psize, mmu_kernel_ssize)) { 90 + printk(KERN_ERR "Failed to do bolted mapping IO " 91 + "memory at %016lx !\n", pa); 92 + return -ENOMEM; 93 + } 94 + } 95 + 96 + smp_wmb(); 97 + return 0; 98 + } 99 + 100 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 101 + 102 + unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, 103 + pmd_t *pmdp, unsigned long clr, 104 + unsigned long set) 105 + { 106 + __be64 old_be, tmp; 107 + unsigned long old; 108 + 109 + #ifdef CONFIG_DEBUG_VM 110 + WARN_ON(!pmd_trans_huge(*pmdp)); 111 + assert_spin_locked(&mm->page_table_lock); 112 + #endif 113 + 114 + __asm__ __volatile__( 115 + "1: ldarx %0,0,%3\n\ 116 + and. %1,%0,%6\n\ 117 + bne- 1b \n\ 118 + andc %1,%0,%4 \n\ 119 + or %1,%1,%7\n\ 120 + stdcx. %1,0,%3 \n\ 121 + bne- 1b" 122 + : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp) 123 + : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp), 124 + "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) 125 + : "cc" ); 126 + 127 + old = be64_to_cpu(old_be); 128 + 129 + trace_hugepage_update(addr, old, clr, set); 130 + if (old & H_PAGE_HASHPTE) 131 + hpte_do_hugepage_flush(mm, addr, pmdp, old); 132 + return old; 133 + } 134 + 135 + pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, 136 + pmd_t *pmdp) 137 + { 138 + pmd_t pmd; 139 + 140 + VM_BUG_ON(address & ~HPAGE_PMD_MASK); 141 + VM_BUG_ON(pmd_trans_huge(*pmdp)); 142 + 143 + pmd = *pmdp; 144 + pmd_clear(pmdp); 145 + /* 146 + * Wait for all pending hash_page to finish. This is needed 147 + * in case of subpage collapse. When we collapse normal pages 148 + * to hugepage, we first clear the pmd, then invalidate all 149 + * the PTE entries. The assumption here is that any low level 150 + * page fault will see a none pmd and take the slow path that 151 + * will wait on mmap_sem. But we could very well be in a 152 + * hash_page with local ptep pointer value. Such a hash page 153 + * can result in adding new HPTE entries for normal subpages. 154 + * That means we could be modifying the page content as we 155 + * copy them to a huge page. So wait for parallel hash_page 156 + * to finish before invalidating HPTE entries. We can do this 157 + * by sending an IPI to all the cpus and executing a dummy 158 + * function there. 159 + */ 160 + kick_all_cpus_sync(); 161 + /* 162 + * Now invalidate the hpte entries in the range 163 + * covered by pmd. This make sure we take a 164 + * fault and will find the pmd as none, which will 165 + * result in a major fault which takes mmap_sem and 166 + * hence wait for collapse to complete. Without this 167 + * the __collapse_huge_page_copy can result in copying 168 + * the old content. 169 + */ 170 + flush_tlb_pmd_range(vma->vm_mm, &pmd, address); 171 + return pmd; 172 + } 173 + 174 + /* 175 + * We want to put the pgtable in pmd and use pgtable for tracking 176 + * the base page size hptes 177 + */ 178 + void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 179 + pgtable_t pgtable) 180 + { 181 + pgtable_t *pgtable_slot; 182 + assert_spin_locked(&mm->page_table_lock); 183 + /* 184 + * we store the pgtable in the second half of PMD 185 + */ 186 + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; 187 + *pgtable_slot = pgtable; 188 + /* 189 + * expose the deposited pgtable to other cpus. 190 + * before we set the hugepage PTE at pmd level 191 + * hash fault code looks at the deposted pgtable 192 + * to store hash index values. 193 + */ 194 + smp_wmb(); 195 + } 196 + 197 + pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 198 + { 199 + pgtable_t pgtable; 200 + pgtable_t *pgtable_slot; 201 + 202 + assert_spin_locked(&mm->page_table_lock); 203 + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; 204 + pgtable = *pgtable_slot; 205 + /* 206 + * Once we withdraw, mark the entry NULL. 207 + */ 208 + *pgtable_slot = NULL; 209 + /* 210 + * We store HPTE information in the deposited PTE fragment. 211 + * zero out the content on withdraw. 212 + */ 213 + memset(pgtable, 0, PTE_FRAG_SIZE); 214 + return pgtable; 215 + } 216 + 217 + void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, 218 + unsigned long address, pmd_t *pmdp) 219 + { 220 + VM_BUG_ON(address & ~HPAGE_PMD_MASK); 221 + VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); 222 + 223 + /* 224 + * We can't mark the pmd none here, because that will cause a race 225 + * against exit_mmap. We need to continue mark pmd TRANS HUGE, while 226 + * we spilt, but at the same time we wan't rest of the ppc64 code 227 + * not to insert hash pte on this, because we will be modifying 228 + * the deposited pgtable in the caller of this function. Hence 229 + * clear the _PAGE_USER so that we move the fault handling to 230 + * higher level function and that will serialize against ptl. 231 + * We need to flush existing hash pte entries here even though, 232 + * the translation is still valid, because we will withdraw 233 + * pgtable_t after this. 234 + */ 235 + pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED); 236 + } 237 + 238 + /* 239 + * A linux hugepage PMD was changed and the corresponding hash table entries 240 + * neesd to be flushed. 241 + */ 242 + void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, 243 + pmd_t *pmdp, unsigned long old_pmd) 244 + { 245 + int ssize; 246 + unsigned int psize; 247 + unsigned long vsid; 248 + unsigned long flags = 0; 249 + const struct cpumask *tmp; 250 + 251 + /* get the base page size,vsid and segment size */ 252 + #ifdef CONFIG_DEBUG_VM 253 + psize = get_slice_psize(mm, addr); 254 + BUG_ON(psize == MMU_PAGE_16M); 255 + #endif 256 + if (old_pmd & H_PAGE_COMBO) 257 + psize = MMU_PAGE_4K; 258 + else 259 + psize = MMU_PAGE_64K; 260 + 261 + if (!is_kernel_addr(addr)) { 262 + ssize = user_segment_size(addr); 263 + vsid = get_vsid(mm->context.id, addr, ssize); 264 + WARN_ON(vsid == 0); 265 + } else { 266 + vsid = get_kernel_vsid(addr, mmu_kernel_ssize); 267 + ssize = mmu_kernel_ssize; 268 + } 269 + 270 + tmp = cpumask_of(smp_processor_id()); 271 + if (cpumask_equal(mm_cpumask(mm), tmp)) 272 + flags |= HPTE_LOCAL_UPDATE; 273 + 274 + return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); 275 + } 276 + 277 + pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, 278 + unsigned long addr, pmd_t *pmdp) 279 + { 280 + pmd_t old_pmd; 281 + pgtable_t pgtable; 282 + unsigned long old; 283 + pgtable_t *pgtable_slot; 284 + 285 + old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); 286 + old_pmd = __pmd(old); 287 + /* 288 + * We have pmd == none and we are holding page_table_lock. 289 + * So we can safely go and clear the pgtable hash 290 + * index info. 291 + */ 292 + pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; 293 + pgtable = *pgtable_slot; 294 + /* 295 + * Let's zero out old valid and hash index details 296 + * hash fault look at them. 297 + */ 298 + memset(pgtable, 0, PTE_FRAG_SIZE); 299 + /* 300 + * Serialize against find_linux_pte_or_hugepte which does lock-less 301 + * lookup in page tables with local interrupts disabled. For huge pages 302 + * it casts pmd_t to pte_t. Since format of pte_t is different from 303 + * pmd_t we want to prevent transit from pmd pointing to page table 304 + * to pmd pointing to huge page (and back) while interrupts are disabled. 305 + * We clear pmd to possibly replace it with page table pointer in 306 + * different code paths. So make sure we wait for the parallel 307 + * find_linux_pte_or_hugepage to finish. 308 + */ 309 + kick_all_cpus_sync(); 310 + return old_pmd; 311 + } 312 + 313 + int hash__has_transparent_hugepage(void) 314 + { 315 + 316 + if (!mmu_has_feature(MMU_FTR_16M_PAGE)) 317 + return 0; 318 + /* 319 + * We support THP only if PMD_SIZE is 16MB. 320 + */ 321 + if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) 322 + return 0; 323 + /* 324 + * We need to make sure that we support 16MB hugepage in a segement 325 + * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE 326 + * of 64K. 327 + */ 328 + /* 329 + * If we have 64K HPTE, we will be using that by default 330 + */ 331 + if (mmu_psize_defs[MMU_PAGE_64K].shift && 332 + (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) 333 + return 0; 334 + /* 335 + * Ok we only have 4K HPTE 336 + */ 337 + if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) 338 + return 0; 339 + 340 + return 1; 341 + } 342 + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */

+526

arch/powerpc/mm/pgtable-radix.c

··· 1 + /* 2 + * Page table handling routines for radix page table. 3 + * 4 + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. 5 + * 6 + * This program is free software; you can redistribute it and/or 7 + * modify it under the terms of the GNU General Public License 8 + * as published by the Free Software Foundation; either version 9 + * 2 of the License, or (at your option) any later version. 10 + */ 11 + #include <linux/sched.h> 12 + #include <linux/memblock.h> 13 + #include <linux/of_fdt.h> 14 + 15 + #include <asm/pgtable.h> 16 + #include <asm/pgalloc.h> 17 + #include <asm/dma.h> 18 + #include <asm/machdep.h> 19 + #include <asm/mmu.h> 20 + #include <asm/firmware.h> 21 + 22 + #include <trace/events/thp.h> 23 + 24 + static int native_update_partition_table(u64 patb1) 25 + { 26 + partition_tb->patb1 = cpu_to_be64(patb1); 27 + return 0; 28 + } 29 + 30 + static __ref void *early_alloc_pgtable(unsigned long size) 31 + { 32 + void *pt; 33 + 34 + pt = __va(memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE)); 35 + memset(pt, 0, size); 36 + 37 + return pt; 38 + } 39 + 40 + int radix__map_kernel_page(unsigned long ea, unsigned long pa, 41 + pgprot_t flags, 42 + unsigned int map_page_size) 43 + { 44 + pgd_t *pgdp; 45 + pud_t *pudp; 46 + pmd_t *pmdp; 47 + pte_t *ptep; 48 + /* 49 + * Make sure task size is correct as per the max adddr 50 + */ 51 + BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE); 52 + if (slab_is_available()) { 53 + pgdp = pgd_offset_k(ea); 54 + pudp = pud_alloc(&init_mm, pgdp, ea); 55 + if (!pudp) 56 + return -ENOMEM; 57 + if (map_page_size == PUD_SIZE) { 58 + ptep = (pte_t *)pudp; 59 + goto set_the_pte; 60 + } 61 + pmdp = pmd_alloc(&init_mm, pudp, ea); 62 + if (!pmdp) 63 + return -ENOMEM; 64 + if (map_page_size == PMD_SIZE) { 65 + ptep = (pte_t *)pudp; 66 + goto set_the_pte; 67 + } 68 + ptep = pte_alloc_kernel(pmdp, ea); 69 + if (!ptep) 70 + return -ENOMEM; 71 + } else { 72 + pgdp = pgd_offset_k(ea); 73 + if (pgd_none(*pgdp)) { 74 + pudp = early_alloc_pgtable(PUD_TABLE_SIZE); 75 + BUG_ON(pudp == NULL); 76 + pgd_populate(&init_mm, pgdp, pudp); 77 + } 78 + pudp = pud_offset(pgdp, ea); 79 + if (map_page_size == PUD_SIZE) { 80 + ptep = (pte_t *)pudp; 81 + goto set_the_pte; 82 + } 83 + if (pud_none(*pudp)) { 84 + pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); 85 + BUG_ON(pmdp == NULL); 86 + pud_populate(&init_mm, pudp, pmdp); 87 + } 88 + pmdp = pmd_offset(pudp, ea); 89 + if (map_page_size == PMD_SIZE) { 90 + ptep = (pte_t *)pudp; 91 + goto set_the_pte; 92 + } 93 + if (!pmd_present(*pmdp)) { 94 + ptep = early_alloc_pgtable(PAGE_SIZE); 95 + BUG_ON(ptep == NULL); 96 + pmd_populate_kernel(&init_mm, pmdp, ptep); 97 + } 98 + ptep = pte_offset_kernel(pmdp, ea); 99 + } 100 + 101 + set_the_pte: 102 + set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, flags)); 103 + smp_wmb(); 104 + return 0; 105 + } 106 + 107 + static void __init radix_init_pgtable(void) 108 + { 109 + int loop_count; 110 + u64 base, end, start_addr; 111 + unsigned long rts_field; 112 + struct memblock_region *reg; 113 + unsigned long linear_page_size; 114 + 115 + /* We don't support slb for radix */ 116 + mmu_slb_size = 0; 117 + /* 118 + * Create the linear mapping, using standard page size for now 119 + */ 120 + loop_count = 0; 121 + for_each_memblock(memory, reg) { 122 + 123 + start_addr = reg->base; 124 + 125 + redo: 126 + if (loop_count < 1 && mmu_psize_defs[MMU_PAGE_1G].shift) 127 + linear_page_size = PUD_SIZE; 128 + else if (loop_count < 2 && mmu_psize_defs[MMU_PAGE_2M].shift) 129 + linear_page_size = PMD_SIZE; 130 + else 131 + linear_page_size = PAGE_SIZE; 132 + 133 + base = _ALIGN_UP(start_addr, linear_page_size); 134 + end = _ALIGN_DOWN(reg->base + reg->size, linear_page_size); 135 + 136 + pr_info("Mapping range 0x%lx - 0x%lx with 0x%lx\n", 137 + (unsigned long)base, (unsigned long)end, 138 + linear_page_size); 139 + 140 + while (base < end) { 141 + radix__map_kernel_page((unsigned long)__va(base), 142 + base, PAGE_KERNEL_X, 143 + linear_page_size); 144 + base += linear_page_size; 145 + } 146 + /* 147 + * map the rest using lower page size 148 + */ 149 + if (end < reg->base + reg->size) { 150 + start_addr = end; 151 + loop_count++; 152 + goto redo; 153 + } 154 + } 155 + /* 156 + * Allocate Partition table and process table for the 157 + * host. 158 + */ 159 + BUILD_BUG_ON_MSG((PRTB_SIZE_SHIFT > 23), "Process table size too large."); 160 + process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT); 161 + /* 162 + * Fill in the process table. 163 + * we support 52 bits, hence 52-28 = 24, 11000 164 + */ 165 + rts_field = 3ull << PPC_BITLSHIFT(2); 166 + process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE); 167 + /* 168 + * Fill in the partition table. We are suppose to use effective address 169 + * of process table here. But our linear mapping also enable us to use 170 + * physical address here. 171 + */ 172 + ppc_md.update_partition_table(__pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR); 173 + pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd); 174 + } 175 + 176 + static void __init radix_init_partition_table(void) 177 + { 178 + unsigned long rts_field; 179 + /* 180 + * we support 52 bits, hence 52-28 = 24, 11000 181 + */ 182 + rts_field = 3ull << PPC_BITLSHIFT(2); 183 + 184 + BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large."); 185 + partition_tb = early_alloc_pgtable(1UL << PATB_SIZE_SHIFT); 186 + partition_tb->patb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | 187 + RADIX_PGD_INDEX_SIZE | PATB_HR); 188 + printk("Partition table %p\n", partition_tb); 189 + 190 + memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); 191 + /* 192 + * update partition table control register, 193 + * 64 K size. 194 + */ 195 + mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); 196 + } 197 + 198 + void __init radix_init_native(void) 199 + { 200 + ppc_md.update_partition_table = native_update_partition_table; 201 + } 202 + 203 + static int __init get_idx_from_shift(unsigned int shift) 204 + { 205 + int idx = -1; 206 + 207 + switch (shift) { 208 + case 0xc: 209 + idx = MMU_PAGE_4K; 210 + break; 211 + case 0x10: 212 + idx = MMU_PAGE_64K; 213 + break; 214 + case 0x15: 215 + idx = MMU_PAGE_2M; 216 + break; 217 + case 0x1e: 218 + idx = MMU_PAGE_1G; 219 + break; 220 + } 221 + return idx; 222 + } 223 + 224 + static int __init radix_dt_scan_page_sizes(unsigned long node, 225 + const char *uname, int depth, 226 + void *data) 227 + { 228 + int size = 0; 229 + int shift, idx; 230 + unsigned int ap; 231 + const __be32 *prop; 232 + const char *type = of_get_flat_dt_prop(node, "device_type", NULL); 233 + 234 + /* We are scanning "cpu" nodes only */ 235 + if (type == NULL || strcmp(type, "cpu") != 0) 236 + return 0; 237 + 238 + prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); 239 + if (!prop) 240 + return 0; 241 + 242 + pr_info("Page sizes from device-tree:\n"); 243 + for (; size >= 4; size -= 4, ++prop) { 244 + 245 + struct mmu_psize_def *def; 246 + 247 + /* top 3 bit is AP encoding */ 248 + shift = be32_to_cpu(prop[0]) & ~(0xe << 28); 249 + ap = be32_to_cpu(prop[0]) >> 29; 250 + pr_info("Page size sift = %d AP=0x%x\n", shift, ap); 251 + 252 + idx = get_idx_from_shift(shift); 253 + if (idx < 0) 254 + continue; 255 + 256 + def = &mmu_psize_defs[idx]; 257 + def->shift = shift; 258 + def->ap = ap; 259 + } 260 + 261 + /* needed ? */ 262 + cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B; 263 + return 1; 264 + } 265 + 266 + static void __init radix_init_page_sizes(void) 267 + { 268 + int rc; 269 + 270 + /* 271 + * Try to find the available page sizes in the device-tree 272 + */ 273 + rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL); 274 + if (rc != 0) /* Found */ 275 + goto found; 276 + /* 277 + * let's assume we have page 4k and 64k support 278 + */ 279 + mmu_psize_defs[MMU_PAGE_4K].shift = 12; 280 + mmu_psize_defs[MMU_PAGE_4K].ap = 0x0; 281 + 282 + mmu_psize_defs[MMU_PAGE_64K].shift = 16; 283 + mmu_psize_defs[MMU_PAGE_64K].ap = 0x5; 284 + found: 285 + #ifdef CONFIG_SPARSEMEM_VMEMMAP 286 + if (mmu_psize_defs[MMU_PAGE_2M].shift) { 287 + /* 288 + * map vmemmap using 2M if available 289 + */ 290 + mmu_vmemmap_psize = MMU_PAGE_2M; 291 + } 292 + #endif /* CONFIG_SPARSEMEM_VMEMMAP */ 293 + return; 294 + } 295 + 296 + void __init radix__early_init_mmu(void) 297 + { 298 + unsigned long lpcr; 299 + /* 300 + * setup LPCR UPRT based on mmu_features 301 + */ 302 + lpcr = mfspr(SPRN_LPCR); 303 + mtspr(SPRN_LPCR, lpcr | LPCR_UPRT); 304 + 305 + #ifdef CONFIG_PPC_64K_PAGES 306 + /* PAGE_SIZE mappings */ 307 + mmu_virtual_psize = MMU_PAGE_64K; 308 + #else 309 + mmu_virtual_psize = MMU_PAGE_4K; 310 + #endif 311 + 312 + #ifdef CONFIG_SPARSEMEM_VMEMMAP 313 + /* vmemmap mapping */ 314 + mmu_vmemmap_psize = mmu_virtual_psize; 315 + #endif 316 + /* 317 + * initialize page table size 318 + */ 319 + __pte_index_size = RADIX_PTE_INDEX_SIZE; 320 + __pmd_index_size = RADIX_PMD_INDEX_SIZE; 321 + __pud_index_size = RADIX_PUD_INDEX_SIZE; 322 + __pgd_index_size = RADIX_PGD_INDEX_SIZE; 323 + __pmd_cache_index = RADIX_PMD_INDEX_SIZE; 324 + __pte_table_size = RADIX_PTE_TABLE_SIZE; 325 + __pmd_table_size = RADIX_PMD_TABLE_SIZE; 326 + __pud_table_size = RADIX_PUD_TABLE_SIZE; 327 + __pgd_table_size = RADIX_PGD_TABLE_SIZE; 328 + 329 + __pmd_val_bits = RADIX_PMD_VAL_BITS; 330 + __pud_val_bits = RADIX_PUD_VAL_BITS; 331 + __pgd_val_bits = RADIX_PGD_VAL_BITS; 332 + 333 + __kernel_virt_start = RADIX_KERN_VIRT_START; 334 + __kernel_virt_size = RADIX_KERN_VIRT_SIZE; 335 + __vmalloc_start = RADIX_VMALLOC_START; 336 + __vmalloc_end = RADIX_VMALLOC_END; 337 + vmemmap = (struct page *)RADIX_VMEMMAP_BASE; 338 + ioremap_bot = IOREMAP_BASE; 339 + /* 340 + * For now radix also use the same frag size 341 + */ 342 + __pte_frag_nr = H_PTE_FRAG_NR; 343 + __pte_frag_size_shift = H_PTE_FRAG_SIZE_SHIFT; 344 + 345 + radix_init_page_sizes(); 346 + if (!firmware_has_feature(FW_FEATURE_LPAR)) 347 + radix_init_partition_table(); 348 + 349 + radix_init_pgtable(); 350 + } 351 + 352 + void radix__early_init_mmu_secondary(void) 353 + { 354 + unsigned long lpcr; 355 + /* 356 + * setup LPCR UPRT based on mmu_features 357 + */ 358 + lpcr = mfspr(SPRN_LPCR); 359 + mtspr(SPRN_LPCR, lpcr | LPCR_UPRT); 360 + /* 361 + * update partition table control register, 64 K size. 362 + */ 363 + if (!firmware_has_feature(FW_FEATURE_LPAR)) 364 + mtspr(SPRN_PTCR, 365 + __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); 366 + } 367 + 368 + void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, 369 + phys_addr_t first_memblock_size) 370 + { 371 + /* We don't currently support the first MEMBLOCK not mapping 0 372 + * physical on those processors 373 + */ 374 + BUG_ON(first_memblock_base != 0); 375 + /* 376 + * We limit the allocation that depend on ppc64_rma_size 377 + * to first_memblock_size. We also clamp it to 1GB to 378 + * avoid some funky things such as RTAS bugs. 379 + * 380 + * On radix config we really don't have a limitation 381 + * on real mode access. But keeping it as above works 382 + * well enough. 383 + */ 384 + ppc64_rma_size = min_t(u64, first_memblock_size, 0x40000000); 385 + /* 386 + * Finally limit subsequent allocations. We really don't want 387 + * to limit the memblock allocations to rma_size. FIXME!! should 388 + * we even limit at all ? 389 + */ 390 + memblock_set_current_limit(first_memblock_base + first_memblock_size); 391 + } 392 + 393 + #ifdef CONFIG_SPARSEMEM_VMEMMAP 394 + int __meminit radix__vmemmap_create_mapping(unsigned long start, 395 + unsigned long page_size, 396 + unsigned long phys) 397 + { 398 + /* Create a PTE encoding */ 399 + unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW; 400 + 401 + BUG_ON(radix__map_kernel_page(start, phys, __pgprot(flags), page_size)); 402 + return 0; 403 + } 404 + 405 + #ifdef CONFIG_MEMORY_HOTPLUG 406 + void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) 407 + { 408 + /* FIXME!! intel does more. We should free page tables mapping vmemmap ? */ 409 + } 410 + #endif 411 + #endif 412 + 413 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 414 + 415 + unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, 416 + pmd_t *pmdp, unsigned long clr, 417 + unsigned long set) 418 + { 419 + unsigned long old; 420 + 421 + #ifdef CONFIG_DEBUG_VM 422 + WARN_ON(!radix__pmd_trans_huge(*pmdp)); 423 + assert_spin_locked(&mm->page_table_lock); 424 + #endif 425 + 426 + old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1); 427 + trace_hugepage_update(addr, old, clr, set); 428 + 429 + return old; 430 + } 431 + 432 + pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, 433 + pmd_t *pmdp) 434 + 435 + { 436 + pmd_t pmd; 437 + 438 + VM_BUG_ON(address & ~HPAGE_PMD_MASK); 439 + VM_BUG_ON(radix__pmd_trans_huge(*pmdp)); 440 + /* 441 + * khugepaged calls this for normal pmd 442 + */ 443 + pmd = *pmdp; 444 + pmd_clear(pmdp); 445 + /*FIXME!! Verify whether we need this kick below */ 446 + kick_all_cpus_sync(); 447 + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 448 + return pmd; 449 + } 450 + 451 + /* 452 + * For us pgtable_t is pte_t *. Inorder to save the deposisted 453 + * page table, we consider the allocated page table as a list 454 + * head. On withdraw we need to make sure we zero out the used 455 + * list_head memory area. 456 + */ 457 + void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 458 + pgtable_t pgtable) 459 + { 460 + struct list_head *lh = (struct list_head *) pgtable; 461 + 462 + assert_spin_locked(pmd_lockptr(mm, pmdp)); 463 + 464 + /* FIFO */ 465 + if (!pmd_huge_pte(mm, pmdp)) 466 + INIT_LIST_HEAD(lh); 467 + else 468 + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); 469 + pmd_huge_pte(mm, pmdp) = pgtable; 470 + } 471 + 472 + pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 473 + { 474 + pte_t *ptep; 475 + pgtable_t pgtable; 476 + struct list_head *lh; 477 + 478 + assert_spin_locked(pmd_lockptr(mm, pmdp)); 479 + 480 + /* FIFO */ 481 + pgtable = pmd_huge_pte(mm, pmdp); 482 + lh = (struct list_head *) pgtable; 483 + if (list_empty(lh)) 484 + pmd_huge_pte(mm, pmdp) = NULL; 485 + else { 486 + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; 487 + list_del(lh); 488 + } 489 + ptep = (pte_t *) pgtable; 490 + *ptep = __pte(0); 491 + ptep++; 492 + *ptep = __pte(0); 493 + return pgtable; 494 + } 495 + 496 + 497 + pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, 498 + unsigned long addr, pmd_t *pmdp) 499 + { 500 + pmd_t old_pmd; 501 + unsigned long old; 502 + 503 + old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); 504 + old_pmd = __pmd(old); 505 + /* 506 + * Serialize against find_linux_pte_or_hugepte which does lock-less 507 + * lookup in page tables with local interrupts disabled. For huge pages 508 + * it casts pmd_t to pte_t. Since format of pte_t is different from 509 + * pmd_t we want to prevent transit from pmd pointing to page table 510 + * to pmd pointing to huge page (and back) while interrupts are disabled. 511 + * We clear pmd to possibly replace it with page table pointer in 512 + * different code paths. So make sure we wait for the parallel 513 + * find_linux_pte_or_hugepage to finish. 514 + */ 515 + kick_all_cpus_sync(); 516 + return old_pmd; 517 + } 518 + 519 + int radix__has_transparent_hugepage(void) 520 + { 521 + /* For radix 2M at PMD level means thp */ 522 + if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT) 523 + return 1; 524 + return 0; 525 + } 526 + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */

+19 -5

arch/powerpc/mm/pgtable.c

··· 38 38 39 39 /* We only try to do i/d cache coherency on stuff that looks like 40 40 * reasonably "normal" PTEs. We currently require a PTE to be present 41 - * and we avoid _PAGE_SPECIAL and _PAGE_NO_CACHE. We also only do that 41 + * and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that 42 42 * on userspace PTEs 43 43 */ 44 44 static inline int pte_looks_normal(pte_t pte) 45 45 { 46 + 47 + #if defined(CONFIG_PPC_BOOK3S_64) 48 + if ((pte_val(pte) & (_PAGE_PRESENT | _PAGE_SPECIAL)) == _PAGE_PRESENT) { 49 + if (pte_ci(pte)) 50 + return 0; 51 + if (pte_user(pte)) 52 + return 1; 53 + } 54 + return 0; 55 + #else 46 56 return (pte_val(pte) & 47 - (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) == 48 - (_PAGE_PRESENT | _PAGE_USER); 57 + (_PAGE_PRESENT | _PAGE_SPECIAL | _PAGE_NO_CACHE | _PAGE_USER)) == 58 + (_PAGE_PRESENT | _PAGE_USER); 59 + #endif 49 60 } 50 61 51 62 static struct page *maybe_pte_to_page(pte_t pte) ··· 82 71 83 72 static pte_t set_pte_filter(pte_t pte) 84 73 { 74 + if (radix_enabled()) 75 + return pte; 76 + 85 77 pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); 86 78 if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || 87 79 cpu_has_feature(CPU_FTR_NOEXECUTE))) { ··· 191 177 * _PAGE_PRESENT, but we can be sure that it is not in hpte. 192 178 * Hence we can use set_pte_at for them. 193 179 */ 194 - VM_WARN_ON((pte_val(*ptep) & (_PAGE_PRESENT | _PAGE_USER)) == 195 - (_PAGE_PRESENT | _PAGE_USER)); 180 + VM_WARN_ON(pte_present(*ptep) && !pte_protnone(*ptep)); 181 + 196 182 /* 197 183 * Add the pte bit when tryint set a pte 198 184 */

+69 -488

arch/powerpc/mm/pgtable_64.c

··· 55 55 56 56 #include "mmu_decl.h" 57 57 58 - #define CREATE_TRACE_POINTS 59 - #include <trace/events/thp.h> 60 - 61 - /* Some sanity checking */ 62 - #if TASK_SIZE_USER64 > PGTABLE_RANGE 63 - #error TASK_SIZE_USER64 exceeds pagetable range 64 - #endif 65 - 66 58 #ifdef CONFIG_PPC_STD_MMU_64 67 59 #if TASK_SIZE_USER64 > (1UL << (ESID_BITS + SID_SHIFT)) 68 60 #error TASK_SIZE_USER64 exceeds user VSID range 69 61 #endif 70 62 #endif 71 63 72 - unsigned long ioremap_bot = IOREMAP_BASE; 73 - 74 - #ifdef CONFIG_PPC_MMU_NOHASH 75 - static __ref void *early_alloc_pgtable(unsigned long size) 76 - { 77 - void *pt; 78 - 79 - pt = __va(memblock_alloc_base(size, size, __pa(MAX_DMA_ADDRESS))); 80 - memset(pt, 0, size); 81 - 82 - return pt; 83 - } 84 - #endif /* CONFIG_PPC_MMU_NOHASH */ 85 - 64 + #ifdef CONFIG_PPC_BOOK3S_64 86 65 /* 87 - * map_kernel_page currently only called by __ioremap 88 - * map_kernel_page adds an entry to the ioremap page table 89 - * and adds an entry to the HPT, possibly bolting it 66 + * partition table and process table for ISA 3.0 90 67 */ 91 - int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags) 92 - { 93 - pgd_t *pgdp; 94 - pud_t *pudp; 95 - pmd_t *pmdp; 96 - pte_t *ptep; 97 - 98 - if (slab_is_available()) { 99 - pgdp = pgd_offset_k(ea); 100 - pudp = pud_alloc(&init_mm, pgdp, ea); 101 - if (!pudp) 102 - return -ENOMEM; 103 - pmdp = pmd_alloc(&init_mm, pudp, ea); 104 - if (!pmdp) 105 - return -ENOMEM; 106 - ptep = pte_alloc_kernel(pmdp, ea); 107 - if (!ptep) 108 - return -ENOMEM; 109 - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, 110 - __pgprot(flags))); 111 - } else { 112 - #ifdef CONFIG_PPC_MMU_NOHASH 113 - pgdp = pgd_offset_k(ea); 114 - #ifdef PUD_TABLE_SIZE 115 - if (pgd_none(*pgdp)) { 116 - pudp = early_alloc_pgtable(PUD_TABLE_SIZE); 117 - BUG_ON(pudp == NULL); 118 - pgd_populate(&init_mm, pgdp, pudp); 119 - } 120 - #endif /* PUD_TABLE_SIZE */ 121 - pudp = pud_offset(pgdp, ea); 122 - if (pud_none(*pudp)) { 123 - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE); 124 - BUG_ON(pmdp == NULL); 125 - pud_populate(&init_mm, pudp, pmdp); 126 - } 127 - pmdp = pmd_offset(pudp, ea); 128 - if (!pmd_present(*pmdp)) { 129 - ptep = early_alloc_pgtable(PAGE_SIZE); 130 - BUG_ON(ptep == NULL); 131 - pmd_populate_kernel(&init_mm, pmdp, ptep); 132 - } 133 - ptep = pte_offset_kernel(pmdp, ea); 134 - set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, 135 - __pgprot(flags))); 136 - #else /* CONFIG_PPC_MMU_NOHASH */ 137 - /* 138 - * If the mm subsystem is not fully up, we cannot create a 139 - * linux page table entry for this mapping. Simply bolt an 140 - * entry in the hardware page table. 141 - * 142 - */ 143 - if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, flags, 144 - mmu_io_psize, mmu_kernel_ssize)) { 145 - printk(KERN_ERR "Failed to do bolted mapping IO " 146 - "memory at %016lx !\n", pa); 147 - return -ENOMEM; 148 - } 149 - #endif /* !CONFIG_PPC_MMU_NOHASH */ 150 - } 151 - 152 - smp_wmb(); 153 - return 0; 154 - } 155 - 68 + struct prtb_entry *process_tb; 69 + struct patb_entry *partition_tb; 70 + /* 71 + * page table size 72 + */ 73 + unsigned long __pte_index_size; 74 + EXPORT_SYMBOL(__pte_index_size); 75 + unsigned long __pmd_index_size; 76 + EXPORT_SYMBOL(__pmd_index_size); 77 + unsigned long __pud_index_size; 78 + EXPORT_SYMBOL(__pud_index_size); 79 + unsigned long __pgd_index_size; 80 + EXPORT_SYMBOL(__pgd_index_size); 81 + unsigned long __pmd_cache_index; 82 + EXPORT_SYMBOL(__pmd_cache_index); 83 + unsigned long __pte_table_size; 84 + EXPORT_SYMBOL(__pte_table_size); 85 + unsigned long __pmd_table_size; 86 + EXPORT_SYMBOL(__pmd_table_size); 87 + unsigned long __pud_table_size; 88 + EXPORT_SYMBOL(__pud_table_size); 89 + unsigned long __pgd_table_size; 90 + EXPORT_SYMBOL(__pgd_table_size); 91 + unsigned long __pmd_val_bits; 92 + EXPORT_SYMBOL(__pmd_val_bits); 93 + unsigned long __pud_val_bits; 94 + EXPORT_SYMBOL(__pud_val_bits); 95 + unsigned long __pgd_val_bits; 96 + EXPORT_SYMBOL(__pgd_val_bits); 97 + unsigned long __kernel_virt_start; 98 + EXPORT_SYMBOL(__kernel_virt_start); 99 + unsigned long __kernel_virt_size; 100 + EXPORT_SYMBOL(__kernel_virt_size); 101 + unsigned long __vmalloc_start; 102 + EXPORT_SYMBOL(__vmalloc_start); 103 + unsigned long __vmalloc_end; 104 + EXPORT_SYMBOL(__vmalloc_end); 105 + struct page *vmemmap; 106 + EXPORT_SYMBOL(vmemmap); 107 + unsigned long __pte_frag_nr; 108 + EXPORT_SYMBOL(__pte_frag_nr); 109 + unsigned long __pte_frag_size_shift; 110 + EXPORT_SYMBOL(__pte_frag_size_shift); 111 + unsigned long ioremap_bot; 112 + #else /* !CONFIG_PPC_BOOK3S_64 */ 113 + unsigned long ioremap_bot = IOREMAP_BASE; 114 + #endif 156 115 157 116 /** 158 117 * __ioremap_at - Low level function to establish the page tables ··· 126 167 if ((flags & _PAGE_PRESENT) == 0) 127 168 flags |= pgprot_val(PAGE_KERNEL); 128 169 129 - /* Non-cacheable page cannot be coherent */ 130 - if (flags & _PAGE_NO_CACHE) 131 - flags &= ~_PAGE_COHERENT; 132 - 133 170 /* We don't support the 4K PFN hack with ioremap */ 134 - if (flags & _PAGE_4K_PFN) 171 + if (flags & H_PAGE_4K_PFN) 135 172 return NULL; 136 173 137 174 WARN_ON(pa & ~PAGE_MASK); ··· 208 253 209 254 void __iomem * ioremap(phys_addr_t addr, unsigned long size) 210 255 { 211 - unsigned long flags = _PAGE_NO_CACHE | _PAGE_GUARDED; 256 + unsigned long flags = pgprot_val(pgprot_noncached(__pgprot(0))); 212 257 void *caller = __builtin_return_address(0); 213 258 214 259 if (ppc_md.ioremap) ··· 218 263 219 264 void __iomem * ioremap_wc(phys_addr_t addr, unsigned long size) 220 265 { 221 - unsigned long flags = _PAGE_NO_CACHE; 266 + unsigned long flags = pgprot_val(pgprot_noncached_wc(__pgprot(0))); 222 267 void *caller = __builtin_return_address(0); 223 268 224 269 if (ppc_md.ioremap) ··· 232 277 void *caller = __builtin_return_address(0); 233 278 234 279 /* writeable implies dirty for kernel addresses */ 235 - if (flags & _PAGE_RW) 280 + if (flags & _PAGE_WRITE) 236 281 flags |= _PAGE_DIRTY; 237 282 238 - /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ 239 - flags &= ~(_PAGE_USER | _PAGE_EXEC); 283 + /* we don't want to let _PAGE_EXEC leak out */ 284 + flags &= ~_PAGE_EXEC; 285 + /* 286 + * Force kernel mapping. 287 + */ 288 + #if defined(CONFIG_PPC_BOOK3S_64) 289 + flags |= _PAGE_PRIVILEGED; 290 + #else 291 + flags &= ~_PAGE_USER; 292 + #endif 293 + 240 294 241 295 #ifdef _PAGE_BAP_SR 242 296 /* _PAGE_USER contains _PAGE_BAP_SR on BookE using the new PTE format ··· 375 411 return (pte_t *)ret; 376 412 } 377 413 378 - pte_t *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) 414 + pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int kernel) 379 415 { 380 416 pte_t *pte; 381 417 ··· 385 421 386 422 return __alloc_for_cache(mm, kernel); 387 423 } 424 + #endif /* CONFIG_PPC_64K_PAGES */ 388 425 389 - void page_table_free(struct mm_struct *mm, unsigned long *table, int kernel) 426 + void pte_fragment_free(unsigned long *table, int kernel) 390 427 { 391 428 struct page *page = virt_to_page(table); 392 429 if (put_page_testzero(page)) { ··· 398 433 } 399 434 400 435 #ifdef CONFIG_SMP 401 - static void page_table_free_rcu(void *table) 402 - { 403 - struct page *page = virt_to_page(table); 404 - if (put_page_testzero(page)) { 405 - pgtable_page_dtor(page); 406 - free_hot_cold_page(page, 0); 407 - } 408 - } 409 - 410 436 void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) 411 437 { 412 438 unsigned long pgf = (unsigned long)table; ··· 414 458 415 459 if (!shift) 416 460 /* PTE page needs special handling */ 417 - page_table_free_rcu(table); 461 + pte_fragment_free(table, 0); 418 462 else { 419 463 BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); 420 464 kmem_cache_free(PGT_CACHE(shift), table); ··· 425 469 { 426 470 if (!shift) { 427 471 /* PTE page needs special handling */ 428 - struct page *page = virt_to_page(table); 429 - if (put_page_testzero(page)) { 430 - pgtable_page_dtor(page); 431 - free_hot_cold_page(page, 0); 432 - } 472 + pte_fragment_free(table, 0); 433 473 } else { 434 474 BUG_ON(shift > MAX_PGTABLE_INDEX_SIZE); 435 475 kmem_cache_free(PGT_CACHE(shift), table); 436 476 } 437 477 } 438 478 #endif 439 - #endif /* CONFIG_PPC_64K_PAGES */ 440 - 441 - #ifdef CONFIG_TRANSPARENT_HUGEPAGE 442 - 443 - /* 444 - * This is called when relaxing access to a hugepage. It's also called in the page 445 - * fault path when we don't hit any of the major fault cases, ie, a minor 446 - * update of _PAGE_ACCESSED, _PAGE_DIRTY, etc... The generic code will have 447 - * handled those two for us, we additionally deal with missing execute 448 - * permission here on some processors 449 - */ 450 - int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, 451 - pmd_t *pmdp, pmd_t entry, int dirty) 452 - { 453 - int changed; 454 - #ifdef CONFIG_DEBUG_VM 455 - WARN_ON(!pmd_trans_huge(*pmdp)); 456 - assert_spin_locked(&vma->vm_mm->page_table_lock); 457 - #endif 458 - changed = !pmd_same(*(pmdp), entry); 459 - if (changed) { 460 - __ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry)); 461 - /* 462 - * Since we are not supporting SW TLB systems, we don't 463 - * have any thing similar to flush_tlb_page_nohash() 464 - */ 465 - } 466 - return changed; 467 - } 468 - 469 - unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, 470 - pmd_t *pmdp, unsigned long clr, 471 - unsigned long set) 472 - { 473 - 474 - unsigned long old, tmp; 475 - 476 - #ifdef CONFIG_DEBUG_VM 477 - WARN_ON(!pmd_trans_huge(*pmdp)); 478 - assert_spin_locked(&mm->page_table_lock); 479 - #endif 480 - 481 - #ifdef PTE_ATOMIC_UPDATES 482 - __asm__ __volatile__( 483 - "1: ldarx %0,0,%3\n\ 484 - andi. %1,%0,%6\n\ 485 - bne- 1b \n\ 486 - andc %1,%0,%4 \n\ 487 - or %1,%1,%7\n\ 488 - stdcx. %1,0,%3 \n\ 489 - bne- 1b" 490 - : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) 491 - : "r" (pmdp), "r" (clr), "m" (*pmdp), "i" (_PAGE_BUSY), "r" (set) 492 - : "cc" ); 493 - #else 494 - old = pmd_val(*pmdp); 495 - *pmdp = __pmd((old & ~clr) | set); 496 - #endif 497 - trace_hugepage_update(addr, old, clr, set); 498 - if (old & _PAGE_HASHPTE) 499 - hpte_do_hugepage_flush(mm, addr, pmdp, old); 500 - return old; 501 - } 502 - 503 - pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, 504 - pmd_t *pmdp) 505 - { 506 - pmd_t pmd; 507 - 508 - VM_BUG_ON(address & ~HPAGE_PMD_MASK); 509 - VM_BUG_ON(pmd_trans_huge(*pmdp)); 510 - 511 - pmd = *pmdp; 512 - pmd_clear(pmdp); 513 - /* 514 - * Wait for all pending hash_page to finish. This is needed 515 - * in case of subpage collapse. When we collapse normal pages 516 - * to hugepage, we first clear the pmd, then invalidate all 517 - * the PTE entries. The assumption here is that any low level 518 - * page fault will see a none pmd and take the slow path that 519 - * will wait on mmap_sem. But we could very well be in a 520 - * hash_page with local ptep pointer value. Such a hash page 521 - * can result in adding new HPTE entries for normal subpages. 522 - * That means we could be modifying the page content as we 523 - * copy them to a huge page. So wait for parallel hash_page 524 - * to finish before invalidating HPTE entries. We can do this 525 - * by sending an IPI to all the cpus and executing a dummy 526 - * function there. 527 - */ 528 - kick_all_cpus_sync(); 529 - /* 530 - * Now invalidate the hpte entries in the range 531 - * covered by pmd. This make sure we take a 532 - * fault and will find the pmd as none, which will 533 - * result in a major fault which takes mmap_sem and 534 - * hence wait for collapse to complete. Without this 535 - * the __collapse_huge_page_copy can result in copying 536 - * the old content. 537 - */ 538 - flush_tlb_pmd_range(vma->vm_mm, &pmd, address); 539 - return pmd; 540 - } 541 - 542 - int pmdp_test_and_clear_young(struct vm_area_struct *vma, 543 - unsigned long address, pmd_t *pmdp) 544 - { 545 - return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); 546 - } 547 - 548 - /* 549 - * We currently remove entries from the hashtable regardless of whether 550 - * the entry was young or dirty. The generic routines only flush if the 551 - * entry was young or dirty which is not good enough. 552 - * 553 - * We should be more intelligent about this but for the moment we override 554 - * these functions and force a tlb flush unconditionally 555 - */ 556 - int pmdp_clear_flush_young(struct vm_area_struct *vma, 557 - unsigned long address, pmd_t *pmdp) 558 - { 559 - return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); 560 - } 561 - 562 - /* 563 - * We want to put the pgtable in pmd and use pgtable for tracking 564 - * the base page size hptes 565 - */ 566 - void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, 567 - pgtable_t pgtable) 568 - { 569 - pgtable_t *pgtable_slot; 570 - assert_spin_locked(&mm->page_table_lock); 571 - /* 572 - * we store the pgtable in the second half of PMD 573 - */ 574 - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; 575 - *pgtable_slot = pgtable; 576 - /* 577 - * expose the deposited pgtable to other cpus. 578 - * before we set the hugepage PTE at pmd level 579 - * hash fault code looks at the deposted pgtable 580 - * to store hash index values. 581 - */ 582 - smp_wmb(); 583 - } 584 - 585 - pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) 586 - { 587 - pgtable_t pgtable; 588 - pgtable_t *pgtable_slot; 589 - 590 - assert_spin_locked(&mm->page_table_lock); 591 - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; 592 - pgtable = *pgtable_slot; 593 - /* 594 - * Once we withdraw, mark the entry NULL. 595 - */ 596 - *pgtable_slot = NULL; 597 - /* 598 - * We store HPTE information in the deposited PTE fragment. 599 - * zero out the content on withdraw. 600 - */ 601 - memset(pgtable, 0, PTE_FRAG_SIZE); 602 - return pgtable; 603 - } 604 - 605 - void pmdp_huge_split_prepare(struct vm_area_struct *vma, 606 - unsigned long address, pmd_t *pmdp) 607 - { 608 - VM_BUG_ON(address & ~HPAGE_PMD_MASK); 609 - VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); 610 - 611 - /* 612 - * We can't mark the pmd none here, because that will cause a race 613 - * against exit_mmap. We need to continue mark pmd TRANS HUGE, while 614 - * we spilt, but at the same time we wan't rest of the ppc64 code 615 - * not to insert hash pte on this, because we will be modifying 616 - * the deposited pgtable in the caller of this function. Hence 617 - * clear the _PAGE_USER so that we move the fault handling to 618 - * higher level function and that will serialize against ptl. 619 - * We need to flush existing hash pte entries here even though, 620 - * the translation is still valid, because we will withdraw 621 - * pgtable_t after this. 622 - */ 623 - pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_USER, 0); 624 - } 625 - 626 - 627 - /* 628 - * set a new huge pmd. We should not be called for updating 629 - * an existing pmd entry. That should go via pmd_hugepage_update. 630 - */ 631 - void set_pmd_at(struct mm_struct *mm, unsigned long addr, 632 - pmd_t *pmdp, pmd_t pmd) 633 - { 634 - #ifdef CONFIG_DEBUG_VM 635 - WARN_ON((pmd_val(*pmdp) & (_PAGE_PRESENT | _PAGE_USER)) == 636 - (_PAGE_PRESENT | _PAGE_USER)); 637 - assert_spin_locked(&mm->page_table_lock); 638 - WARN_ON(!pmd_trans_huge(pmd)); 639 - #endif 640 - trace_hugepage_set_pmd(addr, pmd_val(pmd)); 641 - return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); 642 - } 643 - 644 - /* 645 - * We use this to invalidate a pmdp entry before switching from a 646 - * hugepte to regular pmd entry. 647 - */ 648 - void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, 649 - pmd_t *pmdp) 650 - { 651 - pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); 652 - 653 - /* 654 - * This ensures that generic code that rely on IRQ disabling 655 - * to prevent a parallel THP split work as expected. 656 - */ 657 - kick_all_cpus_sync(); 658 - } 659 - 660 - /* 661 - * A linux hugepage PMD was changed and the corresponding hash table entries 662 - * neesd to be flushed. 663 - */ 664 - void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, 665 - pmd_t *pmdp, unsigned long old_pmd) 666 - { 667 - int ssize; 668 - unsigned int psize; 669 - unsigned long vsid; 670 - unsigned long flags = 0; 671 - const struct cpumask *tmp; 672 - 673 - /* get the base page size,vsid and segment size */ 674 - #ifdef CONFIG_DEBUG_VM 675 - psize = get_slice_psize(mm, addr); 676 - BUG_ON(psize == MMU_PAGE_16M); 677 - #endif 678 - if (old_pmd & _PAGE_COMBO) 679 - psize = MMU_PAGE_4K; 680 - else 681 - psize = MMU_PAGE_64K; 682 - 683 - if (!is_kernel_addr(addr)) { 684 - ssize = user_segment_size(addr); 685 - vsid = get_vsid(mm->context.id, addr, ssize); 686 - WARN_ON(vsid == 0); 687 - } else { 688 - vsid = get_kernel_vsid(addr, mmu_kernel_ssize); 689 - ssize = mmu_kernel_ssize; 690 - } 691 - 692 - tmp = cpumask_of(smp_processor_id()); 693 - if (cpumask_equal(mm_cpumask(mm), tmp)) 694 - flags |= HPTE_LOCAL_UPDATE; 695 - 696 - return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); 697 - } 698 - 699 - static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) 700 - { 701 - return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); 702 - } 703 - 704 - pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) 705 - { 706 - unsigned long pmdv; 707 - 708 - pmdv = (pfn << PTE_RPN_SHIFT) & PTE_RPN_MASK; 709 - return pmd_set_protbits(__pmd(pmdv), pgprot); 710 - } 711 - 712 - pmd_t mk_pmd(struct page *page, pgprot_t pgprot) 713 - { 714 - return pfn_pmd(page_to_pfn(page), pgprot); 715 - } 716 - 717 - pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) 718 - { 719 - unsigned long pmdv; 720 - 721 - pmdv = pmd_val(pmd); 722 - pmdv &= _HPAGE_CHG_MASK; 723 - return pmd_set_protbits(__pmd(pmdv), newprot); 724 - } 725 - 726 - /* 727 - * This is called at the end of handling a user page fault, when the 728 - * fault has been handled by updating a HUGE PMD entry in the linux page tables. 729 - * We use it to preload an HPTE into the hash table corresponding to 730 - * the updated linux HUGE PMD entry. 731 - */ 732 - void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, 733 - pmd_t *pmd) 734 - { 735 - return; 736 - } 737 - 738 - pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, 739 - unsigned long addr, pmd_t *pmdp) 740 - { 741 - pmd_t old_pmd; 742 - pgtable_t pgtable; 743 - unsigned long old; 744 - pgtable_t *pgtable_slot; 745 - 746 - old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); 747 - old_pmd = __pmd(old); 748 - /* 749 - * We have pmd == none and we are holding page_table_lock. 750 - * So we can safely go and clear the pgtable hash 751 - * index info. 752 - */ 753 - pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; 754 - pgtable = *pgtable_slot; 755 - /* 756 - * Let's zero out old valid and hash index details 757 - * hash fault look at them. 758 - */ 759 - memset(pgtable, 0, PTE_FRAG_SIZE); 760 - /* 761 - * Serialize against find_linux_pte_or_hugepte which does lock-less 762 - * lookup in page tables with local interrupts disabled. For huge pages 763 - * it casts pmd_t to pte_t. Since format of pte_t is different from 764 - * pmd_t we want to prevent transit from pmd pointing to page table 765 - * to pmd pointing to huge page (and back) while interrupts are disabled. 766 - * We clear pmd to possibly replace it with page table pointer in 767 - * different code paths. So make sure we wait for the parallel 768 - * find_linux_pte_or_hugepage to finish. 769 - */ 770 - kick_all_cpus_sync(); 771 - return old_pmd; 772 - } 773 - 774 - int has_transparent_hugepage(void) 775 - { 776 - 777 - BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) >= MAX_ORDER, 778 - "hugepages can't be allocated by the buddy allocator"); 779 - 780 - BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) < 2, 781 - "We need more than 2 pages to do deferred thp split"); 782 - 783 - if (!mmu_has_feature(MMU_FTR_16M_PAGE)) 784 - return 0; 785 - /* 786 - * We support THP only if PMD_SIZE is 16MB. 787 - */ 788 - if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) 789 - return 0; 790 - /* 791 - * We need to make sure that we support 16MB hugepage in a segement 792 - * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE 793 - * of 64K. 794 - */ 795 - /* 796 - * If we have 64K HPTE, we will be using that by default 797 - */ 798 - if (mmu_psize_defs[MMU_PAGE_64K].shift && 799 - (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) 800 - return 0; 801 - /* 802 - * Ok we only have 4K HPTE 803 - */ 804 - if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) 805 - return 0; 806 - 807 - return 1; 808 - } 809 - #endif /* CONFIG_TRANSPARENT_HUGEPAGE */

-1

arch/powerpc/mm/slb.c

··· 32 32 }; 33 33 34 34 extern void slb_allocate_realmode(unsigned long ea); 35 - extern void slb_allocate_user(unsigned long ea); 36 35 37 36 static void slb_allocate(unsigned long ea) 38 37 {

+2 -52

arch/powerpc/mm/slb_low.S

··· 35 35 * check for bad kernel/user address 36 36 * (ea & ~REGION_MASK) >= PGTABLE_RANGE 37 37 */ 38 - rldicr. r9,r3,4,(63 - PGTABLE_EADDR_SIZE - 4) 38 + rldicr. r9,r3,4,(63 - H_PGTABLE_EADDR_SIZE - 4) 39 39 bne- 8f 40 40 41 41 srdi r9,r3,60 /* get region */ ··· 91 91 * can be demoted from 64K -> 4K dynamically on some machines 92 92 */ 93 93 clrldi r11,r10,48 94 - cmpldi r11,(VMALLOC_SIZE >> 28) - 1 94 + cmpldi r11,(H_VMALLOC_SIZE >> 28) - 1 95 95 bgt 5f 96 96 lhz r11,PACAVMALLOCSLLP(r13) 97 97 b 6f ··· 178 178 li r9,0 /* BAD_VSID */ 179 179 li r11,SLB_VSID_USER /* flags don't much matter */ 180 180 b slb_finish_load 181 - 182 - #ifdef __DISABLED__ 183 - 184 - /* void slb_allocate_user(unsigned long ea); 185 - * 186 - * Create an SLB entry for the given EA (user or kernel). 187 - * r3 = faulting address, r13 = PACA 188 - * r9, r10, r11 are clobbered by this function 189 - * No other registers are examined or changed. 190 - * 191 - * It is called with translation enabled in order to be able to walk the 192 - * page tables. This is not currently used. 193 - */ 194 - _GLOBAL(slb_allocate_user) 195 - /* r3 = faulting address */ 196 - srdi r10,r3,28 /* get esid */ 197 - 198 - crset 4*cr7+lt /* set "user" flag for later */ 199 - 200 - /* check if we fit in the range covered by the pagetables*/ 201 - srdi. r9,r3,PGTABLE_EADDR_SIZE 202 - crnot 4*cr0+eq,4*cr0+eq 203 - beqlr 204 - 205 - /* now we need to get to the page tables in order to get the page 206 - * size encoding from the PMD. In the future, we'll be able to deal 207 - * with 1T segments too by getting the encoding from the PGD instead 208 - */ 209 - ld r9,PACAPGDIR(r13) 210 - cmpldi cr0,r9,0 211 - beqlr 212 - rlwinm r11,r10,8,25,28 213 - ldx r9,r9,r11 /* get pgd_t */ 214 - cmpldi cr0,r9,0 215 - beqlr 216 - rlwinm r11,r10,3,17,28 217 - ldx r9,r9,r11 /* get pmd_t */ 218 - cmpldi cr0,r9,0 219 - beqlr 220 - 221 - /* build vsid flags */ 222 - andi. r11,r9,SLB_VSID_LLP 223 - ori r11,r11,SLB_VSID_USER 224 - 225 - /* get context to calculate proto-VSID */ 226 - ld r9,PACACONTEXTID(r13) 227 - /* fall through slb_finish_load */ 228 - 229 - #endif /* __DISABLED__ */ 230 - 231 181 232 182 /* 233 183 * Finish loading of an SLB entry and return

+18 -2

arch/powerpc/mm/slice.c

··· 37 37 #include <asm/hugetlb.h> 38 38 39 39 /* some sanity checks */ 40 - #if (PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE 41 - #error PGTABLE_RANGE exceeds slice_mask high_slices size 40 + #if (H_PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE 41 + #error H_PGTABLE_RANGE exceeds slice_mask high_slices size 42 42 #endif 43 43 44 44 static DEFINE_SPINLOCK(slice_convert_lock); ··· 395 395 396 396 /* Sanity checks */ 397 397 BUG_ON(mm->task_size == 0); 398 + VM_BUG_ON(radix_enabled()); 398 399 399 400 slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize); 400 401 slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n", ··· 569 568 unsigned char *hpsizes; 570 569 int index, mask_index; 571 570 571 + /* 572 + * Radix doesn't use slice, but can get enabled along with MMU_SLICE 573 + */ 574 + if (radix_enabled()) { 575 + #ifdef CONFIG_PPC_64K_PAGES 576 + return MMU_PAGE_64K; 577 + #else 578 + return MMU_PAGE_4K; 579 + #endif 580 + } 572 581 if (addr < SLICE_LOW_TOP) { 573 582 u64 lpsizes; 574 583 lpsizes = mm->context.low_slices_psize; ··· 616 605 617 606 slice_dbg("slice_set_user_psize(mm=%p, psize=%d)\n", mm, psize); 618 607 608 + VM_BUG_ON(radix_enabled()); 619 609 spin_lock_irqsave(&slice_convert_lock, flags); 620 610 621 611 old_psize = mm->context.user_psize; ··· 661 649 { 662 650 struct slice_mask mask = slice_range_to_mask(start, len); 663 651 652 + VM_BUG_ON(radix_enabled()); 664 653 slice_convert(mm, mask, psize); 665 654 } 666 655 ··· 690 677 { 691 678 struct slice_mask mask, available; 692 679 unsigned int psize = mm->context.user_psize; 680 + 681 + if (radix_enabled()) 682 + return 0; 693 683 694 684 mask = slice_range_to_mask(addr, len); 695 685 available = slice_mask_for_size(mm, psize);

+251

arch/powerpc/mm/tlb-radix.c

··· 1 + /* 2 + * TLB flush routines for radix kernels. 3 + * 4 + * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. 5 + * 6 + * This program is free software; you can redistribute it and/or 7 + * modify it under the terms of the GNU General Public License 8 + * as published by the Free Software Foundation; either version 9 + * 2 of the License, or (at your option) any later version. 10 + */ 11 + 12 + #include <linux/mm.h> 13 + #include <linux/hugetlb.h> 14 + #include <linux/memblock.h> 15 + 16 + #include <asm/tlb.h> 17 + #include <asm/tlbflush.h> 18 + 19 + static DEFINE_RAW_SPINLOCK(native_tlbie_lock); 20 + 21 + static inline void __tlbiel_pid(unsigned long pid, int set) 22 + { 23 + unsigned long rb,rs,ric,prs,r; 24 + 25 + rb = PPC_BIT(53); /* IS = 1 */ 26 + rb |= set << PPC_BITLSHIFT(51); 27 + rs = ((unsigned long)pid) << PPC_BITLSHIFT(31); 28 + prs = 1; /* process scoped */ 29 + r = 1; /* raidx format */ 30 + ric = 2; /* invalidate all the caches */ 31 + 32 + asm volatile("ptesync": : :"memory"); 33 + asm volatile(".long 0x7c000224 | (%0 << 11) | (%1 << 16) |" 34 + "(%2 << 17) | (%3 << 18) | (%4 << 21)" 35 + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); 36 + asm volatile("ptesync": : :"memory"); 37 + } 38 + 39 + /* 40 + * We use 128 set in radix mode and 256 set in hpt mode. 41 + */ 42 + static inline void _tlbiel_pid(unsigned long pid) 43 + { 44 + int set; 45 + 46 + for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) { 47 + __tlbiel_pid(pid, set); 48 + } 49 + return; 50 + } 51 + 52 + static inline void _tlbie_pid(unsigned long pid) 53 + { 54 + unsigned long rb,rs,ric,prs,r; 55 + 56 + rb = PPC_BIT(53); /* IS = 1 */ 57 + rs = pid << PPC_BITLSHIFT(31); 58 + prs = 1; /* process scoped */ 59 + r = 1; /* raidx format */ 60 + ric = 2; /* invalidate all the caches */ 61 + 62 + asm volatile("ptesync": : :"memory"); 63 + asm volatile(".long 0x7c000264 | (%0 << 11) | (%1 << 16) |" 64 + "(%2 << 17) | (%3 << 18) | (%4 << 21)" 65 + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); 66 + asm volatile("eieio; tlbsync; ptesync": : :"memory"); 67 + } 68 + 69 + static inline void _tlbiel_va(unsigned long va, unsigned long pid, 70 + unsigned long ap) 71 + { 72 + unsigned long rb,rs,ric,prs,r; 73 + 74 + rb = va & ~(PPC_BITMASK(52, 63)); 75 + rb |= ap << PPC_BITLSHIFT(58); 76 + rs = pid << PPC_BITLSHIFT(31); 77 + prs = 1; /* process scoped */ 78 + r = 1; /* raidx format */ 79 + ric = 0; /* no cluster flush yet */ 80 + 81 + asm volatile("ptesync": : :"memory"); 82 + asm volatile(".long 0x7c000224 | (%0 << 11) | (%1 << 16) |" 83 + "(%2 << 17) | (%3 << 18) | (%4 << 21)" 84 + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); 85 + asm volatile("ptesync": : :"memory"); 86 + } 87 + 88 + static inline void _tlbie_va(unsigned long va, unsigned long pid, 89 + unsigned long ap) 90 + { 91 + unsigned long rb,rs,ric,prs,r; 92 + 93 + rb = va & ~(PPC_BITMASK(52, 63)); 94 + rb |= ap << PPC_BITLSHIFT(58); 95 + rs = pid << PPC_BITLSHIFT(31); 96 + prs = 1; /* process scoped */ 97 + r = 1; /* raidx format */ 98 + ric = 0; /* no cluster flush yet */ 99 + 100 + asm volatile("ptesync": : :"memory"); 101 + asm volatile(".long 0x7c000264 | (%0 << 11) | (%1 << 16) |" 102 + "(%2 << 17) | (%3 << 18) | (%4 << 21)" 103 + : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); 104 + asm volatile("eieio; tlbsync; ptesync": : :"memory"); 105 + } 106 + 107 + /* 108 + * Base TLB flushing operations: 109 + * 110 + * - flush_tlb_mm(mm) flushes the specified mm context TLB's 111 + * - flush_tlb_page(vma, vmaddr) flushes one page 112 + * - flush_tlb_range(vma, start, end) flushes a range of pages 113 + * - flush_tlb_kernel_range(start, end) flushes kernel pages 114 + * 115 + * - local_* variants of page and mm only apply to the current 116 + * processor 117 + */ 118 + void radix__local_flush_tlb_mm(struct mm_struct *mm) 119 + { 120 + unsigned int pid; 121 + 122 + preempt_disable(); 123 + pid = mm->context.id; 124 + if (pid != MMU_NO_CONTEXT) 125 + _tlbiel_pid(pid); 126 + preempt_enable(); 127 + } 128 + EXPORT_SYMBOL(radix__local_flush_tlb_mm); 129 + 130 + void radix___local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, 131 + unsigned long ap, int nid) 132 + { 133 + unsigned int pid; 134 + 135 + preempt_disable(); 136 + pid = mm ? mm->context.id : 0; 137 + if (pid != MMU_NO_CONTEXT) 138 + _tlbiel_va(vmaddr, pid, ap); 139 + preempt_enable(); 140 + } 141 + 142 + void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) 143 + { 144 + #ifdef CONFIG_HUGETLB_PAGE 145 + /* need the return fix for nohash.c */ 146 + if (vma && is_vm_hugetlb_page(vma)) 147 + return __local_flush_hugetlb_page(vma, vmaddr); 148 + #endif 149 + radix___local_flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, 150 + mmu_get_ap(mmu_virtual_psize), 0); 151 + } 152 + EXPORT_SYMBOL(radix__local_flush_tlb_page); 153 + 154 + #ifdef CONFIG_SMP 155 + static int mm_is_core_local(struct mm_struct *mm) 156 + { 157 + return cpumask_subset(mm_cpumask(mm), 158 + topology_sibling_cpumask(smp_processor_id())); 159 + } 160 + 161 + void radix__flush_tlb_mm(struct mm_struct *mm) 162 + { 163 + unsigned int pid; 164 + 165 + preempt_disable(); 166 + pid = mm->context.id; 167 + if (unlikely(pid == MMU_NO_CONTEXT)) 168 + goto no_context; 169 + 170 + if (!mm_is_core_local(mm)) { 171 + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); 172 + 173 + if (lock_tlbie) 174 + raw_spin_lock(&native_tlbie_lock); 175 + _tlbie_pid(pid); 176 + if (lock_tlbie) 177 + raw_spin_unlock(&native_tlbie_lock); 178 + } else 179 + _tlbiel_pid(pid); 180 + no_context: 181 + preempt_enable(); 182 + } 183 + EXPORT_SYMBOL(radix__flush_tlb_mm); 184 + 185 + void radix___flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr, 186 + unsigned long ap, int nid) 187 + { 188 + unsigned int pid; 189 + 190 + preempt_disable(); 191 + pid = mm ? mm->context.id : 0; 192 + if (unlikely(pid == MMU_NO_CONTEXT)) 193 + goto bail; 194 + if (!mm_is_core_local(mm)) { 195 + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); 196 + 197 + if (lock_tlbie) 198 + raw_spin_lock(&native_tlbie_lock); 199 + _tlbie_va(vmaddr, pid, ap); 200 + if (lock_tlbie) 201 + raw_spin_unlock(&native_tlbie_lock); 202 + } else 203 + _tlbiel_va(vmaddr, pid, ap); 204 + bail: 205 + preempt_enable(); 206 + } 207 + 208 + void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr) 209 + { 210 + #ifdef CONFIG_HUGETLB_PAGE 211 + if (vma && is_vm_hugetlb_page(vma)) 212 + return flush_hugetlb_page(vma, vmaddr); 213 + #endif 214 + radix___flush_tlb_page(vma ? vma->vm_mm : NULL, vmaddr, 215 + mmu_get_ap(mmu_virtual_psize), 0); 216 + } 217 + EXPORT_SYMBOL(radix__flush_tlb_page); 218 + 219 + #endif /* CONFIG_SMP */ 220 + 221 + void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) 222 + { 223 + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); 224 + 225 + if (lock_tlbie) 226 + raw_spin_lock(&native_tlbie_lock); 227 + _tlbie_pid(0); 228 + if (lock_tlbie) 229 + raw_spin_unlock(&native_tlbie_lock); 230 + } 231 + EXPORT_SYMBOL(radix__flush_tlb_kernel_range); 232 + 233 + /* 234 + * Currently, for range flushing, we just do a full mm flush. Because 235 + * we use this in code path where we don' track the page size. 236 + */ 237 + void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, 238 + unsigned long end) 239 + 240 + { 241 + struct mm_struct *mm = vma->vm_mm; 242 + radix__flush_tlb_mm(mm); 243 + } 244 + EXPORT_SYMBOL(radix__flush_tlb_range); 245 + 246 + 247 + void radix__tlb_flush(struct mmu_gather *tlb) 248 + { 249 + struct mm_struct *mm = tlb->mm; 250 + radix__flush_tlb_mm(mm); 251 + }

+3 -3

arch/powerpc/mm/tlb_hash64.c

··· 155 155 batch->index = 0; 156 156 } 157 157 158 - void tlb_flush(struct mmu_gather *tlb) 158 + void hash__tlb_flush(struct mmu_gather *tlb) 159 159 { 160 160 struct ppc64_tlb_batch *tlbbatch = &get_cpu_var(ppc64_tlb_batch); 161 161 ··· 218 218 pte = pte_val(*ptep); 219 219 if (is_thp) 220 220 trace_hugepage_invalidate(start, pte); 221 - if (!(pte & _PAGE_HASHPTE)) 221 + if (!(pte & H_PAGE_HASHPTE)) 222 222 continue; 223 223 if (unlikely(is_thp)) 224 224 hpte_do_hugepage_flush(mm, start, (pmd_t *)ptep, pte); ··· 248 248 start_pte = pte_offset_map(pmd, addr); 249 249 for (pte = start_pte; pte < start_pte + PTRS_PER_PTE; pte++) { 250 250 unsigned long pteval = pte_val(*pte); 251 - if (pteval & _PAGE_HASHPTE) 251 + if (pteval & H_PAGE_HASHPTE) 252 252 hpte_need_flush(mm, addr, pte, pteval, 0); 253 253 addr += PAGE_SIZE; 254 254 }

+1 -1

arch/powerpc/perf/Makefile

··· 1 1 subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror 2 2 3 - obj-$(CONFIG_PERF_EVENTS) += callchain.o 3 + obj-$(CONFIG_PERF_EVENTS) += callchain.o perf_regs.o 4 4 5 5 obj-$(CONFIG_PPC_PERF_CTRS) += core-book3s.o bhrb.o 6 6 obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \

+1 -1

arch/powerpc/perf/callchain.c

··· 137 137 offset = addr & ((1UL << shift) - 1); 138 138 139 139 pte = READ_ONCE(*ptep); 140 - if (!pte_present(pte) || !(pte_val(pte) & _PAGE_USER)) 140 + if (!pte_present(pte) || !pte_user(pte)) 141 141 goto err_out; 142 142 pfn = pte_pfn(pte); 143 143 if (!page_is_ram(pfn))

+104

arch/powerpc/perf/perf_regs.c

··· 1 + /* 2 + * Copyright 2016 Anju T, IBM Corporation. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public License 6 + * as published by the Free Software Foundation; either version 7 + * 2 of the License, or (at your option) any later version. 8 + */ 9 + 10 + #include <linux/errno.h> 11 + #include <linux/kernel.h> 12 + #include <linux/sched.h> 13 + #include <linux/perf_event.h> 14 + #include <linux/bug.h> 15 + #include <linux/stddef.h> 16 + #include <asm/ptrace.h> 17 + #include <asm/perf_regs.h> 18 + 19 + #define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r) 20 + 21 + #define REG_RESERVED (~((1ULL << PERF_REG_POWERPC_MAX) - 1)) 22 + 23 + static unsigned int pt_regs_offset[PERF_REG_POWERPC_MAX] = { 24 + PT_REGS_OFFSET(PERF_REG_POWERPC_R0, gpr[0]), 25 + PT_REGS_OFFSET(PERF_REG_POWERPC_R1, gpr[1]), 26 + PT_REGS_OFFSET(PERF_REG_POWERPC_R2, gpr[2]), 27 + PT_REGS_OFFSET(PERF_REG_POWERPC_R3, gpr[3]), 28 + PT_REGS_OFFSET(PERF_REG_POWERPC_R4, gpr[4]), 29 + PT_REGS_OFFSET(PERF_REG_POWERPC_R5, gpr[5]), 30 + PT_REGS_OFFSET(PERF_REG_POWERPC_R6, gpr[6]), 31 + PT_REGS_OFFSET(PERF_REG_POWERPC_R7, gpr[7]), 32 + PT_REGS_OFFSET(PERF_REG_POWERPC_R8, gpr[8]), 33 + PT_REGS_OFFSET(PERF_REG_POWERPC_R9, gpr[9]), 34 + PT_REGS_OFFSET(PERF_REG_POWERPC_R10, gpr[10]), 35 + PT_REGS_OFFSET(PERF_REG_POWERPC_R11, gpr[11]), 36 + PT_REGS_OFFSET(PERF_REG_POWERPC_R12, gpr[12]), 37 + PT_REGS_OFFSET(PERF_REG_POWERPC_R13, gpr[13]), 38 + PT_REGS_OFFSET(PERF_REG_POWERPC_R14, gpr[14]), 39 + PT_REGS_OFFSET(PERF_REG_POWERPC_R15, gpr[15]), 40 + PT_REGS_OFFSET(PERF_REG_POWERPC_R16, gpr[16]), 41 + PT_REGS_OFFSET(PERF_REG_POWERPC_R17, gpr[17]), 42 + PT_REGS_OFFSET(PERF_REG_POWERPC_R18, gpr[18]), 43 + PT_REGS_OFFSET(PERF_REG_POWERPC_R19, gpr[19]), 44 + PT_REGS_OFFSET(PERF_REG_POWERPC_R20, gpr[20]), 45 + PT_REGS_OFFSET(PERF_REG_POWERPC_R21, gpr[21]), 46 + PT_REGS_OFFSET(PERF_REG_POWERPC_R22, gpr[22]), 47 + PT_REGS_OFFSET(PERF_REG_POWERPC_R23, gpr[23]), 48 + PT_REGS_OFFSET(PERF_REG_POWERPC_R24, gpr[24]), 49 + PT_REGS_OFFSET(PERF_REG_POWERPC_R25, gpr[25]), 50 + PT_REGS_OFFSET(PERF_REG_POWERPC_R26, gpr[26]), 51 + PT_REGS_OFFSET(PERF_REG_POWERPC_R27, gpr[27]), 52 + PT_REGS_OFFSET(PERF_REG_POWERPC_R28, gpr[28]), 53 + PT_REGS_OFFSET(PERF_REG_POWERPC_R29, gpr[29]), 54 + PT_REGS_OFFSET(PERF_REG_POWERPC_R30, gpr[30]), 55 + PT_REGS_OFFSET(PERF_REG_POWERPC_R31, gpr[31]), 56 + PT_REGS_OFFSET(PERF_REG_POWERPC_NIP, nip), 57 + PT_REGS_OFFSET(PERF_REG_POWERPC_MSR, msr), 58 + PT_REGS_OFFSET(PERF_REG_POWERPC_ORIG_R3, orig_gpr3), 59 + PT_REGS_OFFSET(PERF_REG_POWERPC_CTR, ctr), 60 + PT_REGS_OFFSET(PERF_REG_POWERPC_LINK, link), 61 + PT_REGS_OFFSET(PERF_REG_POWERPC_XER, xer), 62 + PT_REGS_OFFSET(PERF_REG_POWERPC_CCR, ccr), 63 + #ifdef CONFIG_PPC64 64 + PT_REGS_OFFSET(PERF_REG_POWERPC_SOFTE, softe), 65 + #else 66 + PT_REGS_OFFSET(PERF_REG_POWERPC_SOFTE, mq), 67 + #endif 68 + PT_REGS_OFFSET(PERF_REG_POWERPC_TRAP, trap), 69 + PT_REGS_OFFSET(PERF_REG_POWERPC_DAR, dar), 70 + PT_REGS_OFFSET(PERF_REG_POWERPC_DSISR, dsisr), 71 + }; 72 + 73 + u64 perf_reg_value(struct pt_regs *regs, int idx) 74 + { 75 + if (WARN_ON_ONCE(idx >= PERF_REG_POWERPC_MAX)) 76 + return 0; 77 + 78 + return regs_get_register(regs, pt_regs_offset[idx]); 79 + } 80 + 81 + int perf_reg_validate(u64 mask) 82 + { 83 + if (!mask || mask & REG_RESERVED) 84 + return -EINVAL; 85 + return 0; 86 + } 87 + 88 + u64 perf_reg_abi(struct task_struct *task) 89 + { 90 + #ifdef CONFIG_PPC64 91 + if (!test_tsk_thread_flag(task, TIF_32BIT)) 92 + return PERF_SAMPLE_REGS_ABI_64; 93 + else 94 + #endif 95 + return PERF_SAMPLE_REGS_ABI_32; 96 + } 97 + 98 + void perf_get_regs_user(struct perf_regs *regs_user, 99 + struct pt_regs *regs, 100 + struct pt_regs *regs_user_copy) 101 + { 102 + regs_user->regs = task_pt_regs(current); 103 + regs_user->abi = perf_reg_abi(current); 104 + }

+40

arch/powerpc/perf/power8-events-list.h

··· 49 49 EVENT(PM_DTLB_MISS, 0x300fc) 50 50 /* ITLB Reloaded */ 51 51 EVENT(PM_ITLB_MISS, 0x400fc) 52 + /* Run_Instructions */ 53 + EVENT(PM_RUN_INST_CMPL, 0x500fa) 54 + /* Alternate event code for PM_RUN_INST_CMPL */ 55 + EVENT(PM_RUN_INST_CMPL_ALT, 0x400fa) 56 + /* Run_cycles */ 57 + EVENT(PM_RUN_CYC, 0x600f4) 58 + /* Alternate event code for Run_cycles */ 59 + EVENT(PM_RUN_CYC_ALT, 0x200f4) 60 + /* Marked store completed */ 61 + EVENT(PM_MRK_ST_CMPL, 0x10134) 62 + /* Alternate event code for Marked store completed */ 63 + EVENT(PM_MRK_ST_CMPL_ALT, 0x301e2) 64 + /* Marked two path branch */ 65 + EVENT(PM_BR_MRK_2PATH, 0x10138) 66 + /* Alternate event code for PM_BR_MRK_2PATH */ 67 + EVENT(PM_BR_MRK_2PATH_ALT, 0x40138) 68 + /* L3 castouts in Mepf state */ 69 + EVENT(PM_L3_CO_MEPF, 0x18082) 70 + /* Alternate event code for PM_L3_CO_MEPF */ 71 + EVENT(PM_L3_CO_MEPF_ALT, 0x3e05e) 72 + /* Data cache was reloaded from a location other than L2 due to a marked load */ 73 + EVENT(PM_MRK_DATA_FROM_L2MISS, 0x1d14e) 74 + /* Alternate event code for PM_MRK_DATA_FROM_L2MISS */ 75 + EVENT(PM_MRK_DATA_FROM_L2MISS_ALT, 0x401e8) 76 + /* Alternate event code for PM_CMPLU_STALL */ 77 + EVENT(PM_CMPLU_STALL_ALT, 0x1e054) 78 + /* Two path branch */ 79 + EVENT(PM_BR_2PATH, 0x20036) 80 + /* Alternate event code for PM_BR_2PATH */ 81 + EVENT(PM_BR_2PATH_ALT, 0x40036) 82 + /* # PPC Dispatched */ 83 + EVENT(PM_INST_DISP, 0x200f2) 84 + /* Alternate event code for PM_INST_DISP */ 85 + EVENT(PM_INST_DISP_ALT, 0x300f2) 86 + /* Marked filter Match */ 87 + EVENT(PM_MRK_FILT_MATCH, 0x2013c) 88 + /* Alternate event code for PM_MRK_FILT_MATCH */ 89 + EVENT(PM_MRK_FILT_MATCH_ALT, 0x3012e) 90 + /* Alternate event code for PM_LD_MISS_L1 */ 91 + EVENT(PM_LD_MISS_L1_ALT, 0x400f0)

+21 -20

arch/powerpc/perf/power8-pmu.c

··· 274 274 /* Ignore Linux defined bits when checking event below */ 275 275 base_event = event & ~EVENT_LINUX_MASK; 276 276 277 - if (pmc >= 5 && base_event != 0x500fa && base_event != 0x600f4) 277 + if (pmc >= 5 && base_event != PM_RUN_INST_CMPL && 278 + base_event != PM_RUN_CYC) 278 279 return -1; 279 280 280 281 mask |= CNST_PMC_MASK(pmc); ··· 489 488 490 489 /* Table of alternatives, sorted by column 0 */ 491 490 static const unsigned int event_alternatives[][MAX_ALT] = { 492 - { 0x10134, 0x301e2 }, /* PM_MRK_ST_CMPL */ 493 - { 0x10138, 0x40138 }, /* PM_BR_MRK_2PATH */ 494 - { 0x18082, 0x3e05e }, /* PM_L3_CO_MEPF */ 495 - { 0x1d14e, 0x401e8 }, /* PM_MRK_DATA_FROM_L2MISS */ 496 - { 0x1e054, 0x4000a }, /* PM_CMPLU_STALL */ 497 - { 0x20036, 0x40036 }, /* PM_BR_2PATH */ 498 - { 0x200f2, 0x300f2 }, /* PM_INST_DISP */ 499 - { 0x200f4, 0x600f4 }, /* PM_RUN_CYC */ 500 - { 0x2013c, 0x3012e }, /* PM_MRK_FILT_MATCH */ 501 - { 0x3e054, 0x400f0 }, /* PM_LD_MISS_L1 */ 502 - { 0x400fa, 0x500fa }, /* PM_RUN_INST_CMPL */ 491 + { PM_MRK_ST_CMPL, PM_MRK_ST_CMPL_ALT }, 492 + { PM_BR_MRK_2PATH, PM_BR_MRK_2PATH_ALT }, 493 + { PM_L3_CO_MEPF, PM_L3_CO_MEPF_ALT }, 494 + { PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L2MISS_ALT }, 495 + { PM_CMPLU_STALL_ALT, PM_CMPLU_STALL }, 496 + { PM_BR_2PATH, PM_BR_2PATH_ALT }, 497 + { PM_INST_DISP, PM_INST_DISP_ALT }, 498 + { PM_RUN_CYC_ALT, PM_RUN_CYC }, 499 + { PM_MRK_FILT_MATCH, PM_MRK_FILT_MATCH_ALT }, 500 + { PM_LD_MISS_L1, PM_LD_MISS_L1_ALT }, 501 + { PM_RUN_INST_CMPL_ALT, PM_RUN_INST_CMPL }, 503 502 }; 504 503 505 504 /* ··· 547 546 j = num_alt; 548 547 for (i = 0; i < num_alt; ++i) { 549 548 switch (alt[i]) { 550 - case 0x1e: /* PM_CYC */ 551 - alt[j++] = 0x600f4; /* PM_RUN_CYC */ 549 + case PM_CYC: 550 + alt[j++] = PM_RUN_CYC; 552 551 break; 553 - case 0x600f4: /* PM_RUN_CYC */ 554 - alt[j++] = 0x1e; 552 + case PM_RUN_CYC: 553 + alt[j++] = PM_CYC; 555 554 break; 556 - case 0x2: /* PM_PPC_CMPL */ 557 - alt[j++] = 0x500fa; /* PM_RUN_INST_CMPL */ 555 + case PM_INST_CMPL: 556 + alt[j++] = PM_RUN_INST_CMPL; 558 557 break; 559 - case 0x500fa: /* PM_RUN_INST_CMPL */ 560 - alt[j++] = 0x2; /* PM_PPC_CMPL */ 558 + case PM_RUN_INST_CMPL: 559 + alt[j++] = PM_INST_CMPL; 561 560 break; 562 561 } 563 562 }

+10 -1

arch/powerpc/platforms/Kconfig.cputype

··· 72 72 select PPC_FPU 73 73 select PPC_HAVE_PMU_SUPPORT 74 74 select SYS_SUPPORTS_HUGETLBFS 75 - select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES 75 + select HAVE_ARCH_TRANSPARENT_HUGEPAGE 76 76 select ARCH_SUPPORTS_NUMA_BALANCING 77 77 select IRQ_WORK 78 78 ··· 330 330 config PPC_STD_MMU_64 331 331 def_bool y 332 332 depends on PPC_STD_MMU && PPC64 333 + 334 + config PPC_RADIX_MMU 335 + bool "Radix MMU Support" 336 + depends on PPC_BOOK3S_64 337 + default y 338 + help 339 + Enable support for the Power ISA 3.0 Radix style MMU. Currently this 340 + is only implemented by IBM Power9 CPUs, if you don't have one of them 341 + you can probably disable this. 333 342 334 343 config PPC_MMU_NOHASH 335 344 def_bool y

+3 -6

arch/powerpc/platforms/cell/spu_base.c

··· 24 24 25 25 #include <linux/interrupt.h> 26 26 #include <linux/list.h> 27 - #include <linux/module.h> 27 + #include <linux/init.h> 28 28 #include <linux/ptrace.h> 29 29 #include <linux/slab.h> 30 30 #include <linux/wait.h> ··· 197 197 (REGION_ID(ea) != USER_REGION_ID)) { 198 198 199 199 spin_unlock(&spu->register_lock); 200 - ret = hash_page(ea, _PAGE_PRESENT, 0x300, dsisr); 200 + ret = hash_page(ea, _PAGE_PRESENT | _PAGE_READ, 0x300, dsisr); 201 201 spin_lock(&spu->register_lock); 202 202 203 203 if (!ret) { ··· 805 805 out: 806 806 return ret; 807 807 } 808 - module_init(init_spu_base); 809 - 810 - MODULE_LICENSE("GPL"); 811 - MODULE_AUTHOR("Arnd Bergmann <arndb@de.ibm.com>"); 808 + device_initcall(init_spu_base);

+2 -2

arch/powerpc/platforms/cell/spufs/fault.c

··· 141 141 /* we must not hold the lock when entering copro_handle_mm_fault */ 142 142 spu_release(ctx); 143 143 144 - access = (_PAGE_PRESENT | _PAGE_USER); 145 - access |= (dsisr & MFC_DSISR_ACCESS_PUT) ? _PAGE_RW : 0UL; 144 + access = (_PAGE_PRESENT | _PAGE_READ); 145 + access |= (dsisr & MFC_DSISR_ACCESS_PUT) ? _PAGE_WRITE : 0UL; 146 146 local_irq_save(flags); 147 147 ret = hash_page(ea, access, 0x300, dsisr); 148 148 local_irq_restore(flags);

+32 -37

arch/powerpc/platforms/powernv/eeh-powernv.c

··· 75 75 * and P7IOC separately. So we should regard 76 76 * PE#0 as valid for PHB3 and P7IOC. 77 77 */ 78 - if (phb->ioda.reserved_pe != 0) 78 + if (phb->ioda.reserved_pe_idx != 0) 79 79 eeh_add_flag(EEH_VALID_PE_ZERO); 80 80 81 81 break; ··· 1009 1009 static int pnv_eeh_reset(struct eeh_pe *pe, int option) 1010 1010 { 1011 1011 struct pci_controller *hose = pe->phb; 1012 + struct pnv_phb *phb; 1012 1013 struct pci_bus *bus; 1013 - int ret; 1014 + int64_t rc; 1014 1015 1015 1016 /* 1016 1017 * For PHB reset, we always have complete reset. For those PEs whose ··· 1027 1026 * reset. The side effect is that EEH core has to clear the frozen 1028 1027 * state explicitly after BAR restore. 1029 1028 */ 1030 - if (pe->type & EEH_PE_PHB) { 1031 - ret = pnv_eeh_phb_reset(hose, option); 1032 - } else { 1033 - struct pnv_phb *phb; 1034 - s64 rc; 1029 + if (pe->type & EEH_PE_PHB) 1030 + return pnv_eeh_phb_reset(hose, option); 1035 1031 1036 - /* 1037 - * The frozen PE might be caused by PAPR error injection 1038 - * registers, which are expected to be cleared after hitting 1039 - * frozen PE as stated in the hardware spec. Unfortunately, 1040 - * that's not true on P7IOC. So we have to clear it manually 1041 - * to avoid recursive EEH errors during recovery. 1042 - */ 1043 - phb = hose->private_data; 1044 - if (phb->model == PNV_PHB_MODEL_P7IOC && 1045 - (option == EEH_RESET_HOT || 1046 - option == EEH_RESET_FUNDAMENTAL)) { 1047 - rc = opal_pci_reset(phb->opal_id, 1048 - OPAL_RESET_PHB_ERROR, 1049 - OPAL_ASSERT_RESET); 1050 - if (rc != OPAL_SUCCESS) { 1051 - pr_warn("%s: Failure %lld clearing " 1052 - "error injection registers\n", 1053 - __func__, rc); 1054 - return -EIO; 1055 - } 1032 + /* 1033 + * The frozen PE might be caused by PAPR error injection 1034 + * registers, which are expected to be cleared after hitting 1035 + * frozen PE as stated in the hardware spec. Unfortunately, 1036 + * that's not true on P7IOC. So we have to clear it manually 1037 + * to avoid recursive EEH errors during recovery. 1038 + */ 1039 + phb = hose->private_data; 1040 + if (phb->model == PNV_PHB_MODEL_P7IOC && 1041 + (option == EEH_RESET_HOT || 1042 + option == EEH_RESET_FUNDAMENTAL)) { 1043 + rc = opal_pci_reset(phb->opal_id, 1044 + OPAL_RESET_PHB_ERROR, 1045 + OPAL_ASSERT_RESET); 1046 + if (rc != OPAL_SUCCESS) { 1047 + pr_warn("%s: Failure %lld clearing error injection registers\n", 1048 + __func__, rc); 1049 + return -EIO; 1056 1050 } 1057 - 1058 - bus = eeh_pe_bus_get(pe); 1059 - if (pe->type & EEH_PE_VF) 1060 - ret = pnv_eeh_reset_vf_pe(pe, option); 1061 - else if (pci_is_root_bus(bus) || 1062 - pci_is_root_bus(bus->parent)) 1063 - ret = pnv_eeh_root_reset(hose, option); 1064 - else 1065 - ret = pnv_eeh_bridge_reset(bus->self, option); 1066 1051 } 1067 1052 1068 - return ret; 1053 + bus = eeh_pe_bus_get(pe); 1054 + if (pe->type & EEH_PE_VF) 1055 + return pnv_eeh_reset_vf_pe(pe, option); 1056 + 1057 + if (pci_is_root_bus(bus) || 1058 + pci_is_root_bus(bus->parent)) 1059 + return pnv_eeh_root_reset(hose, option); 1060 + 1061 + return pnv_eeh_bridge_reset(bus->self, option); 1069 1062 } 1070 1063 1071 1064 /**

+152 -139

arch/powerpc/platforms/powernv/npu-dma.c

··· 12 12 #include <linux/export.h> 13 13 #include <linux/pci.h> 14 14 #include <linux/memblock.h> 15 + #include <linux/iommu.h> 15 16 16 17 #include <asm/iommu.h> 17 18 #include <asm/pnv-pci.h> ··· 26 25 * Other types of TCE cache invalidation are not functional in the 27 26 * hardware. 28 27 */ 29 - #define TCE_KILL_INVAL_ALL PPC_BIT(0) 30 - 31 28 static struct pci_dev *get_pci_dev(struct device_node *dn) 32 29 { 33 30 return PCI_DN(dn)->pcidev; ··· 137 138 struct pnv_ioda_pe *pe; 138 139 struct pci_dn *pdn; 139 140 140 - if (npe->flags & PNV_IODA_PE_PEER) { 141 - pe = npe->peers[0]; 142 - pdev = pe->pdev; 143 - } else { 144 - pdev = pnv_pci_get_gpu_dev(npe->pdev); 145 - if (!pdev) 146 - return NULL; 141 + pdev = pnv_pci_get_gpu_dev(npe->pdev); 142 + if (!pdev) 143 + return NULL; 147 144 148 - pdn = pci_get_pdn(pdev); 149 - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) 150 - return NULL; 145 + pdn = pci_get_pdn(pdev); 146 + if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) 147 + return NULL; 151 148 152 - hose = pci_bus_to_host(pdev->bus); 153 - phb = hose->private_data; 154 - pe = &phb->ioda.pe_array[pdn->pe_number]; 155 - } 149 + hose = pci_bus_to_host(pdev->bus); 150 + phb = hose->private_data; 151 + pe = &phb->ioda.pe_array[pdn->pe_number]; 156 152 157 153 if (gpdev) 158 154 *gpdev = pdev; ··· 155 161 return pe; 156 162 } 157 163 158 - void pnv_npu_tce_invalidate_entire(struct pnv_ioda_pe *npe) 164 + long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, 165 + struct iommu_table *tbl) 159 166 { 160 167 struct pnv_phb *phb = npe->phb; 168 + int64_t rc; 169 + const unsigned long size = tbl->it_indirect_levels ? 170 + tbl->it_level_size : tbl->it_size; 171 + const __u64 start_addr = tbl->it_offset << tbl->it_page_shift; 172 + const __u64 win_size = tbl->it_size << tbl->it_page_shift; 161 173 162 - if (WARN_ON(phb->type != PNV_PHB_NPU || 163 - !phb->ioda.tce_inval_reg || 164 - !(npe->flags & PNV_IODA_PE_DEV))) 165 - return; 174 + pe_info(npe, "Setting up window %llx..%llx pg=%lx\n", 175 + start_addr, start_addr + win_size - 1, 176 + IOMMU_PAGE_SIZE(tbl)); 166 177 167 - mb(); /* Ensure previous TCE table stores are visible */ 168 - __raw_writeq(cpu_to_be64(TCE_KILL_INVAL_ALL), 169 - phb->ioda.tce_inval_reg); 170 - } 171 - 172 - void pnv_npu_tce_invalidate(struct pnv_ioda_pe *npe, 173 - struct iommu_table *tbl, 174 - unsigned long index, 175 - unsigned long npages, 176 - bool rm) 177 - { 178 - struct pnv_phb *phb = npe->phb; 179 - 180 - /* We can only invalidate the whole cache on NPU */ 181 - unsigned long val = TCE_KILL_INVAL_ALL; 182 - 183 - if (WARN_ON(phb->type != PNV_PHB_NPU || 184 - !phb->ioda.tce_inval_reg || 185 - !(npe->flags & PNV_IODA_PE_DEV))) 186 - return; 187 - 188 - mb(); /* Ensure previous TCE table stores are visible */ 189 - if (rm) 190 - __raw_rm_writeq(cpu_to_be64(val), 191 - (__be64 __iomem *) phb->ioda.tce_inval_reg_phys); 192 - else 193 - __raw_writeq(cpu_to_be64(val), 194 - phb->ioda.tce_inval_reg); 195 - } 196 - 197 - void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe) 198 - { 199 - struct pnv_ioda_pe *gpe; 200 - struct pci_dev *gpdev; 201 - int i, avail = -1; 202 - 203 - if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV)) 204 - return; 205 - 206 - gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); 207 - if (!gpe) 208 - return; 209 - 210 - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { 211 - /* Nothing to do if the PE is already connected. */ 212 - if (gpe->peers[i] == npe) 213 - return; 214 - 215 - if (!gpe->peers[i]) 216 - avail = i; 178 + rc = opal_pci_map_pe_dma_window(phb->opal_id, 179 + npe->pe_number, 180 + npe->pe_number, 181 + tbl->it_indirect_levels + 1, 182 + __pa(tbl->it_base), 183 + size << 3, 184 + IOMMU_PAGE_SIZE(tbl)); 185 + if (rc) { 186 + pe_err(npe, "Failed to configure TCE table, err %lld\n", rc); 187 + return rc; 217 188 } 189 + pnv_pci_ioda2_tce_invalidate_entire(phb, false); 218 190 219 - if (WARN_ON(avail < 0)) 220 - return; 191 + /* Add the table to the list so its TCE cache will get invalidated */ 192 + pnv_pci_link_table_and_group(phb->hose->node, num, 193 + tbl, &npe->table_group); 221 194 222 - gpe->peers[avail] = npe; 223 - gpe->flags |= PNV_IODA_PE_PEER; 195 + return 0; 196 + } 224 197 225 - /* 226 - * We assume that the NPU devices only have a single peer PE 227 - * (the GPU PCIe device PE). 228 - */ 229 - npe->peers[0] = gpe; 230 - npe->flags |= PNV_IODA_PE_PEER; 198 + long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num) 199 + { 200 + struct pnv_phb *phb = npe->phb; 201 + int64_t rc; 202 + 203 + pe_info(npe, "Removing DMA window\n"); 204 + 205 + rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number, 206 + npe->pe_number, 207 + 0/* levels */, 0/* table address */, 208 + 0/* table size */, 0/* page size */); 209 + if (rc) { 210 + pe_err(npe, "Unmapping failed, ret = %lld\n", rc); 211 + return rc; 212 + } 213 + pnv_pci_ioda2_tce_invalidate_entire(phb, false); 214 + 215 + pnv_pci_unlink_table_and_group(npe->table_group.tables[num], 216 + &npe->table_group); 217 + 218 + return 0; 231 219 } 232 220 233 221 /* 234 - * For the NPU we want to point the TCE table at the same table as the 235 - * real PCI device. 222 + * Enables 32 bit DMA on NPU. 236 223 */ 237 - static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe) 224 + static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe) 238 225 { 239 - struct pnv_phb *phb = npe->phb; 240 226 struct pci_dev *gpdev; 241 227 struct pnv_ioda_pe *gpe; 242 - void *addr; 243 - unsigned int size; 244 228 int64_t rc; 245 229 246 230 /* ··· 232 260 if (!gpe) 233 261 return; 234 262 235 - addr = (void *)gpe->table_group.tables[0]->it_base; 236 - size = gpe->table_group.tables[0]->it_size << 3; 237 - rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number, 238 - npe->pe_number, 1, __pa(addr), 239 - size, 0x1000); 240 - if (rc != OPAL_SUCCESS) 241 - pr_warn("%s: Error %lld setting DMA window on PHB#%d-PE#%d\n", 242 - __func__, rc, phb->hose->global_number, npe->pe_number); 263 + rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]); 243 264 244 265 /* 245 266 * We don't initialise npu_pe->tce32_table as we always use ··· 242 277 } 243 278 244 279 /* 245 - * Enable/disable bypass mode on the NPU. The NPU only supports one 280 + * Enables bypass mode on the NPU. The NPU only supports one 246 281 * window per link, so bypass needs to be explicitly enabled or 247 282 * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be 248 283 * active at the same time. 249 284 */ 250 - int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe, bool enable) 285 + static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe) 251 286 { 252 287 struct pnv_phb *phb = npe->phb; 253 288 int64_t rc = 0; 289 + phys_addr_t top = memblock_end_of_DRAM(); 254 290 255 291 if (phb->type != PNV_PHB_NPU || !npe->pdev) 256 292 return -EINVAL; 257 293 258 - if (enable) { 259 - /* Enable the bypass window */ 260 - phys_addr_t top = memblock_end_of_DRAM(); 294 + rc = pnv_npu_unset_window(npe, 0); 295 + if (rc != OPAL_SUCCESS) 296 + return rc; 261 297 262 - npe->tce_bypass_base = 0; 263 - top = roundup_pow_of_two(top); 264 - dev_info(&npe->pdev->dev, "Enabling bypass for PE %d\n", 265 - npe->pe_number); 266 - rc = opal_pci_map_pe_dma_window_real(phb->opal_id, 267 - npe->pe_number, npe->pe_number, 268 - npe->tce_bypass_base, top); 269 - } else { 270 - /* 271 - * Disable the bypass window by replacing it with the 272 - * TCE32 window. 273 - */ 274 - pnv_npu_disable_bypass(npe); 275 - } 298 + /* Enable the bypass window */ 299 + 300 + top = roundup_pow_of_two(top); 301 + dev_info(&npe->pdev->dev, "Enabling bypass for PE %d\n", 302 + npe->pe_number); 303 + rc = opal_pci_map_pe_dma_window_real(phb->opal_id, 304 + npe->pe_number, npe->pe_number, 305 + 0 /* bypass base */, top); 306 + 307 + if (rc == OPAL_SUCCESS) 308 + pnv_pci_ioda2_tce_invalidate_entire(phb, false); 276 309 277 310 return rc; 278 311 } 279 312 280 - int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask) 313 + void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass) 281 314 { 282 - struct pci_controller *hose = pci_bus_to_host(npdev->bus); 283 - struct pnv_phb *phb = hose->private_data; 284 - struct pci_dn *pdn = pci_get_pdn(npdev); 285 - struct pnv_ioda_pe *npe, *gpe; 286 - struct pci_dev *gpdev; 287 - uint64_t top; 288 - bool bypass = false; 315 + int i; 316 + struct pnv_phb *phb; 317 + struct pci_dn *pdn; 318 + struct pnv_ioda_pe *npe; 319 + struct pci_dev *npdev; 289 320 290 - if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) 291 - return -ENXIO; 321 + for (i = 0; ; ++i) { 322 + npdev = pnv_pci_get_npu_dev(gpdev, i); 292 323 293 - /* We only do bypass if it's enabled on the linked device */ 294 - npe = &phb->ioda.pe_array[pdn->pe_number]; 295 - gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); 296 - if (!gpe) 297 - return -ENODEV; 324 + if (!npdev) 325 + break; 298 326 299 - if (gpe->tce_bypass_enabled) { 300 - top = gpe->tce_bypass_base + memblock_end_of_DRAM() - 1; 301 - bypass = (dma_mask >= top); 327 + pdn = pci_get_pdn(npdev); 328 + if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) 329 + return; 330 + 331 + phb = pci_bus_to_host(npdev->bus)->private_data; 332 + 333 + /* We only do bypass if it's enabled on the linked device */ 334 + npe = &phb->ioda.pe_array[pdn->pe_number]; 335 + 336 + if (bypass) { 337 + dev_info(&npdev->dev, 338 + "Using 64-bit DMA iommu bypass\n"); 339 + pnv_npu_dma_set_bypass(npe); 340 + } else { 341 + dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n"); 342 + pnv_npu_dma_set_32(npe); 343 + } 344 + } 345 + } 346 + 347 + /* Switch ownership from platform code to external user (e.g. VFIO) */ 348 + void pnv_npu_take_ownership(struct pnv_ioda_pe *npe) 349 + { 350 + struct pnv_phb *phb = npe->phb; 351 + int64_t rc; 352 + 353 + /* 354 + * Note: NPU has just a single TVE in the hardware which means that 355 + * while used by the kernel, it can have either 32bit window or 356 + * DMA bypass but never both. So we deconfigure 32bit window only 357 + * if it was enabled at the moment of ownership change. 358 + */ 359 + if (npe->table_group.tables[0]) { 360 + pnv_npu_unset_window(npe, 0); 361 + return; 302 362 } 303 363 304 - if (bypass) 305 - dev_info(&npdev->dev, "Using 64-bit DMA iommu bypass\n"); 306 - else 307 - dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n"); 364 + /* Disable bypass */ 365 + rc = opal_pci_map_pe_dma_window_real(phb->opal_id, 366 + npe->pe_number, npe->pe_number, 367 + 0 /* bypass base */, 0); 368 + if (rc) { 369 + pe_err(npe, "Failed to disable bypass, err %lld\n", rc); 370 + return; 371 + } 372 + pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false); 373 + } 308 374 309 - pnv_npu_dma_set_bypass(npe, bypass); 310 - *npdev->dev.dma_mask = dma_mask; 375 + struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) 376 + { 377 + struct pnv_phb *phb = npe->phb; 378 + struct pci_bus *pbus = phb->hose->bus; 379 + struct pci_dev *npdev, *gpdev = NULL, *gptmp; 380 + struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev); 311 381 312 - return 0; 382 + if (!gpe || !gpdev) 383 + return NULL; 384 + 385 + list_for_each_entry(npdev, &pbus->devices, bus_list) { 386 + gptmp = pnv_pci_get_gpu_dev(npdev); 387 + 388 + if (gptmp != gpdev) 389 + continue; 390 + 391 + pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev)); 392 + iommu_group_add_device(gpe->table_group.group, &npdev->dev); 393 + } 394 + 395 + return gpe; 313 396 }

+5 -3

arch/powerpc/platforms/powernv/opal-hmi.c

··· 150 150 static void print_checkstop_reason(const char *level, 151 151 struct OpalHMIEvent *hmi_evt) 152 152 { 153 - switch (hmi_evt->u.xstop_error.xstop_type) { 153 + uint8_t type = hmi_evt->u.xstop_error.xstop_type; 154 + switch (type) { 154 155 case CHECKSTOP_TYPE_CORE: 155 156 print_core_checkstop_reason(level, hmi_evt); 156 157 break; 157 158 case CHECKSTOP_TYPE_NX: 158 159 print_nx_checkstop_reason(level, hmi_evt); 159 160 break; 160 - case CHECKSTOP_TYPE_UNKNOWN: 161 - printk("%s Unknown Malfunction Alert.\n", level); 161 + default: 162 + printk("%s Unknown Malfunction Alert of type %d\n", 163 + level, type); 162 164 break; 163 165 } 164 166 }

+575 -382

arch/powerpc/platforms/powernv/pci-ioda.c

··· 48 48 #include "powernv.h" 49 49 #include "pci.h" 50 50 51 - /* 256M DMA window, 4K TCE pages, 8 bytes TCE */ 52 - #define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8) 51 + #define PNV_IODA1_M64_NUM 16 /* Number of M64 BARs */ 52 + #define PNV_IODA1_M64_SEGS 8 /* Segments per M64 BAR */ 53 + #define PNV_IODA1_DMA32_SEGSIZE 0x10000000 53 54 54 55 #define POWERNV_IOMMU_DEFAULT_LEVELS 1 55 56 #define POWERNV_IOMMU_MAX_LEVELS 5 56 57 57 58 static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); 58 59 59 - static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, 60 + void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, 60 61 const char *fmt, ...) 61 62 { 62 63 struct va_format vaf; ··· 88 87 va_end(args); 89 88 } 90 89 91 - #define pe_err(pe, fmt, ...) \ 92 - pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__) 93 - #define pe_warn(pe, fmt, ...) \ 94 - pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__) 95 - #define pe_info(pe, fmt, ...) \ 96 - pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__) 97 - 98 90 static bool pnv_iommu_bypass_disabled __read_mostly; 99 91 100 92 static int __init iommu_setup(char *str) ··· 116 122 (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH)); 117 123 } 118 124 125 + static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no) 126 + { 127 + phb->ioda.pe_array[pe_no].phb = phb; 128 + phb->ioda.pe_array[pe_no].pe_number = pe_no; 129 + 130 + return &phb->ioda.pe_array[pe_no]; 131 + } 132 + 119 133 static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no) 120 134 { 121 - if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) { 135 + if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe_num)) { 122 136 pr_warn("%s: Invalid PE %d on PHB#%x\n", 123 137 __func__, pe_no, phb->hose->global_number); 124 138 return; ··· 136 134 pr_debug("%s: PE %d was reserved on PHB#%x\n", 137 135 __func__, pe_no, phb->hose->global_number); 138 136 139 - phb->ioda.pe_array[pe_no].phb = phb; 140 - phb->ioda.pe_array[pe_no].pe_number = pe_no; 137 + pnv_ioda_init_pe(phb, pe_no); 141 138 } 142 139 143 - static int pnv_ioda_alloc_pe(struct pnv_phb *phb) 140 + static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb) 144 141 { 145 142 unsigned long pe; 146 143 147 144 do { 148 145 pe = find_next_zero_bit(phb->ioda.pe_alloc, 149 - phb->ioda.total_pe, 0); 150 - if (pe >= phb->ioda.total_pe) 151 - return IODA_INVALID_PE; 146 + phb->ioda.total_pe_num, 0); 147 + if (pe >= phb->ioda.total_pe_num) 148 + return NULL; 152 149 } while(test_and_set_bit(pe, phb->ioda.pe_alloc)); 153 150 154 - phb->ioda.pe_array[pe].phb = phb; 155 - phb->ioda.pe_array[pe].pe_number = pe; 156 - return pe; 151 + return pnv_ioda_init_pe(phb, pe); 157 152 } 158 153 159 - static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe) 154 + static void pnv_ioda_free_pe(struct pnv_ioda_pe *pe) 160 155 { 161 - WARN_ON(phb->ioda.pe_array[pe].pdev); 156 + struct pnv_phb *phb = pe->phb; 162 157 163 - memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe)); 164 - clear_bit(pe, phb->ioda.pe_alloc); 158 + WARN_ON(pe->pdev); 159 + 160 + memset(pe, 0, sizeof(struct pnv_ioda_pe)); 161 + clear_bit(pe->pe_number, phb->ioda.pe_alloc); 165 162 } 166 163 167 164 /* The default M64 BAR is shared by all PEs */ ··· 200 199 * expected to be 0 or last one of PE capabicity. 201 200 */ 202 201 r = &phb->hose->mem_resources[1]; 203 - if (phb->ioda.reserved_pe == 0) 202 + if (phb->ioda.reserved_pe_idx == 0) 204 203 r->start += phb->ioda.m64_segsize; 205 - else if (phb->ioda.reserved_pe == (phb->ioda.total_pe - 1)) 204 + else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) 206 205 r->end -= phb->ioda.m64_segsize; 207 206 else 208 207 pr_warn(" Cannot strip M64 segment for reserved PE#%d\n", 209 - phb->ioda.reserved_pe); 208 + phb->ioda.reserved_pe_idx); 210 209 211 210 return 0; 212 211 ··· 220 219 return -EIO; 221 220 } 222 221 223 - static void pnv_ioda2_reserve_dev_m64_pe(struct pci_dev *pdev, 222 + static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev, 224 223 unsigned long *pe_bitmap) 225 224 { 226 225 struct pci_controller *hose = pci_bus_to_host(pdev->bus); ··· 247 246 } 248 247 } 249 248 250 - static void pnv_ioda2_reserve_m64_pe(struct pci_bus *bus, 251 - unsigned long *pe_bitmap, 252 - bool all) 249 + static int pnv_ioda1_init_m64(struct pnv_phb *phb) 250 + { 251 + struct resource *r; 252 + int index; 253 + 254 + /* 255 + * There are 16 M64 BARs, each of which has 8 segments. So 256 + * there are as many M64 segments as the maximum number of 257 + * PEs, which is 128. 258 + */ 259 + for (index = 0; index < PNV_IODA1_M64_NUM; index++) { 260 + unsigned long base, segsz = phb->ioda.m64_segsize; 261 + int64_t rc; 262 + 263 + base = phb->ioda.m64_base + 264 + index * PNV_IODA1_M64_SEGS * segsz; 265 + rc = opal_pci_set_phb_mem_window(phb->opal_id, 266 + OPAL_M64_WINDOW_TYPE, index, base, 0, 267 + PNV_IODA1_M64_SEGS * segsz); 268 + if (rc != OPAL_SUCCESS) { 269 + pr_warn(" Error %lld setting M64 PHB#%d-BAR#%d\n", 270 + rc, phb->hose->global_number, index); 271 + goto fail; 272 + } 273 + 274 + rc = opal_pci_phb_mmio_enable(phb->opal_id, 275 + OPAL_M64_WINDOW_TYPE, index, 276 + OPAL_ENABLE_M64_SPLIT); 277 + if (rc != OPAL_SUCCESS) { 278 + pr_warn(" Error %lld enabling M64 PHB#%d-BAR#%d\n", 279 + rc, phb->hose->global_number, index); 280 + goto fail; 281 + } 282 + } 283 + 284 + /* 285 + * Exclude the segment used by the reserved PE, which 286 + * is expected to be 0 or last supported PE#. 287 + */ 288 + r = &phb->hose->mem_resources[1]; 289 + if (phb->ioda.reserved_pe_idx == 0) 290 + r->start += phb->ioda.m64_segsize; 291 + else if (phb->ioda.reserved_pe_idx == (phb->ioda.total_pe_num - 1)) 292 + r->end -= phb->ioda.m64_segsize; 293 + else 294 + WARN(1, "Wrong reserved PE#%d on PHB#%d\n", 295 + phb->ioda.reserved_pe_idx, phb->hose->global_number); 296 + 297 + return 0; 298 + 299 + fail: 300 + for ( ; index >= 0; index--) 301 + opal_pci_phb_mmio_enable(phb->opal_id, 302 + OPAL_M64_WINDOW_TYPE, index, OPAL_DISABLE_M64); 303 + 304 + return -EIO; 305 + } 306 + 307 + static void pnv_ioda_reserve_m64_pe(struct pci_bus *bus, 308 + unsigned long *pe_bitmap, 309 + bool all) 253 310 { 254 311 struct pci_dev *pdev; 255 312 256 313 list_for_each_entry(pdev, &bus->devices, bus_list) { 257 - pnv_ioda2_reserve_dev_m64_pe(pdev, pe_bitmap); 314 + pnv_ioda_reserve_dev_m64_pe(pdev, pe_bitmap); 258 315 259 316 if (all && pdev->subordinate) 260 - pnv_ioda2_reserve_m64_pe(pdev->subordinate, 261 - pe_bitmap, all); 317 + pnv_ioda_reserve_m64_pe(pdev->subordinate, 318 + pe_bitmap, all); 262 319 } 263 320 } 264 321 265 - static int pnv_ioda2_pick_m64_pe(struct pci_bus *bus, bool all) 322 + static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all) 266 323 { 267 324 struct pci_controller *hose = pci_bus_to_host(bus); 268 325 struct pnv_phb *phb = hose->private_data; ··· 330 271 331 272 /* Root bus shouldn't use M64 */ 332 273 if (pci_is_root_bus(bus)) 333 - return IODA_INVALID_PE; 274 + return NULL; 334 275 335 276 /* Allocate bitmap */ 336 - size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); 277 + size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long)); 337 278 pe_alloc = kzalloc(size, GFP_KERNEL); 338 279 if (!pe_alloc) { 339 280 pr_warn("%s: Out of memory !\n", 340 281 __func__); 341 - return IODA_INVALID_PE; 282 + return NULL; 342 283 } 343 284 344 285 /* Figure out reserved PE numbers by the PE */ 345 - pnv_ioda2_reserve_m64_pe(bus, pe_alloc, all); 286 + pnv_ioda_reserve_m64_pe(bus, pe_alloc, all); 346 287 347 288 /* 348 289 * the current bus might not own M64 window and that's all 349 290 * contributed by its child buses. For the case, we needn't 350 291 * pick M64 dependent PE#. 351 292 */ 352 - if (bitmap_empty(pe_alloc, phb->ioda.total_pe)) { 293 + if (bitmap_empty(pe_alloc, phb->ioda.total_pe_num)) { 353 294 kfree(pe_alloc); 354 - return IODA_INVALID_PE; 295 + return NULL; 355 296 } 356 297 357 298 /* ··· 360 301 */ 361 302 master_pe = NULL; 362 303 i = -1; 363 - while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe, i + 1)) < 364 - phb->ioda.total_pe) { 304 + while ((i = find_next_bit(pe_alloc, phb->ioda.total_pe_num, i + 1)) < 305 + phb->ioda.total_pe_num) { 365 306 pe = &phb->ioda.pe_array[i]; 366 307 308 + phb->ioda.m64_segmap[pe->pe_number] = pe->pe_number; 367 309 if (!master_pe) { 368 310 pe->flags |= PNV_IODA_PE_MASTER; 369 311 INIT_LIST_HEAD(&pe->slaves); ··· 374 314 pe->master = master_pe; 375 315 list_add_tail(&pe->list, &master_pe->slaves); 376 316 } 317 + 318 + /* 319 + * P7IOC supports M64DT, which helps mapping M64 segment 320 + * to one particular PE#. However, PHB3 has fixed mapping 321 + * between M64 segment and PE#. In order to have same logic 322 + * for P7IOC and PHB3, we enforce fixed mapping between M64 323 + * segment and PE# on P7IOC. 324 + */ 325 + if (phb->type == PNV_PHB_IODA1) { 326 + int64_t rc; 327 + 328 + rc = opal_pci_map_pe_mmio_window(phb->opal_id, 329 + pe->pe_number, OPAL_M64_WINDOW_TYPE, 330 + pe->pe_number / PNV_IODA1_M64_SEGS, 331 + pe->pe_number % PNV_IODA1_M64_SEGS); 332 + if (rc != OPAL_SUCCESS) 333 + pr_warn("%s: Error %lld mapping M64 for PHB#%d-PE#%d\n", 334 + __func__, rc, phb->hose->global_number, 335 + pe->pe_number); 336 + } 377 337 } 378 338 379 339 kfree(pe_alloc); 380 - return master_pe->pe_number; 340 + return master_pe; 381 341 } 382 342 383 343 static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb) ··· 408 328 const u32 *r; 409 329 u64 pci_addr; 410 330 411 - /* FIXME: Support M64 for P7IOC */ 412 - if (phb->type != PNV_PHB_IODA2) { 331 + if (phb->type != PNV_PHB_IODA1 && phb->type != PNV_PHB_IODA2) { 413 332 pr_info(" Not support M64 window\n"); 414 333 return; 415 334 } ··· 434 355 hose->mem_offset[1] = res->start - pci_addr; 435 356 436 357 phb->ioda.m64_size = resource_size(res); 437 - phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe; 358 + phb->ioda.m64_segsize = phb->ioda.m64_size / phb->ioda.total_pe_num; 438 359 phb->ioda.m64_base = pci_addr; 439 360 440 361 pr_info(" MEM64 0x%016llx..0x%016llx -> 0x%016llx\n", ··· 442 363 443 364 /* Use last M64 BAR to cover M64 window */ 444 365 phb->ioda.m64_bar_idx = 15; 445 - phb->init_m64 = pnv_ioda2_init_m64; 446 - phb->reserve_m64_pe = pnv_ioda2_reserve_m64_pe; 447 - phb->pick_m64_pe = pnv_ioda2_pick_m64_pe; 366 + if (phb->type == PNV_PHB_IODA1) 367 + phb->init_m64 = pnv_ioda1_init_m64; 368 + else 369 + phb->init_m64 = pnv_ioda2_init_m64; 370 + phb->reserve_m64_pe = pnv_ioda_reserve_m64_pe; 371 + phb->pick_m64_pe = pnv_ioda_pick_m64_pe; 448 372 } 449 373 450 374 static void pnv_ioda_freeze_pe(struct pnv_phb *phb, int pe_no) ··· 538 456 s64 rc; 539 457 540 458 /* Sanity check on PE number */ 541 - if (pe_no < 0 || pe_no >= phb->ioda.total_pe) 459 + if (pe_no < 0 || pe_no >= phb->ioda.total_pe_num) 542 460 return OPAL_EEH_STOPPED_PERM_UNAVAIL; 543 461 544 462 /* ··· 890 808 return 0; 891 809 } 892 810 893 - static void pnv_ioda_link_pe_by_weight(struct pnv_phb *phb, 894 - struct pnv_ioda_pe *pe) 895 - { 896 - struct pnv_ioda_pe *lpe; 897 - 898 - list_for_each_entry(lpe, &phb->ioda.pe_dma_list, dma_link) { 899 - if (lpe->dma_weight < pe->dma_weight) { 900 - list_add_tail(&pe->dma_link, &lpe->dma_link); 901 - return; 902 - } 903 - } 904 - list_add_tail(&pe->dma_link, &phb->ioda.pe_dma_list); 905 - } 906 - 907 - static unsigned int pnv_ioda_dma_weight(struct pci_dev *dev) 908 - { 909 - /* This is quite simplistic. The "base" weight of a device 910 - * is 10. 0 means no DMA is to be accounted for it. 911 - */ 912 - 913 - /* If it's a bridge, no DMA */ 914 - if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) 915 - return 0; 916 - 917 - /* Reduce the weight of slow USB controllers */ 918 - if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || 919 - dev->class == PCI_CLASS_SERIAL_USB_OHCI || 920 - dev->class == PCI_CLASS_SERIAL_USB_EHCI) 921 - return 3; 922 - 923 - /* Increase the weight of RAID (includes Obsidian) */ 924 - if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) 925 - return 15; 926 - 927 - /* Default */ 928 - return 10; 929 - } 930 - 931 811 #ifdef CONFIG_PCI_IOV 932 812 static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset) 933 813 { ··· 963 919 struct pnv_phb *phb = hose->private_data; 964 920 struct pci_dn *pdn = pci_get_pdn(dev); 965 921 struct pnv_ioda_pe *pe; 966 - int pe_num; 967 922 968 923 if (!pdn) { 969 924 pr_err("%s: Device tree node not associated properly\n", ··· 972 929 if (pdn->pe_number != IODA_INVALID_PE) 973 930 return NULL; 974 931 975 - pe_num = pnv_ioda_alloc_pe(phb); 976 - if (pe_num == IODA_INVALID_PE) { 932 + pe = pnv_ioda_alloc_pe(phb); 933 + if (!pe) { 977 934 pr_warning("%s: Not enough PE# available, disabling device\n", 978 935 pci_name(dev)); 979 936 return NULL; ··· 986 943 * 987 944 * At some point we want to remove the PDN completely anyways 988 945 */ 989 - pe = &phb->ioda.pe_array[pe_num]; 990 946 pci_dev_get(dev); 991 947 pdn->pcidev = dev; 992 - pdn->pe_number = pe_num; 948 + pdn->pe_number = pe->pe_number; 993 949 pe->flags = PNV_IODA_PE_DEV; 994 950 pe->pdev = dev; 995 951 pe->pbus = NULL; 996 - pe->tce32_seg = -1; 997 952 pe->mve_number = -1; 998 953 pe->rid = dev->bus->number << 8 | pdn->devfn; 999 954 ··· 999 958 1000 959 if (pnv_ioda_configure_pe(phb, pe)) { 1001 960 /* XXX What do we do here ? */ 1002 - if (pe_num) 1003 - pnv_ioda_free_pe(phb, pe_num); 961 + pnv_ioda_free_pe(pe); 1004 962 pdn->pe_number = IODA_INVALID_PE; 1005 963 pe->pdev = NULL; 1006 964 pci_dev_put(dev); 1007 965 return NULL; 1008 966 } 1009 967 1010 - /* Assign a DMA weight to the device */ 1011 - pe->dma_weight = pnv_ioda_dma_weight(dev); 1012 - if (pe->dma_weight != 0) { 1013 - phb->ioda.dma_weight += pe->dma_weight; 1014 - phb->ioda.dma_pe_count++; 1015 - } 1016 - 1017 - /* Link the PE */ 1018 - pnv_ioda_link_pe_by_weight(phb, pe); 968 + /* Put PE to the list */ 969 + list_add_tail(&pe->list, &phb->ioda.pe_list); 1019 970 1020 971 return pe; 1021 972 } ··· 1026 993 } 1027 994 pdn->pcidev = dev; 1028 995 pdn->pe_number = pe->pe_number; 1029 - pe->dma_weight += pnv_ioda_dma_weight(dev); 1030 996 if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate) 1031 997 pnv_ioda_setup_same_PE(dev->subordinate, pe); 1032 998 } ··· 1037 1005 * subordinate PCI devices and buses. The second type of PE is normally 1038 1006 * orgiriated by PCIe-to-PCI bridge or PLX switch downstream ports. 1039 1007 */ 1040 - static void pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) 1008 + static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all) 1041 1009 { 1042 1010 struct pci_controller *hose = pci_bus_to_host(bus); 1043 1011 struct pnv_phb *phb = hose->private_data; 1044 - struct pnv_ioda_pe *pe; 1045 - int pe_num = IODA_INVALID_PE; 1012 + struct pnv_ioda_pe *pe = NULL; 1046 1013 1047 1014 /* Check if PE is determined by M64 */ 1048 1015 if (phb->pick_m64_pe) 1049 - pe_num = phb->pick_m64_pe(bus, all); 1016 + pe = phb->pick_m64_pe(bus, all); 1050 1017 1051 1018 /* The PE number isn't pinned by M64 */ 1052 - if (pe_num == IODA_INVALID_PE) 1053 - pe_num = pnv_ioda_alloc_pe(phb); 1019 + if (!pe) 1020 + pe = pnv_ioda_alloc_pe(phb); 1054 1021 1055 - if (pe_num == IODA_INVALID_PE) { 1022 + if (!pe) { 1056 1023 pr_warning("%s: Not enough PE# available for PCI bus %04x:%02x\n", 1057 1024 __func__, pci_domain_nr(bus), bus->number); 1058 - return; 1025 + return NULL; 1059 1026 } 1060 1027 1061 - pe = &phb->ioda.pe_array[pe_num]; 1062 1028 pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS); 1063 1029 pe->pbus = bus; 1064 1030 pe->pdev = NULL; 1065 - pe->tce32_seg = -1; 1066 1031 pe->mve_number = -1; 1067 1032 pe->rid = bus->busn_res.start << 8; 1068 - pe->dma_weight = 0; 1069 1033 1070 1034 if (all) 1071 1035 pe_info(pe, "Secondary bus %d..%d associated with PE#%d\n", 1072 - bus->busn_res.start, bus->busn_res.end, pe_num); 1036 + bus->busn_res.start, bus->busn_res.end, pe->pe_number); 1073 1037 else 1074 1038 pe_info(pe, "Secondary bus %d associated with PE#%d\n", 1075 - bus->busn_res.start, pe_num); 1039 + bus->busn_res.start, pe->pe_number); 1076 1040 1077 1041 if (pnv_ioda_configure_pe(phb, pe)) { 1078 1042 /* XXX What do we do here ? */ 1079 - if (pe_num) 1080 - pnv_ioda_free_pe(phb, pe_num); 1043 + pnv_ioda_free_pe(pe); 1081 1044 pe->pbus = NULL; 1082 - return; 1045 + return NULL; 1083 1046 } 1084 1047 1085 1048 /* Associate it with all child devices */ ··· 1083 1056 /* Put PE to the list */ 1084 1057 list_add_tail(&pe->list, &phb->ioda.pe_list); 1085 1058 1086 - /* Account for one DMA PE if at least one DMA capable device exist 1087 - * below the bridge 1088 - */ 1089 - if (pe->dma_weight != 0) { 1090 - phb->ioda.dma_weight += pe->dma_weight; 1091 - phb->ioda.dma_pe_count++; 1092 - } 1093 - 1094 - /* Link the PE */ 1095 - pnv_ioda_link_pe_by_weight(phb, pe); 1059 + return pe; 1096 1060 } 1097 1061 1098 1062 static struct pnv_ioda_pe *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev) ··· 1106 1088 * same GPU get assigned the same PE. 1107 1089 */ 1108 1090 gpu_pdev = pnv_pci_get_gpu_dev(npu_pdev); 1109 - for (pe_num = 0; pe_num < phb->ioda.total_pe; pe_num++) { 1091 + for (pe_num = 0; pe_num < phb->ioda.total_pe_num; pe_num++) { 1110 1092 pe = &phb->ioda.pe_array[pe_num]; 1111 1093 if (!pe->pdev) 1112 1094 continue; ··· 1124 1106 rid = npu_pdev->bus->number << 8 | npu_pdn->devfn; 1125 1107 npu_pdn->pcidev = npu_pdev; 1126 1108 npu_pdn->pe_number = pe_num; 1127 - pe->dma_weight += pnv_ioda_dma_weight(npu_pdev); 1128 1109 phb->ioda.pe_rmap[rid] = pe->pe_number; 1129 1110 1130 1111 /* Map the PE to this link */ ··· 1395 1378 1396 1379 pnv_ioda_deconfigure_pe(phb, pe); 1397 1380 1398 - pnv_ioda_free_pe(phb, pe->pe_number); 1381 + pnv_ioda_free_pe(pe); 1399 1382 } 1400 1383 } 1401 1384 ··· 1404 1387 struct pci_bus *bus; 1405 1388 struct pci_controller *hose; 1406 1389 struct pnv_phb *phb; 1390 + struct pnv_ioda_pe *pe; 1407 1391 struct pci_dn *pdn; 1408 1392 struct pci_sriov *iov; 1409 1393 u16 num_vfs, i; ··· 1429 1411 /* Release PE numbers */ 1430 1412 if (pdn->m64_single_mode) { 1431 1413 for (i = 0; i < num_vfs; i++) { 1432 - if (pdn->pe_num_map[i] != IODA_INVALID_PE) 1433 - pnv_ioda_free_pe(phb, pdn->pe_num_map[i]); 1414 + if (pdn->pe_num_map[i] == IODA_INVALID_PE) 1415 + continue; 1416 + 1417 + pe = &phb->ioda.pe_array[pdn->pe_num_map[i]]; 1418 + pnv_ioda_free_pe(pe); 1434 1419 } 1435 1420 } else 1436 1421 bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); ··· 1475 1454 pe->flags = PNV_IODA_PE_VF; 1476 1455 pe->pbus = NULL; 1477 1456 pe->parent_dev = pdev; 1478 - pe->tce32_seg = -1; 1479 1457 pe->mve_number = -1; 1480 1458 pe->rid = (pci_iov_virtfn_bus(pdev, vf_index) << 8) | 1481 1459 pci_iov_virtfn_devfn(pdev, vf_index); ··· 1486 1466 1487 1467 if (pnv_ioda_configure_pe(phb, pe)) { 1488 1468 /* XXX What do we do here ? */ 1489 - if (pe_num) 1490 - pnv_ioda_free_pe(phb, pe_num); 1469 + pnv_ioda_free_pe(pe); 1491 1470 pe->pdev = NULL; 1492 1471 continue; 1493 1472 } ··· 1505 1486 struct pci_bus *bus; 1506 1487 struct pci_controller *hose; 1507 1488 struct pnv_phb *phb; 1489 + struct pnv_ioda_pe *pe; 1508 1490 struct pci_dn *pdn; 1509 1491 int ret; 1510 1492 u16 i; ··· 1548 1528 /* Calculate available PE for required VFs */ 1549 1529 if (pdn->m64_single_mode) { 1550 1530 for (i = 0; i < num_vfs; i++) { 1551 - pdn->pe_num_map[i] = pnv_ioda_alloc_pe(phb); 1552 - if (pdn->pe_num_map[i] == IODA_INVALID_PE) { 1531 + pe = pnv_ioda_alloc_pe(phb); 1532 + if (!pe) { 1553 1533 ret = -EBUSY; 1554 1534 goto m64_failed; 1555 1535 } 1536 + 1537 + pdn->pe_num_map[i] = pe->pe_number; 1556 1538 } 1557 1539 } else { 1558 1540 mutex_lock(&phb->ioda.pe_alloc_mutex); 1559 1541 *pdn->pe_num_map = bitmap_find_next_zero_area( 1560 - phb->ioda.pe_alloc, phb->ioda.total_pe, 1542 + phb->ioda.pe_alloc, phb->ioda.total_pe_num, 1561 1543 0, num_vfs, 0); 1562 - if (*pdn->pe_num_map >= phb->ioda.total_pe) { 1544 + if (*pdn->pe_num_map >= phb->ioda.total_pe_num) { 1563 1545 mutex_unlock(&phb->ioda.pe_alloc_mutex); 1564 1546 dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs); 1565 1547 kfree(pdn->pe_num_map); ··· 1599 1577 m64_failed: 1600 1578 if (pdn->m64_single_mode) { 1601 1579 for (i = 0; i < num_vfs; i++) { 1602 - if (pdn->pe_num_map[i] != IODA_INVALID_PE) 1603 - pnv_ioda_free_pe(phb, pdn->pe_num_map[i]); 1580 + if (pdn->pe_num_map[i] == IODA_INVALID_PE) 1581 + continue; 1582 + 1583 + pe = &phb->ioda.pe_array[pdn->pe_num_map[i]]; 1584 + pnv_ioda_free_pe(pe); 1604 1585 } 1605 1586 } else 1606 1587 bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs); ··· 1665 1640 struct pnv_ioda_pe *pe; 1666 1641 uint64_t top; 1667 1642 bool bypass = false; 1668 - struct pci_dev *linked_npu_dev; 1669 - int i; 1670 1643 1671 1644 if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE)) 1672 1645 return -ENODEV;; ··· 1685 1662 *pdev->dev.dma_mask = dma_mask; 1686 1663 1687 1664 /* Update peer npu devices */ 1688 - if (pe->flags & PNV_IODA_PE_PEER) 1689 - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { 1690 - if (!pe->peers[i]) 1691 - continue; 1692 - 1693 - linked_npu_dev = pe->peers[i]->pdev; 1694 - if (dma_get_mask(&linked_npu_dev->dev) != dma_mask) 1695 - dma_set_mask(&linked_npu_dev->dev, dma_mask); 1696 - } 1665 + pnv_npu_try_dma_set_bypass(pdev, bypass); 1697 1666 1698 1667 return 0; 1699 1668 } ··· 1826 1811 .get = pnv_tce_get, 1827 1812 }; 1828 1813 1829 - static inline void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_ioda_pe *pe) 1814 + #define TCE_KILL_INVAL_ALL PPC_BIT(0) 1815 + #define TCE_KILL_INVAL_PE PPC_BIT(1) 1816 + #define TCE_KILL_INVAL_TCE PPC_BIT(2) 1817 + 1818 + void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm) 1819 + { 1820 + const unsigned long val = TCE_KILL_INVAL_ALL; 1821 + 1822 + mb(); /* Ensure previous TCE table stores are visible */ 1823 + if (rm) 1824 + __raw_rm_writeq(cpu_to_be64(val), 1825 + (__be64 __iomem *) 1826 + phb->ioda.tce_inval_reg_phys); 1827 + else 1828 + __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg); 1829 + } 1830 + 1831 + static inline void pnv_pci_ioda2_tce_invalidate_pe(struct pnv_ioda_pe *pe) 1830 1832 { 1831 1833 /* 01xb - invalidate TCEs that match the specified PE# */ 1832 - unsigned long val = (0x4ull << 60) | (pe->pe_number & 0xFF); 1834 + unsigned long val = TCE_KILL_INVAL_PE | (pe->pe_number & 0xFF); 1833 1835 struct pnv_phb *phb = pe->phb; 1834 - struct pnv_ioda_pe *npe; 1835 - int i; 1836 1836 1837 1837 if (!phb->ioda.tce_inval_reg) 1838 1838 return; 1839 1839 1840 1840 mb(); /* Ensure above stores are visible */ 1841 1841 __raw_writeq(cpu_to_be64(val), phb->ioda.tce_inval_reg); 1842 - 1843 - if (pe->flags & PNV_IODA_PE_PEER) 1844 - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { 1845 - npe = pe->peers[i]; 1846 - if (!npe || npe->phb->type != PNV_PHB_NPU) 1847 - continue; 1848 - 1849 - pnv_npu_tce_invalidate_entire(npe); 1850 - } 1851 1842 } 1852 1843 1853 1844 static void pnv_pci_ioda2_do_tce_invalidate(unsigned pe_number, bool rm, ··· 1863 1842 unsigned long start, end, inc; 1864 1843 1865 1844 /* We'll invalidate DMA address in PE scope */ 1866 - start = 0x2ull << 60; 1845 + start = TCE_KILL_INVAL_TCE; 1867 1846 start |= (pe_number & 0xFF); 1868 1847 end = start; 1869 1848 ··· 1888 1867 struct iommu_table_group_link *tgl; 1889 1868 1890 1869 list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { 1891 - struct pnv_ioda_pe *npe; 1892 1870 struct pnv_ioda_pe *pe = container_of(tgl->table_group, 1893 1871 struct pnv_ioda_pe, table_group); 1894 1872 __be64 __iomem *invalidate = rm ? 1895 1873 (__be64 __iomem *)pe->phb->ioda.tce_inval_reg_phys : 1896 1874 pe->phb->ioda.tce_inval_reg; 1897 - int i; 1898 1875 1876 + if (pe->phb->type == PNV_PHB_NPU) { 1877 + /* 1878 + * The NVLink hardware does not support TCE kill 1879 + * per TCE entry so we have to invalidate 1880 + * the entire cache for it. 1881 + */ 1882 + pnv_pci_ioda2_tce_invalidate_entire(pe->phb, rm); 1883 + continue; 1884 + } 1899 1885 pnv_pci_ioda2_do_tce_invalidate(pe->pe_number, rm, 1900 1886 invalidate, tbl->it_page_shift, 1901 1887 index, npages); 1902 - 1903 - if (pe->flags & PNV_IODA_PE_PEER) 1904 - /* Invalidate PEs using the same TCE table */ 1905 - for (i = 0; i < PNV_IODA_MAX_PEER_PES; i++) { 1906 - npe = pe->peers[i]; 1907 - if (!npe || npe->phb->type != PNV_PHB_NPU) 1908 - continue; 1909 - 1910 - pnv_npu_tce_invalidate(npe, tbl, index, 1911 - npages, rm); 1912 - } 1913 1888 } 1914 1889 } 1915 1890 ··· 1962 1945 .free = pnv_ioda2_table_free, 1963 1946 }; 1964 1947 1965 - static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb, 1966 - struct pnv_ioda_pe *pe, unsigned int base, 1967 - unsigned int segs) 1948 + static int pnv_pci_ioda_dev_dma_weight(struct pci_dev *dev, void *data) 1949 + { 1950 + unsigned int *weight = (unsigned int *)data; 1951 + 1952 + /* This is quite simplistic. The "base" weight of a device 1953 + * is 10. 0 means no DMA is to be accounted for it. 1954 + */ 1955 + if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) 1956 + return 0; 1957 + 1958 + if (dev->class == PCI_CLASS_SERIAL_USB_UHCI || 1959 + dev->class == PCI_CLASS_SERIAL_USB_OHCI || 1960 + dev->class == PCI_CLASS_SERIAL_USB_EHCI) 1961 + *weight += 3; 1962 + else if ((dev->class >> 8) == PCI_CLASS_STORAGE_RAID) 1963 + *weight += 15; 1964 + else 1965 + *weight += 10; 1966 + 1967 + return 0; 1968 + } 1969 + 1970 + static unsigned int pnv_pci_ioda_pe_dma_weight(struct pnv_ioda_pe *pe) 1971 + { 1972 + unsigned int weight = 0; 1973 + 1974 + /* SRIOV VF has same DMA32 weight as its PF */ 1975 + #ifdef CONFIG_PCI_IOV 1976 + if ((pe->flags & PNV_IODA_PE_VF) && pe->parent_dev) { 1977 + pnv_pci_ioda_dev_dma_weight(pe->parent_dev, &weight); 1978 + return weight; 1979 + } 1980 + #endif 1981 + 1982 + if ((pe->flags & PNV_IODA_PE_DEV) && pe->pdev) { 1983 + pnv_pci_ioda_dev_dma_weight(pe->pdev, &weight); 1984 + } else if ((pe->flags & PNV_IODA_PE_BUS) && pe->pbus) { 1985 + struct pci_dev *pdev; 1986 + 1987 + list_for_each_entry(pdev, &pe->pbus->devices, bus_list) 1988 + pnv_pci_ioda_dev_dma_weight(pdev, &weight); 1989 + } else if ((pe->flags & PNV_IODA_PE_BUS_ALL) && pe->pbus) { 1990 + pci_walk_bus(pe->pbus, pnv_pci_ioda_dev_dma_weight, &weight); 1991 + } 1992 + 1993 + return weight; 1994 + } 1995 + 1996 + static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb, 1997 + struct pnv_ioda_pe *pe) 1968 1998 { 1969 1999 1970 2000 struct page *tce_mem = NULL; 1971 2001 struct iommu_table *tbl; 1972 - unsigned int i; 2002 + unsigned int weight, total_weight = 0; 2003 + unsigned int tce32_segsz, base, segs, avail, i; 1973 2004 int64_t rc; 1974 2005 void *addr; 1975 2006 1976 2007 /* XXX FIXME: Handle 64-bit only DMA devices */ 1977 2008 /* XXX FIXME: Provide 64-bit DMA facilities & non-4K TCE tables etc.. */ 1978 2009 /* XXX FIXME: Allocate multi-level tables on PHB3 */ 1979 - 1980 - /* We shouldn't already have a 32-bit DMA associated */ 1981 - if (WARN_ON(pe->tce32_seg >= 0)) 2010 + weight = pnv_pci_ioda_pe_dma_weight(pe); 2011 + if (!weight) 1982 2012 return; 1983 2013 2014 + pci_walk_bus(phb->hose->bus, pnv_pci_ioda_dev_dma_weight, 2015 + &total_weight); 2016 + segs = (weight * phb->ioda.dma32_count) / total_weight; 2017 + if (!segs) 2018 + segs = 1; 2019 + 2020 + /* 2021 + * Allocate contiguous DMA32 segments. We begin with the expected 2022 + * number of segments. With one more attempt, the number of DMA32 2023 + * segments to be allocated is decreased by one until one segment 2024 + * is allocated successfully. 2025 + */ 2026 + do { 2027 + for (base = 0; base <= phb->ioda.dma32_count - segs; base++) { 2028 + for (avail = 0, i = base; i < base + segs; i++) { 2029 + if (phb->ioda.dma32_segmap[i] == 2030 + IODA_INVALID_PE) 2031 + avail++; 2032 + } 2033 + 2034 + if (avail == segs) 2035 + goto found; 2036 + } 2037 + } while (--segs); 2038 + 2039 + if (!segs) { 2040 + pe_warn(pe, "No available DMA32 segments\n"); 2041 + return; 2042 + } 2043 + 2044 + found: 1984 2045 tbl = pnv_pci_table_alloc(phb->hose->node); 1985 2046 iommu_register_group(&pe->table_group, phb->hose->global_number, 1986 2047 pe->pe_number); 1987 2048 pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); 1988 2049 1989 2050 /* Grab a 32-bit TCE table */ 1990 - pe->tce32_seg = base; 2051 + pe_info(pe, "DMA weight %d (%d), assigned (%d) %d DMA32 segments\n", 2052 + weight, total_weight, base, segs); 1991 2053 pe_info(pe, " Setting up 32-bit TCE table at %08x..%08x\n", 1992 - (base << 28), ((base + segs) << 28) - 1); 2054 + base * PNV_IODA1_DMA32_SEGSIZE, 2055 + (base + segs) * PNV_IODA1_DMA32_SEGSIZE - 1); 1993 2056 1994 2057 /* XXX Currently, we allocate one big contiguous table for the 1995 2058 * TCEs. We only really need one chunk per 256M of TCE space 1996 2059 * (ie per segment) but that's an optimization for later, it 1997 2060 * requires some added smarts with our get/put_tce implementation 2061 + * 2062 + * Each TCE page is 4KB in size and each TCE entry occupies 8 2063 + * bytes 1998 2064 */ 2065 + tce32_segsz = PNV_IODA1_DMA32_SEGSIZE >> (IOMMU_PAGE_SHIFT_4K - 3); 1999 2066 tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL, 2000 - get_order(TCE32_TABLE_SIZE * segs)); 2067 + get_order(tce32_segsz * segs)); 2001 2068 if (!tce_mem) { 2002 2069 pe_err(pe, " Failed to allocate a 32-bit TCE memory\n"); 2003 2070 goto fail; 2004 2071 } 2005 2072 addr = page_address(tce_mem); 2006 - memset(addr, 0, TCE32_TABLE_SIZE * segs); 2073 + memset(addr, 0, tce32_segsz * segs); 2007 2074 2008 2075 /* Configure HW */ 2009 2076 for (i = 0; i < segs; i++) { 2010 2077 rc = opal_pci_map_pe_dma_window(phb->opal_id, 2011 2078 pe->pe_number, 2012 2079 base + i, 1, 2013 - __pa(addr) + TCE32_TABLE_SIZE * i, 2014 - TCE32_TABLE_SIZE, 0x1000); 2080 + __pa(addr) + tce32_segsz * i, 2081 + tce32_segsz, IOMMU_PAGE_SIZE_4K); 2015 2082 if (rc) { 2016 2083 pe_err(pe, " Failed to configure 32-bit TCE table," 2017 2084 " err %ld\n", rc); ··· 2103 2002 } 2104 2003 } 2105 2004 2005 + /* Setup DMA32 segment mapping */ 2006 + for (i = base; i < base + segs; i++) 2007 + phb->ioda.dma32_segmap[i] = pe->pe_number; 2008 + 2106 2009 /* Setup linux iommu table */ 2107 - pnv_pci_setup_iommu_table(tbl, addr, TCE32_TABLE_SIZE * segs, 2108 - base << 28, IOMMU_PAGE_SHIFT_4K); 2010 + pnv_pci_setup_iommu_table(tbl, addr, tce32_segsz * segs, 2011 + base * PNV_IODA1_DMA32_SEGSIZE, 2012 + IOMMU_PAGE_SHIFT_4K); 2109 2013 2110 2014 /* OPAL variant of P7IOC SW invalidated TCEs */ 2111 2015 if (phb->ioda.tce_inval_reg) ··· 2137 2031 return; 2138 2032 fail: 2139 2033 /* XXX Failure: Try to fallback to 64-bit only ? */ 2140 - if (pe->tce32_seg >= 0) 2141 - pe->tce32_seg = -1; 2142 2034 if (tce_mem) 2143 - __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs)); 2035 + __free_pages(tce_mem, get_order(tce32_segsz * segs)); 2144 2036 if (tbl) { 2145 2037 pnv_pci_unlink_table_and_group(tbl, &pe->table_group); 2146 2038 iommu_free_table(tbl, "pnv"); ··· 2179 2075 2180 2076 pnv_pci_link_table_and_group(phb->hose->node, num, 2181 2077 tbl, &pe->table_group); 2182 - pnv_pci_ioda2_tce_invalidate_entire(pe); 2078 + pnv_pci_ioda2_tce_invalidate_pe(pe); 2183 2079 2184 2080 return 0; 2185 2081 } ··· 2323 2219 if (ret) 2324 2220 pe_warn(pe, "Unmapping failed, ret = %ld\n", ret); 2325 2221 else 2326 - pnv_pci_ioda2_tce_invalidate_entire(pe); 2222 + pnv_pci_ioda2_tce_invalidate_pe(pe); 2327 2223 2328 2224 pnv_pci_unlink_table_and_group(table_group->tables[num], table_group); 2329 2225 ··· 2392 2288 .take_ownership = pnv_ioda2_take_ownership, 2393 2289 .release_ownership = pnv_ioda2_release_ownership, 2394 2290 }; 2291 + 2292 + static int gpe_table_group_to_npe_cb(struct device *dev, void *opaque) 2293 + { 2294 + struct pci_controller *hose; 2295 + struct pnv_phb *phb; 2296 + struct pnv_ioda_pe **ptmppe = opaque; 2297 + struct pci_dev *pdev = container_of(dev, struct pci_dev, dev); 2298 + struct pci_dn *pdn = pci_get_pdn(pdev); 2299 + 2300 + if (!pdn || pdn->pe_number == IODA_INVALID_PE) 2301 + return 0; 2302 + 2303 + hose = pci_bus_to_host(pdev->bus); 2304 + phb = hose->private_data; 2305 + if (phb->type != PNV_PHB_NPU) 2306 + return 0; 2307 + 2308 + *ptmppe = &phb->ioda.pe_array[pdn->pe_number]; 2309 + 2310 + return 1; 2311 + } 2312 + 2313 + /* 2314 + * This returns PE of associated NPU. 2315 + * This assumes that NPU is in the same IOMMU group with GPU and there is 2316 + * no other PEs. 2317 + */ 2318 + static struct pnv_ioda_pe *gpe_table_group_to_npe( 2319 + struct iommu_table_group *table_group) 2320 + { 2321 + struct pnv_ioda_pe *npe = NULL; 2322 + int ret = iommu_group_for_each_dev(table_group->group, &npe, 2323 + gpe_table_group_to_npe_cb); 2324 + 2325 + BUG_ON(!ret || !npe); 2326 + 2327 + return npe; 2328 + } 2329 + 2330 + static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group, 2331 + int num, struct iommu_table *tbl) 2332 + { 2333 + long ret = pnv_pci_ioda2_set_window(table_group, num, tbl); 2334 + 2335 + if (ret) 2336 + return ret; 2337 + 2338 + ret = pnv_npu_set_window(gpe_table_group_to_npe(table_group), num, tbl); 2339 + if (ret) 2340 + pnv_pci_ioda2_unset_window(table_group, num); 2341 + 2342 + return ret; 2343 + } 2344 + 2345 + static long pnv_pci_ioda2_npu_unset_window( 2346 + struct iommu_table_group *table_group, 2347 + int num) 2348 + { 2349 + long ret = pnv_pci_ioda2_unset_window(table_group, num); 2350 + 2351 + if (ret) 2352 + return ret; 2353 + 2354 + return pnv_npu_unset_window(gpe_table_group_to_npe(table_group), num); 2355 + } 2356 + 2357 + static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group) 2358 + { 2359 + /* 2360 + * Detach NPU first as pnv_ioda2_take_ownership() will destroy 2361 + * the iommu_table if 32bit DMA is enabled. 2362 + */ 2363 + pnv_npu_take_ownership(gpe_table_group_to_npe(table_group)); 2364 + pnv_ioda2_take_ownership(table_group); 2365 + } 2366 + 2367 + static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = { 2368 + .get_table_size = pnv_pci_ioda2_get_table_size, 2369 + .create_table = pnv_pci_ioda2_create_table, 2370 + .set_window = pnv_pci_ioda2_npu_set_window, 2371 + .unset_window = pnv_pci_ioda2_npu_unset_window, 2372 + .take_ownership = pnv_ioda2_npu_take_ownership, 2373 + .release_ownership = pnv_ioda2_release_ownership, 2374 + }; 2375 + 2376 + static void pnv_pci_ioda_setup_iommu_api(void) 2377 + { 2378 + struct pci_controller *hose, *tmp; 2379 + struct pnv_phb *phb; 2380 + struct pnv_ioda_pe *pe, *gpe; 2381 + 2382 + /* 2383 + * Now we have all PHBs discovered, time to add NPU devices to 2384 + * the corresponding IOMMU groups. 2385 + */ 2386 + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 2387 + phb = hose->private_data; 2388 + 2389 + if (phb->type != PNV_PHB_NPU) 2390 + continue; 2391 + 2392 + list_for_each_entry(pe, &phb->ioda.pe_list, list) { 2393 + gpe = pnv_pci_npu_setup_iommu(pe); 2394 + if (gpe) 2395 + gpe->table_group.ops = &pnv_pci_ioda2_npu_ops; 2396 + } 2397 + } 2398 + } 2399 + #else /* !CONFIG_IOMMU_API */ 2400 + static void pnv_pci_ioda_setup_iommu_api(void) { }; 2395 2401 #endif 2396 2402 2397 2403 static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb) ··· 2657 2443 { 2658 2444 int64_t rc; 2659 2445 2660 - /* We shouldn't already have a 32-bit DMA associated */ 2661 - if (WARN_ON(pe->tce32_seg >= 0)) 2662 - return; 2663 - 2664 2446 /* TVE #1 is selected by PCI address bit 59 */ 2665 2447 pe->tce_bypass_base = 1ull << 59; 2666 2448 ··· 2664 2454 pe->pe_number); 2665 2455 2666 2456 /* The PE will reserve all possible 32-bits space */ 2667 - pe->tce32_seg = 0; 2668 2457 pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n", 2669 2458 phb->ioda.m32_pci_base); 2670 2459 ··· 2679 2470 #endif 2680 2471 2681 2472 rc = pnv_pci_ioda2_setup_default_config(pe); 2682 - if (rc) { 2683 - if (pe->tce32_seg >= 0) 2684 - pe->tce32_seg = -1; 2473 + if (rc) 2685 2474 return; 2686 - } 2687 2475 2688 2476 if (pe->flags & PNV_IODA_PE_DEV) 2689 2477 iommu_add_device(&pe->pdev->dev); ··· 2691 2485 static void pnv_ioda_setup_dma(struct pnv_phb *phb) 2692 2486 { 2693 2487 struct pci_controller *hose = phb->hose; 2694 - unsigned int residual, remaining, segs, tw, base; 2695 2488 struct pnv_ioda_pe *pe; 2489 + unsigned int weight; 2696 2490 2697 2491 /* If we have more PE# than segments available, hand out one 2698 2492 * per PE until we run out and let the rest fail. If not, 2699 2493 * then we assign at least one segment per PE, plus more based 2700 2494 * on the amount of devices under that PE 2701 2495 */ 2702 - if (phb->ioda.dma_pe_count > phb->ioda.tce32_count) 2703 - residual = 0; 2704 - else 2705 - residual = phb->ioda.tce32_count - 2706 - phb->ioda.dma_pe_count; 2707 - 2708 - pr_info("PCI: Domain %04x has %ld available 32-bit DMA segments\n", 2709 - hose->global_number, phb->ioda.tce32_count); 2710 - pr_info("PCI: %d PE# for a total weight of %d\n", 2711 - phb->ioda.dma_pe_count, phb->ioda.dma_weight); 2496 + pr_info("PCI: Domain %04x has %d available 32-bit DMA segments\n", 2497 + hose->global_number, phb->ioda.dma32_count); 2712 2498 2713 2499 pnv_pci_ioda_setup_opal_tce_kill(phb); 2714 2500 2715 - /* Walk our PE list and configure their DMA segments, hand them 2716 - * out one base segment plus any residual segments based on 2717 - * weight 2718 - */ 2719 - remaining = phb->ioda.tce32_count; 2720 - tw = phb->ioda.dma_weight; 2721 - base = 0; 2722 - list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) { 2723 - if (!pe->dma_weight) 2501 + /* Walk our PE list and configure their DMA segments */ 2502 + list_for_each_entry(pe, &phb->ioda.pe_list, list) { 2503 + weight = pnv_pci_ioda_pe_dma_weight(pe); 2504 + if (!weight) 2724 2505 continue; 2725 - if (!remaining) { 2726 - pe_warn(pe, "No DMA32 resources available\n"); 2727 - continue; 2728 - } 2729 - segs = 1; 2730 - if (residual) { 2731 - segs += ((pe->dma_weight * residual) + (tw / 2)) / tw; 2732 - if (segs > remaining) 2733 - segs = remaining; 2734 - } 2735 2506 2736 2507 /* 2737 2508 * For IODA2 compliant PHB3, we needn't care about the weight. ··· 2716 2533 * the specific PE. 2717 2534 */ 2718 2535 if (phb->type == PNV_PHB_IODA1) { 2719 - pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n", 2720 - pe->dma_weight, segs); 2721 - pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs); 2536 + pnv_pci_ioda1_setup_dma_pe(phb, pe); 2722 2537 } else if (phb->type == PNV_PHB_IODA2) { 2723 2538 pe_info(pe, "Assign DMA32 space\n"); 2724 - segs = 0; 2725 2539 pnv_pci_ioda2_setup_dma_pe(phb, pe); 2726 2540 } else if (phb->type == PNV_PHB_NPU) { 2727 2541 /* ··· 2728 2548 * as the PHB3 TVT. 2729 2549 */ 2730 2550 } 2731 - 2732 - remaining -= segs; 2733 - base += segs; 2734 2551 } 2735 2552 } 2736 2553 ··· 3035 2858 pdn->m64_single_mode = false; 3036 2859 3037 2860 total_vfs = pci_sriov_get_totalvfs(pdev); 3038 - mul = phb->ioda.total_pe; 2861 + mul = phb->ioda.total_pe_num; 3039 2862 total_vf_bar_sz = 0; 3040 2863 3041 2864 for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) { ··· 3106 2929 } 3107 2930 #endif /* CONFIG_PCI_IOV */ 3108 2931 2932 + static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, 2933 + struct resource *res) 2934 + { 2935 + struct pnv_phb *phb = pe->phb; 2936 + struct pci_bus_region region; 2937 + int index; 2938 + int64_t rc; 2939 + 2940 + if (!res || !res->flags || res->start > res->end) 2941 + return; 2942 + 2943 + if (res->flags & IORESOURCE_IO) { 2944 + region.start = res->start - phb->ioda.io_pci_base; 2945 + region.end = res->end - phb->ioda.io_pci_base; 2946 + index = region.start / phb->ioda.io_segsize; 2947 + 2948 + while (index < phb->ioda.total_pe_num && 2949 + region.start <= region.end) { 2950 + phb->ioda.io_segmap[index] = pe->pe_number; 2951 + rc = opal_pci_map_pe_mmio_window(phb->opal_id, 2952 + pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index); 2953 + if (rc != OPAL_SUCCESS) { 2954 + pr_err("%s: Error %lld mapping IO segment#%d to PE#%d\n", 2955 + __func__, rc, index, pe->pe_number); 2956 + break; 2957 + } 2958 + 2959 + region.start += phb->ioda.io_segsize; 2960 + index++; 2961 + } 2962 + } else if ((res->flags & IORESOURCE_MEM) && 2963 + !pnv_pci_is_mem_pref_64(res->flags)) { 2964 + region.start = res->start - 2965 + phb->hose->mem_offset[0] - 2966 + phb->ioda.m32_pci_base; 2967 + region.end = res->end - 2968 + phb->hose->mem_offset[0] - 2969 + phb->ioda.m32_pci_base; 2970 + index = region.start / phb->ioda.m32_segsize; 2971 + 2972 + while (index < phb->ioda.total_pe_num && 2973 + region.start <= region.end) { 2974 + phb->ioda.m32_segmap[index] = pe->pe_number; 2975 + rc = opal_pci_map_pe_mmio_window(phb->opal_id, 2976 + pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index); 2977 + if (rc != OPAL_SUCCESS) { 2978 + pr_err("%s: Error %lld mapping M32 segment#%d to PE#%d", 2979 + __func__, rc, index, pe->pe_number); 2980 + break; 2981 + } 2982 + 2983 + region.start += phb->ioda.m32_segsize; 2984 + index++; 2985 + } 2986 + } 2987 + } 2988 + 3109 2989 /* 3110 2990 * This function is supposed to be called on basis of PE from top 3111 2991 * to bottom style. So the the I/O or MMIO segment assigned to 3112 2992 * parent PE could be overrided by its child PEs if necessary. 3113 2993 */ 3114 - static void pnv_ioda_setup_pe_seg(struct pci_controller *hose, 3115 - struct pnv_ioda_pe *pe) 2994 + static void pnv_ioda_setup_pe_seg(struct pnv_ioda_pe *pe) 3116 2995 { 3117 - struct pnv_phb *phb = hose->private_data; 3118 - struct pci_bus_region region; 3119 - struct resource *res; 3120 - int i, index; 3121 - int rc; 2996 + struct pci_dev *pdev; 2997 + int i; 3122 2998 3123 2999 /* 3124 3000 * NOTE: We only care PCI bus based PE for now. For PCI ··· 3180 2950 */ 3181 2951 BUG_ON(!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))); 3182 2952 3183 - pci_bus_for_each_resource(pe->pbus, res, i) { 3184 - if (!res || !res->flags || 3185 - res->start > res->end) 2953 + list_for_each_entry(pdev, &pe->pbus->devices, bus_list) { 2954 + for (i = 0; i <= PCI_ROM_RESOURCE; i++) 2955 + pnv_ioda_setup_pe_res(pe, &pdev->resource[i]); 2956 + 2957 + /* 2958 + * If the PE contains all subordinate PCI buses, the 2959 + * windows of the child bridges should be mapped to 2960 + * the PE as well. 2961 + */ 2962 + if (!(pe->flags & PNV_IODA_PE_BUS_ALL) || !pci_is_bridge(pdev)) 3186 2963 continue; 3187 - 3188 - if (res->flags & IORESOURCE_IO) { 3189 - region.start = res->start - phb->ioda.io_pci_base; 3190 - region.end = res->end - phb->ioda.io_pci_base; 3191 - index = region.start / phb->ioda.io_segsize; 3192 - 3193 - while (index < phb->ioda.total_pe && 3194 - region.start <= region.end) { 3195 - phb->ioda.io_segmap[index] = pe->pe_number; 3196 - rc = opal_pci_map_pe_mmio_window(phb->opal_id, 3197 - pe->pe_number, OPAL_IO_WINDOW_TYPE, 0, index); 3198 - if (rc != OPAL_SUCCESS) { 3199 - pr_err("%s: OPAL error %d when mapping IO " 3200 - "segment #%d to PE#%d\n", 3201 - __func__, rc, index, pe->pe_number); 3202 - break; 3203 - } 3204 - 3205 - region.start += phb->ioda.io_segsize; 3206 - index++; 3207 - } 3208 - } else if ((res->flags & IORESOURCE_MEM) && 3209 - !pnv_pci_is_mem_pref_64(res->flags)) { 3210 - region.start = res->start - 3211 - hose->mem_offset[0] - 3212 - phb->ioda.m32_pci_base; 3213 - region.end = res->end - 3214 - hose->mem_offset[0] - 3215 - phb->ioda.m32_pci_base; 3216 - index = region.start / phb->ioda.m32_segsize; 3217 - 3218 - while (index < phb->ioda.total_pe && 3219 - region.start <= region.end) { 3220 - phb->ioda.m32_segmap[index] = pe->pe_number; 3221 - rc = opal_pci_map_pe_mmio_window(phb->opal_id, 3222 - pe->pe_number, OPAL_M32_WINDOW_TYPE, 0, index); 3223 - if (rc != OPAL_SUCCESS) { 3224 - pr_err("%s: OPAL error %d when mapping M32 " 3225 - "segment#%d to PE#%d", 3226 - __func__, rc, index, pe->pe_number); 3227 - break; 3228 - } 3229 - 3230 - region.start += phb->ioda.m32_segsize; 3231 - index++; 3232 - } 3233 - } 2964 + for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) 2965 + pnv_ioda_setup_pe_res(pe, 2966 + &pdev->resource[PCI_BRIDGE_RESOURCES + i]); 3234 2967 } 3235 2968 } 3236 2969 ··· 3211 3018 continue; 3212 3019 3213 3020 list_for_each_entry(pe, &phb->ioda.pe_list, list) { 3214 - pnv_ioda_setup_pe_seg(hose, pe); 3021 + pnv_ioda_setup_pe_seg(pe); 3215 3022 } 3216 3023 } 3217 3024 } ··· 3228 3035 phb = hose->private_data; 3229 3036 phb->initialized = 1; 3230 3037 } 3038 + 3039 + pnv_pci_ioda_setup_iommu_api(); 3231 3040 } 3232 3041 3233 3042 static void pnv_pci_ioda_create_dbgfs(void) ··· 3251 3056 #endif /* CONFIG_DEBUG_FS */ 3252 3057 } 3253 3058 3254 - static void pnv_npu_ioda_fixup(void) 3255 - { 3256 - bool enable_bypass; 3257 - struct pci_controller *hose, *tmp; 3258 - struct pnv_phb *phb; 3259 - struct pnv_ioda_pe *pe; 3260 - 3261 - list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { 3262 - phb = hose->private_data; 3263 - if (phb->type != PNV_PHB_NPU) 3264 - continue; 3265 - 3266 - list_for_each_entry(pe, &phb->ioda.pe_dma_list, dma_link) { 3267 - enable_bypass = dma_get_mask(&pe->pdev->dev) == 3268 - DMA_BIT_MASK(64); 3269 - pnv_npu_init_dma_pe(pe); 3270 - pnv_npu_dma_set_bypass(pe, enable_bypass); 3271 - } 3272 - } 3273 - } 3274 - 3275 3059 static void pnv_pci_ioda_fixup(void) 3276 3060 { 3277 3061 pnv_pci_ioda_setup_PEs(); ··· 3263 3089 eeh_init(); 3264 3090 eeh_addr_cache_build(); 3265 3091 #endif 3266 - 3267 - /* Link NPU IODA tables to their PCI devices. */ 3268 - pnv_npu_ioda_fixup(); 3269 3092 } 3270 3093 3271 3094 /* ··· 3366 3195 return true; 3367 3196 } 3368 3197 3369 - static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus, 3370 - u32 devfn) 3371 - { 3372 - return phb->ioda.pe_rmap[(bus->number << 8) | devfn]; 3373 - } 3374 - 3375 3198 static void pnv_pci_ioda_shutdown(struct pci_controller *hose) 3376 3199 { 3377 3200 struct pnv_phb *phb = hose->private_data; ··· 3375 3210 } 3376 3211 3377 3212 static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { 3378 - .dma_dev_setup = pnv_pci_dma_dev_setup, 3379 - .dma_bus_setup = pnv_pci_dma_bus_setup, 3213 + .dma_dev_setup = pnv_pci_dma_dev_setup, 3214 + .dma_bus_setup = pnv_pci_dma_bus_setup, 3380 3215 #ifdef CONFIG_PCI_MSI 3381 - .setup_msi_irqs = pnv_setup_msi_irqs, 3382 - .teardown_msi_irqs = pnv_teardown_msi_irqs, 3216 + .setup_msi_irqs = pnv_setup_msi_irqs, 3217 + .teardown_msi_irqs = pnv_teardown_msi_irqs, 3383 3218 #endif 3384 - .enable_device_hook = pnv_pci_enable_device_hook, 3385 - .window_alignment = pnv_pci_window_alignment, 3386 - .reset_secondary_bus = pnv_pci_reset_secondary_bus, 3387 - .dma_set_mask = pnv_pci_ioda_dma_set_mask, 3388 - .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask, 3389 - .shutdown = pnv_pci_ioda_shutdown, 3219 + .enable_device_hook = pnv_pci_enable_device_hook, 3220 + .window_alignment = pnv_pci_window_alignment, 3221 + .reset_secondary_bus = pnv_pci_reset_secondary_bus, 3222 + .dma_set_mask = pnv_pci_ioda_dma_set_mask, 3223 + .dma_get_required_mask = pnv_pci_ioda_dma_get_required_mask, 3224 + .shutdown = pnv_pci_ioda_shutdown, 3390 3225 }; 3391 3226 3227 + static int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask) 3228 + { 3229 + dev_err_once(&npdev->dev, 3230 + "%s operation unsupported for NVLink devices\n", 3231 + __func__); 3232 + return -EPERM; 3233 + } 3234 + 3392 3235 static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { 3393 - .dma_dev_setup = pnv_pci_dma_dev_setup, 3236 + .dma_dev_setup = pnv_pci_dma_dev_setup, 3394 3237 #ifdef CONFIG_PCI_MSI 3395 - .setup_msi_irqs = pnv_setup_msi_irqs, 3396 - .teardown_msi_irqs = pnv_teardown_msi_irqs, 3238 + .setup_msi_irqs = pnv_setup_msi_irqs, 3239 + .teardown_msi_irqs = pnv_teardown_msi_irqs, 3397 3240 #endif 3398 - .enable_device_hook = pnv_pci_enable_device_hook, 3399 - .window_alignment = pnv_pci_window_alignment, 3400 - .reset_secondary_bus = pnv_pci_reset_secondary_bus, 3401 - .dma_set_mask = pnv_npu_dma_set_mask, 3402 - .shutdown = pnv_pci_ioda_shutdown, 3241 + .enable_device_hook = pnv_pci_enable_device_hook, 3242 + .window_alignment = pnv_pci_window_alignment, 3243 + .reset_secondary_bus = pnv_pci_reset_secondary_bus, 3244 + .dma_set_mask = pnv_npu_dma_set_mask, 3245 + .shutdown = pnv_pci_ioda_shutdown, 3403 3246 }; 3404 3247 3405 3248 static void __init pnv_pci_init_ioda_phb(struct device_node *np, ··· 3415 3242 { 3416 3243 struct pci_controller *hose; 3417 3244 struct pnv_phb *phb; 3418 - unsigned long size, m32map_off, pemap_off, iomap_off = 0; 3245 + unsigned long size, m64map_off, m32map_off, pemap_off; 3246 + unsigned long iomap_off = 0, dma32map_off = 0; 3419 3247 const __be64 *prop64; 3420 3248 const __be32 *prop32; 3421 3249 int len; 3250 + unsigned int segno; 3422 3251 u64 phb_id; 3423 3252 void *aux; 3424 3253 long rc; ··· 3481 3306 pr_err(" Failed to map registers !\n"); 3482 3307 3483 3308 /* Initialize more IODA stuff */ 3484 - phb->ioda.total_pe = 1; 3309 + phb->ioda.total_pe_num = 1; 3485 3310 prop32 = of_get_property(np, "ibm,opal-num-pes", NULL); 3486 3311 if (prop32) 3487 - phb->ioda.total_pe = be32_to_cpup(prop32); 3312 + phb->ioda.total_pe_num = be32_to_cpup(prop32); 3488 3313 prop32 = of_get_property(np, "ibm,opal-reserved-pe", NULL); 3489 3314 if (prop32) 3490 - phb->ioda.reserved_pe = be32_to_cpup(prop32); 3315 + phb->ioda.reserved_pe_idx = be32_to_cpup(prop32); 3491 3316 3492 3317 /* Parse 64-bit MMIO range */ 3493 3318 pnv_ioda_parse_m64_window(phb); ··· 3496 3321 /* FW Has already off top 64k of M32 space (MSI space) */ 3497 3322 phb->ioda.m32_size += 0x10000; 3498 3323 3499 - phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe; 3324 + phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe_num; 3500 3325 phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0]; 3501 3326 phb->ioda.io_size = hose->pci_io_size; 3502 - phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe; 3327 + phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe_num; 3503 3328 phb->ioda.io_pci_base = 0; /* XXX calculate this ? */ 3504 3329 3330 + /* Calculate how many 32-bit TCE segments we have */ 3331 + phb->ioda.dma32_count = phb->ioda.m32_pci_base / 3332 + PNV_IODA1_DMA32_SEGSIZE; 3333 + 3505 3334 /* Allocate aux data & arrays. We don't have IO ports on PHB3 */ 3506 - size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long)); 3335 + size = _ALIGN_UP(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8, 3336 + sizeof(unsigned long)); 3337 + m64map_off = size; 3338 + size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]); 3507 3339 m32map_off = size; 3508 - size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]); 3340 + size += phb->ioda.total_pe_num * sizeof(phb->ioda.m32_segmap[0]); 3509 3341 if (phb->type == PNV_PHB_IODA1) { 3510 3342 iomap_off = size; 3511 - size += phb->ioda.total_pe * sizeof(phb->ioda.io_segmap[0]); 3343 + size += phb->ioda.total_pe_num * sizeof(phb->ioda.io_segmap[0]); 3344 + dma32map_off = size; 3345 + size += phb->ioda.dma32_count * 3346 + sizeof(phb->ioda.dma32_segmap[0]); 3512 3347 } 3513 3348 pemap_off = size; 3514 - size += phb->ioda.total_pe * sizeof(struct pnv_ioda_pe); 3349 + size += phb->ioda.total_pe_num * sizeof(struct pnv_ioda_pe); 3515 3350 aux = memblock_virt_alloc(size, 0); 3516 3351 phb->ioda.pe_alloc = aux; 3352 + phb->ioda.m64_segmap = aux + m64map_off; 3517 3353 phb->ioda.m32_segmap = aux + m32map_off; 3518 - if (phb->type == PNV_PHB_IODA1) 3354 + for (segno = 0; segno < phb->ioda.total_pe_num; segno++) { 3355 + phb->ioda.m64_segmap[segno] = IODA_INVALID_PE; 3356 + phb->ioda.m32_segmap[segno] = IODA_INVALID_PE; 3357 + } 3358 + if (phb->type == PNV_PHB_IODA1) { 3519 3359 phb->ioda.io_segmap = aux + iomap_off; 3520 - phb->ioda.pe_array = aux + pemap_off; 3521 - set_bit(phb->ioda.reserved_pe, phb->ioda.pe_alloc); 3360 + for (segno = 0; segno < phb->ioda.total_pe_num; segno++) 3361 + phb->ioda.io_segmap[segno] = IODA_INVALID_PE; 3522 3362 3523 - INIT_LIST_HEAD(&phb->ioda.pe_dma_list); 3363 + phb->ioda.dma32_segmap = aux + dma32map_off; 3364 + for (segno = 0; segno < phb->ioda.dma32_count; segno++) 3365 + phb->ioda.dma32_segmap[segno] = IODA_INVALID_PE; 3366 + } 3367 + phb->ioda.pe_array = aux + pemap_off; 3368 + set_bit(phb->ioda.reserved_pe_idx, phb->ioda.pe_alloc); 3369 + 3524 3370 INIT_LIST_HEAD(&phb->ioda.pe_list); 3525 3371 mutex_init(&phb->ioda.pe_list_mutex); 3526 3372 3527 3373 /* Calculate how many 32-bit TCE segments we have */ 3528 - phb->ioda.tce32_count = phb->ioda.m32_pci_base >> 28; 3374 + phb->ioda.dma32_count = phb->ioda.m32_pci_base / 3375 + PNV_IODA1_DMA32_SEGSIZE; 3529 3376 3530 3377 #if 0 /* We should really do that ... */ 3531 3378 rc = opal_pci_set_phb_mem_window(opal->phb_id, ··· 3559 3362 #endif 3560 3363 3561 3364 pr_info(" %03d (%03d) PE's M32: 0x%x [segment=0x%x]\n", 3562 - phb->ioda.total_pe, phb->ioda.reserved_pe, 3365 + phb->ioda.total_pe_num, phb->ioda.reserved_pe_idx, 3563 3366 phb->ioda.m32_size, phb->ioda.m32_segsize); 3564 3367 if (phb->ioda.m64_size) 3565 3368 pr_info(" M64: 0x%lx [segment=0x%lx]\n", ··· 3574 3377 phb->freeze_pe = pnv_ioda_freeze_pe; 3575 3378 phb->unfreeze_pe = pnv_ioda_unfreeze_pe; 3576 3379 3577 - /* Setup RID -> PE mapping function */ 3578 - phb->bdfn_to_pe = pnv_ioda_bdfn_to_pe; 3579 - 3580 - /* Setup TCEs */ 3581 - phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; 3582 - 3583 3380 /* Setup MSI support */ 3584 3381 pnv_pci_init_ioda_msis(phb); 3585 3382 ··· 3586 3395 */ 3587 3396 ppc_md.pcibios_fixup = pnv_pci_ioda_fixup; 3588 3397 3589 - if (phb->type == PNV_PHB_NPU) 3398 + if (phb->type == PNV_PHB_NPU) { 3590 3399 hose->controller_ops = pnv_npu_ioda_controller_ops; 3591 - else 3400 + } else { 3401 + phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup; 3592 3402 hose->controller_ops = pnv_pci_ioda_controller_ops; 3403 + } 3593 3404 3594 3405 #ifdef CONFIG_PCI_IOV 3595 3406 ppc_md.pcibios_fixup_sriov = pnv_pci_ioda_fixup_iov_resources;

+8 -11

arch/powerpc/platforms/powernv/pci.c

··· 39 39 /* Delay in usec */ 40 40 #define PCI_RESET_DELAY_US 3000000 41 41 42 - #define cfg_dbg(fmt...) do { } while(0) 43 - //#define cfg_dbg(fmt...) printk(fmt) 44 - 45 42 #ifdef CONFIG_PCI_MSI 46 43 int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) 47 44 { ··· 367 370 struct pnv_phb *phb = pdn->phb->private_data; 368 371 u8 fstate; 369 372 __be16 pcierr; 370 - int pe_no; 373 + unsigned int pe_no; 371 374 s64 rc; 372 375 373 376 /* ··· 377 380 */ 378 381 pe_no = pdn->pe_number; 379 382 if (pe_no == IODA_INVALID_PE) { 380 - pe_no = phb->ioda.reserved_pe; 383 + pe_no = phb->ioda.reserved_pe_idx; 381 384 } 382 385 383 386 /* ··· 399 402 } 400 403 } 401 404 402 - cfg_dbg(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n", 403 - (pdn->busno << 8) | (pdn->devfn), pe_no, fstate); 405 + pr_devel(" -> EEH check, bdfn=%04x PE#%d fstate=%x\n", 406 + (pdn->busno << 8) | (pdn->devfn), pe_no, fstate); 404 407 405 408 /* Clear the frozen state if applicable */ 406 409 if (fstate == OPAL_EEH_STOPPED_MMIO_FREEZE || ··· 448 451 return PCIBIOS_FUNC_NOT_SUPPORTED; 449 452 } 450 453 451 - cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n", 452 - __func__, pdn->busno, pdn->devfn, where, size, *val); 454 + pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n", 455 + __func__, pdn->busno, pdn->devfn, where, size, *val); 453 456 return PCIBIOS_SUCCESSFUL; 454 457 } 455 458 ··· 459 462 struct pnv_phb *phb = pdn->phb->private_data; 460 463 u32 bdfn = (pdn->busno << 8) | pdn->devfn; 461 464 462 - cfg_dbg("%s: bus: %x devfn: %x +%x/%x -> %08x\n", 463 - pdn->busno, pdn->devfn, where, size, val); 465 + pr_devel("%s: bus: %x devfn: %x +%x/%x -> %08x\n", 466 + __func__, pdn->busno, pdn->devfn, where, size, val); 464 467 switch (size) { 465 468 case 1: 466 469 opal_pci_config_write_byte(phb->opal_id, bdfn, where, val);

+28 -44

arch/powerpc/platforms/powernv/pci.h

··· 24 24 #define PNV_IODA_PE_MASTER (1 << 3) /* Master PE in compound case */ 25 25 #define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */ 26 26 #define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */ 27 - #define PNV_IODA_PE_PEER (1 << 6) /* PE has peers */ 28 27 29 28 /* Data associated with a PE, including IOMMU tracking etc.. */ 30 29 struct pnv_phb; 31 30 struct pnv_ioda_pe { 32 31 unsigned long flags; 33 32 struct pnv_phb *phb; 34 - 35 - #define PNV_IODA_MAX_PEER_PES 8 36 - struct pnv_ioda_pe *peers[PNV_IODA_MAX_PEER_PES]; 37 33 38 34 /* A PE can be associated with a single device or an 39 35 * entire bus (& children). In the former case, pdev ··· 49 53 /* PE number */ 50 54 unsigned int pe_number; 51 55 52 - /* "Weight" assigned to the PE for the sake of DMA resource 53 - * allocations 54 - */ 55 - unsigned int dma_weight; 56 - 57 56 /* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */ 58 - int tce32_seg; 59 - int tce32_segcount; 60 57 struct iommu_table_group table_group; 61 58 62 59 /* 64-bit TCE bypass region */ ··· 67 78 struct list_head slaves; 68 79 69 80 /* Link in list of PE#s */ 70 - struct list_head dma_link; 71 81 struct list_head list; 72 82 }; 73 83 ··· 98 110 unsigned int is_64, struct msi_msg *msg); 99 111 void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev); 100 112 void (*fixup_phb)(struct pci_controller *hose); 101 - u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn); 102 113 int (*init_m64)(struct pnv_phb *phb); 103 114 void (*reserve_m64_pe)(struct pci_bus *bus, 104 115 unsigned long *pe_bitmap, bool all); 105 - int (*pick_m64_pe)(struct pci_bus *bus, bool all); 116 + struct pnv_ioda_pe *(*pick_m64_pe)(struct pci_bus *bus, bool all); 106 117 int (*get_pe_state)(struct pnv_phb *phb, int pe_no); 107 118 void (*freeze_pe)(struct pnv_phb *phb, int pe_no); 108 119 int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt); 109 120 110 121 struct { 111 122 /* Global bridge info */ 112 - unsigned int total_pe; 113 - unsigned int reserved_pe; 123 + unsigned int total_pe_num; 124 + unsigned int reserved_pe_idx; 114 125 115 126 /* 32-bit MMIO window */ 116 127 unsigned int m32_size; ··· 128 141 unsigned int io_segsize; 129 142 unsigned int io_pci_base; 130 143 131 - /* PE allocation bitmap */ 132 - unsigned long *pe_alloc; 133 - /* PE allocation mutex */ 144 + /* PE allocation */ 134 145 struct mutex pe_alloc_mutex; 146 + unsigned long *pe_alloc; 147 + struct pnv_ioda_pe *pe_array; 135 148 136 149 /* M32 & IO segment maps */ 150 + unsigned int *m64_segmap; 137 151 unsigned int *m32_segmap; 138 152 unsigned int *io_segmap; 139 - struct pnv_ioda_pe *pe_array; 153 + 154 + /* DMA32 segment maps - IODA1 only */ 155 + unsigned int dma32_count; 156 + unsigned int *dma32_segmap; 140 157 141 158 /* IRQ chip */ 142 159 int irq_chip_init; ··· 157 166 * bus { bus, devfn } 158 167 */ 159 168 unsigned char pe_rmap[0x10000]; 160 - 161 - /* 32-bit TCE tables allocation */ 162 - unsigned long tce32_count; 163 - 164 - /* Total "weight" for the sake of DMA resources 165 - * allocation 166 - */ 167 - unsigned int dma_weight; 168 - unsigned int dma_pe_count; 169 - 170 - /* Sorted list of used PE's, sorted at 171 - * boot for resource allocation purposes 172 - */ 173 - struct list_head pe_dma_list; 174 169 175 170 /* TCE cache invalidate registers (physical and 176 171 * remapped) ··· 213 236 extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); 214 237 extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); 215 238 239 + extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, 240 + const char *fmt, ...); 241 + #define pe_err(pe, fmt, ...) \ 242 + pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__) 243 + #define pe_warn(pe, fmt, ...) \ 244 + pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__) 245 + #define pe_info(pe, fmt, ...) \ 246 + pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__) 247 + 216 248 /* Nvlink functions */ 217 - extern void pnv_npu_tce_invalidate_entire(struct pnv_ioda_pe *npe); 218 - extern void pnv_npu_tce_invalidate(struct pnv_ioda_pe *npe, 219 - struct iommu_table *tbl, 220 - unsigned long index, 221 - unsigned long npages, 222 - bool rm); 223 - extern void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe); 224 - extern void pnv_npu_setup_dma_pe(struct pnv_ioda_pe *npe); 225 - extern int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe, bool enabled); 226 - extern int pnv_npu_dma_set_mask(struct pci_dev *npdev, u64 dma_mask); 249 + extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); 250 + extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); 251 + extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe); 252 + extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, 253 + struct iommu_table *tbl); 254 + extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num); 255 + extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe); 256 + extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe); 227 257 228 258 #endif /* __POWERNV_PCI_H */

+4 -1

arch/powerpc/platforms/powernv/setup.c

··· 273 273 if (!of_flat_dt_is_compatible(root, "ibm,powernv")) 274 274 return 0; 275 275 276 - hpte_init_native(); 276 + if (IS_ENABLED(CONFIG_PPC_RADIX_MMU) && radix_enabled()) 277 + radix_init_native(); 278 + else if (IS_ENABLED(CONFIG_PPC_STD_MMU_64)) 279 + hpte_init_native(); 277 280 278 281 if (firmware_has_feature(FW_FEATURE_OPAL)) 279 282 pnv_setup_machdep_opal();

+1 -1

arch/powerpc/platforms/ps3/htab.c

··· 63 63 vflags &= ~HPTE_V_SECONDARY; 64 64 65 65 hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; 66 - hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize) | rflags; 66 + hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize, ssize) | rflags; 67 67 68 68 spin_lock_irqsave(&ps3_htab_lock, flags); 69 69

+2 -2

arch/powerpc/platforms/ps3/spu.c

··· 205 205 static int __init setup_areas(struct spu *spu) 206 206 { 207 207 struct table {char* name; unsigned long addr; unsigned long size;}; 208 - static const unsigned long shadow_flags = _PAGE_NO_CACHE | 3; 208 + unsigned long shadow_flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL_RO)); 209 209 210 210 spu_pdata(spu)->shadow = __ioremap(spu_pdata(spu)->shadow_addr, 211 211 sizeof(struct spe_shadow), ··· 216 216 } 217 217 218 218 spu->local_store = (__force void *)ioremap_prot(spu->local_store_phys, 219 - LS_SIZE, _PAGE_NO_CACHE); 219 + LS_SIZE, pgprot_val(pgprot_noncached_wc(__pgprot(0)))); 220 220 221 221 if (!spu->local_store) { 222 222 pr_debug("%s:%d: ioremap local_store failed\n",

+181 -44

arch/powerpc/platforms/pseries/hotplug-memory.c

··· 116 116 return new_prop; 117 117 } 118 118 119 + static void dlpar_update_drconf_property(struct device_node *dn, 120 + struct property *prop) 121 + { 122 + struct of_drconf_cell *lmbs; 123 + u32 num_lmbs, *p; 124 + int i; 125 + 126 + /* Convert the property back to BE */ 127 + p = prop->value; 128 + num_lmbs = *p; 129 + *p = cpu_to_be32(*p); 130 + p++; 131 + 132 + lmbs = (struct of_drconf_cell *)p; 133 + for (i = 0; i < num_lmbs; i++) { 134 + lmbs[i].base_addr = cpu_to_be64(lmbs[i].base_addr); 135 + lmbs[i].drc_index = cpu_to_be32(lmbs[i].drc_index); 136 + lmbs[i].flags = cpu_to_be32(lmbs[i].flags); 137 + } 138 + 139 + rtas_hp_event = true; 140 + of_update_property(dn, prop); 141 + rtas_hp_event = false; 142 + } 143 + 144 + static int dlpar_update_device_tree_lmb(struct of_drconf_cell *lmb) 145 + { 146 + struct device_node *dn; 147 + struct property *prop; 148 + struct of_drconf_cell *lmbs; 149 + u32 *p, num_lmbs; 150 + int i; 151 + 152 + dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 153 + if (!dn) 154 + return -ENODEV; 155 + 156 + prop = dlpar_clone_drconf_property(dn); 157 + if (!prop) { 158 + of_node_put(dn); 159 + return -ENODEV; 160 + } 161 + 162 + p = prop->value; 163 + num_lmbs = *p++; 164 + lmbs = (struct of_drconf_cell *)p; 165 + 166 + for (i = 0; i < num_lmbs; i++) { 167 + if (lmbs[i].drc_index == lmb->drc_index) { 168 + lmbs[i].flags = lmb->flags; 169 + lmbs[i].aa_index = lmb->aa_index; 170 + 171 + dlpar_update_drconf_property(dn, prop); 172 + break; 173 + } 174 + } 175 + 176 + of_node_put(dn); 177 + return 0; 178 + } 179 + 180 + static u32 lookup_lmb_associativity_index(struct of_drconf_cell *lmb) 181 + { 182 + struct device_node *parent, *lmb_node, *dr_node; 183 + const u32 *lmb_assoc; 184 + const u32 *assoc_arrays; 185 + u32 aa_index; 186 + int aa_arrays, aa_array_entries, aa_array_sz; 187 + int i; 188 + 189 + parent = of_find_node_by_path("/"); 190 + if (!parent) 191 + return -ENODEV; 192 + 193 + lmb_node = dlpar_configure_connector(cpu_to_be32(lmb->drc_index), 194 + parent); 195 + of_node_put(parent); 196 + if (!lmb_node) 197 + return -EINVAL; 198 + 199 + lmb_assoc = of_get_property(lmb_node, "ibm,associativity", NULL); 200 + if (!lmb_assoc) { 201 + dlpar_free_cc_nodes(lmb_node); 202 + return -ENODEV; 203 + } 204 + 205 + dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); 206 + if (!dr_node) { 207 + dlpar_free_cc_nodes(lmb_node); 208 + return -ENODEV; 209 + } 210 + 211 + assoc_arrays = of_get_property(dr_node, 212 + "ibm,associativity-lookup-arrays", 213 + NULL); 214 + of_node_put(dr_node); 215 + if (!assoc_arrays) { 216 + dlpar_free_cc_nodes(lmb_node); 217 + return -ENODEV; 218 + } 219 + 220 + /* The ibm,associativity-lookup-arrays property is defined to be 221 + * a 32-bit value specifying the number of associativity arrays 222 + * followed by a 32-bitvalue specifying the number of entries per 223 + * array, followed by the associativity arrays. 224 + */ 225 + aa_arrays = be32_to_cpu(assoc_arrays[0]); 226 + aa_array_entries = be32_to_cpu(assoc_arrays[1]); 227 + aa_array_sz = aa_array_entries * sizeof(u32); 228 + 229 + aa_index = -1; 230 + for (i = 0; i < aa_arrays; i++) { 231 + int indx = (i * aa_array_entries) + 2; 232 + 233 + if (memcmp(&assoc_arrays[indx], &lmb_assoc[1], aa_array_sz)) 234 + continue; 235 + 236 + aa_index = i; 237 + break; 238 + } 239 + 240 + dlpar_free_cc_nodes(lmb_node); 241 + return aa_index; 242 + } 243 + 244 + static int dlpar_add_device_tree_lmb(struct of_drconf_cell *lmb) 245 + { 246 + int aa_index; 247 + 248 + lmb->flags |= DRCONF_MEM_ASSIGNED; 249 + 250 + aa_index = lookup_lmb_associativity_index(lmb); 251 + if (aa_index < 0) { 252 + pr_err("Couldn't find associativity index for drc index %x\n", 253 + lmb->drc_index); 254 + return aa_index; 255 + } 256 + 257 + lmb->aa_index = aa_index; 258 + return dlpar_update_device_tree_lmb(lmb); 259 + } 260 + 261 + static int dlpar_remove_device_tree_lmb(struct of_drconf_cell *lmb) 262 + { 263 + lmb->flags &= ~DRCONF_MEM_ASSIGNED; 264 + lmb->aa_index = 0xffffffff; 265 + return dlpar_update_device_tree_lmb(lmb); 266 + } 267 + 119 268 static struct memory_block *lmb_to_memblock(struct of_drconf_cell *lmb) 120 269 { 121 270 unsigned long section_nr; ··· 392 243 memblock_remove(lmb->base_addr, block_sz); 393 244 394 245 dlpar_release_drc(lmb->drc_index); 246 + dlpar_remove_device_tree_lmb(lmb); 395 247 396 - lmb->flags &= ~DRCONF_MEM_ASSIGNED; 397 248 return 0; 398 249 } 399 250 ··· 533 384 534 385 #endif /* CONFIG_MEMORY_HOTREMOVE */ 535 386 536 - static int dlpar_add_lmb(struct of_drconf_cell *lmb) 387 + static int dlpar_add_lmb_memory(struct of_drconf_cell *lmb) 537 388 { 538 389 struct memory_block *mem_block; 539 390 unsigned long block_sz; 540 391 int nid, rc; 541 392 542 - if (lmb->flags & DRCONF_MEM_ASSIGNED) 543 - return -EINVAL; 544 - 545 393 block_sz = memory_block_size_bytes(); 546 - 547 - rc = dlpar_acquire_drc(lmb->drc_index); 548 - if (rc) 549 - return rc; 550 394 551 395 /* Find the node id for this address */ 552 396 nid = memory_add_physaddr_to_nid(lmb->base_addr); 553 397 554 398 /* Add the memory */ 555 399 rc = add_memory(nid, lmb->base_addr, block_sz); 556 - if (rc) { 557 - dlpar_release_drc(lmb->drc_index); 400 + if (rc) 558 401 return rc; 559 - } 560 402 561 403 /* Register this block of memory */ 562 404 rc = memblock_add(lmb->base_addr, block_sz); 563 405 if (rc) { 564 406 remove_memory(nid, lmb->base_addr, block_sz); 565 - dlpar_release_drc(lmb->drc_index); 566 407 return rc; 567 408 } 568 409 569 410 mem_block = lmb_to_memblock(lmb); 570 411 if (!mem_block) { 571 412 remove_memory(nid, lmb->base_addr, block_sz); 572 - dlpar_release_drc(lmb->drc_index); 573 413 return -EINVAL; 574 414 } 575 415 ··· 566 428 put_device(&mem_block->dev); 567 429 if (rc) { 568 430 remove_memory(nid, lmb->base_addr, block_sz); 569 - dlpar_release_drc(lmb->drc_index); 570 431 return rc; 571 432 } 572 433 573 434 lmb->flags |= DRCONF_MEM_ASSIGNED; 574 435 return 0; 436 + } 437 + 438 + static int dlpar_add_lmb(struct of_drconf_cell *lmb) 439 + { 440 + int rc; 441 + 442 + if (lmb->flags & DRCONF_MEM_ASSIGNED) 443 + return -EINVAL; 444 + 445 + rc = dlpar_acquire_drc(lmb->drc_index); 446 + if (rc) 447 + return rc; 448 + 449 + rc = dlpar_add_device_tree_lmb(lmb); 450 + if (rc) { 451 + pr_err("Couldn't update device tree for drc index %x\n", 452 + lmb->drc_index); 453 + dlpar_release_drc(lmb->drc_index); 454 + return rc; 455 + } 456 + 457 + rc = dlpar_add_lmb_memory(lmb); 458 + if (rc) { 459 + dlpar_remove_device_tree_lmb(lmb); 460 + dlpar_release_drc(lmb->drc_index); 461 + } 462 + 463 + return rc; 575 464 } 576 465 577 466 static int dlpar_memory_add_by_count(u32 lmbs_to_add, struct property *prop) ··· 701 536 return rc; 702 537 } 703 538 704 - static void dlpar_update_drconf_property(struct device_node *dn, 705 - struct property *prop) 706 - { 707 - struct of_drconf_cell *lmbs; 708 - u32 num_lmbs, *p; 709 - int i; 710 - 711 - /* Convert the property back to BE */ 712 - p = prop->value; 713 - num_lmbs = *p; 714 - *p = cpu_to_be32(*p); 715 - p++; 716 - 717 - lmbs = (struct of_drconf_cell *)p; 718 - for (i = 0; i < num_lmbs; i++) { 719 - lmbs[i].base_addr = cpu_to_be64(lmbs[i].base_addr); 720 - lmbs[i].drc_index = cpu_to_be32(lmbs[i].drc_index); 721 - lmbs[i].flags = cpu_to_be32(lmbs[i].flags); 722 - } 723 - 724 - rtas_hp_event = true; 725 - of_update_property(dn, prop); 726 - rtas_hp_event = false; 727 - } 728 - 729 539 int dlpar_memory(struct pseries_hp_errorlog *hp_elog) 730 540 { 731 541 struct device_node *dn; ··· 748 608 break; 749 609 } 750 610 751 - if (rc) 752 - dlpar_free_drconf_property(prop); 753 - else 754 - dlpar_update_drconf_property(dn, prop); 611 + dlpar_free_drconf_property(prop); 755 612 756 613 dlpar_memory_out: 757 614 of_node_put(dn);

+12 -12

arch/powerpc/platforms/pseries/iommu.c

··· 912 912 static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, 913 913 struct ddw_query_response *query) 914 914 { 915 - struct eeh_dev *edev; 915 + struct device_node *dn; 916 + struct pci_dn *pdn; 916 917 u32 cfg_addr; 917 918 u64 buid; 918 919 int ret; ··· 924 923 * Retrieve them from the pci device, not the node with the 925 924 * dma-window property 926 925 */ 927 - edev = pci_dev_to_eeh_dev(dev); 928 - cfg_addr = edev->config_addr; 929 - if (edev->pe_config_addr) 930 - cfg_addr = edev->pe_config_addr; 931 - buid = edev->phb->buid; 926 + dn = pci_device_to_OF_node(dev); 927 + pdn = PCI_DN(dn); 928 + buid = pdn->phb->buid; 929 + cfg_addr = (pdn->busno << 8) | pdn->devfn; 932 930 933 931 ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query, 934 932 cfg_addr, BUID_HI(buid), BUID_LO(buid)); ··· 941 941 struct ddw_create_response *create, int page_shift, 942 942 int window_shift) 943 943 { 944 - struct eeh_dev *edev; 944 + struct device_node *dn; 945 + struct pci_dn *pdn; 945 946 u32 cfg_addr; 946 947 u64 buid; 947 948 int ret; ··· 953 952 * Retrieve them from the pci device, not the node with the 954 953 * dma-window property 955 954 */ 956 - edev = pci_dev_to_eeh_dev(dev); 957 - cfg_addr = edev->config_addr; 958 - if (edev->pe_config_addr) 959 - cfg_addr = edev->pe_config_addr; 960 - buid = edev->phb->buid; 955 + dn = pci_device_to_OF_node(dev); 956 + pdn = PCI_DN(dn); 957 + buid = pdn->phb->buid; 958 + cfg_addr = (pdn->busno << 8) | pdn->devfn; 961 959 962 960 do { 963 961 /* extra outputs are LIOBN and dma-addr (hi, lo) */

+12 -8

arch/powerpc/platforms/pseries/lpar.c

··· 89 89 "%lx failed with %ld\n", cpu, hwcpu, addr, ret); 90 90 return; 91 91 } 92 + 93 + #ifdef CONFIG_PPC_STD_MMU_64 92 94 /* 93 95 * PAPR says this feature is SLB-Buffer but firmware never 94 96 * reports that. All SPLPAR support SLB shadow buffer. 95 97 */ 96 - addr = __pa(paca[cpu].slb_shadow_ptr); 97 - if (firmware_has_feature(FW_FEATURE_SPLPAR)) { 98 + if (!radix_enabled() && firmware_has_feature(FW_FEATURE_SPLPAR)) { 99 + addr = __pa(paca[cpu].slb_shadow_ptr); 98 100 ret = register_slb_shadow(hwcpu, addr); 99 101 if (ret) 100 102 pr_err("WARNING: SLB shadow buffer registration for " 101 103 "cpu %d (hw %d) of area %lx failed with %ld\n", 102 104 cpu, hwcpu, addr, ret); 103 105 } 106 + #endif /* CONFIG_PPC_STD_MMU_64 */ 104 107 105 108 /* 106 109 * Register dispatch trace log, if one has been allocated. ··· 126 123 } 127 124 } 128 125 126 + #ifdef CONFIG_PPC_STD_MMU_64 127 + 129 128 static long pSeries_lpar_hpte_insert(unsigned long hpte_group, 130 129 unsigned long vpn, unsigned long pa, 131 130 unsigned long rflags, unsigned long vflags, ··· 144 139 hpte_group, vpn, pa, rflags, vflags, psize); 145 140 146 141 hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; 147 - hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; 142 + hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags; 148 143 149 144 if (!(vflags & HPTE_V_BOLTED)) 150 145 pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r); ··· 156 151 /* I-cache synchronize = 0 */ 157 152 /* Exact = 0 */ 158 153 flags = 0; 159 - 160 - /* Make pHyp happy */ 161 - if ((rflags & _PAGE_NO_CACHE) && !(rflags & _PAGE_WRITETHRU)) 162 - hpte_r &= ~HPTE_R_M; 163 154 164 155 if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N)) 165 156 flags |= H_COALESCE_CAND; ··· 660 659 661 660 void arch_free_page(struct page *page, int order) 662 661 { 662 + if (radix_enabled()) 663 + return; 663 664 if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO)) 664 665 return; 665 666 ··· 669 666 } 670 667 EXPORT_SYMBOL(arch_free_page); 671 668 672 - #endif 669 + #endif /* CONFIG_PPC_SMLPAR */ 670 + #endif /* CONFIG_PPC_STD_MMU_64 */ 673 671 674 672 #ifdef CONFIG_TRACEPOINTS 675 673 #ifdef HAVE_JUMP_LABEL

+2 -1

arch/powerpc/platforms/pseries/lparcfg.c

··· 484 484 seq_printf(m, "shared_processor_mode=%d\n", 485 485 lppaca_shared_proc(get_lppaca())); 486 486 487 + #ifdef CONFIG_PPC_STD_MMU_64 487 488 seq_printf(m, "slb_size=%d\n", mmu_slb_size); 488 - 489 + #endif 489 490 parse_em_data(m); 490 491 491 492 return 0;

+2 -2

arch/powerpc/platforms/pseries/mobility.c

··· 191 191 break; 192 192 193 193 case 0x80000000: 194 - prop = of_find_property(dn, prop_name, NULL); 195 - of_remove_property(dn, prop); 194 + of_remove_property(dn, of_find_property(dn, 195 + prop_name, NULL)); 196 196 prop = NULL; 197 197 break; 198 198

+2 -2

arch/powerpc/platforms/pseries/msi.c

··· 305 305 memset(&counts, 0, sizeof(struct msi_counts)); 306 306 307 307 /* Work out how many devices we have below this PE */ 308 - traverse_pci_devices(pe_dn, count_non_bridge_devices, &counts); 308 + pci_traverse_device_nodes(pe_dn, count_non_bridge_devices, &counts); 309 309 310 310 if (counts.num_devices == 0) { 311 311 pr_err("rtas_msi: found 0 devices under PE for %s\n", ··· 320 320 /* else, we have some more calculating to do */ 321 321 counts.requestor = pci_device_to_OF_node(dev); 322 322 counts.request = request; 323 - traverse_pci_devices(pe_dn, count_spare_msis, &counts); 323 + pci_traverse_device_nodes(pe_dn, count_spare_msis, &counts); 324 324 325 325 /* If the quota isn't an integer multiple of the total, we can 326 326 * use the remainder as spare MSIs for anyone that wants them. */

-32

arch/powerpc/platforms/pseries/pci_dlpar.c

··· 34 34 35 35 #include "pseries.h" 36 36 37 - static struct pci_bus * 38 - find_bus_among_children(struct pci_bus *bus, 39 - struct device_node *dn) 40 - { 41 - struct pci_bus *child = NULL; 42 - struct pci_bus *tmp; 43 - struct device_node *busdn; 44 - 45 - busdn = pci_bus_to_OF_node(bus); 46 - if (busdn == dn) 47 - return bus; 48 - 49 - list_for_each_entry(tmp, &bus->children, node) { 50 - child = find_bus_among_children(tmp, dn); 51 - if (child) 52 - break; 53 - }; 54 - return child; 55 - } 56 - 57 - struct pci_bus * 58 - pcibios_find_pci_bus(struct device_node *dn) 59 - { 60 - struct pci_dn *pdn = dn->data; 61 - 62 - if (!pdn || !pdn->phb || !pdn->phb->bus) 63 - return NULL; 64 - 65 - return find_bus_among_children(pdn->phb->bus, dn); 66 - } 67 - EXPORT_SYMBOL_GPL(pcibios_find_pci_bus); 68 - 69 37 struct pci_controller *init_phb_dynamic(struct device_node *dn) 70 38 { 71 39 struct pci_controller *phb;

+1 -4

arch/powerpc/platforms/pseries/reconfig.c

··· 303 303 { 304 304 struct device_node *np; 305 305 char *tmp; 306 - struct property *prop; 307 306 buf = parse_node(buf, bufsize, &np); 308 307 309 308 if (!np) ··· 315 316 if (strlen(buf) == 0) 316 317 return -EINVAL; 317 318 318 - prop = of_find_property(np, buf, NULL); 319 - 320 - return of_remove_property(np, prop); 319 + return of_remove_property(np, of_find_property(np, buf, NULL)); 321 320 } 322 321 323 322 static int do_update_property(char *buf, size_t bufsize)

+3 -1

arch/powerpc/platforms/pseries/setup.c

··· 235 235 236 236 for_each_node_by_name(np, "interrupt-controller") { 237 237 typep = of_get_property(np, "compatible", NULL); 238 + if (!typep) 239 + continue; 238 240 if (strstr(typep, "open-pic")) { 239 241 pSeries_mpic_node = of_node_get(np); 240 242 ppc_md.init_IRQ = pseries_mpic_init_IRQ; ··· 267 265 pdn = parent ? PCI_DN(parent) : NULL; 268 266 if (pdn) { 269 267 /* Create pdn and EEH device */ 270 - update_dn_pci_info(np, pdn->phb); 268 + pci_add_device_node_info(pdn->phb, np); 271 269 eeh_dev_init(PCI_DN(np), pdn->phb); 272 270 } 273 271

+24

arch/powerpc/sysdev/fsl_pci.c

··· 37 37 #include <asm/pci-bridge.h> 38 38 #include <asm/ppc-pci.h> 39 39 #include <asm/machdep.h> 40 + #include <asm/mpc85xx.h> 40 41 #include <asm/disassemble.h> 41 42 #include <asm/ppc-opcode.h> 42 43 #include <sysdev/fsl_soc.h> ··· 528 527 u8 hdr_type, progif; 529 528 struct device_node *dev; 530 529 struct ccsr_pci __iomem *pci; 530 + u16 temp; 531 + u32 svr = mfspr(SPRN_SVR); 531 532 532 533 dev = pdev->dev.of_node; 533 534 ··· 599 596 PPC_INDIRECT_TYPE_SURPRESS_PRIMARY_BUS; 600 597 if (fsl_pcie_check_link(hose)) 601 598 hose->indirect_type |= PPC_INDIRECT_TYPE_NO_PCIE_LINK; 599 + } else { 600 + /* 601 + * Set PBFR(PCI Bus Function Register)[10] = 1 to 602 + * disable the combining of crossing cacheline 603 + * boundary requests into one burst transaction. 604 + * PCI-X operation is not affected. 605 + * Fix erratum PCI 5 on MPC8548 606 + */ 607 + #define PCI_BUS_FUNCTION 0x44 608 + #define PCI_BUS_FUNCTION_MDS 0x400 /* Master disable streaming */ 609 + if (((SVR_SOC_VER(svr) == SVR_8543) || 610 + (SVR_SOC_VER(svr) == SVR_8545) || 611 + (SVR_SOC_VER(svr) == SVR_8547) || 612 + (SVR_SOC_VER(svr) == SVR_8548)) && 613 + !early_find_capability(hose, 0, 0, PCI_CAP_ID_PCIX)) { 614 + early_read_config_word(hose, 0, 0, 615 + PCI_BUS_FUNCTION, &temp); 616 + temp |= PCI_BUS_FUNCTION_MDS; 617 + early_write_config_word(hose, 0, 0, 618 + PCI_BUS_FUNCTION, temp); 619 + } 602 620 } 603 621 604 622 printk(KERN_INFO "Found FSL PCI host bridge at 0x%016llx. "

+8 -1

arch/powerpc/sysdev/mpic.c

··· 2004 2004 2005 2005 static int mpic_init_sys(void) 2006 2006 { 2007 + int rc; 2008 + 2007 2009 register_syscore_ops(&mpic_syscore_ops); 2008 - subsys_system_register(&mpic_subsys, NULL); 2010 + rc = subsys_system_register(&mpic_subsys, NULL); 2011 + if (rc) { 2012 + unregister_syscore_ops(&mpic_syscore_ops); 2013 + pr_err("mpic: Failed to register subsystem!\n"); 2014 + return rc; 2015 + } 2009 2016 2010 2017 return 0; 2011 2018 }

+1 -1

arch/powerpc/xmon/Makefile

··· 7 7 8 8 ccflags-$(CONFIG_PPC64) := $(NO_MINIMAL_TOC) 9 9 10 - obj-y += xmon.o nonstdio.o 10 + obj-y += xmon.o nonstdio.o spr_access.o 11 11 12 12 ifdef CONFIG_XMON_DISASSEMBLY 13 13 obj-y += ppc-dis.o ppc-opc.o

+45

arch/powerpc/xmon/spr_access.S

··· 1 + #include <asm/ppc_asm.h> 2 + 3 + /* unsigned long xmon_mfspr(sprn, default_value) */ 4 + _GLOBAL(xmon_mfspr) 5 + ld r5, .Lmfspr_table@got(r2) 6 + b xmon_mxspr 7 + 8 + /* void xmon_mtspr(sprn, new_value) */ 9 + _GLOBAL(xmon_mtspr) 10 + ld r5, .Lmtspr_table@got(r2) 11 + b xmon_mxspr 12 + 13 + /* 14 + * r3 = sprn 15 + * r4 = default or new value 16 + * r5 = table base 17 + */ 18 + xmon_mxspr: 19 + /* 20 + * To index into the table of mxsprs we need: 21 + * i = (sprn & 0x3ff) * 8 22 + * or using rwlinm: 23 + * i = (sprn << 3) & (0x3ff << 3) 24 + */ 25 + rlwinm r3, r3, 3, 0x3ff << 3 26 + add r5, r5, r3 27 + mtctr r5 28 + mr r3, r4 /* put default_value in r3 for mfspr */ 29 + bctr 30 + 31 + .Lmfspr_table: 32 + spr = 0 33 + .rept 1024 34 + mfspr r3, spr 35 + blr 36 + spr = spr + 1 37 + .endr 38 + 39 + .Lmtspr_table: 40 + spr = 0 41 + .rept 1024 42 + mtspr spr, r4 43 + blr 44 + spr = spr + 1 45 + .endr

+78 -62

arch/powerpc/xmon/xmon.c

··· 86 86 87 87 static long bus_error_jmp[JMP_BUF_LEN]; 88 88 static int catch_memory_errors; 89 + static int catch_spr_faults; 89 90 static long *xmon_fault_jmp[NR_CPUS]; 90 91 91 92 /* Breakpoint stuff */ ··· 148 147 static void flush_input(void); 149 148 static int inchar(void); 150 149 static void take_input(char *); 151 - static unsigned long read_spr(int); 150 + static int read_spr(int, unsigned long *); 152 151 static void write_spr(int, unsigned long); 153 152 static void super_regs(void); 154 153 static void remove_bpts(void); ··· 251 250 sdi # disassemble spu local store for spu # (in hex)\n" 252 251 #endif 253 252 " S print special registers\n\ 253 + Sa print all SPRs\n\ 254 + Sr # read SPR #\n\ 255 + Sw #v write v to SPR #\n\ 254 256 t print backtrace\n\ 255 257 x exit monitor and recover\n\ 256 258 X exit monitor and don't recover\n" ··· 446 442 #ifdef CONFIG_SMP 447 443 cpu = smp_processor_id(); 448 444 if (cpumask_test_cpu(cpu, &cpus_in_xmon)) { 445 + /* 446 + * We catch SPR read/write faults here because the 0x700, 0xf60 447 + * etc. handlers don't call debugger_fault_handler(). 448 + */ 449 + if (catch_spr_faults) 450 + longjmp(bus_error_jmp, 1); 449 451 get_output_lock(); 450 452 excprint(regs); 451 453 printf("cpu 0x%x: Exception %lx %s in xmon, " ··· 1645 1635 catch_memory_errors = 0; 1646 1636 } 1647 1637 1648 - static unsigned long 1649 - read_spr(int n) 1638 + extern unsigned long xmon_mfspr(int spr, unsigned long default_value); 1639 + extern void xmon_mtspr(int spr, unsigned long value); 1640 + 1641 + static int 1642 + read_spr(int n, unsigned long *vp) 1650 1643 { 1651 - unsigned int instrs[2]; 1652 - unsigned long (*code)(void); 1653 1644 unsigned long ret = -1UL; 1654 - #ifdef CONFIG_PPC64 1655 - unsigned long opd[3]; 1656 - 1657 - opd[0] = (unsigned long)instrs; 1658 - opd[1] = 0; 1659 - opd[2] = 0; 1660 - code = (unsigned long (*)(void)) opd; 1661 - #else 1662 - code = (unsigned long (*)(void)) instrs; 1663 - #endif 1664 - 1665 - /* mfspr r3,n; blr */ 1666 - instrs[0] = 0x7c6002a6 + ((n & 0x1F) << 16) + ((n & 0x3e0) << 6); 1667 - instrs[1] = 0x4e800020; 1668 - store_inst(instrs); 1669 - store_inst(instrs+1); 1645 + int ok = 0; 1670 1646 1671 1647 if (setjmp(bus_error_jmp) == 0) { 1672 - catch_memory_errors = 1; 1648 + catch_spr_faults = 1; 1673 1649 sync(); 1674 1650 1675 - ret = code(); 1651 + ret = xmon_mfspr(n, *vp); 1676 1652 1677 1653 sync(); 1678 - /* wait a little while to see if we get a machine check */ 1679 - __delay(200); 1680 - n = size; 1654 + *vp = ret; 1655 + ok = 1; 1681 1656 } 1657 + catch_spr_faults = 0; 1682 1658 1683 - return ret; 1659 + return ok; 1684 1660 } 1685 1661 1686 1662 static void 1687 1663 write_spr(int n, unsigned long val) 1688 1664 { 1689 - unsigned int instrs[2]; 1690 - unsigned long (*code)(unsigned long); 1691 - #ifdef CONFIG_PPC64 1692 - unsigned long opd[3]; 1693 - 1694 - opd[0] = (unsigned long)instrs; 1695 - opd[1] = 0; 1696 - opd[2] = 0; 1697 - code = (unsigned long (*)(unsigned long)) opd; 1698 - #else 1699 - code = (unsigned long (*)(unsigned long)) instrs; 1700 - #endif 1701 - 1702 - instrs[0] = 0x7c6003a6 + ((n & 0x1F) << 16) + ((n & 0x3e0) << 6); 1703 - instrs[1] = 0x4e800020; 1704 - store_inst(instrs); 1705 - store_inst(instrs+1); 1706 - 1707 1665 if (setjmp(bus_error_jmp) == 0) { 1708 - catch_memory_errors = 1; 1666 + catch_spr_faults = 1; 1709 1667 sync(); 1710 1668 1711 - code(val); 1669 + xmon_mtspr(n, val); 1712 1670 1713 1671 sync(); 1714 - /* wait a little while to see if we get a machine check */ 1715 - __delay(200); 1716 - n = size; 1672 + } else { 1673 + printf("SPR 0x%03x (%4d) Faulted during write\n", n, n); 1717 1674 } 1675 + catch_spr_faults = 0; 1718 1676 } 1719 1677 1720 1678 static unsigned long regno; 1721 1679 extern char exc_prolog; 1722 1680 extern char dec_exc; 1723 1681 1682 + static void dump_one_spr(int spr, bool show_unimplemented) 1683 + { 1684 + unsigned long val; 1685 + 1686 + val = 0xdeadbeef; 1687 + if (!read_spr(spr, &val)) { 1688 + printf("SPR 0x%03x (%4d) Faulted during read\n", spr, spr); 1689 + return; 1690 + } 1691 + 1692 + if (val == 0xdeadbeef) { 1693 + /* Looks like read was a nop, confirm */ 1694 + val = 0x0badcafe; 1695 + if (!read_spr(spr, &val)) { 1696 + printf("SPR 0x%03x (%4d) Faulted during read\n", spr, spr); 1697 + return; 1698 + } 1699 + 1700 + if (val == 0x0badcafe) { 1701 + if (show_unimplemented) 1702 + printf("SPR 0x%03x (%4d) Unimplemented\n", spr, spr); 1703 + return; 1704 + } 1705 + } 1706 + 1707 + printf("SPR 0x%03x (%4d) = 0x%lx\n", spr, spr, val); 1708 + } 1709 + 1724 1710 static void super_regs(void) 1725 1711 { 1726 1712 int cmd; 1727 - unsigned long val; 1713 + int spr; 1728 1714 1729 1715 cmd = skipbl(); 1730 - if (cmd == '\n') { 1716 + 1717 + switch (cmd) { 1718 + case '\n': { 1731 1719 unsigned long sp, toc; 1732 1720 asm("mr %0,1" : "=r" (sp) :); 1733 1721 asm("mr %0,2" : "=r" (toc) :); ··· 1738 1730 mfspr(SPRN_DEC), mfspr(SPRN_SPRG2)); 1739 1731 printf("sp = "REG" sprg3= "REG"\n", sp, mfspr(SPRN_SPRG3)); 1740 1732 printf("toc = "REG" dar = "REG"\n", toc, mfspr(SPRN_DAR)); 1741 - 1742 1733 return; 1743 1734 } 1744 - 1745 - scanhex(&regno); 1746 - switch (cmd) { 1747 - case 'w': 1748 - val = read_spr(regno); 1735 + case 'w': { 1736 + unsigned long val; 1737 + scanhex(&regno); 1738 + val = 0; 1739 + read_spr(regno, &val); 1749 1740 scanhex(&val); 1750 1741 write_spr(regno, val); 1751 - /* fall through */ 1752 - case 'r': 1753 - printf("spr %lx = %lx\n", regno, read_spr(regno)); 1742 + dump_one_spr(regno, true); 1754 1743 break; 1755 1744 } 1745 + case 'r': 1746 + scanhex(&regno); 1747 + dump_one_spr(regno, true); 1748 + break; 1749 + case 'a': 1750 + /* dump ALL SPRs */ 1751 + for (spr = 1; spr < 1024; ++spr) 1752 + dump_one_spr(spr, false); 1753 + break; 1754 + } 1755 + 1756 1756 scannl(); 1757 1757 } 1758 1758 ··· 2929 2913 printf("%s", after); 2930 2914 } 2931 2915 2932 - #ifdef CONFIG_PPC_BOOK3S_64 2916 + #ifdef CONFIG_PPC_STD_MMU_64 2933 2917 void dump_segments(void) 2934 2918 { 2935 2919 int i;

+1 -1

drivers/cpufreq/pmac32-cpufreq.c

··· 300 300 _set_L3CR(save_l3cr); 301 301 302 302 /* Restore userland MMU context */ 303 - switch_mmu_context(NULL, current->active_mm); 303 + switch_mmu_context(NULL, current->active_mm, NULL); 304 304 305 305 #ifdef DEBUG_FREQ 306 306 printk(KERN_DEBUG "HID1, after: %x\n", mfspr(SPRN_HID1));

+1 -4

drivers/infiniband/hw/qib/qib_file_ops.c

··· 824 824 phys = dd->physaddr + piobufs; 825 825 826 826 #if defined(__powerpc__) 827 - /* There isn't a generic way to specify writethrough mappings */ 828 - pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE; 829 - pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU; 830 - pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED; 827 + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); 831 828 #endif 832 829 833 830 /*

-6

drivers/infiniband/hw/qib/qib_pcie.c

··· 144 144 addr = pci_resource_start(pdev, 0); 145 145 len = pci_resource_len(pdev, 0); 146 146 147 - #if defined(__powerpc__) 148 - /* There isn't a generic way to specify writethrough mappings */ 149 - dd->kregbase = __ioremap(addr, len, _PAGE_NO_CACHE | _PAGE_WRITETHRU); 150 - #else 151 147 dd->kregbase = ioremap_nocache(addr, len); 152 - #endif 153 - 154 148 if (!dd->kregbase) 155 149 return -ENOMEM; 156 150

+3 -2

drivers/macintosh/rack-meter.c

··· 154 154 DBDMA_DO_STOP(rm->dma_regs); 155 155 return; 156 156 } 157 - memset(rdma->buf1, 0, SAMPLE_COUNT & sizeof(u32)); 158 - memset(rdma->buf2, 0, SAMPLE_COUNT & sizeof(u32)); 157 + memset(rdma->buf1, 0, ARRAY_SIZE(rdma->buf1)); 158 + memset(rdma->buf2, 0, ARRAY_SIZE(rdma->buf2)); 159 159 160 160 rm->dma_buf_v->mark = 0; 161 161 ··· 227 227 228 228 total_idle_ticks = get_cpu_idle_time(cpu); 229 229 idle_ticks = (unsigned int) (total_idle_ticks - rcpu->prev_idle); 230 + idle_ticks = min(idle_ticks, total_ticks); 230 231 rcpu->prev_idle = total_idle_ticks; 231 232 232 233 /* We do a very dumb calculation to update the LEDs for now,

+2 -2

drivers/macintosh/via-pmu.c

··· 1851 1851 _set_L2CR(save_l2cr); 1852 1852 1853 1853 /* Restore userland MMU context */ 1854 - switch_mmu_context(NULL, current->active_mm); 1854 + switch_mmu_context(NULL, current->active_mm, NULL); 1855 1855 1856 1856 /* Power things up */ 1857 1857 pmu_unlock(); ··· 1940 1940 _set_L3CR(save_l3cr); 1941 1941 1942 1942 /* Restore userland MMU context */ 1943 - switch_mmu_context(NULL, current->active_mm); 1943 + switch_mmu_context(NULL, current->active_mm, NULL); 1944 1944 1945 1945 /* Tell PMU we are ready */ 1946 1946 pmu_unlock();

+19 -9

drivers/misc/cxl/api.c

··· 68 68 } 69 69 EXPORT_SYMBOL_GPL(cxl_get_context); 70 70 71 - struct device *cxl_get_phys_dev(struct pci_dev *dev) 72 - { 73 - struct cxl_afu *afu; 74 - 75 - afu = cxl_pci_to_afu(dev); 76 - 77 - return afu->adapter->dev.parent; 78 - } 79 - 80 71 int cxl_release_context(struct cxl_context *ctx) 81 72 { 82 73 if (ctx->status >= STARTED) ··· 183 192 ctx->pid = get_task_pid(task, PIDTYPE_PID); 184 193 ctx->glpid = get_task_pid(task->group_leader, PIDTYPE_PID); 185 194 kernel = false; 195 + ctx->real_mode = false; 186 196 } 187 197 188 198 cxl_ctx_get(); ··· 219 227 ctx->master = true; 220 228 } 221 229 EXPORT_SYMBOL_GPL(cxl_set_master); 230 + 231 + int cxl_set_translation_mode(struct cxl_context *ctx, bool real_mode) 232 + { 233 + if (ctx->status == STARTED) { 234 + /* 235 + * We could potentially update the PE and issue an update LLCMD 236 + * to support this, but it doesn't seem to have a good use case 237 + * since it's trivial to just create a second kernel context 238 + * with different translation modes, so until someone convinces 239 + * me otherwise: 240 + */ 241 + return -EBUSY; 242 + } 243 + 244 + ctx->real_mode = real_mode; 245 + return 0; 246 + } 247 + EXPORT_SYMBOL_GPL(cxl_set_translation_mode); 222 248 223 249 /* wrappers around afu_* file ops which are EXPORTED */ 224 250 int cxl_fd_open(struct inode *inode, struct file *file)

+1 -2

drivers/misc/cxl/context.c

··· 297 297 if (ctx->kernelapi) 298 298 kfree(ctx->mapping); 299 299 300 - if (ctx->irq_bitmap) 301 - kfree(ctx->irq_bitmap); 300 + kfree(ctx->irq_bitmap); 302 301 303 302 /* Drop ref to the afu device taken during cxl_context_init */ 304 303 cxl_afu_put(ctx->afu);

+5 -10

drivers/misc/cxl/cxl.h

··· 178 178 #define CXL_PSL_SR_An_MP (1ull << (63-62)) /* Master Process */ 179 179 #define CXL_PSL_SR_An_LE (1ull << (63-63)) /* Little Endian */ 180 180 181 - /****** CXL_PSL_LLCMD_An ****************************************************/ 182 - #define CXL_LLCMD_TERMINATE 0x0001000000000000ULL 183 - #define CXL_LLCMD_REMOVE 0x0002000000000000ULL 184 - #define CXL_LLCMD_SUSPEND 0x0003000000000000ULL 185 - #define CXL_LLCMD_RESUME 0x0004000000000000ULL 186 - #define CXL_LLCMD_ADD 0x0005000000000000ULL 187 - #define CXL_LLCMD_UPDATE 0x0006000000000000ULL 188 - #define CXL_LLCMD_HANDLE_MASK 0x000000000000ffffULL 189 - 190 181 /****** CXL_PSL_ID_An ****************************************************/ 191 182 #define CXL_PSL_ID_An_F (1ull << (63-31)) 192 183 #define CXL_PSL_ID_An_L (1ull << (63-30)) ··· 367 376 }; 368 377 369 378 struct cxl_afu_guest { 379 + struct cxl_afu *parent; 370 380 u64 handle; 371 381 phys_addr_t p2n_phys; 372 382 u64 p2n_size; 373 383 int max_ints; 374 - struct mutex recovery_lock; 384 + bool handle_err; 385 + struct delayed_work work_err; 375 386 int previous_state; 376 387 }; 377 388 ··· 517 524 bool pe_inserted; 518 525 bool master; 519 526 bool kernel; 527 + bool real_mode; 520 528 bool pending_irq; 521 529 bool pending_fault; 522 530 bool pending_afu_err; ··· 574 580 bool perst_loads_image; 575 581 bool perst_select_user; 576 582 bool perst_same_image; 583 + bool psl_timebase_synced; 577 584 }; 578 585 579 586 int cxl_pci_alloc_one_irq(struct cxl *adapter);

+6 -4

drivers/misc/cxl/fault.c

··· 149 149 * update_mmu_cache() will not have loaded the hash since current->trap 150 150 * is not a 0x400 or 0x300, so just call hash_page_mm() here. 151 151 */ 152 - access = _PAGE_PRESENT; 152 + access = _PAGE_PRESENT | _PAGE_READ; 153 153 if (dsisr & CXL_PSL_DSISR_An_S) 154 - access |= _PAGE_RW; 155 - if ((!ctx->kernel) || ~(dar & (1ULL << 63))) 156 - access |= _PAGE_USER; 154 + access |= _PAGE_WRITE; 155 + 156 + access |= _PAGE_PRIVILEGED; 157 + if ((!ctx->kernel) || (REGION_ID(dar) == USER_REGION_ID)) 158 + access &= ~_PAGE_PRIVILEGED; 157 159 158 160 if (dsisr & DSISR_NOHPTE) 159 161 inv_flags |= HPTE_NOHPTE_UPDATE;

+51 -27

drivers/misc/cxl/guest.c

··· 178 178 u64 state; 179 179 int rc = 0; 180 180 181 + if (!afu) 182 + return -EIO; 183 + 181 184 rc = cxl_h_read_error_state(afu->guest->handle, &state); 182 185 if (!rc) { 183 186 WARN_ON(state != H_STATE_NORMAL && ··· 555 552 556 553 elem->common.sstp0 = cpu_to_be64(ctx->sstp0); 557 554 elem->common.sstp1 = cpu_to_be64(ctx->sstp1); 555 + 556 + /* 557 + * Ensure we have at least one interrupt allocated to take faults for 558 + * kernel contexts that may not have allocated any AFU IRQs at all: 559 + */ 560 + if (ctx->irqs.range[0] == 0) { 561 + rc = afu_register_irqs(ctx, 0); 562 + if (rc) 563 + goto out_free; 564 + } 565 + 558 566 for (r = 0; r < CXL_IRQ_RANGES; r++) { 559 567 for (i = 0; i < ctx->irqs.range[r]; i++) { 560 568 if (r == 0 && i == 0) { ··· 611 597 enable_afu_irqs(ctx); 612 598 } 613 599 600 + out_free: 614 601 free_page((u64)elem); 615 602 return rc; 616 603 } ··· 619 604 static int guest_attach_process(struct cxl_context *ctx, bool kernel, u64 wed, u64 amr) 620 605 { 621 606 pr_devel("in %s\n", __func__); 607 + 608 + if (ctx->real_mode) 609 + return -EPERM; 622 610 623 611 ctx->kernel = kernel; 624 612 if (ctx->afu->current_mode == CXL_MODE_DIRECTED) ··· 836 818 switch (cur_state) { 837 819 case H_STATE_NORMAL: 838 820 afu->guest->previous_state = cur_state; 839 - rc = 1; 840 821 break; 841 822 842 823 case H_STATE_DISABLE: ··· 851 834 pci_error_handlers(afu, CXL_SLOT_RESET_EVENT, 852 835 pci_channel_io_normal); 853 836 pci_error_handlers(afu, CXL_RESUME_EVENT, 0); 854 - rc = 1; 855 837 } 856 838 afu->guest->previous_state = 0; 857 839 break; ··· 875 859 return rc; 876 860 } 877 861 878 - static int afu_do_recovery(struct cxl_afu *afu) 862 + static void afu_handle_errstate(struct work_struct *work) 879 863 { 880 - int rc; 864 + struct cxl_afu_guest *afu_guest = 865 + container_of(to_delayed_work(work), struct cxl_afu_guest, work_err); 881 866 882 - /* many threads can arrive here, in case of detach_all for example. 883 - * Only one needs to drive the recovery 884 - */ 885 - if (mutex_trylock(&afu->guest->recovery_lock)) { 886 - rc = afu_update_state(afu); 887 - mutex_unlock(&afu->guest->recovery_lock); 888 - return rc; 889 - } 890 - return 0; 867 + if (!afu_update_state(afu_guest->parent) && 868 + afu_guest->previous_state == H_STATE_PERM_UNAVAILABLE) 869 + return; 870 + 871 + if (afu_guest->handle_err == true) 872 + schedule_delayed_work(&afu_guest->work_err, 873 + msecs_to_jiffies(3000)); 891 874 } 892 875 893 876 static bool guest_link_ok(struct cxl *cxl, struct cxl_afu *afu) 894 877 { 895 878 int state; 896 879 897 - if (afu) { 898 - if (afu_read_error_state(afu, &state) || 899 - state != H_STATE_NORMAL) { 900 - if (afu_do_recovery(afu) > 0) { 901 - /* check again in case we've just fixed it */ 902 - if (!afu_read_error_state(afu, &state) && 903 - state == H_STATE_NORMAL) 904 - return true; 905 - } 906 - return false; 907 - } 880 + if (afu && (!afu_read_error_state(afu, &state))) { 881 + if (state == H_STATE_NORMAL) 882 + return true; 908 883 } 909 884 910 - return true; 885 + return false; 911 886 } 912 887 913 888 static int afu_properties_look_ok(struct cxl_afu *afu) ··· 935 928 kfree(afu); 936 929 return -ENOMEM; 937 930 } 938 - 939 - mutex_init(&afu->guest->recovery_lock); 940 931 941 932 if ((rc = dev_set_name(&afu->dev, "afu%i.%i", 942 933 adapter->adapter_num, ··· 991 986 992 987 afu->enabled = true; 993 988 989 + /* 990 + * wake up the cpu periodically to check the state 991 + * of the AFU using "afu" stored in the guest structure. 992 + */ 993 + afu->guest->parent = afu; 994 + afu->guest->handle_err = true; 995 + INIT_DELAYED_WORK(&afu->guest->work_err, afu_handle_errstate); 996 + schedule_delayed_work(&afu->guest->work_err, msecs_to_jiffies(1000)); 997 + 994 998 if ((rc = cxl_pci_vphb_add(afu))) 995 999 dev_info(&afu->dev, "Can't register vPHB\n"); 996 1000 ··· 1027 1013 1028 1014 if (!afu) 1029 1015 return; 1016 + 1017 + /* flush and stop pending job */ 1018 + afu->guest->handle_err = false; 1019 + flush_delayed_work(&afu->guest->work_err); 1030 1020 1031 1021 cxl_pci_vphb_remove(afu); 1032 1022 cxl_sysfs_afu_remove(afu); ··· 1118 1100 adapter->dev.parent = &pdev->dev; 1119 1101 adapter->dev.release = release_adapter; 1120 1102 dev_set_drvdata(&pdev->dev, adapter); 1103 + 1104 + /* 1105 + * Hypervisor controls PSL timebase initialization (p1 register). 1106 + * On FW840, PSL is initialized. 1107 + */ 1108 + adapter->psl_timebase_synced = true; 1121 1109 1122 1110 if ((rc = cxl_of_read_adapter_handle(adapter, np))) 1123 1111 goto err1;

+24 -5

drivers/misc/cxl/native.c

··· 186 186 187 187 int cxl_alloc_spa(struct cxl_afu *afu) 188 188 { 189 + unsigned spa_size; 190 + 189 191 /* Work out how many pages to allocate */ 190 192 afu->native->spa_order = 0; 191 193 do { 192 194 afu->native->spa_order++; 193 - afu->native->spa_size = (1 << afu->native->spa_order) * PAGE_SIZE; 195 + spa_size = (1 << afu->native->spa_order) * PAGE_SIZE; 196 + 197 + if (spa_size > 0x100000) { 198 + dev_warn(&afu->dev, "num_of_processes too large for the SPA, limiting to %i (0x%x)\n", 199 + afu->native->spa_max_procs, afu->native->spa_size); 200 + afu->num_procs = afu->native->spa_max_procs; 201 + break; 202 + } 203 + 204 + afu->native->spa_size = spa_size; 194 205 afu->native->spa_max_procs = spa_max_procs(afu->native->spa_size); 195 206 } while (afu->native->spa_max_procs < afu->num_procs); 196 - 197 - WARN_ON(afu->native->spa_size > 0x100000); /* Max size supported by the hardware */ 198 207 199 208 if (!(afu->native->spa = (struct cxl_process_element *) 200 209 __get_free_pages(GFP_KERNEL | __GFP_ZERO, afu->native->spa_order))) { ··· 495 486 if (mfspr(SPRN_LPCR) & LPCR_TC) 496 487 sr |= CXL_PSL_SR_An_TC; 497 488 if (ctx->kernel) { 498 - sr |= CXL_PSL_SR_An_R | (mfmsr() & MSR_SF); 499 - sr |= CXL_PSL_SR_An_HV; 489 + if (!ctx->real_mode) 490 + sr |= CXL_PSL_SR_An_R; 491 + sr |= (mfmsr() & MSR_SF) | CXL_PSL_SR_An_HV; 500 492 } else { 501 493 sr |= CXL_PSL_SR_An_PR | CXL_PSL_SR_An_R; 502 494 sr &= ~(CXL_PSL_SR_An_HV); ··· 535 525 536 526 ctx->elem->common.sstp0 = cpu_to_be64(ctx->sstp0); 537 527 ctx->elem->common.sstp1 = cpu_to_be64(ctx->sstp1); 528 + 529 + /* 530 + * Ensure we have the multiplexed PSL interrupt set up to take faults 531 + * for kernel contexts that may not have allocated any AFU IRQs at all: 532 + */ 533 + if (ctx->irqs.range[0] == 0) { 534 + ctx->irqs.offset[0] = ctx->afu->native->psl_hwirq; 535 + ctx->irqs.range[0] = 1; 536 + } 538 537 539 538 for (r = 0; r < CXL_IRQ_RANGES; r++) { 540 539 ctx->elem->ivte_offsets[r] = cpu_to_be16(ctx->irqs.offset[r]);

+54 -10

drivers/misc/cxl/pci.c

··· 21 21 #include <asm/msi_bitmap.h> 22 22 #include <asm/pnv-pci.h> 23 23 #include <asm/io.h> 24 + #include <asm/reg.h> 24 25 25 26 #include "cxl.h" 26 27 #include <misc/cxl.h> ··· 322 321 #undef show_reg 323 322 } 324 323 324 + #define CAPP_UNIT0_ID 0xBA 325 + #define CAPP_UNIT1_ID 0XBE 326 + 327 + static u64 get_capp_unit_id(struct device_node *np) 328 + { 329 + u32 phb_index; 330 + 331 + /* 332 + * For chips other than POWER8NVL, we only have CAPP 0, 333 + * irrespective of which PHB is used. 334 + */ 335 + if (!pvr_version_is(PVR_POWER8NVL)) 336 + return CAPP_UNIT0_ID; 337 + 338 + /* 339 + * For POWER8NVL, assume CAPP 0 is attached to PHB0 and 340 + * CAPP 1 is attached to PHB1. 341 + */ 342 + if (of_property_read_u32(np, "ibm,phb-index", &phb_index)) 343 + return 0; 344 + 345 + if (phb_index == 0) 346 + return CAPP_UNIT0_ID; 347 + 348 + if (phb_index == 1) 349 + return CAPP_UNIT1_ID; 350 + 351 + return 0; 352 + } 353 + 325 354 static int init_implementation_adapter_regs(struct cxl *adapter, struct pci_dev *dev) 326 355 { 327 356 struct device_node *np; 328 357 const __be32 *prop; 329 358 u64 psl_dsnctl; 330 359 u64 chipid; 360 + u64 capp_unit_id; 331 361 332 362 if (!(np = pnv_pci_get_phb_node(dev))) 333 363 return -ENODEV; ··· 368 336 if (!np) 369 337 return -ENODEV; 370 338 chipid = be32_to_cpup(prop); 339 + capp_unit_id = get_capp_unit_id(np); 371 340 of_node_put(np); 341 + if (!capp_unit_id) { 342 + pr_err("cxl: invalid capp unit id\n"); 343 + return -ENODEV; 344 + } 372 345 346 + psl_dsnctl = 0x0000900000000000ULL; /* pteupd ttype, scdone */ 347 + psl_dsnctl |= (0x2ULL << (63-38)); /* MMIO hang pulse: 256 us */ 373 348 /* Tell PSL where to route data to */ 374 - psl_dsnctl = 0x02E8900002000000ULL | (chipid << (63-5)); 349 + psl_dsnctl |= (chipid << (63-5)); 350 + psl_dsnctl |= (capp_unit_id << (63-13)); 351 + 375 352 cxl_p1_write(adapter, CXL_PSL_DSNDCTL, psl_dsnctl); 376 353 cxl_p1_write(adapter, CXL_PSL_RESLCKTO, 0x20000000200ULL); 377 354 /* snoop write mask */ ··· 396 355 #define TBSYNC_CNT(n) (((u64)n & 0x7) << (63-6)) 397 356 #define _2048_250MHZ_CYCLES 1 398 357 399 - static int cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev) 358 + static void cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev) 400 359 { 401 360 u64 psl_tb; 402 361 int delta; 403 362 unsigned int retry = 0; 404 363 struct device_node *np; 405 364 365 + adapter->psl_timebase_synced = false; 366 + 406 367 if (!(np = pnv_pci_get_phb_node(dev))) 407 - return -ENODEV; 368 + return; 408 369 409 370 /* Do not fail when CAPP timebase sync is not supported by OPAL */ 410 371 of_node_get(np); 411 372 if (! of_get_property(np, "ibm,capp-timebase-sync", NULL)) { 412 373 of_node_put(np); 413 - pr_err("PSL: Timebase sync: OPAL support missing\n"); 414 - return 0; 374 + dev_info(&dev->dev, "PSL timebase inactive: OPAL support missing\n"); 375 + return; 415 376 } 416 377 of_node_put(np); 417 378 ··· 432 389 do { 433 390 msleep(1); 434 391 if (retry++ > 5) { 435 - pr_err("PSL: Timebase sync: giving up!\n"); 436 - return -EIO; 392 + dev_info(&dev->dev, "PSL timebase can't synchronize\n"); 393 + return; 437 394 } 438 395 psl_tb = cxl_p1_read(adapter, CXL_PSL_Timebase); 439 396 delta = mftb() - psl_tb; ··· 441 398 delta = -delta; 442 399 } while (tb_to_ns(delta) > 16000); 443 400 444 - return 0; 401 + adapter->psl_timebase_synced = true; 402 + return; 445 403 } 446 404 447 405 static int init_implementation_afu_regs(struct cxl_afu *afu) ··· 1188 1144 if ((rc = pnv_phb_to_cxl_mode(dev, OPAL_PHB_CAPI_MODE_SNOOP_ON))) 1189 1145 goto err; 1190 1146 1191 - if ((rc = cxl_setup_psl_timebase(adapter, dev))) 1192 - goto err; 1147 + /* Ignore error, adapter init is not dependant on timebase sync */ 1148 + cxl_setup_psl_timebase(adapter, dev); 1193 1149 1194 1150 if ((rc = cxl_native_register_psl_err_irq(adapter))) 1195 1151 goto err;

+10

drivers/misc/cxl/sysfs.c

··· 57 57 return scnprintf(buf, PAGE_SIZE, "factory\n"); 58 58 } 59 59 60 + static ssize_t psl_timebase_synced_show(struct device *device, 61 + struct device_attribute *attr, 62 + char *buf) 63 + { 64 + struct cxl *adapter = to_cxl_adapter(device); 65 + 66 + return scnprintf(buf, PAGE_SIZE, "%i\n", adapter->psl_timebase_synced); 67 + } 68 + 60 69 static ssize_t reset_adapter_store(struct device *device, 61 70 struct device_attribute *attr, 62 71 const char *buf, size_t count) ··· 151 142 __ATTR_RO(psl_revision), 152 143 __ATTR_RO(base_image), 153 144 __ATTR_RO(image_loaded), 145 + __ATTR_RO(psl_timebase_synced), 154 146 __ATTR_RW(load_image_on_perst), 155 147 __ATTR_RW(perst_reloads_same_image), 156 148 __ATTR(reset, S_IWUSR, NULL, reset_adapter_store),

+3

drivers/of/base.c

··· 1777 1777 unsigned long flags; 1778 1778 int rc; 1779 1779 1780 + if (!prop) 1781 + return -ENODEV; 1782 + 1780 1783 mutex_lock(&of_mutex); 1781 1784 1782 1785 raw_spin_lock_irqsave(&devtree_lock, flags);

+4 -4

drivers/pci/hotplug/rpadlpar_core.c

··· 175 175 struct pci_dev *dev; 176 176 struct pci_controller *phb; 177 177 178 - if (pcibios_find_pci_bus(dn)) 178 + if (pci_find_bus_by_node(dn)) 179 179 return -EINVAL; 180 180 181 181 /* Add pci bus */ ··· 212 212 struct pci_dn *pdn; 213 213 int rc = 0; 214 214 215 - if (!pcibios_find_pci_bus(dn)) 215 + if (!pci_find_bus_by_node(dn)) 216 216 return -EINVAL; 217 217 218 218 /* If pci slot is hotpluggable, use hotplug to remove it */ ··· 356 356 357 357 pci_lock_rescan_remove(); 358 358 359 - bus = pcibios_find_pci_bus(dn); 359 + bus = pci_find_bus_by_node(dn); 360 360 if (!bus) { 361 361 ret = -EINVAL; 362 362 goto out; ··· 380 380 } 381 381 382 382 /* Remove all devices below slot */ 383 - pcibios_remove_pci_devices(bus); 383 + pci_hp_remove_devices(bus); 384 384 385 385 /* Unmap PCI IO space */ 386 386 if (pcibios_unmap_io_space(bus)) {

+2 -2

drivers/pci/hotplug/rpaphp_core.c

··· 404 404 405 405 if (state == PRESENT) { 406 406 pci_lock_rescan_remove(); 407 - pcibios_add_pci_devices(slot->bus); 407 + pci_hp_add_devices(slot->bus); 408 408 pci_unlock_rescan_remove(); 409 409 slot->state = CONFIGURED; 410 410 } else if (state == EMPTY) { ··· 426 426 return -EINVAL; 427 427 428 428 pci_lock_rescan_remove(); 429 - pcibios_remove_pci_devices(slot->bus); 429 + pci_hp_remove_devices(slot->bus); 430 430 pci_unlock_rescan_remove(); 431 431 vm_unmap_aliases(); 432 432

+2 -2

drivers/pci/hotplug/rpaphp_pci.c

··· 93 93 if (rc) 94 94 return rc; 95 95 96 - bus = pcibios_find_pci_bus(slot->dn); 96 + bus = pci_find_bus_by_node(slot->dn); 97 97 if (!bus) { 98 98 err("%s: no pci_bus for dn %s\n", __func__, slot->dn->full_name); 99 99 return -EINVAL; ··· 116 116 } 117 117 118 118 if (list_empty(&bus->devices)) 119 - pcibios_add_pci_devices(bus); 119 + pci_hp_add_devices(bus); 120 120 121 121 if (!list_empty(&bus->devices)) { 122 122 info->adapter_status = CONFIGURED;

+1 -1

drivers/pcmcia/electra_cf.c

··· 228 228 229 229 if (!cf->mem_base || !cf->io_virt || !cf->gpio_base || 230 230 (__ioremap_at(io.start, cf->io_virt, cf->io_size, 231 - _PAGE_NO_CACHE | _PAGE_GUARDED) == NULL)) { 231 + pgprot_val(pgprot_noncached(__pgprot(0)))) == NULL)) { 232 232 dev_err(device, "can't ioremap ranges\n"); 233 233 status = -ENOMEM; 234 234 goto fail1;

+2 -1

drivers/vfio/vfio_iommu_spapr_tce.c

··· 1188 1188 goto unlock_exit; 1189 1189 } 1190 1190 table_group_tmp = iommu_group_get_iommudata(tcegrp->grp); 1191 - if (table_group_tmp->ops != table_group->ops) { 1191 + if (table_group_tmp->ops->create_table != 1192 + table_group->ops->create_table) { 1192 1193 pr_warn("tce_vfio: Group %d is incompatible with group %d\n", 1193 1194 iommu_group_id(iommu_group), 1194 1195 iommu_group_id(tcegrp->grp));

+8

include/misc/cxl.h

··· 127 127 void cxl_set_master(struct cxl_context *ctx); 128 128 129 129 /* 130 + * Sets the context to use real mode memory accesses to operate with 131 + * translation disabled. Note that this only makes sense for kernel contexts 132 + * under bare metal, and will not work with virtualisation. May only be 133 + * performed on stopped contexts. 134 + */ 135 + int cxl_set_translation_mode(struct cxl_context *ctx, bool real_mode); 136 + 137 + /* 130 138 * Map and unmap the AFU Problem Space area. The amount and location mapped 131 139 * depends on if this context is a master or slave. 132 140 */

+12

kernel/trace/ftrace.c

··· 3456 3456 int type; 3457 3457 }; 3458 3458 3459 + /* 3460 + * If symbols in an architecture don't correspond exactly to the user-visible 3461 + * name of what they represent, it is possible to define this function to 3462 + * perform the necessary adjustments. 3463 + */ 3464 + char * __weak arch_ftrace_match_adjust(char *str, const char *search) 3465 + { 3466 + return str; 3467 + } 3468 + 3459 3469 static int ftrace_match(char *str, struct ftrace_glob *g) 3460 3470 { 3461 3471 int matched = 0; 3462 3472 int slen; 3473 + 3474 + str = arch_ftrace_match_adjust(str, g->search); 3463 3475 3464 3476 switch (g->type) { 3465 3477 case MATCH_FULL:

+69

tools/perf/arch/powerpc/include/perf_regs.h

··· 1 + #ifndef ARCH_PERF_REGS_H 2 + #define ARCH_PERF_REGS_H 3 + 4 + #include <stdlib.h> 5 + #include <linux/types.h> 6 + #include <asm/perf_regs.h> 7 + 8 + #define PERF_REGS_MASK ((1ULL << PERF_REG_POWERPC_MAX) - 1) 9 + #define PERF_REGS_MAX PERF_REG_POWERPC_MAX 10 + #ifdef __powerpc64__ 11 + #define PERF_SAMPLE_REGS_ABI PERF_SAMPLE_REGS_ABI_64 12 + #else 13 + #define PERF_SAMPLE_REGS_ABI PERF_SAMPLE_REGS_ABI_32 14 + #endif 15 + 16 + #define PERF_REG_IP PERF_REG_POWERPC_NIP 17 + #define PERF_REG_SP PERF_REG_POWERPC_R1 18 + 19 + static const char *reg_names[] = { 20 + [PERF_REG_POWERPC_R0] = "r0", 21 + [PERF_REG_POWERPC_R1] = "r1", 22 + [PERF_REG_POWERPC_R2] = "r2", 23 + [PERF_REG_POWERPC_R3] = "r3", 24 + [PERF_REG_POWERPC_R4] = "r4", 25 + [PERF_REG_POWERPC_R5] = "r5", 26 + [PERF_REG_POWERPC_R6] = "r6", 27 + [PERF_REG_POWERPC_R7] = "r7", 28 + [PERF_REG_POWERPC_R8] = "r8", 29 + [PERF_REG_POWERPC_R9] = "r9", 30 + [PERF_REG_POWERPC_R10] = "r10", 31 + [PERF_REG_POWERPC_R11] = "r11", 32 + [PERF_REG_POWERPC_R12] = "r12", 33 + [PERF_REG_POWERPC_R13] = "r13", 34 + [PERF_REG_POWERPC_R14] = "r14", 35 + [PERF_REG_POWERPC_R15] = "r15", 36 + [PERF_REG_POWERPC_R16] = "r16", 37 + [PERF_REG_POWERPC_R17] = "r17", 38 + [PERF_REG_POWERPC_R18] = "r18", 39 + [PERF_REG_POWERPC_R19] = "r19", 40 + [PERF_REG_POWERPC_R20] = "r20", 41 + [PERF_REG_POWERPC_R21] = "r21", 42 + [PERF_REG_POWERPC_R22] = "r22", 43 + [PERF_REG_POWERPC_R23] = "r23", 44 + [PERF_REG_POWERPC_R24] = "r24", 45 + [PERF_REG_POWERPC_R25] = "r25", 46 + [PERF_REG_POWERPC_R26] = "r26", 47 + [PERF_REG_POWERPC_R27] = "r27", 48 + [PERF_REG_POWERPC_R28] = "r28", 49 + [PERF_REG_POWERPC_R29] = "r29", 50 + [PERF_REG_POWERPC_R30] = "r30", 51 + [PERF_REG_POWERPC_R31] = "r31", 52 + [PERF_REG_POWERPC_NIP] = "nip", 53 + [PERF_REG_POWERPC_MSR] = "msr", 54 + [PERF_REG_POWERPC_ORIG_R3] = "orig_r3", 55 + [PERF_REG_POWERPC_CTR] = "ctr", 56 + [PERF_REG_POWERPC_LINK] = "link", 57 + [PERF_REG_POWERPC_XER] = "xer", 58 + [PERF_REG_POWERPC_CCR] = "ccr", 59 + [PERF_REG_POWERPC_SOFTE] = "softe", 60 + [PERF_REG_POWERPC_TRAP] = "trap", 61 + [PERF_REG_POWERPC_DAR] = "dar", 62 + [PERF_REG_POWERPC_DSISR] = "dsisr" 63 + }; 64 + 65 + static inline const char *perf_reg_name(int id) 66 + { 67 + return reg_names[id]; 68 + } 69 + #endif /* ARCH_PERF_REGS_H */

+2

tools/perf/arch/powerpc/util/Build

··· 1 1 libperf-y += header.o 2 2 libperf-y += sym-handling.o 3 3 libperf-y += kvm-stat.o 4 + libperf-y += perf_regs.o 4 5 5 6 libperf-$(CONFIG_DWARF) += dwarf-regs.o 6 7 libperf-$(CONFIG_DWARF) += skip-callchain-idx.o 8 + libperf-$(CONFIG_LIBUNWIND) += unwind-libunwind.o

+49

tools/perf/arch/powerpc/util/perf_regs.c

··· 1 + #include "../../perf.h" 2 + #include "../../util/perf_regs.h" 3 + 4 + const struct sample_reg sample_reg_masks[] = { 5 + SMPL_REG(r0, PERF_REG_POWERPC_R0), 6 + SMPL_REG(r1, PERF_REG_POWERPC_R1), 7 + SMPL_REG(r2, PERF_REG_POWERPC_R2), 8 + SMPL_REG(r3, PERF_REG_POWERPC_R3), 9 + SMPL_REG(r4, PERF_REG_POWERPC_R4), 10 + SMPL_REG(r5, PERF_REG_POWERPC_R5), 11 + SMPL_REG(r6, PERF_REG_POWERPC_R6), 12 + SMPL_REG(r7, PERF_REG_POWERPC_R7), 13 + SMPL_REG(r8, PERF_REG_POWERPC_R8), 14 + SMPL_REG(r9, PERF_REG_POWERPC_R9), 15 + SMPL_REG(r10, PERF_REG_POWERPC_R10), 16 + SMPL_REG(r11, PERF_REG_POWERPC_R11), 17 + SMPL_REG(r12, PERF_REG_POWERPC_R12), 18 + SMPL_REG(r13, PERF_REG_POWERPC_R13), 19 + SMPL_REG(r14, PERF_REG_POWERPC_R14), 20 + SMPL_REG(r15, PERF_REG_POWERPC_R15), 21 + SMPL_REG(r16, PERF_REG_POWERPC_R16), 22 + SMPL_REG(r17, PERF_REG_POWERPC_R17), 23 + SMPL_REG(r18, PERF_REG_POWERPC_R18), 24 + SMPL_REG(r19, PERF_REG_POWERPC_R19), 25 + SMPL_REG(r20, PERF_REG_POWERPC_R20), 26 + SMPL_REG(r21, PERF_REG_POWERPC_R21), 27 + SMPL_REG(r22, PERF_REG_POWERPC_R22), 28 + SMPL_REG(r23, PERF_REG_POWERPC_R23), 29 + SMPL_REG(r24, PERF_REG_POWERPC_R24), 30 + SMPL_REG(r25, PERF_REG_POWERPC_R25), 31 + SMPL_REG(r26, PERF_REG_POWERPC_R26), 32 + SMPL_REG(r27, PERF_REG_POWERPC_R27), 33 + SMPL_REG(r28, PERF_REG_POWERPC_R28), 34 + SMPL_REG(r29, PERF_REG_POWERPC_R29), 35 + SMPL_REG(r30, PERF_REG_POWERPC_R30), 36 + SMPL_REG(r31, PERF_REG_POWERPC_R31), 37 + SMPL_REG(nip, PERF_REG_POWERPC_NIP), 38 + SMPL_REG(msr, PERF_REG_POWERPC_MSR), 39 + SMPL_REG(orig_r3, PERF_REG_POWERPC_ORIG_R3), 40 + SMPL_REG(ctr, PERF_REG_POWERPC_CTR), 41 + SMPL_REG(link, PERF_REG_POWERPC_LINK), 42 + SMPL_REG(xer, PERF_REG_POWERPC_XER), 43 + SMPL_REG(ccr, PERF_REG_POWERPC_CCR), 44 + SMPL_REG(softe, PERF_REG_POWERPC_SOFTE), 45 + SMPL_REG(trap, PERF_REG_POWERPC_TRAP), 46 + SMPL_REG(dar, PERF_REG_POWERPC_DAR), 47 + SMPL_REG(dsisr, PERF_REG_POWERPC_DSISR), 48 + SMPL_REG_END 49 + };

+96

tools/perf/arch/powerpc/util/unwind-libunwind.c

··· 1 + /* 2 + * Copyright 2016 Chandan Kumar, IBM Corporation. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public License 6 + * as published by the Free Software Foundation; either version 7 + * 2 of the License, or (at your option) any later version. 8 + */ 9 + 10 + #include <errno.h> 11 + #include <libunwind.h> 12 + #include <asm/perf_regs.h> 13 + #include "../../util/unwind.h" 14 + #include "../../util/debug.h" 15 + 16 + int libunwind__arch_reg_id(int regnum) 17 + { 18 + switch (regnum) { 19 + case UNW_PPC64_R0: 20 + return PERF_REG_POWERPC_R0; 21 + case UNW_PPC64_R1: 22 + return PERF_REG_POWERPC_R1; 23 + case UNW_PPC64_R2: 24 + return PERF_REG_POWERPC_R2; 25 + case UNW_PPC64_R3: 26 + return PERF_REG_POWERPC_R3; 27 + case UNW_PPC64_R4: 28 + return PERF_REG_POWERPC_R4; 29 + case UNW_PPC64_R5: 30 + return PERF_REG_POWERPC_R5; 31 + case UNW_PPC64_R6: 32 + return PERF_REG_POWERPC_R6; 33 + case UNW_PPC64_R7: 34 + return PERF_REG_POWERPC_R7; 35 + case UNW_PPC64_R8: 36 + return PERF_REG_POWERPC_R8; 37 + case UNW_PPC64_R9: 38 + return PERF_REG_POWERPC_R9; 39 + case UNW_PPC64_R10: 40 + return PERF_REG_POWERPC_R10; 41 + case UNW_PPC64_R11: 42 + return PERF_REG_POWERPC_R11; 43 + case UNW_PPC64_R12: 44 + return PERF_REG_POWERPC_R12; 45 + case UNW_PPC64_R13: 46 + return PERF_REG_POWERPC_R13; 47 + case UNW_PPC64_R14: 48 + return PERF_REG_POWERPC_R14; 49 + case UNW_PPC64_R15: 50 + return PERF_REG_POWERPC_R15; 51 + case UNW_PPC64_R16: 52 + return PERF_REG_POWERPC_R16; 53 + case UNW_PPC64_R17: 54 + return PERF_REG_POWERPC_R17; 55 + case UNW_PPC64_R18: 56 + return PERF_REG_POWERPC_R18; 57 + case UNW_PPC64_R19: 58 + return PERF_REG_POWERPC_R19; 59 + case UNW_PPC64_R20: 60 + return PERF_REG_POWERPC_R20; 61 + case UNW_PPC64_R21: 62 + return PERF_REG_POWERPC_R21; 63 + case UNW_PPC64_R22: 64 + return PERF_REG_POWERPC_R22; 65 + case UNW_PPC64_R23: 66 + return PERF_REG_POWERPC_R23; 67 + case UNW_PPC64_R24: 68 + return PERF_REG_POWERPC_R24; 69 + case UNW_PPC64_R25: 70 + return PERF_REG_POWERPC_R25; 71 + case UNW_PPC64_R26: 72 + return PERF_REG_POWERPC_R26; 73 + case UNW_PPC64_R27: 74 + return PERF_REG_POWERPC_R27; 75 + case UNW_PPC64_R28: 76 + return PERF_REG_POWERPC_R28; 77 + case UNW_PPC64_R29: 78 + return PERF_REG_POWERPC_R29; 79 + case UNW_PPC64_R30: 80 + return PERF_REG_POWERPC_R30; 81 + case UNW_PPC64_R31: 82 + return PERF_REG_POWERPC_R31; 83 + case UNW_PPC64_LR: 84 + return PERF_REG_POWERPC_LINK; 85 + case UNW_PPC64_CTR: 86 + return PERF_REG_POWERPC_CTR; 87 + case UNW_PPC64_XER: 88 + return PERF_REG_POWERPC_XER; 89 + case UNW_PPC64_NIP: 90 + return PERF_REG_POWERPC_NIP; 91 + default: 92 + pr_err("unwind: invalid reg id %d\n", regnum); 93 + return -EINVAL; 94 + } 95 + return -EINVAL; 96 + }

+6

tools/perf/config/Makefile

··· 23 23 24 24 NO_PERF_REGS := 1 25 25 26 + # Additional ARCH settings for ppc 27 + ifeq ($(ARCH),powerpc) 28 + NO_PERF_REGS := 0 29 + LIBUNWIND_LIBS := -lunwind -lunwind-ppc64 30 + endif 31 + 26 32 # Additional ARCH settings for x86 27 33 ifeq ($(ARCH),x86) 28 34 $(call detected,CONFIG_X86)

+4 -4

tools/perf/util/perf_regs.c

··· 12 12 int i, idx = 0; 13 13 u64 mask = regs->mask; 14 14 15 - if (regs->cache_mask & (1 << id)) 15 + if (regs->cache_mask & (1ULL << id)) 16 16 goto out; 17 17 18 - if (!(mask & (1 << id))) 18 + if (!(mask & (1ULL << id))) 19 19 return -EINVAL; 20 20 21 21 for (i = 0; i < id; i++) { 22 - if (mask & (1 << i)) 22 + if (mask & (1ULL << i)) 23 23 idx++; 24 24 } 25 25 26 - regs->cache_mask |= (1 << id); 26 + regs->cache_mask |= (1ULL << id); 27 27 regs->cache_regs[id] = regs->regs[idx]; 28 28 29 29 out:

+1

tools/testing/selftests/powerpc/Makefile

··· 14 14 15 15 SUB_DIRS = benchmarks \ 16 16 copyloops \ 17 + context_switch \ 17 18 dscr \ 18 19 mm \ 19 20 pmu \

+1

tools/testing/selftests/powerpc/context_switch/.gitignore

··· 1 + cp_abort

+10

tools/testing/selftests/powerpc/context_switch/Makefile

··· 1 + TEST_PROGS := cp_abort 2 + 3 + all: $(TEST_PROGS) 4 + 5 + $(TEST_PROGS): ../harness.c ../utils.c 6 + 7 + include ../../lib.mk 8 + 9 + clean: 10 + rm -f $(TEST_PROGS)

+110

tools/testing/selftests/powerpc/context_switch/cp_abort.c

··· 1 + /* 2 + * Adapted from Anton Blanchard's context switch microbenchmark. 3 + * 4 + * Copyright 2009, Anton Blanchard, IBM Corporation. 5 + * Copyright 2016, Mikey Neuling, Chris Smart, IBM Corporation. 6 + * 7 + * This program is free software; you can redistribute it and/or 8 + * modify it under the terms of the GNU General Public License 9 + * as published by the Free Software Foundation; either version 10 + * 2 of the License, or (at your option) any later version. 11 + * 12 + * This program tests the copy paste abort functionality of a P9 13 + * (or later) by setting up two processes on the same CPU, one 14 + * which executes the copy instruction and the other which 15 + * executes paste. 16 + * 17 + * The paste instruction should never succeed, as the cp_abort 18 + * instruction is called by the kernel during a context switch. 19 + * 20 + */ 21 + 22 + #define _GNU_SOURCE 23 + 24 + #include <stdio.h> 25 + #include <unistd.h> 26 + #include <stdlib.h> 27 + #include "utils.h" 28 + #include <sched.h> 29 + 30 + #define READ_FD 0 31 + #define WRITE_FD 1 32 + 33 + #define NUM_LOOPS 1000 34 + 35 + /* This defines the "paste" instruction from Power ISA 3.0 Book II, section 4.4. */ 36 + #define PASTE(RA, RB, L, RC) \ 37 + .long (0x7c00070c | (RA) << (31-15) | (RB) << (31-20) | (L) << (31-10) | (RC) << (31-31)) 38 + 39 + int paste(void *i) 40 + { 41 + int cr; 42 + 43 + asm volatile(str(PASTE(0, %1, 1, 1))";" 44 + "mfcr %0;" 45 + : "=r" (cr) 46 + : "b" (i) 47 + : "memory" 48 + ); 49 + return cr; 50 + } 51 + 52 + /* This defines the "copy" instruction from Power ISA 3.0 Book II, section 4.4. */ 53 + #define COPY(RA, RB, L) \ 54 + .long (0x7c00060c | (RA) << (31-15) | (RB) << (31-20) | (L) << (31-10)) 55 + 56 + void copy(void *i) 57 + { 58 + asm volatile(str(COPY(0, %0, 1))";" 59 + : 60 + : "b" (i) 61 + : "memory" 62 + ); 63 + } 64 + 65 + int test_cp_abort(void) 66 + { 67 + /* 128 bytes for a full cache line */ 68 + char buf[128] __cacheline_aligned; 69 + cpu_set_t cpuset; 70 + int fd1[2], fd2[2], pid; 71 + char c; 72 + 73 + /* only run this test on a P9 or later */ 74 + SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_00)); 75 + 76 + /* 77 + * Run both processes on the same CPU, so that copy is more likely 78 + * to leak into a paste. 79 + */ 80 + CPU_ZERO(&cpuset); 81 + CPU_SET(pick_online_cpu(), &cpuset); 82 + FAIL_IF(sched_setaffinity(0, sizeof(cpuset), &cpuset)); 83 + 84 + FAIL_IF(pipe(fd1) || pipe(fd2)); 85 + 86 + pid = fork(); 87 + FAIL_IF(pid < 0); 88 + 89 + if (!pid) { 90 + for (int i = 0; i < NUM_LOOPS; i++) { 91 + FAIL_IF((write(fd1[WRITE_FD], &c, 1)) != 1); 92 + FAIL_IF((read(fd2[READ_FD], &c, 1)) != 1); 93 + /* A paste succeeds if CR0 EQ bit is set */ 94 + FAIL_IF(paste(buf) & 0x20000000); 95 + } 96 + } else { 97 + for (int i = 0; i < NUM_LOOPS; i++) { 98 + FAIL_IF((read(fd1[READ_FD], &c, 1)) != 1); 99 + copy(buf); 100 + FAIL_IF((write(fd2[WRITE_FD], &c, 1) != 1)); 101 + } 102 + } 103 + return 0; 104 + 105 + } 106 + 107 + int main(int argc, char *argv[]) 108 + { 109 + return test_harness(test_cp_abort, "cp_abort"); 110 + }

+10 -8

tools/testing/selftests/powerpc/mm/subpage_prot.c

··· 73 73 want_fault |= (subpage == ((page + 1) % 16)); 74 74 75 75 if (faulted != want_fault) { 76 - printf("Failed at 0x%p (p=%ld,sp=%ld,w=%d), want=%s, got=%s !\n", 76 + printf("Failed at %p (p=%ld,sp=%ld,w=%d), want=%s, got=%s !\n", 77 77 addr, page, subpage, write, 78 78 want_fault ? "fault" : "pass", 79 79 faulted ? "fault" : "pass"); ··· 82 82 83 83 if (faulted) { 84 84 if (dar != addr) { 85 - printf("Fault expected at 0x%p and happened at 0x%p !\n", 85 + printf("Fault expected at %p and happened at %p !\n", 86 86 addr, dar); 87 87 } 88 88 faulted = 0; ··· 162 162 163 163 mallocblock = (void *)align; 164 164 165 - printf("allocated malloc block of 0x%lx bytes at 0x%p\n", 165 + printf("allocated malloc block of 0x%lx bytes at %p\n", 166 166 mallocsize, mallocblock); 167 167 168 168 printf("testing malloc block...\n"); ··· 197 197 perror("failed to map file"); 198 198 return 1; 199 199 } 200 - printf("allocated %s for 0x%lx bytes at 0x%p\n", 200 + printf("allocated %s for 0x%lx bytes at %p\n", 201 201 file_name, filesize, fileblock); 202 202 203 203 printf("testing file map...\n"); ··· 207 207 208 208 int main(int argc, char *argv[]) 209 209 { 210 - test_harness(test_anon, "subpage_prot_anon"); 210 + int rc; 211 + 212 + rc = test_harness(test_anon, "subpage_prot_anon"); 213 + if (rc) 214 + return rc; 211 215 212 216 if (argc > 1) 213 217 file_name = argv[1]; 214 218 else 215 219 file_name = "tempfile"; 216 220 217 - test_harness(test_file, "subpage_prot_file"); 218 - 219 - return 0; 221 + return test_harness(test_file, "subpage_prot_file"); 220 222 }

-1

tools/testing/selftests/powerpc/pmu/ebb/ebb.c

··· 15 15 #include <sys/ioctl.h> 16 16 17 17 #include "trace.h" 18 - #include "reg.h" 19 18 #include "ebb.h" 20 19 21 20

+12 -6

tools/testing/selftests/powerpc/pmu/ebb/reg.h tools/testing/selftests/powerpc/reg.h

··· 9 9 #define __stringify_1(x) #x 10 10 #define __stringify(x) __stringify_1(x) 11 11 12 - #define mfspr(rn) ({unsigned long rval; \ 13 - asm volatile("mfspr %0," __stringify(rn) \ 14 - : "=r" (rval)); rval; }) 15 - #define mtspr(rn, v) asm volatile("mtspr " __stringify(rn) ",%0" : \ 16 - : "r" ((unsigned long)(v)) \ 17 - : "memory") 12 + #define mfspr(rn) ({unsigned long rval; \ 13 + asm volatile("mfspr %0," _str(rn) \ 14 + : "=r" (rval)); rval; }) 15 + #define mtspr(rn, v) asm volatile("mtspr " _str(rn) ",%0" : \ 16 + : "r" ((unsigned long)(v)) \ 17 + : "memory") 18 18 19 19 #define mb() asm volatile("sync" : : : "memory"); 20 20 ··· 45 45 #define SPRN_SIAR 780 46 46 #define SPRN_SDAR 781 47 47 #define SPRN_SIER 768 48 + 49 + #define SPRN_TEXASR 0x82 50 + #define SPRN_TFIAR 0x81 /* Transaction Failure Inst Addr */ 51 + #define SPRN_TFHAR 0x80 /* Transaction Failure Handler Addr */ 52 + #define TEXASR_FS 0x08000000 53 + #define SPRN_TAR 0x32f 48 54 49 55 #endif /* _SELFTESTS_POWERPC_REG_H */

-1

tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c

··· 7 7 #include <stdlib.h> 8 8 9 9 #include "ebb.h" 10 - #include "reg.h" 11 10 12 11 13 12 /*

+3

tools/testing/selftests/powerpc/tm/.gitignore

··· 3 3 tm-signal-msr-resv 4 4 tm-signal-stack 5 5 tm-vmxcopy 6 + tm-fork 7 + tm-tar 8 + tm-tmspr

+2 -1

tools/testing/selftests/powerpc/tm/Makefile

··· 1 - TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack tm-vmxcopy 1 + TEST_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack tm-vmxcopy tm-fork tm-tar tm-tmspr 2 2 3 3 all: $(TEST_PROGS) 4 4 ··· 6 6 7 7 tm-syscall: tm-syscall-asm.S 8 8 tm-syscall: CFLAGS += -mhtm -I../../../../../usr/include 9 + tm-tmspr: CFLAGS += -pthread 9 10 10 11 include ../../lib.mk 11 12

+42

tools/testing/selftests/powerpc/tm/tm-fork.c

··· 1 + /* 2 + * Copyright 2015, Michael Neuling, IBM Corp. 3 + * Licensed under GPLv2. 4 + * 5 + * Edited: Rashmica Gupta, Nov 2015 6 + * 7 + * This test does a fork syscall inside a transaction. Basic sniff test 8 + * to see if we can enter the kernel during a transaction. 9 + */ 10 + 11 + #include <errno.h> 12 + #include <inttypes.h> 13 + #include <pthread.h> 14 + #include <stdio.h> 15 + #include <stdlib.h> 16 + #include <unistd.h> 17 + 18 + #include "utils.h" 19 + #include "tm.h" 20 + 21 + int test_fork(void) 22 + { 23 + SKIP_IF(!have_htm()); 24 + 25 + asm __volatile__( 26 + "tbegin.;" 27 + "blt 1f; " 28 + "li 0, 2;" /* fork syscall */ 29 + "sc ;" 30 + "tend.;" 31 + "1: ;" 32 + : : : "memory", "r0"); 33 + /* If we reach here, we've passed. Otherwise we've probably crashed 34 + * the kernel */ 35 + 36 + return 0; 37 + } 38 + 39 + int main(int argc, char *argv[]) 40 + { 41 + return test_harness(test_fork, "tm_fork"); 42 + }

+5 -11

tools/testing/selftests/powerpc/tm/tm-resched-dscr.c

··· 31 31 #include "utils.h" 32 32 #include "tm.h" 33 33 34 - #define TBEGIN ".long 0x7C00051D ;" 35 - #define TEND ".long 0x7C00055D ;" 36 - #define TCHECK ".long 0x7C00059C ;" 37 - #define TSUSPEND ".long 0x7C0005DD ;" 38 - #define TRESUME ".long 0x7C2005DD ;" 39 - #define SPRN_TEXASR 0x82 40 34 #define SPRN_DSCR 0x03 41 35 42 36 int test_body(void) ··· 49 55 "mtspr %[sprn_dscr], 3;" 50 56 51 57 /* start and suspend a transaction */ 52 - TBEGIN 58 + "tbegin.;" 53 59 "beq 1f;" 54 - TSUSPEND 60 + "tsuspend.;" 55 61 56 62 /* hard loop until the transaction becomes doomed */ 57 63 "2: ;" 58 - TCHECK 64 + "tcheck 0;" 59 65 "bc 4, 0, 2b;" 60 66 61 67 /* record DSCR and TEXASR */ ··· 64 70 "mfspr 3, %[sprn_texasr];" 65 71 "std 3, %[texasr];" 66 72 67 - TRESUME 68 - TEND 73 + "tresume.;" 74 + "tend.;" 69 75 "li %[rv], 0;" 70 76 "1: ;" 71 77 : [rv]"=r"(rv), [dscr2]"=m"(dscr2), [texasr]"=m"(texasr)

+2 -2

tools/testing/selftests/powerpc/tm/tm-signal-stack.c

··· 60 60 exit(1); 61 61 asm volatile("li 1, 0 ;" /* stack ptr == NULL */ 62 62 "1:" 63 - ".long 0x7C00051D ;" /* tbegin */ 63 + "tbegin.;" 64 64 "beq 1b ;" /* retry forever */ 65 - ".long 0x7C0005DD ; ;" /* tsuspend */ 65 + "tsuspend.;" 66 66 "ld 2, 0(1) ;" /* trigger segv" */ 67 67 : : : "memory"); 68 68

+90

tools/testing/selftests/powerpc/tm/tm-tar.c

··· 1 + /* 2 + * Copyright 2015, Michael Neuling, IBM Corp. 3 + * Licensed under GPLv2. 4 + * Original: Michael Neuling 19/7/2013 5 + * Edited: Rashmica Gupta 01/12/2015 6 + * 7 + * Do some transactions, see if the tar is corrupted. 8 + * If the transaction is aborted, the TAR should be rolled back to the 9 + * checkpointed value before the transaction began. The value written to 10 + * TAR in suspended mode should only remain in TAR if the transaction 11 + * completes. 12 + */ 13 + 14 + #include <stdio.h> 15 + #include <stdlib.h> 16 + #include <unistd.h> 17 + #include <string.h> 18 + 19 + #include "tm.h" 20 + #include "utils.h" 21 + 22 + int num_loops = 10000; 23 + 24 + int test_tar(void) 25 + { 26 + int i; 27 + 28 + SKIP_IF(!have_htm()); 29 + 30 + for (i = 0; i < num_loops; i++) 31 + { 32 + uint64_t result = 0; 33 + asm __volatile__( 34 + "li 7, 1;" 35 + "mtspr %[tar], 7;" /* tar = 1 */ 36 + "tbegin.;" 37 + "beq 3f;" 38 + "li 4, 0x7000;" /* Loop lots, to use time */ 39 + "2:;" /* Start loop */ 40 + "li 7, 2;" 41 + "mtspr %[tar], 7;" /* tar = 2 */ 42 + "tsuspend.;" 43 + "li 7, 3;" 44 + "mtspr %[tar], 7;" /* tar = 3 */ 45 + "tresume.;" 46 + "subi 4, 4, 1;" 47 + "cmpdi 4, 0;" 48 + "bne 2b;" 49 + "tend.;" 50 + 51 + /* Transaction sucess! TAR should be 3 */ 52 + "mfspr 7, %[tar];" 53 + "ori %[res], 7, 4;" // res = 3|4 = 7 54 + "b 4f;" 55 + 56 + /* Abort handler. TAR should be rolled back to 1 */ 57 + "3:;" 58 + "mfspr 7, %[tar];" 59 + "ori %[res], 7, 8;" // res = 1|8 = 9 60 + "4:;" 61 + 62 + : [res]"=r"(result) 63 + : [tar]"i"(SPRN_TAR) 64 + : "memory", "r0", "r4", "r7"); 65 + 66 + /* If result is anything else other than 7 or 9, the tar 67 + * value must have been corrupted. */ 68 + if ((result != 7) && (result != 9)) 69 + return 1; 70 + } 71 + return 0; 72 + } 73 + 74 + int main(int argc, char *argv[]) 75 + { 76 + /* A low number of iterations (eg 100) can cause a false pass */ 77 + if (argc > 1) { 78 + if (strcmp(argv[1], "-h") == 0) { 79 + printf("Syntax:\n\t%s [<num loops>]\n", 80 + argv[0]); 81 + return 1; 82 + } else { 83 + num_loops = atoi(argv[1]); 84 + } 85 + } 86 + 87 + printf("Starting, %d loops\n", num_loops); 88 + 89 + return test_harness(test_tar, "tm_tar"); 90 + }

+143

tools/testing/selftests/powerpc/tm/tm-tmspr.c

··· 1 + /* 2 + * Copyright 2015, Michael Neuling, IBM Corp. 3 + * Licensed under GPLv2. 4 + * 5 + * Original: Michael Neuling 3/4/2014 6 + * Modified: Rashmica Gupta 8/12/2015 7 + * 8 + * Check if any of the Transaction Memory SPRs get corrupted. 9 + * - TFIAR - stores address of location of transaction failure 10 + * - TFHAR - stores address of software failure handler (if transaction 11 + * fails) 12 + * - TEXASR - lots of info about the transacion(s) 13 + * 14 + * (1) create more threads than cpus 15 + * (2) in each thread: 16 + * (a) set TFIAR and TFHAR a unique value 17 + * (b) loop for awhile, continually checking to see if 18 + * either register has been corrupted. 19 + * 20 + * (3) Loop: 21 + * (a) begin transaction 22 + * (b) abort transaction 23 + * (c) check TEXASR to see if FS has been corrupted 24 + * 25 + */ 26 + 27 + #define _GNU_SOURCE 28 + #include <stdio.h> 29 + #include <stdlib.h> 30 + #include <unistd.h> 31 + #include <pthread.h> 32 + #include <string.h> 33 + 34 + #include "utils.h" 35 + #include "tm.h" 36 + 37 + int num_loops = 10000; 38 + int passed = 1; 39 + 40 + void tfiar_tfhar(void *in) 41 + { 42 + int i, cpu; 43 + unsigned long tfhar, tfhar_rd, tfiar, tfiar_rd; 44 + cpu_set_t cpuset; 45 + 46 + CPU_ZERO(&cpuset); 47 + cpu = (unsigned long)in >> 1; 48 + CPU_SET(cpu, &cpuset); 49 + sched_setaffinity(0, sizeof(cpuset), &cpuset); 50 + 51 + /* TFIAR: Last bit has to be high so userspace can read register */ 52 + tfiar = ((unsigned long)in) + 1; 53 + tfiar += 2; 54 + mtspr(SPRN_TFIAR, tfiar); 55 + 56 + /* TFHAR: Last two bits are reserved */ 57 + tfhar = ((unsigned long)in); 58 + tfhar &= ~0x3UL; 59 + tfhar += 4; 60 + mtspr(SPRN_TFHAR, tfhar); 61 + 62 + for (i = 0; i < num_loops; i++) { 63 + tfhar_rd = mfspr(SPRN_TFHAR); 64 + tfiar_rd = mfspr(SPRN_TFIAR); 65 + if ( (tfhar != tfhar_rd) || (tfiar != tfiar_rd) ) { 66 + passed = 0; 67 + return; 68 + } 69 + } 70 + return; 71 + } 72 + 73 + void texasr(void *in) 74 + { 75 + unsigned long i; 76 + uint64_t result = 0; 77 + 78 + for (i = 0; i < num_loops; i++) { 79 + asm __volatile__( 80 + "tbegin.;" 81 + "beq 3f ;" 82 + "tabort. 0 ;" 83 + "tend.;" 84 + 85 + /* Abort handler */ 86 + "3: ;" 87 + ::: "memory"); 88 + 89 + /* Check the TEXASR */ 90 + result = mfspr(SPRN_TEXASR); 91 + if ((result & TEXASR_FS) == 0) { 92 + passed = 0; 93 + return; 94 + } 95 + } 96 + return; 97 + } 98 + 99 + int test_tmspr() 100 + { 101 + pthread_t thread; 102 + int thread_num; 103 + unsigned long i; 104 + 105 + SKIP_IF(!have_htm()); 106 + 107 + /* To cause some context switching */ 108 + thread_num = 10 * sysconf(_SC_NPROCESSORS_ONLN); 109 + 110 + /* Test TFIAR and TFHAR */ 111 + for (i = 0 ; i < thread_num ; i += 2){ 112 + if (pthread_create(&thread, NULL, (void*)tfiar_tfhar, (void *)i)) 113 + return EXIT_FAILURE; 114 + } 115 + if (pthread_join(thread, NULL) != 0) 116 + return EXIT_FAILURE; 117 + 118 + /* Test TEXASR */ 119 + for (i = 0 ; i < thread_num ; i++){ 120 + if (pthread_create(&thread, NULL, (void*)texasr, (void *)i)) 121 + return EXIT_FAILURE; 122 + } 123 + if (pthread_join(thread, NULL) != 0) 124 + return EXIT_FAILURE; 125 + 126 + if (passed) 127 + return 0; 128 + else 129 + return 1; 130 + } 131 + 132 + int main(int argc, char *argv[]) 133 + { 134 + if (argc > 1) { 135 + if (strcmp(argv[1], "-h") == 0) { 136 + printf("Syntax:\t [<num loops>]\n"); 137 + return 0; 138 + } else { 139 + num_loops = atoi(argv[1]); 140 + } 141 + } 142 + return test_harness(test_tmspr, "tm_tmspr"); 143 + }

+8

tools/testing/selftests/powerpc/utils.h

··· 6 6 #ifndef _SELFTESTS_POWERPC_UTILS_H 7 7 #define _SELFTESTS_POWERPC_UTILS_H 8 8 9 + #define __cacheline_aligned __attribute__((aligned(128))) 10 + 9 11 #include <stdint.h> 10 12 #include <stdbool.h> 11 13 #include <linux/auxvec.h> 14 + #include "reg.h" 12 15 13 16 /* Avoid headaches with PRI?64 - just use %ll? always */ 14 17 typedef unsigned long long u64; ··· 56 53 57 54 #define _str(s) #s 58 55 #define str(s) _str(s) 56 + 57 + /* POWER9 feature */ 58 + #ifndef PPC_FEATURE2_ARCH_3_00 59 + #define PPC_FEATURE2_ARCH_3_00 0x00800000 60 + #endif 59 61 60 62 #endif /* _SELFTESTS_POWERPC_UTILS_H */