Merge tag 'powerpc-4.12-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux

+1 -1

Documentation/features/debug/kprobes-on-ftrace/arch-support.txt

··· 26 26 | nios2: | TODO | 27 27 | openrisc: | TODO | 28 28 | parisc: | TODO | 29 - | powerpc: | TODO | 29 + | powerpc: | ok | 30 30 | s390: | TODO | 31 31 | score: | TODO | 32 32 | sh: | TODO |

+13 -2

Documentation/powerpc/cxl.txt

··· 21 21 Hardware overview 22 22 ================= 23 23 24 - POWER8 FPGA 24 + POWER8/9 FPGA 25 25 +----------+ +---------+ 26 26 | | | | 27 27 | CPU | | AFU | ··· 34 34 | | CAPP |<------>| | 35 35 +---+------+ PCIE +---------+ 36 36 37 - The POWER8 chip has a Coherently Attached Processor Proxy (CAPP) 37 + The POWER8/9 chip has a Coherently Attached Processor Proxy (CAPP) 38 38 unit which is part of the PCIe Host Bridge (PHB). This is managed 39 39 by Linux by calls into OPAL. Linux doesn't directly program the 40 40 CAPP. ··· 59 59 the fault. The context to which this fault is serviced is based on 60 60 who owns that acceleration function. 61 61 62 + POWER8 <-----> PSL Version 8 is compliant to the CAIA Version 1.0. 63 + POWER9 <-----> PSL Version 9 is compliant to the CAIA Version 2.0. 64 + This PSL Version 9 provides new features such as: 65 + * Interaction with the nest MMU on the P9 chip. 66 + * Native DMA support. 67 + * Supports sending ASB_Notify messages for host thread wakeup. 68 + * Supports Atomic operations. 69 + * .... 70 + 71 + Cards with a PSL9 won't work on a POWER8 system and cards with a 72 + PSL8 won't work on a POWER9 system. 62 73 63 74 AFU Modes 64 75 =========

+17 -17

Documentation/powerpc/firmware-assisted-dump.txt

··· 105 105 106 106 If there is no waiting dump data, then only the memory required 107 107 to hold CPU state, HPTE region, boot memory dump and elfcore 108 - header, is reserved at the top of memory (see Fig. 1). This area 109 - is *not* released: this region will be kept permanently reserved, 110 - so that it can act as a receptacle for a copy of the boot memory 111 - content in addition to CPU state and HPTE region, in the case a 112 - crash does occur. 108 + header, is usually reserved at an offset greater than boot memory 109 + size (see Fig. 1). This area is *not* released: this region will 110 + be kept permanently reserved, so that it can act as a receptacle 111 + for a copy of the boot memory content in addition to CPU state 112 + and HPTE region, in the case a crash does occur. 113 113 114 114 o Memory Reservation during first kernel 115 115 116 - Low memory Top of memory 116 + Low memory Top of memory 117 117 0 boot memory size | 118 - | | |<--Reserved dump area -->| 119 - V V | Permanent Reservation V 120 - +-----------+----------/ /----------+---+----+-----------+----+ 121 - | | |CPU|HPTE| DUMP |ELF | 122 - +-----------+----------/ /----------+---+----+-----------+----+ 118 + | | |<--Reserved dump area -->| | 119 + V V | Permanent Reservation | V 120 + +-----------+----------/ /---+---+----+-----------+----+------+ 121 + | | |CPU|HPTE| DUMP |ELF | | 122 + +-----------+----------/ /---+---+----+-----------+----+------+ 123 123 | ^ 124 124 | | 125 125 \ / ··· 135 135 0 boot memory size | 136 136 | |<------------- Reserved dump area ----------- -->| 137 137 V V V 138 - +-----------+----------/ /----------+---+----+-----------+----+ 139 - | | |CPU|HPTE| DUMP |ELF | 140 - +-----------+----------/ /----------+---+----+-----------+----+ 141 - | | 142 - V V 143 - Used by second /proc/vmcore 138 + +-----------+----------/ /---+---+----+-----------+----+------+ 139 + | | |CPU|HPTE| DUMP |ELF | | 140 + +-----------+----------/ /---+---+----+-----------+----+------+ 141 + | | 142 + V V 143 + Used by second /proc/vmcore 144 144 kernel to boot 145 145 Fig. 2 146 146

+2 -1

MAINTAINERS

··· 5310 5310 L: linuxppc-dev@lists.ozlabs.org 5311 5311 L: linux-arm-kernel@lists.infradead.org 5312 5312 S: Maintained 5313 + F: Documentation/devicetree/bindings/powerpc/fsl/ 5313 5314 F: drivers/soc/fsl/ 5314 5315 F: include/linux/fsl/ 5315 5316 ··· 7569 7568 T: git git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 7570 7569 S: Supported 7571 7570 F: Documentation/ABI/stable/sysfs-firmware-opal-* 7572 - F: Documentation/devicetree/bindings/powerpc/opal/ 7571 + F: Documentation/devicetree/bindings/powerpc/ 7573 7572 F: Documentation/devicetree/bindings/rtc/rtc-opal.txt 7574 7573 F: Documentation/devicetree/bindings/i2c/i2c-opal.txt 7575 7574 F: Documentation/powerpc/

+63 -3

arch/powerpc/Kconfig

··· 22 22 bool 23 23 default y 24 24 25 + config ARCH_MMAP_RND_BITS_MAX 26 + # On Book3S 64, the default virtual address space for 64-bit processes 27 + # is 2^47 (128TB). As a maximum, allow randomisation to consume up to 28 + # 32T of address space (2^45), which should ensure a reasonable gap 29 + # between bottom-up and top-down allocations for applications that 30 + # consume "normal" amounts of address space. Book3S 64 only supports 64K 31 + # and 4K page sizes. 32 + default 29 if PPC_BOOK3S_64 && PPC_64K_PAGES # 29 = 45 (32T) - 16 (64K) 33 + default 33 if PPC_BOOK3S_64 # 33 = 45 (32T) - 12 (4K) 34 + # 35 + # On all other 64-bit platforms (currently only Book3E), the virtual 36 + # address space is 2^46 (64TB). Allow randomisation to consume up to 16T 37 + # of address space (2^44). Only 4K page sizes are supported. 38 + default 32 if 64BIT # 32 = 44 (16T) - 12 (4K) 39 + # 40 + # For 32-bit, use the compat values, as they're the same. 41 + default ARCH_MMAP_RND_COMPAT_BITS_MAX 42 + 43 + config ARCH_MMAP_RND_BITS_MIN 44 + # Allow randomisation to consume up to 1GB of address space (2^30). 45 + default 14 if 64BIT && PPC_64K_PAGES # 14 = 30 (1GB) - 16 (64K) 46 + default 18 if 64BIT # 18 = 30 (1GB) - 12 (4K) 47 + # 48 + # For 32-bit, use the compat values, as they're the same. 49 + default ARCH_MMAP_RND_COMPAT_BITS_MIN 50 + 51 + config ARCH_MMAP_RND_COMPAT_BITS_MAX 52 + # Total virtual address space for 32-bit processes is 2^31 (2GB). 53 + # Allow randomisation to consume up to 512MB of address space (2^29). 54 + default 11 if PPC_256K_PAGES # 11 = 29 (512MB) - 18 (256K) 55 + default 13 if PPC_64K_PAGES # 13 = 29 (512MB) - 16 (64K) 56 + default 15 if PPC_16K_PAGES # 15 = 29 (512MB) - 14 (16K) 57 + default 17 # 17 = 29 (512MB) - 12 (4K) 58 + 59 + config ARCH_MMAP_RND_COMPAT_BITS_MIN 60 + # Total virtual address space for 32-bit processes is 2^31 (2GB). 61 + # Allow randomisation to consume up to 8MB of address space (2^23). 62 + default 5 if PPC_256K_PAGES # 5 = 23 (8MB) - 18 (256K) 63 + default 7 if PPC_64K_PAGES # 7 = 23 (8MB) - 16 (64K) 64 + default 9 if PPC_16K_PAGES # 9 = 23 (8MB) - 14 (16K) 65 + default 11 # 11 = 23 (8MB) - 12 (4K) 66 + 25 67 config HAVE_SETUP_PER_CPU_AREA 26 68 def_bool PPC64 27 69 ··· 79 37 can manage. Virtual interrupt numbers are what you see in 80 38 /proc/interrupts. If you configure your system to have too few, 81 39 drivers will fail to load or worse - handle with care. 40 + 41 + config NMI_IPI 42 + bool 43 + depends on SMP && (DEBUGGER || KEXEC_CORE) 44 + default y 82 45 83 46 config STACKTRACE_SUPPORT 84 47 bool ··· 166 119 select HAVE_ARCH_AUDITSYSCALL 167 120 select HAVE_ARCH_JUMP_LABEL 168 121 select HAVE_ARCH_KGDB 122 + select HAVE_ARCH_MMAP_RND_BITS 123 + select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT 169 124 select HAVE_ARCH_SECCOMP_FILTER 170 125 select HAVE_ARCH_TRACEHOOK 171 126 select HAVE_CBPF_JIT if !PPC64 ··· 190 141 select HAVE_IRQ_EXIT_ON_IRQ_STACK 191 142 select HAVE_KERNEL_GZIP 192 143 select HAVE_KPROBES 144 + select HAVE_KPROBES_ON_FTRACE 193 145 select HAVE_KRETPROBES 194 146 select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS 195 147 select HAVE_MEMBLOCK ··· 539 489 540 490 config RELOCATABLE 541 491 bool "Build a relocatable kernel" 542 - depends on (PPC64 && !COMPILE_TEST) || (FLATMEM && (44x || FSL_BOOKE)) 492 + depends on PPC64 || (FLATMEM && (44x || FSL_BOOKE)) 543 493 select NONSTATIC_KERNEL 544 494 select MODULE_REL_CRCS if MODVERSIONS 545 495 help ··· 573 523 config CRASH_DUMP 574 524 bool "Build a kdump crash kernel" 575 525 depends on PPC64 || 6xx || FSL_BOOKE || (44x && !SMP) 576 - select RELOCATABLE if (PPC64 && !COMPILE_TEST) || 44x || FSL_BOOKE 526 + select RELOCATABLE if PPC64 || 44x || FSL_BOOKE 577 527 help 578 528 Build a kernel suitable for use as a kdump capture kernel. 579 529 The same kernel binary can be used as production kernel and dump ··· 635 585 636 586 config ARCH_SPARSEMEM_DEFAULT 637 587 def_bool y 638 - depends on (SMP && PPC_PSERIES) || PPC_PS3 588 + depends on PPC_BOOK3S_64 639 589 640 590 config SYS_SUPPORTS_HUGETLBFS 641 591 bool ··· 726 676 Say N unless you know what you are doing. 727 677 728 678 endchoice 679 + 680 + config THREAD_SHIFT 681 + int "Thread shift" if EXPERT 682 + range 13 15 683 + default "15" if PPC_256K_PAGES 684 + default "14" if PPC64 685 + default "13" 686 + help 687 + Used to define the stack size. The default is almost always what you 688 + want. Only change this if you know what you are doing. 729 689 730 690 config FORCE_MAX_ZONEORDER 731 691 int "Maximum zone order"

+1 -12

arch/powerpc/Makefile

··· 136 136 endif 137 137 138 138 ifdef CONFIG_MPROFILE_KERNEL 139 - ifeq ($(shell $(srctree)/arch/powerpc/scripts/gcc-check-mprofile-kernel.sh $(CC) -I$(srctree)/include -D__KERNEL__),OK) 139 + ifeq ($(shell $(srctree)/arch/powerpc/tools/gcc-check-mprofile-kernel.sh $(CC) -I$(srctree)/include -D__KERNEL__),OK) 140 140 CC_FLAGS_FTRACE := -pg -mprofile-kernel 141 141 KBUILD_CPPFLAGS += -DCC_USING_MPROFILE_KERNEL 142 142 else ··· 273 273 PHONY += $(BOOT_TARGETS1) $(BOOT_TARGETS2) 274 274 275 275 boot := arch/$(ARCH)/boot 276 - 277 - ifeq ($(CONFIG_RELOCATABLE),y) 278 - quiet_cmd_relocs_check = CALL $< 279 - cmd_relocs_check = $(CONFIG_SHELL) $< "$(OBJDUMP)" "$(obj)/vmlinux" 280 - 281 - PHONY += relocs_check 282 - relocs_check: arch/powerpc/relocs_check.sh vmlinux 283 - $(call cmd,relocs_check) 284 - 285 - zImage: relocs_check 286 - endif 287 276 288 277 $(BOOT_TARGETS1): vmlinux 289 278 $(Q)$(MAKE) $(build)=$(boot) $(patsubst %,$(boot)/%,$@)

+34

arch/powerpc/Makefile.postlink

··· 1 + # =========================================================================== 2 + # Post-link powerpc pass 3 + # =========================================================================== 4 + # 5 + # 1. Check that vmlinux relocations look sane 6 + 7 + PHONY := __archpost 8 + __archpost: 9 + 10 + include include/config/auto.conf 11 + include scripts/Kbuild.include 12 + 13 + quiet_cmd_relocs_check = CHKREL $@ 14 + cmd_relocs_check = $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$@" 15 + 16 + # `@true` prevents complaint when there is nothing to be done 17 + 18 + vmlinux: FORCE 19 + @true 20 + ifdef CONFIG_RELOCATABLE 21 + $(call if_changed,relocs_check) 22 + endif 23 + 24 + %.ko: FORCE 25 + @true 26 + 27 + clean: 28 + @true 29 + 30 + PHONY += FORCE clean 31 + 32 + FORCE: 33 + 34 + .PHONY: $(PHONY)

+3 -3

arch/powerpc/configs/powernv_defconfig

··· 33 33 CONFIG_BPF_SYSCALL=y 34 34 # CONFIG_COMPAT_BRK is not set 35 35 CONFIG_PROFILING=y 36 - CONFIG_OPROFILE=y 36 + CONFIG_OPROFILE=m 37 37 CONFIG_KPROBES=y 38 38 CONFIG_JUMP_LABEL=y 39 39 CONFIG_MODULES=y ··· 261 261 CONFIG_AUTOFS4_FS=m 262 262 CONFIG_FUSE_FS=m 263 263 CONFIG_OVERLAY_FS=m 264 - CONFIG_ISO9660_FS=m 264 + CONFIG_ISO9660_FS=y 265 265 CONFIG_UDF_FS=m 266 266 CONFIG_MSDOS_FS=y 267 267 CONFIG_VFAT_FS=m ··· 306 306 CONFIG_CRYPTO_CCM=m 307 307 CONFIG_CRYPTO_PCBC=m 308 308 CONFIG_CRYPTO_HMAC=y 309 - CONFIG_CRYPT_CRC32C_VPMSUM=m 309 + CONFIG_CRYPTO_CRC32C_VPMSUM=m 310 310 CONFIG_CRYPTO_MD5_PPC=m 311 311 CONFIG_CRYPTO_MICHAEL_MIC=m 312 312 CONFIG_CRYPTO_SHA256=y

+3 -3

arch/powerpc/configs/ppc64_defconfig

··· 19 19 CONFIG_BPF_SYSCALL=y 20 20 # CONFIG_COMPAT_BRK is not set 21 21 CONFIG_PROFILING=y 22 - CONFIG_OPROFILE=y 22 + CONFIG_OPROFILE=m 23 23 CONFIG_KPROBES=y 24 24 CONFIG_JUMP_LABEL=y 25 25 CONFIG_MODULES=y ··· 290 290 CONFIG_AUTOFS4_FS=m 291 291 CONFIG_FUSE_FS=m 292 292 CONFIG_OVERLAY_FS=m 293 - CONFIG_ISO9660_FS=m 293 + CONFIG_ISO9660_FS=y 294 294 CONFIG_UDF_FS=m 295 295 CONFIG_MSDOS_FS=y 296 296 CONFIG_VFAT_FS=m ··· 339 339 CONFIG_CRYPTO_TEST=m 340 340 CONFIG_CRYPTO_PCBC=m 341 341 CONFIG_CRYPTO_HMAC=y 342 - CONFIG_CRYPT_CRC32C_VPMSUM=m 342 + CONFIG_CRYPTO_CRC32C_VPMSUM=m 343 343 CONFIG_CRYPTO_MD5_PPC=m 344 344 CONFIG_CRYPTO_MICHAEL_MIC=m 345 345 CONFIG_CRYPTO_SHA256=y

+3 -3

arch/powerpc/configs/pseries_defconfig

··· 34 34 CONFIG_BPF_SYSCALL=y 35 35 # CONFIG_COMPAT_BRK is not set 36 36 CONFIG_PROFILING=y 37 - CONFIG_OPROFILE=y 37 + CONFIG_OPROFILE=m 38 38 CONFIG_KPROBES=y 39 39 CONFIG_JUMP_LABEL=y 40 40 CONFIG_MODULES=y ··· 259 259 CONFIG_AUTOFS4_FS=m 260 260 CONFIG_FUSE_FS=m 261 261 CONFIG_OVERLAY_FS=m 262 - CONFIG_ISO9660_FS=m 262 + CONFIG_ISO9660_FS=y 263 263 CONFIG_UDF_FS=m 264 264 CONFIG_MSDOS_FS=y 265 265 CONFIG_VFAT_FS=m ··· 303 303 CONFIG_CRYPTO_TEST=m 304 304 CONFIG_CRYPTO_PCBC=m 305 305 CONFIG_CRYPTO_HMAC=y 306 - CONFIG_CRYPT_CRC32C_VPMSUM=m 306 + CONFIG_CRYPTO_CRC32C_VPMSUM=m 307 307 CONFIG_CRYPTO_MD5_PPC=m 308 308 CONFIG_CRYPTO_MICHAEL_MIC=m 309 309 CONFIG_CRYPTO_SHA256=y

+4

arch/powerpc/include/asm/asm-prototypes.h

··· 17 17 #include <asm/checksum.h> 18 18 #include <linux/uaccess.h> 19 19 #include <asm/epapr_hcalls.h> 20 + #include <asm/dcr.h> 21 + #include <asm/mmu_context.h> 20 22 21 23 #include <uapi/asm/ucontext.h> 22 24 ··· 122 120 extern int __cmpdi2(s64, s64); 123 121 extern int __ucmpdi2(u64, u64); 124 122 123 + /* tracing */ 125 124 void _mcount(void); 125 + unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip); 126 126 127 127 #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */

+8

arch/powerpc/include/asm/bitops.h

··· 55 55 #define PPC_BITEXTRACT(bits, ppc_bit, dst_bit) \ 56 56 ((((bits) >> PPC_BITLSHIFT(ppc_bit)) & 1) << (dst_bit)) 57 57 58 + #define PPC_BITLSHIFT32(be) (32 - 1 - (be)) 59 + #define PPC_BIT32(bit) (1UL << PPC_BITLSHIFT32(bit)) 60 + #define PPC_BITMASK32(bs, be) ((PPC_BIT32(bs) - PPC_BIT32(be))|PPC_BIT32(bs)) 61 + 62 + #define PPC_BITLSHIFT8(be) (8 - 1 - (be)) 63 + #define PPC_BIT8(bit) (1UL << PPC_BITLSHIFT8(bit)) 64 + #define PPC_BITMASK8(bs, be) ((PPC_BIT8(bs) - PPC_BIT8(be))|PPC_BIT8(bs)) 65 + 58 66 #include <asm/barrier.h> 59 67 60 68 /* Macro for generating the ***_bits() functions */

+1 -1

arch/powerpc/include/asm/book3s/64/hash-4k.h

··· 8 8 #define H_PTE_INDEX_SIZE 9 9 9 #define H_PMD_INDEX_SIZE 7 10 10 #define H_PUD_INDEX_SIZE 9 11 - #define H_PGD_INDEX_SIZE 9 11 + #define H_PGD_INDEX_SIZE 12 12 12 13 13 #ifndef __ASSEMBLY__ 14 14 #define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE)

+7 -3

arch/powerpc/include/asm/book3s/64/hash-64k.h

··· 4 4 #define H_PTE_INDEX_SIZE 8 5 5 #define H_PMD_INDEX_SIZE 5 6 6 #define H_PUD_INDEX_SIZE 5 7 - #define H_PGD_INDEX_SIZE 12 7 + #define H_PGD_INDEX_SIZE 15 8 8 9 - #define H_PAGE_COMBO 0x00001000 /* this is a combo 4k page */ 10 - #define H_PAGE_4K_PFN 0x00002000 /* PFN is for a single 4k page */ 9 + /* 10 + * 64k aligned address free up few of the lower bits of RPN for us 11 + * We steal that here. For more deatils look at pte_pfn/pfn_pte() 12 + */ 13 + #define H_PAGE_COMBO _RPAGE_RPN0 /* this is a combo 4k page */ 14 + #define H_PAGE_4K_PFN _RPAGE_RPN1 /* PFN is for a single 4k page */ 11 15 /* 12 16 * We need to differentiate between explicit huge page and THP huge 13 17 * page, since THP huge page also need to track real subpage details

+5 -11

arch/powerpc/include/asm/book3s/64/hash.h

··· 6 6 * Common bits between 4K and 64K pages in a linux-style PTE. 7 7 * Additional bits may be defined in pgtable-hash64-*.h 8 8 * 9 - * Note: We only support user read/write permissions. Supervisor always 10 - * have full read/write to pages above PAGE_OFFSET (pages below that 11 - * always use the user access permissions). 12 - * 13 - * We could create separate kernel read-only if we used the 3 PP bits 14 - * combinations that newer processors provide but we currently don't. 15 9 */ 16 - #define H_PAGE_BUSY 0x00800 /* software: PTE & hash are busy */ 17 10 #define H_PTE_NONE_MASK _PAGE_HPTEFLAGS 18 - #define H_PAGE_F_GIX_SHIFT 57 19 - #define H_PAGE_F_GIX (7ul << 57) /* HPTE index within HPTEG */ 20 - #define H_PAGE_F_SECOND (1ul << 60) /* HPTE is in 2ndary HPTEG */ 21 - #define H_PAGE_HASHPTE (1ul << 61) /* PTE has associated HPTE */ 11 + #define H_PAGE_F_GIX_SHIFT 56 12 + #define H_PAGE_BUSY _RPAGE_RSV1 /* software: PTE & hash are busy */ 13 + #define H_PAGE_F_SECOND _RPAGE_RSV2 /* HPTE is in 2ndary HPTEG */ 14 + #define H_PAGE_F_GIX (_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44) 15 + #define H_PAGE_HASHPTE _RPAGE_RPN43 /* PTE has associated HPTE */ 22 16 23 17 #ifdef CONFIG_PPC_64K_PAGES 24 18 #include <asm/book3s/64/hash-64k.h>

+1 -1

arch/powerpc/include/asm/book3s/64/hugetlb.h

··· 46 46 */ 47 47 VM_WARN_ON(page_shift == mmu_psize_defs[MMU_PAGE_1G].shift); 48 48 if (page_shift == mmu_psize_defs[MMU_PAGE_2M].shift) 49 - return __pte(pte_val(entry) | _PAGE_LARGE); 49 + return __pte(pte_val(entry) | R_PAGE_LARGE); 50 50 else 51 51 return entry; 52 52 }

+126 -74

arch/powerpc/include/asm/book3s/64/mmu-hash.h

··· 39 39 40 40 /* Bits in the SLB VSID word */ 41 41 #define SLB_VSID_SHIFT 12 42 + #define SLB_VSID_SHIFT_256M SLB_VSID_SHIFT 42 43 #define SLB_VSID_SHIFT_1T 24 43 44 #define SLB_VSID_SSIZE_SHIFT 62 44 45 #define SLB_VSID_B ASM_CONST(0xc000000000000000) ··· 409 408 static inline unsigned long hpt_hash(unsigned long vpn, 410 409 unsigned int shift, int ssize) 411 410 { 412 - int mask; 411 + unsigned long mask; 413 412 unsigned long hash, vsid; 414 413 415 414 /* VPN_SHIFT can be atmost 12 */ ··· 492 491 * We first generate a 37-bit "proto-VSID". Proto-VSIDs are generated 493 492 * from mmu context id and effective segment id of the address. 494 493 * 495 - * For user processes max context id is limited to ((1ul << 19) - 5) 496 - * for kernel space, we use the top 4 context ids to map address as below 494 + * For user processes max context id is limited to MAX_USER_CONTEXT. 495 + 496 + * For kernel space, we use context ids 1-4 to map addresses as below: 497 497 * NOTE: each context only support 64TB now. 498 - * 0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ] 499 - * 0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ] 500 - * 0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ] 501 - * 0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ] 498 + * 0x00001 - [ 0xc000000000000000 - 0xc0003fffffffffff ] 499 + * 0x00002 - [ 0xd000000000000000 - 0xd0003fffffffffff ] 500 + * 0x00003 - [ 0xe000000000000000 - 0xe0003fffffffffff ] 501 + * 0x00004 - [ 0xf000000000000000 - 0xf0003fffffffffff ] 502 502 * 503 503 * The proto-VSIDs are then scrambled into real VSIDs with the 504 504 * multiplicative hash: ··· 513 511 * robust scattering in the hash table (at least based on some initial 514 512 * results). 515 513 * 516 - * We also consider VSID 0 special. We use VSID 0 for slb entries mapping 517 - * bad address. This enables us to consolidate bad address handling in 518 - * hash_page. 514 + * We use VSID 0 to indicate an invalid VSID. The means we can't use context id 515 + * 0, because a context id of 0 and an EA of 0 gives a proto-VSID of 0, which 516 + * will produce a VSID of 0. 519 517 * 520 518 * We also need to avoid the last segment of the last context, because that 521 519 * would give a protovsid of 0x1fffffffff. That will result in a VSID 0 522 - * because of the modulo operation in vsid scramble. But the vmemmap 523 - * (which is what uses region 0xf) will never be close to 64TB in size 524 - * (it's 56 bytes per page of system memory). 520 + * because of the modulo operation in vsid scramble. 525 521 */ 526 522 523 + /* 524 + * Max Va bits we support as of now is 68 bits. We want 19 bit 525 + * context ID. 526 + * Restrictions: 527 + * GPU has restrictions of not able to access beyond 128TB 528 + * (47 bit effective address). We also cannot do more than 20bit PID. 529 + * For p4 and p5 which can only do 65 bit VA, we restrict our CONTEXT_BITS 530 + * to 16 bits (ie, we can only have 2^16 pids at the same time). 531 + */ 532 + #define VA_BITS 68 527 533 #define CONTEXT_BITS 19 528 - #define ESID_BITS 18 529 - #define ESID_BITS_1T 6 534 + #define ESID_BITS (VA_BITS - (SID_SHIFT + CONTEXT_BITS)) 535 + #define ESID_BITS_1T (VA_BITS - (SID_SHIFT_1T + CONTEXT_BITS)) 530 536 531 537 #define ESID_BITS_MASK ((1 << ESID_BITS) - 1) 532 538 #define ESID_BITS_1T_MASK ((1 << ESID_BITS_1T) - 1) ··· 542 532 /* 543 533 * 256MB segment 544 534 * The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments 545 - * available for user + kernel mapping. The top 4 contexts are used for 546 - * kernel mapping. Each segment contains 2^28 bytes. Each 547 - * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts 548 - * (19 == 37 + 28 - 46). 535 + * available for user + kernel mapping. VSID 0 is reserved as invalid, contexts 536 + * 1-4 are used for kernel mapping. Each segment contains 2^28 bytes. Each 537 + * context maps 2^49 bytes (512TB). 538 + * 539 + * We also need to avoid the last segment of the last context, because that 540 + * would give a protovsid of 0x1fffffffff. That will result in a VSID 0 541 + * because of the modulo operation in vsid scramble. 549 542 */ 550 - #define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 5) 543 + #define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 2) 544 + #define MIN_USER_CONTEXT (5) 545 + 546 + /* Would be nice to use KERNEL_REGION_ID here */ 547 + #define KERNEL_REGION_CONTEXT_OFFSET (0xc - 1) 548 + 549 + /* 550 + * For platforms that support on 65bit VA we limit the context bits 551 + */ 552 + #define MAX_USER_CONTEXT_65BIT_VA ((ASM_CONST(1) << (65 - (SID_SHIFT + ESID_BITS))) - 2) 551 553 552 554 /* 553 555 * This should be computed such that protovosid * vsid_mulitplier 554 - * doesn't overflow 64 bits. It should also be co-prime to vsid_modulus 556 + * doesn't overflow 64 bits. The vsid_mutliplier should also be 557 + * co-prime to vsid_modulus. We also need to make sure that number 558 + * of bits in multiplied result (dividend) is less than twice the number of 559 + * protovsid bits for our modulus optmization to work. 560 + * 561 + * The below table shows the current values used. 562 + * |-------+------------+----------------------+------------+-------------------| 563 + * | | Prime Bits | proto VSID_BITS_65VA | Total Bits | 2* prot VSID_BITS | 564 + * |-------+------------+----------------------+------------+-------------------| 565 + * | 1T | 24 | 25 | 49 | 50 | 566 + * |-------+------------+----------------------+------------+-------------------| 567 + * | 256MB | 24 | 37 | 61 | 74 | 568 + * |-------+------------+----------------------+------------+-------------------| 569 + * 570 + * |-------+------------+----------------------+------------+--------------------| 571 + * | | Prime Bits | proto VSID_BITS_68VA | Total Bits | 2* proto VSID_BITS | 572 + * |-------+------------+----------------------+------------+--------------------| 573 + * | 1T | 24 | 28 | 52 | 56 | 574 + * |-------+------------+----------------------+------------+--------------------| 575 + * | 256MB | 24 | 40 | 64 | 80 | 576 + * |-------+------------+----------------------+------------+--------------------| 577 + * 555 578 */ 556 579 #define VSID_MULTIPLIER_256M ASM_CONST(12538073) /* 24-bit prime */ 557 - #define VSID_BITS_256M (CONTEXT_BITS + ESID_BITS) 558 - #define VSID_MODULUS_256M ((1UL<<VSID_BITS_256M)-1) 580 + #define VSID_BITS_256M (VA_BITS - SID_SHIFT) 581 + #define VSID_BITS_65_256M (65 - SID_SHIFT) 582 + /* 583 + * Modular multiplicative inverse of VSID_MULTIPLIER under modulo VSID_MODULUS 584 + */ 585 + #define VSID_MULINV_256M ASM_CONST(665548017062) 559 586 560 587 #define VSID_MULTIPLIER_1T ASM_CONST(12538073) /* 24-bit prime */ 561 - #define VSID_BITS_1T (CONTEXT_BITS + ESID_BITS_1T) 562 - #define VSID_MODULUS_1T ((1UL<<VSID_BITS_1T)-1) 588 + #define VSID_BITS_1T (VA_BITS - SID_SHIFT_1T) 589 + #define VSID_BITS_65_1T (65 - SID_SHIFT_1T) 590 + #define VSID_MULINV_1T ASM_CONST(209034062) 563 591 564 - 592 + /* 1TB VSID reserved for VRMA */ 593 + #define VRMA_VSID 0x1ffffffUL 565 594 #define USER_VSID_RANGE (1UL << (ESID_BITS + SID_SHIFT)) 566 595 567 - /* 568 - * This macro generates asm code to compute the VSID scramble 569 - * function. Used in slb_allocate() and do_stab_bolted. The function 570 - * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS 571 - * 572 - * rt = register containing the proto-VSID and into which the 573 - * VSID will be stored 574 - * rx = scratch register (clobbered) 575 - * 576 - * - rt and rx must be different registers 577 - * - The answer will end up in the low VSID_BITS bits of rt. The higher 578 - * bits may contain other garbage, so you may need to mask the 579 - * result. 580 - */ 581 - #define ASM_VSID_SCRAMBLE(rt, rx, size) \ 582 - lis rx,VSID_MULTIPLIER_##size@h; \ 583 - ori rx,rx,VSID_MULTIPLIER_##size@l; \ 584 - mulld rt,rt,rx; /* rt = rt * MULTIPLIER */ \ 585 - \ 586 - srdi rx,rt,VSID_BITS_##size; \ 587 - clrldi rt,rt,(64-VSID_BITS_##size); \ 588 - add rt,rt,rx; /* add high and low bits */ \ 589 - /* NOTE: explanation based on VSID_BITS_##size = 36 \ 590 - * Now, r3 == VSID (mod 2^36-1), and lies between 0 and \ 591 - * 2^36-1+2^28-1. That in particular means that if r3 >= \ 592 - * 2^36-1, then r3+1 has the 2^36 bit set. So, if r3+1 has \ 593 - * the bit clear, r3 already has the answer we want, if it \ 594 - * doesn't, the answer is the low 36 bits of r3+1. So in all \ 595 - * cases the answer is the low 36 bits of (r3 + ((r3+1) >> 36))*/\ 596 - addi rx,rt,1; \ 597 - srdi rx,rx,VSID_BITS_##size; /* extract 2^VSID_BITS bit */ \ 598 - add rt,rt,rx 599 - 600 596 /* 4 bits per slice and we have one slice per 1TB */ 601 - #define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) 597 + #define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) 598 + #define TASK_SLICE_ARRAY_SZ(x) ((x)->context.addr_limit >> 41) 602 599 603 600 #ifndef __ASSEMBLY__ 604 601 ··· 651 634 #define vsid_scramble(protovsid, size) \ 652 635 ((((protovsid) * VSID_MULTIPLIER_##size) % VSID_MODULUS_##size)) 653 636 654 - #else /* 1 */ 637 + /* simplified form avoiding mod operation */ 655 638 #define vsid_scramble(protovsid, size) \ 656 639 ({ \ 657 640 unsigned long x; \ ··· 659 642 x = (x >> VSID_BITS_##size) + (x & VSID_MODULUS_##size); \ 660 643 (x + ((x+1) >> VSID_BITS_##size)) & VSID_MODULUS_##size; \ 661 644 }) 645 + 646 + #else /* 1 */ 647 + static inline unsigned long vsid_scramble(unsigned long protovsid, 648 + unsigned long vsid_multiplier, int vsid_bits) 649 + { 650 + unsigned long vsid; 651 + unsigned long vsid_modulus = ((1UL << vsid_bits) - 1); 652 + /* 653 + * We have same multipler for both 256 and 1T segements now 654 + */ 655 + vsid = protovsid * vsid_multiplier; 656 + vsid = (vsid >> vsid_bits) + (vsid & vsid_modulus); 657 + return (vsid + ((vsid + 1) >> vsid_bits)) & vsid_modulus; 658 + } 659 + 662 660 #endif /* 1 */ 663 661 664 662 /* Returns the segment size indicator for a user address */ ··· 688 656 static inline unsigned long get_vsid(unsigned long context, unsigned long ea, 689 657 int ssize) 690 658 { 659 + unsigned long va_bits = VA_BITS; 660 + unsigned long vsid_bits; 661 + unsigned long protovsid; 662 + 691 663 /* 692 664 * Bad address. We return VSID 0 for that 693 665 */ 694 666 if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE) 695 667 return 0; 696 668 697 - if (ssize == MMU_SEGSIZE_256M) 698 - return vsid_scramble((context << ESID_BITS) 699 - | ((ea >> SID_SHIFT) & ESID_BITS_MASK), 256M); 700 - return vsid_scramble((context << ESID_BITS_1T) 701 - | ((ea >> SID_SHIFT_1T) & ESID_BITS_1T_MASK), 1T); 669 + if (!mmu_has_feature(MMU_FTR_68_BIT_VA)) 670 + va_bits = 65; 671 + 672 + if (ssize == MMU_SEGSIZE_256M) { 673 + vsid_bits = va_bits - SID_SHIFT; 674 + protovsid = (context << ESID_BITS) | 675 + ((ea >> SID_SHIFT) & ESID_BITS_MASK); 676 + return vsid_scramble(protovsid, VSID_MULTIPLIER_256M, vsid_bits); 677 + } 678 + /* 1T segment */ 679 + vsid_bits = va_bits - SID_SHIFT_1T; 680 + protovsid = (context << ESID_BITS_1T) | 681 + ((ea >> SID_SHIFT_1T) & ESID_BITS_1T_MASK); 682 + return vsid_scramble(protovsid, VSID_MULTIPLIER_1T, vsid_bits); 702 683 } 703 684 704 685 /* 705 686 * This is only valid for addresses >= PAGE_OFFSET 706 - * 707 - * For kernel space, we use the top 4 context ids to map address as below 708 - * 0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ] 709 - * 0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ] 710 - * 0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ] 711 - * 0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ] 712 687 */ 713 688 static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize) 714 689 { 715 690 unsigned long context; 716 691 692 + if (!is_kernel_addr(ea)) 693 + return 0; 694 + 717 695 /* 718 - * kernel take the top 4 context from the available range 696 + * For kernel space, we use context ids 1-4 to map the address space as 697 + * below: 698 + * 699 + * 0x00001 - [ 0xc000000000000000 - 0xc0003fffffffffff ] 700 + * 0x00002 - [ 0xd000000000000000 - 0xd0003fffffffffff ] 701 + * 0x00003 - [ 0xe000000000000000 - 0xe0003fffffffffff ] 702 + * 0x00004 - [ 0xf000000000000000 - 0xf0003fffffffffff ] 703 + * 704 + * So we can compute the context from the region (top nibble) by 705 + * subtracting 11, or 0xc - 1. 719 706 */ 720 - context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1; 707 + context = (ea >> 60) - KERNEL_REGION_CONTEXT_OFFSET; 708 + 721 709 return get_vsid(context, ea, ssize); 722 710 } 723 711

+9

arch/powerpc/include/asm/book3s/64/mmu.h

··· 65 65 * MAX_USER_CONTEXT * 16 bytes of space. 66 66 */ 67 67 #define PRTB_SIZE_SHIFT (CONTEXT_BITS + 4) 68 + #define PRTB_ENTRIES (1ul << CONTEXT_BITS) 69 + 68 70 /* 69 71 * Power9 currently only support 64K partition table size. 70 72 */ ··· 75 73 typedef unsigned long mm_context_id_t; 76 74 struct spinlock; 77 75 76 + /* Maximum possible number of NPUs in a system. */ 77 + #define NV_MAX_NPUS 8 78 + 78 79 typedef struct { 79 80 mm_context_id_t id; 80 81 u16 user_psize; /* page size index */ 81 82 83 + /* NPU NMMU context */ 84 + struct npu_context *npu_context; 85 + 82 86 #ifdef CONFIG_PPC_MM_SLICES 83 87 u64 low_slices_psize; /* SLB page size encodings */ 84 88 unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; 89 + unsigned long addr_limit; 85 90 #else 86 91 u16 sllp; /* SLB page size encoding */ 87 92 #endif

+41 -17

arch/powerpc/include/asm/book3s/64/pgtable.h

··· 13 13 #define _PAGE_BIT_SWAP_TYPE 0 14 14 15 15 #define _PAGE_RO 0 16 + #define _PAGE_SHARED 0 16 17 17 18 #define _PAGE_EXEC 0x00001 /* execute permission */ 18 19 #define _PAGE_WRITE 0x00002 /* write access allowed */ ··· 38 37 #define _RPAGE_RSV3 0x0400000000000000UL 39 38 #define _RPAGE_RSV4 0x0200000000000000UL 40 39 41 - #ifdef CONFIG_MEM_SOFT_DIRTY 42 - #define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */ 43 - #else 44 - #define _PAGE_SOFT_DIRTY 0x00000 45 - #endif 46 - #define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */ 40 + #define _PAGE_PTE 0x4000000000000000UL /* distinguishes PTEs from pointers */ 41 + #define _PAGE_PRESENT 0x8000000000000000UL /* pte contains a translation */ 47 42 48 43 /* 49 - * For P9 DD1 only, we need to track whether the pte's huge. 44 + * Top and bottom bits of RPN which can be used by hash 45 + * translation mode, because we expect them to be zero 46 + * otherwise. 50 47 */ 51 - #define _PAGE_LARGE _RPAGE_RSV1 48 + #define _RPAGE_RPN0 0x01000 49 + #define _RPAGE_RPN1 0x02000 50 + #define _RPAGE_RPN44 0x0100000000000000UL 51 + #define _RPAGE_RPN43 0x0080000000000000UL 52 + #define _RPAGE_RPN42 0x0040000000000000UL 53 + #define _RPAGE_RPN41 0x0020000000000000UL 52 54 55 + /* Max physical address bit as per radix table */ 56 + #define _RPAGE_PA_MAX 57 53 57 54 - #define _PAGE_PTE (1ul << 62) /* distinguishes PTEs from pointers */ 55 - #define _PAGE_PRESENT (1ul << 63) /* pte contains a translation */ 58 + /* 59 + * Max physical address bit we will use for now. 60 + * 61 + * This is mostly a hardware limitation and for now Power9 has 62 + * a 51 bit limit. 63 + * 64 + * This is different from the number of physical bit required to address 65 + * the last byte of memory. That is defined by MAX_PHYSMEM_BITS. 66 + * MAX_PHYSMEM_BITS is a linux limitation imposed by the maximum 67 + * number of sections we can support (SECTIONS_SHIFT). 68 + * 69 + * This is different from Radix page table limitation above and 70 + * should always be less than that. The limit is done such that 71 + * we can overload the bits between _RPAGE_PA_MAX and _PAGE_PA_MAX 72 + * for hash linux page table specific bits. 73 + * 74 + * In order to be compatible with future hardware generations we keep 75 + * some offsets and limit this for now to 53 76 + */ 77 + #define _PAGE_PA_MAX 53 78 + 79 + #define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */ 80 + #define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */ 56 81 /* 57 82 * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE 58 83 * Instead of fixing all of them, add an alternate define which ··· 86 59 */ 87 60 #define _PAGE_NO_CACHE _PAGE_TOLERANT 88 61 /* 89 - * We support 57 bit real address in pte. Clear everything above 57, and 90 - * every thing below PAGE_SHIFT; 62 + * We support _RPAGE_PA_MAX bit real address in pte. On the linux side 63 + * we are limited by _PAGE_PA_MAX. Clear everything above _PAGE_PA_MAX 64 + * and every thing below PAGE_SHIFT; 91 65 */ 92 - #define PTE_RPN_MASK (((1UL << 57) - 1) & (PAGE_MASK)) 66 + #define PTE_RPN_MASK (((1UL << _PAGE_PA_MAX) - 1) & (PAGE_MASK)) 93 67 /* 94 68 * set of bits not changed in pmd_modify. Even though we have hash specific bits 95 69 * in here, on radix we expect them to be zero. ··· 233 205 extern unsigned long __pte_frag_size_shift; 234 206 #define PTE_FRAG_SIZE_SHIFT __pte_frag_size_shift 235 207 #define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT) 236 - /* 237 - * Pgtable size used by swapper, init in asm code 238 - */ 239 - #define MAX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE) 240 208 241 209 #define PTRS_PER_PTE (1 << PTE_INDEX_SIZE) 242 210 #define PTRS_PER_PMD (1 << PMD_INDEX_SIZE)

+7 -1

arch/powerpc/include/asm/book3s/64/radix.h

··· 11 11 #include <asm/book3s/64/radix-4k.h> 12 12 #endif 13 13 14 + /* 15 + * For P9 DD1 only, we need to track whether the pte's huge. 16 + */ 17 + #define R_PAGE_LARGE _RPAGE_RSV1 18 + 19 + 14 20 #ifndef __ASSEMBLY__ 15 21 #include <asm/book3s/64/tlbflush-radix.h> 16 22 #include <asm/cpu_has_feature.h> ··· 258 252 static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) 259 253 { 260 254 if (cpu_has_feature(CPU_FTR_POWER9_DD1)) 261 - return __pmd(pmd_val(pmd) | _PAGE_PTE | _PAGE_LARGE); 255 + return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE); 262 256 return __pmd(pmd_val(pmd) | _PAGE_PTE); 263 257 } 264 258 static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma,

+41

arch/powerpc/include/asm/code-patching.h

··· 12 12 13 13 #include <asm/types.h> 14 14 #include <asm/ppc-opcode.h> 15 + #include <linux/string.h> 16 + #include <linux/kallsyms.h> 15 17 16 18 /* Flags for create_branch: 17 19 * "b" == create_branch(addr, target, 0); ··· 99 97 /* All other cases there is no change vs ppc_function_entry() */ 100 98 return ppc_function_entry(func); 101 99 #endif 100 + } 101 + 102 + /* 103 + * Wrapper around kallsyms_lookup() to return function entry address: 104 + * - For ABIv1, we lookup the dot variant. 105 + * - For ABIv2, we return the local entry point. 106 + */ 107 + static inline unsigned long ppc_kallsyms_lookup_name(const char *name) 108 + { 109 + unsigned long addr; 110 + #ifdef PPC64_ELF_ABI_v1 111 + /* check for dot variant */ 112 + char dot_name[1 + KSYM_NAME_LEN]; 113 + bool dot_appended = false; 114 + 115 + if (strnlen(name, KSYM_NAME_LEN) >= KSYM_NAME_LEN) 116 + return 0; 117 + 118 + if (name[0] != '.') { 119 + dot_name[0] = '.'; 120 + dot_name[1] = '\0'; 121 + strlcat(dot_name, name, sizeof(dot_name)); 122 + dot_appended = true; 123 + } else { 124 + dot_name[0] = '\0'; 125 + strlcat(dot_name, name, sizeof(dot_name)); 126 + } 127 + addr = kallsyms_lookup_name(dot_name); 128 + if (!addr && dot_appended) 129 + /* Let's try the original non-dot symbol lookup */ 130 + addr = kallsyms_lookup_name(name); 131 + #elif defined(PPC64_ELF_ABI_v2) 132 + addr = kallsyms_lookup_name(name); 133 + if (addr) 134 + addr = ppc_function_entry((void *)addr); 135 + #else 136 + addr = kallsyms_lookup_name(name); 137 + #endif 138 + return addr; 102 139 } 103 140 104 141 #ifdef CONFIG_PPC64

+30 -3

arch/powerpc/include/asm/cpuidle.h

··· 2 2 #define _ASM_POWERPC_CPUIDLE_H 3 3 4 4 #ifdef CONFIG_PPC_POWERNV 5 - /* Used in powernv idle state management */ 5 + /* Thread state used in powernv idle state management */ 6 6 #define PNV_THREAD_RUNNING 0 7 7 #define PNV_THREAD_NAP 1 8 8 #define PNV_THREAD_SLEEP 2 9 9 #define PNV_THREAD_WINKLE 3 10 - #define PNV_CORE_IDLE_LOCK_BIT 0x100 11 - #define PNV_CORE_IDLE_THREAD_BITS 0x0FF 10 + 11 + /* 12 + * Core state used in powernv idle for POWER8. 13 + * 14 + * The lock bit synchronizes updates to the state, as well as parts of the 15 + * sleep/wake code (see kernel/idle_book3s.S). 16 + * 17 + * Bottom 8 bits track the idle state of each thread. Bit is cleared before 18 + * the thread executes an idle instruction (nap/sleep/winkle). 19 + * 20 + * Then there is winkle tracking. A core does not lose complete state 21 + * until every thread is in winkle. So the winkle count field counts the 22 + * number of threads in winkle (small window of false positives is okay 23 + * around the sleep/wake, so long as there are no false negatives). 24 + * 25 + * When the winkle count reaches 8 (the COUNT_ALL_BIT becomes set), then 26 + * the THREAD_WINKLE_BITS are set, which indicate which threads have not 27 + * yet woken from the winkle state. 28 + */ 29 + #define PNV_CORE_IDLE_LOCK_BIT 0x10000000 30 + 31 + #define PNV_CORE_IDLE_WINKLE_COUNT 0x00010000 32 + #define PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT 0x00080000 33 + #define PNV_CORE_IDLE_WINKLE_COUNT_BITS 0x000F0000 34 + #define PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT 8 35 + #define PNV_CORE_IDLE_THREAD_WINKLE_BITS 0x0000FF00 36 + 37 + #define PNV_CORE_IDLE_THREAD_BITS 0x000000FF 12 38 13 39 /* 14 40 * ============================ NOTE ================================= ··· 72 46 73 47 extern u64 pnv_first_deep_stop_state; 74 48 49 + unsigned long pnv_cpu_offline(unsigned int cpu); 75 50 int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags); 76 51 static inline void report_invalid_psscr_val(u64 psscr_val, int err) 77 52 {

+3 -2

arch/powerpc/include/asm/cputable.h

+30 -10

arch/powerpc/include/asm/dbell.h

··· 35 35 #ifdef CONFIG_PPC_BOOK3S 36 36 37 37 #define PPC_DBELL_MSGTYPE PPC_DBELL_SERVER 38 - #define SPRN_DOORBELL_CPUTAG SPRN_TIR 39 - #define PPC_DBELL_TAG_MASK 0x7f 40 38 41 39 static inline void _ppc_msgsnd(u32 msg) 42 40 { 43 - if (cpu_has_feature(CPU_FTR_HVMODE)) 44 - __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); 45 - else 46 - __asm__ __volatile__ (PPC_MSGSNDP(%0) : : "r" (msg)); 41 + __asm__ __volatile__ (ASM_FTR_IFSET(PPC_MSGSND(%1), PPC_MSGSNDP(%1), %0) 42 + : : "i" (CPU_FTR_HVMODE), "r" (msg)); 43 + } 44 + 45 + /* sync before sending message */ 46 + static inline void ppc_msgsnd_sync(void) 47 + { 48 + __asm__ __volatile__ ("sync" : : : "memory"); 49 + } 50 + 51 + /* sync after taking message interrupt */ 52 + static inline void ppc_msgsync(void) 53 + { 54 + /* sync is not required when taking messages from the same core */ 55 + __asm__ __volatile__ (ASM_FTR_IFSET(PPC_MSGSYNC " ; lwsync", "", %0) 56 + : : "i" (CPU_FTR_HVMODE|CPU_FTR_ARCH_300)); 47 57 } 48 58 49 59 #else /* CONFIG_PPC_BOOK3S */ 50 60 51 61 #define PPC_DBELL_MSGTYPE PPC_DBELL 52 - #define SPRN_DOORBELL_CPUTAG SPRN_PIR 53 - #define PPC_DBELL_TAG_MASK 0x3fff 54 62 55 63 static inline void _ppc_msgsnd(u32 msg) 56 64 { 57 65 __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); 58 66 } 59 67 68 + /* sync before sending message */ 69 + static inline void ppc_msgsnd_sync(void) 70 + { 71 + __asm__ __volatile__ ("sync" : : : "memory"); 72 + } 73 + 74 + /* sync after taking message interrupt */ 75 + static inline void ppc_msgsync(void) 76 + { 77 + } 78 + 60 79 #endif /* CONFIG_PPC_BOOK3S */ 61 80 62 - extern void doorbell_cause_ipi(int cpu, unsigned long data); 81 + extern void doorbell_global_ipi(int cpu); 82 + extern void doorbell_core_ipi(int cpu); 83 + extern int doorbell_try_core_ipi(int cpu); 63 84 extern void doorbell_exception(struct pt_regs *regs); 64 - extern void doorbell_setup_this_cpu(void); 65 85 66 86 static inline void ppc_msgsnd(enum ppc_dbell type, u32 flags, u32 tag) 67 87 {

-2

arch/powerpc/include/asm/debug.h

··· 8 8 9 9 struct pt_regs; 10 10 11 - extern struct dentry *powerpc_debugfs_root; 12 - 13 11 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE) 14 12 15 13 extern int (*__debugger)(struct pt_regs *regs);

+17

arch/powerpc/include/asm/debugfs.h

··· 1 + #ifndef _ASM_POWERPC_DEBUGFS_H 2 + #define _ASM_POWERPC_DEBUGFS_H 3 + 4 + /* 5 + * Copyright 2017, Michael Ellerman, IBM Corporation. 6 + * 7 + * This program is free software; you can redistribute it and/or 8 + * modify it under the terms of the GNU General Public License 9 + * as published by the Free Software Foundation; either version 10 + * 2 of the License, or (at your option) any later version. 11 + */ 12 + 13 + #include <linux/debugfs.h> 14 + 15 + extern struct dentry *powerpc_debugfs_root; 16 + 17 + #endif /* _ASM_POWERPC_DEBUGFS_H */

+54 -33

arch/powerpc/include/asm/exception-64s.h

··· 167 167 std ra,offset(r13); \ 168 168 END_FTR_SECTION_NESTED(ftr,ftr,943) 169 169 170 - #define EXCEPTION_PROLOG_0_PACA(area) \ 170 + #define EXCEPTION_PROLOG_0(area) \ 171 + GET_PACA(r13); \ 171 172 std r9,area+EX_R9(r13); /* save r9 */ \ 172 173 OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); \ 173 174 HMT_MEDIUM; \ 174 175 std r10,area+EX_R10(r13); /* save r10 - r12 */ \ 175 176 OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR) 176 - 177 - #define EXCEPTION_PROLOG_0(area) \ 178 - GET_PACA(r13); \ 179 - EXCEPTION_PROLOG_0_PACA(area) 180 177 181 178 #define __EXCEPTION_PROLOG_1(area, extra, vec) \ 182 179 OPT_SAVE_REG_TO_PACA(area+EX_PPR, r9, CPU_FTR_HAS_PPR); \ ··· 200 203 #define EXCEPTION_PROLOG_PSERIES_1(label, h) \ 201 204 __EXCEPTION_PROLOG_PSERIES_1(label, h) 202 205 206 + /* _NORI variant keeps MSR_RI clear */ 207 + #define __EXCEPTION_PROLOG_PSERIES_1_NORI(label, h) \ 208 + ld r10,PACAKMSR(r13); /* get MSR value for kernel */ \ 209 + xori r10,r10,MSR_RI; /* Clear MSR_RI */ \ 210 + mfspr r11,SPRN_##h##SRR0; /* save SRR0 */ \ 211 + LOAD_HANDLER(r12,label) \ 212 + mtspr SPRN_##h##SRR0,r12; \ 213 + mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \ 214 + mtspr SPRN_##h##SRR1,r10; \ 215 + h##rfid; \ 216 + b . /* prevent speculative execution */ 217 + 218 + #define EXCEPTION_PROLOG_PSERIES_1_NORI(label, h) \ 219 + __EXCEPTION_PROLOG_PSERIES_1_NORI(label, h) 220 + 203 221 #define EXCEPTION_PROLOG_PSERIES(area, label, h, extra, vec) \ 204 222 EXCEPTION_PROLOG_0(area); \ 205 - EXCEPTION_PROLOG_1(area, extra, vec); \ 206 - EXCEPTION_PROLOG_PSERIES_1(label, h); 207 - 208 - /* Have the PACA in r13 already */ 209 - #define EXCEPTION_PROLOG_PSERIES_PACA(area, label, h, extra, vec) \ 210 - EXCEPTION_PROLOG_0_PACA(area); \ 211 223 EXCEPTION_PROLOG_1(area, extra, vec); \ 212 224 EXCEPTION_PROLOG_PSERIES_1(label, h); 213 225 ··· 262 256 ld r9,area+EX_R9(r13); \ 263 257 bctr 264 258 265 - #define BRANCH_TO_KVM(reg, label) \ 266 - __LOAD_FAR_HANDLER(reg, label); \ 267 - mtctr reg; \ 268 - bctr 269 - 270 259 #else 271 260 #define BRANCH_TO_COMMON(reg, label) \ 272 261 b label ··· 269 268 #define BRANCH_LINK_TO_FAR(label) \ 270 269 bl label 271 270 272 - #define BRANCH_TO_KVM(reg, label) \ 273 - b label 274 - 275 271 #define __BRANCH_TO_KVM_EXIT(area, label) \ 276 272 ld r9,area+EX_R9(r13); \ 277 273 b label 278 274 279 275 #endif 276 + 277 + /* Do not enable RI */ 278 + #define EXCEPTION_PROLOG_PSERIES_NORI(area, label, h, extra, vec) \ 279 + EXCEPTION_PROLOG_0(area); \ 280 + EXCEPTION_PROLOG_1(area, extra, vec); \ 281 + EXCEPTION_PROLOG_PSERIES_1_NORI(label, h); 280 282 281 283 282 284 #define __KVM_HANDLER(area, h, n) \ ··· 329 325 330 326 #define NOTEST(n) 331 327 328 + #define EXCEPTION_PROLOG_COMMON_1() \ 329 + std r9,_CCR(r1); /* save CR in stackframe */ \ 330 + std r11,_NIP(r1); /* save SRR0 in stackframe */ \ 331 + std r12,_MSR(r1); /* save SRR1 in stackframe */ \ 332 + std r10,0(r1); /* make stack chain pointer */ \ 333 + std r0,GPR0(r1); /* save r0 in stackframe */ \ 334 + std r10,GPR1(r1); /* save r1 in stackframe */ \ 335 + 336 + 332 337 /* 333 338 * The common exception prolog is used for all except a few exceptions 334 339 * such as a segment miss on a kernel address. We have to be prepared ··· 362 349 addi r3,r13,area; /* r3 -> where regs are saved*/ \ 363 350 RESTORE_CTR(r1, area); \ 364 351 b bad_stack; \ 365 - 3: std r9,_CCR(r1); /* save CR in stackframe */ \ 366 - std r11,_NIP(r1); /* save SRR0 in stackframe */ \ 367 - std r12,_MSR(r1); /* save SRR1 in stackframe */ \ 368 - std r10,0(r1); /* make stack chain pointer */ \ 369 - std r0,GPR0(r1); /* save r0 in stackframe */ \ 370 - std r10,GPR1(r1); /* save r1 in stackframe */ \ 352 + 3: EXCEPTION_PROLOG_COMMON_1(); \ 371 353 beq 4f; /* if from kernel mode */ \ 372 354 ACCOUNT_CPU_USER_ENTRY(r13, r9, r10); \ 373 355 SAVE_PPR(area, r9, r10); \ ··· 530 522 531 523 #define MASKABLE_RELON_EXCEPTION_HV_OOL(vec, label) \ 532 524 EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_TEST_HV, vec); \ 533 - EXCEPTION_PROLOG_PSERIES_1(label, EXC_HV) 525 + EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_HV) 534 526 535 527 /* 536 528 * Our exception common code can be passed various "additions" ··· 555 547 beql ppc64_runlatch_on_trampoline; \ 556 548 END_FTR_SECTION_IFSET(CPU_FTR_CTRL) 557 549 558 - #define EXCEPTION_COMMON(trap, label, hdlr, ret, additions) \ 559 - EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN); \ 550 + #define EXCEPTION_COMMON(area, trap, label, hdlr, ret, additions) \ 551 + EXCEPTION_PROLOG_COMMON(trap, area); \ 560 552 /* Volatile regs are potentially clobbered here */ \ 561 553 additions; \ 562 554 addi r3,r1,STACK_FRAME_OVERHEAD; \ 563 555 bl hdlr; \ 564 556 b ret 565 557 558 + /* 559 + * Exception where stack is already set in r1, r1 is saved in r10, and it 560 + * continues rather than returns. 561 + */ 562 + #define EXCEPTION_COMMON_NORET_STACK(area, trap, label, hdlr, additions) \ 563 + EXCEPTION_PROLOG_COMMON_1(); \ 564 + EXCEPTION_PROLOG_COMMON_2(area); \ 565 + EXCEPTION_PROLOG_COMMON_3(trap); \ 566 + /* Volatile regs are potentially clobbered here */ \ 567 + additions; \ 568 + addi r3,r1,STACK_FRAME_OVERHEAD; \ 569 + bl hdlr 570 + 566 571 #define STD_EXCEPTION_COMMON(trap, label, hdlr) \ 567 - EXCEPTION_COMMON(trap, label, hdlr, ret_from_except, \ 568 - ADD_NVGPRS;ADD_RECONCILE) 572 + EXCEPTION_COMMON(PACA_EXGEN, trap, label, hdlr, \ 573 + ret_from_except, ADD_NVGPRS;ADD_RECONCILE) 569 574 570 575 /* 571 576 * Like STD_EXCEPTION_COMMON, but for exceptions that can occur 572 577 * in the idle task and therefore need the special idle handling 573 578 * (finish nap and runlatch) 574 579 */ 575 - #define STD_EXCEPTION_COMMON_ASYNC(trap, label, hdlr) \ 576 - EXCEPTION_COMMON(trap, label, hdlr, ret_from_except_lite, \ 577 - FINISH_NAP;ADD_RECONCILE;RUNLATCH_ON) 580 + #define STD_EXCEPTION_COMMON_ASYNC(trap, label, hdlr) \ 581 + EXCEPTION_COMMON(PACA_EXGEN, trap, label, hdlr, \ 582 + ret_from_except_lite, FINISH_NAP;ADD_RECONCILE;RUNLATCH_ON) 578 583 579 584 /* 580 585 * When the idle code in power4_idle puts the CPU into NAP mode,

+3

arch/powerpc/include/asm/feature-fixups.h

··· 66 66 #define END_FTR_SECTION(msk, val) \ 67 67 END_FTR_SECTION_NESTED(msk, val, 97) 68 68 69 + #define END_FTR_SECTION_NESTED_IFSET(msk, label) \ 70 + END_FTR_SECTION_NESTED((msk), (msk), label) 71 + 69 72 #define END_FTR_SECTION_IFSET(msk) END_FTR_SECTION((msk), (msk)) 70 73 #define END_FTR_SECTION_IFCLR(msk) END_FTR_SECTION((msk), 0) 71 74

+1

arch/powerpc/include/asm/head-64.h

··· 213 213 USE_TEXT_SECTION(); \ 214 214 .balign IFETCH_ALIGN_BYTES; \ 215 215 .global name; \ 216 + _ASM_NOKPROBE_SYMBOL(name); \ 216 217 DEFINE_FIXED_SYMBOL(name); \ 217 218 name: 218 219

-10

arch/powerpc/include/asm/hvcall.h

··· 377 377 long plpar_hcall9(unsigned long opcode, unsigned long *retbuf, ...); 378 378 long plpar_hcall9_raw(unsigned long opcode, unsigned long *retbuf, ...); 379 379 380 - /* For hcall instrumentation. One structure per-hcall, per-CPU */ 381 - struct hcall_stats { 382 - unsigned long num_calls; /* number of calls (on this CPU) */ 383 - unsigned long tb_total; /* total wall time (mftb) of calls. */ 384 - unsigned long purr_total; /* total cpu time (PURR) of calls. */ 385 - unsigned long tb_start; 386 - unsigned long purr_start; 387 - }; 388 - #define HCALL_STAT_ARRAY_SIZE ((MAX_HCALL_OPCODE >> 2) + 1) 389 - 390 380 struct hvcall_mpp_data { 391 381 unsigned long entitled_mem; 392 382 unsigned long mapped_mem;

+53 -49

arch/powerpc/include/asm/io.h

··· 25 25 #endif 26 26 27 27 #include <linux/device.h> 28 - #include <linux/io.h> 29 - 30 28 #include <linux/compiler.h> 31 29 #include <asm/page.h> 32 30 #include <asm/byteorder.h> ··· 190 192 191 193 #endif /* __BIG_ENDIAN */ 192 194 193 - /* 194 - * Cache inhibitied accessors for use in real mode, you don't want to use these 195 - * unless you know what you're doing. 196 - * 197 - * NB. These use the cpu byte ordering. 198 - */ 199 - DEF_MMIO_OUT_X(out_rm8, 8, stbcix); 200 - DEF_MMIO_OUT_X(out_rm16, 16, sthcix); 201 - DEF_MMIO_OUT_X(out_rm32, 32, stwcix); 202 - DEF_MMIO_IN_X(in_rm8, 8, lbzcix); 203 - DEF_MMIO_IN_X(in_rm16, 16, lhzcix); 204 - DEF_MMIO_IN_X(in_rm32, 32, lwzcix); 205 - 206 195 #ifdef __powerpc64__ 207 - 208 - DEF_MMIO_OUT_X(out_rm64, 64, stdcix); 209 - DEF_MMIO_IN_X(in_rm64, 64, ldcix); 210 196 211 197 #ifdef __BIG_ENDIAN__ 212 198 DEF_MMIO_OUT_D(out_be64, 64, std); ··· 223 241 224 242 #endif 225 243 #endif /* __powerpc64__ */ 226 - 227 - 228 - /* 229 - * Simple Cache inhibited accessors 230 - * Unlike the DEF_MMIO_* macros, these don't include any h/w memory 231 - * barriers, callers need to manage memory barriers on their own. 232 - * These can only be used in hypervisor real mode. 233 - */ 234 - 235 - static inline u32 _lwzcix(unsigned long addr) 236 - { 237 - u32 ret; 238 - 239 - __asm__ __volatile__("lwzcix %0,0, %1" 240 - : "=r" (ret) : "r" (addr) : "memory"); 241 - return ret; 242 - } 243 - 244 - static inline void _stbcix(u64 addr, u8 val) 245 - { 246 - __asm__ __volatile__("stbcix %0,0,%1" 247 - : : "r" (val), "r" (addr) : "memory"); 248 - } 249 - 250 - static inline void _stwcix(u64 addr, u32 val) 251 - { 252 - __asm__ __volatile__("stwcix %0,0,%1" 253 - : : "r" (val), "r" (addr) : "memory"); 254 - } 255 244 256 245 /* 257 246 * Low level IO stream instructions are defined out of line for now ··· 370 417 } 371 418 372 419 /* 373 - * Real mode version of the above. stdcix is only supposed to be used 374 - * in hypervisor real mode as per the architecture spec. 420 + * Real mode versions of the above. Those instructions are only supposed 421 + * to be used in hypervisor real mode as per the architecture spec. 375 422 */ 423 + static inline void __raw_rm_writeb(u8 val, volatile void __iomem *paddr) 424 + { 425 + __asm__ __volatile__("stbcix %0,0,%1" 426 + : : "r" (val), "r" (paddr) : "memory"); 427 + } 428 + 429 + static inline void __raw_rm_writew(u16 val, volatile void __iomem *paddr) 430 + { 431 + __asm__ __volatile__("sthcix %0,0,%1" 432 + : : "r" (val), "r" (paddr) : "memory"); 433 + } 434 + 435 + static inline void __raw_rm_writel(u32 val, volatile void __iomem *paddr) 436 + { 437 + __asm__ __volatile__("stwcix %0,0,%1" 438 + : : "r" (val), "r" (paddr) : "memory"); 439 + } 440 + 376 441 static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr) 377 442 { 378 443 __asm__ __volatile__("stdcix %0,0,%1" 379 444 : : "r" (val), "r" (paddr) : "memory"); 380 445 } 381 446 447 + static inline u8 __raw_rm_readb(volatile void __iomem *paddr) 448 + { 449 + u8 ret; 450 + __asm__ __volatile__("lbzcix %0,0, %1" 451 + : "=r" (ret) : "r" (paddr) : "memory"); 452 + return ret; 453 + } 454 + 455 + static inline u16 __raw_rm_readw(volatile void __iomem *paddr) 456 + { 457 + u16 ret; 458 + __asm__ __volatile__("lhzcix %0,0, %1" 459 + : "=r" (ret) : "r" (paddr) : "memory"); 460 + return ret; 461 + } 462 + 463 + static inline u32 __raw_rm_readl(volatile void __iomem *paddr) 464 + { 465 + u32 ret; 466 + __asm__ __volatile__("lwzcix %0,0, %1" 467 + : "=r" (ret) : "r" (paddr) : "memory"); 468 + return ret; 469 + } 470 + 471 + static inline u64 __raw_rm_readq(volatile void __iomem *paddr) 472 + { 473 + u64 ret; 474 + __asm__ __volatile__("ldcix %0,0, %1" 475 + : "=r" (ret) : "r" (paddr) : "memory"); 476 + return ret; 477 + } 382 478 #endif /* __powerpc64__ */ 383 479 384 480 /* ··· 759 757 extern void __iomem *ioremap_wc(phys_addr_t address, unsigned long size); 760 758 #define ioremap_nocache(addr, size) ioremap((addr), (size)) 761 759 #define ioremap_uc(addr, size) ioremap((addr), (size)) 760 + #define ioremap_cache(addr, size) \ 761 + ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL)) 762 762 763 763 extern void iounmap(volatile void __iomem *addr); 764 764

+10 -2

arch/powerpc/include/asm/iommu.h

··· 64 64 long index, 65 65 unsigned long *hpa, 66 66 enum dma_data_direction *direction); 67 + /* Real mode */ 68 + int (*exchange_rm)(struct iommu_table *tbl, 69 + long index, 70 + unsigned long *hpa, 71 + enum dma_data_direction *direction); 67 72 #endif 68 73 void (*clear)(struct iommu_table *tbl, 69 74 long index, long npages); ··· 119 114 struct list_head it_group_list;/* List of iommu_table_group_link */ 120 115 unsigned long *it_userspace; /* userspace view of the table */ 121 116 struct iommu_table_ops *it_ops; 117 + struct kref it_kref; 122 118 }; 123 119 124 120 #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \ ··· 152 146 153 147 extern int dma_iommu_dma_supported(struct device *dev, u64 mask); 154 148 155 - /* Frees table for an individual device node */ 156 - extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); 149 + extern struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl); 150 + extern int iommu_tce_table_put(struct iommu_table *tbl); 157 151 158 152 /* Initializes an iommu_table based in values set in the passed-in 159 153 * structure ··· 213 207 extern void iommu_del_device(struct device *dev); 214 208 extern int __init tce_iommu_bus_notifier_init(void); 215 209 extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, 210 + unsigned long *hpa, enum dma_data_direction *direction); 211 + extern long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry, 216 212 unsigned long *hpa, enum dma_data_direction *direction); 217 213 #else 218 214 static inline void iommu_register_group(struct iommu_table_group *table_group,

+10 -53

arch/powerpc/include/asm/kprobes.h

··· 61 61 #define MAX_OPTINSN_SIZE (optprobe_template_end - optprobe_template_entry) 62 62 #define RELATIVEJUMP_SIZE sizeof(kprobe_opcode_t) /* 4 bytes */ 63 63 64 - #ifdef PPC64_ELF_ABI_v2 65 - /* PPC64 ABIv2 needs local entry point */ 66 - #define kprobe_lookup_name(name, addr) \ 67 - { \ 68 - addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); \ 69 - if (addr) \ 70 - addr = (kprobe_opcode_t *)ppc_function_entry(addr); \ 71 - } 72 - #elif defined(PPC64_ELF_ABI_v1) 73 - /* 74 - * 64bit powerpc ABIv1 uses function descriptors: 75 - * - Check for the dot variant of the symbol first. 76 - * - If that fails, try looking up the symbol provided. 77 - * 78 - * This ensures we always get to the actual symbol and not the descriptor. 79 - * Also handle <module:symbol> format. 80 - */ 81 - #define kprobe_lookup_name(name, addr) \ 82 - { \ 83 - char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN]; \ 84 - const char *modsym; \ 85 - bool dot_appended = false; \ 86 - if ((modsym = strchr(name, ':')) != NULL) { \ 87 - modsym++; \ 88 - if (*modsym != '\0' && *modsym != '.') { \ 89 - /* Convert to <module:.symbol> */ \ 90 - strncpy(dot_name, name, modsym - name); \ 91 - dot_name[modsym - name] = '.'; \ 92 - dot_name[modsym - name + 1] = '\0'; \ 93 - strncat(dot_name, modsym, \ 94 - sizeof(dot_name) - (modsym - name) - 2);\ 95 - dot_appended = true; \ 96 - } else { \ 97 - dot_name[0] = '\0'; \ 98 - strncat(dot_name, name, sizeof(dot_name) - 1); \ 99 - } \ 100 - } else if (name[0] != '.') { \ 101 - dot_name[0] = '.'; \ 102 - dot_name[1] = '\0'; \ 103 - strncat(dot_name, name, KSYM_NAME_LEN - 2); \ 104 - dot_appended = true; \ 105 - } else { \ 106 - dot_name[0] = '\0'; \ 107 - strncat(dot_name, name, KSYM_NAME_LEN - 1); \ 108 - } \ 109 - addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name); \ 110 - if (!addr && dot_appended) { \ 111 - /* Let's try the original non-dot symbol lookup */ \ 112 - addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); \ 113 - } \ 114 - } 115 - #endif 116 - 117 64 #define flush_insn_slot(p) do { } while (0) 118 65 #define kretprobe_blacklist_size 0 119 66 ··· 103 156 extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); 104 157 extern int kprobe_handler(struct pt_regs *regs); 105 158 extern int kprobe_post_handler(struct pt_regs *regs); 159 + #ifdef CONFIG_KPROBES_ON_FTRACE 160 + extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs, 161 + struct kprobe_ctlblk *kcb); 162 + #else 163 + static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs, 164 + struct kprobe_ctlblk *kcb) 165 + { 166 + return 0; 167 + } 168 + #endif 106 169 #else 107 170 static inline int kprobe_handler(struct pt_regs *regs) { return 0; } 108 171 static inline int kprobe_post_handler(struct pt_regs *regs) { return 0; }

-2

arch/powerpc/include/asm/kvm_book3s_64.h

··· 49 49 #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ 50 50 #endif 51 51 52 - #define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ 53 - 54 52 /* 55 53 * We use a lock bit in HPTE dword 0 to synchronize updates and 56 54 * accesses to each HPTE, and another bit to indicate non-present

+1 -1

arch/powerpc/include/asm/kvm_book3s_asm.h

··· 110 110 u8 ptid; 111 111 struct kvm_vcpu *kvm_vcpu; 112 112 struct kvmppc_vcore *kvm_vcore; 113 - unsigned long xics_phys; 113 + void __iomem *xics_phys; 114 114 u32 saved_xirr; 115 115 u64 dabr; 116 116 u64 host_mmcr[7]; /* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */

+1 -9

arch/powerpc/include/asm/kvm_ppc.h

··· 409 409 extern void kvm_cma_reserve(void) __init; 410 410 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) 411 411 { 412 - paca[cpu].kvm_hstate.xics_phys = addr; 412 + paca[cpu].kvm_hstate.xics_phys = (void __iomem *)addr; 413 413 } 414 414 415 415 static inline u32 kvmppc_get_xics_latch(void) ··· 478 478 extern void kvmppc_free_pimap(struct kvm *kvm); 479 479 extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall); 480 480 extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu); 481 - extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server); 482 - extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args); 483 481 extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd); 484 482 extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu); 485 483 extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); ··· 505 507 static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) 506 508 { return 0; } 507 509 static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { } 508 - static inline int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, 509 - unsigned long server) 510 - { return -EINVAL; } 511 - static inline int kvm_vm_ioctl_xics_irq(struct kvm *kvm, 512 - struct kvm_irq_level *args) 513 - { return -ENOTTY; } 514 510 static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) 515 511 { return 0; } 516 512 #endif

+2 -92

arch/powerpc/include/asm/mce.h

··· 24 24 25 25 #include <linux/bitops.h> 26 26 27 - /* 28 - * Machine Check bits on power7 and power8 29 - */ 30 - #define P7_SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42)) /* P8 too */ 31 - 32 - /* SRR1 bits for machine check (On Power7 and Power8) */ 33 - #define P7_SRR1_MC_IFETCH(srr1) ((srr1) & PPC_BITMASK(43, 45)) /* P8 too */ 34 - 35 - #define P7_SRR1_MC_IFETCH_UE (0x1 << PPC_BITLSHIFT(45)) /* P8 too */ 36 - #define P7_SRR1_MC_IFETCH_SLB_PARITY (0x2 << PPC_BITLSHIFT(45)) /* P8 too */ 37 - #define P7_SRR1_MC_IFETCH_SLB_MULTIHIT (0x3 << PPC_BITLSHIFT(45)) /* P8 too */ 38 - #define P7_SRR1_MC_IFETCH_SLB_BOTH (0x4 << PPC_BITLSHIFT(45)) 39 - #define P7_SRR1_MC_IFETCH_TLB_MULTIHIT (0x5 << PPC_BITLSHIFT(45)) /* P8 too */ 40 - #define P7_SRR1_MC_IFETCH_UE_TLB_RELOAD (0x6 << PPC_BITLSHIFT(45)) /* P8 too */ 41 - #define P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL (0x7 << PPC_BITLSHIFT(45)) 42 - 43 - /* SRR1 bits for machine check (On Power8) */ 44 - #define P8_SRR1_MC_IFETCH_ERAT_MULTIHIT (0x4 << PPC_BITLSHIFT(45)) 45 - 46 - /* DSISR bits for machine check (On Power7 and Power8) */ 47 - #define P7_DSISR_MC_UE (PPC_BIT(48)) /* P8 too */ 48 - #define P7_DSISR_MC_UE_TABLEWALK (PPC_BIT(49)) /* P8 too */ 49 - #define P7_DSISR_MC_ERAT_MULTIHIT (PPC_BIT(52)) /* P8 too */ 50 - #define P7_DSISR_MC_TLB_MULTIHIT_MFTLB (PPC_BIT(53)) /* P8 too */ 51 - #define P7_DSISR_MC_SLB_PARITY_MFSLB (PPC_BIT(55)) /* P8 too */ 52 - #define P7_DSISR_MC_SLB_MULTIHIT (PPC_BIT(56)) /* P8 too */ 53 - #define P7_DSISR_MC_SLB_MULTIHIT_PARITY (PPC_BIT(57)) /* P8 too */ 54 - 55 - /* 56 - * DSISR bits for machine check (Power8) in addition to above. 57 - * Secondary DERAT Multihit 58 - */ 59 - #define P8_DSISR_MC_ERAT_MULTIHIT_SEC (PPC_BIT(54)) 60 - 61 - /* SLB error bits */ 62 - #define P7_DSISR_MC_SLB_ERRORS (P7_DSISR_MC_ERAT_MULTIHIT | \ 63 - P7_DSISR_MC_SLB_PARITY_MFSLB | \ 64 - P7_DSISR_MC_SLB_MULTIHIT | \ 65 - P7_DSISR_MC_SLB_MULTIHIT_PARITY) 66 - 67 - #define P8_DSISR_MC_SLB_ERRORS (P7_DSISR_MC_SLB_ERRORS | \ 68 - P8_DSISR_MC_ERAT_MULTIHIT_SEC) 69 - 70 - /* 71 - * Machine Check bits on power9 72 - */ 73 - #define P9_SRR1_MC_LOADSTORE(srr1) (((srr1) >> PPC_BITLSHIFT(42)) & 1) 74 - 75 - #define P9_SRR1_MC_IFETCH(srr1) ( \ 76 - PPC_BITEXTRACT(srr1, 45, 0) | \ 77 - PPC_BITEXTRACT(srr1, 44, 1) | \ 78 - PPC_BITEXTRACT(srr1, 43, 2) | \ 79 - PPC_BITEXTRACT(srr1, 36, 3) ) 80 - 81 - /* 0 is reserved */ 82 - #define P9_SRR1_MC_IFETCH_UE 1 83 - #define P9_SRR1_MC_IFETCH_SLB_PARITY 2 84 - #define P9_SRR1_MC_IFETCH_SLB_MULTIHIT 3 85 - #define P9_SRR1_MC_IFETCH_ERAT_MULTIHIT 4 86 - #define P9_SRR1_MC_IFETCH_TLB_MULTIHIT 5 87 - #define P9_SRR1_MC_IFETCH_UE_TLB_RELOAD 6 88 - /* 7 is reserved */ 89 - #define P9_SRR1_MC_IFETCH_LINK_TIMEOUT 8 90 - #define P9_SRR1_MC_IFETCH_LINK_TABLEWALK_TIMEOUT 9 91 - /* 10 ? */ 92 - #define P9_SRR1_MC_IFETCH_RA 11 93 - #define P9_SRR1_MC_IFETCH_RA_TABLEWALK 12 94 - #define P9_SRR1_MC_IFETCH_RA_ASYNC_STORE 13 95 - #define P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT 14 96 - #define P9_SRR1_MC_IFETCH_RA_TABLEWALK_FOREIGN 15 97 - 98 - /* DSISR bits for machine check (On Power9) */ 99 - #define P9_DSISR_MC_UE (PPC_BIT(48)) 100 - #define P9_DSISR_MC_UE_TABLEWALK (PPC_BIT(49)) 101 - #define P9_DSISR_MC_LINK_LOAD_TIMEOUT (PPC_BIT(50)) 102 - #define P9_DSISR_MC_LINK_TABLEWALK_TIMEOUT (PPC_BIT(51)) 103 - #define P9_DSISR_MC_ERAT_MULTIHIT (PPC_BIT(52)) 104 - #define P9_DSISR_MC_TLB_MULTIHIT_MFTLB (PPC_BIT(53)) 105 - #define P9_DSISR_MC_USER_TLBIE (PPC_BIT(54)) 106 - #define P9_DSISR_MC_SLB_PARITY_MFSLB (PPC_BIT(55)) 107 - #define P9_DSISR_MC_SLB_MULTIHIT_MFSLB (PPC_BIT(56)) 108 - #define P9_DSISR_MC_RA_LOAD (PPC_BIT(57)) 109 - #define P9_DSISR_MC_RA_TABLEWALK (PPC_BIT(58)) 110 - #define P9_DSISR_MC_RA_TABLEWALK_FOREIGN (PPC_BIT(59)) 111 - #define P9_DSISR_MC_RA_FOREIGN (PPC_BIT(60)) 112 - 113 - /* SLB error bits */ 114 - #define P9_DSISR_MC_SLB_ERRORS (P9_DSISR_MC_ERAT_MULTIHIT | \ 115 - P9_DSISR_MC_SLB_PARITY_MFSLB | \ 116 - P9_DSISR_MC_SLB_MULTIHIT_MFSLB) 117 - 118 27 enum MCE_Version { 119 28 MCE_V1 = 1, 120 29 }; ··· 207 298 extern int get_mce_event(struct machine_check_event *mce, bool release); 208 299 extern void release_mce_event(void); 209 300 extern void machine_check_queue_event(void); 210 - extern void machine_check_print_event_info(struct machine_check_event *evt); 301 + extern void machine_check_print_event_info(struct machine_check_event *evt, 302 + bool user_mode); 211 303 extern uint64_t get_mce_fault_addr(struct machine_check_event *evt); 212 304 213 305 #endif /* __ASM_PPC64_MCE_H__ */

-5

arch/powerpc/include/asm/mmu-book3e.h

··· 229 229 unsigned int id; 230 230 unsigned int active; 231 231 unsigned long vdso_base; 232 - #ifdef CONFIG_PPC_MM_SLICES 233 - u64 low_slices_psize; /* SLB page size encodings */ 234 - u64 high_slices_psize; /* 4 bits per slice for now */ 235 - u16 user_psize; /* page size index */ 236 - #endif 237 232 #ifdef CONFIG_PPC_64K_PAGES 238 233 /* for 4K PTE fragment support */ 239 234 void *pte_frag;

+13 -6

arch/powerpc/include/asm/mmu.h

··· 29 29 */ 30 30 31 31 /* 32 + * Support for 68 bit VA space. We added that from ISA 2.05 33 + */ 34 + #define MMU_FTR_68_BIT_VA ASM_CONST(0x00002000) 35 + /* 32 36 * Kernel read only support. 33 37 * We added the ppp value 0b110 in ISA 2.04. 34 38 */ ··· 113 109 #define MMU_FTRS_POWER4 MMU_FTRS_DEFAULT_HPTE_ARCH_V2 114 110 #define MMU_FTRS_PPC970 MMU_FTRS_POWER4 | MMU_FTR_TLBIE_CROP_VA 115 111 #define MMU_FTRS_POWER5 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE 116 - #define MMU_FTRS_POWER6 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO 117 - #define MMU_FTRS_POWER7 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO 118 - #define MMU_FTRS_POWER8 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO 119 - #define MMU_FTRS_POWER9 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO 112 + #define MMU_FTRS_POWER6 MMU_FTRS_POWER5 | MMU_FTR_KERNEL_RO | MMU_FTR_68_BIT_VA 113 + #define MMU_FTRS_POWER7 MMU_FTRS_POWER6 114 + #define MMU_FTRS_POWER8 MMU_FTRS_POWER6 115 + #define MMU_FTRS_POWER9 MMU_FTRS_POWER6 120 116 #define MMU_FTRS_CELL MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \ 121 117 MMU_FTR_CI_LARGE_PAGE 122 118 #define MMU_FTRS_PA6T MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \ ··· 140 136 MMU_FTR_NO_SLBIE_B | MMU_FTR_16M_PAGE | MMU_FTR_TLBIEL | 141 137 MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE | 142 138 MMU_FTR_1T_SEGMENT | MMU_FTR_TLBIE_CROP_VA | 143 - MMU_FTR_KERNEL_RO | 139 + MMU_FTR_KERNEL_RO | MMU_FTR_68_BIT_VA | 144 140 #ifdef CONFIG_PPC_RADIX_MMU 145 141 MMU_FTR_TYPE_RADIX | 146 142 #endif ··· 294 290 #define MMU_PAGE_16G 14 295 291 #define MMU_PAGE_64G 15 296 292 297 - /* N.B. we need to change the type of hpte_page_sizes if this gets to be > 16 */ 293 + /* 294 + * N.B. we need to change the type of hpte_page_sizes if this gets to be > 16 295 + * Also we need to change he type of mm_context.low/high_slices_psize. 296 + */ 298 297 #define MMU_PAGE_COUNT 16 299 298 300 299 #ifdef CONFIG_PPC_BOOK3S_64

+21 -3

arch/powerpc/include/asm/mmu_context.h

··· 29 29 extern void mm_iommu_cleanup(struct mm_struct *mm); 30 30 extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, 31 31 unsigned long ua, unsigned long size); 32 + extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm( 33 + struct mm_struct *mm, unsigned long ua, unsigned long size); 32 34 extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, 33 35 unsigned long ua, unsigned long entries); 34 36 extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, 37 + unsigned long ua, unsigned long *hpa); 38 + extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, 35 39 unsigned long ua, unsigned long *hpa); 36 40 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); 37 41 extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem); ··· 55 51 return switch_slb(tsk, next); 56 52 } 57 53 58 - extern int __init_new_context(void); 54 + extern int hash__alloc_context_id(void); 55 + extern void hash__reserve_context_id(int id); 59 56 extern void __destroy_context(int context_id); 60 57 static inline void mmu_context_init(void) { } 61 58 #else ··· 75 70 * switch_mm is the entry point called from the architecture independent 76 71 * code in kernel/sched/core.c 77 72 */ 78 - static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, 79 - struct task_struct *tsk) 73 + static inline void switch_mm_irqs_off(struct mm_struct *prev, 74 + struct mm_struct *next, 75 + struct task_struct *tsk) 80 76 { 81 77 /* Mark this context has been used on the new CPU */ 82 78 if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) ··· 115 109 */ 116 110 switch_mmu_context(prev, next, tsk); 117 111 } 112 + 113 + static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, 114 + struct task_struct *tsk) 115 + { 116 + unsigned long flags; 117 + 118 + local_irq_save(flags); 119 + switch_mm_irqs_off(prev, next, tsk); 120 + local_irq_restore(flags); 121 + } 122 + #define switch_mm_irqs_off switch_mm_irqs_off 123 + 118 124 119 125 #define deactivate_mm(tsk,mm) do { } while (0) 120 126

-5

arch/powerpc/include/asm/nohash/64/pgtable.h

··· 88 88 #include <asm/nohash/pte-book3e.h> 89 89 #include <asm/pte-common.h> 90 90 91 - #ifdef CONFIG_PPC_MM_SLICES 92 - #define HAVE_ARCH_UNMAPPED_AREA 93 - #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 94 - #endif /* CONFIG_PPC_MM_SLICES */ 95 - 96 91 #ifndef __ASSEMBLY__ 97 92 /* pte_clear moved to later in this file */ 98 93

+76 -1

arch/powerpc/include/asm/opal-api.h

··· 40 40 #define OPAL_I2C_ARBT_LOST -22 41 41 #define OPAL_I2C_NACK_RCVD -23 42 42 #define OPAL_I2C_STOP_ERR -24 43 + #define OPAL_XIVE_PROVISIONING -31 44 + #define OPAL_XIVE_FREE_ACTIVE -32 43 45 44 46 /* API Tokens (in r0) */ 45 47 #define OPAL_INVALID_CALL -1 ··· 170 168 #define OPAL_INT_SET_MFRR 125 171 169 #define OPAL_PCI_TCE_KILL 126 172 170 #define OPAL_NMMU_SET_PTCR 127 173 - #define OPAL_LAST 127 171 + #define OPAL_XIVE_RESET 128 172 + #define OPAL_XIVE_GET_IRQ_INFO 129 173 + #define OPAL_XIVE_GET_IRQ_CONFIG 130 174 + #define OPAL_XIVE_SET_IRQ_CONFIG 131 175 + #define OPAL_XIVE_GET_QUEUE_INFO 132 176 + #define OPAL_XIVE_SET_QUEUE_INFO 133 177 + #define OPAL_XIVE_DONATE_PAGE 134 178 + #define OPAL_XIVE_ALLOCATE_VP_BLOCK 135 179 + #define OPAL_XIVE_FREE_VP_BLOCK 136 180 + #define OPAL_XIVE_GET_VP_INFO 137 181 + #define OPAL_XIVE_SET_VP_INFO 138 182 + #define OPAL_XIVE_ALLOCATE_IRQ 139 183 + #define OPAL_XIVE_FREE_IRQ 140 184 + #define OPAL_XIVE_SYNC 141 185 + #define OPAL_XIVE_DUMP 142 186 + #define OPAL_XIVE_RESERVED3 143 187 + #define OPAL_XIVE_RESERVED4 144 188 + #define OPAL_NPU_INIT_CONTEXT 146 189 + #define OPAL_NPU_DESTROY_CONTEXT 147 190 + #define OPAL_NPU_MAP_LPAR 148 191 + #define OPAL_LAST 148 174 192 175 193 /* Device tree flags */ 176 194 ··· 948 926 OPAL_PCI_TCE_KILL_PAGES, 949 927 OPAL_PCI_TCE_KILL_PE, 950 928 OPAL_PCI_TCE_KILL_ALL, 929 + }; 930 + 931 + /* The xive operation mode indicates the active "API" and 932 + * corresponds to the "mode" parameter of the opal_xive_reset() 933 + * call 934 + */ 935 + enum { 936 + OPAL_XIVE_MODE_EMU = 0, 937 + OPAL_XIVE_MODE_EXPL = 1, 938 + }; 939 + 940 + /* Flags for OPAL_XIVE_GET_IRQ_INFO */ 941 + enum { 942 + OPAL_XIVE_IRQ_TRIGGER_PAGE = 0x00000001, 943 + OPAL_XIVE_IRQ_STORE_EOI = 0x00000002, 944 + OPAL_XIVE_IRQ_LSI = 0x00000004, 945 + OPAL_XIVE_IRQ_SHIFT_BUG = 0x00000008, 946 + OPAL_XIVE_IRQ_MASK_VIA_FW = 0x00000010, 947 + OPAL_XIVE_IRQ_EOI_VIA_FW = 0x00000020, 948 + }; 949 + 950 + /* Flags for OPAL_XIVE_GET/SET_QUEUE_INFO */ 951 + enum { 952 + OPAL_XIVE_EQ_ENABLED = 0x00000001, 953 + OPAL_XIVE_EQ_ALWAYS_NOTIFY = 0x00000002, 954 + OPAL_XIVE_EQ_ESCALATE = 0x00000004, 955 + }; 956 + 957 + /* Flags for OPAL_XIVE_GET/SET_VP_INFO */ 958 + enum { 959 + OPAL_XIVE_VP_ENABLED = 0x00000001, 960 + }; 961 + 962 + /* "Any chip" replacement for chip ID for allocation functions */ 963 + enum { 964 + OPAL_XIVE_ANY_CHIP = 0xffffffff, 965 + }; 966 + 967 + /* Xive sync options */ 968 + enum { 969 + /* This bits are cumulative, arg is a girq */ 970 + XIVE_SYNC_EAS = 0x00000001, /* Sync irq source */ 971 + XIVE_SYNC_QUEUE = 0x00000002, /* Sync irq target */ 972 + }; 973 + 974 + /* Dump options */ 975 + enum { 976 + XIVE_DUMP_TM_HYP = 0, 977 + XIVE_DUMP_TM_POOL = 1, 978 + XIVE_DUMP_TM_OS = 2, 979 + XIVE_DUMP_TM_USER = 3, 980 + XIVE_DUMP_VP = 4, 981 + XIVE_DUMP_EMU_STATE = 5, 951 982 }; 952 983 953 984 #endif /* __ASSEMBLY__ */

+41

arch/powerpc/include/asm/opal.h

··· 29 29 30 30 /* API functions */ 31 31 int64_t opal_invalid_call(void); 32 + int64_t opal_npu_destroy_context(uint64_t phb_id, uint64_t pid, uint64_t bdf); 33 + int64_t opal_npu_init_context(uint64_t phb_id, int pasid, uint64_t msr, 34 + uint64_t bdf); 35 + int64_t opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid, 36 + uint64_t lpcr); 32 37 int64_t opal_console_write(int64_t term_number, __be64 *length, 33 38 const uint8_t *buffer); 34 39 int64_t opal_console_read(int64_t term_number, __be64 *length, ··· 231 226 uint32_t pe_num, uint32_t tce_size, 232 227 uint64_t dma_addr, uint32_t npages); 233 228 int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr); 229 + int64_t opal_xive_reset(uint64_t version); 230 + int64_t opal_xive_get_irq_info(uint32_t girq, 231 + __be64 *out_flags, 232 + __be64 *out_eoi_page, 233 + __be64 *out_trig_page, 234 + __be32 *out_esb_shift, 235 + __be32 *out_src_chip); 236 + int64_t opal_xive_get_irq_config(uint32_t girq, __be64 *out_vp, 237 + uint8_t *out_prio, __be32 *out_lirq); 238 + int64_t opal_xive_set_irq_config(uint32_t girq, uint64_t vp, uint8_t prio, 239 + uint32_t lirq); 240 + int64_t opal_xive_get_queue_info(uint64_t vp, uint32_t prio, 241 + __be64 *out_qpage, 242 + __be64 *out_qsize, 243 + __be64 *out_qeoi_page, 244 + __be32 *out_escalate_irq, 245 + __be64 *out_qflags); 246 + int64_t opal_xive_set_queue_info(uint64_t vp, uint32_t prio, 247 + uint64_t qpage, 248 + uint64_t qsize, 249 + uint64_t qflags); 250 + int64_t opal_xive_donate_page(uint32_t chip_id, uint64_t addr); 251 + int64_t opal_xive_alloc_vp_block(uint32_t alloc_order); 252 + int64_t opal_xive_free_vp_block(uint64_t vp); 253 + int64_t opal_xive_get_vp_info(uint64_t vp, 254 + __be64 *out_flags, 255 + __be64 *out_cam_value, 256 + __be64 *out_report_cl_pair, 257 + __be32 *out_chip_id); 258 + int64_t opal_xive_set_vp_info(uint64_t vp, 259 + uint64_t flags, 260 + uint64_t report_cl_pair); 261 + int64_t opal_xive_allocate_irq(uint32_t chip_id); 262 + int64_t opal_xive_free_irq(uint32_t girq); 263 + int64_t opal_xive_sync(uint32_t type, uint32_t id); 264 + int64_t opal_xive_dump(uint32_t type, uint32_t id); 234 265 235 266 /* Internal functions */ 236 267 extern int early_init_dt_scan_opal(unsigned long node, const char *uname,

+18 -20

arch/powerpc/include/asm/paca.h

··· 99 99 */ 100 100 /* used for most interrupts/exceptions */ 101 101 u64 exgen[13] __attribute__((aligned(0x80))); 102 - u64 exmc[13]; /* used for machine checks */ 103 102 u64 exslb[13]; /* used for SLB/segment table misses 104 103 * on the linear mapping */ 105 104 /* SLB related definitions */ ··· 138 139 #ifdef CONFIG_PPC_MM_SLICES 139 140 u64 mm_ctx_low_slices_psize; 140 141 unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE]; 142 + unsigned long addr_limit; 141 143 #else 142 144 u16 mm_ctx_user_psize; 143 145 u16 mm_ctx_sllp; ··· 172 172 u8 thread_mask; 173 173 /* Mask to denote subcore sibling threads */ 174 174 u8 subcore_sibling_mask; 175 + /* 176 + * Pointer to an array which contains pointer 177 + * to the sibling threads' paca. 178 + */ 179 + struct paca_struct **thread_sibling_pacas; 175 180 #endif 176 181 182 + #ifdef CONFIG_PPC_STD_MMU_64 183 + /* Non-maskable exceptions that are not performance critical */ 184 + u64 exnmi[13]; /* used for system reset (nmi) */ 185 + u64 exmc[13]; /* used for machine checks */ 186 + #endif 177 187 #ifdef CONFIG_PPC_BOOK3S_64 178 - /* Exclusive emergency stack pointer for machine check exception. */ 188 + /* Exclusive stacks for system reset and machine check exception. */ 189 + void *nmi_emergency_sp; 179 190 void *mc_emergency_sp; 191 + 192 + u16 in_nmi; /* In nmi handler */ 193 + 180 194 /* 181 195 * Flag to check whether we are in machine check early handler 182 196 * and already using emergency stack. 183 197 */ 184 198 u16 in_mce; 185 - u8 hmi_event_available; /* HMI event is available */ 199 + u8 hmi_event_available; /* HMI event is available */ 186 200 #endif 187 201 188 202 /* Stuff for accurate time accounting */ ··· 220 206 #endif 221 207 }; 222 208 223 - #ifdef CONFIG_PPC_BOOK3S 224 - static inline void copy_mm_to_paca(mm_context_t *context) 225 - { 226 - get_paca()->mm_ctx_id = context->id; 227 - #ifdef CONFIG_PPC_MM_SLICES 228 - get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize; 229 - memcpy(&get_paca()->mm_ctx_high_slices_psize, 230 - &context->high_slices_psize, SLICE_ARRAY_SIZE); 231 - #else 232 - get_paca()->mm_ctx_user_psize = context->user_psize; 233 - get_paca()->mm_ctx_sllp = context->sllp; 234 - #endif 235 - } 236 - #else 237 - static inline void copy_mm_to_paca(mm_context_t *context){} 238 - #endif 239 - 209 + extern void copy_mm_to_paca(struct mm_struct *mm); 240 210 extern struct paca_struct *paca; 241 211 extern void initialise_paca(struct paca_struct *new_paca, int cpu); 242 212 extern void setup_paca(struct paca_struct *new_paca);

-14

arch/powerpc/include/asm/page_64.h

··· 98 98 #define GET_LOW_SLICE_INDEX(addr) ((addr) >> SLICE_LOW_SHIFT) 99 99 #define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT) 100 100 101 - /* 102 - * 1 bit per slice and we have one slice per 1TB 103 - * Right now we support only 64TB. 104 - * IF we change this we will have to change the type 105 - * of high_slices 106 - */ 107 - #define SLICE_MASK_SIZE 8 108 - 109 101 #ifndef __ASSEMBLY__ 110 - 111 - struct slice_mask { 112 - u16 low_slices; 113 - u64 high_slices; 114 - }; 115 - 116 102 struct mm_struct; 117 103 118 104 extern unsigned long slice_get_unmapped_area(unsigned long addr,

+3

arch/powerpc/include/asm/perf_event_server.h

··· 38 38 unsigned long *valp); 39 39 int (*get_alternatives)(u64 event_id, unsigned int flags, 40 40 u64 alt[]); 41 + void (*get_mem_data_src)(union perf_mem_data_src *dsrc, 42 + u32 flags, struct pt_regs *regs); 43 + void (*get_mem_weight)(u64 *weight); 41 44 u64 (*bhrb_filter_map)(u64 branch_sample_type); 42 45 void (*config_bhrb)(u64 pmu_bhrb_filter); 43 46 void (*disable_pmc)(unsigned int pmc, unsigned long mmcr[]);

+22

arch/powerpc/include/asm/powernv.h

··· 11 11 #define _ASM_POWERNV_H 12 12 13 13 #ifdef CONFIG_PPC_POWERNV 14 + #define NPU2_WRITE 1 14 15 extern void powernv_set_nmmu_ptcr(unsigned long ptcr); 16 + extern struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, 17 + unsigned long flags, 18 + struct npu_context *(*cb)(struct npu_context *, void *), 19 + void *priv); 20 + extern void pnv_npu2_destroy_context(struct npu_context *context, 21 + struct pci_dev *gpdev); 22 + extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, 23 + unsigned long *flags, unsigned long *status, 24 + int count); 15 25 #else 16 26 static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { } 27 + static inline struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, 28 + unsigned long flags, 29 + struct npu_context *(*cb)(struct npu_context *, void *), 30 + void *priv) { return ERR_PTR(-ENODEV); } 31 + static inline void pnv_npu2_destroy_context(struct npu_context *context, 32 + struct pci_dev *gpdev) { } 33 + 34 + static inline int pnv_npu2_handle_fault(struct npu_context *context, 35 + uintptr_t *ea, unsigned long *flags, 36 + unsigned long *status, int count) { 37 + return -ENODEV; 38 + } 17 39 #endif 18 40 19 41 #endif /* _ASM_POWERNV_H */

+2

arch/powerpc/include/asm/ppc-opcode.h

··· 161 161 #define PPC_INST_MFTMR 0x7c0002dc 162 162 #define PPC_INST_MSGSND 0x7c00019c 163 163 #define PPC_INST_MSGCLR 0x7c0001dc 164 + #define PPC_INST_MSGSYNC 0x7c0006ec 164 165 #define PPC_INST_MSGSNDP 0x7c00011c 165 166 #define PPC_INST_MTTMR 0x7c0003dc 166 167 #define PPC_INST_NOP 0x60000000 ··· 346 345 ___PPC_RB(b) | __PPC_EH(eh)) 347 346 #define PPC_MSGSND(b) stringify_in_c(.long PPC_INST_MSGSND | \ 348 347 ___PPC_RB(b)) 348 + #define PPC_MSGSYNC stringify_in_c(.long PPC_INST_MSGSYNC) 349 349 #define PPC_MSGCLR(b) stringify_in_c(.long PPC_INST_MSGCLR | \ 350 350 ___PPC_RB(b)) 351 351 #define PPC_MSGSNDP(b) stringify_in_c(.long PPC_INST_MSGSNDP | \

+33 -8

arch/powerpc/include/asm/processor.h

··· 102 102 #endif 103 103 104 104 #ifdef CONFIG_PPC64 105 - /* 64-bit user address space is 46-bits (64TB user VM) */ 106 - #define TASK_SIZE_USER64 (0x0000400000000000UL) 105 + /* 106 + * 64-bit user address space can have multiple limits 107 + * For now supported values are: 108 + */ 109 + #define TASK_SIZE_64TB (0x0000400000000000UL) 110 + #define TASK_SIZE_128TB (0x0000800000000000UL) 111 + #define TASK_SIZE_512TB (0x0002000000000000UL) 107 112 108 - /* 109 - * 32-bit user address space is 4GB - 1 page 113 + #ifdef CONFIG_PPC_BOOK3S_64 114 + /* 115 + * Max value currently used: 116 + */ 117 + #define TASK_SIZE_USER64 TASK_SIZE_512TB 118 + #else 119 + #define TASK_SIZE_USER64 TASK_SIZE_64TB 120 + #endif 121 + 122 + /* 123 + * 32-bit user address space is 4GB - 1 page 110 124 * (this 1 page is needed so referencing of 0xFFFFFFFF generates EFAULT 111 125 */ 112 126 #define TASK_SIZE_USER32 (0x0000000100000000UL - (1*PAGE_SIZE)) ··· 128 114 #define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \ 129 115 TASK_SIZE_USER32 : TASK_SIZE_USER64) 130 116 #define TASK_SIZE TASK_SIZE_OF(current) 131 - 132 117 /* This decides where the kernel will search for a free chunk of vm 133 118 * space during mmap's. 134 119 */ 135 120 #define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4)) 136 - #define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(TASK_SIZE_USER64 / 4)) 121 + #define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(TASK_SIZE_128TB / 4)) 137 122 138 123 #define TASK_UNMAPPED_BASE ((is_32bit_task()) ? \ 139 124 TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64 ) 140 125 #endif 141 126 127 + /* 128 + * Initial task size value for user applications. For book3s 64 we start 129 + * with 128TB and conditionally enable upto 512TB 130 + */ 131 + #ifdef CONFIG_PPC_BOOK3S_64 132 + #define DEFAULT_MAP_WINDOW ((is_32bit_task()) ? \ 133 + TASK_SIZE_USER32 : TASK_SIZE_128TB) 134 + #else 135 + #define DEFAULT_MAP_WINDOW TASK_SIZE 136 + #endif 137 + 142 138 #ifdef __powerpc64__ 143 139 144 - #define STACK_TOP_USER64 TASK_SIZE_USER64 140 + /* Limit stack to 128TB */ 141 + #define STACK_TOP_USER64 TASK_SIZE_128TB 145 142 #define STACK_TOP_USER32 TASK_SIZE_USER32 146 143 147 144 #define STACK_TOP (is_32bit_task() ? \ 148 145 STACK_TOP_USER32 : STACK_TOP_USER64) 149 146 150 - #define STACK_TOP_MAX STACK_TOP_USER64 147 + #define STACK_TOP_MAX TASK_SIZE_USER64 151 148 152 149 #else /* __powerpc64__ */ 153 150

+4

arch/powerpc/include/asm/reg.h

··· 310 310 #define SPRN_PMCR 0x374 /* Power Management Control Register */ 311 311 312 312 /* HFSCR and FSCR bit numbers are the same */ 313 + #define FSCR_SCV_LG 12 /* Enable System Call Vectored */ 313 314 #define FSCR_MSGP_LG 10 /* Enable MSGP */ 314 315 #define FSCR_TAR_LG 8 /* Enable Target Address Register */ 315 316 #define FSCR_EBB_LG 7 /* Enable Event Based Branching */ ··· 321 320 #define FSCR_VECVSX_LG 1 /* Enable VMX/VSX */ 322 321 #define FSCR_FP_LG 0 /* Enable Floating Point */ 323 322 #define SPRN_FSCR 0x099 /* Facility Status & Control Register */ 323 + #define FSCR_SCV __MASK(FSCR_SCV_LG) 324 324 #define FSCR_TAR __MASK(FSCR_TAR_LG) 325 325 #define FSCR_EBB __MASK(FSCR_EBB_LG) 326 326 #define FSCR_DSCR __MASK(FSCR_DSCR_LG) ··· 367 365 #define LPCR_MER_SH 11 368 366 #define LPCR_GTSE ASM_CONST(0x0000000000000400) /* Guest Translation Shootdown Enable */ 369 367 #define LPCR_TC ASM_CONST(0x0000000000000200) /* Translation control */ 368 + #define LPCR_HEIC ASM_CONST(0x0000000000000010) /* Hypervisor External Interrupt Control */ 370 369 #define LPCR_LPES 0x0000000c 371 370 #define LPCR_LPES0 ASM_CONST(0x0000000000000008) /* LPAR Env selector 0 */ 372 371 #define LPCR_LPES1 ASM_CONST(0x0000000000000004) /* LPAR Env selector 1 */ ··· 659 656 #define SRR1_ISI_PROT 0x08000000 /* ISI: Other protection fault */ 660 657 #define SRR1_WAKEMASK 0x00380000 /* reason for wakeup */ 661 658 #define SRR1_WAKEMASK_P8 0x003c0000 /* reason for wakeup on POWER8 and 9 */ 659 + #define SRR1_WAKEMCE_RESVD 0x003c0000 /* Unused/reserved value used by MCE wakeup to indicate cause to idle wakeup handler */ 662 660 #define SRR1_WAKESYSERR 0x00300000 /* System error */ 663 661 #define SRR1_WAKEEE 0x00200000 /* External interrupt */ 664 662 #define SRR1_WAKEHVI 0x00240000 /* Hypervisor Virtualization Interrupt (P9) */

+2

arch/powerpc/include/asm/sections.h

··· 6 6 #include <linux/uaccess.h> 7 7 #include <asm-generic/sections.h> 8 8 9 + extern char __head_end[]; 10 + 9 11 #ifdef __powerpc64__ 10 12 11 13 extern char __start_interrupts[];

+15 -6

arch/powerpc/include/asm/smp.h

··· 40 40 struct smp_ops_t { 41 41 void (*message_pass)(int cpu, int msg); 42 42 #ifdef CONFIG_PPC_SMP_MUXED_IPI 43 - void (*cause_ipi)(int cpu, unsigned long data); 43 + void (*cause_ipi)(int cpu); 44 44 #endif 45 + int (*cause_nmi_ipi)(int cpu); 45 46 void (*probe)(void); 46 47 int (*kick_cpu)(int nr); 48 + int (*prepare_cpu)(int nr); 47 49 void (*setup_cpu)(int nr); 48 50 void (*bringup_done)(void); 49 51 void (*take_timebase)(void); ··· 63 61 DECLARE_PER_CPU(unsigned int, cpu_pvr); 64 62 65 63 #ifdef CONFIG_HOTPLUG_CPU 66 - extern void migrate_irqs(void); 67 64 int generic_cpu_disable(void); 68 65 void generic_cpu_die(unsigned int cpu); 69 66 void generic_set_cpu_dead(unsigned int cpu); ··· 113 112 * 114 113 * Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up 115 114 * in /proc/interrupts will be wrong!!! --Troy */ 116 - #define PPC_MSG_CALL_FUNCTION 0 117 - #define PPC_MSG_RESCHEDULE 1 115 + #define PPC_MSG_CALL_FUNCTION 0 116 + #define PPC_MSG_RESCHEDULE 1 118 117 #define PPC_MSG_TICK_BROADCAST 2 119 - #define PPC_MSG_DEBUGGER_BREAK 3 118 + #define PPC_MSG_NMI_IPI 3 120 119 121 120 /* This is only used by the powernv kernel */ 122 121 #define PPC_MSG_RM_HOST_ACTION 4 122 + 123 + #define NMI_IPI_ALL_OTHERS -2 124 + 125 + #ifdef CONFIG_NMI_IPI 126 + extern int smp_handle_nmi_ipi(struct pt_regs *regs); 127 + #else 128 + static inline int smp_handle_nmi_ipi(struct pt_regs *regs) { return 0; } 129 + #endif 123 130 124 131 /* for irq controllers that have dedicated ipis per message (4) */ 125 132 extern int smp_request_message_ipi(int virq, int message); 126 133 extern const char *smp_ipi_name[]; 127 134 128 135 /* for irq controllers with only a single ipi */ 129 - extern void smp_muxed_ipi_set_data(int cpu, unsigned long data); 130 136 extern void smp_muxed_ipi_message_pass(int cpu, int msg); 131 137 extern void smp_muxed_ipi_set_message(int cpu, int msg); 132 138 extern irqreturn_t smp_ipi_demux(void); 139 + extern irqreturn_t smp_ipi_demux_relaxed(void); 133 140 134 141 void smp_init_pSeries(void); 135 142 void smp_init_cell(void);

+2 -2

arch/powerpc/include/asm/syscalls.h

··· 8 8 9 9 struct rtas_args; 10 10 11 - asmlinkage unsigned long sys_mmap(unsigned long addr, size_t len, 11 + asmlinkage long sys_mmap(unsigned long addr, size_t len, 12 12 unsigned long prot, unsigned long flags, 13 13 unsigned long fd, off_t offset); 14 - asmlinkage unsigned long sys_mmap2(unsigned long addr, size_t len, 14 + asmlinkage long sys_mmap2(unsigned long addr, size_t len, 15 15 unsigned long prot, unsigned long flags, 16 16 unsigned long fd, unsigned long pgoff); 17 17 asmlinkage long ppc64_personality(unsigned long personality);

+1 -9

arch/powerpc/include/asm/thread_info.h

··· 10 10 11 11 #ifdef __KERNEL__ 12 12 13 - /* We have 8k stacks on ppc32 and 16k on ppc64 */ 14 - 15 - #if defined(CONFIG_PPC64) 16 - #define THREAD_SHIFT 14 17 - #elif defined(CONFIG_PPC_256K_PAGES) 18 - #define THREAD_SHIFT 15 19 - #else 20 - #define THREAD_SHIFT 13 21 - #endif 13 + #define THREAD_SHIFT CONFIG_THREAD_SHIFT 22 14 23 15 #define THREAD_SIZE (1 << THREAD_SHIFT) 24 16

+1 -1

arch/powerpc/include/asm/xics.h

··· 57 57 void (*teardown_cpu)(void); 58 58 void (*flush_ipi)(void); 59 59 #ifdef CONFIG_SMP 60 - void (*cause_ipi)(int cpu, unsigned long data); 60 + void (*cause_ipi)(int cpu); 61 61 irq_handler_t ipi_action; 62 62 #endif 63 63 };

+97

arch/powerpc/include/asm/xive-regs.h

··· 1 + /* 2 + * Copyright 2016,2017 IBM Corporation. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public License 6 + * as published by the Free Software Foundation; either version 7 + * 2 of the License, or (at your option) any later version. 8 + */ 9 + #ifndef _ASM_POWERPC_XIVE_REGS_H 10 + #define _ASM_POWERPC_XIVE_REGS_H 11 + 12 + /* 13 + * Thread Management (aka "TM") registers 14 + */ 15 + 16 + /* TM register offsets */ 17 + #define TM_QW0_USER 0x000 /* All rings */ 18 + #define TM_QW1_OS 0x010 /* Ring 0..2 */ 19 + #define TM_QW2_HV_POOL 0x020 /* Ring 0..1 */ 20 + #define TM_QW3_HV_PHYS 0x030 /* Ring 0..1 */ 21 + 22 + /* Byte offsets inside a QW QW0 QW1 QW2 QW3 */ 23 + #define TM_NSR 0x0 /* + + - + */ 24 + #define TM_CPPR 0x1 /* - + - + */ 25 + #define TM_IPB 0x2 /* - + + + */ 26 + #define TM_LSMFB 0x3 /* - + + + */ 27 + #define TM_ACK_CNT 0x4 /* - + - - */ 28 + #define TM_INC 0x5 /* - + - + */ 29 + #define TM_AGE 0x6 /* - + - + */ 30 + #define TM_PIPR 0x7 /* - + - + */ 31 + 32 + #define TM_WORD0 0x0 33 + #define TM_WORD1 0x4 34 + 35 + /* 36 + * QW word 2 contains the valid bit at the top and other fields 37 + * depending on the QW. 38 + */ 39 + #define TM_WORD2 0x8 40 + #define TM_QW0W2_VU PPC_BIT32(0) 41 + #define TM_QW0W2_LOGIC_SERV PPC_BITMASK32(1,31) // XX 2,31 ? 42 + #define TM_QW1W2_VO PPC_BIT32(0) 43 + #define TM_QW1W2_OS_CAM PPC_BITMASK32(8,31) 44 + #define TM_QW2W2_VP PPC_BIT32(0) 45 + #define TM_QW2W2_POOL_CAM PPC_BITMASK32(8,31) 46 + #define TM_QW3W2_VT PPC_BIT32(0) 47 + #define TM_QW3W2_LP PPC_BIT32(6) 48 + #define TM_QW3W2_LE PPC_BIT32(7) 49 + #define TM_QW3W2_T PPC_BIT32(31) 50 + 51 + /* 52 + * In addition to normal loads to "peek" and writes (only when invalid) 53 + * using 4 and 8 bytes accesses, the above registers support these 54 + * "special" byte operations: 55 + * 56 + * - Byte load from QW0[NSR] - User level NSR (EBB) 57 + * - Byte store to QW0[NSR] - User level NSR (EBB) 58 + * - Byte load/store to QW1[CPPR] and QW3[CPPR] - CPPR access 59 + * - Byte load from QW3[TM_WORD2] - Read VT||00000||LP||LE on thrd 0 60 + * otherwise VT||0000000 61 + * - Byte store to QW3[TM_WORD2] - Set VT bit (and LP/LE if present) 62 + * 63 + * Then we have all these "special" CI ops at these offset that trigger 64 + * all sorts of side effects: 65 + */ 66 + #define TM_SPC_ACK_EBB 0x800 /* Load8 ack EBB to reg*/ 67 + #define TM_SPC_ACK_OS_REG 0x810 /* Load16 ack OS irq to reg */ 68 + #define TM_SPC_PUSH_USR_CTX 0x808 /* Store32 Push/Validate user context */ 69 + #define TM_SPC_PULL_USR_CTX 0x808 /* Load32 Pull/Invalidate user context */ 70 + #define TM_SPC_SET_OS_PENDING 0x812 /* Store8 Set OS irq pending bit */ 71 + #define TM_SPC_PULL_OS_CTX 0x818 /* Load32/Load64 Pull/Invalidate OS context to reg */ 72 + #define TM_SPC_PULL_POOL_CTX 0x828 /* Load32/Load64 Pull/Invalidate Pool context to reg*/ 73 + #define TM_SPC_ACK_HV_REG 0x830 /* Load16 ack HV irq to reg */ 74 + #define TM_SPC_PULL_USR_CTX_OL 0xc08 /* Store8 Pull/Inval usr ctx to odd line */ 75 + #define TM_SPC_ACK_OS_EL 0xc10 /* Store8 ack OS irq to even line */ 76 + #define TM_SPC_ACK_HV_POOL_EL 0xc20 /* Store8 ack HV evt pool to even line */ 77 + #define TM_SPC_ACK_HV_EL 0xc30 /* Store8 ack HV irq to even line */ 78 + /* XXX more... */ 79 + 80 + /* NSR fields for the various QW ack types */ 81 + #define TM_QW0_NSR_EB PPC_BIT8(0) 82 + #define TM_QW1_NSR_EO PPC_BIT8(0) 83 + #define TM_QW3_NSR_HE PPC_BITMASK8(0,1) 84 + #define TM_QW3_NSR_HE_NONE 0 85 + #define TM_QW3_NSR_HE_POOL 1 86 + #define TM_QW3_NSR_HE_PHYS 2 87 + #define TM_QW3_NSR_HE_LSI 3 88 + #define TM_QW3_NSR_I PPC_BIT8(2) 89 + #define TM_QW3_NSR_GRP_LVL PPC_BIT8(3,7) 90 + 91 + /* Utilities to manipulate these (originaly from OPAL) */ 92 + #define MASK_TO_LSH(m) (__builtin_ffsl(m) - 1) 93 + #define GETFIELD(m, v) (((v) & (m)) >> MASK_TO_LSH(m)) 94 + #define SETFIELD(m, v, val) \ 95 + (((v) & ~(m)) | ((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m))) 96 + 97 + #endif /* _ASM_POWERPC_XIVE_REGS_H */

+163

arch/powerpc/include/asm/xive.h

··· 1 + /* 2 + * Copyright 2016,2017 IBM Corporation. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public License 6 + * as published by the Free Software Foundation; either version 7 + * 2 of the License, or (at your option) any later version. 8 + */ 9 + #ifndef _ASM_POWERPC_XIVE_H 10 + #define _ASM_POWERPC_XIVE_H 11 + 12 + #define XIVE_INVALID_VP 0xffffffff 13 + 14 + #ifdef CONFIG_PPC_XIVE 15 + 16 + /* 17 + * Thread Interrupt Management Area (TIMA) 18 + * 19 + * This is a global MMIO region divided in 4 pages of varying access 20 + * permissions, providing access to per-cpu interrupt management 21 + * functions. It always identifies the CPU doing the access based 22 + * on the PowerBus initiator ID, thus we always access via the 23 + * same offset regardless of where the code is executing 24 + */ 25 + extern void __iomem *xive_tima; 26 + 27 + /* 28 + * Offset in the TM area of our current execution level (provided by 29 + * the backend) 30 + */ 31 + extern u32 xive_tima_offset; 32 + 33 + /* 34 + * Per-irq data (irq_get_handler_data for normal IRQs), IPIs 35 + * have it stored in the xive_cpu structure. We also cache 36 + * for normal interrupts the current target CPU. 37 + * 38 + * This structure is setup by the backend for each interrupt. 39 + */ 40 + struct xive_irq_data { 41 + u64 flags; 42 + u64 eoi_page; 43 + void __iomem *eoi_mmio; 44 + u64 trig_page; 45 + void __iomem *trig_mmio; 46 + u32 esb_shift; 47 + int src_chip; 48 + 49 + /* Setup/used by frontend */ 50 + int target; 51 + bool saved_p; 52 + }; 53 + #define XIVE_IRQ_FLAG_STORE_EOI 0x01 54 + #define XIVE_IRQ_FLAG_LSI 0x02 55 + #define XIVE_IRQ_FLAG_SHIFT_BUG 0x04 56 + #define XIVE_IRQ_FLAG_MASK_FW 0x08 57 + #define XIVE_IRQ_FLAG_EOI_FW 0x10 58 + 59 + #define XIVE_INVALID_CHIP_ID -1 60 + 61 + /* A queue tracking structure in a CPU */ 62 + struct xive_q { 63 + __be32 *qpage; 64 + u32 msk; 65 + u32 idx; 66 + u32 toggle; 67 + u64 eoi_phys; 68 + u32 esc_irq; 69 + atomic_t count; 70 + atomic_t pending_count; 71 + }; 72 + 73 + /* 74 + * "magic" Event State Buffer (ESB) MMIO offsets. 75 + * 76 + * Each interrupt source has a 2-bit state machine called ESB 77 + * which can be controlled by MMIO. It's made of 2 bits, P and 78 + * Q. P indicates that an interrupt is pending (has been sent 79 + * to a queue and is waiting for an EOI). Q indicates that the 80 + * interrupt has been triggered while pending. 81 + * 82 + * This acts as a coalescing mechanism in order to guarantee 83 + * that a given interrupt only occurs at most once in a queue. 84 + * 85 + * When doing an EOI, the Q bit will indicate if the interrupt 86 + * needs to be re-triggered. 87 + * 88 + * The following offsets into the ESB MMIO allow to read or 89 + * manipulate the PQ bits. They must be used with an 8-bytes 90 + * load instruction. They all return the previous state of the 91 + * interrupt (atomically). 92 + * 93 + * Additionally, some ESB pages support doing an EOI via a 94 + * store at 0 and some ESBs support doing a trigger via a 95 + * separate trigger page. 96 + */ 97 + #define XIVE_ESB_GET 0x800 98 + #define XIVE_ESB_SET_PQ_00 0xc00 99 + #define XIVE_ESB_SET_PQ_01 0xd00 100 + #define XIVE_ESB_SET_PQ_10 0xe00 101 + #define XIVE_ESB_SET_PQ_11 0xf00 102 + #define XIVE_ESB_MASK XIVE_ESB_SET_PQ_01 103 + 104 + #define XIVE_ESB_VAL_P 0x2 105 + #define XIVE_ESB_VAL_Q 0x1 106 + 107 + /* Global enable flags for the XIVE support */ 108 + extern bool __xive_enabled; 109 + 110 + static inline bool xive_enabled(void) { return __xive_enabled; } 111 + 112 + extern bool xive_native_init(void); 113 + extern void xive_smp_probe(void); 114 + extern int xive_smp_prepare_cpu(unsigned int cpu); 115 + extern void xive_smp_setup_cpu(void); 116 + extern void xive_smp_disable_cpu(void); 117 + extern void xive_kexec_teardown_cpu(int secondary); 118 + extern void xive_shutdown(void); 119 + extern void xive_flush_interrupt(void); 120 + 121 + /* xmon hook */ 122 + extern void xmon_xive_do_dump(int cpu); 123 + 124 + /* APIs used by KVM */ 125 + extern u32 xive_native_default_eq_shift(void); 126 + extern u32 xive_native_alloc_vp_block(u32 max_vcpus); 127 + extern void xive_native_free_vp_block(u32 vp_base); 128 + extern int xive_native_populate_irq_data(u32 hw_irq, 129 + struct xive_irq_data *data); 130 + extern void xive_cleanup_irq_data(struct xive_irq_data *xd); 131 + extern u32 xive_native_alloc_irq(void); 132 + extern void xive_native_free_irq(u32 irq); 133 + extern int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); 134 + 135 + extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio, 136 + __be32 *qpage, u32 order, bool can_escalate); 137 + extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio); 138 + 139 + extern bool __xive_irq_trigger(struct xive_irq_data *xd); 140 + extern bool __xive_irq_retrigger(struct xive_irq_data *xd); 141 + extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd); 142 + 143 + extern bool is_xive_irq(struct irq_chip *chip); 144 + 145 + #else 146 + 147 + static inline bool xive_enabled(void) { return false; } 148 + 149 + static inline bool xive_native_init(void) { return false; } 150 + static inline void xive_smp_probe(void) { } 151 + extern inline int xive_smp_prepare_cpu(unsigned int cpu) { return -EINVAL; } 152 + static inline void xive_smp_setup_cpu(void) { } 153 + static inline void xive_smp_disable_cpu(void) { } 154 + static inline void xive_kexec_teardown_cpu(int secondary) { } 155 + static inline void xive_shutdown(void) { } 156 + static inline void xive_flush_interrupt(void) { } 157 + 158 + static inline u32 xive_native_alloc_vp_block(u32 max_vcpus) { return XIVE_INVALID_VP; } 159 + static inline void xive_native_free_vp_block(u32 vp_base) { } 160 + 161 + #endif 162 + 163 + #endif /* _ASM_POWERPC_XIVE_H */

+2

arch/powerpc/include/asm/xmon.h

··· 29 29 extern int cpus_are_in_xmon(void); 30 30 #endif 31 31 32 + extern void xmon_printf(const char *format, ...); 33 + 32 34 #endif /* __KERNEL __ */ 33 35 #endif /* __ASM_POWERPC_XMON_H */

+16

arch/powerpc/include/uapi/asm/mman.h

··· 29 29 #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ 30 30 #define MAP_HUGETLB 0x40000 /* create a huge page mapping */ 31 31 32 + /* 33 + * When MAP_HUGETLB is set, bits [26:31] of the flags argument to mmap(2), 34 + * encode the log2 of the huge page size. A value of zero indicates that the 35 + * default huge page size should be used. To use a non-default huge page size, 36 + * one of these defines can be used, or the size can be encoded by hand. Note 37 + * that on most systems only a subset, or possibly none, of these sizes will be 38 + * available. 39 + */ 40 + #define MAP_HUGE_512KB (19 << MAP_HUGE_SHIFT) /* 512KB HugeTLB Page */ 41 + #define MAP_HUGE_1MB (20 << MAP_HUGE_SHIFT) /* 1MB HugeTLB Page */ 42 + #define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) /* 2MB HugeTLB Page */ 43 + #define MAP_HUGE_8MB (23 << MAP_HUGE_SHIFT) /* 8MB HugeTLB Page */ 44 + #define MAP_HUGE_16MB (24 << MAP_HUGE_SHIFT) /* 16MB HugeTLB Page */ 45 + #define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) /* 1GB HugeTLB Page */ 46 + #define MAP_HUGE_16GB (34 << MAP_HUGE_SHIFT) /* 16GB HugeTLB Page */ 47 + 32 48 #endif /* _UAPI_ASM_POWERPC_MMAN_H */

+4 -8

arch/powerpc/kernel/Makefile

··· 25 25 CFLAGS_REMOVE_prom_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) 26 26 CFLAGS_REMOVE_btext.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) 27 27 CFLAGS_REMOVE_prom.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) 28 - # do not trace tracer code 29 - CFLAGS_REMOVE_ftrace.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) 30 28 # timers used by tracing 31 29 CFLAGS_REMOVE_time.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) 32 30 endif ··· 95 97 obj-$(CONFIG_SMP) += smp.o 96 98 obj-$(CONFIG_KPROBES) += kprobes.o 97 99 obj-$(CONFIG_OPTPROBES) += optprobes.o optprobes_head.o 100 + obj-$(CONFIG_KPROBES_ON_FTRACE) += kprobes-ftrace.o 98 101 obj-$(CONFIG_UPROBES) += uprobes.o 99 102 obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o 100 103 obj-$(CONFIG_STACKTRACE) += stacktrace.o ··· 117 118 118 119 obj-$(CONFIG_PPC_IO_WORKAROUNDS) += io-workarounds.o 119 120 120 - obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 121 - obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o 122 - obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o 123 - obj-$(CONFIG_TRACING) += trace_clock.o 121 + obj-y += trace/ 124 122 125 123 ifneq ($(CONFIG_PPC_INDIRECT_PIO),y) 126 124 obj-y += iomap.o ··· 138 142 # Disable GCOV & sanitizers in odd or sensitive code 139 143 GCOV_PROFILE_prom_init.o := n 140 144 UBSAN_SANITIZE_prom_init.o := n 141 - GCOV_PROFILE_ftrace.o := n 142 - UBSAN_SANITIZE_ftrace.o := n 143 145 GCOV_PROFILE_machine_kexec_64.o := n 144 146 UBSAN_SANITIZE_machine_kexec_64.o := n 145 147 GCOV_PROFILE_machine_kexec_32.o := n 146 148 UBSAN_SANITIZE_machine_kexec_32.o := n 147 149 GCOV_PROFILE_kprobes.o := n 148 150 UBSAN_SANITIZE_kprobes.o := n 151 + GCOV_PROFILE_kprobes-ftrace.o := n 152 + UBSAN_SANITIZE_kprobes-ftrace.o := n 149 153 UBSAN_SANITIZE_vdso.o := n 150 154 151 155 extra-$(CONFIG_PPC_FPU) += fpu.o

+7 -2

arch/powerpc/kernel/asm-offsets.c

··· 185 185 #ifdef CONFIG_PPC_MM_SLICES 186 186 OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize); 187 187 OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize); 188 + DEFINE(PACA_ADDR_LIMIT, offsetof(struct paca_struct, addr_limit)); 188 189 DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def)); 189 190 #endif /* CONFIG_PPC_MM_SLICES */ 190 191 #endif ··· 220 219 OFFSET(PACA_EXGEN, paca_struct, exgen); 221 220 OFFSET(PACA_EXMC, paca_struct, exmc); 222 221 OFFSET(PACA_EXSLB, paca_struct, exslb); 222 + OFFSET(PACA_EXNMI, paca_struct, exnmi); 223 223 OFFSET(PACALPPACAPTR, paca_struct, lppaca_ptr); 224 224 OFFSET(PACA_SLBSHADOWPTR, paca_struct, slb_shadow_ptr); 225 225 OFFSET(SLBSHADOW_STACKVSID, slb_shadow, save_area[SLB_NUM_BOLTED - 1].vsid); ··· 234 232 OFFSET(PACAEMERGSP, paca_struct, emergency_sp); 235 233 #ifdef CONFIG_PPC_BOOK3S_64 236 234 OFFSET(PACAMCEMERGSP, paca_struct, mc_emergency_sp); 235 + OFFSET(PACA_NMI_EMERG_SP, paca_struct, nmi_emergency_sp); 237 236 OFFSET(PACA_IN_MCE, paca_struct, in_mce); 237 + OFFSET(PACA_IN_NMI, paca_struct, in_nmi); 238 238 #endif 239 239 OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id); 240 240 OFFSET(PACAKEXECSTATE, paca_struct, kexec_state); ··· 403 399 DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry)); 404 400 #endif 405 401 406 - #ifdef MAX_PGD_TABLE_SIZE 407 - DEFINE(PGD_TABLE_SIZE, MAX_PGD_TABLE_SIZE); 402 + #ifdef CONFIG_PPC_BOOK3S_64 403 + DEFINE(PGD_TABLE_SIZE, (sizeof(pgd_t) << max(RADIX_PGD_INDEX_SIZE, H_PGD_INDEX_SIZE))); 408 404 #else 409 405 DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE); 410 406 #endif ··· 731 727 OFFSET(PACA_THREAD_IDLE_STATE, paca_struct, thread_idle_state); 732 728 OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask); 733 729 OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask); 730 + OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas); 734 731 #endif 735 732 736 733 DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER);

+22 -14

arch/powerpc/kernel/cpu_setup_power.S

··· 29 29 li r0,0 30 30 mtspr SPRN_LPID,r0 31 31 mfspr r3,SPRN_LPCR 32 - bl __init_LPCR 32 + li r4,(LPCR_LPES1 >> LPCR_LPES_SH) 33 + bl __init_LPCR_ISA206 33 34 bl __init_tlb_power7 34 35 mtlr r11 35 36 blr ··· 43 42 li r0,0 44 43 mtspr SPRN_LPID,r0 45 44 mfspr r3,SPRN_LPCR 46 - bl __init_LPCR 45 + li r4,(LPCR_LPES1 >> LPCR_LPES_SH) 46 + bl __init_LPCR_ISA206 47 47 bl __init_tlb_power7 48 48 mtlr r11 49 49 blr ··· 61 59 mtspr SPRN_LPID,r0 62 60 mfspr r3,SPRN_LPCR 63 61 ori r3, r3, LPCR_PECEDH 64 - bl __init_LPCR 62 + li r4,0 /* LPES = 0 */ 63 + bl __init_LPCR_ISA206 65 64 bl __init_HFSCR 66 65 bl __init_tlb_power8 67 66 bl __init_PMU_HV ··· 83 80 mtspr SPRN_LPID,r0 84 81 mfspr r3,SPRN_LPCR 85 82 ori r3, r3, LPCR_PECEDH 86 - bl __init_LPCR 83 + li r4,0 /* LPES = 0 */ 84 + bl __init_LPCR_ISA206 87 85 bl __init_HFSCR 88 86 bl __init_tlb_power8 89 87 bl __init_PMU_HV ··· 103 99 mtspr SPRN_PSSCR,r0 104 100 mtspr SPRN_LPID,r0 105 101 mfspr r3,SPRN_LPCR 106 - LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE) 102 + LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC) 107 103 or r3, r3, r4 108 104 LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR) 109 105 andc r3, r3, r4 110 - bl __init_LPCR 106 + li r4,0 /* LPES = 0 */ 107 + bl __init_LPCR_ISA300 111 108 bl __init_HFSCR 112 109 bl __init_tlb_power9 113 110 bl __init_PMU_HV ··· 127 122 mtspr SPRN_PSSCR,r0 128 123 mtspr SPRN_LPID,r0 129 124 mfspr r3,SPRN_LPCR 130 - LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE) 125 + LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC) 131 126 or r3, r3, r4 132 127 LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR) 133 128 andc r3, r3, r4 134 - bl __init_LPCR 129 + li r4,0 /* LPES = 0 */ 130 + bl __init_LPCR_ISA300 135 131 bl __init_HFSCR 136 132 bl __init_tlb_power9 137 133 bl __init_PMU_HV ··· 150 144 std r5,CPU_SPEC_FEATURES(r4) 151 145 blr 152 146 153 - __init_LPCR: 147 + __init_LPCR_ISA206: 154 148 /* Setup a sane LPCR: 155 - * Called with initial LPCR in R3 149 + * Called with initial LPCR in R3 and desired LPES 2-bit value in R4 156 150 * 157 151 * LPES = 0b01 (HSRR0/1 used for 0x500) 158 152 * PECE = 0b111 ··· 163 157 * 164 158 * Other bits untouched for now 165 159 */ 166 - li r5,1 167 - rldimi r3,r5, LPCR_LPES_SH, 64-LPCR_LPES_SH-2 160 + li r5,0x10 161 + rldimi r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5 162 + 163 + /* POWER9 has no VRMASD */ 164 + __init_LPCR_ISA300: 165 + rldimi r3,r4, LPCR_LPES_SH, 64-LPCR_LPES_SH-2 168 166 ori r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2) 169 167 li r5,4 170 168 rldimi r3,r5, LPCR_DPFD_SH, 64-LPCR_DPFD_SH-3 171 169 clrrdi r3,r3,1 /* clear HDICE */ 172 170 li r5,4 173 171 rldimi r3,r5, LPCR_VC_SH, 0 174 - li r5,0x10 175 - rldimi r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5 176 172 mtspr SPRN_LPCR,r3 177 173 isync 178 174 blr

+52 -8

arch/powerpc/kernel/dbell.c

··· 20 20 #include <asm/kvm_ppc.h> 21 21 22 22 #ifdef CONFIG_SMP 23 - void doorbell_setup_this_cpu(void) 24 - { 25 - unsigned long tag = mfspr(SPRN_DOORBELL_CPUTAG) & PPC_DBELL_TAG_MASK; 26 23 27 - smp_muxed_ipi_set_data(smp_processor_id(), tag); 24 + /* 25 + * Doorbells must only be used if CPU_FTR_DBELL is available. 26 + * msgsnd is used in HV, and msgsndp is used in !HV. 27 + * 28 + * These should be used by platform code that is aware of restrictions. 29 + * Other arch code should use ->cause_ipi. 30 + * 31 + * doorbell_global_ipi() sends a dbell to any target CPU. 32 + * Must be used only by architectures that address msgsnd target 33 + * by PIR/get_hard_smp_processor_id. 34 + */ 35 + void doorbell_global_ipi(int cpu) 36 + { 37 + u32 tag = get_hard_smp_processor_id(cpu); 38 + 39 + kvmppc_set_host_ipi(cpu, 1); 40 + /* Order previous accesses vs. msgsnd, which is treated as a store */ 41 + ppc_msgsnd_sync(); 42 + ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag); 28 43 } 29 44 30 - void doorbell_cause_ipi(int cpu, unsigned long data) 45 + /* 46 + * doorbell_core_ipi() sends a dbell to a target CPU in the same core. 47 + * Must be used only by architectures that address msgsnd target 48 + * by TIR/cpu_thread_in_core. 49 + */ 50 + void doorbell_core_ipi(int cpu) 31 51 { 52 + u32 tag = cpu_thread_in_core(cpu); 53 + 54 + kvmppc_set_host_ipi(cpu, 1); 32 55 /* Order previous accesses vs. msgsnd, which is treated as a store */ 33 - mb(); 34 - ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, data); 56 + ppc_msgsnd_sync(); 57 + ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag); 58 + } 59 + 60 + /* 61 + * Attempt to cause a core doorbell if destination is on the same core. 62 + * Returns 1 on success, 0 on failure. 63 + */ 64 + int doorbell_try_core_ipi(int cpu) 65 + { 66 + int this_cpu = get_cpu(); 67 + int ret = 0; 68 + 69 + if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu))) { 70 + doorbell_core_ipi(cpu); 71 + ret = 1; 72 + } 73 + 74 + put_cpu(); 75 + 76 + return ret; 35 77 } 36 78 37 79 void doorbell_exception(struct pt_regs *regs) ··· 82 40 83 41 irq_enter(); 84 42 43 + ppc_msgsync(); 44 + 85 45 may_hard_irq_enable(); 86 46 87 47 kvmppc_set_host_ipi(smp_processor_id(), 0); 88 48 __this_cpu_inc(irq_stat.doorbell_irqs); 89 49 90 - smp_ipi_demux(); 50 + smp_ipi_demux_relaxed(); /* already performed the barrier */ 91 51 92 52 irq_exit(); 93 53 set_irq_regs(old_regs);

+1 -2

arch/powerpc/kernel/eeh.c

··· 22 22 */ 23 23 24 24 #include <linux/delay.h> 25 - #include <linux/debugfs.h> 26 25 #include <linux/sched.h> 27 26 #include <linux/init.h> 28 27 #include <linux/list.h> ··· 36 37 #include <linux/of.h> 37 38 38 39 #include <linux/atomic.h> 39 - #include <asm/debug.h> 40 + #include <asm/debugfs.h> 40 41 #include <asm/eeh.h> 41 42 #include <asm/eeh_event.h> 42 43 #include <asm/io.h>

+39 -16

arch/powerpc/kernel/eeh_driver.c

··· 724 724 */ 725 725 #define MAX_WAIT_FOR_RECOVERY 300 726 726 727 - static void eeh_handle_normal_event(struct eeh_pe *pe) 727 + /** 728 + * eeh_handle_normal_event - Handle EEH events on a specific PE 729 + * @pe: EEH PE 730 + * 731 + * Attempts to recover the given PE. If recovery fails or the PE has failed 732 + * too many times, remove the PE. 733 + * 734 + * Returns true if @pe should no longer be used, else false. 735 + */ 736 + static bool eeh_handle_normal_event(struct eeh_pe *pe) 728 737 { 729 738 struct pci_bus *frozen_bus; 730 739 struct eeh_dev *edev, *tmp; ··· 745 736 if (!frozen_bus) { 746 737 pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n", 747 738 __func__, pe->phb->global_number, pe->addr); 748 - return; 739 + return false; 749 740 } 750 741 751 742 eeh_pe_update_time_stamp(pe); 752 743 pe->freeze_count++; 753 - if (pe->freeze_count > eeh_max_freezes) 754 - goto excess_failures; 744 + if (pe->freeze_count > eeh_max_freezes) { 745 + pr_err("EEH: PHB#%x-PE#%x has failed %d times in the\n" 746 + "last hour and has been permanently disabled.\n", 747 + pe->phb->global_number, pe->addr, 748 + pe->freeze_count); 749 + goto hard_fail; 750 + } 755 751 pr_warn("EEH: This PCI device has failed %d times in the last hour\n", 756 752 pe->freeze_count); 757 753 ··· 884 870 pr_info("EEH: Notify device driver to resume\n"); 885 871 eeh_pe_dev_traverse(pe, eeh_report_resume, NULL); 886 872 887 - return; 873 + return false; 888 874 889 - excess_failures: 875 + hard_fail: 890 876 /* 891 877 * About 90% of all real-life EEH failures in the field 892 878 * are due to poorly seated PCI cards. Only 10% or so are 893 879 * due to actual, failed cards. 894 880 */ 895 - pr_err("EEH: PHB#%x-PE#%x has failed %d times in the\n" 896 - "last hour and has been permanently disabled.\n" 897 - "Please try reseating or replacing it.\n", 898 - pe->phb->global_number, pe->addr, 899 - pe->freeze_count); 900 - goto perm_error; 901 - 902 - hard_fail: 903 881 pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" 904 882 "Please try reseating or replacing it\n", 905 883 pe->phb->global_number, pe->addr); 906 884 907 - perm_error: 908 885 eeh_slot_error_detail(pe, EEH_LOG_PERM); 909 886 910 887 /* Notify all devices that they're about to go down. */ ··· 920 915 pci_lock_rescan_remove(); 921 916 pci_hp_remove_devices(frozen_bus); 922 917 pci_unlock_rescan_remove(); 918 + 919 + /* The passed PE should no longer be used */ 920 + return true; 923 921 } 924 922 } 923 + return false; 925 924 } 926 925 926 + /** 927 + * eeh_handle_special_event - Handle EEH events without a specific failing PE 928 + * 929 + * Called when an EEH event is detected but can't be narrowed down to a 930 + * specific PE. Iterates through possible failures and handles them as 931 + * necessary. 932 + */ 927 933 static void eeh_handle_special_event(void) 928 934 { 929 935 struct eeh_pe *pe, *phb_pe; ··· 998 982 */ 999 983 if (rc == EEH_NEXT_ERR_FROZEN_PE || 1000 984 rc == EEH_NEXT_ERR_FENCED_PHB) { 1001 - eeh_handle_normal_event(pe); 985 + /* 986 + * eeh_handle_normal_event() can make the PE stale if it 987 + * determines that the PE cannot possibly be recovered. 988 + * Don't modify the PE state if that's the case. 989 + */ 990 + if (eeh_handle_normal_event(pe)) 991 + continue; 992 + 1002 993 eeh_pe_state_clear(pe, EEH_PE_RECOVERING); 1003 994 } else { 1004 995 pci_lock_rescan_remove();

-107

arch/powerpc/kernel/entry_32.S

··· 31 31 #include <asm/ppc_asm.h> 32 32 #include <asm/asm-offsets.h> 33 33 #include <asm/unistd.h> 34 - #include <asm/ftrace.h> 35 34 #include <asm/ptrace.h> 36 35 #include <asm/export.h> 37 36 ··· 1314 1315 /* XXX load up BATs and panic */ 1315 1316 1316 1317 #endif /* CONFIG_PPC_RTAS */ 1317 - 1318 - #ifdef CONFIG_FUNCTION_TRACER 1319 - #ifdef CONFIG_DYNAMIC_FTRACE 1320 - _GLOBAL(mcount) 1321 - _GLOBAL(_mcount) 1322 - /* 1323 - * It is required that _mcount on PPC32 must preserve the 1324 - * link register. But we have r0 to play with. We use r0 1325 - * to push the return address back to the caller of mcount 1326 - * into the ctr register, restore the link register and 1327 - * then jump back using the ctr register. 1328 - */ 1329 - mflr r0 1330 - mtctr r0 1331 - lwz r0, 4(r1) 1332 - mtlr r0 1333 - bctr 1334 - 1335 - _GLOBAL(ftrace_caller) 1336 - MCOUNT_SAVE_FRAME 1337 - /* r3 ends up with link register */ 1338 - subi r3, r3, MCOUNT_INSN_SIZE 1339 - .globl ftrace_call 1340 - ftrace_call: 1341 - bl ftrace_stub 1342 - nop 1343 - #ifdef CONFIG_FUNCTION_GRAPH_TRACER 1344 - .globl ftrace_graph_call 1345 - ftrace_graph_call: 1346 - b ftrace_graph_stub 1347 - _GLOBAL(ftrace_graph_stub) 1348 - #endif 1349 - MCOUNT_RESTORE_FRAME 1350 - /* old link register ends up in ctr reg */ 1351 - bctr 1352 - #else 1353 - _GLOBAL(mcount) 1354 - _GLOBAL(_mcount) 1355 - 1356 - MCOUNT_SAVE_FRAME 1357 - 1358 - subi r3, r3, MCOUNT_INSN_SIZE 1359 - LOAD_REG_ADDR(r5, ftrace_trace_function) 1360 - lwz r5,0(r5) 1361 - 1362 - mtctr r5 1363 - bctrl 1364 - nop 1365 - 1366 - #ifdef CONFIG_FUNCTION_GRAPH_TRACER 1367 - b ftrace_graph_caller 1368 - #endif 1369 - MCOUNT_RESTORE_FRAME 1370 - bctr 1371 - #endif 1372 - EXPORT_SYMBOL(_mcount) 1373 - 1374 - _GLOBAL(ftrace_stub) 1375 - blr 1376 - 1377 - #ifdef CONFIG_FUNCTION_GRAPH_TRACER 1378 - _GLOBAL(ftrace_graph_caller) 1379 - /* load r4 with local address */ 1380 - lwz r4, 44(r1) 1381 - subi r4, r4, MCOUNT_INSN_SIZE 1382 - 1383 - /* Grab the LR out of the caller stack frame */ 1384 - lwz r3,52(r1) 1385 - 1386 - bl prepare_ftrace_return 1387 - nop 1388 - 1389 - /* 1390 - * prepare_ftrace_return gives us the address we divert to. 1391 - * Change the LR in the callers stack frame to this. 1392 - */ 1393 - stw r3,52(r1) 1394 - 1395 - MCOUNT_RESTORE_FRAME 1396 - /* old link register ends up in ctr reg */ 1397 - bctr 1398 - 1399 - _GLOBAL(return_to_handler) 1400 - /* need to save return values */ 1401 - stwu r1, -32(r1) 1402 - stw r3, 20(r1) 1403 - stw r4, 16(r1) 1404 - stw r31, 12(r1) 1405 - mr r31, r1 1406 - 1407 - bl ftrace_return_to_handler 1408 - nop 1409 - 1410 - /* return value has real return address */ 1411 - mtlr r3 1412 - 1413 - lwz r3, 20(r1) 1414 - lwz r4, 16(r1) 1415 - lwz r31,12(r1) 1416 - lwz r1, 0(r1) 1417 - 1418 - /* Jump back to real return address */ 1419 - blr 1420 - #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 1421 - 1422 - #endif /* CONFIG_FUNCTION_TRACER */

-380

arch/powerpc/kernel/entry_64.S

··· 20 20 21 21 #include <linux/errno.h> 22 22 #include <linux/err.h> 23 - #include <linux/magic.h> 24 23 #include <asm/unistd.h> 25 24 #include <asm/processor.h> 26 25 #include <asm/page.h> ··· 32 33 #include <asm/bug.h> 33 34 #include <asm/ptrace.h> 34 35 #include <asm/irqflags.h> 35 - #include <asm/ftrace.h> 36 36 #include <asm/hw_irq.h> 37 37 #include <asm/context_tracking.h> 38 38 #include <asm/tm.h> ··· 1171 1173 ld r0,16(r1) 1172 1174 mtlr r0 1173 1175 blr 1174 - 1175 - #ifdef CONFIG_FUNCTION_TRACER 1176 - #ifdef CONFIG_DYNAMIC_FTRACE 1177 - _GLOBAL(mcount) 1178 - _GLOBAL(_mcount) 1179 - EXPORT_SYMBOL(_mcount) 1180 - mflr r12 1181 - mtctr r12 1182 - mtlr r0 1183 - bctr 1184 - 1185 - #ifndef CC_USING_MPROFILE_KERNEL 1186 - _GLOBAL_TOC(ftrace_caller) 1187 - /* Taken from output of objdump from lib64/glibc */ 1188 - mflr r3 1189 - ld r11, 0(r1) 1190 - stdu r1, -112(r1) 1191 - std r3, 128(r1) 1192 - ld r4, 16(r11) 1193 - subi r3, r3, MCOUNT_INSN_SIZE 1194 - .globl ftrace_call 1195 - ftrace_call: 1196 - bl ftrace_stub 1197 - nop 1198 - #ifdef CONFIG_FUNCTION_GRAPH_TRACER 1199 - .globl ftrace_graph_call 1200 - ftrace_graph_call: 1201 - b ftrace_graph_stub 1202 - _GLOBAL(ftrace_graph_stub) 1203 - #endif 1204 - ld r0, 128(r1) 1205 - mtlr r0 1206 - addi r1, r1, 112 1207 - 1208 - #else /* CC_USING_MPROFILE_KERNEL */ 1209 - /* 1210 - * 1211 - * ftrace_caller() is the function that replaces _mcount() when ftrace is 1212 - * active. 1213 - * 1214 - * We arrive here after a function A calls function B, and we are the trace 1215 - * function for B. When we enter r1 points to A's stack frame, B has not yet 1216 - * had a chance to allocate one yet. 1217 - * 1218 - * Additionally r2 may point either to the TOC for A, or B, depending on 1219 - * whether B did a TOC setup sequence before calling us. 1220 - * 1221 - * On entry the LR points back to the _mcount() call site, and r0 holds the 1222 - * saved LR as it was on entry to B, ie. the original return address at the 1223 - * call site in A. 1224 - * 1225 - * Our job is to save the register state into a struct pt_regs (on the stack) 1226 - * and then arrange for the ftrace function to be called. 1227 - */ 1228 - _GLOBAL(ftrace_caller) 1229 - /* Save the original return address in A's stack frame */ 1230 - std r0,LRSAVE(r1) 1231 - 1232 - /* Create our stack frame + pt_regs */ 1233 - stdu r1,-SWITCH_FRAME_SIZE(r1) 1234 - 1235 - /* Save all gprs to pt_regs */ 1236 - SAVE_8GPRS(0,r1) 1237 - SAVE_8GPRS(8,r1) 1238 - SAVE_8GPRS(16,r1) 1239 - SAVE_8GPRS(24,r1) 1240 - 1241 - /* Load special regs for save below */ 1242 - mfmsr r8 1243 - mfctr r9 1244 - mfxer r10 1245 - mfcr r11 1246 - 1247 - /* Get the _mcount() call site out of LR */ 1248 - mflr r7 1249 - /* Save it as pt_regs->nip & pt_regs->link */ 1250 - std r7, _NIP(r1) 1251 - std r7, _LINK(r1) 1252 - 1253 - /* Save callee's TOC in the ABI compliant location */ 1254 - std r2, 24(r1) 1255 - ld r2,PACATOC(r13) /* get kernel TOC in r2 */ 1256 - 1257 - addis r3,r2,function_trace_op@toc@ha 1258 - addi r3,r3,function_trace_op@toc@l 1259 - ld r5,0(r3) 1260 - 1261 - #ifdef CONFIG_LIVEPATCH 1262 - mr r14,r7 /* remember old NIP */ 1263 - #endif 1264 - /* Calculate ip from nip-4 into r3 for call below */ 1265 - subi r3, r7, MCOUNT_INSN_SIZE 1266 - 1267 - /* Put the original return address in r4 as parent_ip */ 1268 - mr r4, r0 1269 - 1270 - /* Save special regs */ 1271 - std r8, _MSR(r1) 1272 - std r9, _CTR(r1) 1273 - std r10, _XER(r1) 1274 - std r11, _CCR(r1) 1275 - 1276 - /* Load &pt_regs in r6 for call below */ 1277 - addi r6, r1 ,STACK_FRAME_OVERHEAD 1278 - 1279 - /* ftrace_call(r3, r4, r5, r6) */ 1280 - .globl ftrace_call 1281 - ftrace_call: 1282 - bl ftrace_stub 1283 - nop 1284 - 1285 - /* Load ctr with the possibly modified NIP */ 1286 - ld r3, _NIP(r1) 1287 - mtctr r3 1288 - #ifdef CONFIG_LIVEPATCH 1289 - cmpd r14,r3 /* has NIP been altered? */ 1290 - #endif 1291 - 1292 - /* Restore gprs */ 1293 - REST_8GPRS(0,r1) 1294 - REST_8GPRS(8,r1) 1295 - REST_8GPRS(16,r1) 1296 - REST_8GPRS(24,r1) 1297 - 1298 - /* Restore callee's TOC */ 1299 - ld r2, 24(r1) 1300 - 1301 - /* Pop our stack frame */ 1302 - addi r1, r1, SWITCH_FRAME_SIZE 1303 - 1304 - /* Restore original LR for return to B */ 1305 - ld r0, LRSAVE(r1) 1306 - mtlr r0 1307 - 1308 - #ifdef CONFIG_LIVEPATCH 1309 - /* Based on the cmpd above, if the NIP was altered handle livepatch */ 1310 - bne- livepatch_handler 1311 - #endif 1312 - 1313 - #ifdef CONFIG_FUNCTION_GRAPH_TRACER 1314 - stdu r1, -112(r1) 1315 - .globl ftrace_graph_call 1316 - ftrace_graph_call: 1317 - b ftrace_graph_stub 1318 - _GLOBAL(ftrace_graph_stub) 1319 - addi r1, r1, 112 1320 - #endif 1321 - 1322 - ld r0,LRSAVE(r1) /* restore callee's lr at _mcount site */ 1323 - mtlr r0 1324 - bctr /* jump after _mcount site */ 1325 - #endif /* CC_USING_MPROFILE_KERNEL */ 1326 - 1327 - _GLOBAL(ftrace_stub) 1328 - blr 1329 - 1330 - #ifdef CONFIG_LIVEPATCH 1331 - /* 1332 - * This function runs in the mcount context, between two functions. As 1333 - * such it can only clobber registers which are volatile and used in 1334 - * function linkage. 1335 - * 1336 - * We get here when a function A, calls another function B, but B has 1337 - * been live patched with a new function C. 1338 - * 1339 - * On entry: 1340 - * - we have no stack frame and can not allocate one 1341 - * - LR points back to the original caller (in A) 1342 - * - CTR holds the new NIP in C 1343 - * - r0 & r12 are free 1344 - * 1345 - * r0 can't be used as the base register for a DS-form load or store, so 1346 - * we temporarily shuffle r1 (stack pointer) into r0 and then put it back. 1347 - */ 1348 - livepatch_handler: 1349 - CURRENT_THREAD_INFO(r12, r1) 1350 - 1351 - /* Save stack pointer into r0 */ 1352 - mr r0, r1 1353 - 1354 - /* Allocate 3 x 8 bytes */ 1355 - ld r1, TI_livepatch_sp(r12) 1356 - addi r1, r1, 24 1357 - std r1, TI_livepatch_sp(r12) 1358 - 1359 - /* Save toc & real LR on livepatch stack */ 1360 - std r2, -24(r1) 1361 - mflr r12 1362 - std r12, -16(r1) 1363 - 1364 - /* Store stack end marker */ 1365 - lis r12, STACK_END_MAGIC@h 1366 - ori r12, r12, STACK_END_MAGIC@l 1367 - std r12, -8(r1) 1368 - 1369 - /* Restore real stack pointer */ 1370 - mr r1, r0 1371 - 1372 - /* Put ctr in r12 for global entry and branch there */ 1373 - mfctr r12 1374 - bctrl 1375 - 1376 - /* 1377 - * Now we are returning from the patched function to the original 1378 - * caller A. We are free to use r0 and r12, and we can use r2 until we 1379 - * restore it. 1380 - */ 1381 - 1382 - CURRENT_THREAD_INFO(r12, r1) 1383 - 1384 - /* Save stack pointer into r0 */ 1385 - mr r0, r1 1386 - 1387 - ld r1, TI_livepatch_sp(r12) 1388 - 1389 - /* Check stack marker hasn't been trashed */ 1390 - lis r2, STACK_END_MAGIC@h 1391 - ori r2, r2, STACK_END_MAGIC@l 1392 - ld r12, -8(r1) 1393 - 1: tdne r12, r2 1394 - EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0 1395 - 1396 - /* Restore LR & toc from livepatch stack */ 1397 - ld r12, -16(r1) 1398 - mtlr r12 1399 - ld r2, -24(r1) 1400 - 1401 - /* Pop livepatch stack frame */ 1402 - CURRENT_THREAD_INFO(r12, r0) 1403 - subi r1, r1, 24 1404 - std r1, TI_livepatch_sp(r12) 1405 - 1406 - /* Restore real stack pointer */ 1407 - mr r1, r0 1408 - 1409 - /* Return to original caller of live patched function */ 1410 - blr 1411 - #endif 1412 - 1413 - 1414 - #else 1415 - _GLOBAL_TOC(_mcount) 1416 - EXPORT_SYMBOL(_mcount) 1417 - /* Taken from output of objdump from lib64/glibc */ 1418 - mflr r3 1419 - ld r11, 0(r1) 1420 - stdu r1, -112(r1) 1421 - std r3, 128(r1) 1422 - ld r4, 16(r11) 1423 - 1424 - subi r3, r3, MCOUNT_INSN_SIZE 1425 - LOAD_REG_ADDR(r5,ftrace_trace_function) 1426 - ld r5,0(r5) 1427 - ld r5,0(r5) 1428 - mtctr r5 1429 - bctrl 1430 - nop 1431 - 1432 - 1433 - #ifdef CONFIG_FUNCTION_GRAPH_TRACER 1434 - b ftrace_graph_caller 1435 - #endif 1436 - ld r0, 128(r1) 1437 - mtlr r0 1438 - addi r1, r1, 112 1439 - _GLOBAL(ftrace_stub) 1440 - blr 1441 - 1442 - #endif /* CONFIG_DYNAMIC_FTRACE */ 1443 - 1444 - #ifdef CONFIG_FUNCTION_GRAPH_TRACER 1445 - #ifndef CC_USING_MPROFILE_KERNEL 1446 - _GLOBAL(ftrace_graph_caller) 1447 - /* load r4 with local address */ 1448 - ld r4, 128(r1) 1449 - subi r4, r4, MCOUNT_INSN_SIZE 1450 - 1451 - /* Grab the LR out of the caller stack frame */ 1452 - ld r11, 112(r1) 1453 - ld r3, 16(r11) 1454 - 1455 - bl prepare_ftrace_return 1456 - nop 1457 - 1458 - /* 1459 - * prepare_ftrace_return gives us the address we divert to. 1460 - * Change the LR in the callers stack frame to this. 1461 - */ 1462 - ld r11, 112(r1) 1463 - std r3, 16(r11) 1464 - 1465 - ld r0, 128(r1) 1466 - mtlr r0 1467 - addi r1, r1, 112 1468 - blr 1469 - 1470 - #else /* CC_USING_MPROFILE_KERNEL */ 1471 - _GLOBAL(ftrace_graph_caller) 1472 - /* with -mprofile-kernel, parameter regs are still alive at _mcount */ 1473 - std r10, 104(r1) 1474 - std r9, 96(r1) 1475 - std r8, 88(r1) 1476 - std r7, 80(r1) 1477 - std r6, 72(r1) 1478 - std r5, 64(r1) 1479 - std r4, 56(r1) 1480 - std r3, 48(r1) 1481 - 1482 - /* Save callee's TOC in the ABI compliant location */ 1483 - std r2, 24(r1) 1484 - ld r2, PACATOC(r13) /* get kernel TOC in r2 */ 1485 - 1486 - mfctr r4 /* ftrace_caller has moved local addr here */ 1487 - std r4, 40(r1) 1488 - mflr r3 /* ftrace_caller has restored LR from stack */ 1489 - subi r4, r4, MCOUNT_INSN_SIZE 1490 - 1491 - bl prepare_ftrace_return 1492 - nop 1493 - 1494 - /* 1495 - * prepare_ftrace_return gives us the address we divert to. 1496 - * Change the LR to this. 1497 - */ 1498 - mtlr r3 1499 - 1500 - ld r0, 40(r1) 1501 - mtctr r0 1502 - ld r10, 104(r1) 1503 - ld r9, 96(r1) 1504 - ld r8, 88(r1) 1505 - ld r7, 80(r1) 1506 - ld r6, 72(r1) 1507 - ld r5, 64(r1) 1508 - ld r4, 56(r1) 1509 - ld r3, 48(r1) 1510 - 1511 - /* Restore callee's TOC */ 1512 - ld r2, 24(r1) 1513 - 1514 - addi r1, r1, 112 1515 - mflr r0 1516 - std r0, LRSAVE(r1) 1517 - bctr 1518 - #endif /* CC_USING_MPROFILE_KERNEL */ 1519 - 1520 - _GLOBAL(return_to_handler) 1521 - /* need to save return values */ 1522 - std r4, -32(r1) 1523 - std r3, -24(r1) 1524 - /* save TOC */ 1525 - std r2, -16(r1) 1526 - std r31, -8(r1) 1527 - mr r31, r1 1528 - stdu r1, -112(r1) 1529 - 1530 - /* 1531 - * We might be called from a module. 1532 - * Switch to our TOC to run inside the core kernel. 1533 - */ 1534 - ld r2, PACATOC(r13) 1535 - 1536 - bl ftrace_return_to_handler 1537 - nop 1538 - 1539 - /* return value has real return address */ 1540 - mtlr r3 1541 - 1542 - ld r1, 0(r1) 1543 - ld r4, -32(r1) 1544 - ld r3, -24(r1) 1545 - ld r2, -16(r1) 1546 - ld r31, -8(r1) 1547 - 1548 - /* Jump back to real return address */ 1549 - blr 1550 - #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 1551 - #endif /* CONFIG_FUNCTION_TRACER */

+88 -100

arch/powerpc/kernel/exceptions-64s.S

··· 116 116 117 117 EXC_REAL_BEGIN(system_reset, 0x100, 0x100) 118 118 SET_SCRATCH0(r13) 119 - GET_PACA(r13) 120 - clrrdi r13,r13,1 /* Last bit of HSPRG0 is set if waking from winkle */ 121 - EXCEPTION_PROLOG_PSERIES_PACA(PACA_EXGEN, system_reset_common, EXC_STD, 119 + /* 120 + * MSR_RI is not enabled, because PACA_EXNMI and nmi stack is 121 + * being used, so a nested NMI exception would corrupt it. 122 + */ 123 + EXCEPTION_PROLOG_PSERIES_NORI(PACA_EXNMI, system_reset_common, EXC_STD, 122 124 IDLETEST, 0x100) 123 125 124 126 EXC_REAL_END(system_reset, 0x100, 0x100) ··· 128 126 129 127 #ifdef CONFIG_PPC_P7_NAP 130 128 EXC_COMMON_BEGIN(system_reset_idle_common) 131 - BEGIN_FTR_SECTION 132 - GET_PACA(r13) /* Restore HSPRG0 to get the winkle bit in r13 */ 133 - END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) 134 - bl pnv_restore_hyp_resource 135 - 136 - li r0,PNV_THREAD_RUNNING 137 - stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */ 138 - 139 - #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 140 - li r0,KVM_HWTHREAD_IN_KERNEL 141 - stb r0,HSTATE_HWTHREAD_STATE(r13) 142 - /* Order setting hwthread_state vs. testing hwthread_req */ 143 - sync 144 - lbz r0,HSTATE_HWTHREAD_REQ(r13) 145 - cmpwi r0,0 146 - beq 1f 147 - BRANCH_TO_KVM(r10, kvm_start_guest) 148 - 1: 129 + b pnv_powersave_wakeup 149 130 #endif 150 131 151 - /* Return SRR1 from power7_nap() */ 152 - mfspr r3,SPRN_SRR1 153 - blt cr3,2f 154 - b pnv_wakeup_loss 155 - 2: b pnv_wakeup_noloss 156 - #endif 132 + EXC_COMMON_BEGIN(system_reset_common) 133 + /* 134 + * Increment paca->in_nmi then enable MSR_RI. SLB or MCE will be able 135 + * to recover, but nested NMI will notice in_nmi and not recover 136 + * because of the use of the NMI stack. in_nmi reentrancy is tested in 137 + * system_reset_exception. 138 + */ 139 + lhz r10,PACA_IN_NMI(r13) 140 + addi r10,r10,1 141 + sth r10,PACA_IN_NMI(r13) 142 + li r10,MSR_RI 143 + mtmsrd r10,1 157 144 158 - EXC_COMMON(system_reset_common, 0x100, system_reset_exception) 145 + mr r10,r1 146 + ld r1,PACA_NMI_EMERG_SP(r13) 147 + subi r1,r1,INT_FRAME_SIZE 148 + EXCEPTION_COMMON_NORET_STACK(PACA_EXNMI, 0x100, 149 + system_reset, system_reset_exception, 150 + ADD_NVGPRS;ADD_RECONCILE) 151 + 152 + /* 153 + * The stack is no longer in use, decrement in_nmi. 154 + */ 155 + lhz r10,PACA_IN_NMI(r13) 156 + subi r10,r10,1 157 + sth r10,PACA_IN_NMI(r13) 158 + 159 + b ret_from_except 159 160 160 161 #ifdef CONFIG_PPC_PSERIES 161 162 /* ··· 166 161 */ 167 162 TRAMP_REAL_BEGIN(system_reset_fwnmi) 168 163 SET_SCRATCH0(r13) /* save r13 */ 169 - EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD, 170 - NOTEST, 0x100) 164 + /* See comment at system_reset exception */ 165 + EXCEPTION_PROLOG_PSERIES_NORI(PACA_EXNMI, system_reset_common, 166 + EXC_STD, NOTEST, 0x100) 171 167 #endif /* CONFIG_PPC_PSERIES */ 172 168 173 169 ··· 178 172 * vector 179 173 */ 180 174 SET_SCRATCH0(r13) /* save r13 */ 181 - /* 182 - * Running native on arch 2.06 or later, we may wakeup from winkle 183 - * inside machine check. If yes, then last bit of HSPRG0 would be set 184 - * to 1. Hence clear it unconditionally. 185 - */ 186 - GET_PACA(r13) 187 - clrrdi r13,r13,1 188 - SET_PACA(r13) 189 175 EXCEPTION_PROLOG_0(PACA_EXMC) 190 176 BEGIN_FTR_SECTION 191 177 b machine_check_powernv_early ··· 210 212 * NOTE: We are here with MSR_ME=0 (off), which means we risk a 211 213 * checkstop if we get another machine check exception before we do 212 214 * rfid with MSR_ME=1. 215 + * 216 + * This interrupt can wake directly from idle. If that is the case, 217 + * the machine check is handled then the idle wakeup code is called 218 + * to restore state. In that case, the POWER9 DD1 idle PACA workaround 219 + * is not applied in the early machine check code, which will cause 220 + * bugs. 213 221 */ 214 222 mr r11,r1 /* Save r1 */ 215 223 lhz r10,PACA_IN_MCE(r13) ··· 272 268 machine_check_pSeries_0: 273 269 EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200) 274 270 /* 275 - * The following is essentially EXCEPTION_PROLOG_PSERIES_1 with the 276 - * difference that MSR_RI is not enabled, because PACA_EXMC is being 277 - * used, so nested machine check corrupts it. machine_check_common 278 - * enables MSR_RI. 271 + * MSR_RI is not enabled, because PACA_EXMC is being used, so a 272 + * nested machine check corrupts it. machine_check_common enables 273 + * MSR_RI. 279 274 */ 280 - ld r10,PACAKMSR(r13) 281 - xori r10,r10,MSR_RI 282 - mfspr r11,SPRN_SRR0 283 - LOAD_HANDLER(r12, machine_check_common) 284 - mtspr SPRN_SRR0,r12 285 - mfspr r12,SPRN_SRR1 286 - mtspr SPRN_SRR1,r10 287 - rfid 288 - b . /* prevent speculative execution */ 275 + EXCEPTION_PROLOG_PSERIES_1_NORI(machine_check_common, EXC_STD) 289 276 290 277 TRAMP_KVM_SKIP(PACA_EXMC, 0x200) 291 278 ··· 335 340 /* restore original r1. */ \ 336 341 ld r1,GPR1(r1) 337 342 343 + #ifdef CONFIG_PPC_P7_NAP 344 + /* 345 + * This is an idle wakeup. Low level machine check has already been 346 + * done. Queue the event then call the idle code to do the wake up. 347 + */ 348 + EXC_COMMON_BEGIN(machine_check_idle_common) 349 + bl machine_check_queue_event 350 + 351 + /* 352 + * We have not used any non-volatile GPRs here, and as a rule 353 + * most exception code including machine check does not. 354 + * Therefore PACA_NAPSTATELOST does not need to be set. Idle 355 + * wakeup will restore volatile registers. 356 + * 357 + * Load the original SRR1 into r3 for pnv_powersave_wakeup_mce. 358 + * 359 + * Then decrement MCE nesting after finishing with the stack. 360 + */ 361 + ld r3,_MSR(r1) 362 + 363 + lhz r11,PACA_IN_MCE(r13) 364 + subi r11,r11,1 365 + sth r11,PACA_IN_MCE(r13) 366 + 367 + /* Turn off the RI bit because SRR1 is used by idle wakeup code. */ 368 + /* Recoverability could be improved by reducing the use of SRR1. */ 369 + li r11,0 370 + mtmsrd r11,1 371 + 372 + b pnv_powersave_wakeup_mce 373 + #endif 338 374 /* 339 375 * Handle machine check early in real mode. We come here with 340 376 * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack. ··· 378 352 bl machine_check_early 379 353 std r3,RESULT(r1) /* Save result */ 380 354 ld r12,_MSR(r1) 355 + 381 356 #ifdef CONFIG_PPC_P7_NAP 382 357 /* 383 358 * Check if thread was in power saving mode. We come here when any ··· 389 362 * 390 363 * Go back to nap/sleep/winkle mode again if (b) is true. 391 364 */ 392 - rlwinm. r11,r12,47-31,30,31 /* Was it in power saving mode? */ 393 - beq 4f /* No, it wasn;t */ 394 - /* Thread was in power saving mode. Go back to nap again. */ 395 - cmpwi r11,2 396 - blt 3f 397 - /* Supervisor/Hypervisor state loss */ 398 - li r0,1 399 - stb r0,PACA_NAPSTATELOST(r13) 400 - 3: bl machine_check_queue_event 401 - MACHINE_CHECK_HANDLER_WINDUP 402 - GET_PACA(r13) 403 - ld r1,PACAR1(r13) 404 - /* 405 - * Check what idle state this CPU was in and go back to same mode 406 - * again. 407 - */ 408 - lbz r3,PACA_THREAD_IDLE_STATE(r13) 409 - cmpwi r3,PNV_THREAD_NAP 410 - bgt 10f 411 - IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP) 412 - /* No return */ 413 - 10: 414 - cmpwi r3,PNV_THREAD_SLEEP 415 - bgt 2f 416 - IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP) 417 - /* No return */ 418 - 419 - 2: 420 - /* 421 - * Go back to winkle. Please note that this thread was woken up in 422 - * machine check from winkle and have not restored the per-subcore 423 - * state. Hence before going back to winkle, set last bit of HSPRG0 424 - * to 1. This will make sure that if this thread gets woken up 425 - * again at reset vector 0x100 then it will get chance to restore 426 - * the subcore state. 427 - */ 428 - ori r13,r13,1 429 - SET_PACA(r13) 430 - IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE) 431 - /* No return */ 365 + BEGIN_FTR_SECTION 366 + rlwinm. r11,r12,47-31,30,31 367 + beq- 4f 368 + BRANCH_TO_COMMON(r10, machine_check_idle_common) 432 369 4: 370 + END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) 433 371 #endif 372 + 434 373 /* 435 374 * Check if we are coming from hypervisor userspace. If yes then we 436 375 * continue in host kernel in V mode to deliver the MC event. ··· 961 968 TRAMP_KVM_HV(PACA_EXGEN, 0xe60) 962 969 TRAMP_REAL_BEGIN(hmi_exception_early) 963 970 EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_HV, 0xe60) 964 - mr r10,r1 /* Save r1 */ 965 - ld r1,PACAEMERGSP(r13) /* Use emergency stack */ 971 + mr r10,r1 /* Save r1 */ 972 + ld r1,PACAEMERGSP(r13) /* Use emergency stack for realmode */ 966 973 subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ 967 - std r9,_CCR(r1) /* save CR in stackframe */ 968 974 mfspr r11,SPRN_HSRR0 /* Save HSRR0 */ 969 - std r11,_NIP(r1) /* save HSRR0 in stackframe */ 970 - mfspr r12,SPRN_HSRR1 /* Save SRR1 */ 971 - std r12,_MSR(r1) /* save SRR1 in stackframe */ 972 - std r10,0(r1) /* make stack chain pointer */ 973 - std r0,GPR0(r1) /* save r0 in stackframe */ 974 - std r10,GPR1(r1) /* save r1 in stackframe */ 975 + mfspr r12,SPRN_HSRR1 /* Save HSRR1 */ 976 + EXCEPTION_PROLOG_COMMON_1() 975 977 EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN) 976 978 EXCEPTION_PROLOG_COMMON_3(0xe60) 977 979 addi r3,r1,STACK_FRAME_OVERHEAD

+27 -9

arch/powerpc/kernel/fadump.c

··· 30 30 #include <linux/string.h> 31 31 #include <linux/memblock.h> 32 32 #include <linux/delay.h> 33 - #include <linux/debugfs.h> 34 33 #include <linux/seq_file.h> 35 34 #include <linux/crash_dump.h> 36 35 #include <linux/kobject.h> 37 36 #include <linux/sysfs.h> 38 37 38 + #include <asm/debugfs.h> 39 39 #include <asm/page.h> 40 40 #include <asm/prom.h> 41 41 #include <asm/rtas.h> 42 42 #include <asm/fadump.h> 43 - #include <asm/debug.h> 44 43 #include <asm/setup.h> 45 44 46 45 static struct fw_dump fw_dump; ··· 318 319 pr_debug("fadumphdr_addr = %p\n", 319 320 (void *) fw_dump.fadumphdr_addr); 320 321 } else { 321 - /* Reserve the memory at the top of memory. */ 322 322 size = get_fadump_area_size(); 323 - base = memory_boundary - size; 324 - memblock_reserve(base, size); 325 - printk(KERN_INFO "Reserved %ldMB of memory at %ldMB " 326 - "for firmware-assisted dump\n", 327 - (unsigned long)(size >> 20), 328 - (unsigned long)(base >> 20)); 323 + 324 + /* 325 + * Reserve memory at an offset closer to bottom of the RAM to 326 + * minimize the impact of memory hot-remove operation. We can't 327 + * use memblock_find_in_range() here since it doesn't allocate 328 + * from bottom to top. 329 + */ 330 + for (base = fw_dump.boot_memory_size; 331 + base <= (memory_boundary - size); 332 + base += size) { 333 + if (memblock_is_region_memory(base, size) && 334 + !memblock_is_region_reserved(base, size)) 335 + break; 336 + } 337 + if ((base > (memory_boundary - size)) || 338 + memblock_reserve(base, size)) { 339 + pr_err("Failed to reserve memory\n"); 340 + return 0; 341 + } 342 + 343 + pr_info("Reserved %ldMB of memory at %ldMB for firmware-" 344 + "assisted dump (System RAM: %ldMB)\n", 345 + (unsigned long)(size >> 20), 346 + (unsigned long)(base >> 20), 347 + (unsigned long)(memblock_phys_mem_size() >> 20)); 329 348 } 349 + 330 350 fw_dump.reserve_dump_area_start = base; 331 351 fw_dump.reserve_dump_area_size = size; 332 352 return 1;

+1

arch/powerpc/kernel/ftrace.c arch/powerpc/kernel/trace/ftrace.c

··· 21 21 #include <linux/init.h> 22 22 #include <linux/list.h> 23 23 24 + #include <asm/asm-prototypes.h> 24 25 #include <asm/cacheflush.h> 25 26 #include <asm/code-patching.h> 26 27 #include <asm/ftrace.h>

+1 -15

arch/powerpc/kernel/head_32.S

··· 735 735 EXCEPTION(0x2c00, Trap_2c, unknown_exception, EXC_XFER_EE) 736 736 EXCEPTION(0x2d00, Trap_2d, unknown_exception, EXC_XFER_EE) 737 737 EXCEPTION(0x2e00, Trap_2e, unknown_exception, EXC_XFER_EE) 738 - EXCEPTION(0x2f00, MOLTrampoline, unknown_exception, EXC_XFER_EE_LITE) 739 - 740 - .globl mol_trampoline 741 - .set mol_trampoline, i0x2f00 742 - EXPORT_SYMBOL(mol_trampoline) 738 + EXCEPTION(0x2f00, Trap_2f, unknown_exception, EXC_XFER_EE) 743 739 744 740 . = 0x3000 745 741 ··· 1273 1277 .globl swapper_pg_dir 1274 1278 swapper_pg_dir: 1275 1279 .space PGD_TABLE_SIZE 1276 - 1277 - .globl intercept_table 1278 - intercept_table: 1279 - .long 0, 0, i0x200, i0x300, i0x400, 0, i0x600, i0x700 1280 - .long i0x800, 0, 0, 0, 0, i0xd00, 0, 0 1281 - .long 0, 0, 0, i0x1300, 0, 0, 0, 0 1282 - .long 0, 0, 0, 0, 0, 0, 0, 0 1283 - .long 0, 0, 0, 0, 0, 0, 0, 0 1284 - .long 0, 0, 0, 0, 0, 0, 0, 0 1285 - EXPORT_SYMBOL(intercept_table) 1286 1280 1287 1281 /* Room for two PTE pointers, usually the kernel and current user pointers 1288 1282 * to their respective root page table.

+2 -1

arch/powerpc/kernel/head_64.S

··· 949 949 LOAD_REG_ADDR(r3,init_thread_union) 950 950 951 951 /* set up a stack pointer */ 952 - addi r1,r3,THREAD_SIZE 952 + LOAD_REG_IMMEDIATE(r1,THREAD_SIZE) 953 + add r1,r3,r1 953 954 li r0,0 954 955 stdu r0,-STACK_FRAME_OVERHEAD(r1) 955 956

+225 -60

arch/powerpc/kernel/idle_book3s.S

··· 20 20 #include <asm/kvm_book3s_asm.h> 21 21 #include <asm/opal.h> 22 22 #include <asm/cpuidle.h> 23 + #include <asm/exception-64s.h> 23 24 #include <asm/book3s/64/mmu-hash.h> 24 25 #include <asm/mmu.h> 25 26 ··· 95 94 core_idle_lock_held: 96 95 HMT_LOW 97 96 3: lwz r15,0(r14) 98 - andi. r15,r15,PNV_CORE_IDLE_LOCK_BIT 97 + andis. r15,r15,PNV_CORE_IDLE_LOCK_BIT@h 99 98 bne 3b 100 99 HMT_MEDIUM 101 100 lwarx r15,0,r14 102 - andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT 103 - bne core_idle_lock_held 101 + andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h 102 + bne- core_idle_lock_held 104 103 blr 105 104 106 105 /* ··· 114 113 * 115 114 * Address to 'rfid' to in r5 116 115 */ 117 - _GLOBAL(pnv_powersave_common) 116 + pnv_powersave_common: 118 117 /* Use r3 to pass state nap/sleep/winkle */ 119 118 /* NAP is a state loss, we create a regs frame on the 120 119 * stack, fill it up with the state we care about and ··· 189 188 /* The following store to HSTATE_HWTHREAD_STATE(r13) */ 190 189 /* MUST occur in real mode, i.e. with the MMU off, */ 191 190 /* and the MMU must stay off until we clear this flag */ 192 - /* and test HSTATE_HWTHREAD_REQ(r13) in the system */ 193 - /* reset interrupt vector in exceptions-64s.S. */ 191 + /* and test HSTATE_HWTHREAD_REQ(r13) in */ 192 + /* pnv_powersave_wakeup in this file. */ 194 193 /* The reason is that another thread can switch the */ 195 194 /* MMU to a guest context whenever this flag is set */ 196 195 /* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on, */ ··· 210 209 /* Sleep or winkle */ 211 210 lbz r7,PACA_THREAD_MASK(r13) 212 211 ld r14,PACA_CORE_IDLE_STATE_PTR(r13) 212 + li r5,0 213 + beq cr3,3f 214 + lis r5,PNV_CORE_IDLE_WINKLE_COUNT@h 215 + 3: 213 216 lwarx_loop1: 214 217 lwarx r15,0,r14 215 218 216 - andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT 217 - bnel core_idle_lock_held 219 + andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h 220 + bnel- core_idle_lock_held 218 221 222 + add r15,r15,r5 /* Add if winkle */ 219 223 andc r15,r15,r7 /* Clear thread bit */ 220 224 221 - andi. r15,r15,PNV_CORE_IDLE_THREAD_BITS 225 + andi. r9,r15,PNV_CORE_IDLE_THREAD_BITS 222 226 223 227 /* 224 228 * If cr0 = 0, then current thread is the last thread of the core entering ··· 246 240 IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP) 247 241 248 242 fastsleep_workaround_at_entry: 249 - ori r15,r15,PNV_CORE_IDLE_LOCK_BIT 243 + oris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h 250 244 stwcx. r15,0,r14 251 245 bne- lwarx_loop1 252 246 isync ··· 256 250 li r4,1 257 251 bl opal_config_cpu_idle_state 258 252 259 - /* Clear Lock bit */ 260 - li r0,0 253 + /* Unlock */ 254 + xoris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h 261 255 lwsync 262 - stw r0,0(r14) 256 + stw r15,0(r14) 263 257 b common_enter 264 258 265 259 enter_winkle: ··· 307 301 308 302 lwarx_loop_stop: 309 303 lwarx r15,0,r14 310 - andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT 311 - bnel core_idle_lock_held 304 + andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h 305 + bnel- core_idle_lock_held 312 306 andc r15,r15,r7 /* Clear thread bit */ 313 307 314 308 stwcx. r15,0,r14 ··· 381 375 li r4,1 382 376 b pnv_powersave_common 383 377 /* No return */ 378 + 384 379 /* 385 - * Called from reset vector. Check whether we have woken up with 386 - * hypervisor state loss. If yes, restore hypervisor state and return 387 - * back to reset vector. 380 + * On waking up from stop 0,1,2 with ESL=1 on POWER9 DD1, 381 + * HSPRG0 will be set to the HSPRG0 value of one of the 382 + * threads in this core. Thus the value we have in r13 383 + * may not be this thread's paca pointer. 388 384 * 389 - * r13 - Contents of HSPRG0 385 + * Fortunately, the TIR remains invariant. Since this thread's 386 + * paca pointer is recorded in all its sibling's paca, we can 387 + * correctly recover this thread's paca pointer if we 388 + * know the index of this thread in the core. 389 + * 390 + * This index can be obtained from the TIR. 391 + * 392 + * i.e, thread's position in the core = TIR. 393 + * If this value is i, then this thread's paca is 394 + * paca->thread_sibling_pacas[i]. 395 + */ 396 + power9_dd1_recover_paca: 397 + mfspr r4, SPRN_TIR 398 + /* 399 + * Since each entry in thread_sibling_pacas is 8 bytes 400 + * we need to left-shift by 3 bits. Thus r4 = i * 8 401 + */ 402 + sldi r4, r4, 3 403 + /* Get &paca->thread_sibling_pacas[0] in r5 */ 404 + ld r5, PACA_SIBLING_PACA_PTRS(r13) 405 + /* Load paca->thread_sibling_pacas[i] into r13 */ 406 + ldx r13, r4, r5 407 + SET_PACA(r13) 408 + /* 409 + * Indicate that we have lost NVGPR state 410 + * which needs to be restored from the stack. 411 + */ 412 + li r3, 1 413 + stb r0,PACA_NAPSTATELOST(r13) 414 + blr 415 + 416 + /* 417 + * Called from machine check handler for powersave wakeups. 418 + * Low level machine check processing has already been done. Now just 419 + * go through the wake up path to get everything in order. 420 + * 421 + * r3 - The original SRR1 value. 422 + * Original SRR[01] have been clobbered. 423 + * MSR_RI is clear. 424 + */ 425 + .global pnv_powersave_wakeup_mce 426 + pnv_powersave_wakeup_mce: 427 + /* Set cr3 for pnv_powersave_wakeup */ 428 + rlwinm r11,r3,47-31,30,31 429 + cmpwi cr3,r11,2 430 + 431 + /* 432 + * Now put the original SRR1 with SRR1_WAKEMCE_RESVD as the wake 433 + * reason into SRR1, which allows reuse of the system reset wakeup 434 + * code without being mistaken for another type of wakeup. 435 + */ 436 + oris r3,r3,SRR1_WAKEMCE_RESVD@h 437 + mtspr SPRN_SRR1,r3 438 + 439 + b pnv_powersave_wakeup 440 + 441 + /* 442 + * Called from reset vector for powersave wakeups. 390 443 * cr3 - set to gt if waking up with partial/complete hypervisor state loss 391 444 */ 392 - _GLOBAL(pnv_restore_hyp_resource) 445 + .global pnv_powersave_wakeup 446 + pnv_powersave_wakeup: 447 + ld r2, PACATOC(r13) 448 + 393 449 BEGIN_FTR_SECTION 394 - ld r2,PACATOC(r13); 450 + BEGIN_FTR_SECTION_NESTED(70) 451 + bl power9_dd1_recover_paca 452 + END_FTR_SECTION_NESTED_IFSET(CPU_FTR_POWER9_DD1, 70) 453 + bl pnv_restore_hyp_resource_arch300 454 + FTR_SECTION_ELSE 455 + bl pnv_restore_hyp_resource_arch207 456 + ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) 457 + 458 + li r0,PNV_THREAD_RUNNING 459 + stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */ 460 + 461 + #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 462 + li r0,KVM_HWTHREAD_IN_KERNEL 463 + stb r0,HSTATE_HWTHREAD_STATE(r13) 464 + /* Order setting hwthread_state vs. testing hwthread_req */ 465 + sync 466 + lbz r0,HSTATE_HWTHREAD_REQ(r13) 467 + cmpwi r0,0 468 + beq 1f 469 + b kvm_start_guest 470 + 1: 471 + #endif 472 + 473 + /* Return SRR1 from power7_nap() */ 474 + mfspr r3,SPRN_SRR1 475 + blt cr3,pnv_wakeup_noloss 476 + b pnv_wakeup_loss 477 + 478 + /* 479 + * Check whether we have woken up with hypervisor state loss. 480 + * If yes, restore hypervisor state and return back to link. 481 + * 482 + * cr3 - set to gt if waking up with partial/complete hypervisor state loss 483 + */ 484 + pnv_restore_hyp_resource_arch300: 395 485 /* 396 486 * POWER ISA 3. Use PSSCR to determine if we 397 487 * are waking up from deep idle state ··· 502 400 */ 503 401 rldicl r5,r5,4,60 504 402 cmpd cr4,r5,r4 505 - bge cr4,pnv_wakeup_tb_loss 506 - /* 507 - * Waking up without hypervisor state loss. Return to 508 - * reset vector 509 - */ 510 - blr 403 + bge cr4,pnv_wakeup_tb_loss /* returns to caller */ 511 404 512 - END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) 405 + blr /* Waking up without hypervisor state loss. */ 513 406 407 + /* Same calling convention as arch300 */ 408 + pnv_restore_hyp_resource_arch207: 514 409 /* 515 410 * POWER ISA 2.07 or less. 516 - * Check if last bit of HSPGR0 is set. This indicates whether we are 517 - * waking up from winkle. 411 + * Check if we slept with sleep or winkle. 518 412 */ 519 - clrldi r5,r13,63 520 - clrrdi r13,r13,1 521 - 522 - /* Now that we are sure r13 is corrected, load TOC */ 523 - ld r2,PACATOC(r13); 524 - cmpwi cr4,r5,1 525 - mtspr SPRN_HSPRG0,r13 526 - 527 - lbz r0,PACA_THREAD_IDLE_STATE(r13) 528 - cmpwi cr2,r0,PNV_THREAD_NAP 529 - bgt cr2,pnv_wakeup_tb_loss /* Either sleep or Winkle */ 413 + lbz r4,PACA_THREAD_IDLE_STATE(r13) 414 + cmpwi cr2,r4,PNV_THREAD_NAP 415 + bgt cr2,pnv_wakeup_tb_loss /* Either sleep or Winkle */ 530 416 531 417 /* 532 418 * We fall through here if PACA_THREAD_IDLE_STATE shows we are waking ··· 523 433 */ 524 434 bgt cr3,. 525 435 526 - blr /* Return back to System Reset vector from where 527 - pnv_restore_hyp_resource was invoked */ 436 + blr /* Waking up without hypervisor state loss */ 528 437 529 438 /* 530 439 * Called if waking up from idle state which can cause either partial or ··· 533 444 * 534 445 * r13 - PACA 535 446 * cr3 - gt if waking up with partial/complete hypervisor state loss 447 + * 448 + * If ISA300: 536 449 * cr4 - gt or eq if waking up from complete hypervisor state loss. 450 + * 451 + * If ISA207: 452 + * r4 - PACA_THREAD_IDLE_STATE 537 453 */ 538 - _GLOBAL(pnv_wakeup_tb_loss) 454 + pnv_wakeup_tb_loss: 539 455 ld r1,PACAR1(r13) 540 456 /* 541 457 * Before entering any idle state, the NVGPRs are saved in the stack. ··· 567 473 * is required to return back to reset vector after hypervisor state 568 474 * restore is complete. 569 475 */ 476 + mr r18,r4 570 477 mflr r17 571 478 mfspr r16,SPRN_SRR1 572 479 BEGIN_FTR_SECTION 573 480 CHECK_HMI_INTERRUPT 574 481 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) 575 482 576 - lbz r7,PACA_THREAD_MASK(r13) 577 483 ld r14,PACA_CORE_IDLE_STATE_PTR(r13) 578 - lwarx_loop2: 579 - lwarx r15,0,r14 580 - andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT 484 + lbz r7,PACA_THREAD_MASK(r13) 485 + 581 486 /* 487 + * Take the core lock to synchronize against other threads. 488 + * 582 489 * Lock bit is set in one of the 2 cases- 583 490 * a. In the sleep/winkle enter path, the last thread is executing 584 491 * fastsleep workaround code. ··· 587 492 * workaround undo code or resyncing timebase or restoring context 588 493 * In either case loop until the lock bit is cleared. 589 494 */ 590 - bnel core_idle_lock_held 495 + 1: 496 + lwarx r15,0,r14 497 + andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h 498 + bnel- core_idle_lock_held 499 + oris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h 500 + stwcx. r15,0,r14 501 + bne- 1b 502 + isync 591 503 592 - cmpwi cr2,r15,0 504 + andi. r9,r15,PNV_CORE_IDLE_THREAD_BITS 505 + cmpwi cr2,r9,0 593 506 594 507 /* 595 508 * At this stage 596 509 * cr2 - eq if first thread to wakeup in core 597 510 * cr3- gt if waking up with partial/complete hypervisor state loss 511 + * ISA300: 598 512 * cr4 - gt or eq if waking up from complete hypervisor state loss. 599 513 */ 600 514 601 - ori r15,r15,PNV_CORE_IDLE_LOCK_BIT 602 - stwcx. r15,0,r14 603 - bne- lwarx_loop2 604 - isync 605 - 606 515 BEGIN_FTR_SECTION 516 + /* 517 + * Were we in winkle? 518 + * If yes, check if all threads were in winkle, decrement our 519 + * winkle count, set all thread winkle bits if all were in winkle. 520 + * Check if our thread has a winkle bit set, and set cr4 accordingly 521 + * (to match ISA300, above). Pseudo-code for core idle state 522 + * transitions for ISA207 is as follows (everything happens atomically 523 + * due to store conditional and/or lock bit): 524 + * 525 + * nap_idle() { } 526 + * nap_wake() { } 527 + * 528 + * sleep_idle() 529 + * { 530 + * core_idle_state &= ~thread_in_core 531 + * } 532 + * 533 + * sleep_wake() 534 + * { 535 + * bool first_in_core, first_in_subcore; 536 + * 537 + * first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0; 538 + * first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0; 539 + * 540 + * core_idle_state |= thread_in_core; 541 + * } 542 + * 543 + * winkle_idle() 544 + * { 545 + * core_idle_state &= ~thread_in_core; 546 + * core_idle_state += 1 << WINKLE_COUNT_SHIFT; 547 + * } 548 + * 549 + * winkle_wake() 550 + * { 551 + * bool first_in_core, first_in_subcore, winkle_state_lost; 552 + * 553 + * first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0; 554 + * first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0; 555 + * 556 + * core_idle_state |= thread_in_core; 557 + * 558 + * if ((core_idle_state & WINKLE_MASK) == (8 << WINKLE_COUNT_SIHFT)) 559 + * core_idle_state |= THREAD_WINKLE_BITS; 560 + * core_idle_state -= 1 << WINKLE_COUNT_SHIFT; 561 + * 562 + * winkle_state_lost = core_idle_state & 563 + * (thread_in_core << WINKLE_THREAD_SHIFT); 564 + * core_idle_state &= ~(thread_in_core << WINKLE_THREAD_SHIFT); 565 + * } 566 + * 567 + */ 568 + cmpwi r18,PNV_THREAD_WINKLE 569 + bne 2f 570 + andis. r9,r15,PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT@h 571 + subis r15,r15,PNV_CORE_IDLE_WINKLE_COUNT@h 572 + beq 2f 573 + ori r15,r15,PNV_CORE_IDLE_THREAD_WINKLE_BITS /* all were winkle */ 574 + 2: 575 + /* Shift thread bit to winkle mask, then test if this thread is set, 576 + * and remove it from the winkle bits */ 577 + slwi r8,r7,8 578 + and r8,r8,r15 579 + andc r15,r15,r8 580 + cmpwi cr4,r8,1 /* cr4 will be gt if our bit is set, lt if not */ 581 + 607 582 lbz r4,PACA_SUBCORE_SIBLING_MASK(r13) 608 583 and r4,r4,r15 609 584 cmpwi r4,0 /* Check if first in subcore */ ··· 758 593 mtspr SPRN_WORC,r4 759 594 760 595 clear_lock: 761 - andi. r15,r15,PNV_CORE_IDLE_THREAD_BITS 596 + xoris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h 762 597 lwsync 763 598 stw r15,0(r14) 764 599 ··· 816 651 817 652 mtspr SPRN_SRR1,r16 818 653 mtlr r17 819 - blr /* Return back to System Reset vector from where 820 - pnv_restore_hyp_resource was invoked */ 654 + blr /* return to pnv_powersave_wakeup */ 821 655 822 656 fastsleep_workaround_at_exit: 823 657 li r3,1 ··· 828 664 * R3 here contains the value that will be returned to the caller 829 665 * of power7_nap. 830 666 */ 831 - _GLOBAL(pnv_wakeup_loss) 667 + .global pnv_wakeup_loss 668 + pnv_wakeup_loss: 832 669 ld r1,PACAR1(r13) 833 670 BEGIN_FTR_SECTION 834 671 CHECK_HMI_INTERRUPT ··· 849 684 * R3 here contains the value that will be returned to the caller 850 685 * of power7_nap. 851 686 */ 852 - _GLOBAL(pnv_wakeup_noloss) 687 + pnv_wakeup_noloss: 853 688 lbz r0,PACA_NAPSTATELOST(r13) 854 689 cmpwi r0,0 855 690 bne pnv_wakeup_loss

+50 -4

arch/powerpc/kernel/iommu.c

··· 711 711 return tbl; 712 712 } 713 713 714 - void iommu_free_table(struct iommu_table *tbl, const char *node_name) 714 + static void iommu_table_free(struct kref *kref) 715 715 { 716 716 unsigned long bitmap_sz; 717 717 unsigned int order; 718 + struct iommu_table *tbl; 718 719 719 - if (!tbl) 720 - return; 720 + tbl = container_of(kref, struct iommu_table, it_kref); 721 + 722 + if (tbl->it_ops->free) 723 + tbl->it_ops->free(tbl); 721 724 722 725 if (!tbl->it_map) { 723 726 kfree(tbl); ··· 736 733 737 734 /* verify that table contains no entries */ 738 735 if (!bitmap_empty(tbl->it_map, tbl->it_size)) 739 - pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name); 736 + pr_warn("%s: Unexpected TCEs\n", __func__); 740 737 741 738 /* calculate bitmap size in bytes */ 742 739 bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long); ··· 748 745 /* free table */ 749 746 kfree(tbl); 750 747 } 748 + 749 + struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl) 750 + { 751 + if (kref_get_unless_zero(&tbl->it_kref)) 752 + return tbl; 753 + 754 + return NULL; 755 + } 756 + EXPORT_SYMBOL_GPL(iommu_tce_table_get); 757 + 758 + int iommu_tce_table_put(struct iommu_table *tbl) 759 + { 760 + if (WARN_ON(!tbl)) 761 + return 0; 762 + 763 + return kref_put(&tbl->it_kref, iommu_table_free); 764 + } 765 + EXPORT_SYMBOL_GPL(iommu_tce_table_put); 751 766 752 767 /* Creates TCEs for a user provided buffer. The user buffer must be 753 768 * contiguous real kernel storage (not vmalloc). The address passed here ··· 1024 1003 return ret; 1025 1004 } 1026 1005 EXPORT_SYMBOL_GPL(iommu_tce_xchg); 1006 + 1007 + #ifdef CONFIG_PPC_BOOK3S_64 1008 + long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry, 1009 + unsigned long *hpa, enum dma_data_direction *direction) 1010 + { 1011 + long ret; 1012 + 1013 + ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); 1014 + 1015 + if (!ret && ((*direction == DMA_FROM_DEVICE) || 1016 + (*direction == DMA_BIDIRECTIONAL))) { 1017 + struct page *pg = realmode_pfn_to_page(*hpa >> PAGE_SHIFT); 1018 + 1019 + if (likely(pg)) { 1020 + SetPageDirty(pg); 1021 + } else { 1022 + tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); 1023 + ret = -EFAULT; 1024 + } 1025 + } 1026 + 1027 + return ret; 1028 + } 1029 + EXPORT_SYMBOL_GPL(iommu_tce_xchg_rm); 1030 + #endif 1027 1031 1028 1032 int iommu_take_ownership(struct iommu_table *tbl) 1029 1033 {

-41

arch/powerpc/kernel/irq.c

··· 65 65 #include <asm/machdep.h> 66 66 #include <asm/udbg.h> 67 67 #include <asm/smp.h> 68 - #include <asm/debug.h> 69 68 #include <asm/livepatch.h> 70 69 #include <asm/asm-prototypes.h> 71 70 ··· 440 441 441 442 return sum; 442 443 } 443 - 444 - #ifdef CONFIG_HOTPLUG_CPU 445 - void migrate_irqs(void) 446 - { 447 - struct irq_desc *desc; 448 - unsigned int irq; 449 - static int warned; 450 - cpumask_var_t mask; 451 - const struct cpumask *map = cpu_online_mask; 452 - 453 - alloc_cpumask_var(&mask, GFP_KERNEL); 454 - 455 - for_each_irq_desc(irq, desc) { 456 - struct irq_data *data; 457 - struct irq_chip *chip; 458 - 459 - data = irq_desc_get_irq_data(desc); 460 - if (irqd_is_per_cpu(data)) 461 - continue; 462 - 463 - chip = irq_data_get_irq_chip(data); 464 - 465 - cpumask_and(mask, irq_data_get_affinity_mask(data), map); 466 - if (cpumask_any(mask) >= nr_cpu_ids) { 467 - pr_warn("Breaking affinity for irq %i\n", irq); 468 - cpumask_copy(mask, map); 469 - } 470 - if (chip->irq_set_affinity) 471 - chip->irq_set_affinity(data, mask, true); 472 - else if (desc->action && !(warned++)) 473 - pr_err("Cannot set affinity for irq %i\n", irq); 474 - } 475 - 476 - free_cpumask_var(mask); 477 - 478 - local_irq_enable(); 479 - mdelay(1); 480 - local_irq_disable(); 481 - } 482 - #endif 483 444 484 445 static inline void check_stack_overflow(void) 485 446 {

+104

arch/powerpc/kernel/kprobes-ftrace.c

··· 1 + /* 2 + * Dynamic Ftrace based Kprobes Optimization 3 + * 4 + * This program is free software; you can redistribute it and/or modify 5 + * it under the terms of the GNU General Public License as published by 6 + * the Free Software Foundation; either version 2 of the License, or 7 + * (at your option) any later version. 8 + * 9 + * This program is distributed in the hope that it will be useful, 10 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 + * GNU General Public License for more details. 13 + * 14 + * You should have received a copy of the GNU General Public License 15 + * along with this program; if not, write to the Free Software 16 + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 17 + * 18 + * Copyright (C) Hitachi Ltd., 2012 19 + * Copyright 2016 Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com> 20 + * IBM Corporation 21 + */ 22 + #include <linux/kprobes.h> 23 + #include <linux/ptrace.h> 24 + #include <linux/hardirq.h> 25 + #include <linux/preempt.h> 26 + #include <linux/ftrace.h> 27 + 28 + static nokprobe_inline 29 + int __skip_singlestep(struct kprobe *p, struct pt_regs *regs, 30 + struct kprobe_ctlblk *kcb, unsigned long orig_nip) 31 + { 32 + /* 33 + * Emulate singlestep (and also recover regs->nip) 34 + * as if there is a nop 35 + */ 36 + regs->nip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; 37 + if (unlikely(p->post_handler)) { 38 + kcb->kprobe_status = KPROBE_HIT_SSDONE; 39 + p->post_handler(p, regs, 0); 40 + } 41 + __this_cpu_write(current_kprobe, NULL); 42 + if (orig_nip) 43 + regs->nip = orig_nip; 44 + return 1; 45 + } 46 + 47 + int skip_singlestep(struct kprobe *p, struct pt_regs *regs, 48 + struct kprobe_ctlblk *kcb) 49 + { 50 + if (kprobe_ftrace(p)) 51 + return __skip_singlestep(p, regs, kcb, 0); 52 + else 53 + return 0; 54 + } 55 + NOKPROBE_SYMBOL(skip_singlestep); 56 + 57 + /* Ftrace callback handler for kprobes */ 58 + void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, 59 + struct ftrace_ops *ops, struct pt_regs *regs) 60 + { 61 + struct kprobe *p; 62 + struct kprobe_ctlblk *kcb; 63 + unsigned long flags; 64 + 65 + /* Disable irq for emulating a breakpoint and avoiding preempt */ 66 + local_irq_save(flags); 67 + hard_irq_disable(); 68 + 69 + p = get_kprobe((kprobe_opcode_t *)nip); 70 + if (unlikely(!p) || kprobe_disabled(p)) 71 + goto end; 72 + 73 + kcb = get_kprobe_ctlblk(); 74 + if (kprobe_running()) { 75 + kprobes_inc_nmissed_count(p); 76 + } else { 77 + unsigned long orig_nip = regs->nip; 78 + 79 + /* 80 + * On powerpc, NIP is *before* this instruction for the 81 + * pre handler 82 + */ 83 + regs->nip -= MCOUNT_INSN_SIZE; 84 + 85 + __this_cpu_write(current_kprobe, p); 86 + kcb->kprobe_status = KPROBE_HIT_ACTIVE; 87 + if (!p->pre_handler || !p->pre_handler(p, regs)) 88 + __skip_singlestep(p, regs, kcb, orig_nip); 89 + /* 90 + * If pre_handler returns !0, it sets regs->nip and 91 + * resets current kprobe. 92 + */ 93 + } 94 + end: 95 + local_irq_restore(flags); 96 + } 97 + NOKPROBE_SYMBOL(kprobe_ftrace_handler); 98 + 99 + int arch_prepare_kprobe_ftrace(struct kprobe *p) 100 + { 101 + p->ainsn.insn = NULL; 102 + p->ainsn.boostable = -1; 103 + return 0; 104 + }

+170 -44

arch/powerpc/kernel/kprobes.c

··· 35 35 #include <asm/code-patching.h> 36 36 #include <asm/cacheflush.h> 37 37 #include <asm/sstep.h> 38 + #include <asm/sections.h> 38 39 #include <linux/uaccess.h> 39 40 40 41 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; ··· 43 42 44 43 struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}}; 45 44 46 - int __kprobes arch_prepare_kprobe(struct kprobe *p) 45 + bool arch_within_kprobe_blacklist(unsigned long addr) 46 + { 47 + return (addr >= (unsigned long)__kprobes_text_start && 48 + addr < (unsigned long)__kprobes_text_end) || 49 + (addr >= (unsigned long)_stext && 50 + addr < (unsigned long)__head_end); 51 + } 52 + 53 + kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset) 54 + { 55 + kprobe_opcode_t *addr; 56 + 57 + #ifdef PPC64_ELF_ABI_v2 58 + /* PPC64 ABIv2 needs local entry point */ 59 + addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); 60 + if (addr && !offset) { 61 + #ifdef CONFIG_KPROBES_ON_FTRACE 62 + unsigned long faddr; 63 + /* 64 + * Per livepatch.h, ftrace location is always within the first 65 + * 16 bytes of a function on powerpc with -mprofile-kernel. 66 + */ 67 + faddr = ftrace_location_range((unsigned long)addr, 68 + (unsigned long)addr + 16); 69 + if (faddr) 70 + addr = (kprobe_opcode_t *)faddr; 71 + else 72 + #endif 73 + addr = (kprobe_opcode_t *)ppc_function_entry(addr); 74 + } 75 + #elif defined(PPC64_ELF_ABI_v1) 76 + /* 77 + * 64bit powerpc ABIv1 uses function descriptors: 78 + * - Check for the dot variant of the symbol first. 79 + * - If that fails, try looking up the symbol provided. 80 + * 81 + * This ensures we always get to the actual symbol and not 82 + * the descriptor. 83 + * 84 + * Also handle <module:symbol> format. 85 + */ 86 + char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN]; 87 + const char *modsym; 88 + bool dot_appended = false; 89 + if ((modsym = strchr(name, ':')) != NULL) { 90 + modsym++; 91 + if (*modsym != '\0' && *modsym != '.') { 92 + /* Convert to <module:.symbol> */ 93 + strncpy(dot_name, name, modsym - name); 94 + dot_name[modsym - name] = '.'; 95 + dot_name[modsym - name + 1] = '\0'; 96 + strncat(dot_name, modsym, 97 + sizeof(dot_name) - (modsym - name) - 2); 98 + dot_appended = true; 99 + } else { 100 + dot_name[0] = '\0'; 101 + strncat(dot_name, name, sizeof(dot_name) - 1); 102 + } 103 + } else if (name[0] != '.') { 104 + dot_name[0] = '.'; 105 + dot_name[1] = '\0'; 106 + strncat(dot_name, name, KSYM_NAME_LEN - 2); 107 + dot_appended = true; 108 + } else { 109 + dot_name[0] = '\0'; 110 + strncat(dot_name, name, KSYM_NAME_LEN - 1); 111 + } 112 + addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name); 113 + if (!addr && dot_appended) { 114 + /* Let's try the original non-dot symbol lookup */ 115 + addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); 116 + } 117 + #else 118 + addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); 119 + #endif 120 + 121 + return addr; 122 + } 123 + 124 + int arch_prepare_kprobe(struct kprobe *p) 47 125 { 48 126 int ret = 0; 49 127 kprobe_opcode_t insn = *p->addr; ··· 154 74 p->ainsn.boostable = 0; 155 75 return ret; 156 76 } 77 + NOKPROBE_SYMBOL(arch_prepare_kprobe); 157 78 158 - void __kprobes arch_arm_kprobe(struct kprobe *p) 79 + void arch_arm_kprobe(struct kprobe *p) 159 80 { 160 81 *p->addr = BREAKPOINT_INSTRUCTION; 161 82 flush_icache_range((unsigned long) p->addr, 162 83 (unsigned long) p->addr + sizeof(kprobe_opcode_t)); 163 84 } 85 + NOKPROBE_SYMBOL(arch_arm_kprobe); 164 86 165 - void __kprobes arch_disarm_kprobe(struct kprobe *p) 87 + void arch_disarm_kprobe(struct kprobe *p) 166 88 { 167 89 *p->addr = p->opcode; 168 90 flush_icache_range((unsigned long) p->addr, 169 91 (unsigned long) p->addr + sizeof(kprobe_opcode_t)); 170 92 } 93 + NOKPROBE_SYMBOL(arch_disarm_kprobe); 171 94 172 - void __kprobes arch_remove_kprobe(struct kprobe *p) 95 + void arch_remove_kprobe(struct kprobe *p) 173 96 { 174 97 if (p->ainsn.insn) { 175 98 free_insn_slot(p->ainsn.insn, 0); 176 99 p->ainsn.insn = NULL; 177 100 } 178 101 } 102 + NOKPROBE_SYMBOL(arch_remove_kprobe); 179 103 180 - static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) 104 + static nokprobe_inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) 181 105 { 182 106 enable_single_step(regs); 183 107 ··· 194 110 regs->nip = (unsigned long)p->ainsn.insn; 195 111 } 196 112 197 - static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) 113 + static nokprobe_inline void save_previous_kprobe(struct kprobe_ctlblk *kcb) 198 114 { 199 115 kcb->prev_kprobe.kp = kprobe_running(); 200 116 kcb->prev_kprobe.status = kcb->kprobe_status; 201 117 kcb->prev_kprobe.saved_msr = kcb->kprobe_saved_msr; 202 118 } 203 119 204 - static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) 120 + static nokprobe_inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb) 205 121 { 206 122 __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); 207 123 kcb->kprobe_status = kcb->prev_kprobe.status; 208 124 kcb->kprobe_saved_msr = kcb->prev_kprobe.saved_msr; 209 125 } 210 126 211 - static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, 127 + static nokprobe_inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs, 212 128 struct kprobe_ctlblk *kcb) 213 129 { 214 130 __this_cpu_write(current_kprobe, p); 215 131 kcb->kprobe_saved_msr = regs->msr; 216 132 } 217 133 218 - void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 219 - struct pt_regs *regs) 134 + bool arch_function_offset_within_entry(unsigned long offset) 135 + { 136 + #ifdef PPC64_ELF_ABI_v2 137 + #ifdef CONFIG_KPROBES_ON_FTRACE 138 + return offset <= 16; 139 + #else 140 + return offset <= 8; 141 + #endif 142 + #else 143 + return !offset; 144 + #endif 145 + } 146 + 147 + void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) 220 148 { 221 149 ri->ret_addr = (kprobe_opcode_t *)regs->link; 222 150 223 151 /* Replace the return addr with trampoline addr */ 224 152 regs->link = (unsigned long)kretprobe_trampoline; 225 153 } 154 + NOKPROBE_SYMBOL(arch_prepare_kretprobe); 226 155 227 - int __kprobes kprobe_handler(struct pt_regs *regs) 156 + int try_to_emulate(struct kprobe *p, struct pt_regs *regs) 157 + { 158 + int ret; 159 + unsigned int insn = *p->ainsn.insn; 160 + 161 + /* regs->nip is also adjusted if emulate_step returns 1 */ 162 + ret = emulate_step(regs, insn); 163 + if (ret > 0) { 164 + /* 165 + * Once this instruction has been boosted 166 + * successfully, set the boostable flag 167 + */ 168 + if (unlikely(p->ainsn.boostable == 0)) 169 + p->ainsn.boostable = 1; 170 + } else if (ret < 0) { 171 + /* 172 + * We don't allow kprobes on mtmsr(d)/rfi(d), etc. 173 + * So, we should never get here... but, its still 174 + * good to catch them, just in case... 175 + */ 176 + printk("Can't step on instruction %x\n", insn); 177 + BUG(); 178 + } else if (ret == 0) 179 + /* This instruction can't be boosted */ 180 + p->ainsn.boostable = -1; 181 + 182 + return ret; 183 + } 184 + NOKPROBE_SYMBOL(try_to_emulate); 185 + 186 + int kprobe_handler(struct pt_regs *regs) 228 187 { 229 188 struct kprobe *p; 230 189 int ret = 0; ··· 304 177 */ 305 178 save_previous_kprobe(kcb); 306 179 set_current_kprobe(p, regs, kcb); 307 - kcb->kprobe_saved_msr = regs->msr; 308 180 kprobes_inc_nmissed_count(p); 309 181 prepare_singlestep(p, regs); 310 182 kcb->kprobe_status = KPROBE_REENTER; 183 + if (p->ainsn.boostable >= 0) { 184 + ret = try_to_emulate(p, regs); 185 + 186 + if (ret > 0) { 187 + restore_previous_kprobe(kcb); 188 + return 1; 189 + } 190 + } 311 191 return 1; 312 192 } else { 313 193 if (*addr != BREAKPOINT_INSTRUCTION) { ··· 331 197 } 332 198 p = __this_cpu_read(current_kprobe); 333 199 if (p->break_handler && p->break_handler(p, regs)) { 334 - goto ss_probe; 200 + if (!skip_singlestep(p, regs, kcb)) 201 + goto ss_probe; 202 + ret = 1; 335 203 } 336 204 } 337 205 goto no_kprobe; ··· 371 235 372 236 ss_probe: 373 237 if (p->ainsn.boostable >= 0) { 374 - unsigned int insn = *p->ainsn.insn; 238 + ret = try_to_emulate(p, regs); 375 239 376 - /* regs->nip is also adjusted if emulate_step returns 1 */ 377 - ret = emulate_step(regs, insn); 378 240 if (ret > 0) { 379 - /* 380 - * Once this instruction has been boosted 381 - * successfully, set the boostable flag 382 - */ 383 - if (unlikely(p->ainsn.boostable == 0)) 384 - p->ainsn.boostable = 1; 385 - 386 241 if (p->post_handler) 387 242 p->post_handler(p, regs, 0); 388 243 ··· 381 254 reset_current_kprobe(); 382 255 preempt_enable_no_resched(); 383 256 return 1; 384 - } else if (ret < 0) { 385 - /* 386 - * We don't allow kprobes on mtmsr(d)/rfi(d), etc. 387 - * So, we should never get here... but, its still 388 - * good to catch them, just in case... 389 - */ 390 - printk("Can't step on instruction %x\n", insn); 391 - BUG(); 392 - } else if (ret == 0) 393 - /* This instruction can't be boosted */ 394 - p->ainsn.boostable = -1; 257 + } 395 258 } 396 259 prepare_singlestep(p, regs); 397 260 kcb->kprobe_status = KPROBE_HIT_SS; ··· 391 274 preempt_enable_no_resched(); 392 275 return ret; 393 276 } 277 + NOKPROBE_SYMBOL(kprobe_handler); 394 278 395 279 /* 396 280 * Function return probe trampoline: ··· 409 291 /* 410 292 * Called when the probe at kretprobe trampoline is hit 411 293 */ 412 - static int __kprobes trampoline_probe_handler(struct kprobe *p, 413 - struct pt_regs *regs) 294 + static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) 414 295 { 415 296 struct kretprobe_instance *ri = NULL; 416 297 struct hlist_head *head, empty_rp; ··· 478 361 */ 479 362 return 1; 480 363 } 364 + NOKPROBE_SYMBOL(trampoline_probe_handler); 481 365 482 366 /* 483 367 * Called after single-stepping. p->addr is the address of the ··· 488 370 * single-stepped a copy of the instruction. The address of this 489 371 * copy is p->ainsn.insn. 490 372 */ 491 - int __kprobes kprobe_post_handler(struct pt_regs *regs) 373 + int kprobe_post_handler(struct pt_regs *regs) 492 374 { 493 375 struct kprobe *cur = kprobe_running(); 494 376 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); ··· 528 410 529 411 return 1; 530 412 } 413 + NOKPROBE_SYMBOL(kprobe_post_handler); 531 414 532 - int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) 415 + int kprobe_fault_handler(struct pt_regs *regs, int trapnr) 533 416 { 534 417 struct kprobe *cur = kprobe_running(); 535 418 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); ··· 593 474 } 594 475 return 0; 595 476 } 477 + NOKPROBE_SYMBOL(kprobe_fault_handler); 596 478 597 479 unsigned long arch_deref_entry_point(void *entry) 598 480 { 599 481 return ppc_global_function_entry(entry); 600 482 } 483 + NOKPROBE_SYMBOL(arch_deref_entry_point); 601 484 602 - int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) 485 + int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) 603 486 { 604 487 struct jprobe *jp = container_of(p, struct jprobe, kp); 605 488 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); ··· 618 497 619 498 return 1; 620 499 } 500 + NOKPROBE_SYMBOL(setjmp_pre_handler); 621 501 622 - void __used __kprobes jprobe_return(void) 502 + void __used jprobe_return(void) 623 503 { 624 504 asm volatile("trap" ::: "memory"); 625 505 } 506 + NOKPROBE_SYMBOL(jprobe_return); 626 507 627 - static void __used __kprobes jprobe_return_end(void) 508 + static void __used jprobe_return_end(void) 628 509 { 629 - }; 510 + } 511 + NOKPROBE_SYMBOL(jprobe_return_end); 630 512 631 - int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) 513 + int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) 632 514 { 633 515 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); 634 516 ··· 644 520 preempt_enable_no_resched(); 645 521 return 1; 646 522 } 523 + NOKPROBE_SYMBOL(longjmp_break_handler); 647 524 648 525 static struct kprobe trampoline_p = { 649 526 .addr = (kprobe_opcode_t *) &kretprobe_trampoline, ··· 656 531 return register_kprobe(&trampoline_p); 657 532 } 658 533 659 - int __kprobes arch_trampoline_kprobe(struct kprobe *p) 534 + int arch_trampoline_kprobe(struct kprobe *p) 660 535 { 661 536 if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) 662 537 return 1; 663 538 664 539 return 0; 665 540 } 541 + NOKPROBE_SYMBOL(arch_trampoline_kprobe);

+15 -3

arch/powerpc/kernel/mce.c

··· 221 221 { 222 222 int index; 223 223 224 + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 225 + 224 226 /* 225 227 * For now just print it to console. 226 228 * TODO: log this error event to FSP or nvram. ··· 230 228 while (__this_cpu_read(mce_queue_count) > 0) { 231 229 index = __this_cpu_read(mce_queue_count) - 1; 232 230 machine_check_print_event_info( 233 - this_cpu_ptr(&mce_event_queue[index])); 231 + this_cpu_ptr(&mce_event_queue[index]), false); 234 232 __this_cpu_dec(mce_queue_count); 235 233 } 236 234 } 237 235 238 - void machine_check_print_event_info(struct machine_check_event *evt) 236 + void machine_check_print_event_info(struct machine_check_event *evt, 237 + bool user_mode) 239 238 { 240 239 const char *level, *sevstr, *subtype; 241 240 static const char *mc_ue_types[] = { ··· 313 310 314 311 printk("%s%s Machine check interrupt [%s]\n", level, sevstr, 315 312 evt->disposition == MCE_DISPOSITION_RECOVERED ? 316 - "Recovered" : "[Not recovered"); 313 + "Recovered" : "Not recovered"); 314 + 315 + if (user_mode) { 316 + printk("%s NIP: [%016llx] PID: %d Comm: %s\n", level, 317 + evt->srr0, current->pid, current->comm); 318 + } else { 319 + printk("%s NIP [%016llx]: %pS\n", level, evt->srr0, 320 + (void *)evt->srr0); 321 + } 322 + 317 323 printk("%s Initiator: %s\n", level, 318 324 evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown"); 319 325 switch (evt->error_type) {

+370 -410

arch/powerpc/kernel/mce_power.c

··· 72 72 73 73 void __flush_tlb_power9(unsigned int action) 74 74 { 75 - if (radix_enabled()) 76 - flush_tlb_206(POWER9_TLB_SETS_RADIX, action); 75 + unsigned int num_sets; 77 76 78 - flush_tlb_206(POWER9_TLB_SETS_HASH, action); 77 + if (radix_enabled()) 78 + num_sets = POWER9_TLB_SETS_RADIX; 79 + else 80 + num_sets = POWER9_TLB_SETS_HASH; 81 + 82 + flush_tlb_206(num_sets, action); 79 83 } 80 84 81 85 ··· 151 147 return 0; 152 148 } 153 149 154 - static int mce_handle_flush_derrors(uint64_t dsisr, uint64_t slb, uint64_t tlb, uint64_t erat) 150 + #define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42)) 151 + 152 + struct mce_ierror_table { 153 + unsigned long srr1_mask; 154 + unsigned long srr1_value; 155 + bool nip_valid; /* nip is a valid indicator of faulting address */ 156 + unsigned int error_type; 157 + unsigned int error_subtype; 158 + unsigned int initiator; 159 + unsigned int severity; 160 + }; 161 + 162 + static const struct mce_ierror_table mce_p7_ierror_table[] = { 163 + { 0x00000000001c0000, 0x0000000000040000, true, 164 + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, 165 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 166 + { 0x00000000001c0000, 0x0000000000080000, true, 167 + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, 168 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 169 + { 0x00000000001c0000, 0x00000000000c0000, true, 170 + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, 171 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 172 + { 0x00000000001c0000, 0x0000000000100000, true, 173 + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */ 174 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 175 + { 0x00000000001c0000, 0x0000000000140000, true, 176 + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, 177 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 178 + { 0x00000000001c0000, 0x0000000000180000, true, 179 + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, 180 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 181 + { 0x00000000001c0000, 0x00000000001c0000, true, 182 + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, 183 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 184 + { 0, 0, 0, 0, 0, 0 } }; 185 + 186 + static const struct mce_ierror_table mce_p8_ierror_table[] = { 187 + { 0x00000000081c0000, 0x0000000000040000, true, 188 + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, 189 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 190 + { 0x00000000081c0000, 0x0000000000080000, true, 191 + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, 192 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 193 + { 0x00000000081c0000, 0x00000000000c0000, true, 194 + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, 195 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 196 + { 0x00000000081c0000, 0x0000000000100000, true, 197 + MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT, 198 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 199 + { 0x00000000081c0000, 0x0000000000140000, true, 200 + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, 201 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 202 + { 0x00000000081c0000, 0x0000000000180000, true, 203 + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, 204 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 205 + { 0x00000000081c0000, 0x00000000001c0000, true, 206 + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, 207 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 208 + { 0x00000000081c0000, 0x0000000008000000, true, 209 + MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT, 210 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 211 + { 0x00000000081c0000, 0x0000000008040000, true, 212 + MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT, 213 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 214 + { 0, 0, 0, 0, 0, 0 } }; 215 + 216 + static const struct mce_ierror_table mce_p9_ierror_table[] = { 217 + { 0x00000000081c0000, 0x0000000000040000, true, 218 + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, 219 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 220 + { 0x00000000081c0000, 0x0000000000080000, true, 221 + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, 222 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 223 + { 0x00000000081c0000, 0x00000000000c0000, true, 224 + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, 225 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 226 + { 0x00000000081c0000, 0x0000000000100000, true, 227 + MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT, 228 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 229 + { 0x00000000081c0000, 0x0000000000140000, true, 230 + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, 231 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 232 + { 0x00000000081c0000, 0x0000000000180000, true, 233 + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, 234 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 235 + { 0x00000000081c0000, 0x0000000008000000, true, 236 + MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT, 237 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 238 + { 0x00000000081c0000, 0x0000000008040000, true, 239 + MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT, 240 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 241 + { 0x00000000081c0000, 0x00000000080c0000, true, 242 + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH, 243 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 244 + { 0x00000000081c0000, 0x0000000008100000, true, 245 + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH, 246 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 247 + { 0x00000000081c0000, 0x0000000008140000, false, 248 + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_STORE, 249 + MCE_INITIATOR_CPU, MCE_SEV_FATAL, }, /* ASYNC is fatal */ 250 + { 0x00000000081c0000, 0x0000000008180000, false, 251 + MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_STORE_TIMEOUT, 252 + MCE_INITIATOR_CPU, MCE_SEV_FATAL, }, /* ASYNC is fatal */ 253 + { 0x00000000081c0000, 0x00000000081c0000, true, 254 + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN, 255 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 256 + { 0, 0, 0, 0, 0, 0 } }; 257 + 258 + struct mce_derror_table { 259 + unsigned long dsisr_value; 260 + bool dar_valid; /* dar is a valid indicator of faulting address */ 261 + unsigned int error_type; 262 + unsigned int error_subtype; 263 + unsigned int initiator; 264 + unsigned int severity; 265 + }; 266 + 267 + static const struct mce_derror_table mce_p7_derror_table[] = { 268 + { 0x00008000, false, 269 + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, 270 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 271 + { 0x00004000, true, 272 + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE, 273 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 274 + { 0x00000800, true, 275 + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, 276 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 277 + { 0x00000400, true, 278 + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, 279 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 280 + { 0x00000100, true, 281 + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, 282 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 283 + { 0x00000080, true, 284 + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, 285 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 286 + { 0x00000040, true, 287 + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */ 288 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 289 + { 0, false, 0, 0, 0, 0 } }; 290 + 291 + static const struct mce_derror_table mce_p8_derror_table[] = { 292 + { 0x00008000, false, 293 + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, 294 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 295 + { 0x00004000, true, 296 + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE, 297 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 298 + { 0x00002000, true, 299 + MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT, 300 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 301 + { 0x00001000, true, 302 + MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT, 303 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 304 + { 0x00000800, true, 305 + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, 306 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 307 + { 0x00000400, true, 308 + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, 309 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 310 + { 0x00000200, true, 311 + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, /* SECONDARY ERAT */ 312 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 313 + { 0x00000100, true, 314 + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, 315 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 316 + { 0x00000080, true, 317 + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, 318 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 319 + { 0, false, 0, 0, 0, 0 } }; 320 + 321 + static const struct mce_derror_table mce_p9_derror_table[] = { 322 + { 0x00008000, false, 323 + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, 324 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 325 + { 0x00004000, true, 326 + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE, 327 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 328 + { 0x00002000, true, 329 + MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT, 330 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 331 + { 0x00001000, true, 332 + MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT, 333 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 334 + { 0x00000800, true, 335 + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, 336 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 337 + { 0x00000400, true, 338 + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, 339 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 340 + { 0x00000200, false, 341 + MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE, 342 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 343 + { 0x00000100, true, 344 + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, 345 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 346 + { 0x00000080, true, 347 + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, 348 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 349 + { 0x00000040, true, 350 + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD, 351 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 352 + { 0x00000020, false, 353 + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE, 354 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 355 + { 0x00000010, false, 356 + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN, 357 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 358 + { 0x00000008, false, 359 + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD_STORE_FOREIGN, 360 + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, 361 + { 0, false, 0, 0, 0, 0 } }; 362 + 363 + static int mce_handle_ierror(struct pt_regs *regs, 364 + const struct mce_ierror_table table[], 365 + struct mce_error_info *mce_err, uint64_t *addr) 155 366 { 156 - if ((dsisr & slb) && mce_flush(MCE_FLUSH_SLB)) 157 - dsisr &= ~slb; 158 - if ((dsisr & erat) && mce_flush(MCE_FLUSH_ERAT)) 159 - dsisr &= ~erat; 160 - if ((dsisr & tlb) && mce_flush(MCE_FLUSH_TLB)) 161 - dsisr &= ~tlb; 162 - /* Any other errors we don't understand? */ 163 - if (dsisr) 164 - return 0; 165 - return 1; 166 - } 367 + uint64_t srr1 = regs->msr; 368 + int handled = 0; 369 + int i; 167 370 168 - static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits) 169 - { 170 - long handled = 1; 371 + *addr = 0; 171 372 172 - /* 173 - * flush and reload SLBs for SLB errors and flush TLBs for TLB errors. 174 - * reset the error bits whenever we handle them so that at the end 175 - * we can check whether we handled all of them or not. 176 - * */ 177 - #ifdef CONFIG_PPC_STD_MMU_64 178 - if (dsisr & slb_error_bits) { 179 - flush_and_reload_slb(); 180 - /* reset error bits */ 181 - dsisr &= ~(slb_error_bits); 182 - } 183 - if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) { 184 - if (cur_cpu_spec && cur_cpu_spec->flush_tlb) 185 - cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_GLOBAL); 186 - /* reset error bits */ 187 - dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB; 188 - } 189 - #endif 190 - /* Any other errors we don't understand? */ 191 - if (dsisr & 0xffffffffUL) 192 - handled = 0; 373 + for (i = 0; table[i].srr1_mask; i++) { 374 + if ((srr1 & table[i].srr1_mask) != table[i].srr1_value) 375 + continue; 193 376 194 - return handled; 195 - } 196 - 197 - static long mce_handle_derror_p7(uint64_t dsisr) 198 - { 199 - return mce_handle_derror(dsisr, P7_DSISR_MC_SLB_ERRORS); 200 - } 201 - 202 - static long mce_handle_common_ierror(uint64_t srr1) 203 - { 204 - long handled = 0; 205 - 206 - switch (P7_SRR1_MC_IFETCH(srr1)) { 207 - case 0: 208 - break; 209 - #ifdef CONFIG_PPC_STD_MMU_64 210 - case P7_SRR1_MC_IFETCH_SLB_PARITY: 211 - case P7_SRR1_MC_IFETCH_SLB_MULTIHIT: 212 - /* flush and reload SLBs for SLB errors. */ 213 - flush_and_reload_slb(); 214 - handled = 1; 215 - break; 216 - case P7_SRR1_MC_IFETCH_TLB_MULTIHIT: 217 - if (cur_cpu_spec && cur_cpu_spec->flush_tlb) { 218 - cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_GLOBAL); 219 - handled = 1; 377 + /* attempt to correct the error */ 378 + switch (table[i].error_type) { 379 + case MCE_ERROR_TYPE_SLB: 380 + handled = mce_flush(MCE_FLUSH_SLB); 381 + break; 382 + case MCE_ERROR_TYPE_ERAT: 383 + handled = mce_flush(MCE_FLUSH_ERAT); 384 + break; 385 + case MCE_ERROR_TYPE_TLB: 386 + handled = mce_flush(MCE_FLUSH_TLB); 387 + break; 220 388 } 221 - break; 222 - #endif 223 - default: 224 - break; 389 + 390 + /* now fill in mce_error_info */ 391 + mce_err->error_type = table[i].error_type; 392 + switch (table[i].error_type) { 393 + case MCE_ERROR_TYPE_UE: 394 + mce_err->u.ue_error_type = table[i].error_subtype; 395 + break; 396 + case MCE_ERROR_TYPE_SLB: 397 + mce_err->u.slb_error_type = table[i].error_subtype; 398 + break; 399 + case MCE_ERROR_TYPE_ERAT: 400 + mce_err->u.erat_error_type = table[i].error_subtype; 401 + break; 402 + case MCE_ERROR_TYPE_TLB: 403 + mce_err->u.tlb_error_type = table[i].error_subtype; 404 + break; 405 + case MCE_ERROR_TYPE_USER: 406 + mce_err->u.user_error_type = table[i].error_subtype; 407 + break; 408 + case MCE_ERROR_TYPE_RA: 409 + mce_err->u.ra_error_type = table[i].error_subtype; 410 + break; 411 + case MCE_ERROR_TYPE_LINK: 412 + mce_err->u.link_error_type = table[i].error_subtype; 413 + break; 414 + } 415 + mce_err->severity = table[i].severity; 416 + mce_err->initiator = table[i].initiator; 417 + if (table[i].nip_valid) 418 + *addr = regs->nip; 419 + return handled; 225 420 } 226 421 227 - return handled; 422 + mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN; 423 + mce_err->severity = MCE_SEV_ERROR_SYNC; 424 + mce_err->initiator = MCE_INITIATOR_CPU; 425 + 426 + return 0; 228 427 } 229 428 230 - static long mce_handle_ierror_p7(uint64_t srr1) 429 + static int mce_handle_derror(struct pt_regs *regs, 430 + const struct mce_derror_table table[], 431 + struct mce_error_info *mce_err, uint64_t *addr) 231 432 { 232 - long handled = 0; 433 + uint64_t dsisr = regs->dsisr; 434 + int handled = 0; 435 + int found = 0; 436 + int i; 233 437 234 - handled = mce_handle_common_ierror(srr1); 438 + *addr = 0; 235 439 236 - #ifdef CONFIG_PPC_STD_MMU_64 237 - if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) { 238 - flush_and_reload_slb(); 239 - handled = 1; 440 + for (i = 0; table[i].dsisr_value; i++) { 441 + if (!(dsisr & table[i].dsisr_value)) 442 + continue; 443 + 444 + /* attempt to correct the error */ 445 + switch (table[i].error_type) { 446 + case MCE_ERROR_TYPE_SLB: 447 + if (mce_flush(MCE_FLUSH_SLB)) 448 + handled = 1; 449 + break; 450 + case MCE_ERROR_TYPE_ERAT: 451 + if (mce_flush(MCE_FLUSH_ERAT)) 452 + handled = 1; 453 + break; 454 + case MCE_ERROR_TYPE_TLB: 455 + if (mce_flush(MCE_FLUSH_TLB)) 456 + handled = 1; 457 + break; 458 + } 459 + 460 + /* 461 + * Attempt to handle multiple conditions, but only return 462 + * one. Ensure uncorrectable errors are first in the table 463 + * to match. 464 + */ 465 + if (found) 466 + continue; 467 + 468 + /* now fill in mce_error_info */ 469 + mce_err->error_type = table[i].error_type; 470 + switch (table[i].error_type) { 471 + case MCE_ERROR_TYPE_UE: 472 + mce_err->u.ue_error_type = table[i].error_subtype; 473 + break; 474 + case MCE_ERROR_TYPE_SLB: 475 + mce_err->u.slb_error_type = table[i].error_subtype; 476 + break; 477 + case MCE_ERROR_TYPE_ERAT: 478 + mce_err->u.erat_error_type = table[i].error_subtype; 479 + break; 480 + case MCE_ERROR_TYPE_TLB: 481 + mce_err->u.tlb_error_type = table[i].error_subtype; 482 + break; 483 + case MCE_ERROR_TYPE_USER: 484 + mce_err->u.user_error_type = table[i].error_subtype; 485 + break; 486 + case MCE_ERROR_TYPE_RA: 487 + mce_err->u.ra_error_type = table[i].error_subtype; 488 + break; 489 + case MCE_ERROR_TYPE_LINK: 490 + mce_err->u.link_error_type = table[i].error_subtype; 491 + break; 492 + } 493 + mce_err->severity = table[i].severity; 494 + mce_err->initiator = table[i].initiator; 495 + if (table[i].dar_valid) 496 + *addr = regs->dar; 497 + 498 + found = 1; 240 499 } 241 - #endif 242 - return handled; 243 - } 244 500 245 - static void mce_get_common_ierror(struct mce_error_info *mce_err, uint64_t srr1) 246 - { 247 - switch (P7_SRR1_MC_IFETCH(srr1)) { 248 - case P7_SRR1_MC_IFETCH_SLB_PARITY: 249 - mce_err->error_type = MCE_ERROR_TYPE_SLB; 250 - mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY; 251 - break; 252 - case P7_SRR1_MC_IFETCH_SLB_MULTIHIT: 253 - mce_err->error_type = MCE_ERROR_TYPE_SLB; 254 - mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; 255 - break; 256 - case P7_SRR1_MC_IFETCH_TLB_MULTIHIT: 257 - mce_err->error_type = MCE_ERROR_TYPE_TLB; 258 - mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; 259 - break; 260 - case P7_SRR1_MC_IFETCH_UE: 261 - case P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL: 262 - mce_err->error_type = MCE_ERROR_TYPE_UE; 263 - mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH; 264 - break; 265 - case P7_SRR1_MC_IFETCH_UE_TLB_RELOAD: 266 - mce_err->error_type = MCE_ERROR_TYPE_UE; 267 - mce_err->u.ue_error_type = 268 - MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH; 269 - break; 270 - } 271 - } 501 + if (found) 502 + return handled; 272 503 273 - static void mce_get_ierror_p7(struct mce_error_info *mce_err, uint64_t srr1) 274 - { 275 - mce_get_common_ierror(mce_err, srr1); 276 - if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) { 277 - mce_err->error_type = MCE_ERROR_TYPE_SLB; 278 - mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; 279 - } 280 - } 504 + mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN; 505 + mce_err->severity = MCE_SEV_ERROR_SYNC; 506 + mce_err->initiator = MCE_INITIATOR_CPU; 281 507 282 - static void mce_get_derror_p7(struct mce_error_info *mce_err, uint64_t dsisr) 283 - { 284 - if (dsisr & P7_DSISR_MC_UE) { 285 - mce_err->error_type = MCE_ERROR_TYPE_UE; 286 - mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE; 287 - } else if (dsisr & P7_DSISR_MC_UE_TABLEWALK) { 288 - mce_err->error_type = MCE_ERROR_TYPE_UE; 289 - mce_err->u.ue_error_type = 290 - MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE; 291 - } else if (dsisr & P7_DSISR_MC_ERAT_MULTIHIT) { 292 - mce_err->error_type = MCE_ERROR_TYPE_ERAT; 293 - mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; 294 - } else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT) { 295 - mce_err->error_type = MCE_ERROR_TYPE_SLB; 296 - mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; 297 - } else if (dsisr & P7_DSISR_MC_SLB_PARITY_MFSLB) { 298 - mce_err->error_type = MCE_ERROR_TYPE_SLB; 299 - mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY; 300 - } else if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) { 301 - mce_err->error_type = MCE_ERROR_TYPE_TLB; 302 - mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; 303 - } else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT_PARITY) { 304 - mce_err->error_type = MCE_ERROR_TYPE_SLB; 305 - mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; 306 - } 508 + return 0; 307 509 } 308 510 309 511 static long mce_handle_ue_error(struct pt_regs *regs) ··· 530 320 return handled; 531 321 } 532 322 533 - long __machine_check_early_realmode_p7(struct pt_regs *regs) 323 + static long mce_handle_error(struct pt_regs *regs, 324 + const struct mce_derror_table dtable[], 325 + const struct mce_ierror_table itable[]) 534 326 { 535 - uint64_t srr1, nip, addr; 536 - long handled = 1; 537 - struct mce_error_info mce_error_info = { 0 }; 327 + struct mce_error_info mce_err = { 0 }; 328 + uint64_t addr; 329 + uint64_t srr1 = regs->msr; 330 + long handled; 538 331 539 - mce_error_info.severity = MCE_SEV_ERROR_SYNC; 540 - mce_error_info.initiator = MCE_INITIATOR_CPU; 332 + if (SRR1_MC_LOADSTORE(srr1)) 333 + handled = mce_handle_derror(regs, dtable, &mce_err, &addr); 334 + else 335 + handled = mce_handle_ierror(regs, itable, &mce_err, &addr); 541 336 542 - srr1 = regs->msr; 543 - nip = regs->nip; 544 - 545 - /* 546 - * Handle memory errors depending whether this was a load/store or 547 - * ifetch exception. Also, populate the mce error_type and 548 - * type-specific error_type from either SRR1 or DSISR, depending 549 - * whether this was a load/store or ifetch exception 550 - */ 551 - if (P7_SRR1_MC_LOADSTORE(srr1)) { 552 - handled = mce_handle_derror_p7(regs->dsisr); 553 - mce_get_derror_p7(&mce_error_info, regs->dsisr); 554 - addr = regs->dar; 555 - } else { 556 - handled = mce_handle_ierror_p7(srr1); 557 - mce_get_ierror_p7(&mce_error_info, srr1); 558 - addr = regs->nip; 559 - } 560 - 561 - /* Handle UE error. */ 562 - if (mce_error_info.error_type == MCE_ERROR_TYPE_UE) 337 + if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE) 563 338 handled = mce_handle_ue_error(regs); 564 339 565 - save_mce_event(regs, handled, &mce_error_info, nip, addr); 340 + save_mce_event(regs, handled, &mce_err, regs->nip, addr); 341 + 566 342 return handled; 567 343 } 568 344 569 - static void mce_get_ierror_p8(struct mce_error_info *mce_err, uint64_t srr1) 345 + long __machine_check_early_realmode_p7(struct pt_regs *regs) 570 346 { 571 - mce_get_common_ierror(mce_err, srr1); 572 - if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) { 573 - mce_err->error_type = MCE_ERROR_TYPE_ERAT; 574 - mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; 575 - } 576 - } 347 + /* P7 DD1 leaves top bits of DSISR undefined */ 348 + regs->dsisr &= 0x0000ffff; 577 349 578 - static void mce_get_derror_p8(struct mce_error_info *mce_err, uint64_t dsisr) 579 - { 580 - mce_get_derror_p7(mce_err, dsisr); 581 - if (dsisr & P8_DSISR_MC_ERAT_MULTIHIT_SEC) { 582 - mce_err->error_type = MCE_ERROR_TYPE_ERAT; 583 - mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; 584 - } 585 - } 586 - 587 - static long mce_handle_ierror_p8(uint64_t srr1) 588 - { 589 - long handled = 0; 590 - 591 - handled = mce_handle_common_ierror(srr1); 592 - 593 - #ifdef CONFIG_PPC_STD_MMU_64 594 - if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) { 595 - flush_and_reload_slb(); 596 - handled = 1; 597 - } 598 - #endif 599 - return handled; 600 - } 601 - 602 - static long mce_handle_derror_p8(uint64_t dsisr) 603 - { 604 - return mce_handle_derror(dsisr, P8_DSISR_MC_SLB_ERRORS); 350 + return mce_handle_error(regs, mce_p7_derror_table, mce_p7_ierror_table); 605 351 } 606 352 607 353 long __machine_check_early_realmode_p8(struct pt_regs *regs) 608 354 { 609 - uint64_t srr1, nip, addr; 610 - long handled = 1; 611 - struct mce_error_info mce_error_info = { 0 }; 612 - 613 - mce_error_info.severity = MCE_SEV_ERROR_SYNC; 614 - mce_error_info.initiator = MCE_INITIATOR_CPU; 615 - 616 - srr1 = regs->msr; 617 - nip = regs->nip; 618 - 619 - if (P7_SRR1_MC_LOADSTORE(srr1)) { 620 - handled = mce_handle_derror_p8(regs->dsisr); 621 - mce_get_derror_p8(&mce_error_info, regs->dsisr); 622 - addr = regs->dar; 623 - } else { 624 - handled = mce_handle_ierror_p8(srr1); 625 - mce_get_ierror_p8(&mce_error_info, srr1); 626 - addr = regs->nip; 627 - } 628 - 629 - /* Handle UE error. */ 630 - if (mce_error_info.error_type == MCE_ERROR_TYPE_UE) 631 - handled = mce_handle_ue_error(regs); 632 - 633 - save_mce_event(regs, handled, &mce_error_info, nip, addr); 634 - return handled; 635 - } 636 - 637 - static int mce_handle_derror_p9(struct pt_regs *regs) 638 - { 639 - uint64_t dsisr = regs->dsisr; 640 - 641 - return mce_handle_flush_derrors(dsisr, 642 - P9_DSISR_MC_SLB_PARITY_MFSLB | 643 - P9_DSISR_MC_SLB_MULTIHIT_MFSLB, 644 - 645 - P9_DSISR_MC_TLB_MULTIHIT_MFTLB, 646 - 647 - P9_DSISR_MC_ERAT_MULTIHIT); 648 - } 649 - 650 - static int mce_handle_ierror_p9(struct pt_regs *regs) 651 - { 652 - uint64_t srr1 = regs->msr; 653 - 654 - switch (P9_SRR1_MC_IFETCH(srr1)) { 655 - case P9_SRR1_MC_IFETCH_SLB_PARITY: 656 - case P9_SRR1_MC_IFETCH_SLB_MULTIHIT: 657 - return mce_flush(MCE_FLUSH_SLB); 658 - case P9_SRR1_MC_IFETCH_TLB_MULTIHIT: 659 - return mce_flush(MCE_FLUSH_TLB); 660 - case P9_SRR1_MC_IFETCH_ERAT_MULTIHIT: 661 - return mce_flush(MCE_FLUSH_ERAT); 662 - default: 663 - return 0; 664 - } 665 - } 666 - 667 - static void mce_get_derror_p9(struct pt_regs *regs, 668 - struct mce_error_info *mce_err, uint64_t *addr) 669 - { 670 - uint64_t dsisr = regs->dsisr; 671 - 672 - mce_err->severity = MCE_SEV_ERROR_SYNC; 673 - mce_err->initiator = MCE_INITIATOR_CPU; 674 - 675 - if (dsisr & P9_DSISR_MC_USER_TLBIE) 676 - *addr = regs->nip; 677 - else 678 - *addr = regs->dar; 679 - 680 - if (dsisr & P9_DSISR_MC_UE) { 681 - mce_err->error_type = MCE_ERROR_TYPE_UE; 682 - mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE; 683 - } else if (dsisr & P9_DSISR_MC_UE_TABLEWALK) { 684 - mce_err->error_type = MCE_ERROR_TYPE_UE; 685 - mce_err->u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE; 686 - } else if (dsisr & P9_DSISR_MC_LINK_LOAD_TIMEOUT) { 687 - mce_err->error_type = MCE_ERROR_TYPE_LINK; 688 - mce_err->u.link_error_type = MCE_LINK_ERROR_LOAD_TIMEOUT; 689 - } else if (dsisr & P9_DSISR_MC_LINK_TABLEWALK_TIMEOUT) { 690 - mce_err->error_type = MCE_ERROR_TYPE_LINK; 691 - mce_err->u.link_error_type = MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT; 692 - } else if (dsisr & P9_DSISR_MC_ERAT_MULTIHIT) { 693 - mce_err->error_type = MCE_ERROR_TYPE_ERAT; 694 - mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; 695 - } else if (dsisr & P9_DSISR_MC_TLB_MULTIHIT_MFTLB) { 696 - mce_err->error_type = MCE_ERROR_TYPE_TLB; 697 - mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; 698 - } else if (dsisr & P9_DSISR_MC_USER_TLBIE) { 699 - mce_err->error_type = MCE_ERROR_TYPE_USER; 700 - mce_err->u.user_error_type = MCE_USER_ERROR_TLBIE; 701 - } else if (dsisr & P9_DSISR_MC_SLB_PARITY_MFSLB) { 702 - mce_err->error_type = MCE_ERROR_TYPE_SLB; 703 - mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY; 704 - } else if (dsisr & P9_DSISR_MC_SLB_MULTIHIT_MFSLB) { 705 - mce_err->error_type = MCE_ERROR_TYPE_SLB; 706 - mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; 707 - } else if (dsisr & P9_DSISR_MC_RA_LOAD) { 708 - mce_err->error_type = MCE_ERROR_TYPE_RA; 709 - mce_err->u.ra_error_type = MCE_RA_ERROR_LOAD; 710 - } else if (dsisr & P9_DSISR_MC_RA_TABLEWALK) { 711 - mce_err->error_type = MCE_ERROR_TYPE_RA; 712 - mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE; 713 - } else if (dsisr & P9_DSISR_MC_RA_TABLEWALK_FOREIGN) { 714 - mce_err->error_type = MCE_ERROR_TYPE_RA; 715 - mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN; 716 - } else if (dsisr & P9_DSISR_MC_RA_FOREIGN) { 717 - mce_err->error_type = MCE_ERROR_TYPE_RA; 718 - mce_err->u.ra_error_type = MCE_RA_ERROR_LOAD_STORE_FOREIGN; 719 - } 720 - } 721 - 722 - static void mce_get_ierror_p9(struct pt_regs *regs, 723 - struct mce_error_info *mce_err, uint64_t *addr) 724 - { 725 - uint64_t srr1 = regs->msr; 726 - 727 - switch (P9_SRR1_MC_IFETCH(srr1)) { 728 - case P9_SRR1_MC_IFETCH_RA_ASYNC_STORE: 729 - case P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT: 730 - mce_err->severity = MCE_SEV_FATAL; 731 - break; 732 - default: 733 - mce_err->severity = MCE_SEV_ERROR_SYNC; 734 - break; 735 - } 736 - 737 - mce_err->initiator = MCE_INITIATOR_CPU; 738 - 739 - *addr = regs->nip; 740 - 741 - switch (P9_SRR1_MC_IFETCH(srr1)) { 742 - case P9_SRR1_MC_IFETCH_UE: 743 - mce_err->error_type = MCE_ERROR_TYPE_UE; 744 - mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH; 745 - break; 746 - case P9_SRR1_MC_IFETCH_SLB_PARITY: 747 - mce_err->error_type = MCE_ERROR_TYPE_SLB; 748 - mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY; 749 - break; 750 - case P9_SRR1_MC_IFETCH_SLB_MULTIHIT: 751 - mce_err->error_type = MCE_ERROR_TYPE_SLB; 752 - mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; 753 - break; 754 - case P9_SRR1_MC_IFETCH_ERAT_MULTIHIT: 755 - mce_err->error_type = MCE_ERROR_TYPE_ERAT; 756 - mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; 757 - break; 758 - case P9_SRR1_MC_IFETCH_TLB_MULTIHIT: 759 - mce_err->error_type = MCE_ERROR_TYPE_TLB; 760 - mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; 761 - break; 762 - case P9_SRR1_MC_IFETCH_UE_TLB_RELOAD: 763 - mce_err->error_type = MCE_ERROR_TYPE_UE; 764 - mce_err->u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH; 765 - break; 766 - case P9_SRR1_MC_IFETCH_LINK_TIMEOUT: 767 - mce_err->error_type = MCE_ERROR_TYPE_LINK; 768 - mce_err->u.link_error_type = MCE_LINK_ERROR_IFETCH_TIMEOUT; 769 - break; 770 - case P9_SRR1_MC_IFETCH_LINK_TABLEWALK_TIMEOUT: 771 - mce_err->error_type = MCE_ERROR_TYPE_LINK; 772 - mce_err->u.link_error_type = MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT; 773 - break; 774 - case P9_SRR1_MC_IFETCH_RA: 775 - mce_err->error_type = MCE_ERROR_TYPE_RA; 776 - mce_err->u.ra_error_type = MCE_RA_ERROR_IFETCH; 777 - break; 778 - case P9_SRR1_MC_IFETCH_RA_TABLEWALK: 779 - mce_err->error_type = MCE_ERROR_TYPE_RA; 780 - mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH; 781 - break; 782 - case P9_SRR1_MC_IFETCH_RA_ASYNC_STORE: 783 - mce_err->error_type = MCE_ERROR_TYPE_RA; 784 - mce_err->u.ra_error_type = MCE_RA_ERROR_STORE; 785 - break; 786 - case P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT: 787 - mce_err->error_type = MCE_ERROR_TYPE_LINK; 788 - mce_err->u.link_error_type = MCE_LINK_ERROR_STORE_TIMEOUT; 789 - break; 790 - case P9_SRR1_MC_IFETCH_RA_TABLEWALK_FOREIGN: 791 - mce_err->error_type = MCE_ERROR_TYPE_RA; 792 - mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN; 793 - break; 794 - default: 795 - break; 796 - } 355 + return mce_handle_error(regs, mce_p8_derror_table, mce_p8_ierror_table); 797 356 } 798 357 799 358 long __machine_check_early_realmode_p9(struct pt_regs *regs) 800 359 { 801 - uint64_t nip, addr; 802 - long handled; 803 - struct mce_error_info mce_error_info = { 0 }; 804 - 805 - nip = regs->nip; 806 - 807 - if (P9_SRR1_MC_LOADSTORE(regs->msr)) { 808 - handled = mce_handle_derror_p9(regs); 809 - mce_get_derror_p9(regs, &mce_error_info, &addr); 810 - } else { 811 - handled = mce_handle_ierror_p9(regs); 812 - mce_get_ierror_p9(regs, &mce_error_info, &addr); 813 - } 814 - 815 - /* Handle UE error. */ 816 - if (mce_error_info.error_type == MCE_ERROR_TYPE_UE) 817 - handled = mce_handle_ue_error(regs); 818 - 819 - save_mce_event(regs, handled, &mce_error_info, nip, addr); 820 - return handled; 360 + return mce_handle_error(regs, mce_p9_derror_table, mce_p9_ierror_table); 821 361 }

+3 -3

arch/powerpc/kernel/optprobes.c

··· 243 243 /* 244 244 * 2. branch to optimized_callback() and emulate_step() 245 245 */ 246 - kprobe_lookup_name("optimized_callback", op_callback_addr); 247 - kprobe_lookup_name("emulate_step", emulate_step_addr); 246 + op_callback_addr = (kprobe_opcode_t *)ppc_kallsyms_lookup_name("optimized_callback"); 247 + emulate_step_addr = (kprobe_opcode_t *)ppc_kallsyms_lookup_name("emulate_step"); 248 248 if (!op_callback_addr || !emulate_step_addr) { 249 - WARN(1, "kprobe_lookup_name() failed\n"); 249 + WARN(1, "Unable to lookup optimized_callback()/emulate_step()\n"); 250 250 goto error; 251 251 } 252 252

+21

arch/powerpc/kernel/paca.c

··· 245 245 246 246 free_lppacas(); 247 247 } 248 + 249 + void copy_mm_to_paca(struct mm_struct *mm) 250 + { 251 + #ifdef CONFIG_PPC_BOOK3S 252 + mm_context_t *context = &mm->context; 253 + 254 + get_paca()->mm_ctx_id = context->id; 255 + #ifdef CONFIG_PPC_MM_SLICES 256 + VM_BUG_ON(!mm->context.addr_limit); 257 + get_paca()->addr_limit = mm->context.addr_limit; 258 + get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize; 259 + memcpy(&get_paca()->mm_ctx_high_slices_psize, 260 + &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm)); 261 + #else /* CONFIG_PPC_MM_SLICES */ 262 + get_paca()->mm_ctx_user_psize = context->user_psize; 263 + get_paca()->mm_ctx_sllp = context->sllp; 264 + #endif 265 + #else /* CONFIG_PPC_BOOK3S */ 266 + return; 267 + #endif 268 + }

-1

arch/powerpc/kernel/prom.c

··· 55 55 #include <asm/kexec.h> 56 56 #include <asm/opal.h> 57 57 #include <asm/fadump.h> 58 - #include <asm/debug.h> 59 58 #include <asm/epapr_hcalls.h> 60 59 #include <asm/firmware.h> 61 60

+1 -1

arch/powerpc/kernel/prom_init.c

··· 815 815 .virt_base = cpu_to_be32(0xffffffff), 816 816 .virt_size = cpu_to_be32(0xffffffff), 817 817 .load_base = cpu_to_be32(0xffffffff), 818 - .min_rma = cpu_to_be32(256), /* 256MB min RMA */ 818 + .min_rma = cpu_to_be32(512), /* 512MB min RMA */ 819 819 .min_load = cpu_to_be32(0xffffffff), /* full client load */ 820 820 .min_rma_percent = 0, /* min RMA percentage of total RAM */ 821 821 .max_pft_size = 48, /* max log_2(hash table size) */

+10 -1

arch/powerpc/kernel/setup-common.c

··· 31 31 #include <linux/unistd.h> 32 32 #include <linux/serial.h> 33 33 #include <linux/serial_8250.h> 34 - #include <linux/debugfs.h> 35 34 #include <linux/percpu.h> 36 35 #include <linux/memblock.h> 37 36 #include <linux/of_platform.h> 38 37 #include <linux/hugetlb.h> 38 + #include <asm/debugfs.h> 39 39 #include <asm/io.h> 40 40 #include <asm/paca.h> 41 41 #include <asm/prom.h> ··· 920 920 init_mm.end_code = (unsigned long) _etext; 921 921 init_mm.end_data = (unsigned long) _edata; 922 922 init_mm.brk = klimit; 923 + 924 + #ifdef CONFIG_PPC_MM_SLICES 925 + #ifdef CONFIG_PPC64 926 + init_mm.context.addr_limit = TASK_SIZE_128TB; 927 + #else 928 + #error "context.addr_limit not initialized." 929 + #endif 930 + #endif 931 + 923 932 #ifdef CONFIG_PPC_64K_PAGES 924 933 init_mm.context.pte_frag = NULL; 925 934 #endif

+7 -2

arch/powerpc/kernel/setup_64.c

··· 230 230 * If we are not in hypervisor mode the job is done once for 231 231 * the whole partition in configure_exceptions(). 232 232 */ 233 - if (early_cpu_has_feature(CPU_FTR_HVMODE) && 234 - early_cpu_has_feature(CPU_FTR_ARCH_207S)) { 233 + if (cpu_has_feature(CPU_FTR_HVMODE) && 234 + cpu_has_feature(CPU_FTR_ARCH_207S)) { 235 235 unsigned long lpcr = mfspr(SPRN_LPCR); 236 236 mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3); 237 237 } ··· 637 637 paca[i].emergency_sp = (void *)ti + THREAD_SIZE; 638 638 639 639 #ifdef CONFIG_PPC_BOOK3S_64 640 + /* emergency stack for NMI exception handling. */ 641 + ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit)); 642 + klp_init_thread_info(ti); 643 + paca[i].nmi_emergency_sp = (void *)ti + THREAD_SIZE; 644 + 640 645 /* emergency stack for machine check exception handling. */ 641 646 ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit)); 642 647 klp_init_thread_info(ti);

+248 -51

arch/powerpc/kernel/smp.c

··· 39 39 #include <asm/irq.h> 40 40 #include <asm/hw_irq.h> 41 41 #include <asm/kvm_ppc.h> 42 + #include <asm/dbell.h> 42 43 #include <asm/page.h> 43 44 #include <asm/pgtable.h> 44 45 #include <asm/prom.h> ··· 86 85 volatile unsigned int cpu_callin_map[NR_CPUS]; 87 86 88 87 int smt_enabled_at_boot = 1; 89 - 90 - static void (*crash_ipi_function_ptr)(struct pt_regs *) = NULL; 91 88 92 89 /* 93 90 * Returns 1 if the specified cpu should be brought up during boot. ··· 157 158 return IRQ_HANDLED; 158 159 } 159 160 160 - static irqreturn_t debug_ipi_action(int irq, void *data) 161 + #ifdef CONFIG_NMI_IPI 162 + static irqreturn_t nmi_ipi_action(int irq, void *data) 161 163 { 162 - if (crash_ipi_function_ptr) { 163 - crash_ipi_function_ptr(get_irq_regs()); 164 - return IRQ_HANDLED; 165 - } 166 - 167 - #ifdef CONFIG_DEBUGGER 168 - debugger_ipi(get_irq_regs()); 169 - #endif /* CONFIG_DEBUGGER */ 170 - 164 + smp_handle_nmi_ipi(get_irq_regs()); 171 165 return IRQ_HANDLED; 172 166 } 167 + #endif 173 168 174 169 static irq_handler_t smp_ipi_action[] = { 175 170 [PPC_MSG_CALL_FUNCTION] = call_function_action, 176 171 [PPC_MSG_RESCHEDULE] = reschedule_action, 177 172 [PPC_MSG_TICK_BROADCAST] = tick_broadcast_ipi_action, 178 - [PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action, 173 + #ifdef CONFIG_NMI_IPI 174 + [PPC_MSG_NMI_IPI] = nmi_ipi_action, 175 + #endif 179 176 }; 180 177 178 + /* 179 + * The NMI IPI is a fallback and not truly non-maskable. It is simpler 180 + * than going through the call function infrastructure, and strongly 181 + * serialized, so it is more appropriate for debugging. 182 + */ 181 183 const char *smp_ipi_name[] = { 182 184 [PPC_MSG_CALL_FUNCTION] = "ipi call function", 183 185 [PPC_MSG_RESCHEDULE] = "ipi reschedule", 184 186 [PPC_MSG_TICK_BROADCAST] = "ipi tick-broadcast", 185 - [PPC_MSG_DEBUGGER_BREAK] = "ipi debugger", 187 + [PPC_MSG_NMI_IPI] = "nmi ipi", 186 188 }; 187 189 188 190 /* optional function to request ipi, for controllers with >= 4 ipis */ ··· 191 191 { 192 192 int err; 193 193 194 - if (msg < 0 || msg > PPC_MSG_DEBUGGER_BREAK) { 194 + if (msg < 0 || msg > PPC_MSG_NMI_IPI) 195 195 return -EINVAL; 196 - } 197 - #if !defined(CONFIG_DEBUGGER) && !defined(CONFIG_KEXEC_CORE) 198 - if (msg == PPC_MSG_DEBUGGER_BREAK) { 196 + #ifndef CONFIG_NMI_IPI 197 + if (msg == PPC_MSG_NMI_IPI) 199 198 return 1; 200 - } 201 199 #endif 200 + 202 201 err = request_irq(virq, smp_ipi_action[msg], 203 202 IRQF_PERCPU | IRQF_NO_THREAD | IRQF_NO_SUSPEND, 204 203 smp_ipi_name[msg], NULL); ··· 210 211 #ifdef CONFIG_PPC_SMP_MUXED_IPI 211 212 struct cpu_messages { 212 213 long messages; /* current messages */ 213 - unsigned long data; /* data for cause ipi */ 214 214 }; 215 215 static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message); 216 - 217 - void smp_muxed_ipi_set_data(int cpu, unsigned long data) 218 - { 219 - struct cpu_messages *info = &per_cpu(ipi_message, cpu); 220 - 221 - info->data = data; 222 - } 223 216 224 217 void smp_muxed_ipi_set_message(int cpu, int msg) 225 218 { ··· 227 236 228 237 void smp_muxed_ipi_message_pass(int cpu, int msg) 229 238 { 230 - struct cpu_messages *info = &per_cpu(ipi_message, cpu); 231 - 232 239 smp_muxed_ipi_set_message(cpu, msg); 240 + 233 241 /* 234 242 * cause_ipi functions are required to include a full barrier 235 243 * before doing whatever causes the IPI. 236 244 */ 237 - smp_ops->cause_ipi(cpu, info->data); 245 + smp_ops->cause_ipi(cpu); 238 246 } 239 247 240 248 #ifdef __BIG_ENDIAN__ ··· 244 254 245 255 irqreturn_t smp_ipi_demux(void) 246 256 { 247 - struct cpu_messages *info = this_cpu_ptr(&ipi_message); 248 - unsigned long all; 249 - 250 257 mb(); /* order any irq clear */ 251 258 259 + return smp_ipi_demux_relaxed(); 260 + } 261 + 262 + /* sync-free variant. Callers should ensure synchronization */ 263 + irqreturn_t smp_ipi_demux_relaxed(void) 264 + { 265 + struct cpu_messages *info; 266 + unsigned long all; 267 + 268 + info = this_cpu_ptr(&ipi_message); 252 269 do { 253 270 all = xchg(&info->messages, 0); 254 271 #if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) ··· 275 278 scheduler_ipi(); 276 279 if (all & IPI_MESSAGE(PPC_MSG_TICK_BROADCAST)) 277 280 tick_broadcast_ipi_handler(); 278 - if (all & IPI_MESSAGE(PPC_MSG_DEBUGGER_BREAK)) 279 - debug_ipi_action(0, NULL); 281 + #ifdef CONFIG_NMI_IPI 282 + if (all & IPI_MESSAGE(PPC_MSG_NMI_IPI)) 283 + nmi_ipi_action(0, NULL); 284 + #endif 280 285 } while (info->messages); 281 286 282 287 return IRQ_HANDLED; ··· 315 316 do_message_pass(cpu, PPC_MSG_CALL_FUNCTION); 316 317 } 317 318 319 + #ifdef CONFIG_NMI_IPI 320 + 321 + /* 322 + * "NMI IPI" system. 323 + * 324 + * NMI IPIs may not be recoverable, so should not be used as ongoing part of 325 + * a running system. They can be used for crash, debug, halt/reboot, etc. 326 + * 327 + * NMI IPIs are globally single threaded. No more than one in progress at 328 + * any time. 329 + * 330 + * The IPI call waits with interrupts disabled until all targets enter the 331 + * NMI handler, then the call returns. 332 + * 333 + * No new NMI can be initiated until targets exit the handler. 334 + * 335 + * The IPI call may time out without all targets entering the NMI handler. 336 + * In that case, there is some logic to recover (and ignore subsequent 337 + * NMI interrupts that may eventually be raised), but the platform interrupt 338 + * handler may not be able to distinguish this from other exception causes, 339 + * which may cause a crash. 340 + */ 341 + 342 + static atomic_t __nmi_ipi_lock = ATOMIC_INIT(0); 343 + static struct cpumask nmi_ipi_pending_mask; 344 + static int nmi_ipi_busy_count = 0; 345 + static void (*nmi_ipi_function)(struct pt_regs *) = NULL; 346 + 347 + static void nmi_ipi_lock_start(unsigned long *flags) 348 + { 349 + raw_local_irq_save(*flags); 350 + hard_irq_disable(); 351 + while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) { 352 + raw_local_irq_restore(*flags); 353 + cpu_relax(); 354 + raw_local_irq_save(*flags); 355 + hard_irq_disable(); 356 + } 357 + } 358 + 359 + static void nmi_ipi_lock(void) 360 + { 361 + while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) 362 + cpu_relax(); 363 + } 364 + 365 + static void nmi_ipi_unlock(void) 366 + { 367 + smp_mb(); 368 + WARN_ON(atomic_read(&__nmi_ipi_lock) != 1); 369 + atomic_set(&__nmi_ipi_lock, 0); 370 + } 371 + 372 + static void nmi_ipi_unlock_end(unsigned long *flags) 373 + { 374 + nmi_ipi_unlock(); 375 + raw_local_irq_restore(*flags); 376 + } 377 + 378 + /* 379 + * Platform NMI handler calls this to ack 380 + */ 381 + int smp_handle_nmi_ipi(struct pt_regs *regs) 382 + { 383 + void (*fn)(struct pt_regs *); 384 + unsigned long flags; 385 + int me = raw_smp_processor_id(); 386 + int ret = 0; 387 + 388 + /* 389 + * Unexpected NMIs are possible here because the interrupt may not 390 + * be able to distinguish NMI IPIs from other types of NMIs, or 391 + * because the caller may have timed out. 392 + */ 393 + nmi_ipi_lock_start(&flags); 394 + if (!nmi_ipi_busy_count) 395 + goto out; 396 + if (!cpumask_test_cpu(me, &nmi_ipi_pending_mask)) 397 + goto out; 398 + 399 + fn = nmi_ipi_function; 400 + if (!fn) 401 + goto out; 402 + 403 + cpumask_clear_cpu(me, &nmi_ipi_pending_mask); 404 + nmi_ipi_busy_count++; 405 + nmi_ipi_unlock(); 406 + 407 + ret = 1; 408 + 409 + fn(regs); 410 + 411 + nmi_ipi_lock(); 412 + nmi_ipi_busy_count--; 413 + out: 414 + nmi_ipi_unlock_end(&flags); 415 + 416 + return ret; 417 + } 418 + 419 + static void do_smp_send_nmi_ipi(int cpu) 420 + { 421 + if (smp_ops->cause_nmi_ipi && smp_ops->cause_nmi_ipi(cpu)) 422 + return; 423 + 424 + if (cpu >= 0) { 425 + do_message_pass(cpu, PPC_MSG_NMI_IPI); 426 + } else { 427 + int c; 428 + 429 + for_each_online_cpu(c) { 430 + if (c == raw_smp_processor_id()) 431 + continue; 432 + do_message_pass(c, PPC_MSG_NMI_IPI); 433 + } 434 + } 435 + } 436 + 437 + /* 438 + * - cpu is the target CPU (must not be this CPU), or NMI_IPI_ALL_OTHERS. 439 + * - fn is the target callback function. 440 + * - delay_us > 0 is the delay before giving up waiting for targets to 441 + * enter the handler, == 0 specifies indefinite delay. 442 + */ 443 + static int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us) 444 + { 445 + unsigned long flags; 446 + int me = raw_smp_processor_id(); 447 + int ret = 1; 448 + 449 + BUG_ON(cpu == me); 450 + BUG_ON(cpu < 0 && cpu != NMI_IPI_ALL_OTHERS); 451 + 452 + if (unlikely(!smp_ops)) 453 + return 0; 454 + 455 + /* Take the nmi_ipi_busy count/lock with interrupts hard disabled */ 456 + nmi_ipi_lock_start(&flags); 457 + while (nmi_ipi_busy_count) { 458 + nmi_ipi_unlock_end(&flags); 459 + cpu_relax(); 460 + nmi_ipi_lock_start(&flags); 461 + } 462 + 463 + nmi_ipi_function = fn; 464 + 465 + if (cpu < 0) { 466 + /* ALL_OTHERS */ 467 + cpumask_copy(&nmi_ipi_pending_mask, cpu_online_mask); 468 + cpumask_clear_cpu(me, &nmi_ipi_pending_mask); 469 + } else { 470 + /* cpumask starts clear */ 471 + cpumask_set_cpu(cpu, &nmi_ipi_pending_mask); 472 + } 473 + nmi_ipi_busy_count++; 474 + nmi_ipi_unlock(); 475 + 476 + do_smp_send_nmi_ipi(cpu); 477 + 478 + while (!cpumask_empty(&nmi_ipi_pending_mask)) { 479 + udelay(1); 480 + if (delay_us) { 481 + delay_us--; 482 + if (!delay_us) 483 + break; 484 + } 485 + } 486 + 487 + nmi_ipi_lock(); 488 + if (!cpumask_empty(&nmi_ipi_pending_mask)) { 489 + /* Could not gather all CPUs */ 490 + ret = 0; 491 + cpumask_clear(&nmi_ipi_pending_mask); 492 + } 493 + nmi_ipi_busy_count--; 494 + nmi_ipi_unlock_end(&flags); 495 + 496 + return ret; 497 + } 498 + #endif /* CONFIG_NMI_IPI */ 499 + 318 500 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 319 501 void tick_broadcast(const struct cpumask *mask) 320 502 { ··· 506 326 } 507 327 #endif 508 328 509 - #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE) 329 + #ifdef CONFIG_DEBUGGER 330 + void debugger_ipi_callback(struct pt_regs *regs) 331 + { 332 + debugger_ipi(regs); 333 + } 334 + 510 335 void smp_send_debugger_break(void) 511 336 { 512 - int cpu; 513 - int me = raw_smp_processor_id(); 514 - 515 - if (unlikely(!smp_ops)) 516 - return; 517 - 518 - for_each_online_cpu(cpu) 519 - if (cpu != me) 520 - do_message_pass(cpu, PPC_MSG_DEBUGGER_BREAK); 337 + smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, debugger_ipi_callback, 1000000); 521 338 } 522 339 #endif 523 340 524 341 #ifdef CONFIG_KEXEC_CORE 525 342 void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)) 526 343 { 527 - crash_ipi_function_ptr = crash_ipi_callback; 528 - if (crash_ipi_callback) { 529 - mb(); 530 - smp_send_debugger_break(); 531 - } 344 + smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_ipi_callback, 1000000); 532 345 } 533 346 #endif 534 347 ··· 612 439 #ifdef CONFIG_PPC64 613 440 vdso_data->processorCount--; 614 441 #endif 615 - migrate_irqs(); 442 + /* Update affinity of all IRQs previously aimed at this CPU */ 443 + irq_migrate_all_off_this_cpu(); 444 + 445 + /* 446 + * Depending on the details of the interrupt controller, it's possible 447 + * that one of the interrupts we just migrated away from this CPU is 448 + * actually already pending on this CPU. If we leave it in that state 449 + * the interrupt will never be EOI'ed, and will never fire again. So 450 + * temporarily enable interrupts here, to allow any pending interrupt to 451 + * be received (and EOI'ed), before we take this CPU offline. 452 + */ 453 + local_irq_enable(); 454 + mdelay(1); 455 + local_irq_disable(); 456 + 616 457 return 0; 617 458 } 618 459 ··· 707 520 return -EINVAL; 708 521 709 522 cpu_idle_thread_init(cpu, tidle); 523 + 524 + /* 525 + * The platform might need to allocate resources prior to bringing 526 + * up the CPU 527 + */ 528 + if (smp_ops->prepare_cpu) { 529 + rc = smp_ops->prepare_cpu(cpu); 530 + if (rc) 531 + return rc; 532 + } 710 533 711 534 /* Make sure callin-map entry is 0 (can be leftover a CPU 712 535 * hotplug

+8 -1

arch/powerpc/kernel/stacktrace.c

··· 59 59 60 60 void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) 61 61 { 62 - save_context_stack(trace, tsk->thread.ksp, tsk, 0); 62 + unsigned long sp; 63 + 64 + if (tsk == current) 65 + sp = current_stack_pointer(); 66 + else 67 + sp = tsk->thread.ksp; 68 + 69 + save_context_stack(trace, sp, tsk, 0); 63 70 } 64 71 EXPORT_SYMBOL_GPL(save_stack_trace_tsk); 65 72

+1

arch/powerpc/kernel/swsusp.c

··· 10 10 */ 11 11 12 12 #include <linux/sched.h> 13 + #include <linux/suspend.h> 13 14 #include <asm/current.h> 14 15 #include <asm/mmu_context.h> 15 16 #include <asm/switch_to.h>

+8 -8

arch/powerpc/kernel/syscalls.c

··· 42 42 #include <asm/unistd.h> 43 43 #include <asm/asm-prototypes.h> 44 44 45 - static inline unsigned long do_mmap2(unsigned long addr, size_t len, 45 + static inline long do_mmap2(unsigned long addr, size_t len, 46 46 unsigned long prot, unsigned long flags, 47 47 unsigned long fd, unsigned long off, int shift) 48 48 { 49 - unsigned long ret = -EINVAL; 49 + long ret = -EINVAL; 50 50 51 51 if (!arch_validate_prot(prot)) 52 52 goto out; ··· 62 62 return ret; 63 63 } 64 64 65 - unsigned long sys_mmap2(unsigned long addr, size_t len, 66 - unsigned long prot, unsigned long flags, 67 - unsigned long fd, unsigned long pgoff) 65 + SYSCALL_DEFINE6(mmap2, unsigned long, addr, size_t, len, 66 + unsigned long, prot, unsigned long, flags, 67 + unsigned long, fd, unsigned long, pgoff) 68 68 { 69 69 return do_mmap2(addr, len, prot, flags, fd, pgoff, PAGE_SHIFT-12); 70 70 } 71 71 72 - unsigned long sys_mmap(unsigned long addr, size_t len, 73 - unsigned long prot, unsigned long flags, 74 - unsigned long fd, off_t offset) 72 + SYSCALL_DEFINE6(mmap, unsigned long, addr, size_t, len, 73 + unsigned long, prot, unsigned long, flags, 74 + unsigned long, fd, off_t, offset) 75 75 { 76 76 return do_mmap2(addr, len, prot, flags, fd, offset, PAGE_SHIFT); 77 77 }

+10 -2

arch/powerpc/kernel/sysfs.c

··· 710 710 struct device_attribute *attrs, *pmc_attrs; 711 711 int i, nattrs; 712 712 713 + /* For cpus present at boot a reference was already grabbed in register_cpu() */ 714 + if (!s->of_node) 715 + s->of_node = of_get_cpu_node(cpu, NULL); 716 + 713 717 #ifdef CONFIG_PPC64 714 718 if (cpu_has_feature(CPU_FTR_SMT)) 715 719 device_create_file(s, &dev_attr_smt_snooze_delay); ··· 789 785 return 0; 790 786 } 791 787 788 + #ifdef CONFIG_HOTPLUG_CPU 792 789 static int unregister_cpu_online(unsigned int cpu) 793 790 { 794 - #ifdef CONFIG_HOTPLUG_CPU 795 791 struct cpu *c = &per_cpu(cpu_devices, cpu); 796 792 struct device *s = &c->dev; 797 793 struct device_attribute *attrs, *pmc_attrs; ··· 868 864 } 869 865 #endif 870 866 cacheinfo_cpu_offline(cpu); 871 - #endif /* CONFIG_HOTPLUG_CPU */ 867 + of_node_put(s->of_node); 868 + s->of_node = NULL; 872 869 return 0; 873 870 } 871 + #else /* !CONFIG_HOTPLUG_CPU */ 872 + #define unregister_cpu_online NULL 873 + #endif 874 874 875 875 #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE 876 876 ssize_t arch_cpu_probe(const char *buf, size_t count)

+29

arch/powerpc/kernel/trace/Makefile

··· 1 + # 2 + # Makefile for the powerpc trace subsystem 3 + # 4 + 5 + subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror 6 + 7 + ifdef CONFIG_FUNCTION_TRACER 8 + # do not trace tracer code 9 + CFLAGS_REMOVE_ftrace.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) 10 + endif 11 + 12 + obj32-$(CONFIG_FUNCTION_TRACER) += ftrace_32.o 13 + obj64-$(CONFIG_FUNCTION_TRACER) += ftrace_64.o 14 + ifdef CONFIG_MPROFILE_KERNEL 15 + obj64-$(CONFIG_FUNCTION_TRACER) += ftrace_64_mprofile.o 16 + else 17 + obj64-$(CONFIG_FUNCTION_TRACER) += ftrace_64_pg.o 18 + endif 19 + obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 20 + obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o 21 + obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o 22 + obj-$(CONFIG_TRACING) += trace_clock.o 23 + 24 + obj-$(CONFIG_PPC64) += $(obj64-y) 25 + obj-$(CONFIG_PPC32) += $(obj32-y) 26 + 27 + # Disable GCOV & sanitizers in odd or sensitive code 28 + GCOV_PROFILE_ftrace.o := n 29 + UBSAN_SANITIZE_ftrace.o := n

+118

arch/powerpc/kernel/trace/ftrace_32.S

··· 1 + /* 2 + * Split from entry_32.S 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public License 6 + * as published by the Free Software Foundation; either version 7 + * 2 of the License, or (at your option) any later version. 8 + */ 9 + 10 + #include <linux/magic.h> 11 + #include <asm/reg.h> 12 + #include <asm/ppc_asm.h> 13 + #include <asm/asm-offsets.h> 14 + #include <asm/ftrace.h> 15 + #include <asm/export.h> 16 + 17 + #ifdef CONFIG_DYNAMIC_FTRACE 18 + _GLOBAL(mcount) 19 + _GLOBAL(_mcount) 20 + /* 21 + * It is required that _mcount on PPC32 must preserve the 22 + * link register. But we have r0 to play with. We use r0 23 + * to push the return address back to the caller of mcount 24 + * into the ctr register, restore the link register and 25 + * then jump back using the ctr register. 26 + */ 27 + mflr r0 28 + mtctr r0 29 + lwz r0, 4(r1) 30 + mtlr r0 31 + bctr 32 + 33 + _GLOBAL(ftrace_caller) 34 + MCOUNT_SAVE_FRAME 35 + /* r3 ends up with link register */ 36 + subi r3, r3, MCOUNT_INSN_SIZE 37 + .globl ftrace_call 38 + ftrace_call: 39 + bl ftrace_stub 40 + nop 41 + #ifdef CONFIG_FUNCTION_GRAPH_TRACER 42 + .globl ftrace_graph_call 43 + ftrace_graph_call: 44 + b ftrace_graph_stub 45 + _GLOBAL(ftrace_graph_stub) 46 + #endif 47 + MCOUNT_RESTORE_FRAME 48 + /* old link register ends up in ctr reg */ 49 + bctr 50 + #else 51 + _GLOBAL(mcount) 52 + _GLOBAL(_mcount) 53 + 54 + MCOUNT_SAVE_FRAME 55 + 56 + subi r3, r3, MCOUNT_INSN_SIZE 57 + LOAD_REG_ADDR(r5, ftrace_trace_function) 58 + lwz r5,0(r5) 59 + 60 + mtctr r5 61 + bctrl 62 + nop 63 + 64 + #ifdef CONFIG_FUNCTION_GRAPH_TRACER 65 + b ftrace_graph_caller 66 + #endif 67 + MCOUNT_RESTORE_FRAME 68 + bctr 69 + #endif 70 + EXPORT_SYMBOL(_mcount) 71 + 72 + _GLOBAL(ftrace_stub) 73 + blr 74 + 75 + #ifdef CONFIG_FUNCTION_GRAPH_TRACER 76 + _GLOBAL(ftrace_graph_caller) 77 + /* load r4 with local address */ 78 + lwz r4, 44(r1) 79 + subi r4, r4, MCOUNT_INSN_SIZE 80 + 81 + /* Grab the LR out of the caller stack frame */ 82 + lwz r3,52(r1) 83 + 84 + bl prepare_ftrace_return 85 + nop 86 + 87 + /* 88 + * prepare_ftrace_return gives us the address we divert to. 89 + * Change the LR in the callers stack frame to this. 90 + */ 91 + stw r3,52(r1) 92 + 93 + MCOUNT_RESTORE_FRAME 94 + /* old link register ends up in ctr reg */ 95 + bctr 96 + 97 + _GLOBAL(return_to_handler) 98 + /* need to save return values */ 99 + stwu r1, -32(r1) 100 + stw r3, 20(r1) 101 + stw r4, 16(r1) 102 + stw r31, 12(r1) 103 + mr r31, r1 104 + 105 + bl ftrace_return_to_handler 106 + nop 107 + 108 + /* return value has real return address */ 109 + mtlr r3 110 + 111 + lwz r3, 20(r1) 112 + lwz r4, 16(r1) 113 + lwz r31,12(r1) 114 + lwz r1, 0(r1) 115 + 116 + /* Jump back to real return address */ 117 + blr 118 + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */

+85

arch/powerpc/kernel/trace/ftrace_64.S

··· 1 + /* 2 + * Split from entry_64.S 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public License 6 + * as published by the Free Software Foundation; either version 7 + * 2 of the License, or (at your option) any later version. 8 + */ 9 + 10 + #include <linux/magic.h> 11 + #include <asm/ppc_asm.h> 12 + #include <asm/asm-offsets.h> 13 + #include <asm/ftrace.h> 14 + #include <asm/ppc-opcode.h> 15 + #include <asm/export.h> 16 + 17 + #ifdef CONFIG_DYNAMIC_FTRACE 18 + _GLOBAL(mcount) 19 + _GLOBAL(_mcount) 20 + EXPORT_SYMBOL(_mcount) 21 + mflr r12 22 + mtctr r12 23 + mtlr r0 24 + bctr 25 + 26 + #else /* CONFIG_DYNAMIC_FTRACE */ 27 + _GLOBAL_TOC(_mcount) 28 + EXPORT_SYMBOL(_mcount) 29 + /* Taken from output of objdump from lib64/glibc */ 30 + mflr r3 31 + ld r11, 0(r1) 32 + stdu r1, -112(r1) 33 + std r3, 128(r1) 34 + ld r4, 16(r11) 35 + 36 + subi r3, r3, MCOUNT_INSN_SIZE 37 + LOAD_REG_ADDR(r5,ftrace_trace_function) 38 + ld r5,0(r5) 39 + ld r5,0(r5) 40 + mtctr r5 41 + bctrl 42 + nop 43 + 44 + #ifdef CONFIG_FUNCTION_GRAPH_TRACER 45 + b ftrace_graph_caller 46 + #endif 47 + ld r0, 128(r1) 48 + mtlr r0 49 + addi r1, r1, 112 50 + _GLOBAL(ftrace_stub) 51 + blr 52 + #endif /* CONFIG_DYNAMIC_FTRACE */ 53 + 54 + #ifdef CONFIG_FUNCTION_GRAPH_TRACER 55 + _GLOBAL(return_to_handler) 56 + /* need to save return values */ 57 + std r4, -32(r1) 58 + std r3, -24(r1) 59 + /* save TOC */ 60 + std r2, -16(r1) 61 + std r31, -8(r1) 62 + mr r31, r1 63 + stdu r1, -112(r1) 64 + 65 + /* 66 + * We might be called from a module. 67 + * Switch to our TOC to run inside the core kernel. 68 + */ 69 + ld r2, PACATOC(r13) 70 + 71 + bl ftrace_return_to_handler 72 + nop 73 + 74 + /* return value has real return address */ 75 + mtlr r3 76 + 77 + ld r1, 0(r1) 78 + ld r4, -32(r1) 79 + ld r3, -24(r1) 80 + ld r2, -16(r1) 81 + ld r31, -8(r1) 82 + 83 + /* Jump back to real return address */ 84 + blr 85 + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */

+272

arch/powerpc/kernel/trace/ftrace_64_mprofile.S

··· 1 + /* 2 + * Split from ftrace_64.S 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public License 6 + * as published by the Free Software Foundation; either version 7 + * 2 of the License, or (at your option) any later version. 8 + */ 9 + 10 + #include <linux/magic.h> 11 + #include <asm/ppc_asm.h> 12 + #include <asm/asm-offsets.h> 13 + #include <asm/ftrace.h> 14 + #include <asm/ppc-opcode.h> 15 + #include <asm/export.h> 16 + #include <asm/thread_info.h> 17 + #include <asm/bug.h> 18 + #include <asm/ptrace.h> 19 + 20 + #ifdef CONFIG_DYNAMIC_FTRACE 21 + /* 22 + * 23 + * ftrace_caller() is the function that replaces _mcount() when ftrace is 24 + * active. 25 + * 26 + * We arrive here after a function A calls function B, and we are the trace 27 + * function for B. When we enter r1 points to A's stack frame, B has not yet 28 + * had a chance to allocate one yet. 29 + * 30 + * Additionally r2 may point either to the TOC for A, or B, depending on 31 + * whether B did a TOC setup sequence before calling us. 32 + * 33 + * On entry the LR points back to the _mcount() call site, and r0 holds the 34 + * saved LR as it was on entry to B, ie. the original return address at the 35 + * call site in A. 36 + * 37 + * Our job is to save the register state into a struct pt_regs (on the stack) 38 + * and then arrange for the ftrace function to be called. 39 + */ 40 + _GLOBAL(ftrace_caller) 41 + /* Save the original return address in A's stack frame */ 42 + std r0,LRSAVE(r1) 43 + 44 + /* Create our stack frame + pt_regs */ 45 + stdu r1,-SWITCH_FRAME_SIZE(r1) 46 + 47 + /* Save all gprs to pt_regs */ 48 + SAVE_8GPRS(0,r1) 49 + SAVE_8GPRS(8,r1) 50 + SAVE_8GPRS(16,r1) 51 + SAVE_8GPRS(24,r1) 52 + 53 + /* Load special regs for save below */ 54 + mfmsr r8 55 + mfctr r9 56 + mfxer r10 57 + mfcr r11 58 + 59 + /* Get the _mcount() call site out of LR */ 60 + mflr r7 61 + /* Save it as pt_regs->nip */ 62 + std r7, _NIP(r1) 63 + /* Save the read LR in pt_regs->link */ 64 + std r0, _LINK(r1) 65 + 66 + /* Save callee's TOC in the ABI compliant location */ 67 + std r2, 24(r1) 68 + ld r2,PACATOC(r13) /* get kernel TOC in r2 */ 69 + 70 + addis r3,r2,function_trace_op@toc@ha 71 + addi r3,r3,function_trace_op@toc@l 72 + ld r5,0(r3) 73 + 74 + #ifdef CONFIG_LIVEPATCH 75 + mr r14,r7 /* remember old NIP */ 76 + #endif 77 + /* Calculate ip from nip-4 into r3 for call below */ 78 + subi r3, r7, MCOUNT_INSN_SIZE 79 + 80 + /* Put the original return address in r4 as parent_ip */ 81 + mr r4, r0 82 + 83 + /* Save special regs */ 84 + std r8, _MSR(r1) 85 + std r9, _CTR(r1) 86 + std r10, _XER(r1) 87 + std r11, _CCR(r1) 88 + 89 + /* Load &pt_regs in r6 for call below */ 90 + addi r6, r1 ,STACK_FRAME_OVERHEAD 91 + 92 + /* ftrace_call(r3, r4, r5, r6) */ 93 + .globl ftrace_call 94 + ftrace_call: 95 + bl ftrace_stub 96 + nop 97 + 98 + /* Load ctr with the possibly modified NIP */ 99 + ld r3, _NIP(r1) 100 + mtctr r3 101 + #ifdef CONFIG_LIVEPATCH 102 + cmpd r14,r3 /* has NIP been altered? */ 103 + #endif 104 + 105 + /* Restore gprs */ 106 + REST_8GPRS(0,r1) 107 + REST_8GPRS(8,r1) 108 + REST_8GPRS(16,r1) 109 + REST_8GPRS(24,r1) 110 + 111 + /* Restore possibly modified LR */ 112 + ld r0, _LINK(r1) 113 + mtlr r0 114 + 115 + /* Restore callee's TOC */ 116 + ld r2, 24(r1) 117 + 118 + /* Pop our stack frame */ 119 + addi r1, r1, SWITCH_FRAME_SIZE 120 + 121 + #ifdef CONFIG_LIVEPATCH 122 + /* Based on the cmpd above, if the NIP was altered handle livepatch */ 123 + bne- livepatch_handler 124 + #endif 125 + 126 + #ifdef CONFIG_FUNCTION_GRAPH_TRACER 127 + .globl ftrace_graph_call 128 + ftrace_graph_call: 129 + b ftrace_graph_stub 130 + _GLOBAL(ftrace_graph_stub) 131 + #endif 132 + 133 + bctr /* jump after _mcount site */ 134 + 135 + _GLOBAL(ftrace_stub) 136 + blr 137 + 138 + #ifdef CONFIG_LIVEPATCH 139 + /* 140 + * This function runs in the mcount context, between two functions. As 141 + * such it can only clobber registers which are volatile and used in 142 + * function linkage. 143 + * 144 + * We get here when a function A, calls another function B, but B has 145 + * been live patched with a new function C. 146 + * 147 + * On entry: 148 + * - we have no stack frame and can not allocate one 149 + * - LR points back to the original caller (in A) 150 + * - CTR holds the new NIP in C 151 + * - r0 & r12 are free 152 + * 153 + * r0 can't be used as the base register for a DS-form load or store, so 154 + * we temporarily shuffle r1 (stack pointer) into r0 and then put it back. 155 + */ 156 + livepatch_handler: 157 + CURRENT_THREAD_INFO(r12, r1) 158 + 159 + /* Save stack pointer into r0 */ 160 + mr r0, r1 161 + 162 + /* Allocate 3 x 8 bytes */ 163 + ld r1, TI_livepatch_sp(r12) 164 + addi r1, r1, 24 165 + std r1, TI_livepatch_sp(r12) 166 + 167 + /* Save toc & real LR on livepatch stack */ 168 + std r2, -24(r1) 169 + mflr r12 170 + std r12, -16(r1) 171 + 172 + /* Store stack end marker */ 173 + lis r12, STACK_END_MAGIC@h 174 + ori r12, r12, STACK_END_MAGIC@l 175 + std r12, -8(r1) 176 + 177 + /* Restore real stack pointer */ 178 + mr r1, r0 179 + 180 + /* Put ctr in r12 for global entry and branch there */ 181 + mfctr r12 182 + bctrl 183 + 184 + /* 185 + * Now we are returning from the patched function to the original 186 + * caller A. We are free to use r0 and r12, and we can use r2 until we 187 + * restore it. 188 + */ 189 + 190 + CURRENT_THREAD_INFO(r12, r1) 191 + 192 + /* Save stack pointer into r0 */ 193 + mr r0, r1 194 + 195 + ld r1, TI_livepatch_sp(r12) 196 + 197 + /* Check stack marker hasn't been trashed */ 198 + lis r2, STACK_END_MAGIC@h 199 + ori r2, r2, STACK_END_MAGIC@l 200 + ld r12, -8(r1) 201 + 1: tdne r12, r2 202 + EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0 203 + 204 + /* Restore LR & toc from livepatch stack */ 205 + ld r12, -16(r1) 206 + mtlr r12 207 + ld r2, -24(r1) 208 + 209 + /* Pop livepatch stack frame */ 210 + CURRENT_THREAD_INFO(r12, r0) 211 + subi r1, r1, 24 212 + std r1, TI_livepatch_sp(r12) 213 + 214 + /* Restore real stack pointer */ 215 + mr r1, r0 216 + 217 + /* Return to original caller of live patched function */ 218 + blr 219 + #endif /* CONFIG_LIVEPATCH */ 220 + 221 + #endif /* CONFIG_DYNAMIC_FTRACE */ 222 + 223 + #ifdef CONFIG_FUNCTION_GRAPH_TRACER 224 + _GLOBAL(ftrace_graph_caller) 225 + stdu r1, -112(r1) 226 + /* with -mprofile-kernel, parameter regs are still alive at _mcount */ 227 + std r10, 104(r1) 228 + std r9, 96(r1) 229 + std r8, 88(r1) 230 + std r7, 80(r1) 231 + std r6, 72(r1) 232 + std r5, 64(r1) 233 + std r4, 56(r1) 234 + std r3, 48(r1) 235 + 236 + /* Save callee's TOC in the ABI compliant location */ 237 + std r2, 24(r1) 238 + ld r2, PACATOC(r13) /* get kernel TOC in r2 */ 239 + 240 + mfctr r4 /* ftrace_caller has moved local addr here */ 241 + std r4, 40(r1) 242 + mflr r3 /* ftrace_caller has restored LR from stack */ 243 + subi r4, r4, MCOUNT_INSN_SIZE 244 + 245 + bl prepare_ftrace_return 246 + nop 247 + 248 + /* 249 + * prepare_ftrace_return gives us the address we divert to. 250 + * Change the LR to this. 251 + */ 252 + mtlr r3 253 + 254 + ld r0, 40(r1) 255 + mtctr r0 256 + ld r10, 104(r1) 257 + ld r9, 96(r1) 258 + ld r8, 88(r1) 259 + ld r7, 80(r1) 260 + ld r6, 72(r1) 261 + ld r5, 64(r1) 262 + ld r4, 56(r1) 263 + ld r3, 48(r1) 264 + 265 + /* Restore callee's TOC */ 266 + ld r2, 24(r1) 267 + 268 + addi r1, r1, 112 269 + mflr r0 270 + std r0, LRSAVE(r1) 271 + bctr 272 + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */

+68

arch/powerpc/kernel/trace/ftrace_64_pg.S

··· 1 + /* 2 + * Split from ftrace_64.S 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public License 6 + * as published by the Free Software Foundation; either version 7 + * 2 of the License, or (at your option) any later version. 8 + */ 9 + 10 + #include <linux/magic.h> 11 + #include <asm/ppc_asm.h> 12 + #include <asm/asm-offsets.h> 13 + #include <asm/ftrace.h> 14 + #include <asm/ppc-opcode.h> 15 + #include <asm/export.h> 16 + 17 + #ifdef CONFIG_DYNAMIC_FTRACE 18 + _GLOBAL_TOC(ftrace_caller) 19 + /* Taken from output of objdump from lib64/glibc */ 20 + mflr r3 21 + ld r11, 0(r1) 22 + stdu r1, -112(r1) 23 + std r3, 128(r1) 24 + ld r4, 16(r11) 25 + subi r3, r3, MCOUNT_INSN_SIZE 26 + .globl ftrace_call 27 + ftrace_call: 28 + bl ftrace_stub 29 + nop 30 + #ifdef CONFIG_FUNCTION_GRAPH_TRACER 31 + .globl ftrace_graph_call 32 + ftrace_graph_call: 33 + b ftrace_graph_stub 34 + _GLOBAL(ftrace_graph_stub) 35 + #endif 36 + ld r0, 128(r1) 37 + mtlr r0 38 + addi r1, r1, 112 39 + 40 + _GLOBAL(ftrace_stub) 41 + blr 42 + #endif /* CONFIG_DYNAMIC_FTRACE */ 43 + 44 + #ifdef CONFIG_FUNCTION_GRAPH_TRACER 45 + _GLOBAL(ftrace_graph_caller) 46 + /* load r4 with local address */ 47 + ld r4, 128(r1) 48 + subi r4, r4, MCOUNT_INSN_SIZE 49 + 50 + /* Grab the LR out of the caller stack frame */ 51 + ld r11, 112(r1) 52 + ld r3, 16(r11) 53 + 54 + bl prepare_ftrace_return 55 + nop 56 + 57 + /* 58 + * prepare_ftrace_return gives us the address we divert to. 59 + * Change the LR in the callers stack frame to this. 60 + */ 61 + ld r11, 112(r1) 62 + std r3, 16(r11) 63 + 64 + ld r0, 128(r1) 65 + mtlr r0 66 + addi r1, r1, 112 67 + blr 68 + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */

arch/powerpc/kernel/trace_clock.c arch/powerpc/kernel/trace/trace_clock.c

+23 -4

arch/powerpc/kernel/traps.c

··· 35 35 #include <linux/backlight.h> 36 36 #include <linux/bug.h> 37 37 #include <linux/kdebug.h> 38 - #include <linux/debugfs.h> 39 38 #include <linux/ratelimit.h> 40 39 #include <linux/context_tracking.h> 41 40 42 41 #include <asm/emulated_ops.h> 43 42 #include <asm/pgtable.h> 44 43 #include <linux/uaccess.h> 44 + #include <asm/debugfs.h> 45 45 #include <asm/io.h> 46 46 #include <asm/machdep.h> 47 47 #include <asm/rtas.h> ··· 279 279 280 280 void system_reset_exception(struct pt_regs *regs) 281 281 { 282 + /* 283 + * Avoid crashes in case of nested NMI exceptions. Recoverability 284 + * is determined by RI and in_nmi 285 + */ 286 + bool nested = in_nmi(); 287 + if (!nested) 288 + nmi_enter(); 289 + 282 290 /* See if any machine dependent calls */ 283 291 if (ppc_md.system_reset_exception) { 284 292 if (ppc_md.system_reset_exception(regs)) 285 - return; 293 + goto out; 286 294 } 287 295 288 296 die("System Reset", regs, SIGABRT); 289 297 298 + out: 299 + #ifdef CONFIG_PPC_BOOK3S_64 300 + BUG_ON(get_paca()->in_nmi == 0); 301 + if (get_paca()->in_nmi > 1) 302 + panic("Unrecoverable nested System Reset"); 303 + #endif 290 304 /* Must die if the interrupt is not recoverable */ 291 305 if (!(regs->msr & MSR_RI)) 292 306 panic("Unrecoverable System Reset"); 307 + 308 + if (!nested) 309 + nmi_exit(); 293 310 294 311 /* What should we do here? We could issue a shutdown or hard reset. */ 295 312 } ··· 322 305 long handled = 0; 323 306 324 307 __this_cpu_inc(irq_stat.mce_exceptions); 325 - 326 - add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 327 308 328 309 if (cur_cpu_spec && cur_cpu_spec->machine_check_early) 329 310 handled = cur_cpu_spec->machine_check_early(regs); ··· 755 740 int recover = 0; 756 741 757 742 __this_cpu_inc(irq_stat.mce_exceptions); 743 + 744 + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); 758 745 759 746 /* See if any machine dependent calls. In theory, we would want 760 747 * to call the CPU first, and call the ppc_md. one if the CPU ··· 1457 1440 [FSCR_TM_LG] = "TM", 1458 1441 [FSCR_EBB_LG] = "EBB", 1459 1442 [FSCR_TAR_LG] = "TAR", 1443 + [FSCR_MSGP_LG] = "MSGP", 1444 + [FSCR_SCV_LG] = "SCV", 1460 1445 }; 1461 1446 char *facility = "unknown"; 1462 1447 u64 value;

+2

arch/powerpc/kernel/vmlinux.lds.S

··· 77 77 #endif 78 78 } :kernel 79 79 80 + __head_end = .; 81 + 80 82 /* 81 83 * If the build dies here, it's likely code in head_64.S is referencing 82 84 * labels it can't reach, and the linker inserting stubs without the

+4 -4

arch/powerpc/kvm/book3s.c

··· 20 20 #include <linux/slab.h> 21 21 #include <linux/module.h> 22 22 #include <linux/miscdevice.h> 23 + #include <linux/gfp.h> 24 + #include <linux/sched.h> 25 + #include <linux/vmalloc.h> 26 + #include <linux/highmem.h> 23 27 24 28 #include <asm/reg.h> 25 29 #include <asm/cputable.h> ··· 35 31 #include <asm/kvm_book3s.h> 36 32 #include <asm/mmu_context.h> 37 33 #include <asm/page.h> 38 - #include <linux/gfp.h> 39 - #include <linux/sched.h> 40 - #include <linux/vmalloc.h> 41 - #include <linux/highmem.h> 42 34 43 35 #include "book3s.h" 44 36 #include "trace.h"

+8 -2

arch/powerpc/kvm/book3s_64_mmu_host.c

··· 229 229 230 230 static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) 231 231 { 232 + unsigned long vsid_bits = VSID_BITS_65_256M; 232 233 struct kvmppc_sid_map *map; 233 234 struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); 234 235 u16 sid_map_mask; ··· 258 257 kvmppc_mmu_pte_flush(vcpu, 0, 0); 259 258 kvmppc_mmu_flush_segments(vcpu); 260 259 } 261 - map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++, 256M); 260 + 261 + if (mmu_has_feature(MMU_FTR_68_BIT_VA)) 262 + vsid_bits = VSID_BITS_256M; 263 + 264 + map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++, 265 + VSID_MULTIPLIER_256M, vsid_bits); 262 266 263 267 map->guest_vsid = gvsid; 264 268 map->valid = true; ··· 396 390 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); 397 391 int err; 398 392 399 - err = __init_new_context(); 393 + err = hash__alloc_context_id(); 400 394 if (err < 0) 401 395 return -1; 402 396 vcpu3s->context_id[0] = err;

+9 -9

arch/powerpc/kvm/book3s_hv.c

··· 35 35 #include <linux/srcu.h> 36 36 #include <linux/miscdevice.h> 37 37 #include <linux/debugfs.h> 38 + #include <linux/gfp.h> 39 + #include <linux/vmalloc.h> 40 + #include <linux/highmem.h> 41 + #include <linux/hugetlb.h> 42 + #include <linux/kvm_irqfd.h> 43 + #include <linux/irqbypass.h> 44 + #include <linux/module.h> 45 + #include <linux/compiler.h> 46 + #include <linux/of.h> 38 47 39 48 #include <asm/reg.h> 40 49 #include <asm/cputable.h> ··· 67 58 #include <asm/mmu.h> 68 59 #include <asm/opal.h> 69 60 #include <asm/xics.h> 70 - #include <linux/gfp.h> 71 - #include <linux/vmalloc.h> 72 - #include <linux/highmem.h> 73 - #include <linux/hugetlb.h> 74 - #include <linux/kvm_irqfd.h> 75 - #include <linux/irqbypass.h> 76 - #include <linux/module.h> 77 - #include <linux/compiler.h> 78 - #include <linux/of.h> 79 61 80 62 #include "book3s.h" 81 63

+16 -13

arch/powerpc/kvm/book3s_hv_builtin.c

··· 23 23 #include <asm/kvm_book3s.h> 24 24 #include <asm/archrandom.h> 25 25 #include <asm/xics.h> 26 + #include <asm/xive.h> 26 27 #include <asm/dbell.h> 27 28 #include <asm/cputhreads.h> 28 29 #include <asm/io.h> ··· 194 193 return H_HARDWARE; 195 194 } 196 195 197 - static inline void rm_writeb(unsigned long paddr, u8 val) 198 - { 199 - __asm__ __volatile__("stbcix %0,0,%1" 200 - : : "r" (val), "r" (paddr) : "memory"); 201 - } 202 - 203 196 /* 204 197 * Send an interrupt or message to another CPU. 205 198 * The caller needs to include any barrier needed to order writes ··· 201 206 */ 202 207 void kvmhv_rm_send_ipi(int cpu) 203 208 { 204 - unsigned long xics_phys; 209 + void __iomem *xics_phys; 205 210 unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); 206 211 207 212 /* On POWER9 we can use msgsnd for any destination cpu. */ ··· 219 224 return; 220 225 } 221 226 227 + /* We should never reach this */ 228 + if (WARN_ON_ONCE(xive_enabled())) 229 + return; 230 + 222 231 /* Else poke the target with an IPI */ 223 232 xics_phys = paca[cpu].kvm_hstate.xics_phys; 224 233 if (xics_phys) 225 - rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); 234 + __raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR); 226 235 else 227 236 opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY); 228 237 } ··· 385 386 long rc; 386 387 bool again; 387 388 389 + if (xive_enabled()) 390 + return 1; 391 + 388 392 do { 389 393 again = false; 390 394 rc = kvmppc_read_one_intr(&again); ··· 399 397 400 398 static long kvmppc_read_one_intr(bool *again) 401 399 { 402 - unsigned long xics_phys; 400 + void __iomem *xics_phys; 403 401 u32 h_xirr; 404 402 __be32 xirr; 405 403 u32 xisr; ··· 417 415 if (!xics_phys) 418 416 rc = opal_int_get_xirr(&xirr, false); 419 417 else 420 - xirr = _lwzcix(xics_phys + XICS_XIRR); 418 + xirr = __raw_rm_readl(xics_phys + XICS_XIRR); 421 419 if (rc < 0) 422 420 return 1; 423 421 ··· 447 445 if (xisr == XICS_IPI) { 448 446 rc = 0; 449 447 if (xics_phys) { 450 - _stbcix(xics_phys + XICS_MFRR, 0xff); 451 - _stwcix(xics_phys + XICS_XIRR, xirr); 448 + __raw_rm_writeb(0xff, xics_phys + XICS_MFRR); 449 + __raw_rm_writel(xirr, xics_phys + XICS_XIRR); 452 450 } else { 453 451 opal_int_set_mfrr(hard_smp_processor_id(), 0xff); 454 452 rc = opal_int_eoi(h_xirr); ··· 473 471 * we need to resend that IPI, bummer 474 472 */ 475 473 if (xics_phys) 476 - _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY); 474 + __raw_rm_writeb(IPI_PRIORITY, 475 + xics_phys + XICS_MFRR); 477 476 else 478 477 opal_int_set_mfrr(hard_smp_processor_id(), 479 478 IPI_PRIORITY);

+2 -3

arch/powerpc/kvm/book3s_hv_rm_xics.c

··· 16 16 #include <asm/kvm_ppc.h> 17 17 #include <asm/hvcall.h> 18 18 #include <asm/xics.h> 19 - #include <asm/debug.h> 20 19 #include <asm/synch.h> 21 20 #include <asm/cputhreads.h> 22 21 #include <asm/pgtable.h> ··· 765 766 766 767 static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) 767 768 { 768 - unsigned long xics_phys; 769 + void __iomem *xics_phys; 769 770 int64_t rc; 770 771 771 772 rc = pnv_opal_pci_msi_eoi(c, hwirq); ··· 778 779 /* EOI it */ 779 780 xics_phys = local_paca->kvm_hstate.xics_phys; 780 781 if (xics_phys) { 781 - _stwcix(xics_phys + XICS_XIRR, xirr); 782 + __raw_rm_writel(xirr, xics_phys + XICS_XIRR); 782 783 } else { 783 784 rc = opal_int_eoi(be32_to_cpu(xirr)); 784 785 *again = rc > 0;

+2 -3

arch/powerpc/kvm/book3s_xics.c

··· 19 19 #include <asm/kvm_ppc.h> 20 20 #include <asm/hvcall.h> 21 21 #include <asm/xics.h> 22 - #include <asm/debug.h> 22 + #include <asm/debugfs.h> 23 23 #include <asm/time.h> 24 24 25 - #include <linux/debugfs.h> 26 25 #include <linux/seq_file.h> 27 26 28 27 #include "book3s_xics.h" ··· 1083 1084 return xics->ics[icsid]; 1084 1085 } 1085 1086 1086 - int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num) 1087 + static int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num) 1087 1088 { 1088 1089 struct kvmppc_icp *icp; 1089 1090

+3 -1

arch/powerpc/lib/code-patching.c

··· 8 8 */ 9 9 10 10 #include <linux/kernel.h> 11 + #include <linux/kprobes.h> 11 12 #include <linux/vmalloc.h> 12 13 #include <linux/init.h> 13 14 #include <linux/mm.h> ··· 60 59 * Helper to check if a given instruction is a conditional branch 61 60 * Derived from the conditional checks in analyse_instr() 62 61 */ 63 - bool __kprobes is_conditional_branch(unsigned int instr) 62 + bool is_conditional_branch(unsigned int instr) 64 63 { 65 64 unsigned int opcode = instr >> 26; 66 65 ··· 76 75 } 77 76 return false; 78 77 } 78 + NOKPROBE_SYMBOL(is_conditional_branch); 79 79 80 80 unsigned int create_branch(const unsigned int *addr, 81 81 unsigned long target, int flags)

+44 -38

arch/powerpc/lib/sstep.c

··· 49 49 /* 50 50 * Emulate the truncation of 64 bit values in 32-bit mode. 51 51 */ 52 - static unsigned long truncate_if_32bit(unsigned long msr, unsigned long val) 52 + static nokprobe_inline unsigned long truncate_if_32bit(unsigned long msr, 53 + unsigned long val) 53 54 { 54 55 #ifdef __powerpc64__ 55 56 if ((msr & MSR_64BIT) == 0) ··· 62 61 /* 63 62 * Determine whether a conditional branch instruction would branch. 64 63 */ 65 - static int __kprobes branch_taken(unsigned int instr, struct pt_regs *regs) 64 + static nokprobe_inline int branch_taken(unsigned int instr, struct pt_regs *regs) 66 65 { 67 66 unsigned int bo = (instr >> 21) & 0x1f; 68 67 unsigned int bi; ··· 82 81 return 1; 83 82 } 84 83 85 - 86 - static long __kprobes address_ok(struct pt_regs *regs, unsigned long ea, int nb) 84 + static nokprobe_inline long address_ok(struct pt_regs *regs, unsigned long ea, int nb) 87 85 { 88 86 if (!user_mode(regs)) 89 87 return 1; ··· 92 92 /* 93 93 * Calculate effective address for a D-form instruction 94 94 */ 95 - static unsigned long __kprobes dform_ea(unsigned int instr, struct pt_regs *regs) 95 + static nokprobe_inline unsigned long dform_ea(unsigned int instr, struct pt_regs *regs) 96 96 { 97 97 int ra; 98 98 unsigned long ea; ··· 109 109 /* 110 110 * Calculate effective address for a DS-form instruction 111 111 */ 112 - static unsigned long __kprobes dsform_ea(unsigned int instr, struct pt_regs *regs) 112 + static nokprobe_inline unsigned long dsform_ea(unsigned int instr, struct pt_regs *regs) 113 113 { 114 114 int ra; 115 115 unsigned long ea; ··· 126 126 /* 127 127 * Calculate effective address for an X-form instruction 128 128 */ 129 - static unsigned long __kprobes xform_ea(unsigned int instr, 130 - struct pt_regs *regs) 129 + static nokprobe_inline unsigned long xform_ea(unsigned int instr, 130 + struct pt_regs *regs) 131 131 { 132 132 int ra, rb; 133 133 unsigned long ea; ··· 145 145 * Return the largest power of 2, not greater than sizeof(unsigned long), 146 146 * such that x is a multiple of it. 147 147 */ 148 - static inline unsigned long max_align(unsigned long x) 148 + static nokprobe_inline unsigned long max_align(unsigned long x) 149 149 { 150 150 x |= sizeof(unsigned long); 151 151 return x & -x; /* isolates rightmost bit */ 152 152 } 153 153 154 154 155 - static inline unsigned long byterev_2(unsigned long x) 155 + static nokprobe_inline unsigned long byterev_2(unsigned long x) 156 156 { 157 157 return ((x >> 8) & 0xff) | ((x & 0xff) << 8); 158 158 } 159 159 160 - static inline unsigned long byterev_4(unsigned long x) 160 + static nokprobe_inline unsigned long byterev_4(unsigned long x) 161 161 { 162 162 return ((x >> 24) & 0xff) | ((x >> 8) & 0xff00) | 163 163 ((x & 0xff00) << 8) | ((x & 0xff) << 24); 164 164 } 165 165 166 166 #ifdef __powerpc64__ 167 - static inline unsigned long byterev_8(unsigned long x) 167 + static nokprobe_inline unsigned long byterev_8(unsigned long x) 168 168 { 169 169 return (byterev_4(x) << 32) | byterev_4(x >> 32); 170 170 } 171 171 #endif 172 172 173 - static int __kprobes read_mem_aligned(unsigned long *dest, unsigned long ea, 174 - int nb) 173 + static nokprobe_inline int read_mem_aligned(unsigned long *dest, 174 + unsigned long ea, int nb) 175 175 { 176 176 int err = 0; 177 177 unsigned long x = 0; ··· 197 197 return err; 198 198 } 199 199 200 - static int __kprobes read_mem_unaligned(unsigned long *dest, unsigned long ea, 201 - int nb, struct pt_regs *regs) 200 + static nokprobe_inline int read_mem_unaligned(unsigned long *dest, 201 + unsigned long ea, int nb, struct pt_regs *regs) 202 202 { 203 203 int err; 204 204 unsigned long x, b, c; ··· 248 248 * Read memory at address ea for nb bytes, return 0 for success 249 249 * or -EFAULT if an error occurred. 250 250 */ 251 - static int __kprobes read_mem(unsigned long *dest, unsigned long ea, int nb, 251 + static int read_mem(unsigned long *dest, unsigned long ea, int nb, 252 252 struct pt_regs *regs) 253 253 { 254 254 if (!address_ok(regs, ea, nb)) ··· 257 257 return read_mem_aligned(dest, ea, nb); 258 258 return read_mem_unaligned(dest, ea, nb, regs); 259 259 } 260 + NOKPROBE_SYMBOL(read_mem); 260 261 261 - static int __kprobes write_mem_aligned(unsigned long val, unsigned long ea, 262 - int nb) 262 + static nokprobe_inline int write_mem_aligned(unsigned long val, 263 + unsigned long ea, int nb) 263 264 { 264 265 int err = 0; 265 266 ··· 283 282 return err; 284 283 } 285 284 286 - static int __kprobes write_mem_unaligned(unsigned long val, unsigned long ea, 287 - int nb, struct pt_regs *regs) 285 + static nokprobe_inline int write_mem_unaligned(unsigned long val, 286 + unsigned long ea, int nb, struct pt_regs *regs) 288 287 { 289 288 int err; 290 289 unsigned long c; ··· 326 325 * Write memory at address ea for nb bytes, return 0 for success 327 326 * or -EFAULT if an error occurred. 328 327 */ 329 - static int __kprobes write_mem(unsigned long val, unsigned long ea, int nb, 328 + static int write_mem(unsigned long val, unsigned long ea, int nb, 330 329 struct pt_regs *regs) 331 330 { 332 331 if (!address_ok(regs, ea, nb)) ··· 335 334 return write_mem_aligned(val, ea, nb); 336 335 return write_mem_unaligned(val, ea, nb, regs); 337 336 } 337 + NOKPROBE_SYMBOL(write_mem); 338 338 339 339 #ifdef CONFIG_PPC_FPU 340 340 /* 341 341 * Check the address and alignment, and call func to do the actual 342 342 * load or store. 343 343 */ 344 - static int __kprobes do_fp_load(int rn, int (*func)(int, unsigned long), 344 + static int do_fp_load(int rn, int (*func)(int, unsigned long), 345 345 unsigned long ea, int nb, 346 346 struct pt_regs *regs) 347 347 { ··· 382 380 return err; 383 381 return (*func)(rn, ptr); 384 382 } 383 + NOKPROBE_SYMBOL(do_fp_load); 385 384 386 - static int __kprobes do_fp_store(int rn, int (*func)(int, unsigned long), 385 + static int do_fp_store(int rn, int (*func)(int, unsigned long), 387 386 unsigned long ea, int nb, 388 387 struct pt_regs *regs) 389 388 { ··· 428 425 } 429 426 return err; 430 427 } 428 + NOKPROBE_SYMBOL(do_fp_store); 431 429 #endif 432 430 433 431 #ifdef CONFIG_ALTIVEC 434 432 /* For Altivec/VMX, no need to worry about alignment */ 435 - static int __kprobes do_vec_load(int rn, int (*func)(int, unsigned long), 433 + static nokprobe_inline int do_vec_load(int rn, int (*func)(int, unsigned long), 436 434 unsigned long ea, struct pt_regs *regs) 437 435 { 438 436 if (!address_ok(regs, ea & ~0xfUL, 16)) ··· 441 437 return (*func)(rn, ea); 442 438 } 443 439 444 - static int __kprobes do_vec_store(int rn, int (*func)(int, unsigned long), 440 + static nokprobe_inline int do_vec_store(int rn, int (*func)(int, unsigned long), 445 441 unsigned long ea, struct pt_regs *regs) 446 442 { 447 443 if (!address_ok(regs, ea & ~0xfUL, 16)) ··· 451 447 #endif /* CONFIG_ALTIVEC */ 452 448 453 449 #ifdef CONFIG_VSX 454 - static int __kprobes do_vsx_load(int rn, int (*func)(int, unsigned long), 450 + static nokprobe_inline int do_vsx_load(int rn, int (*func)(int, unsigned long), 455 451 unsigned long ea, struct pt_regs *regs) 456 452 { 457 453 int err; ··· 469 465 return err; 470 466 } 471 467 472 - static int __kprobes do_vsx_store(int rn, int (*func)(int, unsigned long), 468 + static nokprobe_inline int do_vsx_store(int rn, int (*func)(int, unsigned long), 473 469 unsigned long ea, struct pt_regs *regs) 474 470 { 475 471 int err; ··· 526 522 : "=r" (err) \ 527 523 : "r" (addr), "i" (-EFAULT), "0" (err)) 528 524 529 - static void __kprobes set_cr0(struct pt_regs *regs, int rd) 525 + static nokprobe_inline void set_cr0(struct pt_regs *regs, int rd) 530 526 { 531 527 long val = regs->gpr[rd]; 532 528 ··· 543 539 regs->ccr |= 0x20000000; 544 540 } 545 541 546 - static void __kprobes add_with_carry(struct pt_regs *regs, int rd, 542 + static nokprobe_inline void add_with_carry(struct pt_regs *regs, int rd, 547 543 unsigned long val1, unsigned long val2, 548 544 unsigned long carry_in) 549 545 { ··· 564 560 regs->xer &= ~XER_CA; 565 561 } 566 562 567 - static void __kprobes do_cmp_signed(struct pt_regs *regs, long v1, long v2, 563 + static nokprobe_inline void do_cmp_signed(struct pt_regs *regs, long v1, long v2, 568 564 int crfld) 569 565 { 570 566 unsigned int crval, shift; ··· 580 576 regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift); 581 577 } 582 578 583 - static void __kprobes do_cmp_unsigned(struct pt_regs *regs, unsigned long v1, 579 + static nokprobe_inline void do_cmp_unsigned(struct pt_regs *regs, unsigned long v1, 584 580 unsigned long v2, int crfld) 585 581 { 586 582 unsigned int crval, shift; ··· 596 592 regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift); 597 593 } 598 594 599 - static int __kprobes trap_compare(long v1, long v2) 595 + static nokprobe_inline int trap_compare(long v1, long v2) 600 596 { 601 597 int ret = 0; 602 598 ··· 635 631 * Returns 1 if the instruction has been executed, or 0 if not. 636 632 * Sets *op to indicate what the instruction does. 637 633 */ 638 - int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs, 634 + int analyse_instr(struct instruction_op *op, struct pt_regs *regs, 639 635 unsigned int instr) 640 636 { 641 637 unsigned int opcode, ra, rb, rd, spr, u; ··· 1696 1692 #endif 1697 1693 } 1698 1694 EXPORT_SYMBOL_GPL(analyse_instr); 1695 + NOKPROBE_SYMBOL(analyse_instr); 1699 1696 1700 1697 /* 1701 1698 * For PPC32 we always use stwu with r1 to change the stack pointer. ··· 1706 1701 * don't emulate the real store operation. We will do real store 1707 1702 * operation safely in exception return code by checking this flag. 1708 1703 */ 1709 - static __kprobes int handle_stack_update(unsigned long ea, struct pt_regs *regs) 1704 + static nokprobe_inline int handle_stack_update(unsigned long ea, struct pt_regs *regs) 1710 1705 { 1711 1706 #ifdef CONFIG_PPC32 1712 1707 /* ··· 1726 1721 return 0; 1727 1722 } 1728 1723 1729 - static __kprobes void do_signext(unsigned long *valp, int size) 1724 + static nokprobe_inline void do_signext(unsigned long *valp, int size) 1730 1725 { 1731 1726 switch (size) { 1732 1727 case 2: ··· 1738 1733 } 1739 1734 } 1740 1735 1741 - static __kprobes void do_byterev(unsigned long *valp, int size) 1736 + static nokprobe_inline void do_byterev(unsigned long *valp, int size) 1742 1737 { 1743 1738 switch (size) { 1744 1739 case 2: ··· 1762 1757 * or -1 if the instruction is one that should not be stepped, 1763 1758 * such as an rfid, or a mtmsrd that would clear MSR_RI. 1764 1759 */ 1765 - int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) 1760 + int emulate_step(struct pt_regs *regs, unsigned int instr) 1766 1761 { 1767 1762 struct instruction_op op; 1768 1763 int r, err, size; ··· 1993 1988 regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4); 1994 1989 return 1; 1995 1990 } 1991 + NOKPROBE_SYMBOL(emulate_step);

+1 -1

arch/powerpc/mm/dump_hashpagetable.c

··· 468 468 unsigned long psize = 1 << mmu_psize_defs[mmu_linear_psize].shift; 469 469 470 470 for (addr = PAGE_OFFSET; addr < PAGE_OFFSET + 471 - memblock_phys_mem_size(); addr += psize) 471 + memblock_end_of_DRAM(); addr += psize) 472 472 hpte_find(st, addr, mmu_linear_psize); 473 473 } 474 474

+92 -14

arch/powerpc/mm/dump_linuxpagetables.c

··· 26 26 #include <asm/page.h> 27 27 #include <asm/pgalloc.h> 28 28 29 + #ifdef CONFIG_PPC32 30 + #define KERN_VIRT_START 0 31 + #endif 32 + 29 33 /* 30 34 * To visualise what is happening, 31 35 * ··· 60 56 struct seq_file *seq; 61 57 const struct addr_marker *marker; 62 58 unsigned long start_address; 59 + unsigned long start_pa; 60 + unsigned long last_pa; 63 61 unsigned int level; 64 62 u64 current_flags; 65 63 }; ··· 75 69 { 0, "Start of kernel VM" }, 76 70 { 0, "vmalloc() Area" }, 77 71 { 0, "vmalloc() End" }, 72 + #ifdef CONFIG_PPC64 78 73 { 0, "isa I/O start" }, 79 74 { 0, "isa I/O end" }, 80 75 { 0, "phb I/O start" }, ··· 83 76 { 0, "I/O remap start" }, 84 77 { 0, "I/O remap end" }, 85 78 { 0, "vmemmap start" }, 79 + #else 80 + { 0, "Early I/O remap start" }, 81 + { 0, "Early I/O remap end" }, 82 + #ifdef CONFIG_NOT_COHERENT_CACHE 83 + { 0, "Consistent mem start" }, 84 + { 0, "Consistent mem end" }, 85 + #endif 86 + #ifdef CONFIG_HIGHMEM 87 + { 0, "Highmem PTEs start" }, 88 + { 0, "Highmem PTEs end" }, 89 + #endif 90 + { 0, "Fixmap start" }, 91 + { 0, "Fixmap end" }, 92 + #endif 86 93 { -1, NULL }, 87 94 }; 88 95 ··· 121 100 .set = "user", 122 101 .clear = " ", 123 102 }, { 103 + #if _PAGE_RO == 0 124 104 .mask = _PAGE_RW, 125 105 .val = _PAGE_RW, 106 + #else 107 + .mask = _PAGE_RO, 108 + .val = 0, 109 + #endif 126 110 .set = "rw", 127 111 .clear = "ro", 128 112 }, { ··· 180 154 .clear = " ", 181 155 }, { 182 156 #endif 157 + #ifndef CONFIG_PPC_BOOK3S_64 183 158 .mask = _PAGE_NO_CACHE, 184 159 .val = _PAGE_NO_CACHE, 185 160 .set = "no cache", 186 161 .clear = " ", 187 162 }, { 163 + #else 164 + .mask = _PAGE_NON_IDEMPOTENT, 165 + .val = _PAGE_NON_IDEMPOTENT, 166 + .set = "non-idempotent", 167 + .clear = " ", 168 + }, { 169 + .mask = _PAGE_TOLERANT, 170 + .val = _PAGE_TOLERANT, 171 + .set = "tolerant", 172 + .clear = " ", 173 + }, { 174 + #endif 188 175 #ifdef CONFIG_PPC_BOOK3S_64 189 176 .mask = H_PAGE_BUSY, 190 177 .val = H_PAGE_BUSY, ··· 227 188 .mask = _PAGE_SPECIAL, 228 189 .val = _PAGE_SPECIAL, 229 190 .set = "special", 191 + }, { 192 + .mask = _PAGE_SHARED, 193 + .val = _PAGE_SHARED, 194 + .set = "shared", 230 195 } 231 196 }; 232 197 ··· 295 252 const char *unit = units; 296 253 unsigned long delta; 297 254 298 - seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1); 255 + #ifdef CONFIG_PPC64 256 + seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1); 257 + seq_printf(st->seq, "0x%016lx ", st->start_pa); 258 + #else 259 + seq_printf(st->seq, "0x%08lx-0x%08lx ", st->start_address, addr - 1); 260 + seq_printf(st->seq, "0x%08lx ", st->start_pa); 261 + #endif 262 + 299 263 delta = (addr - st->start_address) >> 10; 300 264 /* Work out what appropriate unit to use */ 301 265 while (!(delta & 1023) && unit[1]) { ··· 317 267 unsigned int level, u64 val) 318 268 { 319 269 u64 flag = val & pg_level[level].mask; 270 + u64 pa = val & PTE_RPN_MASK; 271 + 320 272 /* At first no level is set */ 321 273 if (!st->level) { 322 274 st->level = level; 323 275 st->current_flags = flag; 324 276 st->start_address = addr; 277 + st->start_pa = pa; 278 + st->last_pa = pa; 325 279 seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); 326 280 /* 327 281 * Dump the section of virtual memory when: ··· 333 279 * - we change levels in the tree. 334 280 * - the address is in a different section of memory and is thus 335 281 * used for a different purpose, regardless of the flags. 282 + * - the pa of this page is not adjacent to the last inspected page 336 283 */ 337 284 } else if (flag != st->current_flags || level != st->level || 338 - addr >= st->marker[1].start_address) { 285 + addr >= st->marker[1].start_address || 286 + pa != st->last_pa + PAGE_SIZE) { 339 287 340 288 /* Check the PTE flags */ 341 289 if (st->current_flags) { ··· 361 305 seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); 362 306 } 363 307 st->start_address = addr; 308 + st->start_pa = pa; 309 + st->last_pa = pa; 364 310 st->current_flags = flag; 365 311 st->level = level; 312 + } else { 313 + st->last_pa = pa; 366 314 } 367 315 } 368 316 ··· 437 377 438 378 static void populate_markers(void) 439 379 { 440 - address_markers[0].start_address = PAGE_OFFSET; 441 - address_markers[1].start_address = VMALLOC_START; 442 - address_markers[2].start_address = VMALLOC_END; 443 - address_markers[3].start_address = ISA_IO_BASE; 444 - address_markers[4].start_address = ISA_IO_END; 445 - address_markers[5].start_address = PHB_IO_BASE; 446 - address_markers[6].start_address = PHB_IO_END; 447 - address_markers[7].start_address = IOREMAP_BASE; 448 - address_markers[8].start_address = IOREMAP_END; 380 + int i = 0; 381 + 382 + address_markers[i++].start_address = PAGE_OFFSET; 383 + address_markers[i++].start_address = VMALLOC_START; 384 + address_markers[i++].start_address = VMALLOC_END; 385 + #ifdef CONFIG_PPC64 386 + address_markers[i++].start_address = ISA_IO_BASE; 387 + address_markers[i++].start_address = ISA_IO_END; 388 + address_markers[i++].start_address = PHB_IO_BASE; 389 + address_markers[i++].start_address = PHB_IO_END; 390 + address_markers[i++].start_address = IOREMAP_BASE; 391 + address_markers[i++].start_address = IOREMAP_END; 449 392 #ifdef CONFIG_PPC_STD_MMU_64 450 - address_markers[9].start_address = H_VMEMMAP_BASE; 393 + address_markers[i++].start_address = H_VMEMMAP_BASE; 451 394 #else 452 - address_markers[9].start_address = VMEMMAP_BASE; 395 + address_markers[i++].start_address = VMEMMAP_BASE; 453 396 #endif 397 + #else /* !CONFIG_PPC64 */ 398 + address_markers[i++].start_address = ioremap_bot; 399 + address_markers[i++].start_address = IOREMAP_TOP; 400 + #ifdef CONFIG_NOT_COHERENT_CACHE 401 + address_markers[i++].start_address = IOREMAP_TOP; 402 + address_markers[i++].start_address = IOREMAP_TOP + 403 + CONFIG_CONSISTENT_SIZE; 404 + #endif 405 + #ifdef CONFIG_HIGHMEM 406 + address_markers[i++].start_address = PKMAP_BASE; 407 + address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP); 408 + #endif 409 + address_markers[i++].start_address = FIXADDR_START; 410 + address_markers[i++].start_address = FIXADDR_TOP; 411 + #endif /* CONFIG_PPC64 */ 454 412 } 455 413 456 414 static int ptdump_show(struct seq_file *m, void *v) ··· 513 435 514 436 populate_markers(); 515 437 build_pgtable_complete_mask(); 516 - debugfs_file = debugfs_create_file("kernel_pagetables", 0400, NULL, 438 + debugfs_file = debugfs_create_file("kernel_page_tables", 0400, NULL, 517 439 NULL, &ptdump_fops); 518 440 return debugfs_file ? 0 : -ENOMEM; 519 441 }

+40 -42

arch/powerpc/mm/fault.c

··· 120 120 siginfo_t info; 121 121 unsigned int lsb = 0; 122 122 123 - up_read(&current->mm->mmap_sem); 124 - 125 123 if (!user_mode(regs)) 126 124 return MM_FAULT_ERR(SIGBUS); 127 125 ··· 152 154 * continue the pagefault. 153 155 */ 154 156 if (fatal_signal_pending(current)) { 155 - /* 156 - * If we have retry set, the mmap semaphore will have 157 - * alrady been released in __lock_page_or_retry(). Else 158 - * we release it now. 159 - */ 160 - if (!(fault & VM_FAULT_RETRY)) 161 - up_read(&current->mm->mmap_sem); 162 157 /* Coming from kernel, we need to deal with uaccess fixups */ 163 158 if (user_mode(regs)) 164 159 return MM_FAULT_RETURN; ··· 164 173 165 174 /* Out of memory */ 166 175 if (fault & VM_FAULT_OOM) { 167 - up_read(&current->mm->mmap_sem); 168 - 169 176 /* 170 177 * We ran out of memory, or some other thing happened to us that 171 178 * made us unable to handle the page fault gracefully. ··· 287 298 * can result in fault, which will cause a deadlock when called with 288 299 * mmap_sem held 289 300 */ 290 - if (user_mode(regs)) 301 + if (!is_exec && user_mode(regs)) 291 302 store_update_sp = store_updates_sp(regs); 292 303 293 304 if (user_mode(regs)) ··· 447 458 * the fault. 448 459 */ 449 460 fault = handle_mm_fault(vma, address, flags); 461 + 462 + /* 463 + * Handle the retry right now, the mmap_sem has been released in that 464 + * case. 465 + */ 466 + if (unlikely(fault & VM_FAULT_RETRY)) { 467 + /* We retry only once */ 468 + if (flags & FAULT_FLAG_ALLOW_RETRY) { 469 + /* 470 + * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk 471 + * of starvation. 472 + */ 473 + flags &= ~FAULT_FLAG_ALLOW_RETRY; 474 + flags |= FAULT_FLAG_TRIED; 475 + if (!fatal_signal_pending(current)) 476 + goto retry; 477 + } 478 + /* We will enter mm_fault_error() below */ 479 + } else 480 + up_read(&current->mm->mmap_sem); 481 + 450 482 if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { 451 483 if (fault & VM_FAULT_SIGSEGV) 452 - goto bad_area; 484 + goto bad_area_nosemaphore; 453 485 rc = mm_fault_error(regs, address, fault); 454 486 if (rc >= MM_FAULT_RETURN) 455 487 goto bail; ··· 479 469 } 480 470 481 471 /* 482 - * Major/minor page fault accounting is only done on the 483 - * initial attempt. If we go through a retry, it is extremely 484 - * likely that the page will be found in page cache at that point. 472 + * Major/minor page fault accounting. 485 473 */ 486 - if (flags & FAULT_FLAG_ALLOW_RETRY) { 487 - if (fault & VM_FAULT_MAJOR) { 488 - current->maj_flt++; 489 - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 490 - regs, address); 474 + if (fault & VM_FAULT_MAJOR) { 475 + current->maj_flt++; 476 + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 477 + regs, address); 491 478 #ifdef CONFIG_PPC_SMLPAR 492 - if (firmware_has_feature(FW_FEATURE_CMO)) { 493 - u32 page_ins; 479 + if (firmware_has_feature(FW_FEATURE_CMO)) { 480 + u32 page_ins; 494 481 495 - preempt_disable(); 496 - page_ins = be32_to_cpu(get_lppaca()->page_ins); 497 - page_ins += 1 << PAGE_FACTOR; 498 - get_lppaca()->page_ins = cpu_to_be32(page_ins); 499 - preempt_enable(); 500 - } 482 + preempt_disable(); 483 + page_ins = be32_to_cpu(get_lppaca()->page_ins); 484 + page_ins += 1 << PAGE_FACTOR; 485 + get_lppaca()->page_ins = cpu_to_be32(page_ins); 486 + preempt_enable(); 487 + } 501 488 #endif /* CONFIG_PPC_SMLPAR */ 502 - } else { 503 - current->min_flt++; 504 - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 505 - regs, address); 506 - } 507 - if (fault & VM_FAULT_RETRY) { 508 - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk 509 - * of starvation. */ 510 - flags &= ~FAULT_FLAG_ALLOW_RETRY; 511 - flags |= FAULT_FLAG_TRIED; 512 - goto retry; 513 - } 489 + } else { 490 + current->min_flt++; 491 + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 492 + regs, address); 514 493 } 515 494 516 - up_read(&mm->mmap_sem); 517 495 goto bail; 518 496 519 497 bad_area:

-2

arch/powerpc/mm/hash_low_32.S

··· 31 31 #ifdef CONFIG_SMP 32 32 .section .bss 33 33 .align 2 34 - .globl mmu_hash_lock 35 34 mmu_hash_lock: 36 35 .space 4 37 - EXPORT_SYMBOL(mmu_hash_lock) 38 36 #endif /* CONFIG_SMP */ 39 37 40 38 /*

+16 -10

arch/powerpc/mm/hash_utils_64.c

··· 35 35 #include <linux/memblock.h> 36 36 #include <linux/context_tracking.h> 37 37 #include <linux/libfdt.h> 38 - #include <linux/debugfs.h> 39 38 40 - #include <asm/debug.h> 39 + #include <asm/debugfs.h> 41 40 #include <asm/processor.h> 42 41 #include <asm/pgtable.h> 43 42 #include <asm/mmu.h> ··· 926 927 } 927 928 #endif /* CONFIG_DEBUG_PAGEALLOC */ 928 929 929 - /* On U3 based machines, we need to reserve the DART area and 930 - * _NOT_ map it to avoid cache paradoxes as it's remapped non 931 - * cacheable later on 932 - */ 933 - 934 930 /* create bolted the linear mapping in the hash table */ 935 931 for_each_memblock(memory, reg) { 936 932 base = (unsigned long)__va(reg->base); ··· 975 981 976 982 void __init hash__early_init_mmu(void) 977 983 { 984 + /* 985 + * We have code in __hash_page_64K() and elsewhere, which assumes it can 986 + * do the following: 987 + * new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX); 988 + * 989 + * Where the slot number is between 0-15, and values of 8-15 indicate 990 + * the secondary bucket. For that code to work H_PAGE_F_SECOND and 991 + * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and 992 + * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here 993 + * with a BUILD_BUG_ON(). 994 + */ 995 + BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul << (H_PAGE_F_GIX_SHIFT + 3))); 996 + 978 997 htab_init_page_sizes(); 979 998 980 999 /* ··· 1127 1120 copro_flush_all_slbs(mm); 1128 1121 if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { 1129 1122 1130 - copy_mm_to_paca(&mm->context); 1123 + copy_mm_to_paca(mm); 1131 1124 slb_flush_and_rebolt(); 1132 1125 } 1133 1126 } ··· 1199 1192 { 1200 1193 if (user_region) { 1201 1194 if (psize != get_paca_psize(ea)) { 1202 - copy_mm_to_paca(&mm->context); 1195 + copy_mm_to_paca(mm); 1203 1196 slb_flush_and_rebolt(); 1204 1197 } 1205 1198 } else if (get_paca()->vmalloc_sllp != ··· 1862 1855 return 0; 1863 1856 } 1864 1857 machine_device_initcall(pseries, hash64_debugfs); 1865 - 1866 1858 #endif /* CONFIG_DEBUG_FS */

-7

arch/powerpc/mm/hugetlbpage-book3e.c

··· 148 148 149 149 mm = vma->vm_mm; 150 150 151 - #ifdef CONFIG_PPC_MM_SLICES 152 - psize = get_slice_psize(mm, ea); 153 - tsize = mmu_get_tsize(psize); 154 - shift = mmu_psize_defs[psize].shift; 155 - #else 156 151 psize = vma_mmu_pagesize(vma); 157 152 shift = __ilog2(psize); 158 153 tsize = shift - 10; 159 - #endif 160 - 161 154 /* 162 155 * We can't be interrupted while we're setting up the MAS 163 156 * regusters or after we've confirmed that no tlb exists.

+9 -2

arch/powerpc/mm/hugetlbpage-radix.c

··· 50 50 struct hstate *h = hstate_file(file); 51 51 struct vm_unmapped_area_info info; 52 52 53 + if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE)) 54 + mm->context.addr_limit = TASK_SIZE; 55 + 53 56 if (len & ~huge_page_mask(h)) 54 57 return -EINVAL; 55 - if (len > TASK_SIZE) 58 + if (len > mm->task_size) 56 59 return -ENOMEM; 57 60 58 61 if (flags & MAP_FIXED) { ··· 67 64 if (addr) { 68 65 addr = ALIGN(addr, huge_page_size(h)); 69 66 vma = find_vma(mm, addr); 70 - if (TASK_SIZE - len >= addr && 67 + if (mm->task_size - len >= addr && 71 68 (!vma || addr + len <= vma->vm_start)) 72 69 return addr; 73 70 } ··· 81 78 info.high_limit = current->mm->mmap_base; 82 79 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 83 80 info.align_offset = 0; 81 + 82 + if (addr > DEFAULT_MAP_WINDOW) 83 + info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW; 84 + 84 85 return vm_unmapped_area(&info); 85 86 }

+18

arch/powerpc/mm/hugetlbpage.c

··· 753 753 if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) 754 754 return -EINVAL; 755 755 756 + #ifdef CONFIG_PPC_BOOK3S_64 757 + /* 758 + * We need to make sure that for different page sizes reported by 759 + * firmware we only add hugetlb support for page sizes that can be 760 + * supported by linux page table layout. 761 + * For now we have 762 + * Radix: 2M 763 + * Hash: 16M and 16G 764 + */ 765 + if (radix_enabled()) { 766 + if (mmu_psize != MMU_PAGE_2M) 767 + return -EINVAL; 768 + } else { 769 + if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G) 770 + return -EINVAL; 771 + } 772 + #endif 773 + 756 774 BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); 757 775 758 776 /* Return if huge page size has already been setup */

-4

arch/powerpc/mm/init_64.c

··· 71 71 #if H_PGTABLE_RANGE > USER_VSID_RANGE 72 72 #warning Limited user VSID range means pagetable space is wasted 73 73 #endif 74 - 75 - #if (TASK_SIZE_USER64 < H_PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) 76 - #warning TASK_SIZE is smaller than it needs to be. 77 - #endif 78 74 #endif /* CONFIG_PPC_STD_MMU_64 */ 79 75 80 76 phys_addr_t memstart_addr = ~0;

+33 -20

arch/powerpc/mm/mmap.c

··· 59 59 60 60 unsigned long arch_mmap_rnd(void) 61 61 { 62 - unsigned long rnd; 62 + unsigned long shift, rnd; 63 63 64 - /* 8MB for 32bit, 1GB for 64bit */ 64 + shift = mmap_rnd_bits; 65 + #ifdef CONFIG_COMPAT 65 66 if (is_32bit_task()) 66 - rnd = get_random_long() % (1<<(23-PAGE_SHIFT)); 67 - else 68 - rnd = get_random_long() % (1UL<<(30-PAGE_SHIFT)); 67 + shift = mmap_rnd_compat_bits; 68 + #endif 69 + rnd = get_random_long() % (1ul << shift); 69 70 70 71 return rnd << PAGE_SHIFT; 71 72 } ··· 80 79 else if (gap > MAX_GAP) 81 80 gap = MAX_GAP; 82 81 83 - return PAGE_ALIGN(TASK_SIZE - gap - rnd); 82 + return PAGE_ALIGN(DEFAULT_MAP_WINDOW - gap - rnd); 84 83 } 85 84 86 85 #ifdef CONFIG_PPC_RADIX_MMU ··· 98 97 struct vm_area_struct *vma; 99 98 struct vm_unmapped_area_info info; 100 99 101 - if (len > TASK_SIZE - mmap_min_addr) 100 + if (unlikely(addr > mm->context.addr_limit && 101 + mm->context.addr_limit != TASK_SIZE)) 102 + mm->context.addr_limit = TASK_SIZE; 103 + 104 + if (len > mm->task_size - mmap_min_addr) 102 105 return -ENOMEM; 103 106 104 107 if (flags & MAP_FIXED) ··· 111 106 if (addr) { 112 107 addr = PAGE_ALIGN(addr); 113 108 vma = find_vma(mm, addr); 114 - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && 109 + if (mm->task_size - len >= addr && addr >= mmap_min_addr && 115 110 (!vma || addr + len <= vma->vm_start)) 116 111 return addr; 117 112 } ··· 119 114 info.flags = 0; 120 115 info.length = len; 121 116 info.low_limit = mm->mmap_base; 122 - info.high_limit = TASK_SIZE; 123 117 info.align_mask = 0; 118 + 119 + if (unlikely(addr > DEFAULT_MAP_WINDOW)) 120 + info.high_limit = mm->context.addr_limit; 121 + else 122 + info.high_limit = DEFAULT_MAP_WINDOW; 123 + 124 124 return vm_unmapped_area(&info); 125 125 } 126 126 ··· 141 131 unsigned long addr = addr0; 142 132 struct vm_unmapped_area_info info; 143 133 134 + if (unlikely(addr > mm->context.addr_limit && 135 + mm->context.addr_limit != TASK_SIZE)) 136 + mm->context.addr_limit = TASK_SIZE; 137 + 144 138 /* requested length too big for entire address space */ 145 - if (len > TASK_SIZE - mmap_min_addr) 139 + if (len > mm->task_size - mmap_min_addr) 146 140 return -ENOMEM; 147 141 148 142 if (flags & MAP_FIXED) ··· 156 142 if (addr) { 157 143 addr = PAGE_ALIGN(addr); 158 144 vma = find_vma(mm, addr); 159 - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && 145 + if (mm->task_size - len >= addr && addr >= mmap_min_addr && 160 146 (!vma || addr + len <= vma->vm_start)) 161 147 return addr; 162 148 } ··· 166 152 info.low_limit = max(PAGE_SIZE, mmap_min_addr); 167 153 info.high_limit = mm->mmap_base; 168 154 info.align_mask = 0; 155 + 156 + if (addr > DEFAULT_MAP_WINDOW) 157 + info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW; 158 + 169 159 addr = vm_unmapped_area(&info); 160 + if (!(addr & ~PAGE_MASK)) 161 + return addr; 162 + VM_BUG_ON(addr != -ENOMEM); 170 163 171 164 /* 172 165 * A failed mmap() very likely causes application failure, ··· 181 160 * can happen with large stack limits and large mmap() 182 161 * allocations. 183 162 */ 184 - if (addr & ~PAGE_MASK) { 185 - VM_BUG_ON(addr != -ENOMEM); 186 - info.flags = 0; 187 - info.low_limit = TASK_UNMAPPED_BASE; 188 - info.high_limit = TASK_SIZE; 189 - addr = vm_unmapped_area(&info); 190 - } 191 - 192 - return addr; 163 + return radix__arch_get_unmapped_area(filp, addr0, len, pgoff, flags); 193 164 } 194 165 195 166 static void radix__arch_pick_mmap_layout(struct mm_struct *mm,

+86 -30

arch/powerpc/mm/mmu_context_book3s64.c

··· 30 30 static DEFINE_SPINLOCK(mmu_context_lock); 31 31 static DEFINE_IDA(mmu_context_ida); 32 32 33 - int __init_new_context(void) 33 + static int alloc_context_id(int min_id, int max_id) 34 34 { 35 - int index; 36 - int err; 35 + int index, err; 37 36 38 37 again: 39 38 if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL)) 40 39 return -ENOMEM; 41 40 42 41 spin_lock(&mmu_context_lock); 43 - err = ida_get_new_above(&mmu_context_ida, 1, &index); 42 + err = ida_get_new_above(&mmu_context_ida, min_id, &index); 44 43 spin_unlock(&mmu_context_lock); 45 44 46 45 if (err == -EAGAIN) ··· 47 48 else if (err) 48 49 return err; 49 50 50 - if (index > MAX_USER_CONTEXT) { 51 + if (index > max_id) { 51 52 spin_lock(&mmu_context_lock); 52 53 ida_remove(&mmu_context_ida, index); 53 54 spin_unlock(&mmu_context_lock); ··· 56 57 57 58 return index; 58 59 } 59 - EXPORT_SYMBOL_GPL(__init_new_context); 60 - static int radix__init_new_context(struct mm_struct *mm, int index) 60 + 61 + void hash__reserve_context_id(int id) 62 + { 63 + int rc, result = 0; 64 + 65 + do { 66 + if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL)) 67 + break; 68 + 69 + spin_lock(&mmu_context_lock); 70 + rc = ida_get_new_above(&mmu_context_ida, id, &result); 71 + spin_unlock(&mmu_context_lock); 72 + } while (rc == -EAGAIN); 73 + 74 + WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result); 75 + } 76 + 77 + int hash__alloc_context_id(void) 78 + { 79 + unsigned long max; 80 + 81 + if (mmu_has_feature(MMU_FTR_68_BIT_VA)) 82 + max = MAX_USER_CONTEXT; 83 + else 84 + max = MAX_USER_CONTEXT_65BIT_VA; 85 + 86 + return alloc_context_id(MIN_USER_CONTEXT, max); 87 + } 88 + EXPORT_SYMBOL_GPL(hash__alloc_context_id); 89 + 90 + static int hash__init_new_context(struct mm_struct *mm) 91 + { 92 + int index; 93 + 94 + index = hash__alloc_context_id(); 95 + if (index < 0) 96 + return index; 97 + 98 + /* 99 + * We do switch_slb() early in fork, even before we setup the 100 + * mm->context.addr_limit. Default to max task size so that we copy the 101 + * default values to paca which will help us to handle slb miss early. 102 + */ 103 + mm->context.addr_limit = TASK_SIZE_128TB; 104 + 105 + /* 106 + * The old code would re-promote on fork, we don't do that when using 107 + * slices as it could cause problem promoting slices that have been 108 + * forced down to 4K. 109 + * 110 + * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check 111 + * explicitly against context.id == 0. This ensures that we properly 112 + * initialize context slice details for newly allocated mm's (which will 113 + * have id == 0) and don't alter context slice inherited via fork (which 114 + * will have id != 0). 115 + * 116 + * We should not be calling init_new_context() on init_mm. Hence a 117 + * check against 0 is OK. 118 + */ 119 + if (mm->context.id == 0) 120 + slice_set_user_psize(mm, mmu_virtual_psize); 121 + 122 + subpage_prot_init_new_context(mm); 123 + 124 + return index; 125 + } 126 + 127 + static int radix__init_new_context(struct mm_struct *mm) 61 128 { 62 129 unsigned long rts_field; 130 + int index; 131 + 132 + index = alloc_context_id(1, PRTB_ENTRIES - 1); 133 + if (index < 0) 134 + return index; 63 135 64 136 /* 65 137 * set the process table entry, 66 138 */ 67 139 rts_field = radix__get_tree_size(); 68 140 process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE); 69 - return 0; 141 + 142 + mm->context.npu_context = NULL; 143 + 144 + return index; 70 145 } 71 146 72 147 int init_new_context(struct task_struct *tsk, struct mm_struct *mm) 73 148 { 74 149 int index; 75 150 76 - index = __init_new_context(); 151 + if (radix_enabled()) 152 + index = radix__init_new_context(mm); 153 + else 154 + index = hash__init_new_context(mm); 155 + 77 156 if (index < 0) 78 157 return index; 79 158 80 - if (radix_enabled()) { 81 - radix__init_new_context(mm, index); 82 - } else { 83 - 84 - /* The old code would re-promote on fork, we don't do that 85 - * when using slices as it could cause problem promoting slices 86 - * that have been forced down to 4K 87 - * 88 - * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check 89 - * explicitly against context.id == 0. This ensures that we 90 - * properly initialize context slice details for newly allocated 91 - * mm's (which will have id == 0) and don't alter context slice 92 - * inherited via fork (which will have id != 0). 93 - * 94 - * We should not be calling init_new_context() on init_mm. Hence a 95 - * check against 0 is ok. 96 - */ 97 - if (mm->context.id == 0) 98 - slice_set_user_psize(mm, mmu_virtual_psize); 99 - subpage_prot_init_new_context(mm); 100 - } 101 159 mm->context.id = index; 102 160 #ifdef CONFIG_PPC_ICSWX 103 161 mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL);

+41 -2

arch/powerpc/mm/mmu_context_iommu.c

··· 81 81 gfp_t gfp_mask = GFP_USER; 82 82 struct page *new_page; 83 83 84 - if (PageHuge(page) || PageTransHuge(page) || PageCompound(page)) 84 + if (PageCompound(page)) 85 85 return NULL; 86 86 87 87 if (PageHighMem(page)) ··· 100 100 LIST_HEAD(cma_migrate_pages); 101 101 102 102 /* Ignore huge pages for now */ 103 - if (PageHuge(page) || PageTransHuge(page) || PageCompound(page)) 103 + if (PageCompound(page)) 104 104 return -EBUSY; 105 105 106 106 lru_add_drain(); ··· 314 314 } 315 315 EXPORT_SYMBOL_GPL(mm_iommu_lookup); 316 316 317 + struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm, 318 + unsigned long ua, unsigned long size) 319 + { 320 + struct mm_iommu_table_group_mem_t *mem, *ret = NULL; 321 + 322 + list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list, 323 + next) { 324 + if ((mem->ua <= ua) && 325 + (ua + size <= mem->ua + 326 + (mem->entries << PAGE_SHIFT))) { 327 + ret = mem; 328 + break; 329 + } 330 + } 331 + 332 + return ret; 333 + } 334 + EXPORT_SYMBOL_GPL(mm_iommu_lookup_rm); 335 + 317 336 struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, 318 337 unsigned long ua, unsigned long entries) 319 338 { ··· 363 344 return 0; 364 345 } 365 346 EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa); 347 + 348 + long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, 349 + unsigned long ua, unsigned long *hpa) 350 + { 351 + const long entry = (ua - mem->ua) >> PAGE_SHIFT; 352 + void *va = &mem->hpas[entry]; 353 + unsigned long *pa; 354 + 355 + if (entry >= mem->entries) 356 + return -EFAULT; 357 + 358 + pa = (void *) vmalloc_to_phys(va); 359 + if (!pa) 360 + return -EFAULT; 361 + 362 + *hpa = *pa | (ua & ~PAGE_MASK); 363 + 364 + return 0; 365 + } 366 + EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa_rm); 366 367 367 368 long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem) 368 369 {

-5

arch/powerpc/mm/mmu_context_nohash.c

··· 333 333 334 334 mm->context.id = MMU_NO_CONTEXT; 335 335 mm->context.active = 0; 336 - 337 - #ifdef CONFIG_PPC_MM_SLICES 338 - slice_set_user_psize(mm, mmu_virtual_psize); 339 - #endif 340 - 341 336 return 0; 342 337 } 343 338

-7

arch/powerpc/mm/numa.c

··· 875 875 void *nd; 876 876 int tnid; 877 877 878 - if (spanned_pages) 879 - pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n", 880 - nid, start_pfn << PAGE_SHIFT, 881 - (end_pfn << PAGE_SHIFT) - 1); 882 - else 883 - pr_info("Initmem setup node %d\n", nid); 884 - 885 878 nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); 886 879 nd = __va(nd_pa); 887 880

+2 -2

arch/powerpc/mm/slb.c

··· 131 131 "slbmte %2,%3\n" 132 132 "isync" 133 133 :: "r"(mk_vsid_data(VMALLOC_START, mmu_kernel_ssize, vflags)), 134 - "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, 1)), 134 + "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, VMALLOC_INDEX)), 135 135 "r"(ksp_vsid_data), 136 136 "r"(ksp_esid_data) 137 137 : "memory"); ··· 229 229 asm volatile("slbie %0" : : "r" (slbie_data)); 230 230 231 231 get_paca()->slb_cache_ptr = 0; 232 - copy_mm_to_paca(&mm->context); 232 + copy_mm_to_paca(mm); 233 233 234 234 /* 235 235 * preload some userspace segments into the SLB.

+57 -25

arch/powerpc/mm/slb_low.S

··· 23 23 #include <asm/pgtable.h> 24 24 #include <asm/firmware.h> 25 25 26 + /* 27 + * This macro generates asm code to compute the VSID scramble 28 + * function. Used in slb_allocate() and do_stab_bolted. The function 29 + * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS 30 + * 31 + * rt = register containing the proto-VSID and into which the 32 + * VSID will be stored 33 + * rx = scratch register (clobbered) 34 + * rf = flags 35 + * 36 + * - rt and rx must be different registers 37 + * - The answer will end up in the low VSID_BITS bits of rt. The higher 38 + * bits may contain other garbage, so you may need to mask the 39 + * result. 40 + */ 41 + #define ASM_VSID_SCRAMBLE(rt, rx, rf, size) \ 42 + lis rx,VSID_MULTIPLIER_##size@h; \ 43 + ori rx,rx,VSID_MULTIPLIER_##size@l; \ 44 + mulld rt,rt,rx; /* rt = rt * MULTIPLIER */ \ 45 + /* \ 46 + * powermac get slb fault before feature fixup, so make 65 bit part \ 47 + * the default part of feature fixup \ 48 + */ \ 49 + BEGIN_MMU_FTR_SECTION \ 50 + srdi rx,rt,VSID_BITS_65_##size; \ 51 + clrldi rt,rt,(64-VSID_BITS_65_##size); \ 52 + add rt,rt,rx; \ 53 + addi rx,rt,1; \ 54 + srdi rx,rx,VSID_BITS_65_##size; \ 55 + add rt,rt,rx; \ 56 + rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_65_##size)); \ 57 + MMU_FTR_SECTION_ELSE \ 58 + srdi rx,rt,VSID_BITS_##size; \ 59 + clrldi rt,rt,(64-VSID_BITS_##size); \ 60 + add rt,rt,rx; /* add high and low bits */ \ 61 + addi rx,rt,1; \ 62 + srdi rx,rx,VSID_BITS_##size; /* extract 2^VSID_BITS bit */ \ 63 + add rt,rt,rx; \ 64 + rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_##size)); \ 65 + ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA) 66 + 67 + 26 68 /* void slb_allocate_realmode(unsigned long ea); 27 69 * 28 70 * Create an SLB entry for the given EA (user or kernel). ··· 87 45 /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */ 88 46 blt cr7,0f /* user or kernel? */ 89 47 90 - /* kernel address: proto-VSID = ESID */ 91 - /* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but 92 - * this code will generate the protoVSID 0xfffffffff for the 93 - * top segment. That's ok, the scramble below will translate 94 - * it to VSID 0, which is reserved as a bad VSID - one which 95 - * will never have any pages in it. */ 96 - 97 48 /* Check if hitting the linear mapping or some other kernel space 98 49 */ 99 50 bne cr7,1f ··· 98 63 slb_miss_kernel_load_linear: 99 64 li r11,0 100 65 /* 101 - * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 66 + * context = (ea >> 60) - (0xc - 1) 102 67 * r9 = region id. 103 68 */ 104 - addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha 105 - addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l 106 - 69 + subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET 107 70 108 71 BEGIN_FTR_SECTION 109 72 b .Lslb_finish_load ··· 110 77 111 78 1: 112 79 #ifdef CONFIG_SPARSEMEM_VMEMMAP 113 - /* Check virtual memmap region. To be patches at kernel boot */ 114 80 cmpldi cr0,r9,0xf 115 81 bne 1f 82 + /* Check virtual memmap region. To be patched at kernel boot */ 116 83 .globl slb_miss_kernel_load_vmemmap 117 84 slb_miss_kernel_load_vmemmap: 118 85 li r11,0 ··· 135 102 li r11,0 136 103 6: 137 104 /* 138 - * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 105 + * context = (ea >> 60) - (0xc - 1) 139 106 * r9 = region id. 140 107 */ 141 - addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha 142 - addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l 108 + subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET 143 109 144 110 BEGIN_FTR_SECTION 145 111 b .Lslb_finish_load ··· 149 117 * For userspace addresses, make sure this is region 0. 150 118 */ 151 119 cmpdi r9, 0 152 - bne 8f 120 + bne- 8f 121 + /* 122 + * user space make sure we are within the allowed limit 123 + */ 124 + ld r11,PACA_ADDR_LIMIT(r13) 125 + cmpld r3,r11 126 + bge- 8f 153 127 154 128 /* when using slices, we extract the psize off the slice bitmaps 155 129 * and then we need to get the sllp encoding off the mmu_psize_defs ··· 227 189 */ 228 190 .Lslb_finish_load: 229 191 rldimi r10,r9,ESID_BITS,0 230 - ASM_VSID_SCRAMBLE(r10,r9,256M) 231 - /* 232 - * bits above VSID_BITS_256M need to be ignored from r10 233 - * also combine VSID and flags 234 - */ 235 - rldimi r11,r10,SLB_VSID_SHIFT,(64 - (SLB_VSID_SHIFT + VSID_BITS_256M)) 236 - 192 + ASM_VSID_SCRAMBLE(r10,r9,r11,256M) 237 193 /* r3 = EA, r11 = VSID data */ 238 194 /* 239 195 * Find a slot, round robin. Previously we tried to find a ··· 291 259 .Lslb_finish_load_1T: 292 260 srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */ 293 261 rldimi r10,r9,ESID_BITS_1T,0 294 - ASM_VSID_SCRAMBLE(r10,r9,1T) 262 + ASM_VSID_SCRAMBLE(r10,r9,r11,1T) 295 263 /* 296 264 * bits above VSID_BITS_1T need to be ignored from r10 297 265 * also combine VSID and flags 298 266 */ 299 - rldimi r11,r10,SLB_VSID_SHIFT_1T,(64 - (SLB_VSID_SHIFT_1T + VSID_BITS_1T)) 267 + 300 268 li r10,MMU_SEGSIZE_1T 301 269 rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ 302 270

+157 -101

arch/powerpc/mm/slice.c

··· 36 36 #include <asm/copro.h> 37 37 #include <asm/hugetlb.h> 38 38 39 - /* some sanity checks */ 40 - #if (H_PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE 41 - #error H_PGTABLE_RANGE exceeds slice_mask high_slices size 42 - #endif 43 - 44 39 static DEFINE_SPINLOCK(slice_convert_lock); 45 - 40 + /* 41 + * One bit per slice. We have lower slices which cover 256MB segments 42 + * upto 4G range. That gets us 16 low slices. For the rest we track slices 43 + * in 1TB size. 44 + */ 45 + struct slice_mask { 46 + u64 low_slices; 47 + DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH); 48 + }; 46 49 47 50 #ifdef DEBUG 48 51 int _slice_debug = 1; 49 52 50 53 static void slice_print_mask(const char *label, struct slice_mask mask) 51 54 { 52 - char *p, buf[16 + 3 + 64 + 1]; 53 - int i; 54 - 55 55 if (!_slice_debug) 56 56 return; 57 - p = buf; 58 - for (i = 0; i < SLICE_NUM_LOW; i++) 59 - *(p++) = (mask.low_slices & (1 << i)) ? '1' : '0'; 60 - *(p++) = ' '; 61 - *(p++) = '-'; 62 - *(p++) = ' '; 63 - for (i = 0; i < SLICE_NUM_HIGH; i++) 64 - *(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0'; 65 - *(p++) = 0; 66 - 67 - printk(KERN_DEBUG "%s:%s\n", label, buf); 57 + pr_devel("%s low_slice: %*pbl\n", label, (int)SLICE_NUM_LOW, &mask.low_slices); 58 + pr_devel("%s high_slice: %*pbl\n", label, (int)SLICE_NUM_HIGH, mask.high_slices); 68 59 } 69 60 70 - #define slice_dbg(fmt...) do { if (_slice_debug) pr_debug(fmt); } while(0) 61 + #define slice_dbg(fmt...) do { if (_slice_debug) pr_devel(fmt); } while (0) 71 62 72 63 #else 73 64 ··· 67 76 68 77 #endif 69 78 70 - static struct slice_mask slice_range_to_mask(unsigned long start, 71 - unsigned long len) 79 + static void slice_range_to_mask(unsigned long start, unsigned long len, 80 + struct slice_mask *ret) 72 81 { 73 82 unsigned long end = start + len - 1; 74 - struct slice_mask ret = { 0, 0 }; 83 + 84 + ret->low_slices = 0; 85 + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); 75 86 76 87 if (start < SLICE_LOW_TOP) { 77 - unsigned long mend = min(end, SLICE_LOW_TOP); 78 - unsigned long mstart = min(start, SLICE_LOW_TOP); 88 + unsigned long mend = min(end, (SLICE_LOW_TOP - 1)); 79 89 80 - ret.low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) 81 - - (1u << GET_LOW_SLICE_INDEX(mstart)); 90 + ret->low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) 91 + - (1u << GET_LOW_SLICE_INDEX(start)); 82 92 } 83 93 84 - if ((start + len) > SLICE_LOW_TOP) 85 - ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1)) 86 - - (1ul << GET_HIGH_SLICE_INDEX(start)); 94 + if ((start + len) > SLICE_LOW_TOP) { 95 + unsigned long start_index = GET_HIGH_SLICE_INDEX(start); 96 + unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT)); 97 + unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index; 87 98 88 - return ret; 99 + bitmap_set(ret->high_slices, start_index, count); 100 + } 89 101 } 90 102 91 103 static int slice_area_is_free(struct mm_struct *mm, unsigned long addr, ··· 122 128 return !slice_area_is_free(mm, start, end - start); 123 129 } 124 130 125 - static struct slice_mask slice_mask_for_free(struct mm_struct *mm) 131 + static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret) 126 132 { 127 - struct slice_mask ret = { 0, 0 }; 128 133 unsigned long i; 134 + 135 + ret->low_slices = 0; 136 + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); 129 137 130 138 for (i = 0; i < SLICE_NUM_LOW; i++) 131 139 if (!slice_low_has_vma(mm, i)) 132 - ret.low_slices |= 1u << i; 140 + ret->low_slices |= 1u << i; 133 141 134 142 if (mm->task_size <= SLICE_LOW_TOP) 135 - return ret; 143 + return; 136 144 137 - for (i = 0; i < SLICE_NUM_HIGH; i++) 145 + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) 138 146 if (!slice_high_has_vma(mm, i)) 139 - ret.high_slices |= 1ul << i; 140 - 141 - return ret; 147 + __set_bit(i, ret->high_slices); 142 148 } 143 149 144 - static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize) 150 + static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_mask *ret) 145 151 { 146 152 unsigned char *hpsizes; 147 153 int index, mask_index; 148 - struct slice_mask ret = { 0, 0 }; 149 154 unsigned long i; 150 155 u64 lpsizes; 156 + 157 + ret->low_slices = 0; 158 + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); 151 159 152 160 lpsizes = mm->context.low_slices_psize; 153 161 for (i = 0; i < SLICE_NUM_LOW; i++) 154 162 if (((lpsizes >> (i * 4)) & 0xf) == psize) 155 - ret.low_slices |= 1u << i; 163 + ret->low_slices |= 1u << i; 156 164 157 165 hpsizes = mm->context.high_slices_psize; 158 - for (i = 0; i < SLICE_NUM_HIGH; i++) { 166 + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) { 159 167 mask_index = i & 0x1; 160 168 index = i >> 1; 161 169 if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize) 162 - ret.high_slices |= 1ul << i; 170 + __set_bit(i, ret->high_slices); 163 171 } 164 - 165 - return ret; 166 172 } 167 173 168 - static int slice_check_fit(struct slice_mask mask, struct slice_mask available) 174 + static int slice_check_fit(struct mm_struct *mm, 175 + struct slice_mask mask, struct slice_mask available) 169 176 { 177 + DECLARE_BITMAP(result, SLICE_NUM_HIGH); 178 + unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.addr_limit); 179 + 180 + bitmap_and(result, mask.high_slices, 181 + available.high_slices, slice_count); 182 + 170 183 return (mask.low_slices & available.low_slices) == mask.low_slices && 171 - (mask.high_slices & available.high_slices) == mask.high_slices; 184 + bitmap_equal(result, mask.high_slices, slice_count); 172 185 } 173 186 174 187 static void slice_flush_segments(void *parm) ··· 186 185 if (mm != current->active_mm) 187 186 return; 188 187 189 - copy_mm_to_paca(&current->active_mm->context); 188 + copy_mm_to_paca(current->active_mm); 190 189 191 190 local_irq_save(flags); 192 191 slb_flush_and_rebolt(); ··· 219 218 mm->context.low_slices_psize = lpsizes; 220 219 221 220 hpsizes = mm->context.high_slices_psize; 222 - for (i = 0; i < SLICE_NUM_HIGH; i++) { 221 + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) { 223 222 mask_index = i & 0x1; 224 223 index = i >> 1; 225 - if (mask.high_slices & (1ul << i)) 224 + if (test_bit(i, mask.high_slices)) 226 225 hpsizes[index] = (hpsizes[index] & 227 226 ~(0xf << (mask_index * 4))) | 228 227 (((unsigned long)psize) << (mask_index * 4)); 229 228 } 230 229 231 230 slice_dbg(" lsps=%lx, hsps=%lx\n", 232 - mm->context.low_slices_psize, 233 - mm->context.high_slices_psize); 231 + (unsigned long)mm->context.low_slices_psize, 232 + (unsigned long)mm->context.high_slices_psize); 234 233 235 234 spin_unlock_irqrestore(&slice_convert_lock, flags); 236 235 ··· 258 257 slice = GET_HIGH_SLICE_INDEX(addr); 259 258 *boundary_addr = (slice + end) ? 260 259 ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP; 261 - return !!(available.high_slices & (1ul << slice)); 260 + return !!test_bit(slice, available.high_slices); 262 261 } 263 262 } 264 263 265 264 static unsigned long slice_find_area_bottomup(struct mm_struct *mm, 266 265 unsigned long len, 267 266 struct slice_mask available, 268 - int psize) 267 + int psize, unsigned long high_limit) 269 268 { 270 269 int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); 271 270 unsigned long addr, found, next_end; ··· 277 276 info.align_offset = 0; 278 277 279 278 addr = TASK_UNMAPPED_BASE; 280 - while (addr < TASK_SIZE) { 279 + /* 280 + * Check till the allow max value for this mmap request 281 + */ 282 + while (addr < high_limit) { 281 283 info.low_limit = addr; 282 284 if (!slice_scan_available(addr, available, 1, &addr)) 283 285 continue; ··· 292 288 * Check if we need to reduce the range, or if we can 293 289 * extend it to cover the next available slice. 294 290 */ 295 - if (addr >= TASK_SIZE) 296 - addr = TASK_SIZE; 291 + if (addr >= high_limit) 292 + addr = high_limit; 297 293 else if (slice_scan_available(addr, available, 1, &next_end)) { 298 294 addr = next_end; 299 295 goto next_slice; ··· 311 307 static unsigned long slice_find_area_topdown(struct mm_struct *mm, 312 308 unsigned long len, 313 309 struct slice_mask available, 314 - int psize) 310 + int psize, unsigned long high_limit) 315 311 { 316 312 int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); 317 313 unsigned long addr, found, prev; ··· 323 319 info.align_offset = 0; 324 320 325 321 addr = mm->mmap_base; 322 + /* 323 + * If we are trying to allocate above DEFAULT_MAP_WINDOW 324 + * Add the different to the mmap_base. 325 + * Only for that request for which high_limit is above 326 + * DEFAULT_MAP_WINDOW we should apply this. 327 + */ 328 + if (high_limit > DEFAULT_MAP_WINDOW) 329 + addr += mm->context.addr_limit - DEFAULT_MAP_WINDOW; 330 + 326 331 while (addr > PAGE_SIZE) { 327 332 info.high_limit = addr; 328 333 if (!slice_scan_available(addr - 1, available, 0, &addr)) ··· 363 350 * can happen with large stack limits and large mmap() 364 351 * allocations. 365 352 */ 366 - return slice_find_area_bottomup(mm, len, available, psize); 353 + return slice_find_area_bottomup(mm, len, available, psize, high_limit); 367 354 } 368 355 369 356 370 357 static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, 371 358 struct slice_mask mask, int psize, 372 - int topdown) 359 + int topdown, unsigned long high_limit) 373 360 { 374 361 if (topdown) 375 - return slice_find_area_topdown(mm, len, mask, psize); 362 + return slice_find_area_topdown(mm, len, mask, psize, high_limit); 376 363 else 377 - return slice_find_area_bottomup(mm, len, mask, psize); 364 + return slice_find_area_bottomup(mm, len, mask, psize, high_limit); 378 365 } 379 366 380 - #define or_mask(dst, src) do { \ 381 - (dst).low_slices |= (src).low_slices; \ 382 - (dst).high_slices |= (src).high_slices; \ 383 - } while (0) 367 + static inline void slice_or_mask(struct slice_mask *dst, struct slice_mask *src) 368 + { 369 + DECLARE_BITMAP(result, SLICE_NUM_HIGH); 384 370 385 - #define andnot_mask(dst, src) do { \ 386 - (dst).low_slices &= ~(src).low_slices; \ 387 - (dst).high_slices &= ~(src).high_slices; \ 388 - } while (0) 371 + dst->low_slices |= src->low_slices; 372 + bitmap_or(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH); 373 + bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH); 374 + } 375 + 376 + static inline void slice_andnot_mask(struct slice_mask *dst, struct slice_mask *src) 377 + { 378 + DECLARE_BITMAP(result, SLICE_NUM_HIGH); 379 + 380 + dst->low_slices &= ~src->low_slices; 381 + 382 + bitmap_andnot(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH); 383 + bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH); 384 + } 389 385 390 386 #ifdef CONFIG_PPC_64K_PAGES 391 387 #define MMU_PAGE_BASE MMU_PAGE_64K ··· 406 384 unsigned long flags, unsigned int psize, 407 385 int topdown) 408 386 { 409 - struct slice_mask mask = {0, 0}; 387 + struct slice_mask mask; 410 388 struct slice_mask good_mask; 411 - struct slice_mask potential_mask = {0,0} /* silence stupid warning */; 412 - struct slice_mask compat_mask = {0, 0}; 389 + struct slice_mask potential_mask; 390 + struct slice_mask compat_mask; 413 391 int fixed = (flags & MAP_FIXED); 414 392 int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); 415 393 struct mm_struct *mm = current->mm; 416 394 unsigned long newaddr; 395 + unsigned long high_limit; 396 + 397 + /* 398 + * Check if we need to expland slice area. 399 + */ 400 + if (unlikely(addr > mm->context.addr_limit && 401 + mm->context.addr_limit != TASK_SIZE)) { 402 + mm->context.addr_limit = TASK_SIZE; 403 + on_each_cpu(slice_flush_segments, mm, 1); 404 + } 405 + /* 406 + * This mmap request can allocate upt to 512TB 407 + */ 408 + if (addr > DEFAULT_MAP_WINDOW) 409 + high_limit = mm->context.addr_limit; 410 + else 411 + high_limit = DEFAULT_MAP_WINDOW; 412 + /* 413 + * init different masks 414 + */ 415 + mask.low_slices = 0; 416 + bitmap_zero(mask.high_slices, SLICE_NUM_HIGH); 417 + 418 + /* silence stupid warning */; 419 + potential_mask.low_slices = 0; 420 + bitmap_zero(potential_mask.high_slices, SLICE_NUM_HIGH); 421 + 422 + compat_mask.low_slices = 0; 423 + bitmap_zero(compat_mask.high_slices, SLICE_NUM_HIGH); 417 424 418 425 /* Sanity checks */ 419 426 BUG_ON(mm->task_size == 0); ··· 474 423 /* First make up a "good" mask of slices that have the right size 475 424 * already 476 425 */ 477 - good_mask = slice_mask_for_size(mm, psize); 426 + slice_mask_for_size(mm, psize, &good_mask); 478 427 slice_print_mask(" good_mask", good_mask); 479 428 480 429 /* ··· 499 448 #ifdef CONFIG_PPC_64K_PAGES 500 449 /* If we support combo pages, we can allow 64k pages in 4k slices */ 501 450 if (psize == MMU_PAGE_64K) { 502 - compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K); 451 + slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask); 503 452 if (fixed) 504 - or_mask(good_mask, compat_mask); 453 + slice_or_mask(&good_mask, &compat_mask); 505 454 } 506 455 #endif 507 456 508 457 /* First check hint if it's valid or if we have MAP_FIXED */ 509 458 if (addr != 0 || fixed) { 510 459 /* Build a mask for the requested range */ 511 - mask = slice_range_to_mask(addr, len); 460 + slice_range_to_mask(addr, len, &mask); 512 461 slice_print_mask(" mask", mask); 513 462 514 463 /* Check if we fit in the good mask. If we do, we just return, 515 464 * nothing else to do 516 465 */ 517 - if (slice_check_fit(mask, good_mask)) { 466 + if (slice_check_fit(mm, mask, good_mask)) { 518 467 slice_dbg(" fits good !\n"); 519 468 return addr; 520 469 } ··· 522 471 /* Now let's see if we can find something in the existing 523 472 * slices for that size 524 473 */ 525 - newaddr = slice_find_area(mm, len, good_mask, psize, topdown); 474 + newaddr = slice_find_area(mm, len, good_mask, 475 + psize, topdown, high_limit); 526 476 if (newaddr != -ENOMEM) { 527 477 /* Found within the good mask, we don't have to setup, 528 478 * we thus return directly ··· 536 484 /* We don't fit in the good mask, check what other slices are 537 485 * empty and thus can be converted 538 486 */ 539 - potential_mask = slice_mask_for_free(mm); 540 - or_mask(potential_mask, good_mask); 487 + slice_mask_for_free(mm, &potential_mask); 488 + slice_or_mask(&potential_mask, &good_mask); 541 489 slice_print_mask(" potential", potential_mask); 542 490 543 - if ((addr != 0 || fixed) && slice_check_fit(mask, potential_mask)) { 491 + if ((addr != 0 || fixed) && slice_check_fit(mm, mask, potential_mask)) { 544 492 slice_dbg(" fits potential !\n"); 545 493 goto convert; 546 494 } ··· 555 503 * anywhere in the good area. 556 504 */ 557 505 if (addr) { 558 - addr = slice_find_area(mm, len, good_mask, psize, topdown); 506 + addr = slice_find_area(mm, len, good_mask, 507 + psize, topdown, high_limit); 559 508 if (addr != -ENOMEM) { 560 509 slice_dbg(" found area at 0x%lx\n", addr); 561 510 return addr; ··· 566 513 /* Now let's see if we can find something in the existing slices 567 514 * for that size plus free slices 568 515 */ 569 - addr = slice_find_area(mm, len, potential_mask, psize, topdown); 516 + addr = slice_find_area(mm, len, potential_mask, 517 + psize, topdown, high_limit); 570 518 571 519 #ifdef CONFIG_PPC_64K_PAGES 572 520 if (addr == -ENOMEM && psize == MMU_PAGE_64K) { 573 521 /* retry the search with 4k-page slices included */ 574 - or_mask(potential_mask, compat_mask); 575 - addr = slice_find_area(mm, len, potential_mask, psize, 576 - topdown); 522 + slice_or_mask(&potential_mask, &compat_mask); 523 + addr = slice_find_area(mm, len, potential_mask, 524 + psize, topdown, high_limit); 577 525 } 578 526 #endif 579 527 580 528 if (addr == -ENOMEM) 581 529 return -ENOMEM; 582 530 583 - mask = slice_range_to_mask(addr, len); 531 + slice_range_to_mask(addr, len, &mask); 584 532 slice_dbg(" found potential area at 0x%lx\n", addr); 585 533 slice_print_mask(" mask", mask); 586 534 587 535 convert: 588 - andnot_mask(mask, good_mask); 589 - andnot_mask(mask, compat_mask); 590 - if (mask.low_slices || mask.high_slices) { 536 + slice_andnot_mask(&mask, &good_mask); 537 + slice_andnot_mask(&mask, &compat_mask); 538 + if (mask.low_slices || !bitmap_empty(mask.high_slices, SLICE_NUM_HIGH)) { 591 539 slice_convert(mm, mask, psize); 592 540 if (psize > MMU_PAGE_BASE) 593 541 on_each_cpu(slice_flush_segments, mm, 1); ··· 703 649 704 650 705 651 slice_dbg(" lsps=%lx, hsps=%lx\n", 706 - mm->context.low_slices_psize, 707 - mm->context.high_slices_psize); 652 + (unsigned long)mm->context.low_slices_psize, 653 + (unsigned long)mm->context.high_slices_psize); 708 654 709 655 bail: 710 656 spin_unlock_irqrestore(&slice_convert_lock, flags); ··· 713 659 void slice_set_range_psize(struct mm_struct *mm, unsigned long start, 714 660 unsigned long len, unsigned int psize) 715 661 { 716 - struct slice_mask mask = slice_range_to_mask(start, len); 662 + struct slice_mask mask; 717 663 718 664 VM_BUG_ON(radix_enabled()); 665 + 666 + slice_range_to_mask(start, len, &mask); 719 667 slice_convert(mm, mask, psize); 720 668 } 721 669 ··· 750 694 if (radix_enabled()) 751 695 return 0; 752 696 753 - mask = slice_range_to_mask(addr, len); 754 - available = slice_mask_for_size(mm, psize); 697 + slice_range_to_mask(addr, len, &mask); 698 + slice_mask_for_size(mm, psize, &available); 755 699 #ifdef CONFIG_PPC_64K_PAGES 756 700 /* We need to account for 4k slices too */ 757 701 if (psize == MMU_PAGE_64K) { 758 702 struct slice_mask compat_mask; 759 - compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K); 760 - or_mask(available, compat_mask); 703 + slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask); 704 + slice_or_mask(&available, &compat_mask); 761 705 } 762 706 #endif 763 707 ··· 767 711 slice_print_mask(" mask", mask); 768 712 slice_print_mask(" available", available); 769 713 #endif 770 - return !slice_check_fit(mask, available); 714 + return !slice_check_fit(mm, mask, available); 771 715 } 772 716 #endif

+2 -1

arch/powerpc/mm/subpage-prot.c

··· 197 197 198 198 /* Check parameters */ 199 199 if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) || 200 - addr >= TASK_SIZE || len >= TASK_SIZE || addr + len > TASK_SIZE) 200 + addr >= mm->task_size || len >= mm->task_size || 201 + addr + len > mm->task_size) 201 202 return -EINVAL; 202 203 203 204 if (is_hugepage_only_range(mm, addr, len))

+48 -45

arch/powerpc/mm/tlb-radix.c

··· 17 17 #include <asm/tlb.h> 18 18 #include <asm/tlbflush.h> 19 19 20 - static DEFINE_RAW_SPINLOCK(native_tlbie_lock); 21 20 22 21 #define RIC_FLUSH_TLB 0 23 22 #define RIC_FLUSH_PWC 1 ··· 33 34 prs = 1; /* process scoped */ 34 35 r = 1; /* raidx format */ 35 36 36 - asm volatile("ptesync": : :"memory"); 37 37 asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) 38 38 : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); 39 - asm volatile("ptesync": : :"memory"); 40 39 } 41 40 42 41 /* ··· 44 47 { 45 48 int set; 46 49 47 - for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) { 50 + asm volatile("ptesync": : :"memory"); 51 + 52 + /* 53 + * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL, 54 + * also flush the entire Page Walk Cache. 55 + */ 56 + __tlbiel_pid(pid, 0, ric); 57 + 58 + if (ric == RIC_FLUSH_ALL) 59 + /* For the remaining sets, just flush the TLB */ 60 + ric = RIC_FLUSH_TLB; 61 + 62 + for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) 48 63 __tlbiel_pid(pid, set, ric); 49 - } 64 + 65 + asm volatile("ptesync": : :"memory"); 66 + asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); 67 + } 68 + 69 + static inline void tlbiel_pwc(unsigned long pid) 70 + { 71 + asm volatile("ptesync": : :"memory"); 72 + 73 + /* For PWC flush, we don't look at set number */ 74 + __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); 75 + 76 + asm volatile("ptesync": : :"memory"); 50 77 asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); 51 78 } 52 79 ··· 150 129 { 151 130 unsigned long pid; 152 131 struct mm_struct *mm = tlb->mm; 132 + /* 133 + * If we are doing a full mm flush, we will do a tlb flush 134 + * with RIC_FLUSH_ALL later. 135 + */ 136 + if (tlb->fullmm) 137 + return; 153 138 154 139 preempt_disable(); 155 140 156 141 pid = mm->context.id; 157 142 if (pid != MMU_NO_CONTEXT) 158 - _tlbiel_pid(pid, RIC_FLUSH_PWC); 143 + tlbiel_pwc(pid); 159 144 160 145 preempt_enable(); 161 146 } ··· 202 175 if (unlikely(pid == MMU_NO_CONTEXT)) 203 176 goto no_context; 204 177 205 - if (!mm_is_thread_local(mm)) { 206 - int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); 207 - 208 - if (lock_tlbie) 209 - raw_spin_lock(&native_tlbie_lock); 178 + if (!mm_is_thread_local(mm)) 210 179 _tlbie_pid(pid, RIC_FLUSH_ALL); 211 - if (lock_tlbie) 212 - raw_spin_unlock(&native_tlbie_lock); 213 - } else 180 + else 214 181 _tlbiel_pid(pid, RIC_FLUSH_ALL); 215 182 no_context: 216 183 preempt_enable(); ··· 216 195 unsigned long pid; 217 196 struct mm_struct *mm = tlb->mm; 218 197 198 + /* 199 + * If we are doing a full mm flush, we will do a tlb flush 200 + * with RIC_FLUSH_ALL later. 201 + */ 202 + if (tlb->fullmm) 203 + return; 219 204 preempt_disable(); 220 205 221 206 pid = mm->context.id; 222 207 if (unlikely(pid == MMU_NO_CONTEXT)) 223 208 goto no_context; 224 209 225 - if (!mm_is_thread_local(mm)) { 226 - int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); 227 - 228 - if (lock_tlbie) 229 - raw_spin_lock(&native_tlbie_lock); 210 + if (!mm_is_thread_local(mm)) 230 211 _tlbie_pid(pid, RIC_FLUSH_PWC); 231 - if (lock_tlbie) 232 - raw_spin_unlock(&native_tlbie_lock); 233 - } else 234 - _tlbiel_pid(pid, RIC_FLUSH_PWC); 212 + else 213 + tlbiel_pwc(pid); 235 214 no_context: 236 215 preempt_enable(); 237 216 } ··· 247 226 pid = mm ? mm->context.id : 0; 248 227 if (unlikely(pid == MMU_NO_CONTEXT)) 249 228 goto bail; 250 - if (!mm_is_thread_local(mm)) { 251 - int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); 252 - 253 - if (lock_tlbie) 254 - raw_spin_lock(&native_tlbie_lock); 229 + if (!mm_is_thread_local(mm)) 255 230 _tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB); 256 - if (lock_tlbie) 257 - raw_spin_unlock(&native_tlbie_lock); 258 - } else 231 + else 259 232 _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB); 260 233 bail: 261 234 preempt_enable(); ··· 270 255 271 256 void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) 272 257 { 273 - int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); 274 - 275 - if (lock_tlbie) 276 - raw_spin_lock(&native_tlbie_lock); 277 258 _tlbie_pid(0, RIC_FLUSH_ALL); 278 - if (lock_tlbie) 279 - raw_spin_unlock(&native_tlbie_lock); 280 259 } 281 260 EXPORT_SYMBOL(radix__flush_tlb_kernel_range); 282 261 ··· 332 323 unsigned long addr; 333 324 int local = mm_is_thread_local(mm); 334 325 unsigned long ap = mmu_get_ap(psize); 335 - int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); 336 326 unsigned long page_size = 1UL << mmu_psize_defs[psize].shift; 337 327 338 328 ··· 352 344 353 345 if (local) 354 346 _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); 355 - else { 356 - if (lock_tlbie) 357 - raw_spin_lock(&native_tlbie_lock); 347 + else 358 348 _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); 359 - if (lock_tlbie) 360 - raw_spin_unlock(&native_tlbie_lock); 361 - } 362 349 } 363 350 err_out: 364 351 preempt_enable(); ··· 440 437 return; 441 438 } 442 439 443 - if (old_pte & _PAGE_LARGE) 440 + if (old_pte & R_PAGE_LARGE) 444 441 radix__flush_tlb_page_psize(mm, address, MMU_PAGE_2M); 445 442 else 446 443 radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize);

+1 -1

arch/powerpc/mm/tlb_nohash.c

··· 770 770 * avoid going over total available memory just in case... 771 771 */ 772 772 #ifdef CONFIG_PPC_FSL_BOOK3E 773 - if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { 773 + if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { 774 774 unsigned long linear_sz; 775 775 unsigned int num_cams; 776 776

+8

arch/powerpc/perf/core-book3s.c

··· 2049 2049 data.br_stack = &cpuhw->bhrb_stack; 2050 2050 } 2051 2051 2052 + if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC && 2053 + ppmu->get_mem_data_src) 2054 + ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs); 2055 + 2056 + if (event->attr.sample_type & PERF_SAMPLE_WEIGHT && 2057 + ppmu->get_mem_weight) 2058 + ppmu->get_mem_weight(&data.weight); 2059 + 2052 2060 if (perf_event_overflow(event, &data, regs)) 2053 2061 power_pmu_stop(event, 0); 2054 2062 }

+82

arch/powerpc/perf/isa207-common.c

··· 148 148 return true; 149 149 } 150 150 151 + static inline u64 isa207_find_source(u64 idx, u32 sub_idx) 152 + { 153 + u64 ret = PERF_MEM_NA; 154 + 155 + switch(idx) { 156 + case 0: 157 + /* Nothing to do */ 158 + break; 159 + case 1: 160 + ret = PH(LVL, L1); 161 + break; 162 + case 2: 163 + ret = PH(LVL, L2); 164 + break; 165 + case 3: 166 + ret = PH(LVL, L3); 167 + break; 168 + case 4: 169 + if (sub_idx <= 1) 170 + ret = PH(LVL, LOC_RAM); 171 + else if (sub_idx > 1 && sub_idx <= 2) 172 + ret = PH(LVL, REM_RAM1); 173 + else 174 + ret = PH(LVL, REM_RAM2); 175 + ret |= P(SNOOP, HIT); 176 + break; 177 + case 5: 178 + ret = PH(LVL, REM_CCE1); 179 + if ((sub_idx == 0) || (sub_idx == 2) || (sub_idx == 4)) 180 + ret |= P(SNOOP, HIT); 181 + else if ((sub_idx == 1) || (sub_idx == 3) || (sub_idx == 5)) 182 + ret |= P(SNOOP, HITM); 183 + break; 184 + case 6: 185 + ret = PH(LVL, REM_CCE2); 186 + if ((sub_idx == 0) || (sub_idx == 2)) 187 + ret |= P(SNOOP, HIT); 188 + else if ((sub_idx == 1) || (sub_idx == 3)) 189 + ret |= P(SNOOP, HITM); 190 + break; 191 + case 7: 192 + ret = PM(LVL, L1); 193 + break; 194 + } 195 + 196 + return ret; 197 + } 198 + 199 + void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags, 200 + struct pt_regs *regs) 201 + { 202 + u64 idx; 203 + u32 sub_idx; 204 + u64 sier; 205 + u64 val; 206 + 207 + /* Skip if no SIER support */ 208 + if (!(flags & PPMU_HAS_SIER)) { 209 + dsrc->val = 0; 210 + return; 211 + } 212 + 213 + sier = mfspr(SPRN_SIER); 214 + val = (sier & ISA207_SIER_TYPE_MASK) >> ISA207_SIER_TYPE_SHIFT; 215 + if (val == 1 || val == 2) { 216 + idx = (sier & ISA207_SIER_LDST_MASK) >> ISA207_SIER_LDST_SHIFT; 217 + sub_idx = (sier & ISA207_SIER_DATA_SRC_MASK) >> ISA207_SIER_DATA_SRC_SHIFT; 218 + 219 + dsrc->val = isa207_find_source(idx, sub_idx); 220 + dsrc->val |= (val == 1) ? P(OP, LOAD) : P(OP, STORE); 221 + } 222 + } 223 + 224 + void isa207_get_mem_weight(u64 *weight) 225 + { 226 + u64 mmcra = mfspr(SPRN_MMCRA); 227 + u64 exp = MMCRA_THR_CTR_EXP(mmcra); 228 + u64 mantissa = MMCRA_THR_CTR_MANT(mmcra); 229 + 230 + *weight = mantissa << (2 * exp); 231 + } 232 + 151 233 int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp) 152 234 { 153 235 unsigned int unit, pmc, cache, ebb;

+25 -1

arch/powerpc/perf/isa207-common.h

··· 248 248 #define MMCRA_SDAR_MODE_TLB (1ull << MMCRA_SDAR_MODE_SHIFT) 249 249 #define MMCRA_SDAR_MODE_NO_UPDATES ~(0x3ull << MMCRA_SDAR_MODE_SHIFT) 250 250 #define MMCRA_IFM_SHIFT 30 251 + #define MMCRA_THR_CTR_MANT_SHIFT 19 252 + #define MMCRA_THR_CTR_MANT_MASK 0x7Ful 253 + #define MMCRA_THR_CTR_MANT(v) (((v) >> MMCRA_THR_CTR_MANT_SHIFT) &\ 254 + MMCRA_THR_CTR_MANT_MASK) 255 + 256 + #define MMCRA_THR_CTR_EXP_SHIFT 27 257 + #define MMCRA_THR_CTR_EXP_MASK 0x7ul 258 + #define MMCRA_THR_CTR_EXP(v) (((v) >> MMCRA_THR_CTR_EXP_SHIFT) &\ 259 + MMCRA_THR_CTR_EXP_MASK) 251 260 252 261 /* MMCR1 Threshold Compare bit constant for power9 */ 253 262 #define p9_MMCRA_THR_CMP_SHIFT 45 ··· 269 260 #define MAX_ALT 2 270 261 #define MAX_PMU_COUNTERS 6 271 262 263 + #define ISA207_SIER_TYPE_SHIFT 15 264 + #define ISA207_SIER_TYPE_MASK (0x7ull << ISA207_SIER_TYPE_SHIFT) 265 + 266 + #define ISA207_SIER_LDST_SHIFT 1 267 + #define ISA207_SIER_LDST_MASK (0x7ull << ISA207_SIER_LDST_SHIFT) 268 + 269 + #define ISA207_SIER_DATA_SRC_SHIFT 53 270 + #define ISA207_SIER_DATA_SRC_MASK (0x7ull << ISA207_SIER_DATA_SRC_SHIFT) 271 + 272 + #define P(a, b) PERF_MEM_S(a, b) 273 + #define PH(a, b) (P(LVL, HIT) | P(a, b)) 274 + #define PM(a, b) (P(LVL, MISS) | P(a, b)) 275 + 272 276 int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp); 273 277 int isa207_compute_mmcr(u64 event[], int n_ev, 274 278 unsigned int hwc[], unsigned long mmcr[], ··· 289 267 void isa207_disable_pmc(unsigned int pmc, unsigned long mmcr[]); 290 268 int isa207_get_alternatives(u64 event, u64 alt[], 291 269 const unsigned int ev_alt[][MAX_ALT], int size); 292 - 270 + void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags, 271 + struct pt_regs *regs); 272 + void isa207_get_mem_weight(u64 *weight); 293 273 294 274 #endif

+6

arch/powerpc/perf/power8-events-list.h

··· 89 89 EVENT(PM_MRK_FILT_MATCH_ALT, 0x3012e) 90 90 /* Alternate event code for PM_LD_MISS_L1 */ 91 91 EVENT(PM_LD_MISS_L1_ALT, 0x400f0) 92 + /* 93 + * Memory Access Event -- mem_access 94 + * Primary PMU event used here is PM_MRK_INST_CMPL, along with 95 + * Random Load/Store Facility Sampling (RIS) in Random sampling mode (MMCRA[SM]). 96 + */ 97 + EVENT(MEM_ACCESS, 0x10401e0)

+4

arch/powerpc/perf/power8-pmu.c

··· 90 90 GENERIC_EVENT_ATTR(branch-misses, PM_BR_MPRED_CMPL); 91 91 GENERIC_EVENT_ATTR(cache-references, PM_LD_REF_L1); 92 92 GENERIC_EVENT_ATTR(cache-misses, PM_LD_MISS_L1); 93 + GENERIC_EVENT_ATTR(mem_access, MEM_ACCESS); 93 94 94 95 CACHE_EVENT_ATTR(L1-dcache-load-misses, PM_LD_MISS_L1); 95 96 CACHE_EVENT_ATTR(L1-dcache-loads, PM_LD_REF_L1); ··· 121 120 GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL), 122 121 GENERIC_EVENT_PTR(PM_LD_REF_L1), 123 122 GENERIC_EVENT_PTR(PM_LD_MISS_L1), 123 + GENERIC_EVENT_PTR(MEM_ACCESS), 124 124 125 125 CACHE_EVENT_PTR(PM_LD_MISS_L1), 126 126 CACHE_EVENT_PTR(PM_LD_REF_L1), ··· 327 325 .bhrb_filter_map = power8_bhrb_filter_map, 328 326 .get_constraint = isa207_get_constraint, 329 327 .get_alternatives = power8_get_alternatives, 328 + .get_mem_data_src = isa207_get_mem_data_src, 329 + .get_mem_weight = isa207_get_mem_weight, 330 330 .disable_pmc = isa207_disable_pmc, 331 331 .flags = PPMU_HAS_SIER | PPMU_ARCH_207S, 332 332 .n_generic = ARRAY_SIZE(power8_generic_events),

+2

arch/powerpc/perf/power9-pmu.c

··· 427 427 .bhrb_filter_map = power9_bhrb_filter_map, 428 428 .get_constraint = isa207_get_constraint, 429 429 .get_alternatives = power9_get_alternatives, 430 + .get_mem_data_src = isa207_get_mem_data_src, 431 + .get_mem_weight = isa207_get_mem_weight, 430 432 .disable_pmc = isa207_disable_pmc, 431 433 .flags = PPMU_HAS_SIER | PPMU_ARCH_207S, 432 434 .n_generic = ARRAY_SIZE(power9_generic_events),

+1 -1

arch/powerpc/platforms/44x/sam440ep.c

··· 70 70 .irq = -1, 71 71 }; 72 72 73 - static int sam440ep_setup_rtc(void) 73 + static int __init sam440ep_setup_rtc(void) 74 74 { 75 75 return i2c_register_board_info(0, &sam440ep_rtc_info, 1); 76 76 }

-1

arch/powerpc/platforms/52xx/Kconfig

··· 33 33 bool "bPlan Efika 5k2. MPC5200B based computer" 34 34 depends on PPC_MPC52xx 35 35 select PPC_RTAS 36 - select RTAS_PROC 37 36 select PPC_NATIVE 38 37 39 38 config PPC_LITE5200

+3 -9

arch/powerpc/platforms/85xx/smp.c

··· 344 344 } 345 345 346 346 struct smp_ops_t smp_85xx_ops = { 347 + .cause_nmi_ipi = NULL, 347 348 .kick_cpu = smp_85xx_kick_cpu, 348 349 .cpu_bootable = smp_generic_cpu_bootable, 349 350 #ifdef CONFIG_HOTPLUG_CPU ··· 462 461 } 463 462 #endif /* CONFIG_KEXEC_CORE */ 464 463 465 - static void smp_85xx_basic_setup(int cpu_nr) 466 - { 467 - if (cpu_has_feature(CPU_FTR_DBELL)) 468 - doorbell_setup_this_cpu(); 469 - } 470 - 471 464 static void smp_85xx_setup_cpu(int cpu_nr) 472 465 { 473 466 mpic_setup_this_cpu(); 474 - smp_85xx_basic_setup(cpu_nr); 475 467 } 476 468 477 469 void __init mpc85xx_smp_init(void) ··· 478 484 smp_85xx_ops.setup_cpu = smp_85xx_setup_cpu; 479 485 smp_85xx_ops.message_pass = smp_mpic_message_pass; 480 486 } else 481 - smp_85xx_ops.setup_cpu = smp_85xx_basic_setup; 487 + smp_85xx_ops.setup_cpu = NULL; 482 488 483 489 if (cpu_has_feature(CPU_FTR_DBELL)) { 484 490 /* ··· 486 492 * smp_muxed_ipi_message_pass 487 493 */ 488 494 smp_85xx_ops.message_pass = NULL; 489 - smp_85xx_ops.cause_ipi = doorbell_cause_ipi; 495 + smp_85xx_ops.cause_ipi = doorbell_global_ipi; 490 496 smp_85xx_ops.probe = NULL; 491 497 } 492 498

+1

arch/powerpc/platforms/86xx/mpc86xx_smp.c

··· 105 105 106 106 107 107 struct smp_ops_t smp_86xx_ops = { 108 + .cause_nmi_ipi = NULL, 108 109 .message_pass = smp_mpic_message_pass, 109 110 .probe = smp_mpic_probe, 110 111 .kick_cpu = smp_86xx_kick_cpu,

+11 -3

arch/powerpc/platforms/Kconfig.cputype

··· 279 279 280 280 This option enables kernel support for the PowerPC Initiate 281 281 Coprocessor Store Word (icswx) coprocessor instruction on POWER7 282 - or newer processors. 282 + and POWER8 processors. POWER9 uses new copy/paste instructions 283 + to invoke the coprocessor. 283 284 284 285 This option is only useful if you have a processor that supports 285 286 the icswx coprocessor instruction. It does not have any effect ··· 360 359 361 360 config PPC_MM_SLICES 362 361 bool 363 - default y if (!PPC_FSL_BOOK3E && PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES) 362 + default y if PPC_STD_MMU_64 364 363 default n 365 364 366 365 config PPC_HAVE_PMU_SUPPORT ··· 372 371 help 373 372 This enables the powerpc-specific perf_event back-end. 374 373 374 + config FORCE_SMP 375 + # Allow platforms to force SMP=y by selecting this 376 + bool 377 + default n 378 + select SMP 379 + 375 380 config SMP 376 381 depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE || PPC_47x 377 - bool "Symmetric multi-processing support" 382 + select GENERIC_IRQ_MIGRATION 383 + bool "Symmetric multi-processing support" if !FORCE_SMP 378 384 ---help--- 379 385 This enables support for systems with more than one CPU. If you have 380 386 a system with only one CPU, say N. If you have a system with more

+1 -1

arch/powerpc/platforms/cell/axon_msi.c

··· 15 15 #include <linux/msi.h> 16 16 #include <linux/export.h> 17 17 #include <linux/of_platform.h> 18 - #include <linux/debugfs.h> 19 18 #include <linux/slab.h> 20 19 20 + #include <asm/debugfs.h> 21 21 #include <asm/dcr.h> 22 22 #include <asm/machdep.h> 23 23 #include <asm/prom.h>

+1 -1

arch/powerpc/platforms/cell/interrupt.c

··· 211 211 iic_request_ipi(PPC_MSG_CALL_FUNCTION); 212 212 iic_request_ipi(PPC_MSG_RESCHEDULE); 213 213 iic_request_ipi(PPC_MSG_TICK_BROADCAST); 214 - iic_request_ipi(PPC_MSG_DEBUGGER_BREAK); 214 + iic_request_ipi(PPC_MSG_NMI_IPI); 215 215 } 216 216 217 217 #endif /* CONFIG_SMP */

+7 -4

arch/powerpc/platforms/cell/pervasive.c

··· 88 88 static int cbe_system_reset_exception(struct pt_regs *regs) 89 89 { 90 90 switch (regs->msr & SRR1_WAKEMASK) { 91 - case SRR1_WAKEEE: 92 - do_IRQ(regs); 93 - break; 94 91 case SRR1_WAKEDEC: 95 - timer_interrupt(regs); 92 + set_dec(1); 93 + case SRR1_WAKEEE: 94 + /* 95 + * Handle these when interrupts get re-enabled and we take 96 + * them as regular exceptions. We are in an NMI context 97 + * and can't handle these here. 98 + */ 96 99 break; 97 100 case SRR1_WAKEMT: 98 101 return cbe_sysreset_hack();

+1

arch/powerpc/platforms/chrp/smp.c

··· 44 44 45 45 /* CHRP with openpic */ 46 46 struct smp_ops_t chrp_smp_ops = { 47 + .cause_nmi_ipi = NULL, 47 48 .message_pass = smp_mpic_message_pass, 48 49 .probe = smp_mpic_probe, 49 50 .kick_cpu = smp_chrp_kick_cpu,

+7 -4

arch/powerpc/platforms/pasemi/idle.c

··· 53 53 regs->nip = regs->link; 54 54 55 55 switch (regs->msr & SRR1_WAKEMASK) { 56 - case SRR1_WAKEEE: 57 - do_IRQ(regs); 58 - break; 59 56 case SRR1_WAKEDEC: 60 - timer_interrupt(regs); 57 + set_dec(1); 58 + case SRR1_WAKEEE: 59 + /* 60 + * Handle these when interrupts get re-enabled and we take 61 + * them as regular exceptions. We are in an NMI context 62 + * and can't handle these here. 63 + */ 61 64 break; 62 65 default: 63 66 /* do system reset */

+2 -1

arch/powerpc/platforms/powermac/smp.c

··· 172 172 return IRQ_HANDLED; 173 173 } 174 174 175 - static void smp_psurge_cause_ipi(int cpu, unsigned long data) 175 + static void smp_psurge_cause_ipi(int cpu) 176 176 { 177 177 psurge_set_ipi(cpu); 178 178 } ··· 447 447 struct smp_ops_t psurge_smp_ops = { 448 448 .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */ 449 449 .cause_ipi = smp_psurge_cause_ipi, 450 + .cause_nmi_ipi = NULL, 450 451 .probe = smp_psurge_probe, 451 452 .kick_cpu = smp_psurge_kick_cpu, 452 453 .setup_cpu = smp_psurge_setup_cpu,

+3

arch/powerpc/platforms/powernv/Kconfig

··· 4 4 select PPC_NATIVE 5 5 select PPC_XICS 6 6 select PPC_ICP_NATIVE 7 + select PPC_XIVE_NATIVE 7 8 select PPC_P7_NAP 8 9 select PCI 9 10 select PCI_MSI ··· 20 19 select CPU_FREQ_GOV_ONDEMAND 21 20 select CPU_FREQ_GOV_CONSERVATIVE 22 21 select PPC_DOORBELL 22 + select MMU_NOTIFIER 23 + select FORCE_SMP 23 24 default y 24 25 25 26 config OPAL_PRD

+7

arch/powerpc/platforms/powernv/eeh-powernv.c

··· 1102 1102 return -EIO; 1103 1103 } 1104 1104 1105 + /* 1106 + * If dealing with the root bus (or the bus underneath the 1107 + * root port), we reset the bus underneath the root port. 1108 + * 1109 + * The cxl driver depends on this behaviour for bi-modal card 1110 + * switching. 1111 + */ 1105 1112 if (pci_is_root_bus(bus) || 1106 1113 pci_is_root_bus(bus->parent)) 1107 1114 return pnv_eeh_root_reset(hose, option);

+80 -29

arch/powerpc/platforms/powernv/idle.c

··· 53 53 uint64_t pir = get_hard_smp_processor_id(cpu); 54 54 uint64_t hsprg0_val = (uint64_t)&paca[cpu]; 55 55 56 - if (!cpu_has_feature(CPU_FTR_ARCH_300)) { 57 - /* 58 - * HSPRG0 is used to store the cpu's pointer to paca. 59 - * Hence last 3 bits are guaranteed to be 0. Program 60 - * slw to restore HSPRG0 with 63rd bit set, so that 61 - * when a thread wakes up at 0x100 we can use this bit 62 - * to distinguish between fastsleep and deep winkle. 63 - * This is not necessary with stop/psscr since PLS 64 - * field of psscr indicates which state we are waking 65 - * up from. 66 - */ 67 - hsprg0_val |= 1; 68 - } 69 56 rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val); 70 57 if (rc != 0) 71 58 return rc; ··· 109 122 for (i = 0; i < nr_cores; i++) { 110 123 int first_cpu = i * threads_per_core; 111 124 int node = cpu_to_node(first_cpu); 125 + size_t paca_ptr_array_size; 112 126 113 127 core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node); 114 128 *core_idle_state = PNV_CORE_IDLE_THREAD_BITS; 129 + paca_ptr_array_size = (threads_per_core * 130 + sizeof(struct paca_struct *)); 115 131 116 132 for (j = 0; j < threads_per_core; j++) { 117 133 int cpu = first_cpu + j; ··· 122 132 paca[cpu].core_idle_state_ptr = core_idle_state; 123 133 paca[cpu].thread_idle_state = PNV_THREAD_RUNNING; 124 134 paca[cpu].thread_mask = 1 << j; 135 + if (!cpu_has_feature(CPU_FTR_POWER9_DD1)) 136 + continue; 137 + paca[cpu].thread_sibling_pacas = 138 + kmalloc_node(paca_ptr_array_size, 139 + GFP_KERNEL, node); 125 140 } 126 141 } 127 142 ··· 141 146 return supported_cpuidle_states; 142 147 } 143 148 EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states); 144 - 145 149 146 150 static void pnv_fastsleep_workaround_apply(void *info) 147 151 ··· 235 241 * The default stop state that will be used by ppc_md.power_save 236 242 * function on platforms that support stop instruction. 237 243 */ 238 - u64 pnv_default_stop_val; 239 - u64 pnv_default_stop_mask; 244 + static u64 pnv_default_stop_val; 245 + static u64 pnv_default_stop_mask; 246 + static bool default_stop_found; 240 247 241 248 /* 242 249 * Used for ppc_md.power_save which needs a function with no parameters ··· 257 262 * psscr value and mask of the deepest stop idle state. 258 263 * Used when a cpu is offlined. 259 264 */ 260 - u64 pnv_deepest_stop_psscr_val; 261 - u64 pnv_deepest_stop_psscr_mask; 265 + static u64 pnv_deepest_stop_psscr_val; 266 + static u64 pnv_deepest_stop_psscr_mask; 267 + static bool deepest_stop_found; 268 + 269 + /* 270 + * pnv_cpu_offline: A function that puts the CPU into the deepest 271 + * available platform idle state on a CPU-Offline. 272 + */ 273 + unsigned long pnv_cpu_offline(unsigned int cpu) 274 + { 275 + unsigned long srr1; 276 + 277 + u32 idle_states = pnv_get_supported_cpuidle_states(); 278 + 279 + if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) { 280 + srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val, 281 + pnv_deepest_stop_psscr_mask); 282 + } else if (idle_states & OPAL_PM_WINKLE_ENABLED) { 283 + srr1 = power7_winkle(); 284 + } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || 285 + (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { 286 + srr1 = power7_sleep(); 287 + } else if (idle_states & OPAL_PM_NAP_ENABLED) { 288 + srr1 = power7_nap(1); 289 + } else { 290 + /* This is the fallback method. We emulate snooze */ 291 + while (!generic_check_cpu_restart(cpu)) { 292 + HMT_low(); 293 + HMT_very_low(); 294 + } 295 + srr1 = 0; 296 + HMT_medium(); 297 + } 298 + 299 + return srr1; 300 + } 262 301 263 302 /* 264 303 * Power ISA 3.0 idle initialization. ··· 381 352 u32 *residency_ns = NULL; 382 353 u64 max_residency_ns = 0; 383 354 int rc = 0, i; 384 - bool default_stop_found = false, deepest_stop_found = false; 385 355 386 356 psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL); 387 357 psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL); ··· 460 432 } 461 433 } 462 434 463 - if (!default_stop_found) { 464 - pnv_default_stop_val = PSSCR_HV_DEFAULT_VAL; 465 - pnv_default_stop_mask = PSSCR_HV_DEFAULT_MASK; 466 - pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n", 435 + if (unlikely(!default_stop_found)) { 436 + pr_warn("cpuidle-powernv: No suitable default stop state found. Disabling platform idle.\n"); 437 + } else { 438 + ppc_md.power_save = power9_idle; 439 + pr_info("cpuidle-powernv: Default stop: psscr = 0x%016llx,mask=0x%016llx\n", 467 440 pnv_default_stop_val, pnv_default_stop_mask); 468 441 } 469 442 470 - if (!deepest_stop_found) { 471 - pnv_deepest_stop_psscr_val = PSSCR_HV_DEFAULT_VAL; 472 - pnv_deepest_stop_psscr_mask = PSSCR_HV_DEFAULT_MASK; 473 - pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n", 443 + if (unlikely(!deepest_stop_found)) { 444 + pr_warn("cpuidle-powernv: No suitable stop state for CPU-Hotplug. Offlined CPUs will busy wait"); 445 + } else { 446 + pr_info("cpuidle-powernv: Deepest stop: psscr = 0x%016llx,mask=0x%016llx\n", 474 447 pnv_deepest_stop_psscr_val, 475 448 pnv_deepest_stop_psscr_mask); 476 449 } 477 450 451 + pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n", 452 + pnv_first_deep_stop_state); 478 453 out: 479 454 kfree(psscr_val); 480 455 kfree(psscr_mask); ··· 555 524 556 525 pnv_alloc_idle_core_states(); 557 526 527 + /* 528 + * For each CPU, record its PACA address in each of it's 529 + * sibling thread's PACA at the slot corresponding to this 530 + * CPU's index in the core. 531 + */ 532 + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { 533 + int cpu; 534 + 535 + pr_info("powernv: idle: Saving PACA pointers of all CPUs in their thread sibling PACA\n"); 536 + for_each_possible_cpu(cpu) { 537 + int base_cpu = cpu_first_thread_sibling(cpu); 538 + int idx = cpu_thread_in_core(cpu); 539 + int i; 540 + 541 + for (i = 0; i < threads_per_core; i++) { 542 + int j = base_cpu + i; 543 + 544 + paca[j].thread_sibling_pacas[idx] = &paca[cpu]; 545 + } 546 + } 547 + } 548 + 558 549 if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED) 559 550 ppc_md.power_save = power7_idle; 560 - else if (supported_cpuidle_states & OPAL_PM_STOP_INST_FAST) 561 - ppc_md.power_save = power9_idle; 562 551 563 552 out: 564 553 return 0;

+466 -4

arch/powerpc/platforms/powernv/npu-dma.c

··· 9 9 * License as published by the Free Software Foundation. 10 10 */ 11 11 12 + #include <linux/slab.h> 13 + #include <linux/mmu_notifier.h> 14 + #include <linux/mmu_context.h> 15 + #include <linux/of.h> 12 16 #include <linux/export.h> 13 17 #include <linux/pci.h> 14 18 #include <linux/memblock.h> 15 19 #include <linux/iommu.h> 16 20 21 + #include <asm/tlb.h> 22 + #include <asm/powernv.h> 23 + #include <asm/reg.h> 24 + #include <asm/opal.h> 25 + #include <asm/io.h> 17 26 #include <asm/iommu.h> 18 27 #include <asm/pnv-pci.h> 19 28 #include <asm/msi_bitmap.h> ··· 30 21 31 22 #include "powernv.h" 32 23 #include "pci.h" 24 + 25 + #define npu_to_phb(x) container_of(x, struct pnv_phb, npu) 33 26 34 27 /* 35 28 * Other types of TCE cache invalidation are not functional in the ··· 47 36 { 48 37 struct device_node *dn; 49 38 struct pci_dev *gpdev; 39 + 40 + if (WARN_ON(!npdev)) 41 + return NULL; 42 + 43 + if (WARN_ON(!npdev->dev.of_node)) 44 + return NULL; 50 45 51 46 /* Get assoicated PCI device */ 52 47 dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0); ··· 71 54 { 72 55 struct device_node *dn; 73 56 struct pci_dev *npdev; 57 + 58 + if (WARN_ON(!gpdev)) 59 + return NULL; 60 + 61 + if (WARN_ON(!gpdev->dev.of_node)) 62 + return NULL; 74 63 75 64 /* Get assoicated PCI device */ 76 65 dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index); ··· 203 180 pe_err(npe, "Failed to configure TCE table, err %lld\n", rc); 204 181 return rc; 205 182 } 206 - pnv_pci_phb3_tce_invalidate_entire(phb, false); 183 + pnv_pci_ioda2_tce_invalidate_entire(phb, false); 207 184 208 185 /* Add the table to the list so its TCE cache will get invalidated */ 209 186 pnv_pci_link_table_and_group(phb->hose->node, num, ··· 227 204 pe_err(npe, "Unmapping failed, ret = %lld\n", rc); 228 205 return rc; 229 206 } 230 - pnv_pci_phb3_tce_invalidate_entire(phb, false); 207 + pnv_pci_ioda2_tce_invalidate_entire(phb, false); 231 208 232 209 pnv_pci_unlink_table_and_group(npe->table_group.tables[num], 233 210 &npe->table_group); ··· 293 270 0 /* bypass base */, top); 294 271 295 272 if (rc == OPAL_SUCCESS) 296 - pnv_pci_phb3_tce_invalidate_entire(phb, false); 273 + pnv_pci_ioda2_tce_invalidate_entire(phb, false); 297 274 298 275 return rc; 299 276 } ··· 357 334 pe_err(npe, "Failed to disable bypass, err %lld\n", rc); 358 335 return; 359 336 } 360 - pnv_pci_phb3_tce_invalidate_entire(npe->phb, false); 337 + pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false); 361 338 } 362 339 363 340 struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) ··· 381 358 } 382 359 383 360 return gpe; 361 + } 362 + 363 + /* Maximum number of nvlinks per npu */ 364 + #define NV_MAX_LINKS 6 365 + 366 + /* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */ 367 + static int max_npu2_index; 368 + 369 + struct npu_context { 370 + struct mm_struct *mm; 371 + struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS]; 372 + struct mmu_notifier mn; 373 + struct kref kref; 374 + 375 + /* Callback to stop translation requests on a given GPU */ 376 + struct npu_context *(*release_cb)(struct npu_context *, void *); 377 + 378 + /* 379 + * Private pointer passed to the above callback for usage by 380 + * device drivers. 381 + */ 382 + void *priv; 383 + }; 384 + 385 + /* 386 + * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC 387 + * if none are available. 388 + */ 389 + static int get_mmio_atsd_reg(struct npu *npu) 390 + { 391 + int i; 392 + 393 + for (i = 0; i < npu->mmio_atsd_count; i++) { 394 + if (!test_and_set_bit(i, &npu->mmio_atsd_usage)) 395 + return i; 396 + } 397 + 398 + return -ENOSPC; 399 + } 400 + 401 + static void put_mmio_atsd_reg(struct npu *npu, int reg) 402 + { 403 + clear_bit(reg, &npu->mmio_atsd_usage); 404 + } 405 + 406 + /* MMIO ATSD register offsets */ 407 + #define XTS_ATSD_AVA 1 408 + #define XTS_ATSD_STAT 2 409 + 410 + static int mmio_launch_invalidate(struct npu *npu, unsigned long launch, 411 + unsigned long va) 412 + { 413 + int mmio_atsd_reg; 414 + 415 + do { 416 + mmio_atsd_reg = get_mmio_atsd_reg(npu); 417 + cpu_relax(); 418 + } while (mmio_atsd_reg < 0); 419 + 420 + __raw_writeq(cpu_to_be64(va), 421 + npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA); 422 + eieio(); 423 + __raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]); 424 + 425 + return mmio_atsd_reg; 426 + } 427 + 428 + static int mmio_invalidate_pid(struct npu *npu, unsigned long pid) 429 + { 430 + unsigned long launch; 431 + 432 + /* IS set to invalidate matching PID */ 433 + launch = PPC_BIT(12); 434 + 435 + /* PRS set to process-scoped */ 436 + launch |= PPC_BIT(13); 437 + 438 + /* AP */ 439 + launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); 440 + 441 + /* PID */ 442 + launch |= pid << PPC_BITLSHIFT(38); 443 + 444 + /* Invalidating the entire process doesn't use a va */ 445 + return mmio_launch_invalidate(npu, launch, 0); 446 + } 447 + 448 + static int mmio_invalidate_va(struct npu *npu, unsigned long va, 449 + unsigned long pid) 450 + { 451 + unsigned long launch; 452 + 453 + /* IS set to invalidate target VA */ 454 + launch = 0; 455 + 456 + /* PRS set to process scoped */ 457 + launch |= PPC_BIT(13); 458 + 459 + /* AP */ 460 + launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); 461 + 462 + /* PID */ 463 + launch |= pid << PPC_BITLSHIFT(38); 464 + 465 + return mmio_launch_invalidate(npu, launch, va); 466 + } 467 + 468 + #define mn_to_npu_context(x) container_of(x, struct npu_context, mn) 469 + 470 + /* 471 + * Invalidate either a single address or an entire PID depending on 472 + * the value of va. 473 + */ 474 + static void mmio_invalidate(struct npu_context *npu_context, int va, 475 + unsigned long address) 476 + { 477 + int i, j, reg; 478 + struct npu *npu; 479 + struct pnv_phb *nphb; 480 + struct pci_dev *npdev; 481 + struct { 482 + struct npu *npu; 483 + int reg; 484 + } mmio_atsd_reg[NV_MAX_NPUS]; 485 + unsigned long pid = npu_context->mm->context.id; 486 + 487 + /* 488 + * Loop over all the NPUs this process is active on and launch 489 + * an invalidate. 490 + */ 491 + for (i = 0; i <= max_npu2_index; i++) { 492 + mmio_atsd_reg[i].reg = -1; 493 + for (j = 0; j < NV_MAX_LINKS; j++) { 494 + npdev = npu_context->npdev[i][j]; 495 + if (!npdev) 496 + continue; 497 + 498 + nphb = pci_bus_to_host(npdev->bus)->private_data; 499 + npu = &nphb->npu; 500 + mmio_atsd_reg[i].npu = npu; 501 + 502 + if (va) 503 + mmio_atsd_reg[i].reg = 504 + mmio_invalidate_va(npu, address, pid); 505 + else 506 + mmio_atsd_reg[i].reg = 507 + mmio_invalidate_pid(npu, pid); 508 + 509 + /* 510 + * The NPU hardware forwards the shootdown to all GPUs 511 + * so we only have to launch one shootdown per NPU. 512 + */ 513 + break; 514 + } 515 + } 516 + 517 + /* 518 + * Unfortunately the nest mmu does not support flushing specific 519 + * addresses so we have to flush the whole mm. 520 + */ 521 + flush_tlb_mm(npu_context->mm); 522 + 523 + /* Wait for all invalidations to complete */ 524 + for (i = 0; i <= max_npu2_index; i++) { 525 + if (mmio_atsd_reg[i].reg < 0) 526 + continue; 527 + 528 + /* Wait for completion */ 529 + npu = mmio_atsd_reg[i].npu; 530 + reg = mmio_atsd_reg[i].reg; 531 + while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT)) 532 + cpu_relax(); 533 + put_mmio_atsd_reg(npu, reg); 534 + } 535 + } 536 + 537 + static void pnv_npu2_mn_release(struct mmu_notifier *mn, 538 + struct mm_struct *mm) 539 + { 540 + struct npu_context *npu_context = mn_to_npu_context(mn); 541 + 542 + /* Call into device driver to stop requests to the NMMU */ 543 + if (npu_context->release_cb) 544 + npu_context->release_cb(npu_context, npu_context->priv); 545 + 546 + /* 547 + * There should be no more translation requests for this PID, but we 548 + * need to ensure any entries for it are removed from the TLB. 549 + */ 550 + mmio_invalidate(npu_context, 0, 0); 551 + } 552 + 553 + static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, 554 + struct mm_struct *mm, 555 + unsigned long address, 556 + pte_t pte) 557 + { 558 + struct npu_context *npu_context = mn_to_npu_context(mn); 559 + 560 + mmio_invalidate(npu_context, 1, address); 561 + } 562 + 563 + static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn, 564 + struct mm_struct *mm, 565 + unsigned long address) 566 + { 567 + struct npu_context *npu_context = mn_to_npu_context(mn); 568 + 569 + mmio_invalidate(npu_context, 1, address); 570 + } 571 + 572 + static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, 573 + struct mm_struct *mm, 574 + unsigned long start, unsigned long end) 575 + { 576 + struct npu_context *npu_context = mn_to_npu_context(mn); 577 + unsigned long address; 578 + 579 + for (address = start; address <= end; address += PAGE_SIZE) 580 + mmio_invalidate(npu_context, 1, address); 581 + } 582 + 583 + static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { 584 + .release = pnv_npu2_mn_release, 585 + .change_pte = pnv_npu2_mn_change_pte, 586 + .invalidate_page = pnv_npu2_mn_invalidate_page, 587 + .invalidate_range = pnv_npu2_mn_invalidate_range, 588 + }; 589 + 590 + /* 591 + * Call into OPAL to setup the nmmu context for the current task in 592 + * the NPU. This must be called to setup the context tables before the 593 + * GPU issues ATRs. pdev should be a pointed to PCIe GPU device. 594 + * 595 + * A release callback should be registered to allow a device driver to 596 + * be notified that it should not launch any new translation requests 597 + * as the final TLB invalidate is about to occur. 598 + * 599 + * Returns an error if there no contexts are currently available or a 600 + * npu_context which should be passed to pnv_npu2_handle_fault(). 601 + * 602 + * mmap_sem must be held in write mode. 603 + */ 604 + struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, 605 + unsigned long flags, 606 + struct npu_context *(*cb)(struct npu_context *, void *), 607 + void *priv) 608 + { 609 + int rc; 610 + u32 nvlink_index; 611 + struct device_node *nvlink_dn; 612 + struct mm_struct *mm = current->mm; 613 + struct pnv_phb *nphb; 614 + struct npu *npu; 615 + struct npu_context *npu_context; 616 + 617 + /* 618 + * At present we don't support GPUs connected to multiple NPUs and I'm 619 + * not sure the hardware does either. 620 + */ 621 + struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); 622 + 623 + if (!firmware_has_feature(FW_FEATURE_OPAL)) 624 + return ERR_PTR(-ENODEV); 625 + 626 + if (!npdev) 627 + /* No nvlink associated with this GPU device */ 628 + return ERR_PTR(-ENODEV); 629 + 630 + if (!mm) { 631 + /* kernel thread contexts are not supported */ 632 + return ERR_PTR(-EINVAL); 633 + } 634 + 635 + nphb = pci_bus_to_host(npdev->bus)->private_data; 636 + npu = &nphb->npu; 637 + 638 + /* 639 + * Setup the NPU context table for a particular GPU. These need to be 640 + * per-GPU as we need the tables to filter ATSDs when there are no 641 + * active contexts on a particular GPU. 642 + */ 643 + rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags, 644 + PCI_DEVID(gpdev->bus->number, gpdev->devfn)); 645 + if (rc < 0) 646 + return ERR_PTR(-ENOSPC); 647 + 648 + /* 649 + * We store the npu pci device so we can more easily get at the 650 + * associated npus. 651 + */ 652 + npu_context = mm->context.npu_context; 653 + if (!npu_context) { 654 + npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL); 655 + if (!npu_context) 656 + return ERR_PTR(-ENOMEM); 657 + 658 + mm->context.npu_context = npu_context; 659 + npu_context->mm = mm; 660 + npu_context->mn.ops = &nv_nmmu_notifier_ops; 661 + __mmu_notifier_register(&npu_context->mn, mm); 662 + kref_init(&npu_context->kref); 663 + } else { 664 + kref_get(&npu_context->kref); 665 + } 666 + 667 + npu_context->release_cb = cb; 668 + npu_context->priv = priv; 669 + nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); 670 + if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", 671 + &nvlink_index))) 672 + return ERR_PTR(-ENODEV); 673 + npu_context->npdev[npu->index][nvlink_index] = npdev; 674 + 675 + return npu_context; 676 + } 677 + EXPORT_SYMBOL(pnv_npu2_init_context); 678 + 679 + static void pnv_npu2_release_context(struct kref *kref) 680 + { 681 + struct npu_context *npu_context = 682 + container_of(kref, struct npu_context, kref); 683 + 684 + npu_context->mm->context.npu_context = NULL; 685 + mmu_notifier_unregister(&npu_context->mn, 686 + npu_context->mm); 687 + 688 + kfree(npu_context); 689 + } 690 + 691 + void pnv_npu2_destroy_context(struct npu_context *npu_context, 692 + struct pci_dev *gpdev) 693 + { 694 + struct pnv_phb *nphb, *phb; 695 + struct npu *npu; 696 + struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); 697 + struct device_node *nvlink_dn; 698 + u32 nvlink_index; 699 + 700 + if (WARN_ON(!npdev)) 701 + return; 702 + 703 + if (!firmware_has_feature(FW_FEATURE_OPAL)) 704 + return; 705 + 706 + nphb = pci_bus_to_host(npdev->bus)->private_data; 707 + npu = &nphb->npu; 708 + phb = pci_bus_to_host(gpdev->bus)->private_data; 709 + nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); 710 + if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", 711 + &nvlink_index))) 712 + return; 713 + npu_context->npdev[npu->index][nvlink_index] = NULL; 714 + opal_npu_destroy_context(phb->opal_id, npu_context->mm->context.id, 715 + PCI_DEVID(gpdev->bus->number, gpdev->devfn)); 716 + kref_put(&npu_context->kref, pnv_npu2_release_context); 717 + } 718 + EXPORT_SYMBOL(pnv_npu2_destroy_context); 719 + 720 + /* 721 + * Assumes mmap_sem is held for the contexts associated mm. 722 + */ 723 + int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, 724 + unsigned long *flags, unsigned long *status, int count) 725 + { 726 + u64 rc = 0, result = 0; 727 + int i, is_write; 728 + struct page *page[1]; 729 + 730 + /* mmap_sem should be held so the struct_mm must be present */ 731 + struct mm_struct *mm = context->mm; 732 + 733 + if (!firmware_has_feature(FW_FEATURE_OPAL)) 734 + return -ENODEV; 735 + 736 + WARN_ON(!rwsem_is_locked(&mm->mmap_sem)); 737 + 738 + for (i = 0; i < count; i++) { 739 + is_write = flags[i] & NPU2_WRITE; 740 + rc = get_user_pages_remote(NULL, mm, ea[i], 1, 741 + is_write ? FOLL_WRITE : 0, 742 + page, NULL, NULL); 743 + 744 + /* 745 + * To support virtualised environments we will have to do an 746 + * access to the page to ensure it gets faulted into the 747 + * hypervisor. For the moment virtualisation is not supported in 748 + * other areas so leave the access out. 749 + */ 750 + if (rc != 1) { 751 + status[i] = rc; 752 + result = -EFAULT; 753 + continue; 754 + } 755 + 756 + status[i] = 0; 757 + put_page(page[0]); 758 + } 759 + 760 + return result; 761 + } 762 + EXPORT_SYMBOL(pnv_npu2_handle_fault); 763 + 764 + int pnv_npu2_init(struct pnv_phb *phb) 765 + { 766 + unsigned int i; 767 + u64 mmio_atsd; 768 + struct device_node *dn; 769 + struct pci_dev *gpdev; 770 + static int npu_index; 771 + uint64_t rc = 0; 772 + 773 + for_each_child_of_node(phb->hose->dn, dn) { 774 + gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn)); 775 + if (gpdev) { 776 + rc = opal_npu_map_lpar(phb->opal_id, 777 + PCI_DEVID(gpdev->bus->number, gpdev->devfn), 778 + 0, 0); 779 + if (rc) 780 + dev_err(&gpdev->dev, 781 + "Error %lld mapping device to LPAR\n", 782 + rc); 783 + } 784 + } 785 + 786 + for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd", 787 + i, &mmio_atsd); i++) 788 + phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32); 789 + 790 + pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i); 791 + phb->npu.mmio_atsd_count = i; 792 + phb->npu.mmio_atsd_usage = 0; 793 + npu_index++; 794 + if (WARN_ON(npu_index >= NV_MAX_NPUS)) 795 + return -ENOSPC; 796 + max_npu2_index = npu_index; 797 + phb->npu.index = npu_index; 798 + 799 + return 0; 384 800 }

+1 -2

arch/powerpc/platforms/powernv/opal-lpc.c

··· 12 12 #include <linux/kernel.h> 13 13 #include <linux/of.h> 14 14 #include <linux/bug.h> 15 - #include <linux/debugfs.h> 16 15 #include <linux/io.h> 17 16 #include <linux/slab.h> 18 17 ··· 20 21 #include <asm/opal.h> 21 22 #include <asm/prom.h> 22 23 #include <linux/uaccess.h> 23 - #include <asm/debug.h> 24 + #include <asm/debugfs.h> 24 25 #include <asm/isa-bridge.h> 25 26 26 27 static int opal_lpc_chip_id = -1;

+4

arch/powerpc/platforms/powernv/opal-sensor.c

··· 64 64 *sensor_data = be32_to_cpu(data); 65 65 break; 66 66 67 + case OPAL_WRONG_STATE: 68 + ret = -EIO; 69 + break; 70 + 67 71 default: 68 72 ret = opal_error_code(ret); 69 73 break;

+40 -31

arch/powerpc/platforms/powernv/opal-wrappers.S

··· 50 50 #define OPAL_BRANCH(LABEL) 51 51 #endif 52 52 53 - /* TODO: 54 - * 55 - * - Trace irqs in/off (needs saving/restoring all args, argh...) 56 - * - Get r11 feed up by Dave so I can have better register usage 53 + /* 54 + * DO_OPAL_CALL assumes: 55 + * r0 = opal call token 56 + * r12 = msr 57 + * LR has been saved 57 58 */ 58 - 59 - #define OPAL_CALL(name, token) \ 60 - _GLOBAL_TOC(name); \ 61 - mfmsr r12; \ 62 - mflr r0; \ 63 - andi. r11,r12,MSR_IR|MSR_DR; \ 64 - std r0,PPC_LR_STKOFF(r1); \ 65 - li r0,token; \ 66 - beq opal_real_call; \ 67 - OPAL_BRANCH(opal_tracepoint_entry) \ 59 + #define DO_OPAL_CALL() \ 68 60 mfcr r11; \ 69 61 stw r11,8(r1); \ 70 62 li r11,0; \ ··· 74 82 ld r2,0(r11); \ 75 83 mtspr SPRN_HSRR0,r12; \ 76 84 hrfid 85 + 86 + #define OPAL_CALL(name, token) \ 87 + _GLOBAL_TOC(name); \ 88 + mfmsr r12; \ 89 + mflr r0; \ 90 + andi. r11,r12,MSR_IR|MSR_DR; \ 91 + std r0,PPC_LR_STKOFF(r1); \ 92 + li r0,token; \ 93 + beq opal_real_call; \ 94 + OPAL_BRANCH(opal_tracepoint_entry) \ 95 + DO_OPAL_CALL() 96 + 77 97 78 98 opal_return: 79 99 /* ··· 152 148 ld r8,STK_REG(R29)(r1) 153 149 ld r9,STK_REG(R30)(r1) 154 150 ld r10,STK_REG(R31)(r1) 151 + 152 + /* setup LR so we return via tracepoint_return */ 155 153 LOAD_REG_ADDR(r11,opal_tracepoint_return) 156 - mfcr r12 157 154 std r11,16(r1) 158 - stw r12,8(r1) 159 - li r11,0 155 + 160 156 mfmsr r12 161 - ori r11,r11,MSR_EE 162 - std r12,PACASAVEDMSR(r13) 163 - andc r12,r12,r11 164 - mtmsrd r12,1 165 - LOAD_REG_ADDR(r11,opal_return) 166 - mtlr r11 167 - li r11,MSR_DR|MSR_IR|MSR_LE 168 - andc r12,r12,r11 169 - mtspr SPRN_HSRR1,r12 170 - LOAD_REG_ADDR(r11,opal) 171 - ld r12,8(r11) 172 - ld r2,0(r11) 173 - mtspr SPRN_HSRR0,r12 174 - hrfid 157 + DO_OPAL_CALL() 175 158 176 159 opal_tracepoint_return: 177 160 std r3,STK_REG(R31)(r1) ··· 292 301 OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR); 293 302 OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL); 294 303 OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR); 304 + OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET); 305 + OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO); 306 + OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG); 307 + OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG); 308 + OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO); 309 + OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO); 310 + OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE); 311 + OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK); 312 + OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK); 313 + OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ); 314 + OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ); 315 + OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); 316 + OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); 317 + OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); 318 + OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); 319 + OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); 320 + OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); 321 + OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR);

+17 -10

arch/powerpc/platforms/powernv/opal-xscom.c

··· 73 73 74 74 static u64 opal_scom_unmangle(u64 addr) 75 75 { 76 + u64 tmp; 77 + 76 78 /* 77 - * XSCOM indirect addresses have the top bit set. Additionally 78 - * the rest of the top 3 nibbles is always 0. 79 + * XSCOM addresses use the top nibble to set indirect mode and 80 + * its form. Bits 4-11 are always 0. 79 81 * 80 82 * Because the debugfs interface uses signed offsets and shifts 81 83 * the address left by 3, we basically cannot use the top 4 bits 82 84 * of the 64-bit address, and thus cannot use the indirect bit. 83 85 * 84 - * To deal with that, we support the indirect bit being in bit 85 - * 4 (IBM notation) instead of bit 0 in this API, we do the 86 - * conversion here. To leave room for further xscom address 87 - * expansion, we only clear out the top byte 86 + * To deal with that, we support the indirect bits being in 87 + * bits 4-7 (IBM notation) instead of bit 0-3 in this API, we 88 + * do the conversion here. 88 89 * 89 - * For in-kernel use, we also support the real indirect bit, so 90 - * we test for any of the top 5 bits 90 + * For in-kernel use, we don't need to do this mangling. In 91 + * kernel won't have bits 4-7 set. 91 92 * 93 + * So: 94 + * debugfs will always set 0-3 = 0 and clear 4-7 95 + * kernel will always clear 0-3 = 0 and set 4-7 92 96 */ 93 - if (addr & (0x1full << 59)) 94 - addr = (addr & ~(0xffull << 56)) | (1ull << 63); 97 + tmp = addr; 98 + tmp &= 0x0f00000000000000; 99 + addr &= 0xf0ffffffffffffff; 100 + addr |= tmp << 4; 101 + 95 102 return addr; 96 103 } 97 104

+78 -1

arch/powerpc/platforms/powernv/opal.c

··· 435 435 evt.version); 436 436 return 0; 437 437 } 438 - machine_check_print_event_info(&evt); 438 + machine_check_print_event_info(&evt, user_mode(regs)); 439 439 440 440 if (opal_recover_mce(regs, &evt)) 441 441 return 1; ··· 595 595 pr_warn("Error %d creating OPAL symbols file\n", rc); 596 596 } 597 597 598 + static ssize_t export_attr_read(struct file *fp, struct kobject *kobj, 599 + struct bin_attribute *bin_attr, char *buf, 600 + loff_t off, size_t count) 601 + { 602 + return memory_read_from_buffer(buf, count, &off, bin_attr->private, 603 + bin_attr->size); 604 + } 605 + 606 + /* 607 + * opal_export_attrs: creates a sysfs node for each property listed in 608 + * the device-tree under /ibm,opal/firmware/exports/ 609 + * All new sysfs nodes are created under /opal/exports/. 610 + * This allows for reserved memory regions (e.g. HDAT) to be read. 611 + * The new sysfs nodes are only readable by root. 612 + */ 613 + static void opal_export_attrs(void) 614 + { 615 + struct bin_attribute *attr; 616 + struct device_node *np; 617 + struct property *prop; 618 + struct kobject *kobj; 619 + u64 vals[2]; 620 + int rc; 621 + 622 + np = of_find_node_by_path("/ibm,opal/firmware/exports"); 623 + if (!np) 624 + return; 625 + 626 + /* Create new 'exports' directory - /sys/firmware/opal/exports */ 627 + kobj = kobject_create_and_add("exports", opal_kobj); 628 + if (!kobj) { 629 + pr_warn("kobject_create_and_add() of exports failed\n"); 630 + return; 631 + } 632 + 633 + for_each_property_of_node(np, prop) { 634 + if (!strcmp(prop->name, "name") || !strcmp(prop->name, "phandle")) 635 + continue; 636 + 637 + if (of_property_read_u64_array(np, prop->name, &vals[0], 2)) 638 + continue; 639 + 640 + attr = kzalloc(sizeof(*attr), GFP_KERNEL); 641 + 642 + if (attr == NULL) { 643 + pr_warn("Failed kmalloc for bin_attribute!"); 644 + continue; 645 + } 646 + 647 + sysfs_bin_attr_init(attr); 648 + attr->attr.name = kstrdup(prop->name, GFP_KERNEL); 649 + attr->attr.mode = 0400; 650 + attr->read = export_attr_read; 651 + attr->private = __va(vals[0]); 652 + attr->size = vals[1]; 653 + 654 + if (attr->attr.name == NULL) { 655 + pr_warn("Failed kstrdup for bin_attribute attr.name"); 656 + kfree(attr); 657 + continue; 658 + } 659 + 660 + rc = sysfs_create_bin_file(kobj, attr); 661 + if (rc) { 662 + pr_warn("Error %d creating OPAL sysfs exports/%s file\n", 663 + rc, prop->name); 664 + kfree(attr->attr.name); 665 + kfree(attr); 666 + } 667 + } 668 + 669 + of_node_put(np); 670 + } 671 + 598 672 static void __init opal_dump_region_init(void) 599 673 { 600 674 void *addr; ··· 806 732 /* Setup message log sysfs interface. */ 807 733 opal_msglog_sysfs_init(); 808 734 } 735 + 736 + /* Export all properties */ 737 + opal_export_attrs(); 809 738 810 739 /* Initialize platform devices: IPMI backend, PRD & flash interface */ 811 740 opal_pdev_init("ibm,opal-ipmi");

+52 -19

arch/powerpc/platforms/powernv/pci-ioda.c

··· 14 14 #include <linux/kernel.h> 15 15 #include <linux/pci.h> 16 16 #include <linux/crash_dump.h> 17 - #include <linux/debugfs.h> 18 17 #include <linux/delay.h> 19 18 #include <linux/string.h> 20 19 #include <linux/init.h> ··· 37 38 #include <asm/iommu.h> 38 39 #include <asm/tce.h> 39 40 #include <asm/xics.h> 40 - #include <asm/debug.h> 41 + #include <asm/debugfs.h> 41 42 #include <asm/firmware.h> 42 43 #include <asm/pnv-pci.h> 43 44 #include <asm/mmzone.h> ··· 1261 1262 /* PE#0 is needed for error reporting */ 1262 1263 pnv_ioda_reserve_pe(phb, 0); 1263 1264 pnv_ioda_setup_npu_PEs(hose->bus); 1265 + if (phb->model == PNV_PHB_MODEL_NPU2) 1266 + pnv_npu2_init(phb); 1264 1267 } 1265 1268 } 1266 1269 } ··· 1425 1424 iommu_group_put(pe->table_group.group); 1426 1425 BUG_ON(pe->table_group.group); 1427 1426 } 1428 - pnv_pci_ioda2_table_free_pages(tbl); 1429 - iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); 1427 + iommu_tce_table_put(tbl); 1430 1428 } 1431 1429 1432 1430 static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) ··· 1860 1860 1861 1861 return ret; 1862 1862 } 1863 + 1864 + static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index, 1865 + unsigned long *hpa, enum dma_data_direction *direction) 1866 + { 1867 + long ret = pnv_tce_xchg(tbl, index, hpa, direction); 1868 + 1869 + if (!ret) 1870 + pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true); 1871 + 1872 + return ret; 1873 + } 1863 1874 #endif 1864 1875 1865 1876 static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, ··· 1885 1874 .set = pnv_ioda1_tce_build, 1886 1875 #ifdef CONFIG_IOMMU_API 1887 1876 .exchange = pnv_ioda1_tce_xchg, 1877 + .exchange_rm = pnv_ioda1_tce_xchg_rm, 1888 1878 #endif 1889 1879 .clear = pnv_ioda1_tce_free, 1890 1880 .get = pnv_tce_get, ··· 1895 1883 #define PHB3_TCE_KILL_INVAL_PE PPC_BIT(1) 1896 1884 #define PHB3_TCE_KILL_INVAL_ONE PPC_BIT(2) 1897 1885 1898 - void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm) 1886 + static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm) 1899 1887 { 1900 1888 __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm); 1901 1889 const unsigned long val = PHB3_TCE_KILL_INVAL_ALL; ··· 1960 1948 { 1961 1949 struct iommu_table_group_link *tgl; 1962 1950 1963 - list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { 1951 + list_for_each_entry_lockless(tgl, &tbl->it_group_list, next) { 1964 1952 struct pnv_ioda_pe *pe = container_of(tgl->table_group, 1965 1953 struct pnv_ioda_pe, table_group); 1966 1954 struct pnv_phb *phb = pe->phb; ··· 1991 1979 } 1992 1980 } 1993 1981 1982 + void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm) 1983 + { 1984 + if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3) 1985 + pnv_pci_phb3_tce_invalidate_entire(phb, rm); 1986 + else 1987 + opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0); 1988 + } 1989 + 1994 1990 static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, 1995 1991 long npages, unsigned long uaddr, 1996 1992 enum dma_data_direction direction, ··· 2024 2004 2025 2005 return ret; 2026 2006 } 2007 + 2008 + static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index, 2009 + unsigned long *hpa, enum dma_data_direction *direction) 2010 + { 2011 + long ret = pnv_tce_xchg(tbl, index, hpa, direction); 2012 + 2013 + if (!ret) 2014 + pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true); 2015 + 2016 + return ret; 2017 + } 2027 2018 #endif 2028 2019 2029 2020 static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, ··· 2048 2017 static void pnv_ioda2_table_free(struct iommu_table *tbl) 2049 2018 { 2050 2019 pnv_pci_ioda2_table_free_pages(tbl); 2051 - iommu_free_table(tbl, "pnv"); 2052 2020 } 2053 2021 2054 2022 static struct iommu_table_ops pnv_ioda2_iommu_ops = { 2055 2023 .set = pnv_ioda2_tce_build, 2056 2024 #ifdef CONFIG_IOMMU_API 2057 2025 .exchange = pnv_ioda2_tce_xchg, 2026 + .exchange_rm = pnv_ioda2_tce_xchg_rm, 2058 2027 #endif 2059 2028 .clear = pnv_ioda2_tce_free, 2060 2029 .get = pnv_tce_get, ··· 2159 2128 2160 2129 found: 2161 2130 tbl = pnv_pci_table_alloc(phb->hose->node); 2131 + if (WARN_ON(!tbl)) 2132 + return; 2133 + 2162 2134 iommu_register_group(&pe->table_group, phb->hose->global_number, 2163 2135 pe->pe_number); 2164 2136 pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group); ··· 2237 2203 __free_pages(tce_mem, get_order(tce32_segsz * segs)); 2238 2204 if (tbl) { 2239 2205 pnv_pci_unlink_table_and_group(tbl, &pe->table_group); 2240 - iommu_free_table(tbl, "pnv"); 2206 + iommu_tce_table_put(tbl); 2241 2207 } 2242 2208 } 2243 2209 ··· 2327 2293 if (!tbl) 2328 2294 return -ENOMEM; 2329 2295 2296 + tbl->it_ops = &pnv_ioda2_iommu_ops; 2297 + 2330 2298 ret = pnv_pci_ioda2_table_alloc_pages(nid, 2331 2299 bus_offset, page_shift, window_size, 2332 2300 levels, tbl); 2333 2301 if (ret) { 2334 - iommu_free_table(tbl, "pnv"); 2302 + iommu_tce_table_put(tbl); 2335 2303 return ret; 2336 2304 } 2337 - 2338 - tbl->it_ops = &pnv_ioda2_iommu_ops; 2339 2305 2340 2306 *ptbl = tbl; 2341 2307 ··· 2377 2343 if (rc) { 2378 2344 pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", 2379 2345 rc); 2380 - pnv_ioda2_table_free(tbl); 2346 + iommu_tce_table_put(tbl); 2381 2347 return rc; 2382 2348 } 2383 2349 ··· 2448 2414 2449 2415 tce_table_size /= direct_table_size; 2450 2416 tce_table_size <<= 3; 2451 - tce_table_size = _ALIGN_UP(tce_table_size, direct_table_size); 2417 + tce_table_size = max_t(unsigned long, 2418 + tce_table_size, direct_table_size); 2452 2419 } 2453 2420 2454 2421 return bytes; ··· 2466 2431 pnv_pci_ioda2_unset_window(&pe->table_group, 0); 2467 2432 if (pe->pbus) 2468 2433 pnv_ioda_setup_bus_dma(pe, pe->pbus, false); 2469 - pnv_ioda2_table_free(tbl); 2434 + iommu_tce_table_put(tbl); 2470 2435 } 2471 2436 2472 2437 static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) ··· 2770 2735 if (rc) 2771 2736 return; 2772 2737 2773 - if (pe->flags & PNV_IODA_PE_DEV) 2774 - iommu_add_device(&pe->pdev->dev); 2775 - else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) 2738 + if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) 2776 2739 pnv_ioda_setup_bus_dma(pe, pe->pbus, true); 2777 2740 } 2778 2741 ··· 3439 3406 } 3440 3407 3441 3408 free_pages(tbl->it_base, get_order(tbl->it_size << 3)); 3442 - iommu_free_table(tbl, "pnv"); 3409 + iommu_tce_table_put(tbl); 3443 3410 } 3444 3411 3445 3412 static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe) ··· 3466 3433 } 3467 3434 3468 3435 pnv_pci_ioda2_table_free_pages(tbl); 3469 - iommu_free_table(tbl, "pnv"); 3436 + iommu_tce_table_put(tbl); 3470 3437 } 3471 3438 3472 3439 static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,

+5 -1

arch/powerpc/platforms/powernv/pci.c

··· 758 758 759 759 unsigned long pnv_tce_get(struct iommu_table *tbl, long index) 760 760 { 761 - return *(pnv_tce(tbl, index - tbl->it_offset)); 761 + return be64_to_cpu(*(pnv_tce(tbl, index - tbl->it_offset))); 762 762 } 763 763 764 764 struct iommu_table *pnv_pci_table_alloc(int nid) ··· 766 766 struct iommu_table *tbl; 767 767 768 768 tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid); 769 + if (!tbl) 770 + return NULL; 771 + 769 772 INIT_LIST_HEAD_RCU(&tbl->it_group_list); 773 + kref_init(&tbl->it_kref); 770 774 771 775 return tbl; 772 776 }

+15 -2

arch/powerpc/platforms/powernv/pci.h

··· 7 7 8 8 struct pci_dn; 9 9 10 + /* Maximum possible number of ATSD MMIO registers per NPU */ 11 + #define NV_NMMU_ATSD_REGS 8 12 + 10 13 enum pnv_phb_type { 11 14 PNV_PHB_IODA1 = 0, 12 15 PNV_PHB_IODA2 = 1, ··· 177 174 struct OpalIoP7IOCErrorData hub_diag; 178 175 } diag; 179 176 177 + /* Nvlink2 data */ 178 + struct npu { 179 + int index; 180 + __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS]; 181 + unsigned int mmio_atsd_count; 182 + 183 + /* Bitmask for MMIO register usage */ 184 + unsigned long mmio_atsd_usage; 185 + } npu; 186 + 180 187 #ifdef CONFIG_CXL_BASE 181 188 struct cxl_afu *cxl_afu; 182 189 #endif ··· 242 229 243 230 /* Nvlink functions */ 244 231 extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); 245 - extern void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm); 232 + extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); 246 233 extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe); 247 234 extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, 248 235 struct iommu_table *tbl); 249 236 extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num); 250 237 extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe); 251 238 extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe); 252 - 239 + extern int pnv_npu2_init(struct pnv_phb *phb); 253 240 254 241 /* cxl functions */ 255 242 extern bool pnv_cxl_enable_device_hook(struct pci_dev *dev);

-2

arch/powerpc/platforms/powernv/powernv.h

··· 18 18 #endif 19 19 20 20 extern u32 pnv_get_supported_cpuidle_states(void); 21 - extern u64 pnv_deepest_stop_psscr_val; 22 - extern u64 pnv_deepest_stop_psscr_mask; 23 21 24 22 extern void pnv_lpc_init(void); 25 23

+1 -1

arch/powerpc/platforms/powernv/rng.c

··· 62 62 63 63 rng = raw_cpu_read(powernv_rng); 64 64 65 - *v = rng_whiten(rng, in_rm64(rng->regs_real)); 65 + *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real)); 66 66 67 67 return 1; 68 68 }

+16 -3

arch/powerpc/platforms/powernv/setup.c

··· 32 32 #include <asm/machdep.h> 33 33 #include <asm/firmware.h> 34 34 #include <asm/xics.h> 35 + #include <asm/xive.h> 35 36 #include <asm/opal.h> 36 37 #include <asm/kexec.h> 37 38 #include <asm/smp.h> ··· 77 76 78 77 static void __init pnv_init_IRQ(void) 79 78 { 80 - xics_init(); 79 + /* Try using a XIVE if available, otherwise use a XICS */ 80 + if (!xive_native_init()) 81 + xics_init(); 81 82 82 83 WARN_ON(!ppc_md.get_irq); 83 84 } ··· 98 95 else 99 96 seq_printf(m, "firmware\t: BML\n"); 100 97 of_node_put(root); 98 + if (radix_enabled()) 99 + seq_printf(m, "MMU\t\t: Radix\n"); 100 + else 101 + seq_printf(m, "MMU\t\t: Hash\n"); 101 102 } 102 103 103 104 static void pnv_prepare_going_down(void) ··· 225 218 226 219 static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) 227 220 { 228 - xics_kexec_teardown_cpu(secondary); 221 + if (xive_enabled()) 222 + xive_kexec_teardown_cpu(secondary); 223 + else 224 + xics_kexec_teardown_cpu(secondary); 229 225 230 226 /* On OPAL, we return all CPUs to firmware */ 231 - 232 227 if (!firmware_has_feature(FW_FEATURE_OPAL)) 233 228 return; 234 229 ··· 245 236 } else { 246 237 /* Primary waits for the secondaries to have reached OPAL */ 247 238 pnv_kexec_wait_secondaries_down(); 239 + 240 + /* Switch XIVE back to emulation mode */ 241 + if (xive_enabled()) 242 + xive_shutdown(); 248 243 249 244 /* 250 245 * We might be running as little-endian - now that interrupts

+78 -29

arch/powerpc/platforms/powernv/smp.c

··· 29 29 #include <asm/vdso_datapage.h> 30 30 #include <asm/cputhreads.h> 31 31 #include <asm/xics.h> 32 + #include <asm/xive.h> 32 33 #include <asm/opal.h> 33 34 #include <asm/runlatch.h> 34 35 #include <asm/code-patching.h> 35 36 #include <asm/dbell.h> 36 37 #include <asm/kvm_ppc.h> 37 38 #include <asm/ppc-opcode.h> 39 + #include <asm/cpuidle.h> 38 40 39 41 #include "powernv.h" 40 42 ··· 49 47 50 48 static void pnv_smp_setup_cpu(int cpu) 51 49 { 52 - if (cpu != boot_cpuid) 50 + if (xive_enabled()) 51 + xive_smp_setup_cpu(); 52 + else if (cpu != boot_cpuid) 53 53 xics_setup_cpu(); 54 - 55 - #ifdef CONFIG_PPC_DOORBELL 56 - if (cpu_has_feature(CPU_FTR_DBELL)) 57 - doorbell_setup_this_cpu(); 58 - #endif 59 54 } 60 55 61 56 static int pnv_smp_kick_cpu(int nr) ··· 131 132 vdso_data->processorCount--; 132 133 if (cpu == boot_cpuid) 133 134 boot_cpuid = cpumask_any(cpu_online_mask); 134 - xics_migrate_irqs_away(); 135 + if (xive_enabled()) 136 + xive_smp_disable_cpu(); 137 + else 138 + xics_migrate_irqs_away(); 135 139 return 0; 136 140 } 137 141 ··· 142 140 { 143 141 unsigned int cpu; 144 142 unsigned long srr1, wmask; 145 - u32 idle_states; 146 143 147 144 /* Standard hot unplug procedure */ 148 145 local_irq_disable(); ··· 155 154 wmask = SRR1_WAKEMASK; 156 155 if (cpu_has_feature(CPU_FTR_ARCH_207S)) 157 156 wmask = SRR1_WAKEMASK_P8; 158 - 159 - idle_states = pnv_get_supported_cpuidle_states(); 160 157 161 158 /* We don't want to take decrementer interrupts while we are offline, 162 159 * so clear LPCR:PECE1. We keep PECE2 (and LPCR_PECE_HVEE on P9) ··· 183 184 kvmppc_set_host_ipi(cpu, 0); 184 185 185 186 ppc64_runlatch_off(); 186 - 187 - if (cpu_has_feature(CPU_FTR_ARCH_300)) { 188 - srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val, 189 - pnv_deepest_stop_psscr_mask); 190 - } else if (idle_states & OPAL_PM_WINKLE_ENABLED) { 191 - srr1 = power7_winkle(); 192 - } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || 193 - (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { 194 - srr1 = power7_sleep(); 195 - } else { 196 - srr1 = power7_nap(1); 197 - } 198 - 187 + srr1 = pnv_cpu_offline(cpu); 199 188 ppc64_runlatch_on(); 200 189 201 190 /* ··· 200 213 if (((srr1 & wmask) == SRR1_WAKEEE) || 201 214 ((srr1 & wmask) == SRR1_WAKEHVI) || 202 215 (local_paca->irq_happened & PACA_IRQ_EE)) { 203 - if (cpu_has_feature(CPU_FTR_ARCH_300)) 204 - icp_opal_flush_interrupt(); 205 - else 216 + if (cpu_has_feature(CPU_FTR_ARCH_300)) { 217 + if (xive_enabled()) 218 + xive_flush_interrupt(); 219 + else 220 + icp_opal_flush_interrupt(); 221 + } else 206 222 icp_native_flush_interrupt(); 207 223 } else if ((srr1 & wmask) == SRR1_WAKEHDBELL) { 208 224 unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); ··· 242 252 return smp_generic_cpu_bootable(nr); 243 253 } 244 254 255 + static int pnv_smp_prepare_cpu(int cpu) 256 + { 257 + if (xive_enabled()) 258 + return xive_smp_prepare_cpu(cpu); 259 + return 0; 260 + } 261 + 262 + /* Cause IPI as setup by the interrupt controller (xics or xive) */ 263 + static void (*ic_cause_ipi)(int cpu); 264 + 265 + static void pnv_cause_ipi(int cpu) 266 + { 267 + if (doorbell_try_core_ipi(cpu)) 268 + return; 269 + 270 + ic_cause_ipi(cpu); 271 + } 272 + 273 + static void pnv_p9_dd1_cause_ipi(int cpu) 274 + { 275 + int this_cpu = get_cpu(); 276 + 277 + /* 278 + * POWER9 DD1 has a global addressed msgsnd, but for now we restrict 279 + * IPIs to same core, because it requires additional synchronization 280 + * for inter-core doorbells which we do not implement. 281 + */ 282 + if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu))) 283 + doorbell_global_ipi(cpu); 284 + else 285 + ic_cause_ipi(cpu); 286 + 287 + put_cpu(); 288 + } 289 + 290 + static void __init pnv_smp_probe(void) 291 + { 292 + if (xive_enabled()) 293 + xive_smp_probe(); 294 + else 295 + xics_smp_probe(); 296 + 297 + if (cpu_has_feature(CPU_FTR_DBELL)) { 298 + ic_cause_ipi = smp_ops->cause_ipi; 299 + WARN_ON(!ic_cause_ipi); 300 + 301 + if (cpu_has_feature(CPU_FTR_ARCH_300)) { 302 + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) 303 + smp_ops->cause_ipi = pnv_p9_dd1_cause_ipi; 304 + else 305 + smp_ops->cause_ipi = doorbell_global_ipi; 306 + } else { 307 + smp_ops->cause_ipi = pnv_cause_ipi; 308 + } 309 + } 310 + } 311 + 245 312 static struct smp_ops_t pnv_smp_ops = { 246 - .message_pass = smp_muxed_ipi_message_pass, 247 - .cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */ 248 - .probe = xics_smp_probe, 313 + .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */ 314 + .cause_ipi = NULL, /* Filled at runtime by pnv_smp_probe() */ 315 + .cause_nmi_ipi = NULL, 316 + .probe = pnv_smp_probe, 317 + .prepare_cpu = pnv_smp_prepare_cpu, 249 318 .kick_cpu = pnv_smp_kick_cpu, 250 319 .setup_cpu = pnv_smp_setup_cpu, 251 320 .cpu_bootable = pnv_cpu_bootable,

+2 -2

arch/powerpc/platforms/ps3/smp.c

··· 77 77 BUILD_BUG_ON(PPC_MSG_CALL_FUNCTION != 0); 78 78 BUILD_BUG_ON(PPC_MSG_RESCHEDULE != 1); 79 79 BUILD_BUG_ON(PPC_MSG_TICK_BROADCAST != 2); 80 - BUILD_BUG_ON(PPC_MSG_DEBUGGER_BREAK != 3); 80 + BUILD_BUG_ON(PPC_MSG_NMI_IPI != 3); 81 81 82 82 for (i = 0; i < MSG_COUNT; i++) { 83 83 result = ps3_event_receive_port_setup(cpu, &virqs[i]); ··· 96 96 ps3_register_ipi_irq(cpu, virqs[i]); 97 97 } 98 98 99 - ps3_register_ipi_debug_brk(cpu, virqs[PPC_MSG_DEBUGGER_BREAK]); 99 + ps3_register_ipi_debug_brk(cpu, virqs[PPC_MSG_NMI_IPI]); 100 100 101 101 DBG(" <- %s:%d: (%d)\n", __func__, __LINE__, cpu); 102 102 }

+2 -1

arch/powerpc/platforms/pseries/Kconfig

··· 17 17 select PPC_UDBG_16550 18 18 select PPC_NATIVE 19 19 select PPC_DOORBELL 20 - select HOTPLUG_CPU if SMP 20 + select HOTPLUG_CPU 21 21 select ARCH_RANDOM 22 22 select PPC_DOORBELL 23 + select FORCE_SMP 23 24 default y 24 25 25 26 config PPC_SPLPAR

-1

arch/powerpc/platforms/pseries/dlpar.c

··· 288 288 if (rc) 289 289 return rc; 290 290 291 - of_node_put(dn); /* Must decrement the refcount */ 292 291 return 0; 293 292 } 294 293

+1 -2

arch/powerpc/platforms/pseries/dtl.c

··· 21 21 */ 22 22 23 23 #include <linux/slab.h> 24 - #include <linux/debugfs.h> 25 24 #include <linux/spinlock.h> 26 25 #include <asm/smp.h> 27 26 #include <linux/uaccess.h> 28 27 #include <asm/firmware.h> 29 28 #include <asm/lppaca.h> 30 - #include <asm/debug.h> 29 + #include <asm/debugfs.h> 31 30 #include <asm/plpar_wrappers.h> 32 31 #include <asm/machdep.h> 33 32

+10

arch/powerpc/platforms/pseries/hvCall_inst.c

··· 29 29 #include <asm/trace.h> 30 30 #include <asm/machdep.h> 31 31 32 + /* For hcall instrumentation. One structure per-hcall, per-CPU */ 33 + struct hcall_stats { 34 + unsigned long num_calls; /* number of calls (on this CPU) */ 35 + unsigned long tb_total; /* total wall time (mftb) of calls. */ 36 + unsigned long purr_total; /* total cpu time (PURR) of calls. */ 37 + unsigned long tb_start; 38 + unsigned long purr_start; 39 + }; 40 + #define HCALL_STAT_ARRAY_SIZE ((MAX_HCALL_OPCODE >> 2) + 1) 41 + 32 42 DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats); 33 43 34 44 /*

+40 -3

arch/powerpc/platforms/pseries/iommu.c

··· 74 74 goto fail_exit; 75 75 76 76 INIT_LIST_HEAD_RCU(&tbl->it_group_list); 77 + kref_init(&tbl->it_kref); 77 78 tgl->table_group = table_group; 78 79 list_add_rcu(&tgl->next, &tbl->it_group_list); 79 80 ··· 116 115 BUG_ON(table_group->group); 117 116 } 118 117 #endif 119 - iommu_free_table(tbl, node_name); 118 + iommu_tce_table_put(tbl); 120 119 121 120 kfree(table_group); 122 121 } ··· 551 550 static void iommu_table_setparms_lpar(struct pci_controller *phb, 552 551 struct device_node *dn, 553 552 struct iommu_table *tbl, 553 + struct iommu_table_group *table_group, 554 554 const __be32 *dma_window) 555 555 { 556 556 unsigned long offset, size; ··· 565 563 tbl->it_type = TCE_PCI; 566 564 tbl->it_offset = offset >> tbl->it_page_shift; 567 565 tbl->it_size = size >> tbl->it_page_shift; 566 + 567 + table_group->tce32_start = offset; 568 + table_group->tce32_size = size; 568 569 } 569 570 570 571 struct iommu_table_ops iommu_table_pseries_ops = { ··· 656 651 pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size); 657 652 } 658 653 654 + #ifdef CONFIG_IOMMU_API 655 + static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned 656 + long *tce, enum dma_data_direction *direction) 657 + { 658 + long rc; 659 + unsigned long ioba = (unsigned long) index << tbl->it_page_shift; 660 + unsigned long flags, oldtce = 0; 661 + u64 proto_tce = iommu_direction_to_tce_perm(*direction); 662 + unsigned long newtce = *tce | proto_tce; 663 + 664 + spin_lock_irqsave(&tbl->large_pool.lock, flags); 665 + 666 + rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce); 667 + if (!rc) 668 + rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce); 669 + 670 + if (!rc) { 671 + *direction = iommu_tce_direction(oldtce); 672 + *tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE); 673 + } 674 + 675 + spin_unlock_irqrestore(&tbl->large_pool.lock, flags); 676 + 677 + return rc; 678 + } 679 + #endif 680 + 659 681 struct iommu_table_ops iommu_table_lpar_multi_ops = { 660 682 .set = tce_buildmulti_pSeriesLP, 683 + #ifdef CONFIG_IOMMU_API 684 + .exchange = tce_exchange_pseries, 685 + #endif 661 686 .clear = tce_freemulti_pSeriesLP, 662 687 .get = tce_get_pSeriesLP 663 688 }; ··· 724 689 if (!ppci->table_group) { 725 690 ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node); 726 691 tbl = ppci->table_group->tables[0]; 727 - iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window); 692 + iommu_table_setparms_lpar(ppci->phb, pdn, tbl, 693 + ppci->table_group, dma_window); 728 694 tbl->it_ops = &iommu_table_lpar_multi_ops; 729 695 iommu_init_table(tbl, ppci->phb->node); 730 696 iommu_register_group(ppci->table_group, ··· 1179 1143 if (!pci->table_group) { 1180 1144 pci->table_group = iommu_pseries_alloc_group(pci->phb->node); 1181 1145 tbl = pci->table_group->tables[0]; 1182 - iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window); 1146 + iommu_table_setparms_lpar(pci->phb, pdn, tbl, 1147 + pci->table_group, dma_window); 1183 1148 tbl->it_ops = &iommu_table_lpar_multi_ops; 1184 1149 iommu_init_table(tbl, pci->phb->node); 1185 1150 iommu_register_group(pci->table_group,

+61

arch/powerpc/platforms/pseries/lpar.c

··· 958 958 959 959 return rc; 960 960 } 961 + 962 + static unsigned long vsid_unscramble(unsigned long vsid, int ssize) 963 + { 964 + unsigned long protovsid; 965 + unsigned long va_bits = VA_BITS; 966 + unsigned long modinv, vsid_modulus; 967 + unsigned long max_mod_inv, tmp_modinv; 968 + 969 + if (!mmu_has_feature(MMU_FTR_68_BIT_VA)) 970 + va_bits = 65; 971 + 972 + if (ssize == MMU_SEGSIZE_256M) { 973 + modinv = VSID_MULINV_256M; 974 + vsid_modulus = ((1UL << (va_bits - SID_SHIFT)) - 1); 975 + } else { 976 + modinv = VSID_MULINV_1T; 977 + vsid_modulus = ((1UL << (va_bits - SID_SHIFT_1T)) - 1); 978 + } 979 + 980 + /* 981 + * vsid outside our range. 982 + */ 983 + if (vsid >= vsid_modulus) 984 + return 0; 985 + 986 + /* 987 + * If modinv is the modular multiplicate inverse of (x % vsid_modulus) 988 + * and vsid = (protovsid * x) % vsid_modulus, then we say: 989 + * protovsid = (vsid * modinv) % vsid_modulus 990 + */ 991 + 992 + /* Check if (vsid * modinv) overflow (63 bits) */ 993 + max_mod_inv = 0x7fffffffffffffffull / vsid; 994 + if (modinv < max_mod_inv) 995 + return (vsid * modinv) % vsid_modulus; 996 + 997 + tmp_modinv = modinv/max_mod_inv; 998 + modinv %= max_mod_inv; 999 + 1000 + protovsid = (((vsid * max_mod_inv) % vsid_modulus) * tmp_modinv) % vsid_modulus; 1001 + protovsid = (protovsid + vsid * modinv) % vsid_modulus; 1002 + 1003 + return protovsid; 1004 + } 1005 + 1006 + static int __init reserve_vrma_context_id(void) 1007 + { 1008 + unsigned long protovsid; 1009 + 1010 + /* 1011 + * Reserve context ids which map to reserved virtual addresses. For now 1012 + * we only reserve the context id which maps to the VRMA VSID. We ignore 1013 + * the addresses in "ibm,adjunct-virtual-addresses" because we don't 1014 + * enable adjunct support via the "ibm,client-architecture-support" 1015 + * interface. 1016 + */ 1017 + protovsid = vsid_unscramble(VRMA_VSID, MMU_SEGSIZE_1T); 1018 + hash__reserve_context_id(protovsid >> ESID_BITS_1T); 1019 + return 0; 1020 + } 1021 + machine_device_initcall(pseries, reserve_vrma_context_id);

+4

arch/powerpc/platforms/pseries/ras.c

··· 386 386 } 387 387 fwnmi_release_errinfo(); 388 388 } 389 + 390 + if (smp_handle_nmi_ipi(regs)) 391 + return 1; 392 + 389 393 return 0; /* need to perform reset */ 390 394 } 391 395

+4

arch/powerpc/platforms/pseries/setup.c

··· 87 87 model = of_get_property(root, "model", NULL); 88 88 seq_printf(m, "machine\t\t: CHRP %s\n", model); 89 89 of_node_put(root); 90 + if (radix_enabled()) 91 + seq_printf(m, "MMU\t\t: Radix\n"); 92 + else 93 + seq_printf(m, "MMU\t\t: Hash\n"); 90 94 } 91 95 92 96 /* Initialize firmware assisted non-maskable interrupts if

+32 -17

arch/powerpc/platforms/pseries/smp.c

··· 55 55 */ 56 56 static cpumask_var_t of_spin_mask; 57 57 58 - /* 59 - * If we multiplex IPI mechanisms, store the appropriate XICS IPI mechanism here 60 - */ 61 - static void (*xics_cause_ipi)(int cpu, unsigned long data); 62 - 63 58 /* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */ 64 59 int smp_query_cpu_stopped(unsigned int pcpu) 65 60 { ··· 138 143 { 139 144 if (cpu != boot_cpuid) 140 145 xics_setup_cpu(); 141 - if (cpu_has_feature(CPU_FTR_DBELL)) 142 - doorbell_setup_this_cpu(); 143 146 144 147 if (firmware_has_feature(FW_FEATURE_SPLPAR)) 145 148 vpa_init(cpu); ··· 180 187 return 0; 181 188 } 182 189 183 - /* Only used on systems that support multiple IPI mechanisms */ 184 - static void pSeries_cause_ipi_mux(int cpu, unsigned long data) 190 + static void smp_pseries_cause_ipi(int cpu) 185 191 { 186 - if (cpumask_test_cpu(cpu, cpu_sibling_mask(smp_processor_id()))) 187 - doorbell_cause_ipi(cpu, data); 188 - else 189 - xics_cause_ipi(cpu, data); 192 + /* POWER9 should not use this handler */ 193 + if (doorbell_try_core_ipi(cpu)) 194 + return; 195 + 196 + icp_ops->cause_ipi(cpu); 197 + } 198 + 199 + static int pseries_cause_nmi_ipi(int cpu) 200 + { 201 + int hwcpu; 202 + 203 + if (cpu == NMI_IPI_ALL_OTHERS) { 204 + hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS; 205 + } else { 206 + if (cpu < 0) { 207 + WARN_ONCE(true, "incorrect cpu parameter %d", cpu); 208 + return 0; 209 + } 210 + 211 + hwcpu = get_hard_smp_processor_id(cpu); 212 + } 213 + 214 + if (plapr_signal_sys_reset(hwcpu) == H_SUCCESS) 215 + return 1; 216 + 217 + return 0; 190 218 } 191 219 192 220 static __init void pSeries_smp_probe(void) 193 221 { 194 222 xics_smp_probe(); 195 223 196 - if (cpu_has_feature(CPU_FTR_DBELL)) { 197 - xics_cause_ipi = smp_ops->cause_ipi; 198 - smp_ops->cause_ipi = pSeries_cause_ipi_mux; 199 - } 224 + if (cpu_has_feature(CPU_FTR_DBELL)) 225 + smp_ops->cause_ipi = smp_pseries_cause_ipi; 226 + else 227 + smp_ops->cause_ipi = icp_ops->cause_ipi; 200 228 } 201 229 202 230 static struct smp_ops_t pseries_smp_ops = { 203 231 .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */ 204 232 .cause_ipi = NULL, /* Filled at runtime by pSeries_smp_probe() */ 233 + .cause_nmi_ipi = pseries_cause_nmi_ipi, 205 234 .probe = pSeries_smp_probe, 206 235 .kick_cpu = smp_pSeries_kick_cpu, 207 236 .setup_cpu = smp_setup_cpu,

+1 -1

arch/powerpc/platforms/pseries/vio.c

··· 1318 1318 struct iommu_table *tbl = get_iommu_table_base(dev); 1319 1319 1320 1320 if (tbl) 1321 - iommu_free_table(tbl, of_node_full_name(dev->of_node)); 1321 + iommu_tce_table_put(tbl); 1322 1322 of_node_put(dev->of_node); 1323 1323 kfree(to_vio_dev(dev)); 1324 1324 }

arch/powerpc/relocs_check.sh arch/powerpc/tools/relocs_check.sh

arch/powerpc/scripts/gcc-check-mprofile-kernel.sh arch/powerpc/tools/gcc-check-mprofile-kernel.sh

+1

arch/powerpc/sysdev/Kconfig

··· 28 28 default y if PPC_POWERNV 29 29 30 30 source "arch/powerpc/sysdev/xics/Kconfig" 31 + source "arch/powerpc/sysdev/xive/Kconfig" 31 32 32 33 config PPC_SCOM 33 34 bool

+1

arch/powerpc/sysdev/Makefile

··· 71 71 subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror 72 72 73 73 obj-$(CONFIG_PPC_XICS) += xics/ 74 + obj-$(CONFIG_PPC_XIVE) += xive/ 74 75 75 76 obj-$(CONFIG_GE_FPGA) += ge/

+1 -2

arch/powerpc/sysdev/scom.c

··· 19 19 */ 20 20 21 21 #include <linux/kernel.h> 22 - #include <linux/debugfs.h> 23 22 #include <linux/slab.h> 24 23 #include <linux/export.h> 25 - #include <asm/debug.h> 24 + #include <asm/debugfs.h> 26 25 #include <asm/prom.h> 27 26 #include <asm/scom.h> 28 27 #include <linux/uaccess.h>

+1 -1

arch/powerpc/sysdev/xics/icp-hv.c

··· 138 138 139 139 #ifdef CONFIG_SMP 140 140 141 - static void icp_hv_cause_ipi(int cpu, unsigned long data) 141 + static void icp_hv_cause_ipi(int cpu) 142 142 { 143 143 icp_hv_set_qirr(cpu, IPI_PRIORITY); 144 144 }

+5 -15

arch/powerpc/sysdev/xics/icp-native.c

··· 143 143 144 144 #ifdef CONFIG_SMP 145 145 146 - static void icp_native_cause_ipi(int cpu, unsigned long data) 146 + static void icp_native_cause_ipi(int cpu) 147 147 { 148 148 kvmppc_set_host_ipi(cpu, 1); 149 - #ifdef CONFIG_PPC_DOORBELL 150 - if (cpu_has_feature(CPU_FTR_DBELL)) { 151 - if (cpumask_test_cpu(cpu, cpu_sibling_mask(get_cpu()))) { 152 - doorbell_cause_ipi(cpu, data); 153 - put_cpu(); 154 - return; 155 - } 156 - put_cpu(); 157 - } 158 - #endif 159 149 icp_native_set_qirr(cpu, IPI_PRIORITY); 160 150 } 161 151 ··· 158 168 * Need the physical address of the XICS to be 159 169 * previously saved in kvm_hstate in the paca. 160 170 */ 161 - unsigned long xics_phys; 171 + void __iomem *xics_phys; 162 172 163 173 /* 164 174 * Just like the cause_ipi functions, it is required to 165 - * include a full barrier (out8 includes a sync) before 166 - * causing the IPI. 175 + * include a full barrier before causing the IPI. 167 176 */ 168 177 xics_phys = paca[cpu].kvm_hstate.xics_phys; 169 - out_rm8((u8 *)(xics_phys + XICS_MFRR), IPI_PRIORITY); 178 + mb(); 179 + __raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR); 170 180 } 171 181 #endif 172 182

+1 -1

arch/powerpc/sysdev/xics/icp-opal.c

··· 126 126 127 127 #ifdef CONFIG_SMP 128 128 129 - static void icp_opal_cause_ipi(int cpu, unsigned long data) 129 + static void icp_opal_cause_ipi(int cpu) 130 130 { 131 131 int hw_cpu = get_hard_smp_processor_id(cpu); 132 132

+3 -3

arch/powerpc/sysdev/xics/xics-common.c

··· 143 143 144 144 void __init xics_smp_probe(void) 145 145 { 146 - /* Setup cause_ipi callback based on which ICP is used */ 147 - smp_ops->cause_ipi = icp_ops->cause_ipi; 148 - 149 146 /* Register all the IPIs */ 150 147 xics_request_ipi(); 148 + 149 + /* Setup cause_ipi callback based on which ICP is used */ 150 + smp_ops->cause_ipi = icp_ops->cause_ipi; 151 151 } 152 152 153 153 #endif /* CONFIG_SMP */

+11

arch/powerpc/sysdev/xive/Kconfig

··· 1 + config PPC_XIVE 2 + bool 3 + default n 4 + select PPC_SMP_MUXED_IPI 5 + select HARDIRQS_SW_RESEND 6 + 7 + config PPC_XIVE_NATIVE 8 + bool 9 + default n 10 + select PPC_XIVE 11 + depends on PPC_POWERNV

+4

arch/powerpc/sysdev/xive/Makefile

··· 1 + subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror 2 + 3 + obj-y += common.o 4 + obj-$(CONFIG_PPC_XIVE_NATIVE) += native.o

+1302

arch/powerpc/sysdev/xive/common.c

··· 1 + /* 2 + * Copyright 2016,2017 IBM Corporation. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public License 6 + * as published by the Free Software Foundation; either version 7 + * 2 of the License, or (at your option) any later version. 8 + */ 9 + 10 + #define pr_fmt(fmt) "xive: " fmt 11 + 12 + #include <linux/types.h> 13 + #include <linux/threads.h> 14 + #include <linux/kernel.h> 15 + #include <linux/irq.h> 16 + #include <linux/debugfs.h> 17 + #include <linux/smp.h> 18 + #include <linux/interrupt.h> 19 + #include <linux/seq_file.h> 20 + #include <linux/init.h> 21 + #include <linux/cpu.h> 22 + #include <linux/of.h> 23 + #include <linux/slab.h> 24 + #include <linux/spinlock.h> 25 + #include <linux/msi.h> 26 + 27 + #include <asm/prom.h> 28 + #include <asm/io.h> 29 + #include <asm/smp.h> 30 + #include <asm/machdep.h> 31 + #include <asm/irq.h> 32 + #include <asm/errno.h> 33 + #include <asm/xive.h> 34 + #include <asm/xive-regs.h> 35 + #include <asm/xmon.h> 36 + 37 + #include "xive-internal.h" 38 + 39 + #undef DEBUG_FLUSH 40 + #undef DEBUG_ALL 41 + 42 + #ifdef DEBUG_ALL 43 + #define DBG_VERBOSE(fmt...) pr_devel(fmt) 44 + #else 45 + #define DBG_VERBOSE(fmt...) do { } while(0) 46 + #endif 47 + 48 + bool __xive_enabled; 49 + bool xive_cmdline_disabled; 50 + 51 + /* We use only one priority for now */ 52 + static u8 xive_irq_priority; 53 + 54 + /* TIMA */ 55 + void __iomem *xive_tima; 56 + u32 xive_tima_offset; 57 + 58 + /* Backend ops */ 59 + static const struct xive_ops *xive_ops; 60 + 61 + /* Our global interrupt domain */ 62 + static struct irq_domain *xive_irq_domain; 63 + 64 + #ifdef CONFIG_SMP 65 + /* The IPIs all use the same logical irq number */ 66 + static u32 xive_ipi_irq; 67 + #endif 68 + 69 + /* Xive state for each CPU */ 70 + static DEFINE_PER_CPU(struct xive_cpu *, xive_cpu); 71 + 72 + /* 73 + * A "disabled" interrupt should never fire, to catch problems 74 + * we set its logical number to this 75 + */ 76 + #define XIVE_BAD_IRQ 0x7fffffff 77 + #define XIVE_MAX_IRQ (XIVE_BAD_IRQ - 1) 78 + 79 + /* An invalid CPU target */ 80 + #define XIVE_INVALID_TARGET (-1) 81 + 82 + /* 83 + * Read the next entry in a queue, return its content if it's valid 84 + * or 0 if there is no new entry. 85 + * 86 + * The queue pointer is moved forward unless "just_peek" is set 87 + */ 88 + static u32 xive_read_eq(struct xive_q *q, bool just_peek) 89 + { 90 + u32 cur; 91 + 92 + if (!q->qpage) 93 + return 0; 94 + cur = be32_to_cpup(q->qpage + q->idx); 95 + 96 + /* Check valid bit (31) vs current toggle polarity */ 97 + if ((cur >> 31) == q->toggle) 98 + return 0; 99 + 100 + /* If consuming from the queue ... */ 101 + if (!just_peek) { 102 + /* Next entry */ 103 + q->idx = (q->idx + 1) & q->msk; 104 + 105 + /* Wrap around: flip valid toggle */ 106 + if (q->idx == 0) 107 + q->toggle ^= 1; 108 + } 109 + /* Mask out the valid bit (31) */ 110 + return cur & 0x7fffffff; 111 + } 112 + 113 + /* 114 + * Scans all the queue that may have interrupts in them 115 + * (based on "pending_prio") in priority order until an 116 + * interrupt is found or all the queues are empty. 117 + * 118 + * Then updates the CPPR (Current Processor Priority 119 + * Register) based on the most favored interrupt found 120 + * (0xff if none) and return what was found (0 if none). 121 + * 122 + * If just_peek is set, return the most favored pending 123 + * interrupt if any but don't update the queue pointers. 124 + * 125 + * Note: This function can operate generically on any number 126 + * of queues (up to 8). The current implementation of the XIVE 127 + * driver only uses a single queue however. 128 + * 129 + * Note2: This will also "flush" "the pending_count" of a queue 130 + * into the "count" when that queue is observed to be empty. 131 + * This is used to keep track of the amount of interrupts 132 + * targetting a queue. When an interrupt is moved away from 133 + * a queue, we only decrement that queue count once the queue 134 + * has been observed empty to avoid races. 135 + */ 136 + static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek) 137 + { 138 + u32 irq = 0; 139 + u8 prio; 140 + 141 + /* Find highest pending priority */ 142 + while (xc->pending_prio != 0) { 143 + struct xive_q *q; 144 + 145 + prio = ffs(xc->pending_prio) - 1; 146 + DBG_VERBOSE("scan_irq: trying prio %d\n", prio); 147 + 148 + /* Try to fetch */ 149 + irq = xive_read_eq(&xc->queue[prio], just_peek); 150 + 151 + /* Found something ? That's it */ 152 + if (irq) 153 + break; 154 + 155 + /* Clear pending bits */ 156 + xc->pending_prio &= ~(1 << prio); 157 + 158 + /* 159 + * Check if the queue count needs adjusting due to 160 + * interrupts being moved away. See description of 161 + * xive_dec_target_count() 162 + */ 163 + q = &xc->queue[prio]; 164 + if (atomic_read(&q->pending_count)) { 165 + int p = atomic_xchg(&q->pending_count, 0); 166 + if (p) { 167 + WARN_ON(p > atomic_read(&q->count)); 168 + atomic_sub(p, &q->count); 169 + } 170 + } 171 + } 172 + 173 + /* If nothing was found, set CPPR to 0xff */ 174 + if (irq == 0) 175 + prio = 0xff; 176 + 177 + /* Update HW CPPR to match if necessary */ 178 + if (prio != xc->cppr) { 179 + DBG_VERBOSE("scan_irq: adjusting CPPR to %d\n", prio); 180 + xc->cppr = prio; 181 + out_8(xive_tima + xive_tima_offset + TM_CPPR, prio); 182 + } 183 + 184 + return irq; 185 + } 186 + 187 + /* 188 + * This is used to perform the magic loads from an ESB 189 + * described in xive.h 190 + */ 191 + static u8 xive_poke_esb(struct xive_irq_data *xd, u32 offset) 192 + { 193 + u64 val; 194 + 195 + /* Handle HW errata */ 196 + if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG) 197 + offset |= offset << 4; 198 + 199 + val = in_be64(xd->eoi_mmio + offset); 200 + 201 + return (u8)val; 202 + } 203 + 204 + #ifdef CONFIG_XMON 205 + static void xive_dump_eq(const char *name, struct xive_q *q) 206 + { 207 + u32 i0, i1, idx; 208 + 209 + if (!q->qpage) 210 + return; 211 + idx = q->idx; 212 + i0 = be32_to_cpup(q->qpage + idx); 213 + idx = (idx + 1) & q->msk; 214 + i1 = be32_to_cpup(q->qpage + idx); 215 + xmon_printf(" %s Q T=%d %08x %08x ...\n", name, 216 + q->toggle, i0, i1); 217 + } 218 + 219 + void xmon_xive_do_dump(int cpu) 220 + { 221 + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); 222 + 223 + xmon_printf("XIVE state for CPU %d:\n", cpu); 224 + xmon_printf(" pp=%02x cppr=%02x\n", xc->pending_prio, xc->cppr); 225 + xive_dump_eq("IRQ", &xc->queue[xive_irq_priority]); 226 + #ifdef CONFIG_SMP 227 + { 228 + u64 val = xive_poke_esb(&xc->ipi_data, XIVE_ESB_GET); 229 + xmon_printf(" IPI state: %x:%c%c\n", xc->hw_ipi, 230 + val & XIVE_ESB_VAL_P ? 'P' : 'p', 231 + val & XIVE_ESB_VAL_P ? 'Q' : 'q'); 232 + } 233 + #endif 234 + } 235 + #endif /* CONFIG_XMON */ 236 + 237 + static unsigned int xive_get_irq(void) 238 + { 239 + struct xive_cpu *xc = __this_cpu_read(xive_cpu); 240 + u32 irq; 241 + 242 + /* 243 + * This can be called either as a result of a HW interrupt or 244 + * as a "replay" because EOI decided there was still something 245 + * in one of the queues. 246 + * 247 + * First we perform an ACK cycle in order to update our mask 248 + * of pending priorities. This will also have the effect of 249 + * updating the CPPR to the most favored pending interrupts. 250 + * 251 + * In the future, if we have a way to differenciate a first 252 + * entry (on HW interrupt) from a replay triggered by EOI, 253 + * we could skip this on replays unless we soft-mask tells us 254 + * that a new HW interrupt occurred. 255 + */ 256 + xive_ops->update_pending(xc); 257 + 258 + DBG_VERBOSE("get_irq: pending=%02x\n", xc->pending_prio); 259 + 260 + /* Scan our queue(s) for interrupts */ 261 + irq = xive_scan_interrupts(xc, false); 262 + 263 + DBG_VERBOSE("get_irq: got irq 0x%x, new pending=0x%02x\n", 264 + irq, xc->pending_prio); 265 + 266 + /* Return pending interrupt if any */ 267 + if (irq == XIVE_BAD_IRQ) 268 + return 0; 269 + return irq; 270 + } 271 + 272 + /* 273 + * After EOI'ing an interrupt, we need to re-check the queue 274 + * to see if another interrupt is pending since multiple 275 + * interrupts can coalesce into a single notification to the 276 + * CPU. 277 + * 278 + * If we find that there is indeed more in there, we call 279 + * force_external_irq_replay() to make Linux synthetize an 280 + * external interrupt on the next call to local_irq_restore(). 281 + */ 282 + static void xive_do_queue_eoi(struct xive_cpu *xc) 283 + { 284 + if (xive_scan_interrupts(xc, true) != 0) { 285 + DBG_VERBOSE("eoi: pending=0x%02x\n", xc->pending_prio); 286 + force_external_irq_replay(); 287 + } 288 + } 289 + 290 + /* 291 + * EOI an interrupt at the source. There are several methods 292 + * to do this depending on the HW version and source type 293 + */ 294 + void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) 295 + { 296 + /* If the XIVE supports the new "store EOI facility, use it */ 297 + if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) 298 + out_be64(xd->eoi_mmio, 0); 299 + else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) { 300 + /* 301 + * The FW told us to call it. This happens for some 302 + * interrupt sources that need additional HW whacking 303 + * beyond the ESB manipulation. For example LPC interrupts 304 + * on P9 DD1.0 need a latch to be clared in the LPC bridge 305 + * itself. The Firmware will take care of it. 306 + */ 307 + if (WARN_ON_ONCE(!xive_ops->eoi)) 308 + return; 309 + xive_ops->eoi(hw_irq); 310 + } else { 311 + u8 eoi_val; 312 + 313 + /* 314 + * Otherwise for EOI, we use the special MMIO that does 315 + * a clear of both P and Q and returns the old Q, 316 + * except for LSIs where we use the "EOI cycle" special 317 + * load. 318 + * 319 + * This allows us to then do a re-trigger if Q was set 320 + * rather than synthesizing an interrupt in software 321 + * 322 + * For LSIs, using the HW EOI cycle works around a problem 323 + * on P9 DD1 PHBs where the other ESB accesses don't work 324 + * properly. 325 + */ 326 + if (xd->flags & XIVE_IRQ_FLAG_LSI) 327 + in_be64(xd->eoi_mmio); 328 + else { 329 + eoi_val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_00); 330 + DBG_VERBOSE("eoi_val=%x\n", offset, eoi_val); 331 + 332 + /* Re-trigger if needed */ 333 + if ((eoi_val & XIVE_ESB_VAL_Q) && xd->trig_mmio) 334 + out_be64(xd->trig_mmio, 0); 335 + } 336 + } 337 + } 338 + 339 + /* irq_chip eoi callback */ 340 + static void xive_irq_eoi(struct irq_data *d) 341 + { 342 + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 343 + struct xive_cpu *xc = __this_cpu_read(xive_cpu); 344 + 345 + DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n", 346 + d->irq, irqd_to_hwirq(d), xc->pending_prio); 347 + 348 + /* EOI the source if it hasn't been disabled */ 349 + if (!irqd_irq_disabled(d)) 350 + xive_do_source_eoi(irqd_to_hwirq(d), xd); 351 + 352 + /* 353 + * Clear saved_p to indicate that it's no longer occupying 354 + * a queue slot on the target queue 355 + */ 356 + xd->saved_p = false; 357 + 358 + /* Check for more work in the queue */ 359 + xive_do_queue_eoi(xc); 360 + } 361 + 362 + /* 363 + * Helper used to mask and unmask an interrupt source. This 364 + * is only called for normal interrupts that do not require 365 + * masking/unmasking via firmware. 366 + */ 367 + static void xive_do_source_set_mask(struct xive_irq_data *xd, 368 + bool mask) 369 + { 370 + u64 val; 371 + 372 + /* 373 + * If the interrupt had P set, it may be in a queue. 374 + * 375 + * We need to make sure we don't re-enable it until it 376 + * has been fetched from that queue and EOId. We keep 377 + * a copy of that P state and use it to restore the 378 + * ESB accordingly on unmask. 379 + */ 380 + if (mask) { 381 + val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_01); 382 + xd->saved_p = !!(val & XIVE_ESB_VAL_P); 383 + } else if (xd->saved_p) 384 + xive_poke_esb(xd, XIVE_ESB_SET_PQ_10); 385 + else 386 + xive_poke_esb(xd, XIVE_ESB_SET_PQ_00); 387 + } 388 + 389 + /* 390 + * Try to chose "cpu" as a new interrupt target. Increments 391 + * the queue accounting for that target if it's not already 392 + * full. 393 + */ 394 + static bool xive_try_pick_target(int cpu) 395 + { 396 + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); 397 + struct xive_q *q = &xc->queue[xive_irq_priority]; 398 + int max; 399 + 400 + /* 401 + * Calculate max number of interrupts in that queue. 402 + * 403 + * We leave a gap of 1 just in case... 404 + */ 405 + max = (q->msk + 1) - 1; 406 + return !!atomic_add_unless(&q->count, 1, max); 407 + } 408 + 409 + /* 410 + * Un-account an interrupt for a target CPU. We don't directly 411 + * decrement q->count since the interrupt might still be present 412 + * in the queue. 413 + * 414 + * Instead increment a separate counter "pending_count" which 415 + * will be substracted from "count" later when that CPU observes 416 + * the queue to be empty. 417 + */ 418 + static void xive_dec_target_count(int cpu) 419 + { 420 + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); 421 + struct xive_q *q = &xc->queue[xive_irq_priority]; 422 + 423 + if (unlikely(WARN_ON(cpu < 0 || !xc))) { 424 + pr_err("%s: cpu=%d xc=%p\n", __func__, cpu, xc); 425 + return; 426 + } 427 + 428 + /* 429 + * We increment the "pending count" which will be used 430 + * to decrement the target queue count whenever it's next 431 + * processed and found empty. This ensure that we don't 432 + * decrement while we still have the interrupt there 433 + * occupying a slot. 434 + */ 435 + atomic_inc(&q->pending_count); 436 + } 437 + 438 + /* Find a tentative CPU target in a CPU mask */ 439 + static int xive_find_target_in_mask(const struct cpumask *mask, 440 + unsigned int fuzz) 441 + { 442 + int cpu, first, num, i; 443 + 444 + /* Pick up a starting point CPU in the mask based on fuzz */ 445 + num = cpumask_weight(mask); 446 + first = fuzz % num; 447 + 448 + /* Locate it */ 449 + cpu = cpumask_first(mask); 450 + for (i = 0; i < first && cpu < nr_cpu_ids; i++) 451 + cpu = cpumask_next(cpu, mask); 452 + 453 + /* Sanity check */ 454 + if (WARN_ON(cpu >= nr_cpu_ids)) 455 + cpu = cpumask_first(cpu_online_mask); 456 + 457 + /* Remember first one to handle wrap-around */ 458 + first = cpu; 459 + 460 + /* 461 + * Now go through the entire mask until we find a valid 462 + * target. 463 + */ 464 + for (;;) { 465 + /* 466 + * We re-check online as the fallback case passes us 467 + * an untested affinity mask 468 + */ 469 + if (cpu_online(cpu) && xive_try_pick_target(cpu)) 470 + return cpu; 471 + cpu = cpumask_next(cpu, mask); 472 + if (cpu == first) 473 + break; 474 + /* Wrap around */ 475 + if (cpu >= nr_cpu_ids) 476 + cpu = cpumask_first(mask); 477 + } 478 + return -1; 479 + } 480 + 481 + /* 482 + * Pick a target CPU for an interrupt. This is done at 483 + * startup or if the affinity is changed in a way that 484 + * invalidates the current target. 485 + */ 486 + static int xive_pick_irq_target(struct irq_data *d, 487 + const struct cpumask *affinity) 488 + { 489 + static unsigned int fuzz; 490 + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 491 + cpumask_var_t mask; 492 + int cpu = -1; 493 + 494 + /* 495 + * If we have chip IDs, first we try to build a mask of 496 + * CPUs matching the CPU and find a target in there 497 + */ 498 + if (xd->src_chip != XIVE_INVALID_CHIP_ID && 499 + zalloc_cpumask_var(&mask, GFP_ATOMIC)) { 500 + /* Build a mask of matching chip IDs */ 501 + for_each_cpu_and(cpu, affinity, cpu_online_mask) { 502 + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); 503 + if (xc->chip_id == xd->src_chip) 504 + cpumask_set_cpu(cpu, mask); 505 + } 506 + /* Try to find a target */ 507 + if (cpumask_empty(mask)) 508 + cpu = -1; 509 + else 510 + cpu = xive_find_target_in_mask(mask, fuzz++); 511 + free_cpumask_var(mask); 512 + if (cpu >= 0) 513 + return cpu; 514 + fuzz--; 515 + } 516 + 517 + /* No chip IDs, fallback to using the affinity mask */ 518 + return xive_find_target_in_mask(affinity, fuzz++); 519 + } 520 + 521 + static unsigned int xive_irq_startup(struct irq_data *d) 522 + { 523 + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 524 + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); 525 + int target, rc; 526 + 527 + pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n", 528 + d->irq, hw_irq, d); 529 + 530 + #ifdef CONFIG_PCI_MSI 531 + /* 532 + * The generic MSI code returns with the interrupt disabled on the 533 + * card, using the MSI mask bits. Firmware doesn't appear to unmask 534 + * at that level, so we do it here by hand. 535 + */ 536 + if (irq_data_get_msi_desc(d)) 537 + pci_msi_unmask_irq(d); 538 + #endif 539 + 540 + /* Pick a target */ 541 + target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d)); 542 + if (target == XIVE_INVALID_TARGET) { 543 + /* Try again breaking affinity */ 544 + target = xive_pick_irq_target(d, cpu_online_mask); 545 + if (target == XIVE_INVALID_TARGET) 546 + return -ENXIO; 547 + pr_warn("irq %d started with broken affinity\n", d->irq); 548 + } 549 + 550 + /* Sanity check */ 551 + if (WARN_ON(target == XIVE_INVALID_TARGET || 552 + target >= nr_cpu_ids)) 553 + target = smp_processor_id(); 554 + 555 + xd->target = target; 556 + 557 + /* 558 + * Configure the logical number to be the Linux IRQ number 559 + * and set the target queue 560 + */ 561 + rc = xive_ops->configure_irq(hw_irq, 562 + get_hard_smp_processor_id(target), 563 + xive_irq_priority, d->irq); 564 + if (rc) 565 + return rc; 566 + 567 + /* Unmask the ESB */ 568 + xive_do_source_set_mask(xd, false); 569 + 570 + return 0; 571 + } 572 + 573 + static void xive_irq_shutdown(struct irq_data *d) 574 + { 575 + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 576 + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); 577 + 578 + pr_devel("xive_irq_shutdown: irq %d [0x%x] data @%p\n", 579 + d->irq, hw_irq, d); 580 + 581 + if (WARN_ON(xd->target == XIVE_INVALID_TARGET)) 582 + return; 583 + 584 + /* Mask the interrupt at the source */ 585 + xive_do_source_set_mask(xd, true); 586 + 587 + /* 588 + * The above may have set saved_p. We clear it otherwise it 589 + * will prevent re-enabling later on. It is ok to forget the 590 + * fact that the interrupt might be in a queue because we are 591 + * accounting that already in xive_dec_target_count() and will 592 + * be re-routing it to a new queue with proper accounting when 593 + * it's started up again 594 + */ 595 + xd->saved_p = false; 596 + 597 + /* 598 + * Mask the interrupt in HW in the IVT/EAS and set the number 599 + * to be the "bad" IRQ number 600 + */ 601 + xive_ops->configure_irq(hw_irq, 602 + get_hard_smp_processor_id(xd->target), 603 + 0xff, XIVE_BAD_IRQ); 604 + 605 + xive_dec_target_count(xd->target); 606 + xd->target = XIVE_INVALID_TARGET; 607 + } 608 + 609 + static void xive_irq_unmask(struct irq_data *d) 610 + { 611 + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 612 + 613 + pr_devel("xive_irq_unmask: irq %d data @%p\n", d->irq, xd); 614 + 615 + /* 616 + * This is a workaround for PCI LSI problems on P9, for 617 + * these, we call FW to set the mask. The problems might 618 + * be fixed by P9 DD2.0, if that is the case, firmware 619 + * will no longer set that flag. 620 + */ 621 + if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) { 622 + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); 623 + xive_ops->configure_irq(hw_irq, 624 + get_hard_smp_processor_id(xd->target), 625 + xive_irq_priority, d->irq); 626 + return; 627 + } 628 + 629 + xive_do_source_set_mask(xd, false); 630 + } 631 + 632 + static void xive_irq_mask(struct irq_data *d) 633 + { 634 + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 635 + 636 + pr_devel("xive_irq_mask: irq %d data @%p\n", d->irq, xd); 637 + 638 + /* 639 + * This is a workaround for PCI LSI problems on P9, for 640 + * these, we call OPAL to set the mask. The problems might 641 + * be fixed by P9 DD2.0, if that is the case, firmware 642 + * will no longer set that flag. 643 + */ 644 + if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) { 645 + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); 646 + xive_ops->configure_irq(hw_irq, 647 + get_hard_smp_processor_id(xd->target), 648 + 0xff, d->irq); 649 + return; 650 + } 651 + 652 + xive_do_source_set_mask(xd, true); 653 + } 654 + 655 + static int xive_irq_set_affinity(struct irq_data *d, 656 + const struct cpumask *cpumask, 657 + bool force) 658 + { 659 + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 660 + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); 661 + u32 target, old_target; 662 + int rc = 0; 663 + 664 + pr_devel("xive_irq_set_affinity: irq %d\n", d->irq); 665 + 666 + /* Is this valid ? */ 667 + if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) 668 + return -EINVAL; 669 + 670 + /* 671 + * If existing target is already in the new mask, and is 672 + * online then do nothing. 673 + */ 674 + if (xd->target != XIVE_INVALID_TARGET && 675 + cpu_online(xd->target) && 676 + cpumask_test_cpu(xd->target, cpumask)) 677 + return IRQ_SET_MASK_OK; 678 + 679 + /* Pick a new target */ 680 + target = xive_pick_irq_target(d, cpumask); 681 + 682 + /* No target found */ 683 + if (target == XIVE_INVALID_TARGET) 684 + return -ENXIO; 685 + 686 + /* Sanity check */ 687 + if (WARN_ON(target >= nr_cpu_ids)) 688 + target = smp_processor_id(); 689 + 690 + old_target = xd->target; 691 + 692 + rc = xive_ops->configure_irq(hw_irq, 693 + get_hard_smp_processor_id(target), 694 + xive_irq_priority, d->irq); 695 + if (rc < 0) { 696 + pr_err("Error %d reconfiguring irq %d\n", rc, d->irq); 697 + return rc; 698 + } 699 + 700 + pr_devel(" target: 0x%x\n", target); 701 + xd->target = target; 702 + 703 + /* Give up previous target */ 704 + if (old_target != XIVE_INVALID_TARGET) 705 + xive_dec_target_count(old_target); 706 + 707 + return IRQ_SET_MASK_OK; 708 + } 709 + 710 + static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type) 711 + { 712 + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 713 + 714 + /* 715 + * We only support these. This has really no effect other than setting 716 + * the corresponding descriptor bits mind you but those will in turn 717 + * affect the resend function when re-enabling an edge interrupt. 718 + * 719 + * Set set the default to edge as explained in map(). 720 + */ 721 + if (flow_type == IRQ_TYPE_DEFAULT || flow_type == IRQ_TYPE_NONE) 722 + flow_type = IRQ_TYPE_EDGE_RISING; 723 + 724 + if (flow_type != IRQ_TYPE_EDGE_RISING && 725 + flow_type != IRQ_TYPE_LEVEL_LOW) 726 + return -EINVAL; 727 + 728 + irqd_set_trigger_type(d, flow_type); 729 + 730 + /* 731 + * Double check it matches what the FW thinks 732 + * 733 + * NOTE: We don't know yet if the PAPR interface will provide 734 + * the LSI vs MSI information apart from the device-tree so 735 + * this check might have to move into an optional backend call 736 + * that is specific to the native backend 737 + */ 738 + if ((flow_type == IRQ_TYPE_LEVEL_LOW) != 739 + !!(xd->flags & XIVE_IRQ_FLAG_LSI)) { 740 + pr_warn("Interrupt %d (HW 0x%x) type mismatch, Linux says %s, FW says %s\n", 741 + d->irq, (u32)irqd_to_hwirq(d), 742 + (flow_type == IRQ_TYPE_LEVEL_LOW) ? "Level" : "Edge", 743 + (xd->flags & XIVE_IRQ_FLAG_LSI) ? "Level" : "Edge"); 744 + } 745 + 746 + return IRQ_SET_MASK_OK_NOCOPY; 747 + } 748 + 749 + static int xive_irq_retrigger(struct irq_data *d) 750 + { 751 + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); 752 + 753 + /* This should be only for MSIs */ 754 + if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI)) 755 + return 0; 756 + 757 + /* 758 + * To perform a retrigger, we first set the PQ bits to 759 + * 11, then perform an EOI. 760 + */ 761 + xive_poke_esb(xd, XIVE_ESB_SET_PQ_11); 762 + 763 + /* 764 + * Note: We pass "0" to the hw_irq argument in order to 765 + * avoid calling into the backend EOI code which we don't 766 + * want to do in the case of a re-trigger. Backends typically 767 + * only do EOI for LSIs anyway. 768 + */ 769 + xive_do_source_eoi(0, xd); 770 + 771 + return 1; 772 + } 773 + 774 + static struct irq_chip xive_irq_chip = { 775 + .name = "XIVE-IRQ", 776 + .irq_startup = xive_irq_startup, 777 + .irq_shutdown = xive_irq_shutdown, 778 + .irq_eoi = xive_irq_eoi, 779 + .irq_mask = xive_irq_mask, 780 + .irq_unmask = xive_irq_unmask, 781 + .irq_set_affinity = xive_irq_set_affinity, 782 + .irq_set_type = xive_irq_set_type, 783 + .irq_retrigger = xive_irq_retrigger, 784 + }; 785 + 786 + bool is_xive_irq(struct irq_chip *chip) 787 + { 788 + return chip == &xive_irq_chip; 789 + } 790 + 791 + void xive_cleanup_irq_data(struct xive_irq_data *xd) 792 + { 793 + if (xd->eoi_mmio) { 794 + iounmap(xd->eoi_mmio); 795 + if (xd->eoi_mmio == xd->trig_mmio) 796 + xd->trig_mmio = NULL; 797 + xd->eoi_mmio = NULL; 798 + } 799 + if (xd->trig_mmio) { 800 + iounmap(xd->trig_mmio); 801 + xd->trig_mmio = NULL; 802 + } 803 + } 804 + 805 + static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw) 806 + { 807 + struct xive_irq_data *xd; 808 + int rc; 809 + 810 + xd = kzalloc(sizeof(struct xive_irq_data), GFP_KERNEL); 811 + if (!xd) 812 + return -ENOMEM; 813 + rc = xive_ops->populate_irq_data(hw, xd); 814 + if (rc) { 815 + kfree(xd); 816 + return rc; 817 + } 818 + xd->target = XIVE_INVALID_TARGET; 819 + irq_set_handler_data(virq, xd); 820 + 821 + return 0; 822 + } 823 + 824 + static void xive_irq_free_data(unsigned int virq) 825 + { 826 + struct xive_irq_data *xd = irq_get_handler_data(virq); 827 + 828 + if (!xd) 829 + return; 830 + irq_set_handler_data(virq, NULL); 831 + xive_cleanup_irq_data(xd); 832 + kfree(xd); 833 + } 834 + 835 + #ifdef CONFIG_SMP 836 + 837 + static void xive_cause_ipi(int cpu) 838 + { 839 + struct xive_cpu *xc; 840 + struct xive_irq_data *xd; 841 + 842 + xc = per_cpu(xive_cpu, cpu); 843 + 844 + DBG_VERBOSE("IPI CPU %d -> %d (HW IRQ 0x%x)\n", 845 + smp_processor_id(), cpu, xc->hw_ipi); 846 + 847 + xd = &xc->ipi_data; 848 + if (WARN_ON(!xd->trig_mmio)) 849 + return; 850 + out_be64(xd->trig_mmio, 0); 851 + } 852 + 853 + static irqreturn_t xive_muxed_ipi_action(int irq, void *dev_id) 854 + { 855 + return smp_ipi_demux(); 856 + } 857 + 858 + static void xive_ipi_eoi(struct irq_data *d) 859 + { 860 + struct xive_cpu *xc = __this_cpu_read(xive_cpu); 861 + 862 + /* Handle possible race with unplug and drop stale IPIs */ 863 + if (!xc) 864 + return; 865 + xive_do_source_eoi(xc->hw_ipi, &xc->ipi_data); 866 + xive_do_queue_eoi(xc); 867 + } 868 + 869 + static void xive_ipi_do_nothing(struct irq_data *d) 870 + { 871 + /* 872 + * Nothing to do, we never mask/unmask IPIs, but the callback 873 + * has to exist for the struct irq_chip. 874 + */ 875 + } 876 + 877 + static struct irq_chip xive_ipi_chip = { 878 + .name = "XIVE-IPI", 879 + .irq_eoi = xive_ipi_eoi, 880 + .irq_mask = xive_ipi_do_nothing, 881 + .irq_unmask = xive_ipi_do_nothing, 882 + }; 883 + 884 + static void __init xive_request_ipi(void) 885 + { 886 + unsigned int virq; 887 + 888 + /* 889 + * Initialization failed, move on, we might manage to 890 + * reach the point where we display our errors before 891 + * the system falls appart 892 + */ 893 + if (!xive_irq_domain) 894 + return; 895 + 896 + /* Initialize it */ 897 + virq = irq_create_mapping(xive_irq_domain, 0); 898 + xive_ipi_irq = virq; 899 + 900 + WARN_ON(request_irq(virq, xive_muxed_ipi_action, 901 + IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL)); 902 + } 903 + 904 + static int xive_setup_cpu_ipi(unsigned int cpu) 905 + { 906 + struct xive_cpu *xc; 907 + int rc; 908 + 909 + pr_debug("Setting up IPI for CPU %d\n", cpu); 910 + 911 + xc = per_cpu(xive_cpu, cpu); 912 + 913 + /* Check if we are already setup */ 914 + if (xc->hw_ipi != 0) 915 + return 0; 916 + 917 + /* Grab an IPI from the backend, this will populate xc->hw_ipi */ 918 + if (xive_ops->get_ipi(cpu, xc)) 919 + return -EIO; 920 + 921 + /* 922 + * Populate the IRQ data in the xive_cpu structure and 923 + * configure the HW / enable the IPIs. 924 + */ 925 + rc = xive_ops->populate_irq_data(xc->hw_ipi, &xc->ipi_data); 926 + if (rc) { 927 + pr_err("Failed to populate IPI data on CPU %d\n", cpu); 928 + return -EIO; 929 + } 930 + rc = xive_ops->configure_irq(xc->hw_ipi, 931 + get_hard_smp_processor_id(cpu), 932 + xive_irq_priority, xive_ipi_irq); 933 + if (rc) { 934 + pr_err("Failed to map IPI CPU %d\n", cpu); 935 + return -EIO; 936 + } 937 + pr_devel("CPU %d HW IPI %x, virq %d, trig_mmio=%p\n", cpu, 938 + xc->hw_ipi, xive_ipi_irq, xc->ipi_data.trig_mmio); 939 + 940 + /* Unmask it */ 941 + xive_do_source_set_mask(&xc->ipi_data, false); 942 + 943 + return 0; 944 + } 945 + 946 + static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc) 947 + { 948 + /* Disable the IPI and free the IRQ data */ 949 + 950 + /* Already cleaned up ? */ 951 + if (xc->hw_ipi == 0) 952 + return; 953 + 954 + /* Mask the IPI */ 955 + xive_do_source_set_mask(&xc->ipi_data, true); 956 + 957 + /* 958 + * Note: We don't call xive_cleanup_irq_data() to free 959 + * the mappings as this is called from an IPI on kexec 960 + * which is not a safe environment to call iounmap() 961 + */ 962 + 963 + /* Deconfigure/mask in the backend */ 964 + xive_ops->configure_irq(xc->hw_ipi, hard_smp_processor_id(), 965 + 0xff, xive_ipi_irq); 966 + 967 + /* Free the IPIs in the backend */ 968 + xive_ops->put_ipi(cpu, xc); 969 + } 970 + 971 + void __init xive_smp_probe(void) 972 + { 973 + smp_ops->cause_ipi = xive_cause_ipi; 974 + 975 + /* Register the IPI */ 976 + xive_request_ipi(); 977 + 978 + /* Allocate and setup IPI for the boot CPU */ 979 + xive_setup_cpu_ipi(smp_processor_id()); 980 + } 981 + 982 + #endif /* CONFIG_SMP */ 983 + 984 + static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq, 985 + irq_hw_number_t hw) 986 + { 987 + int rc; 988 + 989 + /* 990 + * Mark interrupts as edge sensitive by default so that resend 991 + * actually works. Will fix that up below if needed. 992 + */ 993 + irq_clear_status_flags(virq, IRQ_LEVEL); 994 + 995 + #ifdef CONFIG_SMP 996 + /* IPIs are special and come up with HW number 0 */ 997 + if (hw == 0) { 998 + /* 999 + * IPIs are marked per-cpu. We use separate HW interrupts under 1000 + * the hood but associated with the same "linux" interrupt 1001 + */ 1002 + irq_set_chip_and_handler(virq, &xive_ipi_chip, 1003 + handle_percpu_irq); 1004 + return 0; 1005 + } 1006 + #endif 1007 + 1008 + rc = xive_irq_alloc_data(virq, hw); 1009 + if (rc) 1010 + return rc; 1011 + 1012 + irq_set_chip_and_handler(virq, &xive_irq_chip, handle_fasteoi_irq); 1013 + 1014 + return 0; 1015 + } 1016 + 1017 + static void xive_irq_domain_unmap(struct irq_domain *d, unsigned int virq) 1018 + { 1019 + struct irq_data *data = irq_get_irq_data(virq); 1020 + unsigned int hw_irq; 1021 + 1022 + /* XXX Assign BAD number */ 1023 + if (!data) 1024 + return; 1025 + hw_irq = (unsigned int)irqd_to_hwirq(data); 1026 + if (hw_irq) 1027 + xive_irq_free_data(virq); 1028 + } 1029 + 1030 + static int xive_irq_domain_xlate(struct irq_domain *h, struct device_node *ct, 1031 + const u32 *intspec, unsigned int intsize, 1032 + irq_hw_number_t *out_hwirq, unsigned int *out_flags) 1033 + 1034 + { 1035 + *out_hwirq = intspec[0]; 1036 + 1037 + /* 1038 + * If intsize is at least 2, we look for the type in the second cell, 1039 + * we assume the LSB indicates a level interrupt. 1040 + */ 1041 + if (intsize > 1) { 1042 + if (intspec[1] & 1) 1043 + *out_flags = IRQ_TYPE_LEVEL_LOW; 1044 + else 1045 + *out_flags = IRQ_TYPE_EDGE_RISING; 1046 + } else 1047 + *out_flags = IRQ_TYPE_LEVEL_LOW; 1048 + 1049 + return 0; 1050 + } 1051 + 1052 + static int xive_irq_domain_match(struct irq_domain *h, struct device_node *node, 1053 + enum irq_domain_bus_token bus_token) 1054 + { 1055 + return xive_ops->match(node); 1056 + } 1057 + 1058 + static const struct irq_domain_ops xive_irq_domain_ops = { 1059 + .match = xive_irq_domain_match, 1060 + .map = xive_irq_domain_map, 1061 + .unmap = xive_irq_domain_unmap, 1062 + .xlate = xive_irq_domain_xlate, 1063 + }; 1064 + 1065 + static void __init xive_init_host(void) 1066 + { 1067 + xive_irq_domain = irq_domain_add_nomap(NULL, XIVE_MAX_IRQ, 1068 + &xive_irq_domain_ops, NULL); 1069 + if (WARN_ON(xive_irq_domain == NULL)) 1070 + return; 1071 + irq_set_default_host(xive_irq_domain); 1072 + } 1073 + 1074 + static void xive_cleanup_cpu_queues(unsigned int cpu, struct xive_cpu *xc) 1075 + { 1076 + if (xc->queue[xive_irq_priority].qpage) 1077 + xive_ops->cleanup_queue(cpu, xc, xive_irq_priority); 1078 + } 1079 + 1080 + static int xive_setup_cpu_queues(unsigned int cpu, struct xive_cpu *xc) 1081 + { 1082 + int rc = 0; 1083 + 1084 + /* We setup 1 queues for now with a 64k page */ 1085 + if (!xc->queue[xive_irq_priority].qpage) 1086 + rc = xive_ops->setup_queue(cpu, xc, xive_irq_priority); 1087 + 1088 + return rc; 1089 + } 1090 + 1091 + static int xive_prepare_cpu(unsigned int cpu) 1092 + { 1093 + struct xive_cpu *xc; 1094 + 1095 + xc = per_cpu(xive_cpu, cpu); 1096 + if (!xc) { 1097 + struct device_node *np; 1098 + 1099 + xc = kzalloc_node(sizeof(struct xive_cpu), 1100 + GFP_KERNEL, cpu_to_node(cpu)); 1101 + if (!xc) 1102 + return -ENOMEM; 1103 + np = of_get_cpu_node(cpu, NULL); 1104 + if (np) 1105 + xc->chip_id = of_get_ibm_chip_id(np); 1106 + of_node_put(np); 1107 + 1108 + per_cpu(xive_cpu, cpu) = xc; 1109 + } 1110 + 1111 + /* Setup EQs if not already */ 1112 + return xive_setup_cpu_queues(cpu, xc); 1113 + } 1114 + 1115 + static void xive_setup_cpu(void) 1116 + { 1117 + struct xive_cpu *xc = __this_cpu_read(xive_cpu); 1118 + 1119 + /* Debug: Dump the TM state */ 1120 + pr_devel("CPU %d [HW 0x%02x] VT=%02x\n", 1121 + smp_processor_id(), hard_smp_processor_id(), 1122 + in_8(xive_tima + xive_tima_offset + TM_WORD2)); 1123 + 1124 + /* The backend might have additional things to do */ 1125 + if (xive_ops->setup_cpu) 1126 + xive_ops->setup_cpu(smp_processor_id(), xc); 1127 + 1128 + /* Set CPPR to 0xff to enable flow of interrupts */ 1129 + xc->cppr = 0xff; 1130 + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0xff); 1131 + } 1132 + 1133 + #ifdef CONFIG_SMP 1134 + void xive_smp_setup_cpu(void) 1135 + { 1136 + pr_devel("SMP setup CPU %d\n", smp_processor_id()); 1137 + 1138 + /* This will have already been done on the boot CPU */ 1139 + if (smp_processor_id() != boot_cpuid) 1140 + xive_setup_cpu(); 1141 + 1142 + } 1143 + 1144 + int xive_smp_prepare_cpu(unsigned int cpu) 1145 + { 1146 + int rc; 1147 + 1148 + /* Allocate per-CPU data and queues */ 1149 + rc = xive_prepare_cpu(cpu); 1150 + if (rc) 1151 + return rc; 1152 + 1153 + /* Allocate and setup IPI for the new CPU */ 1154 + return xive_setup_cpu_ipi(cpu); 1155 + } 1156 + 1157 + #ifdef CONFIG_HOTPLUG_CPU 1158 + static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc) 1159 + { 1160 + u32 irq; 1161 + 1162 + /* We assume local irqs are disabled */ 1163 + WARN_ON(!irqs_disabled()); 1164 + 1165 + /* Check what's already in the CPU queue */ 1166 + while ((irq = xive_scan_interrupts(xc, false)) != 0) { 1167 + /* 1168 + * We need to re-route that interrupt to its new destination. 1169 + * First get and lock the descriptor 1170 + */ 1171 + struct irq_desc *desc = irq_to_desc(irq); 1172 + struct irq_data *d = irq_desc_get_irq_data(desc); 1173 + struct xive_irq_data *xd; 1174 + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); 1175 + 1176 + /* 1177 + * Ignore anything that isn't a XIVE irq and ignore 1178 + * IPIs, so can just be dropped. 1179 + */ 1180 + if (d->domain != xive_irq_domain || hw_irq == 0) 1181 + continue; 1182 + 1183 + /* 1184 + * The IRQ should have already been re-routed, it's just a 1185 + * stale in the old queue, so re-trigger it in order to make 1186 + * it reach is new destination. 1187 + */ 1188 + #ifdef DEBUG_FLUSH 1189 + pr_info("CPU %d: Got irq %d while offline, re-sending...\n", 1190 + cpu, irq); 1191 + #endif 1192 + raw_spin_lock(&desc->lock); 1193 + xd = irq_desc_get_handler_data(desc); 1194 + 1195 + /* 1196 + * For LSIs, we EOI, this will cause a resend if it's 1197 + * still asserted. Otherwise do an MSI retrigger. 1198 + */ 1199 + if (xd->flags & XIVE_IRQ_FLAG_LSI) 1200 + xive_do_source_eoi(irqd_to_hwirq(d), xd); 1201 + else 1202 + xive_irq_retrigger(d); 1203 + 1204 + raw_spin_unlock(&desc->lock); 1205 + } 1206 + } 1207 + 1208 + void xive_smp_disable_cpu(void) 1209 + { 1210 + struct xive_cpu *xc = __this_cpu_read(xive_cpu); 1211 + unsigned int cpu = smp_processor_id(); 1212 + 1213 + /* Migrate interrupts away from the CPU */ 1214 + irq_migrate_all_off_this_cpu(); 1215 + 1216 + /* Set CPPR to 0 to disable flow of interrupts */ 1217 + xc->cppr = 0; 1218 + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0); 1219 + 1220 + /* Flush everything still in the queue */ 1221 + xive_flush_cpu_queue(cpu, xc); 1222 + 1223 + /* Re-enable CPPR */ 1224 + xc->cppr = 0xff; 1225 + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0xff); 1226 + } 1227 + 1228 + void xive_flush_interrupt(void) 1229 + { 1230 + struct xive_cpu *xc = __this_cpu_read(xive_cpu); 1231 + unsigned int cpu = smp_processor_id(); 1232 + 1233 + /* Called if an interrupt occurs while the CPU is hot unplugged */ 1234 + xive_flush_cpu_queue(cpu, xc); 1235 + } 1236 + 1237 + #endif /* CONFIG_HOTPLUG_CPU */ 1238 + 1239 + #endif /* CONFIG_SMP */ 1240 + 1241 + void xive_kexec_teardown_cpu(int secondary) 1242 + { 1243 + struct xive_cpu *xc = __this_cpu_read(xive_cpu); 1244 + unsigned int cpu = smp_processor_id(); 1245 + 1246 + /* Set CPPR to 0 to disable flow of interrupts */ 1247 + xc->cppr = 0; 1248 + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0); 1249 + 1250 + /* Backend cleanup if any */ 1251 + if (xive_ops->teardown_cpu) 1252 + xive_ops->teardown_cpu(cpu, xc); 1253 + 1254 + #ifdef CONFIG_SMP 1255 + /* Get rid of IPI */ 1256 + xive_cleanup_cpu_ipi(cpu, xc); 1257 + #endif 1258 + 1259 + /* Disable and free the queues */ 1260 + xive_cleanup_cpu_queues(cpu, xc); 1261 + } 1262 + 1263 + void xive_shutdown(void) 1264 + { 1265 + xive_ops->shutdown(); 1266 + } 1267 + 1268 + bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset, 1269 + u8 max_prio) 1270 + { 1271 + xive_tima = area; 1272 + xive_tima_offset = offset; 1273 + xive_ops = ops; 1274 + xive_irq_priority = max_prio; 1275 + 1276 + ppc_md.get_irq = xive_get_irq; 1277 + __xive_enabled = true; 1278 + 1279 + pr_devel("Initializing host..\n"); 1280 + xive_init_host(); 1281 + 1282 + pr_devel("Initializing boot CPU..\n"); 1283 + 1284 + /* Allocate per-CPU data and queues */ 1285 + xive_prepare_cpu(smp_processor_id()); 1286 + 1287 + /* Get ready for interrupts */ 1288 + xive_setup_cpu(); 1289 + 1290 + pr_info("Interrupt handling intialized with %s backend\n", 1291 + xive_ops->name); 1292 + pr_info("Using priority %d for all interrupts\n", max_prio); 1293 + 1294 + return true; 1295 + } 1296 + 1297 + static int __init xive_off(char *arg) 1298 + { 1299 + xive_cmdline_disabled = true; 1300 + return 0; 1301 + } 1302 + __setup("xive=off", xive_off);

+640

arch/powerpc/sysdev/xive/native.c

··· 1 + /* 2 + * Copyright 2016,2017 IBM Corporation. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public License 6 + * as published by the Free Software Foundation; either version 7 + * 2 of the License, or (at your option) any later version. 8 + */ 9 + 10 + #define pr_fmt(fmt) "xive: " fmt 11 + 12 + #include <linux/types.h> 13 + #include <linux/irq.h> 14 + #include <linux/debugfs.h> 15 + #include <linux/smp.h> 16 + #include <linux/interrupt.h> 17 + #include <linux/seq_file.h> 18 + #include <linux/init.h> 19 + #include <linux/of.h> 20 + #include <linux/slab.h> 21 + #include <linux/spinlock.h> 22 + #include <linux/delay.h> 23 + #include <linux/cpumask.h> 24 + #include <linux/mm.h> 25 + 26 + #include <asm/prom.h> 27 + #include <asm/io.h> 28 + #include <asm/smp.h> 29 + #include <asm/irq.h> 30 + #include <asm/errno.h> 31 + #include <asm/xive.h> 32 + #include <asm/xive-regs.h> 33 + #include <asm/opal.h> 34 + 35 + #include "xive-internal.h" 36 + 37 + 38 + static u32 xive_provision_size; 39 + static u32 *xive_provision_chips; 40 + static u32 xive_provision_chip_count; 41 + static u32 xive_queue_shift; 42 + static u32 xive_pool_vps = XIVE_INVALID_VP; 43 + static struct kmem_cache *xive_provision_cache; 44 + 45 + int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) 46 + { 47 + __be64 flags, eoi_page, trig_page; 48 + __be32 esb_shift, src_chip; 49 + u64 opal_flags; 50 + s64 rc; 51 + 52 + memset(data, 0, sizeof(*data)); 53 + 54 + rc = opal_xive_get_irq_info(hw_irq, &flags, &eoi_page, &trig_page, 55 + &esb_shift, &src_chip); 56 + if (rc) { 57 + pr_err("opal_xive_get_irq_info(0x%x) returned %lld\n", 58 + hw_irq, rc); 59 + return -EINVAL; 60 + } 61 + 62 + opal_flags = be64_to_cpu(flags); 63 + if (opal_flags & OPAL_XIVE_IRQ_STORE_EOI) 64 + data->flags |= XIVE_IRQ_FLAG_STORE_EOI; 65 + if (opal_flags & OPAL_XIVE_IRQ_LSI) 66 + data->flags |= XIVE_IRQ_FLAG_LSI; 67 + if (opal_flags & OPAL_XIVE_IRQ_SHIFT_BUG) 68 + data->flags |= XIVE_IRQ_FLAG_SHIFT_BUG; 69 + if (opal_flags & OPAL_XIVE_IRQ_MASK_VIA_FW) 70 + data->flags |= XIVE_IRQ_FLAG_MASK_FW; 71 + if (opal_flags & OPAL_XIVE_IRQ_EOI_VIA_FW) 72 + data->flags |= XIVE_IRQ_FLAG_EOI_FW; 73 + data->eoi_page = be64_to_cpu(eoi_page); 74 + data->trig_page = be64_to_cpu(trig_page); 75 + data->esb_shift = be32_to_cpu(esb_shift); 76 + data->src_chip = be32_to_cpu(src_chip); 77 + 78 + data->eoi_mmio = ioremap(data->eoi_page, 1u << data->esb_shift); 79 + if (!data->eoi_mmio) { 80 + pr_err("Failed to map EOI page for irq 0x%x\n", hw_irq); 81 + return -ENOMEM; 82 + } 83 + 84 + if (!data->trig_page) 85 + return 0; 86 + if (data->trig_page == data->eoi_page) { 87 + data->trig_mmio = data->eoi_mmio; 88 + return 0; 89 + } 90 + 91 + data->trig_mmio = ioremap(data->trig_page, 1u << data->esb_shift); 92 + if (!data->trig_mmio) { 93 + pr_err("Failed to map trigger page for irq 0x%x\n", hw_irq); 94 + return -ENOMEM; 95 + } 96 + return 0; 97 + } 98 + 99 + int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq) 100 + { 101 + s64 rc; 102 + 103 + for (;;) { 104 + rc = opal_xive_set_irq_config(hw_irq, target, prio, sw_irq); 105 + if (rc != OPAL_BUSY) 106 + break; 107 + msleep(1); 108 + } 109 + return rc == 0 ? 0 : -ENXIO; 110 + } 111 + 112 + /* This can be called multiple time to change a queue configuration */ 113 + int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio, 114 + __be32 *qpage, u32 order, bool can_escalate) 115 + { 116 + s64 rc = 0; 117 + __be64 qeoi_page_be; 118 + __be32 esc_irq_be; 119 + u64 flags, qpage_phys; 120 + 121 + /* If there's an actual queue page, clean it */ 122 + if (order) { 123 + if (WARN_ON(!qpage)) 124 + return -EINVAL; 125 + qpage_phys = __pa(qpage); 126 + } else 127 + qpage_phys = 0; 128 + 129 + /* Initialize the rest of the fields */ 130 + q->msk = order ? ((1u << (order - 2)) - 1) : 0; 131 + q->idx = 0; 132 + q->toggle = 0; 133 + 134 + rc = opal_xive_get_queue_info(vp_id, prio, NULL, NULL, 135 + &qeoi_page_be, 136 + &esc_irq_be, 137 + NULL); 138 + if (rc) { 139 + pr_err("Error %lld getting queue info prio %d\n", rc, prio); 140 + rc = -EIO; 141 + goto fail; 142 + } 143 + q->eoi_phys = be64_to_cpu(qeoi_page_be); 144 + 145 + /* Default flags */ 146 + flags = OPAL_XIVE_EQ_ALWAYS_NOTIFY | OPAL_XIVE_EQ_ENABLED; 147 + 148 + /* Escalation needed ? */ 149 + if (can_escalate) { 150 + q->esc_irq = be32_to_cpu(esc_irq_be); 151 + flags |= OPAL_XIVE_EQ_ESCALATE; 152 + } 153 + 154 + /* Configure and enable the queue in HW */ 155 + for (;;) { 156 + rc = opal_xive_set_queue_info(vp_id, prio, qpage_phys, order, flags); 157 + if (rc != OPAL_BUSY) 158 + break; 159 + msleep(1); 160 + } 161 + if (rc) { 162 + pr_err("Error %lld setting queue for prio %d\n", rc, prio); 163 + rc = -EIO; 164 + } else { 165 + /* 166 + * KVM code requires all of the above to be visible before 167 + * q->qpage is set due to how it manages IPI EOIs 168 + */ 169 + wmb(); 170 + q->qpage = qpage; 171 + } 172 + fail: 173 + return rc; 174 + } 175 + 176 + static void __xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio) 177 + { 178 + s64 rc; 179 + 180 + /* Disable the queue in HW */ 181 + for (;;) { 182 + rc = opal_xive_set_queue_info(vp_id, prio, 0, 0, 0); 183 + if (rc != OPAL_BUSY) 184 + break; 185 + msleep(1); 186 + } 187 + if (rc) 188 + pr_err("Error %lld disabling queue for prio %d\n", rc, prio); 189 + } 190 + 191 + void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio) 192 + { 193 + __xive_native_disable_queue(vp_id, q, prio); 194 + } 195 + 196 + static int xive_native_setup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio) 197 + { 198 + struct xive_q *q = &xc->queue[prio]; 199 + unsigned int alloc_order; 200 + struct page *pages; 201 + __be32 *qpage; 202 + 203 + alloc_order = (xive_queue_shift > PAGE_SHIFT) ? 204 + (xive_queue_shift - PAGE_SHIFT) : 0; 205 + pages = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, alloc_order); 206 + if (!pages) 207 + return -ENOMEM; 208 + qpage = (__be32 *)page_address(pages); 209 + memset(qpage, 0, 1 << xive_queue_shift); 210 + return xive_native_configure_queue(get_hard_smp_processor_id(cpu), 211 + q, prio, qpage, xive_queue_shift, false); 212 + } 213 + 214 + static void xive_native_cleanup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio) 215 + { 216 + struct xive_q *q = &xc->queue[prio]; 217 + unsigned int alloc_order; 218 + 219 + /* 220 + * We use the variant with no iounmap as this is called on exec 221 + * from an IPI and iounmap isn't safe 222 + */ 223 + __xive_native_disable_queue(get_hard_smp_processor_id(cpu), q, prio); 224 + alloc_order = (xive_queue_shift > PAGE_SHIFT) ? 225 + (xive_queue_shift - PAGE_SHIFT) : 0; 226 + free_pages((unsigned long)q->qpage, alloc_order); 227 + q->qpage = NULL; 228 + } 229 + 230 + static bool xive_native_match(struct device_node *node) 231 + { 232 + return of_device_is_compatible(node, "ibm,opal-xive-vc"); 233 + } 234 + 235 + #ifdef CONFIG_SMP 236 + static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc) 237 + { 238 + struct device_node *np; 239 + unsigned int chip_id; 240 + s64 irq; 241 + 242 + /* Find the chip ID */ 243 + np = of_get_cpu_node(cpu, NULL); 244 + if (np) { 245 + if (of_property_read_u32(np, "ibm,chip-id", &chip_id) < 0) 246 + chip_id = 0; 247 + } 248 + 249 + /* Allocate an IPI and populate info about it */ 250 + for (;;) { 251 + irq = opal_xive_allocate_irq(chip_id); 252 + if (irq == OPAL_BUSY) { 253 + msleep(1); 254 + continue; 255 + } 256 + if (irq < 0) { 257 + pr_err("Failed to allocate IPI on CPU %d\n", cpu); 258 + return -ENXIO; 259 + } 260 + xc->hw_ipi = irq; 261 + break; 262 + } 263 + return 0; 264 + } 265 + 266 + u32 xive_native_alloc_irq(void) 267 + { 268 + s64 rc; 269 + 270 + for (;;) { 271 + rc = opal_xive_allocate_irq(OPAL_XIVE_ANY_CHIP); 272 + if (rc != OPAL_BUSY) 273 + break; 274 + msleep(1); 275 + } 276 + if (rc < 0) 277 + return 0; 278 + return rc; 279 + } 280 + 281 + void xive_native_free_irq(u32 irq) 282 + { 283 + for (;;) { 284 + s64 rc = opal_xive_free_irq(irq); 285 + if (rc != OPAL_BUSY) 286 + break; 287 + msleep(1); 288 + } 289 + } 290 + 291 + static void xive_native_put_ipi(unsigned int cpu, struct xive_cpu *xc) 292 + { 293 + s64 rc; 294 + 295 + /* Free the IPI */ 296 + if (!xc->hw_ipi) 297 + return; 298 + for (;;) { 299 + rc = opal_xive_free_irq(xc->hw_ipi); 300 + if (rc == OPAL_BUSY) { 301 + msleep(1); 302 + continue; 303 + } 304 + xc->hw_ipi = 0; 305 + break; 306 + } 307 + } 308 + #endif /* CONFIG_SMP */ 309 + 310 + static void xive_native_shutdown(void) 311 + { 312 + /* Switch the XIVE to emulation mode */ 313 + opal_xive_reset(OPAL_XIVE_MODE_EMU); 314 + } 315 + 316 + /* 317 + * Perform an "ack" cycle on the current thread, thus 318 + * grabbing the pending active priorities and updating 319 + * the CPPR to the most favored one. 320 + */ 321 + static void xive_native_update_pending(struct xive_cpu *xc) 322 + { 323 + u8 he, cppr; 324 + u16 ack; 325 + 326 + /* Perform the acknowledge hypervisor to register cycle */ 327 + ack = be16_to_cpu(__raw_readw(xive_tima + TM_SPC_ACK_HV_REG)); 328 + 329 + /* Synchronize subsequent queue accesses */ 330 + mb(); 331 + 332 + /* 333 + * Grab the CPPR and the "HE" field which indicates the source 334 + * of the hypervisor interrupt (if any) 335 + */ 336 + cppr = ack & 0xff; 337 + he = GETFIELD(TM_QW3_NSR_HE, (ack >> 8)); 338 + switch(he) { 339 + case TM_QW3_NSR_HE_NONE: /* Nothing to see here */ 340 + break; 341 + case TM_QW3_NSR_HE_PHYS: /* Physical thread interrupt */ 342 + if (cppr == 0xff) 343 + return; 344 + /* Mark the priority pending */ 345 + xc->pending_prio |= 1 << cppr; 346 + 347 + /* 348 + * A new interrupt should never have a CPPR less favored 349 + * than our current one. 350 + */ 351 + if (cppr >= xc->cppr) 352 + pr_err("CPU %d odd ack CPPR, got %d at %d\n", 353 + smp_processor_id(), cppr, xc->cppr); 354 + 355 + /* Update our idea of what the CPPR is */ 356 + xc->cppr = cppr; 357 + break; 358 + case TM_QW3_NSR_HE_POOL: /* HV Pool interrupt (unused) */ 359 + case TM_QW3_NSR_HE_LSI: /* Legacy FW LSI (unused) */ 360 + pr_err("CPU %d got unexpected interrupt type HE=%d\n", 361 + smp_processor_id(), he); 362 + return; 363 + } 364 + } 365 + 366 + static void xive_native_eoi(u32 hw_irq) 367 + { 368 + /* 369 + * Not normally used except if specific interrupts need 370 + * a workaround on EOI. 371 + */ 372 + opal_int_eoi(hw_irq); 373 + } 374 + 375 + static void xive_native_setup_cpu(unsigned int cpu, struct xive_cpu *xc) 376 + { 377 + s64 rc; 378 + u32 vp; 379 + __be64 vp_cam_be; 380 + u64 vp_cam; 381 + 382 + if (xive_pool_vps == XIVE_INVALID_VP) 383 + return; 384 + 385 + /* Enable the pool VP */ 386 + vp = xive_pool_vps + get_hard_smp_processor_id(cpu); 387 + pr_debug("CPU %d setting up pool VP 0x%x\n", cpu, vp); 388 + for (;;) { 389 + rc = opal_xive_set_vp_info(vp, OPAL_XIVE_VP_ENABLED, 0); 390 + if (rc != OPAL_BUSY) 391 + break; 392 + msleep(1); 393 + } 394 + if (rc) { 395 + pr_err("Failed to enable pool VP on CPU %d\n", cpu); 396 + return; 397 + } 398 + 399 + /* Grab it's CAM value */ 400 + rc = opal_xive_get_vp_info(vp, NULL, &vp_cam_be, NULL, NULL); 401 + if (rc) { 402 + pr_err("Failed to get pool VP info CPU %d\n", cpu); 403 + return; 404 + } 405 + vp_cam = be64_to_cpu(vp_cam_be); 406 + 407 + pr_debug("VP CAM = %llx\n", vp_cam); 408 + 409 + /* Push it on the CPU (set LSMFB to 0xff to skip backlog scan) */ 410 + pr_debug("(Old HW value: %08x)\n", 411 + in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2)); 412 + out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD0, 0xff); 413 + out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2, 414 + TM_QW2W2_VP | vp_cam); 415 + pr_debug("(New HW value: %08x)\n", 416 + in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2)); 417 + } 418 + 419 + static void xive_native_teardown_cpu(unsigned int cpu, struct xive_cpu *xc) 420 + { 421 + s64 rc; 422 + u32 vp; 423 + 424 + if (xive_pool_vps == XIVE_INVALID_VP) 425 + return; 426 + 427 + /* Pull the pool VP from the CPU */ 428 + in_be64(xive_tima + TM_SPC_PULL_POOL_CTX); 429 + 430 + /* Disable it */ 431 + vp = xive_pool_vps + get_hard_smp_processor_id(cpu); 432 + for (;;) { 433 + rc = opal_xive_set_vp_info(vp, 0, 0); 434 + if (rc != OPAL_BUSY) 435 + break; 436 + msleep(1); 437 + } 438 + } 439 + 440 + static void xive_native_sync_source(u32 hw_irq) 441 + { 442 + opal_xive_sync(XIVE_SYNC_EAS, hw_irq); 443 + } 444 + 445 + static const struct xive_ops xive_native_ops = { 446 + .populate_irq_data = xive_native_populate_irq_data, 447 + .configure_irq = xive_native_configure_irq, 448 + .setup_queue = xive_native_setup_queue, 449 + .cleanup_queue = xive_native_cleanup_queue, 450 + .match = xive_native_match, 451 + .shutdown = xive_native_shutdown, 452 + .update_pending = xive_native_update_pending, 453 + .eoi = xive_native_eoi, 454 + .setup_cpu = xive_native_setup_cpu, 455 + .teardown_cpu = xive_native_teardown_cpu, 456 + .sync_source = xive_native_sync_source, 457 + #ifdef CONFIG_SMP 458 + .get_ipi = xive_native_get_ipi, 459 + .put_ipi = xive_native_put_ipi, 460 + #endif /* CONFIG_SMP */ 461 + .name = "native", 462 + }; 463 + 464 + static bool xive_parse_provisioning(struct device_node *np) 465 + { 466 + int rc; 467 + 468 + if (of_property_read_u32(np, "ibm,xive-provision-page-size", 469 + &xive_provision_size) < 0) 470 + return true; 471 + rc = of_property_count_elems_of_size(np, "ibm,xive-provision-chips", 4); 472 + if (rc < 0) { 473 + pr_err("Error %d getting provision chips array\n", rc); 474 + return false; 475 + } 476 + xive_provision_chip_count = rc; 477 + if (rc == 0) 478 + return true; 479 + 480 + xive_provision_chips = kzalloc(4 * xive_provision_chip_count, 481 + GFP_KERNEL); 482 + if (WARN_ON(!xive_provision_chips)) 483 + return false; 484 + 485 + rc = of_property_read_u32_array(np, "ibm,xive-provision-chips", 486 + xive_provision_chips, 487 + xive_provision_chip_count); 488 + if (rc < 0) { 489 + pr_err("Error %d reading provision chips array\n", rc); 490 + return false; 491 + } 492 + 493 + xive_provision_cache = kmem_cache_create("xive-provision", 494 + xive_provision_size, 495 + xive_provision_size, 496 + 0, NULL); 497 + if (!xive_provision_cache) { 498 + pr_err("Failed to allocate provision cache\n"); 499 + return false; 500 + } 501 + return true; 502 + } 503 + 504 + u32 xive_native_default_eq_shift(void) 505 + { 506 + return xive_queue_shift; 507 + } 508 + 509 + bool xive_native_init(void) 510 + { 511 + struct device_node *np; 512 + struct resource r; 513 + void __iomem *tima; 514 + struct property *prop; 515 + u8 max_prio = 7; 516 + const __be32 *p; 517 + u32 val; 518 + s64 rc; 519 + 520 + if (xive_cmdline_disabled) 521 + return false; 522 + 523 + pr_devel("xive_native_init()\n"); 524 + np = of_find_compatible_node(NULL, NULL, "ibm,opal-xive-pe"); 525 + if (!np) { 526 + pr_devel("not found !\n"); 527 + return false; 528 + } 529 + pr_devel("Found %s\n", np->full_name); 530 + 531 + /* Resource 1 is HV window */ 532 + if (of_address_to_resource(np, 1, &r)) { 533 + pr_err("Failed to get thread mgmnt area resource\n"); 534 + return false; 535 + } 536 + tima = ioremap(r.start, resource_size(&r)); 537 + if (!tima) { 538 + pr_err("Failed to map thread mgmnt area\n"); 539 + return false; 540 + } 541 + 542 + /* Read number of priorities */ 543 + if (of_property_read_u32(np, "ibm,xive-#priorities", &val) == 0) 544 + max_prio = val - 1; 545 + 546 + /* Iterate the EQ sizes and pick one */ 547 + of_property_for_each_u32(np, "ibm,xive-eq-sizes", prop, p, val) { 548 + xive_queue_shift = val; 549 + if (val == PAGE_SHIFT) 550 + break; 551 + } 552 + 553 + /* Grab size of provisioning pages */ 554 + xive_parse_provisioning(np); 555 + 556 + /* Switch the XIVE to exploitation mode */ 557 + rc = opal_xive_reset(OPAL_XIVE_MODE_EXPL); 558 + if (rc) { 559 + pr_err("Switch to exploitation mode failed with error %lld\n", rc); 560 + return false; 561 + } 562 + 563 + /* Initialize XIVE core with our backend */ 564 + if (!xive_core_init(&xive_native_ops, tima, TM_QW3_HV_PHYS, 565 + max_prio)) { 566 + opal_xive_reset(OPAL_XIVE_MODE_EMU); 567 + return false; 568 + } 569 + pr_info("Using %dkB queues\n", 1 << (xive_queue_shift - 10)); 570 + return true; 571 + } 572 + 573 + static bool xive_native_provision_pages(void) 574 + { 575 + u32 i; 576 + void *p; 577 + 578 + for (i = 0; i < xive_provision_chip_count; i++) { 579 + u32 chip = xive_provision_chips[i]; 580 + 581 + /* 582 + * XXX TODO: Try to make the allocation local to the node where 583 + * the chip resides. 584 + */ 585 + p = kmem_cache_alloc(xive_provision_cache, GFP_KERNEL); 586 + if (!p) { 587 + pr_err("Failed to allocate provisioning page\n"); 588 + return false; 589 + } 590 + opal_xive_donate_page(chip, __pa(p)); 591 + } 592 + return true; 593 + } 594 + 595 + u32 xive_native_alloc_vp_block(u32 max_vcpus) 596 + { 597 + s64 rc; 598 + u32 order; 599 + 600 + order = fls(max_vcpus) - 1; 601 + if (max_vcpus > (1 << order)) 602 + order++; 603 + 604 + pr_info("VP block alloc, for max VCPUs %d use order %d\n", 605 + max_vcpus, order); 606 + 607 + for (;;) { 608 + rc = opal_xive_alloc_vp_block(order); 609 + switch (rc) { 610 + case OPAL_BUSY: 611 + msleep(1); 612 + break; 613 + case OPAL_XIVE_PROVISIONING: 614 + if (!xive_native_provision_pages()) 615 + return XIVE_INVALID_VP; 616 + break; 617 + default: 618 + if (rc < 0) { 619 + pr_err("OPAL failed to allocate VCPUs order %d, err %lld\n", 620 + order, rc); 621 + return XIVE_INVALID_VP; 622 + } 623 + return rc; 624 + } 625 + } 626 + } 627 + EXPORT_SYMBOL_GPL(xive_native_alloc_vp_block); 628 + 629 + void xive_native_free_vp_block(u32 vp_base) 630 + { 631 + s64 rc; 632 + 633 + if (vp_base == XIVE_INVALID_VP) 634 + return; 635 + 636 + rc = opal_xive_free_vp_block(vp_base); 637 + if (rc < 0) 638 + pr_warn("OPAL error %lld freeing VP block\n", rc); 639 + } 640 + EXPORT_SYMBOL_GPL(xive_native_free_vp_block);

+62

arch/powerpc/sysdev/xive/xive-internal.h

··· 1 + /* 2 + * Copyright 2016,2017 IBM Corporation. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public License 6 + * as published by the Free Software Foundation; either version 7 + * 2 of the License, or (at your option) any later version. 8 + */ 9 + #ifndef __XIVE_INTERNAL_H 10 + #define __XIVE_INTERNAL_H 11 + 12 + /* Each CPU carry one of these with various per-CPU state */ 13 + struct xive_cpu { 14 + #ifdef CONFIG_SMP 15 + /* HW irq number and data of IPI */ 16 + u32 hw_ipi; 17 + struct xive_irq_data ipi_data; 18 + #endif /* CONFIG_SMP */ 19 + 20 + int chip_id; 21 + 22 + /* Queue datas. Only one is populated */ 23 + #define XIVE_MAX_QUEUES 8 24 + struct xive_q queue[XIVE_MAX_QUEUES]; 25 + 26 + /* 27 + * Pending mask. Each bit corresponds to a priority that 28 + * potentially has pending interrupts. 29 + */ 30 + u8 pending_prio; 31 + 32 + /* Cache of HW CPPR */ 33 + u8 cppr; 34 + }; 35 + 36 + /* Backend ops */ 37 + struct xive_ops { 38 + int (*populate_irq_data)(u32 hw_irq, struct xive_irq_data *data); 39 + int (*configure_irq)(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); 40 + int (*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio); 41 + void (*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio); 42 + void (*setup_cpu)(unsigned int cpu, struct xive_cpu *xc); 43 + void (*teardown_cpu)(unsigned int cpu, struct xive_cpu *xc); 44 + bool (*match)(struct device_node *np); 45 + void (*shutdown)(void); 46 + 47 + void (*update_pending)(struct xive_cpu *xc); 48 + void (*eoi)(u32 hw_irq); 49 + void (*sync_source)(u32 hw_irq); 50 + #ifdef CONFIG_SMP 51 + int (*get_ipi)(unsigned int cpu, struct xive_cpu *xc); 52 + void (*put_ipi)(unsigned int cpu, struct xive_cpu *xc); 53 + #endif 54 + const char *name; 55 + }; 56 + 57 + bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset, 58 + u8 max_prio); 59 + 60 + extern bool xive_cmdline_disabled; 61 + 62 + #endif /* __XIVE_INTERNAL_H */

+195 -46

arch/powerpc/xmon/xmon.c

··· 29 29 #include <linux/nmi.h> 30 30 #include <linux/ctype.h> 31 31 32 + #include <asm/debugfs.h> 32 33 #include <asm/ptrace.h> 34 + #include <asm/smp.h> 33 35 #include <asm/string.h> 34 36 #include <asm/prom.h> 35 37 #include <asm/machdep.h> ··· 50 48 #include <asm/reg.h> 51 49 #include <asm/debug.h> 52 50 #include <asm/hw_breakpoint.h> 53 - 51 + #include <asm/xive.h> 54 52 #include <asm/opal.h> 55 53 #include <asm/firmware.h> 56 54 ··· 78 76 #endif /* CONFIG_SMP */ 79 77 80 78 static unsigned long in_xmon __read_mostly = 0; 79 + static int xmon_on = IS_ENABLED(CONFIG_XMON_DEFAULT); 81 80 82 81 static unsigned long adrs; 83 82 static int size = 1; ··· 187 184 static void dump_tlb_book3e(void); 188 185 #endif 189 186 190 - static int xmon_no_auto_backtrace; 191 - 192 187 #ifdef CONFIG_PPC64 193 188 #define REG "%.16lx" 194 189 #else ··· 233 232 "\ 234 233 dr dump stream of raw bytes\n\ 235 234 dt dump the tracing buffers (uses printk)\n\ 236 - e print exception information\n\ 235 + " 236 + #ifdef CONFIG_PPC_POWERNV 237 + " dx# dump xive on CPU #\n\ 238 + dxi# dump xive irq state #\n\ 239 + dxa dump xive on all CPUs\n" 240 + #endif 241 + " e print exception information\n\ 237 242 f flush cache\n\ 238 243 la lookup symbol+offset of specified address\n\ 239 244 ls lookup address of specified symbol\n\ ··· 418 411 { 419 412 return !cpumask_empty(&cpus_in_xmon); 420 413 } 421 - #endif 414 + 415 + static bool wait_for_other_cpus(int ncpus) 416 + { 417 + unsigned long timeout; 418 + 419 + /* We wait for 2s, which is a metric "little while" */ 420 + for (timeout = 20000; timeout != 0; --timeout) { 421 + if (cpumask_weight(&cpus_in_xmon) >= ncpus) 422 + return true; 423 + udelay(100); 424 + barrier(); 425 + } 426 + 427 + return false; 428 + } 429 + #endif /* CONFIG_SMP */ 422 430 423 431 static inline int unrecoverable_excp(struct pt_regs *regs) 424 432 { ··· 455 433 #ifdef CONFIG_SMP 456 434 int cpu; 457 435 int secondary; 458 - unsigned long timeout; 459 436 #endif 460 437 461 438 local_irq_save(flags); ··· 541 520 xmon_owner = cpu; 542 521 mb(); 543 522 if (ncpus > 1) { 544 - smp_send_debugger_break(); 545 - /* wait for other cpus to come in */ 546 - for (timeout = 100000000; timeout != 0; --timeout) { 547 - if (cpumask_weight(&cpus_in_xmon) >= ncpus) 548 - break; 549 - barrier(); 550 - } 523 + /* 524 + * A system reset (trap == 0x100) can be triggered on 525 + * all CPUs, so when we come in via 0x100 try waiting 526 + * for the other CPUs to come in before we send the 527 + * debugger break (IPI). This is similar to 528 + * crash_kexec_secondary(). 529 + */ 530 + if (TRAP(regs) != 0x100 || !wait_for_other_cpus(ncpus)) 531 + smp_send_debugger_break(); 532 + 533 + wait_for_other_cpus(ncpus); 551 534 } 552 535 remove_bpts(); 553 536 disable_surveillance(); ··· 909 884 last_cmd = NULL; 910 885 xmon_regs = excp; 911 886 912 - if (!xmon_no_auto_backtrace) { 913 - xmon_no_auto_backtrace = 1; 914 - xmon_show_stack(excp->gpr[1], excp->link, excp->nip); 915 - } 887 + xmon_show_stack(excp->gpr[1], excp->link, excp->nip); 916 888 917 889 for(;;) { 918 890 #ifdef CONFIG_SMP ··· 1369 1347 case 0x100: ret = "(System Reset)"; break; 1370 1348 case 0x200: ret = "(Machine Check)"; break; 1371 1349 case 0x300: ret = "(Data Access)"; break; 1372 - case 0x380: ret = "(Data SLB Access)"; break; 1350 + case 0x380: 1351 + if (radix_enabled()) 1352 + ret = "(Data Access Out of Range)"; 1353 + else 1354 + ret = "(Data SLB Access)"; 1355 + break; 1373 1356 case 0x400: ret = "(Instruction Access)"; break; 1374 - case 0x480: ret = "(Instruction SLB Access)"; break; 1357 + case 0x480: 1358 + if (radix_enabled()) 1359 + ret = "(Instruction Access Out of Range)"; 1360 + else 1361 + ret = "(Instruction SLB Access)"; 1362 + break; 1375 1363 case 0x500: ret = "(Hardware Interrupt)"; break; 1376 1364 case 0x600: ret = "(Alignment)"; break; 1377 1365 case 0x700: ret = "(Program Check)"; break; ··· 2263 2231 DUMP(p, kernel_msr, "lx"); 2264 2232 DUMP(p, emergency_sp, "p"); 2265 2233 #ifdef CONFIG_PPC_BOOK3S_64 2234 + DUMP(p, nmi_emergency_sp, "p"); 2266 2235 DUMP(p, mc_emergency_sp, "p"); 2236 + DUMP(p, in_nmi, "x"); 2267 2237 DUMP(p, in_mce, "x"); 2268 2238 DUMP(p, hmi_event_available, "x"); 2269 2239 #endif ··· 2372 2338 } 2373 2339 #endif 2374 2340 2341 + #ifdef CONFIG_PPC_POWERNV 2342 + static void dump_one_xive(int cpu) 2343 + { 2344 + unsigned int hwid = get_hard_smp_processor_id(cpu); 2345 + 2346 + opal_xive_dump(XIVE_DUMP_TM_HYP, hwid); 2347 + opal_xive_dump(XIVE_DUMP_TM_POOL, hwid); 2348 + opal_xive_dump(XIVE_DUMP_TM_OS, hwid); 2349 + opal_xive_dump(XIVE_DUMP_TM_USER, hwid); 2350 + opal_xive_dump(XIVE_DUMP_VP, hwid); 2351 + opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid); 2352 + 2353 + if (setjmp(bus_error_jmp) != 0) { 2354 + catch_memory_errors = 0; 2355 + printf("*** Error dumping xive on cpu %d\n", cpu); 2356 + return; 2357 + } 2358 + 2359 + catch_memory_errors = 1; 2360 + sync(); 2361 + xmon_xive_do_dump(cpu); 2362 + sync(); 2363 + __delay(200); 2364 + catch_memory_errors = 0; 2365 + } 2366 + 2367 + static void dump_all_xives(void) 2368 + { 2369 + int cpu; 2370 + 2371 + if (num_possible_cpus() == 0) { 2372 + printf("No possible cpus, use 'dx #' to dump individual cpus\n"); 2373 + return; 2374 + } 2375 + 2376 + for_each_possible_cpu(cpu) 2377 + dump_one_xive(cpu); 2378 + } 2379 + 2380 + static void dump_one_xive_irq(u32 num) 2381 + { 2382 + s64 rc; 2383 + __be64 vp; 2384 + u8 prio; 2385 + __be32 lirq; 2386 + 2387 + rc = opal_xive_get_irq_config(num, &vp, &prio, &lirq); 2388 + xmon_printf("IRQ 0x%x config: vp=0x%llx prio=%d lirq=0x%x (rc=%lld)\n", 2389 + num, be64_to_cpu(vp), prio, be32_to_cpu(lirq), rc); 2390 + } 2391 + 2392 + static void dump_xives(void) 2393 + { 2394 + unsigned long num; 2395 + int c; 2396 + 2397 + c = inchar(); 2398 + if (c == 'a') { 2399 + dump_all_xives(); 2400 + return; 2401 + } else if (c == 'i') { 2402 + if (scanhex(&num)) 2403 + dump_one_xive_irq(num); 2404 + return; 2405 + } 2406 + 2407 + termch = c; /* Put c back, it wasn't 'a' */ 2408 + 2409 + if (scanhex(&num)) 2410 + dump_one_xive(num); 2411 + else 2412 + dump_one_xive(xmon_owner); 2413 + } 2414 + #endif /* CONFIG_PPC_POWERNV */ 2415 + 2375 2416 static void dump_by_size(unsigned long addr, long count, int size) 2376 2417 { 2377 2418 unsigned char temp[16]; ··· 2491 2382 if (c == 'p') { 2492 2383 xmon_start_pagination(); 2493 2384 dump_pacas(); 2385 + xmon_end_pagination(); 2386 + return; 2387 + } 2388 + #endif 2389 + #ifdef CONFIG_PPC_POWERNV 2390 + if (c == 'x') { 2391 + xmon_start_pagination(); 2392 + dump_xives(); 2494 2393 xmon_end_pagination(); 2495 2394 return; 2496 2395 } ··· 3187 3070 for (i = 0; i < mmu_slb_size; i++) { 3188 3071 asm volatile("slbmfee %0,%1" : "=r" (esid) : "r" (i)); 3189 3072 asm volatile("slbmfev %0,%1" : "=r" (vsid) : "r" (i)); 3190 - if (esid || vsid) { 3191 - printf("%02d %016lx %016lx", i, esid, vsid); 3192 - if (esid & SLB_ESID_V) { 3193 - llp = vsid & SLB_VSID_LLP; 3194 - if (vsid & SLB_VSID_B_1T) { 3195 - printf(" 1T ESID=%9lx VSID=%13lx LLP:%3lx \n", 3196 - GET_ESID_1T(esid), 3197 - (vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, 3198 - llp); 3199 - } else { 3200 - printf(" 256M ESID=%9lx VSID=%13lx LLP:%3lx \n", 3201 - GET_ESID(esid), 3202 - (vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT, 3203 - llp); 3204 - } 3205 - } else 3206 - printf("\n"); 3073 + 3074 + if (!esid && !vsid) 3075 + continue; 3076 + 3077 + printf("%02d %016lx %016lx", i, esid, vsid); 3078 + 3079 + if (!(esid & SLB_ESID_V)) { 3080 + printf("\n"); 3081 + continue; 3082 + } 3083 + 3084 + llp = vsid & SLB_VSID_LLP; 3085 + if (vsid & SLB_VSID_B_1T) { 3086 + printf(" 1T ESID=%9lx VSID=%13lx LLP:%3lx \n", 3087 + GET_ESID_1T(esid), 3088 + (vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T, 3089 + llp); 3090 + } else { 3091 + printf(" 256M ESID=%9lx VSID=%13lx LLP:%3lx \n", 3092 + GET_ESID(esid), 3093 + (vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT, 3094 + llp); 3207 3095 } 3208 3096 } 3209 3097 } ··· 3424 3302 /* ensure xmon is enabled */ 3425 3303 xmon_init(1); 3426 3304 debugger(get_irq_regs()); 3305 + if (!xmon_on) 3306 + xmon_init(0); 3427 3307 } 3428 3308 3429 3309 static struct sysrq_key_op sysrq_xmon_op = { ··· 3439 3315 register_sysrq_key('x', &sysrq_xmon_op); 3440 3316 return 0; 3441 3317 } 3442 - __initcall(setup_xmon_sysrq); 3318 + device_initcall(setup_xmon_sysrq); 3443 3319 #endif /* CONFIG_MAGIC_SYSRQ */ 3444 3320 3445 - static int __initdata xmon_early, xmon_off; 3321 + #ifdef CONFIG_DEBUG_FS 3322 + static int xmon_dbgfs_set(void *data, u64 val) 3323 + { 3324 + xmon_on = !!val; 3325 + xmon_init(xmon_on); 3326 + 3327 + return 0; 3328 + } 3329 + 3330 + static int xmon_dbgfs_get(void *data, u64 *val) 3331 + { 3332 + *val = xmon_on; 3333 + return 0; 3334 + } 3335 + 3336 + DEFINE_SIMPLE_ATTRIBUTE(xmon_dbgfs_ops, xmon_dbgfs_get, 3337 + xmon_dbgfs_set, "%llu\n"); 3338 + 3339 + static int __init setup_xmon_dbgfs(void) 3340 + { 3341 + debugfs_create_file("xmon", 0600, powerpc_debugfs_root, NULL, 3342 + &xmon_dbgfs_ops); 3343 + return 0; 3344 + } 3345 + device_initcall(setup_xmon_dbgfs); 3346 + #endif /* CONFIG_DEBUG_FS */ 3347 + 3348 + static int xmon_early __initdata; 3446 3349 3447 3350 static int __init early_parse_xmon(char *p) 3448 3351 { ··· 3477 3326 /* just "xmon" is equivalent to "xmon=early" */ 3478 3327 xmon_init(1); 3479 3328 xmon_early = 1; 3480 - } else if (strncmp(p, "on", 2) == 0) 3329 + xmon_on = 1; 3330 + } else if (strncmp(p, "on", 2) == 0) { 3481 3331 xmon_init(1); 3482 - else if (strncmp(p, "off", 3) == 0) 3483 - xmon_off = 1; 3484 - else if (strncmp(p, "nobt", 4) == 0) 3485 - xmon_no_auto_backtrace = 1; 3332 + xmon_on = 1; 3333 + } else if (strncmp(p, "off", 3) == 0) 3334 + xmon_on = 0; 3486 3335 else 3487 3336 return 1; 3488 3337 ··· 3492 3341 3493 3342 void __init xmon_setup(void) 3494 3343 { 3495 - #ifdef CONFIG_XMON_DEFAULT 3496 - if (!xmon_off) 3344 + if (xmon_on) 3497 3345 xmon_init(1); 3498 - #endif 3499 3346 if (xmon_early) 3500 3347 debugger(NULL); 3501 3348 }

+14 -3

drivers/misc/cxl/api.c

··· 14 14 #include <linux/msi.h> 15 15 #include <linux/module.h> 16 16 #include <linux/mount.h> 17 + #include <linux/sched/mm.h> 17 18 18 19 #include "cxl.h" 19 20 ··· 322 321 323 322 if (task) { 324 323 ctx->pid = get_task_pid(task, PIDTYPE_PID); 325 - ctx->glpid = get_task_pid(task->group_leader, PIDTYPE_PID); 326 324 kernel = false; 327 325 ctx->real_mode = false; 326 + 327 + /* acquire a reference to the task's mm */ 328 + ctx->mm = get_task_mm(current); 329 + 330 + /* ensure this mm_struct can't be freed */ 331 + cxl_context_mm_count_get(ctx); 332 + 333 + /* decrement the use count */ 334 + if (ctx->mm) 335 + mmput(ctx->mm); 328 336 } 329 337 330 338 cxl_ctx_get(); 331 339 332 340 if ((rc = cxl_ops->attach_process(ctx, kernel, wed, 0))) { 333 - put_pid(ctx->glpid); 334 341 put_pid(ctx->pid); 335 - ctx->glpid = ctx->pid = NULL; 342 + ctx->pid = NULL; 336 343 cxl_adapter_context_put(ctx->afu->adapter); 337 344 cxl_ctx_put(); 345 + if (task) 346 + cxl_context_mm_count_put(ctx); 338 347 goto out; 339 348 } 340 349

+51 -17

drivers/misc/cxl/context.c

··· 17 17 #include <linux/debugfs.h> 18 18 #include <linux/slab.h> 19 19 #include <linux/idr.h> 20 + #include <linux/sched/mm.h> 20 21 #include <asm/cputable.h> 21 22 #include <asm/current.h> 22 23 #include <asm/copro.h> ··· 39 38 { 40 39 int i; 41 40 42 - spin_lock_init(&ctx->sste_lock); 43 41 ctx->afu = afu; 44 42 ctx->master = master; 45 - ctx->pid = ctx->glpid = NULL; /* Set in start work ioctl */ 43 + ctx->pid = NULL; /* Set in start work ioctl */ 46 44 mutex_init(&ctx->mapping_lock); 47 45 ctx->mapping = NULL; 48 46 49 - /* 50 - * Allocate the segment table before we put it in the IDR so that we 51 - * can always access it when dereferenced from IDR. For the same 52 - * reason, the segment table is only destroyed after the context is 53 - * removed from the IDR. Access to this in the IOCTL is protected by 54 - * Linux filesytem symantics (can't IOCTL until open is complete). 55 - */ 56 - i = cxl_alloc_sst(ctx); 57 - if (i) 58 - return i; 47 + if (cxl_is_psl8(afu)) { 48 + spin_lock_init(&ctx->sste_lock); 49 + 50 + /* 51 + * Allocate the segment table before we put it in the IDR so that we 52 + * can always access it when dereferenced from IDR. For the same 53 + * reason, the segment table is only destroyed after the context is 54 + * removed from the IDR. Access to this in the IOCTL is protected by 55 + * Linux filesytem symantics (can't IOCTL until open is complete). 56 + */ 57 + i = cxl_alloc_sst(ctx); 58 + if (i) 59 + return i; 60 + } 59 61 60 62 INIT_WORK(&ctx->fault_work, cxl_handle_fault); 61 63 ··· 188 184 if (ctx->afu->current_mode == CXL_MODE_DEDICATED) { 189 185 if (start + len > ctx->afu->adapter->ps_size) 190 186 return -EINVAL; 187 + 188 + if (cxl_is_psl9(ctx->afu)) { 189 + /* 190 + * Make sure there is a valid problem state 191 + * area space for this AFU. 192 + */ 193 + if (ctx->master && !ctx->afu->psa) { 194 + pr_devel("AFU doesn't support mmio space\n"); 195 + return -EINVAL; 196 + } 197 + 198 + /* Can't mmap until the AFU is enabled */ 199 + if (!ctx->afu->enabled) 200 + return -EBUSY; 201 + } 191 202 } else { 192 203 if (start + len > ctx->psn_size) 193 204 return -EINVAL; 194 - } 195 205 196 - if (ctx->afu->current_mode != CXL_MODE_DEDICATED) { 197 - /* make sure there is a valid per process space for this AFU */ 206 + /* Make sure there is a valid per process space for this AFU */ 198 207 if ((ctx->master && !ctx->afu->psa) || (!ctx->afu->pp_psa)) { 199 208 pr_devel("AFU doesn't support mmio space\n"); 200 209 return -EINVAL; ··· 259 242 260 243 /* release the reference to the group leader and mm handling pid */ 261 244 put_pid(ctx->pid); 262 - put_pid(ctx->glpid); 263 245 264 246 cxl_ctx_put(); 265 247 266 248 /* Decrease the attached context count on the adapter */ 267 249 cxl_adapter_context_put(ctx->afu->adapter); 250 + 251 + /* Decrease the mm count on the context */ 252 + cxl_context_mm_count_put(ctx); 253 + ctx->mm = NULL; 254 + 268 255 return 0; 269 256 } 270 257 ··· 324 303 { 325 304 struct cxl_context *ctx = container_of(rcu, struct cxl_context, rcu); 326 305 327 - free_page((u64)ctx->sstp); 306 + if (cxl_is_psl8(ctx->afu)) 307 + free_page((u64)ctx->sstp); 328 308 if (ctx->ff_page) 329 309 __free_page(ctx->ff_page); 330 310 ctx->sstp = NULL; ··· 346 324 idr_remove(&ctx->afu->contexts_idr, ctx->pe); 347 325 mutex_unlock(&ctx->afu->contexts_lock); 348 326 call_rcu(&ctx->rcu, reclaim_ctx); 327 + } 328 + 329 + void cxl_context_mm_count_get(struct cxl_context *ctx) 330 + { 331 + if (ctx->mm) 332 + atomic_inc(&ctx->mm->mm_count); 333 + } 334 + 335 + void cxl_context_mm_count_put(struct cxl_context *ctx) 336 + { 337 + if (ctx->mm) 338 + mmdrop(ctx->mm); 349 339 }

+206 -57

drivers/misc/cxl/cxl.h

··· 63 63 /* Memory maps. Ref CXL Appendix A */ 64 64 65 65 /* PSL Privilege 1 Memory Map */ 66 - /* Configuration and Control area */ 66 + /* Configuration and Control area - CAIA 1&2 */ 67 67 static const cxl_p1_reg_t CXL_PSL_CtxTime = {0x0000}; 68 68 static const cxl_p1_reg_t CXL_PSL_ErrIVTE = {0x0008}; 69 69 static const cxl_p1_reg_t CXL_PSL_KEY1 = {0x0010}; ··· 73 73 static const cxl_p1_reg_t CXL_PSL_DLCNTL = {0x0060}; 74 74 static const cxl_p1_reg_t CXL_PSL_DLADDR = {0x0068}; 75 75 76 - /* PSL Lookaside Buffer Management Area */ 76 + /* PSL Lookaside Buffer Management Area - CAIA 1 */ 77 77 static const cxl_p1_reg_t CXL_PSL_LBISEL = {0x0080}; 78 78 static const cxl_p1_reg_t CXL_PSL_SLBIE = {0x0088}; 79 79 static const cxl_p1_reg_t CXL_PSL_SLBIA = {0x0090}; ··· 82 82 static const cxl_p1_reg_t CXL_PSL_AFUSEL = {0x00B0}; 83 83 84 84 /* 0x00C0:7EFF Implementation dependent area */ 85 - /* PSL registers */ 85 + /* PSL registers - CAIA 1 */ 86 86 static const cxl_p1_reg_t CXL_PSL_FIR1 = {0x0100}; 87 87 static const cxl_p1_reg_t CXL_PSL_FIR2 = {0x0108}; 88 88 static const cxl_p1_reg_t CXL_PSL_Timebase = {0x0110}; ··· 98 98 static const cxl_p1_reg_t CXL_XSL_TB_CTLSTAT = {0x0108}; 99 99 static const cxl_p1_reg_t CXL_XSL_FEC = {0x0158}; 100 100 static const cxl_p1_reg_t CXL_XSL_DSNCTL = {0x0168}; 101 + /* PSL registers - CAIA 2 */ 102 + static const cxl_p1_reg_t CXL_PSL9_CONTROL = {0x0020}; 103 + static const cxl_p1_reg_t CXL_XSL9_DSNCTL = {0x0168}; 104 + static const cxl_p1_reg_t CXL_PSL9_FIR1 = {0x0300}; 105 + static const cxl_p1_reg_t CXL_PSL9_FIR2 = {0x0308}; 106 + static const cxl_p1_reg_t CXL_PSL9_Timebase = {0x0310}; 107 + static const cxl_p1_reg_t CXL_PSL9_DEBUG = {0x0320}; 108 + static const cxl_p1_reg_t CXL_PSL9_FIR_CNTL = {0x0348}; 109 + static const cxl_p1_reg_t CXL_PSL9_DSNDCTL = {0x0350}; 110 + static const cxl_p1_reg_t CXL_PSL9_TB_CTLSTAT = {0x0340}; 111 + static const cxl_p1_reg_t CXL_PSL9_TRACECFG = {0x0368}; 112 + static const cxl_p1_reg_t CXL_PSL9_APCDEDALLOC = {0x0378}; 113 + static const cxl_p1_reg_t CXL_PSL9_APCDEDTYPE = {0x0380}; 114 + static const cxl_p1_reg_t CXL_PSL9_TNR_ADDR = {0x0388}; 115 + static const cxl_p1_reg_t CXL_PSL9_GP_CT = {0x0398}; 116 + static const cxl_p1_reg_t CXL_XSL9_IERAT = {0x0588}; 117 + static const cxl_p1_reg_t CXL_XSL9_ILPP = {0x0590}; 118 + 101 119 /* 0x7F00:7FFF Reserved PCIe MSI-X Pending Bit Array area */ 102 120 /* 0x8000:FFFF Reserved PCIe MSI-X Table Area */ 103 121 104 122 /* PSL Slice Privilege 1 Memory Map */ 105 - /* Configuration Area */ 123 + /* Configuration Area - CAIA 1&2 */ 106 124 static const cxl_p1n_reg_t CXL_PSL_SR_An = {0x00}; 107 125 static const cxl_p1n_reg_t CXL_PSL_LPID_An = {0x08}; 108 126 static const cxl_p1n_reg_t CXL_PSL_AMBAR_An = {0x10}; 109 127 static const cxl_p1n_reg_t CXL_PSL_SPOffset_An = {0x18}; 110 128 static const cxl_p1n_reg_t CXL_PSL_ID_An = {0x20}; 111 129 static const cxl_p1n_reg_t CXL_PSL_SERR_An = {0x28}; 112 - /* Memory Management and Lookaside Buffer Management */ 130 + /* Memory Management and Lookaside Buffer Management - CAIA 1*/ 113 131 static const cxl_p1n_reg_t CXL_PSL_SDR_An = {0x30}; 132 + /* Memory Management and Lookaside Buffer Management - CAIA 1&2 */ 114 133 static const cxl_p1n_reg_t CXL_PSL_AMOR_An = {0x38}; 115 - /* Pointer Area */ 134 + /* Pointer Area - CAIA 1&2 */ 116 135 static const cxl_p1n_reg_t CXL_HAURP_An = {0x80}; 117 136 static const cxl_p1n_reg_t CXL_PSL_SPAP_An = {0x88}; 118 137 static const cxl_p1n_reg_t CXL_PSL_LLCMD_An = {0x90}; 119 - /* Control Area */ 138 + /* Control Area - CAIA 1&2 */ 120 139 static const cxl_p1n_reg_t CXL_PSL_SCNTL_An = {0xA0}; 121 140 static const cxl_p1n_reg_t CXL_PSL_CtxTime_An = {0xA8}; 122 141 static const cxl_p1n_reg_t CXL_PSL_IVTE_Offset_An = {0xB0}; 123 142 static const cxl_p1n_reg_t CXL_PSL_IVTE_Limit_An = {0xB8}; 124 - /* 0xC0:FF Implementation Dependent Area */ 143 + /* 0xC0:FF Implementation Dependent Area - CAIA 1&2 */ 125 144 static const cxl_p1n_reg_t CXL_PSL_FIR_SLICE_An = {0xC0}; 126 145 static const cxl_p1n_reg_t CXL_AFU_DEBUG_An = {0xC8}; 146 + /* 0xC0:FF Implementation Dependent Area - CAIA 1 */ 127 147 static const cxl_p1n_reg_t CXL_PSL_APCALLOC_A = {0xD0}; 128 148 static const cxl_p1n_reg_t CXL_PSL_COALLOC_A = {0xD8}; 129 149 static const cxl_p1n_reg_t CXL_PSL_RXCTL_A = {0xE0}; 130 150 static const cxl_p1n_reg_t CXL_PSL_SLICE_TRACE = {0xE8}; 131 151 132 152 /* PSL Slice Privilege 2 Memory Map */ 133 - /* Configuration and Control Area */ 153 + /* Configuration and Control Area - CAIA 1&2 */ 134 154 static const cxl_p2n_reg_t CXL_PSL_PID_TID_An = {0x000}; 135 155 static const cxl_p2n_reg_t CXL_CSRP_An = {0x008}; 156 + /* Configuration and Control Area - CAIA 1 */ 136 157 static const cxl_p2n_reg_t CXL_AURP0_An = {0x010}; 137 158 static const cxl_p2n_reg_t CXL_AURP1_An = {0x018}; 138 159 static const cxl_p2n_reg_t CXL_SSTP0_An = {0x020}; 139 160 static const cxl_p2n_reg_t CXL_SSTP1_An = {0x028}; 161 + /* Configuration and Control Area - CAIA 1 */ 140 162 static const cxl_p2n_reg_t CXL_PSL_AMR_An = {0x030}; 141 - /* Segment Lookaside Buffer Management */ 163 + /* Segment Lookaside Buffer Management - CAIA 1 */ 142 164 static const cxl_p2n_reg_t CXL_SLBIE_An = {0x040}; 143 165 static const cxl_p2n_reg_t CXL_SLBIA_An = {0x048}; 144 166 static const cxl_p2n_reg_t CXL_SLBI_Select_An = {0x050}; 145 - /* Interrupt Registers */ 167 + /* Interrupt Registers - CAIA 1&2 */ 146 168 static const cxl_p2n_reg_t CXL_PSL_DSISR_An = {0x060}; 147 169 static const cxl_p2n_reg_t CXL_PSL_DAR_An = {0x068}; 148 170 static const cxl_p2n_reg_t CXL_PSL_DSR_An = {0x070}; 149 171 static const cxl_p2n_reg_t CXL_PSL_TFC_An = {0x078}; 150 172 static const cxl_p2n_reg_t CXL_PSL_PEHandle_An = {0x080}; 151 173 static const cxl_p2n_reg_t CXL_PSL_ErrStat_An = {0x088}; 152 - /* AFU Registers */ 174 + /* AFU Registers - CAIA 1&2 */ 153 175 static const cxl_p2n_reg_t CXL_AFU_Cntl_An = {0x090}; 154 176 static const cxl_p2n_reg_t CXL_AFU_ERR_An = {0x098}; 155 - /* Work Element Descriptor */ 177 + /* Work Element Descriptor - CAIA 1&2 */ 156 178 static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0}; 157 179 /* 0x0C0:FFF Implementation Dependent Area */ 158 180 ··· 201 179 #define CXL_PSL_SR_An_SF MSR_SF /* 64bit */ 202 180 #define CXL_PSL_SR_An_TA (1ull << (63-1)) /* Tags active, GA1: 0 */ 203 181 #define CXL_PSL_SR_An_HV MSR_HV /* Hypervisor, GA1: 0 */ 182 + #define CXL_PSL_SR_An_XLAT_hpt (0ull << (63-6))/* Hashed page table (HPT) mode */ 183 + #define CXL_PSL_SR_An_XLAT_roh (2ull << (63-6))/* Radix on HPT mode */ 184 + #define CXL_PSL_SR_An_XLAT_ror (3ull << (63-6))/* Radix on Radix mode */ 185 + #define CXL_PSL_SR_An_BOT (1ull << (63-10)) /* Use the in-memory segment table */ 204 186 #define CXL_PSL_SR_An_PR MSR_PR /* Problem state, GA1: 1 */ 205 187 #define CXL_PSL_SR_An_ISL (1ull << (63-53)) /* Ignore Segment Large Page */ 206 188 #define CXL_PSL_SR_An_TC (1ull << (63-54)) /* Page Table secondary hash */ ··· 228 202 #define CXL_PSL_SERR_An_llcmdto (1ull << (63-6)) 229 203 #define CXL_PSL_SERR_An_afupar (1ull << (63-7)) 230 204 #define CXL_PSL_SERR_An_afudup (1ull << (63-8)) 205 + #define CXL_PSL_SERR_An_IRQS ( \ 206 + CXL_PSL_SERR_An_afuto | CXL_PSL_SERR_An_afudis | CXL_PSL_SERR_An_afuov | \ 207 + CXL_PSL_SERR_An_badsrc | CXL_PSL_SERR_An_badctx | CXL_PSL_SERR_An_llcmdis | \ 208 + CXL_PSL_SERR_An_llcmdto | CXL_PSL_SERR_An_afupar | CXL_PSL_SERR_An_afudup) 209 + #define CXL_PSL_SERR_An_afuto_mask (1ull << (63-32)) 210 + #define CXL_PSL_SERR_An_afudis_mask (1ull << (63-33)) 211 + #define CXL_PSL_SERR_An_afuov_mask (1ull << (63-34)) 212 + #define CXL_PSL_SERR_An_badsrc_mask (1ull << (63-35)) 213 + #define CXL_PSL_SERR_An_badctx_mask (1ull << (63-36)) 214 + #define CXL_PSL_SERR_An_llcmdis_mask (1ull << (63-37)) 215 + #define CXL_PSL_SERR_An_llcmdto_mask (1ull << (63-38)) 216 + #define CXL_PSL_SERR_An_afupar_mask (1ull << (63-39)) 217 + #define CXL_PSL_SERR_An_afudup_mask (1ull << (63-40)) 218 + #define CXL_PSL_SERR_An_IRQ_MASKS ( \ 219 + CXL_PSL_SERR_An_afuto_mask | CXL_PSL_SERR_An_afudis_mask | CXL_PSL_SERR_An_afuov_mask | \ 220 + CXL_PSL_SERR_An_badsrc_mask | CXL_PSL_SERR_An_badctx_mask | CXL_PSL_SERR_An_llcmdis_mask | \ 221 + CXL_PSL_SERR_An_llcmdto_mask | CXL_PSL_SERR_An_afupar_mask | CXL_PSL_SERR_An_afudup_mask) 222 + 231 223 #define CXL_PSL_SERR_An_AE (1ull << (63-30)) 232 224 233 225 /****** CXL_PSL_SCNTL_An ****************************************************/ ··· 301 257 #define CXL_SSTP1_An_STVA_L_MASK (~((1ull << (63-55))-1)) 302 258 #define CXL_SSTP1_An_V (1ull << (63-63)) 303 259 304 - /****** CXL_PSL_SLBIE_[An] **************************************************/ 260 + /****** CXL_PSL_SLBIE_[An] - CAIA 1 **************************************************/ 305 261 /* write: */ 306 262 #define CXL_SLBIE_C PPC_BIT(36) /* Class */ 307 263 #define CXL_SLBIE_SS PPC_BITMASK(37, 38) /* Segment Size */ ··· 311 267 #define CXL_SLBIE_MAX PPC_BITMASK(24, 31) 312 268 #define CXL_SLBIE_PENDING PPC_BITMASK(56, 63) 313 269 314 - /****** Common to all CXL_TLBIA/SLBIA_[An] **********************************/ 270 + /****** Common to all CXL_TLBIA/SLBIA_[An] - CAIA 1 **********************************/ 315 271 #define CXL_TLB_SLB_P (1ull) /* Pending (read) */ 316 272 317 - /****** Common to all CXL_TLB/SLB_IA/IE_[An] registers **********************/ 273 + /****** Common to all CXL_TLB/SLB_IA/IE_[An] registers - CAIA 1 **********************/ 318 274 #define CXL_TLB_SLB_IQ_ALL (0ull) /* Inv qualifier */ 319 275 #define CXL_TLB_SLB_IQ_LPID (1ull) /* Inv qualifier */ 320 276 #define CXL_TLB_SLB_IQ_LPIDPID (3ull) /* Inv qualifier */ ··· 322 278 /****** CXL_PSL_AFUSEL ******************************************************/ 323 279 #define CXL_PSL_AFUSEL_A (1ull << (63-55)) /* Adapter wide invalidates affect all AFUs */ 324 280 325 - /****** CXL_PSL_DSISR_An ****************************************************/ 281 + /****** CXL_PSL_DSISR_An - CAIA 1 ****************************************************/ 326 282 #define CXL_PSL_DSISR_An_DS (1ull << (63-0)) /* Segment not found */ 327 283 #define CXL_PSL_DSISR_An_DM (1ull << (63-1)) /* PTE not found (See also: M) or protection fault */ 328 284 #define CXL_PSL_DSISR_An_ST (1ull << (63-2)) /* Segment Table PTE not found */ ··· 339 295 #define CXL_PSL_DSISR_An_S DSISR_ISSTORE /* Access was afu_wr or afu_zero */ 340 296 #define CXL_PSL_DSISR_An_K DSISR_KEYFAULT /* Access not permitted by virtual page class key protection */ 341 297 298 + /****** CXL_PSL_DSISR_An - CAIA 2 ****************************************************/ 299 + #define CXL_PSL9_DSISR_An_TF (1ull << (63-3)) /* Translation fault */ 300 + #define CXL_PSL9_DSISR_An_PE (1ull << (63-4)) /* PSL Error (implementation specific) */ 301 + #define CXL_PSL9_DSISR_An_AE (1ull << (63-5)) /* AFU Error */ 302 + #define CXL_PSL9_DSISR_An_OC (1ull << (63-6)) /* OS Context Warning */ 303 + #define CXL_PSL9_DSISR_An_S (1ull << (63-38)) /* TF for a write operation */ 304 + #define CXL_PSL9_DSISR_PENDING (CXL_PSL9_DSISR_An_TF | CXL_PSL9_DSISR_An_PE | CXL_PSL9_DSISR_An_AE | CXL_PSL9_DSISR_An_OC) 305 + /* 306 + * NOTE: Bits 56:63 (Checkout Response Status) are valid when DSISR_An[TF] = 1 307 + * Status (0:7) Encoding 308 + */ 309 + #define CXL_PSL9_DSISR_An_CO_MASK 0x00000000000000ffULL 310 + #define CXL_PSL9_DSISR_An_SF 0x0000000000000080ULL /* Segment Fault 0b10000000 */ 311 + #define CXL_PSL9_DSISR_An_PF_SLR 0x0000000000000088ULL /* PTE not found (Single Level Radix) 0b10001000 */ 312 + #define CXL_PSL9_DSISR_An_PF_RGC 0x000000000000008CULL /* PTE not found (Radix Guest (child)) 0b10001100 */ 313 + #define CXL_PSL9_DSISR_An_PF_RGP 0x0000000000000090ULL /* PTE not found (Radix Guest (parent)) 0b10010000 */ 314 + #define CXL_PSL9_DSISR_An_PF_HRH 0x0000000000000094ULL /* PTE not found (HPT/Radix Host) 0b10010100 */ 315 + #define CXL_PSL9_DSISR_An_PF_STEG 0x000000000000009CULL /* PTE not found (STEG VA) 0b10011100 */ 316 + 342 317 /****** CXL_PSL_TFC_An ******************************************************/ 343 318 #define CXL_PSL_TFC_An_A (1ull << (63-28)) /* Acknowledge non-translation fault */ 344 319 #define CXL_PSL_TFC_An_C (1ull << (63-29)) /* Continue (abort transaction) */ 345 320 #define CXL_PSL_TFC_An_AE (1ull << (63-30)) /* Restart PSL with address error */ 346 321 #define CXL_PSL_TFC_An_R (1ull << (63-31)) /* Restart PSL transaction */ 322 + 323 + /****** CXL_XSL9_IERAT_ERAT - CAIA 2 **********************************/ 324 + #define CXL_XSL9_IERAT_MLPID (1ull << (63-0)) /* Match LPID */ 325 + #define CXL_XSL9_IERAT_MPID (1ull << (63-1)) /* Match PID */ 326 + #define CXL_XSL9_IERAT_PRS (1ull << (63-4)) /* PRS bit for Radix invalidations */ 327 + #define CXL_XSL9_IERAT_INVR (1ull << (63-3)) /* Invalidate Radix */ 328 + #define CXL_XSL9_IERAT_IALL (1ull << (63-8)) /* Invalidate All */ 329 + #define CXL_XSL9_IERAT_IINPROG (1ull << (63-63)) /* Invalidate in progress */ 347 330 348 331 /* cxl_process_element->software_status */ 349 332 #define CXL_PE_SOFTWARE_STATE_V (1ul << (31 - 0)) /* Valid */ ··· 553 482 unsigned int sst_size, sst_lru; 554 483 555 484 wait_queue_head_t wq; 556 - /* pid of the group leader associated with the pid */ 557 - struct pid *glpid; 558 485 /* use mm context associated with this pid for ds faults */ 559 486 struct pid *pid; 560 487 spinlock_t lock; /* Protects pending_irq_mask, pending_fault and fault_addr */ ··· 620 551 * CX4 only: 621 552 */ 622 553 struct list_head extra_irq_contexts; 554 + 555 + struct mm_struct *mm; 623 556 }; 557 + 558 + struct cxl_irq_info; 624 559 625 560 struct cxl_service_layer_ops { 626 561 int (*adapter_regs_init)(struct cxl *adapter, struct pci_dev *dev); 562 + int (*invalidate_all)(struct cxl *adapter); 627 563 int (*afu_regs_init)(struct cxl_afu *afu); 564 + int (*sanitise_afu_regs)(struct cxl_afu *afu); 628 565 int (*register_serr_irq)(struct cxl_afu *afu); 629 566 void (*release_serr_irq)(struct cxl_afu *afu); 630 - void (*debugfs_add_adapter_sl_regs)(struct cxl *adapter, struct dentry *dir); 631 - void (*debugfs_add_afu_sl_regs)(struct cxl_afu *afu, struct dentry *dir); 567 + irqreturn_t (*handle_interrupt)(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info); 568 + irqreturn_t (*fail_irq)(struct cxl_afu *afu, struct cxl_irq_info *irq_info); 569 + int (*activate_dedicated_process)(struct cxl_afu *afu); 570 + int (*attach_afu_directed)(struct cxl_context *ctx, u64 wed, u64 amr); 571 + int (*attach_dedicated_process)(struct cxl_context *ctx, u64 wed, u64 amr); 572 + void (*update_dedicated_ivtes)(struct cxl_context *ctx); 573 + void (*debugfs_add_adapter_regs)(struct cxl *adapter, struct dentry *dir); 574 + void (*debugfs_add_afu_regs)(struct cxl_afu *afu, struct dentry *dir); 632 575 void (*psl_irq_dump_registers)(struct cxl_context *ctx); 633 576 void (*err_irq_dump_registers)(struct cxl *adapter); 634 577 void (*debugfs_stop_trace)(struct cxl *adapter); ··· 722 641 void cxl_pci_release_afu(struct device *dev); 723 642 ssize_t cxl_pci_read_adapter_vpd(struct cxl *adapter, void *buf, size_t len); 724 643 725 - /* common == phyp + powernv */ 644 + /* common == phyp + powernv - CAIA 1&2 */ 726 645 struct cxl_process_element_common { 727 646 __be32 tid; 728 647 __be32 pid; 729 648 __be64 csrp; 730 - __be64 aurp0; 731 - __be64 aurp1; 732 - __be64 sstp0; 733 - __be64 sstp1; 649 + union { 650 + struct { 651 + __be64 aurp0; 652 + __be64 aurp1; 653 + __be64 sstp0; 654 + __be64 sstp1; 655 + } psl8; /* CAIA 1 */ 656 + struct { 657 + u8 reserved2[8]; 658 + u8 reserved3[8]; 659 + u8 reserved4[8]; 660 + u8 reserved5[8]; 661 + } psl9; /* CAIA 2 */ 662 + } u; 734 663 __be64 amr; 735 - u8 reserved3[4]; 664 + u8 reserved6[4]; 736 665 __be64 wed; 737 666 } __packed; 738 667 739 - /* just powernv */ 668 + /* just powernv - CAIA 1&2 */ 740 669 struct cxl_process_element { 741 670 __be64 sr; 742 671 __be64 SPOffset; 743 - __be64 sdr; 672 + union { 673 + __be64 sdr; /* CAIA 1 */ 674 + u8 reserved1[8]; /* CAIA 2 */ 675 + } u; 744 676 __be64 haurp; 745 677 __be32 ctxtime; 746 678 __be16 ivte_offsets[4]; ··· 833 739 return ~0ULL; 834 740 } 835 741 742 + static inline bool cxl_is_power8(void) 743 + { 744 + if ((pvr_version_is(PVR_POWER8E)) || 745 + (pvr_version_is(PVR_POWER8NVL)) || 746 + (pvr_version_is(PVR_POWER8))) 747 + return true; 748 + return false; 749 + } 750 + 751 + static inline bool cxl_is_power9(void) 752 + { 753 + /* intermediate solution */ 754 + if (!cxl_is_power8() && 755 + (cpu_has_feature(CPU_FTRS_POWER9) || 756 + cpu_has_feature(CPU_FTR_POWER9_DD1))) 757 + return true; 758 + return false; 759 + } 760 + 761 + static inline bool cxl_is_psl8(struct cxl_afu *afu) 762 + { 763 + if (afu->adapter->caia_major == 1) 764 + return true; 765 + return false; 766 + } 767 + 768 + static inline bool cxl_is_psl9(struct cxl_afu *afu) 769 + { 770 + if (afu->adapter->caia_major == 2) 771 + return true; 772 + return false; 773 + } 774 + 836 775 ssize_t cxl_pci_afu_read_err_buffer(struct cxl_afu *afu, char *buf, 837 776 loff_t off, size_t count); 838 777 ··· 892 765 893 766 void cxl_remove_adapter_nr(struct cxl *adapter); 894 767 895 - int cxl_alloc_spa(struct cxl_afu *afu); 896 768 void cxl_release_spa(struct cxl_afu *afu); 897 769 898 770 dev_t cxl_get_dev(void); ··· 929 803 void afu_release_irqs(struct cxl_context *ctx, void *cookie); 930 804 void afu_irq_name_free(struct cxl_context *ctx); 931 805 806 + int cxl_attach_afu_directed_psl9(struct cxl_context *ctx, u64 wed, u64 amr); 807 + int cxl_attach_afu_directed_psl8(struct cxl_context *ctx, u64 wed, u64 amr); 808 + int cxl_activate_dedicated_process_psl9(struct cxl_afu *afu); 809 + int cxl_activate_dedicated_process_psl8(struct cxl_afu *afu); 810 + int cxl_attach_dedicated_process_psl9(struct cxl_context *ctx, u64 wed, u64 amr); 811 + int cxl_attach_dedicated_process_psl8(struct cxl_context *ctx, u64 wed, u64 amr); 812 + void cxl_update_dedicated_ivtes_psl9(struct cxl_context *ctx); 813 + void cxl_update_dedicated_ivtes_psl8(struct cxl_context *ctx); 814 + 932 815 #ifdef CONFIG_DEBUG_FS 933 816 934 817 int cxl_debugfs_init(void); ··· 946 811 void cxl_debugfs_adapter_remove(struct cxl *adapter); 947 812 int cxl_debugfs_afu_add(struct cxl_afu *afu); 948 813 void cxl_debugfs_afu_remove(struct cxl_afu *afu); 949 - void cxl_stop_trace(struct cxl *cxl); 950 - void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, struct dentry *dir); 951 - void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter, struct dentry *dir); 952 - void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir); 814 + void cxl_stop_trace_psl9(struct cxl *cxl); 815 + void cxl_stop_trace_psl8(struct cxl *cxl); 816 + void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter, struct dentry *dir); 817 + void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter, struct dentry *dir); 818 + void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter, struct dentry *dir); 819 + void cxl_debugfs_add_afu_regs_psl9(struct cxl_afu *afu, struct dentry *dir); 820 + void cxl_debugfs_add_afu_regs_psl8(struct cxl_afu *afu, struct dentry *dir); 953 821 954 822 #else /* CONFIG_DEBUG_FS */ 955 823 ··· 983 845 { 984 846 } 985 847 986 - static inline void cxl_stop_trace(struct cxl *cxl) 848 + static inline void cxl_stop_trace_psl9(struct cxl *cxl) 987 849 { 988 850 } 989 851 990 - static inline void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, 852 + static inline void cxl_stop_trace_psl8(struct cxl *cxl) 853 + { 854 + } 855 + 856 + static inline void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter, 991 857 struct dentry *dir) 992 858 { 993 859 } 994 860 995 - static inline void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter, 861 + static inline void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter, 996 862 struct dentry *dir) 997 863 { 998 864 } 999 865 1000 - static inline void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir) 866 + static inline void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter, 867 + struct dentry *dir) 868 + { 869 + } 870 + 871 + static inline void cxl_debugfs_add_afu_regs_psl9(struct cxl_afu *afu, struct dentry *dir) 872 + { 873 + } 874 + 875 + static inline void cxl_debugfs_add_afu_regs_psl8(struct cxl_afu *afu, struct dentry *dir) 1001 876 { 1002 877 } 1003 878 ··· 1039 888 /* 1040 889 * This must match the layout of the H_COLLECT_CA_INT_INFO retbuf defined 1041 890 * in PAPR. 1042 - * A word about endianness: a pointer to this structure is passed when 1043 - * calling the hcall. However, it is not a block of memory filled up by 1044 - * the hypervisor. The return values are found in registers, and copied 1045 - * one by one when returning from the hcall. See the end of the call to 1046 - * plpar_hcall9() in hvCall.S 1047 - * As a consequence: 1048 - * - we don't need to do any endianness conversion 1049 - * - the pid and tid are an exception. They are 32-bit values returned in 1050 - * the same 64-bit register. So we do need to worry about byte ordering. 891 + * Field pid_tid is now 'reserved' because it's no more used on bare-metal. 892 + * On a guest environment, PSL_PID_An is located on the upper 32 bits and 893 + * PSL_TID_An register in the lower 32 bits. 1051 894 */ 1052 895 struct cxl_irq_info { 1053 896 u64 dsisr; 1054 897 u64 dar; 1055 898 u64 dsr; 1056 - #ifndef CONFIG_CPU_LITTLE_ENDIAN 1057 - u32 pid; 1058 - u32 tid; 1059 - #else 1060 - u32 tid; 1061 - u32 pid; 1062 - #endif 899 + u64 reserved; 1063 900 u64 afu_err; 1064 901 u64 errstat; 1065 902 u64 proc_handle; ··· 1055 916 }; 1056 917 1057 918 void cxl_assign_psn_space(struct cxl_context *ctx); 1058 - irqreturn_t cxl_irq(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info); 919 + int cxl_invalidate_all_psl9(struct cxl *adapter); 920 + int cxl_invalidate_all_psl8(struct cxl *adapter); 921 + irqreturn_t cxl_irq_psl9(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info); 922 + irqreturn_t cxl_irq_psl8(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info); 923 + irqreturn_t cxl_fail_irq_psl(struct cxl_afu *afu, struct cxl_irq_info *irq_info); 1059 924 int cxl_register_one_irq(struct cxl *adapter, irq_handler_t handler, 1060 925 void *cookie, irq_hw_number_t *dest_hwirq, 1061 926 unsigned int *dest_virq, const char *name); 1062 927 1063 928 int cxl_check_error(struct cxl_afu *afu); 1064 929 int cxl_afu_slbia(struct cxl_afu *afu); 1065 - int cxl_tlb_slb_invalidate(struct cxl *adapter); 1066 930 int cxl_data_cache_flush(struct cxl *adapter); 1067 931 int cxl_afu_disable(struct cxl_afu *afu); 1068 932 int cxl_psl_purge(struct cxl_afu *afu); 1069 933 1070 - void cxl_native_psl_irq_dump_regs(struct cxl_context *ctx); 934 + void cxl_native_irq_dump_regs_psl9(struct cxl_context *ctx); 935 + void cxl_native_irq_dump_regs_psl8(struct cxl_context *ctx); 1071 936 void cxl_native_err_irq_dump_regs(struct cxl *adapter); 1072 937 int cxl_pci_vphb_add(struct cxl_afu *afu); 1073 938 void cxl_pci_vphb_remove(struct cxl_afu *afu); ··· 1166 1023 1167 1024 /* Unlock the contexts-lock if taken. Warn and force unlock otherwise */ 1168 1025 void cxl_adapter_context_unlock(struct cxl *adapter); 1026 + 1027 + /* Increases the reference count to "struct mm_struct" */ 1028 + void cxl_context_mm_count_get(struct cxl_context *ctx); 1029 + 1030 + /* Decrements the reference count to "struct mm_struct" */ 1031 + void cxl_context_mm_count_put(struct cxl_context *ctx); 1169 1032 1170 1033 #endif

+31 -10

drivers/misc/cxl/debugfs.c

··· 15 15 16 16 static struct dentry *cxl_debugfs; 17 17 18 - void cxl_stop_trace(struct cxl *adapter) 18 + void cxl_stop_trace_psl9(struct cxl *adapter) 19 + { 20 + /* Stop the trace */ 21 + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x4480000000000000ULL); 22 + } 23 + 24 + void cxl_stop_trace_psl8(struct cxl *adapter) 19 25 { 20 26 int slice; 21 27 ··· 59 53 (void __force *)value, &fops_io_x64); 60 54 } 61 55 62 - void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, struct dentry *dir) 56 + void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter, struct dentry *dir) 57 + { 58 + debugfs_create_io_x64("fir1", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_FIR1)); 59 + debugfs_create_io_x64("fir2", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_FIR2)); 60 + debugfs_create_io_x64("fir_cntl", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_FIR_CNTL)); 61 + debugfs_create_io_x64("trace", S_IRUSR | S_IWUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_TRACECFG)); 62 + } 63 + 64 + void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter, struct dentry *dir) 63 65 { 64 66 debugfs_create_io_x64("fir1", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_FIR1)); 65 67 debugfs_create_io_x64("fir2", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_FIR2)); ··· 75 61 debugfs_create_io_x64("trace", S_IRUSR | S_IWUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_TRACE)); 76 62 } 77 63 78 - void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter, struct dentry *dir) 64 + void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter, struct dentry *dir) 79 65 { 80 66 debugfs_create_io_x64("fec", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_XSL_FEC)); 81 67 } ··· 96 82 97 83 debugfs_create_io_x64("err_ivte", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_ErrIVTE)); 98 84 99 - if (adapter->native->sl_ops->debugfs_add_adapter_sl_regs) 100 - adapter->native->sl_ops->debugfs_add_adapter_sl_regs(adapter, dir); 85 + if (adapter->native->sl_ops->debugfs_add_adapter_regs) 86 + adapter->native->sl_ops->debugfs_add_adapter_regs(adapter, dir); 101 87 return 0; 102 88 } 103 89 ··· 106 92 debugfs_remove_recursive(adapter->debugfs); 107 93 } 108 94 109 - void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir) 95 + void cxl_debugfs_add_afu_regs_psl9(struct cxl_afu *afu, struct dentry *dir) 110 96 { 97 + debugfs_create_io_x64("serr", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_SERR_An)); 98 + } 99 + 100 + void cxl_debugfs_add_afu_regs_psl8(struct cxl_afu *afu, struct dentry *dir) 101 + { 102 + debugfs_create_io_x64("sstp0", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_SSTP0_An)); 103 + debugfs_create_io_x64("sstp1", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_SSTP1_An)); 104 + 111 105 debugfs_create_io_x64("fir", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_FIR_SLICE_An)); 112 106 debugfs_create_io_x64("serr", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_SERR_An)); 113 107 debugfs_create_io_x64("afu_debug", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_AFU_DEBUG_An)); ··· 139 117 debugfs_create_io_x64("sr", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_SR_An)); 140 118 debugfs_create_io_x64("dsisr", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_PSL_DSISR_An)); 141 119 debugfs_create_io_x64("dar", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_PSL_DAR_An)); 142 - debugfs_create_io_x64("sstp0", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_SSTP0_An)); 143 - debugfs_create_io_x64("sstp1", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_SSTP1_An)); 120 + 144 121 debugfs_create_io_x64("err_status", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_PSL_ErrStat_An)); 145 122 146 - if (afu->adapter->native->sl_ops->debugfs_add_afu_sl_regs) 147 - afu->adapter->native->sl_ops->debugfs_add_afu_sl_regs(afu, dir); 123 + if (afu->adapter->native->sl_ops->debugfs_add_afu_regs) 124 + afu->adapter->native->sl_ops->debugfs_add_afu_regs(afu, dir); 148 125 149 126 return 0; 150 127 }

+46 -88

drivers/misc/cxl/fault.c

··· 146 146 return cxl_ack_ae(ctx); 147 147 } 148 148 149 - /* 150 - * update_mmu_cache() will not have loaded the hash since current->trap 151 - * is not a 0x400 or 0x300, so just call hash_page_mm() here. 152 - */ 153 - access = _PAGE_PRESENT | _PAGE_READ; 154 - if (dsisr & CXL_PSL_DSISR_An_S) 155 - access |= _PAGE_WRITE; 149 + if (!radix_enabled()) { 150 + /* 151 + * update_mmu_cache() will not have loaded the hash since current->trap 152 + * is not a 0x400 or 0x300, so just call hash_page_mm() here. 153 + */ 154 + access = _PAGE_PRESENT | _PAGE_READ; 155 + if (dsisr & CXL_PSL_DSISR_An_S) 156 + access |= _PAGE_WRITE; 156 157 157 - access |= _PAGE_PRIVILEGED; 158 - if ((!ctx->kernel) || (REGION_ID(dar) == USER_REGION_ID)) 159 - access &= ~_PAGE_PRIVILEGED; 158 + access |= _PAGE_PRIVILEGED; 159 + if ((!ctx->kernel) || (REGION_ID(dar) == USER_REGION_ID)) 160 + access &= ~_PAGE_PRIVILEGED; 160 161 161 - if (dsisr & DSISR_NOHPTE) 162 - inv_flags |= HPTE_NOHPTE_UPDATE; 162 + if (dsisr & DSISR_NOHPTE) 163 + inv_flags |= HPTE_NOHPTE_UPDATE; 163 164 164 - local_irq_save(flags); 165 - hash_page_mm(mm, dar, access, 0x300, inv_flags); 166 - local_irq_restore(flags); 167 - 165 + local_irq_save(flags); 166 + hash_page_mm(mm, dar, access, 0x300, inv_flags); 167 + local_irq_restore(flags); 168 + } 168 169 pr_devel("Page fault successfully handled for pe: %i!\n", ctx->pe); 169 170 cxl_ops->ack_irq(ctx, CXL_PSL_TFC_An_R, 0); 170 171 } 171 172 172 173 /* 173 - * Returns the mm_struct corresponding to the context ctx via ctx->pid 174 - * In case the task has exited we use the task group leader accessible 175 - * via ctx->glpid to find the next task in the thread group that has a 176 - * valid mm_struct associated with it. If a task with valid mm_struct 177 - * is found the ctx->pid is updated to use the task struct for subsequent 178 - * translations. In case no valid mm_struct is found in the task group to 179 - * service the fault a NULL is returned. 174 + * Returns the mm_struct corresponding to the context ctx. 175 + * mm_users == 0, the context may be in the process of being closed. 180 176 */ 181 177 static struct mm_struct *get_mem_context(struct cxl_context *ctx) 182 178 { 183 - struct task_struct *task = NULL; 184 - struct mm_struct *mm = NULL; 185 - struct pid *old_pid = ctx->pid; 186 - 187 - if (old_pid == NULL) { 188 - pr_warn("%s: Invalid context for pe=%d\n", 189 - __func__, ctx->pe); 179 + if (ctx->mm == NULL) 190 180 return NULL; 191 - } 192 181 193 - task = get_pid_task(old_pid, PIDTYPE_PID); 182 + if (!atomic_inc_not_zero(&ctx->mm->mm_users)) 183 + return NULL; 194 184 195 - /* 196 - * pid_alive may look racy but this saves us from costly 197 - * get_task_mm when the task is a zombie. In worst case 198 - * we may think a task is alive, which is about to die 199 - * but get_task_mm will return NULL. 200 - */ 201 - if (task != NULL && pid_alive(task)) 202 - mm = get_task_mm(task); 203 - 204 - /* release the task struct that was taken earlier */ 205 - if (task) 206 - put_task_struct(task); 207 - else 208 - pr_devel("%s: Context owning pid=%i for pe=%i dead\n", 209 - __func__, pid_nr(old_pid), ctx->pe); 210 - 211 - /* 212 - * If we couldn't find the mm context then use the group 213 - * leader to iterate over the task group and find a task 214 - * that gives us mm_struct. 215 - */ 216 - if (unlikely(mm == NULL && ctx->glpid != NULL)) { 217 - 218 - rcu_read_lock(); 219 - task = pid_task(ctx->glpid, PIDTYPE_PID); 220 - if (task) 221 - do { 222 - mm = get_task_mm(task); 223 - if (mm) { 224 - ctx->pid = get_task_pid(task, 225 - PIDTYPE_PID); 226 - break; 227 - } 228 - task = next_thread(task); 229 - } while (task && !thread_group_leader(task)); 230 - rcu_read_unlock(); 231 - 232 - /* check if we switched pid */ 233 - if (ctx->pid != old_pid) { 234 - if (mm) 235 - pr_devel("%s:pe=%i switch pid %i->%i\n", 236 - __func__, ctx->pe, pid_nr(old_pid), 237 - pid_nr(ctx->pid)); 238 - else 239 - pr_devel("%s:Cannot find mm for pid=%i\n", 240 - __func__, pid_nr(old_pid)); 241 - 242 - /* drop the reference to older pid */ 243 - put_pid(old_pid); 244 - } 245 - } 246 - 247 - return mm; 185 + return ctx->mm; 248 186 } 249 187 188 + static bool cxl_is_segment_miss(struct cxl_context *ctx, u64 dsisr) 189 + { 190 + if ((cxl_is_psl8(ctx->afu)) && (dsisr & CXL_PSL_DSISR_An_DS)) 191 + return true; 250 192 193 + return false; 194 + } 195 + 196 + static bool cxl_is_page_fault(struct cxl_context *ctx, u64 dsisr) 197 + { 198 + if ((cxl_is_psl8(ctx->afu)) && (dsisr & CXL_PSL_DSISR_An_DM)) 199 + return true; 200 + 201 + if ((cxl_is_psl9(ctx->afu)) && 202 + ((dsisr & CXL_PSL9_DSISR_An_CO_MASK) & 203 + (CXL_PSL9_DSISR_An_PF_SLR | CXL_PSL9_DSISR_An_PF_RGC | 204 + CXL_PSL9_DSISR_An_PF_RGP | CXL_PSL9_DSISR_An_PF_HRH | 205 + CXL_PSL9_DSISR_An_PF_STEG))) 206 + return true; 207 + 208 + return false; 209 + } 251 210 252 211 void cxl_handle_fault(struct work_struct *fault_work) 253 212 { ··· 241 282 if (!ctx->kernel) { 242 283 243 284 mm = get_mem_context(ctx); 244 - /* indicates all the thread in task group have exited */ 245 285 if (mm == NULL) { 246 286 pr_devel("%s: unable to get mm for pe=%d pid=%i\n", 247 287 __func__, ctx->pe, pid_nr(ctx->pid)); ··· 252 294 } 253 295 } 254 296 255 - if (dsisr & CXL_PSL_DSISR_An_DS) 297 + if (cxl_is_segment_miss(ctx, dsisr)) 256 298 cxl_handle_segment_miss(ctx, mm, dar); 257 - else if (dsisr & CXL_PSL_DSISR_An_DM) 299 + else if (cxl_is_page_fault(ctx, dsisr)) 258 300 cxl_handle_page_fault(ctx, mm, dsisr, dar); 259 301 else 260 302 WARN(1, "cxl_handle_fault has nothing to handle\n");

+12 -3

drivers/misc/cxl/file.c

··· 18 18 #include <linux/fs.h> 19 19 #include <linux/mm.h> 20 20 #include <linux/slab.h> 21 + #include <linux/sched/mm.h> 21 22 #include <asm/cputable.h> 22 23 #include <asm/current.h> 23 24 #include <asm/copro.h> ··· 217 216 * process is still accessible. 218 217 */ 219 218 ctx->pid = get_task_pid(current, PIDTYPE_PID); 220 - ctx->glpid = get_task_pid(current->group_leader, PIDTYPE_PID); 221 219 220 + /* acquire a reference to the task's mm */ 221 + ctx->mm = get_task_mm(current); 222 + 223 + /* ensure this mm_struct can't be freed */ 224 + cxl_context_mm_count_get(ctx); 225 + 226 + /* decrement the use count */ 227 + if (ctx->mm) 228 + mmput(ctx->mm); 222 229 223 230 trace_cxl_attach(ctx, work.work_element_descriptor, work.num_interrupts, amr); 224 231 ··· 234 225 amr))) { 235 226 afu_release_irqs(ctx, ctx); 236 227 cxl_adapter_context_put(ctx->afu->adapter); 237 - put_pid(ctx->glpid); 238 228 put_pid(ctx->pid); 239 - ctx->glpid = ctx->pid = NULL; 229 + ctx->pid = NULL; 230 + cxl_context_mm_count_put(ctx); 240 231 goto out; 241 232 } 242 233

+5 -5

drivers/misc/cxl/guest.c

··· 169 169 return IRQ_HANDLED; 170 170 } 171 171 172 - rc = cxl_irq(irq, ctx, &irq_info); 172 + rc = cxl_irq_psl8(irq, ctx, &irq_info); 173 173 return rc; 174 174 } 175 175 ··· 551 551 elem->common.tid = cpu_to_be32(0); /* Unused */ 552 552 elem->common.pid = cpu_to_be32(pid); 553 553 elem->common.csrp = cpu_to_be64(0); /* disable */ 554 - elem->common.aurp0 = cpu_to_be64(0); /* disable */ 555 - elem->common.aurp1 = cpu_to_be64(0); /* disable */ 554 + elem->common.u.psl8.aurp0 = cpu_to_be64(0); /* disable */ 555 + elem->common.u.psl8.aurp1 = cpu_to_be64(0); /* disable */ 556 556 557 557 cxl_prefault(ctx, wed); 558 558 559 - elem->common.sstp0 = cpu_to_be64(ctx->sstp0); 560 - elem->common.sstp1 = cpu_to_be64(ctx->sstp1); 559 + elem->common.u.psl8.sstp0 = cpu_to_be64(ctx->sstp0); 560 + elem->common.u.psl8.sstp1 = cpu_to_be64(ctx->sstp1); 561 561 562 562 /* 563 563 * Ensure we have at least one interrupt allocated to take faults for

+3 -3

drivers/misc/cxl/hcalls.c

··· 413 413 414 414 switch (rc) { 415 415 case H_SUCCESS: /* The interrupt info is returned in return registers. */ 416 - pr_devel("dsisr:%#llx, dar:%#llx, dsr:%#llx, pid:%u, tid:%u, afu_err:%#llx, errstat:%#llx\n", 417 - info->dsisr, info->dar, info->dsr, info->pid, 418 - info->tid, info->afu_err, info->errstat); 416 + pr_devel("dsisr:%#llx, dar:%#llx, dsr:%#llx, pid_tid:%#llx, afu_err:%#llx, errstat:%#llx\n", 417 + info->dsisr, info->dar, info->dsr, info->reserved, 418 + info->afu_err, info->errstat); 419 419 return 0; 420 420 case H_PARAMETER: /* An incorrect parameter was supplied. */ 421 421 return -EINVAL;

+52 -1

drivers/misc/cxl/irq.c

··· 34 34 return IRQ_HANDLED; 35 35 } 36 36 37 - irqreturn_t cxl_irq(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info) 37 + irqreturn_t cxl_irq_psl9(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info) 38 + { 39 + u64 dsisr, dar; 40 + 41 + dsisr = irq_info->dsisr; 42 + dar = irq_info->dar; 43 + 44 + trace_cxl_psl9_irq(ctx, irq, dsisr, dar); 45 + 46 + pr_devel("CXL interrupt %i for afu pe: %i DSISR: %#llx DAR: %#llx\n", irq, ctx->pe, dsisr, dar); 47 + 48 + if (dsisr & CXL_PSL9_DSISR_An_TF) { 49 + pr_devel("CXL interrupt: Scheduling translation fault handling for later (pe: %i)\n", ctx->pe); 50 + return schedule_cxl_fault(ctx, dsisr, dar); 51 + } 52 + 53 + if (dsisr & CXL_PSL9_DSISR_An_PE) 54 + return cxl_ops->handle_psl_slice_error(ctx, dsisr, 55 + irq_info->errstat); 56 + if (dsisr & CXL_PSL9_DSISR_An_AE) { 57 + pr_devel("CXL interrupt: AFU Error 0x%016llx\n", irq_info->afu_err); 58 + 59 + if (ctx->pending_afu_err) { 60 + /* 61 + * This shouldn't happen - the PSL treats these errors 62 + * as fatal and will have reset the AFU, so there's not 63 + * much point buffering multiple AFU errors. 64 + * OTOH if we DO ever see a storm of these come in it's 65 + * probably best that we log them somewhere: 66 + */ 67 + dev_err_ratelimited(&ctx->afu->dev, "CXL AFU Error undelivered to pe %i: 0x%016llx\n", 68 + ctx->pe, irq_info->afu_err); 69 + } else { 70 + spin_lock(&ctx->lock); 71 + ctx->afu_err = irq_info->afu_err; 72 + ctx->pending_afu_err = 1; 73 + spin_unlock(&ctx->lock); 74 + 75 + wake_up_all(&ctx->wq); 76 + } 77 + 78 + cxl_ops->ack_irq(ctx, CXL_PSL_TFC_An_A, 0); 79 + return IRQ_HANDLED; 80 + } 81 + if (dsisr & CXL_PSL9_DSISR_An_OC) 82 + pr_devel("CXL interrupt: OS Context Warning\n"); 83 + 84 + WARN(1, "Unhandled CXL PSL IRQ\n"); 85 + return IRQ_HANDLED; 86 + } 87 + 88 + irqreturn_t cxl_irq_psl8(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info) 38 89 { 39 90 u64 dsisr, dar; 40 91

+2 -10

drivers/misc/cxl/main.c

··· 59 59 60 60 static inline void _cxl_slbia(struct cxl_context *ctx, struct mm_struct *mm) 61 61 { 62 - struct task_struct *task; 63 62 unsigned long flags; 64 - if (!(task = get_pid_task(ctx->pid, PIDTYPE_PID))) { 65 - pr_devel("%s unable to get task %i\n", 66 - __func__, pid_nr(ctx->pid)); 67 - return; 68 - } 69 63 70 - if (task->mm != mm) 71 - goto out_put; 64 + if (ctx->mm != mm) 65 + return; 72 66 73 67 pr_devel("%s matched mm - card: %i afu: %i pe: %i\n", __func__, 74 68 ctx->afu->adapter->adapter_num, ctx->afu->slice, ctx->pe); ··· 73 79 spin_unlock_irqrestore(&ctx->sste_lock, flags); 74 80 mb(); 75 81 cxl_afu_slbia(ctx->afu); 76 - out_put: 77 - put_task_struct(task); 78 82 } 79 83 80 84 static inline void cxl_slbia_core(struct mm_struct *mm)

+287 -52

drivers/misc/cxl/native.c

··· 95 95 /* This will disable as well as reset */ 96 96 static int native_afu_reset(struct cxl_afu *afu) 97 97 { 98 + int rc; 99 + u64 serr; 100 + 98 101 pr_devel("AFU reset request\n"); 99 102 100 - return afu_control(afu, CXL_AFU_Cntl_An_RA, 0, 103 + rc = afu_control(afu, CXL_AFU_Cntl_An_RA, 0, 101 104 CXL_AFU_Cntl_An_RS_Complete | CXL_AFU_Cntl_An_ES_Disabled, 102 105 CXL_AFU_Cntl_An_RS_MASK | CXL_AFU_Cntl_An_ES_MASK, 103 106 false); 107 + 108 + /* Re-enable any masked interrupts */ 109 + serr = cxl_p1n_read(afu, CXL_PSL_SERR_An); 110 + serr &= ~CXL_PSL_SERR_An_IRQ_MASKS; 111 + cxl_p1n_write(afu, CXL_PSL_SERR_An, serr); 112 + 113 + 114 + return rc; 104 115 } 105 116 106 117 static int native_afu_check_and_enable(struct cxl_afu *afu) ··· 131 120 u64 AFU_Cntl = cxl_p2n_read(afu, CXL_AFU_Cntl_An); 132 121 u64 dsisr, dar; 133 122 u64 start, end; 123 + u64 trans_fault = 0x0ULL; 134 124 unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT); 135 125 int rc = 0; 136 126 137 127 trace_cxl_psl_ctrl(afu, CXL_PSL_SCNTL_An_Pc); 138 128 139 129 pr_devel("PSL purge request\n"); 130 + 131 + if (cxl_is_psl8(afu)) 132 + trans_fault = CXL_PSL_DSISR_TRANS; 133 + if (cxl_is_psl9(afu)) 134 + trans_fault = CXL_PSL9_DSISR_An_TF; 140 135 141 136 if (!cxl_ops->link_ok(afu->adapter, afu)) { 142 137 dev_warn(&afu->dev, "PSL Purge called with link down, ignoring\n"); ··· 172 155 } 173 156 174 157 dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An); 175 - pr_devel_ratelimited("PSL purging... PSL_CNTL: 0x%016llx PSL_DSISR: 0x%016llx\n", PSL_CNTL, dsisr); 176 - if (dsisr & CXL_PSL_DSISR_TRANS) { 158 + pr_devel_ratelimited("PSL purging... PSL_CNTL: 0x%016llx PSL_DSISR: 0x%016llx\n", 159 + PSL_CNTL, dsisr); 160 + 161 + if (dsisr & trans_fault) { 177 162 dar = cxl_p2n_read(afu, CXL_PSL_DAR_An); 178 - dev_notice(&afu->dev, "PSL purge terminating pending translation, DSISR: 0x%016llx, DAR: 0x%016llx\n", dsisr, dar); 163 + dev_notice(&afu->dev, "PSL purge terminating pending translation, DSISR: 0x%016llx, DAR: 0x%016llx\n", 164 + dsisr, dar); 179 165 cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_AE); 180 166 } else if (dsisr) { 181 - dev_notice(&afu->dev, "PSL purge acknowledging pending non-translation fault, DSISR: 0x%016llx\n", dsisr); 167 + dev_notice(&afu->dev, "PSL purge acknowledging pending non-translation fault, DSISR: 0x%016llx\n", 168 + dsisr); 182 169 cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_A); 183 170 } else { 184 171 cpu_relax(); ··· 217 196 return ((spa_size / 8) - 96) / 17; 218 197 } 219 198 220 - int cxl_alloc_spa(struct cxl_afu *afu) 199 + static int cxl_alloc_spa(struct cxl_afu *afu, int mode) 221 200 { 222 201 unsigned spa_size; 223 202 ··· 230 209 if (spa_size > 0x100000) { 231 210 dev_warn(&afu->dev, "num_of_processes too large for the SPA, limiting to %i (0x%x)\n", 232 211 afu->native->spa_max_procs, afu->native->spa_size); 233 - afu->num_procs = afu->native->spa_max_procs; 212 + if (mode != CXL_MODE_DEDICATED) 213 + afu->num_procs = afu->native->spa_max_procs; 234 214 break; 235 215 } 236 216 ··· 280 258 } 281 259 } 282 260 283 - int cxl_tlb_slb_invalidate(struct cxl *adapter) 261 + /* 262 + * Invalidation of all ERAT entries is no longer required by CAIA2. Use 263 + * only for debug. 264 + */ 265 + int cxl_invalidate_all_psl9(struct cxl *adapter) 266 + { 267 + unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT); 268 + u64 ierat; 269 + 270 + pr_devel("CXL adapter - invalidation of all ERAT entries\n"); 271 + 272 + /* Invalidates all ERAT entries for Radix or HPT */ 273 + ierat = CXL_XSL9_IERAT_IALL; 274 + if (radix_enabled()) 275 + ierat |= CXL_XSL9_IERAT_INVR; 276 + cxl_p1_write(adapter, CXL_XSL9_IERAT, ierat); 277 + 278 + while (cxl_p1_read(adapter, CXL_XSL9_IERAT) & CXL_XSL9_IERAT_IINPROG) { 279 + if (time_after_eq(jiffies, timeout)) { 280 + dev_warn(&adapter->dev, 281 + "WARNING: CXL adapter invalidation of all ERAT entries timed out!\n"); 282 + return -EBUSY; 283 + } 284 + if (!cxl_ops->link_ok(adapter, NULL)) 285 + return -EIO; 286 + cpu_relax(); 287 + } 288 + return 0; 289 + } 290 + 291 + int cxl_invalidate_all_psl8(struct cxl *adapter) 284 292 { 285 293 unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT); 286 294 ··· 518 466 519 467 if (!rc) 520 468 ctx->pe_inserted = false; 521 - slb_invalid(ctx); 469 + if (cxl_is_power8()) 470 + slb_invalid(ctx); 522 471 pr_devel("%s Remove pe: %i finished\n", __func__, ctx->pe); 523 472 mutex_unlock(&ctx->afu->native->spa_mutex); 524 473 ··· 546 493 547 494 afu->num_procs = afu->max_procs_virtualised; 548 495 if (afu->native->spa == NULL) { 549 - if (cxl_alloc_spa(afu)) 496 + if (cxl_alloc_spa(afu, CXL_MODE_DIRECTED)) 550 497 return -ENOMEM; 551 498 } 552 499 attach_spa(afu); 553 500 554 501 cxl_p1n_write(afu, CXL_PSL_SCNTL_An, CXL_PSL_SCNTL_An_PM_AFU); 555 - cxl_p1n_write(afu, CXL_PSL_AMOR_An, 0xFFFFFFFFFFFFFFFFULL); 502 + if (cxl_is_power8()) 503 + cxl_p1n_write(afu, CXL_PSL_AMOR_An, 0xFFFFFFFFFFFFFFFFULL); 556 504 cxl_p1n_write(afu, CXL_PSL_ID_An, CXL_PSL_ID_An_F | CXL_PSL_ID_An_L); 557 505 558 506 afu->current_mode = CXL_MODE_DIRECTED; ··· 596 542 sr |= (mfmsr() & MSR_SF) | CXL_PSL_SR_An_HV; 597 543 } else { 598 544 sr |= CXL_PSL_SR_An_PR | CXL_PSL_SR_An_R; 599 - sr &= ~(CXL_PSL_SR_An_HV); 545 + if (radix_enabled()) 546 + sr |= CXL_PSL_SR_An_HV; 547 + else 548 + sr &= ~(CXL_PSL_SR_An_HV); 600 549 if (!test_tsk_thread_flag(current, TIF_32BIT)) 601 550 sr |= CXL_PSL_SR_An_SF; 551 + } 552 + if (cxl_is_psl9(ctx->afu)) { 553 + if (radix_enabled()) 554 + sr |= CXL_PSL_SR_An_XLAT_ror; 555 + else 556 + sr |= CXL_PSL_SR_An_XLAT_hpt; 602 557 } 603 558 return sr; 604 559 } ··· 641 578 WARN_ON(add_process_element(ctx)); 642 579 } 643 580 644 - static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr) 581 + static int process_element_entry_psl9(struct cxl_context *ctx, u64 wed, u64 amr) 582 + { 583 + u32 pid; 584 + 585 + cxl_assign_psn_space(ctx); 586 + 587 + ctx->elem->ctxtime = 0; /* disable */ 588 + ctx->elem->lpid = cpu_to_be32(mfspr(SPRN_LPID)); 589 + ctx->elem->haurp = 0; /* disable */ 590 + 591 + if (ctx->kernel) 592 + pid = 0; 593 + else { 594 + if (ctx->mm == NULL) { 595 + pr_devel("%s: unable to get mm for pe=%d pid=%i\n", 596 + __func__, ctx->pe, pid_nr(ctx->pid)); 597 + return -EINVAL; 598 + } 599 + pid = ctx->mm->context.id; 600 + } 601 + 602 + ctx->elem->common.tid = 0; 603 + ctx->elem->common.pid = cpu_to_be32(pid); 604 + 605 + ctx->elem->sr = cpu_to_be64(calculate_sr(ctx)); 606 + 607 + ctx->elem->common.csrp = 0; /* disable */ 608 + 609 + cxl_prefault(ctx, wed); 610 + 611 + /* 612 + * Ensure we have the multiplexed PSL interrupt set up to take faults 613 + * for kernel contexts that may not have allocated any AFU IRQs at all: 614 + */ 615 + if (ctx->irqs.range[0] == 0) { 616 + ctx->irqs.offset[0] = ctx->afu->native->psl_hwirq; 617 + ctx->irqs.range[0] = 1; 618 + } 619 + 620 + ctx->elem->common.amr = cpu_to_be64(amr); 621 + ctx->elem->common.wed = cpu_to_be64(wed); 622 + 623 + return 0; 624 + } 625 + 626 + int cxl_attach_afu_directed_psl9(struct cxl_context *ctx, u64 wed, u64 amr) 627 + { 628 + int result; 629 + 630 + /* fill the process element entry */ 631 + result = process_element_entry_psl9(ctx, wed, amr); 632 + if (result) 633 + return result; 634 + 635 + update_ivtes_directed(ctx); 636 + 637 + /* first guy needs to enable */ 638 + result = cxl_ops->afu_check_and_enable(ctx->afu); 639 + if (result) 640 + return result; 641 + 642 + return add_process_element(ctx); 643 + } 644 + 645 + int cxl_attach_afu_directed_psl8(struct cxl_context *ctx, u64 wed, u64 amr) 645 646 { 646 647 u32 pid; 647 648 int result; ··· 715 588 ctx->elem->ctxtime = 0; /* disable */ 716 589 ctx->elem->lpid = cpu_to_be32(mfspr(SPRN_LPID)); 717 590 ctx->elem->haurp = 0; /* disable */ 718 - ctx->elem->sdr = cpu_to_be64(mfspr(SPRN_SDR1)); 591 + ctx->elem->u.sdr = cpu_to_be64(mfspr(SPRN_SDR1)); 719 592 720 593 pid = current->pid; 721 594 if (ctx->kernel) ··· 726 599 ctx->elem->sr = cpu_to_be64(calculate_sr(ctx)); 727 600 728 601 ctx->elem->common.csrp = 0; /* disable */ 729 - ctx->elem->common.aurp0 = 0; /* disable */ 730 - ctx->elem->common.aurp1 = 0; /* disable */ 602 + ctx->elem->common.u.psl8.aurp0 = 0; /* disable */ 603 + ctx->elem->common.u.psl8.aurp1 = 0; /* disable */ 731 604 732 605 cxl_prefault(ctx, wed); 733 606 734 - ctx->elem->common.sstp0 = cpu_to_be64(ctx->sstp0); 735 - ctx->elem->common.sstp1 = cpu_to_be64(ctx->sstp1); 607 + ctx->elem->common.u.psl8.sstp0 = cpu_to_be64(ctx->sstp0); 608 + ctx->elem->common.u.psl8.sstp1 = cpu_to_be64(ctx->sstp1); 736 609 737 610 /* 738 611 * Ensure we have the multiplexed PSL interrupt set up to take faults ··· 798 671 return 0; 799 672 } 800 673 801 - static int activate_dedicated_process(struct cxl_afu *afu) 674 + int cxl_activate_dedicated_process_psl9(struct cxl_afu *afu) 675 + { 676 + dev_info(&afu->dev, "Activating dedicated process mode\n"); 677 + 678 + /* 679 + * If XSL is set to dedicated mode (Set in PSL_SCNTL reg), the 680 + * XSL and AFU are programmed to work with a single context. 681 + * The context information should be configured in the SPA area 682 + * index 0 (so PSL_SPAP must be configured before enabling the 683 + * AFU). 684 + */ 685 + afu->num_procs = 1; 686 + if (afu->native->spa == NULL) { 687 + if (cxl_alloc_spa(afu, CXL_MODE_DEDICATED)) 688 + return -ENOMEM; 689 + } 690 + attach_spa(afu); 691 + 692 + cxl_p1n_write(afu, CXL_PSL_SCNTL_An, CXL_PSL_SCNTL_An_PM_Process); 693 + cxl_p1n_write(afu, CXL_PSL_ID_An, CXL_PSL_ID_An_F | CXL_PSL_ID_An_L); 694 + 695 + afu->current_mode = CXL_MODE_DEDICATED; 696 + 697 + return cxl_chardev_d_afu_add(afu); 698 + } 699 + 700 + int cxl_activate_dedicated_process_psl8(struct cxl_afu *afu) 802 701 { 803 702 dev_info(&afu->dev, "Activating dedicated process mode\n"); 804 703 ··· 847 694 return cxl_chardev_d_afu_add(afu); 848 695 } 849 696 850 - static void update_ivtes_dedicated(struct cxl_context *ctx) 697 + void cxl_update_dedicated_ivtes_psl9(struct cxl_context *ctx) 698 + { 699 + int r; 700 + 701 + for (r = 0; r < CXL_IRQ_RANGES; r++) { 702 + ctx->elem->ivte_offsets[r] = cpu_to_be16(ctx->irqs.offset[r]); 703 + ctx->elem->ivte_ranges[r] = cpu_to_be16(ctx->irqs.range[r]); 704 + } 705 + } 706 + 707 + void cxl_update_dedicated_ivtes_psl8(struct cxl_context *ctx) 851 708 { 852 709 struct cxl_afu *afu = ctx->afu; 853 710 ··· 873 710 ((u64)ctx->irqs.range[3] & 0xffff)); 874 711 } 875 712 876 - static int attach_dedicated(struct cxl_context *ctx, u64 wed, u64 amr) 713 + int cxl_attach_dedicated_process_psl9(struct cxl_context *ctx, u64 wed, u64 amr) 714 + { 715 + struct cxl_afu *afu = ctx->afu; 716 + int result; 717 + 718 + /* fill the process element entry */ 719 + result = process_element_entry_psl9(ctx, wed, amr); 720 + if (result) 721 + return result; 722 + 723 + if (ctx->afu->adapter->native->sl_ops->update_dedicated_ivtes) 724 + afu->adapter->native->sl_ops->update_dedicated_ivtes(ctx); 725 + 726 + result = cxl_ops->afu_reset(afu); 727 + if (result) 728 + return result; 729 + 730 + return afu_enable(afu); 731 + } 732 + 733 + int cxl_attach_dedicated_process_psl8(struct cxl_context *ctx, u64 wed, u64 amr) 877 734 { 878 735 struct cxl_afu *afu = ctx->afu; 879 736 u64 pid; ··· 911 728 912 729 cxl_prefault(ctx, wed); 913 730 914 - update_ivtes_dedicated(ctx); 731 + if (ctx->afu->adapter->native->sl_ops->update_dedicated_ivtes) 732 + afu->adapter->native->sl_ops->update_dedicated_ivtes(ctx); 915 733 916 734 cxl_p2n_write(afu, CXL_PSL_AMR_An, amr); 917 735 ··· 962 778 963 779 if (mode == CXL_MODE_DIRECTED) 964 780 return activate_afu_directed(afu); 965 - if (mode == CXL_MODE_DEDICATED) 966 - return activate_dedicated_process(afu); 781 + if ((mode == CXL_MODE_DEDICATED) && 782 + (afu->adapter->native->sl_ops->activate_dedicated_process)) 783 + return afu->adapter->native->sl_ops->activate_dedicated_process(afu); 967 784 968 785 return -EINVAL; 969 786 } ··· 978 793 } 979 794 980 795 ctx->kernel = kernel; 981 - if (ctx->afu->current_mode == CXL_MODE_DIRECTED) 982 - return attach_afu_directed(ctx, wed, amr); 796 + if ((ctx->afu->current_mode == CXL_MODE_DIRECTED) && 797 + (ctx->afu->adapter->native->sl_ops->attach_afu_directed)) 798 + return ctx->afu->adapter->native->sl_ops->attach_afu_directed(ctx, wed, amr); 983 799 984 - if (ctx->afu->current_mode == CXL_MODE_DEDICATED) 985 - return attach_dedicated(ctx, wed, amr); 800 + if ((ctx->afu->current_mode == CXL_MODE_DEDICATED) && 801 + (ctx->afu->adapter->native->sl_ops->attach_dedicated_process)) 802 + return ctx->afu->adapter->native->sl_ops->attach_dedicated_process(ctx, wed, amr); 986 803 987 804 return -EINVAL; 988 805 } ··· 1017 830 { 1018 831 if (ctx->afu->current_mode == CXL_MODE_DIRECTED) 1019 832 return update_ivtes_directed(ctx); 1020 - if (ctx->afu->current_mode == CXL_MODE_DEDICATED) 1021 - return update_ivtes_dedicated(ctx); 833 + if ((ctx->afu->current_mode == CXL_MODE_DEDICATED) && 834 + (ctx->afu->adapter->native->sl_ops->update_dedicated_ivtes)) 835 + return ctx->afu->adapter->native->sl_ops->update_dedicated_ivtes(ctx); 1022 836 WARN(1, "native_update_ivtes: Bad mode\n"); 1023 837 } 1024 838 ··· 1047 859 1048 860 static int native_get_irq_info(struct cxl_afu *afu, struct cxl_irq_info *info) 1049 861 { 1050 - u64 pidtid; 1051 - 1052 862 /* If the adapter has gone away, we can't get any meaningful 1053 863 * information. 1054 864 */ ··· 1055 869 1056 870 info->dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An); 1057 871 info->dar = cxl_p2n_read(afu, CXL_PSL_DAR_An); 1058 - info->dsr = cxl_p2n_read(afu, CXL_PSL_DSR_An); 1059 - pidtid = cxl_p2n_read(afu, CXL_PSL_PID_TID_An); 1060 - info->pid = pidtid >> 32; 1061 - info->tid = pidtid & 0xffffffff; 872 + if (cxl_is_power8()) 873 + info->dsr = cxl_p2n_read(afu, CXL_PSL_DSR_An); 1062 874 info->afu_err = cxl_p2n_read(afu, CXL_AFU_ERR_An); 1063 875 info->errstat = cxl_p2n_read(afu, CXL_PSL_ErrStat_An); 1064 876 info->proc_handle = 0; ··· 1064 880 return 0; 1065 881 } 1066 882 1067 - void cxl_native_psl_irq_dump_regs(struct cxl_context *ctx) 883 + void cxl_native_irq_dump_regs_psl9(struct cxl_context *ctx) 884 + { 885 + u64 fir1, fir2, serr; 886 + 887 + fir1 = cxl_p1_read(ctx->afu->adapter, CXL_PSL9_FIR1); 888 + fir2 = cxl_p1_read(ctx->afu->adapter, CXL_PSL9_FIR2); 889 + 890 + dev_crit(&ctx->afu->dev, "PSL_FIR1: 0x%016llx\n", fir1); 891 + dev_crit(&ctx->afu->dev, "PSL_FIR2: 0x%016llx\n", fir2); 892 + if (ctx->afu->adapter->native->sl_ops->register_serr_irq) { 893 + serr = cxl_p1n_read(ctx->afu, CXL_PSL_SERR_An); 894 + cxl_afu_decode_psl_serr(ctx->afu, serr); 895 + } 896 + } 897 + 898 + void cxl_native_irq_dump_regs_psl8(struct cxl_context *ctx) 1068 899 { 1069 900 u64 fir1, fir2, fir_slice, serr, afu_debug; 1070 901 ··· 1115 916 return cxl_ops->ack_irq(ctx, 0, errstat); 1116 917 } 1117 918 1118 - static irqreturn_t fail_psl_irq(struct cxl_afu *afu, struct cxl_irq_info *irq_info) 919 + static bool cxl_is_translation_fault(struct cxl_afu *afu, u64 dsisr) 1119 920 { 1120 - if (irq_info->dsisr & CXL_PSL_DSISR_TRANS) 921 + if ((cxl_is_psl8(afu)) && (dsisr & CXL_PSL_DSISR_TRANS)) 922 + return true; 923 + 924 + if ((cxl_is_psl9(afu)) && (dsisr & CXL_PSL9_DSISR_An_TF)) 925 + return true; 926 + 927 + return false; 928 + } 929 + 930 + irqreturn_t cxl_fail_irq_psl(struct cxl_afu *afu, struct cxl_irq_info *irq_info) 931 + { 932 + if (cxl_is_translation_fault(afu, irq_info->dsisr)) 1121 933 cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_AE); 1122 934 else 1123 935 cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_A); ··· 1142 932 struct cxl_context *ctx; 1143 933 struct cxl_irq_info irq_info; 1144 934 u64 phreg = cxl_p2n_read(afu, CXL_PSL_PEHandle_An); 1145 - int ph, ret; 935 + int ph, ret = IRQ_HANDLED, res; 1146 936 1147 937 /* check if eeh kicked in while the interrupt was in flight */ 1148 938 if (unlikely(phreg == ~0ULL)) { ··· 1153 943 } 1154 944 /* Mask the pe-handle from register value */ 1155 945 ph = phreg & 0xffff; 1156 - if ((ret = native_get_irq_info(afu, &irq_info))) { 1157 - WARN(1, "Unable to get CXL IRQ Info: %i\n", ret); 1158 - return fail_psl_irq(afu, &irq_info); 946 + if ((res = native_get_irq_info(afu, &irq_info))) { 947 + WARN(1, "Unable to get CXL IRQ Info: %i\n", res); 948 + if (afu->adapter->native->sl_ops->fail_irq) 949 + return afu->adapter->native->sl_ops->fail_irq(afu, &irq_info); 950 + return ret; 1159 951 } 1160 952 1161 953 rcu_read_lock(); 1162 954 ctx = idr_find(&afu->contexts_idr, ph); 1163 955 if (ctx) { 1164 - ret = cxl_irq(irq, ctx, &irq_info); 956 + if (afu->adapter->native->sl_ops->handle_interrupt) 957 + ret = afu->adapter->native->sl_ops->handle_interrupt(irq, ctx, &irq_info); 1165 958 rcu_read_unlock(); 1166 959 return ret; 1167 960 } ··· 1174 961 " %016llx\n(Possible AFU HW issue - was a term/remove acked" 1175 962 " with outstanding transactions?)\n", ph, irq_info.dsisr, 1176 963 irq_info.dar); 1177 - return fail_psl_irq(afu, &irq_info); 964 + if (afu->adapter->native->sl_ops->fail_irq) 965 + ret = afu->adapter->native->sl_ops->fail_irq(afu, &irq_info); 966 + return ret; 1178 967 } 1179 968 1180 969 static void native_irq_wait(struct cxl_context *ctx) ··· 1194 979 if (ph != ctx->pe) 1195 980 return; 1196 981 dsisr = cxl_p2n_read(ctx->afu, CXL_PSL_DSISR_An); 1197 - if ((dsisr & CXL_PSL_DSISR_PENDING) == 0) 982 + if (cxl_is_psl8(ctx->afu) && 983 + ((dsisr & CXL_PSL_DSISR_PENDING) == 0)) 984 + return; 985 + if (cxl_is_psl9(ctx->afu) && 986 + ((dsisr & CXL_PSL9_DSISR_PENDING) == 0)) 1198 987 return; 1199 988 /* 1200 989 * We are waiting for the workqueue to process our ··· 1215 996 static irqreturn_t native_slice_irq_err(int irq, void *data) 1216 997 { 1217 998 struct cxl_afu *afu = data; 1218 - u64 fir_slice, errstat, serr, afu_debug, afu_error, dsisr; 999 + u64 errstat, serr, afu_error, dsisr; 1000 + u64 fir_slice, afu_debug, irq_mask; 1219 1001 1220 1002 /* 1221 1003 * slice err interrupt is only used with full PSL (no XSL) 1222 1004 */ 1223 1005 serr = cxl_p1n_read(afu, CXL_PSL_SERR_An); 1224 - fir_slice = cxl_p1n_read(afu, CXL_PSL_FIR_SLICE_An); 1225 1006 errstat = cxl_p2n_read(afu, CXL_PSL_ErrStat_An); 1226 - afu_debug = cxl_p1n_read(afu, CXL_AFU_DEBUG_An); 1227 1007 afu_error = cxl_p2n_read(afu, CXL_AFU_ERR_An); 1228 1008 dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An); 1229 1009 cxl_afu_decode_psl_serr(afu, serr); 1230 - dev_crit(&afu->dev, "PSL_FIR_SLICE_An: 0x%016llx\n", fir_slice); 1010 + 1011 + if (cxl_is_power8()) { 1012 + fir_slice = cxl_p1n_read(afu, CXL_PSL_FIR_SLICE_An); 1013 + afu_debug = cxl_p1n_read(afu, CXL_AFU_DEBUG_An); 1014 + dev_crit(&afu->dev, "PSL_FIR_SLICE_An: 0x%016llx\n", fir_slice); 1015 + dev_crit(&afu->dev, "CXL_PSL_AFU_DEBUG_An: 0x%016llx\n", afu_debug); 1016 + } 1231 1017 dev_crit(&afu->dev, "CXL_PSL_ErrStat_An: 0x%016llx\n", errstat); 1232 - dev_crit(&afu->dev, "CXL_PSL_AFU_DEBUG_An: 0x%016llx\n", afu_debug); 1233 1018 dev_crit(&afu->dev, "AFU_ERR_An: 0x%.16llx\n", afu_error); 1234 1019 dev_crit(&afu->dev, "PSL_DSISR_An: 0x%.16llx\n", dsisr); 1235 1020 1021 + /* mask off the IRQ so it won't retrigger until the AFU is reset */ 1022 + irq_mask = (serr & CXL_PSL_SERR_An_IRQS) >> 32; 1023 + serr |= irq_mask; 1236 1024 cxl_p1n_write(afu, CXL_PSL_SERR_An, serr); 1025 + dev_info(&afu->dev, "Further such interrupts will be masked until the AFU is reset\n"); 1237 1026 1238 1027 return IRQ_HANDLED; 1239 1028 } ··· 1330 1103 } 1331 1104 1332 1105 serr = cxl_p1n_read(afu, CXL_PSL_SERR_An); 1333 - serr = (serr & 0x00ffffffffff0000ULL) | (afu->serr_hwirq & 0xffff); 1106 + if (cxl_is_power8()) 1107 + serr = (serr & 0x00ffffffffff0000ULL) | (afu->serr_hwirq & 0xffff); 1108 + if (cxl_is_power9()) { 1109 + /* 1110 + * By default, all errors are masked. So don't set all masks. 1111 + * Slice errors will be transfered. 1112 + */ 1113 + serr = (serr & ~0xff0000007fffffffULL) | (afu->serr_hwirq & 0xffff); 1114 + } 1334 1115 cxl_p1n_write(afu, CXL_PSL_SERR_An, serr); 1335 1116 1336 1117 return 0;

+355 -62

drivers/misc/cxl/pci.c

··· 60 60 #define CXL_VSEC_PROTOCOL_MASK 0xe0 61 61 #define CXL_VSEC_PROTOCOL_1024TB 0x80 62 62 #define CXL_VSEC_PROTOCOL_512TB 0x40 63 - #define CXL_VSEC_PROTOCOL_256TB 0x20 /* Power 8 uses this */ 63 + #define CXL_VSEC_PROTOCOL_256TB 0x20 /* Power 8/9 uses this */ 64 64 #define CXL_VSEC_PROTOCOL_ENABLE 0x01 65 65 66 66 #define CXL_READ_VSEC_PSL_REVISION(dev, vsec, dest) \ ··· 123 123 { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x044b), }, 124 124 { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x04cf), }, 125 125 { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0601), }, 126 + { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0623), }, 127 + { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0628), }, 126 128 { PCI_DEVICE_CLASS(0x120000, ~0), }, 127 129 128 130 { } ··· 326 324 #undef show_reg 327 325 } 328 326 329 - #define CAPP_UNIT0_ID 0xBA 330 - #define CAPP_UNIT1_ID 0XBE 327 + #define P8_CAPP_UNIT0_ID 0xBA 328 + #define P8_CAPP_UNIT1_ID 0XBE 329 + #define P9_CAPP_UNIT0_ID 0xC0 330 + #define P9_CAPP_UNIT1_ID 0xE0 331 331 332 - static u64 get_capp_unit_id(struct device_node *np) 332 + static int get_phb_index(struct device_node *np, u32 *phb_index) 333 333 { 334 - u32 phb_index; 334 + if (of_property_read_u32(np, "ibm,phb-index", phb_index)) 335 + return -ENODEV; 336 + return 0; 337 + } 338 + 339 + static u64 get_capp_unit_id(struct device_node *np, u32 phb_index) 340 + { 341 + /* 342 + * POWER 8: 343 + * - For chips other than POWER8NVL, we only have CAPP 0, 344 + * irrespective of which PHB is used. 345 + * - For POWER8NVL, assume CAPP 0 is attached to PHB0 and 346 + * CAPP 1 is attached to PHB1. 347 + */ 348 + if (cxl_is_power8()) { 349 + if (!pvr_version_is(PVR_POWER8NVL)) 350 + return P8_CAPP_UNIT0_ID; 351 + 352 + if (phb_index == 0) 353 + return P8_CAPP_UNIT0_ID; 354 + 355 + if (phb_index == 1) 356 + return P8_CAPP_UNIT1_ID; 357 + } 335 358 336 359 /* 337 - * For chips other than POWER8NVL, we only have CAPP 0, 338 - * irrespective of which PHB is used. 360 + * POWER 9: 361 + * PEC0 (PHB0). Capp ID = CAPP0 (0b1100_0000) 362 + * PEC1 (PHB1 - PHB2). No capi mode 363 + * PEC2 (PHB3 - PHB4 - PHB5): Capi mode on PHB3 only. Capp ID = CAPP1 (0b1110_0000) 339 364 */ 340 - if (!pvr_version_is(PVR_POWER8NVL)) 341 - return CAPP_UNIT0_ID; 365 + if (cxl_is_power9()) { 366 + if (phb_index == 0) 367 + return P9_CAPP_UNIT0_ID; 342 368 343 - /* 344 - * For POWER8NVL, assume CAPP 0 is attached to PHB0 and 345 - * CAPP 1 is attached to PHB1. 346 - */ 347 - if (of_property_read_u32(np, "ibm,phb-index", &phb_index)) 348 - return 0; 349 - 350 - if (phb_index == 0) 351 - return CAPP_UNIT0_ID; 352 - 353 - if (phb_index == 1) 354 - return CAPP_UNIT1_ID; 369 + if (phb_index == 3) 370 + return P9_CAPP_UNIT1_ID; 371 + } 355 372 356 373 return 0; 357 374 } 358 375 359 - static int calc_capp_routing(struct pci_dev *dev, u64 *chipid, u64 *capp_unit_id) 376 + static int calc_capp_routing(struct pci_dev *dev, u64 *chipid, 377 + u32 *phb_index, u64 *capp_unit_id) 360 378 { 379 + int rc; 361 380 struct device_node *np; 362 381 const __be32 *prop; 363 382 ··· 389 366 np = of_get_next_parent(np); 390 367 if (!np) 391 368 return -ENODEV; 369 + 392 370 *chipid = be32_to_cpup(prop); 393 - *capp_unit_id = get_capp_unit_id(np); 371 + 372 + rc = get_phb_index(np, phb_index); 373 + if (rc) { 374 + pr_err("cxl: invalid phb index\n"); 375 + return rc; 376 + } 377 + 378 + *capp_unit_id = get_capp_unit_id(np, *phb_index); 394 379 of_node_put(np); 395 380 if (!*capp_unit_id) { 396 381 pr_err("cxl: invalid capp unit id\n"); ··· 408 377 return 0; 409 378 } 410 379 411 - static int init_implementation_adapter_psl_regs(struct cxl *adapter, struct pci_dev *dev) 380 + static int init_implementation_adapter_regs_psl9(struct cxl *adapter, struct pci_dev *dev) 412 381 { 413 - u64 psl_dsnctl, psl_fircntl; 382 + u64 xsl_dsnctl, psl_fircntl; 414 383 u64 chipid; 384 + u32 phb_index; 415 385 u64 capp_unit_id; 416 386 int rc; 417 387 418 - rc = calc_capp_routing(dev, &chipid, &capp_unit_id); 388 + rc = calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id); 389 + if (rc) 390 + return rc; 391 + 392 + /* 393 + * CAPI Identifier bits [0:7] 394 + * bit 61:60 MSI bits --> 0 395 + * bit 59 TVT selector --> 0 396 + */ 397 + 398 + /* 399 + * Tell XSL where to route data to. 400 + * The field chipid should match the PHB CAPI_CMPM register 401 + */ 402 + xsl_dsnctl = ((u64)0x2 << (63-7)); /* Bit 57 */ 403 + xsl_dsnctl |= (capp_unit_id << (63-15)); 404 + 405 + /* nMMU_ID Defaults to: b’000001001’*/ 406 + xsl_dsnctl |= ((u64)0x09 << (63-28)); 407 + 408 + if (cxl_is_power9() && !cpu_has_feature(CPU_FTR_POWER9_DD1)) { 409 + /* 410 + * Used to identify CAPI packets which should be sorted into 411 + * the Non-Blocking queues by the PHB. This field should match 412 + * the PHB PBL_NBW_CMPM register 413 + * nbwind=0x03, bits [57:58], must include capi indicator. 414 + * Not supported on P9 DD1. 415 + */ 416 + xsl_dsnctl |= ((u64)0x03 << (63-47)); 417 + 418 + /* 419 + * Upper 16b address bits of ASB_Notify messages sent to the 420 + * system. Need to match the PHB’s ASN Compare/Mask Register. 421 + * Not supported on P9 DD1. 422 + */ 423 + xsl_dsnctl |= ((u64)0x04 << (63-55)); 424 + } 425 + 426 + cxl_p1_write(adapter, CXL_XSL9_DSNCTL, xsl_dsnctl); 427 + 428 + /* Set fir_cntl to recommended value for production env */ 429 + psl_fircntl = (0x2ULL << (63-3)); /* ce_report */ 430 + psl_fircntl |= (0x1ULL << (63-6)); /* FIR_report */ 431 + psl_fircntl |= 0x1ULL; /* ce_thresh */ 432 + cxl_p1_write(adapter, CXL_PSL9_FIR_CNTL, psl_fircntl); 433 + 434 + /* vccredits=0x1 pcklat=0x4 */ 435 + cxl_p1_write(adapter, CXL_PSL9_DSNDCTL, 0x0000000000001810ULL); 436 + 437 + /* 438 + * For debugging with trace arrays. 439 + * Configure RX trace 0 segmented mode. 440 + * Configure CT trace 0 segmented mode. 441 + * Configure LA0 trace 0 segmented mode. 442 + * Configure LA1 trace 0 segmented mode. 443 + */ 444 + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000000ULL); 445 + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000003ULL); 446 + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000005ULL); 447 + cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000006ULL); 448 + 449 + /* 450 + * A response to an ASB_Notify request is returned by the 451 + * system as an MMIO write to the address defined in 452 + * the PSL_TNR_ADDR register 453 + */ 454 + /* PSL_TNR_ADDR */ 455 + 456 + /* NORST */ 457 + cxl_p1_write(adapter, CXL_PSL9_DEBUG, 0x8000000000000000ULL); 458 + 459 + /* allocate the apc machines */ 460 + cxl_p1_write(adapter, CXL_PSL9_APCDEDTYPE, 0x40000003FFFF0000ULL); 461 + 462 + /* Disable vc dd1 fix */ 463 + if ((cxl_is_power9() && cpu_has_feature(CPU_FTR_POWER9_DD1))) 464 + cxl_p1_write(adapter, CXL_PSL9_GP_CT, 0x0400000000000001ULL); 465 + 466 + return 0; 467 + } 468 + 469 + static int init_implementation_adapter_regs_psl8(struct cxl *adapter, struct pci_dev *dev) 470 + { 471 + u64 psl_dsnctl, psl_fircntl; 472 + u64 chipid; 473 + u32 phb_index; 474 + u64 capp_unit_id; 475 + int rc; 476 + 477 + rc = calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id); 419 478 if (rc) 420 479 return rc; 421 480 ··· 530 409 return 0; 531 410 } 532 411 533 - static int init_implementation_adapter_xsl_regs(struct cxl *adapter, struct pci_dev *dev) 412 + static int init_implementation_adapter_regs_xsl(struct cxl *adapter, struct pci_dev *dev) 534 413 { 535 414 u64 xsl_dsnctl; 536 415 u64 chipid; 416 + u32 phb_index; 537 417 u64 capp_unit_id; 538 418 int rc; 539 419 540 - rc = calc_capp_routing(dev, &chipid, &capp_unit_id); 420 + rc = calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id); 541 421 if (rc) 542 422 return rc; 543 423 ··· 556 434 /* For the PSL this is a multiple for 0 < n <= 7: */ 557 435 #define PSL_2048_250MHZ_CYCLES 1 558 436 559 - static void write_timebase_ctrl_psl(struct cxl *adapter) 437 + static void write_timebase_ctrl_psl9(struct cxl *adapter) 438 + { 439 + cxl_p1_write(adapter, CXL_PSL9_TB_CTLSTAT, 440 + TBSYNC_CNT(2 * PSL_2048_250MHZ_CYCLES)); 441 + } 442 + 443 + static void write_timebase_ctrl_psl8(struct cxl *adapter) 560 444 { 561 445 cxl_p1_write(adapter, CXL_PSL_TB_CTLSTAT, 562 446 TBSYNC_CNT(2 * PSL_2048_250MHZ_CYCLES)); ··· 583 455 TBSYNC_CNT(XSL_4000_CLOCKS)); 584 456 } 585 457 586 - static u64 timebase_read_psl(struct cxl *adapter) 458 + static u64 timebase_read_psl9(struct cxl *adapter) 459 + { 460 + return cxl_p1_read(adapter, CXL_PSL9_Timebase); 461 + } 462 + 463 + static u64 timebase_read_psl8(struct cxl *adapter) 587 464 { 588 465 return cxl_p1_read(adapter, CXL_PSL_Timebase); 589 466 } ··· 646 513 return; 647 514 } 648 515 649 - static int init_implementation_afu_psl_regs(struct cxl_afu *afu) 516 + static int init_implementation_afu_regs_psl9(struct cxl_afu *afu) 517 + { 518 + return 0; 519 + } 520 + 521 + static int init_implementation_afu_regs_psl8(struct cxl_afu *afu) 650 522 { 651 523 /* read/write masks for this slice */ 652 524 cxl_p1n_write(afu, CXL_PSL_APCALLOC_A, 0xFFFFFFFEFEFEFEFEULL); ··· 749 611 /* 750 612 * BAR 4/5 has a special meaning for CXL and must be programmed with a 751 613 * special value corresponding to the CXL protocol address range. 752 - * For POWER 8 that means bits 48:49 must be set to 10 614 + * For POWER 8/9 that means bits 48:49 must be set to 10 753 615 */ 754 616 pci_write_config_dword(dev, PCI_BASE_ADDRESS_4, 0x00000000); 755 617 pci_write_config_dword(dev, PCI_BASE_ADDRESS_5, 0x00020000); ··· 1106 968 } 1107 969 1108 970 if (afu->pp_psa && (afu->pp_size < PAGE_SIZE)) 1109 - dev_warn(&afu->dev, "AFU uses < PAGE_SIZE per-process PSA!"); 971 + dev_warn(&afu->dev, "AFU uses pp_size(%#016llx) < PAGE_SIZE per-process PSA!\n", afu->pp_size); 1110 972 1111 973 for (i = 0; i < afu->crs_num; i++) { 1112 974 rc = cxl_ops->afu_cr_read32(afu, i, 0, &val); ··· 1134 996 return 0; 1135 997 } 1136 998 1137 - static int sanitise_afu_regs(struct cxl_afu *afu) 999 + static int sanitise_afu_regs_psl9(struct cxl_afu *afu) 1000 + { 1001 + u64 reg; 1002 + 1003 + /* 1004 + * Clear out any regs that contain either an IVTE or address or may be 1005 + * waiting on an acknowledgment to try to be a bit safer as we bring 1006 + * it online 1007 + */ 1008 + reg = cxl_p2n_read(afu, CXL_AFU_Cntl_An); 1009 + if ((reg & CXL_AFU_Cntl_An_ES_MASK) != CXL_AFU_Cntl_An_ES_Disabled) { 1010 + dev_warn(&afu->dev, "WARNING: AFU was not disabled: %#016llx\n", reg); 1011 + if (cxl_ops->afu_reset(afu)) 1012 + return -EIO; 1013 + if (cxl_afu_disable(afu)) 1014 + return -EIO; 1015 + if (cxl_psl_purge(afu)) 1016 + return -EIO; 1017 + } 1018 + cxl_p1n_write(afu, CXL_PSL_SPAP_An, 0x0000000000000000); 1019 + cxl_p1n_write(afu, CXL_PSL_AMBAR_An, 0x0000000000000000); 1020 + reg = cxl_p2n_read(afu, CXL_PSL_DSISR_An); 1021 + if (reg) { 1022 + dev_warn(&afu->dev, "AFU had pending DSISR: %#016llx\n", reg); 1023 + if (reg & CXL_PSL9_DSISR_An_TF) 1024 + cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_AE); 1025 + else 1026 + cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_A); 1027 + } 1028 + if (afu->adapter->native->sl_ops->register_serr_irq) { 1029 + reg = cxl_p1n_read(afu, CXL_PSL_SERR_An); 1030 + if (reg) { 1031 + if (reg & ~0x000000007fffffff) 1032 + dev_warn(&afu->dev, "AFU had pending SERR: %#016llx\n", reg); 1033 + cxl_p1n_write(afu, CXL_PSL_SERR_An, reg & ~0xffff); 1034 + } 1035 + } 1036 + reg = cxl_p2n_read(afu, CXL_PSL_ErrStat_An); 1037 + if (reg) { 1038 + dev_warn(&afu->dev, "AFU had pending error status: %#016llx\n", reg); 1039 + cxl_p2n_write(afu, CXL_PSL_ErrStat_An, reg); 1040 + } 1041 + 1042 + return 0; 1043 + } 1044 + 1045 + static int sanitise_afu_regs_psl8(struct cxl_afu *afu) 1138 1046 { 1139 1047 u64 reg; 1140 1048 ··· 1286 1102 if ((rc = pci_map_slice_regs(afu, adapter, dev))) 1287 1103 return rc; 1288 1104 1289 - if ((rc = sanitise_afu_regs(afu))) 1290 - goto err1; 1105 + if (adapter->native->sl_ops->sanitise_afu_regs) { 1106 + rc = adapter->native->sl_ops->sanitise_afu_regs(afu); 1107 + if (rc) 1108 + goto err1; 1109 + } 1291 1110 1292 1111 /* We need to reset the AFU before we can read the AFU descriptor */ 1293 1112 if ((rc = cxl_ops->afu_reset(afu))) ··· 1435 1248 1436 1249 dev_info(&dev->dev, "CXL reset\n"); 1437 1250 1438 - /* the adapter is about to be reset, so ignore errors */ 1439 - cxl_data_cache_flush(adapter); 1251 + /* 1252 + * The adapter is about to be reset, so ignore errors. 1253 + * Not supported on P9 DD1 1254 + */ 1255 + if ((cxl_is_power8()) || 1256 + ((cxl_is_power9() && !cpu_has_feature(CPU_FTR_POWER9_DD1)))) 1257 + cxl_data_cache_flush(adapter); 1440 1258 1441 1259 /* pcie_warm_reset requests a fundamental pci reset which includes a 1442 1260 * PERST assert/deassert. PERST triggers a loading of the image ··· 1524 1332 CXL_READ_VSEC_IMAGE_STATE(dev, vsec, &image_state); 1525 1333 adapter->user_image_loaded = !!(image_state & CXL_VSEC_USER_IMAGE_LOADED); 1526 1334 adapter->perst_select_user = !!(image_state & CXL_VSEC_USER_IMAGE_LOADED); 1335 + adapter->perst_loads_image = !!(image_state & CXL_VSEC_PERST_LOADS_IMAGE); 1527 1336 1528 1337 CXL_READ_VSEC_NAFUS(dev, vsec, &adapter->slices); 1529 1338 CXL_READ_VSEC_AFU_DESC_OFF(dev, vsec, &afu_desc_off); ··· 1571 1378 pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, data); 1572 1379 } 1573 1380 1381 + static bool cxl_compatible_caia_version(struct cxl *adapter) 1382 + { 1383 + if (cxl_is_power8() && (adapter->caia_major == 1)) 1384 + return true; 1385 + 1386 + if (cxl_is_power9() && (adapter->caia_major == 2)) 1387 + return true; 1388 + 1389 + return false; 1390 + } 1391 + 1574 1392 static int cxl_vsec_looks_ok(struct cxl *adapter, struct pci_dev *dev) 1575 1393 { 1576 1394 if (adapter->vsec_status & CXL_STATUS_SECOND_PORT) ··· 1590 1386 if (adapter->vsec_status & CXL_UNSUPPORTED_FEATURES) { 1591 1387 dev_err(&dev->dev, "ABORTING: CXL requires unsupported features\n"); 1592 1388 return -EINVAL; 1389 + } 1390 + 1391 + if (!cxl_compatible_caia_version(adapter)) { 1392 + dev_info(&dev->dev, "Ignoring card. PSL type is not supported (caia version: %d)\n", 1393 + adapter->caia_major); 1394 + return -ENODEV; 1593 1395 } 1594 1396 1595 1397 if (!adapter->slices) { ··· 1641 1431 1642 1432 static int sanitise_adapter_regs(struct cxl *adapter) 1643 1433 { 1434 + int rc = 0; 1435 + 1644 1436 /* Clear PSL tberror bit by writing 1 to it */ 1645 1437 cxl_p1_write(adapter, CXL_PSL_ErrIVTE, CXL_PSL_ErrIVTE_tberror); 1646 - return cxl_tlb_slb_invalidate(adapter); 1438 + 1439 + if (adapter->native->sl_ops->invalidate_all) { 1440 + /* do not invalidate ERAT entries when not reloading on PERST */ 1441 + if (cxl_is_power9() && (adapter->perst_loads_image)) 1442 + return 0; 1443 + rc = adapter->native->sl_ops->invalidate_all(adapter); 1444 + } 1445 + 1446 + return rc; 1647 1447 } 1648 1448 1649 1449 /* This should contain *only* operations that can safely be done in ··· 1716 1496 if ((rc = cxl_native_register_psl_err_irq(adapter))) 1717 1497 goto err; 1718 1498 1719 - /* Release the context lock as adapter is configured */ 1720 - cxl_adapter_context_unlock(adapter); 1721 1499 return 0; 1722 1500 1723 1501 err: ··· 1734 1516 pci_disable_device(pdev); 1735 1517 } 1736 1518 1737 - static const struct cxl_service_layer_ops psl_ops = { 1738 - .adapter_regs_init = init_implementation_adapter_psl_regs, 1739 - .afu_regs_init = init_implementation_afu_psl_regs, 1519 + static const struct cxl_service_layer_ops psl9_ops = { 1520 + .adapter_regs_init = init_implementation_adapter_regs_psl9, 1521 + .invalidate_all = cxl_invalidate_all_psl9, 1522 + .afu_regs_init = init_implementation_afu_regs_psl9, 1523 + .sanitise_afu_regs = sanitise_afu_regs_psl9, 1740 1524 .register_serr_irq = cxl_native_register_serr_irq, 1741 1525 .release_serr_irq = cxl_native_release_serr_irq, 1742 - .debugfs_add_adapter_sl_regs = cxl_debugfs_add_adapter_psl_regs, 1743 - .debugfs_add_afu_sl_regs = cxl_debugfs_add_afu_psl_regs, 1744 - .psl_irq_dump_registers = cxl_native_psl_irq_dump_regs, 1526 + .handle_interrupt = cxl_irq_psl9, 1527 + .fail_irq = cxl_fail_irq_psl, 1528 + .activate_dedicated_process = cxl_activate_dedicated_process_psl9, 1529 + .attach_afu_directed = cxl_attach_afu_directed_psl9, 1530 + .attach_dedicated_process = cxl_attach_dedicated_process_psl9, 1531 + .update_dedicated_ivtes = cxl_update_dedicated_ivtes_psl9, 1532 + .debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_psl9, 1533 + .debugfs_add_afu_regs = cxl_debugfs_add_afu_regs_psl9, 1534 + .psl_irq_dump_registers = cxl_native_irq_dump_regs_psl9, 1745 1535 .err_irq_dump_registers = cxl_native_err_irq_dump_regs, 1746 - .debugfs_stop_trace = cxl_stop_trace, 1747 - .write_timebase_ctrl = write_timebase_ctrl_psl, 1748 - .timebase_read = timebase_read_psl, 1536 + .debugfs_stop_trace = cxl_stop_trace_psl9, 1537 + .write_timebase_ctrl = write_timebase_ctrl_psl9, 1538 + .timebase_read = timebase_read_psl9, 1539 + .capi_mode = OPAL_PHB_CAPI_MODE_CAPI, 1540 + .needs_reset_before_disable = true, 1541 + }; 1542 + 1543 + static const struct cxl_service_layer_ops psl8_ops = { 1544 + .adapter_regs_init = init_implementation_adapter_regs_psl8, 1545 + .invalidate_all = cxl_invalidate_all_psl8, 1546 + .afu_regs_init = init_implementation_afu_regs_psl8, 1547 + .sanitise_afu_regs = sanitise_afu_regs_psl8, 1548 + .register_serr_irq = cxl_native_register_serr_irq, 1549 + .release_serr_irq = cxl_native_release_serr_irq, 1550 + .handle_interrupt = cxl_irq_psl8, 1551 + .fail_irq = cxl_fail_irq_psl, 1552 + .activate_dedicated_process = cxl_activate_dedicated_process_psl8, 1553 + .attach_afu_directed = cxl_attach_afu_directed_psl8, 1554 + .attach_dedicated_process = cxl_attach_dedicated_process_psl8, 1555 + .update_dedicated_ivtes = cxl_update_dedicated_ivtes_psl8, 1556 + .debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_psl8, 1557 + .debugfs_add_afu_regs = cxl_debugfs_add_afu_regs_psl8, 1558 + .psl_irq_dump_registers = cxl_native_irq_dump_regs_psl8, 1559 + .err_irq_dump_registers = cxl_native_err_irq_dump_regs, 1560 + .debugfs_stop_trace = cxl_stop_trace_psl8, 1561 + .write_timebase_ctrl = write_timebase_ctrl_psl8, 1562 + .timebase_read = timebase_read_psl8, 1749 1563 .capi_mode = OPAL_PHB_CAPI_MODE_CAPI, 1750 1564 .needs_reset_before_disable = true, 1751 1565 }; 1752 1566 1753 1567 static const struct cxl_service_layer_ops xsl_ops = { 1754 - .adapter_regs_init = init_implementation_adapter_xsl_regs, 1755 - .debugfs_add_adapter_sl_regs = cxl_debugfs_add_adapter_xsl_regs, 1568 + .adapter_regs_init = init_implementation_adapter_regs_xsl, 1569 + .invalidate_all = cxl_invalidate_all_psl8, 1570 + .sanitise_afu_regs = sanitise_afu_regs_psl8, 1571 + .handle_interrupt = cxl_irq_psl8, 1572 + .fail_irq = cxl_fail_irq_psl, 1573 + .activate_dedicated_process = cxl_activate_dedicated_process_psl8, 1574 + .attach_afu_directed = cxl_attach_afu_directed_psl8, 1575 + .attach_dedicated_process = cxl_attach_dedicated_process_psl8, 1576 + .update_dedicated_ivtes = cxl_update_dedicated_ivtes_psl8, 1577 + .debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_xsl, 1756 1578 .write_timebase_ctrl = write_timebase_ctrl_xsl, 1757 1579 .timebase_read = timebase_read_xsl, 1758 1580 .capi_mode = OPAL_PHB_CAPI_MODE_DMA, ··· 1806 1548 adapter->native->sl_ops = &xsl_ops; 1807 1549 adapter->min_pe = 1; /* Workaround for CX-4 hardware bug */ 1808 1550 } else { 1809 - dev_info(&dev->dev, "Device uses a PSL\n"); 1810 - adapter->native->sl_ops = &psl_ops; 1551 + if (cxl_is_power8()) { 1552 + dev_info(&dev->dev, "Device uses a PSL8\n"); 1553 + adapter->native->sl_ops = &psl8_ops; 1554 + } else { 1555 + dev_info(&dev->dev, "Device uses a PSL9\n"); 1556 + adapter->native->sl_ops = &psl9_ops; 1557 + } 1811 1558 } 1812 1559 } 1813 1560 ··· 1859 1596 if ((rc = cxl_sysfs_adapter_add(adapter))) 1860 1597 goto err_put1; 1861 1598 1599 + /* Release the context lock as adapter is configured */ 1600 + cxl_adapter_context_unlock(adapter); 1601 + 1862 1602 return adapter; 1863 1603 1864 1604 err_put1: ··· 1885 1619 cxl_sysfs_adapter_remove(adapter); 1886 1620 cxl_debugfs_adapter_remove(adapter); 1887 1621 1888 - /* Flush adapter datacache as its about to be removed */ 1889 - cxl_data_cache_flush(adapter); 1622 + /* 1623 + * Flush adapter datacache as its about to be removed. 1624 + * Not supported on P9 DD1. 1625 + */ 1626 + if ((cxl_is_power8()) || 1627 + ((cxl_is_power9() && !cpu_has_feature(CPU_FTR_POWER9_DD1)))) 1628 + cxl_data_cache_flush(adapter); 1890 1629 1891 1630 cxl_deconfigure_adapter(adapter); 1892 1631 ··· 1975 1704 return -ENODEV; 1976 1705 } 1977 1706 1707 + if (cxl_is_power9() && !radix_enabled()) { 1708 + dev_info(&dev->dev, "Only Radix mode supported\n"); 1709 + return -ENODEV; 1710 + } 1711 + 1978 1712 if (cxl_verbose) 1979 1713 dump_cxl_config_space(dev); 1980 1714 ··· 2057 1781 { 2058 1782 struct cxl *adapter = pci_get_drvdata(pdev); 2059 1783 struct cxl_afu *afu; 2060 - pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET; 1784 + pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET, afu_result; 2061 1785 int i; 2062 1786 2063 1787 /* At this point, we could still have an interrupt pending. ··· 2161 1885 for (i = 0; i < adapter->slices; i++) { 2162 1886 afu = adapter->afu[i]; 2163 1887 2164 - result = cxl_vphb_error_detected(afu, state); 2165 - 2166 - /* Only continue if everyone agrees on NEED_RESET */ 2167 - if (result != PCI_ERS_RESULT_NEED_RESET) 2168 - return result; 1888 + afu_result = cxl_vphb_error_detected(afu, state); 2169 1889 2170 1890 cxl_context_detach_all(afu); 2171 1891 cxl_ops->afu_deactivate_mode(afu, afu->current_mode); 2172 1892 pci_deconfigure_afu(afu); 1893 + 1894 + /* Disconnect trumps all, NONE trumps NEED_RESET */ 1895 + if (afu_result == PCI_ERS_RESULT_DISCONNECT) 1896 + result = PCI_ERS_RESULT_DISCONNECT; 1897 + else if ((afu_result == PCI_ERS_RESULT_NONE) && 1898 + (result == PCI_ERS_RESULT_NEED_RESET)) 1899 + result = PCI_ERS_RESULT_NONE; 2173 1900 } 1901 + 1902 + /* should take the context lock here */ 1903 + if (cxl_adapter_context_lock(adapter) != 0) 1904 + dev_warn(&adapter->dev, 1905 + "Couldn't take context lock with %d active-contexts\n", 1906 + atomic_read(&adapter->contexts_num)); 1907 + 2174 1908 cxl_deconfigure_adapter(adapter); 2175 1909 2176 1910 return result; ··· 2198 1912 2199 1913 if (cxl_configure_adapter(adapter, pdev)) 2200 1914 goto err; 1915 + 1916 + /* 1917 + * Unlock context activation for the adapter. Ideally this should be 1918 + * done in cxl_pci_resume but cxlflash module tries to activate the 1919 + * master context as part of slot_reset callback. 1920 + */ 1921 + cxl_adapter_context_unlock(adapter); 2201 1922 2202 1923 for (i = 0; i < adapter->slices; i++) { 2203 1924 afu = adapter->afu[i];

+43

drivers/misc/cxl/trace.h

··· 17 17 18 18 #include "cxl.h" 19 19 20 + #define dsisr_psl9_flags(flags) \ 21 + __print_flags(flags, "|", \ 22 + { CXL_PSL9_DSISR_An_CO_MASK, "FR" }, \ 23 + { CXL_PSL9_DSISR_An_TF, "TF" }, \ 24 + { CXL_PSL9_DSISR_An_PE, "PE" }, \ 25 + { CXL_PSL9_DSISR_An_AE, "AE" }, \ 26 + { CXL_PSL9_DSISR_An_OC, "OC" }, \ 27 + { CXL_PSL9_DSISR_An_S, "S" }) 28 + 20 29 #define DSISR_FLAGS \ 21 30 { CXL_PSL_DSISR_An_DS, "DS" }, \ 22 31 { CXL_PSL_DSISR_An_DM, "DM" }, \ ··· 160 151 __entry->afu_irq, 161 152 __entry->virq, 162 153 __entry->hwirq 154 + ) 155 + ); 156 + 157 + TRACE_EVENT(cxl_psl9_irq, 158 + TP_PROTO(struct cxl_context *ctx, int irq, u64 dsisr, u64 dar), 159 + 160 + TP_ARGS(ctx, irq, dsisr, dar), 161 + 162 + TP_STRUCT__entry( 163 + __field(u8, card) 164 + __field(u8, afu) 165 + __field(u16, pe) 166 + __field(int, irq) 167 + __field(u64, dsisr) 168 + __field(u64, dar) 169 + ), 170 + 171 + TP_fast_assign( 172 + __entry->card = ctx->afu->adapter->adapter_num; 173 + __entry->afu = ctx->afu->slice; 174 + __entry->pe = ctx->pe; 175 + __entry->irq = irq; 176 + __entry->dsisr = dsisr; 177 + __entry->dar = dar; 178 + ), 179 + 180 + TP_printk("afu%i.%i pe=%i irq=%i dsisr=0x%016llx dsisr=%s dar=0x%016llx", 181 + __entry->card, 182 + __entry->afu, 183 + __entry->pe, 184 + __entry->irq, 185 + __entry->dsisr, 186 + dsisr_psl9_flags(__entry->dsisr), 187 + __entry->dar 163 188 ) 164 189 ); 165 190

+31

drivers/of/base.c

··· 1213 1213 EXPORT_SYMBOL_GPL(of_property_read_u32_index); 1214 1214 1215 1215 /** 1216 + * of_property_read_u64_index - Find and read a u64 from a multi-value property. 1217 + * 1218 + * @np: device node from which the property value is to be read. 1219 + * @propname: name of the property to be searched. 1220 + * @index: index of the u64 in the list of values 1221 + * @out_value: pointer to return value, modified only if no error. 1222 + * 1223 + * Search for a property in a device node and read nth 64-bit value from 1224 + * it. Returns 0 on success, -EINVAL if the property does not exist, 1225 + * -ENODATA if property does not have a value, and -EOVERFLOW if the 1226 + * property data isn't large enough. 1227 + * 1228 + * The out_value is modified only if a valid u64 value can be decoded. 1229 + */ 1230 + int of_property_read_u64_index(const struct device_node *np, 1231 + const char *propname, 1232 + u32 index, u64 *out_value) 1233 + { 1234 + const u64 *val = of_find_property_value_of_size(np, propname, 1235 + ((index + 1) * sizeof(*out_value)), 1236 + 0, NULL); 1237 + 1238 + if (IS_ERR(val)) 1239 + return PTR_ERR(val); 1240 + 1241 + *out_value = be64_to_cpup(((__be64 *)val) + index); 1242 + return 0; 1243 + } 1244 + EXPORT_SYMBOL_GPL(of_property_read_u64_index); 1245 + 1246 + /** 1216 1247 * of_property_read_variable_u8_array - Find and read an array of u8 from a 1217 1248 * property, with bounds on the minimum and maximum array size. 1218 1249 *

+2 -2

drivers/pcmcia/electra_cf.c

··· 207 207 return -ENOMEM; 208 208 209 209 setup_timer(&cf->timer, electra_cf_timer, (unsigned long)cf); 210 - cf->irq = NO_IRQ; 210 + cf->irq = 0; 211 211 212 212 cf->ofdev = ofdev; 213 213 cf->mem_phys = mem.start; ··· 313 313 fail2: 314 314 release_mem_region(cf->mem_phys, cf->mem_size); 315 315 fail1: 316 - if (cf->irq != NO_IRQ) 316 + if (cf->irq) 317 317 free_irq(cf->irq, cf); 318 318 319 319 if (cf->io_virt)

+1 -1

drivers/vfio/vfio_iommu_spapr_tce.c

··· 685 685 unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; 686 686 687 687 tce_iommu_userspace_view_free(tbl, container->mm); 688 - tbl->it_ops->free(tbl); 688 + iommu_tce_table_put(tbl); 689 689 decrement_locked_vm(container->mm, pages); 690 690 } 691 691

+1

include/linux/kprobes.h

··· 381 381 return this_cpu_ptr(&kprobe_ctlblk); 382 382 } 383 383 384 + kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset); 384 385 int register_kprobe(struct kprobe *p); 385 386 void unregister_kprobe(struct kprobe *p); 386 387 int register_kprobes(struct kprobe **kps, int num);

+3

include/linux/of.h

··· 294 294 extern int of_property_read_u32_index(const struct device_node *np, 295 295 const char *propname, 296 296 u32 index, u32 *out_value); 297 + extern int of_property_read_u64_index(const struct device_node *np, 298 + const char *propname, 299 + u32 index, u64 *out_value); 297 300 extern int of_property_read_variable_u8_array(const struct device_node *np, 298 301 const char *propname, u8 *out_values, 299 302 size_t sz_min, size_t sz_max);

+16

include/uapi/linux/perf_event.h

··· 922 922 #define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ 923 923 #define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ 924 924 925 + #if defined(__LITTLE_ENDIAN_BITFIELD) 925 926 union perf_mem_data_src { 926 927 __u64 val; 927 928 struct { ··· 934 933 mem_rsvd:31; 935 934 }; 936 935 }; 936 + #elif defined(__BIG_ENDIAN_BITFIELD) 937 + union perf_mem_data_src { 938 + __u64 val; 939 + struct { 940 + __u64 mem_rsvd:31, 941 + mem_dtlb:7, /* tlb access */ 942 + mem_lock:2, /* lock instr */ 943 + mem_snoop:5, /* snoop mode */ 944 + mem_lvl:14, /* memory hierarchy level */ 945 + mem_op:5; /* type of opcode */ 946 + }; 947 + }; 948 + #else 949 + #error "Unknown endianness" 950 + #endif 937 951 938 952 /* type of opcode (load/store/prefetch,code) */ 939 953 #define PERF_MEM_OP_NA 0x01 /* not available */

+18 -14

kernel/kprobes.c

··· 58 58 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) 59 59 60 60 61 - /* 62 - * Some oddball architectures like 64bit powerpc have function descriptors 63 - * so this must be overridable. 64 - */ 65 - #ifndef kprobe_lookup_name 66 - #define kprobe_lookup_name(name, addr) \ 67 - addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name))) 68 - #endif 69 - 70 61 static int kprobes_initialized; 71 62 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 72 63 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; ··· 71 80 static struct { 72 81 raw_spinlock_t lock ____cacheline_aligned_in_smp; 73 82 } kretprobe_table_locks[KPROBE_TABLE_SIZE]; 83 + 84 + kprobe_opcode_t * __weak kprobe_lookup_name(const char *name, 85 + unsigned int __unused) 86 + { 87 + return ((kprobe_opcode_t *)(kallsyms_lookup_name(name))); 88 + } 74 89 75 90 static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) 76 91 { ··· 743 746 arch_remove_optimized_kprobe(op); 744 747 } 745 748 749 + static inline 750 + void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) 751 + { 752 + if (!kprobe_ftrace(p)) 753 + arch_prepare_optimized_kprobe(op, p); 754 + } 755 + 746 756 /* Try to prepare optimized instructions */ 747 757 static void prepare_optimized_kprobe(struct kprobe *p) 748 758 { 749 759 struct optimized_kprobe *op; 750 760 751 761 op = container_of(p, struct optimized_kprobe, kp); 752 - arch_prepare_optimized_kprobe(op, p); 762 + __prepare_optimized_kprobe(op, p); 753 763 } 754 764 755 765 /* Allocate new optimized_kprobe and try to prepare optimized instructions */ ··· 770 766 771 767 INIT_LIST_HEAD(&op->list); 772 768 op->kp.addr = p->addr; 773 - arch_prepare_optimized_kprobe(op, p); 769 + __prepare_optimized_kprobe(op, p); 774 770 775 771 return &op->kp; 776 772 } ··· 1402 1398 goto invalid; 1403 1399 1404 1400 if (symbol_name) { 1405 - kprobe_lookup_name(symbol_name, addr); 1401 + addr = kprobe_lookup_name(symbol_name, offset); 1406 1402 if (!addr) 1407 1403 return ERR_PTR(-ENOENT); 1408 1404 } ··· 2222 2218 if (kretprobe_blacklist_size) { 2223 2219 /* lookup the function address from its name */ 2224 2220 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { 2225 - kprobe_lookup_name(kretprobe_blacklist[i].name, 2226 - kretprobe_blacklist[i].addr); 2221 + kretprobe_blacklist[i].addr = 2222 + kprobe_lookup_name(kretprobe_blacklist[i].name, 0); 2227 2223 if (!kretprobe_blacklist[i].addr) 2228 2224 printk("kretprobe: lookup failed: %s\n", 2229 2225 kretprobe_blacklist[i].name);

+16

tools/include/uapi/linux/perf_event.h

··· 922 922 #define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ 923 923 #define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ 924 924 925 + #if defined(__LITTLE_ENDIAN_BITFIELD) 925 926 union perf_mem_data_src { 926 927 __u64 val; 927 928 struct { ··· 934 933 mem_rsvd:31; 935 934 }; 936 935 }; 936 + #elif defined(__BIG_ENDIAN_BITFIELD) 937 + union perf_mem_data_src { 938 + __u64 val; 939 + struct { 940 + __u64 mem_rsvd:31, 941 + mem_dtlb:7, /* tlb access */ 942 + mem_lock:2, /* lock instr */ 943 + mem_snoop:5, /* snoop mode */ 944 + mem_lvl:14, /* memory hierarchy level */ 945 + mem_op:5; /* type of opcode */ 946 + }; 947 + }; 948 + #else 949 + #error "Unknown endianness" 950 + #endif 937 951 938 952 /* type of opcode (load/store/prefetch,code) */ 939 953 #define PERF_MEM_OP_NA 0x01 /* not available */

+1

tools/testing/selftests/powerpc/Makefile

··· 14 14 15 15 SUB_DIRS = alignment \ 16 16 benchmarks \ 17 + cache_shape \ 17 18 copyloops \ 18 19 context_switch \ 19 20 dscr \

+1

tools/testing/selftests/powerpc/cache_shape/.gitignore

··· 1 + cache_shape

+10

tools/testing/selftests/powerpc/cache_shape/Makefile

··· 1 + TEST_PROGS := cache_shape 2 + 3 + all: $(TEST_PROGS) 4 + 5 + $(TEST_PROGS): ../harness.c ../utils.c 6 + 7 + include ../../lib.mk 8 + 9 + clean: 10 + rm -f $(TEST_PROGS) *.o

+125

tools/testing/selftests/powerpc/cache_shape/cache_shape.c

··· 1 + /* 2 + * Copyright 2017, Michael Ellerman, IBM Corp. 3 + * 4 + * This program is free software; you can redistribute it and/or 5 + * modify it under the terms of the GNU General Public License 6 + * as published by the Free Software Foundation; either version 7 + * 2 of the License, or (at your option) any later version. 8 + */ 9 + 10 + #include <elf.h> 11 + #include <errno.h> 12 + #include <fcntl.h> 13 + #include <link.h> 14 + #include <stdio.h> 15 + #include <stdlib.h> 16 + #include <string.h> 17 + #include <sys/stat.h> 18 + #include <sys/types.h> 19 + #include <sys/wait.h> 20 + #include <unistd.h> 21 + 22 + #include "utils.h" 23 + 24 + #ifndef AT_L1I_CACHESIZE 25 + #define AT_L1I_CACHESIZE 40 26 + #define AT_L1I_CACHEGEOMETRY 41 27 + #define AT_L1D_CACHESIZE 42 28 + #define AT_L1D_CACHEGEOMETRY 43 29 + #define AT_L2_CACHESIZE 44 30 + #define AT_L2_CACHEGEOMETRY 45 31 + #define AT_L3_CACHESIZE 46 32 + #define AT_L3_CACHEGEOMETRY 47 33 + #endif 34 + 35 + static void print_size(const char *label, uint32_t val) 36 + { 37 + printf("%s cache size: %#10x %10dB %10dK\n", label, val, val, val / 1024); 38 + } 39 + 40 + static void print_geo(const char *label, uint32_t val) 41 + { 42 + uint16_t assoc; 43 + 44 + printf("%s line size: %#10x ", label, val & 0xFFFF); 45 + 46 + assoc = val >> 16; 47 + if (assoc) 48 + printf("%u-way", assoc); 49 + else 50 + printf("fully"); 51 + 52 + printf(" associative\n"); 53 + } 54 + 55 + static int test_cache_shape() 56 + { 57 + static char buffer[4096]; 58 + ElfW(auxv_t) *p; 59 + int found; 60 + 61 + FAIL_IF(read_auxv(buffer, sizeof(buffer))); 62 + 63 + found = 0; 64 + 65 + p = find_auxv_entry(AT_L1I_CACHESIZE, buffer); 66 + if (p) { 67 + found++; 68 + print_size("L1I ", (uint32_t)p->a_un.a_val); 69 + } 70 + 71 + p = find_auxv_entry(AT_L1I_CACHEGEOMETRY, buffer); 72 + if (p) { 73 + found++; 74 + print_geo("L1I ", (uint32_t)p->a_un.a_val); 75 + } 76 + 77 + p = find_auxv_entry(AT_L1D_CACHESIZE, buffer); 78 + if (p) { 79 + found++; 80 + print_size("L1D ", (uint32_t)p->a_un.a_val); 81 + } 82 + 83 + p = find_auxv_entry(AT_L1D_CACHEGEOMETRY, buffer); 84 + if (p) { 85 + found++; 86 + print_geo("L1D ", (uint32_t)p->a_un.a_val); 87 + } 88 + 89 + p = find_auxv_entry(AT_L2_CACHESIZE, buffer); 90 + if (p) { 91 + found++; 92 + print_size("L2 ", (uint32_t)p->a_un.a_val); 93 + } 94 + 95 + p = find_auxv_entry(AT_L2_CACHEGEOMETRY, buffer); 96 + if (p) { 97 + found++; 98 + print_geo("L2 ", (uint32_t)p->a_un.a_val); 99 + } 100 + 101 + p = find_auxv_entry(AT_L3_CACHESIZE, buffer); 102 + if (p) { 103 + found++; 104 + print_size("L3 ", (uint32_t)p->a_un.a_val); 105 + } 106 + 107 + p = find_auxv_entry(AT_L3_CACHEGEOMETRY, buffer); 108 + if (p) { 109 + found++; 110 + print_geo("L3 ", (uint32_t)p->a_un.a_val); 111 + } 112 + 113 + /* If we found none we're probably on a system where they don't exist */ 114 + SKIP_IF(found == 0); 115 + 116 + /* But if we found any, we expect to find them all */ 117 + FAIL_IF(found != 8); 118 + 119 + return 0; 120 + } 121 + 122 + int main(void) 123 + { 124 + return test_harness(test_cache_shape, "cache_shape"); 125 + }

+5 -1

tools/testing/selftests/powerpc/include/utils.h

··· 24 24 25 25 void test_harness_set_timeout(uint64_t time); 26 26 int test_harness(int (test_function)(void), char *name); 27 - extern void *get_auxv_entry(int type); 27 + 28 + int read_auxv(char *buf, ssize_t buf_size); 29 + void *find_auxv_entry(int type, char *auxv); 30 + void *get_auxv_entry(int type); 31 + 28 32 int pick_online_cpu(void); 29 33 30 34 static inline bool have_hwcap(unsigned long ftr)

+36 -17

tools/testing/selftests/powerpc/utils.c

··· 19 19 20 20 static char auxv[4096]; 21 21 22 - void *get_auxv_entry(int type) 22 + int read_auxv(char *buf, ssize_t buf_size) 23 23 { 24 - ElfW(auxv_t) *p; 25 - void *result; 26 24 ssize_t num; 27 - int fd; 25 + int rc, fd; 28 26 29 27 fd = open("/proc/self/auxv", O_RDONLY); 30 28 if (fd == -1) { 31 29 perror("open"); 32 - return NULL; 30 + return -errno; 33 31 } 34 32 35 - result = NULL; 36 - 37 - num = read(fd, auxv, sizeof(auxv)); 33 + num = read(fd, buf, buf_size); 38 34 if (num < 0) { 39 35 perror("read"); 36 + rc = -EIO; 40 37 goto out; 41 38 } 42 39 43 - if (num > sizeof(auxv)) { 44 - printf("Overflowed auxv buffer\n"); 40 + if (num > buf_size) { 41 + printf("overflowed auxv buffer\n"); 42 + rc = -EOVERFLOW; 45 43 goto out; 46 44 } 45 + 46 + rc = 0; 47 + out: 48 + close(fd); 49 + return rc; 50 + } 51 + 52 + void *find_auxv_entry(int type, char *auxv) 53 + { 54 + ElfW(auxv_t) *p; 47 55 48 56 p = (ElfW(auxv_t) *)auxv; 49 57 50 58 while (p->a_type != AT_NULL) { 51 - if (p->a_type == type) { 52 - result = (void *)p->a_un.a_val; 53 - break; 54 - } 59 + if (p->a_type == type) 60 + return p; 55 61 56 62 p++; 57 63 } 58 - out: 59 - close(fd); 60 - return result; 64 + 65 + return NULL; 66 + } 67 + 68 + void *get_auxv_entry(int type) 69 + { 70 + ElfW(auxv_t) *p; 71 + 72 + if (read_auxv(auxv, sizeof(auxv))) 73 + return NULL; 74 + 75 + p = find_auxv_entry(type, auxv); 76 + if (p) 77 + return (void *)p->a_un.a_val; 78 + 79 + return NULL; 61 80 } 62 81 63 82 int pick_online_cpu(void)