Merge tag 'hyperv-next-signed' of git://git.kernel.org/pub/scm/linux/kernel/git/hyperv/linux

+1

MAINTAINERS

··· 7924 7924 F: drivers/scsi/storvsc_drv.c 7925 7925 F: drivers/uio/uio_hv_generic.c 7926 7926 F: drivers/video/fbdev/hyperv_fb.c 7927 + F: include/asm-generic/hyperv-tlfs.h 7927 7928 F: include/asm-generic/mshyperv.h 7928 7929 F: include/clocksource/hyperv_timer.h 7929 7930 F: include/linux/hyperv.h

+36 -436

arch/x86/include/asm/hyperv-tlfs.h

··· 11 11 12 12 #include <linux/types.h> 13 13 #include <asm/page.h> 14 - 15 - /* 16 - * While not explicitly listed in the TLFS, Hyper-V always runs with a page size 17 - * of 4096. These definitions are used when communicating with Hyper-V using 18 - * guest physical pages and guest physical page addresses, since the guest page 19 - * size may not be 4096 on all architectures. 20 - */ 21 - #define HV_HYP_PAGE_SHIFT 12 22 - #define HV_HYP_PAGE_SIZE BIT(HV_HYP_PAGE_SHIFT) 23 - #define HV_HYP_PAGE_MASK (~(HV_HYP_PAGE_SIZE - 1)) 24 - 25 14 /* 26 15 * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent 27 16 * is set by CPUID(HvCpuIdFunctionVersionAndFeatures). ··· 28 39 #define HYPERV_CPUID_MAX 0x4000ffff 29 40 30 41 /* 31 - * Feature identification. EAX indicates which features are available 32 - * to the partition based upon the current partition privileges. 33 - * These are HYPERV_CPUID_FEATURES.EAX bits. 42 + * Aliases for Group A features that have X64 in the name. 43 + * On x86/x64 these are HYPERV_CPUID_FEATURES.EAX bits. 34 44 */ 35 45 36 - /* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ 37 - #define HV_X64_MSR_VP_RUNTIME_AVAILABLE BIT(0) 38 - /* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ 39 - #define HV_MSR_TIME_REF_COUNT_AVAILABLE BIT(1) 40 - /* 41 - * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM 42 - * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available 43 - */ 44 - #define HV_X64_MSR_SYNIC_AVAILABLE BIT(2) 45 - /* 46 - * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through 47 - * HV_X64_MSR_STIMER3_COUNT) available 48 - */ 49 - #define HV_MSR_SYNTIMER_AVAILABLE BIT(3) 50 - /* 51 - * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) 52 - * are available 53 - */ 54 - #define HV_X64_MSR_APIC_ACCESS_AVAILABLE BIT(4) 55 - /* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/ 56 - #define HV_X64_MSR_HYPERCALL_AVAILABLE BIT(5) 57 - /* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/ 58 - #define HV_X64_MSR_VP_INDEX_AVAILABLE BIT(6) 59 - /* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/ 60 - #define HV_X64_MSR_RESET_AVAILABLE BIT(7) 61 - /* 62 - * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, 63 - * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, 64 - * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available 65 - */ 66 - #define HV_X64_MSR_STAT_PAGES_AVAILABLE BIT(8) 67 - /* Partition reference TSC MSR is available */ 68 - #define HV_MSR_REFERENCE_TSC_AVAILABLE BIT(9) 69 - /* Partition Guest IDLE MSR is available */ 70 - #define HV_X64_MSR_GUEST_IDLE_AVAILABLE BIT(10) 71 - /* 72 - * There is a single feature flag that signifies if the partition has access 73 - * to MSRs with local APIC and TSC frequencies. 74 - */ 75 - #define HV_X64_ACCESS_FREQUENCY_MSRS BIT(11) 76 - /* AccessReenlightenmentControls privilege */ 77 - #define HV_X64_ACCESS_REENLIGHTENMENT BIT(13) 78 - /* AccessTscInvariantControls privilege */ 79 - #define HV_X64_ACCESS_TSC_INVARIANT BIT(15) 46 + #define HV_X64_MSR_VP_RUNTIME_AVAILABLE \ 47 + HV_MSR_VP_RUNTIME_AVAILABLE 48 + #define HV_X64_MSR_SYNIC_AVAILABLE \ 49 + HV_MSR_SYNIC_AVAILABLE 50 + #define HV_X64_MSR_APIC_ACCESS_AVAILABLE \ 51 + HV_MSR_APIC_ACCESS_AVAILABLE 52 + #define HV_X64_MSR_HYPERCALL_AVAILABLE \ 53 + HV_MSR_HYPERCALL_AVAILABLE 54 + #define HV_X64_MSR_VP_INDEX_AVAILABLE \ 55 + HV_MSR_VP_INDEX_AVAILABLE 56 + #define HV_X64_MSR_RESET_AVAILABLE \ 57 + HV_MSR_RESET_AVAILABLE 58 + #define HV_X64_MSR_GUEST_IDLE_AVAILABLE \ 59 + HV_MSR_GUEST_IDLE_AVAILABLE 60 + #define HV_X64_ACCESS_FREQUENCY_MSRS \ 61 + HV_ACCESS_FREQUENCY_MSRS 62 + #define HV_X64_ACCESS_REENLIGHTENMENT \ 63 + HV_ACCESS_REENLIGHTENMENT 64 + #define HV_X64_ACCESS_TSC_INVARIANT \ 65 + HV_ACCESS_TSC_INVARIANT 80 66 81 67 /* 82 - * Feature identification: indicates which flags were specified at partition 83 - * creation. The format is the same as the partition creation flag structure 84 - * defined in section Partition Creation Flags. 85 - * These are HYPERV_CPUID_FEATURES.EBX bits. 68 + * Aliases for Group B features that have X64 in the name. 69 + * On x86/x64 these are HYPERV_CPUID_FEATURES.EBX bits. 86 70 */ 87 - #define HV_X64_CREATE_PARTITIONS BIT(0) 88 - #define HV_X64_ACCESS_PARTITION_ID BIT(1) 89 - #define HV_X64_ACCESS_MEMORY_POOL BIT(2) 90 - #define HV_X64_ADJUST_MESSAGE_BUFFERS BIT(3) 91 - #define HV_X64_POST_MESSAGES BIT(4) 92 - #define HV_X64_SIGNAL_EVENTS BIT(5) 93 - #define HV_X64_CREATE_PORT BIT(6) 94 - #define HV_X64_CONNECT_PORT BIT(7) 95 - #define HV_X64_ACCESS_STATS BIT(8) 96 - #define HV_X64_DEBUGGING BIT(11) 97 - #define HV_X64_CPU_POWER_MANAGEMENT BIT(12) 71 + #define HV_X64_POST_MESSAGES HV_POST_MESSAGES 72 + #define HV_X64_SIGNAL_EVENTS HV_SIGNAL_EVENTS 98 73 99 74 /* 100 - * Feature identification. EDX indicates which miscellaneous features 101 - * are available to the partition. 102 - * These are HYPERV_CPUID_FEATURES.EDX bits. 75 + * Group D Features. The bit assignments are custom to each architecture. 76 + * On x86/x64 these are HYPERV_CPUID_FEATURES.EDX bits. 103 77 */ 104 78 /* The MWAIT instruction is available (per section MONITOR / MWAIT) */ 105 79 #define HV_X64_MWAIT_AVAILABLE BIT(0) ··· 139 187 * processor, except for virtual processors that are reported as sibling SMT 140 188 * threads. 141 189 */ 142 - #define HV_X64_NO_NONARCH_CORESHARING BIT(18) 190 + #define HV_X64_NO_NONARCH_CORESHARING BIT(18) 143 191 144 192 /* Nested features. These are HYPERV_CPUID_NESTED_FEATURES.EAX bits. */ 145 193 #define HV_X64_NESTED_DIRECT_FLUSH BIT(17) ··· 247 295 } __packed; 248 296 }; 249 297 250 - /* 251 - * TSC page layout. 252 - */ 253 - struct ms_hyperv_tsc_page { 254 - volatile u32 tsc_sequence; 255 - u32 reserved1; 256 - volatile u64 tsc_scale; 257 - volatile s64 tsc_offset; 258 - u64 reserved2[509]; 259 - } __packed; 260 - 261 - /* 262 - * The guest OS needs to register the guest ID with the hypervisor. 263 - * The guest ID is a 64 bit entity and the structure of this ID is 264 - * specified in the Hyper-V specification: 265 - * 266 - * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx 267 - * 268 - * While the current guideline does not specify how Linux guest ID(s) 269 - * need to be generated, our plan is to publish the guidelines for 270 - * Linux and other guest operating systems that currently are hosted 271 - * on Hyper-V. The implementation here conforms to this yet 272 - * unpublished guidelines. 273 - * 274 - * 275 - * Bit(s) 276 - * 63 - Indicates if the OS is Open Source or not; 1 is Open Source 277 - * 62:56 - Os Type; Linux is 0x100 278 - * 55:48 - Distro specific identification 279 - * 47:16 - Linux kernel version number 280 - * 15:0 - Distro specific identification 281 - * 282 - * 283 - */ 284 - 285 - #define HV_LINUX_VENDOR_ID 0x8100 286 - 287 298 struct hv_reenlightenment_control { 288 299 __u64 vector:8; 289 300 __u64 reserved1:8; ··· 270 355 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ 271 356 (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) 272 357 273 - /* 274 - * Crash notification (HV_X64_MSR_CRASH_CTL) flags. 275 - */ 276 - #define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) 277 - #define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) 278 358 #define HV_X64_MSR_CRASH_PARAMS \ 279 359 (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) 280 360 281 361 #define HV_IPI_LOW_VECTOR 0x10 282 362 #define HV_IPI_HIGH_VECTOR 0xff 283 - 284 - /* Declare the various hypercall operations. */ 285 - #define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 286 - #define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 287 - #define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 288 - #define HVCALL_SEND_IPI 0x000b 289 - #define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 290 - #define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 291 - #define HVCALL_SEND_IPI_EX 0x0015 292 - #define HVCALL_POST_MESSAGE 0x005c 293 - #define HVCALL_SIGNAL_EVENT 0x005d 294 - #define HVCALL_RETARGET_INTERRUPT 0x007e 295 - #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af 296 - #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 297 363 298 364 #define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 299 365 #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 ··· 287 391 #define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001 288 392 #define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12 289 393 290 - #define HV_PROCESSOR_POWER_STATE_C0 0 291 - #define HV_PROCESSOR_POWER_STATE_C1 1 292 - #define HV_PROCESSOR_POWER_STATE_C2 2 293 - #define HV_PROCESSOR_POWER_STATE_C3 3 294 - 295 - #define HV_FLUSH_ALL_PROCESSORS BIT(0) 296 - #define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) 297 - #define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) 298 - #define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) 299 - 300 - enum HV_GENERIC_SET_FORMAT { 301 - HV_GENERIC_SET_SPARSE_4K, 302 - HV_GENERIC_SET_ALL, 303 - }; 304 - 305 - #define HV_PARTITION_ID_SELF ((u64)-1) 306 - 307 - #define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) 308 - #define HV_HYPERCALL_FAST_BIT BIT(16) 309 - #define HV_HYPERCALL_VARHEAD_OFFSET 17 310 - #define HV_HYPERCALL_REP_COMP_OFFSET 32 311 - #define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) 312 - #define HV_HYPERCALL_REP_START_OFFSET 48 313 - #define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) 314 - 315 - /* hypercall status code */ 316 - #define HV_STATUS_SUCCESS 0 317 - #define HV_STATUS_INVALID_HYPERCALL_CODE 2 318 - #define HV_STATUS_INVALID_HYPERCALL_INPUT 3 319 - #define HV_STATUS_INVALID_ALIGNMENT 4 320 - #define HV_STATUS_INVALID_PARAMETER 5 321 - #define HV_STATUS_INSUFFICIENT_MEMORY 11 322 - #define HV_STATUS_INVALID_PORT_ID 17 323 - #define HV_STATUS_INVALID_CONNECTION_ID 18 324 - #define HV_STATUS_INSUFFICIENT_BUFFERS 19 325 - 326 - /* 327 - * The Hyper-V TimeRefCount register and the TSC 328 - * page provide a guest VM clock with 100ns tick rate 329 - */ 330 - #define HV_CLOCK_HZ (NSEC_PER_SEC/100) 331 - 332 - typedef struct _HV_REFERENCE_TSC_PAGE { 333 - __u32 tsc_sequence; 334 - __u32 res1; 335 - __u64 tsc_scale; 336 - __s64 tsc_offset; 337 - } __packed HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE; 338 - 339 - /* Define the number of synthetic interrupt sources. */ 340 - #define HV_SYNIC_SINT_COUNT (16) 341 - /* Define the expected SynIC version. */ 342 - #define HV_SYNIC_VERSION_1 (0x1) 343 - /* Valid SynIC vectors are 16-255. */ 344 - #define HV_SYNIC_FIRST_VALID_VECTOR (16) 345 - 346 - #define HV_SYNIC_CONTROL_ENABLE (1ULL << 0) 347 - #define HV_SYNIC_SIMP_ENABLE (1ULL << 0) 348 - #define HV_SYNIC_SIEFP_ENABLE (1ULL << 0) 349 - #define HV_SYNIC_SINT_MASKED (1ULL << 16) 350 - #define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17) 351 - #define HV_SYNIC_SINT_VECTOR_MASK (0xFF) 352 - 353 - #define HV_SYNIC_STIMER_COUNT (4) 354 - 355 - /* Define synthetic interrupt controller message constants. */ 356 - #define HV_MESSAGE_SIZE (256) 357 - #define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) 358 - #define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30) 359 394 360 395 /* Define hypervisor message types. */ 361 396 enum hv_message_type { ··· 297 470 HVMSG_GPA_INTERCEPT = 0x80000001, 298 471 299 472 /* Timer notification messages. */ 300 - HVMSG_TIMER_EXPIRED = 0x80000010, 473 + HVMSG_TIMER_EXPIRED = 0x80000010, 301 474 302 475 /* Error messages. */ 303 476 HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020, 304 477 HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021, 305 - HVMSG_UNSUPPORTED_FEATURE = 0x80000022, 478 + HVMSG_UNSUPPORTED_FEATURE = 0x80000022, 306 479 307 480 /* Trace buffer complete messages. */ 308 481 HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040, 309 482 310 483 /* Platform-specific processor intercept messages. */ 311 - HVMSG_X64_IOPORT_INTERCEPT = 0x80010000, 484 + HVMSG_X64_IOPORT_INTERCEPT = 0x80010000, 312 485 HVMSG_X64_MSR_INTERCEPT = 0x80010001, 313 - HVMSG_X64_CPUID_INTERCEPT = 0x80010002, 486 + HVMSG_X64_CPUID_INTERCEPT = 0x80010002, 314 487 HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003, 315 - HVMSG_X64_APIC_EOI = 0x80010004, 316 - HVMSG_X64_LEGACY_FP_ERROR = 0x80010005 488 + HVMSG_X64_APIC_EOI = 0x80010004, 489 + HVMSG_X64_LEGACY_FP_ERROR = 0x80010005 317 490 }; 318 - 319 - /* Define synthetic interrupt controller message flags. */ 320 - union hv_message_flags { 321 - __u8 asu8; 322 - struct { 323 - __u8 msg_pending:1; 324 - __u8 reserved:7; 325 - } __packed; 326 - }; 327 - 328 - /* Define port identifier type. */ 329 - union hv_port_id { 330 - __u32 asu32; 331 - struct { 332 - __u32 id:24; 333 - __u32 reserved:8; 334 - } __packed u; 335 - }; 336 - 337 - /* Define synthetic interrupt controller message header. */ 338 - struct hv_message_header { 339 - __u32 message_type; 340 - __u8 payload_size; 341 - union hv_message_flags message_flags; 342 - __u8 reserved[2]; 343 - union { 344 - __u64 sender; 345 - union hv_port_id port; 346 - }; 347 - } __packed; 348 - 349 - /* Define synthetic interrupt controller message format. */ 350 - struct hv_message { 351 - struct hv_message_header header; 352 - union { 353 - __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; 354 - } u; 355 - } __packed; 356 - 357 - /* Define the synthetic interrupt message page layout. */ 358 - struct hv_message_page { 359 - struct hv_message sint_message[HV_SYNIC_SINT_COUNT]; 360 - } __packed; 361 - 362 - /* Define timer message payload structure. */ 363 - struct hv_timer_message_payload { 364 - __u32 timer_index; 365 - __u32 reserved; 366 - __u64 expiration_time; /* When the timer expired */ 367 - __u64 delivery_time; /* When the message was delivered */ 368 - } __packed; 369 491 370 492 struct hv_nested_enlightenments_control { 371 493 struct { ··· 543 767 544 768 #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF 545 769 546 - /* Define synthetic interrupt controller flag constants. */ 547 - #define HV_EVENT_FLAGS_COUNT (256 * 8) 548 - #define HV_EVENT_FLAGS_LONG_COUNT (256 / sizeof(unsigned long)) 549 - 550 - /* 551 - * Synthetic timer configuration. 552 - */ 553 - union hv_stimer_config { 554 - u64 as_uint64; 555 - struct { 556 - u64 enable:1; 557 - u64 periodic:1; 558 - u64 lazy:1; 559 - u64 auto_enable:1; 560 - u64 apic_vector:8; 561 - u64 direct_mode:1; 562 - u64 reserved_z0:3; 563 - u64 sintx:4; 564 - u64 reserved_z1:44; 565 - } __packed; 566 - }; 567 - 568 - 569 - /* Define the synthetic interrupt controller event flags format. */ 570 - union hv_synic_event_flags { 571 - unsigned long flags[HV_EVENT_FLAGS_LONG_COUNT]; 572 - }; 573 - 574 - /* Define SynIC control register. */ 575 - union hv_synic_scontrol { 576 - u64 as_uint64; 577 - struct { 578 - u64 enable:1; 579 - u64 reserved:63; 580 - } __packed; 581 - }; 582 - 583 - /* Define synthetic interrupt source. */ 584 - union hv_synic_sint { 585 - u64 as_uint64; 586 - struct { 587 - u64 vector:8; 588 - u64 reserved1:8; 589 - u64 masked:1; 590 - u64 auto_eoi:1; 591 - u64 polling:1; 592 - u64 reserved2:45; 593 - } __packed; 594 - }; 595 - 596 - /* Define the format of the SIMP register */ 597 - union hv_synic_simp { 598 - u64 as_uint64; 599 - struct { 600 - u64 simp_enabled:1; 601 - u64 preserved:11; 602 - u64 base_simp_gpa:52; 603 - } __packed; 604 - }; 605 - 606 - /* Define the format of the SIEFP register */ 607 - union hv_synic_siefp { 608 - u64 as_uint64; 609 - struct { 610 - u64 siefp_enabled:1; 611 - u64 preserved:11; 612 - u64 base_siefp_gpa:52; 613 - } __packed; 614 - }; 615 - 616 - struct hv_vpset { 617 - u64 format; 618 - u64 valid_bank_mask; 619 - u64 bank_contents[]; 620 - } __packed; 621 - 622 - /* HvCallSendSyntheticClusterIpi hypercall */ 623 - struct hv_send_ipi { 624 - u32 vector; 625 - u32 reserved; 626 - u64 cpu_mask; 627 - } __packed; 628 - 629 - /* HvCallSendSyntheticClusterIpiEx hypercall */ 630 - struct hv_send_ipi_ex { 631 - u32 vector; 632 - u32 reserved; 633 - struct hv_vpset vp_set; 634 - } __packed; 635 - 636 - /* HvFlushGuestPhysicalAddressSpace hypercalls */ 637 - struct hv_guest_mapping_flush { 638 - u64 address_space; 639 - u64 flags; 640 - } __packed; 641 - 642 - /* 643 - * HV_MAX_FLUSH_PAGES = "additional_pages" + 1. It's limited 644 - * by the bitwidth of "additional_pages" in union hv_gpa_page_range. 645 - */ 646 - #define HV_MAX_FLUSH_PAGES (2048) 647 - 648 - /* HvFlushGuestPhysicalAddressList hypercall */ 649 - union hv_gpa_page_range { 650 - u64 address_space; 651 - struct { 652 - u64 additional_pages:11; 653 - u64 largepage:1; 654 - u64 basepfn:52; 655 - } page; 656 - }; 657 - 658 - /* 659 - * All input flush parameters should be in single page. The max flush 660 - * count is equal with how many entries of union hv_gpa_page_range can 661 - * be populated into the input parameter page. 662 - */ 663 - #define HV_MAX_FLUSH_REP_COUNT ((HV_HYP_PAGE_SIZE - 2 * sizeof(u64)) / \ 664 - sizeof(union hv_gpa_page_range)) 665 - 666 - struct hv_guest_mapping_flush_list { 667 - u64 address_space; 668 - u64 flags; 669 - union hv_gpa_page_range gpa_list[HV_MAX_FLUSH_REP_COUNT]; 670 - }; 671 - 672 - /* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ 673 - struct hv_tlb_flush { 674 - u64 address_space; 675 - u64 flags; 676 - u64 processor_mask; 677 - u64 gva_list[]; 678 - } __packed; 679 - 680 - /* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ 681 - struct hv_tlb_flush_ex { 682 - u64 address_space; 683 - u64 flags; 684 - struct hv_vpset hv_vp_set; 685 - u64 gva_list[]; 686 - } __packed; 687 - 688 770 struct hv_partition_assist_pg { 689 771 u32 tlb_lock_count; 690 772 }; 691 773 692 - union hv_msi_entry { 693 - u64 as_uint64; 694 - struct { 695 - u32 address; 696 - u32 data; 697 - } __packed; 698 - }; 699 774 700 - struct hv_interrupt_entry { 701 - u32 source; /* 1 for MSI(-X) */ 702 - u32 reserved1; 703 - union hv_msi_entry msi_entry; 704 - } __packed; 775 + #include <asm-generic/hyperv-tlfs.h> 705 776 706 - /* 707 - * flags for hv_device_interrupt_target.flags 708 - */ 709 - #define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1 710 - #define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2 711 - 712 - struct hv_device_interrupt_target { 713 - u32 vector; 714 - u32 flags; 715 - union { 716 - u64 vp_mask; 717 - struct hv_vpset vp_set; 718 - }; 719 - } __packed; 720 - 721 - /* HvRetargetDeviceInterrupt hypercall */ 722 - struct hv_retarget_device_interrupt { 723 - u64 partition_id; /* use "self" */ 724 - u64 device_id; 725 - struct hv_interrupt_entry int_entry; 726 - u64 reserved2; 727 - struct hv_device_interrupt_target int_target; 728 - } __packed __aligned(8); 729 777 #endif

+1 -1

arch/x86/include/asm/kvm_host.h

··· 866 866 u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; 867 867 u64 hv_crash_ctl; 868 868 869 - HV_REFERENCE_TSC_PAGE tsc_ref; 869 + struct ms_hyperv_tsc_page tsc_ref; 870 870 871 871 struct idr conn_to_evt; 872 872

+2 -2

arch/x86/kvm/hyperv.c

··· 900 900 * These two equivalencies are implemented in this function. 901 901 */ 902 902 static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, 903 - HV_REFERENCE_TSC_PAGE *tsc_ref) 903 + struct ms_hyperv_tsc_page *tsc_ref) 904 904 { 905 905 u64 max_mul; 906 906 ··· 941 941 u64 gfn; 942 942 943 943 BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); 944 - BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0); 944 + BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); 945 945 946 946 if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) 947 947 return;

+41 -17

drivers/hv/channel.c

··· 290 290 EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request); 291 291 292 292 /* 293 + * Set/change the vCPU (@target_vp) the channel (@child_relid) will interrupt. 294 + * 295 + * CHANNELMSG_MODIFYCHANNEL messages are aynchronous. Also, Hyper-V does not 296 + * ACK such messages. IOW we can't know when the host will stop interrupting 297 + * the "old" vCPU and start interrupting the "new" vCPU for the given channel. 298 + * 299 + * The CHANNELMSG_MODIFYCHANNEL message type is supported since VMBus version 300 + * VERSION_WIN10_V4_1. 301 + */ 302 + int vmbus_send_modifychannel(u32 child_relid, u32 target_vp) 303 + { 304 + struct vmbus_channel_modifychannel conn_msg; 305 + int ret; 306 + 307 + memset(&conn_msg, 0, sizeof(conn_msg)); 308 + conn_msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL; 309 + conn_msg.child_relid = child_relid; 310 + conn_msg.target_vp = target_vp; 311 + 312 + ret = vmbus_post_msg(&conn_msg, sizeof(conn_msg), true); 313 + 314 + trace_vmbus_send_modifychannel(&conn_msg, ret); 315 + 316 + return ret; 317 + } 318 + EXPORT_SYMBOL_GPL(vmbus_send_modifychannel); 319 + 320 + /* 293 321 * create_gpadl_header - Creates a gpadl for the specified buffer 294 322 */ 295 323 static int create_gpadl_header(void *kbuffer, u32 size, ··· 622 594 } 623 595 EXPORT_SYMBOL_GPL(vmbus_teardown_gpadl); 624 596 625 - static void reset_channel_cb(void *arg) 626 - { 627 - struct vmbus_channel *channel = arg; 628 - 629 - channel->onchannel_callback = NULL; 630 - } 631 - 632 597 void vmbus_reset_channel_cb(struct vmbus_channel *channel) 633 598 { 599 + unsigned long flags; 600 + 634 601 /* 635 602 * vmbus_on_event(), running in the per-channel tasklet, can race 636 603 * with vmbus_close_internal() in the case of SMP guest, e.g., when 637 604 * the former is accessing channel->inbound.ring_buffer, the latter 638 605 * could be freeing the ring_buffer pages, so here we must stop it 639 606 * first. 607 + * 608 + * vmbus_chan_sched() might call the netvsc driver callback function 609 + * that ends up scheduling NAPI work that accesses the ring buffer. 610 + * At this point, we have to ensure that any such work is completed 611 + * and that the channel ring buffer is no longer being accessed, cf. 612 + * the calls to napi_disable() in netvsc_device_remove(). 640 613 */ 641 614 tasklet_disable(&channel->callback_event); 642 615 643 - channel->sc_creation_callback = NULL; 616 + /* See the inline comments in vmbus_chan_sched(). */ 617 + spin_lock_irqsave(&channel->sched_lock, flags); 618 + channel->onchannel_callback = NULL; 619 + spin_unlock_irqrestore(&channel->sched_lock, flags); 644 620 645 - /* Stop the callback asap */ 646 - if (channel->target_cpu != get_cpu()) { 647 - put_cpu(); 648 - smp_call_function_single(channel->target_cpu, reset_channel_cb, 649 - channel, true); 650 - } else { 651 - reset_channel_cb(channel); 652 - put_cpu(); 653 - } 621 + channel->sc_creation_callback = NULL; 654 622 655 623 /* Re-enable tasklet for use on re-open */ 656 624 tasklet_enable(&channel->callback_event);

+230 -209

drivers/hv/channel_mgmt.c

··· 18 18 #include <linux/module.h> 19 19 #include <linux/completion.h> 20 20 #include <linux/delay.h> 21 + #include <linux/cpu.h> 21 22 #include <linux/hyperv.h> 22 23 #include <asm/mshyperv.h> 23 24 24 25 #include "hyperv_vmbus.h" 25 26 26 - static void init_vp_index(struct vmbus_channel *channel, u16 dev_type); 27 + static void init_vp_index(struct vmbus_channel *channel); 27 28 28 - static const struct vmbus_device vmbus_devs[] = { 29 + const struct vmbus_device vmbus_devs[] = { 29 30 /* IDE */ 30 31 { .dev_type = HV_IDE, 31 32 HV_IDE_GUID, ··· 316 315 if (!channel) 317 316 return NULL; 318 317 318 + spin_lock_init(&channel->sched_lock); 319 319 spin_lock_init(&channel->lock); 320 320 init_completion(&channel->rescind_event); 321 321 322 322 INIT_LIST_HEAD(&channel->sc_list); 323 - INIT_LIST_HEAD(&channel->percpu_list); 324 323 325 324 tasklet_init(&channel->callback_event, 326 325 vmbus_on_event, (unsigned long)channel); ··· 341 340 kobject_put(&channel->kobj); 342 341 } 343 342 344 - static void percpu_channel_enq(void *arg) 343 + void vmbus_channel_map_relid(struct vmbus_channel *channel) 345 344 { 346 - struct vmbus_channel *channel = arg; 347 - struct hv_per_cpu_context *hv_cpu 348 - = this_cpu_ptr(hv_context.cpu_context); 349 - 350 - list_add_tail_rcu(&channel->percpu_list, &hv_cpu->chan_list); 345 + if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS)) 346 + return; 347 + /* 348 + * The mapping of the channel's relid is visible from the CPUs that 349 + * execute vmbus_chan_sched() by the time that vmbus_chan_sched() will 350 + * execute: 351 + * 352 + * (a) In the "normal (i.e., not resuming from hibernation)" path, 353 + * the full barrier in smp_store_mb() guarantees that the store 354 + * is propagated to all CPUs before the add_channel_work work 355 + * is queued. In turn, add_channel_work is queued before the 356 + * channel's ring buffer is allocated/initialized and the 357 + * OPENCHANNEL message for the channel is sent in vmbus_open(). 358 + * Hyper-V won't start sending the interrupts for the channel 359 + * before the OPENCHANNEL message is acked. The memory barrier 360 + * in vmbus_chan_sched() -> sync_test_and_clear_bit() ensures 361 + * that vmbus_chan_sched() must find the channel's relid in 362 + * recv_int_page before retrieving the channel pointer from the 363 + * array of channels. 364 + * 365 + * (b) In the "resuming from hibernation" path, the smp_store_mb() 366 + * guarantees that the store is propagated to all CPUs before 367 + * the VMBus connection is marked as ready for the resume event 368 + * (cf. check_ready_for_resume_event()). The interrupt handler 369 + * of the VMBus driver and vmbus_chan_sched() can not run before 370 + * vmbus_bus_resume() has completed execution (cf. resume_noirq). 371 + */ 372 + smp_store_mb( 373 + vmbus_connection.channels[channel->offermsg.child_relid], 374 + channel); 351 375 } 352 376 353 - static void percpu_channel_deq(void *arg) 377 + void vmbus_channel_unmap_relid(struct vmbus_channel *channel) 354 378 { 355 - struct vmbus_channel *channel = arg; 356 - 357 - list_del_rcu(&channel->percpu_list); 379 + if (WARN_ON(channel->offermsg.child_relid >= MAX_CHANNEL_RELIDS)) 380 + return; 381 + WRITE_ONCE( 382 + vmbus_connection.channels[channel->offermsg.child_relid], 383 + NULL); 358 384 } 359 - 360 385 361 386 static void vmbus_release_relid(u32 relid) 362 387 { ··· 400 373 401 374 void hv_process_channel_removal(struct vmbus_channel *channel) 402 375 { 403 - struct vmbus_channel *primary_channel; 404 376 unsigned long flags; 405 377 406 - BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex)); 378 + lockdep_assert_held(&vmbus_connection.channel_mutex); 407 379 BUG_ON(!channel->rescind); 408 380 409 - if (channel->target_cpu != get_cpu()) { 410 - put_cpu(); 411 - smp_call_function_single(channel->target_cpu, 412 - percpu_channel_deq, channel, true); 413 - } else { 414 - percpu_channel_deq(channel); 415 - put_cpu(); 416 - } 381 + /* 382 + * hv_process_channel_removal() could find INVALID_RELID only for 383 + * hv_sock channels. See the inline comments in vmbus_onoffer(). 384 + */ 385 + WARN_ON(channel->offermsg.child_relid == INVALID_RELID && 386 + !is_hvsock_channel(channel)); 387 + 388 + /* 389 + * Upon suspend, an in-use hv_sock channel is removed from the array of 390 + * channels and the relid is invalidated. After hibernation, when the 391 + * user-space appplication destroys the channel, it's unnecessary and 392 + * unsafe to remove the channel from the array of channels. See also 393 + * the inline comments before the call of vmbus_release_relid() below. 394 + */ 395 + if (channel->offermsg.child_relid != INVALID_RELID) 396 + vmbus_channel_unmap_relid(channel); 417 397 418 398 if (channel->primary_channel == NULL) { 419 399 list_del(&channel->listentry); 420 - 421 - primary_channel = channel; 422 400 } else { 423 - primary_channel = channel->primary_channel; 401 + struct vmbus_channel *primary_channel = channel->primary_channel; 424 402 spin_lock_irqsave(&primary_channel->lock, flags); 425 403 list_del(&channel->sc_list); 426 404 spin_unlock_irqrestore(&primary_channel->lock, flags); 427 405 } 428 406 429 407 /* 430 - * We need to free the bit for init_vp_index() to work in the case 431 - * of sub-channel, when we reload drivers like hv_netvsc. 408 + * If this is a "perf" channel, updates the hv_numa_map[] masks so that 409 + * init_vp_index() can (re-)use the CPU. 432 410 */ 433 - if (channel->affinity_policy == HV_LOCALIZED) 434 - cpumask_clear_cpu(channel->target_cpu, 435 - &primary_channel->alloced_cpus_in_node); 411 + if (hv_is_perf_channel(channel)) 412 + hv_clear_alloced_cpu(channel->target_cpu); 436 413 437 414 /* 438 415 * Upon suspend, an in-use hv_sock channel is marked as "rescinded" and ··· 471 440 container_of(work, struct vmbus_channel, add_channel_work); 472 441 struct vmbus_channel *primary_channel = newchannel->primary_channel; 473 442 unsigned long flags; 474 - u16 dev_type; 475 443 int ret; 476 - 477 - dev_type = hv_get_dev_type(newchannel); 478 - 479 - init_vp_index(newchannel, dev_type); 480 - 481 - if (newchannel->target_cpu != get_cpu()) { 482 - put_cpu(); 483 - smp_call_function_single(newchannel->target_cpu, 484 - percpu_channel_enq, 485 - newchannel, true); 486 - } else { 487 - percpu_channel_enq(newchannel); 488 - put_cpu(); 489 - } 490 444 491 445 /* 492 446 * This state is used to indicate a successful open ··· 504 488 if (!newchannel->device_obj) 505 489 goto err_deq_chan; 506 490 507 - newchannel->device_obj->device_id = dev_type; 491 + newchannel->device_obj->device_id = newchannel->device_id; 508 492 /* 509 493 * Add the new device to the bus. This will kick off device-driver 510 494 * binding which eventually invokes the device driver's AddDevice() ··· 539 523 spin_unlock_irqrestore(&primary_channel->lock, flags); 540 524 } 541 525 542 - mutex_unlock(&vmbus_connection.channel_mutex); 526 + /* vmbus_process_offer() has mapped the channel. */ 527 + vmbus_channel_unmap_relid(newchannel); 543 528 544 - if (newchannel->target_cpu != get_cpu()) { 545 - put_cpu(); 546 - smp_call_function_single(newchannel->target_cpu, 547 - percpu_channel_deq, 548 - newchannel, true); 549 - } else { 550 - percpu_channel_deq(newchannel); 551 - put_cpu(); 552 - } 529 + mutex_unlock(&vmbus_connection.channel_mutex); 553 530 554 531 vmbus_release_relid(newchannel->offermsg.child_relid); 555 532 ··· 560 551 unsigned long flags; 561 552 bool fnew = true; 562 553 554 + /* 555 + * Synchronize vmbus_process_offer() and CPU hotplugging: 556 + * 557 + * CPU1 CPU2 558 + * 559 + * [vmbus_process_offer()] [Hot removal of the CPU] 560 + * 561 + * CPU_READ_LOCK CPUS_WRITE_LOCK 562 + * LOAD cpu_online_mask SEARCH chn_list 563 + * STORE target_cpu LOAD target_cpu 564 + * INSERT chn_list STORE cpu_online_mask 565 + * CPUS_READ_UNLOCK CPUS_WRITE_UNLOCK 566 + * 567 + * Forbids: CPU1's LOAD from *not* seing CPU2's STORE && 568 + * CPU2's SEARCH from *not* seeing CPU1's INSERT 569 + * 570 + * Forbids: CPU2's SEARCH from seeing CPU1's INSERT && 571 + * CPU2's LOAD from *not* seing CPU1's STORE 572 + */ 573 + cpus_read_lock(); 574 + 575 + /* 576 + * Serializes the modifications of the chn_list list as well as 577 + * the accesses to next_numa_node_id in init_vp_index(). 578 + */ 563 579 mutex_lock(&vmbus_connection.channel_mutex); 580 + 581 + init_vp_index(newchannel); 564 582 565 583 /* Remember the channels that should be cleaned up upon suspend. */ 566 584 if (is_hvsock_channel(newchannel) || is_sub_channel(newchannel)) ··· 635 599 spin_unlock_irqrestore(&channel->lock, flags); 636 600 } 637 601 602 + vmbus_channel_map_relid(newchannel); 603 + 638 604 mutex_unlock(&vmbus_connection.channel_mutex); 605 + cpus_read_unlock(); 639 606 640 607 /* 641 608 * vmbus_process_offer() mustn't call channel->sc_creation_callback() ··· 671 632 * We use this state to statically distribute the channel interrupt load. 672 633 */ 673 634 static int next_numa_node_id; 674 - /* 675 - * init_vp_index() accesses global variables like next_numa_node_id, and 676 - * it can run concurrently for primary channels and sub-channels: see 677 - * vmbus_process_offer(), so we need the lock to protect the global 678 - * variables. 679 - */ 680 - static DEFINE_SPINLOCK(bind_channel_to_cpu_lock); 681 635 682 636 /* 683 637 * Starting with Win8, we can statically distribute the incoming 684 638 * channel interrupt load by binding a channel to VCPU. 685 - * We distribute the interrupt loads to one or more NUMA nodes based on 686 - * the channel's affinity_policy. 687 639 * 688 640 * For pre-win8 hosts or non-performance critical channels we assign the 689 - * first CPU in the first NUMA node. 641 + * VMBUS_CONNECT_CPU. 642 + * 643 + * Starting with win8, performance critical channels will be distributed 644 + * evenly among all the available NUMA nodes. Once the node is assigned, 645 + * we will assign the CPU based on a simple round robin scheme. 690 646 */ 691 - static void init_vp_index(struct vmbus_channel *channel, u16 dev_type) 647 + static void init_vp_index(struct vmbus_channel *channel) 692 648 { 693 - u32 cur_cpu; 694 - bool perf_chn = vmbus_devs[dev_type].perf_device; 695 - struct vmbus_channel *primary = channel->primary_channel; 696 - int next_node; 649 + bool perf_chn = hv_is_perf_channel(channel); 697 650 cpumask_var_t available_mask; 698 651 struct cpumask *alloced_mask; 652 + u32 target_cpu; 653 + int numa_node; 699 654 700 655 if ((vmbus_proto_version == VERSION_WS2008) || 701 656 (vmbus_proto_version == VERSION_WIN7) || (!perf_chn) || 702 657 !alloc_cpumask_var(&available_mask, GFP_KERNEL)) { 703 658 /* 704 659 * Prior to win8, all channel interrupts are 705 - * delivered on cpu 0. 660 + * delivered on VMBUS_CONNECT_CPU. 706 661 * Also if the channel is not a performance critical 707 - * channel, bind it to cpu 0. 708 - * In case alloc_cpumask_var() fails, bind it to cpu 0. 662 + * channel, bind it to VMBUS_CONNECT_CPU. 663 + * In case alloc_cpumask_var() fails, bind it to 664 + * VMBUS_CONNECT_CPU. 709 665 */ 710 - channel->numa_node = 0; 711 - channel->target_cpu = 0; 712 - channel->target_vp = hv_cpu_number_to_vp_number(0); 666 + channel->numa_node = cpu_to_node(VMBUS_CONNECT_CPU); 667 + channel->target_cpu = VMBUS_CONNECT_CPU; 668 + channel->target_vp = 669 + hv_cpu_number_to_vp_number(VMBUS_CONNECT_CPU); 670 + if (perf_chn) 671 + hv_set_alloced_cpu(VMBUS_CONNECT_CPU); 713 672 return; 714 673 } 715 674 716 - spin_lock(&bind_channel_to_cpu_lock); 717 - 718 - /* 719 - * Based on the channel affinity policy, we will assign the NUMA 720 - * nodes. 721 - */ 722 - 723 - if ((channel->affinity_policy == HV_BALANCED) || (!primary)) { 724 - while (true) { 725 - next_node = next_numa_node_id++; 726 - if (next_node == nr_node_ids) { 727 - next_node = next_numa_node_id = 0; 728 - continue; 729 - } 730 - if (cpumask_empty(cpumask_of_node(next_node))) 731 - continue; 732 - break; 675 + while (true) { 676 + numa_node = next_numa_node_id++; 677 + if (numa_node == nr_node_ids) { 678 + next_numa_node_id = 0; 679 + continue; 733 680 } 734 - channel->numa_node = next_node; 735 - primary = channel; 681 + if (cpumask_empty(cpumask_of_node(numa_node))) 682 + continue; 683 + break; 736 684 } 737 - alloced_mask = &hv_context.hv_numa_map[primary->numa_node]; 685 + channel->numa_node = numa_node; 686 + alloced_mask = &hv_context.hv_numa_map[numa_node]; 738 687 739 688 if (cpumask_weight(alloced_mask) == 740 - cpumask_weight(cpumask_of_node(primary->numa_node))) { 689 + cpumask_weight(cpumask_of_node(numa_node))) { 741 690 /* 742 691 * We have cycled through all the CPUs in the node; 743 692 * reset the alloced map. ··· 733 706 cpumask_clear(alloced_mask); 734 707 } 735 708 736 - cpumask_xor(available_mask, alloced_mask, 737 - cpumask_of_node(primary->numa_node)); 709 + cpumask_xor(available_mask, alloced_mask, cpumask_of_node(numa_node)); 738 710 739 - cur_cpu = -1; 711 + target_cpu = cpumask_first(available_mask); 712 + cpumask_set_cpu(target_cpu, alloced_mask); 740 713 741 - if (primary->affinity_policy == HV_LOCALIZED) { 742 - /* 743 - * Normally Hyper-V host doesn't create more subchannels 744 - * than there are VCPUs on the node but it is possible when not 745 - * all present VCPUs on the node are initialized by guest. 746 - * Clear the alloced_cpus_in_node to start over. 747 - */ 748 - if (cpumask_equal(&primary->alloced_cpus_in_node, 749 - cpumask_of_node(primary->numa_node))) 750 - cpumask_clear(&primary->alloced_cpus_in_node); 751 - } 752 - 753 - while (true) { 754 - cur_cpu = cpumask_next(cur_cpu, available_mask); 755 - if (cur_cpu >= nr_cpu_ids) { 756 - cur_cpu = -1; 757 - cpumask_copy(available_mask, 758 - cpumask_of_node(primary->numa_node)); 759 - continue; 760 - } 761 - 762 - if (primary->affinity_policy == HV_LOCALIZED) { 763 - /* 764 - * NOTE: in the case of sub-channel, we clear the 765 - * sub-channel related bit(s) in 766 - * primary->alloced_cpus_in_node in 767 - * hv_process_channel_removal(), so when we 768 - * reload drivers like hv_netvsc in SMP guest, here 769 - * we're able to re-allocate 770 - * bit from primary->alloced_cpus_in_node. 771 - */ 772 - if (!cpumask_test_cpu(cur_cpu, 773 - &primary->alloced_cpus_in_node)) { 774 - cpumask_set_cpu(cur_cpu, 775 - &primary->alloced_cpus_in_node); 776 - cpumask_set_cpu(cur_cpu, alloced_mask); 777 - break; 778 - } 779 - } else { 780 - cpumask_set_cpu(cur_cpu, alloced_mask); 781 - break; 782 - } 783 - } 784 - 785 - channel->target_cpu = cur_cpu; 786 - channel->target_vp = hv_cpu_number_to_vp_number(cur_cpu); 787 - 788 - spin_unlock(&bind_channel_to_cpu_lock); 714 + channel->target_cpu = target_cpu; 715 + channel->target_vp = hv_cpu_number_to_vp_number(target_cpu); 789 716 790 717 free_cpumask_var(available_mask); 791 718 } ··· 871 890 sizeof(struct vmbus_channel_offer_channel)); 872 891 channel->monitor_grp = (u8)offer->monitorid / 32; 873 892 channel->monitor_bit = (u8)offer->monitorid % 32; 893 + channel->device_id = hv_get_dev_type(channel); 874 894 } 875 895 876 896 /* ··· 922 940 oldchannel = find_primary_channel_by_offer(offer); 923 941 924 942 if (oldchannel != NULL) { 925 - atomic_dec(&vmbus_connection.offer_in_progress); 926 - 927 943 /* 928 944 * We're resuming from hibernation: all the sub-channel and 929 945 * hv_sock channels we had before the hibernation should have ··· 929 949 * primary channel that we had before the hibernation. 930 950 */ 931 951 952 + /* 953 + * { Initially: channel relid = INVALID_RELID, 954 + * channels[valid_relid] = NULL } 955 + * 956 + * CPU1 CPU2 957 + * 958 + * [vmbus_onoffer()] [vmbus_device_release()] 959 + * 960 + * LOCK channel_mutex LOCK channel_mutex 961 + * STORE channel relid = valid_relid LOAD r1 = channel relid 962 + * MAP_RELID channel if (r1 != INVALID_RELID) 963 + * UNLOCK channel_mutex UNMAP_RELID channel 964 + * UNLOCK channel_mutex 965 + * 966 + * Forbids: r1 == valid_relid && 967 + * channels[valid_relid] == channel 968 + * 969 + * Note. r1 can be INVALID_RELID only for an hv_sock channel. 970 + * None of the hv_sock channels which were present before the 971 + * suspend are re-offered upon the resume. See the WARN_ON() 972 + * in hv_process_channel_removal(). 973 + */ 974 + mutex_lock(&vmbus_connection.channel_mutex); 975 + 976 + atomic_dec(&vmbus_connection.offer_in_progress); 977 + 932 978 WARN_ON(oldchannel->offermsg.child_relid != INVALID_RELID); 933 979 /* Fix up the relid. */ 934 980 oldchannel->offermsg.child_relid = offer->child_relid; 935 981 936 982 offer_sz = sizeof(*offer); 937 - if (memcmp(offer, &oldchannel->offermsg, offer_sz) == 0) { 938 - check_ready_for_resume_event(); 939 - return; 983 + if (memcmp(offer, &oldchannel->offermsg, offer_sz) != 0) { 984 + /* 985 + * This is not an error, since the host can also change 986 + * the other field(s) of the offer, e.g. on WS RS5 987 + * (Build 17763), the offer->connection_id of the 988 + * Mellanox VF vmbus device can change when the host 989 + * reoffers the device upon resume. 990 + */ 991 + pr_debug("vmbus offer changed: relid=%d\n", 992 + offer->child_relid); 993 + 994 + print_hex_dump_debug("Old vmbus offer: ", 995 + DUMP_PREFIX_OFFSET, 16, 4, 996 + &oldchannel->offermsg, offer_sz, 997 + false); 998 + print_hex_dump_debug("New vmbus offer: ", 999 + DUMP_PREFIX_OFFSET, 16, 4, 1000 + offer, offer_sz, false); 1001 + 1002 + /* Fix up the old channel. */ 1003 + vmbus_setup_channel_state(oldchannel, offer); 940 1004 } 941 1005 942 - /* 943 - * This is not an error, since the host can also change the 944 - * other field(s) of the offer, e.g. on WS RS5 (Build 17763), 945 - * the offer->connection_id of the Mellanox VF vmbus device 946 - * can change when the host reoffers the device upon resume. 947 - */ 948 - pr_debug("vmbus offer changed: relid=%d\n", 949 - offer->child_relid); 950 - 951 - print_hex_dump_debug("Old vmbus offer: ", DUMP_PREFIX_OFFSET, 952 - 16, 4, &oldchannel->offermsg, offer_sz, 953 - false); 954 - print_hex_dump_debug("New vmbus offer: ", DUMP_PREFIX_OFFSET, 955 - 16, 4, offer, offer_sz, false); 956 - 957 - /* Fix up the old channel. */ 958 - vmbus_setup_channel_state(oldchannel, offer); 959 - 1006 + /* Add the channel back to the array of channels. */ 1007 + vmbus_channel_map_relid(oldchannel); 960 1008 check_ready_for_resume_event(); 961 1009 1010 + mutex_unlock(&vmbus_connection.channel_mutex); 962 1011 return; 963 1012 } 964 1013 ··· 1037 1028 * offer comes in first and then the rescind. 1038 1029 * Since we process these events in work elements, 1039 1030 * and with preemption, we may end up processing 1040 - * the events out of order. Given that we handle these 1041 - * work elements on the same CPU, this is possible only 1042 - * in the case of preemption. In any case wait here 1043 - * until the offer processing has moved beyond the 1044 - * point where the channel is discoverable. 1031 + * the events out of order. We rely on the synchronization 1032 + * provided by offer_in_progress and by channel_mutex for 1033 + * ordering these events: 1034 + * 1035 + * { Initially: offer_in_progress = 1 } 1036 + * 1037 + * CPU1 CPU2 1038 + * 1039 + * [vmbus_onoffer()] [vmbus_onoffer_rescind()] 1040 + * 1041 + * LOCK channel_mutex WAIT_ON offer_in_progress == 0 1042 + * DECREMENT offer_in_progress LOCK channel_mutex 1043 + * STORE channels[] LOAD channels[] 1044 + * UNLOCK channel_mutex UNLOCK channel_mutex 1045 + * 1046 + * Forbids: CPU2's LOAD from *not* seeing CPU1's STORE 1045 1047 */ 1046 1048 1047 1049 while (atomic_read(&vmbus_connection.offer_in_progress) != 0) { ··· 1352 1332 /* Channel message dispatch table */ 1353 1333 const struct vmbus_channel_message_table_entry 1354 1334 channel_message_table[CHANNELMSG_COUNT] = { 1355 - { CHANNELMSG_INVALID, 0, NULL }, 1356 - { CHANNELMSG_OFFERCHANNEL, 0, vmbus_onoffer }, 1357 - { CHANNELMSG_RESCIND_CHANNELOFFER, 0, vmbus_onoffer_rescind }, 1358 - { CHANNELMSG_REQUESTOFFERS, 0, NULL }, 1359 - { CHANNELMSG_ALLOFFERS_DELIVERED, 1, vmbus_onoffers_delivered }, 1360 - { CHANNELMSG_OPENCHANNEL, 0, NULL }, 1361 - { CHANNELMSG_OPENCHANNEL_RESULT, 1, vmbus_onopen_result }, 1362 - { CHANNELMSG_CLOSECHANNEL, 0, NULL }, 1363 - { CHANNELMSG_GPADL_HEADER, 0, NULL }, 1364 - { CHANNELMSG_GPADL_BODY, 0, NULL }, 1365 - { CHANNELMSG_GPADL_CREATED, 1, vmbus_ongpadl_created }, 1366 - { CHANNELMSG_GPADL_TEARDOWN, 0, NULL }, 1367 - { CHANNELMSG_GPADL_TORNDOWN, 1, vmbus_ongpadl_torndown }, 1368 - { CHANNELMSG_RELID_RELEASED, 0, NULL }, 1369 - { CHANNELMSG_INITIATE_CONTACT, 0, NULL }, 1370 - { CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response }, 1371 - { CHANNELMSG_UNLOAD, 0, NULL }, 1372 - { CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response }, 1373 - { CHANNELMSG_18, 0, NULL }, 1374 - { CHANNELMSG_19, 0, NULL }, 1375 - { CHANNELMSG_20, 0, NULL }, 1376 - { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL }, 1377 - { CHANNELMSG_22, 0, NULL }, 1378 - { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL }, 1335 + { CHANNELMSG_INVALID, 0, NULL, 0}, 1336 + { CHANNELMSG_OFFERCHANNEL, 0, vmbus_onoffer, 1337 + sizeof(struct vmbus_channel_offer_channel)}, 1338 + { CHANNELMSG_RESCIND_CHANNELOFFER, 0, vmbus_onoffer_rescind, 1339 + sizeof(struct vmbus_channel_rescind_offer) }, 1340 + { CHANNELMSG_REQUESTOFFERS, 0, NULL, 0}, 1341 + { CHANNELMSG_ALLOFFERS_DELIVERED, 1, vmbus_onoffers_delivered, 0}, 1342 + { CHANNELMSG_OPENCHANNEL, 0, NULL, 0}, 1343 + { CHANNELMSG_OPENCHANNEL_RESULT, 1, vmbus_onopen_result, 1344 + sizeof(struct vmbus_channel_open_result)}, 1345 + { CHANNELMSG_CLOSECHANNEL, 0, NULL, 0}, 1346 + { CHANNELMSG_GPADL_HEADER, 0, NULL, 0}, 1347 + { CHANNELMSG_GPADL_BODY, 0, NULL, 0}, 1348 + { CHANNELMSG_GPADL_CREATED, 1, vmbus_ongpadl_created, 1349 + sizeof(struct vmbus_channel_gpadl_created)}, 1350 + { CHANNELMSG_GPADL_TEARDOWN, 0, NULL, 0}, 1351 + { CHANNELMSG_GPADL_TORNDOWN, 1, vmbus_ongpadl_torndown, 1352 + sizeof(struct vmbus_channel_gpadl_torndown) }, 1353 + { CHANNELMSG_RELID_RELEASED, 0, NULL, 0}, 1354 + { CHANNELMSG_INITIATE_CONTACT, 0, NULL, 0}, 1355 + { CHANNELMSG_VERSION_RESPONSE, 1, vmbus_onversion_response, 1356 + sizeof(struct vmbus_channel_version_response)}, 1357 + { CHANNELMSG_UNLOAD, 0, NULL, 0}, 1358 + { CHANNELMSG_UNLOAD_RESPONSE, 1, vmbus_unload_response, 0}, 1359 + { CHANNELMSG_18, 0, NULL, 0}, 1360 + { CHANNELMSG_19, 0, NULL, 0}, 1361 + { CHANNELMSG_20, 0, NULL, 0}, 1362 + { CHANNELMSG_TL_CONNECT_REQUEST, 0, NULL, 0}, 1363 + { CHANNELMSG_MODIFYCHANNEL, 0, NULL, 0}, 1364 + { CHANNELMSG_TL_CONNECT_RESULT, 0, NULL, 0}, 1379 1365 }; 1380 1366 1381 1367 /* ··· 1389 1363 * 1390 1364 * This is invoked in the vmbus worker thread context. 1391 1365 */ 1392 - void vmbus_onmessage(void *context) 1366 + void vmbus_onmessage(struct vmbus_channel_message_header *hdr) 1393 1367 { 1394 - struct hv_message *msg = context; 1395 - struct vmbus_channel_message_header *hdr; 1396 - 1397 - hdr = (struct vmbus_channel_message_header *)msg->u.payload; 1398 - 1399 1368 trace_vmbus_on_message(hdr); 1400 1369 1401 1370 /*

+12 -46

drivers/hv/connection.c

··· 69 69 int vmbus_negotiate_version(struct vmbus_channel_msginfo *msginfo, u32 version) 70 70 { 71 71 int ret = 0; 72 - unsigned int cur_cpu; 73 72 struct vmbus_channel_initiate_contact *msg; 74 73 unsigned long flags; 75 74 ··· 101 102 102 103 msg->monitor_page1 = virt_to_phys(vmbus_connection.monitor_pages[0]); 103 104 msg->monitor_page2 = virt_to_phys(vmbus_connection.monitor_pages[1]); 104 - /* 105 - * We want all channel messages to be delivered on CPU 0. 106 - * This has been the behavior pre-win8. This is not 107 - * perf issue and having all channel messages delivered on CPU 0 108 - * would be ok. 109 - * For post win8 hosts, we support receiving channel messagges on 110 - * all the CPUs. This is needed for kexec to work correctly where 111 - * the CPU attempting to connect may not be CPU 0. 112 - */ 113 - if (version >= VERSION_WIN8_1) { 114 - cur_cpu = get_cpu(); 115 - msg->target_vcpu = hv_cpu_number_to_vp_number(cur_cpu); 116 - vmbus_connection.connect_cpu = cur_cpu; 117 - put_cpu(); 118 - } else { 119 - msg->target_vcpu = 0; 120 - vmbus_connection.connect_cpu = 0; 121 - } 105 + msg->target_vcpu = hv_cpu_number_to_vp_number(VMBUS_CONNECT_CPU); 122 106 123 107 /* 124 108 * Add to list before we send the request since we may ··· 248 266 pr_info("Vmbus version:%d.%d\n", 249 267 version >> 16, version & 0xFFFF); 250 268 269 + vmbus_connection.channels = kcalloc(MAX_CHANNEL_RELIDS, 270 + sizeof(struct vmbus_channel *), 271 + GFP_KERNEL); 272 + if (vmbus_connection.channels == NULL) { 273 + ret = -ENOMEM; 274 + goto cleanup; 275 + } 276 + 251 277 kfree(msginfo); 252 278 return 0; 253 279 ··· 303 313 */ 304 314 struct vmbus_channel *relid2channel(u32 relid) 305 315 { 306 - struct vmbus_channel *channel; 307 - struct vmbus_channel *found_channel = NULL; 308 - struct list_head *cur, *tmp; 309 - struct vmbus_channel *cur_sc; 310 - 311 - BUG_ON(!mutex_is_locked(&vmbus_connection.channel_mutex)); 312 - 313 - list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { 314 - if (channel->offermsg.child_relid == relid) { 315 - found_channel = channel; 316 - break; 317 - } else if (!list_empty(&channel->sc_list)) { 318 - /* 319 - * Deal with sub-channels. 320 - */ 321 - list_for_each_safe(cur, tmp, &channel->sc_list) { 322 - cur_sc = list_entry(cur, struct vmbus_channel, 323 - sc_list); 324 - if (cur_sc->offermsg.child_relid == relid) { 325 - found_channel = cur_sc; 326 - break; 327 - } 328 - } 329 - } 330 - } 331 - 332 - return found_channel; 316 + if (WARN_ON(relid >= MAX_CHANNEL_RELIDS)) 317 + return NULL; 318 + return READ_ONCE(vmbus_connection.channels[relid]); 333 319 } 334 320 335 321 /*

+11 -5

drivers/hv/hv.c

··· 117 117 pr_err("Unable to allocate post msg page\n"); 118 118 goto err; 119 119 } 120 - 121 - INIT_LIST_HEAD(&hv_cpu->chan_list); 122 120 } 123 121 124 122 return 0; ··· 244 246 unsigned long flags; 245 247 246 248 /* 249 + * Hyper-V does not provide a way to change the connect CPU once 250 + * it is set; we must prevent the connect CPU from going offline. 251 + */ 252 + if (cpu == VMBUS_CONNECT_CPU) 253 + return -EBUSY; 254 + 255 + /* 247 256 * Search for channels which are bound to the CPU we're about to 248 - * cleanup. In case we find one and vmbus is still connected we need to 249 - * fail, this will effectively prevent CPU offlining. There is no way 250 - * we can re-bind channels to different CPUs for now. 257 + * cleanup. In case we find one and vmbus is still connected, we 258 + * fail; this will effectively prevent CPU offlining. 259 + * 260 + * TODO: Re-bind the channels to different CPUs. 251 261 */ 252 262 mutex_lock(&vmbus_connection.channel_mutex); 253 263 list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {

+1 -1

drivers/hv/hv_fcopy.c

··· 71 71 { 72 72 /* Transaction is finished, reset the state here to avoid races. */ 73 73 fcopy_transaction.state = HVUTIL_READY; 74 - hv_fcopy_onchannelcallback(channel); 74 + tasklet_schedule(&((struct vmbus_channel *)channel)->callback_event); 75 75 } 76 76 77 77 static void fcopy_timeout_func(struct work_struct *dummy)

+1 -1

drivers/hv/hv_snapshot.c

··· 80 80 { 81 81 /* Transaction is finished, reset the state here to avoid races. */ 82 82 vss_transaction.state = HVUTIL_READY; 83 - hv_vss_onchannelcallback(channel); 83 + tasklet_schedule(&((struct vmbus_channel *)channel)->callback_event); 84 84 } 85 85 86 86 /*

+21 -4

drivers/hv/hv_trace.h

··· 44 44 __entry->monitorid = offer->monitorid; 45 45 __entry->is_ddc_int = offer->is_dedicated_interrupt; 46 46 __entry->connection_id = offer->connection_id; 47 - memcpy(__entry->if_type, 48 - &offer->offer.if_type.b, 16); 49 - memcpy(__entry->if_instance, 50 - &offer->offer.if_instance.b, 16); 47 + export_guid(__entry->if_type, &offer->offer.if_type); 48 + export_guid(__entry->if_instance, &offer->offer.if_instance); 51 49 __entry->chn_flags = offer->offer.chn_flags; 52 50 __entry->mmio_mb = offer->offer.mmio_megabytes; 53 51 __entry->sub_idx = offer->offer.sub_channel_index; ··· 291 293 TP_printk("sending guest_endpoint_id %pUl, host_service_id %pUl, " 292 294 "ret %d", 293 295 __entry->guest_id, __entry->host_id, __entry->ret 296 + ) 297 + ); 298 + 299 + TRACE_EVENT(vmbus_send_modifychannel, 300 + TP_PROTO(const struct vmbus_channel_modifychannel *msg, 301 + int ret), 302 + TP_ARGS(msg, ret), 303 + TP_STRUCT__entry( 304 + __field(u32, child_relid) 305 + __field(u32, target_vp) 306 + __field(int, ret) 307 + ), 308 + TP_fast_assign( 309 + __entry->child_relid = msg->child_relid; 310 + __entry->target_vp = msg->target_vp; 311 + __entry->ret = ret; 312 + ), 313 + TP_printk("binding child_relid 0x%x to target_vp 0x%x, ret %d", 314 + __entry->child_relid, __entry->target_vp, __entry->ret 294 315 ) 295 316 ); 296 317

+64 -17

drivers/hv/hyperv_vmbus.h

··· 132 132 * basis. 133 133 */ 134 134 struct tasklet_struct msg_dpc; 135 - 136 - /* 137 - * To optimize the mapping of relid to channel, maintain 138 - * per-cpu list of the channels based on their CPU affinity. 139 - */ 140 - struct list_head chan_list; 141 135 }; 142 136 143 137 struct hv_context { ··· 196 202 /* TODO: Need to make this configurable */ 197 203 #define MAX_NUM_CHANNELS_SUPPORTED 256 198 204 205 + #define MAX_CHANNEL_RELIDS \ 206 + max(MAX_NUM_CHANNELS_SUPPORTED, HV_EVENT_FLAGS_COUNT) 199 207 200 208 enum vmbus_connect_state { 201 209 DISCONNECTED, ··· 208 212 209 213 #define MAX_SIZE_CHANNEL_MESSAGE HV_MESSAGE_PAYLOAD_BYTE_COUNT 210 214 211 - struct vmbus_connection { 212 - /* 213 - * CPU on which the initial host contact was made. 214 - */ 215 - int connect_cpu; 215 + /* 216 + * The CPU that Hyper-V will interrupt for VMBUS messages, such as 217 + * CHANNELMSG_OFFERCHANNEL and CHANNELMSG_RESCIND_CHANNELOFFER. 218 + */ 219 + #define VMBUS_CONNECT_CPU 0 216 220 221 + struct vmbus_connection { 217 222 u32 msg_conn_id; 218 223 219 224 atomic_t offer_in_progress; ··· 246 249 /* List of channels */ 247 250 struct list_head chn_list; 248 251 struct mutex channel_mutex; 252 + 253 + /* Array of channels */ 254 + struct vmbus_channel **channels; 249 255 250 256 /* 251 257 * An offer message is handled first on the work_queue, and then ··· 317 317 enum vmbus_channel_message_type message_type; 318 318 enum vmbus_message_handler_type handler_type; 319 319 void (*message_handler)(struct vmbus_channel_message_header *msg); 320 + u32 min_payload_len; 320 321 }; 321 322 322 323 extern const struct vmbus_channel_message_table_entry ··· 336 335 struct vmbus_channel *channel); 337 336 338 337 void vmbus_remove_channel_attr_group(struct vmbus_channel *channel); 338 + 339 + void vmbus_channel_map_relid(struct vmbus_channel *channel); 340 + void vmbus_channel_unmap_relid(struct vmbus_channel *channel); 339 341 340 342 struct vmbus_channel *relid2channel(u32 relid); 341 343 ··· 378 374 { 379 375 if (!channel) 380 376 return; 381 - 382 - if (in_interrupt() && (channel->target_cpu == smp_processor_id())) { 383 - cb(channel); 384 - return; 385 - } 386 - smp_call_function_single(channel->target_cpu, cb, channel, true); 377 + cb(channel); 387 378 } 388 379 389 380 enum hvutil_device_state { ··· 394 395 INTERRUPT_DELAY = 0, 395 396 MESSAGE_DELAY = 1, 396 397 }; 398 + 399 + extern const struct vmbus_device vmbus_devs[]; 400 + 401 + static inline bool hv_is_perf_channel(struct vmbus_channel *channel) 402 + { 403 + return vmbus_devs[channel->device_id].perf_device; 404 + } 405 + 406 + static inline bool hv_is_alloced_cpu(unsigned int cpu) 407 + { 408 + struct vmbus_channel *channel, *sc; 409 + 410 + lockdep_assert_held(&vmbus_connection.channel_mutex); 411 + /* 412 + * List additions/deletions as well as updates of the target CPUs are 413 + * protected by channel_mutex. 414 + */ 415 + list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { 416 + if (!hv_is_perf_channel(channel)) 417 + continue; 418 + if (channel->target_cpu == cpu) 419 + return true; 420 + list_for_each_entry(sc, &channel->sc_list, sc_list) { 421 + if (sc->target_cpu == cpu) 422 + return true; 423 + } 424 + } 425 + return false; 426 + } 427 + 428 + static inline void hv_set_alloced_cpu(unsigned int cpu) 429 + { 430 + cpumask_set_cpu(cpu, &hv_context.hv_numa_map[cpu_to_node(cpu)]); 431 + } 432 + 433 + static inline void hv_clear_alloced_cpu(unsigned int cpu) 434 + { 435 + if (hv_is_alloced_cpu(cpu)) 436 + return; 437 + cpumask_clear_cpu(cpu, &hv_context.hv_numa_map[cpu_to_node(cpu)]); 438 + } 439 + 440 + static inline void hv_update_alloced_cpus(unsigned int old_cpu, 441 + unsigned int new_cpu) 442 + { 443 + hv_set_alloced_cpu(new_cpu); 444 + hv_clear_alloced_cpu(old_cpu); 445 + } 397 446 398 447 #ifdef CONFIG_HYPERV_TESTING 399 448

+243 -71

drivers/hv/vmbus_drv.c

··· 117 117 return 0; 118 118 } 119 119 120 - #define VMBUS_ALIAS_LEN ((sizeof((struct hv_vmbus_device_id *)0)->guid) * 2) 121 - static void print_alias_name(struct hv_device *hv_dev, char *alias_name) 122 - { 123 - int i; 124 - for (i = 0; i < VMBUS_ALIAS_LEN; i += 2) 125 - sprintf(&alias_name[i], "%02x", hv_dev->dev_type.b[i/2]); 126 - } 127 - 128 120 static u8 channel_monitor_group(const struct vmbus_channel *channel) 129 121 { 130 122 return (u8)channel->offermsg.monitorid / 32; ··· 193 201 if (!hv_dev->channel) 194 202 return -ENODEV; 195 203 return sprintf(buf, "{%pUl}\n", 196 - hv_dev->channel->offermsg.offer.if_type.b); 204 + &hv_dev->channel->offermsg.offer.if_type); 197 205 } 198 206 static DEVICE_ATTR_RO(class_id); 199 207 ··· 205 213 if (!hv_dev->channel) 206 214 return -ENODEV; 207 215 return sprintf(buf, "{%pUl}\n", 208 - hv_dev->channel->offermsg.offer.if_instance.b); 216 + &hv_dev->channel->offermsg.offer.if_instance); 209 217 } 210 218 static DEVICE_ATTR_RO(device_id); 211 219 ··· 213 221 struct device_attribute *dev_attr, char *buf) 214 222 { 215 223 struct hv_device *hv_dev = device_to_hv_device(dev); 216 - char alias_name[VMBUS_ALIAS_LEN + 1]; 217 224 218 - print_alias_name(hv_dev, alias_name); 219 - return sprintf(buf, "vmbus:%s\n", alias_name); 225 + return sprintf(buf, "vmbus:%*phN\n", UUID_SIZE, &hv_dev->dev_type); 220 226 } 221 227 static DEVICE_ATTR_RO(modalias); 222 228 ··· 683 693 static int vmbus_uevent(struct device *device, struct kobj_uevent_env *env) 684 694 { 685 695 struct hv_device *dev = device_to_hv_device(device); 686 - int ret; 687 - char alias_name[VMBUS_ALIAS_LEN + 1]; 696 + const char *format = "MODALIAS=vmbus:%*phN"; 688 697 689 - print_alias_name(dev, alias_name); 690 - ret = add_uevent_var(env, "MODALIAS=vmbus:%s", alias_name); 691 - return ret; 698 + return add_uevent_var(env, format, UUID_SIZE, &dev->dev_type); 692 699 } 693 700 694 701 static const struct hv_vmbus_device_id * ··· 1020 1033 1021 1034 struct onmessage_work_context { 1022 1035 struct work_struct work; 1023 - struct hv_message msg; 1036 + struct { 1037 + struct hv_message_header header; 1038 + u8 payload[]; 1039 + } msg; 1024 1040 }; 1025 1041 1026 1042 static void vmbus_onmessage_work(struct work_struct *work) ··· 1036 1046 1037 1047 ctx = container_of(work, struct onmessage_work_context, 1038 1048 work); 1039 - vmbus_onmessage(&ctx->msg); 1049 + vmbus_onmessage((struct vmbus_channel_message_header *) 1050 + &ctx->msg.payload); 1040 1051 kfree(ctx); 1041 1052 } 1042 1053 ··· 1052 1061 struct onmessage_work_context *ctx; 1053 1062 u32 message_type = msg->header.message_type; 1054 1063 1064 + /* 1065 + * 'enum vmbus_channel_message_type' is supposed to always be 'u32' as 1066 + * it is being used in 'struct vmbus_channel_message_header' definition 1067 + * which is supposed to match hypervisor ABI. 1068 + */ 1069 + BUILD_BUG_ON(sizeof(enum vmbus_channel_message_type) != sizeof(u32)); 1070 + 1055 1071 if (message_type == HVMSG_NONE) 1056 1072 /* no msg */ 1057 1073 return; ··· 1072 1074 goto msg_handled; 1073 1075 } 1074 1076 1077 + if (msg->header.payload_size > HV_MESSAGE_PAYLOAD_BYTE_COUNT) { 1078 + WARN_ONCE(1, "payload size is too large (%d)\n", 1079 + msg->header.payload_size); 1080 + goto msg_handled; 1081 + } 1082 + 1075 1083 entry = &channel_message_table[hdr->msgtype]; 1076 1084 1077 1085 if (!entry->message_handler) 1078 1086 goto msg_handled; 1079 1087 1088 + if (msg->header.payload_size < entry->min_payload_len) { 1089 + WARN_ONCE(1, "message too short: msgtype=%d len=%d\n", 1090 + hdr->msgtype, msg->header.payload_size); 1091 + goto msg_handled; 1092 + } 1093 + 1080 1094 if (entry->handler_type == VMHT_BLOCKING) { 1081 - ctx = kmalloc(sizeof(*ctx), GFP_ATOMIC); 1095 + ctx = kmalloc(sizeof(*ctx) + msg->header.payload_size, 1096 + GFP_ATOMIC); 1082 1097 if (ctx == NULL) 1083 1098 return; 1084 1099 1085 1100 INIT_WORK(&ctx->work, vmbus_onmessage_work); 1086 - memcpy(&ctx->msg, msg, sizeof(*msg)); 1101 + memcpy(&ctx->msg, msg, sizeof(msg->header) + 1102 + msg->header.payload_size); 1087 1103 1088 1104 /* 1089 1105 * The host can generate a rescind message while we 1090 1106 * may still be handling the original offer. We deal with 1091 - * this condition by ensuring the processing is done on the 1092 - * same CPU. 1107 + * this condition by relying on the synchronization provided 1108 + * by offer_in_progress and by channel_mutex. See also the 1109 + * inline comments in vmbus_onoffer_rescind(). 1093 1110 */ 1094 1111 switch (hdr->msgtype) { 1095 1112 case CHANNELMSG_RESCIND_CHANNELOFFER: 1096 1113 /* 1097 1114 * If we are handling the rescind message; 1098 1115 * schedule the work on the global work queue. 1116 + * 1117 + * The OFFER message and the RESCIND message should 1118 + * not be handled by the same serialized work queue, 1119 + * because the OFFER handler may call vmbus_open(), 1120 + * which tries to open the channel by sending an 1121 + * OPEN_CHANNEL message to the host and waits for 1122 + * the host's response; however, if the host has 1123 + * rescinded the channel before it receives the 1124 + * OPEN_CHANNEL message, the host just silently 1125 + * ignores the OPEN_CHANNEL message; as a result, 1126 + * the guest's OFFER handler hangs for ever, if we 1127 + * handle the RESCIND message in the same serialized 1128 + * work queue: the RESCIND handler can not start to 1129 + * run before the OFFER handler finishes. 1099 1130 */ 1100 - schedule_work_on(vmbus_connection.connect_cpu, 1101 - &ctx->work); 1131 + schedule_work(&ctx->work); 1102 1132 break; 1103 1133 1104 1134 case CHANNELMSG_OFFERCHANNEL: 1135 + /* 1136 + * The host sends the offer message of a given channel 1137 + * before sending the rescind message of the same 1138 + * channel. These messages are sent to the guest's 1139 + * connect CPU; the guest then starts processing them 1140 + * in the tasklet handler on this CPU: 1141 + * 1142 + * VMBUS_CONNECT_CPU 1143 + * 1144 + * [vmbus_on_msg_dpc()] 1145 + * atomic_inc() // CHANNELMSG_OFFERCHANNEL 1146 + * queue_work() 1147 + * ... 1148 + * [vmbus_on_msg_dpc()] 1149 + * schedule_work() // CHANNELMSG_RESCIND_CHANNELOFFER 1150 + * 1151 + * We rely on the memory-ordering properties of the 1152 + * queue_work() and schedule_work() primitives, which 1153 + * guarantee that the atomic increment will be visible 1154 + * to the CPUs which will execute the offer & rescind 1155 + * works by the time these works will start execution. 1156 + */ 1105 1157 atomic_inc(&vmbus_connection.offer_in_progress); 1106 - queue_work_on(vmbus_connection.connect_cpu, 1107 - vmbus_connection.work_queue, 1108 - &ctx->work); 1109 - break; 1158 + fallthrough; 1110 1159 1111 1160 default: 1112 1161 queue_work(vmbus_connection.work_queue, &ctx->work); ··· 1178 1133 WARN_ON(!is_hvsock_channel(channel)); 1179 1134 1180 1135 /* 1181 - * sizeof(*ctx) is small and the allocation should really not fail, 1136 + * Allocation size is small and the allocation should really not fail, 1182 1137 * otherwise the state of the hv_sock connections ends up in limbo. 1183 1138 */ 1184 - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL); 1139 + ctx = kzalloc(sizeof(*ctx) + sizeof(*rescind), 1140 + GFP_KERNEL | __GFP_NOFAIL); 1185 1141 1186 1142 /* 1187 1143 * So far, these are not really used by Linux. Just set them to the ··· 1192 1146 ctx->msg.header.payload_size = sizeof(*rescind); 1193 1147 1194 1148 /* These values are actually used by Linux. */ 1195 - rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.u.payload; 1149 + rescind = (struct vmbus_channel_rescind_offer *)ctx->msg.payload; 1196 1150 rescind->header.msgtype = CHANNELMSG_RESCIND_CHANNELOFFER; 1197 1151 rescind->child_relid = channel->offermsg.child_relid; 1198 1152 1199 1153 INIT_WORK(&ctx->work, vmbus_onmessage_work); 1200 1154 1201 - queue_work_on(vmbus_connection.connect_cpu, 1202 - vmbus_connection.work_queue, 1203 - &ctx->work); 1155 + queue_work(vmbus_connection.work_queue, &ctx->work); 1204 1156 } 1205 1157 #endif /* CONFIG_PM_SLEEP */ 1206 - 1207 - /* 1208 - * Direct callback for channels using other deferred processing 1209 - */ 1210 - static void vmbus_channel_isr(struct vmbus_channel *channel) 1211 - { 1212 - void (*callback_fn)(void *); 1213 - 1214 - callback_fn = READ_ONCE(channel->onchannel_callback); 1215 - if (likely(callback_fn != NULL)) 1216 - (*callback_fn)(channel->channel_callback_context); 1217 - } 1218 1158 1219 1159 /* 1220 1160 * Schedule all channels with events pending ··· 1232 1200 return; 1233 1201 1234 1202 for_each_set_bit(relid, recv_int_page, maxbits) { 1203 + void (*callback_fn)(void *context); 1235 1204 struct vmbus_channel *channel; 1236 1205 1237 1206 if (!sync_test_and_clear_bit(relid, recv_int_page)) ··· 1242 1209 if (relid == 0) 1243 1210 continue; 1244 1211 1212 + /* 1213 + * Pairs with the kfree_rcu() in vmbus_chan_release(). 1214 + * Guarantees that the channel data structure doesn't 1215 + * get freed while the channel pointer below is being 1216 + * dereferenced. 1217 + */ 1245 1218 rcu_read_lock(); 1246 1219 1247 1220 /* Find channel based on relid */ 1248 - list_for_each_entry_rcu(channel, &hv_cpu->chan_list, percpu_list) { 1249 - if (channel->offermsg.child_relid != relid) 1250 - continue; 1221 + channel = relid2channel(relid); 1222 + if (channel == NULL) 1223 + goto sched_unlock_rcu; 1251 1224 1252 - if (channel->rescind) 1253 - continue; 1225 + if (channel->rescind) 1226 + goto sched_unlock_rcu; 1254 1227 1255 - trace_vmbus_chan_sched(channel); 1228 + /* 1229 + * Make sure that the ring buffer data structure doesn't get 1230 + * freed while we dereference the ring buffer pointer. Test 1231 + * for the channel's onchannel_callback being NULL within a 1232 + * sched_lock critical section. See also the inline comments 1233 + * in vmbus_reset_channel_cb(). 1234 + */ 1235 + spin_lock(&channel->sched_lock); 1256 1236 1257 - ++channel->interrupts; 1237 + callback_fn = channel->onchannel_callback; 1238 + if (unlikely(callback_fn == NULL)) 1239 + goto sched_unlock; 1258 1240 1259 - switch (channel->callback_mode) { 1260 - case HV_CALL_ISR: 1261 - vmbus_channel_isr(channel); 1262 - break; 1241 + trace_vmbus_chan_sched(channel); 1263 1242 1264 - case HV_CALL_BATCHED: 1265 - hv_begin_read(&channel->inbound); 1266 - /* fallthrough */ 1267 - case HV_CALL_DIRECT: 1268 - tasklet_schedule(&channel->callback_event); 1269 - } 1243 + ++channel->interrupts; 1244 + 1245 + switch (channel->callback_mode) { 1246 + case HV_CALL_ISR: 1247 + (*callback_fn)(channel->channel_callback_context); 1248 + break; 1249 + 1250 + case HV_CALL_BATCHED: 1251 + hv_begin_read(&channel->inbound); 1252 + fallthrough; 1253 + case HV_CALL_DIRECT: 1254 + tasklet_schedule(&channel->callback_event); 1270 1255 } 1271 1256 1257 + sched_unlock: 1258 + spin_unlock(&channel->sched_lock); 1259 + sched_unlock_rcu: 1272 1260 rcu_read_unlock(); 1273 1261 } 1274 1262 } ··· 1418 1364 { 1419 1365 int ret; 1420 1366 1421 - /* Hypervisor initialization...setup hypercall page..etc */ 1422 1367 ret = hv_init(); 1423 1368 if (ret != 0) { 1424 1369 pr_err("Unable to initialize the hypervisor - 0x%x\n", ret); ··· 1606 1553 return attribute->show(chan, buf); 1607 1554 } 1608 1555 1556 + static ssize_t vmbus_chan_attr_store(struct kobject *kobj, 1557 + struct attribute *attr, const char *buf, 1558 + size_t count) 1559 + { 1560 + const struct vmbus_chan_attribute *attribute 1561 + = container_of(attr, struct vmbus_chan_attribute, attr); 1562 + struct vmbus_channel *chan 1563 + = container_of(kobj, struct vmbus_channel, kobj); 1564 + 1565 + if (!attribute->store) 1566 + return -EIO; 1567 + 1568 + return attribute->store(chan, buf, count); 1569 + } 1570 + 1609 1571 static const struct sysfs_ops vmbus_chan_sysfs_ops = { 1610 1572 .show = vmbus_chan_attr_show, 1573 + .store = vmbus_chan_attr_store, 1611 1574 }; 1612 1575 1613 1576 static ssize_t out_mask_show(struct vmbus_channel *channel, char *buf) ··· 1694 1625 } 1695 1626 static VMBUS_CHAN_ATTR_RO(write_avail); 1696 1627 1697 - static ssize_t show_target_cpu(struct vmbus_channel *channel, char *buf) 1628 + static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf) 1698 1629 { 1699 1630 return sprintf(buf, "%u\n", channel->target_cpu); 1700 1631 } 1701 - static VMBUS_CHAN_ATTR(cpu, S_IRUGO, show_target_cpu, NULL); 1632 + static ssize_t target_cpu_store(struct vmbus_channel *channel, 1633 + const char *buf, size_t count) 1634 + { 1635 + u32 target_cpu, origin_cpu; 1636 + ssize_t ret = count; 1637 + 1638 + if (vmbus_proto_version < VERSION_WIN10_V4_1) 1639 + return -EIO; 1640 + 1641 + if (sscanf(buf, "%uu", &target_cpu) != 1) 1642 + return -EIO; 1643 + 1644 + /* Validate target_cpu for the cpumask_test_cpu() operation below. */ 1645 + if (target_cpu >= nr_cpumask_bits) 1646 + return -EINVAL; 1647 + 1648 + /* No CPUs should come up or down during this. */ 1649 + cpus_read_lock(); 1650 + 1651 + if (!cpumask_test_cpu(target_cpu, cpu_online_mask)) { 1652 + cpus_read_unlock(); 1653 + return -EINVAL; 1654 + } 1655 + 1656 + /* 1657 + * Synchronizes target_cpu_store() and channel closure: 1658 + * 1659 + * { Initially: state = CHANNEL_OPENED } 1660 + * 1661 + * CPU1 CPU2 1662 + * 1663 + * [target_cpu_store()] [vmbus_disconnect_ring()] 1664 + * 1665 + * LOCK channel_mutex LOCK channel_mutex 1666 + * LOAD r1 = state LOAD r2 = state 1667 + * IF (r1 == CHANNEL_OPENED) IF (r2 == CHANNEL_OPENED) 1668 + * SEND MODIFYCHANNEL STORE state = CHANNEL_OPEN 1669 + * [...] SEND CLOSECHANNEL 1670 + * UNLOCK channel_mutex UNLOCK channel_mutex 1671 + * 1672 + * Forbids: r1 == r2 == CHANNEL_OPENED (i.e., CPU1's LOCK precedes 1673 + * CPU2's LOCK) && CPU2's SEND precedes CPU1's SEND 1674 + * 1675 + * Note. The host processes the channel messages "sequentially", in 1676 + * the order in which they are received on a per-partition basis. 1677 + */ 1678 + mutex_lock(&vmbus_connection.channel_mutex); 1679 + 1680 + /* 1681 + * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels; 1682 + * avoid sending the message and fail here for such channels. 1683 + */ 1684 + if (channel->state != CHANNEL_OPENED_STATE) { 1685 + ret = -EIO; 1686 + goto cpu_store_unlock; 1687 + } 1688 + 1689 + origin_cpu = channel->target_cpu; 1690 + if (target_cpu == origin_cpu) 1691 + goto cpu_store_unlock; 1692 + 1693 + if (vmbus_send_modifychannel(channel->offermsg.child_relid, 1694 + hv_cpu_number_to_vp_number(target_cpu))) { 1695 + ret = -EIO; 1696 + goto cpu_store_unlock; 1697 + } 1698 + 1699 + /* 1700 + * Warning. At this point, there is *no* guarantee that the host will 1701 + * have successfully processed the vmbus_send_modifychannel() request. 1702 + * See the header comment of vmbus_send_modifychannel() for more info. 1703 + * 1704 + * Lags in the processing of the above vmbus_send_modifychannel() can 1705 + * result in missed interrupts if the "old" target CPU is taken offline 1706 + * before Hyper-V starts sending interrupts to the "new" target CPU. 1707 + * But apart from this offlining scenario, the code tolerates such 1708 + * lags. It will function correctly even if a channel interrupt comes 1709 + * in on a CPU that is different from the channel target_cpu value. 1710 + */ 1711 + 1712 + channel->target_cpu = target_cpu; 1713 + channel->target_vp = hv_cpu_number_to_vp_number(target_cpu); 1714 + channel->numa_node = cpu_to_node(target_cpu); 1715 + 1716 + /* See init_vp_index(). */ 1717 + if (hv_is_perf_channel(channel)) 1718 + hv_update_alloced_cpus(origin_cpu, target_cpu); 1719 + 1720 + /* Currently set only for storvsc channels. */ 1721 + if (channel->change_target_cpu_callback) { 1722 + (*channel->change_target_cpu_callback)(channel, 1723 + origin_cpu, target_cpu); 1724 + } 1725 + 1726 + cpu_store_unlock: 1727 + mutex_unlock(&vmbus_connection.channel_mutex); 1728 + cpus_read_unlock(); 1729 + return ret; 1730 + } 1731 + static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store); 1702 1732 1703 1733 static ssize_t channel_pending_show(struct vmbus_channel *channel, 1704 1734 char *buf) ··· 1998 1830 int ret; 1999 1831 2000 1832 dev_set_name(&child_device_obj->device, "%pUl", 2001 - child_device_obj->channel->offermsg.offer.if_instance.b); 1833 + &child_device_obj->channel->offermsg.offer.if_instance); 2002 1834 2003 1835 child_device_obj->device.bus = &hv_bus; 2004 1836 child_device_obj->device.parent = &hv_acpi_dev->dev; ··· 2389 2221 2390 2222 list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) { 2391 2223 /* 2392 - * Invalidate the field. Upon resume, vmbus_onoffer() will fix 2393 - * up the field, and the other fields (if necessary). 2224 + * Remove the channel from the array of channels and invalidate 2225 + * the channel's relid. Upon resume, vmbus_onoffer() will fix 2226 + * up the relid (and other fields, if necessary) and add the 2227 + * channel back to the array. 2394 2228 */ 2229 + vmbus_channel_unmap_relid(channel); 2395 2230 channel->offermsg.child_relid = INVALID_RELID; 2396 2231 2397 2232 if (is_hvsock_channel(channel)) { ··· 2641 2470 hv_debug_rm_all_dir(); 2642 2471 2643 2472 vmbus_free_channels(); 2473 + kfree(vmbus_connection.channels); 2644 2474 2645 2475 if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE) { 2646 2476 kmsg_dump_unregister(&hv_kmsg_dumper);

+5 -2

drivers/net/hyperv/netvsc.c

··· 636 636 637 637 RCU_INIT_POINTER(net_device_ctx->nvdev, NULL); 638 638 639 - /* And disassociate NAPI context from device */ 640 - for (i = 0; i < net_device->num_chn; i++) 639 + /* Disable NAPI and disassociate its context from the device. */ 640 + for (i = 0; i < net_device->num_chn; i++) { 641 + /* See also vmbus_reset_channel_cb(). */ 642 + napi_disable(&net_device->chan_table[i].napi); 641 643 netif_napi_del(&net_device->chan_table[i].napi); 644 + } 642 645 643 646 /* 644 647 * At this point, no one should be accessing net_device

+28 -16

drivers/pci/controller/pci-hyperv.c

··· 1356 1356 { 1357 1357 struct irq_cfg *cfg = irqd_cfg(data); 1358 1358 struct hv_pcibus_device *hbus; 1359 + struct vmbus_channel *channel; 1359 1360 struct hv_pci_dev *hpdev; 1360 1361 struct pci_bus *pbus; 1361 1362 struct pci_dev *pdev; 1362 1363 struct cpumask *dest; 1363 - unsigned long flags; 1364 1364 struct compose_comp_ctxt comp; 1365 1365 struct tran_int_desc *int_desc; 1366 1366 struct { ··· 1378 1378 dest = irq_data_get_effective_affinity_mask(data); 1379 1379 pbus = pdev->bus; 1380 1380 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); 1381 + channel = hbus->hdev->channel; 1381 1382 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn)); 1382 1383 if (!hpdev) 1383 1384 goto return_null_message; ··· 1437 1436 } 1438 1437 1439 1438 /* 1439 + * Prevents hv_pci_onchannelcallback() from running concurrently 1440 + * in the tasklet. 1441 + */ 1442 + tasklet_disable(&channel->callback_event); 1443 + 1444 + /* 1440 1445 * Since this function is called with IRQ locks held, can't 1441 1446 * do normal wait for completion; instead poll. 1442 1447 */ 1443 1448 while (!try_wait_for_completion(&comp.comp_pkt.host_event)) { 1449 + unsigned long flags; 1450 + 1444 1451 /* 0xFFFF means an invalid PCI VENDOR ID. */ 1445 1452 if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) { 1446 1453 dev_err_once(&hbus->hdev->device, 1447 1454 "the device has gone\n"); 1448 - goto free_int_desc; 1455 + goto enable_tasklet; 1449 1456 } 1450 1457 1451 1458 /* 1452 - * When the higher level interrupt code calls us with 1453 - * interrupt disabled, we must poll the channel by calling 1454 - * the channel callback directly when channel->target_cpu is 1455 - * the current CPU. When the higher level interrupt code 1456 - * calls us with interrupt enabled, let's add the 1457 - * local_irq_save()/restore() to avoid race: 1458 - * hv_pci_onchannelcallback() can also run in tasklet. 1459 + * Make sure that the ring buffer data structure doesn't get 1460 + * freed while we dereference the ring buffer pointer. Test 1461 + * for the channel's onchannel_callback being NULL within a 1462 + * sched_lock critical section. See also the inline comments 1463 + * in vmbus_reset_channel_cb(). 1459 1464 */ 1460 - local_irq_save(flags); 1461 - 1462 - if (hbus->hdev->channel->target_cpu == smp_processor_id()) 1463 - hv_pci_onchannelcallback(hbus); 1464 - 1465 - local_irq_restore(flags); 1465 + spin_lock_irqsave(&channel->sched_lock, flags); 1466 + if (unlikely(channel->onchannel_callback == NULL)) { 1467 + spin_unlock_irqrestore(&channel->sched_lock, flags); 1468 + goto enable_tasklet; 1469 + } 1470 + hv_pci_onchannelcallback(hbus); 1471 + spin_unlock_irqrestore(&channel->sched_lock, flags); 1466 1472 1467 1473 if (hpdev->state == hv_pcichild_ejecting) { 1468 1474 dev_err_once(&hbus->hdev->device, 1469 1475 "the device is being ejected\n"); 1470 - goto free_int_desc; 1476 + goto enable_tasklet; 1471 1477 } 1472 1478 1473 1479 udelay(100); 1474 1480 } 1481 + 1482 + tasklet_enable(&channel->callback_event); 1475 1483 1476 1484 if (comp.comp_pkt.completion_status < 0) { 1477 1485 dev_err(&hbus->hdev->device, ··· 1505 1495 put_pcichild(hpdev); 1506 1496 return; 1507 1497 1498 + enable_tasklet: 1499 + tasklet_enable(&channel->callback_event); 1508 1500 free_int_desc: 1509 1501 kfree(int_desc); 1510 1502 drop_reference:

+88 -8

drivers/scsi/storvsc_drv.c

··· 621 621 622 622 } 623 623 624 + static void storvsc_change_target_cpu(struct vmbus_channel *channel, u32 old, 625 + u32 new) 626 + { 627 + struct storvsc_device *stor_device; 628 + struct vmbus_channel *cur_chn; 629 + bool old_is_alloced = false; 630 + struct hv_device *device; 631 + unsigned long flags; 632 + int cpu; 633 + 634 + device = channel->primary_channel ? 635 + channel->primary_channel->device_obj 636 + : channel->device_obj; 637 + stor_device = get_out_stor_device(device); 638 + if (!stor_device) 639 + return; 640 + 641 + /* See storvsc_do_io() -> get_og_chn(). */ 642 + spin_lock_irqsave(&device->channel->lock, flags); 643 + 644 + /* 645 + * Determines if the storvsc device has other channels assigned to 646 + * the "old" CPU to update the alloced_cpus mask and the stor_chns 647 + * array. 648 + */ 649 + if (device->channel != channel && device->channel->target_cpu == old) { 650 + cur_chn = device->channel; 651 + old_is_alloced = true; 652 + goto old_is_alloced; 653 + } 654 + list_for_each_entry(cur_chn, &device->channel->sc_list, sc_list) { 655 + if (cur_chn == channel) 656 + continue; 657 + if (cur_chn->target_cpu == old) { 658 + old_is_alloced = true; 659 + goto old_is_alloced; 660 + } 661 + } 662 + 663 + old_is_alloced: 664 + if (old_is_alloced) 665 + WRITE_ONCE(stor_device->stor_chns[old], cur_chn); 666 + else 667 + cpumask_clear_cpu(old, &stor_device->alloced_cpus); 668 + 669 + /* "Flush" the stor_chns array. */ 670 + for_each_possible_cpu(cpu) { 671 + if (stor_device->stor_chns[cpu] && !cpumask_test_cpu( 672 + cpu, &stor_device->alloced_cpus)) 673 + WRITE_ONCE(stor_device->stor_chns[cpu], NULL); 674 + } 675 + 676 + WRITE_ONCE(stor_device->stor_chns[new], channel); 677 + cpumask_set_cpu(new, &stor_device->alloced_cpus); 678 + 679 + spin_unlock_irqrestore(&device->channel->lock, flags); 680 + } 681 + 624 682 static void handle_sc_creation(struct vmbus_channel *new_sc) 625 683 { 626 684 struct hv_device *device = new_sc->primary_channel->device_obj; ··· 705 647 dev_err(dev, "Failed to open sub-channel: err=%d\n", ret); 706 648 return; 707 649 } 650 + 651 + new_sc->change_target_cpu_callback = storvsc_change_target_cpu; 708 652 709 653 /* Add the sub-channel to the array of available channels. */ 710 654 stor_device->stor_chns[new_sc->target_cpu] = new_sc; ··· 935 875 GFP_KERNEL); 936 876 if (stor_device->stor_chns == NULL) 937 877 return -ENOMEM; 878 + 879 + device->channel->change_target_cpu_callback = storvsc_change_target_cpu; 938 880 939 881 stor_device->stor_chns[device->channel->target_cpu] = device->channel; 940 882 cpumask_set_cpu(device->channel->target_cpu, ··· 1310 1248 const struct cpumask *node_mask; 1311 1249 int num_channels, tgt_cpu; 1312 1250 1313 - if (stor_device->num_sc == 0) 1251 + if (stor_device->num_sc == 0) { 1252 + stor_device->stor_chns[q_num] = stor_device->device->channel; 1314 1253 return stor_device->device->channel; 1254 + } 1315 1255 1316 1256 /* 1317 1257 * Our channel array is sparsley populated and we ··· 1322 1258 * The strategy is simple: 1323 1259 * I. Ensure NUMA locality 1324 1260 * II. Distribute evenly (best effort) 1325 - * III. Mapping is persistent. 1326 1261 */ 1327 1262 1328 1263 node_mask = cpumask_of_node(cpu_to_node(q_num)); ··· 1331 1268 if (cpumask_test_cpu(tgt_cpu, node_mask)) 1332 1269 num_channels++; 1333 1270 } 1334 - if (num_channels == 0) 1271 + if (num_channels == 0) { 1272 + stor_device->stor_chns[q_num] = stor_device->device->channel; 1335 1273 return stor_device->device->channel; 1274 + } 1336 1275 1337 1276 hash_qnum = q_num; 1338 1277 while (hash_qnum >= num_channels) ··· 1360 1295 struct storvsc_device *stor_device; 1361 1296 struct vstor_packet *vstor_packet; 1362 1297 struct vmbus_channel *outgoing_channel, *channel; 1298 + unsigned long flags; 1363 1299 int ret = 0; 1364 1300 const struct cpumask *node_mask; 1365 1301 int tgt_cpu; ··· 1374 1308 1375 1309 request->device = device; 1376 1310 /* 1377 - * Select an an appropriate channel to send the request out. 1311 + * Select an appropriate channel to send the request out. 1378 1312 */ 1379 - if (stor_device->stor_chns[q_num] != NULL) { 1380 - outgoing_channel = stor_device->stor_chns[q_num]; 1313 + /* See storvsc_change_target_cpu(). */ 1314 + outgoing_channel = READ_ONCE(stor_device->stor_chns[q_num]); 1315 + if (outgoing_channel != NULL) { 1381 1316 if (outgoing_channel->target_cpu == q_num) { 1382 1317 /* 1383 1318 * Ideally, we want to pick a different channel if ··· 1391 1324 continue; 1392 1325 if (tgt_cpu == q_num) 1393 1326 continue; 1394 - channel = stor_device->stor_chns[tgt_cpu]; 1327 + channel = READ_ONCE( 1328 + stor_device->stor_chns[tgt_cpu]); 1329 + if (channel == NULL) 1330 + continue; 1395 1331 if (hv_get_avail_to_write_percent( 1396 1332 &channel->outbound) 1397 1333 > ring_avail_percent_lowater) { ··· 1420 1350 for_each_cpu(tgt_cpu, &stor_device->alloced_cpus) { 1421 1351 if (cpumask_test_cpu(tgt_cpu, node_mask)) 1422 1352 continue; 1423 - channel = stor_device->stor_chns[tgt_cpu]; 1353 + channel = READ_ONCE( 1354 + stor_device->stor_chns[tgt_cpu]); 1355 + if (channel == NULL) 1356 + continue; 1424 1357 if (hv_get_avail_to_write_percent( 1425 1358 &channel->outbound) 1426 1359 > ring_avail_percent_lowater) { ··· 1433 1360 } 1434 1361 } 1435 1362 } else { 1363 + spin_lock_irqsave(&device->channel->lock, flags); 1364 + outgoing_channel = stor_device->stor_chns[q_num]; 1365 + if (outgoing_channel != NULL) { 1366 + spin_unlock_irqrestore(&device->channel->lock, flags); 1367 + goto found_channel; 1368 + } 1436 1369 outgoing_channel = get_og_chn(stor_device, q_num); 1370 + spin_unlock_irqrestore(&device->channel->lock, flags); 1437 1371 } 1438 1372 1439 1373 found_channel:

+493

include/asm-generic/hyperv-tlfs.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + 3 + /* 4 + * This file contains definitions from Hyper-V Hypervisor Top-Level Functional 5 + * Specification (TLFS): 6 + * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs 7 + */ 8 + 9 + #ifndef _ASM_GENERIC_HYPERV_TLFS_H 10 + #define _ASM_GENERIC_HYPERV_TLFS_H 11 + 12 + #include <linux/types.h> 13 + #include <linux/bits.h> 14 + #include <linux/time64.h> 15 + 16 + /* 17 + * While not explicitly listed in the TLFS, Hyper-V always runs with a page size 18 + * of 4096. These definitions are used when communicating with Hyper-V using 19 + * guest physical pages and guest physical page addresses, since the guest page 20 + * size may not be 4096 on all architectures. 21 + */ 22 + #define HV_HYP_PAGE_SHIFT 12 23 + #define HV_HYP_PAGE_SIZE BIT(HV_HYP_PAGE_SHIFT) 24 + #define HV_HYP_PAGE_MASK (~(HV_HYP_PAGE_SIZE - 1)) 25 + 26 + /* 27 + * Hyper-V provides two categories of flags relevant to guest VMs. The 28 + * "Features" category indicates specific functionality that is available 29 + * to guests on this particular instance of Hyper-V. The "Features" 30 + * are presented in four groups, each of which is 32 bits. The group A 31 + * and B definitions are common across architectures and are listed here. 32 + * However, not all flags are relevant on all architectures. 33 + * 34 + * Groups C and D vary across architectures and are listed in the 35 + * architecture specific portion of hyperv-tlfs.h. Some of these flags exist 36 + * on multiple architectures, but the bit positions are different so they 37 + * cannot appear in the generic portion of hyperv-tlfs.h. 38 + * 39 + * The "Enlightenments" category provides recommendations on whether to use 40 + * specific enlightenments that are available. The Enlighenments are a single 41 + * group of 32 bits, but they vary across architectures and are listed in 42 + * the architecture specific portion of hyperv-tlfs.h. 43 + */ 44 + 45 + /* 46 + * Group A Features. 47 + */ 48 + 49 + /* VP Runtime register available */ 50 + #define HV_MSR_VP_RUNTIME_AVAILABLE BIT(0) 51 + /* Partition Reference Counter available*/ 52 + #define HV_MSR_TIME_REF_COUNT_AVAILABLE BIT(1) 53 + /* Basic SynIC register available */ 54 + #define HV_MSR_SYNIC_AVAILABLE BIT(2) 55 + /* Synthetic Timer registers available */ 56 + #define HV_MSR_SYNTIMER_AVAILABLE BIT(3) 57 + /* Virtual APIC assist and VP assist page registers available */ 58 + #define HV_MSR_APIC_ACCESS_AVAILABLE BIT(4) 59 + /* Hypercall and Guest OS ID registers available*/ 60 + #define HV_MSR_HYPERCALL_AVAILABLE BIT(5) 61 + /* Access virtual processor index register available*/ 62 + #define HV_MSR_VP_INDEX_AVAILABLE BIT(6) 63 + /* Virtual system reset register available*/ 64 + #define HV_MSR_RESET_AVAILABLE BIT(7) 65 + /* Access statistics page registers available */ 66 + #define HV_MSR_STAT_PAGES_AVAILABLE BIT(8) 67 + /* Partition reference TSC register is available */ 68 + #define HV_MSR_REFERENCE_TSC_AVAILABLE BIT(9) 69 + /* Partition Guest IDLE register is available */ 70 + #define HV_MSR_GUEST_IDLE_AVAILABLE BIT(10) 71 + /* Partition local APIC and TSC frequency registers available */ 72 + #define HV_ACCESS_FREQUENCY_MSRS BIT(11) 73 + /* AccessReenlightenmentControls privilege */ 74 + #define HV_ACCESS_REENLIGHTENMENT BIT(13) 75 + /* AccessTscInvariantControls privilege */ 76 + #define HV_ACCESS_TSC_INVARIANT BIT(15) 77 + 78 + /* 79 + * Group B features. 80 + */ 81 + #define HV_CREATE_PARTITIONS BIT(0) 82 + #define HV_ACCESS_PARTITION_ID BIT(1) 83 + #define HV_ACCESS_MEMORY_POOL BIT(2) 84 + #define HV_ADJUST_MESSAGE_BUFFERS BIT(3) 85 + #define HV_POST_MESSAGES BIT(4) 86 + #define HV_SIGNAL_EVENTS BIT(5) 87 + #define HV_CREATE_PORT BIT(6) 88 + #define HV_CONNECT_PORT BIT(7) 89 + #define HV_ACCESS_STATS BIT(8) 90 + #define HV_DEBUGGING BIT(11) 91 + #define HV_CPU_POWER_MANAGEMENT BIT(12) 92 + 93 + 94 + /* 95 + * TSC page layout. 96 + */ 97 + struct ms_hyperv_tsc_page { 98 + volatile u32 tsc_sequence; 99 + u32 reserved1; 100 + volatile u64 tsc_scale; 101 + volatile s64 tsc_offset; 102 + } __packed; 103 + 104 + /* 105 + * The guest OS needs to register the guest ID with the hypervisor. 106 + * The guest ID is a 64 bit entity and the structure of this ID is 107 + * specified in the Hyper-V specification: 108 + * 109 + * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx 110 + * 111 + * While the current guideline does not specify how Linux guest ID(s) 112 + * need to be generated, our plan is to publish the guidelines for 113 + * Linux and other guest operating systems that currently are hosted 114 + * on Hyper-V. The implementation here conforms to this yet 115 + * unpublished guidelines. 116 + * 117 + * 118 + * Bit(s) 119 + * 63 - Indicates if the OS is Open Source or not; 1 is Open Source 120 + * 62:56 - Os Type; Linux is 0x100 121 + * 55:48 - Distro specific identification 122 + * 47:16 - Linux kernel version number 123 + * 15:0 - Distro specific identification 124 + * 125 + * 126 + */ 127 + 128 + #define HV_LINUX_VENDOR_ID 0x8100 129 + 130 + /* 131 + * Crash notification flags. 132 + */ 133 + #define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) 134 + #define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) 135 + 136 + /* Declare the various hypercall operations. */ 137 + #define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 138 + #define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 139 + #define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 140 + #define HVCALL_SEND_IPI 0x000b 141 + #define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 142 + #define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 143 + #define HVCALL_SEND_IPI_EX 0x0015 144 + #define HVCALL_GET_VP_REGISTERS 0x0050 145 + #define HVCALL_SET_VP_REGISTERS 0x0051 146 + #define HVCALL_POST_MESSAGE 0x005c 147 + #define HVCALL_SIGNAL_EVENT 0x005d 148 + #define HVCALL_RETARGET_INTERRUPT 0x007e 149 + #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af 150 + #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 151 + 152 + #define HV_FLUSH_ALL_PROCESSORS BIT(0) 153 + #define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) 154 + #define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) 155 + #define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) 156 + 157 + enum HV_GENERIC_SET_FORMAT { 158 + HV_GENERIC_SET_SPARSE_4K, 159 + HV_GENERIC_SET_ALL, 160 + }; 161 + 162 + #define HV_PARTITION_ID_SELF ((u64)-1) 163 + #define HV_VP_INDEX_SELF ((u32)-2) 164 + 165 + #define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) 166 + #define HV_HYPERCALL_FAST_BIT BIT(16) 167 + #define HV_HYPERCALL_VARHEAD_OFFSET 17 168 + #define HV_HYPERCALL_REP_COMP_OFFSET 32 169 + #define HV_HYPERCALL_REP_COMP_1 BIT_ULL(32) 170 + #define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) 171 + #define HV_HYPERCALL_REP_START_OFFSET 48 172 + #define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) 173 + 174 + /* hypercall status code */ 175 + #define HV_STATUS_SUCCESS 0 176 + #define HV_STATUS_INVALID_HYPERCALL_CODE 2 177 + #define HV_STATUS_INVALID_HYPERCALL_INPUT 3 178 + #define HV_STATUS_INVALID_ALIGNMENT 4 179 + #define HV_STATUS_INVALID_PARAMETER 5 180 + #define HV_STATUS_INSUFFICIENT_MEMORY 11 181 + #define HV_STATUS_INVALID_PORT_ID 17 182 + #define HV_STATUS_INVALID_CONNECTION_ID 18 183 + #define HV_STATUS_INSUFFICIENT_BUFFERS 19 184 + 185 + /* 186 + * The Hyper-V TimeRefCount register and the TSC 187 + * page provide a guest VM clock with 100ns tick rate 188 + */ 189 + #define HV_CLOCK_HZ (NSEC_PER_SEC/100) 190 + 191 + /* Define the number of synthetic interrupt sources. */ 192 + #define HV_SYNIC_SINT_COUNT (16) 193 + /* Define the expected SynIC version. */ 194 + #define HV_SYNIC_VERSION_1 (0x1) 195 + /* Valid SynIC vectors are 16-255. */ 196 + #define HV_SYNIC_FIRST_VALID_VECTOR (16) 197 + 198 + #define HV_SYNIC_CONTROL_ENABLE (1ULL << 0) 199 + #define HV_SYNIC_SIMP_ENABLE (1ULL << 0) 200 + #define HV_SYNIC_SIEFP_ENABLE (1ULL << 0) 201 + #define HV_SYNIC_SINT_MASKED (1ULL << 16) 202 + #define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17) 203 + #define HV_SYNIC_SINT_VECTOR_MASK (0xFF) 204 + 205 + #define HV_SYNIC_STIMER_COUNT (4) 206 + 207 + /* Define synthetic interrupt controller message constants. */ 208 + #define HV_MESSAGE_SIZE (256) 209 + #define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) 210 + #define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30) 211 + 212 + /* Define synthetic interrupt controller message flags. */ 213 + union hv_message_flags { 214 + __u8 asu8; 215 + struct { 216 + __u8 msg_pending:1; 217 + __u8 reserved:7; 218 + } __packed; 219 + }; 220 + 221 + /* Define port identifier type. */ 222 + union hv_port_id { 223 + __u32 asu32; 224 + struct { 225 + __u32 id:24; 226 + __u32 reserved:8; 227 + } __packed u; 228 + }; 229 + 230 + /* Define synthetic interrupt controller message header. */ 231 + struct hv_message_header { 232 + __u32 message_type; 233 + __u8 payload_size; 234 + union hv_message_flags message_flags; 235 + __u8 reserved[2]; 236 + union { 237 + __u64 sender; 238 + union hv_port_id port; 239 + }; 240 + } __packed; 241 + 242 + /* Define synthetic interrupt controller message format. */ 243 + struct hv_message { 244 + struct hv_message_header header; 245 + union { 246 + __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; 247 + } u; 248 + } __packed; 249 + 250 + /* Define the synthetic interrupt message page layout. */ 251 + struct hv_message_page { 252 + struct hv_message sint_message[HV_SYNIC_SINT_COUNT]; 253 + } __packed; 254 + 255 + /* Define timer message payload structure. */ 256 + struct hv_timer_message_payload { 257 + __u32 timer_index; 258 + __u32 reserved; 259 + __u64 expiration_time; /* When the timer expired */ 260 + __u64 delivery_time; /* When the message was delivered */ 261 + } __packed; 262 + 263 + 264 + /* Define synthetic interrupt controller flag constants. */ 265 + #define HV_EVENT_FLAGS_COUNT (256 * 8) 266 + #define HV_EVENT_FLAGS_LONG_COUNT (256 / sizeof(unsigned long)) 267 + 268 + /* 269 + * Synthetic timer configuration. 270 + */ 271 + union hv_stimer_config { 272 + u64 as_uint64; 273 + struct { 274 + u64 enable:1; 275 + u64 periodic:1; 276 + u64 lazy:1; 277 + u64 auto_enable:1; 278 + u64 apic_vector:8; 279 + u64 direct_mode:1; 280 + u64 reserved_z0:3; 281 + u64 sintx:4; 282 + u64 reserved_z1:44; 283 + } __packed; 284 + }; 285 + 286 + 287 + /* Define the synthetic interrupt controller event flags format. */ 288 + union hv_synic_event_flags { 289 + unsigned long flags[HV_EVENT_FLAGS_LONG_COUNT]; 290 + }; 291 + 292 + /* Define SynIC control register. */ 293 + union hv_synic_scontrol { 294 + u64 as_uint64; 295 + struct { 296 + u64 enable:1; 297 + u64 reserved:63; 298 + } __packed; 299 + }; 300 + 301 + /* Define synthetic interrupt source. */ 302 + union hv_synic_sint { 303 + u64 as_uint64; 304 + struct { 305 + u64 vector:8; 306 + u64 reserved1:8; 307 + u64 masked:1; 308 + u64 auto_eoi:1; 309 + u64 polling:1; 310 + u64 reserved2:45; 311 + } __packed; 312 + }; 313 + 314 + /* Define the format of the SIMP register */ 315 + union hv_synic_simp { 316 + u64 as_uint64; 317 + struct { 318 + u64 simp_enabled:1; 319 + u64 preserved:11; 320 + u64 base_simp_gpa:52; 321 + } __packed; 322 + }; 323 + 324 + /* Define the format of the SIEFP register */ 325 + union hv_synic_siefp { 326 + u64 as_uint64; 327 + struct { 328 + u64 siefp_enabled:1; 329 + u64 preserved:11; 330 + u64 base_siefp_gpa:52; 331 + } __packed; 332 + }; 333 + 334 + struct hv_vpset { 335 + u64 format; 336 + u64 valid_bank_mask; 337 + u64 bank_contents[]; 338 + } __packed; 339 + 340 + /* HvCallSendSyntheticClusterIpi hypercall */ 341 + struct hv_send_ipi { 342 + u32 vector; 343 + u32 reserved; 344 + u64 cpu_mask; 345 + } __packed; 346 + 347 + /* HvCallSendSyntheticClusterIpiEx hypercall */ 348 + struct hv_send_ipi_ex { 349 + u32 vector; 350 + u32 reserved; 351 + struct hv_vpset vp_set; 352 + } __packed; 353 + 354 + /* HvFlushGuestPhysicalAddressSpace hypercalls */ 355 + struct hv_guest_mapping_flush { 356 + u64 address_space; 357 + u64 flags; 358 + } __packed; 359 + 360 + /* 361 + * HV_MAX_FLUSH_PAGES = "additional_pages" + 1. It's limited 362 + * by the bitwidth of "additional_pages" in union hv_gpa_page_range. 363 + */ 364 + #define HV_MAX_FLUSH_PAGES (2048) 365 + 366 + /* HvFlushGuestPhysicalAddressList hypercall */ 367 + union hv_gpa_page_range { 368 + u64 address_space; 369 + struct { 370 + u64 additional_pages:11; 371 + u64 largepage:1; 372 + u64 basepfn:52; 373 + } page; 374 + }; 375 + 376 + /* 377 + * All input flush parameters should be in single page. The max flush 378 + * count is equal with how many entries of union hv_gpa_page_range can 379 + * be populated into the input parameter page. 380 + */ 381 + #define HV_MAX_FLUSH_REP_COUNT ((HV_HYP_PAGE_SIZE - 2 * sizeof(u64)) / \ 382 + sizeof(union hv_gpa_page_range)) 383 + 384 + struct hv_guest_mapping_flush_list { 385 + u64 address_space; 386 + u64 flags; 387 + union hv_gpa_page_range gpa_list[HV_MAX_FLUSH_REP_COUNT]; 388 + }; 389 + 390 + /* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ 391 + struct hv_tlb_flush { 392 + u64 address_space; 393 + u64 flags; 394 + u64 processor_mask; 395 + u64 gva_list[]; 396 + } __packed; 397 + 398 + /* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ 399 + struct hv_tlb_flush_ex { 400 + u64 address_space; 401 + u64 flags; 402 + struct hv_vpset hv_vp_set; 403 + u64 gva_list[]; 404 + } __packed; 405 + 406 + /* HvRetargetDeviceInterrupt hypercall */ 407 + union hv_msi_entry { 408 + u64 as_uint64; 409 + struct { 410 + u32 address; 411 + u32 data; 412 + } __packed; 413 + }; 414 + 415 + struct hv_interrupt_entry { 416 + u32 source; /* 1 for MSI(-X) */ 417 + u32 reserved1; 418 + union hv_msi_entry msi_entry; 419 + } __packed; 420 + 421 + /* 422 + * flags for hv_device_interrupt_target.flags 423 + */ 424 + #define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1 425 + #define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2 426 + 427 + struct hv_device_interrupt_target { 428 + u32 vector; 429 + u32 flags; 430 + union { 431 + u64 vp_mask; 432 + struct hv_vpset vp_set; 433 + }; 434 + } __packed; 435 + 436 + struct hv_retarget_device_interrupt { 437 + u64 partition_id; /* use "self" */ 438 + u64 device_id; 439 + struct hv_interrupt_entry int_entry; 440 + u64 reserved2; 441 + struct hv_device_interrupt_target int_target; 442 + } __packed __aligned(8); 443 + 444 + 445 + /* HvGetVpRegisters hypercall input with variable size reg name list*/ 446 + struct hv_get_vp_registers_input { 447 + struct { 448 + u64 partitionid; 449 + u32 vpindex; 450 + u8 inputvtl; 451 + u8 padding[3]; 452 + } header; 453 + struct input { 454 + u32 name0; 455 + u32 name1; 456 + } element[]; 457 + } __packed; 458 + 459 + 460 + /* HvGetVpRegisters returns an array of these output elements */ 461 + struct hv_get_vp_registers_output { 462 + union { 463 + struct { 464 + u32 a; 465 + u32 b; 466 + u32 c; 467 + u32 d; 468 + } as32 __packed; 469 + struct { 470 + u64 low; 471 + u64 high; 472 + } as64 __packed; 473 + }; 474 + }; 475 + 476 + /* HvSetVpRegisters hypercall with variable size reg name/value list*/ 477 + struct hv_set_vp_registers_input { 478 + struct { 479 + u64 partitionid; 480 + u32 vpindex; 481 + u8 inputvtl; 482 + u8 padding[3]; 483 + } header; 484 + struct { 485 + u32 name; 486 + u32 padding1; 487 + u64 padding2; 488 + u64 valuelow; 489 + u64 valuehigh; 490 + } element[]; 491 + } __packed; 492 + 493 + #endif

+31 -39

include/linux/hyperv.h

··· 117 117 * Ring data starts here + RingDataStartOffset 118 118 * !!! DO NOT place any fields below this !!! 119 119 */ 120 - u8 buffer[0]; 120 + u8 buffer[]; 121 121 } __packed; 122 122 123 123 struct hv_ring_buffer_info { ··· 313 313 struct gpa_range { 314 314 u32 byte_count; 315 315 u32 byte_offset; 316 - u64 pfn_array[0]; 316 + u64 pfn_array[]; 317 317 }; 318 318 319 319 /* ··· 425 425 CHANNELMSG_19 = 19, 426 426 CHANNELMSG_20 = 20, 427 427 CHANNELMSG_TL_CONNECT_REQUEST = 21, 428 - CHANNELMSG_22 = 22, 428 + CHANNELMSG_MODIFYCHANNEL = 22, 429 429 CHANNELMSG_TL_CONNECT_RESULT = 23, 430 430 CHANNELMSG_COUNT 431 431 }; ··· 563 563 u32 gpadl; 564 564 u16 range_buflen; 565 565 u16 rangecount; 566 - struct gpa_range range[0]; 566 + struct gpa_range range[]; 567 567 } __packed; 568 568 569 569 /* This is the followup packet that contains more PFNs. */ ··· 571 571 struct vmbus_channel_message_header header; 572 572 u32 msgnumber; 573 573 u32 gpadl; 574 - u64 pfn[0]; 574 + u64 pfn[]; 575 575 } __packed; 576 576 577 577 struct vmbus_channel_gpadl_created { ··· 618 618 struct vmbus_channel_message_header header; 619 619 guid_t guest_endpoint_id; 620 620 guid_t host_service_id; 621 + } __packed; 622 + 623 + /* Modify Channel parameters, cf. vmbus_send_modifychannel() */ 624 + struct vmbus_channel_modifychannel { 625 + struct vmbus_channel_message_header header; 626 + u32 child_relid; 627 + u32 target_vp; 621 628 } __packed; 622 629 623 630 struct vmbus_channel_version_response { ··· 679 672 * The channel message that goes out on the "wire". 680 673 * It will contain at minimum the VMBUS_CHANNEL_MESSAGE_HEADER header 681 674 */ 682 - unsigned char msg[0]; 675 + unsigned char msg[]; 683 676 }; 684 677 685 678 struct vmbus_close_msg { ··· 694 687 u32 id:24; 695 688 u32 reserved:8; 696 689 } u; 697 - }; 698 - 699 - enum hv_numa_policy { 700 - HV_BALANCED = 0, 701 - HV_LOCALIZED, 702 690 }; 703 691 704 692 enum vmbus_device_type { ··· 773 771 void (*onchannel_callback)(void *context); 774 772 void *channel_callback_context; 775 773 774 + void (*change_target_cpu_callback)(struct vmbus_channel *channel, 775 + u32 old, u32 new); 776 + 777 + /* 778 + * Synchronize channel scheduling and channel removal; see the inline 779 + * comments in vmbus_chan_sched() and vmbus_reset_channel_cb(). 780 + */ 781 + spinlock_t sched_lock; 782 + 776 783 /* 777 784 * A channel can be marked for one of three modes of reading: 778 785 * BATCHED - callback called from taslket and should read ··· 813 802 u32 target_vp; 814 803 /* The corresponding CPUID in the guest */ 815 804 u32 target_cpu; 816 - /* 817 - * State to manage the CPU affiliation of channels. 818 - */ 819 - struct cpumask alloced_cpus_in_node; 820 805 int numa_node; 821 806 /* 822 807 * Support for sub-channels. For high performance devices, ··· 861 854 * Support per-channel state for use by vmbus drivers. 862 855 */ 863 856 void *per_channel_state; 864 - /* 865 - * To support per-cpu lookup mapping of relid to channel, 866 - * link up channels based on their CPU affinity. 867 - */ 868 - struct list_head percpu_list; 869 857 870 858 /* 871 859 * Defer freeing channel until after all cpu's have ··· 899 897 */ 900 898 bool low_latency; 901 899 902 - /* 903 - * NUMA distribution policy: 904 - * We support two policies: 905 - * 1) Balanced: Here all performance critical channels are 906 - * distributed evenly amongst all the NUMA nodes. 907 - * This policy will be the default policy. 908 - * 2) Localized: All channels of a given instance of a 909 - * performance critical service will be assigned CPUs 910 - * within a selected NUMA node. 911 - */ 912 - enum hv_numa_policy affinity_policy; 913 - 914 900 bool probe_done; 901 + 902 + /* 903 + * Cache the device ID here for easy access; this is useful, in 904 + * particular, in situations where the channel's device_obj has 905 + * not been allocated/initialized yet. 906 + */ 907 + u16 device_id; 915 908 916 909 /* 917 910 * We must offload the handling of the primary/sub channels ··· 961 964 return c->offermsg.offer.sub_channel_index != 0; 962 965 } 963 966 964 - static inline void set_channel_affinity_state(struct vmbus_channel *c, 965 - enum hv_numa_policy policy) 966 - { 967 - c->affinity_policy = policy; 968 - } 969 - 970 967 static inline void set_channel_read_mode(struct vmbus_channel *c, 971 968 enum hv_callback_mode mode) 972 969 { ··· 1008 1017 c->low_latency = false; 1009 1018 } 1010 1019 1011 - void vmbus_onmessage(void *context); 1020 + void vmbus_onmessage(struct vmbus_channel_message_header *hdr); 1012 1021 1013 1022 int vmbus_request_offers(void); 1014 1023 ··· 1522 1531 1523 1532 int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id, 1524 1533 const guid_t *shv_host_servie_id); 1534 + int vmbus_send_modifychannel(u32 child_relid, u32 target_vp); 1525 1535 void vmbus_set_event(struct vmbus_channel *channel); 1526 1536 1527 1537 /* Get the start of the ring buffer. */

+1 -1

include/linux/mod_devicetable.h

··· 434 434 * For Hyper-V devices we use the device guid as the id. 435 435 */ 436 436 struct hv_vmbus_device_id { 437 - uuid_le guid; 437 + guid_t guid; 438 438 kernel_ulong_t driver_data; /* Data private to the driver */ 439 439 }; 440 440