| // SPDX-License-Identifier: GPL-2.0-or-later |
| #include <linux/acpi.h> |
| #include <linux/cpu.h> |
| #include <linux/delay.h> |
| #include <linux/io.h> |
| #include <linux/kexec.h> |
| #include <linux/memblock.h> |
| #include <linux/pgtable.h> |
| #include <linux/sched/hotplug.h> |
| #include <asm/apic.h> |
| #include <asm/barrier.h> |
| #include <asm/init.h> |
| #include <asm/intel_pt.h> |
| #include <asm/nmi.h> |
| #include <asm/processor.h> |
| #include <asm/reboot.h> |
| |
| /* Physical address of the Multiprocessor Wakeup Structure mailbox */ |
| static u64 acpi_mp_wake_mailbox_paddr __ro_after_init; |
| |
| /* Virtual address of the Multiprocessor Wakeup Structure mailbox */ |
| static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox; |
| |
| static u64 acpi_mp_pgd __ro_after_init; |
| static u64 acpi_mp_reset_vector_paddr __ro_after_init; |
| |
| static void acpi_mp_stop_this_cpu(void) |
| { |
| asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd); |
| } |
| |
| static void acpi_mp_play_dead(void) |
| { |
| play_dead_common(); |
| asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd); |
| } |
| |
| static void acpi_mp_cpu_die(unsigned int cpu) |
| { |
| u32 apicid = per_cpu(x86_cpu_to_apicid, cpu); |
| unsigned long timeout; |
| |
| /* |
| * Use TEST mailbox command to prove that BIOS got control over |
| * the CPU before declaring it dead. |
| * |
| * BIOS has to clear 'command' field of the mailbox. |
| */ |
| acpi_mp_wake_mailbox->apic_id = apicid; |
| smp_store_release(&acpi_mp_wake_mailbox->command, |
| ACPI_MP_WAKE_COMMAND_TEST); |
| |
| /* Don't wait longer than a second. */ |
| timeout = USEC_PER_SEC; |
| while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout) |
| udelay(1); |
| |
| if (!timeout) |
| pr_err("Failed to hand over CPU %d to BIOS\n", cpu); |
| } |
| |
| /* The argument is required to match type of x86_mapping_info::alloc_pgt_page */ |
| static void __init *alloc_pgt_page(void *dummy) |
| { |
| return memblock_alloc(PAGE_SIZE, PAGE_SIZE); |
| } |
| |
| static void __init free_pgt_page(void *pgt, void *dummy) |
| { |
| return memblock_free(pgt, PAGE_SIZE); |
| } |
| |
| /* |
| * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at |
| * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches |
| * to the identity mapping and the function has be present at the same spot in |
| * the virtual address space before and after switching page tables. |
| */ |
| static int __init init_transition_pgtable(pgd_t *pgd) |
| { |
| pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; |
| unsigned long vaddr, paddr; |
| p4d_t *p4d; |
| pud_t *pud; |
| pmd_t *pmd; |
| pte_t *pte; |
| |
| vaddr = (unsigned long)asm_acpi_mp_play_dead; |
| pgd += pgd_index(vaddr); |
| if (!pgd_present(*pgd)) { |
| p4d = (p4d_t *)alloc_pgt_page(NULL); |
| if (!p4d) |
| return -ENOMEM; |
| set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); |
| } |
| p4d = p4d_offset(pgd, vaddr); |
| if (!p4d_present(*p4d)) { |
| pud = (pud_t *)alloc_pgt_page(NULL); |
| if (!pud) |
| return -ENOMEM; |
| set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); |
| } |
| pud = pud_offset(p4d, vaddr); |
| if (!pud_present(*pud)) { |
| pmd = (pmd_t *)alloc_pgt_page(NULL); |
| if (!pmd) |
| return -ENOMEM; |
| set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); |
| } |
| pmd = pmd_offset(pud, vaddr); |
| if (!pmd_present(*pmd)) { |
| pte = (pte_t *)alloc_pgt_page(NULL); |
| if (!pte) |
| return -ENOMEM; |
| set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); |
| } |
| pte = pte_offset_kernel(pmd, vaddr); |
| |
| paddr = __pa(vaddr); |
| set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); |
| |
| return 0; |
| } |
| |
| static int __init acpi_mp_setup_reset(u64 reset_vector) |
| { |
| struct x86_mapping_info info = { |
| .alloc_pgt_page = alloc_pgt_page, |
| .free_pgt_page = free_pgt_page, |
| .page_flag = __PAGE_KERNEL_LARGE_EXEC, |
| .kernpg_flag = _KERNPG_TABLE_NOENC, |
| }; |
| pgd_t *pgd; |
| |
| pgd = alloc_pgt_page(NULL); |
| if (!pgd) |
| return -ENOMEM; |
| |
| for (int i = 0; i < nr_pfn_mapped; i++) { |
| unsigned long mstart, mend; |
| |
| mstart = pfn_mapped[i].start << PAGE_SHIFT; |
| mend = pfn_mapped[i].end << PAGE_SHIFT; |
| if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) { |
| kernel_ident_mapping_free(&info, pgd); |
| return -ENOMEM; |
| } |
| } |
| |
| if (kernel_ident_mapping_init(&info, pgd, |
| PAGE_ALIGN_DOWN(reset_vector), |
| PAGE_ALIGN(reset_vector + 1))) { |
| kernel_ident_mapping_free(&info, pgd); |
| return -ENOMEM; |
| } |
| |
| if (init_transition_pgtable(pgd)) { |
| kernel_ident_mapping_free(&info, pgd); |
| return -ENOMEM; |
| } |
| |
| smp_ops.play_dead = acpi_mp_play_dead; |
| smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu; |
| smp_ops.cpu_die = acpi_mp_cpu_die; |
| |
| acpi_mp_reset_vector_paddr = reset_vector; |
| acpi_mp_pgd = __pa(pgd); |
| |
| return 0; |
| } |
| |
| static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip) |
| { |
| if (!acpi_mp_wake_mailbox_paddr) { |
| pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n"); |
| return -EOPNOTSUPP; |
| } |
| |
| /* |
| * Remap mailbox memory only for the first call to acpi_wakeup_cpu(). |
| * |
| * Wakeup of secondary CPUs is fully serialized in the core code. |
| * No need to protect acpi_mp_wake_mailbox from concurrent accesses. |
| */ |
| if (!acpi_mp_wake_mailbox) { |
| acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr, |
| sizeof(*acpi_mp_wake_mailbox), |
| MEMREMAP_WB); |
| } |
| |
| /* |
| * Mailbox memory is shared between the firmware and OS. Firmware will |
| * listen on mailbox command address, and once it receives the wakeup |
| * command, the CPU associated with the given apicid will be booted. |
| * |
| * The value of 'apic_id' and 'wakeup_vector' must be visible to the |
| * firmware before the wakeup command is visible. smp_store_release() |
| * ensures ordering and visibility. |
| */ |
| acpi_mp_wake_mailbox->apic_id = apicid; |
| acpi_mp_wake_mailbox->wakeup_vector = start_ip; |
| smp_store_release(&acpi_mp_wake_mailbox->command, |
| ACPI_MP_WAKE_COMMAND_WAKEUP); |
| |
| /* |
| * Wait for the CPU to wake up. |
| * |
| * The CPU being woken up is essentially in a spin loop waiting to be |
| * woken up. It should not take long for it wake up and acknowledge by |
| * zeroing out ->command. |
| * |
| * ACPI specification doesn't provide any guidance on how long kernel |
| * has to wait for a wake up acknowledgment. It also doesn't provide |
| * a way to cancel a wake up request if it takes too long. |
| * |
| * In TDX environment, the VMM has control over how long it takes to |
| * wake up secondary. It can postpone scheduling secondary vCPU |
| * indefinitely. Giving up on wake up request and reporting error opens |
| * possible attack vector for VMM: it can wake up a secondary CPU when |
| * kernel doesn't expect it. Wait until positive result of the wake up |
| * request. |
| */ |
| while (READ_ONCE(acpi_mp_wake_mailbox->command)) |
| cpu_relax(); |
| |
| return 0; |
| } |
| |
| static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake) |
| { |
| cpu_hotplug_disable_offlining(); |
| |
| /* |
| * ACPI MADT doesn't allow to offline a CPU after it was onlined. This |
| * limits kexec: the second kernel won't be able to use more than one CPU. |
| * |
| * To prevent a kexec kernel from onlining secondary CPUs invalidate the |
| * mailbox address in the ACPI MADT wakeup structure which prevents a |
| * kexec kernel to use it. |
| * |
| * This is safe as the booting kernel has the mailbox address cached |
| * already and acpi_wakeup_cpu() uses the cached value to bring up the |
| * secondary CPUs. |
| * |
| * Note: This is a Linux specific convention and not covered by the |
| * ACPI specification. |
| */ |
| mp_wake->mailbox_address = 0; |
| } |
| |
| int __init acpi_parse_mp_wake(union acpi_subtable_headers *header, |
| const unsigned long end) |
| { |
| struct acpi_madt_multiproc_wakeup *mp_wake; |
| |
| mp_wake = (struct acpi_madt_multiproc_wakeup *)header; |
| |
| /* |
| * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake |
| * entry. 'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger |
| * than the actual size of the MP wakeup entry in ACPI table because the |
| * 'reset_vector' is only available in the V1 MP wakeup structure. |
| */ |
| if (!mp_wake) |
| return -EINVAL; |
| if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0) |
| return -EINVAL; |
| if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0) |
| return -EINVAL; |
| |
| acpi_table_print_madt_entry(&header->common); |
| |
| acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address; |
| |
| if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 && |
| mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) { |
| if (acpi_mp_setup_reset(mp_wake->reset_vector)) { |
| pr_warn("Failed to setup MADT reset vector\n"); |
| acpi_mp_disable_offlining(mp_wake); |
| } |
| } else { |
| /* |
| * CPU offlining requires version 1 of the ACPI MADT wakeup |
| * structure. |
| */ |
| acpi_mp_disable_offlining(mp_wake); |
| } |
| |
| apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu); |
| |
| return 0; |
| } |