arch/x86/kernel/acpi/madt_wakeup.c - linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0-or-later
 #include <linux/acpi.h>
 #include <linux/cpu.h>
 #include <linux/delay.h>
 #include <linux/io.h>
 #include <linux/kexec.h>
 #include <linux/memblock.h>
 #include <linux/pgtable.h>
 #include <linux/sched/hotplug.h>
 #include <asm/apic.h>
 #include <asm/barrier.h>
 #include <asm/init.h>
 #include <asm/intel_pt.h>
 #include <asm/nmi.h>
 #include <asm/processor.h>
 #include <asm/reboot.h>

 /* Physical address of the Multiprocessor Wakeup Structure mailbox */
 static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;

 /* Virtual address of the Multiprocessor Wakeup Structure mailbox */
 static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;

 static u64 acpi_mp_pgd __ro_after_init;
 static u64 acpi_mp_reset_vector_paddr __ro_after_init;

 static void acpi_mp_stop_this_cpu(void)
 {
 	asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
 }

 static void acpi_mp_play_dead(void)
 {
 	play_dead_common();
 	asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
 }

 static void acpi_mp_cpu_die(unsigned int cpu)
 {
 	u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
 	unsigned long timeout;

 	/*
 	 * Use TEST mailbox command to prove that BIOS got control over
 	 * the CPU before declaring it dead.
 	 *
 	 * BIOS has to clear 'command' field of the mailbox.
 	 */
 	acpi_mp_wake_mailbox->apic_id = apicid;
 	smp_store_release(&acpi_mp_wake_mailbox->command,
 			  ACPI_MP_WAKE_COMMAND_TEST);

 	/* Don't wait longer than a second. */
 	timeout = USEC_PER_SEC;
 	while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout)
 		udelay(1);

 	if (!timeout)
 		pr_err("Failed to hand over CPU %d to BIOS\n", cpu);
 }

 /* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
 static void __init *alloc_pgt_page(void *dummy)
 {
 	return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
 }

 static void __init free_pgt_page(void *pgt, void *dummy)
 {
 	return memblock_free(pgt, PAGE_SIZE);
 }

 /*
  * Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
  * the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
  * to the identity mapping and the function has be present at the same spot in
  * the virtual address space before and after switching page tables.
  */
 static int __init init_transition_pgtable(pgd_t *pgd)
 {
 	pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
 	unsigned long vaddr, paddr;
 	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;

 	vaddr = (unsigned long)asm_acpi_mp_play_dead;
 	pgd += pgd_index(vaddr);
 	if (!pgd_present(*pgd)) {
 		p4d = (p4d_t *)alloc_pgt_page(NULL);
 		if (!p4d)
 			return -ENOMEM;
 		set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
 	}
 	p4d = p4d_offset(pgd, vaddr);
 	if (!p4d_present(*p4d)) {
 		pud = (pud_t *)alloc_pgt_page(NULL);
 		if (!pud)
 			return -ENOMEM;
 		set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
 	}
 	pud = pud_offset(p4d, vaddr);
 	if (!pud_present(*pud)) {
 		pmd = (pmd_t *)alloc_pgt_page(NULL);
 		if (!pmd)
 			return -ENOMEM;
 		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
 	}
 	pmd = pmd_offset(pud, vaddr);
 	if (!pmd_present(*pmd)) {
 		pte = (pte_t *)alloc_pgt_page(NULL);
 		if (!pte)
 			return -ENOMEM;
 		set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
 	}
 	pte = pte_offset_kernel(pmd, vaddr);

 	paddr = __pa(vaddr);
 	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));

 	return 0;
 }

 static int __init acpi_mp_setup_reset(u64 reset_vector)
 {
 	struct x86_mapping_info info = {
 		.alloc_pgt_page = alloc_pgt_page,
 		.free_pgt_page	= free_pgt_page,
 		.page_flag      = __PAGE_KERNEL_LARGE_EXEC,
 		.kernpg_flag    = _KERNPG_TABLE_NOENC,
 	};
 	pgd_t *pgd;

 	pgd = alloc_pgt_page(NULL);
 	if (!pgd)
 		return -ENOMEM;

 	for (int i = 0; i < nr_pfn_mapped; i++) {
 		unsigned long mstart, mend;

 		mstart = pfn_mapped[i].start << PAGE_SHIFT;
 		mend   = pfn_mapped[i].end << PAGE_SHIFT;
 		if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
 			kernel_ident_mapping_free(&info, pgd);
 			return -ENOMEM;
 		}
 	}

 	if (kernel_ident_mapping_init(&info, pgd,
 				      PAGE_ALIGN_DOWN(reset_vector),
 				      PAGE_ALIGN(reset_vector + 1))) {
 		kernel_ident_mapping_free(&info, pgd);
 		return -ENOMEM;
 	}

 	if (init_transition_pgtable(pgd)) {
 		kernel_ident_mapping_free(&info, pgd);
 		return -ENOMEM;
 	}

 	smp_ops.play_dead = acpi_mp_play_dead;
 	smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu;
 	smp_ops.cpu_die = acpi_mp_cpu_die;

 	acpi_mp_reset_vector_paddr = reset_vector;
 	acpi_mp_pgd = __pa(pgd);

 	return 0;
 }

 static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
 {
 	if (!acpi_mp_wake_mailbox_paddr) {
 		pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n");
 		return -EOPNOTSUPP;
 	}

 	/*
 	 * Remap mailbox memory only for the first call to acpi_wakeup_cpu().
 	 *
 	 * Wakeup of secondary CPUs is fully serialized in the core code.
 	 * No need to protect acpi_mp_wake_mailbox from concurrent accesses.
 	 */
 	if (!acpi_mp_wake_mailbox) {
 		acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
 						sizeof(*acpi_mp_wake_mailbox),
 						MEMREMAP_WB);
 	}

 	/*
 	 * Mailbox memory is shared between the firmware and OS. Firmware will
 	 * listen on mailbox command address, and once it receives the wakeup
 	 * command, the CPU associated with the given apicid will be booted.
 	 *
 	 * The value of 'apic_id' and 'wakeup_vector' must be visible to the
 	 * firmware before the wakeup command is visible.  smp_store_release()
 	 * ensures ordering and visibility.
 	 */
 	acpi_mp_wake_mailbox->apic_id	    = apicid;
 	acpi_mp_wake_mailbox->wakeup_vector = start_ip;
 	smp_store_release(&acpi_mp_wake_mailbox->command,
 			  ACPI_MP_WAKE_COMMAND_WAKEUP);

 	/*
 	 * Wait for the CPU to wake up.
 	 *
 	 * The CPU being woken up is essentially in a spin loop waiting to be
 	 * woken up. It should not take long for it wake up and acknowledge by
 	 * zeroing out ->command.
 	 *
 	 * ACPI specification doesn't provide any guidance on how long kernel
 	 * has to wait for a wake up acknowledgment. It also doesn't provide
 	 * a way to cancel a wake up request if it takes too long.
 	 *
 	 * In TDX environment, the VMM has control over how long it takes to
 	 * wake up secondary. It can postpone scheduling secondary vCPU
 	 * indefinitely. Giving up on wake up request and reporting error opens
 	 * possible attack vector for VMM: it can wake up a secondary CPU when
 	 * kernel doesn't expect it. Wait until positive result of the wake up
 	 * request.
 	 */
 	while (READ_ONCE(acpi_mp_wake_mailbox->command))
 		cpu_relax();

 	return 0;
 }

 static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake)
 {
 	cpu_hotplug_disable_offlining();

 	/*
 	 * ACPI MADT doesn't allow to offline a CPU after it was onlined. This
 	 * limits kexec: the second kernel won't be able to use more than one CPU.
 	 *
 	 * To prevent a kexec kernel from onlining secondary CPUs invalidate the
 	 * mailbox address in the ACPI MADT wakeup structure which prevents a
 	 * kexec kernel to use it.
 	 *
 	 * This is safe as the booting kernel has the mailbox address cached
 	 * already and acpi_wakeup_cpu() uses the cached value to bring up the
 	 * secondary CPUs.
 	 *
 	 * Note: This is a Linux specific convention and not covered by the
 	 *       ACPI specification.
 	 */
 	mp_wake->mailbox_address = 0;
 }

 int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
 			      const unsigned long end)
 {
 	struct acpi_madt_multiproc_wakeup *mp_wake;

 	mp_wake = (struct acpi_madt_multiproc_wakeup *)header;

 	/*
 	 * Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake
 	 * entry.  'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger
 	 * than the actual size of the MP wakeup entry in ACPI table because the
 	 * 'reset_vector' is only available in the V1 MP wakeup structure.
 	 */
 	if (!mp_wake)
 		return -EINVAL;
 	if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
 		return -EINVAL;
 	if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
 		return -EINVAL;

 	acpi_table_print_madt_entry(&header->common);

 	acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address;

 	if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 &&
 	    mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) {
 		if (acpi_mp_setup_reset(mp_wake->reset_vector)) {
 			pr_warn("Failed to setup MADT reset vector\n");
 			acpi_mp_disable_offlining(mp_wake);
 		}
 	} else {
 		/*
 		 * CPU offlining requires version 1 of the ACPI MADT wakeup
 		 * structure.
 		 */
 		acpi_mp_disable_offlining(mp_wake);
 	}

 	apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);

 	return 0;
 }
	// SPDX-License-Identifier: GPL-2.0-or-later
	#include <linux/acpi.h>
	#include <linux/cpu.h>
	#include <linux/delay.h>
	#include <linux/io.h>
	#include <linux/kexec.h>
	#include <linux/memblock.h>
	#include <linux/pgtable.h>
	#include <linux/sched/hotplug.h>
	#include <asm/apic.h>
	#include <asm/barrier.h>
	#include <asm/init.h>
	#include <asm/intel_pt.h>
	#include <asm/nmi.h>
	#include <asm/processor.h>
	#include <asm/reboot.h>

	/* Physical address of the Multiprocessor Wakeup Structure mailbox */
	static u64 acpi_mp_wake_mailbox_paddr __ro_after_init;

	/* Virtual address of the Multiprocessor Wakeup Structure mailbox */
	static struct acpi_madt_multiproc_wakeup_mailbox *acpi_mp_wake_mailbox;

	static u64 acpi_mp_pgd __ro_after_init;
	static u64 acpi_mp_reset_vector_paddr __ro_after_init;

	static void acpi_mp_stop_this_cpu(void)
	{
	asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
	}

	static void acpi_mp_play_dead(void)
	{
	play_dead_common();
	asm_acpi_mp_play_dead(acpi_mp_reset_vector_paddr, acpi_mp_pgd);
	}

	static void acpi_mp_cpu_die(unsigned int cpu)
	{
	u32 apicid = per_cpu(x86_cpu_to_apicid, cpu);
	unsigned long timeout;

	/*
	* Use TEST mailbox command to prove that BIOS got control over
	* the CPU before declaring it dead.
	*
	* BIOS has to clear 'command' field of the mailbox.
	*/
	acpi_mp_wake_mailbox->apic_id = apicid;
	smp_store_release(&acpi_mp_wake_mailbox->command,
	ACPI_MP_WAKE_COMMAND_TEST);

	/* Don't wait longer than a second. */
	timeout = USEC_PER_SEC;
	while (READ_ONCE(acpi_mp_wake_mailbox->command) && --timeout)
	udelay(1);

	if (!timeout)
	pr_err("Failed to hand over CPU %d to BIOS\n", cpu);
	}

	/* The argument is required to match type of x86_mapping_info::alloc_pgt_page */
	static void __init alloc_pgt_page(void dummy)
	{
	return memblock_alloc(PAGE_SIZE, PAGE_SIZE);
	}

	static void __init free_pgt_page(void pgt, void dummy)
	{
	return memblock_free(pgt, PAGE_SIZE);
	}

	/*
	* Make sure asm_acpi_mp_play_dead() is present in the identity mapping at
	* the same place as in the kernel page tables. asm_acpi_mp_play_dead() switches
	* to the identity mapping and the function has be present at the same spot in
	* the virtual address space before and after switching page tables.
	*/
	static int __init init_transition_pgtable(pgd_t *pgd)
	{
	pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
	unsigned long vaddr, paddr;
	p4d_t *p4d;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	vaddr = (unsigned long)asm_acpi_mp_play_dead;
	pgd += pgd_index(vaddr);
	if (!pgd_present(*pgd)) {
	p4d = (p4d_t *)alloc_pgt_page(NULL);
	if (!p4d)
	return -ENOMEM;
	set_pgd(pgd, __pgd(__pa(p4d) \| _KERNPG_TABLE));
	}
	p4d = p4d_offset(pgd, vaddr);
	if (!p4d_present(*p4d)) {
	pud = (pud_t *)alloc_pgt_page(NULL);
	if (!pud)
	return -ENOMEM;
	set_p4d(p4d, __p4d(__pa(pud) \| _KERNPG_TABLE));
	}
	pud = pud_offset(p4d, vaddr);
	if (!pud_present(*pud)) {
	pmd = (pmd_t *)alloc_pgt_page(NULL);
	if (!pmd)
	return -ENOMEM;
	set_pud(pud, __pud(__pa(pmd) \| _KERNPG_TABLE));
	}
	pmd = pmd_offset(pud, vaddr);
	if (!pmd_present(*pmd)) {
	pte = (pte_t *)alloc_pgt_page(NULL);
	if (!pte)
	return -ENOMEM;
	set_pmd(pmd, __pmd(__pa(pte) \| _KERNPG_TABLE));
	}
	pte = pte_offset_kernel(pmd, vaddr);

	paddr = __pa(vaddr);
	set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));

	return 0;
	}

	static int __init acpi_mp_setup_reset(u64 reset_vector)
	{
	struct x86_mapping_info info = {
	.alloc_pgt_page = alloc_pgt_page,
	.free_pgt_page = free_pgt_page,
	.page_flag = __PAGE_KERNEL_LARGE_EXEC,
	.kernpg_flag = _KERNPG_TABLE_NOENC,
	};
	pgd_t *pgd;

	pgd = alloc_pgt_page(NULL);
	if (!pgd)
	return -ENOMEM;

	for (int i = 0; i < nr_pfn_mapped; i++) {
	unsigned long mstart, mend;

	mstart = pfn_mapped[i].start << PAGE_SHIFT;
	mend = pfn_mapped[i].end << PAGE_SHIFT;
	if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
	kernel_ident_mapping_free(&info, pgd);
	return -ENOMEM;
	}
	}

	if (kernel_ident_mapping_init(&info, pgd,
	PAGE_ALIGN_DOWN(reset_vector),
	PAGE_ALIGN(reset_vector + 1))) {
	kernel_ident_mapping_free(&info, pgd);
	return -ENOMEM;
	}

	if (init_transition_pgtable(pgd)) {
	kernel_ident_mapping_free(&info, pgd);
	return -ENOMEM;
	}

	smp_ops.play_dead = acpi_mp_play_dead;
	smp_ops.stop_this_cpu = acpi_mp_stop_this_cpu;
	smp_ops.cpu_die = acpi_mp_cpu_die;

	acpi_mp_reset_vector_paddr = reset_vector;
	acpi_mp_pgd = __pa(pgd);

	return 0;
	}

	static int acpi_wakeup_cpu(u32 apicid, unsigned long start_ip)
	{
	if (!acpi_mp_wake_mailbox_paddr) {
	pr_warn_once("No MADT mailbox: cannot bringup secondary CPUs. Booting with kexec?\n");
	return -EOPNOTSUPP;
	}

	/*
	* Remap mailbox memory only for the first call to acpi_wakeup_cpu().
	*
	* Wakeup of secondary CPUs is fully serialized in the core code.
	* No need to protect acpi_mp_wake_mailbox from concurrent accesses.
	*/
	if (!acpi_mp_wake_mailbox) {
	acpi_mp_wake_mailbox = memremap(acpi_mp_wake_mailbox_paddr,
	sizeof(*acpi_mp_wake_mailbox),
	MEMREMAP_WB);
	}

	/*
	* Mailbox memory is shared between the firmware and OS. Firmware will
	* listen on mailbox command address, and once it receives the wakeup
	* command, the CPU associated with the given apicid will be booted.
	*
	* The value of 'apic_id' and 'wakeup_vector' must be visible to the
	* firmware before the wakeup command is visible. smp_store_release()
	* ensures ordering and visibility.
	*/
	acpi_mp_wake_mailbox->apic_id = apicid;
	acpi_mp_wake_mailbox->wakeup_vector = start_ip;
	smp_store_release(&acpi_mp_wake_mailbox->command,
	ACPI_MP_WAKE_COMMAND_WAKEUP);

	/*
	* Wait for the CPU to wake up.
	*
	* The CPU being woken up is essentially in a spin loop waiting to be
	* woken up. It should not take long for it wake up and acknowledge by
	* zeroing out ->command.
	*
	* ACPI specification doesn't provide any guidance on how long kernel
	* has to wait for a wake up acknowledgment. It also doesn't provide
	* a way to cancel a wake up request if it takes too long.
	*
	* In TDX environment, the VMM has control over how long it takes to
	* wake up secondary. It can postpone scheduling secondary vCPU
	* indefinitely. Giving up on wake up request and reporting error opens
	* possible attack vector for VMM: it can wake up a secondary CPU when
	* kernel doesn't expect it. Wait until positive result of the wake up
	* request.
	*/
	while (READ_ONCE(acpi_mp_wake_mailbox->command))
	cpu_relax();

	return 0;
	}

	static void acpi_mp_disable_offlining(struct acpi_madt_multiproc_wakeup *mp_wake)
	{
	cpu_hotplug_disable_offlining();

	/*
	* ACPI MADT doesn't allow to offline a CPU after it was onlined. This
	* limits kexec: the second kernel won't be able to use more than one CPU.
	*
	* To prevent a kexec kernel from onlining secondary CPUs invalidate the
	* mailbox address in the ACPI MADT wakeup structure which prevents a
	* kexec kernel to use it.
	*
	* This is safe as the booting kernel has the mailbox address cached
	* already and acpi_wakeup_cpu() uses the cached value to bring up the
	* secondary CPUs.
	*
	* Note: This is a Linux specific convention and not covered by the
	* ACPI specification.
	*/
	mp_wake->mailbox_address = 0;
	}

	int __init acpi_parse_mp_wake(union acpi_subtable_headers *header,
	const unsigned long end)
	{
	struct acpi_madt_multiproc_wakeup *mp_wake;

	mp_wake = (struct acpi_madt_multiproc_wakeup *)header;

	/*
	* Cannot use the standard BAD_MADT_ENTRY() to sanity check the @mp_wake
	* entry. 'sizeof (struct acpi_madt_multiproc_wakeup)' can be larger
	* than the actual size of the MP wakeup entry in ACPI table because the
	* 'reset_vector' is only available in the V1 MP wakeup structure.
	*/
	if (!mp_wake)
	return -EINVAL;
	if (end - (unsigned long)mp_wake < ACPI_MADT_MP_WAKEUP_SIZE_V0)
	return -EINVAL;
	if (mp_wake->header.length < ACPI_MADT_MP_WAKEUP_SIZE_V0)
	return -EINVAL;

	acpi_table_print_madt_entry(&header->common);

	acpi_mp_wake_mailbox_paddr = mp_wake->mailbox_address;

	if (mp_wake->version >= ACPI_MADT_MP_WAKEUP_VERSION_V1 &&
	mp_wake->header.length >= ACPI_MADT_MP_WAKEUP_SIZE_V1) {
	if (acpi_mp_setup_reset(mp_wake->reset_vector)) {
	pr_warn("Failed to setup MADT reset vector\n");
	acpi_mp_disable_offlining(mp_wake);
	}
	} else {
	/*
	* CPU offlining requires version 1 of the ACPI MADT wakeup
	* structure.
	*/
	acpi_mp_disable_offlining(mp_wake);
	}

	apic_update_callback(wakeup_secondary_cpu_64, acpi_wakeup_cpu);

	return 0;
	}