| // SPDX-License-Identifier: GPL-2.0-or-later |
| /* |
| * Copyright 2020-21 IBM Corp. |
| */ |
| |
| #define pr_fmt(fmt) "vas: " fmt |
| |
| #include <linux/module.h> |
| #include <linux/kernel.h> |
| #include <linux/export.h> |
| #include <linux/types.h> |
| #include <linux/delay.h> |
| #include <linux/slab.h> |
| #include <linux/interrupt.h> |
| #include <linux/irqdomain.h> |
| #include <asm/machdep.h> |
| #include <asm/hvcall.h> |
| #include <asm/plpar_wrappers.h> |
| #include <asm/firmware.h> |
| #include <asm/vphn.h> |
| #include <asm/vas.h> |
| #include "vas.h" |
| |
| #define VAS_INVALID_WIN_ADDRESS 0xFFFFFFFFFFFFFFFFul |
| #define VAS_DEFAULT_DOMAIN_ID 0xFFFFFFFFFFFFFFFFul |
| /* The hypervisor allows one credit per window right now */ |
| #define DEF_WIN_CREDS 1 |
| |
| static struct vas_all_caps caps_all; |
| static bool copypaste_feat; |
| static struct hv_vas_cop_feat_caps hv_cop_caps; |
| |
| static struct vas_caps vascaps[VAS_MAX_FEAT_TYPE]; |
| static DEFINE_MUTEX(vas_pseries_mutex); |
| static bool migration_in_progress; |
| |
| static long hcall_return_busy_check(long rc) |
| { |
| /* Check if we are stalled for some time */ |
| if (H_IS_LONG_BUSY(rc)) { |
| unsigned int ms; |
| /* |
| * Allocate, Modify and Deallocate HCALLs returns |
| * H_LONG_BUSY_ORDER_1_MSEC or H_LONG_BUSY_ORDER_10_MSEC |
| * for the long delay. So the sleep time should always |
| * be either 1 or 10msecs, but in case if the HCALL |
| * returns the long delay > 10 msecs, clamp the sleep |
| * time to 10msecs. |
| */ |
| ms = clamp(get_longbusy_msecs(rc), 1, 10); |
| |
| /* |
| * msleep() will often sleep at least 20 msecs even |
| * though the hypervisor suggests that the OS reissue |
| * HCALLs after 1 or 10msecs. Also the delay hint from |
| * the HCALL is just a suggestion. So OK to pause for |
| * less time than the hinted delay. Use usleep_range() |
| * to ensure we don't sleep much longer than actually |
| * needed. |
| */ |
| usleep_range(ms * (USEC_PER_MSEC / 10), ms * USEC_PER_MSEC); |
| rc = H_BUSY; |
| } else if (rc == H_BUSY) { |
| cond_resched(); |
| } |
| |
| return rc; |
| } |
| |
| /* |
| * Allocate VAS window hcall |
| */ |
| static int h_allocate_vas_window(struct pseries_vas_window *win, u64 *domain, |
| u8 wintype, u16 credits) |
| { |
| long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; |
| long rc; |
| |
| do { |
| rc = plpar_hcall9(H_ALLOCATE_VAS_WINDOW, retbuf, wintype, |
| credits, domain[0], domain[1], domain[2], |
| domain[3], domain[4], domain[5]); |
| |
| rc = hcall_return_busy_check(rc); |
| } while (rc == H_BUSY); |
| |
| if (rc == H_SUCCESS) { |
| if (win->win_addr == VAS_INVALID_WIN_ADDRESS) { |
| pr_err("H_ALLOCATE_VAS_WINDOW: COPY/PASTE is not supported\n"); |
| return -ENOTSUPP; |
| } |
| win->vas_win.winid = retbuf[0]; |
| win->win_addr = retbuf[1]; |
| win->complete_irq = retbuf[2]; |
| win->fault_irq = retbuf[3]; |
| return 0; |
| } |
| |
| pr_err("H_ALLOCATE_VAS_WINDOW error: %ld, wintype: %u, credits: %u\n", |
| rc, wintype, credits); |
| |
| return -EIO; |
| } |
| |
| /* |
| * Deallocate VAS window hcall. |
| */ |
| static int h_deallocate_vas_window(u64 winid) |
| { |
| long rc; |
| |
| do { |
| rc = plpar_hcall_norets(H_DEALLOCATE_VAS_WINDOW, winid); |
| |
| rc = hcall_return_busy_check(rc); |
| } while (rc == H_BUSY); |
| |
| if (rc == H_SUCCESS) |
| return 0; |
| |
| pr_err("H_DEALLOCATE_VAS_WINDOW error: %ld, winid: %llu\n", |
| rc, winid); |
| return -EIO; |
| } |
| |
| /* |
| * Modify VAS window. |
| * After the window is opened with allocate window hcall, configure it |
| * with flags and LPAR PID before using. |
| */ |
| static int h_modify_vas_window(struct pseries_vas_window *win) |
| { |
| long rc; |
| |
| /* |
| * AMR value is not supported in Linux VAS implementation. |
| * The hypervisor ignores it if 0 is passed. |
| */ |
| do { |
| rc = plpar_hcall_norets(H_MODIFY_VAS_WINDOW, |
| win->vas_win.winid, win->pid, 0, |
| VAS_MOD_WIN_FLAGS, 0); |
| |
| rc = hcall_return_busy_check(rc); |
| } while (rc == H_BUSY); |
| |
| if (rc == H_SUCCESS) |
| return 0; |
| |
| pr_err("H_MODIFY_VAS_WINDOW error: %ld, winid %u pid %u\n", |
| rc, win->vas_win.winid, win->pid); |
| return -EIO; |
| } |
| |
| /* |
| * This hcall is used to determine the capabilities from the hypervisor. |
| * @hcall: H_QUERY_VAS_CAPABILITIES or H_QUERY_NX_CAPABILITIES |
| * @query_type: If 0 is passed, the hypervisor returns the overall |
| * capabilities which provides all feature(s) that are |
| * available. Then query the hypervisor to get the |
| * corresponding capabilities for the specific feature. |
| * Example: H_QUERY_VAS_CAPABILITIES provides VAS GZIP QoS |
| * and VAS GZIP Default capabilities. |
| * H_QUERY_NX_CAPABILITIES provides NX GZIP |
| * capabilities. |
| * @result: Return buffer to save capabilities. |
| */ |
| int h_query_vas_capabilities(const u64 hcall, u8 query_type, u64 result) |
| { |
| long rc; |
| |
| rc = plpar_hcall_norets(hcall, query_type, result); |
| |
| if (rc == H_SUCCESS) |
| return 0; |
| |
| /* H_FUNCTION means HV does not support VAS so don't print an error */ |
| if (rc != H_FUNCTION) { |
| pr_err("%s error %ld, query_type %u, result buffer 0x%llx\n", |
| (hcall == H_QUERY_VAS_CAPABILITIES) ? |
| "H_QUERY_VAS_CAPABILITIES" : |
| "H_QUERY_NX_CAPABILITIES", |
| rc, query_type, result); |
| } |
| |
| return -EIO; |
| } |
| EXPORT_SYMBOL_GPL(h_query_vas_capabilities); |
| |
| /* |
| * hcall to get fault CRB from the hypervisor. |
| */ |
| static int h_get_nx_fault(u32 winid, u64 buffer) |
| { |
| long rc; |
| |
| rc = plpar_hcall_norets(H_GET_NX_FAULT, winid, buffer); |
| |
| if (rc == H_SUCCESS) |
| return 0; |
| |
| pr_err("H_GET_NX_FAULT error: %ld, winid %u, buffer 0x%llx\n", |
| rc, winid, buffer); |
| return -EIO; |
| |
| } |
| |
| /* |
| * Handle the fault interrupt. |
| * When the fault interrupt is received for each window, query the |
| * hypervisor to get the fault CRB on the specific fault. Then |
| * process the CRB by updating CSB or send signal if the user space |
| * CSB is invalid. |
| * Note: The hypervisor forwards an interrupt for each fault request. |
| * So one fault CRB to process for each H_GET_NX_FAULT hcall. |
| */ |
| static irqreturn_t pseries_vas_fault_thread_fn(int irq, void *data) |
| { |
| struct pseries_vas_window *txwin = data; |
| struct coprocessor_request_block crb; |
| struct vas_user_win_ref *tsk_ref; |
| int rc; |
| |
| while (atomic_read(&txwin->pending_faults)) { |
| rc = h_get_nx_fault(txwin->vas_win.winid, (u64)virt_to_phys(&crb)); |
| if (!rc) { |
| tsk_ref = &txwin->vas_win.task_ref; |
| vas_dump_crb(&crb); |
| vas_update_csb(&crb, tsk_ref); |
| } |
| atomic_dec(&txwin->pending_faults); |
| } |
| |
| return IRQ_HANDLED; |
| } |
| |
| /* |
| * irq_default_primary_handler() can be used only with IRQF_ONESHOT |
| * which disables IRQ before executing the thread handler and enables |
| * it after. But this disabling interrupt sets the VAS IRQ OFF |
| * state in the hypervisor. If the NX generates fault interrupt |
| * during this window, the hypervisor will not deliver this |
| * interrupt to the LPAR. So use VAS specific IRQ handler instead |
| * of calling the default primary handler. |
| */ |
| static irqreturn_t pseries_vas_irq_handler(int irq, void *data) |
| { |
| struct pseries_vas_window *txwin = data; |
| |
| /* |
| * The thread handler will process this interrupt if it is |
| * already running. |
| */ |
| atomic_inc(&txwin->pending_faults); |
| |
| return IRQ_WAKE_THREAD; |
| } |
| |
| /* |
| * Allocate window and setup IRQ mapping. |
| */ |
| static int allocate_setup_window(struct pseries_vas_window *txwin, |
| u64 *domain, u8 wintype) |
| { |
| int rc; |
| |
| rc = h_allocate_vas_window(txwin, domain, wintype, DEF_WIN_CREDS); |
| if (rc) |
| return rc; |
| /* |
| * On PowerVM, the hypervisor setup and forwards the fault |
| * interrupt per window. So the IRQ setup and fault handling |
| * will be done for each open window separately. |
| */ |
| txwin->fault_virq = irq_create_mapping(NULL, txwin->fault_irq); |
| if (!txwin->fault_virq) { |
| pr_err("Failed irq mapping %d\n", txwin->fault_irq); |
| rc = -EINVAL; |
| goto out_win; |
| } |
| |
| txwin->name = kasprintf(GFP_KERNEL, "vas-win-%d", |
| txwin->vas_win.winid); |
| if (!txwin->name) { |
| rc = -ENOMEM; |
| goto out_irq; |
| } |
| |
| rc = request_threaded_irq(txwin->fault_virq, |
| pseries_vas_irq_handler, |
| pseries_vas_fault_thread_fn, 0, |
| txwin->name, txwin); |
| if (rc) { |
| pr_err("VAS-Window[%d]: Request IRQ(%u) failed with %d\n", |
| txwin->vas_win.winid, txwin->fault_virq, rc); |
| goto out_free; |
| } |
| |
| txwin->vas_win.wcreds_max = DEF_WIN_CREDS; |
| |
| return 0; |
| out_free: |
| kfree(txwin->name); |
| out_irq: |
| irq_dispose_mapping(txwin->fault_virq); |
| out_win: |
| h_deallocate_vas_window(txwin->vas_win.winid); |
| return rc; |
| } |
| |
| static inline void free_irq_setup(struct pseries_vas_window *txwin) |
| { |
| free_irq(txwin->fault_virq, txwin); |
| kfree(txwin->name); |
| irq_dispose_mapping(txwin->fault_virq); |
| } |
| |
| static struct vas_window *vas_allocate_window(int vas_id, u64 flags, |
| enum vas_cop_type cop_type) |
| { |
| long domain[PLPAR_HCALL9_BUFSIZE] = {VAS_DEFAULT_DOMAIN_ID}; |
| struct vas_cop_feat_caps *cop_feat_caps; |
| struct vas_caps *caps; |
| struct pseries_vas_window *txwin; |
| int rc; |
| |
| txwin = kzalloc(sizeof(*txwin), GFP_KERNEL); |
| if (!txwin) |
| return ERR_PTR(-ENOMEM); |
| |
| /* |
| * A VAS window can have many credits which means that many |
| * requests can be issued simultaneously. But the hypervisor |
| * restricts one credit per window. |
| * The hypervisor introduces 2 different types of credits: |
| * Default credit type (Uses normal priority FIFO): |
| * A limited number of credits are assigned to partitions |
| * based on processor entitlement. But these credits may be |
| * over-committed on a system depends on whether the CPUs |
| * are in shared or dedicated modes - that is, more requests |
| * may be issued across the system than NX can service at |
| * once which can result in paste command failure (RMA_busy). |
| * Then the process has to resend requests or fall-back to |
| * SW compression. |
| * Quality of Service (QoS) credit type (Uses high priority FIFO): |
| * To avoid NX HW contention, the system admins can assign |
| * QoS credits for each LPAR so that this partition is |
| * guaranteed access to NX resources. These credits are |
| * assigned to partitions via the HMC. |
| * Refer PAPR for more information. |
| * |
| * Allocate window with QoS credits if user requested. Otherwise |
| * default credits are used. |
| */ |
| if (flags & VAS_TX_WIN_FLAG_QOS_CREDIT) |
| caps = &vascaps[VAS_GZIP_QOS_FEAT_TYPE]; |
| else |
| caps = &vascaps[VAS_GZIP_DEF_FEAT_TYPE]; |
| |
| cop_feat_caps = &caps->caps; |
| |
| if (atomic_inc_return(&cop_feat_caps->nr_used_credits) > |
| atomic_read(&cop_feat_caps->nr_total_credits)) { |
| pr_err_ratelimited("Credits are not available to allocate window\n"); |
| rc = -EINVAL; |
| goto out; |
| } |
| |
| if (vas_id == -1) { |
| /* |
| * The user space is requesting to allocate a window on |
| * a VAS instance where the process is executing. |
| * On PowerVM, domain values are passed to the hypervisor |
| * to select VAS instance. Useful if the process is |
| * affinity to NUMA node. |
| * The hypervisor selects VAS instance if |
| * VAS_DEFAULT_DOMAIN_ID (-1) is passed for domain values. |
| * The h_allocate_vas_window hcall is defined to take a |
| * domain values as specified by h_home_node_associativity, |
| * So no unpacking needs to be done. |
| */ |
| rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, domain, |
| VPHN_FLAG_VCPU, hard_smp_processor_id()); |
| if (rc != H_SUCCESS) { |
| pr_err("H_HOME_NODE_ASSOCIATIVITY error: %d\n", rc); |
| goto out; |
| } |
| } |
| |
| txwin->pid = mfspr(SPRN_PID); |
| |
| /* |
| * Allocate / Deallocate window hcalls and setup / free IRQs |
| * have to be protected with mutex. |
| * Open VAS window: Allocate window hcall and setup IRQ |
| * Close VAS window: Deallocate window hcall and free IRQ |
| * The hypervisor waits until all NX requests are |
| * completed before closing the window. So expects OS |
| * to handle NX faults, means IRQ can be freed only |
| * after the deallocate window hcall is returned. |
| * So once the window is closed with deallocate hcall before |
| * the IRQ is freed, it can be assigned to new allocate |
| * hcall with the same fault IRQ by the hypervisor. It can |
| * result in setup IRQ fail for the new window since the |
| * same fault IRQ is not freed by the OS before. |
| */ |
| mutex_lock(&vas_pseries_mutex); |
| if (migration_in_progress) { |
| rc = -EBUSY; |
| } else { |
| rc = allocate_setup_window(txwin, (u64 *)&domain[0], |
| cop_feat_caps->win_type); |
| if (!rc) |
| caps->nr_open_wins_progress++; |
| } |
| |
| mutex_unlock(&vas_pseries_mutex); |
| if (rc) |
| goto out; |
| |
| /* |
| * Modify window and it is ready to use. |
| */ |
| rc = h_modify_vas_window(txwin); |
| if (!rc) |
| rc = get_vas_user_win_ref(&txwin->vas_win.task_ref); |
| if (rc) |
| goto out_free; |
| |
| txwin->win_type = cop_feat_caps->win_type; |
| |
| /* |
| * The migration SUSPEND thread sets migration_in_progress and |
| * closes all open windows from the list. But the window is |
| * added to the list after open and modify HCALLs. So possible |
| * that migration_in_progress is set before modify HCALL which |
| * may cause some windows are still open when the hypervisor |
| * initiates the migration. |
| * So checks the migration_in_progress flag again and close all |
| * open windows. |
| * |
| * Possible to lose the acquired credit with DLPAR core |
| * removal after the window is opened. So if there are any |
| * closed windows (means with lost credits), do not give new |
| * window to user space. New windows will be opened only |
| * after the existing windows are reopened when credits are |
| * available. |
| */ |
| mutex_lock(&vas_pseries_mutex); |
| if (!caps->nr_close_wins && !migration_in_progress) { |
| list_add(&txwin->win_list, &caps->list); |
| caps->nr_open_windows++; |
| caps->nr_open_wins_progress--; |
| mutex_unlock(&vas_pseries_mutex); |
| vas_user_win_add_mm_context(&txwin->vas_win.task_ref); |
| return &txwin->vas_win; |
| } |
| mutex_unlock(&vas_pseries_mutex); |
| |
| put_vas_user_win_ref(&txwin->vas_win.task_ref); |
| rc = -EBUSY; |
| pr_err_ratelimited("No credit is available to allocate window\n"); |
| |
| out_free: |
| /* |
| * Window is not operational. Free IRQ before closing |
| * window so that do not have to hold mutex. |
| */ |
| free_irq_setup(txwin); |
| h_deallocate_vas_window(txwin->vas_win.winid); |
| /* |
| * Hold mutex and reduce nr_open_wins_progress counter. |
| */ |
| mutex_lock(&vas_pseries_mutex); |
| caps->nr_open_wins_progress--; |
| mutex_unlock(&vas_pseries_mutex); |
| out: |
| atomic_dec(&cop_feat_caps->nr_used_credits); |
| kfree(txwin); |
| return ERR_PTR(rc); |
| } |
| |
| static u64 vas_paste_address(struct vas_window *vwin) |
| { |
| struct pseries_vas_window *win; |
| |
| win = container_of(vwin, struct pseries_vas_window, vas_win); |
| return win->win_addr; |
| } |
| |
| static int deallocate_free_window(struct pseries_vas_window *win) |
| { |
| int rc = 0; |
| |
| /* |
| * The hypervisor waits for all requests including faults |
| * are processed before closing the window - Means all |
| * credits have to be returned. In the case of fault |
| * request, a credit is returned after OS issues |
| * H_GET_NX_FAULT hcall. |
| * So free IRQ after executing H_DEALLOCATE_VAS_WINDOW |
| * hcall. |
| */ |
| rc = h_deallocate_vas_window(win->vas_win.winid); |
| if (!rc) |
| free_irq_setup(win); |
| |
| return rc; |
| } |
| |
| static int vas_deallocate_window(struct vas_window *vwin) |
| { |
| struct pseries_vas_window *win; |
| struct vas_cop_feat_caps *caps; |
| int rc = 0; |
| |
| if (!vwin) |
| return -EINVAL; |
| |
| win = container_of(vwin, struct pseries_vas_window, vas_win); |
| |
| /* Should not happen */ |
| if (win->win_type >= VAS_MAX_FEAT_TYPE) { |
| pr_err("Window (%u): Invalid window type %u\n", |
| vwin->winid, win->win_type); |
| return -EINVAL; |
| } |
| |
| caps = &vascaps[win->win_type].caps; |
| mutex_lock(&vas_pseries_mutex); |
| /* |
| * VAS window is already closed in the hypervisor when |
| * lost the credit or with migration. So just remove the entry |
| * from the list, remove task references and free vas_window |
| * struct. |
| */ |
| if (!(win->vas_win.status & VAS_WIN_NO_CRED_CLOSE) && |
| !(win->vas_win.status & VAS_WIN_MIGRATE_CLOSE)) { |
| rc = deallocate_free_window(win); |
| if (rc) { |
| mutex_unlock(&vas_pseries_mutex); |
| return rc; |
| } |
| } else |
| vascaps[win->win_type].nr_close_wins--; |
| |
| list_del(&win->win_list); |
| atomic_dec(&caps->nr_used_credits); |
| vascaps[win->win_type].nr_open_windows--; |
| mutex_unlock(&vas_pseries_mutex); |
| |
| mm_context_remove_vas_window(vwin->task_ref.mm); |
| put_vas_user_win_ref(&vwin->task_ref); |
| |
| kfree(win); |
| return 0; |
| } |
| |
| static const struct vas_user_win_ops vops_pseries = { |
| .open_win = vas_allocate_window, /* Open and configure window */ |
| .paste_addr = vas_paste_address, /* To do copy/paste */ |
| .close_win = vas_deallocate_window, /* Close window */ |
| }; |
| |
| /* |
| * Supporting only nx-gzip coprocessor type now, but this API code |
| * extended to other coprocessor types later. |
| */ |
| int vas_register_api_pseries(struct module *mod, enum vas_cop_type cop_type, |
| const char *name) |
| { |
| if (!copypaste_feat) |
| return -ENOTSUPP; |
| |
| return vas_register_coproc_api(mod, cop_type, name, &vops_pseries); |
| } |
| EXPORT_SYMBOL_GPL(vas_register_api_pseries); |
| |
| void vas_unregister_api_pseries(void) |
| { |
| vas_unregister_coproc_api(); |
| } |
| EXPORT_SYMBOL_GPL(vas_unregister_api_pseries); |
| |
| /* |
| * Get the specific capabilities based on the feature type. |
| * Right now supports GZIP default and GZIP QoS capabilities. |
| */ |
| static int __init get_vas_capabilities(u8 feat, enum vas_cop_feat_type type, |
| struct hv_vas_cop_feat_caps *hv_caps) |
| { |
| struct vas_cop_feat_caps *caps; |
| struct vas_caps *vcaps; |
| int rc = 0; |
| |
| vcaps = &vascaps[type]; |
| memset(vcaps, 0, sizeof(*vcaps)); |
| INIT_LIST_HEAD(&vcaps->list); |
| |
| vcaps->feat = feat; |
| caps = &vcaps->caps; |
| |
| rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, feat, |
| (u64)virt_to_phys(hv_caps)); |
| if (rc) |
| return rc; |
| |
| caps->user_mode = hv_caps->user_mode; |
| if (!(caps->user_mode & VAS_COPY_PASTE_USER_MODE)) { |
| pr_err("User space COPY/PASTE is not supported\n"); |
| return -ENOTSUPP; |
| } |
| |
| caps->descriptor = be64_to_cpu(hv_caps->descriptor); |
| caps->win_type = hv_caps->win_type; |
| if (caps->win_type >= VAS_MAX_FEAT_TYPE) { |
| pr_err("Unsupported window type %u\n", caps->win_type); |
| return -EINVAL; |
| } |
| caps->max_lpar_creds = be16_to_cpu(hv_caps->max_lpar_creds); |
| caps->max_win_creds = be16_to_cpu(hv_caps->max_win_creds); |
| atomic_set(&caps->nr_total_credits, |
| be16_to_cpu(hv_caps->target_lpar_creds)); |
| if (feat == VAS_GZIP_DEF_FEAT) { |
| caps->def_lpar_creds = be16_to_cpu(hv_caps->def_lpar_creds); |
| |
| if (caps->max_win_creds < DEF_WIN_CREDS) { |
| pr_err("Window creds(%u) > max allowed window creds(%u)\n", |
| DEF_WIN_CREDS, caps->max_win_creds); |
| return -EINVAL; |
| } |
| } |
| |
| rc = sysfs_add_vas_caps(caps); |
| if (rc) |
| return rc; |
| |
| copypaste_feat = true; |
| |
| return 0; |
| } |
| |
| /* |
| * VAS windows can be closed due to lost credits when the core is |
| * removed. So reopen them if credits are available due to DLPAR |
| * core add and set the window active status. When NX sees the page |
| * fault on the unmapped paste address, the kernel handles the fault |
| * by setting the remapping to new paste address if the window is |
| * active. |
| */ |
| static int reconfig_open_windows(struct vas_caps *vcaps, int creds, |
| bool migrate) |
| { |
| long domain[PLPAR_HCALL9_BUFSIZE] = {VAS_DEFAULT_DOMAIN_ID}; |
| struct vas_cop_feat_caps *caps = &vcaps->caps; |
| struct pseries_vas_window *win = NULL, *tmp; |
| int rc, mv_ents = 0; |
| int flag; |
| |
| /* |
| * Nothing to do if there are no closed windows. |
| */ |
| if (!vcaps->nr_close_wins) |
| return 0; |
| |
| /* |
| * For the core removal, the hypervisor reduces the credits |
| * assigned to the LPAR and the kernel closes VAS windows |
| * in the hypervisor depends on reduced credits. The kernel |
| * uses LIFO (the last windows that are opened will be closed |
| * first) and expects to open in the same order when credits |
| * are available. |
| * For example, 40 windows are closed when the LPAR lost 2 cores |
| * (dedicated). If 1 core is added, this LPAR can have 20 more |
| * credits. It means the kernel can reopen 20 windows. So move |
| * 20 entries in the VAS windows lost and reopen next 20 windows. |
| * For partition migration, reopen all windows that are closed |
| * during resume. |
| */ |
| if ((vcaps->nr_close_wins > creds) && !migrate) |
| mv_ents = vcaps->nr_close_wins - creds; |
| |
| list_for_each_entry_safe(win, tmp, &vcaps->list, win_list) { |
| if (!mv_ents) |
| break; |
| |
| mv_ents--; |
| } |
| |
| /* |
| * Open windows if they are closed only with migration or |
| * DLPAR (lost credit) before. |
| */ |
| if (migrate) |
| flag = VAS_WIN_MIGRATE_CLOSE; |
| else |
| flag = VAS_WIN_NO_CRED_CLOSE; |
| |
| list_for_each_entry_safe_from(win, tmp, &vcaps->list, win_list) { |
| /* |
| * This window is closed with DLPAR and migration events. |
| * So reopen the window with the last event. |
| * The user space is not suspended with the current |
| * migration notifier. So the user space can issue DLPAR |
| * CPU hotplug while migration in progress. In this case |
| * this window will be opened with the last event. |
| */ |
| if ((win->vas_win.status & VAS_WIN_NO_CRED_CLOSE) && |
| (win->vas_win.status & VAS_WIN_MIGRATE_CLOSE)) { |
| win->vas_win.status &= ~flag; |
| continue; |
| } |
| |
| /* |
| * Nothing to do on this window if it is not closed |
| * with this flag |
| */ |
| if (!(win->vas_win.status & flag)) |
| continue; |
| |
| rc = allocate_setup_window(win, (u64 *)&domain[0], |
| caps->win_type); |
| if (rc) |
| return rc; |
| |
| rc = h_modify_vas_window(win); |
| if (rc) |
| goto out; |
| |
| mutex_lock(&win->vas_win.task_ref.mmap_mutex); |
| /* |
| * Set window status to active |
| */ |
| win->vas_win.status &= ~flag; |
| mutex_unlock(&win->vas_win.task_ref.mmap_mutex); |
| win->win_type = caps->win_type; |
| if (!--vcaps->nr_close_wins) |
| break; |
| } |
| |
| return 0; |
| out: |
| /* |
| * Window modify HCALL failed. So close the window to the |
| * hypervisor and return. |
| */ |
| free_irq_setup(win); |
| h_deallocate_vas_window(win->vas_win.winid); |
| return rc; |
| } |
| |
| /* |
| * The hypervisor reduces the available credits if the LPAR lost core. It |
| * means the excessive windows should not be active and the user space |
| * should not be using these windows to send compression requests to NX. |
| * So the kernel closes the excessive windows and unmap the paste address |
| * such that the user space receives paste instruction failure. Then up to |
| * the user space to fall back to SW compression and manage with the |
| * existing windows. |
| */ |
| static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds, |
| bool migrate) |
| { |
| struct pseries_vas_window *win, *tmp; |
| struct vas_user_win_ref *task_ref; |
| struct vm_area_struct *vma; |
| int rc = 0, flag; |
| |
| if (migrate) |
| flag = VAS_WIN_MIGRATE_CLOSE; |
| else |
| flag = VAS_WIN_NO_CRED_CLOSE; |
| |
| list_for_each_entry_safe(win, tmp, &vcap->list, win_list) { |
| /* |
| * This window is already closed due to lost credit |
| * or for migration before. Go for next window. |
| * For migration, nothing to do since this window |
| * closed for DLPAR and will be reopened even on |
| * the destination system with other DLPAR operation. |
| */ |
| if ((win->vas_win.status & VAS_WIN_MIGRATE_CLOSE) || |
| (win->vas_win.status & VAS_WIN_NO_CRED_CLOSE)) { |
| win->vas_win.status |= flag; |
| continue; |
| } |
| |
| task_ref = &win->vas_win.task_ref; |
| /* |
| * VAS mmap (coproc_mmap()) and its fault handler |
| * (vas_mmap_fault()) are called after holding mmap lock. |
| * So hold mmap mutex after mmap_lock to avoid deadlock. |
| */ |
| mmap_write_lock(task_ref->mm); |
| mutex_lock(&task_ref->mmap_mutex); |
| vma = task_ref->vma; |
| /* |
| * Number of available credits are reduced, So select |
| * and close windows. |
| */ |
| win->vas_win.status |= flag; |
| |
| /* |
| * vma is set in the original mapping. But this mapping |
| * is done with mmap() after the window is opened with ioctl. |
| * so we may not see the original mapping if the core remove |
| * is done before the original mmap() and after the ioctl. |
| */ |
| if (vma) |
| zap_vma_pages(vma); |
| |
| mutex_unlock(&task_ref->mmap_mutex); |
| mmap_write_unlock(task_ref->mm); |
| /* |
| * Close VAS window in the hypervisor, but do not |
| * free vas_window struct since it may be reused |
| * when the credit is available later (DLPAR with |
| * adding cores). This struct will be used |
| * later when the process issued with close(FD). |
| */ |
| rc = deallocate_free_window(win); |
| /* |
| * This failure is from the hypervisor. |
| * No way to stop migration for these failures. |
| * So ignore error and continue closing other windows. |
| */ |
| if (rc && !migrate) |
| return rc; |
| |
| vcap->nr_close_wins++; |
| |
| /* |
| * For migration, do not depend on lpar_creds in case if |
| * mismatch with the hypervisor value (should not happen). |
| * So close all active windows in the list and will be |
| * reopened windows based on the new lpar_creds on the |
| * destination system during resume. |
| */ |
| if (!migrate && !--excess_creds) |
| break; |
| } |
| |
| return 0; |
| } |
| |
| /* |
| * Get new VAS capabilities when the core add/removal configuration |
| * changes. Reconfig window configurations based on the credits |
| * availability from this new capabilities. |
| */ |
| int vas_reconfig_capabilties(u8 type, int new_nr_creds) |
| { |
| struct vas_cop_feat_caps *caps; |
| int old_nr_creds; |
| struct vas_caps *vcaps; |
| int rc = 0, nr_active_wins; |
| |
| if (type >= VAS_MAX_FEAT_TYPE) { |
| pr_err("Invalid credit type %d\n", type); |
| return -EINVAL; |
| } |
| |
| vcaps = &vascaps[type]; |
| caps = &vcaps->caps; |
| |
| mutex_lock(&vas_pseries_mutex); |
| |
| old_nr_creds = atomic_read(&caps->nr_total_credits); |
| |
| atomic_set(&caps->nr_total_credits, new_nr_creds); |
| /* |
| * The total number of available credits may be decreased or |
| * increased with DLPAR operation. Means some windows have to be |
| * closed / reopened. Hold the vas_pseries_mutex so that the |
| * user space can not open new windows. |
| */ |
| if (old_nr_creds < new_nr_creds) { |
| /* |
| * If the existing target credits is less than the new |
| * target, reopen windows if they are closed due to |
| * the previous DLPAR (core removal). |
| */ |
| rc = reconfig_open_windows(vcaps, new_nr_creds - old_nr_creds, |
| false); |
| } else { |
| /* |
| * # active windows is more than new LPAR available |
| * credits. So close the excessive windows. |
| * On pseries, each window will have 1 credit. |
| */ |
| nr_active_wins = vcaps->nr_open_windows - vcaps->nr_close_wins; |
| if (nr_active_wins > new_nr_creds) |
| rc = reconfig_close_windows(vcaps, |
| nr_active_wins - new_nr_creds, |
| false); |
| } |
| |
| mutex_unlock(&vas_pseries_mutex); |
| return rc; |
| } |
| |
| int pseries_vas_dlpar_cpu(void) |
| { |
| int new_nr_creds, rc; |
| |
| /* |
| * NX-GZIP is not enabled. Nothing to do for DLPAR event |
| */ |
| if (!copypaste_feat) |
| return 0; |
| |
| |
| rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, |
| vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat, |
| (u64)virt_to_phys(&hv_cop_caps)); |
| if (!rc) { |
| new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); |
| rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, new_nr_creds); |
| } |
| |
| if (rc) |
| pr_err("Failed reconfig VAS capabilities with DLPAR\n"); |
| |
| return rc; |
| } |
| |
| /* |
| * Total number of default credits available (target_credits) |
| * in LPAR depends on number of cores configured. It varies based on |
| * whether processors are in shared mode or dedicated mode. |
| * Get the notifier when CPU configuration is changed with DLPAR |
| * operation so that get the new target_credits (vas default capabilities) |
| * and then update the existing windows usage if needed. |
| */ |
| static int pseries_vas_notifier(struct notifier_block *nb, |
| unsigned long action, void *data) |
| { |
| struct of_reconfig_data *rd = data; |
| struct device_node *dn = rd->dn; |
| const __be32 *intserv = NULL; |
| int len; |
| |
| /* |
| * For shared CPU partition, the hypervisor assigns total credits |
| * based on entitled core capacity. So updating VAS windows will |
| * be called from lparcfg_write(). |
| */ |
| if (is_shared_processor()) |
| return NOTIFY_OK; |
| |
| if ((action == OF_RECONFIG_ATTACH_NODE) || |
| (action == OF_RECONFIG_DETACH_NODE)) |
| intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", |
| &len); |
| /* |
| * Processor config is not changed |
| */ |
| if (!intserv) |
| return NOTIFY_OK; |
| |
| return pseries_vas_dlpar_cpu(); |
| } |
| |
| static struct notifier_block pseries_vas_nb = { |
| .notifier_call = pseries_vas_notifier, |
| }; |
| |
| /* |
| * For LPM, all windows have to be closed on the source partition |
| * before migration and reopen them on the destination partition |
| * after migration. So closing windows during suspend and |
| * reopen them during resume. |
| */ |
| int vas_migration_handler(int action) |
| { |
| struct vas_cop_feat_caps *caps; |
| int old_nr_creds, new_nr_creds = 0; |
| struct vas_caps *vcaps; |
| int i, rc = 0; |
| |
| pr_info("VAS migration event %d\n", action); |
| |
| /* |
| * NX-GZIP is not enabled. Nothing to do for migration. |
| */ |
| if (!copypaste_feat) |
| return rc; |
| |
| if (action == VAS_SUSPEND) |
| migration_in_progress = true; |
| else |
| migration_in_progress = false; |
| |
| for (i = 0; i < VAS_MAX_FEAT_TYPE; i++) { |
| vcaps = &vascaps[i]; |
| caps = &vcaps->caps; |
| old_nr_creds = atomic_read(&caps->nr_total_credits); |
| |
| rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, |
| vcaps->feat, |
| (u64)virt_to_phys(&hv_cop_caps)); |
| if (!rc) { |
| new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); |
| /* |
| * Should not happen. But incase print messages, close |
| * all windows in the list during suspend and reopen |
| * windows based on new lpar_creds on the destination |
| * system. |
| */ |
| if (old_nr_creds != new_nr_creds) { |
| pr_err("Target credits mismatch with the hypervisor\n"); |
| pr_err("state(%d): lpar creds: %d HV lpar creds: %d\n", |
| action, old_nr_creds, new_nr_creds); |
| pr_err("Used creds: %d, Active creds: %d\n", |
| atomic_read(&caps->nr_used_credits), |
| vcaps->nr_open_windows - vcaps->nr_close_wins); |
| } |
| } else { |
| pr_err("state(%d): Get VAS capabilities failed with %d\n", |
| action, rc); |
| /* |
| * We can not stop migration with the current lpm |
| * implementation. So continue closing all windows in |
| * the list (during suspend) and return without |
| * opening windows (during resume) if VAS capabilities |
| * HCALL failed. |
| */ |
| if (action == VAS_RESUME) |
| goto out; |
| } |
| |
| switch (action) { |
| case VAS_SUSPEND: |
| mutex_lock(&vas_pseries_mutex); |
| rc = reconfig_close_windows(vcaps, vcaps->nr_open_windows, |
| true); |
| /* |
| * Windows are included in the list after successful |
| * open. So wait for closing these in-progress open |
| * windows in vas_allocate_window() which will be |
| * done if the migration_in_progress is set. |
| */ |
| while (vcaps->nr_open_wins_progress) { |
| mutex_unlock(&vas_pseries_mutex); |
| msleep(10); |
| mutex_lock(&vas_pseries_mutex); |
| } |
| mutex_unlock(&vas_pseries_mutex); |
| break; |
| case VAS_RESUME: |
| mutex_lock(&vas_pseries_mutex); |
| atomic_set(&caps->nr_total_credits, new_nr_creds); |
| rc = reconfig_open_windows(vcaps, new_nr_creds, true); |
| mutex_unlock(&vas_pseries_mutex); |
| break; |
| default: |
| /* should not happen */ |
| pr_err("Invalid migration action %d\n", action); |
| rc = -EINVAL; |
| goto out; |
| } |
| |
| /* |
| * Ignore errors during suspend and return for resume. |
| */ |
| if (rc && (action == VAS_RESUME)) |
| goto out; |
| } |
| |
| pr_info("VAS migration event (%d) successful\n", action); |
| |
| out: |
| return rc; |
| } |
| |
| static int __init pseries_vas_init(void) |
| { |
| struct hv_vas_all_caps *hv_caps; |
| int rc = 0; |
| |
| /* |
| * Linux supports user space COPY/PASTE only with Radix |
| */ |
| if (!radix_enabled()) { |
| copypaste_feat = false; |
| pr_err("API is supported only with radix page tables\n"); |
| return -ENOTSUPP; |
| } |
| |
| hv_caps = kmalloc(sizeof(*hv_caps), GFP_KERNEL); |
| if (!hv_caps) |
| return -ENOMEM; |
| /* |
| * Get VAS overall capabilities by passing 0 to feature type. |
| */ |
| rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, 0, |
| (u64)virt_to_phys(hv_caps)); |
| if (rc) |
| goto out; |
| |
| caps_all.descriptor = be64_to_cpu(hv_caps->descriptor); |
| caps_all.feat_type = be64_to_cpu(hv_caps->feat_type); |
| |
| sysfs_pseries_vas_init(&caps_all); |
| |
| /* |
| * QOS capabilities available |
| */ |
| if (caps_all.feat_type & VAS_GZIP_QOS_FEAT_BIT) { |
| rc = get_vas_capabilities(VAS_GZIP_QOS_FEAT, |
| VAS_GZIP_QOS_FEAT_TYPE, &hv_cop_caps); |
| |
| if (rc) |
| goto out; |
| } |
| /* |
| * Default capabilities available |
| */ |
| if (caps_all.feat_type & VAS_GZIP_DEF_FEAT_BIT) |
| rc = get_vas_capabilities(VAS_GZIP_DEF_FEAT, |
| VAS_GZIP_DEF_FEAT_TYPE, &hv_cop_caps); |
| |
| if (!rc && copypaste_feat) { |
| if (firmware_has_feature(FW_FEATURE_LPAR)) |
| of_reconfig_notifier_register(&pseries_vas_nb); |
| |
| pr_info("GZIP feature is available\n"); |
| } else { |
| /* |
| * Should not happen, but only when get default |
| * capabilities HCALL failed. So disable copy paste |
| * feature. |
| */ |
| copypaste_feat = false; |
| } |
| |
| out: |
| kfree(hv_caps); |
| return rc; |
| } |
| machine_device_initcall(pseries, pseries_vas_init); |