| // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause |
| |
| /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ |
| /* Copyright (c) 2008-2019, IBM Corporation */ |
| |
| #include <linux/gfp.h> |
| #include <rdma/ib_verbs.h> |
| #include <linux/dma-mapping.h> |
| #include <linux/slab.h> |
| #include <linux/sched/mm.h> |
| #include <linux/resource.h> |
| |
| #include "siw.h" |
| #include "siw_mem.h" |
| |
| /* |
| * Stag lookup is based on its index part only (24 bits). |
| * The code avoids special Stag of zero and tries to randomize |
| * STag values between 1 and SIW_STAG_MAX_INDEX. |
| */ |
| int siw_mem_add(struct siw_device *sdev, struct siw_mem *m) |
| { |
| struct xa_limit limit = XA_LIMIT(1, 0x00ffffff); |
| u32 id, next; |
| |
| get_random_bytes(&next, 4); |
| next &= 0x00ffffff; |
| |
| if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next, |
| GFP_KERNEL) < 0) |
| return -ENOMEM; |
| |
| /* Set the STag index part */ |
| m->stag = id << 8; |
| |
| siw_dbg_mem(m, "new MEM object\n"); |
| |
| return 0; |
| } |
| |
| /* |
| * siw_mem_id2obj() |
| * |
| * resolves memory from stag given by id. might be called from: |
| * o process context before sending out of sgl, or |
| * o in softirq when resolving target memory |
| */ |
| struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index) |
| { |
| struct siw_mem *mem; |
| |
| rcu_read_lock(); |
| mem = xa_load(&sdev->mem_xa, stag_index); |
| if (likely(mem && kref_get_unless_zero(&mem->ref))) { |
| rcu_read_unlock(); |
| return mem; |
| } |
| rcu_read_unlock(); |
| |
| return NULL; |
| } |
| |
| static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages, |
| bool dirty) |
| { |
| struct page **p = chunk->plist; |
| |
| while (num_pages--) { |
| if (!PageDirty(*p) && dirty) |
| put_user_pages_dirty_lock(p, 1); |
| else |
| put_user_page(*p); |
| p++; |
| } |
| } |
| |
| void siw_umem_release(struct siw_umem *umem, bool dirty) |
| { |
| struct mm_struct *mm_s = umem->owning_mm; |
| int i, num_pages = umem->num_pages; |
| |
| for (i = 0; num_pages; i++) { |
| int to_free = min_t(int, PAGES_PER_CHUNK, num_pages); |
| |
| siw_free_plist(&umem->page_chunk[i], to_free, |
| umem->writable && dirty); |
| kfree(umem->page_chunk[i].plist); |
| num_pages -= to_free; |
| } |
| atomic64_sub(umem->num_pages, &mm_s->pinned_vm); |
| |
| mmdrop(mm_s); |
| kfree(umem->page_chunk); |
| kfree(umem); |
| } |
| |
| int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj, |
| u64 start, u64 len, int rights) |
| { |
| struct siw_device *sdev = to_siw_dev(pd->device); |
| struct siw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL); |
| struct xa_limit limit = XA_LIMIT(1, 0x00ffffff); |
| u32 id, next; |
| |
| if (!mem) |
| return -ENOMEM; |
| |
| mem->mem_obj = mem_obj; |
| mem->stag_valid = 0; |
| mem->sdev = sdev; |
| mem->va = start; |
| mem->len = len; |
| mem->pd = pd; |
| mem->perms = rights & IWARP_ACCESS_MASK; |
| kref_init(&mem->ref); |
| |
| mr->mem = mem; |
| |
| get_random_bytes(&next, 4); |
| next &= 0x00ffffff; |
| |
| if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next, |
| GFP_KERNEL) < 0) { |
| kfree(mem); |
| return -ENOMEM; |
| } |
| /* Set the STag index part */ |
| mem->stag = id << 8; |
| mr->base_mr.lkey = mr->base_mr.rkey = mem->stag; |
| |
| return 0; |
| } |
| |
| void siw_mr_drop_mem(struct siw_mr *mr) |
| { |
| struct siw_mem *mem = mr->mem, *found; |
| |
| mem->stag_valid = 0; |
| |
| /* make STag invalid visible asap */ |
| smp_mb(); |
| |
| found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8); |
| WARN_ON(found != mem); |
| siw_mem_put(mem); |
| } |
| |
| void siw_free_mem(struct kref *ref) |
| { |
| struct siw_mem *mem = container_of(ref, struct siw_mem, ref); |
| |
| siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n"); |
| |
| if (!mem->is_mw && mem->mem_obj) { |
| if (mem->is_pbl == 0) |
| siw_umem_release(mem->umem, true); |
| else |
| kfree(mem->pbl); |
| } |
| kfree(mem); |
| } |
| |
| /* |
| * siw_check_mem() |
| * |
| * Check protection domain, STAG state, access permissions and |
| * address range for memory object. |
| * |
| * @pd: Protection Domain memory should belong to |
| * @mem: memory to be checked |
| * @addr: starting addr of mem |
| * @perms: requested access permissions |
| * @len: len of memory interval to be checked |
| * |
| */ |
| int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, |
| enum ib_access_flags perms, int len) |
| { |
| if (!mem->stag_valid) { |
| siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag); |
| return -E_STAG_INVALID; |
| } |
| if (mem->pd != pd) { |
| siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag); |
| return -E_PD_MISMATCH; |
| } |
| /* |
| * check access permissions |
| */ |
| if ((mem->perms & perms) < perms) { |
| siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n", |
| mem->perms, perms); |
| return -E_ACCESS_PERM; |
| } |
| /* |
| * Check if access falls into valid memory interval. |
| */ |
| if (addr < mem->va || addr + len > mem->va + mem->len) { |
| siw_dbg_pd(pd, "MEM interval len %d\n", len); |
| siw_dbg_pd(pd, "[0x%pK, 0x%pK] out of bounds\n", |
| (void *)(uintptr_t)addr, |
| (void *)(uintptr_t)(addr + len)); |
| siw_dbg_pd(pd, "[0x%pK, 0x%pK] STag=0x%08x\n", |
| (void *)(uintptr_t)mem->va, |
| (void *)(uintptr_t)(mem->va + mem->len), |
| mem->stag); |
| |
| return -E_BASE_BOUNDS; |
| } |
| return E_ACCESS_OK; |
| } |
| |
| /* |
| * siw_check_sge() |
| * |
| * Check SGE for access rights in given interval |
| * |
| * @pd: Protection Domain memory should belong to |
| * @sge: SGE to be checked |
| * @mem: location of memory reference within array |
| * @perms: requested access permissions |
| * @off: starting offset in SGE |
| * @len: len of memory interval to be checked |
| * |
| * NOTE: Function references SGE's memory object (mem->obj) |
| * if not yet done. New reference is kept if check went ok and |
| * released if check failed. If mem->obj is already valid, no new |
| * lookup is being done and mem is not released it check fails. |
| */ |
| int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[], |
| enum ib_access_flags perms, u32 off, int len) |
| { |
| struct siw_device *sdev = to_siw_dev(pd->device); |
| struct siw_mem *new = NULL; |
| int rv = E_ACCESS_OK; |
| |
| if (len + off > sge->length) { |
| rv = -E_BASE_BOUNDS; |
| goto fail; |
| } |
| if (*mem == NULL) { |
| new = siw_mem_id2obj(sdev, sge->lkey >> 8); |
| if (unlikely(!new)) { |
| siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey); |
| rv = -E_STAG_INVALID; |
| goto fail; |
| } |
| *mem = new; |
| } |
| /* Check if user re-registered with different STag key */ |
| if (unlikely((*mem)->stag != sge->lkey)) { |
| siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey); |
| rv = -E_STAG_INVALID; |
| goto fail; |
| } |
| rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len); |
| if (unlikely(rv)) |
| goto fail; |
| |
| return 0; |
| |
| fail: |
| if (new) { |
| *mem = NULL; |
| siw_mem_put(new); |
| } |
| return rv; |
| } |
| |
| void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op) |
| { |
| switch (op) { |
| case SIW_OP_SEND: |
| case SIW_OP_WRITE: |
| case SIW_OP_SEND_WITH_IMM: |
| case SIW_OP_SEND_REMOTE_INV: |
| case SIW_OP_READ: |
| case SIW_OP_READ_LOCAL_INV: |
| if (!(wqe->sqe.flags & SIW_WQE_INLINE)) |
| siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge); |
| break; |
| |
| case SIW_OP_RECEIVE: |
| siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge); |
| break; |
| |
| case SIW_OP_READ_RESPONSE: |
| siw_unref_mem_sgl(wqe->mem, 1); |
| break; |
| |
| default: |
| /* |
| * SIW_OP_INVAL_STAG and SIW_OP_REG_MR |
| * do not hold memory references |
| */ |
| break; |
| } |
| } |
| |
| int siw_invalidate_stag(struct ib_pd *pd, u32 stag) |
| { |
| struct siw_device *sdev = to_siw_dev(pd->device); |
| struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8); |
| int rv = 0; |
| |
| if (unlikely(!mem)) { |
| siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag); |
| return -EINVAL; |
| } |
| if (unlikely(mem->pd != pd)) { |
| siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag); |
| rv = -EACCES; |
| goto out; |
| } |
| /* |
| * Per RDMA verbs definition, an STag may already be in invalid |
| * state if invalidation is requested. So no state check here. |
| */ |
| mem->stag_valid = 0; |
| |
| siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag); |
| out: |
| siw_mem_put(mem); |
| return rv; |
| } |
| |
| /* |
| * Gets physical address backed by PBL element. Address is referenced |
| * by linear byte offset into list of variably sized PB elements. |
| * Optionally, provides remaining len within current element, and |
| * current PBL index for later resume at same element. |
| */ |
| dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx) |
| { |
| int i = idx ? *idx : 0; |
| |
| while (i < pbl->num_buf) { |
| struct siw_pble *pble = &pbl->pbe[i]; |
| |
| if (pble->pbl_off + pble->size > off) { |
| u64 pble_off = off - pble->pbl_off; |
| |
| if (len) |
| *len = pble->size - pble_off; |
| if (idx) |
| *idx = i; |
| |
| return pble->addr + pble_off; |
| } |
| i++; |
| } |
| if (len) |
| *len = 0; |
| return 0; |
| } |
| |
| struct siw_pbl *siw_pbl_alloc(u32 num_buf) |
| { |
| struct siw_pbl *pbl; |
| int buf_size = sizeof(*pbl); |
| |
| if (num_buf == 0) |
| return ERR_PTR(-EINVAL); |
| |
| buf_size += ((num_buf - 1) * sizeof(struct siw_pble)); |
| |
| pbl = kzalloc(buf_size, GFP_KERNEL); |
| if (!pbl) |
| return ERR_PTR(-ENOMEM); |
| |
| pbl->max_buf = num_buf; |
| |
| return pbl; |
| } |
| |
| struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) |
| { |
| struct siw_umem *umem; |
| struct mm_struct *mm_s; |
| u64 first_page_va; |
| unsigned long mlock_limit; |
| unsigned int foll_flags = FOLL_WRITE; |
| int num_pages, num_chunks, i, rv = 0; |
| |
| if (!can_do_mlock()) |
| return ERR_PTR(-EPERM); |
| |
| if (!len) |
| return ERR_PTR(-EINVAL); |
| |
| first_page_va = start & PAGE_MASK; |
| num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT; |
| num_chunks = (num_pages >> CHUNK_SHIFT) + 1; |
| |
| umem = kzalloc(sizeof(*umem), GFP_KERNEL); |
| if (!umem) |
| return ERR_PTR(-ENOMEM); |
| |
| mm_s = current->mm; |
| umem->owning_mm = mm_s; |
| umem->writable = writable; |
| |
| mmgrab(mm_s); |
| |
| if (!writable) |
| foll_flags |= FOLL_FORCE; |
| |
| down_read(&mm_s->mmap_sem); |
| |
| mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
| |
| if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) { |
| rv = -ENOMEM; |
| goto out_sem_up; |
| } |
| umem->fp_addr = first_page_va; |
| |
| umem->page_chunk = |
| kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL); |
| if (!umem->page_chunk) { |
| rv = -ENOMEM; |
| goto out_sem_up; |
| } |
| for (i = 0; num_pages; i++) { |
| int got, nents = min_t(int, num_pages, PAGES_PER_CHUNK); |
| |
| umem->page_chunk[i].plist = |
| kcalloc(nents, sizeof(struct page *), GFP_KERNEL); |
| if (!umem->page_chunk[i].plist) { |
| rv = -ENOMEM; |
| goto out_sem_up; |
| } |
| got = 0; |
| while (nents) { |
| struct page **plist = &umem->page_chunk[i].plist[got]; |
| |
| rv = get_user_pages(first_page_va, nents, |
| foll_flags | FOLL_LONGTERM, |
| plist, NULL); |
| if (rv < 0) |
| goto out_sem_up; |
| |
| umem->num_pages += rv; |
| atomic64_add(rv, &mm_s->pinned_vm); |
| first_page_va += rv * PAGE_SIZE; |
| nents -= rv; |
| got += rv; |
| } |
| num_pages -= got; |
| } |
| out_sem_up: |
| up_read(&mm_s->mmap_sem); |
| |
| if (rv > 0) |
| return umem; |
| |
| siw_umem_release(umem, false); |
| |
| return ERR_PTR(rv); |
| } |